diff --git a/src/RealCV.Application/Data/UKHistoricalEmployers.cs b/src/RealCV.Application/Data/UKHistoricalEmployers.cs
new file mode 100644
index 0000000..7175812
--- /dev/null
+++ b/src/RealCV.Application/Data/UKHistoricalEmployers.cs
@@ -0,0 +1,448 @@
+namespace RealCV.Application.Data;
+
+///
+/// Database of historical UK employers that may no longer exist under their original names.
+/// Includes companies that were acquired, merged, dissolved, or renamed.
+/// Also includes public sector bodies and internal divisions of larger organisations.
+///
+public static class UKHistoricalEmployers
+{
+ ///
+ /// Maps historical company names to their current/successor company information.
+ /// Key: Historical name (case-insensitive)
+ /// Value: HistoricalEmployerInfo with successor details
+ ///
+ public static readonly Dictionary HistoricalCompanies =
+ new(StringComparer.OrdinalIgnoreCase)
+ {
+ // Engineering & Construction
+ ["Foster Wheeler"] = new("Wood Group / AMEC Foster Wheeler", "Engineering contractor acquired by AMEC in 2014, now part of Wood Group", "00163609"),
+ ["Foster Wheeler Ltd"] = new("Wood Group / AMEC Foster Wheeler", "Engineering contractor acquired by AMEC in 2014, now part of Wood Group", "00163609"),
+ ["Foster Wheeler Limited"] = new("Wood Group / AMEC Foster Wheeler", "Engineering contractor acquired by AMEC in 2014, now part of Wood Group", "00163609"),
+ ["Foster Wheeler PLC"] = new("Wood Group / AMEC Foster Wheeler", "Engineering contractor acquired by AMEC in 2014, now part of Wood Group", "00163609"),
+ ["Sir Alexander Gibb and Partners"] = new("Jacobs Engineering", "Historic engineering consultancy (founded 1922), acquired by Jacobs", null),
+ ["Alexander Gibb and Partners"] = new("Jacobs Engineering", "Historic engineering consultancy (founded 1922), acquired by Jacobs", null),
+ ["Gibb and Partners"] = new("Jacobs Engineering", "Historic engineering consultancy, acquired by Jacobs", null),
+ ["Mott MacDonald"] = new("Mott MacDonald", "Still trading - major engineering consultancy", "01243967"),
+ ["Ove Arup"] = new("Arup", "Still trading as Arup", "01312453"),
+ ["Arup"] = new("Arup", "Major engineering consultancy", "01312453"),
+ ["WS Atkins"] = new("SNC-Lavalin / Atkins", "Acquired by SNC-Lavalin in 2017", "01885586"),
+ ["Atkins"] = new("SNC-Lavalin / Atkins", "Acquired by SNC-Lavalin in 2017", "01885586"),
+
+ // Pharmaceuticals
+ ["Glaxo"] = new("GlaxoSmithKline (GSK)", "Merged with SmithKline Beecham in 2000 to form GSK", "03888792"),
+ ["Glaxo Research & Development"] = new("GlaxoSmithKline (GSK)", "Glaxo R&D subsidiary, merged into GSK in 2000", "03888792"),
+ ["Glaxo Research & Development Ltd"] = new("GlaxoSmithKline (GSK)", "Glaxo R&D subsidiary, merged into GSK in 2000", "03888792"),
+ ["Glaxo Research and Development"] = new("GlaxoSmithKline (GSK)", "Glaxo R&D subsidiary, merged into GSK in 2000", "03888792"),
+ ["Glaxo Wellcome"] = new("GlaxoSmithKline (GSK)", "Formed 1995 (Glaxo + Wellcome), merged with SmithKline Beecham 2000", "03888792"),
+ ["SmithKline Beecham"] = new("GlaxoSmithKline (GSK)", "Merged with Glaxo Wellcome in 2000 to form GSK", "03888792"),
+ ["Beecham"] = new("GlaxoSmithKline (GSK)", "Merged to form SmithKline Beecham, then GSK", "03888792"),
+ ["Wellcome"] = new("GlaxoSmithKline (GSK)", "Acquired by Glaxo in 1995", "03888792"),
+ ["ICI Pharmaceuticals"] = new("AstraZeneca", "ICI pharma division became Zeneca, merged with Astra 1999", "02723534"),
+ ["Zeneca"] = new("AstraZeneca", "Merged with Astra in 1999", "02723534"),
+
+ // Banking & Finance (historical names)
+ ["Midland Bank"] = new("HSBC UK", "Acquired by HSBC in 1992", "00014259"),
+ ["National Westminster Bank"] = new("NatWest (RBS Group)", "Acquired by RBS in 2000", "00929027"),
+ ["NatWest"] = new("NatWest Group", "Part of NatWest Group (formerly RBS)", "00929027"),
+ ["Lloyds Bank"] = new("Lloyds Banking Group", "Part of Lloyds Banking Group", "00002065"),
+ ["Lloyds TSB"] = new("Lloyds Banking Group", "Rebranded to Lloyds Bank in 2013", "00002065"),
+ ["TSB"] = new("TSB Bank", "Demerged from Lloyds in 2013, acquired by Sabadell", "SC205310"),
+ ["Halifax"] = new("Halifax (Lloyds Banking Group)", "Part of Lloyds Banking Group since 2009", "02367076"),
+ ["HBOS"] = new("Lloyds Banking Group", "Acquired by Lloyds in 2009", "SC218813"),
+ ["Bank of Scotland"] = new("Bank of Scotland (Lloyds Banking Group)", "Part of Lloyds Banking Group", "SC327000"),
+ ["Abbey National"] = new("Santander UK", "Acquired by Santander in 2004", "02294747"),
+ ["Alliance & Leicester"] = new("Santander UK", "Acquired by Santander in 2008", "03263713"),
+ ["Bradford & Bingley"] = new("Santander UK (savings) / UKAR (mortgages)", "Nationalised 2008, split up", "00189520"),
+ ["Northern Rock"] = new("Virgin Money UK", "Nationalised 2008, sold to Virgin Money 2012", "03273685"),
+
+ // Retail
+ ["Woolworths"] = new("Dissolved", "UK Woolworths went into administration in 2008", "00106966"),
+ ["British Home Stores"] = new("Dissolved", "BHS went into administration in 2016", "00229606"),
+ ["BHS"] = new("Dissolved", "BHS went into administration in 2016", "00229606"),
+ ["Littlewoods"] = new("Shop Direct / The Very Group", "Stores closed, online business continued", null),
+ ["Comet"] = new("Dissolved", "Electrical retailer went into administration in 2012", "00abortedte"),
+ ["MFI"] = new("Dissolved", "Furniture retailer went into administration in 2008", null),
+ ["Courts"] = new("Dissolved", "Furniture retailer ceased UK operations", null),
+ ["Safeway"] = new("Morrisons", "UK stores acquired by Morrisons in 2004", "00358949"),
+ ["Kwik Save"] = new("Dissolved", "Supermarket chain dissolved in 2007", null),
+ ["Fine Fare"] = new("Dissolved", "Supermarket chain - stores sold to various buyers", null),
+ ["Gateway"] = new("Somerfield / Co-op", "Became Somerfield, then acquired by Co-op", null),
+ ["Somerfield"] = new("Co-operative Group", "Acquired by Co-op in 2009", null),
+
+ // Telecoms
+ ["British Telecom"] = new("BT Group", "Rebranded to BT", "01800000"),
+ ["GPO Telephones"] = new("BT Group", "Became British Telecom, then BT", "01800000"),
+ ["Mercury Communications"] = new("Cable & Wireless / Vodafone", "Merged into Cable & Wireless, later Vodafone", null),
+ ["Cellnet"] = new("O2 (Virgin Media O2)", "Became BT Cellnet, then O2", null),
+ ["Orange"] = new("EE (BT)", "Merged with T-Mobile to form EE, acquired by BT", null),
+ ["T-Mobile UK"] = new("EE (BT)", "Merged with Orange to form EE", null),
+ ["One2One"] = new("EE (BT)", "Became T-Mobile UK, then EE", null),
+
+ // Utilities
+ ["Central Electricity Generating Board"] = new("National Grid / Various generators", "CEGB privatised and split in 1990", null),
+ ["CEGB"] = new("National Grid / Various generators", "CEGB privatised and split in 1990", null),
+ ["British Gas"] = new("Centrica / National Grid", "Demerged in 1997", "00029782"),
+ ["Eastern Electricity"] = new("EDF Energy", "Privatised, now part of EDF", null),
+ ["London Electricity"] = new("EDF Energy", "Privatised, now part of EDF", null),
+ ["SEEBOARD"] = new("EDF Energy", "Privatised, now part of EDF", null),
+ ["PowerGen"] = new("E.ON UK", "Acquired by E.ON", null),
+ ["National Power"] = new("RWE npower / Innogy", "Split and acquired", null),
+
+ // Manufacturing & Industrial
+ ["British Steel"] = new("Tata Steel UK / British Steel (2016)", "Privatised, acquired by Corus then Tata, British Steel name revived 2016", "12303256"),
+ ["British Steel Corporation"] = new("Tata Steel UK / British Steel (2016)", "Nationalised steel industry, privatised 1988", "12303256"),
+ ["British Steel plc"] = new("Tata Steel UK / British Steel (2016)", "Merged with Hoogovens to form Corus 1999", "12303256"),
+ ["Corus"] = new("Tata Steel UK", "Acquired by Tata Steel in 2007", null),
+ ["British Leyland"] = new("Various (BMW, Tata, etc.)", "Split up - brands went to various owners", null),
+ ["Rover Group"] = new("Dissolved", "Final owner MG Rover went bankrupt 2005", null),
+ ["MG Rover"] = new("Dissolved", "Went into administration in 2005", null),
+ ["Austin Rover"] = new("Dissolved", "Part of British Leyland, became Rover Group", null),
+ ["British Aerospace"] = new("BAE Systems", "Merged with Marconi Electronic Systems in 1999", "01470151"),
+ ["BAe"] = new("BAE Systems", "Merged with Marconi Electronic Systems in 1999", "01470151"),
+ ["Marconi"] = new("BAE Systems / Ericsson", "Defence division to BAE, telecoms to Ericsson", null),
+ ["GEC"] = new("Various", "General Electric Company (UK) - broken up", null),
+ ["GEC Marconi"] = new("BAE Systems", "Defence business became part of BAE Systems", "01470151"),
+ ["Plessey"] = new("Siemens / various", "Broken up in 1989", null),
+ ["ICL"] = new("Fujitsu", "Acquired by Fujitsu", null),
+ ["International Computers Limited"] = new("Fujitsu", "Acquired by Fujitsu in 2002", null),
+ ["Ferranti"] = new("Dissolved", "Collapsed in 1993 after fraud scandal", null),
+
+ // Oil & Gas
+ ["British Petroleum"] = new("BP", "Rebranded to BP", "00102498"),
+ ["BP Amoco"] = new("BP", "Merged 1998, rebranded to just BP", "00102498"),
+ ["Enterprise Oil"] = new("Shell", "Acquired by Shell in 2002", null),
+ ["Lasmo"] = new("Eni", "Acquired by Eni in 2001", null),
+ ["Britoil"] = new("BP", "Acquired by BP in 1988", null),
+
+ // Transport
+ ["British Rail"] = new("Various (Network Rail, TOCs)", "Privatised and split in 1990s", null),
+ ["British Railways"] = new("Various (Network Rail, TOCs)", "Became British Rail, then privatised", null),
+ ["Railtrack"] = new("Network Rail", "Replaced by Network Rail in 2002", "04402220"),
+ ["British Airways"] = new("British Airways (IAG)", "Now part of International Airlines Group", "01777777"),
+ ["British Caledonian"] = new("British Airways", "Acquired by BA in 1987", null),
+ ["British European Airways"] = new("British Airways", "Merged with BOAC to form BA in 1974", null),
+ ["BEA"] = new("British Airways", "Merged with BOAC to form BA in 1974", null),
+ ["BOAC"] = new("British Airways", "Merged with BEA to form BA in 1974", null),
+ ["British Overseas Airways Corporation"] = new("British Airways", "Merged with BEA to form BA in 1974", null),
+ ["Dan-Air"] = new("British Airways", "Acquired by BA in 1992", null),
+
+ // Media
+ ["Thames Television"] = new("Fremantle", "Lost franchise 1991, production continued", null),
+ ["Granada Television"] = new("ITV plc", "Merged to form ITV plc", "04967001"),
+ ["Carlton Television"] = new("ITV plc", "Merged with Granada to form ITV", "04967001"),
+ ["Yorkshire Television"] = new("ITV plc", "Part of ITV plc", "04967001"),
+ ["Tyne Tees Television"] = new("ITV plc", "Part of ITV plc", "04967001"),
+ ["Central Television"] = new("ITV plc", "Part of ITV plc", "04967001"),
+ ["Anglia Television"] = new("ITV plc", "Part of ITV plc", "04967001"),
+ ["HTV"] = new("ITV plc", "Part of ITV plc", "04967001"),
+ ["LWT"] = new("ITV plc", "London Weekend Television, part of ITV", "04967001"),
+ ["London Weekend Television"] = new("ITV plc", "Part of ITV plc", "04967001"),
+
+ // Construction
+ ["Wimpey"] = new("Taylor Wimpey", "Merged with Taylor Woodrow in 2007", "00296805"),
+ ["Taylor Woodrow"] = new("Taylor Wimpey", "Merged with Wimpey in 2007", "00296805"),
+ ["John Laing"] = new("John Laing Group (infrastructure)", "Construction sold, now infrastructure investor", "05975300"),
+ ["Costain Group"] = new("Costain", "Still trading", "00102921"),
+ ["Tarmac"] = new("Tarmac (CRH)", "Construction now part of CRH", null),
+ ["Alfred McAlpine"] = new("Carillion (dissolved)", "Acquired by Carillion, which collapsed 2018", null),
+ ["Carillion"] = new("Dissolved", "Collapsed into liquidation in 2018", "03782379"),
+ ["Mowlem"] = new("Carillion (dissolved)", "Acquired by Carillion in 2006", null),
+ ["Balfour Beatty"] = new("Balfour Beatty", "Still trading", "00395826"),
+
+ // Insurance
+ ["Royal Insurance"] = new("RSA Insurance Group", "Merged with Sun Alliance", "02339826"),
+ ["Sun Alliance"] = new("RSA Insurance Group", "Merged with Royal Insurance", "02339826"),
+ ["Guardian Royal Exchange"] = new("AXA", "Acquired by AXA in 1999", null),
+ ["Commercial Union"] = new("Aviva", "Merged to form CGU, then Aviva", "02468686"),
+ ["General Accident"] = new("Aviva", "Merged to form CGU, then Aviva", "02468686"),
+ ["CGU"] = new("Aviva", "Rebranded to Aviva in 2002", "02468686"),
+ ["Norwich Union"] = new("Aviva", "Rebranded to Aviva in 2009", "02468686"),
+ ["Eagle Star"] = new("Zurich", "Acquired by Zurich", null),
+ ["Prudential"] = new("Prudential plc / M&G", "UK business demerged as M&G plc", "01397169"),
+ };
+
+ ///
+ /// Major UK charities and non-profit organisations.
+ /// These are legitimate employers but may not be found via standard company search.
+ ///
+ public static readonly HashSet CharityEmployers = new(StringComparer.OrdinalIgnoreCase)
+ {
+ // Youth organisations
+ "Girlguiding",
+ "Girlguiding UK",
+ "Girlguiding North East England",
+ "Girl Guides",
+ "Scouts",
+ "Scout Association",
+ "Boys Brigade",
+ "Girls Brigade",
+ "Cadets",
+ "Sea Cadets",
+ "Air Cadets",
+ "Army Cadets",
+
+ // Major charities
+ "British Red Cross",
+ "Oxfam",
+ "Save the Children",
+ "NSPCC",
+ "Barnardo's",
+ "RSPCA",
+ "RSPB",
+ "National Trust",
+ "Cancer Research UK",
+ "British Heart Foundation",
+ "Macmillan Cancer Support",
+ "Marie Curie",
+ "Age UK",
+ "Mind",
+ "Samaritans",
+ "Shelter",
+ "Citizens Advice",
+ "Citizens Advice Bureau",
+ "CAB",
+ "St John Ambulance",
+ "Salvation Army",
+ "YMCA",
+ "YWCA",
+
+ // Religious organisations
+ "Church of England",
+ "Catholic Church",
+ "Methodist Church",
+ "Baptist Church",
+ "Salvation Army",
+ };
+
+ ///
+ /// Public sector organisations and government bodies.
+ /// These are legitimate employers but not registered at Companies House.
+ ///
+ public static readonly HashSet PublicSectorEmployers = new(StringComparer.OrdinalIgnoreCase)
+ {
+ // Emergency Services
+ "Metropolitan Police",
+ "Metropolitan Police Service",
+ "Metropolitan Police Engineers",
+ "Met Police",
+ "City of London Police",
+ "British Transport Police",
+ "Police Scotland",
+ "Police Service of Northern Ireland",
+ "PSNI",
+ "London Fire Brigade",
+ "London Ambulance Service",
+ "NHS",
+ "National Health Service",
+
+ // Government Departments
+ "HM Treasury",
+ "Home Office",
+ "Foreign Office",
+ "Ministry of Defence",
+ "MOD",
+ "Department of Health",
+ "Department for Education",
+ "DfE",
+ "Department for Work and Pensions",
+ "DWP",
+ "HMRC",
+ "HM Revenue and Customs",
+ "Cabinet Office",
+ "DVLA",
+ "DVSA",
+ "Environment Agency",
+ "Highways Agency",
+ "Highways England",
+ "National Highways",
+
+ // Armed Forces
+ "British Army",
+ "Royal Navy",
+ "Royal Air Force",
+ "RAF",
+ "Royal Marines",
+
+ // Local Government
+ "London Borough",
+ "County Council",
+ "City Council",
+ "District Council",
+ "Metropolitan Borough",
+ "Borough Council",
+ "Town Council",
+ "Parish Council",
+ "Greater London Council",
+ "GLC",
+
+ // Education
+ "University of",
+ "College of",
+ "School of",
+
+ // Other Public Bodies
+ "BBC",
+ "British Broadcasting Corporation",
+ "Channel 4",
+ "Bank of England",
+ "Royal Mail",
+ "Post Office",
+ "Transport for London",
+ "TfL",
+ "Network Rail",
+ "Ordnance Survey",
+ "Land Registry",
+ "Companies House",
+ "National Archives",
+ "British Library",
+ "British Museum",
+ "National Gallery",
+ "Tate",
+ "Natural History Museum",
+ "Science Museum",
+ "V&A",
+ "Victoria and Albert Museum",
+ };
+
+ ///
+ /// Patterns that indicate an internal division or department of a larger company.
+ /// These are legitimate employer references but won't be separately registered.
+ ///
+ public static readonly Dictionary DivisionPatterns = new(StringComparer.OrdinalIgnoreCase)
+ {
+ // Airlines
+ ["British Airways Technical Support"] = "British Airways",
+ ["BA Technical Support"] = "British Airways",
+ ["BA Engineering"] = "British Airways",
+ ["British Airways Engineering"] = "British Airways",
+ ["FBA - British Airways"] = "British Airways",
+
+ // Major employers with divisions
+ ["BBC News"] = "BBC",
+ ["BBC World Service"] = "BBC",
+ ["BBC Studios"] = "BBC",
+ ["ITV News"] = "ITV plc",
+ ["Sky News"] = "Sky UK",
+ ["BT Openreach"] = "BT Group",
+ ["Openreach"] = "BT Group",
+ ["BT Research"] = "BT Group",
+ ["Shell Research"] = "Shell",
+ ["BP Research"] = "BP",
+ ["Rolls-Royce Aerospace"] = "Rolls-Royce",
+ ["Rolls-Royce Marine"] = "Rolls-Royce",
+ ["BAE Systems Naval Ships"] = "BAE Systems",
+ ["BAE Systems Submarines"] = "BAE Systems",
+
+ // Banks - divisions
+ ["Barclays Investment Bank"] = "Barclays",
+ ["Barclays Capital"] = "Barclays",
+ ["HSBC Investment Bank"] = "HSBC",
+ ["Lloyds Commercial Banking"] = "Lloyds Banking Group",
+ ["NatWest Markets"] = "NatWest Group",
+ ["RBS Markets"] = "NatWest Group",
+ };
+
+ ///
+ /// Check if an employer name is a known historical company.
+ ///
+ public static bool IsHistoricalEmployer(string employerName)
+ {
+ if (string.IsNullOrWhiteSpace(employerName))
+ return false;
+
+ return HistoricalCompanies.ContainsKey(employerName.Trim());
+ }
+
+ ///
+ /// Get information about a historical employer.
+ ///
+ public static HistoricalEmployerInfo? GetHistoricalEmployerInfo(string employerName)
+ {
+ if (string.IsNullOrWhiteSpace(employerName))
+ return null;
+
+ return HistoricalCompanies.GetValueOrDefault(employerName.Trim());
+ }
+
+ ///
+ /// Check if an employer is a public sector organisation.
+ ///
+ public static bool IsPublicSectorEmployer(string employerName)
+ {
+ if (string.IsNullOrWhiteSpace(employerName))
+ return false;
+
+ var name = employerName.Trim();
+
+ // Direct match
+ if (PublicSectorEmployers.Contains(name))
+ return true;
+
+ // Partial match for patterns like "London Borough of X"
+ foreach (var pattern in PublicSectorEmployers)
+ {
+ if (name.Contains(pattern, StringComparison.OrdinalIgnoreCase))
+ return true;
+ }
+
+ return false;
+ }
+
+ ///
+ /// Check if an employer is a charity or non-profit organisation.
+ ///
+ public static bool IsCharityEmployer(string employerName)
+ {
+ if (string.IsNullOrWhiteSpace(employerName))
+ return false;
+
+ var name = employerName.Trim();
+
+ // Direct match
+ if (CharityEmployers.Contains(name))
+ return true;
+
+ // Partial match
+ foreach (var pattern in CharityEmployers)
+ {
+ if (name.Contains(pattern, StringComparison.OrdinalIgnoreCase))
+ return true;
+ }
+
+ return false;
+ }
+
+ ///
+ /// Check if an employer name is an internal division and get the parent company.
+ ///
+ public static string? GetParentCompanyForDivision(string employerName)
+ {
+ if (string.IsNullOrWhiteSpace(employerName))
+ return null;
+
+ var name = employerName.Trim();
+
+ // Direct match
+ if (DivisionPatterns.TryGetValue(name, out var parent))
+ return parent;
+
+ // Partial match
+ foreach (var (pattern, parentCompany) in DivisionPatterns)
+ {
+ if (name.Contains(pattern, StringComparison.OrdinalIgnoreCase))
+ return parentCompany;
+ }
+
+ return null;
+ }
+}
+
+///
+/// Information about a historical employer.
+///
+public sealed record HistoricalEmployerInfo(
+ string SuccessorName,
+ string Notes,
+ string? CompanyNumber
+);
diff --git a/src/RealCV.Application/Data/UKInstitutions.cs b/src/RealCV.Application/Data/UKInstitutions.cs
index c43a284..dc66d1f 100644
--- a/src/RealCV.Application/Data/UKInstitutions.cs
+++ b/src/RealCV.Application/Data/UKInstitutions.cs
@@ -122,6 +122,28 @@ public static class UKInstitutions
"Wrexham University",
"York St John University",
+ // Post-1992 Universities (former polytechnics)
+ "Leeds Beckett University",
+ "Birmingham City University",
+ "University of Bedfordshire",
+ "Anglia Ruskin University",
+ "University of Central Lancashire",
+ "University of West London",
+ "University of Northampton",
+ "University of Chichester",
+ "Plymouth Marjon University",
+ "Bath Spa University",
+ "Solent University",
+ "University of Bolton",
+ "University of Cumbria",
+ "University of Chester",
+ "University of Gloucestershire",
+ "University of Suffolk",
+ "Newman University",
+ "Bishop Grosseteste University",
+ "Harper Adams University",
+ "Royal Agricultural University",
+
// Scottish Universities
"University of Aberdeen",
"Abertay University",
@@ -134,6 +156,8 @@ public static class UKInstitutions
"Bangor University",
"University of South Wales",
"Wrexham Glyndwr University",
+ "Wrexham University",
+ "Cardiff Metropolitan University",
// Northern Ireland
"Ulster University",
@@ -304,6 +328,112 @@ public static class UKInstitutions
["South Bank University"] = "London South Bank University",
["LSBU"] = "London South Bank University",
+ // Historical polytechnic names (became universities in 1992)
+ // These are legitimate institutions that existed under different names
+ ["South Bank Polytechnic"] = "London South Bank University",
+ ["Polytechnic of the South Bank"] = "London South Bank University",
+ ["Thames Polytechnic"] = "University of Greenwich",
+ ["Woolwich Polytechnic"] = "University of Greenwich",
+ ["Polytechnic of Central London"] = "University of Westminster",
+ ["PCL"] = "University of Westminster",
+ ["Polytechnic of North London"] = "London Metropolitan University",
+ ["City of London Polytechnic"] = "London Metropolitan University",
+ ["London Guildhall University"] = "London Metropolitan University",
+ ["University of North London"] = "London Metropolitan University",
+ ["Polytechnic of East London"] = "University of East London",
+ ["North East London Polytechnic"] = "University of East London",
+ ["Middlesex Polytechnic"] = "Middlesex University",
+ ["Hatfield Polytechnic"] = "University of Hertfordshire",
+ ["Sheffield Polytechnic"] = "Sheffield Hallam University",
+ ["Sheffield City Polytechnic"] = "Sheffield Hallam University",
+ ["Manchester Polytechnic"] = "Manchester Metropolitan University",
+ ["Leeds Polytechnic"] = "Leeds Beckett University",
+ ["Leeds Metropolitan University"] = "Leeds Beckett University",
+ ["Leicester Polytechnic"] = "De Montfort University",
+ ["Coventry Polytechnic"] = "Coventry University",
+ ["Lanchester Polytechnic"] = "Coventry University",
+ ["Brighton Polytechnic"] = "University of Brighton",
+ ["Portsmouth Polytechnic"] = "University of Portsmouth",
+ ["Plymouth Polytechnic"] = "University of Plymouth",
+ ["Polytechnic South West"] = "University of Plymouth",
+ ["Oxford Polytechnic"] = "Oxford Brookes University",
+ ["Newcastle Polytechnic"] = "Northumbria University",
+ ["Newcastle upon Tyne Polytechnic"] = "Northumbria University",
+ ["Sunderland Polytechnic"] = "University of Sunderland",
+ ["Teesside Polytechnic"] = "Teesside University",
+ ["Huddersfield Polytechnic"] = "University of Huddersfield",
+ ["Wolverhampton Polytechnic"] = "University of Wolverhampton",
+ ["Liverpool Polytechnic"] = "Liverpool John Moores University",
+ ["Bristol Polytechnic"] = "University of the West of England",
+ ["Kingston Polytechnic"] = "Kingston University",
+ ["Nottingham Polytechnic"] = "Nottingham Trent University",
+ ["Trent Polytechnic"] = "Nottingham Trent University",
+ ["Birmingham Polytechnic"] = "Birmingham City University",
+ ["City of Birmingham Polytechnic"] = "Birmingham City University",
+ ["University of Central England"] = "Birmingham City University",
+ ["UCE Birmingham"] = "Birmingham City University",
+ ["Staffordshire Polytechnic"] = "Staffordshire University",
+ ["North Staffordshire Polytechnic"] = "Staffordshire University",
+ ["Luton College of Higher Education"] = "University of Bedfordshire",
+ ["University of Luton"] = "University of Bedfordshire",
+ ["Anglia Polytechnic"] = "Anglia Ruskin University",
+ ["Anglia Polytechnic University"] = "Anglia Ruskin University",
+ ["APU"] = "Anglia Ruskin University",
+ ["Cambridgeshire College of Arts and Technology"] = "Anglia Ruskin University",
+ ["CCAT"] = "Anglia Ruskin University",
+ ["Bournemouth Polytechnic"] = "Bournemouth University",
+ ["Dorset Institute of Higher Education"] = "Bournemouth University",
+ ["Derby College of Higher Education"] = "University of Derby",
+ ["Derbyshire College of Higher Education"] = "University of Derby",
+ ["Humberside Polytechnic"] = "University of Lincoln",
+ ["Humberside College of Higher Education"] = "University of Lincoln",
+ ["University of Humberside"] = "University of Lincoln",
+ ["University of Lincolnshire and Humberside"] = "University of Lincoln",
+ ["Central Lancashire Polytechnic"] = "University of Central Lancashire",
+ ["Preston Polytechnic"] = "University of Central Lancashire",
+ ["Lancashire Polytechnic"] = "University of Central Lancashire",
+ ["Glamorgan Polytechnic"] = "University of South Wales",
+ ["Polytechnic of Wales"] = "University of South Wales",
+ ["University of Glamorgan"] = "University of South Wales",
+ ["Robert Gordon Institute of Technology"] = "Robert Gordon University",
+ ["RGIT"] = "Robert Gordon University",
+ ["Napier Polytechnic"] = "Edinburgh Napier University",
+ ["Napier College"] = "Edinburgh Napier University",
+ ["Glasgow Polytechnic"] = "Glasgow Caledonian University",
+ ["Queen's College Glasgow"] = "Glasgow Caledonian University",
+ ["Dundee Institute of Technology"] = "Abertay University",
+ ["Dundee College of Technology"] = "Abertay University",
+
+ // Other historical name changes
+ ["Roehampton Institute"] = "Roehampton University",
+ ["University of Surrey Roehampton"] = "Roehampton University",
+ ["Thames Valley University"] = "University of West London",
+ ["Polytechnic of West London"] = "University of West London",
+ ["Ealing College of Higher Education"] = "University of West London",
+ ["London College of Music and Media"] = "University of West London",
+ ["University College Northampton"] = "University of Northampton",
+ ["Nene College"] = "University of Northampton",
+ ["University College Worcester"] = "University of Worcester",
+ ["Worcester College of Higher Education"] = "University of Worcester",
+ ["University College Chichester"] = "University of Chichester",
+ ["Chichester Institute of Higher Education"] = "University of Chichester",
+ ["College of St Mark and St John"] = "Plymouth Marjon University",
+ ["Marjon"] = "Plymouth Marjon University",
+ ["University of St Mark and St John"] = "Plymouth Marjon University",
+ ["University College Falmouth"] = "Falmouth University",
+ ["Falmouth College of Arts"] = "Falmouth University",
+ ["Bath College of Higher Education"] = "Bath Spa University",
+ ["Bath Spa University College"] = "Bath Spa University",
+ ["Liverpool Institute of Higher Education"] = "Liverpool Hope University",
+ ["Liverpool Hope University College"] = "Liverpool Hope University",
+ ["University of Wales, Newport"] = "University of South Wales",
+ ["University of Wales Institute, Cardiff"] = "Cardiff Metropolitan University",
+ ["UWIC"] = "Cardiff Metropolitan University",
+ ["North East Wales Institute"] = "Wrexham University",
+ ["NEWI"] = "Wrexham University",
+ ["Glyndwr University"] = "Wrexham University",
+ ["Wrexham Glyndwr University"] = "Wrexham University",
+
// Other common variations
["Open University"] = "The Open University",
["OU"] = "The Open University",
diff --git a/src/RealCV.Infrastructure/Services/AICompanyNameMatcherService.cs b/src/RealCV.Infrastructure/Services/AICompanyNameMatcherService.cs
index ea8c506..b96d87a 100644
--- a/src/RealCV.Infrastructure/Services/AICompanyNameMatcherService.cs
+++ b/src/RealCV.Infrastructure/Services/AICompanyNameMatcherService.cs
@@ -39,22 +39,33 @@ public sealed class AICompanyNameMatcherService : ICompanyNameMatcherService
Determine which candidate (if any) is the SAME company as the CV entry.
- Rules:
- 1. A match requires the companies to be the SAME organisation, not just similar names
- 2. "Families First CiC" is NOT the same as "FAMILIES AGAINST CONFORMITY LTD" - different words = different companies
- 3. Trading names should match their registered entity (e.g., "Tesco" matches "TESCO PLC")
- 4. Subsidiaries can match if clearly the same organisation (e.g., "ASDA" could match "ASDA STORES LIMITED")
- 5. Acronyms in parentheses are abbreviations of the full name (e.g., "North Halifax Partnership (NHP)" = "NORTH HALIFAX PARTNERSHIP")
- 6. CiC/CIC = Community Interest Company, LLP = Limited Liability Partnership - these are legal suffixes
- 7. If the CV name contains all the key words of a candidate (ignoring Ltd/Limited/CIC/etc.), it's likely a match
- 8. If NO candidate is clearly the same company, return "NONE" as the best match
+ Matching Guidelines:
+ 1. MATCH if the CV name is clearly the same organisation as a candidate:
+ - "Royal Bank of Scotland" → "THE ROYAL BANK OF SCOTLAND PUBLIC LIMITED COMPANY" ✓ (same bank)
+ - "Yorkshire Electricity" → "YORKSHIRE ELECTRICITY GROUP PLC" ✓ (same utility)
+ - "Tesco" → "TESCO PLC" ✓ (trading name = registered name)
+ - "ASDA" → "ASDA STORES LIMITED" ✓ (brand = operating company)
+
+ 2. DO NOT MATCH if the words are fundamentally different:
+ - "Families First" ≠ "FAMILIES AGAINST CONFORMITY" (different words after "Families")
+ - "Royal Bank" ≠ "Royal Academy" (Bank ≠ Academy)
+ - "Storm Ideas" ≠ "STORM LIMITED" (missing "Ideas" - could be different company)
+
+ 3. Legal suffixes (Ltd, Limited, PLC, LLP, CiC) should be ignored when comparing names
+
+ 4. Adding "THE" or "GROUP" to a name doesn't make it a different company
+
+ 5. If unsure, prefer matching over rejecting when core identifying words match
+
+ CRITICAL: Return the COMPLETE company number exactly as shown (e.g., "SC083026", "02366995").
+ Do NOT truncate or abbreviate the company number.
Respond with this exact JSON structure:
{
- "bestMatchCompanyNumber": "string (company number of best match, or 'NONE' if no valid match)",
+ "bestMatchCompanyNumber": "COMPLETE company number from the list above, or 'NONE' if no valid match",
"confidenceScore": number (0-100, where 100 = certain match, 0 = no match),
- "matchType": "string (Exact, TradingName, Subsidiary, Parent, NoMatch)",
- "reasoning": "string (brief explanation of why this is or isn't a match)"
+ "matchType": "Exact|TradingName|Subsidiary|Parent|NoMatch",
+ "reasoning": "brief explanation"
}
""";
@@ -81,8 +92,9 @@ public sealed class AICompanyNameMatcherService : ICompanyNameMatcherService
try
{
+ // Format candidates with company number prominently displayed to prevent truncation
var candidatesText = string.Join("\n", candidates.Select((c, i) =>
- $"{i + 1}. {c.CompanyName} (Number: {c.CompanyNumber}, Status: {c.CompanyStatus ?? "Unknown"})"));
+ $"[{c.CompanyNumber}] {c.CompanyName} (Status: {c.CompanyStatus ?? "Unknown"})"));
var prompt = MatchingPrompt
.Replace("{CV_COMPANY}", cvCompanyName)
@@ -127,7 +139,8 @@ public sealed class AICompanyNameMatcherService : ICompanyNameMatcherService
aiResponse.BestMatchCompanyNumber, aiResponse.ConfidenceScore, aiResponse.Reasoning);
// Find the matched candidate
- if (aiResponse.BestMatchCompanyNumber == "NONE" || aiResponse.ConfidenceScore < 50)
+ // Lower threshold to 30 - we have fuzzy validation as backup
+ if (aiResponse.BestMatchCompanyNumber == "NONE" || aiResponse.ConfidenceScore < 30)
{
return new SemanticMatchResult
{
@@ -142,10 +155,40 @@ public sealed class AICompanyNameMatcherService : ICompanyNameMatcherService
var matchedCandidate = candidates.FirstOrDefault(c =>
c.CompanyNumber.Equals(aiResponse.BestMatchCompanyNumber, StringComparison.OrdinalIgnoreCase));
+ // If exact match not found, try to find a candidate that starts with the returned number
+ // This handles cases where AI truncates "09052626" to "09" or similar
+ if (matchedCandidate is null && !string.IsNullOrWhiteSpace(aiResponse.BestMatchCompanyNumber)
+ && aiResponse.BestMatchCompanyNumber != "NONE")
+ {
+ var partialMatch = candidates.FirstOrDefault(c =>
+ c.CompanyNumber.StartsWith(aiResponse.BestMatchCompanyNumber, StringComparison.OrdinalIgnoreCase));
+
+ if (partialMatch is not null)
+ {
+ _logger.LogDebug("AI returned partial company number '{Partial}', matched to full number '{Full}'",
+ aiResponse.BestMatchCompanyNumber, partialMatch.CompanyNumber);
+ matchedCandidate = partialMatch;
+ }
+ else
+ {
+ // Try reverse - maybe AI returned a longer string that contains the actual number
+ var reverseMatch = candidates.FirstOrDefault(c =>
+ aiResponse.BestMatchCompanyNumber.Contains(c.CompanyNumber, StringComparison.OrdinalIgnoreCase));
+
+ if (reverseMatch is not null)
+ {
+ _logger.LogDebug("AI returned string containing company number '{Number}'",
+ reverseMatch.CompanyNumber);
+ matchedCandidate = reverseMatch;
+ }
+ }
+ }
+
if (matchedCandidate is null)
{
- _logger.LogWarning("AI returned company number {Number} not in candidates list",
- aiResponse.BestMatchCompanyNumber);
+ _logger.LogWarning("AI returned company number '{Number}' not in candidates list. Candidates: {Candidates}",
+ aiResponse.BestMatchCompanyNumber,
+ string.Join(", ", candidates.Select(c => c.CompanyNumber)));
return null;
}
diff --git a/src/RealCV.Infrastructure/Services/CompanyVerifierService.cs b/src/RealCV.Infrastructure/Services/CompanyVerifierService.cs
index 5206b4c..3df890f 100644
--- a/src/RealCV.Infrastructure/Services/CompanyVerifierService.cs
+++ b/src/RealCV.Infrastructure/Services/CompanyVerifierService.cs
@@ -2,6 +2,7 @@ using System.Text.Json;
using FuzzySharp;
using Microsoft.EntityFrameworkCore;
using Microsoft.Extensions.Logging;
+using RealCV.Application.Data;
using RealCV.Application.DTOs;
using RealCV.Application.Helpers;
using RealCV.Application.Interfaces;
@@ -93,11 +94,140 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
{
ArgumentException.ThrowIfNullOrWhiteSpace(companyName);
- _logger.LogDebug("Verifying company: {CompanyName}", companyName);
+ // Normalize company name - strip trailing punctuation that causes matching issues
+ var normalizedName = NormalizeCompanyName(companyName);
+ _logger.LogDebug("Verifying company: {CompanyName} (normalized: {NormalizedName})", companyName, normalizedName);
var flags = new List();
+ // Check 1a: Is this a public sector employer?
+ if (UKHistoricalEmployers.IsPublicSectorEmployer(normalizedName))
+ {
+ _logger.LogInformation("Recognised public sector employer: {CompanyName}", companyName);
+ return new CompanyVerificationResult
+ {
+ ClaimedCompany = companyName,
+ MatchedCompanyName = companyName,
+ MatchedCompanyNumber = null,
+ MatchScore = 100,
+ IsVerified = true,
+ VerificationNotes = "Public sector employer - not registered at Companies House",
+ ClaimedStartDate = startDate,
+ ClaimedEndDate = endDate,
+ CompanyType = "public-sector",
+ CompanyStatus = "active",
+ ClaimedJobTitle = jobTitle,
+ Flags = flags
+ };
+ }
+
+ // Check 1b: Is this a charity or non-profit organisation?
+ if (UKHistoricalEmployers.IsCharityEmployer(normalizedName))
+ {
+ _logger.LogInformation("Recognised charity employer: {CompanyName}", companyName);
+ return new CompanyVerificationResult
+ {
+ ClaimedCompany = companyName,
+ MatchedCompanyName = companyName,
+ MatchedCompanyNumber = null,
+ MatchScore = 100,
+ IsVerified = true,
+ VerificationNotes = "Charity/non-profit organisation",
+ ClaimedStartDate = startDate,
+ ClaimedEndDate = endDate,
+ CompanyType = "charity",
+ CompanyStatus = "active",
+ ClaimedJobTitle = jobTitle,
+ Flags = flags
+ };
+ }
+
+ // Check 2: Is this an internal division of a larger company?
+ var parentCompany = UKHistoricalEmployers.GetParentCompanyForDivision(normalizedName);
+ if (parentCompany != null)
+ {
+ _logger.LogInformation("Recognised division '{CompanyName}' of parent company '{ParentCompany}'", companyName, parentCompany);
+ // Try to verify the parent company instead
+ var parentResult = await VerifyCompanyAsync(parentCompany, startDate, endDate, jobTitle);
+ if (parentResult.IsVerified)
+ {
+ return parentResult with
+ {
+ ClaimedCompany = companyName,
+ VerificationNotes = $"Internal division of {parentResult.MatchedCompanyName}"
+ };
+ }
+ // If parent verification failed, return a partial match
+ return new CompanyVerificationResult
+ {
+ ClaimedCompany = companyName,
+ MatchedCompanyName = parentCompany,
+ MatchedCompanyNumber = null,
+ MatchScore = 85,
+ IsVerified = true,
+ VerificationNotes = $"Recognised as division of {parentCompany}",
+ ClaimedStartDate = startDate,
+ ClaimedEndDate = endDate,
+ ClaimedJobTitle = jobTitle,
+ Flags = flags
+ };
+ }
+
+ // Check 3: Is this a known historical employer?
+ var historicalInfo = UKHistoricalEmployers.GetHistoricalEmployerInfo(normalizedName);
+ if (historicalInfo != null)
+ {
+ _logger.LogInformation("Recognised historical employer: {CompanyName} -> {Successor}", companyName, historicalInfo.SuccessorName);
+
+ // If we have a company number for the successor, try to get current details
+ if (!string.IsNullOrEmpty(historicalInfo.CompanyNumber))
+ {
+ try
+ {
+ var successorDetails = await _companiesHouseClient.GetCompanyAsync(historicalInfo.CompanyNumber);
+ if (successorDetails != null)
+ {
+ return new CompanyVerificationResult
+ {
+ ClaimedCompany = companyName,
+ MatchedCompanyName = $"{companyName} (now {successorDetails.CompanyName})",
+ MatchedCompanyNumber = historicalInfo.CompanyNumber,
+ MatchScore = 90,
+ IsVerified = true,
+ VerificationNotes = $"Historical company. {historicalInfo.Notes}",
+ ClaimedStartDate = startDate,
+ ClaimedEndDate = endDate,
+ CompanyType = successorDetails.Type,
+ CompanyStatus = "historical",
+ ClaimedJobTitle = jobTitle,
+ Flags = flags
+ };
+ }
+ }
+ catch (Exception ex)
+ {
+ _logger.LogWarning(ex, "Failed to fetch successor company details for {CompanyNumber}", historicalInfo.CompanyNumber);
+ }
+ }
+
+ // Return historical match without successor details
+ return new CompanyVerificationResult
+ {
+ ClaimedCompany = companyName,
+ MatchedCompanyName = $"{companyName} (now {historicalInfo.SuccessorName})",
+ MatchedCompanyNumber = historicalInfo.CompanyNumber,
+ MatchScore = 90,
+ IsVerified = true,
+ VerificationNotes = $"Historical company. {historicalInfo.Notes}",
+ ClaimedStartDate = startDate,
+ ClaimedEndDate = endDate,
+ CompanyStatus = "historical",
+ ClaimedJobTitle = jobTitle,
+ Flags = flags
+ };
+ }
+
// Try to find a cached match first (but only if it existed at claimed start date)
- var cachedMatch = await FindCachedMatchAsync(companyName);
+ var cachedMatch = await FindCachedMatchAsync(normalizedName);
if (cachedMatch is not null)
{
// Check if cached company existed at the claimed start date
@@ -119,9 +249,9 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
// Search Companies House with fallback queries
try
{
- var searchQueries = GenerateSearchQueries(companyName);
+ var searchQueries = GenerateSearchQueries(normalizedName);
_logger.LogDebug("Generated {Count} search queries for '{CompanyName}': {Queries}",
- searchQueries.Count, companyName, string.Join(", ", searchQueries.Select(q => $"'{q}'")));
+ searchQueries.Count, normalizedName, string.Join(", ", searchQueries.Select(q => $"'{q}'")));
// Collect all candidates from all search queries for AI matching
var allCandidates = new Dictionary();
@@ -148,7 +278,7 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
}
// Find fuzzy matches (as before) for fallback
- var fuzzyMatch = FindBestMatch(companyName, query, searchResponse.Items, startDate);
+ var fuzzyMatch = FindBestMatch(normalizedName, query, searchResponse.Items, startDate);
if (fuzzyMatch is not null)
{
fuzzyMatches.Add(fuzzyMatch.Value);
@@ -157,30 +287,47 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
if (allCandidates.Count == 0)
{
- _logger.LogDebug("No candidates found for: {CompanyName} after trying {Count} queries", companyName, searchQueries.Count);
+ _logger.LogDebug("No candidates found for: {CompanyName} after trying {Count} queries", normalizedName, searchQueries.Count);
return CreateUnverifiedResult(companyName, startDate, endDate, jobTitle,
"Company name could not be verified against official records");
}
// Use AI to find the best semantic match from all candidates
- _logger.LogDebug("Using AI to match '{CompanyName}' against {Count} candidates", companyName, allCandidates.Count);
+ _logger.LogDebug("Using AI to match '{CompanyName}' against {Count} candidates", normalizedName, allCandidates.Count);
+ // Sort candidates by fuzzy relevance to the search term before taking top 10
+ // This ensures the most likely matches are sent to the AI, not just arbitrary entries
+ var normalizedUpper = normalizedName.ToUpperInvariant();
var candidatesForAI = allCandidates.Values
- .Take(10) // Limit to top 10 candidates to reduce AI cost
- .Select(c => new CompanyCandidate
+ .Select(c => new
{
- CompanyName = c.Title,
- CompanyNumber = c.CompanyNumber,
- CompanyStatus = c.CompanyStatus,
- DateOfCreation = c.DateOfCreation
+ Item = c,
+ Score = Fuzz.TokenSetRatio(normalizedUpper, c.Title.ToUpperInvariant())
+ })
+ .OrderByDescending(x => x.Score)
+ .Take(10)
+ .Select(x => new CompanyCandidate
+ {
+ CompanyName = x.Item.Title,
+ CompanyNumber = x.Item.CompanyNumber,
+ CompanyStatus = x.Item.CompanyStatus,
+ DateOfCreation = x.Item.DateOfCreation
})
.ToList();
- var aiResult = await _aiMatcher.FindBestMatchAsync(companyName, candidatesForAI);
+ _logger.LogDebug("Top candidates for AI matching (sorted by relevance): {Candidates}",
+ string.Join(", ", candidatesForAI.Select(c => $"{c.CompanyName} [{c.CompanyNumber}]")));
+
+ var aiResult = await _aiMatcher.FindBestMatchAsync(normalizedName, candidatesForAI);
CompaniesHouseSearchItem? matchedItem = null;
int matchScore;
+ // Get best fuzzy match for potential fallback
+ var bestFuzzy = fuzzyMatches.Count > 0
+ ? fuzzyMatches.OrderByDescending(m => m.Score).First()
+ : ((CompaniesHouseSearchItem Item, int Score)?)null;
+
if (aiResult is not null && aiResult.IsMatch)
{
// AI found a valid match
@@ -195,21 +342,63 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
// AI didn't find a match - check if it explicitly rejected or just failed
if (aiResult?.MatchType == "NoMatch")
{
- _logger.LogDebug("AI explicitly rejected all candidates for '{CompanyName}'. Reasoning: {Reasoning}",
- companyName, aiResult?.Reasoning ?? "No match found");
- return CreateUnverifiedResult(companyName, startDate, endDate, jobTitle,
- "Company name could not be verified - no matching company found in official records");
- }
+ // AI explicitly rejected. Only override if fuzzy match passes strict validation:
+ // 1. High fuzzy score (>= 90%)
+ // 2. ALL core identifying words from original name appear in the match
+ // 3. Match doesn't have significantly more core words (prevents partial word matches)
+ if (bestFuzzy.HasValue && bestFuzzy.Value.Score >= 90)
+ {
+ var originalCores = ExtractCoreIdentifiers(normalizedName);
+ var matchCores = ExtractCoreIdentifiers(bestFuzzy.Value.Item.Title);
- // AI failed (API error, etc.) - fall back to fuzzy matching
- _logger.LogWarning("AI matching failed for '{CompanyName}', falling back to fuzzy matching", companyName);
- var bestFuzzy = fuzzyMatches.OrderByDescending(m => m.Score).First();
- matchedItem = bestFuzzy.Item;
- matchScore = bestFuzzy.Score;
+ // All original core words must appear in the match
+ var allCoresPresent = originalCores.Count == 0 ||
+ originalCores.All(c => bestFuzzy.Value.Item.Title.Contains(c, StringComparison.OrdinalIgnoreCase));
+
+ // Match shouldn't have too many extra core words (max 2 extra, e.g., "GROUP PLC")
+ var extraCores = matchCores.Count(c => !originalCores.Any(o =>
+ c.Equals(o, StringComparison.OrdinalIgnoreCase)));
+ var reasonableExtras = extraCores <= 2;
+
+ if (allCoresPresent && reasonableExtras)
+ {
+ _logger.LogInformation(
+ "AI rejected '{CompanyName}' but fuzzy match '{MatchedName}' ({Score}%) passes validation. " +
+ "Original cores: [{OriginalCores}], Match cores: [{MatchCores}]",
+ normalizedName, bestFuzzy.Value.Item.Title, bestFuzzy.Value.Score,
+ string.Join(", ", originalCores), string.Join(", ", matchCores));
+ matchedItem = bestFuzzy.Value.Item;
+ matchScore = bestFuzzy.Value.Score;
+ }
+ else
+ {
+ _logger.LogDebug(
+ "AI rejected '{CompanyName}' and fuzzy match '{MatchedName}' fails validation. " +
+ "AllCoresPresent: {AllCores}, ExtraCores: {Extra}",
+ normalizedName, bestFuzzy.Value.Item.Title, allCoresPresent, extraCores);
+ return CreateUnverifiedResult(companyName, startDate, endDate, jobTitle,
+ "Company name could not be verified - no matching company found in official records");
+ }
+ }
+ else
+ {
+ _logger.LogDebug("AI explicitly rejected all candidates for '{CompanyName}'. Reasoning: {Reasoning}",
+ normalizedName, aiResult?.Reasoning ?? "No match found");
+ return CreateUnverifiedResult(companyName, startDate, endDate, jobTitle,
+ "Company name could not be verified - no matching company found in official records");
+ }
+ }
+ else
+ {
+ // AI failed (API error, etc.) - fall back to fuzzy matching
+ _logger.LogWarning("AI matching failed for '{CompanyName}', falling back to fuzzy matching", normalizedName);
+ matchedItem = bestFuzzy!.Value.Item;
+ matchScore = bestFuzzy!.Value.Score;
+ }
}
else
{
- _logger.LogDebug("No valid match found for: {CompanyName}", companyName);
+ _logger.LogDebug("No valid match found for: {CompanyName}", normalizedName);
return CreateUnverifiedResult(companyName, startDate, endDate, jobTitle,
"Company name could not be verified against official records");
}
@@ -624,6 +813,26 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
#region Helper Methods
+ ///
+ /// Normalizes a company name by removing trailing punctuation and cleaning up common issues.
+ ///
+ private static string NormalizeCompanyName(string companyName)
+ {
+ if (string.IsNullOrWhiteSpace(companyName))
+ return companyName;
+
+ var normalized = companyName.Trim();
+
+ // Remove trailing punctuation (dots, commas, etc.) that cause matching issues
+ // e.g., "Glaxo Research & Development Ltd." -> "Glaxo Research & Development Ltd"
+ normalized = normalized.TrimEnd('.', ',', ';', ':', '!', '?');
+
+ // Normalize multiple spaces to single space
+ normalized = System.Text.RegularExpressions.Regex.Replace(normalized, @"\s+", " ");
+
+ return normalized;
+ }
+
private async Task FindCachedMatchAsync(string companyName)
{
var cutoffDate = DateTime.UtcNow.AddDays(-CacheExpirationDays);
@@ -790,12 +999,13 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
var searchText = originalLower + " " + queryLower;
// Penalize subsidiary indicators (unless search explicitly included them)
+ // Use word boundary matching to avoid "SCOTLAND" matching "land"
foreach (var indicator in SubsidiaryIndicators)
{
- if (itemTitleLower.Contains(indicator))
+ if (ContainsWholeWord(itemTitleLower, indicator))
{
// Only penalize if the search didn't explicitly include this indicator
- if (!searchText.Contains(indicator))
+ if (!ContainsWholeWord(searchText, indicator))
{
score -= 10; // Significant penalty for subsidiaries
}
@@ -806,7 +1016,7 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
// Boost main company indicators
foreach (var indicator in MainCompanyIndicators)
{
- if (itemTitleLower.Contains(indicator))
+ if (ContainsWholeWord(itemTitleLower, indicator))
{
score += 5; // Boost for main trading companies
break; // Only apply one boost
@@ -1168,7 +1378,10 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
}
// Check if the item matches any pattern in this non-employment category
- if (patterns.Any(pattern => itemTitleLower.Contains(pattern)))
+ // Use whole-word matching for single words, substring for multi-word patterns
+ if (patterns.Any(pattern => pattern.Contains(' ')
+ ? itemTitleLower.Contains(pattern)
+ : ContainsWholeWord(itemTitleLower, pattern)))
{
return false; // This is a non-employment entity type that wasn't explicitly searched for
}
@@ -1177,6 +1390,19 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
return true; // No non-employment patterns matched, this is likely a valid employment entity
}
+ ///
+ /// Checks if a string contains a word as a whole word (not as a substring of another word).
+ /// E.g., "scotland" does NOT contain whole word "land", but "land holdings" does.
+ ///
+ private static bool ContainsWholeWord(string text, string word)
+ {
+ if (string.IsNullOrEmpty(text) || string.IsNullOrEmpty(word))
+ return false;
+
+ var pattern = $@"\b{System.Text.RegularExpressions.Regex.Escape(word)}\b";
+ return System.Text.RegularExpressions.Regex.IsMatch(text, pattern, System.Text.RegularExpressions.RegexOptions.IgnoreCase);
+ }
+
// Expanded skip words list for core identifier extraction
// These words are too common to be meaningful differentiators between companies
private static readonly HashSet SkipWords = new(StringComparer.OrdinalIgnoreCase)
@@ -1220,8 +1446,8 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
"new", "modern", "advanced", "innovative", "premier", "elite", "premium",
"quality", "superior", "excellent", "best", "top", "leading", "major",
- // Ownership indicators
- "royal", "imperial", "crown", "state", "public", "private", "independent",
+ // Ownership indicators (excluding "royal" as it's a meaningful company identifier)
+ "imperial", "crown", "state", "public", "private", "independent",
"mutual", "cooperative", "coop", "community",
// Time-related
@@ -1235,7 +1461,7 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
/// Extracts ALL core identifying words from a company name.
/// These are significant words that aren't common prefixes/suffixes.
/// E.g., "BMW Group Canada" -> ["BMW"], "Lloyds Bowmaker" -> ["LLOYDS", "BOWMAKER"]
- /// "Bank of Scotland" -> ["BANK", "SCOTLAND"]
+ /// "Royal Bank of Scotland" -> ["ROYAL", "BANK"] (Scotland is a geographic skipWord)
///
private static List ExtractCoreIdentifiers(string companyName)
{