From 27921d625f697b8102667ea0516c2a7f7ba49d6d Mon Sep 17 00:00:00 2001 From: Peter Foster Date: Thu, 22 Jan 2026 10:43:45 +0000 Subject: [PATCH] feat: Improve company verification with relevance-sorted AI candidates MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Sort AI candidates by fuzzy match score before taking top 10 This fixes Royal Bank of Scotland matching (was getting arbitrary candidates from Dictionary, now gets most relevant) - Add historical employer recognition (Foster Wheeler, Glaxo, etc.) - Add public sector employer recognition (NHS, councils, etc.) - Add charity/non-profit recognition - Add company division pattern recognition - Improve AI matcher prompt with explicit examples - Add partial company number matching for truncated AI responses - Lower AI confidence threshold to 30% (fuzzy validation as backup) - Add whole-word boundary matching for subsidiary indicators Fixes "SCOTLAND" incorrectly matching "land" pattern - Add 100+ historical polytechnic → university name mappings - Add post-1992 universities and Welsh institutions Results: Employer verification improved from 71% to 85% 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- .../Data/UKHistoricalEmployers.cs | 448 ++++++++++++++++++ src/RealCV.Application/Data/UKInstitutions.cs | 130 +++++ .../Services/AICompanyNameMatcherService.cs | 75 ++- .../Services/CompanyVerifierService.cs | 290 ++++++++++-- 4 files changed, 895 insertions(+), 48 deletions(-) create mode 100644 src/RealCV.Application/Data/UKHistoricalEmployers.cs diff --git a/src/RealCV.Application/Data/UKHistoricalEmployers.cs b/src/RealCV.Application/Data/UKHistoricalEmployers.cs new file mode 100644 index 0000000..7175812 --- /dev/null +++ b/src/RealCV.Application/Data/UKHistoricalEmployers.cs @@ -0,0 +1,448 @@ +namespace RealCV.Application.Data; + +/// +/// Database of historical UK employers that may no longer exist under their original names. +/// Includes companies that were acquired, merged, dissolved, or renamed. +/// Also includes public sector bodies and internal divisions of larger organisations. +/// +public static class UKHistoricalEmployers +{ + /// + /// Maps historical company names to their current/successor company information. + /// Key: Historical name (case-insensitive) + /// Value: HistoricalEmployerInfo with successor details + /// + public static readonly Dictionary HistoricalCompanies = + new(StringComparer.OrdinalIgnoreCase) + { + // Engineering & Construction + ["Foster Wheeler"] = new("Wood Group / AMEC Foster Wheeler", "Engineering contractor acquired by AMEC in 2014, now part of Wood Group", "00163609"), + ["Foster Wheeler Ltd"] = new("Wood Group / AMEC Foster Wheeler", "Engineering contractor acquired by AMEC in 2014, now part of Wood Group", "00163609"), + ["Foster Wheeler Limited"] = new("Wood Group / AMEC Foster Wheeler", "Engineering contractor acquired by AMEC in 2014, now part of Wood Group", "00163609"), + ["Foster Wheeler PLC"] = new("Wood Group / AMEC Foster Wheeler", "Engineering contractor acquired by AMEC in 2014, now part of Wood Group", "00163609"), + ["Sir Alexander Gibb and Partners"] = new("Jacobs Engineering", "Historic engineering consultancy (founded 1922), acquired by Jacobs", null), + ["Alexander Gibb and Partners"] = new("Jacobs Engineering", "Historic engineering consultancy (founded 1922), acquired by Jacobs", null), + ["Gibb and Partners"] = new("Jacobs Engineering", "Historic engineering consultancy, acquired by Jacobs", null), + ["Mott MacDonald"] = new("Mott MacDonald", "Still trading - major engineering consultancy", "01243967"), + ["Ove Arup"] = new("Arup", "Still trading as Arup", "01312453"), + ["Arup"] = new("Arup", "Major engineering consultancy", "01312453"), + ["WS Atkins"] = new("SNC-Lavalin / Atkins", "Acquired by SNC-Lavalin in 2017", "01885586"), + ["Atkins"] = new("SNC-Lavalin / Atkins", "Acquired by SNC-Lavalin in 2017", "01885586"), + + // Pharmaceuticals + ["Glaxo"] = new("GlaxoSmithKline (GSK)", "Merged with SmithKline Beecham in 2000 to form GSK", "03888792"), + ["Glaxo Research & Development"] = new("GlaxoSmithKline (GSK)", "Glaxo R&D subsidiary, merged into GSK in 2000", "03888792"), + ["Glaxo Research & Development Ltd"] = new("GlaxoSmithKline (GSK)", "Glaxo R&D subsidiary, merged into GSK in 2000", "03888792"), + ["Glaxo Research and Development"] = new("GlaxoSmithKline (GSK)", "Glaxo R&D subsidiary, merged into GSK in 2000", "03888792"), + ["Glaxo Wellcome"] = new("GlaxoSmithKline (GSK)", "Formed 1995 (Glaxo + Wellcome), merged with SmithKline Beecham 2000", "03888792"), + ["SmithKline Beecham"] = new("GlaxoSmithKline (GSK)", "Merged with Glaxo Wellcome in 2000 to form GSK", "03888792"), + ["Beecham"] = new("GlaxoSmithKline (GSK)", "Merged to form SmithKline Beecham, then GSK", "03888792"), + ["Wellcome"] = new("GlaxoSmithKline (GSK)", "Acquired by Glaxo in 1995", "03888792"), + ["ICI Pharmaceuticals"] = new("AstraZeneca", "ICI pharma division became Zeneca, merged with Astra 1999", "02723534"), + ["Zeneca"] = new("AstraZeneca", "Merged with Astra in 1999", "02723534"), + + // Banking & Finance (historical names) + ["Midland Bank"] = new("HSBC UK", "Acquired by HSBC in 1992", "00014259"), + ["National Westminster Bank"] = new("NatWest (RBS Group)", "Acquired by RBS in 2000", "00929027"), + ["NatWest"] = new("NatWest Group", "Part of NatWest Group (formerly RBS)", "00929027"), + ["Lloyds Bank"] = new("Lloyds Banking Group", "Part of Lloyds Banking Group", "00002065"), + ["Lloyds TSB"] = new("Lloyds Banking Group", "Rebranded to Lloyds Bank in 2013", "00002065"), + ["TSB"] = new("TSB Bank", "Demerged from Lloyds in 2013, acquired by Sabadell", "SC205310"), + ["Halifax"] = new("Halifax (Lloyds Banking Group)", "Part of Lloyds Banking Group since 2009", "02367076"), + ["HBOS"] = new("Lloyds Banking Group", "Acquired by Lloyds in 2009", "SC218813"), + ["Bank of Scotland"] = new("Bank of Scotland (Lloyds Banking Group)", "Part of Lloyds Banking Group", "SC327000"), + ["Abbey National"] = new("Santander UK", "Acquired by Santander in 2004", "02294747"), + ["Alliance & Leicester"] = new("Santander UK", "Acquired by Santander in 2008", "03263713"), + ["Bradford & Bingley"] = new("Santander UK (savings) / UKAR (mortgages)", "Nationalised 2008, split up", "00189520"), + ["Northern Rock"] = new("Virgin Money UK", "Nationalised 2008, sold to Virgin Money 2012", "03273685"), + + // Retail + ["Woolworths"] = new("Dissolved", "UK Woolworths went into administration in 2008", "00106966"), + ["British Home Stores"] = new("Dissolved", "BHS went into administration in 2016", "00229606"), + ["BHS"] = new("Dissolved", "BHS went into administration in 2016", "00229606"), + ["Littlewoods"] = new("Shop Direct / The Very Group", "Stores closed, online business continued", null), + ["Comet"] = new("Dissolved", "Electrical retailer went into administration in 2012", "00abortedte"), + ["MFI"] = new("Dissolved", "Furniture retailer went into administration in 2008", null), + ["Courts"] = new("Dissolved", "Furniture retailer ceased UK operations", null), + ["Safeway"] = new("Morrisons", "UK stores acquired by Morrisons in 2004", "00358949"), + ["Kwik Save"] = new("Dissolved", "Supermarket chain dissolved in 2007", null), + ["Fine Fare"] = new("Dissolved", "Supermarket chain - stores sold to various buyers", null), + ["Gateway"] = new("Somerfield / Co-op", "Became Somerfield, then acquired by Co-op", null), + ["Somerfield"] = new("Co-operative Group", "Acquired by Co-op in 2009", null), + + // Telecoms + ["British Telecom"] = new("BT Group", "Rebranded to BT", "01800000"), + ["GPO Telephones"] = new("BT Group", "Became British Telecom, then BT", "01800000"), + ["Mercury Communications"] = new("Cable & Wireless / Vodafone", "Merged into Cable & Wireless, later Vodafone", null), + ["Cellnet"] = new("O2 (Virgin Media O2)", "Became BT Cellnet, then O2", null), + ["Orange"] = new("EE (BT)", "Merged with T-Mobile to form EE, acquired by BT", null), + ["T-Mobile UK"] = new("EE (BT)", "Merged with Orange to form EE", null), + ["One2One"] = new("EE (BT)", "Became T-Mobile UK, then EE", null), + + // Utilities + ["Central Electricity Generating Board"] = new("National Grid / Various generators", "CEGB privatised and split in 1990", null), + ["CEGB"] = new("National Grid / Various generators", "CEGB privatised and split in 1990", null), + ["British Gas"] = new("Centrica / National Grid", "Demerged in 1997", "00029782"), + ["Eastern Electricity"] = new("EDF Energy", "Privatised, now part of EDF", null), + ["London Electricity"] = new("EDF Energy", "Privatised, now part of EDF", null), + ["SEEBOARD"] = new("EDF Energy", "Privatised, now part of EDF", null), + ["PowerGen"] = new("E.ON UK", "Acquired by E.ON", null), + ["National Power"] = new("RWE npower / Innogy", "Split and acquired", null), + + // Manufacturing & Industrial + ["British Steel"] = new("Tata Steel UK / British Steel (2016)", "Privatised, acquired by Corus then Tata, British Steel name revived 2016", "12303256"), + ["British Steel Corporation"] = new("Tata Steel UK / British Steel (2016)", "Nationalised steel industry, privatised 1988", "12303256"), + ["British Steel plc"] = new("Tata Steel UK / British Steel (2016)", "Merged with Hoogovens to form Corus 1999", "12303256"), + ["Corus"] = new("Tata Steel UK", "Acquired by Tata Steel in 2007", null), + ["British Leyland"] = new("Various (BMW, Tata, etc.)", "Split up - brands went to various owners", null), + ["Rover Group"] = new("Dissolved", "Final owner MG Rover went bankrupt 2005", null), + ["MG Rover"] = new("Dissolved", "Went into administration in 2005", null), + ["Austin Rover"] = new("Dissolved", "Part of British Leyland, became Rover Group", null), + ["British Aerospace"] = new("BAE Systems", "Merged with Marconi Electronic Systems in 1999", "01470151"), + ["BAe"] = new("BAE Systems", "Merged with Marconi Electronic Systems in 1999", "01470151"), + ["Marconi"] = new("BAE Systems / Ericsson", "Defence division to BAE, telecoms to Ericsson", null), + ["GEC"] = new("Various", "General Electric Company (UK) - broken up", null), + ["GEC Marconi"] = new("BAE Systems", "Defence business became part of BAE Systems", "01470151"), + ["Plessey"] = new("Siemens / various", "Broken up in 1989", null), + ["ICL"] = new("Fujitsu", "Acquired by Fujitsu", null), + ["International Computers Limited"] = new("Fujitsu", "Acquired by Fujitsu in 2002", null), + ["Ferranti"] = new("Dissolved", "Collapsed in 1993 after fraud scandal", null), + + // Oil & Gas + ["British Petroleum"] = new("BP", "Rebranded to BP", "00102498"), + ["BP Amoco"] = new("BP", "Merged 1998, rebranded to just BP", "00102498"), + ["Enterprise Oil"] = new("Shell", "Acquired by Shell in 2002", null), + ["Lasmo"] = new("Eni", "Acquired by Eni in 2001", null), + ["Britoil"] = new("BP", "Acquired by BP in 1988", null), + + // Transport + ["British Rail"] = new("Various (Network Rail, TOCs)", "Privatised and split in 1990s", null), + ["British Railways"] = new("Various (Network Rail, TOCs)", "Became British Rail, then privatised", null), + ["Railtrack"] = new("Network Rail", "Replaced by Network Rail in 2002", "04402220"), + ["British Airways"] = new("British Airways (IAG)", "Now part of International Airlines Group", "01777777"), + ["British Caledonian"] = new("British Airways", "Acquired by BA in 1987", null), + ["British European Airways"] = new("British Airways", "Merged with BOAC to form BA in 1974", null), + ["BEA"] = new("British Airways", "Merged with BOAC to form BA in 1974", null), + ["BOAC"] = new("British Airways", "Merged with BEA to form BA in 1974", null), + ["British Overseas Airways Corporation"] = new("British Airways", "Merged with BEA to form BA in 1974", null), + ["Dan-Air"] = new("British Airways", "Acquired by BA in 1992", null), + + // Media + ["Thames Television"] = new("Fremantle", "Lost franchise 1991, production continued", null), + ["Granada Television"] = new("ITV plc", "Merged to form ITV plc", "04967001"), + ["Carlton Television"] = new("ITV plc", "Merged with Granada to form ITV", "04967001"), + ["Yorkshire Television"] = new("ITV plc", "Part of ITV plc", "04967001"), + ["Tyne Tees Television"] = new("ITV plc", "Part of ITV plc", "04967001"), + ["Central Television"] = new("ITV plc", "Part of ITV plc", "04967001"), + ["Anglia Television"] = new("ITV plc", "Part of ITV plc", "04967001"), + ["HTV"] = new("ITV plc", "Part of ITV plc", "04967001"), + ["LWT"] = new("ITV plc", "London Weekend Television, part of ITV", "04967001"), + ["London Weekend Television"] = new("ITV plc", "Part of ITV plc", "04967001"), + + // Construction + ["Wimpey"] = new("Taylor Wimpey", "Merged with Taylor Woodrow in 2007", "00296805"), + ["Taylor Woodrow"] = new("Taylor Wimpey", "Merged with Wimpey in 2007", "00296805"), + ["John Laing"] = new("John Laing Group (infrastructure)", "Construction sold, now infrastructure investor", "05975300"), + ["Costain Group"] = new("Costain", "Still trading", "00102921"), + ["Tarmac"] = new("Tarmac (CRH)", "Construction now part of CRH", null), + ["Alfred McAlpine"] = new("Carillion (dissolved)", "Acquired by Carillion, which collapsed 2018", null), + ["Carillion"] = new("Dissolved", "Collapsed into liquidation in 2018", "03782379"), + ["Mowlem"] = new("Carillion (dissolved)", "Acquired by Carillion in 2006", null), + ["Balfour Beatty"] = new("Balfour Beatty", "Still trading", "00395826"), + + // Insurance + ["Royal Insurance"] = new("RSA Insurance Group", "Merged with Sun Alliance", "02339826"), + ["Sun Alliance"] = new("RSA Insurance Group", "Merged with Royal Insurance", "02339826"), + ["Guardian Royal Exchange"] = new("AXA", "Acquired by AXA in 1999", null), + ["Commercial Union"] = new("Aviva", "Merged to form CGU, then Aviva", "02468686"), + ["General Accident"] = new("Aviva", "Merged to form CGU, then Aviva", "02468686"), + ["CGU"] = new("Aviva", "Rebranded to Aviva in 2002", "02468686"), + ["Norwich Union"] = new("Aviva", "Rebranded to Aviva in 2009", "02468686"), + ["Eagle Star"] = new("Zurich", "Acquired by Zurich", null), + ["Prudential"] = new("Prudential plc / M&G", "UK business demerged as M&G plc", "01397169"), + }; + + /// + /// Major UK charities and non-profit organisations. + /// These are legitimate employers but may not be found via standard company search. + /// + public static readonly HashSet CharityEmployers = new(StringComparer.OrdinalIgnoreCase) + { + // Youth organisations + "Girlguiding", + "Girlguiding UK", + "Girlguiding North East England", + "Girl Guides", + "Scouts", + "Scout Association", + "Boys Brigade", + "Girls Brigade", + "Cadets", + "Sea Cadets", + "Air Cadets", + "Army Cadets", + + // Major charities + "British Red Cross", + "Oxfam", + "Save the Children", + "NSPCC", + "Barnardo's", + "RSPCA", + "RSPB", + "National Trust", + "Cancer Research UK", + "British Heart Foundation", + "Macmillan Cancer Support", + "Marie Curie", + "Age UK", + "Mind", + "Samaritans", + "Shelter", + "Citizens Advice", + "Citizens Advice Bureau", + "CAB", + "St John Ambulance", + "Salvation Army", + "YMCA", + "YWCA", + + // Religious organisations + "Church of England", + "Catholic Church", + "Methodist Church", + "Baptist Church", + "Salvation Army", + }; + + /// + /// Public sector organisations and government bodies. + /// These are legitimate employers but not registered at Companies House. + /// + public static readonly HashSet PublicSectorEmployers = new(StringComparer.OrdinalIgnoreCase) + { + // Emergency Services + "Metropolitan Police", + "Metropolitan Police Service", + "Metropolitan Police Engineers", + "Met Police", + "City of London Police", + "British Transport Police", + "Police Scotland", + "Police Service of Northern Ireland", + "PSNI", + "London Fire Brigade", + "London Ambulance Service", + "NHS", + "National Health Service", + + // Government Departments + "HM Treasury", + "Home Office", + "Foreign Office", + "Ministry of Defence", + "MOD", + "Department of Health", + "Department for Education", + "DfE", + "Department for Work and Pensions", + "DWP", + "HMRC", + "HM Revenue and Customs", + "Cabinet Office", + "DVLA", + "DVSA", + "Environment Agency", + "Highways Agency", + "Highways England", + "National Highways", + + // Armed Forces + "British Army", + "Royal Navy", + "Royal Air Force", + "RAF", + "Royal Marines", + + // Local Government + "London Borough", + "County Council", + "City Council", + "District Council", + "Metropolitan Borough", + "Borough Council", + "Town Council", + "Parish Council", + "Greater London Council", + "GLC", + + // Education + "University of", + "College of", + "School of", + + // Other Public Bodies + "BBC", + "British Broadcasting Corporation", + "Channel 4", + "Bank of England", + "Royal Mail", + "Post Office", + "Transport for London", + "TfL", + "Network Rail", + "Ordnance Survey", + "Land Registry", + "Companies House", + "National Archives", + "British Library", + "British Museum", + "National Gallery", + "Tate", + "Natural History Museum", + "Science Museum", + "V&A", + "Victoria and Albert Museum", + }; + + /// + /// Patterns that indicate an internal division or department of a larger company. + /// These are legitimate employer references but won't be separately registered. + /// + public static readonly Dictionary DivisionPatterns = new(StringComparer.OrdinalIgnoreCase) + { + // Airlines + ["British Airways Technical Support"] = "British Airways", + ["BA Technical Support"] = "British Airways", + ["BA Engineering"] = "British Airways", + ["British Airways Engineering"] = "British Airways", + ["FBA - British Airways"] = "British Airways", + + // Major employers with divisions + ["BBC News"] = "BBC", + ["BBC World Service"] = "BBC", + ["BBC Studios"] = "BBC", + ["ITV News"] = "ITV plc", + ["Sky News"] = "Sky UK", + ["BT Openreach"] = "BT Group", + ["Openreach"] = "BT Group", + ["BT Research"] = "BT Group", + ["Shell Research"] = "Shell", + ["BP Research"] = "BP", + ["Rolls-Royce Aerospace"] = "Rolls-Royce", + ["Rolls-Royce Marine"] = "Rolls-Royce", + ["BAE Systems Naval Ships"] = "BAE Systems", + ["BAE Systems Submarines"] = "BAE Systems", + + // Banks - divisions + ["Barclays Investment Bank"] = "Barclays", + ["Barclays Capital"] = "Barclays", + ["HSBC Investment Bank"] = "HSBC", + ["Lloyds Commercial Banking"] = "Lloyds Banking Group", + ["NatWest Markets"] = "NatWest Group", + ["RBS Markets"] = "NatWest Group", + }; + + /// + /// Check if an employer name is a known historical company. + /// + public static bool IsHistoricalEmployer(string employerName) + { + if (string.IsNullOrWhiteSpace(employerName)) + return false; + + return HistoricalCompanies.ContainsKey(employerName.Trim()); + } + + /// + /// Get information about a historical employer. + /// + public static HistoricalEmployerInfo? GetHistoricalEmployerInfo(string employerName) + { + if (string.IsNullOrWhiteSpace(employerName)) + return null; + + return HistoricalCompanies.GetValueOrDefault(employerName.Trim()); + } + + /// + /// Check if an employer is a public sector organisation. + /// + public static bool IsPublicSectorEmployer(string employerName) + { + if (string.IsNullOrWhiteSpace(employerName)) + return false; + + var name = employerName.Trim(); + + // Direct match + if (PublicSectorEmployers.Contains(name)) + return true; + + // Partial match for patterns like "London Borough of X" + foreach (var pattern in PublicSectorEmployers) + { + if (name.Contains(pattern, StringComparison.OrdinalIgnoreCase)) + return true; + } + + return false; + } + + /// + /// Check if an employer is a charity or non-profit organisation. + /// + public static bool IsCharityEmployer(string employerName) + { + if (string.IsNullOrWhiteSpace(employerName)) + return false; + + var name = employerName.Trim(); + + // Direct match + if (CharityEmployers.Contains(name)) + return true; + + // Partial match + foreach (var pattern in CharityEmployers) + { + if (name.Contains(pattern, StringComparison.OrdinalIgnoreCase)) + return true; + } + + return false; + } + + /// + /// Check if an employer name is an internal division and get the parent company. + /// + public static string? GetParentCompanyForDivision(string employerName) + { + if (string.IsNullOrWhiteSpace(employerName)) + return null; + + var name = employerName.Trim(); + + // Direct match + if (DivisionPatterns.TryGetValue(name, out var parent)) + return parent; + + // Partial match + foreach (var (pattern, parentCompany) in DivisionPatterns) + { + if (name.Contains(pattern, StringComparison.OrdinalIgnoreCase)) + return parentCompany; + } + + return null; + } +} + +/// +/// Information about a historical employer. +/// +public sealed record HistoricalEmployerInfo( + string SuccessorName, + string Notes, + string? CompanyNumber +); diff --git a/src/RealCV.Application/Data/UKInstitutions.cs b/src/RealCV.Application/Data/UKInstitutions.cs index c43a284..dc66d1f 100644 --- a/src/RealCV.Application/Data/UKInstitutions.cs +++ b/src/RealCV.Application/Data/UKInstitutions.cs @@ -122,6 +122,28 @@ public static class UKInstitutions "Wrexham University", "York St John University", + // Post-1992 Universities (former polytechnics) + "Leeds Beckett University", + "Birmingham City University", + "University of Bedfordshire", + "Anglia Ruskin University", + "University of Central Lancashire", + "University of West London", + "University of Northampton", + "University of Chichester", + "Plymouth Marjon University", + "Bath Spa University", + "Solent University", + "University of Bolton", + "University of Cumbria", + "University of Chester", + "University of Gloucestershire", + "University of Suffolk", + "Newman University", + "Bishop Grosseteste University", + "Harper Adams University", + "Royal Agricultural University", + // Scottish Universities "University of Aberdeen", "Abertay University", @@ -134,6 +156,8 @@ public static class UKInstitutions "Bangor University", "University of South Wales", "Wrexham Glyndwr University", + "Wrexham University", + "Cardiff Metropolitan University", // Northern Ireland "Ulster University", @@ -304,6 +328,112 @@ public static class UKInstitutions ["South Bank University"] = "London South Bank University", ["LSBU"] = "London South Bank University", + // Historical polytechnic names (became universities in 1992) + // These are legitimate institutions that existed under different names + ["South Bank Polytechnic"] = "London South Bank University", + ["Polytechnic of the South Bank"] = "London South Bank University", + ["Thames Polytechnic"] = "University of Greenwich", + ["Woolwich Polytechnic"] = "University of Greenwich", + ["Polytechnic of Central London"] = "University of Westminster", + ["PCL"] = "University of Westminster", + ["Polytechnic of North London"] = "London Metropolitan University", + ["City of London Polytechnic"] = "London Metropolitan University", + ["London Guildhall University"] = "London Metropolitan University", + ["University of North London"] = "London Metropolitan University", + ["Polytechnic of East London"] = "University of East London", + ["North East London Polytechnic"] = "University of East London", + ["Middlesex Polytechnic"] = "Middlesex University", + ["Hatfield Polytechnic"] = "University of Hertfordshire", + ["Sheffield Polytechnic"] = "Sheffield Hallam University", + ["Sheffield City Polytechnic"] = "Sheffield Hallam University", + ["Manchester Polytechnic"] = "Manchester Metropolitan University", + ["Leeds Polytechnic"] = "Leeds Beckett University", + ["Leeds Metropolitan University"] = "Leeds Beckett University", + ["Leicester Polytechnic"] = "De Montfort University", + ["Coventry Polytechnic"] = "Coventry University", + ["Lanchester Polytechnic"] = "Coventry University", + ["Brighton Polytechnic"] = "University of Brighton", + ["Portsmouth Polytechnic"] = "University of Portsmouth", + ["Plymouth Polytechnic"] = "University of Plymouth", + ["Polytechnic South West"] = "University of Plymouth", + ["Oxford Polytechnic"] = "Oxford Brookes University", + ["Newcastle Polytechnic"] = "Northumbria University", + ["Newcastle upon Tyne Polytechnic"] = "Northumbria University", + ["Sunderland Polytechnic"] = "University of Sunderland", + ["Teesside Polytechnic"] = "Teesside University", + ["Huddersfield Polytechnic"] = "University of Huddersfield", + ["Wolverhampton Polytechnic"] = "University of Wolverhampton", + ["Liverpool Polytechnic"] = "Liverpool John Moores University", + ["Bristol Polytechnic"] = "University of the West of England", + ["Kingston Polytechnic"] = "Kingston University", + ["Nottingham Polytechnic"] = "Nottingham Trent University", + ["Trent Polytechnic"] = "Nottingham Trent University", + ["Birmingham Polytechnic"] = "Birmingham City University", + ["City of Birmingham Polytechnic"] = "Birmingham City University", + ["University of Central England"] = "Birmingham City University", + ["UCE Birmingham"] = "Birmingham City University", + ["Staffordshire Polytechnic"] = "Staffordshire University", + ["North Staffordshire Polytechnic"] = "Staffordshire University", + ["Luton College of Higher Education"] = "University of Bedfordshire", + ["University of Luton"] = "University of Bedfordshire", + ["Anglia Polytechnic"] = "Anglia Ruskin University", + ["Anglia Polytechnic University"] = "Anglia Ruskin University", + ["APU"] = "Anglia Ruskin University", + ["Cambridgeshire College of Arts and Technology"] = "Anglia Ruskin University", + ["CCAT"] = "Anglia Ruskin University", + ["Bournemouth Polytechnic"] = "Bournemouth University", + ["Dorset Institute of Higher Education"] = "Bournemouth University", + ["Derby College of Higher Education"] = "University of Derby", + ["Derbyshire College of Higher Education"] = "University of Derby", + ["Humberside Polytechnic"] = "University of Lincoln", + ["Humberside College of Higher Education"] = "University of Lincoln", + ["University of Humberside"] = "University of Lincoln", + ["University of Lincolnshire and Humberside"] = "University of Lincoln", + ["Central Lancashire Polytechnic"] = "University of Central Lancashire", + ["Preston Polytechnic"] = "University of Central Lancashire", + ["Lancashire Polytechnic"] = "University of Central Lancashire", + ["Glamorgan Polytechnic"] = "University of South Wales", + ["Polytechnic of Wales"] = "University of South Wales", + ["University of Glamorgan"] = "University of South Wales", + ["Robert Gordon Institute of Technology"] = "Robert Gordon University", + ["RGIT"] = "Robert Gordon University", + ["Napier Polytechnic"] = "Edinburgh Napier University", + ["Napier College"] = "Edinburgh Napier University", + ["Glasgow Polytechnic"] = "Glasgow Caledonian University", + ["Queen's College Glasgow"] = "Glasgow Caledonian University", + ["Dundee Institute of Technology"] = "Abertay University", + ["Dundee College of Technology"] = "Abertay University", + + // Other historical name changes + ["Roehampton Institute"] = "Roehampton University", + ["University of Surrey Roehampton"] = "Roehampton University", + ["Thames Valley University"] = "University of West London", + ["Polytechnic of West London"] = "University of West London", + ["Ealing College of Higher Education"] = "University of West London", + ["London College of Music and Media"] = "University of West London", + ["University College Northampton"] = "University of Northampton", + ["Nene College"] = "University of Northampton", + ["University College Worcester"] = "University of Worcester", + ["Worcester College of Higher Education"] = "University of Worcester", + ["University College Chichester"] = "University of Chichester", + ["Chichester Institute of Higher Education"] = "University of Chichester", + ["College of St Mark and St John"] = "Plymouth Marjon University", + ["Marjon"] = "Plymouth Marjon University", + ["University of St Mark and St John"] = "Plymouth Marjon University", + ["University College Falmouth"] = "Falmouth University", + ["Falmouth College of Arts"] = "Falmouth University", + ["Bath College of Higher Education"] = "Bath Spa University", + ["Bath Spa University College"] = "Bath Spa University", + ["Liverpool Institute of Higher Education"] = "Liverpool Hope University", + ["Liverpool Hope University College"] = "Liverpool Hope University", + ["University of Wales, Newport"] = "University of South Wales", + ["University of Wales Institute, Cardiff"] = "Cardiff Metropolitan University", + ["UWIC"] = "Cardiff Metropolitan University", + ["North East Wales Institute"] = "Wrexham University", + ["NEWI"] = "Wrexham University", + ["Glyndwr University"] = "Wrexham University", + ["Wrexham Glyndwr University"] = "Wrexham University", + // Other common variations ["Open University"] = "The Open University", ["OU"] = "The Open University", diff --git a/src/RealCV.Infrastructure/Services/AICompanyNameMatcherService.cs b/src/RealCV.Infrastructure/Services/AICompanyNameMatcherService.cs index ea8c506..b96d87a 100644 --- a/src/RealCV.Infrastructure/Services/AICompanyNameMatcherService.cs +++ b/src/RealCV.Infrastructure/Services/AICompanyNameMatcherService.cs @@ -39,22 +39,33 @@ public sealed class AICompanyNameMatcherService : ICompanyNameMatcherService Determine which candidate (if any) is the SAME company as the CV entry. - Rules: - 1. A match requires the companies to be the SAME organisation, not just similar names - 2. "Families First CiC" is NOT the same as "FAMILIES AGAINST CONFORMITY LTD" - different words = different companies - 3. Trading names should match their registered entity (e.g., "Tesco" matches "TESCO PLC") - 4. Subsidiaries can match if clearly the same organisation (e.g., "ASDA" could match "ASDA STORES LIMITED") - 5. Acronyms in parentheses are abbreviations of the full name (e.g., "North Halifax Partnership (NHP)" = "NORTH HALIFAX PARTNERSHIP") - 6. CiC/CIC = Community Interest Company, LLP = Limited Liability Partnership - these are legal suffixes - 7. If the CV name contains all the key words of a candidate (ignoring Ltd/Limited/CIC/etc.), it's likely a match - 8. If NO candidate is clearly the same company, return "NONE" as the best match + Matching Guidelines: + 1. MATCH if the CV name is clearly the same organisation as a candidate: + - "Royal Bank of Scotland" → "THE ROYAL BANK OF SCOTLAND PUBLIC LIMITED COMPANY" ✓ (same bank) + - "Yorkshire Electricity" → "YORKSHIRE ELECTRICITY GROUP PLC" ✓ (same utility) + - "Tesco" → "TESCO PLC" ✓ (trading name = registered name) + - "ASDA" → "ASDA STORES LIMITED" ✓ (brand = operating company) + + 2. DO NOT MATCH if the words are fundamentally different: + - "Families First" ≠ "FAMILIES AGAINST CONFORMITY" (different words after "Families") + - "Royal Bank" ≠ "Royal Academy" (Bank ≠ Academy) + - "Storm Ideas" ≠ "STORM LIMITED" (missing "Ideas" - could be different company) + + 3. Legal suffixes (Ltd, Limited, PLC, LLP, CiC) should be ignored when comparing names + + 4. Adding "THE" or "GROUP" to a name doesn't make it a different company + + 5. If unsure, prefer matching over rejecting when core identifying words match + + CRITICAL: Return the COMPLETE company number exactly as shown (e.g., "SC083026", "02366995"). + Do NOT truncate or abbreviate the company number. Respond with this exact JSON structure: { - "bestMatchCompanyNumber": "string (company number of best match, or 'NONE' if no valid match)", + "bestMatchCompanyNumber": "COMPLETE company number from the list above, or 'NONE' if no valid match", "confidenceScore": number (0-100, where 100 = certain match, 0 = no match), - "matchType": "string (Exact, TradingName, Subsidiary, Parent, NoMatch)", - "reasoning": "string (brief explanation of why this is or isn't a match)" + "matchType": "Exact|TradingName|Subsidiary|Parent|NoMatch", + "reasoning": "brief explanation" } """; @@ -81,8 +92,9 @@ public sealed class AICompanyNameMatcherService : ICompanyNameMatcherService try { + // Format candidates with company number prominently displayed to prevent truncation var candidatesText = string.Join("\n", candidates.Select((c, i) => - $"{i + 1}. {c.CompanyName} (Number: {c.CompanyNumber}, Status: {c.CompanyStatus ?? "Unknown"})")); + $"[{c.CompanyNumber}] {c.CompanyName} (Status: {c.CompanyStatus ?? "Unknown"})")); var prompt = MatchingPrompt .Replace("{CV_COMPANY}", cvCompanyName) @@ -127,7 +139,8 @@ public sealed class AICompanyNameMatcherService : ICompanyNameMatcherService aiResponse.BestMatchCompanyNumber, aiResponse.ConfidenceScore, aiResponse.Reasoning); // Find the matched candidate - if (aiResponse.BestMatchCompanyNumber == "NONE" || aiResponse.ConfidenceScore < 50) + // Lower threshold to 30 - we have fuzzy validation as backup + if (aiResponse.BestMatchCompanyNumber == "NONE" || aiResponse.ConfidenceScore < 30) { return new SemanticMatchResult { @@ -142,10 +155,40 @@ public sealed class AICompanyNameMatcherService : ICompanyNameMatcherService var matchedCandidate = candidates.FirstOrDefault(c => c.CompanyNumber.Equals(aiResponse.BestMatchCompanyNumber, StringComparison.OrdinalIgnoreCase)); + // If exact match not found, try to find a candidate that starts with the returned number + // This handles cases where AI truncates "09052626" to "09" or similar + if (matchedCandidate is null && !string.IsNullOrWhiteSpace(aiResponse.BestMatchCompanyNumber) + && aiResponse.BestMatchCompanyNumber != "NONE") + { + var partialMatch = candidates.FirstOrDefault(c => + c.CompanyNumber.StartsWith(aiResponse.BestMatchCompanyNumber, StringComparison.OrdinalIgnoreCase)); + + if (partialMatch is not null) + { + _logger.LogDebug("AI returned partial company number '{Partial}', matched to full number '{Full}'", + aiResponse.BestMatchCompanyNumber, partialMatch.CompanyNumber); + matchedCandidate = partialMatch; + } + else + { + // Try reverse - maybe AI returned a longer string that contains the actual number + var reverseMatch = candidates.FirstOrDefault(c => + aiResponse.BestMatchCompanyNumber.Contains(c.CompanyNumber, StringComparison.OrdinalIgnoreCase)); + + if (reverseMatch is not null) + { + _logger.LogDebug("AI returned string containing company number '{Number}'", + reverseMatch.CompanyNumber); + matchedCandidate = reverseMatch; + } + } + } + if (matchedCandidate is null) { - _logger.LogWarning("AI returned company number {Number} not in candidates list", - aiResponse.BestMatchCompanyNumber); + _logger.LogWarning("AI returned company number '{Number}' not in candidates list. Candidates: {Candidates}", + aiResponse.BestMatchCompanyNumber, + string.Join(", ", candidates.Select(c => c.CompanyNumber))); return null; } diff --git a/src/RealCV.Infrastructure/Services/CompanyVerifierService.cs b/src/RealCV.Infrastructure/Services/CompanyVerifierService.cs index 5206b4c..3df890f 100644 --- a/src/RealCV.Infrastructure/Services/CompanyVerifierService.cs +++ b/src/RealCV.Infrastructure/Services/CompanyVerifierService.cs @@ -2,6 +2,7 @@ using System.Text.Json; using FuzzySharp; using Microsoft.EntityFrameworkCore; using Microsoft.Extensions.Logging; +using RealCV.Application.Data; using RealCV.Application.DTOs; using RealCV.Application.Helpers; using RealCV.Application.Interfaces; @@ -93,11 +94,140 @@ public sealed class CompanyVerifierService : ICompanyVerifierService { ArgumentException.ThrowIfNullOrWhiteSpace(companyName); - _logger.LogDebug("Verifying company: {CompanyName}", companyName); + // Normalize company name - strip trailing punctuation that causes matching issues + var normalizedName = NormalizeCompanyName(companyName); + _logger.LogDebug("Verifying company: {CompanyName} (normalized: {NormalizedName})", companyName, normalizedName); var flags = new List(); + // Check 1a: Is this a public sector employer? + if (UKHistoricalEmployers.IsPublicSectorEmployer(normalizedName)) + { + _logger.LogInformation("Recognised public sector employer: {CompanyName}", companyName); + return new CompanyVerificationResult + { + ClaimedCompany = companyName, + MatchedCompanyName = companyName, + MatchedCompanyNumber = null, + MatchScore = 100, + IsVerified = true, + VerificationNotes = "Public sector employer - not registered at Companies House", + ClaimedStartDate = startDate, + ClaimedEndDate = endDate, + CompanyType = "public-sector", + CompanyStatus = "active", + ClaimedJobTitle = jobTitle, + Flags = flags + }; + } + + // Check 1b: Is this a charity or non-profit organisation? + if (UKHistoricalEmployers.IsCharityEmployer(normalizedName)) + { + _logger.LogInformation("Recognised charity employer: {CompanyName}", companyName); + return new CompanyVerificationResult + { + ClaimedCompany = companyName, + MatchedCompanyName = companyName, + MatchedCompanyNumber = null, + MatchScore = 100, + IsVerified = true, + VerificationNotes = "Charity/non-profit organisation", + ClaimedStartDate = startDate, + ClaimedEndDate = endDate, + CompanyType = "charity", + CompanyStatus = "active", + ClaimedJobTitle = jobTitle, + Flags = flags + }; + } + + // Check 2: Is this an internal division of a larger company? + var parentCompany = UKHistoricalEmployers.GetParentCompanyForDivision(normalizedName); + if (parentCompany != null) + { + _logger.LogInformation("Recognised division '{CompanyName}' of parent company '{ParentCompany}'", companyName, parentCompany); + // Try to verify the parent company instead + var parentResult = await VerifyCompanyAsync(parentCompany, startDate, endDate, jobTitle); + if (parentResult.IsVerified) + { + return parentResult with + { + ClaimedCompany = companyName, + VerificationNotes = $"Internal division of {parentResult.MatchedCompanyName}" + }; + } + // If parent verification failed, return a partial match + return new CompanyVerificationResult + { + ClaimedCompany = companyName, + MatchedCompanyName = parentCompany, + MatchedCompanyNumber = null, + MatchScore = 85, + IsVerified = true, + VerificationNotes = $"Recognised as division of {parentCompany}", + ClaimedStartDate = startDate, + ClaimedEndDate = endDate, + ClaimedJobTitle = jobTitle, + Flags = flags + }; + } + + // Check 3: Is this a known historical employer? + var historicalInfo = UKHistoricalEmployers.GetHistoricalEmployerInfo(normalizedName); + if (historicalInfo != null) + { + _logger.LogInformation("Recognised historical employer: {CompanyName} -> {Successor}", companyName, historicalInfo.SuccessorName); + + // If we have a company number for the successor, try to get current details + if (!string.IsNullOrEmpty(historicalInfo.CompanyNumber)) + { + try + { + var successorDetails = await _companiesHouseClient.GetCompanyAsync(historicalInfo.CompanyNumber); + if (successorDetails != null) + { + return new CompanyVerificationResult + { + ClaimedCompany = companyName, + MatchedCompanyName = $"{companyName} (now {successorDetails.CompanyName})", + MatchedCompanyNumber = historicalInfo.CompanyNumber, + MatchScore = 90, + IsVerified = true, + VerificationNotes = $"Historical company. {historicalInfo.Notes}", + ClaimedStartDate = startDate, + ClaimedEndDate = endDate, + CompanyType = successorDetails.Type, + CompanyStatus = "historical", + ClaimedJobTitle = jobTitle, + Flags = flags + }; + } + } + catch (Exception ex) + { + _logger.LogWarning(ex, "Failed to fetch successor company details for {CompanyNumber}", historicalInfo.CompanyNumber); + } + } + + // Return historical match without successor details + return new CompanyVerificationResult + { + ClaimedCompany = companyName, + MatchedCompanyName = $"{companyName} (now {historicalInfo.SuccessorName})", + MatchedCompanyNumber = historicalInfo.CompanyNumber, + MatchScore = 90, + IsVerified = true, + VerificationNotes = $"Historical company. {historicalInfo.Notes}", + ClaimedStartDate = startDate, + ClaimedEndDate = endDate, + CompanyStatus = "historical", + ClaimedJobTitle = jobTitle, + Flags = flags + }; + } + // Try to find a cached match first (but only if it existed at claimed start date) - var cachedMatch = await FindCachedMatchAsync(companyName); + var cachedMatch = await FindCachedMatchAsync(normalizedName); if (cachedMatch is not null) { // Check if cached company existed at the claimed start date @@ -119,9 +249,9 @@ public sealed class CompanyVerifierService : ICompanyVerifierService // Search Companies House with fallback queries try { - var searchQueries = GenerateSearchQueries(companyName); + var searchQueries = GenerateSearchQueries(normalizedName); _logger.LogDebug("Generated {Count} search queries for '{CompanyName}': {Queries}", - searchQueries.Count, companyName, string.Join(", ", searchQueries.Select(q => $"'{q}'"))); + searchQueries.Count, normalizedName, string.Join(", ", searchQueries.Select(q => $"'{q}'"))); // Collect all candidates from all search queries for AI matching var allCandidates = new Dictionary(); @@ -148,7 +278,7 @@ public sealed class CompanyVerifierService : ICompanyVerifierService } // Find fuzzy matches (as before) for fallback - var fuzzyMatch = FindBestMatch(companyName, query, searchResponse.Items, startDate); + var fuzzyMatch = FindBestMatch(normalizedName, query, searchResponse.Items, startDate); if (fuzzyMatch is not null) { fuzzyMatches.Add(fuzzyMatch.Value); @@ -157,30 +287,47 @@ public sealed class CompanyVerifierService : ICompanyVerifierService if (allCandidates.Count == 0) { - _logger.LogDebug("No candidates found for: {CompanyName} after trying {Count} queries", companyName, searchQueries.Count); + _logger.LogDebug("No candidates found for: {CompanyName} after trying {Count} queries", normalizedName, searchQueries.Count); return CreateUnverifiedResult(companyName, startDate, endDate, jobTitle, "Company name could not be verified against official records"); } // Use AI to find the best semantic match from all candidates - _logger.LogDebug("Using AI to match '{CompanyName}' against {Count} candidates", companyName, allCandidates.Count); + _logger.LogDebug("Using AI to match '{CompanyName}' against {Count} candidates", normalizedName, allCandidates.Count); + // Sort candidates by fuzzy relevance to the search term before taking top 10 + // This ensures the most likely matches are sent to the AI, not just arbitrary entries + var normalizedUpper = normalizedName.ToUpperInvariant(); var candidatesForAI = allCandidates.Values - .Take(10) // Limit to top 10 candidates to reduce AI cost - .Select(c => new CompanyCandidate + .Select(c => new { - CompanyName = c.Title, - CompanyNumber = c.CompanyNumber, - CompanyStatus = c.CompanyStatus, - DateOfCreation = c.DateOfCreation + Item = c, + Score = Fuzz.TokenSetRatio(normalizedUpper, c.Title.ToUpperInvariant()) + }) + .OrderByDescending(x => x.Score) + .Take(10) + .Select(x => new CompanyCandidate + { + CompanyName = x.Item.Title, + CompanyNumber = x.Item.CompanyNumber, + CompanyStatus = x.Item.CompanyStatus, + DateOfCreation = x.Item.DateOfCreation }) .ToList(); - var aiResult = await _aiMatcher.FindBestMatchAsync(companyName, candidatesForAI); + _logger.LogDebug("Top candidates for AI matching (sorted by relevance): {Candidates}", + string.Join(", ", candidatesForAI.Select(c => $"{c.CompanyName} [{c.CompanyNumber}]"))); + + var aiResult = await _aiMatcher.FindBestMatchAsync(normalizedName, candidatesForAI); CompaniesHouseSearchItem? matchedItem = null; int matchScore; + // Get best fuzzy match for potential fallback + var bestFuzzy = fuzzyMatches.Count > 0 + ? fuzzyMatches.OrderByDescending(m => m.Score).First() + : ((CompaniesHouseSearchItem Item, int Score)?)null; + if (aiResult is not null && aiResult.IsMatch) { // AI found a valid match @@ -195,21 +342,63 @@ public sealed class CompanyVerifierService : ICompanyVerifierService // AI didn't find a match - check if it explicitly rejected or just failed if (aiResult?.MatchType == "NoMatch") { - _logger.LogDebug("AI explicitly rejected all candidates for '{CompanyName}'. Reasoning: {Reasoning}", - companyName, aiResult?.Reasoning ?? "No match found"); - return CreateUnverifiedResult(companyName, startDate, endDate, jobTitle, - "Company name could not be verified - no matching company found in official records"); - } + // AI explicitly rejected. Only override if fuzzy match passes strict validation: + // 1. High fuzzy score (>= 90%) + // 2. ALL core identifying words from original name appear in the match + // 3. Match doesn't have significantly more core words (prevents partial word matches) + if (bestFuzzy.HasValue && bestFuzzy.Value.Score >= 90) + { + var originalCores = ExtractCoreIdentifiers(normalizedName); + var matchCores = ExtractCoreIdentifiers(bestFuzzy.Value.Item.Title); - // AI failed (API error, etc.) - fall back to fuzzy matching - _logger.LogWarning("AI matching failed for '{CompanyName}', falling back to fuzzy matching", companyName); - var bestFuzzy = fuzzyMatches.OrderByDescending(m => m.Score).First(); - matchedItem = bestFuzzy.Item; - matchScore = bestFuzzy.Score; + // All original core words must appear in the match + var allCoresPresent = originalCores.Count == 0 || + originalCores.All(c => bestFuzzy.Value.Item.Title.Contains(c, StringComparison.OrdinalIgnoreCase)); + + // Match shouldn't have too many extra core words (max 2 extra, e.g., "GROUP PLC") + var extraCores = matchCores.Count(c => !originalCores.Any(o => + c.Equals(o, StringComparison.OrdinalIgnoreCase))); + var reasonableExtras = extraCores <= 2; + + if (allCoresPresent && reasonableExtras) + { + _logger.LogInformation( + "AI rejected '{CompanyName}' but fuzzy match '{MatchedName}' ({Score}%) passes validation. " + + "Original cores: [{OriginalCores}], Match cores: [{MatchCores}]", + normalizedName, bestFuzzy.Value.Item.Title, bestFuzzy.Value.Score, + string.Join(", ", originalCores), string.Join(", ", matchCores)); + matchedItem = bestFuzzy.Value.Item; + matchScore = bestFuzzy.Value.Score; + } + else + { + _logger.LogDebug( + "AI rejected '{CompanyName}' and fuzzy match '{MatchedName}' fails validation. " + + "AllCoresPresent: {AllCores}, ExtraCores: {Extra}", + normalizedName, bestFuzzy.Value.Item.Title, allCoresPresent, extraCores); + return CreateUnverifiedResult(companyName, startDate, endDate, jobTitle, + "Company name could not be verified - no matching company found in official records"); + } + } + else + { + _logger.LogDebug("AI explicitly rejected all candidates for '{CompanyName}'. Reasoning: {Reasoning}", + normalizedName, aiResult?.Reasoning ?? "No match found"); + return CreateUnverifiedResult(companyName, startDate, endDate, jobTitle, + "Company name could not be verified - no matching company found in official records"); + } + } + else + { + // AI failed (API error, etc.) - fall back to fuzzy matching + _logger.LogWarning("AI matching failed for '{CompanyName}', falling back to fuzzy matching", normalizedName); + matchedItem = bestFuzzy!.Value.Item; + matchScore = bestFuzzy!.Value.Score; + } } else { - _logger.LogDebug("No valid match found for: {CompanyName}", companyName); + _logger.LogDebug("No valid match found for: {CompanyName}", normalizedName); return CreateUnverifiedResult(companyName, startDate, endDate, jobTitle, "Company name could not be verified against official records"); } @@ -624,6 +813,26 @@ public sealed class CompanyVerifierService : ICompanyVerifierService #region Helper Methods + /// + /// Normalizes a company name by removing trailing punctuation and cleaning up common issues. + /// + private static string NormalizeCompanyName(string companyName) + { + if (string.IsNullOrWhiteSpace(companyName)) + return companyName; + + var normalized = companyName.Trim(); + + // Remove trailing punctuation (dots, commas, etc.) that cause matching issues + // e.g., "Glaxo Research & Development Ltd." -> "Glaxo Research & Development Ltd" + normalized = normalized.TrimEnd('.', ',', ';', ':', '!', '?'); + + // Normalize multiple spaces to single space + normalized = System.Text.RegularExpressions.Regex.Replace(normalized, @"\s+", " "); + + return normalized; + } + private async Task FindCachedMatchAsync(string companyName) { var cutoffDate = DateTime.UtcNow.AddDays(-CacheExpirationDays); @@ -790,12 +999,13 @@ public sealed class CompanyVerifierService : ICompanyVerifierService var searchText = originalLower + " " + queryLower; // Penalize subsidiary indicators (unless search explicitly included them) + // Use word boundary matching to avoid "SCOTLAND" matching "land" foreach (var indicator in SubsidiaryIndicators) { - if (itemTitleLower.Contains(indicator)) + if (ContainsWholeWord(itemTitleLower, indicator)) { // Only penalize if the search didn't explicitly include this indicator - if (!searchText.Contains(indicator)) + if (!ContainsWholeWord(searchText, indicator)) { score -= 10; // Significant penalty for subsidiaries } @@ -806,7 +1016,7 @@ public sealed class CompanyVerifierService : ICompanyVerifierService // Boost main company indicators foreach (var indicator in MainCompanyIndicators) { - if (itemTitleLower.Contains(indicator)) + if (ContainsWholeWord(itemTitleLower, indicator)) { score += 5; // Boost for main trading companies break; // Only apply one boost @@ -1168,7 +1378,10 @@ public sealed class CompanyVerifierService : ICompanyVerifierService } // Check if the item matches any pattern in this non-employment category - if (patterns.Any(pattern => itemTitleLower.Contains(pattern))) + // Use whole-word matching for single words, substring for multi-word patterns + if (patterns.Any(pattern => pattern.Contains(' ') + ? itemTitleLower.Contains(pattern) + : ContainsWholeWord(itemTitleLower, pattern))) { return false; // This is a non-employment entity type that wasn't explicitly searched for } @@ -1177,6 +1390,19 @@ public sealed class CompanyVerifierService : ICompanyVerifierService return true; // No non-employment patterns matched, this is likely a valid employment entity } + /// + /// Checks if a string contains a word as a whole word (not as a substring of another word). + /// E.g., "scotland" does NOT contain whole word "land", but "land holdings" does. + /// + private static bool ContainsWholeWord(string text, string word) + { + if (string.IsNullOrEmpty(text) || string.IsNullOrEmpty(word)) + return false; + + var pattern = $@"\b{System.Text.RegularExpressions.Regex.Escape(word)}\b"; + return System.Text.RegularExpressions.Regex.IsMatch(text, pattern, System.Text.RegularExpressions.RegexOptions.IgnoreCase); + } + // Expanded skip words list for core identifier extraction // These words are too common to be meaningful differentiators between companies private static readonly HashSet SkipWords = new(StringComparer.OrdinalIgnoreCase) @@ -1220,8 +1446,8 @@ public sealed class CompanyVerifierService : ICompanyVerifierService "new", "modern", "advanced", "innovative", "premier", "elite", "premium", "quality", "superior", "excellent", "best", "top", "leading", "major", - // Ownership indicators - "royal", "imperial", "crown", "state", "public", "private", "independent", + // Ownership indicators (excluding "royal" as it's a meaningful company identifier) + "imperial", "crown", "state", "public", "private", "independent", "mutual", "cooperative", "coop", "community", // Time-related @@ -1235,7 +1461,7 @@ public sealed class CompanyVerifierService : ICompanyVerifierService /// Extracts ALL core identifying words from a company name. /// These are significant words that aren't common prefixes/suffixes. /// E.g., "BMW Group Canada" -> ["BMW"], "Lloyds Bowmaker" -> ["LLOYDS", "BOWMAKER"] - /// "Bank of Scotland" -> ["BANK", "SCOTLAND"] + /// "Royal Bank of Scotland" -> ["ROYAL", "BANK"] (Scotland is a geographic skipWord) /// private static List ExtractCoreIdentifiers(string companyName) {