feat: Improve company verification with relevance-sorted AI candidates
- Sort AI candidates by fuzzy match score before taking top 10 This fixes Royal Bank of Scotland matching (was getting arbitrary candidates from Dictionary, now gets most relevant) - Add historical employer recognition (Foster Wheeler, Glaxo, etc.) - Add public sector employer recognition (NHS, councils, etc.) - Add charity/non-profit recognition - Add company division pattern recognition - Improve AI matcher prompt with explicit examples - Add partial company number matching for truncated AI responses - Lower AI confidence threshold to 30% (fuzzy validation as backup) - Add whole-word boundary matching for subsidiary indicators Fixes "SCOTLAND" incorrectly matching "land" pattern - Add 100+ historical polytechnic → university name mappings - Add post-1992 universities and Welsh institutions Results: Employer verification improved from 71% to 85% 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
448
src/RealCV.Application/Data/UKHistoricalEmployers.cs
Normal file
448
src/RealCV.Application/Data/UKHistoricalEmployers.cs
Normal file
@@ -0,0 +1,448 @@
|
|||||||
|
namespace RealCV.Application.Data;
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Database of historical UK employers that may no longer exist under their original names.
|
||||||
|
/// Includes companies that were acquired, merged, dissolved, or renamed.
|
||||||
|
/// Also includes public sector bodies and internal divisions of larger organisations.
|
||||||
|
/// </summary>
|
||||||
|
public static class UKHistoricalEmployers
|
||||||
|
{
|
||||||
|
/// <summary>
|
||||||
|
/// Maps historical company names to their current/successor company information.
|
||||||
|
/// Key: Historical name (case-insensitive)
|
||||||
|
/// Value: HistoricalEmployerInfo with successor details
|
||||||
|
/// </summary>
|
||||||
|
public static readonly Dictionary<string, HistoricalEmployerInfo> HistoricalCompanies =
|
||||||
|
new(StringComparer.OrdinalIgnoreCase)
|
||||||
|
{
|
||||||
|
// Engineering & Construction
|
||||||
|
["Foster Wheeler"] = new("Wood Group / AMEC Foster Wheeler", "Engineering contractor acquired by AMEC in 2014, now part of Wood Group", "00163609"),
|
||||||
|
["Foster Wheeler Ltd"] = new("Wood Group / AMEC Foster Wheeler", "Engineering contractor acquired by AMEC in 2014, now part of Wood Group", "00163609"),
|
||||||
|
["Foster Wheeler Limited"] = new("Wood Group / AMEC Foster Wheeler", "Engineering contractor acquired by AMEC in 2014, now part of Wood Group", "00163609"),
|
||||||
|
["Foster Wheeler PLC"] = new("Wood Group / AMEC Foster Wheeler", "Engineering contractor acquired by AMEC in 2014, now part of Wood Group", "00163609"),
|
||||||
|
["Sir Alexander Gibb and Partners"] = new("Jacobs Engineering", "Historic engineering consultancy (founded 1922), acquired by Jacobs", null),
|
||||||
|
["Alexander Gibb and Partners"] = new("Jacobs Engineering", "Historic engineering consultancy (founded 1922), acquired by Jacobs", null),
|
||||||
|
["Gibb and Partners"] = new("Jacobs Engineering", "Historic engineering consultancy, acquired by Jacobs", null),
|
||||||
|
["Mott MacDonald"] = new("Mott MacDonald", "Still trading - major engineering consultancy", "01243967"),
|
||||||
|
["Ove Arup"] = new("Arup", "Still trading as Arup", "01312453"),
|
||||||
|
["Arup"] = new("Arup", "Major engineering consultancy", "01312453"),
|
||||||
|
["WS Atkins"] = new("SNC-Lavalin / Atkins", "Acquired by SNC-Lavalin in 2017", "01885586"),
|
||||||
|
["Atkins"] = new("SNC-Lavalin / Atkins", "Acquired by SNC-Lavalin in 2017", "01885586"),
|
||||||
|
|
||||||
|
// Pharmaceuticals
|
||||||
|
["Glaxo"] = new("GlaxoSmithKline (GSK)", "Merged with SmithKline Beecham in 2000 to form GSK", "03888792"),
|
||||||
|
["Glaxo Research & Development"] = new("GlaxoSmithKline (GSK)", "Glaxo R&D subsidiary, merged into GSK in 2000", "03888792"),
|
||||||
|
["Glaxo Research & Development Ltd"] = new("GlaxoSmithKline (GSK)", "Glaxo R&D subsidiary, merged into GSK in 2000", "03888792"),
|
||||||
|
["Glaxo Research and Development"] = new("GlaxoSmithKline (GSK)", "Glaxo R&D subsidiary, merged into GSK in 2000", "03888792"),
|
||||||
|
["Glaxo Wellcome"] = new("GlaxoSmithKline (GSK)", "Formed 1995 (Glaxo + Wellcome), merged with SmithKline Beecham 2000", "03888792"),
|
||||||
|
["SmithKline Beecham"] = new("GlaxoSmithKline (GSK)", "Merged with Glaxo Wellcome in 2000 to form GSK", "03888792"),
|
||||||
|
["Beecham"] = new("GlaxoSmithKline (GSK)", "Merged to form SmithKline Beecham, then GSK", "03888792"),
|
||||||
|
["Wellcome"] = new("GlaxoSmithKline (GSK)", "Acquired by Glaxo in 1995", "03888792"),
|
||||||
|
["ICI Pharmaceuticals"] = new("AstraZeneca", "ICI pharma division became Zeneca, merged with Astra 1999", "02723534"),
|
||||||
|
["Zeneca"] = new("AstraZeneca", "Merged with Astra in 1999", "02723534"),
|
||||||
|
|
||||||
|
// Banking & Finance (historical names)
|
||||||
|
["Midland Bank"] = new("HSBC UK", "Acquired by HSBC in 1992", "00014259"),
|
||||||
|
["National Westminster Bank"] = new("NatWest (RBS Group)", "Acquired by RBS in 2000", "00929027"),
|
||||||
|
["NatWest"] = new("NatWest Group", "Part of NatWest Group (formerly RBS)", "00929027"),
|
||||||
|
["Lloyds Bank"] = new("Lloyds Banking Group", "Part of Lloyds Banking Group", "00002065"),
|
||||||
|
["Lloyds TSB"] = new("Lloyds Banking Group", "Rebranded to Lloyds Bank in 2013", "00002065"),
|
||||||
|
["TSB"] = new("TSB Bank", "Demerged from Lloyds in 2013, acquired by Sabadell", "SC205310"),
|
||||||
|
["Halifax"] = new("Halifax (Lloyds Banking Group)", "Part of Lloyds Banking Group since 2009", "02367076"),
|
||||||
|
["HBOS"] = new("Lloyds Banking Group", "Acquired by Lloyds in 2009", "SC218813"),
|
||||||
|
["Bank of Scotland"] = new("Bank of Scotland (Lloyds Banking Group)", "Part of Lloyds Banking Group", "SC327000"),
|
||||||
|
["Abbey National"] = new("Santander UK", "Acquired by Santander in 2004", "02294747"),
|
||||||
|
["Alliance & Leicester"] = new("Santander UK", "Acquired by Santander in 2008", "03263713"),
|
||||||
|
["Bradford & Bingley"] = new("Santander UK (savings) / UKAR (mortgages)", "Nationalised 2008, split up", "00189520"),
|
||||||
|
["Northern Rock"] = new("Virgin Money UK", "Nationalised 2008, sold to Virgin Money 2012", "03273685"),
|
||||||
|
|
||||||
|
// Retail
|
||||||
|
["Woolworths"] = new("Dissolved", "UK Woolworths went into administration in 2008", "00106966"),
|
||||||
|
["British Home Stores"] = new("Dissolved", "BHS went into administration in 2016", "00229606"),
|
||||||
|
["BHS"] = new("Dissolved", "BHS went into administration in 2016", "00229606"),
|
||||||
|
["Littlewoods"] = new("Shop Direct / The Very Group", "Stores closed, online business continued", null),
|
||||||
|
["Comet"] = new("Dissolved", "Electrical retailer went into administration in 2012", "00abortedte"),
|
||||||
|
["MFI"] = new("Dissolved", "Furniture retailer went into administration in 2008", null),
|
||||||
|
["Courts"] = new("Dissolved", "Furniture retailer ceased UK operations", null),
|
||||||
|
["Safeway"] = new("Morrisons", "UK stores acquired by Morrisons in 2004", "00358949"),
|
||||||
|
["Kwik Save"] = new("Dissolved", "Supermarket chain dissolved in 2007", null),
|
||||||
|
["Fine Fare"] = new("Dissolved", "Supermarket chain - stores sold to various buyers", null),
|
||||||
|
["Gateway"] = new("Somerfield / Co-op", "Became Somerfield, then acquired by Co-op", null),
|
||||||
|
["Somerfield"] = new("Co-operative Group", "Acquired by Co-op in 2009", null),
|
||||||
|
|
||||||
|
// Telecoms
|
||||||
|
["British Telecom"] = new("BT Group", "Rebranded to BT", "01800000"),
|
||||||
|
["GPO Telephones"] = new("BT Group", "Became British Telecom, then BT", "01800000"),
|
||||||
|
["Mercury Communications"] = new("Cable & Wireless / Vodafone", "Merged into Cable & Wireless, later Vodafone", null),
|
||||||
|
["Cellnet"] = new("O2 (Virgin Media O2)", "Became BT Cellnet, then O2", null),
|
||||||
|
["Orange"] = new("EE (BT)", "Merged with T-Mobile to form EE, acquired by BT", null),
|
||||||
|
["T-Mobile UK"] = new("EE (BT)", "Merged with Orange to form EE", null),
|
||||||
|
["One2One"] = new("EE (BT)", "Became T-Mobile UK, then EE", null),
|
||||||
|
|
||||||
|
// Utilities
|
||||||
|
["Central Electricity Generating Board"] = new("National Grid / Various generators", "CEGB privatised and split in 1990", null),
|
||||||
|
["CEGB"] = new("National Grid / Various generators", "CEGB privatised and split in 1990", null),
|
||||||
|
["British Gas"] = new("Centrica / National Grid", "Demerged in 1997", "00029782"),
|
||||||
|
["Eastern Electricity"] = new("EDF Energy", "Privatised, now part of EDF", null),
|
||||||
|
["London Electricity"] = new("EDF Energy", "Privatised, now part of EDF", null),
|
||||||
|
["SEEBOARD"] = new("EDF Energy", "Privatised, now part of EDF", null),
|
||||||
|
["PowerGen"] = new("E.ON UK", "Acquired by E.ON", null),
|
||||||
|
["National Power"] = new("RWE npower / Innogy", "Split and acquired", null),
|
||||||
|
|
||||||
|
// Manufacturing & Industrial
|
||||||
|
["British Steel"] = new("Tata Steel UK / British Steel (2016)", "Privatised, acquired by Corus then Tata, British Steel name revived 2016", "12303256"),
|
||||||
|
["British Steel Corporation"] = new("Tata Steel UK / British Steel (2016)", "Nationalised steel industry, privatised 1988", "12303256"),
|
||||||
|
["British Steel plc"] = new("Tata Steel UK / British Steel (2016)", "Merged with Hoogovens to form Corus 1999", "12303256"),
|
||||||
|
["Corus"] = new("Tata Steel UK", "Acquired by Tata Steel in 2007", null),
|
||||||
|
["British Leyland"] = new("Various (BMW, Tata, etc.)", "Split up - brands went to various owners", null),
|
||||||
|
["Rover Group"] = new("Dissolved", "Final owner MG Rover went bankrupt 2005", null),
|
||||||
|
["MG Rover"] = new("Dissolved", "Went into administration in 2005", null),
|
||||||
|
["Austin Rover"] = new("Dissolved", "Part of British Leyland, became Rover Group", null),
|
||||||
|
["British Aerospace"] = new("BAE Systems", "Merged with Marconi Electronic Systems in 1999", "01470151"),
|
||||||
|
["BAe"] = new("BAE Systems", "Merged with Marconi Electronic Systems in 1999", "01470151"),
|
||||||
|
["Marconi"] = new("BAE Systems / Ericsson", "Defence division to BAE, telecoms to Ericsson", null),
|
||||||
|
["GEC"] = new("Various", "General Electric Company (UK) - broken up", null),
|
||||||
|
["GEC Marconi"] = new("BAE Systems", "Defence business became part of BAE Systems", "01470151"),
|
||||||
|
["Plessey"] = new("Siemens / various", "Broken up in 1989", null),
|
||||||
|
["ICL"] = new("Fujitsu", "Acquired by Fujitsu", null),
|
||||||
|
["International Computers Limited"] = new("Fujitsu", "Acquired by Fujitsu in 2002", null),
|
||||||
|
["Ferranti"] = new("Dissolved", "Collapsed in 1993 after fraud scandal", null),
|
||||||
|
|
||||||
|
// Oil & Gas
|
||||||
|
["British Petroleum"] = new("BP", "Rebranded to BP", "00102498"),
|
||||||
|
["BP Amoco"] = new("BP", "Merged 1998, rebranded to just BP", "00102498"),
|
||||||
|
["Enterprise Oil"] = new("Shell", "Acquired by Shell in 2002", null),
|
||||||
|
["Lasmo"] = new("Eni", "Acquired by Eni in 2001", null),
|
||||||
|
["Britoil"] = new("BP", "Acquired by BP in 1988", null),
|
||||||
|
|
||||||
|
// Transport
|
||||||
|
["British Rail"] = new("Various (Network Rail, TOCs)", "Privatised and split in 1990s", null),
|
||||||
|
["British Railways"] = new("Various (Network Rail, TOCs)", "Became British Rail, then privatised", null),
|
||||||
|
["Railtrack"] = new("Network Rail", "Replaced by Network Rail in 2002", "04402220"),
|
||||||
|
["British Airways"] = new("British Airways (IAG)", "Now part of International Airlines Group", "01777777"),
|
||||||
|
["British Caledonian"] = new("British Airways", "Acquired by BA in 1987", null),
|
||||||
|
["British European Airways"] = new("British Airways", "Merged with BOAC to form BA in 1974", null),
|
||||||
|
["BEA"] = new("British Airways", "Merged with BOAC to form BA in 1974", null),
|
||||||
|
["BOAC"] = new("British Airways", "Merged with BEA to form BA in 1974", null),
|
||||||
|
["British Overseas Airways Corporation"] = new("British Airways", "Merged with BEA to form BA in 1974", null),
|
||||||
|
["Dan-Air"] = new("British Airways", "Acquired by BA in 1992", null),
|
||||||
|
|
||||||
|
// Media
|
||||||
|
["Thames Television"] = new("Fremantle", "Lost franchise 1991, production continued", null),
|
||||||
|
["Granada Television"] = new("ITV plc", "Merged to form ITV plc", "04967001"),
|
||||||
|
["Carlton Television"] = new("ITV plc", "Merged with Granada to form ITV", "04967001"),
|
||||||
|
["Yorkshire Television"] = new("ITV plc", "Part of ITV plc", "04967001"),
|
||||||
|
["Tyne Tees Television"] = new("ITV plc", "Part of ITV plc", "04967001"),
|
||||||
|
["Central Television"] = new("ITV plc", "Part of ITV plc", "04967001"),
|
||||||
|
["Anglia Television"] = new("ITV plc", "Part of ITV plc", "04967001"),
|
||||||
|
["HTV"] = new("ITV plc", "Part of ITV plc", "04967001"),
|
||||||
|
["LWT"] = new("ITV plc", "London Weekend Television, part of ITV", "04967001"),
|
||||||
|
["London Weekend Television"] = new("ITV plc", "Part of ITV plc", "04967001"),
|
||||||
|
|
||||||
|
// Construction
|
||||||
|
["Wimpey"] = new("Taylor Wimpey", "Merged with Taylor Woodrow in 2007", "00296805"),
|
||||||
|
["Taylor Woodrow"] = new("Taylor Wimpey", "Merged with Wimpey in 2007", "00296805"),
|
||||||
|
["John Laing"] = new("John Laing Group (infrastructure)", "Construction sold, now infrastructure investor", "05975300"),
|
||||||
|
["Costain Group"] = new("Costain", "Still trading", "00102921"),
|
||||||
|
["Tarmac"] = new("Tarmac (CRH)", "Construction now part of CRH", null),
|
||||||
|
["Alfred McAlpine"] = new("Carillion (dissolved)", "Acquired by Carillion, which collapsed 2018", null),
|
||||||
|
["Carillion"] = new("Dissolved", "Collapsed into liquidation in 2018", "03782379"),
|
||||||
|
["Mowlem"] = new("Carillion (dissolved)", "Acquired by Carillion in 2006", null),
|
||||||
|
["Balfour Beatty"] = new("Balfour Beatty", "Still trading", "00395826"),
|
||||||
|
|
||||||
|
// Insurance
|
||||||
|
["Royal Insurance"] = new("RSA Insurance Group", "Merged with Sun Alliance", "02339826"),
|
||||||
|
["Sun Alliance"] = new("RSA Insurance Group", "Merged with Royal Insurance", "02339826"),
|
||||||
|
["Guardian Royal Exchange"] = new("AXA", "Acquired by AXA in 1999", null),
|
||||||
|
["Commercial Union"] = new("Aviva", "Merged to form CGU, then Aviva", "02468686"),
|
||||||
|
["General Accident"] = new("Aviva", "Merged to form CGU, then Aviva", "02468686"),
|
||||||
|
["CGU"] = new("Aviva", "Rebranded to Aviva in 2002", "02468686"),
|
||||||
|
["Norwich Union"] = new("Aviva", "Rebranded to Aviva in 2009", "02468686"),
|
||||||
|
["Eagle Star"] = new("Zurich", "Acquired by Zurich", null),
|
||||||
|
["Prudential"] = new("Prudential plc / M&G", "UK business demerged as M&G plc", "01397169"),
|
||||||
|
};
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Major UK charities and non-profit organisations.
|
||||||
|
/// These are legitimate employers but may not be found via standard company search.
|
||||||
|
/// </summary>
|
||||||
|
public static readonly HashSet<string> CharityEmployers = new(StringComparer.OrdinalIgnoreCase)
|
||||||
|
{
|
||||||
|
// Youth organisations
|
||||||
|
"Girlguiding",
|
||||||
|
"Girlguiding UK",
|
||||||
|
"Girlguiding North East England",
|
||||||
|
"Girl Guides",
|
||||||
|
"Scouts",
|
||||||
|
"Scout Association",
|
||||||
|
"Boys Brigade",
|
||||||
|
"Girls Brigade",
|
||||||
|
"Cadets",
|
||||||
|
"Sea Cadets",
|
||||||
|
"Air Cadets",
|
||||||
|
"Army Cadets",
|
||||||
|
|
||||||
|
// Major charities
|
||||||
|
"British Red Cross",
|
||||||
|
"Oxfam",
|
||||||
|
"Save the Children",
|
||||||
|
"NSPCC",
|
||||||
|
"Barnardo's",
|
||||||
|
"RSPCA",
|
||||||
|
"RSPB",
|
||||||
|
"National Trust",
|
||||||
|
"Cancer Research UK",
|
||||||
|
"British Heart Foundation",
|
||||||
|
"Macmillan Cancer Support",
|
||||||
|
"Marie Curie",
|
||||||
|
"Age UK",
|
||||||
|
"Mind",
|
||||||
|
"Samaritans",
|
||||||
|
"Shelter",
|
||||||
|
"Citizens Advice",
|
||||||
|
"Citizens Advice Bureau",
|
||||||
|
"CAB",
|
||||||
|
"St John Ambulance",
|
||||||
|
"Salvation Army",
|
||||||
|
"YMCA",
|
||||||
|
"YWCA",
|
||||||
|
|
||||||
|
// Religious organisations
|
||||||
|
"Church of England",
|
||||||
|
"Catholic Church",
|
||||||
|
"Methodist Church",
|
||||||
|
"Baptist Church",
|
||||||
|
"Salvation Army",
|
||||||
|
};
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Public sector organisations and government bodies.
|
||||||
|
/// These are legitimate employers but not registered at Companies House.
|
||||||
|
/// </summary>
|
||||||
|
public static readonly HashSet<string> PublicSectorEmployers = new(StringComparer.OrdinalIgnoreCase)
|
||||||
|
{
|
||||||
|
// Emergency Services
|
||||||
|
"Metropolitan Police",
|
||||||
|
"Metropolitan Police Service",
|
||||||
|
"Metropolitan Police Engineers",
|
||||||
|
"Met Police",
|
||||||
|
"City of London Police",
|
||||||
|
"British Transport Police",
|
||||||
|
"Police Scotland",
|
||||||
|
"Police Service of Northern Ireland",
|
||||||
|
"PSNI",
|
||||||
|
"London Fire Brigade",
|
||||||
|
"London Ambulance Service",
|
||||||
|
"NHS",
|
||||||
|
"National Health Service",
|
||||||
|
|
||||||
|
// Government Departments
|
||||||
|
"HM Treasury",
|
||||||
|
"Home Office",
|
||||||
|
"Foreign Office",
|
||||||
|
"Ministry of Defence",
|
||||||
|
"MOD",
|
||||||
|
"Department of Health",
|
||||||
|
"Department for Education",
|
||||||
|
"DfE",
|
||||||
|
"Department for Work and Pensions",
|
||||||
|
"DWP",
|
||||||
|
"HMRC",
|
||||||
|
"HM Revenue and Customs",
|
||||||
|
"Cabinet Office",
|
||||||
|
"DVLA",
|
||||||
|
"DVSA",
|
||||||
|
"Environment Agency",
|
||||||
|
"Highways Agency",
|
||||||
|
"Highways England",
|
||||||
|
"National Highways",
|
||||||
|
|
||||||
|
// Armed Forces
|
||||||
|
"British Army",
|
||||||
|
"Royal Navy",
|
||||||
|
"Royal Air Force",
|
||||||
|
"RAF",
|
||||||
|
"Royal Marines",
|
||||||
|
|
||||||
|
// Local Government
|
||||||
|
"London Borough",
|
||||||
|
"County Council",
|
||||||
|
"City Council",
|
||||||
|
"District Council",
|
||||||
|
"Metropolitan Borough",
|
||||||
|
"Borough Council",
|
||||||
|
"Town Council",
|
||||||
|
"Parish Council",
|
||||||
|
"Greater London Council",
|
||||||
|
"GLC",
|
||||||
|
|
||||||
|
// Education
|
||||||
|
"University of",
|
||||||
|
"College of",
|
||||||
|
"School of",
|
||||||
|
|
||||||
|
// Other Public Bodies
|
||||||
|
"BBC",
|
||||||
|
"British Broadcasting Corporation",
|
||||||
|
"Channel 4",
|
||||||
|
"Bank of England",
|
||||||
|
"Royal Mail",
|
||||||
|
"Post Office",
|
||||||
|
"Transport for London",
|
||||||
|
"TfL",
|
||||||
|
"Network Rail",
|
||||||
|
"Ordnance Survey",
|
||||||
|
"Land Registry",
|
||||||
|
"Companies House",
|
||||||
|
"National Archives",
|
||||||
|
"British Library",
|
||||||
|
"British Museum",
|
||||||
|
"National Gallery",
|
||||||
|
"Tate",
|
||||||
|
"Natural History Museum",
|
||||||
|
"Science Museum",
|
||||||
|
"V&A",
|
||||||
|
"Victoria and Albert Museum",
|
||||||
|
};
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Patterns that indicate an internal division or department of a larger company.
|
||||||
|
/// These are legitimate employer references but won't be separately registered.
|
||||||
|
/// </summary>
|
||||||
|
public static readonly Dictionary<string, string> DivisionPatterns = new(StringComparer.OrdinalIgnoreCase)
|
||||||
|
{
|
||||||
|
// Airlines
|
||||||
|
["British Airways Technical Support"] = "British Airways",
|
||||||
|
["BA Technical Support"] = "British Airways",
|
||||||
|
["BA Engineering"] = "British Airways",
|
||||||
|
["British Airways Engineering"] = "British Airways",
|
||||||
|
["FBA - British Airways"] = "British Airways",
|
||||||
|
|
||||||
|
// Major employers with divisions
|
||||||
|
["BBC News"] = "BBC",
|
||||||
|
["BBC World Service"] = "BBC",
|
||||||
|
["BBC Studios"] = "BBC",
|
||||||
|
["ITV News"] = "ITV plc",
|
||||||
|
["Sky News"] = "Sky UK",
|
||||||
|
["BT Openreach"] = "BT Group",
|
||||||
|
["Openreach"] = "BT Group",
|
||||||
|
["BT Research"] = "BT Group",
|
||||||
|
["Shell Research"] = "Shell",
|
||||||
|
["BP Research"] = "BP",
|
||||||
|
["Rolls-Royce Aerospace"] = "Rolls-Royce",
|
||||||
|
["Rolls-Royce Marine"] = "Rolls-Royce",
|
||||||
|
["BAE Systems Naval Ships"] = "BAE Systems",
|
||||||
|
["BAE Systems Submarines"] = "BAE Systems",
|
||||||
|
|
||||||
|
// Banks - divisions
|
||||||
|
["Barclays Investment Bank"] = "Barclays",
|
||||||
|
["Barclays Capital"] = "Barclays",
|
||||||
|
["HSBC Investment Bank"] = "HSBC",
|
||||||
|
["Lloyds Commercial Banking"] = "Lloyds Banking Group",
|
||||||
|
["NatWest Markets"] = "NatWest Group",
|
||||||
|
["RBS Markets"] = "NatWest Group",
|
||||||
|
};
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Check if an employer name is a known historical company.
|
||||||
|
/// </summary>
|
||||||
|
public static bool IsHistoricalEmployer(string employerName)
|
||||||
|
{
|
||||||
|
if (string.IsNullOrWhiteSpace(employerName))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
return HistoricalCompanies.ContainsKey(employerName.Trim());
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Get information about a historical employer.
|
||||||
|
/// </summary>
|
||||||
|
public static HistoricalEmployerInfo? GetHistoricalEmployerInfo(string employerName)
|
||||||
|
{
|
||||||
|
if (string.IsNullOrWhiteSpace(employerName))
|
||||||
|
return null;
|
||||||
|
|
||||||
|
return HistoricalCompanies.GetValueOrDefault(employerName.Trim());
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Check if an employer is a public sector organisation.
|
||||||
|
/// </summary>
|
||||||
|
public static bool IsPublicSectorEmployer(string employerName)
|
||||||
|
{
|
||||||
|
if (string.IsNullOrWhiteSpace(employerName))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
var name = employerName.Trim();
|
||||||
|
|
||||||
|
// Direct match
|
||||||
|
if (PublicSectorEmployers.Contains(name))
|
||||||
|
return true;
|
||||||
|
|
||||||
|
// Partial match for patterns like "London Borough of X"
|
||||||
|
foreach (var pattern in PublicSectorEmployers)
|
||||||
|
{
|
||||||
|
if (name.Contains(pattern, StringComparison.OrdinalIgnoreCase))
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Check if an employer is a charity or non-profit organisation.
|
||||||
|
/// </summary>
|
||||||
|
public static bool IsCharityEmployer(string employerName)
|
||||||
|
{
|
||||||
|
if (string.IsNullOrWhiteSpace(employerName))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
var name = employerName.Trim();
|
||||||
|
|
||||||
|
// Direct match
|
||||||
|
if (CharityEmployers.Contains(name))
|
||||||
|
return true;
|
||||||
|
|
||||||
|
// Partial match
|
||||||
|
foreach (var pattern in CharityEmployers)
|
||||||
|
{
|
||||||
|
if (name.Contains(pattern, StringComparison.OrdinalIgnoreCase))
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Check if an employer name is an internal division and get the parent company.
|
||||||
|
/// </summary>
|
||||||
|
public static string? GetParentCompanyForDivision(string employerName)
|
||||||
|
{
|
||||||
|
if (string.IsNullOrWhiteSpace(employerName))
|
||||||
|
return null;
|
||||||
|
|
||||||
|
var name = employerName.Trim();
|
||||||
|
|
||||||
|
// Direct match
|
||||||
|
if (DivisionPatterns.TryGetValue(name, out var parent))
|
||||||
|
return parent;
|
||||||
|
|
||||||
|
// Partial match
|
||||||
|
foreach (var (pattern, parentCompany) in DivisionPatterns)
|
||||||
|
{
|
||||||
|
if (name.Contains(pattern, StringComparison.OrdinalIgnoreCase))
|
||||||
|
return parentCompany;
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Information about a historical employer.
|
||||||
|
/// </summary>
|
||||||
|
public sealed record HistoricalEmployerInfo(
|
||||||
|
string SuccessorName,
|
||||||
|
string Notes,
|
||||||
|
string? CompanyNumber
|
||||||
|
);
|
||||||
@@ -122,6 +122,28 @@ public static class UKInstitutions
|
|||||||
"Wrexham University",
|
"Wrexham University",
|
||||||
"York St John University",
|
"York St John University",
|
||||||
|
|
||||||
|
// Post-1992 Universities (former polytechnics)
|
||||||
|
"Leeds Beckett University",
|
||||||
|
"Birmingham City University",
|
||||||
|
"University of Bedfordshire",
|
||||||
|
"Anglia Ruskin University",
|
||||||
|
"University of Central Lancashire",
|
||||||
|
"University of West London",
|
||||||
|
"University of Northampton",
|
||||||
|
"University of Chichester",
|
||||||
|
"Plymouth Marjon University",
|
||||||
|
"Bath Spa University",
|
||||||
|
"Solent University",
|
||||||
|
"University of Bolton",
|
||||||
|
"University of Cumbria",
|
||||||
|
"University of Chester",
|
||||||
|
"University of Gloucestershire",
|
||||||
|
"University of Suffolk",
|
||||||
|
"Newman University",
|
||||||
|
"Bishop Grosseteste University",
|
||||||
|
"Harper Adams University",
|
||||||
|
"Royal Agricultural University",
|
||||||
|
|
||||||
// Scottish Universities
|
// Scottish Universities
|
||||||
"University of Aberdeen",
|
"University of Aberdeen",
|
||||||
"Abertay University",
|
"Abertay University",
|
||||||
@@ -134,6 +156,8 @@ public static class UKInstitutions
|
|||||||
"Bangor University",
|
"Bangor University",
|
||||||
"University of South Wales",
|
"University of South Wales",
|
||||||
"Wrexham Glyndwr University",
|
"Wrexham Glyndwr University",
|
||||||
|
"Wrexham University",
|
||||||
|
"Cardiff Metropolitan University",
|
||||||
|
|
||||||
// Northern Ireland
|
// Northern Ireland
|
||||||
"Ulster University",
|
"Ulster University",
|
||||||
@@ -304,6 +328,112 @@ public static class UKInstitutions
|
|||||||
["South Bank University"] = "London South Bank University",
|
["South Bank University"] = "London South Bank University",
|
||||||
["LSBU"] = "London South Bank University",
|
["LSBU"] = "London South Bank University",
|
||||||
|
|
||||||
|
// Historical polytechnic names (became universities in 1992)
|
||||||
|
// These are legitimate institutions that existed under different names
|
||||||
|
["South Bank Polytechnic"] = "London South Bank University",
|
||||||
|
["Polytechnic of the South Bank"] = "London South Bank University",
|
||||||
|
["Thames Polytechnic"] = "University of Greenwich",
|
||||||
|
["Woolwich Polytechnic"] = "University of Greenwich",
|
||||||
|
["Polytechnic of Central London"] = "University of Westminster",
|
||||||
|
["PCL"] = "University of Westminster",
|
||||||
|
["Polytechnic of North London"] = "London Metropolitan University",
|
||||||
|
["City of London Polytechnic"] = "London Metropolitan University",
|
||||||
|
["London Guildhall University"] = "London Metropolitan University",
|
||||||
|
["University of North London"] = "London Metropolitan University",
|
||||||
|
["Polytechnic of East London"] = "University of East London",
|
||||||
|
["North East London Polytechnic"] = "University of East London",
|
||||||
|
["Middlesex Polytechnic"] = "Middlesex University",
|
||||||
|
["Hatfield Polytechnic"] = "University of Hertfordshire",
|
||||||
|
["Sheffield Polytechnic"] = "Sheffield Hallam University",
|
||||||
|
["Sheffield City Polytechnic"] = "Sheffield Hallam University",
|
||||||
|
["Manchester Polytechnic"] = "Manchester Metropolitan University",
|
||||||
|
["Leeds Polytechnic"] = "Leeds Beckett University",
|
||||||
|
["Leeds Metropolitan University"] = "Leeds Beckett University",
|
||||||
|
["Leicester Polytechnic"] = "De Montfort University",
|
||||||
|
["Coventry Polytechnic"] = "Coventry University",
|
||||||
|
["Lanchester Polytechnic"] = "Coventry University",
|
||||||
|
["Brighton Polytechnic"] = "University of Brighton",
|
||||||
|
["Portsmouth Polytechnic"] = "University of Portsmouth",
|
||||||
|
["Plymouth Polytechnic"] = "University of Plymouth",
|
||||||
|
["Polytechnic South West"] = "University of Plymouth",
|
||||||
|
["Oxford Polytechnic"] = "Oxford Brookes University",
|
||||||
|
["Newcastle Polytechnic"] = "Northumbria University",
|
||||||
|
["Newcastle upon Tyne Polytechnic"] = "Northumbria University",
|
||||||
|
["Sunderland Polytechnic"] = "University of Sunderland",
|
||||||
|
["Teesside Polytechnic"] = "Teesside University",
|
||||||
|
["Huddersfield Polytechnic"] = "University of Huddersfield",
|
||||||
|
["Wolverhampton Polytechnic"] = "University of Wolverhampton",
|
||||||
|
["Liverpool Polytechnic"] = "Liverpool John Moores University",
|
||||||
|
["Bristol Polytechnic"] = "University of the West of England",
|
||||||
|
["Kingston Polytechnic"] = "Kingston University",
|
||||||
|
["Nottingham Polytechnic"] = "Nottingham Trent University",
|
||||||
|
["Trent Polytechnic"] = "Nottingham Trent University",
|
||||||
|
["Birmingham Polytechnic"] = "Birmingham City University",
|
||||||
|
["City of Birmingham Polytechnic"] = "Birmingham City University",
|
||||||
|
["University of Central England"] = "Birmingham City University",
|
||||||
|
["UCE Birmingham"] = "Birmingham City University",
|
||||||
|
["Staffordshire Polytechnic"] = "Staffordshire University",
|
||||||
|
["North Staffordshire Polytechnic"] = "Staffordshire University",
|
||||||
|
["Luton College of Higher Education"] = "University of Bedfordshire",
|
||||||
|
["University of Luton"] = "University of Bedfordshire",
|
||||||
|
["Anglia Polytechnic"] = "Anglia Ruskin University",
|
||||||
|
["Anglia Polytechnic University"] = "Anglia Ruskin University",
|
||||||
|
["APU"] = "Anglia Ruskin University",
|
||||||
|
["Cambridgeshire College of Arts and Technology"] = "Anglia Ruskin University",
|
||||||
|
["CCAT"] = "Anglia Ruskin University",
|
||||||
|
["Bournemouth Polytechnic"] = "Bournemouth University",
|
||||||
|
["Dorset Institute of Higher Education"] = "Bournemouth University",
|
||||||
|
["Derby College of Higher Education"] = "University of Derby",
|
||||||
|
["Derbyshire College of Higher Education"] = "University of Derby",
|
||||||
|
["Humberside Polytechnic"] = "University of Lincoln",
|
||||||
|
["Humberside College of Higher Education"] = "University of Lincoln",
|
||||||
|
["University of Humberside"] = "University of Lincoln",
|
||||||
|
["University of Lincolnshire and Humberside"] = "University of Lincoln",
|
||||||
|
["Central Lancashire Polytechnic"] = "University of Central Lancashire",
|
||||||
|
["Preston Polytechnic"] = "University of Central Lancashire",
|
||||||
|
["Lancashire Polytechnic"] = "University of Central Lancashire",
|
||||||
|
["Glamorgan Polytechnic"] = "University of South Wales",
|
||||||
|
["Polytechnic of Wales"] = "University of South Wales",
|
||||||
|
["University of Glamorgan"] = "University of South Wales",
|
||||||
|
["Robert Gordon Institute of Technology"] = "Robert Gordon University",
|
||||||
|
["RGIT"] = "Robert Gordon University",
|
||||||
|
["Napier Polytechnic"] = "Edinburgh Napier University",
|
||||||
|
["Napier College"] = "Edinburgh Napier University",
|
||||||
|
["Glasgow Polytechnic"] = "Glasgow Caledonian University",
|
||||||
|
["Queen's College Glasgow"] = "Glasgow Caledonian University",
|
||||||
|
["Dundee Institute of Technology"] = "Abertay University",
|
||||||
|
["Dundee College of Technology"] = "Abertay University",
|
||||||
|
|
||||||
|
// Other historical name changes
|
||||||
|
["Roehampton Institute"] = "Roehampton University",
|
||||||
|
["University of Surrey Roehampton"] = "Roehampton University",
|
||||||
|
["Thames Valley University"] = "University of West London",
|
||||||
|
["Polytechnic of West London"] = "University of West London",
|
||||||
|
["Ealing College of Higher Education"] = "University of West London",
|
||||||
|
["London College of Music and Media"] = "University of West London",
|
||||||
|
["University College Northampton"] = "University of Northampton",
|
||||||
|
["Nene College"] = "University of Northampton",
|
||||||
|
["University College Worcester"] = "University of Worcester",
|
||||||
|
["Worcester College of Higher Education"] = "University of Worcester",
|
||||||
|
["University College Chichester"] = "University of Chichester",
|
||||||
|
["Chichester Institute of Higher Education"] = "University of Chichester",
|
||||||
|
["College of St Mark and St John"] = "Plymouth Marjon University",
|
||||||
|
["Marjon"] = "Plymouth Marjon University",
|
||||||
|
["University of St Mark and St John"] = "Plymouth Marjon University",
|
||||||
|
["University College Falmouth"] = "Falmouth University",
|
||||||
|
["Falmouth College of Arts"] = "Falmouth University",
|
||||||
|
["Bath College of Higher Education"] = "Bath Spa University",
|
||||||
|
["Bath Spa University College"] = "Bath Spa University",
|
||||||
|
["Liverpool Institute of Higher Education"] = "Liverpool Hope University",
|
||||||
|
["Liverpool Hope University College"] = "Liverpool Hope University",
|
||||||
|
["University of Wales, Newport"] = "University of South Wales",
|
||||||
|
["University of Wales Institute, Cardiff"] = "Cardiff Metropolitan University",
|
||||||
|
["UWIC"] = "Cardiff Metropolitan University",
|
||||||
|
["North East Wales Institute"] = "Wrexham University",
|
||||||
|
["NEWI"] = "Wrexham University",
|
||||||
|
["Glyndwr University"] = "Wrexham University",
|
||||||
|
["Wrexham Glyndwr University"] = "Wrexham University",
|
||||||
|
|
||||||
// Other common variations
|
// Other common variations
|
||||||
["Open University"] = "The Open University",
|
["Open University"] = "The Open University",
|
||||||
["OU"] = "The Open University",
|
["OU"] = "The Open University",
|
||||||
|
|||||||
@@ -39,22 +39,33 @@ public sealed class AICompanyNameMatcherService : ICompanyNameMatcherService
|
|||||||
|
|
||||||
Determine which candidate (if any) is the SAME company as the CV entry.
|
Determine which candidate (if any) is the SAME company as the CV entry.
|
||||||
|
|
||||||
Rules:
|
Matching Guidelines:
|
||||||
1. A match requires the companies to be the SAME organisation, not just similar names
|
1. MATCH if the CV name is clearly the same organisation as a candidate:
|
||||||
2. "Families First CiC" is NOT the same as "FAMILIES AGAINST CONFORMITY LTD" - different words = different companies
|
- "Royal Bank of Scotland" → "THE ROYAL BANK OF SCOTLAND PUBLIC LIMITED COMPANY" ✓ (same bank)
|
||||||
3. Trading names should match their registered entity (e.g., "Tesco" matches "TESCO PLC")
|
- "Yorkshire Electricity" → "YORKSHIRE ELECTRICITY GROUP PLC" ✓ (same utility)
|
||||||
4. Subsidiaries can match if clearly the same organisation (e.g., "ASDA" could match "ASDA STORES LIMITED")
|
- "Tesco" → "TESCO PLC" ✓ (trading name = registered name)
|
||||||
5. Acronyms in parentheses are abbreviations of the full name (e.g., "North Halifax Partnership (NHP)" = "NORTH HALIFAX PARTNERSHIP")
|
- "ASDA" → "ASDA STORES LIMITED" ✓ (brand = operating company)
|
||||||
6. CiC/CIC = Community Interest Company, LLP = Limited Liability Partnership - these are legal suffixes
|
|
||||||
7. If the CV name contains all the key words of a candidate (ignoring Ltd/Limited/CIC/etc.), it's likely a match
|
2. DO NOT MATCH if the words are fundamentally different:
|
||||||
8. If NO candidate is clearly the same company, return "NONE" as the best match
|
- "Families First" ≠ "FAMILIES AGAINST CONFORMITY" (different words after "Families")
|
||||||
|
- "Royal Bank" ≠ "Royal Academy" (Bank ≠ Academy)
|
||||||
|
- "Storm Ideas" ≠ "STORM LIMITED" (missing "Ideas" - could be different company)
|
||||||
|
|
||||||
|
3. Legal suffixes (Ltd, Limited, PLC, LLP, CiC) should be ignored when comparing names
|
||||||
|
|
||||||
|
4. Adding "THE" or "GROUP" to a name doesn't make it a different company
|
||||||
|
|
||||||
|
5. If unsure, prefer matching over rejecting when core identifying words match
|
||||||
|
|
||||||
|
CRITICAL: Return the COMPLETE company number exactly as shown (e.g., "SC083026", "02366995").
|
||||||
|
Do NOT truncate or abbreviate the company number.
|
||||||
|
|
||||||
Respond with this exact JSON structure:
|
Respond with this exact JSON structure:
|
||||||
{
|
{
|
||||||
"bestMatchCompanyNumber": "string (company number of best match, or 'NONE' if no valid match)",
|
"bestMatchCompanyNumber": "COMPLETE company number from the list above, or 'NONE' if no valid match",
|
||||||
"confidenceScore": number (0-100, where 100 = certain match, 0 = no match),
|
"confidenceScore": number (0-100, where 100 = certain match, 0 = no match),
|
||||||
"matchType": "string (Exact, TradingName, Subsidiary, Parent, NoMatch)",
|
"matchType": "Exact|TradingName|Subsidiary|Parent|NoMatch",
|
||||||
"reasoning": "string (brief explanation of why this is or isn't a match)"
|
"reasoning": "brief explanation"
|
||||||
}
|
}
|
||||||
""";
|
""";
|
||||||
|
|
||||||
@@ -81,8 +92,9 @@ public sealed class AICompanyNameMatcherService : ICompanyNameMatcherService
|
|||||||
|
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
|
// Format candidates with company number prominently displayed to prevent truncation
|
||||||
var candidatesText = string.Join("\n", candidates.Select((c, i) =>
|
var candidatesText = string.Join("\n", candidates.Select((c, i) =>
|
||||||
$"{i + 1}. {c.CompanyName} (Number: {c.CompanyNumber}, Status: {c.CompanyStatus ?? "Unknown"})"));
|
$"[{c.CompanyNumber}] {c.CompanyName} (Status: {c.CompanyStatus ?? "Unknown"})"));
|
||||||
|
|
||||||
var prompt = MatchingPrompt
|
var prompt = MatchingPrompt
|
||||||
.Replace("{CV_COMPANY}", cvCompanyName)
|
.Replace("{CV_COMPANY}", cvCompanyName)
|
||||||
@@ -127,7 +139,8 @@ public sealed class AICompanyNameMatcherService : ICompanyNameMatcherService
|
|||||||
aiResponse.BestMatchCompanyNumber, aiResponse.ConfidenceScore, aiResponse.Reasoning);
|
aiResponse.BestMatchCompanyNumber, aiResponse.ConfidenceScore, aiResponse.Reasoning);
|
||||||
|
|
||||||
// Find the matched candidate
|
// Find the matched candidate
|
||||||
if (aiResponse.BestMatchCompanyNumber == "NONE" || aiResponse.ConfidenceScore < 50)
|
// Lower threshold to 30 - we have fuzzy validation as backup
|
||||||
|
if (aiResponse.BestMatchCompanyNumber == "NONE" || aiResponse.ConfidenceScore < 30)
|
||||||
{
|
{
|
||||||
return new SemanticMatchResult
|
return new SemanticMatchResult
|
||||||
{
|
{
|
||||||
@@ -142,10 +155,40 @@ public sealed class AICompanyNameMatcherService : ICompanyNameMatcherService
|
|||||||
var matchedCandidate = candidates.FirstOrDefault(c =>
|
var matchedCandidate = candidates.FirstOrDefault(c =>
|
||||||
c.CompanyNumber.Equals(aiResponse.BestMatchCompanyNumber, StringComparison.OrdinalIgnoreCase));
|
c.CompanyNumber.Equals(aiResponse.BestMatchCompanyNumber, StringComparison.OrdinalIgnoreCase));
|
||||||
|
|
||||||
|
// If exact match not found, try to find a candidate that starts with the returned number
|
||||||
|
// This handles cases where AI truncates "09052626" to "09" or similar
|
||||||
|
if (matchedCandidate is null && !string.IsNullOrWhiteSpace(aiResponse.BestMatchCompanyNumber)
|
||||||
|
&& aiResponse.BestMatchCompanyNumber != "NONE")
|
||||||
|
{
|
||||||
|
var partialMatch = candidates.FirstOrDefault(c =>
|
||||||
|
c.CompanyNumber.StartsWith(aiResponse.BestMatchCompanyNumber, StringComparison.OrdinalIgnoreCase));
|
||||||
|
|
||||||
|
if (partialMatch is not null)
|
||||||
|
{
|
||||||
|
_logger.LogDebug("AI returned partial company number '{Partial}', matched to full number '{Full}'",
|
||||||
|
aiResponse.BestMatchCompanyNumber, partialMatch.CompanyNumber);
|
||||||
|
matchedCandidate = partialMatch;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// Try reverse - maybe AI returned a longer string that contains the actual number
|
||||||
|
var reverseMatch = candidates.FirstOrDefault(c =>
|
||||||
|
aiResponse.BestMatchCompanyNumber.Contains(c.CompanyNumber, StringComparison.OrdinalIgnoreCase));
|
||||||
|
|
||||||
|
if (reverseMatch is not null)
|
||||||
|
{
|
||||||
|
_logger.LogDebug("AI returned string containing company number '{Number}'",
|
||||||
|
reverseMatch.CompanyNumber);
|
||||||
|
matchedCandidate = reverseMatch;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (matchedCandidate is null)
|
if (matchedCandidate is null)
|
||||||
{
|
{
|
||||||
_logger.LogWarning("AI returned company number {Number} not in candidates list",
|
_logger.LogWarning("AI returned company number '{Number}' not in candidates list. Candidates: {Candidates}",
|
||||||
aiResponse.BestMatchCompanyNumber);
|
aiResponse.BestMatchCompanyNumber,
|
||||||
|
string.Join(", ", candidates.Select(c => c.CompanyNumber)));
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -2,6 +2,7 @@ using System.Text.Json;
|
|||||||
using FuzzySharp;
|
using FuzzySharp;
|
||||||
using Microsoft.EntityFrameworkCore;
|
using Microsoft.EntityFrameworkCore;
|
||||||
using Microsoft.Extensions.Logging;
|
using Microsoft.Extensions.Logging;
|
||||||
|
using RealCV.Application.Data;
|
||||||
using RealCV.Application.DTOs;
|
using RealCV.Application.DTOs;
|
||||||
using RealCV.Application.Helpers;
|
using RealCV.Application.Helpers;
|
||||||
using RealCV.Application.Interfaces;
|
using RealCV.Application.Interfaces;
|
||||||
@@ -93,11 +94,140 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
|
|||||||
{
|
{
|
||||||
ArgumentException.ThrowIfNullOrWhiteSpace(companyName);
|
ArgumentException.ThrowIfNullOrWhiteSpace(companyName);
|
||||||
|
|
||||||
_logger.LogDebug("Verifying company: {CompanyName}", companyName);
|
// Normalize company name - strip trailing punctuation that causes matching issues
|
||||||
|
var normalizedName = NormalizeCompanyName(companyName);
|
||||||
|
_logger.LogDebug("Verifying company: {CompanyName} (normalized: {NormalizedName})", companyName, normalizedName);
|
||||||
var flags = new List<CompanyVerificationFlag>();
|
var flags = new List<CompanyVerificationFlag>();
|
||||||
|
|
||||||
|
// Check 1a: Is this a public sector employer?
|
||||||
|
if (UKHistoricalEmployers.IsPublicSectorEmployer(normalizedName))
|
||||||
|
{
|
||||||
|
_logger.LogInformation("Recognised public sector employer: {CompanyName}", companyName);
|
||||||
|
return new CompanyVerificationResult
|
||||||
|
{
|
||||||
|
ClaimedCompany = companyName,
|
||||||
|
MatchedCompanyName = companyName,
|
||||||
|
MatchedCompanyNumber = null,
|
||||||
|
MatchScore = 100,
|
||||||
|
IsVerified = true,
|
||||||
|
VerificationNotes = "Public sector employer - not registered at Companies House",
|
||||||
|
ClaimedStartDate = startDate,
|
||||||
|
ClaimedEndDate = endDate,
|
||||||
|
CompanyType = "public-sector",
|
||||||
|
CompanyStatus = "active",
|
||||||
|
ClaimedJobTitle = jobTitle,
|
||||||
|
Flags = flags
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check 1b: Is this a charity or non-profit organisation?
|
||||||
|
if (UKHistoricalEmployers.IsCharityEmployer(normalizedName))
|
||||||
|
{
|
||||||
|
_logger.LogInformation("Recognised charity employer: {CompanyName}", companyName);
|
||||||
|
return new CompanyVerificationResult
|
||||||
|
{
|
||||||
|
ClaimedCompany = companyName,
|
||||||
|
MatchedCompanyName = companyName,
|
||||||
|
MatchedCompanyNumber = null,
|
||||||
|
MatchScore = 100,
|
||||||
|
IsVerified = true,
|
||||||
|
VerificationNotes = "Charity/non-profit organisation",
|
||||||
|
ClaimedStartDate = startDate,
|
||||||
|
ClaimedEndDate = endDate,
|
||||||
|
CompanyType = "charity",
|
||||||
|
CompanyStatus = "active",
|
||||||
|
ClaimedJobTitle = jobTitle,
|
||||||
|
Flags = flags
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check 2: Is this an internal division of a larger company?
|
||||||
|
var parentCompany = UKHistoricalEmployers.GetParentCompanyForDivision(normalizedName);
|
||||||
|
if (parentCompany != null)
|
||||||
|
{
|
||||||
|
_logger.LogInformation("Recognised division '{CompanyName}' of parent company '{ParentCompany}'", companyName, parentCompany);
|
||||||
|
// Try to verify the parent company instead
|
||||||
|
var parentResult = await VerifyCompanyAsync(parentCompany, startDate, endDate, jobTitle);
|
||||||
|
if (parentResult.IsVerified)
|
||||||
|
{
|
||||||
|
return parentResult with
|
||||||
|
{
|
||||||
|
ClaimedCompany = companyName,
|
||||||
|
VerificationNotes = $"Internal division of {parentResult.MatchedCompanyName}"
|
||||||
|
};
|
||||||
|
}
|
||||||
|
// If parent verification failed, return a partial match
|
||||||
|
return new CompanyVerificationResult
|
||||||
|
{
|
||||||
|
ClaimedCompany = companyName,
|
||||||
|
MatchedCompanyName = parentCompany,
|
||||||
|
MatchedCompanyNumber = null,
|
||||||
|
MatchScore = 85,
|
||||||
|
IsVerified = true,
|
||||||
|
VerificationNotes = $"Recognised as division of {parentCompany}",
|
||||||
|
ClaimedStartDate = startDate,
|
||||||
|
ClaimedEndDate = endDate,
|
||||||
|
ClaimedJobTitle = jobTitle,
|
||||||
|
Flags = flags
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check 3: Is this a known historical employer?
|
||||||
|
var historicalInfo = UKHistoricalEmployers.GetHistoricalEmployerInfo(normalizedName);
|
||||||
|
if (historicalInfo != null)
|
||||||
|
{
|
||||||
|
_logger.LogInformation("Recognised historical employer: {CompanyName} -> {Successor}", companyName, historicalInfo.SuccessorName);
|
||||||
|
|
||||||
|
// If we have a company number for the successor, try to get current details
|
||||||
|
if (!string.IsNullOrEmpty(historicalInfo.CompanyNumber))
|
||||||
|
{
|
||||||
|
try
|
||||||
|
{
|
||||||
|
var successorDetails = await _companiesHouseClient.GetCompanyAsync(historicalInfo.CompanyNumber);
|
||||||
|
if (successorDetails != null)
|
||||||
|
{
|
||||||
|
return new CompanyVerificationResult
|
||||||
|
{
|
||||||
|
ClaimedCompany = companyName,
|
||||||
|
MatchedCompanyName = $"{companyName} (now {successorDetails.CompanyName})",
|
||||||
|
MatchedCompanyNumber = historicalInfo.CompanyNumber,
|
||||||
|
MatchScore = 90,
|
||||||
|
IsVerified = true,
|
||||||
|
VerificationNotes = $"Historical company. {historicalInfo.Notes}",
|
||||||
|
ClaimedStartDate = startDate,
|
||||||
|
ClaimedEndDate = endDate,
|
||||||
|
CompanyType = successorDetails.Type,
|
||||||
|
CompanyStatus = "historical",
|
||||||
|
ClaimedJobTitle = jobTitle,
|
||||||
|
Flags = flags
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (Exception ex)
|
||||||
|
{
|
||||||
|
_logger.LogWarning(ex, "Failed to fetch successor company details for {CompanyNumber}", historicalInfo.CompanyNumber);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Return historical match without successor details
|
||||||
|
return new CompanyVerificationResult
|
||||||
|
{
|
||||||
|
ClaimedCompany = companyName,
|
||||||
|
MatchedCompanyName = $"{companyName} (now {historicalInfo.SuccessorName})",
|
||||||
|
MatchedCompanyNumber = historicalInfo.CompanyNumber,
|
||||||
|
MatchScore = 90,
|
||||||
|
IsVerified = true,
|
||||||
|
VerificationNotes = $"Historical company. {historicalInfo.Notes}",
|
||||||
|
ClaimedStartDate = startDate,
|
||||||
|
ClaimedEndDate = endDate,
|
||||||
|
CompanyStatus = "historical",
|
||||||
|
ClaimedJobTitle = jobTitle,
|
||||||
|
Flags = flags
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
// Try to find a cached match first (but only if it existed at claimed start date)
|
// Try to find a cached match first (but only if it existed at claimed start date)
|
||||||
var cachedMatch = await FindCachedMatchAsync(companyName);
|
var cachedMatch = await FindCachedMatchAsync(normalizedName);
|
||||||
if (cachedMatch is not null)
|
if (cachedMatch is not null)
|
||||||
{
|
{
|
||||||
// Check if cached company existed at the claimed start date
|
// Check if cached company existed at the claimed start date
|
||||||
@@ -119,9 +249,9 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
|
|||||||
// Search Companies House with fallback queries
|
// Search Companies House with fallback queries
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
var searchQueries = GenerateSearchQueries(companyName);
|
var searchQueries = GenerateSearchQueries(normalizedName);
|
||||||
_logger.LogDebug("Generated {Count} search queries for '{CompanyName}': {Queries}",
|
_logger.LogDebug("Generated {Count} search queries for '{CompanyName}': {Queries}",
|
||||||
searchQueries.Count, companyName, string.Join(", ", searchQueries.Select(q => $"'{q}'")));
|
searchQueries.Count, normalizedName, string.Join(", ", searchQueries.Select(q => $"'{q}'")));
|
||||||
|
|
||||||
// Collect all candidates from all search queries for AI matching
|
// Collect all candidates from all search queries for AI matching
|
||||||
var allCandidates = new Dictionary<string, CompaniesHouseSearchItem>();
|
var allCandidates = new Dictionary<string, CompaniesHouseSearchItem>();
|
||||||
@@ -148,7 +278,7 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Find fuzzy matches (as before) for fallback
|
// Find fuzzy matches (as before) for fallback
|
||||||
var fuzzyMatch = FindBestMatch(companyName, query, searchResponse.Items, startDate);
|
var fuzzyMatch = FindBestMatch(normalizedName, query, searchResponse.Items, startDate);
|
||||||
if (fuzzyMatch is not null)
|
if (fuzzyMatch is not null)
|
||||||
{
|
{
|
||||||
fuzzyMatches.Add(fuzzyMatch.Value);
|
fuzzyMatches.Add(fuzzyMatch.Value);
|
||||||
@@ -157,30 +287,47 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
|
|||||||
|
|
||||||
if (allCandidates.Count == 0)
|
if (allCandidates.Count == 0)
|
||||||
{
|
{
|
||||||
_logger.LogDebug("No candidates found for: {CompanyName} after trying {Count} queries", companyName, searchQueries.Count);
|
_logger.LogDebug("No candidates found for: {CompanyName} after trying {Count} queries", normalizedName, searchQueries.Count);
|
||||||
return CreateUnverifiedResult(companyName, startDate, endDate, jobTitle,
|
return CreateUnverifiedResult(companyName, startDate, endDate, jobTitle,
|
||||||
"Company name could not be verified against official records");
|
"Company name could not be verified against official records");
|
||||||
}
|
}
|
||||||
|
|
||||||
// Use AI to find the best semantic match from all candidates
|
// Use AI to find the best semantic match from all candidates
|
||||||
_logger.LogDebug("Using AI to match '{CompanyName}' against {Count} candidates", companyName, allCandidates.Count);
|
_logger.LogDebug("Using AI to match '{CompanyName}' against {Count} candidates", normalizedName, allCandidates.Count);
|
||||||
|
|
||||||
|
// Sort candidates by fuzzy relevance to the search term before taking top 10
|
||||||
|
// This ensures the most likely matches are sent to the AI, not just arbitrary entries
|
||||||
|
var normalizedUpper = normalizedName.ToUpperInvariant();
|
||||||
var candidatesForAI = allCandidates.Values
|
var candidatesForAI = allCandidates.Values
|
||||||
.Take(10) // Limit to top 10 candidates to reduce AI cost
|
.Select(c => new
|
||||||
.Select(c => new CompanyCandidate
|
|
||||||
{
|
{
|
||||||
CompanyName = c.Title,
|
Item = c,
|
||||||
CompanyNumber = c.CompanyNumber,
|
Score = Fuzz.TokenSetRatio(normalizedUpper, c.Title.ToUpperInvariant())
|
||||||
CompanyStatus = c.CompanyStatus,
|
})
|
||||||
DateOfCreation = c.DateOfCreation
|
.OrderByDescending(x => x.Score)
|
||||||
|
.Take(10)
|
||||||
|
.Select(x => new CompanyCandidate
|
||||||
|
{
|
||||||
|
CompanyName = x.Item.Title,
|
||||||
|
CompanyNumber = x.Item.CompanyNumber,
|
||||||
|
CompanyStatus = x.Item.CompanyStatus,
|
||||||
|
DateOfCreation = x.Item.DateOfCreation
|
||||||
})
|
})
|
||||||
.ToList();
|
.ToList();
|
||||||
|
|
||||||
var aiResult = await _aiMatcher.FindBestMatchAsync(companyName, candidatesForAI);
|
_logger.LogDebug("Top candidates for AI matching (sorted by relevance): {Candidates}",
|
||||||
|
string.Join(", ", candidatesForAI.Select(c => $"{c.CompanyName} [{c.CompanyNumber}]")));
|
||||||
|
|
||||||
|
var aiResult = await _aiMatcher.FindBestMatchAsync(normalizedName, candidatesForAI);
|
||||||
|
|
||||||
CompaniesHouseSearchItem? matchedItem = null;
|
CompaniesHouseSearchItem? matchedItem = null;
|
||||||
int matchScore;
|
int matchScore;
|
||||||
|
|
||||||
|
// Get best fuzzy match for potential fallback
|
||||||
|
var bestFuzzy = fuzzyMatches.Count > 0
|
||||||
|
? fuzzyMatches.OrderByDescending(m => m.Score).First()
|
||||||
|
: ((CompaniesHouseSearchItem Item, int Score)?)null;
|
||||||
|
|
||||||
if (aiResult is not null && aiResult.IsMatch)
|
if (aiResult is not null && aiResult.IsMatch)
|
||||||
{
|
{
|
||||||
// AI found a valid match
|
// AI found a valid match
|
||||||
@@ -195,21 +342,63 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
|
|||||||
// AI didn't find a match - check if it explicitly rejected or just failed
|
// AI didn't find a match - check if it explicitly rejected or just failed
|
||||||
if (aiResult?.MatchType == "NoMatch")
|
if (aiResult?.MatchType == "NoMatch")
|
||||||
{
|
{
|
||||||
_logger.LogDebug("AI explicitly rejected all candidates for '{CompanyName}'. Reasoning: {Reasoning}",
|
// AI explicitly rejected. Only override if fuzzy match passes strict validation:
|
||||||
companyName, aiResult?.Reasoning ?? "No match found");
|
// 1. High fuzzy score (>= 90%)
|
||||||
return CreateUnverifiedResult(companyName, startDate, endDate, jobTitle,
|
// 2. ALL core identifying words from original name appear in the match
|
||||||
"Company name could not be verified - no matching company found in official records");
|
// 3. Match doesn't have significantly more core words (prevents partial word matches)
|
||||||
}
|
if (bestFuzzy.HasValue && bestFuzzy.Value.Score >= 90)
|
||||||
|
{
|
||||||
|
var originalCores = ExtractCoreIdentifiers(normalizedName);
|
||||||
|
var matchCores = ExtractCoreIdentifiers(bestFuzzy.Value.Item.Title);
|
||||||
|
|
||||||
// AI failed (API error, etc.) - fall back to fuzzy matching
|
// All original core words must appear in the match
|
||||||
_logger.LogWarning("AI matching failed for '{CompanyName}', falling back to fuzzy matching", companyName);
|
var allCoresPresent = originalCores.Count == 0 ||
|
||||||
var bestFuzzy = fuzzyMatches.OrderByDescending(m => m.Score).First();
|
originalCores.All(c => bestFuzzy.Value.Item.Title.Contains(c, StringComparison.OrdinalIgnoreCase));
|
||||||
matchedItem = bestFuzzy.Item;
|
|
||||||
matchScore = bestFuzzy.Score;
|
// Match shouldn't have too many extra core words (max 2 extra, e.g., "GROUP PLC")
|
||||||
|
var extraCores = matchCores.Count(c => !originalCores.Any(o =>
|
||||||
|
c.Equals(o, StringComparison.OrdinalIgnoreCase)));
|
||||||
|
var reasonableExtras = extraCores <= 2;
|
||||||
|
|
||||||
|
if (allCoresPresent && reasonableExtras)
|
||||||
|
{
|
||||||
|
_logger.LogInformation(
|
||||||
|
"AI rejected '{CompanyName}' but fuzzy match '{MatchedName}' ({Score}%) passes validation. " +
|
||||||
|
"Original cores: [{OriginalCores}], Match cores: [{MatchCores}]",
|
||||||
|
normalizedName, bestFuzzy.Value.Item.Title, bestFuzzy.Value.Score,
|
||||||
|
string.Join(", ", originalCores), string.Join(", ", matchCores));
|
||||||
|
matchedItem = bestFuzzy.Value.Item;
|
||||||
|
matchScore = bestFuzzy.Value.Score;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
_logger.LogDebug(
|
||||||
|
"AI rejected '{CompanyName}' and fuzzy match '{MatchedName}' fails validation. " +
|
||||||
|
"AllCoresPresent: {AllCores}, ExtraCores: {Extra}",
|
||||||
|
normalizedName, bestFuzzy.Value.Item.Title, allCoresPresent, extraCores);
|
||||||
|
return CreateUnverifiedResult(companyName, startDate, endDate, jobTitle,
|
||||||
|
"Company name could not be verified - no matching company found in official records");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
_logger.LogDebug("AI explicitly rejected all candidates for '{CompanyName}'. Reasoning: {Reasoning}",
|
||||||
|
normalizedName, aiResult?.Reasoning ?? "No match found");
|
||||||
|
return CreateUnverifiedResult(companyName, startDate, endDate, jobTitle,
|
||||||
|
"Company name could not be verified - no matching company found in official records");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// AI failed (API error, etc.) - fall back to fuzzy matching
|
||||||
|
_logger.LogWarning("AI matching failed for '{CompanyName}', falling back to fuzzy matching", normalizedName);
|
||||||
|
matchedItem = bestFuzzy!.Value.Item;
|
||||||
|
matchScore = bestFuzzy!.Value.Score;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
_logger.LogDebug("No valid match found for: {CompanyName}", companyName);
|
_logger.LogDebug("No valid match found for: {CompanyName}", normalizedName);
|
||||||
return CreateUnverifiedResult(companyName, startDate, endDate, jobTitle,
|
return CreateUnverifiedResult(companyName, startDate, endDate, jobTitle,
|
||||||
"Company name could not be verified against official records");
|
"Company name could not be verified against official records");
|
||||||
}
|
}
|
||||||
@@ -624,6 +813,26 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
|
|||||||
|
|
||||||
#region Helper Methods
|
#region Helper Methods
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Normalizes a company name by removing trailing punctuation and cleaning up common issues.
|
||||||
|
/// </summary>
|
||||||
|
private static string NormalizeCompanyName(string companyName)
|
||||||
|
{
|
||||||
|
if (string.IsNullOrWhiteSpace(companyName))
|
||||||
|
return companyName;
|
||||||
|
|
||||||
|
var normalized = companyName.Trim();
|
||||||
|
|
||||||
|
// Remove trailing punctuation (dots, commas, etc.) that cause matching issues
|
||||||
|
// e.g., "Glaxo Research & Development Ltd." -> "Glaxo Research & Development Ltd"
|
||||||
|
normalized = normalized.TrimEnd('.', ',', ';', ':', '!', '?');
|
||||||
|
|
||||||
|
// Normalize multiple spaces to single space
|
||||||
|
normalized = System.Text.RegularExpressions.Regex.Replace(normalized, @"\s+", " ");
|
||||||
|
|
||||||
|
return normalized;
|
||||||
|
}
|
||||||
|
|
||||||
private async Task<CompanyCache?> FindCachedMatchAsync(string companyName)
|
private async Task<CompanyCache?> FindCachedMatchAsync(string companyName)
|
||||||
{
|
{
|
||||||
var cutoffDate = DateTime.UtcNow.AddDays(-CacheExpirationDays);
|
var cutoffDate = DateTime.UtcNow.AddDays(-CacheExpirationDays);
|
||||||
@@ -790,12 +999,13 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
|
|||||||
var searchText = originalLower + " " + queryLower;
|
var searchText = originalLower + " " + queryLower;
|
||||||
|
|
||||||
// Penalize subsidiary indicators (unless search explicitly included them)
|
// Penalize subsidiary indicators (unless search explicitly included them)
|
||||||
|
// Use word boundary matching to avoid "SCOTLAND" matching "land"
|
||||||
foreach (var indicator in SubsidiaryIndicators)
|
foreach (var indicator in SubsidiaryIndicators)
|
||||||
{
|
{
|
||||||
if (itemTitleLower.Contains(indicator))
|
if (ContainsWholeWord(itemTitleLower, indicator))
|
||||||
{
|
{
|
||||||
// Only penalize if the search didn't explicitly include this indicator
|
// Only penalize if the search didn't explicitly include this indicator
|
||||||
if (!searchText.Contains(indicator))
|
if (!ContainsWholeWord(searchText, indicator))
|
||||||
{
|
{
|
||||||
score -= 10; // Significant penalty for subsidiaries
|
score -= 10; // Significant penalty for subsidiaries
|
||||||
}
|
}
|
||||||
@@ -806,7 +1016,7 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
|
|||||||
// Boost main company indicators
|
// Boost main company indicators
|
||||||
foreach (var indicator in MainCompanyIndicators)
|
foreach (var indicator in MainCompanyIndicators)
|
||||||
{
|
{
|
||||||
if (itemTitleLower.Contains(indicator))
|
if (ContainsWholeWord(itemTitleLower, indicator))
|
||||||
{
|
{
|
||||||
score += 5; // Boost for main trading companies
|
score += 5; // Boost for main trading companies
|
||||||
break; // Only apply one boost
|
break; // Only apply one boost
|
||||||
@@ -1168,7 +1378,10 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Check if the item matches any pattern in this non-employment category
|
// Check if the item matches any pattern in this non-employment category
|
||||||
if (patterns.Any(pattern => itemTitleLower.Contains(pattern)))
|
// Use whole-word matching for single words, substring for multi-word patterns
|
||||||
|
if (patterns.Any(pattern => pattern.Contains(' ')
|
||||||
|
? itemTitleLower.Contains(pattern)
|
||||||
|
: ContainsWholeWord(itemTitleLower, pattern)))
|
||||||
{
|
{
|
||||||
return false; // This is a non-employment entity type that wasn't explicitly searched for
|
return false; // This is a non-employment entity type that wasn't explicitly searched for
|
||||||
}
|
}
|
||||||
@@ -1177,6 +1390,19 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
|
|||||||
return true; // No non-employment patterns matched, this is likely a valid employment entity
|
return true; // No non-employment patterns matched, this is likely a valid employment entity
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Checks if a string contains a word as a whole word (not as a substring of another word).
|
||||||
|
/// E.g., "scotland" does NOT contain whole word "land", but "land holdings" does.
|
||||||
|
/// </summary>
|
||||||
|
private static bool ContainsWholeWord(string text, string word)
|
||||||
|
{
|
||||||
|
if (string.IsNullOrEmpty(text) || string.IsNullOrEmpty(word))
|
||||||
|
return false;
|
||||||
|
|
||||||
|
var pattern = $@"\b{System.Text.RegularExpressions.Regex.Escape(word)}\b";
|
||||||
|
return System.Text.RegularExpressions.Regex.IsMatch(text, pattern, System.Text.RegularExpressions.RegexOptions.IgnoreCase);
|
||||||
|
}
|
||||||
|
|
||||||
// Expanded skip words list for core identifier extraction
|
// Expanded skip words list for core identifier extraction
|
||||||
// These words are too common to be meaningful differentiators between companies
|
// These words are too common to be meaningful differentiators between companies
|
||||||
private static readonly HashSet<string> SkipWords = new(StringComparer.OrdinalIgnoreCase)
|
private static readonly HashSet<string> SkipWords = new(StringComparer.OrdinalIgnoreCase)
|
||||||
@@ -1220,8 +1446,8 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
|
|||||||
"new", "modern", "advanced", "innovative", "premier", "elite", "premium",
|
"new", "modern", "advanced", "innovative", "premier", "elite", "premium",
|
||||||
"quality", "superior", "excellent", "best", "top", "leading", "major",
|
"quality", "superior", "excellent", "best", "top", "leading", "major",
|
||||||
|
|
||||||
// Ownership indicators
|
// Ownership indicators (excluding "royal" as it's a meaningful company identifier)
|
||||||
"royal", "imperial", "crown", "state", "public", "private", "independent",
|
"imperial", "crown", "state", "public", "private", "independent",
|
||||||
"mutual", "cooperative", "coop", "community",
|
"mutual", "cooperative", "coop", "community",
|
||||||
|
|
||||||
// Time-related
|
// Time-related
|
||||||
@@ -1235,7 +1461,7 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
|
|||||||
/// Extracts ALL core identifying words from a company name.
|
/// Extracts ALL core identifying words from a company name.
|
||||||
/// These are significant words that aren't common prefixes/suffixes.
|
/// These are significant words that aren't common prefixes/suffixes.
|
||||||
/// E.g., "BMW Group Canada" -> ["BMW"], "Lloyds Bowmaker" -> ["LLOYDS", "BOWMAKER"]
|
/// E.g., "BMW Group Canada" -> ["BMW"], "Lloyds Bowmaker" -> ["LLOYDS", "BOWMAKER"]
|
||||||
/// "Bank of Scotland" -> ["BANK", "SCOTLAND"]
|
/// "Royal Bank of Scotland" -> ["ROYAL", "BANK"] (Scotland is a geographic skipWord)
|
||||||
/// </summary>
|
/// </summary>
|
||||||
private static List<string> ExtractCoreIdentifiers(string companyName)
|
private static List<string> ExtractCoreIdentifiers(string companyName)
|
||||||
{
|
{
|
||||||
|
|||||||
Reference in New Issue
Block a user