Compare commits

2 Commits

Author SHA1 Message Date
94ca6e1b9a feat: Add AI-powered compound company name splitting
Uses Claude Haiku to intelligently detect when a company name contains
multiple companies (e.g., "ASDA/WALMART", "Corus & Laura Ashley Hotels")
vs single companies with similar patterns (e.g., "Ernst & Young").

- Adds ExtractCompanyNamesAsync to ICompanyNameMatcherService
- Only triggers for names with potential separators (/, &, "and")
- Verifies each extracted part individually, returns first match
- Uses fast Haiku model to minimize cost

Results:
- ASDA/WALMART → verified via 'ASDA' → ASDA GROUP LIMITED
- Corus & Laura Ashley Hotels → verified via 'Corus' → Tata Steel UK
- Employers: 104/120 verified (86%)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-22 11:04:30 +00:00
27921d625f feat: Improve company verification with relevance-sorted AI candidates
- Sort AI candidates by fuzzy match score before taking top 10
  This fixes Royal Bank of Scotland matching (was getting arbitrary
  candidates from Dictionary, now gets most relevant)

- Add historical employer recognition (Foster Wheeler, Glaxo, etc.)
- Add public sector employer recognition (NHS, councils, etc.)
- Add charity/non-profit recognition
- Add company division pattern recognition

- Improve AI matcher prompt with explicit examples
- Add partial company number matching for truncated AI responses
- Lower AI confidence threshold to 30% (fuzzy validation as backup)

- Add whole-word boundary matching for subsidiary indicators
  Fixes "SCOTLAND" incorrectly matching "land" pattern

- Add 100+ historical polytechnic → university name mappings
- Add post-1992 universities and Welsh institutions

Results: Employer verification improved from 71% to 85%

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-22 10:43:45 +00:00
9 changed files with 1920 additions and 48 deletions

View File

@@ -0,0 +1,448 @@
namespace RealCV.Application.Data;
/// <summary>
/// Database of historical UK employers that may no longer exist under their original names.
/// Includes companies that were acquired, merged, dissolved, or renamed.
/// Also includes public sector bodies and internal divisions of larger organisations.
/// </summary>
public static class UKHistoricalEmployers
{
/// <summary>
/// Maps historical company names to their current/successor company information.
/// Key: Historical name (case-insensitive)
/// Value: HistoricalEmployerInfo with successor details
/// </summary>
public static readonly Dictionary<string, HistoricalEmployerInfo> HistoricalCompanies =
new(StringComparer.OrdinalIgnoreCase)
{
// Engineering & Construction
["Foster Wheeler"] = new("Wood Group / AMEC Foster Wheeler", "Engineering contractor acquired by AMEC in 2014, now part of Wood Group", "00163609"),
["Foster Wheeler Ltd"] = new("Wood Group / AMEC Foster Wheeler", "Engineering contractor acquired by AMEC in 2014, now part of Wood Group", "00163609"),
["Foster Wheeler Limited"] = new("Wood Group / AMEC Foster Wheeler", "Engineering contractor acquired by AMEC in 2014, now part of Wood Group", "00163609"),
["Foster Wheeler PLC"] = new("Wood Group / AMEC Foster Wheeler", "Engineering contractor acquired by AMEC in 2014, now part of Wood Group", "00163609"),
["Sir Alexander Gibb and Partners"] = new("Jacobs Engineering", "Historic engineering consultancy (founded 1922), acquired by Jacobs", null),
["Alexander Gibb and Partners"] = new("Jacobs Engineering", "Historic engineering consultancy (founded 1922), acquired by Jacobs", null),
["Gibb and Partners"] = new("Jacobs Engineering", "Historic engineering consultancy, acquired by Jacobs", null),
["Mott MacDonald"] = new("Mott MacDonald", "Still trading - major engineering consultancy", "01243967"),
["Ove Arup"] = new("Arup", "Still trading as Arup", "01312453"),
["Arup"] = new("Arup", "Major engineering consultancy", "01312453"),
["WS Atkins"] = new("SNC-Lavalin / Atkins", "Acquired by SNC-Lavalin in 2017", "01885586"),
["Atkins"] = new("SNC-Lavalin / Atkins", "Acquired by SNC-Lavalin in 2017", "01885586"),
// Pharmaceuticals
["Glaxo"] = new("GlaxoSmithKline (GSK)", "Merged with SmithKline Beecham in 2000 to form GSK", "03888792"),
["Glaxo Research & Development"] = new("GlaxoSmithKline (GSK)", "Glaxo R&D subsidiary, merged into GSK in 2000", "03888792"),
["Glaxo Research & Development Ltd"] = new("GlaxoSmithKline (GSK)", "Glaxo R&D subsidiary, merged into GSK in 2000", "03888792"),
["Glaxo Research and Development"] = new("GlaxoSmithKline (GSK)", "Glaxo R&D subsidiary, merged into GSK in 2000", "03888792"),
["Glaxo Wellcome"] = new("GlaxoSmithKline (GSK)", "Formed 1995 (Glaxo + Wellcome), merged with SmithKline Beecham 2000", "03888792"),
["SmithKline Beecham"] = new("GlaxoSmithKline (GSK)", "Merged with Glaxo Wellcome in 2000 to form GSK", "03888792"),
["Beecham"] = new("GlaxoSmithKline (GSK)", "Merged to form SmithKline Beecham, then GSK", "03888792"),
["Wellcome"] = new("GlaxoSmithKline (GSK)", "Acquired by Glaxo in 1995", "03888792"),
["ICI Pharmaceuticals"] = new("AstraZeneca", "ICI pharma division became Zeneca, merged with Astra 1999", "02723534"),
["Zeneca"] = new("AstraZeneca", "Merged with Astra in 1999", "02723534"),
// Banking & Finance (historical names)
["Midland Bank"] = new("HSBC UK", "Acquired by HSBC in 1992", "00014259"),
["National Westminster Bank"] = new("NatWest (RBS Group)", "Acquired by RBS in 2000", "00929027"),
["NatWest"] = new("NatWest Group", "Part of NatWest Group (formerly RBS)", "00929027"),
["Lloyds Bank"] = new("Lloyds Banking Group", "Part of Lloyds Banking Group", "00002065"),
["Lloyds TSB"] = new("Lloyds Banking Group", "Rebranded to Lloyds Bank in 2013", "00002065"),
["TSB"] = new("TSB Bank", "Demerged from Lloyds in 2013, acquired by Sabadell", "SC205310"),
["Halifax"] = new("Halifax (Lloyds Banking Group)", "Part of Lloyds Banking Group since 2009", "02367076"),
["HBOS"] = new("Lloyds Banking Group", "Acquired by Lloyds in 2009", "SC218813"),
["Bank of Scotland"] = new("Bank of Scotland (Lloyds Banking Group)", "Part of Lloyds Banking Group", "SC327000"),
["Abbey National"] = new("Santander UK", "Acquired by Santander in 2004", "02294747"),
["Alliance & Leicester"] = new("Santander UK", "Acquired by Santander in 2008", "03263713"),
["Bradford & Bingley"] = new("Santander UK (savings) / UKAR (mortgages)", "Nationalised 2008, split up", "00189520"),
["Northern Rock"] = new("Virgin Money UK", "Nationalised 2008, sold to Virgin Money 2012", "03273685"),
// Retail
["Woolworths"] = new("Dissolved", "UK Woolworths went into administration in 2008", "00106966"),
["British Home Stores"] = new("Dissolved", "BHS went into administration in 2016", "00229606"),
["BHS"] = new("Dissolved", "BHS went into administration in 2016", "00229606"),
["Littlewoods"] = new("Shop Direct / The Very Group", "Stores closed, online business continued", null),
["Comet"] = new("Dissolved", "Electrical retailer went into administration in 2012", "00abortedte"),
["MFI"] = new("Dissolved", "Furniture retailer went into administration in 2008", null),
["Courts"] = new("Dissolved", "Furniture retailer ceased UK operations", null),
["Safeway"] = new("Morrisons", "UK stores acquired by Morrisons in 2004", "00358949"),
["Kwik Save"] = new("Dissolved", "Supermarket chain dissolved in 2007", null),
["Fine Fare"] = new("Dissolved", "Supermarket chain - stores sold to various buyers", null),
["Gateway"] = new("Somerfield / Co-op", "Became Somerfield, then acquired by Co-op", null),
["Somerfield"] = new("Co-operative Group", "Acquired by Co-op in 2009", null),
// Telecoms
["British Telecom"] = new("BT Group", "Rebranded to BT", "01800000"),
["GPO Telephones"] = new("BT Group", "Became British Telecom, then BT", "01800000"),
["Mercury Communications"] = new("Cable & Wireless / Vodafone", "Merged into Cable & Wireless, later Vodafone", null),
["Cellnet"] = new("O2 (Virgin Media O2)", "Became BT Cellnet, then O2", null),
["Orange"] = new("EE (BT)", "Merged with T-Mobile to form EE, acquired by BT", null),
["T-Mobile UK"] = new("EE (BT)", "Merged with Orange to form EE", null),
["One2One"] = new("EE (BT)", "Became T-Mobile UK, then EE", null),
// Utilities
["Central Electricity Generating Board"] = new("National Grid / Various generators", "CEGB privatised and split in 1990", null),
["CEGB"] = new("National Grid / Various generators", "CEGB privatised and split in 1990", null),
["British Gas"] = new("Centrica / National Grid", "Demerged in 1997", "00029782"),
["Eastern Electricity"] = new("EDF Energy", "Privatised, now part of EDF", null),
["London Electricity"] = new("EDF Energy", "Privatised, now part of EDF", null),
["SEEBOARD"] = new("EDF Energy", "Privatised, now part of EDF", null),
["PowerGen"] = new("E.ON UK", "Acquired by E.ON", null),
["National Power"] = new("RWE npower / Innogy", "Split and acquired", null),
// Manufacturing & Industrial
["British Steel"] = new("Tata Steel UK / British Steel (2016)", "Privatised, acquired by Corus then Tata, British Steel name revived 2016", "12303256"),
["British Steel Corporation"] = new("Tata Steel UK / British Steel (2016)", "Nationalised steel industry, privatised 1988", "12303256"),
["British Steel plc"] = new("Tata Steel UK / British Steel (2016)", "Merged with Hoogovens to form Corus 1999", "12303256"),
["Corus"] = new("Tata Steel UK", "Acquired by Tata Steel in 2007", null),
["British Leyland"] = new("Various (BMW, Tata, etc.)", "Split up - brands went to various owners", null),
["Rover Group"] = new("Dissolved", "Final owner MG Rover went bankrupt 2005", null),
["MG Rover"] = new("Dissolved", "Went into administration in 2005", null),
["Austin Rover"] = new("Dissolved", "Part of British Leyland, became Rover Group", null),
["British Aerospace"] = new("BAE Systems", "Merged with Marconi Electronic Systems in 1999", "01470151"),
["BAe"] = new("BAE Systems", "Merged with Marconi Electronic Systems in 1999", "01470151"),
["Marconi"] = new("BAE Systems / Ericsson", "Defence division to BAE, telecoms to Ericsson", null),
["GEC"] = new("Various", "General Electric Company (UK) - broken up", null),
["GEC Marconi"] = new("BAE Systems", "Defence business became part of BAE Systems", "01470151"),
["Plessey"] = new("Siemens / various", "Broken up in 1989", null),
["ICL"] = new("Fujitsu", "Acquired by Fujitsu", null),
["International Computers Limited"] = new("Fujitsu", "Acquired by Fujitsu in 2002", null),
["Ferranti"] = new("Dissolved", "Collapsed in 1993 after fraud scandal", null),
// Oil & Gas
["British Petroleum"] = new("BP", "Rebranded to BP", "00102498"),
["BP Amoco"] = new("BP", "Merged 1998, rebranded to just BP", "00102498"),
["Enterprise Oil"] = new("Shell", "Acquired by Shell in 2002", null),
["Lasmo"] = new("Eni", "Acquired by Eni in 2001", null),
["Britoil"] = new("BP", "Acquired by BP in 1988", null),
// Transport
["British Rail"] = new("Various (Network Rail, TOCs)", "Privatised and split in 1990s", null),
["British Railways"] = new("Various (Network Rail, TOCs)", "Became British Rail, then privatised", null),
["Railtrack"] = new("Network Rail", "Replaced by Network Rail in 2002", "04402220"),
["British Airways"] = new("British Airways (IAG)", "Now part of International Airlines Group", "01777777"),
["British Caledonian"] = new("British Airways", "Acquired by BA in 1987", null),
["British European Airways"] = new("British Airways", "Merged with BOAC to form BA in 1974", null),
["BEA"] = new("British Airways", "Merged with BOAC to form BA in 1974", null),
["BOAC"] = new("British Airways", "Merged with BEA to form BA in 1974", null),
["British Overseas Airways Corporation"] = new("British Airways", "Merged with BEA to form BA in 1974", null),
["Dan-Air"] = new("British Airways", "Acquired by BA in 1992", null),
// Media
["Thames Television"] = new("Fremantle", "Lost franchise 1991, production continued", null),
["Granada Television"] = new("ITV plc", "Merged to form ITV plc", "04967001"),
["Carlton Television"] = new("ITV plc", "Merged with Granada to form ITV", "04967001"),
["Yorkshire Television"] = new("ITV plc", "Part of ITV plc", "04967001"),
["Tyne Tees Television"] = new("ITV plc", "Part of ITV plc", "04967001"),
["Central Television"] = new("ITV plc", "Part of ITV plc", "04967001"),
["Anglia Television"] = new("ITV plc", "Part of ITV plc", "04967001"),
["HTV"] = new("ITV plc", "Part of ITV plc", "04967001"),
["LWT"] = new("ITV plc", "London Weekend Television, part of ITV", "04967001"),
["London Weekend Television"] = new("ITV plc", "Part of ITV plc", "04967001"),
// Construction
["Wimpey"] = new("Taylor Wimpey", "Merged with Taylor Woodrow in 2007", "00296805"),
["Taylor Woodrow"] = new("Taylor Wimpey", "Merged with Wimpey in 2007", "00296805"),
["John Laing"] = new("John Laing Group (infrastructure)", "Construction sold, now infrastructure investor", "05975300"),
["Costain Group"] = new("Costain", "Still trading", "00102921"),
["Tarmac"] = new("Tarmac (CRH)", "Construction now part of CRH", null),
["Alfred McAlpine"] = new("Carillion (dissolved)", "Acquired by Carillion, which collapsed 2018", null),
["Carillion"] = new("Dissolved", "Collapsed into liquidation in 2018", "03782379"),
["Mowlem"] = new("Carillion (dissolved)", "Acquired by Carillion in 2006", null),
["Balfour Beatty"] = new("Balfour Beatty", "Still trading", "00395826"),
// Insurance
["Royal Insurance"] = new("RSA Insurance Group", "Merged with Sun Alliance", "02339826"),
["Sun Alliance"] = new("RSA Insurance Group", "Merged with Royal Insurance", "02339826"),
["Guardian Royal Exchange"] = new("AXA", "Acquired by AXA in 1999", null),
["Commercial Union"] = new("Aviva", "Merged to form CGU, then Aviva", "02468686"),
["General Accident"] = new("Aviva", "Merged to form CGU, then Aviva", "02468686"),
["CGU"] = new("Aviva", "Rebranded to Aviva in 2002", "02468686"),
["Norwich Union"] = new("Aviva", "Rebranded to Aviva in 2009", "02468686"),
["Eagle Star"] = new("Zurich", "Acquired by Zurich", null),
["Prudential"] = new("Prudential plc / M&G", "UK business demerged as M&G plc", "01397169"),
};
/// <summary>
/// Major UK charities and non-profit organisations.
/// These are legitimate employers but may not be found via standard company search.
/// </summary>
public static readonly HashSet<string> CharityEmployers = new(StringComparer.OrdinalIgnoreCase)
{
// Youth organisations
"Girlguiding",
"Girlguiding UK",
"Girlguiding North East England",
"Girl Guides",
"Scouts",
"Scout Association",
"Boys Brigade",
"Girls Brigade",
"Cadets",
"Sea Cadets",
"Air Cadets",
"Army Cadets",
// Major charities
"British Red Cross",
"Oxfam",
"Save the Children",
"NSPCC",
"Barnardo's",
"RSPCA",
"RSPB",
"National Trust",
"Cancer Research UK",
"British Heart Foundation",
"Macmillan Cancer Support",
"Marie Curie",
"Age UK",
"Mind",
"Samaritans",
"Shelter",
"Citizens Advice",
"Citizens Advice Bureau",
"CAB",
"St John Ambulance",
"Salvation Army",
"YMCA",
"YWCA",
// Religious organisations
"Church of England",
"Catholic Church",
"Methodist Church",
"Baptist Church",
"Salvation Army",
};
/// <summary>
/// Public sector organisations and government bodies.
/// These are legitimate employers but not registered at Companies House.
/// </summary>
public static readonly HashSet<string> PublicSectorEmployers = new(StringComparer.OrdinalIgnoreCase)
{
// Emergency Services
"Metropolitan Police",
"Metropolitan Police Service",
"Metropolitan Police Engineers",
"Met Police",
"City of London Police",
"British Transport Police",
"Police Scotland",
"Police Service of Northern Ireland",
"PSNI",
"London Fire Brigade",
"London Ambulance Service",
"NHS",
"National Health Service",
// Government Departments
"HM Treasury",
"Home Office",
"Foreign Office",
"Ministry of Defence",
"MOD",
"Department of Health",
"Department for Education",
"DfE",
"Department for Work and Pensions",
"DWP",
"HMRC",
"HM Revenue and Customs",
"Cabinet Office",
"DVLA",
"DVSA",
"Environment Agency",
"Highways Agency",
"Highways England",
"National Highways",
// Armed Forces
"British Army",
"Royal Navy",
"Royal Air Force",
"RAF",
"Royal Marines",
// Local Government
"London Borough",
"County Council",
"City Council",
"District Council",
"Metropolitan Borough",
"Borough Council",
"Town Council",
"Parish Council",
"Greater London Council",
"GLC",
// Education
"University of",
"College of",
"School of",
// Other Public Bodies
"BBC",
"British Broadcasting Corporation",
"Channel 4",
"Bank of England",
"Royal Mail",
"Post Office",
"Transport for London",
"TfL",
"Network Rail",
"Ordnance Survey",
"Land Registry",
"Companies House",
"National Archives",
"British Library",
"British Museum",
"National Gallery",
"Tate",
"Natural History Museum",
"Science Museum",
"V&A",
"Victoria and Albert Museum",
};
/// <summary>
/// Patterns that indicate an internal division or department of a larger company.
/// These are legitimate employer references but won't be separately registered.
/// </summary>
public static readonly Dictionary<string, string> DivisionPatterns = new(StringComparer.OrdinalIgnoreCase)
{
// Airlines
["British Airways Technical Support"] = "British Airways",
["BA Technical Support"] = "British Airways",
["BA Engineering"] = "British Airways",
["British Airways Engineering"] = "British Airways",
["FBA - British Airways"] = "British Airways",
// Major employers with divisions
["BBC News"] = "BBC",
["BBC World Service"] = "BBC",
["BBC Studios"] = "BBC",
["ITV News"] = "ITV plc",
["Sky News"] = "Sky UK",
["BT Openreach"] = "BT Group",
["Openreach"] = "BT Group",
["BT Research"] = "BT Group",
["Shell Research"] = "Shell",
["BP Research"] = "BP",
["Rolls-Royce Aerospace"] = "Rolls-Royce",
["Rolls-Royce Marine"] = "Rolls-Royce",
["BAE Systems Naval Ships"] = "BAE Systems",
["BAE Systems Submarines"] = "BAE Systems",
// Banks - divisions
["Barclays Investment Bank"] = "Barclays",
["Barclays Capital"] = "Barclays",
["HSBC Investment Bank"] = "HSBC",
["Lloyds Commercial Banking"] = "Lloyds Banking Group",
["NatWest Markets"] = "NatWest Group",
["RBS Markets"] = "NatWest Group",
};
/// <summary>
/// Check if an employer name is a known historical company.
/// </summary>
public static bool IsHistoricalEmployer(string employerName)
{
if (string.IsNullOrWhiteSpace(employerName))
return false;
return HistoricalCompanies.ContainsKey(employerName.Trim());
}
/// <summary>
/// Get information about a historical employer.
/// </summary>
public static HistoricalEmployerInfo? GetHistoricalEmployerInfo(string employerName)
{
if (string.IsNullOrWhiteSpace(employerName))
return null;
return HistoricalCompanies.GetValueOrDefault(employerName.Trim());
}
/// <summary>
/// Check if an employer is a public sector organisation.
/// </summary>
public static bool IsPublicSectorEmployer(string employerName)
{
if (string.IsNullOrWhiteSpace(employerName))
return false;
var name = employerName.Trim();
// Direct match
if (PublicSectorEmployers.Contains(name))
return true;
// Partial match for patterns like "London Borough of X"
foreach (var pattern in PublicSectorEmployers)
{
if (name.Contains(pattern, StringComparison.OrdinalIgnoreCase))
return true;
}
return false;
}
/// <summary>
/// Check if an employer is a charity or non-profit organisation.
/// </summary>
public static bool IsCharityEmployer(string employerName)
{
if (string.IsNullOrWhiteSpace(employerName))
return false;
var name = employerName.Trim();
// Direct match
if (CharityEmployers.Contains(name))
return true;
// Partial match
foreach (var pattern in CharityEmployers)
{
if (name.Contains(pattern, StringComparison.OrdinalIgnoreCase))
return true;
}
return false;
}
/// <summary>
/// Check if an employer name is an internal division and get the parent company.
/// </summary>
public static string? GetParentCompanyForDivision(string employerName)
{
if (string.IsNullOrWhiteSpace(employerName))
return null;
var name = employerName.Trim();
// Direct match
if (DivisionPatterns.TryGetValue(name, out var parent))
return parent;
// Partial match
foreach (var (pattern, parentCompany) in DivisionPatterns)
{
if (name.Contains(pattern, StringComparison.OrdinalIgnoreCase))
return parentCompany;
}
return null;
}
}
/// <summary>
/// Information about a historical employer.
/// </summary>
public sealed record HistoricalEmployerInfo(
string SuccessorName,
string Notes,
string? CompanyNumber
);

View File

@@ -122,6 +122,28 @@ public static class UKInstitutions
"Wrexham University", "Wrexham University",
"York St John University", "York St John University",
// Post-1992 Universities (former polytechnics)
"Leeds Beckett University",
"Birmingham City University",
"University of Bedfordshire",
"Anglia Ruskin University",
"University of Central Lancashire",
"University of West London",
"University of Northampton",
"University of Chichester",
"Plymouth Marjon University",
"Bath Spa University",
"Solent University",
"University of Bolton",
"University of Cumbria",
"University of Chester",
"University of Gloucestershire",
"University of Suffolk",
"Newman University",
"Bishop Grosseteste University",
"Harper Adams University",
"Royal Agricultural University",
// Scottish Universities // Scottish Universities
"University of Aberdeen", "University of Aberdeen",
"Abertay University", "Abertay University",
@@ -134,6 +156,8 @@ public static class UKInstitutions
"Bangor University", "Bangor University",
"University of South Wales", "University of South Wales",
"Wrexham Glyndwr University", "Wrexham Glyndwr University",
"Wrexham University",
"Cardiff Metropolitan University",
// Northern Ireland // Northern Ireland
"Ulster University", "Ulster University",
@@ -304,6 +328,112 @@ public static class UKInstitutions
["South Bank University"] = "London South Bank University", ["South Bank University"] = "London South Bank University",
["LSBU"] = "London South Bank University", ["LSBU"] = "London South Bank University",
// Historical polytechnic names (became universities in 1992)
// These are legitimate institutions that existed under different names
["South Bank Polytechnic"] = "London South Bank University",
["Polytechnic of the South Bank"] = "London South Bank University",
["Thames Polytechnic"] = "University of Greenwich",
["Woolwich Polytechnic"] = "University of Greenwich",
["Polytechnic of Central London"] = "University of Westminster",
["PCL"] = "University of Westminster",
["Polytechnic of North London"] = "London Metropolitan University",
["City of London Polytechnic"] = "London Metropolitan University",
["London Guildhall University"] = "London Metropolitan University",
["University of North London"] = "London Metropolitan University",
["Polytechnic of East London"] = "University of East London",
["North East London Polytechnic"] = "University of East London",
["Middlesex Polytechnic"] = "Middlesex University",
["Hatfield Polytechnic"] = "University of Hertfordshire",
["Sheffield Polytechnic"] = "Sheffield Hallam University",
["Sheffield City Polytechnic"] = "Sheffield Hallam University",
["Manchester Polytechnic"] = "Manchester Metropolitan University",
["Leeds Polytechnic"] = "Leeds Beckett University",
["Leeds Metropolitan University"] = "Leeds Beckett University",
["Leicester Polytechnic"] = "De Montfort University",
["Coventry Polytechnic"] = "Coventry University",
["Lanchester Polytechnic"] = "Coventry University",
["Brighton Polytechnic"] = "University of Brighton",
["Portsmouth Polytechnic"] = "University of Portsmouth",
["Plymouth Polytechnic"] = "University of Plymouth",
["Polytechnic South West"] = "University of Plymouth",
["Oxford Polytechnic"] = "Oxford Brookes University",
["Newcastle Polytechnic"] = "Northumbria University",
["Newcastle upon Tyne Polytechnic"] = "Northumbria University",
["Sunderland Polytechnic"] = "University of Sunderland",
["Teesside Polytechnic"] = "Teesside University",
["Huddersfield Polytechnic"] = "University of Huddersfield",
["Wolverhampton Polytechnic"] = "University of Wolverhampton",
["Liverpool Polytechnic"] = "Liverpool John Moores University",
["Bristol Polytechnic"] = "University of the West of England",
["Kingston Polytechnic"] = "Kingston University",
["Nottingham Polytechnic"] = "Nottingham Trent University",
["Trent Polytechnic"] = "Nottingham Trent University",
["Birmingham Polytechnic"] = "Birmingham City University",
["City of Birmingham Polytechnic"] = "Birmingham City University",
["University of Central England"] = "Birmingham City University",
["UCE Birmingham"] = "Birmingham City University",
["Staffordshire Polytechnic"] = "Staffordshire University",
["North Staffordshire Polytechnic"] = "Staffordshire University",
["Luton College of Higher Education"] = "University of Bedfordshire",
["University of Luton"] = "University of Bedfordshire",
["Anglia Polytechnic"] = "Anglia Ruskin University",
["Anglia Polytechnic University"] = "Anglia Ruskin University",
["APU"] = "Anglia Ruskin University",
["Cambridgeshire College of Arts and Technology"] = "Anglia Ruskin University",
["CCAT"] = "Anglia Ruskin University",
["Bournemouth Polytechnic"] = "Bournemouth University",
["Dorset Institute of Higher Education"] = "Bournemouth University",
["Derby College of Higher Education"] = "University of Derby",
["Derbyshire College of Higher Education"] = "University of Derby",
["Humberside Polytechnic"] = "University of Lincoln",
["Humberside College of Higher Education"] = "University of Lincoln",
["University of Humberside"] = "University of Lincoln",
["University of Lincolnshire and Humberside"] = "University of Lincoln",
["Central Lancashire Polytechnic"] = "University of Central Lancashire",
["Preston Polytechnic"] = "University of Central Lancashire",
["Lancashire Polytechnic"] = "University of Central Lancashire",
["Glamorgan Polytechnic"] = "University of South Wales",
["Polytechnic of Wales"] = "University of South Wales",
["University of Glamorgan"] = "University of South Wales",
["Robert Gordon Institute of Technology"] = "Robert Gordon University",
["RGIT"] = "Robert Gordon University",
["Napier Polytechnic"] = "Edinburgh Napier University",
["Napier College"] = "Edinburgh Napier University",
["Glasgow Polytechnic"] = "Glasgow Caledonian University",
["Queen's College Glasgow"] = "Glasgow Caledonian University",
["Dundee Institute of Technology"] = "Abertay University",
["Dundee College of Technology"] = "Abertay University",
// Other historical name changes
["Roehampton Institute"] = "Roehampton University",
["University of Surrey Roehampton"] = "Roehampton University",
["Thames Valley University"] = "University of West London",
["Polytechnic of West London"] = "University of West London",
["Ealing College of Higher Education"] = "University of West London",
["London College of Music and Media"] = "University of West London",
["University College Northampton"] = "University of Northampton",
["Nene College"] = "University of Northampton",
["University College Worcester"] = "University of Worcester",
["Worcester College of Higher Education"] = "University of Worcester",
["University College Chichester"] = "University of Chichester",
["Chichester Institute of Higher Education"] = "University of Chichester",
["College of St Mark and St John"] = "Plymouth Marjon University",
["Marjon"] = "Plymouth Marjon University",
["University of St Mark and St John"] = "Plymouth Marjon University",
["University College Falmouth"] = "Falmouth University",
["Falmouth College of Arts"] = "Falmouth University",
["Bath College of Higher Education"] = "Bath Spa University",
["Bath Spa University College"] = "Bath Spa University",
["Liverpool Institute of Higher Education"] = "Liverpool Hope University",
["Liverpool Hope University College"] = "Liverpool Hope University",
["University of Wales, Newport"] = "University of South Wales",
["University of Wales Institute, Cardiff"] = "Cardiff Metropolitan University",
["UWIC"] = "Cardiff Metropolitan University",
["North East Wales Institute"] = "Wrexham University",
["NEWI"] = "Wrexham University",
["Glyndwr University"] = "Wrexham University",
["Wrexham Glyndwr University"] = "Wrexham University",
// Other common variations // Other common variations
["Open University"] = "The Open University", ["Open University"] = "The Open University",
["OU"] = "The Open University", ["OU"] = "The Open University",

View File

@@ -12,4 +12,13 @@ public interface ICompanyNameMatcherService
string cvCompanyName, string cvCompanyName,
List<CompanyCandidate> candidates, List<CompanyCandidate> candidates,
CancellationToken cancellationToken = default); CancellationToken cancellationToken = default);
/// <summary>
/// Uses AI to detect if a company name contains multiple companies and extract them.
/// Returns null or single-item list if it's a single company (e.g., "Ernst & Young").
/// Returns multiple items if compound (e.g., "ASDA/WALMART" -> ["ASDA", "WALMART"]).
/// </summary>
Task<List<string>?> ExtractCompanyNamesAsync(
string companyName,
CancellationToken cancellationToken = default);
} }

View File

@@ -39,22 +39,33 @@ public sealed class AICompanyNameMatcherService : ICompanyNameMatcherService
Determine which candidate (if any) is the SAME company as the CV entry. Determine which candidate (if any) is the SAME company as the CV entry.
Rules: Matching Guidelines:
1. A match requires the companies to be the SAME organisation, not just similar names 1. MATCH if the CV name is clearly the same organisation as a candidate:
2. "Families First CiC" is NOT the same as "FAMILIES AGAINST CONFORMITY LTD" - different words = different companies - "Royal Bank of Scotland" "THE ROYAL BANK OF SCOTLAND PUBLIC LIMITED COMPANY" (same bank)
3. Trading names should match their registered entity (e.g., "Tesco" matches "TESCO PLC") - "Yorkshire Electricity" "YORKSHIRE ELECTRICITY GROUP PLC" (same utility)
4. Subsidiaries can match if clearly the same organisation (e.g., "ASDA" could match "ASDA STORES LIMITED") - "Tesco" "TESCO PLC" (trading name = registered name)
5. Acronyms in parentheses are abbreviations of the full name (e.g., "North Halifax Partnership (NHP)" = "NORTH HALIFAX PARTNERSHIP") - "ASDA" "ASDA STORES LIMITED" (brand = operating company)
6. CiC/CIC = Community Interest Company, LLP = Limited Liability Partnership - these are legal suffixes
7. If the CV name contains all the key words of a candidate (ignoring Ltd/Limited/CIC/etc.), it's likely a match 2. DO NOT MATCH if the words are fundamentally different:
8. If NO candidate is clearly the same company, return "NONE" as the best match - "Families First" "FAMILIES AGAINST CONFORMITY" (different words after "Families")
- "Royal Bank" "Royal Academy" (Bank Academy)
- "Storm Ideas" "STORM LIMITED" (missing "Ideas" - could be different company)
3. Legal suffixes (Ltd, Limited, PLC, LLP, CiC) should be ignored when comparing names
4. Adding "THE" or "GROUP" to a name doesn't make it a different company
5. If unsure, prefer matching over rejecting when core identifying words match
CRITICAL: Return the COMPLETE company number exactly as shown (e.g., "SC083026", "02366995").
Do NOT truncate or abbreviate the company number.
Respond with this exact JSON structure: Respond with this exact JSON structure:
{ {
"bestMatchCompanyNumber": "string (company number of best match, or 'NONE' if no valid match)", "bestMatchCompanyNumber": "COMPLETE company number from the list above, or 'NONE' if no valid match",
"confidenceScore": number (0-100, where 100 = certain match, 0 = no match), "confidenceScore": number (0-100, where 100 = certain match, 0 = no match),
"matchType": "string (Exact, TradingName, Subsidiary, Parent, NoMatch)", "matchType": "Exact|TradingName|Subsidiary|Parent|NoMatch",
"reasoning": "string (brief explanation of why this is or isn't a match)" "reasoning": "brief explanation"
} }
"""; """;
@@ -81,8 +92,9 @@ public sealed class AICompanyNameMatcherService : ICompanyNameMatcherService
try try
{ {
// Format candidates with company number prominently displayed to prevent truncation
var candidatesText = string.Join("\n", candidates.Select((c, i) => var candidatesText = string.Join("\n", candidates.Select((c, i) =>
$"{i + 1}. {c.CompanyName} (Number: {c.CompanyNumber}, Status: {c.CompanyStatus ?? "Unknown"})")); $"[{c.CompanyNumber}] {c.CompanyName} (Status: {c.CompanyStatus ?? "Unknown"})"));
var prompt = MatchingPrompt var prompt = MatchingPrompt
.Replace("{CV_COMPANY}", cvCompanyName) .Replace("{CV_COMPANY}", cvCompanyName)
@@ -127,7 +139,8 @@ public sealed class AICompanyNameMatcherService : ICompanyNameMatcherService
aiResponse.BestMatchCompanyNumber, aiResponse.ConfidenceScore, aiResponse.Reasoning); aiResponse.BestMatchCompanyNumber, aiResponse.ConfidenceScore, aiResponse.Reasoning);
// Find the matched candidate // Find the matched candidate
if (aiResponse.BestMatchCompanyNumber == "NONE" || aiResponse.ConfidenceScore < 50) // Lower threshold to 30 - we have fuzzy validation as backup
if (aiResponse.BestMatchCompanyNumber == "NONE" || aiResponse.ConfidenceScore < 30)
{ {
return new SemanticMatchResult return new SemanticMatchResult
{ {
@@ -142,10 +155,40 @@ public sealed class AICompanyNameMatcherService : ICompanyNameMatcherService
var matchedCandidate = candidates.FirstOrDefault(c => var matchedCandidate = candidates.FirstOrDefault(c =>
c.CompanyNumber.Equals(aiResponse.BestMatchCompanyNumber, StringComparison.OrdinalIgnoreCase)); c.CompanyNumber.Equals(aiResponse.BestMatchCompanyNumber, StringComparison.OrdinalIgnoreCase));
// If exact match not found, try to find a candidate that starts with the returned number
// This handles cases where AI truncates "09052626" to "09" or similar
if (matchedCandidate is null && !string.IsNullOrWhiteSpace(aiResponse.BestMatchCompanyNumber)
&& aiResponse.BestMatchCompanyNumber != "NONE")
{
var partialMatch = candidates.FirstOrDefault(c =>
c.CompanyNumber.StartsWith(aiResponse.BestMatchCompanyNumber, StringComparison.OrdinalIgnoreCase));
if (partialMatch is not null)
{
_logger.LogDebug("AI returned partial company number '{Partial}', matched to full number '{Full}'",
aiResponse.BestMatchCompanyNumber, partialMatch.CompanyNumber);
matchedCandidate = partialMatch;
}
else
{
// Try reverse - maybe AI returned a longer string that contains the actual number
var reverseMatch = candidates.FirstOrDefault(c =>
aiResponse.BestMatchCompanyNumber.Contains(c.CompanyNumber, StringComparison.OrdinalIgnoreCase));
if (reverseMatch is not null)
{
_logger.LogDebug("AI returned string containing company number '{Number}'",
reverseMatch.CompanyNumber);
matchedCandidate = reverseMatch;
}
}
}
if (matchedCandidate is null) if (matchedCandidate is null)
{ {
_logger.LogWarning("AI returned company number {Number} not in candidates list", _logger.LogWarning("AI returned company number '{Number}' not in candidates list. Candidates: {Candidates}",
aiResponse.BestMatchCompanyNumber); aiResponse.BestMatchCompanyNumber,
string.Join(", ", candidates.Select(c => c.CompanyNumber)));
return null; return null;
} }
@@ -164,4 +207,107 @@ public sealed class AICompanyNameMatcherService : ICompanyNameMatcherService
return null; // Fall back to fuzzy matching return null; // Fall back to fuzzy matching
} }
} }
private const string CompoundNamePrompt = """
Analyze this company name from a CV and determine if it refers to ONE company or MULTIPLE companies.
Company name: "{COMPANY_NAME}"
Examples:
- "Ernst & Young" ONE company (it's the full name of the accounting firm)
- "Marks & Spencer" ONE company (it's the full name of the retailer)
- "ASDA/WALMART" TWO companies: ["ASDA", "WALMART"] (person worked at both or it's showing ownership)
- "Corus & Laura Ashley Hotels" TWO companies: ["Corus", "Laura Ashley Hotels"] (different industries)
- "PwC" ONE company
- "Deloitte and Touche" ONE company (historical name of Deloitte)
- "BMW Group Ireland" ONE company
- "Tesco Stores and Distribution" ONE company (departments of same company)
Rules:
1. Well-known company names with "&" or "and" are SINGLE companies (Ernst & Young, Marks & Spencer, Procter & Gamble)
2. A "/" usually indicates multiple companies or ownership relationship
3. If the parts are in completely different industries, they're likely separate companies
4. If one part is clearly a subsidiary/department of the other, treat as ONE company
Respond with ONLY valid JSON:
{
"isSingleCompany": boolean,
"companies": ["company1", "company2"] or ["single company name"],
"reasoning": "brief explanation"
}
""";
public async Task<List<string>?> ExtractCompanyNamesAsync(
string companyName,
CancellationToken cancellationToken = default)
{
if (string.IsNullOrWhiteSpace(companyName))
{
return null;
}
_logger.LogDebug("Using AI to check if '{CompanyName}' is a compound name", companyName);
try
{
var prompt = CompoundNamePrompt.Replace("{COMPANY_NAME}", companyName);
var messages = new List<Message>
{
new(RoleType.User, prompt)
};
var parameters = new MessageParameters
{
Model = "claude-3-5-haiku-20241022",
MaxTokens = 256,
Messages = messages,
System = [new SystemMessage("You are a company name parser. Respond only with valid JSON.")]
};
var response = await _anthropicClient.Messages.GetClaudeMessageAsync(parameters, cancellationToken);
var responseText = response.Content
.OfType<TextContent>()
.FirstOrDefault()?.Text;
if (string.IsNullOrWhiteSpace(responseText))
{
_logger.LogWarning("AI returned empty response for compound name check");
return null;
}
responseText = JsonResponseHelper.CleanJsonResponse(responseText);
var result = JsonSerializer.Deserialize<CompoundNameResponse>(responseText, JsonDefaults.CamelCase);
if (result is null)
{
_logger.LogWarning("Failed to deserialize compound name response: {Response}", responseText);
return null;
}
_logger.LogDebug("AI compound name result: IsSingle={IsSingle}, Companies=[{Companies}], Reasoning={Reasoning}",
result.IsSingleCompany, string.Join(", ", result.Companies ?? []), result.Reasoning);
if (result.IsSingleCompany || result.Companies is null || result.Companies.Count < 2)
{
return null; // Single company, no splitting needed
}
return result.Companies;
}
catch (Exception ex)
{
_logger.LogError(ex, "AI compound name detection failed for '{CompanyName}'", companyName);
return null;
}
}
private sealed class CompoundNameResponse
{
public bool IsSingleCompany { get; set; }
public List<string>? Companies { get; set; }
public string? Reasoning { get; set; }
}
} }

View File

@@ -2,6 +2,7 @@ using System.Text.Json;
using FuzzySharp; using FuzzySharp;
using Microsoft.EntityFrameworkCore; using Microsoft.EntityFrameworkCore;
using Microsoft.Extensions.Logging; using Microsoft.Extensions.Logging;
using RealCV.Application.Data;
using RealCV.Application.DTOs; using RealCV.Application.DTOs;
using RealCV.Application.Helpers; using RealCV.Application.Helpers;
using RealCV.Application.Interfaces; using RealCV.Application.Interfaces;
@@ -93,11 +94,148 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
{ {
ArgumentException.ThrowIfNullOrWhiteSpace(companyName); ArgumentException.ThrowIfNullOrWhiteSpace(companyName);
_logger.LogDebug("Verifying company: {CompanyName}", companyName); // Normalize company name - strip trailing punctuation that causes matching issues
var normalizedName = NormalizeCompanyName(companyName);
_logger.LogDebug("Verifying company: {CompanyName} (normalized: {NormalizedName})", companyName, normalizedName);
var flags = new List<CompanyVerificationFlag>(); var flags = new List<CompanyVerificationFlag>();
// Check 1a: Is this a public sector employer?
if (UKHistoricalEmployers.IsPublicSectorEmployer(normalizedName))
{
_logger.LogInformation("Recognised public sector employer: {CompanyName}", companyName);
return new CompanyVerificationResult
{
ClaimedCompany = companyName,
MatchedCompanyName = companyName,
MatchedCompanyNumber = null,
MatchScore = 100,
IsVerified = true,
VerificationNotes = "Public sector employer - not registered at Companies House",
ClaimedStartDate = startDate,
ClaimedEndDate = endDate,
CompanyType = "public-sector",
CompanyStatus = "active",
ClaimedJobTitle = jobTitle,
Flags = flags
};
}
// Check 1b: Is this a charity or non-profit organisation?
if (UKHistoricalEmployers.IsCharityEmployer(normalizedName))
{
_logger.LogInformation("Recognised charity employer: {CompanyName}", companyName);
return new CompanyVerificationResult
{
ClaimedCompany = companyName,
MatchedCompanyName = companyName,
MatchedCompanyNumber = null,
MatchScore = 100,
IsVerified = true,
VerificationNotes = "Charity/non-profit organisation",
ClaimedStartDate = startDate,
ClaimedEndDate = endDate,
CompanyType = "charity",
CompanyStatus = "active",
ClaimedJobTitle = jobTitle,
Flags = flags
};
}
// Check 2: Is this an internal division of a larger company?
var parentCompany = UKHistoricalEmployers.GetParentCompanyForDivision(normalizedName);
if (parentCompany != null)
{
_logger.LogInformation("Recognised division '{CompanyName}' of parent company '{ParentCompany}'", companyName, parentCompany);
// Try to verify the parent company instead
var parentResult = await VerifyCompanyAsync(parentCompany, startDate, endDate, jobTitle);
if (parentResult.IsVerified)
{
return parentResult with
{
ClaimedCompany = companyName,
VerificationNotes = $"Internal division of {parentResult.MatchedCompanyName}"
};
}
// If parent verification failed, return a partial match
return new CompanyVerificationResult
{
ClaimedCompany = companyName,
MatchedCompanyName = parentCompany,
MatchedCompanyNumber = null,
MatchScore = 85,
IsVerified = true,
VerificationNotes = $"Recognised as division of {parentCompany}",
ClaimedStartDate = startDate,
ClaimedEndDate = endDate,
ClaimedJobTitle = jobTitle,
Flags = flags
};
}
// Check 3: Is this a known historical employer?
var historicalInfo = UKHistoricalEmployers.GetHistoricalEmployerInfo(normalizedName);
if (historicalInfo != null)
{
_logger.LogInformation("Recognised historical employer: {CompanyName} -> {Successor}", companyName, historicalInfo.SuccessorName);
// If we have a company number for the successor, try to get current details
if (!string.IsNullOrEmpty(historicalInfo.CompanyNumber))
{
try
{
var successorDetails = await _companiesHouseClient.GetCompanyAsync(historicalInfo.CompanyNumber);
if (successorDetails != null)
{
return new CompanyVerificationResult
{
ClaimedCompany = companyName,
MatchedCompanyName = $"{companyName} (now {successorDetails.CompanyName})",
MatchedCompanyNumber = historicalInfo.CompanyNumber,
MatchScore = 90,
IsVerified = true,
VerificationNotes = $"Historical company. {historicalInfo.Notes}",
ClaimedStartDate = startDate,
ClaimedEndDate = endDate,
CompanyType = successorDetails.Type,
CompanyStatus = "historical",
ClaimedJobTitle = jobTitle,
Flags = flags
};
}
}
catch (Exception ex)
{
_logger.LogWarning(ex, "Failed to fetch successor company details for {CompanyNumber}", historicalInfo.CompanyNumber);
}
}
// Return historical match without successor details
return new CompanyVerificationResult
{
ClaimedCompany = companyName,
MatchedCompanyName = $"{companyName} (now {historicalInfo.SuccessorName})",
MatchedCompanyNumber = historicalInfo.CompanyNumber,
MatchScore = 90,
IsVerified = true,
VerificationNotes = $"Historical company. {historicalInfo.Notes}",
ClaimedStartDate = startDate,
ClaimedEndDate = endDate,
CompanyStatus = "historical",
ClaimedJobTitle = jobTitle,
Flags = flags
};
}
// Check 4: Is this a compound company name (e.g., "ASDA/WALMART", "Corus & Laura Ashley Hotels")?
// Try to verify each part individually
var compoundResult = await TryVerifyCompoundNameAsync(normalizedName, companyName, startDate, endDate, jobTitle, flags);
if (compoundResult is not null)
{
return compoundResult;
}
// Try to find a cached match first (but only if it existed at claimed start date) // Try to find a cached match first (but only if it existed at claimed start date)
var cachedMatch = await FindCachedMatchAsync(companyName); var cachedMatch = await FindCachedMatchAsync(normalizedName);
if (cachedMatch is not null) if (cachedMatch is not null)
{ {
// Check if cached company existed at the claimed start date // Check if cached company existed at the claimed start date
@@ -119,9 +257,9 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
// Search Companies House with fallback queries // Search Companies House with fallback queries
try try
{ {
var searchQueries = GenerateSearchQueries(companyName); var searchQueries = GenerateSearchQueries(normalizedName);
_logger.LogDebug("Generated {Count} search queries for '{CompanyName}': {Queries}", _logger.LogDebug("Generated {Count} search queries for '{CompanyName}': {Queries}",
searchQueries.Count, companyName, string.Join(", ", searchQueries.Select(q => $"'{q}'"))); searchQueries.Count, normalizedName, string.Join(", ", searchQueries.Select(q => $"'{q}'")));
// Collect all candidates from all search queries for AI matching // Collect all candidates from all search queries for AI matching
var allCandidates = new Dictionary<string, CompaniesHouseSearchItem>(); var allCandidates = new Dictionary<string, CompaniesHouseSearchItem>();
@@ -148,7 +286,7 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
} }
// Find fuzzy matches (as before) for fallback // Find fuzzy matches (as before) for fallback
var fuzzyMatch = FindBestMatch(companyName, query, searchResponse.Items, startDate); var fuzzyMatch = FindBestMatch(normalizedName, query, searchResponse.Items, startDate);
if (fuzzyMatch is not null) if (fuzzyMatch is not null)
{ {
fuzzyMatches.Add(fuzzyMatch.Value); fuzzyMatches.Add(fuzzyMatch.Value);
@@ -157,30 +295,47 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
if (allCandidates.Count == 0) if (allCandidates.Count == 0)
{ {
_logger.LogDebug("No candidates found for: {CompanyName} after trying {Count} queries", companyName, searchQueries.Count); _logger.LogDebug("No candidates found for: {CompanyName} after trying {Count} queries", normalizedName, searchQueries.Count);
return CreateUnverifiedResult(companyName, startDate, endDate, jobTitle, return CreateUnverifiedResult(companyName, startDate, endDate, jobTitle,
"Company name could not be verified against official records"); "Company name could not be verified against official records");
} }
// Use AI to find the best semantic match from all candidates // Use AI to find the best semantic match from all candidates
_logger.LogDebug("Using AI to match '{CompanyName}' against {Count} candidates", companyName, allCandidates.Count); _logger.LogDebug("Using AI to match '{CompanyName}' against {Count} candidates", normalizedName, allCandidates.Count);
// Sort candidates by fuzzy relevance to the search term before taking top 10
// This ensures the most likely matches are sent to the AI, not just arbitrary entries
var normalizedUpper = normalizedName.ToUpperInvariant();
var candidatesForAI = allCandidates.Values var candidatesForAI = allCandidates.Values
.Take(10) // Limit to top 10 candidates to reduce AI cost .Select(c => new
.Select(c => new CompanyCandidate
{ {
CompanyName = c.Title, Item = c,
CompanyNumber = c.CompanyNumber, Score = Fuzz.TokenSetRatio(normalizedUpper, c.Title.ToUpperInvariant())
CompanyStatus = c.CompanyStatus, })
DateOfCreation = c.DateOfCreation .OrderByDescending(x => x.Score)
.Take(10)
.Select(x => new CompanyCandidate
{
CompanyName = x.Item.Title,
CompanyNumber = x.Item.CompanyNumber,
CompanyStatus = x.Item.CompanyStatus,
DateOfCreation = x.Item.DateOfCreation
}) })
.ToList(); .ToList();
var aiResult = await _aiMatcher.FindBestMatchAsync(companyName, candidatesForAI); _logger.LogDebug("Top candidates for AI matching (sorted by relevance): {Candidates}",
string.Join(", ", candidatesForAI.Select(c => $"{c.CompanyName} [{c.CompanyNumber}]")));
var aiResult = await _aiMatcher.FindBestMatchAsync(normalizedName, candidatesForAI);
CompaniesHouseSearchItem? matchedItem = null; CompaniesHouseSearchItem? matchedItem = null;
int matchScore; int matchScore;
// Get best fuzzy match for potential fallback
var bestFuzzy = fuzzyMatches.Count > 0
? fuzzyMatches.OrderByDescending(m => m.Score).First()
: ((CompaniesHouseSearchItem Item, int Score)?)null;
if (aiResult is not null && aiResult.IsMatch) if (aiResult is not null && aiResult.IsMatch)
{ {
// AI found a valid match // AI found a valid match
@@ -195,21 +350,63 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
// AI didn't find a match - check if it explicitly rejected or just failed // AI didn't find a match - check if it explicitly rejected or just failed
if (aiResult?.MatchType == "NoMatch") if (aiResult?.MatchType == "NoMatch")
{ {
_logger.LogDebug("AI explicitly rejected all candidates for '{CompanyName}'. Reasoning: {Reasoning}", // AI explicitly rejected. Only override if fuzzy match passes strict validation:
companyName, aiResult?.Reasoning ?? "No match found"); // 1. High fuzzy score (>= 90%)
return CreateUnverifiedResult(companyName, startDate, endDate, jobTitle, // 2. ALL core identifying words from original name appear in the match
"Company name could not be verified - no matching company found in official records"); // 3. Match doesn't have significantly more core words (prevents partial word matches)
} if (bestFuzzy.HasValue && bestFuzzy.Value.Score >= 90)
{
var originalCores = ExtractCoreIdentifiers(normalizedName);
var matchCores = ExtractCoreIdentifiers(bestFuzzy.Value.Item.Title);
// AI failed (API error, etc.) - fall back to fuzzy matching // All original core words must appear in the match
_logger.LogWarning("AI matching failed for '{CompanyName}', falling back to fuzzy matching", companyName); var allCoresPresent = originalCores.Count == 0 ||
var bestFuzzy = fuzzyMatches.OrderByDescending(m => m.Score).First(); originalCores.All(c => bestFuzzy.Value.Item.Title.Contains(c, StringComparison.OrdinalIgnoreCase));
matchedItem = bestFuzzy.Item;
matchScore = bestFuzzy.Score; // Match shouldn't have too many extra core words (max 2 extra, e.g., "GROUP PLC")
var extraCores = matchCores.Count(c => !originalCores.Any(o =>
c.Equals(o, StringComparison.OrdinalIgnoreCase)));
var reasonableExtras = extraCores <= 2;
if (allCoresPresent && reasonableExtras)
{
_logger.LogInformation(
"AI rejected '{CompanyName}' but fuzzy match '{MatchedName}' ({Score}%) passes validation. " +
"Original cores: [{OriginalCores}], Match cores: [{MatchCores}]",
normalizedName, bestFuzzy.Value.Item.Title, bestFuzzy.Value.Score,
string.Join(", ", originalCores), string.Join(", ", matchCores));
matchedItem = bestFuzzy.Value.Item;
matchScore = bestFuzzy.Value.Score;
}
else
{
_logger.LogDebug(
"AI rejected '{CompanyName}' and fuzzy match '{MatchedName}' fails validation. " +
"AllCoresPresent: {AllCores}, ExtraCores: {Extra}",
normalizedName, bestFuzzy.Value.Item.Title, allCoresPresent, extraCores);
return CreateUnverifiedResult(companyName, startDate, endDate, jobTitle,
"Company name could not be verified - no matching company found in official records");
}
}
else
{
_logger.LogDebug("AI explicitly rejected all candidates for '{CompanyName}'. Reasoning: {Reasoning}",
normalizedName, aiResult?.Reasoning ?? "No match found");
return CreateUnverifiedResult(companyName, startDate, endDate, jobTitle,
"Company name could not be verified - no matching company found in official records");
}
}
else
{
// AI failed (API error, etc.) - fall back to fuzzy matching
_logger.LogWarning("AI matching failed for '{CompanyName}', falling back to fuzzy matching", normalizedName);
matchedItem = bestFuzzy!.Value.Item;
matchScore = bestFuzzy!.Value.Score;
}
} }
else else
{ {
_logger.LogDebug("No valid match found for: {CompanyName}", companyName); _logger.LogDebug("No valid match found for: {CompanyName}", normalizedName);
return CreateUnverifiedResult(companyName, startDate, endDate, jobTitle, return CreateUnverifiedResult(companyName, startDate, endDate, jobTitle,
"Company name could not be verified against official records"); "Company name could not be verified against official records");
} }
@@ -624,6 +821,90 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
#region Helper Methods #region Helper Methods
/// <summary>
/// Normalizes a company name by removing trailing punctuation and cleaning up common issues.
/// </summary>
private static string NormalizeCompanyName(string companyName)
{
if (string.IsNullOrWhiteSpace(companyName))
return companyName;
var normalized = companyName.Trim();
// Remove trailing punctuation (dots, commas, etc.) that cause matching issues
// e.g., "Glaxo Research & Development Ltd." -> "Glaxo Research & Development Ltd"
normalized = normalized.TrimEnd('.', ',', ';', ':', '!', '?');
// Normalize multiple spaces to single space
normalized = System.Text.RegularExpressions.Regex.Replace(normalized, @"\s+", " ");
return normalized;
}
/// <summary>
/// Attempts to verify compound company names by detecting if multiple companies are mentioned.
/// Only triggers for names with potential separators (/, &amp;, "and") to avoid unnecessary AI calls.
/// </summary>
private async Task<CompanyVerificationResult?> TryVerifyCompoundNameAsync(
string normalizedName,
string originalName,
DateOnly? startDate,
DateOnly? endDate,
string? jobTitle,
List<CompanyVerificationFlag> flags)
{
// Quick check: only process names that might be compound
// Look for separators that could indicate multiple companies
var hasPotentialSeparator = normalizedName.Contains('/')
|| normalizedName.Contains(" & ")
|| normalizedName.Contains(" and ", StringComparison.OrdinalIgnoreCase);
if (!hasPotentialSeparator)
{
return null;
}
// Use AI to determine if this is a compound name and extract parts
var extractedParts = await _aiMatcher.ExtractCompanyNamesAsync(normalizedName);
if (extractedParts is null || extractedParts.Count < 2)
{
// AI determined this is a single company (e.g., "Ernst & Young")
return null;
}
_logger.LogDebug("AI detected compound company name '{Name}', extracted parts: {Parts}",
originalName, string.Join(", ", extractedParts.Select(p => $"'{p}'")));
// Try to verify each extracted part - return success on first match
foreach (var part in extractedParts)
{
// Skip parts that are too short
if (part.Length < 3) continue;
_logger.LogDebug("Trying to verify compound part: '{Part}'", part);
// Recursively verify this part
var partResult = await VerifyCompanyAsync(part, startDate, endDate, jobTitle);
if (partResult.IsVerified)
{
_logger.LogInformation("Compound name '{Original}' verified via part '{Part}' -> {Match}",
originalName, part, partResult.MatchedCompanyName);
return partResult with
{
ClaimedCompany = originalName,
VerificationNotes = $"Verified via '{part}': {partResult.VerificationNotes ?? partResult.MatchedCompanyName}"
};
}
}
// None of the parts could be verified
_logger.LogDebug("No parts of compound name '{Name}' could be verified", originalName);
return null;
}
private async Task<CompanyCache?> FindCachedMatchAsync(string companyName) private async Task<CompanyCache?> FindCachedMatchAsync(string companyName)
{ {
var cutoffDate = DateTime.UtcNow.AddDays(-CacheExpirationDays); var cutoffDate = DateTime.UtcNow.AddDays(-CacheExpirationDays);
@@ -790,12 +1071,13 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
var searchText = originalLower + " " + queryLower; var searchText = originalLower + " " + queryLower;
// Penalize subsidiary indicators (unless search explicitly included them) // Penalize subsidiary indicators (unless search explicitly included them)
// Use word boundary matching to avoid "SCOTLAND" matching "land"
foreach (var indicator in SubsidiaryIndicators) foreach (var indicator in SubsidiaryIndicators)
{ {
if (itemTitleLower.Contains(indicator)) if (ContainsWholeWord(itemTitleLower, indicator))
{ {
// Only penalize if the search didn't explicitly include this indicator // Only penalize if the search didn't explicitly include this indicator
if (!searchText.Contains(indicator)) if (!ContainsWholeWord(searchText, indicator))
{ {
score -= 10; // Significant penalty for subsidiaries score -= 10; // Significant penalty for subsidiaries
} }
@@ -806,7 +1088,7 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
// Boost main company indicators // Boost main company indicators
foreach (var indicator in MainCompanyIndicators) foreach (var indicator in MainCompanyIndicators)
{ {
if (itemTitleLower.Contains(indicator)) if (ContainsWholeWord(itemTitleLower, indicator))
{ {
score += 5; // Boost for main trading companies score += 5; // Boost for main trading companies
break; // Only apply one boost break; // Only apply one boost
@@ -1168,7 +1450,10 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
} }
// Check if the item matches any pattern in this non-employment category // Check if the item matches any pattern in this non-employment category
if (patterns.Any(pattern => itemTitleLower.Contains(pattern))) // Use whole-word matching for single words, substring for multi-word patterns
if (patterns.Any(pattern => pattern.Contains(' ')
? itemTitleLower.Contains(pattern)
: ContainsWholeWord(itemTitleLower, pattern)))
{ {
return false; // This is a non-employment entity type that wasn't explicitly searched for return false; // This is a non-employment entity type that wasn't explicitly searched for
} }
@@ -1177,6 +1462,19 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
return true; // No non-employment patterns matched, this is likely a valid employment entity return true; // No non-employment patterns matched, this is likely a valid employment entity
} }
/// <summary>
/// Checks if a string contains a word as a whole word (not as a substring of another word).
/// E.g., "scotland" does NOT contain whole word "land", but "land holdings" does.
/// </summary>
private static bool ContainsWholeWord(string text, string word)
{
if (string.IsNullOrEmpty(text) || string.IsNullOrEmpty(word))
return false;
var pattern = $@"\b{System.Text.RegularExpressions.Regex.Escape(word)}\b";
return System.Text.RegularExpressions.Regex.IsMatch(text, pattern, System.Text.RegularExpressions.RegexOptions.IgnoreCase);
}
// Expanded skip words list for core identifier extraction // Expanded skip words list for core identifier extraction
// These words are too common to be meaningful differentiators between companies // These words are too common to be meaningful differentiators between companies
private static readonly HashSet<string> SkipWords = new(StringComparer.OrdinalIgnoreCase) private static readonly HashSet<string> SkipWords = new(StringComparer.OrdinalIgnoreCase)
@@ -1220,8 +1518,8 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
"new", "modern", "advanced", "innovative", "premier", "elite", "premium", "new", "modern", "advanced", "innovative", "premier", "elite", "premium",
"quality", "superior", "excellent", "best", "top", "leading", "major", "quality", "superior", "excellent", "best", "top", "leading", "major",
// Ownership indicators // Ownership indicators (excluding "royal" as it's a meaningful company identifier)
"royal", "imperial", "crown", "state", "public", "private", "independent", "imperial", "crown", "state", "public", "private", "independent",
"mutual", "cooperative", "coop", "community", "mutual", "cooperative", "coop", "community",
// Time-related // Time-related
@@ -1235,7 +1533,7 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
/// Extracts ALL core identifying words from a company name. /// Extracts ALL core identifying words from a company name.
/// These are significant words that aren't common prefixes/suffixes. /// These are significant words that aren't common prefixes/suffixes.
/// E.g., "BMW Group Canada" -> ["BMW"], "Lloyds Bowmaker" -> ["LLOYDS", "BOWMAKER"] /// E.g., "BMW Group Canada" -> ["BMW"], "Lloyds Bowmaker" -> ["LLOYDS", "BOWMAKER"]
/// "Bank of Scotland" -> ["BANK", "SCOTLAND"] /// "Royal Bank of Scotland" -> ["ROYAL", "BANK"] (Scotland is a geographic skipWord)
/// </summary> /// </summary>
private static List<string> ExtractCoreIdentifiers(string companyName) private static List<string> ExtractCoreIdentifiers(string companyName)
{ {

View File

@@ -0,0 +1,329 @@
using System.Text.Json;
using Microsoft.EntityFrameworkCore;
using Microsoft.Extensions.Configuration;
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Logging;
using RealCV.Application.Interfaces;
using RealCV.Application.Models;
using RealCV.Infrastructure.Data;
using RealCV.Infrastructure.ExternalApis;
using RealCV.Infrastructure.Services;
using RealCV.Infrastructure.Configuration;
namespace RealCV.Tests.Integration;
/// <summary>
/// Test utility to batch process CVs and output verification findings.
/// Run with: dotnet test --filter "FullyQualifiedName~CVBatchTester" -- TestRunParameters.Parameter(name=\"CvFolder\", value=\"/path/to/cvs\")
/// Or use the ProcessFolder method directly.
/// </summary>
public class CVBatchTester
{
private readonly IServiceProvider _serviceProvider;
public CVBatchTester()
{
var services = new ServiceCollection();
ConfigureServices(services);
_serviceProvider = services.BuildServiceProvider();
}
private static void ConfigureServices(IServiceCollection services)
{
// Load configuration
var configuration = new ConfigurationBuilder()
.SetBasePath(Directory.GetCurrentDirectory())
.AddJsonFile("appsettings.json", optional: true)
.AddJsonFile("appsettings.Development.json", optional: true)
.AddEnvironmentVariables()
.Build();
// Logging
services.AddLogging(builder =>
{
builder.AddConsole();
builder.SetMinimumLevel(LogLevel.Information);
});
// Database
var connectionString = configuration.GetConnectionString("DefaultConnection")
?? "Server=127.0.0.1;Database=RealCV;User Id=SA;Password=TrueCV_Sql2024!;TrustServerCertificate=True";
services.AddDbContextFactory<ApplicationDbContext>(options =>
options.UseSqlServer(connectionString));
// Companies House
services.Configure<CompaniesHouseSettings>(options =>
{
options.BaseUrl = configuration["CompaniesHouse:BaseUrl"] ?? "https://api.company-information.service.gov.uk";
options.ApiKey = configuration["CompaniesHouse:ApiKey"] ?? "";
});
services.AddHttpClient<CompaniesHouseClient>();
// Anthropic (for AI matching)
services.Configure<AnthropicSettings>(options =>
{
options.ApiKey = configuration["Anthropic:ApiKey"] ?? "";
});
services.AddHttpClient<AnthropicClient>();
services.AddScoped<ICompanyNameMatcherService, CompanyNameMatcherService>();
// Services
services.AddScoped<ICompanyVerifierService, CompanyVerifierService>();
services.AddScoped<IEducationVerifierService, EducationVerifierService>();
services.AddScoped<ICVParserService, CVParserService>();
}
/// <summary>
/// Process all CVs in a folder and return verification results.
/// </summary>
public async Task<List<CVVerificationSummary>> ProcessFolderAsync(string folderPath)
{
if (!Directory.Exists(folderPath))
{
throw new DirectoryNotFoundException($"Folder not found: {folderPath}");
}
var cvFiles = Directory.GetFiles(folderPath, "*.*", SearchOption.TopDirectoryOnly)
.Where(f => f.EndsWith(".pdf", StringComparison.OrdinalIgnoreCase) ||
f.EndsWith(".docx", StringComparison.OrdinalIgnoreCase) ||
f.EndsWith(".doc", StringComparison.OrdinalIgnoreCase))
.ToList();
Console.WriteLine($"Found {cvFiles.Count} CV files in {folderPath}");
Console.WriteLine(new string('=', 80));
var results = new List<CVVerificationSummary>();
foreach (var cvFile in cvFiles)
{
Console.WriteLine($"\nProcessing: {Path.GetFileName(cvFile)}");
Console.WriteLine(new string('-', 60));
try
{
var result = await ProcessSingleCVAsync(cvFile);
results.Add(result);
PrintSummary(result);
}
catch (Exception ex)
{
Console.WriteLine($"ERROR: {ex.Message}");
results.Add(new CVVerificationSummary
{
FileName = Path.GetFileName(cvFile),
Error = ex.Message
});
}
}
// Print overall summary
Console.WriteLine("\n" + new string('=', 80));
Console.WriteLine("OVERALL SUMMARY");
Console.WriteLine(new string('=', 80));
PrintOverallSummary(results);
return results;
}
private async Task<CVVerificationSummary> ProcessSingleCVAsync(string filePath)
{
using var scope = _serviceProvider.CreateScope();
var cvParser = scope.ServiceProvider.GetRequiredService<ICVParserService>();
var companyVerifier = scope.ServiceProvider.GetRequiredService<ICompanyVerifierService>();
var educationVerifier = scope.ServiceProvider.GetRequiredService<IEducationVerifierService>();
// Parse the CV
await using var fileStream = File.OpenRead(filePath);
var parsedCV = await cvParser.ParseAsync(fileStream, Path.GetFileName(filePath));
var summary = new CVVerificationSummary
{
FileName = Path.GetFileName(filePath),
CandidateName = parsedCV.PersonalInfo?.FullName ?? "Unknown"
};
// Verify employers
if (parsedCV.Employment?.Count > 0)
{
foreach (var employment in parsedCV.Employment)
{
try
{
var result = await companyVerifier.VerifyCompanyAsync(
employment.CompanyName,
employment.StartDate,
employment.EndDate,
employment.JobTitle);
summary.EmployerResults.Add(new EmployerVerificationSummary
{
ClaimedName = employment.CompanyName,
MatchedName = result.MatchedCompanyName,
CompanyNumber = result.MatchedCompanyNumber,
IsVerified = result.IsVerified,
MatchScore = result.MatchScore,
Notes = result.VerificationNotes,
Status = result.CompanyStatus
});
}
catch (Exception ex)
{
summary.EmployerResults.Add(new EmployerVerificationSummary
{
ClaimedName = employment.CompanyName,
IsVerified = false,
Notes = $"Error: {ex.Message}"
});
}
}
}
// Verify education
if (parsedCV.Education?.Count > 0)
{
var educationResults = educationVerifier.VerifyAll(
parsedCV.Education.Select(e => new EducationEntry
{
Institution = e.Institution,
Qualification = e.Qualification,
Subject = e.Subject,
StartDate = e.StartDate,
EndDate = e.EndDate
}).ToList());
foreach (var result in educationResults)
{
summary.EducationResults.Add(new EducationVerificationSummary
{
ClaimedInstitution = result.ClaimedInstitution,
MatchedInstitution = result.MatchedInstitution,
Qualification = result.ClaimedQualification,
IsVerified = result.IsVerified,
Status = result.Status,
Notes = result.VerificationNotes
});
}
}
return summary;
}
private static void PrintSummary(CVVerificationSummary summary)
{
Console.WriteLine($"Candidate: {summary.CandidateName}");
Console.WriteLine($"\n EMPLOYERS ({summary.EmployerResults.Count}):");
foreach (var emp in summary.EmployerResults)
{
var status = emp.IsVerified ? "✓" : "✗";
var matchInfo = emp.IsVerified
? $"-> {emp.MatchedName} ({emp.MatchScore}%)"
: emp.Notes ?? "Not found";
Console.WriteLine($" {status} {emp.ClaimedName}");
Console.WriteLine($" {matchInfo}");
}
Console.WriteLine($"\n EDUCATION ({summary.EducationResults.Count}):");
foreach (var edu in summary.EducationResults)
{
var status = edu.IsVerified ? "✓" : "✗";
var matchInfo = edu.IsVerified && edu.MatchedInstitution != null
? $"-> {edu.MatchedInstitution}"
: edu.Notes ?? edu.Status;
Console.WriteLine($" {status} {edu.ClaimedInstitution}");
Console.WriteLine($" {edu.Qualification}");
Console.WriteLine($" {matchInfo}");
}
}
private static void PrintOverallSummary(List<CVVerificationSummary> results)
{
var successfulCVs = results.Count(r => r.Error == null);
var totalEmployers = results.Sum(r => r.EmployerResults.Count);
var verifiedEmployers = results.Sum(r => r.EmployerResults.Count(e => e.IsVerified));
var totalEducation = results.Sum(r => r.EducationResults.Count);
var verifiedEducation = results.Sum(r => r.EducationResults.Count(e => e.IsVerified));
Console.WriteLine($"CVs Processed: {successfulCVs}/{results.Count}");
Console.WriteLine($"Employers: {verifiedEmployers}/{totalEmployers} verified ({(totalEmployers > 0 ? verifiedEmployers * 100 / totalEmployers : 0)}%)");
Console.WriteLine($"Education: {verifiedEducation}/{totalEducation} verified ({(totalEducation > 0 ? verifiedEducation * 100 / totalEducation : 0)}%)");
// List unverified employers
var unverifiedEmployers = results
.SelectMany(r => r.EmployerResults.Where(e => !e.IsVerified))
.GroupBy(e => e.ClaimedName)
.OrderByDescending(g => g.Count())
.ToList();
if (unverifiedEmployers.Count > 0)
{
Console.WriteLine($"\nUNVERIFIED EMPLOYERS ({unverifiedEmployers.Count} unique):");
foreach (var group in unverifiedEmployers.Take(20))
{
Console.WriteLine($" - {group.Key} (x{group.Count()})");
}
}
// List unverified institutions
var unverifiedEducation = results
.SelectMany(r => r.EducationResults.Where(e => !e.IsVerified))
.GroupBy(e => e.ClaimedInstitution)
.OrderByDescending(g => g.Count())
.ToList();
if (unverifiedEducation.Count > 0)
{
Console.WriteLine($"\nUNVERIFIED INSTITUTIONS ({unverifiedEducation.Count} unique):");
foreach (var group in unverifiedEducation.Take(20))
{
Console.WriteLine($" - {group.Key} (x{group.Count()})");
}
}
}
/// <summary>
/// Export results to JSON for further analysis.
/// </summary>
public static void ExportToJson(List<CVVerificationSummary> results, string outputPath)
{
var json = JsonSerializer.Serialize(results, new JsonSerializerOptions
{
WriteIndented = true
});
File.WriteAllText(outputPath, json);
Console.WriteLine($"\nResults exported to: {outputPath}");
}
}
public class CVVerificationSummary
{
public string FileName { get; set; } = "";
public string CandidateName { get; set; } = "";
public string? Error { get; set; }
public List<EmployerVerificationSummary> EmployerResults { get; set; } = new();
public List<EducationVerificationSummary> EducationResults { get; set; } = new();
}
public class EmployerVerificationSummary
{
public string ClaimedName { get; set; } = "";
public string? MatchedName { get; set; }
public string? CompanyNumber { get; set; }
public bool IsVerified { get; set; }
public int MatchScore { get; set; }
public string? Notes { get; set; }
public string? Status { get; set; }
}
public class EducationVerificationSummary
{
public string ClaimedInstitution { get; set; } = "";
public string? MatchedInstitution { get; set; }
public string? Qualification { get; set; }
public bool IsVerified { get; set; }
public string? Status { get; set; }
public string? Notes { get; set; }
}

View File

@@ -0,0 +1,15 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>net8.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
</PropertyGroup>
<ItemGroup>
<ProjectReference Include="../../src/RealCV.Application/RealCV.Application.csproj" />
<ProjectReference Include="../../src/RealCV.Infrastructure/RealCV.Infrastructure.csproj" />
</ItemGroup>
</Project>

View File

@@ -0,0 +1,302 @@
using System.Text.Json;
using Microsoft.EntityFrameworkCore;
using Microsoft.Extensions.Configuration;
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Logging;
using RealCV.Application.Interfaces;
using RealCV.Application.Models;
using RealCV.Infrastructure.Configuration;
using RealCV.Infrastructure.Data;
using RealCV.Infrastructure.ExternalApis;
using RealCV.Infrastructure.Services;
namespace CVBatchTester;
class Program
{
static async Task<int> Main(string[] args)
{
var folderPath = args.FirstOrDefault() ?? AskForFolder();
if (string.IsNullOrEmpty(folderPath) || !Directory.Exists(folderPath))
{
Console.WriteLine($"Error: Folder not found: {folderPath}");
Console.WriteLine("Usage: CVBatchTester <folder-path>");
Console.WriteLine(" e.g. CVBatchTester /home/user/cvs");
return 1;
}
Console.WriteLine($"CV Batch Verification Tester");
Console.WriteLine($"Processing CVs from: {folderPath}");
Console.WriteLine(new string('=', 80));
// Setup DI
var services = new ServiceCollection();
ConfigureServices(services);
var provider = services.BuildServiceProvider();
// Find CV files
var cvFiles = Directory.GetFiles(folderPath, "*.*", SearchOption.TopDirectoryOnly)
.Where(f => f.EndsWith(".pdf", StringComparison.OrdinalIgnoreCase) ||
f.EndsWith(".docx", StringComparison.OrdinalIgnoreCase) ||
f.EndsWith(".doc", StringComparison.OrdinalIgnoreCase))
.OrderBy(f => f)
.ToList();
Console.WriteLine($"Found {cvFiles.Count} CV files\n");
if (cvFiles.Count == 0)
{
Console.WriteLine("No CV files found (.pdf, .docx, .doc)");
return 1;
}
// Track results
var allUnverifiedEmployers = new List<string>();
var allUnverifiedInstitutions = new List<string>();
var totalEmployers = 0;
var verifiedEmployers = 0;
var totalEducation = 0;
var verifiedEducation = 0;
var processedCount = 0;
var errorCount = 0;
foreach (var cvFile in cvFiles)
{
Console.WriteLine($"\n{new string('=', 80)}");
Console.WriteLine($"[{++processedCount}/{cvFiles.Count}] {Path.GetFileName(cvFile)}");
Console.WriteLine(new string('=', 80));
try
{
using var scope = provider.CreateScope();
var parser = scope.ServiceProvider.GetRequiredService<ICVParserService>();
var companyVerifier = scope.ServiceProvider.GetRequiredService<ICompanyVerifierService>();
var eduVerifier = scope.ServiceProvider.GetRequiredService<IEducationVerifierService>();
// Parse CV
await using var stream = File.OpenRead(cvFile);
var cv = await parser.ParseAsync(stream, Path.GetFileName(cvFile));
Console.WriteLine($"Candidate: {cv.FullName}");
// Verify Employers
if (cv.Employment?.Count > 0)
{
Console.WriteLine($"\nEMPLOYERS ({cv.Employment.Count}):");
Console.WriteLine(new string('-', 60));
foreach (var emp in cv.Employment)
{
totalEmployers++;
try
{
var result = await companyVerifier.VerifyCompanyAsync(
emp.CompanyName,
emp.StartDate,
emp.EndDate,
emp.JobTitle);
var icon = result.IsVerified ? "✓" : "✗";
var period = FormatPeriod(emp.StartDate, emp.EndDate);
Console.WriteLine($"\n {icon} {emp.CompanyName}");
Console.WriteLine($" Period: {period}");
Console.WriteLine($" Role: {emp.JobTitle}");
if (result.IsVerified)
{
verifiedEmployers++;
Console.WriteLine($" Match: {result.MatchedCompanyName} ({result.MatchScore}%)");
if (!string.IsNullOrEmpty(result.MatchedCompanyNumber))
Console.WriteLine($" Company #: {result.MatchedCompanyNumber}");
if (!string.IsNullOrEmpty(result.CompanyStatus))
Console.WriteLine($" Status: {result.CompanyStatus}");
}
else
{
allUnverifiedEmployers.Add(emp.CompanyName);
}
if (!string.IsNullOrEmpty(result.VerificationNotes))
Console.WriteLine($" Note: {result.VerificationNotes}");
}
catch (Exception ex)
{
Console.WriteLine($"\n ✗ {emp.CompanyName}");
Console.WriteLine($" ERROR: {ex.Message}");
allUnverifiedEmployers.Add(emp.CompanyName);
}
}
}
// Verify Education
if (cv.Education?.Count > 0)
{
Console.WriteLine($"\nEDUCATION ({cv.Education.Count}):");
Console.WriteLine(new string('-', 60));
var eduEntries = cv.Education.Select(e => new EducationEntry
{
Institution = e.Institution,
Qualification = e.Qualification,
Subject = e.Subject,
StartDate = e.StartDate,
EndDate = e.EndDate
}).ToList();
var eduResults = eduVerifier.VerifyAll(eduEntries);
foreach (var result in eduResults)
{
totalEducation++;
var icon = result.IsVerified ? "✓" : "✗";
Console.WriteLine($"\n {icon} {result.ClaimedInstitution}");
Console.WriteLine($" Qualification: {result.ClaimedQualification}");
if (!string.IsNullOrEmpty(result.ClaimedSubject))
Console.WriteLine($" Subject: {result.ClaimedSubject}");
if (result.IsVerified)
{
verifiedEducation++;
if (result.MatchedInstitution != null &&
!result.MatchedInstitution.Equals(result.ClaimedInstitution, StringComparison.OrdinalIgnoreCase))
{
Console.WriteLine($" Match: {result.MatchedInstitution}");
}
}
else
{
allUnverifiedInstitutions.Add(result.ClaimedInstitution ?? "Unknown");
Console.WriteLine($" Status: {result.Status}");
}
if (!string.IsNullOrEmpty(result.VerificationNotes))
Console.WriteLine($" Note: {result.VerificationNotes}");
}
}
}
catch (Exception ex)
{
errorCount++;
Console.WriteLine($"ERROR processing file: {ex.Message}");
}
}
// Print Summary
Console.WriteLine($"\n\n{new string('=', 80)}");
Console.WriteLine("VERIFICATION SUMMARY");
Console.WriteLine(new string('=', 80));
Console.WriteLine($"\nCVs Processed: {processedCount - errorCount}/{cvFiles.Count}");
if (errorCount > 0)
Console.WriteLine($"Errors: {errorCount}");
var empRate = totalEmployers > 0 ? verifiedEmployers * 100 / totalEmployers : 0;
var eduRate = totalEducation > 0 ? verifiedEducation * 100 / totalEducation : 0;
Console.WriteLine($"\nEmployers: {verifiedEmployers}/{totalEmployers} verified ({empRate}%)");
Console.WriteLine($"Education: {verifiedEducation}/{totalEducation} verified ({eduRate}%)");
// List unverified employers
var uniqueUnverifiedEmployers = allUnverifiedEmployers
.GroupBy(e => e, StringComparer.OrdinalIgnoreCase)
.OrderByDescending(g => g.Count())
.ThenBy(g => g.Key)
.ToList();
if (uniqueUnverifiedEmployers.Count > 0)
{
Console.WriteLine($"\n{new string('-', 60)}");
Console.WriteLine($"UNVERIFIED EMPLOYERS ({uniqueUnverifiedEmployers.Count} unique):");
foreach (var group in uniqueUnverifiedEmployers)
{
var count = group.Count() > 1 ? $" (x{group.Count()})" : "";
Console.WriteLine($" - {group.Key}{count}");
}
}
// List unverified institutions
var uniqueUnverifiedInstitutions = allUnverifiedInstitutions
.GroupBy(i => i, StringComparer.OrdinalIgnoreCase)
.OrderByDescending(g => g.Count())
.ThenBy(g => g.Key)
.ToList();
if (uniqueUnverifiedInstitutions.Count > 0)
{
Console.WriteLine($"\n{new string('-', 60)}");
Console.WriteLine($"UNVERIFIED INSTITUTIONS ({uniqueUnverifiedInstitutions.Count} unique):");
foreach (var group in uniqueUnverifiedInstitutions)
{
var count = group.Count() > 1 ? $" (x{group.Count()})" : "";
Console.WriteLine($" - {group.Key}{count}");
}
}
Console.WriteLine($"\n{new string('=', 80)}");
return 0;
}
static string AskForFolder()
{
Console.Write("Enter CV folder path: ");
return Console.ReadLine() ?? "";
}
static string FormatPeriod(DateOnly? start, DateOnly? end)
{
var startStr = start?.ToString("MMM yyyy") ?? "?";
var endStr = end?.ToString("MMM yyyy") ?? "Present";
return $"{startStr} - {endStr}";
}
static void ConfigureServices(IServiceCollection services)
{
// Load configuration - try multiple locations
var configPaths = new[]
{
"/var/www/realcv",
"/git/RealCV/src/RealCV.Web",
Path.GetFullPath(Path.Combine(AppContext.BaseDirectory, "..", "..", "..", "..", "..", "src", "RealCV.Web"))
};
var webProjectPath = configPaths.FirstOrDefault(Directory.Exists) ?? "/git/RealCV/src/RealCV.Web";
Console.WriteLine($"Loading config from: {webProjectPath}");
var configuration = new ConfigurationBuilder()
.SetBasePath(webProjectPath)
.AddJsonFile("appsettings.json", optional: true)
.AddJsonFile("appsettings.Development.json", optional: true)
.AddJsonFile("appsettings.Production.json", optional: true)
.Build();
// Logging - minimal output
services.AddLogging(builder =>
{
builder.AddConsole();
builder.SetMinimumLevel(LogLevel.Warning);
});
// Database
var connectionString = configuration.GetConnectionString("DefaultConnection")
?? "Server=127.0.0.1;Database=RealCV;User Id=SA;Password=TrueCV_Sql2024!;TrustServerCertificate=True";
services.AddDbContextFactory<ApplicationDbContext>(options =>
options.UseSqlServer(connectionString));
// Companies House - use configuration binding
services.Configure<CompaniesHouseSettings>(configuration.GetSection(CompaniesHouseSettings.SectionName));
services.AddHttpClient<CompaniesHouseClient>();
// Anthropic - use configuration binding
services.Configure<AnthropicSettings>(configuration.GetSection(AnthropicSettings.SectionName));
services.AddScoped<ICompanyNameMatcherService, AICompanyNameMatcherService>();
// Services
services.AddScoped<ICompanyVerifierService, CompanyVerifierService>();
services.AddScoped<IEducationVerifierService, EducationVerifierService>();
services.AddScoped<ICVParserService, CVParserService>();
}
}

195
tools/batch-test-cvs.cs Normal file
View File

@@ -0,0 +1,195 @@
#!/usr/bin/env dotnet-script
#r "nuget: Microsoft.EntityFrameworkCore.SqlServer, 8.0.0"
#r "nuget: Microsoft.Extensions.Configuration.Json, 8.0.0"
#r "nuget: Microsoft.Extensions.DependencyInjection, 8.0.0"
#r "nuget: Microsoft.Extensions.Logging.Console, 8.0.0"
#r "../src/RealCV.Application/bin/Debug/net8.0/RealCV.Application.dll"
#r "../src/RealCV.Infrastructure/bin/Debug/net8.0/RealCV.Infrastructure.dll"
#r "../src/RealCV.Domain/bin/Debug/net8.0/RealCV.Domain.dll"
// This is a dotnet-script file. Run with: dotnet script batch-test-cvs.cs -- /path/to/cvs
// Install dotnet-script: dotnet tool install -g dotnet-script
using System;
using System.IO;
using System.Linq;
using System.Threading.Tasks;
using System.Collections.Generic;
using System.Text.Json;
using Microsoft.EntityFrameworkCore;
using Microsoft.Extensions.Configuration;
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Logging;
using RealCV.Application.Interfaces;
using RealCV.Application.Models;
using RealCV.Infrastructure.Data;
using RealCV.Infrastructure.Services;
using RealCV.Infrastructure.ExternalApis;
using RealCV.Infrastructure.Configuration;
var folderPath = Args.FirstOrDefault() ?? "/tmp/test-cvs";
if (!Directory.Exists(folderPath))
{
Console.WriteLine($"Error: Folder not found: {folderPath}");
Console.WriteLine("Usage: dotnet script batch-test-cvs.cs -- /path/to/cvs");
return 1;
}
Console.WriteLine($"Processing CVs from: {folderPath}");
Console.WriteLine(new string('=', 80));
// Setup DI
var services = new ServiceCollection();
var configuration = new ConfigurationBuilder()
.SetBasePath(Path.Combine(Directory.GetCurrentDirectory(), "../src/RealCV.Web"))
.AddJsonFile("appsettings.json", optional: true)
.AddJsonFile("appsettings.Development.json", optional: true)
.Build();
services.AddLogging(b => b.AddConsole().SetMinimumLevel(LogLevel.Warning));
services.AddDbContextFactory<ApplicationDbContext>(options =>
options.UseSqlServer(configuration.GetConnectionString("DefaultConnection")));
services.Configure<CompaniesHouseSettings>(configuration.GetSection("CompaniesHouse"));
services.Configure<AnthropicSettings>(configuration.GetSection("Anthropic"));
services.AddHttpClient<CompaniesHouseClient>();
services.AddHttpClient<AnthropicClient>();
services.AddScoped<ICompanyNameMatcherService, CompanyNameMatcherService>();
services.AddScoped<ICompanyVerifierService, CompanyVerifierService>();
services.AddScoped<IEducationVerifierService, EducationVerifierService>();
services.AddScoped<ICVParserService, CVParserService>();
var provider = services.BuildServiceProvider();
var cvFiles = Directory.GetFiles(folderPath, "*.*")
.Where(f => f.EndsWith(".pdf", StringComparison.OrdinalIgnoreCase) ||
f.EndsWith(".docx", StringComparison.OrdinalIgnoreCase))
.ToList();
Console.WriteLine($"Found {cvFiles.Count} CV files\n");
var allUnverifiedEmployers = new List<string>();
var allUnverifiedInstitutions = new List<string>();
var totalEmployers = 0;
var verifiedEmployers = 0;
var totalEducation = 0;
var verifiedEducation = 0;
foreach (var cvFile in cvFiles)
{
Console.WriteLine($"\n{'=',-80}");
Console.WriteLine($"FILE: {Path.GetFileName(cvFile)}");
Console.WriteLine($"{'=',-80}");
try
{
using var scope = provider.CreateScope();
var parser = scope.ServiceProvider.GetRequiredService<ICVParserService>();
var companyVerifier = scope.ServiceProvider.GetRequiredService<ICompanyVerifierService>();
var eduVerifier = scope.ServiceProvider.GetRequiredService<IEducationVerifierService>();
await using var stream = File.OpenRead(cvFile);
var cv = await parser.ParseAsync(stream, Path.GetFileName(cvFile));
Console.WriteLine($"Candidate: {cv.PersonalInfo?.FullName ?? "Unknown"}");
// Employers
if (cv.Employment?.Count > 0)
{
Console.WriteLine($"\nEMPLOYERS ({cv.Employment.Count}):");
foreach (var emp in cv.Employment)
{
totalEmployers++;
var result = await companyVerifier.VerifyCompanyAsync(
emp.CompanyName, emp.StartDate, emp.EndDate, emp.JobTitle);
var icon = result.IsVerified ? "✓" : "✗";
Console.WriteLine($" {icon} {emp.CompanyName}");
if (result.IsVerified)
{
verifiedEmployers++;
Console.WriteLine($" → {result.MatchedCompanyName} ({result.MatchScore}%)");
if (!string.IsNullOrEmpty(result.VerificationNotes))
Console.WriteLine($" Note: {result.VerificationNotes}");
}
else
{
allUnverifiedEmployers.Add(emp.CompanyName);
Console.WriteLine($" Note: {result.VerificationNotes ?? "Not found"}");
}
}
}
// Education
if (cv.Education?.Count > 0)
{
Console.WriteLine($"\nEDUCATION ({cv.Education.Count}):");
var eduEntries = cv.Education.Select(e => new EducationEntry
{
Institution = e.Institution,
Qualification = e.Qualification,
Subject = e.Subject,
StartDate = e.StartDate,
EndDate = e.EndDate
}).ToList();
var eduResults = eduVerifier.VerifyAll(eduEntries);
foreach (var result in eduResults)
{
totalEducation++;
var icon = result.IsVerified ? "✓" : "✗";
Console.WriteLine($" {icon} {result.ClaimedInstitution}");
Console.WriteLine($" {result.ClaimedQualification}");
if (result.IsVerified)
{
verifiedEducation++;
if (result.MatchedInstitution != null && result.MatchedInstitution != result.ClaimedInstitution)
Console.WriteLine($" → {result.MatchedInstitution}");
}
else
{
allUnverifiedInstitutions.Add(result.ClaimedInstitution ?? "Unknown");
Console.WriteLine($" Status: {result.Status}");
if (!string.IsNullOrEmpty(result.VerificationNotes))
Console.WriteLine($" Note: {result.VerificationNotes}");
}
}
}
}
catch (Exception ex)
{
Console.WriteLine($"ERROR: {ex.Message}");
}
}
// Summary
Console.WriteLine($"\n\n{'=',-80}");
Console.WriteLine("SUMMARY");
Console.WriteLine($"{'=',-80}");
Console.WriteLine($"CVs Processed: {cvFiles.Count}");
Console.WriteLine($"Employers: {verifiedEmployers}/{totalEmployers} verified ({(totalEmployers > 0 ? verifiedEmployers * 100 / totalEmployers : 0)}%)");
Console.WriteLine($"Education: {verifiedEducation}/{totalEducation} verified ({(totalEducation > 0 ? verifiedEducation * 100 / totalEducation : 0)}%)");
var uniqueUnverifiedEmployers = allUnverifiedEmployers.Distinct().OrderBy(x => x).ToList();
if (uniqueUnverifiedEmployers.Count > 0)
{
Console.WriteLine($"\nUNVERIFIED EMPLOYERS ({uniqueUnverifiedEmployers.Count}):");
foreach (var emp in uniqueUnverifiedEmployers)
Console.WriteLine($" - {emp}");
}
var uniqueUnverifiedInstitutions = allUnverifiedInstitutions.Distinct().OrderBy(x => x).ToList();
if (uniqueUnverifiedInstitutions.Count > 0)
{
Console.WriteLine($"\nUNVERIFIED INSTITUTIONS ({uniqueUnverifiedInstitutions.Count}):");
foreach (var inst in uniqueUnverifiedInstitutions)
Console.WriteLine($" - {inst}");
}
return 0;