diff --git a/src/RealCV.Application/Interfaces/ICompanyNameMatcherService.cs b/src/RealCV.Application/Interfaces/ICompanyNameMatcherService.cs
index 466ecfb..d4d3a6d 100644
--- a/src/RealCV.Application/Interfaces/ICompanyNameMatcherService.cs
+++ b/src/RealCV.Application/Interfaces/ICompanyNameMatcherService.cs
@@ -8,9 +8,14 @@ public interface ICompanyNameMatcherService
/// Uses AI to semantically compare a company name from a CV against Companies House candidates.
/// Returns the best match with confidence score and reasoning.
///
+ /// The company name as written on the CV
+ /// List of potential matches from Companies House
+ /// Optional industry context for well-known brands (e.g., "pharmacy/healthcare retail")
+ /// Cancellation token
Task FindBestMatchAsync(
string cvCompanyName,
List candidates,
+ string? industryHint = null,
CancellationToken cancellationToken = default);
///
diff --git a/src/RealCV.Infrastructure/Services/AICompanyNameMatcherService.cs b/src/RealCV.Infrastructure/Services/AICompanyNameMatcherService.cs
index 11fb900..3ce515b 100644
--- a/src/RealCV.Infrastructure/Services/AICompanyNameMatcherService.cs
+++ b/src/RealCV.Infrastructure/Services/AICompanyNameMatcherService.cs
@@ -33,29 +33,43 @@ public sealed class AICompanyNameMatcherService : ICompanyNameMatcherService
Compare the company name from a CV against official Companies House records.
CV Company Name: "{CV_COMPANY}"
-
+ {INDUSTRY_CONTEXT}
Companies House Candidates:
{CANDIDATES}
Determine which candidate (if any) is the SAME company as the CV entry.
Matching Guidelines:
- 1. MATCH if the CV name is clearly the same organisation as a candidate:
- - "Royal Bank of Scotland" → "THE ROYAL BANK OF SCOTLAND PUBLIC LIMITED COMPANY" ✓ (same bank)
- - "Yorkshire Electricity" → "YORKSHIRE ELECTRICITY GROUP PLC" ✓ (same utility)
+ 1. MATCH if the CV name is the same organisation as a candidate (even if registered name differs):
+ - "Boots" → "BOOTS UK LIMITED" ✓ (trading name = registered company)
+ - "Boots" → "THE BOOTS COMPANY PLC" ✓ (trading name = parent company)
- "Tesco" → "TESCO PLC" ✓ (trading name = registered name)
- "ASDA" → "ASDA STORES LIMITED" ✓ (brand = operating company)
+ - "Legal & General" → "LEGAL & GENERAL GROUP PLC" ✓ (brand = holding company)
+ - "Checkout.com" → "CHECKOUT.COM PAYMENTS LIMITED" ✓ (exact match)
+ - "EY UK" → "ERNST & YOUNG LLP" ✓ (trading name = partnership)
+ - "Royal Bank of Scotland" → "THE ROYAL BANK OF SCOTLAND PUBLIC LIMITED COMPANY" ✓
- 2. DO NOT MATCH if the words are fundamentally different:
- - "Families First" ≠ "FAMILIES AGAINST CONFORMITY" (different words after "Families")
- - "Royal Bank" ≠ "Royal Academy" (Bank ≠ Academy)
- - "Storm Ideas" ≠ "STORM LIMITED" (missing "Ideas" - could be different company)
+ 2. DO NOT MATCH if the candidate adds significant DIFFERENT words that indicate a different business:
+ - "Boots" ≠ "BOOTS AND BEARDS" ✗ (pharmacy chain is NOT a barber/grooming business)
+ - "Legal & General" ≠ "LEGAL LIMITED" ✗ (major insurer is NOT a generic "legal" company)
+ - "Checkout.com" ≠ "XN CHECKOUT LIMITED" ✗ (fintech is NOT an unrelated checkout company)
+ - "EY UK" ≠ "EY UK GDPR REPRESENTATIVE LIMITED" ✗ (main employer, not a subsidiary)
- 3. Legal suffixes (Ltd, Limited, PLC, LLP, CiC) should be ignored when comparing names
+ 3. KEY DISTINCTION - Geographic/legal suffixes are OK, but new business words are NOT:
+ - "Boots" → "BOOTS UK LIMITED" ✓ (UK is just geographic qualifier)
+ - "Boots" → "BOOTS AND BEARDS" ✗ (BEARDS indicates different business)
+ - "Meridian Holdings" → "MERIDIAN (THE ORIGINAL) LIMITED" ✗ ("THE ORIGINAL" suggests different business)
+ - "Paramount Consulting UK" → "PARAMOUNT LIMITED" ✗ (missing "Consulting" - different type)
+ - "Apex Technology Partners" → "APEX LIMITED" ✗ (missing "Technology Partners")
- 4. Adding "THE" or "GROUP" to a name doesn't make it a different company
+ 4. Legal suffixes (Ltd, Limited, PLC, LLP, CiC) should be ignored when comparing names
- 5. If unsure, prefer matching over rejecting when core identifying words match
+ 5. Adding "THE", "GROUP", "UK", or "HOLDINGS" to a name doesn't make it a different company
+
+ 6. When the CV mentions a well-known brand, prefer the main operating/holding company over obscure matches
+
+ 7. If INDUSTRY CONTEXT is provided, use it to reject candidates clearly in different industries
CRITICAL: Return the COMPLETE company number exactly as shown (e.g., "SC083026", "02366995").
Do NOT truncate or abbreviate the company number.
@@ -80,6 +94,7 @@ public sealed class AICompanyNameMatcherService : ICompanyNameMatcherService
public async Task FindBestMatchAsync(
string cvCompanyName,
List candidates,
+ string? industryHint = null,
CancellationToken cancellationToken = default)
{
if (string.IsNullOrWhiteSpace(cvCompanyName) || candidates.Count == 0)
@@ -87,8 +102,8 @@ public sealed class AICompanyNameMatcherService : ICompanyNameMatcherService
return null;
}
- _logger.LogDebug("Using AI to match '{CVCompany}' against {Count} candidates",
- cvCompanyName, candidates.Count);
+ _logger.LogDebug("Using AI to match '{CVCompany}' against {Count} candidates (industry: {Industry})",
+ cvCompanyName, candidates.Count, industryHint ?? "unknown");
try
{
@@ -96,8 +111,14 @@ public sealed class AICompanyNameMatcherService : ICompanyNameMatcherService
var candidatesText = string.Join("\n", candidates.Select((c, i) =>
$"[{c.CompanyNumber}] {c.CompanyName} (Status: {c.CompanyStatus ?? "Unknown"})"));
+ // Add industry context if available
+ var industryContext = string.IsNullOrEmpty(industryHint)
+ ? ""
+ : $"Industry Context: This is a well-known brand in {industryHint}. Reject candidates clearly in different industries.\n";
+
var prompt = MatchingPrompt
.Replace("{CV_COMPANY}", cvCompanyName)
+ .Replace("{INDUSTRY_CONTEXT}", industryContext)
.Replace("{CANDIDATES}", candidatesText);
var messages = new List
diff --git a/src/RealCV.Infrastructure/Services/CompanyVerifierService.cs b/src/RealCV.Infrastructure/Services/CompanyVerifierService.cs
index 8a76440..7cdf49e 100644
--- a/src/RealCV.Infrastructure/Services/CompanyVerifierService.cs
+++ b/src/RealCV.Infrastructure/Services/CompanyVerifierService.cs
@@ -73,6 +73,127 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
"manufacturing", "operations", "trading"
};
+ // Words that are "safe expansions" - they don't change company identity when added
+ // "Boots" -> "BOOTS UK LIMITED" is safe (UK + LIMITED are standard suffixes)
+ // "Boots" -> "THE BOOTS COMPANY PLC" is safe (THE + COMPANY + PLC are standard)
+ // These are NOT counted as "extra meaningful words" in matching
+ private static readonly HashSet SafeExpansionWords = new(StringComparer.OrdinalIgnoreCase)
+ {
+ // Legal structures (already in SkipWords, but explicit here for clarity)
+ "limited", "ltd", "plc", "llp", "llc", "inc", "incorporated", "corporation", "corp",
+ "company", "co", "partners", "partnership",
+
+ // Corporate structure words - these expand but don't change identity
+ "group", "holdings", "holding", "the",
+
+ // Geographic qualifiers (already in SkipWords)
+ "uk", "u.k.", "gb", "britain", "british", "england", "europe", "european",
+ "international", "global", "worldwide",
+
+ // Common corporate suffixes
+ "services", "solutions", // "Boots" vs "BOOTS SERVICES LIMITED" - likely same company
+ "retail", "stores", // "ASDA" vs "ASDA STORES LIMITED" - same company
+ };
+
+ // Well-known UK trading names/brands mapped to their expected company name patterns
+ // This provides fast-path recognition for major employers without full AI evaluation
+ // Key: Trading name (how people commonly refer to the company)
+ // Value: Tuple of (primary company number, list of acceptable name patterns)
+ private static readonly Dictionary WellKnownBrands =
+ new(StringComparer.OrdinalIgnoreCase)
+ {
+ // Retail
+ ["Boots"] = new("00928555", "pharmacy/healthcare retail", new[] { "BOOTS UK", "BOOTS COMPANY", "BOOTS PLC", "BOOTS LIMITED" }),
+ ["ASDA"] = new("00464777", "supermarket retail", new[] { "ASDA STORES", "ASDA GROUP", "ASDA PLC" }),
+ ["Tesco"] = new("00445790", "supermarket retail", new[] { "TESCO PLC", "TESCO STORES", "TESCO UK" }),
+ ["Sainsbury"] = new("00185647", "supermarket retail", new[] { "SAINSBURY", "J SAINSBURY" }),
+ ["Sainsbury's"] = new("00185647", "supermarket retail", new[] { "SAINSBURY", "J SAINSBURY" }),
+ ["Morrisons"] = new("00358949", "supermarket retail", new[] { "WM MORRISON", "MORRISON SUPERMARKETS" }),
+ ["Waitrose"] = new("00099405", "supermarket retail", new[] { "WAITROSE", "JOHN LEWIS" }),
+ ["Marks & Spencer"] = new("00214436", "retail", new[] { "MARKS AND SPENCER", "MARKS & SPENCER" }),
+ ["M&S"] = new("00214436", "retail", new[] { "MARKS AND SPENCER", "MARKS & SPENCER" }),
+ ["John Lewis"] = new("00233462", "retail", new[] { "JOHN LEWIS", "JOHN LEWIS PARTNERSHIP" }),
+ ["Next"] = new("04425340", "retail", new[] { "NEXT PLC", "NEXT RETAIL", "NEXT GROUP" }),
+ ["Primark"] = new("NI016270", "retail", new[] { "PRIMARK", "PENNEYS", "ASSOCIATED BRITISH FOODS" }),
+ ["Argos"] = new("01081551", "retail", new[] { "ARGOS", "SAINSBURY'S ARGOS" }),
+
+ // Finance & Insurance
+ ["Legal & General"] = new("01417162", "insurance/financial services", new[] { "LEGAL AND GENERAL", "LEGAL & GENERAL", "L&G" }),
+ ["Aviva"] = new("02468686", "insurance", new[] { "AVIVA", "NORWICH UNION" }),
+ ["Prudential"] = new("01397169", "insurance/financial services", new[] { "PRUDENTIAL" }),
+ ["AXA"] = new("01878835", "insurance", new[] { "AXA UK", "AXA INSURANCE" }),
+ ["Lloyds Banking Group"] = new("00002065", "banking", new[] { "LLOYDS BANK", "LLOYDS BANKING" }),
+ ["Barclays"] = new("01026167", "banking", new[] { "BARCLAYS BANK", "BARCLAYS PLC" }),
+ ["HSBC"] = new("00014259", "banking", new[] { "HSBC BANK", "HSBC UK", "HSBC HOLDINGS" }),
+ ["NatWest"] = new("00929027", "banking", new[] { "NATWEST", "NATIONAL WESTMINSTER", "NATWEST GROUP" }),
+
+ // Professional Services
+ ["EY"] = new("OC300001", "accounting/professional services", new[] { "ERNST & YOUNG", "EY LLP", "ERNST AND YOUNG" }),
+ ["EY UK"] = new("OC300001", "accounting/professional services", new[] { "ERNST & YOUNG", "EY LLP" }),
+ ["Ernst & Young"] = new("OC300001", "accounting/professional services", new[] { "ERNST & YOUNG", "EY LLP" }),
+ ["PwC"] = new("OC303525", "accounting/professional services", new[] { "PRICEWATERHOUSECOOPERS", "PWC" }),
+ ["Deloitte"] = new("OC303675", "accounting/professional services", new[] { "DELOITTE LLP", "DELOITTE" }),
+ ["KPMG"] = new("OC301540", "accounting/professional services", new[] { "KPMG LLP", "KPMG" }),
+ ["Accenture"] = new("04abortedt6", "consulting", new[] { "ACCENTURE UK", "ACCENTURE" }),
+ ["McKinsey"] = new("03883888", "consulting", new[] { "MCKINSEY", "MCKINSEY & COMPANY" }),
+
+ // Technology
+ ["Checkout.com"] = new("09131987", "fintech/payments", new[] { "CHECKOUT.COM", "CHECKOUT LTD", "CHECKOUT PAYMENTS" }),
+ ["Revolut"] = new("08804411", "fintech", new[] { "REVOLUT LTD", "REVOLUT" }),
+ ["Monzo"] = new("09446231", "fintech", new[] { "MONZO BANK", "MONZO" }),
+ ["Wise"] = new("07209813", "fintech", new[] { "WISE", "TRANSFERWISE" }),
+ ["TransferWise"] = new("07209813", "fintech", new[] { "WISE", "TRANSFERWISE" }),
+ ["Deliveroo"] = new("08167130", "food delivery", new[] { "DELIVEROO", "ROO" }),
+ ["Just Eat"] = new("02465307", "food delivery", new[] { "JUST EAT", "JUST-EAT" }),
+ ["IBM"] = new("00741598", "technology", new[] { "IBM", "IBM UK", "INTERNATIONAL BUSINESS MACHINES" }),
+ ["IBM UK"] = new("00741598", "technology", new[] { "IBM", "IBM UK", "INTERNATIONAL BUSINESS MACHINES" }),
+ ["JCB"] = new("00561597", "manufacturing/machinery", new[] { "JCB", "J C BAMFORD", "BAMFORD EXCAVATORS" }),
+ ["Brewdog"] = new("SC311560", "brewing/hospitality", new[] { "BREWDOG", "BREW DOG" }),
+ ["BrewDog"] = new("SC311560", "brewing/hospitality", new[] { "BREWDOG", "BREW DOG" }),
+ ["Cazoo"] = new("11043737", "automotive/retail", new[] { "CAZOO" }),
+ ["Gymshark"] = new("08396100", "retail/fitness", new[] { "GYMSHARK", "GYM SHARK" }),
+
+ // Telecoms & Media
+ ["BT"] = new("01800000", "telecoms", new[] { "BT GROUP", "BT PLC", "BRITISH TELECOM" }),
+ ["Vodafone"] = new("01471587", "telecoms", new[] { "VODAFONE", "VODAFONE UK", "VODAFONE GROUP" }),
+ ["Sky"] = new("02247735", "media/telecoms", new[] { "SKY UK", "SKY LIMITED", "BSkyB" }),
+ ["Virgin Media"] = new("02591237", "telecoms", new[] { "VIRGIN MEDIA", "VIRGIN MEDIA O2" }),
+
+ // Airlines & Travel
+ ["British Airways"] = new("01777777", "airline", new[] { "BRITISH AIRWAYS", "BA PLC" }),
+ ["BA"] = new("01777777", "airline", new[] { "BRITISH AIRWAYS", "BA PLC" }),
+ ["easyJet"] = new("03959649", "airline", new[] { "EASYJET", "EASY JET" }),
+ ["Ryanair"] = new("01914abortedt", "airline", new[] { "RYANAIR UK", "RYANAIR" }),
+
+ // Energy
+ ["BP"] = new("00102498", "oil & gas", new[] { "BP P.L.C.", "BP PLC", "BRITISH PETROLEUM" }),
+ ["Shell"] = new("04366849", "oil & gas", new[] { "SHELL UK", "SHELL PLC", "ROYAL DUTCH SHELL" }),
+ ["National Grid"] = new("04031152", "utilities", new[] { "NATIONAL GRID", "NATIONAL GRID PLC" }),
+ ["SSE"] = new("SC117119", "utilities", new[] { "SSE PLC", "SSE ENERGY", "SCOTTISH AND SOUTHERN" }),
+ ["Centrica"] = new("03033654", "utilities", new[] { "CENTRICA", "BRITISH GAS" }),
+
+ // Pharma & Healthcare
+ ["GSK"] = new("03888792", "pharmaceuticals", new[] { "GLAXOSMITHKLINE", "GSK PLC" }),
+ ["GlaxoSmithKline"] = new("03888792", "pharmaceuticals", new[] { "GLAXOSMITHKLINE", "GSK" }),
+ ["AstraZeneca"] = new("02723534", "pharmaceuticals", new[] { "ASTRAZENECA", "ASTRA ZENECA" }),
+
+ // Manufacturing & Industrial
+ ["Rolls-Royce"] = new("01003142", "aerospace/engineering", new[] { "ROLLS-ROYCE", "ROLLS ROYCE" }),
+ ["BAE Systems"] = new("01470151", "defence/aerospace", new[] { "BAE SYSTEMS" }),
+ ["Dyson"] = new("02023199", "manufacturing/technology", new[] { "DYSON", "DYSON TECHNOLOGY" }),
+
+ // Automotive
+ ["Jaguar Land Rover"] = new("01672070", "automotive", new[] { "JAGUAR LAND ROVER", "JLR" }),
+ ["JLR"] = new("01672070", "automotive", new[] { "JAGUAR LAND ROVER" }),
+ };
+
+ // Record to hold well-known brand information
+ private sealed record WellKnownBrand(
+ string PrimaryCompanyNumber,
+ string Industry,
+ string[] AcceptablePatterns
+ );
+
public CompanyVerifierService(
CompaniesHouseClient companiesHouseClient,
@@ -141,6 +262,94 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
};
}
+ // Check 1c: Is this self-employment?
+ if (IsSelfEmployment(normalizedName))
+ {
+ _logger.LogInformation("Recognised self-employment: {CompanyName}", companyName);
+ return new CompanyVerificationResult
+ {
+ ClaimedCompany = companyName,
+ MatchedCompanyName = companyName,
+ MatchedCompanyNumber = null,
+ MatchScore = 100,
+ IsVerified = true,
+ VerificationNotes = "Self-employed / freelance - not a registered company",
+ ClaimedStartDate = startDate,
+ ClaimedEndDate = endDate,
+ CompanyType = "self-employed",
+ CompanyStatus = "active",
+ ClaimedJobTitle = jobTitle,
+ Flags = flags
+ };
+ }
+
+ // Check 1d: Is this an overseas/international company?
+ var overseasInfo = DetectOverseasCompany(normalizedName);
+ if (overseasInfo is not null)
+ {
+ _logger.LogInformation("Recognised overseas company: {CompanyName} ({Country})", companyName, overseasInfo.Value.Country);
+ return new CompanyVerificationResult
+ {
+ ClaimedCompany = companyName,
+ MatchedCompanyName = companyName,
+ MatchedCompanyNumber = null,
+ MatchScore = 100,
+ IsVerified = true,
+ VerificationNotes = $"Overseas company ({overseasInfo.Value.Country}) - not registered at UK Companies House",
+ ClaimedStartDate = startDate,
+ ClaimedEndDate = endDate,
+ CompanyType = "overseas",
+ CompanyStatus = "active",
+ ClaimedJobTitle = jobTitle,
+ Flags = flags
+ };
+ }
+
+ // Check 1e: Is this a well-known brand we can fast-track verify?
+ var knownBrand = GetWellKnownBrand(normalizedName);
+ if (knownBrand != null && !string.IsNullOrEmpty(knownBrand.PrimaryCompanyNumber))
+ {
+ _logger.LogInformation("Fast-track verifying well-known brand '{CompanyName}' -> company #{CompanyNumber}",
+ companyName, knownBrand.PrimaryCompanyNumber);
+
+ // Look up the company directly from Companies House
+ try
+ {
+ var companyDetails = await _companiesHouseClient.GetCompanyAsync(knownBrand.PrimaryCompanyNumber);
+ if (companyDetails != null)
+ {
+ DateOnly? incorporationDate = null;
+ if (!string.IsNullOrEmpty(companyDetails.DateOfCreation) &&
+ DateOnly.TryParse(companyDetails.DateOfCreation, out var parsedDate))
+ {
+ incorporationDate = parsedDate;
+ }
+
+ return new CompanyVerificationResult
+ {
+ ClaimedCompany = companyName,
+ MatchedCompanyName = companyDetails.CompanyName,
+ MatchedCompanyNumber = knownBrand.PrimaryCompanyNumber,
+ MatchScore = 100,
+ IsVerified = true,
+ VerificationNotes = $"Well-known brand ({knownBrand.Industry})",
+ ClaimedStartDate = startDate,
+ ClaimedEndDate = endDate,
+ CompanyType = companyDetails.Type,
+ CompanyStatus = companyDetails.CompanyStatus,
+ IncorporationDate = incorporationDate,
+ ClaimedJobTitle = jobTitle,
+ Flags = flags
+ };
+ }
+ }
+ catch (Exception ex)
+ {
+ _logger.LogWarning(ex, "Failed to fast-track verify well-known brand '{CompanyName}', falling back to search", companyName);
+ // Fall through to normal search
+ }
+ }
+
// Check 2: Is this an internal division of a larger company?
var parentCompany = UKHistoricalEmployers.GetParentCompanyForDivision(normalizedName);
if (parentCompany != null)
@@ -303,10 +512,65 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
// Use AI to find the best semantic match from all candidates
_logger.LogDebug("Using AI to match '{CompanyName}' against {Count} candidates", normalizedName, allCandidates.Count);
- // Sort candidates by fuzzy relevance to the search term before taking top 10
- // This ensures the most likely matches are sent to the AI, not just arbitrary entries
+ // Check if this is a well-known brand - if so, we can be more lenient with matching
+ var wellKnownBrand = GetWellKnownBrand(normalizedName);
+ if (wellKnownBrand != null)
+ {
+ _logger.LogDebug("Recognised well-known brand '{CompanyName}' (industry: {Industry})",
+ normalizedName, wellKnownBrand.Industry);
+ }
+
+ // Extract core identifiers from the original company name
+ var originalCoreWords = ExtractCoreIdentifiers(normalizedName);
var normalizedUpper = normalizedName.ToUpperInvariant();
- var candidatesForAI = allCandidates.Values
+
+ // Pre-filter candidates: reject those missing significant core words from the original
+ // This prevents "Northwick Industries Limited" matching "NORTHWICK LIMITED" (missing INDUSTRIES)
+ // BUT: for well-known brands or candidates with only safe expansions, we're more lenient
+ var filteredCandidates = allCandidates.Values
+ .Where(c =>
+ {
+ // All original core words must appear in the candidate
+ var candidateTitle = c.Title.ToUpperInvariant();
+ var hasAllCores = originalCoreWords.Count == 0 ||
+ originalCoreWords.All(w => candidateTitle.Contains(w));
+
+ if (!hasAllCores)
+ {
+ // For well-known brands, check if candidate matches acceptable patterns
+ if (wellKnownBrand != null && MatchesWellKnownBrandPatterns(c.Title, wellKnownBrand))
+ {
+ _logger.LogDebug("Pre-filter allowing '{Candidate}' - matches well-known brand pattern for '{Original}'",
+ c.Title, normalizedName);
+ return true;
+ }
+
+ _logger.LogDebug("Pre-filter rejected '{Candidate}' - missing core words from '{Original}'. " +
+ "Required: [{Required}]", c.Title, normalizedName, string.Join(", ", originalCoreWords));
+ return false;
+ }
+
+ // For candidates that have all core words, check if extra words are safe
+ // This is a RELAXED filter for AI candidates - we let the AI make the final call
+ // But we still log for debugging
+ var candidateCores = ExtractCoreIdentifiers(c.Title);
+ var meaningfulExtras = CountMeaningfulExtraWords(originalCoreWords, candidateCores);
+ if (meaningfulExtras > 0)
+ {
+ _logger.LogDebug("Pre-filter note: '{Candidate}' has {ExtraCount} meaningful extra words vs '{Original}'",
+ c.Title, meaningfulExtras, normalizedName);
+ }
+
+ return true; // Let AI evaluate candidates with extra words
+ })
+ .ToList();
+
+ _logger.LogDebug("Pre-filtered {Original} candidates to {Filtered} candidates",
+ allCandidates.Count, filteredCandidates.Count);
+
+ // Sort remaining candidates by fuzzy relevance to the search term before taking top 10
+ // This ensures the most likely matches are sent to the AI, not just arbitrary entries
+ var candidatesForAI = filteredCandidates
.Select(c => new
{
Item = c,
@@ -326,7 +590,9 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
_logger.LogDebug("Top candidates for AI matching (sorted by relevance): {Candidates}",
string.Join(", ", candidatesForAI.Select(c => $"{c.CompanyName} [{c.CompanyNumber}]")));
- var aiResult = await _aiMatcher.FindBestMatchAsync(normalizedName, candidatesForAI);
+ // Pass industry context if this is a well-known brand
+ var industryHint = wellKnownBrand?.Industry;
+ var aiResult = await _aiMatcher.FindBestMatchAsync(normalizedName, candidatesForAI, industryHint);
CompaniesHouseSearchItem? matchedItem = null;
int matchScore;
@@ -350,10 +616,9 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
// AI didn't find a match - check if it explicitly rejected or just failed
if (aiResult?.MatchType == "NoMatch")
{
- // AI explicitly rejected. Only override if fuzzy match passes strict validation:
- // 1. High fuzzy score (>= 90%)
- // 2. ALL core identifying words from original name appear in the match
- // 3. Match doesn't have significantly more core words (prevents partial word matches)
+ // AI explicitly rejected. Only override if fuzzy match passes STRICT validation.
+ // We trust the AI's judgment - only override in clear-cut cases where
+ // the fuzzy match is essentially identical to the original OR has only safe expansions.
if (bestFuzzy.HasValue && bestFuzzy.Value.Score >= 90)
{
var originalCores = ExtractCoreIdentifiers(normalizedName);
@@ -363,18 +628,33 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
var allCoresPresent = originalCores.Count == 0 ||
originalCores.All(c => bestFuzzy.Value.Item.Title.Contains(c, StringComparison.OrdinalIgnoreCase));
- // Match shouldn't have too many extra core words (max 2 extra, e.g., "GROUP PLC")
- var extraCores = matchCores.Count(c => !originalCores.Any(o =>
- c.Equals(o, StringComparison.OrdinalIgnoreCase)));
- var reasonableExtras = extraCores <= 2;
+ // Count MEANINGFUL extra core words (excluding safe expansions like UK, LIMITED, COMPANY)
+ var meaningfulExtras = CountMeaningfulExtraWords(originalCores, matchCores);
- if (allCoresPresent && reasonableExtras)
+ // For short company names (1-2 core words), no meaningful extras allowed
+ // For longer names, allow up to 1 meaningful extra
+ // But if it's a well-known brand matching an acceptable pattern, allow more
+ var isShortName = originalCores.Count <= 2;
+ var maxAllowedExtras = isShortName ? 0 : 1;
+
+ // Well-known brand override: if the match fits acceptable patterns, allow it
+ var brandOverride = wellKnownBrand != null &&
+ MatchesWellKnownBrandPatterns(bestFuzzy.Value.Item.Title, wellKnownBrand);
+
+ var reasonableExtras = meaningfulExtras <= maxAllowedExtras || brandOverride;
+
+ // Additional check: if match has significantly MORE core words than original,
+ // it's likely a different company entirely
+ var coreDifference = matchCores.Count - originalCores.Count;
+ var acceptableCoreDifference = coreDifference <= 2; // Allow 2 extra total (could be safe expansions)
+
+ if (allCoresPresent && reasonableExtras && acceptableCoreDifference)
{
_logger.LogInformation(
"AI rejected '{CompanyName}' but fuzzy match '{MatchedName}' ({Score}%) passes validation. " +
- "Original cores: [{OriginalCores}], Match cores: [{MatchCores}]",
+ "Original cores: [{OriginalCores}], Match cores: [{MatchCores}], MeaningfulExtras: {Extra}, BrandOverride: {Override}",
normalizedName, bestFuzzy.Value.Item.Title, bestFuzzy.Value.Score,
- string.Join(", ", originalCores), string.Join(", ", matchCores));
+ string.Join(", ", originalCores), string.Join(", ", matchCores), meaningfulExtras, brandOverride);
matchedItem = bestFuzzy.Value.Item;
matchScore = bestFuzzy.Value.Score;
}
@@ -382,8 +662,8 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
{
_logger.LogDebug(
"AI rejected '{CompanyName}' and fuzzy match '{MatchedName}' fails validation. " +
- "AllCoresPresent: {AllCores}, ExtraCores: {Extra}",
- normalizedName, bestFuzzy.Value.Item.Title, allCoresPresent, extraCores);
+ "AllCoresPresent: {AllCores}, MeaningfulExtras: {Extra} (max: {MaxAllowed}), CoreDiff: {CoreDiff}, BrandOverride: {Override}",
+ normalizedName, bestFuzzy.Value.Item.Title, allCoresPresent, meaningfulExtras, maxAllowedExtras, coreDifference, brandOverride);
return CreateUnverifiedResult(companyName, startDate, endDate, jobTitle,
"Company name could not be verified - no matching company found in official records");
}
@@ -841,6 +1121,162 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
return normalized;
}
+ ///
+ /// Detects if a company name indicates self-employment or freelance work.
+ ///
+ private static bool IsSelfEmployment(string companyName)
+ {
+ var lower = companyName.ToLowerInvariant().Trim();
+
+ // Exact matches
+ var selfEmployedTerms = new HashSet(StringComparer.OrdinalIgnoreCase)
+ {
+ "self-employed", "self employed", "selfemployed",
+ "freelance", "freelancer", "free-lance", "free lance",
+ "contractor", "independent contractor",
+ "sole trader", "sole-trader", "soletrader",
+ "consultant", "independent consultant",
+ "self", "myself", "own business", "own company",
+ "private practice", "private consultancy",
+ "portfolio career", "various clients",
+ "contract work", "contracting"
+ };
+
+ if (selfEmployedTerms.Contains(lower))
+ return true;
+
+ // Pattern matches
+ if (lower.StartsWith("self-employed") || lower.StartsWith("self employed"))
+ return true;
+ if (lower.StartsWith("freelance") || lower.StartsWith("free-lance"))
+ return true;
+ if (lower.EndsWith("(self-employed)") || lower.EndsWith("(freelance)"))
+ return true;
+ if (lower.Contains("self-employed as") || lower.Contains("freelancing as"))
+ return true;
+
+ return false;
+ }
+
+ ///
+ /// Detects if a company name indicates an overseas/international company not registered in the UK.
+ /// Returns the detected country if found, null otherwise.
+ ///
+ private static (string Country, string BaseName)? DetectOverseasCompany(string companyName)
+ {
+ var lower = companyName.ToLowerInvariant();
+
+ // Countries and their variations that indicate non-UK companies
+ var overseasCountries = new Dictionary
+ {
+ // North America
+ { new[] { "canada", "canadian" }, "Canada" },
+ { new[] { "usa", "u.s.a.", "u.s.", "united states", "american", "america" }, "United States" },
+ { new[] { "mexico", "mexican" }, "Mexico" },
+
+ // Europe (non-UK)
+ { new[] { "ireland", "irish", "eire", "dublin" }, "Ireland" },
+ { new[] { "france", "french", "paris" }, "France" },
+ { new[] { "germany", "german", "deutsche", "berlin", "munich" }, "Germany" },
+ { new[] { "spain", "spanish", "madrid", "barcelona" }, "Spain" },
+ { new[] { "italy", "italian", "milan", "rome" }, "Italy" },
+ { new[] { "netherlands", "dutch", "holland", "amsterdam" }, "Netherlands" },
+ { new[] { "belgium", "belgian", "brussels" }, "Belgium" },
+ { new[] { "switzerland", "swiss", "zurich", "geneva" }, "Switzerland" },
+ { new[] { "austria", "austrian", "vienna" }, "Austria" },
+ { new[] { "sweden", "swedish", "stockholm" }, "Sweden" },
+ { new[] { "norway", "norwegian", "oslo" }, "Norway" },
+ { new[] { "denmark", "danish", "copenhagen" }, "Denmark" },
+ { new[] { "finland", "finnish", "helsinki" }, "Finland" },
+ { new[] { "poland", "polish", "warsaw" }, "Poland" },
+ { new[] { "portugal", "portuguese", "lisbon" }, "Portugal" },
+ { new[] { "greece", "greek", "athens" }, "Greece" },
+ { new[] { "cyprus", "cypriot", "nicosia" }, "Cyprus" },
+ { new[] { "czech", "prague" }, "Czech Republic" },
+ { new[] { "hungary", "hungarian", "budapest" }, "Hungary" },
+ { new[] { "romania", "romanian", "bucharest" }, "Romania" },
+
+ // Asia Pacific
+ { new[] { "australia", "australian", "sydney", "melbourne" }, "Australia" },
+ { new[] { "new zealand", "nz", "auckland", "wellington" }, "New Zealand" },
+ { new[] { "japan", "japanese", "tokyo" }, "Japan" },
+ { new[] { "china", "chinese", "beijing", "shanghai", "hong kong" }, "China" },
+ { new[] { "india", "indian", "mumbai", "delhi", "bangalore" }, "India" },
+ { new[] { "singapore", "singaporean" }, "Singapore" },
+ { new[] { "malaysia", "malaysian", "kuala lumpur" }, "Malaysia" },
+ { new[] { "korea", "korean", "seoul" }, "South Korea" },
+ { new[] { "taiwan", "taiwanese", "taipei" }, "Taiwan" },
+ { new[] { "thailand", "thai", "bangkok" }, "Thailand" },
+ { new[] { "philippines", "filipino", "manila" }, "Philippines" },
+ { new[] { "indonesia", "indonesian", "jakarta" }, "Indonesia" },
+ { new[] { "vietnam", "vietnamese", "hanoi", "ho chi minh" }, "Vietnam" },
+
+ // Middle East & Africa
+ { new[] { "uae", "u.a.e.", "dubai", "abu dhabi", "emirates" }, "UAE" },
+ { new[] { "saudi", "riyadh", "jeddah" }, "Saudi Arabia" },
+ { new[] { "qatar", "doha" }, "Qatar" },
+ { new[] { "israel", "israeli", "tel aviv" }, "Israel" },
+ { new[] { "south africa", "johannesburg", "cape town" }, "South Africa" },
+ { new[] { "egypt", "egyptian", "cairo" }, "Egypt" },
+ { new[] { "nigeria", "nigerian", "lagos" }, "Nigeria" },
+ { new[] { "kenya", "kenyan", "nairobi" }, "Kenya" },
+
+ // South America
+ { new[] { "brazil", "brazilian", "sao paulo", "rio" }, "Brazil" },
+ { new[] { "argentina", "argentine", "buenos aires" }, "Argentina" },
+ { new[] { "chile", "chilean", "santiago" }, "Chile" },
+ { new[] { "colombia", "colombian", "bogota" }, "Colombia" },
+ };
+
+ // Check for country indicators at the end or in the name
+ foreach (var (terms, country) in overseasCountries)
+ {
+ foreach (var term in terms)
+ {
+ // Check if name ends with country (e.g., "BMW Group Canada")
+ if (lower.EndsWith(" " + term))
+ {
+ var baseName = companyName[..^(term.Length + 1)].Trim();
+ return (country, baseName);
+ }
+
+ // Check for patterns like "Company Name (Country)" or "Company Name - Country"
+ if (lower.EndsWith($"({term})") || lower.EndsWith($"- {term}") || lower.EndsWith($", {term}"))
+ {
+ var idx = lower.LastIndexOf(term);
+ var baseName = companyName[..(idx - 2)].Trim().TrimEnd('(', '-', ',').Trim();
+ return (country, baseName);
+ }
+
+ // Check for "Country Office" or "Country Branch" patterns
+ if (lower.Contains($"{term} office") || lower.Contains($"{term} branch") ||
+ lower.Contains($"{term} division") || lower.Contains($"{term} operations"))
+ {
+ return (country, companyName);
+ }
+ }
+ }
+
+ // Check for international organization patterns
+ var internationalPatterns = new[]
+ {
+ "national guard", "armed forces", "military", "army", "navy", "air force",
+ "embassy", "consulate", "foreign ministry",
+ "max planck", "fraunhofer", "cnrs", "csiro", "nasa", "esa",
+ "world bank", "imf", "united nations", "un ", "nato", "who ", "unesco"
+ };
+
+ foreach (var pattern in internationalPatterns)
+ {
+ if (lower.Contains(pattern))
+ {
+ return ("International", companyName);
+ }
+ }
+
+ return null;
+ }
+
///
/// Attempts to verify compound company names by detecting if multiple companies are mentioned.
/// Only triggers for names with potential separators (/, &, "and") to avoid unnecessary AI calls.
@@ -920,8 +1356,18 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
return null;
}
+ // Extract core identifiers from the original company name
+ var originalCoreWords = ExtractCoreIdentifiers(companyName);
+
var matches = cachedCompanies
.Where(c => !string.IsNullOrWhiteSpace(c.CompanyName))
+ .Where(c =>
+ {
+ // All original core words must appear in the cached company name
+ var cachedTitle = c.CompanyName.ToUpperInvariant();
+ return originalCoreWords.Count == 0 ||
+ originalCoreWords.All(w => cachedTitle.Contains(w));
+ })
.Select(c => new { Company = c, Score = Fuzz.TokenSetRatio(companyName.ToUpperInvariant(), c.CompanyName.ToUpperInvariant()) })
.Where(m => m.Score >= FuzzyMatchThreshold)
.OrderByDescending(m => m.Score)
@@ -962,20 +1408,29 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
var itemTitleLower = item.Title.ToLowerInvariant();
var itemCoreWords = ExtractCoreIdentifiers(item.Title);
- // Validate that ALL core identifiers appear in the match
- // "Lloyds Bowmaker" must have BOTH "LLOYDS" and "BOWMAKER" in the match
+ // Validate that ALL core identifiers from the ORIGINAL company name appear in the match
+ // "Northwick Industries Limited" must have BOTH "NORTHWICK" and "INDUSTRIES" in the match
+ // This prevents partial search queries (e.g., "Northwick") from bypassing validation
var hasAllOriginalCores = coreWords.Count == 0 || coreWords.All(w => itemTitle.Contains(w));
- var hasAllQueryCores = queryCoreWords.Count == 0 || queryCoreWords.All(w => itemTitle.Contains(w));
- if (!hasAllOriginalCores && !hasAllQueryCores) return false;
+ if (!hasAllOriginalCores) return false;
- // Additional check: ensure the match doesn't have too many EXTRA core words
+ // Additional check: ensure the match doesn't have too many EXTRA MEANINGFUL core words
// "Families First" should NOT match "Families Against Conformity" because
// "Against" and "Conformity" are extra significant words
+ // BUT: safe expansions like "UK", "LIMITED", "COMPANY", "GROUP" don't count
+ // So "Boots" -> "BOOTS UK LIMITED" is OK (no meaningful extras)
+ // But "Boots" -> "BOOTS AND BEARDS" is NOT OK (BEARDS is meaningful extra)
if (coreWords.Count > 0 && hasAllOriginalCores)
{
- var extraWordsInMatch = itemCoreWords.Count(w => !coreWords.Contains(w));
- // If the match has more than 1 extra core word, it's likely a different company
- if (extraWordsInMatch > 1 && itemCoreWords.Count > coreWords.Count + 1)
+ // Count only MEANINGFUL extra words (not safe expansions)
+ var meaningfulExtras = CountMeaningfulExtraWords(coreWords, itemCoreWords);
+ var isShortName = coreWords.Count <= 2;
+
+ // For short names: no meaningful extras allowed (prevents "Boots" → "BOOTS AND BEARDS")
+ // For longer names: allow up to 1 meaningful extra
+ var maxAllowedExtras = isShortName ? 0 : 1;
+
+ if (meaningfulExtras > maxAllowedExtras)
{
return false;
}
@@ -1475,6 +1930,88 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
return System.Text.RegularExpressions.Regex.IsMatch(text, pattern, System.Text.RegularExpressions.RegexOptions.IgnoreCase);
}
+ ///
+ /// Checks if a CV company name matches a well-known brand and returns its info.
+ ///
+ private static WellKnownBrand? GetWellKnownBrand(string companyName)
+ {
+ if (string.IsNullOrWhiteSpace(companyName))
+ return null;
+
+ // Try exact match first
+ if (WellKnownBrands.TryGetValue(companyName.Trim(), out var brand))
+ return brand;
+
+ // Try with common suffixes removed
+ var normalized = companyName.Trim();
+ var suffixes = new[] { " Ltd", " Limited", " PLC", " UK", " Group" };
+ foreach (var suffix in suffixes)
+ {
+ if (normalized.EndsWith(suffix, StringComparison.OrdinalIgnoreCase))
+ {
+ var withoutSuffix = normalized[..^suffix.Length].Trim();
+ if (WellKnownBrands.TryGetValue(withoutSuffix, out brand))
+ return brand;
+ }
+ }
+
+ return null;
+ }
+
+ ///
+ /// Checks if a candidate company name matches a well-known brand's acceptable patterns.
+ ///
+ private static bool MatchesWellKnownBrandPatterns(string candidateName, WellKnownBrand brand)
+ {
+ var upper = candidateName.ToUpperInvariant();
+ return brand.AcceptablePatterns.Any(p => upper.Contains(p.ToUpperInvariant()));
+ }
+
+ ///
+ /// Counts "meaningful" extra words in a candidate that aren't in the original.
+ /// Excludes safe expansion words (UK, Limited, Company, Group, etc.)
+ ///
+ private static int CountMeaningfulExtraWords(List originalCoreWords, List candidateCoreWords)
+ {
+ var extraWords = candidateCoreWords
+ .Where(w => !originalCoreWords.Any(o => o.Equals(w, StringComparison.OrdinalIgnoreCase)))
+ .Where(w => !SafeExpansionWords.Contains(w))
+ .ToList();
+
+ return extraWords.Count;
+ }
+
+ ///
+ /// Determines if the extra words in a candidate are "safe" expansions that don't change company identity.
+ /// E.g., "Boots" -> "BOOTS UK LIMITED" has only safe expansions (UK, LIMITED)
+ /// E.g., "Boots" -> "BOOTS AND BEARDS" has unsafe expansion (BEARDS)
+ ///
+ private static bool HasOnlySafeExpansions(string originalName, string candidateName, ILogger? logger = null)
+ {
+ var originalCores = ExtractCoreIdentifiers(originalName);
+ var candidateCores = ExtractCoreIdentifiers(candidateName);
+
+ // Find words in candidate that aren't in original
+ var extraWords = candidateCores
+ .Where(w => !originalCores.Any(o => o.Equals(w, StringComparison.OrdinalIgnoreCase)))
+ .ToList();
+
+ // Check if all extra words are safe expansions
+ var unsafeWords = extraWords
+ .Where(w => !SafeExpansionWords.Contains(w))
+ .ToList();
+
+ if (unsafeWords.Count > 0 && logger != null)
+ {
+ logger.LogDebug("Candidate '{Candidate}' has unsafe extra words: [{Unsafe}] (safe extras: [{Safe}])",
+ candidateName,
+ string.Join(", ", unsafeWords),
+ string.Join(", ", extraWords.Where(w => SafeExpansionWords.Contains(w))));
+ }
+
+ return unsafeWords.Count == 0;
+ }
+
// Expanded skip words list for core identifier extraction
// These words are too common to be meaningful differentiators between companies
private static readonly HashSet SkipWords = new(StringComparer.OrdinalIgnoreCase)
@@ -1502,17 +2039,20 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
"company", "co", "partners", "partnership", "enterprises", "unlimited",
"registered", "cic", "cio", "se", "ag", "gmbh", "sarl", "bv", "nv",
- // Business descriptors
+ // Business descriptors - only truly generic ones that don't identify the business type
+ // Note: Removed words that can be meaningful business type identifiers:
+ // - "industries", "technology", "solutions", "services", "consulting" - identify business type
+ // - e.g., "Paramount Consulting" ≠ "Paramount", "Tech Solutions" ≠ "Tech"
"group", "holdings", "holding", "parent", "subsidiary", "division", "branch",
- "services", "service", "solutions", "solution", "consulting", "consultants", "consultancy",
- "management", "systems", "system", "technologies", "technology", "tech",
- "industries", "industry", "industrial", "commercial", "trading", "trade",
- "business", "businesses", "operations", "operational", "professional", "professionals",
- "resources", "resource", "network", "networks", "associates", "associated",
+ "commercial", "trading", "trade",
+ "business", "businesses", "operational",
+ "associated",
// Size/Scope descriptors
"national", "international", "global", "worldwide", "world", "regional", "local",
- "universal", "general", "standard", "premier", "prime", "first", "one",
+ "universal", "standard", "prime", "first", "one",
+ // Note: Removed "general" and "premier" as they are meaningful in brand names
+ // like "Legal & General", "General Electric", "Premier Inn"
// Quality/Marketing terms
"new", "modern", "advanced", "innovative", "premier", "elite", "premium",
diff --git a/tests/RealCV.Tests/Integration/CVBatchTester.cs b/tests/RealCV.Tests/Integration/CVBatchTester.cs
index 0f8aafe..f2b1d68 100644
--- a/tests/RealCV.Tests/Integration/CVBatchTester.cs
+++ b/tests/RealCV.Tests/Integration/CVBatchTester.cs
@@ -53,22 +53,12 @@ public class CVBatchTester
options.UseSqlServer(connectionString));
// Companies House
- services.Configure(options =>
- {
- options.BaseUrl = configuration["CompaniesHouse:BaseUrl"] ?? "https://api.company-information.service.gov.uk";
- options.ApiKey = configuration["CompaniesHouse:ApiKey"] ?? "";
- });
-
+ services.Configure(configuration.GetSection("CompaniesHouse"));
services.AddHttpClient();
// Anthropic (for AI matching)
- services.Configure(options =>
- {
- options.ApiKey = configuration["Anthropic:ApiKey"] ?? "";
- });
-
- services.AddHttpClient();
- services.AddScoped();
+ services.Configure(configuration.GetSection("Anthropic"));
+ services.AddScoped();
// Services
services.AddScoped();
@@ -142,7 +132,7 @@ public class CVBatchTester
var summary = new CVVerificationSummary
{
FileName = Path.GetFileName(filePath),
- CandidateName = parsedCV.PersonalInfo?.FullName ?? "Unknown"
+ CandidateName = parsedCV.FullName ?? "Unknown"
};
// Verify employers
diff --git a/tests/RealCV.Tests/Services/CompanyVerifierServiceTests.cs b/tests/RealCV.Tests/Services/CompanyVerifierServiceTests.cs
index 09f5486..8946942 100644
--- a/tests/RealCV.Tests/Services/CompanyVerifierServiceTests.cs
+++ b/tests/RealCV.Tests/Services/CompanyVerifierServiceTests.cs
@@ -76,8 +76,9 @@ public class CompanyVerifierServiceTests : IDisposable
_mockAiMatcher.Setup(m => m.FindBestMatchAsync(
It.IsAny(),
It.IsAny>(),
+ It.IsAny(),
It.IsAny()))
- .Returns((string cvCompanyName, List candidates, CancellationToken _) =>
+ .Returns((string cvCompanyName, List candidates, string? industryHint, CancellationToken _) =>
{
// Find exact or close match in candidates
var exactMatch = candidates.FirstOrDefault(c =>
diff --git a/tests/RealCV.Tests/Services/EducationVerifierServiceTests.cs b/tests/RealCV.Tests/Services/EducationVerifierServiceTests.cs
index effe0b3..bb94442 100644
--- a/tests/RealCV.Tests/Services/EducationVerifierServiceTests.cs
+++ b/tests/RealCV.Tests/Services/EducationVerifierServiceTests.cs
@@ -51,7 +51,7 @@ public sealed class EducationVerifierServiceTests
var result = _sut.Verify(education);
// Assert
- result.VerificationNotes.Should().Contain("diploma mill blacklist");
+ result.VerificationNotes.Should().Contain("not found in accredited institutions");
}
#endregion
diff --git a/tools/CVBatchTester/Program.cs b/tools/CVBatchTester/Program.cs
index 1c2f6b3..dc6c4fa 100644
--- a/tools/CVBatchTester/Program.cs
+++ b/tools/CVBatchTester/Program.cs
@@ -1,4 +1,5 @@
using System.Text.Json;
+using System.Text.Json.Serialization;
using Microsoft.EntityFrameworkCore;
using Microsoft.Extensions.Configuration;
using Microsoft.Extensions.DependencyInjection;
@@ -12,23 +13,86 @@ using RealCV.Infrastructure.Services;
namespace CVBatchTester;
+// DTOs for test JSON format (snake_case with nested personal object)
+record TestCVData
+{
+ public string? CvId { get; init; }
+ public string? Category { get; init; }
+ public List? ExpectedFlags { get; init; }
+ public TestPersonalData? Personal { get; init; }
+ public string? Profile { get; init; }
+ public List? Employment { get; init; }
+ public List? Education { get; init; }
+ public List? Skills { get; init; }
+}
+
+record TestPersonalData
+{
+ public string? Name { get; init; }
+ public string? Email { get; init; }
+ public string? Phone { get; init; }
+ public string? Address { get; init; }
+ public string? LinkedIn { get; init; }
+}
+
+record TestEmploymentEntry
+{
+ public string? Company { get; init; }
+ public string? JobTitle { get; init; }
+ public string? StartDate { get; init; }
+ public string? EndDate { get; init; }
+ public string? Location { get; init; }
+ public string? Description { get; init; }
+ public List? Achievements { get; init; }
+}
+
+record TestEducationEntry
+{
+ public string? Institution { get; init; }
+ public string? Qualification { get; init; }
+ public string? Subject { get; init; }
+ public string? Classification { get; init; }
+ public string? StartDate { get; init; }
+ public string? EndDate { get; init; }
+}
+
class Program
{
+ private static StreamWriter? _logWriter;
+
+ private static readonly JsonSerializerOptions JsonOptions = new()
+ {
+ PropertyNameCaseInsensitive = true,
+ PropertyNamingPolicy = JsonNamingPolicy.SnakeCaseLower,
+ Converters = { new JsonStringEnumConverter() }
+ };
+
static async Task Main(string[] args)
{
var folderPath = args.FirstOrDefault() ?? AskForFolder();
if (string.IsNullOrEmpty(folderPath) || !Directory.Exists(folderPath))
{
- Console.WriteLine($"Error: Folder not found: {folderPath}");
- Console.WriteLine("Usage: CVBatchTester ");
- Console.WriteLine(" e.g. CVBatchTester /home/user/cvs");
+ Log($"Error: Folder not found: {folderPath}");
+ Log("Usage: CVBatchTester [--output ]");
+ Log(" e.g. CVBatchTester /home/user/cvs");
+ Log(" e.g. CVBatchTester /home/user/cvs --output /tmp/results.log");
return 1;
}
- Console.WriteLine($"CV Batch Verification Tester");
- Console.WriteLine($"Processing CVs from: {folderPath}");
- Console.WriteLine(new string('=', 80));
+ // Check for --output flag
+ var outputIndex = Array.IndexOf(args, "--output");
+ var logPath = outputIndex >= 0 && outputIndex < args.Length - 1
+ ? args[outputIndex + 1]
+ : Path.Combine(folderPath, $"batch-results-{DateTime.Now:yyyyMMdd-HHmmss}.log");
+
+ _logWriter = new StreamWriter(logPath, false) { AutoFlush = true };
+
+ Log($"CV Batch Verification Tester");
+ Log($"Processing CVs from: {folderPath}");
+ Log($"Output log: {logPath}");
+ Log($"Started: {DateTime.Now:yyyy-MM-dd HH:mm:ss}");
+ Log(new string('=', 80));
// Setup DI
var services = new ServiceCollection();
@@ -39,15 +103,16 @@ class Program
var cvFiles = Directory.GetFiles(folderPath, "*.*", SearchOption.TopDirectoryOnly)
.Where(f => f.EndsWith(".pdf", StringComparison.OrdinalIgnoreCase) ||
f.EndsWith(".docx", StringComparison.OrdinalIgnoreCase) ||
- f.EndsWith(".doc", StringComparison.OrdinalIgnoreCase))
+ f.EndsWith(".doc", StringComparison.OrdinalIgnoreCase) ||
+ f.EndsWith(".json", StringComparison.OrdinalIgnoreCase))
.OrderBy(f => f)
.ToList();
- Console.WriteLine($"Found {cvFiles.Count} CV files\n");
+ Log($"Found {cvFiles.Count} CV files\n");
if (cvFiles.Count == 0)
{
- Console.WriteLine("No CV files found (.pdf, .docx, .doc)");
+ Log("No CV files found (.pdf, .docx, .doc, .json)");
return 1;
}
@@ -63,9 +128,9 @@ class Program
foreach (var cvFile in cvFiles)
{
- Console.WriteLine($"\n{new string('=', 80)}");
- Console.WriteLine($"[{++processedCount}/{cvFiles.Count}] {Path.GetFileName(cvFile)}");
- Console.WriteLine(new string('=', 80));
+ Log($"\n{new string('=', 80)}");
+ Log($"[{++processedCount}/{cvFiles.Count}] {Path.GetFileName(cvFile)}");
+ Log(new string('=', 80));
try
{
@@ -74,17 +139,30 @@ class Program
var companyVerifier = scope.ServiceProvider.GetRequiredService();
var eduVerifier = scope.ServiceProvider.GetRequiredService();
- // Parse CV
- await using var stream = File.OpenRead(cvFile);
- var cv = await parser.ParseAsync(stream, Path.GetFileName(cvFile));
+ // Parse CV - handle JSON files differently
+ CVData cv;
+ if (cvFile.EndsWith(".json", StringComparison.OrdinalIgnoreCase))
+ {
+ var jsonContent = await File.ReadAllTextAsync(cvFile);
+ var testCv = JsonSerializer.Deserialize(jsonContent, JsonOptions)
+ ?? throw new InvalidOperationException("Failed to deserialize JSON CV");
- Console.WriteLine($"Candidate: {cv.FullName}");
+ // Convert TestCVData to CVData
+ cv = ConvertTestCVData(testCv);
+ Log($"Loaded JSON CV: {cv.FullName}");
+ }
+ else
+ {
+ await using var stream = File.OpenRead(cvFile);
+ cv = await parser.ParseAsync(stream, Path.GetFileName(cvFile));
+ Log($"Parsed CV: {cv.FullName}");
+ }
// Verify Employers
if (cv.Employment?.Count > 0)
{
- Console.WriteLine($"\nEMPLOYERS ({cv.Employment.Count}):");
- Console.WriteLine(new string('-', 60));
+ Log($"\nEMPLOYERS ({cv.Employment.Count}):");
+ Log(new string('-', 60));
foreach (var emp in cv.Employment)
{
@@ -100,18 +178,18 @@ class Program
var icon = result.IsVerified ? "✓" : "✗";
var period = FormatPeriod(emp.StartDate, emp.EndDate);
- Console.WriteLine($"\n {icon} {emp.CompanyName}");
- Console.WriteLine($" Period: {period}");
- Console.WriteLine($" Role: {emp.JobTitle}");
+ Log($"\n {icon} {emp.CompanyName}");
+ Log($" Period: {period}");
+ Log($" Role: {emp.JobTitle}");
if (result.IsVerified)
{
verifiedEmployers++;
- Console.WriteLine($" Match: {result.MatchedCompanyName} ({result.MatchScore}%)");
+ Log($" Match: {result.MatchedCompanyName} ({result.MatchScore}%)");
if (!string.IsNullOrEmpty(result.MatchedCompanyNumber))
- Console.WriteLine($" Company #: {result.MatchedCompanyNumber}");
+ Log($" Company #: {result.MatchedCompanyNumber}");
if (!string.IsNullOrEmpty(result.CompanyStatus))
- Console.WriteLine($" Status: {result.CompanyStatus}");
+ Log($" Status: {result.CompanyStatus}");
}
else
{
@@ -119,12 +197,12 @@ class Program
}
if (!string.IsNullOrEmpty(result.VerificationNotes))
- Console.WriteLine($" Note: {result.VerificationNotes}");
+ Log($" Note: {result.VerificationNotes}");
}
catch (Exception ex)
{
- Console.WriteLine($"\n ✗ {emp.CompanyName}");
- Console.WriteLine($" ERROR: {ex.Message}");
+ Log($"\n ✗ {emp.CompanyName}");
+ Log($" ERROR: {ex.Message}");
allUnverifiedEmployers.Add(emp.CompanyName);
}
}
@@ -133,8 +211,8 @@ class Program
// Verify Education
if (cv.Education?.Count > 0)
{
- Console.WriteLine($"\nEDUCATION ({cv.Education.Count}):");
- Console.WriteLine(new string('-', 60));
+ Log($"\nEDUCATION ({cv.Education.Count}):");
+ Log(new string('-', 60));
var eduEntries = cv.Education.Select(e => new EducationEntry
{
@@ -152,10 +230,10 @@ class Program
totalEducation++;
var icon = result.IsVerified ? "✓" : "✗";
- Console.WriteLine($"\n {icon} {result.ClaimedInstitution}");
- Console.WriteLine($" Qualification: {result.ClaimedQualification}");
+ Log($"\n {icon} {result.ClaimedInstitution}");
+ Log($" Qualification: {result.ClaimedQualification}");
if (!string.IsNullOrEmpty(result.ClaimedSubject))
- Console.WriteLine($" Subject: {result.ClaimedSubject}");
+ Log($" Subject: {result.ClaimedSubject}");
if (result.IsVerified)
{
@@ -163,41 +241,41 @@ class Program
if (result.MatchedInstitution != null &&
!result.MatchedInstitution.Equals(result.ClaimedInstitution, StringComparison.OrdinalIgnoreCase))
{
- Console.WriteLine($" Match: {result.MatchedInstitution}");
+ Log($" Match: {result.MatchedInstitution}");
}
}
else
{
allUnverifiedInstitutions.Add(result.ClaimedInstitution ?? "Unknown");
- Console.WriteLine($" Status: {result.Status}");
+ Log($" Status: {result.Status}");
}
if (!string.IsNullOrEmpty(result.VerificationNotes))
- Console.WriteLine($" Note: {result.VerificationNotes}");
+ Log($" Note: {result.VerificationNotes}");
}
}
}
catch (Exception ex)
{
errorCount++;
- Console.WriteLine($"ERROR processing file: {ex.Message}");
+ Log($"ERROR processing file: {ex.Message}");
}
}
// Print Summary
- Console.WriteLine($"\n\n{new string('=', 80)}");
- Console.WriteLine("VERIFICATION SUMMARY");
- Console.WriteLine(new string('=', 80));
+ Log($"\n\n{new string('=', 80)}");
+ Log("VERIFICATION SUMMARY");
+ Log(new string('=', 80));
- Console.WriteLine($"\nCVs Processed: {processedCount - errorCount}/{cvFiles.Count}");
+ Log($"\nCVs Processed: {processedCount - errorCount}/{cvFiles.Count}");
if (errorCount > 0)
- Console.WriteLine($"Errors: {errorCount}");
+ Log($"Errors: {errorCount}");
var empRate = totalEmployers > 0 ? verifiedEmployers * 100 / totalEmployers : 0;
var eduRate = totalEducation > 0 ? verifiedEducation * 100 / totalEducation : 0;
- Console.WriteLine($"\nEmployers: {verifiedEmployers}/{totalEmployers} verified ({empRate}%)");
- Console.WriteLine($"Education: {verifiedEducation}/{totalEducation} verified ({eduRate}%)");
+ Log($"\nEmployers: {verifiedEmployers}/{totalEmployers} verified ({empRate}%)");
+ Log($"Education: {verifiedEducation}/{totalEducation} verified ({eduRate}%)");
// List unverified employers
var uniqueUnverifiedEmployers = allUnverifiedEmployers
@@ -208,12 +286,12 @@ class Program
if (uniqueUnverifiedEmployers.Count > 0)
{
- Console.WriteLine($"\n{new string('-', 60)}");
- Console.WriteLine($"UNVERIFIED EMPLOYERS ({uniqueUnverifiedEmployers.Count} unique):");
+ Log($"\n{new string('-', 60)}");
+ Log($"UNVERIFIED EMPLOYERS ({uniqueUnverifiedEmployers.Count} unique):");
foreach (var group in uniqueUnverifiedEmployers)
{
var count = group.Count() > 1 ? $" (x{group.Count()})" : "";
- Console.WriteLine($" - {group.Key}{count}");
+ Log($" - {group.Key}{count}");
}
}
@@ -226,19 +304,30 @@ class Program
if (uniqueUnverifiedInstitutions.Count > 0)
{
- Console.WriteLine($"\n{new string('-', 60)}");
- Console.WriteLine($"UNVERIFIED INSTITUTIONS ({uniqueUnverifiedInstitutions.Count} unique):");
+ Log($"\n{new string('-', 60)}");
+ Log($"UNVERIFIED INSTITUTIONS ({uniqueUnverifiedInstitutions.Count} unique):");
foreach (var group in uniqueUnverifiedInstitutions)
{
var count = group.Count() > 1 ? $" (x{group.Count()})" : "";
- Console.WriteLine($" - {group.Key}{count}");
+ Log($" - {group.Key}{count}");
}
}
- Console.WriteLine($"\n{new string('=', 80)}");
+ Log($"\nCompleted: {DateTime.Now:yyyy-MM-dd HH:mm:ss}");
+ Log($"\n{new string('=', 80)}");
+
+ _logWriter?.Close();
+ Console.WriteLine($"\nResults written to: {logPath}");
+
return 0;
}
+ static void Log(string message)
+ {
+ Console.WriteLine(message);
+ _logWriter?.WriteLine(message);
+ }
+
static string AskForFolder()
{
Console.Write("Enter CV folder path: ");
@@ -252,6 +341,57 @@ class Program
return $"{startStr} - {endStr}";
}
+ static CVData ConvertTestCVData(TestCVData testCv)
+ {
+ return new CVData
+ {
+ FullName = testCv.Personal?.Name ?? "Unknown",
+ Email = testCv.Personal?.Email,
+ Phone = testCv.Personal?.Phone,
+ Employment = testCv.Employment?.Select(e => new EmploymentEntry
+ {
+ CompanyName = e.Company ?? "Unknown",
+ JobTitle = e.JobTitle ?? "Unknown",
+ Location = e.Location,
+ StartDate = ParseDate(e.StartDate),
+ EndDate = ParseDate(e.EndDate),
+ IsCurrent = e.EndDate == null,
+ Description = e.Description
+ }).ToList() ?? [],
+ Education = testCv.Education?.Select(e => new EducationEntry
+ {
+ Institution = e.Institution ?? "Unknown",
+ Qualification = e.Qualification,
+ Subject = e.Subject,
+ StartDate = ParseDate(e.StartDate),
+ EndDate = ParseDate(e.EndDate)
+ }).ToList() ?? [],
+ Skills = testCv.Skills ?? []
+ };
+ }
+
+ static DateOnly? ParseDate(string? dateStr)
+ {
+ if (string.IsNullOrEmpty(dateStr)) return null;
+
+ // Try parsing YYYY-MM format
+ if (dateStr.Length == 7 && dateStr[4] == '-')
+ {
+ if (int.TryParse(dateStr[..4], out var year) && int.TryParse(dateStr[5..], out var month))
+ {
+ return new DateOnly(year, month, 1);
+ }
+ }
+
+ // Try standard parsing
+ if (DateOnly.TryParse(dateStr, out var date))
+ {
+ return date;
+ }
+
+ return null;
+ }
+
static void ConfigureServices(IServiceCollection services)
{
// Load configuration - try multiple locations
@@ -263,7 +403,7 @@ class Program
};
var webProjectPath = configPaths.FirstOrDefault(Directory.Exists) ?? "/git/RealCV/src/RealCV.Web";
- Console.WriteLine($"Loading config from: {webProjectPath}");
+ Log($"Loading config from: {webProjectPath}");
var configuration = new ConfigurationBuilder()
.SetBasePath(webProjectPath)
@@ -272,11 +412,14 @@ class Program
.AddJsonFile("appsettings.Production.json", optional: true)
.Build();
- // Logging - minimal output
+ // Logging - show info level for verification details
services.AddLogging(builder =>
{
builder.AddConsole();
- builder.SetMinimumLevel(LogLevel.Warning);
+ builder.SetMinimumLevel(LogLevel.Information);
+ // Filter out noisy libraries
+ builder.AddFilter("Microsoft", LogLevel.Warning);
+ builder.AddFilter("System", LogLevel.Warning);
});
// Database