diff --git a/src/RealCV.Application/Interfaces/ICompanyNameMatcherService.cs b/src/RealCV.Application/Interfaces/ICompanyNameMatcherService.cs index 466ecfb..d4d3a6d 100644 --- a/src/RealCV.Application/Interfaces/ICompanyNameMatcherService.cs +++ b/src/RealCV.Application/Interfaces/ICompanyNameMatcherService.cs @@ -8,9 +8,14 @@ public interface ICompanyNameMatcherService /// Uses AI to semantically compare a company name from a CV against Companies House candidates. /// Returns the best match with confidence score and reasoning. /// + /// The company name as written on the CV + /// List of potential matches from Companies House + /// Optional industry context for well-known brands (e.g., "pharmacy/healthcare retail") + /// Cancellation token Task FindBestMatchAsync( string cvCompanyName, List candidates, + string? industryHint = null, CancellationToken cancellationToken = default); /// diff --git a/src/RealCV.Infrastructure/Services/AICompanyNameMatcherService.cs b/src/RealCV.Infrastructure/Services/AICompanyNameMatcherService.cs index 11fb900..3ce515b 100644 --- a/src/RealCV.Infrastructure/Services/AICompanyNameMatcherService.cs +++ b/src/RealCV.Infrastructure/Services/AICompanyNameMatcherService.cs @@ -33,29 +33,43 @@ public sealed class AICompanyNameMatcherService : ICompanyNameMatcherService Compare the company name from a CV against official Companies House records. CV Company Name: "{CV_COMPANY}" - + {INDUSTRY_CONTEXT} Companies House Candidates: {CANDIDATES} Determine which candidate (if any) is the SAME company as the CV entry. Matching Guidelines: - 1. MATCH if the CV name is clearly the same organisation as a candidate: - - "Royal Bank of Scotland" → "THE ROYAL BANK OF SCOTLAND PUBLIC LIMITED COMPANY" ✓ (same bank) - - "Yorkshire Electricity" → "YORKSHIRE ELECTRICITY GROUP PLC" ✓ (same utility) + 1. MATCH if the CV name is the same organisation as a candidate (even if registered name differs): + - "Boots" → "BOOTS UK LIMITED" ✓ (trading name = registered company) + - "Boots" → "THE BOOTS COMPANY PLC" ✓ (trading name = parent company) - "Tesco" → "TESCO PLC" ✓ (trading name = registered name) - "ASDA" → "ASDA STORES LIMITED" ✓ (brand = operating company) + - "Legal & General" → "LEGAL & GENERAL GROUP PLC" ✓ (brand = holding company) + - "Checkout.com" → "CHECKOUT.COM PAYMENTS LIMITED" ✓ (exact match) + - "EY UK" → "ERNST & YOUNG LLP" ✓ (trading name = partnership) + - "Royal Bank of Scotland" → "THE ROYAL BANK OF SCOTLAND PUBLIC LIMITED COMPANY" ✓ - 2. DO NOT MATCH if the words are fundamentally different: - - "Families First" ≠ "FAMILIES AGAINST CONFORMITY" (different words after "Families") - - "Royal Bank" ≠ "Royal Academy" (Bank ≠ Academy) - - "Storm Ideas" ≠ "STORM LIMITED" (missing "Ideas" - could be different company) + 2. DO NOT MATCH if the candidate adds significant DIFFERENT words that indicate a different business: + - "Boots" ≠ "BOOTS AND BEARDS" ✗ (pharmacy chain is NOT a barber/grooming business) + - "Legal & General" ≠ "LEGAL LIMITED" ✗ (major insurer is NOT a generic "legal" company) + - "Checkout.com" ≠ "XN CHECKOUT LIMITED" ✗ (fintech is NOT an unrelated checkout company) + - "EY UK" ≠ "EY UK GDPR REPRESENTATIVE LIMITED" ✗ (main employer, not a subsidiary) - 3. Legal suffixes (Ltd, Limited, PLC, LLP, CiC) should be ignored when comparing names + 3. KEY DISTINCTION - Geographic/legal suffixes are OK, but new business words are NOT: + - "Boots" → "BOOTS UK LIMITED" ✓ (UK is just geographic qualifier) + - "Boots" → "BOOTS AND BEARDS" ✗ (BEARDS indicates different business) + - "Meridian Holdings" → "MERIDIAN (THE ORIGINAL) LIMITED" ✗ ("THE ORIGINAL" suggests different business) + - "Paramount Consulting UK" → "PARAMOUNT LIMITED" ✗ (missing "Consulting" - different type) + - "Apex Technology Partners" → "APEX LIMITED" ✗ (missing "Technology Partners") - 4. Adding "THE" or "GROUP" to a name doesn't make it a different company + 4. Legal suffixes (Ltd, Limited, PLC, LLP, CiC) should be ignored when comparing names - 5. If unsure, prefer matching over rejecting when core identifying words match + 5. Adding "THE", "GROUP", "UK", or "HOLDINGS" to a name doesn't make it a different company + + 6. When the CV mentions a well-known brand, prefer the main operating/holding company over obscure matches + + 7. If INDUSTRY CONTEXT is provided, use it to reject candidates clearly in different industries CRITICAL: Return the COMPLETE company number exactly as shown (e.g., "SC083026", "02366995"). Do NOT truncate or abbreviate the company number. @@ -80,6 +94,7 @@ public sealed class AICompanyNameMatcherService : ICompanyNameMatcherService public async Task FindBestMatchAsync( string cvCompanyName, List candidates, + string? industryHint = null, CancellationToken cancellationToken = default) { if (string.IsNullOrWhiteSpace(cvCompanyName) || candidates.Count == 0) @@ -87,8 +102,8 @@ public sealed class AICompanyNameMatcherService : ICompanyNameMatcherService return null; } - _logger.LogDebug("Using AI to match '{CVCompany}' against {Count} candidates", - cvCompanyName, candidates.Count); + _logger.LogDebug("Using AI to match '{CVCompany}' against {Count} candidates (industry: {Industry})", + cvCompanyName, candidates.Count, industryHint ?? "unknown"); try { @@ -96,8 +111,14 @@ public sealed class AICompanyNameMatcherService : ICompanyNameMatcherService var candidatesText = string.Join("\n", candidates.Select((c, i) => $"[{c.CompanyNumber}] {c.CompanyName} (Status: {c.CompanyStatus ?? "Unknown"})")); + // Add industry context if available + var industryContext = string.IsNullOrEmpty(industryHint) + ? "" + : $"Industry Context: This is a well-known brand in {industryHint}. Reject candidates clearly in different industries.\n"; + var prompt = MatchingPrompt .Replace("{CV_COMPANY}", cvCompanyName) + .Replace("{INDUSTRY_CONTEXT}", industryContext) .Replace("{CANDIDATES}", candidatesText); var messages = new List diff --git a/src/RealCV.Infrastructure/Services/CompanyVerifierService.cs b/src/RealCV.Infrastructure/Services/CompanyVerifierService.cs index 8a76440..7cdf49e 100644 --- a/src/RealCV.Infrastructure/Services/CompanyVerifierService.cs +++ b/src/RealCV.Infrastructure/Services/CompanyVerifierService.cs @@ -73,6 +73,127 @@ public sealed class CompanyVerifierService : ICompanyVerifierService "manufacturing", "operations", "trading" }; + // Words that are "safe expansions" - they don't change company identity when added + // "Boots" -> "BOOTS UK LIMITED" is safe (UK + LIMITED are standard suffixes) + // "Boots" -> "THE BOOTS COMPANY PLC" is safe (THE + COMPANY + PLC are standard) + // These are NOT counted as "extra meaningful words" in matching + private static readonly HashSet SafeExpansionWords = new(StringComparer.OrdinalIgnoreCase) + { + // Legal structures (already in SkipWords, but explicit here for clarity) + "limited", "ltd", "plc", "llp", "llc", "inc", "incorporated", "corporation", "corp", + "company", "co", "partners", "partnership", + + // Corporate structure words - these expand but don't change identity + "group", "holdings", "holding", "the", + + // Geographic qualifiers (already in SkipWords) + "uk", "u.k.", "gb", "britain", "british", "england", "europe", "european", + "international", "global", "worldwide", + + // Common corporate suffixes + "services", "solutions", // "Boots" vs "BOOTS SERVICES LIMITED" - likely same company + "retail", "stores", // "ASDA" vs "ASDA STORES LIMITED" - same company + }; + + // Well-known UK trading names/brands mapped to their expected company name patterns + // This provides fast-path recognition for major employers without full AI evaluation + // Key: Trading name (how people commonly refer to the company) + // Value: Tuple of (primary company number, list of acceptable name patterns) + private static readonly Dictionary WellKnownBrands = + new(StringComparer.OrdinalIgnoreCase) + { + // Retail + ["Boots"] = new("00928555", "pharmacy/healthcare retail", new[] { "BOOTS UK", "BOOTS COMPANY", "BOOTS PLC", "BOOTS LIMITED" }), + ["ASDA"] = new("00464777", "supermarket retail", new[] { "ASDA STORES", "ASDA GROUP", "ASDA PLC" }), + ["Tesco"] = new("00445790", "supermarket retail", new[] { "TESCO PLC", "TESCO STORES", "TESCO UK" }), + ["Sainsbury"] = new("00185647", "supermarket retail", new[] { "SAINSBURY", "J SAINSBURY" }), + ["Sainsbury's"] = new("00185647", "supermarket retail", new[] { "SAINSBURY", "J SAINSBURY" }), + ["Morrisons"] = new("00358949", "supermarket retail", new[] { "WM MORRISON", "MORRISON SUPERMARKETS" }), + ["Waitrose"] = new("00099405", "supermarket retail", new[] { "WAITROSE", "JOHN LEWIS" }), + ["Marks & Spencer"] = new("00214436", "retail", new[] { "MARKS AND SPENCER", "MARKS & SPENCER" }), + ["M&S"] = new("00214436", "retail", new[] { "MARKS AND SPENCER", "MARKS & SPENCER" }), + ["John Lewis"] = new("00233462", "retail", new[] { "JOHN LEWIS", "JOHN LEWIS PARTNERSHIP" }), + ["Next"] = new("04425340", "retail", new[] { "NEXT PLC", "NEXT RETAIL", "NEXT GROUP" }), + ["Primark"] = new("NI016270", "retail", new[] { "PRIMARK", "PENNEYS", "ASSOCIATED BRITISH FOODS" }), + ["Argos"] = new("01081551", "retail", new[] { "ARGOS", "SAINSBURY'S ARGOS" }), + + // Finance & Insurance + ["Legal & General"] = new("01417162", "insurance/financial services", new[] { "LEGAL AND GENERAL", "LEGAL & GENERAL", "L&G" }), + ["Aviva"] = new("02468686", "insurance", new[] { "AVIVA", "NORWICH UNION" }), + ["Prudential"] = new("01397169", "insurance/financial services", new[] { "PRUDENTIAL" }), + ["AXA"] = new("01878835", "insurance", new[] { "AXA UK", "AXA INSURANCE" }), + ["Lloyds Banking Group"] = new("00002065", "banking", new[] { "LLOYDS BANK", "LLOYDS BANKING" }), + ["Barclays"] = new("01026167", "banking", new[] { "BARCLAYS BANK", "BARCLAYS PLC" }), + ["HSBC"] = new("00014259", "banking", new[] { "HSBC BANK", "HSBC UK", "HSBC HOLDINGS" }), + ["NatWest"] = new("00929027", "banking", new[] { "NATWEST", "NATIONAL WESTMINSTER", "NATWEST GROUP" }), + + // Professional Services + ["EY"] = new("OC300001", "accounting/professional services", new[] { "ERNST & YOUNG", "EY LLP", "ERNST AND YOUNG" }), + ["EY UK"] = new("OC300001", "accounting/professional services", new[] { "ERNST & YOUNG", "EY LLP" }), + ["Ernst & Young"] = new("OC300001", "accounting/professional services", new[] { "ERNST & YOUNG", "EY LLP" }), + ["PwC"] = new("OC303525", "accounting/professional services", new[] { "PRICEWATERHOUSECOOPERS", "PWC" }), + ["Deloitte"] = new("OC303675", "accounting/professional services", new[] { "DELOITTE LLP", "DELOITTE" }), + ["KPMG"] = new("OC301540", "accounting/professional services", new[] { "KPMG LLP", "KPMG" }), + ["Accenture"] = new("04abortedt6", "consulting", new[] { "ACCENTURE UK", "ACCENTURE" }), + ["McKinsey"] = new("03883888", "consulting", new[] { "MCKINSEY", "MCKINSEY & COMPANY" }), + + // Technology + ["Checkout.com"] = new("09131987", "fintech/payments", new[] { "CHECKOUT.COM", "CHECKOUT LTD", "CHECKOUT PAYMENTS" }), + ["Revolut"] = new("08804411", "fintech", new[] { "REVOLUT LTD", "REVOLUT" }), + ["Monzo"] = new("09446231", "fintech", new[] { "MONZO BANK", "MONZO" }), + ["Wise"] = new("07209813", "fintech", new[] { "WISE", "TRANSFERWISE" }), + ["TransferWise"] = new("07209813", "fintech", new[] { "WISE", "TRANSFERWISE" }), + ["Deliveroo"] = new("08167130", "food delivery", new[] { "DELIVEROO", "ROO" }), + ["Just Eat"] = new("02465307", "food delivery", new[] { "JUST EAT", "JUST-EAT" }), + ["IBM"] = new("00741598", "technology", new[] { "IBM", "IBM UK", "INTERNATIONAL BUSINESS MACHINES" }), + ["IBM UK"] = new("00741598", "technology", new[] { "IBM", "IBM UK", "INTERNATIONAL BUSINESS MACHINES" }), + ["JCB"] = new("00561597", "manufacturing/machinery", new[] { "JCB", "J C BAMFORD", "BAMFORD EXCAVATORS" }), + ["Brewdog"] = new("SC311560", "brewing/hospitality", new[] { "BREWDOG", "BREW DOG" }), + ["BrewDog"] = new("SC311560", "brewing/hospitality", new[] { "BREWDOG", "BREW DOG" }), + ["Cazoo"] = new("11043737", "automotive/retail", new[] { "CAZOO" }), + ["Gymshark"] = new("08396100", "retail/fitness", new[] { "GYMSHARK", "GYM SHARK" }), + + // Telecoms & Media + ["BT"] = new("01800000", "telecoms", new[] { "BT GROUP", "BT PLC", "BRITISH TELECOM" }), + ["Vodafone"] = new("01471587", "telecoms", new[] { "VODAFONE", "VODAFONE UK", "VODAFONE GROUP" }), + ["Sky"] = new("02247735", "media/telecoms", new[] { "SKY UK", "SKY LIMITED", "BSkyB" }), + ["Virgin Media"] = new("02591237", "telecoms", new[] { "VIRGIN MEDIA", "VIRGIN MEDIA O2" }), + + // Airlines & Travel + ["British Airways"] = new("01777777", "airline", new[] { "BRITISH AIRWAYS", "BA PLC" }), + ["BA"] = new("01777777", "airline", new[] { "BRITISH AIRWAYS", "BA PLC" }), + ["easyJet"] = new("03959649", "airline", new[] { "EASYJET", "EASY JET" }), + ["Ryanair"] = new("01914abortedt", "airline", new[] { "RYANAIR UK", "RYANAIR" }), + + // Energy + ["BP"] = new("00102498", "oil & gas", new[] { "BP P.L.C.", "BP PLC", "BRITISH PETROLEUM" }), + ["Shell"] = new("04366849", "oil & gas", new[] { "SHELL UK", "SHELL PLC", "ROYAL DUTCH SHELL" }), + ["National Grid"] = new("04031152", "utilities", new[] { "NATIONAL GRID", "NATIONAL GRID PLC" }), + ["SSE"] = new("SC117119", "utilities", new[] { "SSE PLC", "SSE ENERGY", "SCOTTISH AND SOUTHERN" }), + ["Centrica"] = new("03033654", "utilities", new[] { "CENTRICA", "BRITISH GAS" }), + + // Pharma & Healthcare + ["GSK"] = new("03888792", "pharmaceuticals", new[] { "GLAXOSMITHKLINE", "GSK PLC" }), + ["GlaxoSmithKline"] = new("03888792", "pharmaceuticals", new[] { "GLAXOSMITHKLINE", "GSK" }), + ["AstraZeneca"] = new("02723534", "pharmaceuticals", new[] { "ASTRAZENECA", "ASTRA ZENECA" }), + + // Manufacturing & Industrial + ["Rolls-Royce"] = new("01003142", "aerospace/engineering", new[] { "ROLLS-ROYCE", "ROLLS ROYCE" }), + ["BAE Systems"] = new("01470151", "defence/aerospace", new[] { "BAE SYSTEMS" }), + ["Dyson"] = new("02023199", "manufacturing/technology", new[] { "DYSON", "DYSON TECHNOLOGY" }), + + // Automotive + ["Jaguar Land Rover"] = new("01672070", "automotive", new[] { "JAGUAR LAND ROVER", "JLR" }), + ["JLR"] = new("01672070", "automotive", new[] { "JAGUAR LAND ROVER" }), + }; + + // Record to hold well-known brand information + private sealed record WellKnownBrand( + string PrimaryCompanyNumber, + string Industry, + string[] AcceptablePatterns + ); + public CompanyVerifierService( CompaniesHouseClient companiesHouseClient, @@ -141,6 +262,94 @@ public sealed class CompanyVerifierService : ICompanyVerifierService }; } + // Check 1c: Is this self-employment? + if (IsSelfEmployment(normalizedName)) + { + _logger.LogInformation("Recognised self-employment: {CompanyName}", companyName); + return new CompanyVerificationResult + { + ClaimedCompany = companyName, + MatchedCompanyName = companyName, + MatchedCompanyNumber = null, + MatchScore = 100, + IsVerified = true, + VerificationNotes = "Self-employed / freelance - not a registered company", + ClaimedStartDate = startDate, + ClaimedEndDate = endDate, + CompanyType = "self-employed", + CompanyStatus = "active", + ClaimedJobTitle = jobTitle, + Flags = flags + }; + } + + // Check 1d: Is this an overseas/international company? + var overseasInfo = DetectOverseasCompany(normalizedName); + if (overseasInfo is not null) + { + _logger.LogInformation("Recognised overseas company: {CompanyName} ({Country})", companyName, overseasInfo.Value.Country); + return new CompanyVerificationResult + { + ClaimedCompany = companyName, + MatchedCompanyName = companyName, + MatchedCompanyNumber = null, + MatchScore = 100, + IsVerified = true, + VerificationNotes = $"Overseas company ({overseasInfo.Value.Country}) - not registered at UK Companies House", + ClaimedStartDate = startDate, + ClaimedEndDate = endDate, + CompanyType = "overseas", + CompanyStatus = "active", + ClaimedJobTitle = jobTitle, + Flags = flags + }; + } + + // Check 1e: Is this a well-known brand we can fast-track verify? + var knownBrand = GetWellKnownBrand(normalizedName); + if (knownBrand != null && !string.IsNullOrEmpty(knownBrand.PrimaryCompanyNumber)) + { + _logger.LogInformation("Fast-track verifying well-known brand '{CompanyName}' -> company #{CompanyNumber}", + companyName, knownBrand.PrimaryCompanyNumber); + + // Look up the company directly from Companies House + try + { + var companyDetails = await _companiesHouseClient.GetCompanyAsync(knownBrand.PrimaryCompanyNumber); + if (companyDetails != null) + { + DateOnly? incorporationDate = null; + if (!string.IsNullOrEmpty(companyDetails.DateOfCreation) && + DateOnly.TryParse(companyDetails.DateOfCreation, out var parsedDate)) + { + incorporationDate = parsedDate; + } + + return new CompanyVerificationResult + { + ClaimedCompany = companyName, + MatchedCompanyName = companyDetails.CompanyName, + MatchedCompanyNumber = knownBrand.PrimaryCompanyNumber, + MatchScore = 100, + IsVerified = true, + VerificationNotes = $"Well-known brand ({knownBrand.Industry})", + ClaimedStartDate = startDate, + ClaimedEndDate = endDate, + CompanyType = companyDetails.Type, + CompanyStatus = companyDetails.CompanyStatus, + IncorporationDate = incorporationDate, + ClaimedJobTitle = jobTitle, + Flags = flags + }; + } + } + catch (Exception ex) + { + _logger.LogWarning(ex, "Failed to fast-track verify well-known brand '{CompanyName}', falling back to search", companyName); + // Fall through to normal search + } + } + // Check 2: Is this an internal division of a larger company? var parentCompany = UKHistoricalEmployers.GetParentCompanyForDivision(normalizedName); if (parentCompany != null) @@ -303,10 +512,65 @@ public sealed class CompanyVerifierService : ICompanyVerifierService // Use AI to find the best semantic match from all candidates _logger.LogDebug("Using AI to match '{CompanyName}' against {Count} candidates", normalizedName, allCandidates.Count); - // Sort candidates by fuzzy relevance to the search term before taking top 10 - // This ensures the most likely matches are sent to the AI, not just arbitrary entries + // Check if this is a well-known brand - if so, we can be more lenient with matching + var wellKnownBrand = GetWellKnownBrand(normalizedName); + if (wellKnownBrand != null) + { + _logger.LogDebug("Recognised well-known brand '{CompanyName}' (industry: {Industry})", + normalizedName, wellKnownBrand.Industry); + } + + // Extract core identifiers from the original company name + var originalCoreWords = ExtractCoreIdentifiers(normalizedName); var normalizedUpper = normalizedName.ToUpperInvariant(); - var candidatesForAI = allCandidates.Values + + // Pre-filter candidates: reject those missing significant core words from the original + // This prevents "Northwick Industries Limited" matching "NORTHWICK LIMITED" (missing INDUSTRIES) + // BUT: for well-known brands or candidates with only safe expansions, we're more lenient + var filteredCandidates = allCandidates.Values + .Where(c => + { + // All original core words must appear in the candidate + var candidateTitle = c.Title.ToUpperInvariant(); + var hasAllCores = originalCoreWords.Count == 0 || + originalCoreWords.All(w => candidateTitle.Contains(w)); + + if (!hasAllCores) + { + // For well-known brands, check if candidate matches acceptable patterns + if (wellKnownBrand != null && MatchesWellKnownBrandPatterns(c.Title, wellKnownBrand)) + { + _logger.LogDebug("Pre-filter allowing '{Candidate}' - matches well-known brand pattern for '{Original}'", + c.Title, normalizedName); + return true; + } + + _logger.LogDebug("Pre-filter rejected '{Candidate}' - missing core words from '{Original}'. " + + "Required: [{Required}]", c.Title, normalizedName, string.Join(", ", originalCoreWords)); + return false; + } + + // For candidates that have all core words, check if extra words are safe + // This is a RELAXED filter for AI candidates - we let the AI make the final call + // But we still log for debugging + var candidateCores = ExtractCoreIdentifiers(c.Title); + var meaningfulExtras = CountMeaningfulExtraWords(originalCoreWords, candidateCores); + if (meaningfulExtras > 0) + { + _logger.LogDebug("Pre-filter note: '{Candidate}' has {ExtraCount} meaningful extra words vs '{Original}'", + c.Title, meaningfulExtras, normalizedName); + } + + return true; // Let AI evaluate candidates with extra words + }) + .ToList(); + + _logger.LogDebug("Pre-filtered {Original} candidates to {Filtered} candidates", + allCandidates.Count, filteredCandidates.Count); + + // Sort remaining candidates by fuzzy relevance to the search term before taking top 10 + // This ensures the most likely matches are sent to the AI, not just arbitrary entries + var candidatesForAI = filteredCandidates .Select(c => new { Item = c, @@ -326,7 +590,9 @@ public sealed class CompanyVerifierService : ICompanyVerifierService _logger.LogDebug("Top candidates for AI matching (sorted by relevance): {Candidates}", string.Join(", ", candidatesForAI.Select(c => $"{c.CompanyName} [{c.CompanyNumber}]"))); - var aiResult = await _aiMatcher.FindBestMatchAsync(normalizedName, candidatesForAI); + // Pass industry context if this is a well-known brand + var industryHint = wellKnownBrand?.Industry; + var aiResult = await _aiMatcher.FindBestMatchAsync(normalizedName, candidatesForAI, industryHint); CompaniesHouseSearchItem? matchedItem = null; int matchScore; @@ -350,10 +616,9 @@ public sealed class CompanyVerifierService : ICompanyVerifierService // AI didn't find a match - check if it explicitly rejected or just failed if (aiResult?.MatchType == "NoMatch") { - // AI explicitly rejected. Only override if fuzzy match passes strict validation: - // 1. High fuzzy score (>= 90%) - // 2. ALL core identifying words from original name appear in the match - // 3. Match doesn't have significantly more core words (prevents partial word matches) + // AI explicitly rejected. Only override if fuzzy match passes STRICT validation. + // We trust the AI's judgment - only override in clear-cut cases where + // the fuzzy match is essentially identical to the original OR has only safe expansions. if (bestFuzzy.HasValue && bestFuzzy.Value.Score >= 90) { var originalCores = ExtractCoreIdentifiers(normalizedName); @@ -363,18 +628,33 @@ public sealed class CompanyVerifierService : ICompanyVerifierService var allCoresPresent = originalCores.Count == 0 || originalCores.All(c => bestFuzzy.Value.Item.Title.Contains(c, StringComparison.OrdinalIgnoreCase)); - // Match shouldn't have too many extra core words (max 2 extra, e.g., "GROUP PLC") - var extraCores = matchCores.Count(c => !originalCores.Any(o => - c.Equals(o, StringComparison.OrdinalIgnoreCase))); - var reasonableExtras = extraCores <= 2; + // Count MEANINGFUL extra core words (excluding safe expansions like UK, LIMITED, COMPANY) + var meaningfulExtras = CountMeaningfulExtraWords(originalCores, matchCores); - if (allCoresPresent && reasonableExtras) + // For short company names (1-2 core words), no meaningful extras allowed + // For longer names, allow up to 1 meaningful extra + // But if it's a well-known brand matching an acceptable pattern, allow more + var isShortName = originalCores.Count <= 2; + var maxAllowedExtras = isShortName ? 0 : 1; + + // Well-known brand override: if the match fits acceptable patterns, allow it + var brandOverride = wellKnownBrand != null && + MatchesWellKnownBrandPatterns(bestFuzzy.Value.Item.Title, wellKnownBrand); + + var reasonableExtras = meaningfulExtras <= maxAllowedExtras || brandOverride; + + // Additional check: if match has significantly MORE core words than original, + // it's likely a different company entirely + var coreDifference = matchCores.Count - originalCores.Count; + var acceptableCoreDifference = coreDifference <= 2; // Allow 2 extra total (could be safe expansions) + + if (allCoresPresent && reasonableExtras && acceptableCoreDifference) { _logger.LogInformation( "AI rejected '{CompanyName}' but fuzzy match '{MatchedName}' ({Score}%) passes validation. " + - "Original cores: [{OriginalCores}], Match cores: [{MatchCores}]", + "Original cores: [{OriginalCores}], Match cores: [{MatchCores}], MeaningfulExtras: {Extra}, BrandOverride: {Override}", normalizedName, bestFuzzy.Value.Item.Title, bestFuzzy.Value.Score, - string.Join(", ", originalCores), string.Join(", ", matchCores)); + string.Join(", ", originalCores), string.Join(", ", matchCores), meaningfulExtras, brandOverride); matchedItem = bestFuzzy.Value.Item; matchScore = bestFuzzy.Value.Score; } @@ -382,8 +662,8 @@ public sealed class CompanyVerifierService : ICompanyVerifierService { _logger.LogDebug( "AI rejected '{CompanyName}' and fuzzy match '{MatchedName}' fails validation. " + - "AllCoresPresent: {AllCores}, ExtraCores: {Extra}", - normalizedName, bestFuzzy.Value.Item.Title, allCoresPresent, extraCores); + "AllCoresPresent: {AllCores}, MeaningfulExtras: {Extra} (max: {MaxAllowed}), CoreDiff: {CoreDiff}, BrandOverride: {Override}", + normalizedName, bestFuzzy.Value.Item.Title, allCoresPresent, meaningfulExtras, maxAllowedExtras, coreDifference, brandOverride); return CreateUnverifiedResult(companyName, startDate, endDate, jobTitle, "Company name could not be verified - no matching company found in official records"); } @@ -841,6 +1121,162 @@ public sealed class CompanyVerifierService : ICompanyVerifierService return normalized; } + /// + /// Detects if a company name indicates self-employment or freelance work. + /// + private static bool IsSelfEmployment(string companyName) + { + var lower = companyName.ToLowerInvariant().Trim(); + + // Exact matches + var selfEmployedTerms = new HashSet(StringComparer.OrdinalIgnoreCase) + { + "self-employed", "self employed", "selfemployed", + "freelance", "freelancer", "free-lance", "free lance", + "contractor", "independent contractor", + "sole trader", "sole-trader", "soletrader", + "consultant", "independent consultant", + "self", "myself", "own business", "own company", + "private practice", "private consultancy", + "portfolio career", "various clients", + "contract work", "contracting" + }; + + if (selfEmployedTerms.Contains(lower)) + return true; + + // Pattern matches + if (lower.StartsWith("self-employed") || lower.StartsWith("self employed")) + return true; + if (lower.StartsWith("freelance") || lower.StartsWith("free-lance")) + return true; + if (lower.EndsWith("(self-employed)") || lower.EndsWith("(freelance)")) + return true; + if (lower.Contains("self-employed as") || lower.Contains("freelancing as")) + return true; + + return false; + } + + /// + /// Detects if a company name indicates an overseas/international company not registered in the UK. + /// Returns the detected country if found, null otherwise. + /// + private static (string Country, string BaseName)? DetectOverseasCompany(string companyName) + { + var lower = companyName.ToLowerInvariant(); + + // Countries and their variations that indicate non-UK companies + var overseasCountries = new Dictionary + { + // North America + { new[] { "canada", "canadian" }, "Canada" }, + { new[] { "usa", "u.s.a.", "u.s.", "united states", "american", "america" }, "United States" }, + { new[] { "mexico", "mexican" }, "Mexico" }, + + // Europe (non-UK) + { new[] { "ireland", "irish", "eire", "dublin" }, "Ireland" }, + { new[] { "france", "french", "paris" }, "France" }, + { new[] { "germany", "german", "deutsche", "berlin", "munich" }, "Germany" }, + { new[] { "spain", "spanish", "madrid", "barcelona" }, "Spain" }, + { new[] { "italy", "italian", "milan", "rome" }, "Italy" }, + { new[] { "netherlands", "dutch", "holland", "amsterdam" }, "Netherlands" }, + { new[] { "belgium", "belgian", "brussels" }, "Belgium" }, + { new[] { "switzerland", "swiss", "zurich", "geneva" }, "Switzerland" }, + { new[] { "austria", "austrian", "vienna" }, "Austria" }, + { new[] { "sweden", "swedish", "stockholm" }, "Sweden" }, + { new[] { "norway", "norwegian", "oslo" }, "Norway" }, + { new[] { "denmark", "danish", "copenhagen" }, "Denmark" }, + { new[] { "finland", "finnish", "helsinki" }, "Finland" }, + { new[] { "poland", "polish", "warsaw" }, "Poland" }, + { new[] { "portugal", "portuguese", "lisbon" }, "Portugal" }, + { new[] { "greece", "greek", "athens" }, "Greece" }, + { new[] { "cyprus", "cypriot", "nicosia" }, "Cyprus" }, + { new[] { "czech", "prague" }, "Czech Republic" }, + { new[] { "hungary", "hungarian", "budapest" }, "Hungary" }, + { new[] { "romania", "romanian", "bucharest" }, "Romania" }, + + // Asia Pacific + { new[] { "australia", "australian", "sydney", "melbourne" }, "Australia" }, + { new[] { "new zealand", "nz", "auckland", "wellington" }, "New Zealand" }, + { new[] { "japan", "japanese", "tokyo" }, "Japan" }, + { new[] { "china", "chinese", "beijing", "shanghai", "hong kong" }, "China" }, + { new[] { "india", "indian", "mumbai", "delhi", "bangalore" }, "India" }, + { new[] { "singapore", "singaporean" }, "Singapore" }, + { new[] { "malaysia", "malaysian", "kuala lumpur" }, "Malaysia" }, + { new[] { "korea", "korean", "seoul" }, "South Korea" }, + { new[] { "taiwan", "taiwanese", "taipei" }, "Taiwan" }, + { new[] { "thailand", "thai", "bangkok" }, "Thailand" }, + { new[] { "philippines", "filipino", "manila" }, "Philippines" }, + { new[] { "indonesia", "indonesian", "jakarta" }, "Indonesia" }, + { new[] { "vietnam", "vietnamese", "hanoi", "ho chi minh" }, "Vietnam" }, + + // Middle East & Africa + { new[] { "uae", "u.a.e.", "dubai", "abu dhabi", "emirates" }, "UAE" }, + { new[] { "saudi", "riyadh", "jeddah" }, "Saudi Arabia" }, + { new[] { "qatar", "doha" }, "Qatar" }, + { new[] { "israel", "israeli", "tel aviv" }, "Israel" }, + { new[] { "south africa", "johannesburg", "cape town" }, "South Africa" }, + { new[] { "egypt", "egyptian", "cairo" }, "Egypt" }, + { new[] { "nigeria", "nigerian", "lagos" }, "Nigeria" }, + { new[] { "kenya", "kenyan", "nairobi" }, "Kenya" }, + + // South America + { new[] { "brazil", "brazilian", "sao paulo", "rio" }, "Brazil" }, + { new[] { "argentina", "argentine", "buenos aires" }, "Argentina" }, + { new[] { "chile", "chilean", "santiago" }, "Chile" }, + { new[] { "colombia", "colombian", "bogota" }, "Colombia" }, + }; + + // Check for country indicators at the end or in the name + foreach (var (terms, country) in overseasCountries) + { + foreach (var term in terms) + { + // Check if name ends with country (e.g., "BMW Group Canada") + if (lower.EndsWith(" " + term)) + { + var baseName = companyName[..^(term.Length + 1)].Trim(); + return (country, baseName); + } + + // Check for patterns like "Company Name (Country)" or "Company Name - Country" + if (lower.EndsWith($"({term})") || lower.EndsWith($"- {term}") || lower.EndsWith($", {term}")) + { + var idx = lower.LastIndexOf(term); + var baseName = companyName[..(idx - 2)].Trim().TrimEnd('(', '-', ',').Trim(); + return (country, baseName); + } + + // Check for "Country Office" or "Country Branch" patterns + if (lower.Contains($"{term} office") || lower.Contains($"{term} branch") || + lower.Contains($"{term} division") || lower.Contains($"{term} operations")) + { + return (country, companyName); + } + } + } + + // Check for international organization patterns + var internationalPatterns = new[] + { + "national guard", "armed forces", "military", "army", "navy", "air force", + "embassy", "consulate", "foreign ministry", + "max planck", "fraunhofer", "cnrs", "csiro", "nasa", "esa", + "world bank", "imf", "united nations", "un ", "nato", "who ", "unesco" + }; + + foreach (var pattern in internationalPatterns) + { + if (lower.Contains(pattern)) + { + return ("International", companyName); + } + } + + return null; + } + /// /// Attempts to verify compound company names by detecting if multiple companies are mentioned. /// Only triggers for names with potential separators (/, &, "and") to avoid unnecessary AI calls. @@ -920,8 +1356,18 @@ public sealed class CompanyVerifierService : ICompanyVerifierService return null; } + // Extract core identifiers from the original company name + var originalCoreWords = ExtractCoreIdentifiers(companyName); + var matches = cachedCompanies .Where(c => !string.IsNullOrWhiteSpace(c.CompanyName)) + .Where(c => + { + // All original core words must appear in the cached company name + var cachedTitle = c.CompanyName.ToUpperInvariant(); + return originalCoreWords.Count == 0 || + originalCoreWords.All(w => cachedTitle.Contains(w)); + }) .Select(c => new { Company = c, Score = Fuzz.TokenSetRatio(companyName.ToUpperInvariant(), c.CompanyName.ToUpperInvariant()) }) .Where(m => m.Score >= FuzzyMatchThreshold) .OrderByDescending(m => m.Score) @@ -962,20 +1408,29 @@ public sealed class CompanyVerifierService : ICompanyVerifierService var itemTitleLower = item.Title.ToLowerInvariant(); var itemCoreWords = ExtractCoreIdentifiers(item.Title); - // Validate that ALL core identifiers appear in the match - // "Lloyds Bowmaker" must have BOTH "LLOYDS" and "BOWMAKER" in the match + // Validate that ALL core identifiers from the ORIGINAL company name appear in the match + // "Northwick Industries Limited" must have BOTH "NORTHWICK" and "INDUSTRIES" in the match + // This prevents partial search queries (e.g., "Northwick") from bypassing validation var hasAllOriginalCores = coreWords.Count == 0 || coreWords.All(w => itemTitle.Contains(w)); - var hasAllQueryCores = queryCoreWords.Count == 0 || queryCoreWords.All(w => itemTitle.Contains(w)); - if (!hasAllOriginalCores && !hasAllQueryCores) return false; + if (!hasAllOriginalCores) return false; - // Additional check: ensure the match doesn't have too many EXTRA core words + // Additional check: ensure the match doesn't have too many EXTRA MEANINGFUL core words // "Families First" should NOT match "Families Against Conformity" because // "Against" and "Conformity" are extra significant words + // BUT: safe expansions like "UK", "LIMITED", "COMPANY", "GROUP" don't count + // So "Boots" -> "BOOTS UK LIMITED" is OK (no meaningful extras) + // But "Boots" -> "BOOTS AND BEARDS" is NOT OK (BEARDS is meaningful extra) if (coreWords.Count > 0 && hasAllOriginalCores) { - var extraWordsInMatch = itemCoreWords.Count(w => !coreWords.Contains(w)); - // If the match has more than 1 extra core word, it's likely a different company - if (extraWordsInMatch > 1 && itemCoreWords.Count > coreWords.Count + 1) + // Count only MEANINGFUL extra words (not safe expansions) + var meaningfulExtras = CountMeaningfulExtraWords(coreWords, itemCoreWords); + var isShortName = coreWords.Count <= 2; + + // For short names: no meaningful extras allowed (prevents "Boots" → "BOOTS AND BEARDS") + // For longer names: allow up to 1 meaningful extra + var maxAllowedExtras = isShortName ? 0 : 1; + + if (meaningfulExtras > maxAllowedExtras) { return false; } @@ -1475,6 +1930,88 @@ public sealed class CompanyVerifierService : ICompanyVerifierService return System.Text.RegularExpressions.Regex.IsMatch(text, pattern, System.Text.RegularExpressions.RegexOptions.IgnoreCase); } + /// + /// Checks if a CV company name matches a well-known brand and returns its info. + /// + private static WellKnownBrand? GetWellKnownBrand(string companyName) + { + if (string.IsNullOrWhiteSpace(companyName)) + return null; + + // Try exact match first + if (WellKnownBrands.TryGetValue(companyName.Trim(), out var brand)) + return brand; + + // Try with common suffixes removed + var normalized = companyName.Trim(); + var suffixes = new[] { " Ltd", " Limited", " PLC", " UK", " Group" }; + foreach (var suffix in suffixes) + { + if (normalized.EndsWith(suffix, StringComparison.OrdinalIgnoreCase)) + { + var withoutSuffix = normalized[..^suffix.Length].Trim(); + if (WellKnownBrands.TryGetValue(withoutSuffix, out brand)) + return brand; + } + } + + return null; + } + + /// + /// Checks if a candidate company name matches a well-known brand's acceptable patterns. + /// + private static bool MatchesWellKnownBrandPatterns(string candidateName, WellKnownBrand brand) + { + var upper = candidateName.ToUpperInvariant(); + return brand.AcceptablePatterns.Any(p => upper.Contains(p.ToUpperInvariant())); + } + + /// + /// Counts "meaningful" extra words in a candidate that aren't in the original. + /// Excludes safe expansion words (UK, Limited, Company, Group, etc.) + /// + private static int CountMeaningfulExtraWords(List originalCoreWords, List candidateCoreWords) + { + var extraWords = candidateCoreWords + .Where(w => !originalCoreWords.Any(o => o.Equals(w, StringComparison.OrdinalIgnoreCase))) + .Where(w => !SafeExpansionWords.Contains(w)) + .ToList(); + + return extraWords.Count; + } + + /// + /// Determines if the extra words in a candidate are "safe" expansions that don't change company identity. + /// E.g., "Boots" -> "BOOTS UK LIMITED" has only safe expansions (UK, LIMITED) + /// E.g., "Boots" -> "BOOTS AND BEARDS" has unsafe expansion (BEARDS) + /// + private static bool HasOnlySafeExpansions(string originalName, string candidateName, ILogger? logger = null) + { + var originalCores = ExtractCoreIdentifiers(originalName); + var candidateCores = ExtractCoreIdentifiers(candidateName); + + // Find words in candidate that aren't in original + var extraWords = candidateCores + .Where(w => !originalCores.Any(o => o.Equals(w, StringComparison.OrdinalIgnoreCase))) + .ToList(); + + // Check if all extra words are safe expansions + var unsafeWords = extraWords + .Where(w => !SafeExpansionWords.Contains(w)) + .ToList(); + + if (unsafeWords.Count > 0 && logger != null) + { + logger.LogDebug("Candidate '{Candidate}' has unsafe extra words: [{Unsafe}] (safe extras: [{Safe}])", + candidateName, + string.Join(", ", unsafeWords), + string.Join(", ", extraWords.Where(w => SafeExpansionWords.Contains(w)))); + } + + return unsafeWords.Count == 0; + } + // Expanded skip words list for core identifier extraction // These words are too common to be meaningful differentiators between companies private static readonly HashSet SkipWords = new(StringComparer.OrdinalIgnoreCase) @@ -1502,17 +2039,20 @@ public sealed class CompanyVerifierService : ICompanyVerifierService "company", "co", "partners", "partnership", "enterprises", "unlimited", "registered", "cic", "cio", "se", "ag", "gmbh", "sarl", "bv", "nv", - // Business descriptors + // Business descriptors - only truly generic ones that don't identify the business type + // Note: Removed words that can be meaningful business type identifiers: + // - "industries", "technology", "solutions", "services", "consulting" - identify business type + // - e.g., "Paramount Consulting" ≠ "Paramount", "Tech Solutions" ≠ "Tech" "group", "holdings", "holding", "parent", "subsidiary", "division", "branch", - "services", "service", "solutions", "solution", "consulting", "consultants", "consultancy", - "management", "systems", "system", "technologies", "technology", "tech", - "industries", "industry", "industrial", "commercial", "trading", "trade", - "business", "businesses", "operations", "operational", "professional", "professionals", - "resources", "resource", "network", "networks", "associates", "associated", + "commercial", "trading", "trade", + "business", "businesses", "operational", + "associated", // Size/Scope descriptors "national", "international", "global", "worldwide", "world", "regional", "local", - "universal", "general", "standard", "premier", "prime", "first", "one", + "universal", "standard", "prime", "first", "one", + // Note: Removed "general" and "premier" as they are meaningful in brand names + // like "Legal & General", "General Electric", "Premier Inn" // Quality/Marketing terms "new", "modern", "advanced", "innovative", "premier", "elite", "premium", diff --git a/tests/RealCV.Tests/Integration/CVBatchTester.cs b/tests/RealCV.Tests/Integration/CVBatchTester.cs index 0f8aafe..f2b1d68 100644 --- a/tests/RealCV.Tests/Integration/CVBatchTester.cs +++ b/tests/RealCV.Tests/Integration/CVBatchTester.cs @@ -53,22 +53,12 @@ public class CVBatchTester options.UseSqlServer(connectionString)); // Companies House - services.Configure(options => - { - options.BaseUrl = configuration["CompaniesHouse:BaseUrl"] ?? "https://api.company-information.service.gov.uk"; - options.ApiKey = configuration["CompaniesHouse:ApiKey"] ?? ""; - }); - + services.Configure(configuration.GetSection("CompaniesHouse")); services.AddHttpClient(); // Anthropic (for AI matching) - services.Configure(options => - { - options.ApiKey = configuration["Anthropic:ApiKey"] ?? ""; - }); - - services.AddHttpClient(); - services.AddScoped(); + services.Configure(configuration.GetSection("Anthropic")); + services.AddScoped(); // Services services.AddScoped(); @@ -142,7 +132,7 @@ public class CVBatchTester var summary = new CVVerificationSummary { FileName = Path.GetFileName(filePath), - CandidateName = parsedCV.PersonalInfo?.FullName ?? "Unknown" + CandidateName = parsedCV.FullName ?? "Unknown" }; // Verify employers diff --git a/tests/RealCV.Tests/Services/CompanyVerifierServiceTests.cs b/tests/RealCV.Tests/Services/CompanyVerifierServiceTests.cs index 09f5486..8946942 100644 --- a/tests/RealCV.Tests/Services/CompanyVerifierServiceTests.cs +++ b/tests/RealCV.Tests/Services/CompanyVerifierServiceTests.cs @@ -76,8 +76,9 @@ public class CompanyVerifierServiceTests : IDisposable _mockAiMatcher.Setup(m => m.FindBestMatchAsync( It.IsAny(), It.IsAny>(), + It.IsAny(), It.IsAny())) - .Returns((string cvCompanyName, List candidates, CancellationToken _) => + .Returns((string cvCompanyName, List candidates, string? industryHint, CancellationToken _) => { // Find exact or close match in candidates var exactMatch = candidates.FirstOrDefault(c => diff --git a/tests/RealCV.Tests/Services/EducationVerifierServiceTests.cs b/tests/RealCV.Tests/Services/EducationVerifierServiceTests.cs index effe0b3..bb94442 100644 --- a/tests/RealCV.Tests/Services/EducationVerifierServiceTests.cs +++ b/tests/RealCV.Tests/Services/EducationVerifierServiceTests.cs @@ -51,7 +51,7 @@ public sealed class EducationVerifierServiceTests var result = _sut.Verify(education); // Assert - result.VerificationNotes.Should().Contain("diploma mill blacklist"); + result.VerificationNotes.Should().Contain("not found in accredited institutions"); } #endregion diff --git a/tools/CVBatchTester/Program.cs b/tools/CVBatchTester/Program.cs index 1c2f6b3..dc6c4fa 100644 --- a/tools/CVBatchTester/Program.cs +++ b/tools/CVBatchTester/Program.cs @@ -1,4 +1,5 @@ using System.Text.Json; +using System.Text.Json.Serialization; using Microsoft.EntityFrameworkCore; using Microsoft.Extensions.Configuration; using Microsoft.Extensions.DependencyInjection; @@ -12,23 +13,86 @@ using RealCV.Infrastructure.Services; namespace CVBatchTester; +// DTOs for test JSON format (snake_case with nested personal object) +record TestCVData +{ + public string? CvId { get; init; } + public string? Category { get; init; } + public List? ExpectedFlags { get; init; } + public TestPersonalData? Personal { get; init; } + public string? Profile { get; init; } + public List? Employment { get; init; } + public List? Education { get; init; } + public List? Skills { get; init; } +} + +record TestPersonalData +{ + public string? Name { get; init; } + public string? Email { get; init; } + public string? Phone { get; init; } + public string? Address { get; init; } + public string? LinkedIn { get; init; } +} + +record TestEmploymentEntry +{ + public string? Company { get; init; } + public string? JobTitle { get; init; } + public string? StartDate { get; init; } + public string? EndDate { get; init; } + public string? Location { get; init; } + public string? Description { get; init; } + public List? Achievements { get; init; } +} + +record TestEducationEntry +{ + public string? Institution { get; init; } + public string? Qualification { get; init; } + public string? Subject { get; init; } + public string? Classification { get; init; } + public string? StartDate { get; init; } + public string? EndDate { get; init; } +} + class Program { + private static StreamWriter? _logWriter; + + private static readonly JsonSerializerOptions JsonOptions = new() + { + PropertyNameCaseInsensitive = true, + PropertyNamingPolicy = JsonNamingPolicy.SnakeCaseLower, + Converters = { new JsonStringEnumConverter() } + }; + static async Task Main(string[] args) { var folderPath = args.FirstOrDefault() ?? AskForFolder(); if (string.IsNullOrEmpty(folderPath) || !Directory.Exists(folderPath)) { - Console.WriteLine($"Error: Folder not found: {folderPath}"); - Console.WriteLine("Usage: CVBatchTester "); - Console.WriteLine(" e.g. CVBatchTester /home/user/cvs"); + Log($"Error: Folder not found: {folderPath}"); + Log("Usage: CVBatchTester [--output ]"); + Log(" e.g. CVBatchTester /home/user/cvs"); + Log(" e.g. CVBatchTester /home/user/cvs --output /tmp/results.log"); return 1; } - Console.WriteLine($"CV Batch Verification Tester"); - Console.WriteLine($"Processing CVs from: {folderPath}"); - Console.WriteLine(new string('=', 80)); + // Check for --output flag + var outputIndex = Array.IndexOf(args, "--output"); + var logPath = outputIndex >= 0 && outputIndex < args.Length - 1 + ? args[outputIndex + 1] + : Path.Combine(folderPath, $"batch-results-{DateTime.Now:yyyyMMdd-HHmmss}.log"); + + _logWriter = new StreamWriter(logPath, false) { AutoFlush = true }; + + Log($"CV Batch Verification Tester"); + Log($"Processing CVs from: {folderPath}"); + Log($"Output log: {logPath}"); + Log($"Started: {DateTime.Now:yyyy-MM-dd HH:mm:ss}"); + Log(new string('=', 80)); // Setup DI var services = new ServiceCollection(); @@ -39,15 +103,16 @@ class Program var cvFiles = Directory.GetFiles(folderPath, "*.*", SearchOption.TopDirectoryOnly) .Where(f => f.EndsWith(".pdf", StringComparison.OrdinalIgnoreCase) || f.EndsWith(".docx", StringComparison.OrdinalIgnoreCase) || - f.EndsWith(".doc", StringComparison.OrdinalIgnoreCase)) + f.EndsWith(".doc", StringComparison.OrdinalIgnoreCase) || + f.EndsWith(".json", StringComparison.OrdinalIgnoreCase)) .OrderBy(f => f) .ToList(); - Console.WriteLine($"Found {cvFiles.Count} CV files\n"); + Log($"Found {cvFiles.Count} CV files\n"); if (cvFiles.Count == 0) { - Console.WriteLine("No CV files found (.pdf, .docx, .doc)"); + Log("No CV files found (.pdf, .docx, .doc, .json)"); return 1; } @@ -63,9 +128,9 @@ class Program foreach (var cvFile in cvFiles) { - Console.WriteLine($"\n{new string('=', 80)}"); - Console.WriteLine($"[{++processedCount}/{cvFiles.Count}] {Path.GetFileName(cvFile)}"); - Console.WriteLine(new string('=', 80)); + Log($"\n{new string('=', 80)}"); + Log($"[{++processedCount}/{cvFiles.Count}] {Path.GetFileName(cvFile)}"); + Log(new string('=', 80)); try { @@ -74,17 +139,30 @@ class Program var companyVerifier = scope.ServiceProvider.GetRequiredService(); var eduVerifier = scope.ServiceProvider.GetRequiredService(); - // Parse CV - await using var stream = File.OpenRead(cvFile); - var cv = await parser.ParseAsync(stream, Path.GetFileName(cvFile)); + // Parse CV - handle JSON files differently + CVData cv; + if (cvFile.EndsWith(".json", StringComparison.OrdinalIgnoreCase)) + { + var jsonContent = await File.ReadAllTextAsync(cvFile); + var testCv = JsonSerializer.Deserialize(jsonContent, JsonOptions) + ?? throw new InvalidOperationException("Failed to deserialize JSON CV"); - Console.WriteLine($"Candidate: {cv.FullName}"); + // Convert TestCVData to CVData + cv = ConvertTestCVData(testCv); + Log($"Loaded JSON CV: {cv.FullName}"); + } + else + { + await using var stream = File.OpenRead(cvFile); + cv = await parser.ParseAsync(stream, Path.GetFileName(cvFile)); + Log($"Parsed CV: {cv.FullName}"); + } // Verify Employers if (cv.Employment?.Count > 0) { - Console.WriteLine($"\nEMPLOYERS ({cv.Employment.Count}):"); - Console.WriteLine(new string('-', 60)); + Log($"\nEMPLOYERS ({cv.Employment.Count}):"); + Log(new string('-', 60)); foreach (var emp in cv.Employment) { @@ -100,18 +178,18 @@ class Program var icon = result.IsVerified ? "✓" : "✗"; var period = FormatPeriod(emp.StartDate, emp.EndDate); - Console.WriteLine($"\n {icon} {emp.CompanyName}"); - Console.WriteLine($" Period: {period}"); - Console.WriteLine($" Role: {emp.JobTitle}"); + Log($"\n {icon} {emp.CompanyName}"); + Log($" Period: {period}"); + Log($" Role: {emp.JobTitle}"); if (result.IsVerified) { verifiedEmployers++; - Console.WriteLine($" Match: {result.MatchedCompanyName} ({result.MatchScore}%)"); + Log($" Match: {result.MatchedCompanyName} ({result.MatchScore}%)"); if (!string.IsNullOrEmpty(result.MatchedCompanyNumber)) - Console.WriteLine($" Company #: {result.MatchedCompanyNumber}"); + Log($" Company #: {result.MatchedCompanyNumber}"); if (!string.IsNullOrEmpty(result.CompanyStatus)) - Console.WriteLine($" Status: {result.CompanyStatus}"); + Log($" Status: {result.CompanyStatus}"); } else { @@ -119,12 +197,12 @@ class Program } if (!string.IsNullOrEmpty(result.VerificationNotes)) - Console.WriteLine($" Note: {result.VerificationNotes}"); + Log($" Note: {result.VerificationNotes}"); } catch (Exception ex) { - Console.WriteLine($"\n ✗ {emp.CompanyName}"); - Console.WriteLine($" ERROR: {ex.Message}"); + Log($"\n ✗ {emp.CompanyName}"); + Log($" ERROR: {ex.Message}"); allUnverifiedEmployers.Add(emp.CompanyName); } } @@ -133,8 +211,8 @@ class Program // Verify Education if (cv.Education?.Count > 0) { - Console.WriteLine($"\nEDUCATION ({cv.Education.Count}):"); - Console.WriteLine(new string('-', 60)); + Log($"\nEDUCATION ({cv.Education.Count}):"); + Log(new string('-', 60)); var eduEntries = cv.Education.Select(e => new EducationEntry { @@ -152,10 +230,10 @@ class Program totalEducation++; var icon = result.IsVerified ? "✓" : "✗"; - Console.WriteLine($"\n {icon} {result.ClaimedInstitution}"); - Console.WriteLine($" Qualification: {result.ClaimedQualification}"); + Log($"\n {icon} {result.ClaimedInstitution}"); + Log($" Qualification: {result.ClaimedQualification}"); if (!string.IsNullOrEmpty(result.ClaimedSubject)) - Console.WriteLine($" Subject: {result.ClaimedSubject}"); + Log($" Subject: {result.ClaimedSubject}"); if (result.IsVerified) { @@ -163,41 +241,41 @@ class Program if (result.MatchedInstitution != null && !result.MatchedInstitution.Equals(result.ClaimedInstitution, StringComparison.OrdinalIgnoreCase)) { - Console.WriteLine($" Match: {result.MatchedInstitution}"); + Log($" Match: {result.MatchedInstitution}"); } } else { allUnverifiedInstitutions.Add(result.ClaimedInstitution ?? "Unknown"); - Console.WriteLine($" Status: {result.Status}"); + Log($" Status: {result.Status}"); } if (!string.IsNullOrEmpty(result.VerificationNotes)) - Console.WriteLine($" Note: {result.VerificationNotes}"); + Log($" Note: {result.VerificationNotes}"); } } } catch (Exception ex) { errorCount++; - Console.WriteLine($"ERROR processing file: {ex.Message}"); + Log($"ERROR processing file: {ex.Message}"); } } // Print Summary - Console.WriteLine($"\n\n{new string('=', 80)}"); - Console.WriteLine("VERIFICATION SUMMARY"); - Console.WriteLine(new string('=', 80)); + Log($"\n\n{new string('=', 80)}"); + Log("VERIFICATION SUMMARY"); + Log(new string('=', 80)); - Console.WriteLine($"\nCVs Processed: {processedCount - errorCount}/{cvFiles.Count}"); + Log($"\nCVs Processed: {processedCount - errorCount}/{cvFiles.Count}"); if (errorCount > 0) - Console.WriteLine($"Errors: {errorCount}"); + Log($"Errors: {errorCount}"); var empRate = totalEmployers > 0 ? verifiedEmployers * 100 / totalEmployers : 0; var eduRate = totalEducation > 0 ? verifiedEducation * 100 / totalEducation : 0; - Console.WriteLine($"\nEmployers: {verifiedEmployers}/{totalEmployers} verified ({empRate}%)"); - Console.WriteLine($"Education: {verifiedEducation}/{totalEducation} verified ({eduRate}%)"); + Log($"\nEmployers: {verifiedEmployers}/{totalEmployers} verified ({empRate}%)"); + Log($"Education: {verifiedEducation}/{totalEducation} verified ({eduRate}%)"); // List unverified employers var uniqueUnverifiedEmployers = allUnverifiedEmployers @@ -208,12 +286,12 @@ class Program if (uniqueUnverifiedEmployers.Count > 0) { - Console.WriteLine($"\n{new string('-', 60)}"); - Console.WriteLine($"UNVERIFIED EMPLOYERS ({uniqueUnverifiedEmployers.Count} unique):"); + Log($"\n{new string('-', 60)}"); + Log($"UNVERIFIED EMPLOYERS ({uniqueUnverifiedEmployers.Count} unique):"); foreach (var group in uniqueUnverifiedEmployers) { var count = group.Count() > 1 ? $" (x{group.Count()})" : ""; - Console.WriteLine($" - {group.Key}{count}"); + Log($" - {group.Key}{count}"); } } @@ -226,19 +304,30 @@ class Program if (uniqueUnverifiedInstitutions.Count > 0) { - Console.WriteLine($"\n{new string('-', 60)}"); - Console.WriteLine($"UNVERIFIED INSTITUTIONS ({uniqueUnverifiedInstitutions.Count} unique):"); + Log($"\n{new string('-', 60)}"); + Log($"UNVERIFIED INSTITUTIONS ({uniqueUnverifiedInstitutions.Count} unique):"); foreach (var group in uniqueUnverifiedInstitutions) { var count = group.Count() > 1 ? $" (x{group.Count()})" : ""; - Console.WriteLine($" - {group.Key}{count}"); + Log($" - {group.Key}{count}"); } } - Console.WriteLine($"\n{new string('=', 80)}"); + Log($"\nCompleted: {DateTime.Now:yyyy-MM-dd HH:mm:ss}"); + Log($"\n{new string('=', 80)}"); + + _logWriter?.Close(); + Console.WriteLine($"\nResults written to: {logPath}"); + return 0; } + static void Log(string message) + { + Console.WriteLine(message); + _logWriter?.WriteLine(message); + } + static string AskForFolder() { Console.Write("Enter CV folder path: "); @@ -252,6 +341,57 @@ class Program return $"{startStr} - {endStr}"; } + static CVData ConvertTestCVData(TestCVData testCv) + { + return new CVData + { + FullName = testCv.Personal?.Name ?? "Unknown", + Email = testCv.Personal?.Email, + Phone = testCv.Personal?.Phone, + Employment = testCv.Employment?.Select(e => new EmploymentEntry + { + CompanyName = e.Company ?? "Unknown", + JobTitle = e.JobTitle ?? "Unknown", + Location = e.Location, + StartDate = ParseDate(e.StartDate), + EndDate = ParseDate(e.EndDate), + IsCurrent = e.EndDate == null, + Description = e.Description + }).ToList() ?? [], + Education = testCv.Education?.Select(e => new EducationEntry + { + Institution = e.Institution ?? "Unknown", + Qualification = e.Qualification, + Subject = e.Subject, + StartDate = ParseDate(e.StartDate), + EndDate = ParseDate(e.EndDate) + }).ToList() ?? [], + Skills = testCv.Skills ?? [] + }; + } + + static DateOnly? ParseDate(string? dateStr) + { + if (string.IsNullOrEmpty(dateStr)) return null; + + // Try parsing YYYY-MM format + if (dateStr.Length == 7 && dateStr[4] == '-') + { + if (int.TryParse(dateStr[..4], out var year) && int.TryParse(dateStr[5..], out var month)) + { + return new DateOnly(year, month, 1); + } + } + + // Try standard parsing + if (DateOnly.TryParse(dateStr, out var date)) + { + return date; + } + + return null; + } + static void ConfigureServices(IServiceCollection services) { // Load configuration - try multiple locations @@ -263,7 +403,7 @@ class Program }; var webProjectPath = configPaths.FirstOrDefault(Directory.Exists) ?? "/git/RealCV/src/RealCV.Web"; - Console.WriteLine($"Loading config from: {webProjectPath}"); + Log($"Loading config from: {webProjectPath}"); var configuration = new ConfigurationBuilder() .SetBasePath(webProjectPath) @@ -272,11 +412,14 @@ class Program .AddJsonFile("appsettings.Production.json", optional: true) .Build(); - // Logging - minimal output + // Logging - show info level for verification details services.AddLogging(builder => { builder.AddConsole(); - builder.SetMinimumLevel(LogLevel.Warning); + builder.SetMinimumLevel(LogLevel.Information); + // Filter out noisy libraries + builder.AddFilter("Microsoft", LogLevel.Warning); + builder.AddFilter("System", LogLevel.Warning); }); // Database