From 58c0e79a85692dba9c813c7d058a3237eaa884e0 Mon Sep 17 00:00:00 2001 From: peter Date: Tue, 20 Jan 2026 21:32:02 +0100 Subject: [PATCH] Add alternative search queries for company name variations When searching Companies House, now generates multiple query variations to find companies registered with different naming conventions (e.g., "U.K." vs "UK", "Limited" vs "Ltd"). This helps match older companies like "MATTEL U.K. LIMITED" when CVs list "Mattel UK Ltd". Co-Authored-By: Claude Opus 4.5 --- .../Services/CompanyVerifierService.cs | 110 ++++++++++++++++-- 1 file changed, 101 insertions(+), 9 deletions(-) diff --git a/src/TrueCV.Infrastructure/Services/CompanyVerifierService.cs b/src/TrueCV.Infrastructure/Services/CompanyVerifierService.cs index 766b7b5..cdc856a 100644 --- a/src/TrueCV.Infrastructure/Services/CompanyVerifierService.cs +++ b/src/TrueCV.Infrastructure/Services/CompanyVerifierService.cs @@ -71,23 +71,37 @@ public sealed class CompanyVerifierService : ICompanyVerifierService } } - // Search Companies House + // Search Companies House with fallback queries try { - var searchResponse = await _companiesHouseClient.SearchCompaniesAsync(companyName); + var searchQueries = GenerateSearchQueries(companyName); + _logger.LogDebug("Generated {Count} search queries for '{CompanyName}': {Queries}", + searchQueries.Count, companyName, string.Join(", ", searchQueries.Select(q => $"'{q}'"))); + (CompaniesHouseSearchItem Item, int Score)? bestMatch = null; - if (searchResponse?.Items is null || searchResponse.Items.Count == 0) + foreach (var query in searchQueries) { - _logger.LogDebug("No companies found for: {CompanyName}", companyName); - return CreateUnverifiedResult(companyName, startDate, endDate, jobTitle, "No matching company found in Companies House"); - } + _logger.LogDebug("Searching Companies House with query: {Query}", query); + var searchResponse = await _companiesHouseClient.SearchCompaniesAsync(query); - // Find best fuzzy match, preferring companies that existed at claimed start date - var bestMatch = FindBestMatch(companyName, searchResponse.Items, startDate); + if (searchResponse?.Items is null || searchResponse.Items.Count == 0) + { + continue; + } + + // Find best fuzzy match, preferring companies that existed at claimed start date + bestMatch = FindBestMatch(companyName, searchResponse.Items, startDate); + + if (bestMatch is not null) + { + _logger.LogDebug("Found match with query '{Query}': {Company}", query, bestMatch.Value.Item.Title); + break; + } + } if (bestMatch is null) { - _logger.LogDebug("No fuzzy match above threshold for: {CompanyName}", companyName); + _logger.LogDebug("No valid match found for: {CompanyName} after trying {Count} queries", companyName, searchQueries.Count); return CreateUnverifiedResult(companyName, startDate, endDate, jobTitle, "Company name could not be verified against official records"); } @@ -768,5 +782,83 @@ public sealed class CompanyVerifierService : ICompanyVerifierService }; } + /// + /// Generates alternative search queries to find companies that may be registered + /// with slightly different names (e.g., "U.K." vs "UK", "Limited" vs "Ltd"). + /// + private static List GenerateSearchQueries(string companyName) + { + var queries = new HashSet(StringComparer.OrdinalIgnoreCase) { companyName }; + var normalized = companyName.Trim(); + + // Common suffixes to try variations of + var suffixPatterns = new[] + { + (" Ltd", " Limited"), + (" Limited", " Ltd"), + (" PLC", " Public Limited Company"), + (" Public Limited Company", " PLC"), + (" LLP", " Limited Liability Partnership"), + (" Limited Liability Partnership", " LLP"), + }; + + // Try suffix variations + foreach (var (from, to) in suffixPatterns) + { + if (normalized.EndsWith(from, StringComparison.OrdinalIgnoreCase)) + { + var variant = normalized[..^from.Length] + to; + queries.Add(variant); + } + } + + // Try adding/removing periods in country codes (UK <-> U.K., US <-> U.S.) + var withPeriods = System.Text.RegularExpressions.Regex.Replace( + normalized, + @"\b([A-Z])([A-Z])\b", + "$1.$2."); + queries.Add(withPeriods); + + var withoutPeriods = System.Text.RegularExpressions.Regex.Replace( + normalized, + @"\b([A-Z])\.([A-Z])\.\b", + "$1$2"); + queries.Add(withoutPeriods); + + // Also try replacing "UK" with "U.K." and vice versa specifically + if (normalized.Contains(" UK ", StringComparison.OrdinalIgnoreCase) || + normalized.EndsWith(" UK", StringComparison.OrdinalIgnoreCase)) + { + queries.Add(normalized.Replace(" UK ", " U.K. ", StringComparison.OrdinalIgnoreCase) + .Replace(" UK", " U.K.", StringComparison.OrdinalIgnoreCase)); + } + if (normalized.Contains(" U.K. ", StringComparison.OrdinalIgnoreCase) || + normalized.EndsWith(" U.K.", StringComparison.OrdinalIgnoreCase)) + { + queries.Add(normalized.Replace(" U.K. ", " UK ", StringComparison.OrdinalIgnoreCase) + .Replace(" U.K.", " UK", StringComparison.OrdinalIgnoreCase)); + } + + // Remove common suffixes to get core name + var suffixesToRemove = new[] { " Ltd", " Limited", " PLC", " LLP", " Inc", " Corporation", " Corp" }; + var coreName = normalized; + foreach (var suffix in suffixesToRemove) + { + if (coreName.EndsWith(suffix, StringComparison.OrdinalIgnoreCase)) + { + coreName = coreName[..^suffix.Length].Trim(); + break; + } + } + if (coreName != normalized && coreName.Length >= 3) + { + queries.Add(coreName); + // Also try core name with "Limited" appended + queries.Add(coreName + " Limited"); + } + + return queries.ToList(); + } + #endregion }