Add alternative search queries for company name variations

When searching Companies House, now generates multiple query variations
to find companies registered with different naming conventions (e.g.,
"U.K." vs "UK", "Limited" vs "Ltd"). This helps match older companies
like "MATTEL U.K. LIMITED" when CVs list "Mattel UK Ltd".

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-01-20 21:32:02 +01:00
parent 12da40496e
commit 58c0e79a85

View File

@@ -71,23 +71,37 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
}
}
// Search Companies House
// Search Companies House with fallback queries
try
{
var searchResponse = await _companiesHouseClient.SearchCompaniesAsync(companyName);
var searchQueries = GenerateSearchQueries(companyName);
_logger.LogDebug("Generated {Count} search queries for '{CompanyName}': {Queries}",
searchQueries.Count, companyName, string.Join(", ", searchQueries.Select(q => $"'{q}'")));
(CompaniesHouseSearchItem Item, int Score)? bestMatch = null;
if (searchResponse?.Items is null || searchResponse.Items.Count == 0)
foreach (var query in searchQueries)
{
_logger.LogDebug("No companies found for: {CompanyName}", companyName);
return CreateUnverifiedResult(companyName, startDate, endDate, jobTitle, "No matching company found in Companies House");
}
_logger.LogDebug("Searching Companies House with query: {Query}", query);
var searchResponse = await _companiesHouseClient.SearchCompaniesAsync(query);
// Find best fuzzy match, preferring companies that existed at claimed start date
var bestMatch = FindBestMatch(companyName, searchResponse.Items, startDate);
if (searchResponse?.Items is null || searchResponse.Items.Count == 0)
{
continue;
}
// Find best fuzzy match, preferring companies that existed at claimed start date
bestMatch = FindBestMatch(companyName, searchResponse.Items, startDate);
if (bestMatch is not null)
{
_logger.LogDebug("Found match with query '{Query}': {Company}", query, bestMatch.Value.Item.Title);
break;
}
}
if (bestMatch is null)
{
_logger.LogDebug("No fuzzy match above threshold for: {CompanyName}", companyName);
_logger.LogDebug("No valid match found for: {CompanyName} after trying {Count} queries", companyName, searchQueries.Count);
return CreateUnverifiedResult(companyName, startDate, endDate, jobTitle,
"Company name could not be verified against official records");
}
@@ -768,5 +782,83 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
};
}
/// <summary>
/// Generates alternative search queries to find companies that may be registered
/// with slightly different names (e.g., "U.K." vs "UK", "Limited" vs "Ltd").
/// </summary>
private static List<string> GenerateSearchQueries(string companyName)
{
var queries = new HashSet<string>(StringComparer.OrdinalIgnoreCase) { companyName };
var normalized = companyName.Trim();
// Common suffixes to try variations of
var suffixPatterns = new[]
{
(" Ltd", " Limited"),
(" Limited", " Ltd"),
(" PLC", " Public Limited Company"),
(" Public Limited Company", " PLC"),
(" LLP", " Limited Liability Partnership"),
(" Limited Liability Partnership", " LLP"),
};
// Try suffix variations
foreach (var (from, to) in suffixPatterns)
{
if (normalized.EndsWith(from, StringComparison.OrdinalIgnoreCase))
{
var variant = normalized[..^from.Length] + to;
queries.Add(variant);
}
}
// Try adding/removing periods in country codes (UK <-> U.K., US <-> U.S.)
var withPeriods = System.Text.RegularExpressions.Regex.Replace(
normalized,
@"\b([A-Z])([A-Z])\b",
"$1.$2.");
queries.Add(withPeriods);
var withoutPeriods = System.Text.RegularExpressions.Regex.Replace(
normalized,
@"\b([A-Z])\.([A-Z])\.\b",
"$1$2");
queries.Add(withoutPeriods);
// Also try replacing "UK" with "U.K." and vice versa specifically
if (normalized.Contains(" UK ", StringComparison.OrdinalIgnoreCase) ||
normalized.EndsWith(" UK", StringComparison.OrdinalIgnoreCase))
{
queries.Add(normalized.Replace(" UK ", " U.K. ", StringComparison.OrdinalIgnoreCase)
.Replace(" UK", " U.K.", StringComparison.OrdinalIgnoreCase));
}
if (normalized.Contains(" U.K. ", StringComparison.OrdinalIgnoreCase) ||
normalized.EndsWith(" U.K.", StringComparison.OrdinalIgnoreCase))
{
queries.Add(normalized.Replace(" U.K. ", " UK ", StringComparison.OrdinalIgnoreCase)
.Replace(" U.K.", " UK", StringComparison.OrdinalIgnoreCase));
}
// Remove common suffixes to get core name
var suffixesToRemove = new[] { " Ltd", " Limited", " PLC", " LLP", " Inc", " Corporation", " Corp" };
var coreName = normalized;
foreach (var suffix in suffixesToRemove)
{
if (coreName.EndsWith(suffix, StringComparison.OrdinalIgnoreCase))
{
coreName = coreName[..^suffix.Length].Trim();
break;
}
}
if (coreName != normalized && coreName.Length >= 3)
{
queries.Add(coreName);
// Also try core name with "Limited" appended
queries.Add(coreName + " Limited");
}
return queries.ToList();
}
#endregion
}