Add AI-powered company name matching using Claude API

Replace fuzzy string matching with semantic AI matching to fix false
positives where similar-sounding but different companies were matched
(e.g., "Families First CiC" incorrectly matching "FAMILIES AGAINST
CONFORMITY LTD").

Changes:
- Add ICompanyNameMatcherService interface and AICompanyNameMatcherService
  implementation using Claude Sonnet 4 for semantic company name comparison
- Add SemanticMatchResult and related models for AI match results
- Update CompanyVerifierService to use AI matching with fuzzy fallback
- Add detection for public sector employers, charities, and self-employed
  entries that cannot be verified via Companies House
- Update tests to work with new AI matcher integration

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-01-21 00:51:24 +01:00
parent 030ede9e77
commit d047de1c84
7 changed files with 586 additions and 28 deletions

View File

@@ -16,6 +16,7 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
{
private readonly CompaniesHouseClient _companiesHouseClient;
private readonly IDbContextFactory<ApplicationDbContext> _dbContextFactory;
private readonly ICompanyNameMatcherService _aiMatcher;
private readonly ILogger<CompanyVerifierService> _logger;
private const int FuzzyMatchThreshold = 85;
@@ -75,10 +76,12 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
public CompanyVerifierService(
CompaniesHouseClient companiesHouseClient,
IDbContextFactory<ApplicationDbContext> dbContextFactory,
ICompanyNameMatcherService aiMatcher,
ILogger<CompanyVerifierService> logger)
{
_companiesHouseClient = companiesHouseClient;
_dbContextFactory = dbContextFactory;
_aiMatcher = aiMatcher;
_logger = logger;
}
@@ -119,7 +122,10 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
var searchQueries = GenerateSearchQueries(companyName);
_logger.LogDebug("Generated {Count} search queries for '{CompanyName}': {Queries}",
searchQueries.Count, companyName, string.Join(", ", searchQueries.Select(q => $"'{q}'")));
(CompaniesHouseSearchItem Item, int Score)? bestMatch = null;
// Collect all candidates from all search queries for AI matching
var allCandidates = new Dictionary<string, CompaniesHouseSearchItem>();
var fuzzyMatches = new List<(CompaniesHouseSearchItem Item, int Score)>();
foreach (var query in searchQueries)
{
@@ -131,25 +137,91 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
continue;
}
// Find best fuzzy match, preferring companies that existed at claimed start date
// Pass both original name and search query for matching flexibility
bestMatch = FindBestMatch(companyName, query, searchResponse.Items, startDate);
if (bestMatch is not null)
// Collect unique candidates
foreach (var item in searchResponse.Items)
{
_logger.LogDebug("Found match with query '{Query}': {Company}", query, bestMatch.Value.Item.Title);
break;
if (!string.IsNullOrWhiteSpace(item.CompanyNumber) &&
!allCandidates.ContainsKey(item.CompanyNumber))
{
allCandidates[item.CompanyNumber] = item;
}
}
// Find fuzzy matches (as before) for fallback
var fuzzyMatch = FindBestMatch(companyName, query, searchResponse.Items, startDate);
if (fuzzyMatch is not null)
{
fuzzyMatches.Add(fuzzyMatch.Value);
}
}
if (bestMatch is null)
if (allCandidates.Count == 0)
{
_logger.LogDebug("No valid match found for: {CompanyName} after trying {Count} queries", companyName, searchQueries.Count);
_logger.LogDebug("No candidates found for: {CompanyName} after trying {Count} queries", companyName, searchQueries.Count);
return CreateUnverifiedResult(companyName, startDate, endDate, jobTitle,
"Company name could not be verified against official records");
}
var match = bestMatch.Value;
// Use AI to find the best semantic match from all candidates
_logger.LogDebug("Using AI to match '{CompanyName}' against {Count} candidates", companyName, allCandidates.Count);
var candidatesForAI = allCandidates.Values
.Take(10) // Limit to top 10 candidates to reduce AI cost
.Select(c => new CompanyCandidate
{
CompanyName = c.Title,
CompanyNumber = c.CompanyNumber,
CompanyStatus = c.CompanyStatus,
DateOfCreation = c.DateOfCreation
})
.ToList();
var aiResult = await _aiMatcher.FindBestMatchAsync(companyName, candidatesForAI);
CompaniesHouseSearchItem? matchedItem = null;
int matchScore;
if (aiResult is not null && aiResult.IsMatch)
{
// AI found a valid match
matchedItem = allCandidates.GetValueOrDefault(aiResult.CandidateCompanyNumber);
matchScore = aiResult.ConfidenceScore;
_logger.LogInformation(
"AI matched '{ClaimedName}' to '{MatchedName}' with {Score}% confidence. Reasoning: {Reasoning}",
companyName, aiResult.CandidateCompanyName, aiResult.ConfidenceScore, aiResult.Reasoning);
}
else if (fuzzyMatches.Count > 0)
{
// AI didn't find a match - check if it explicitly rejected or just failed
if (aiResult?.MatchType == "NoMatch")
{
_logger.LogDebug("AI explicitly rejected all candidates for '{CompanyName}'. Reasoning: {Reasoning}",
companyName, aiResult?.Reasoning ?? "No match found");
return CreateUnverifiedResult(companyName, startDate, endDate, jobTitle,
"Company name could not be verified - no matching company found in official records");
}
// AI failed (API error, etc.) - fall back to fuzzy matching
_logger.LogWarning("AI matching failed for '{CompanyName}', falling back to fuzzy matching", companyName);
var bestFuzzy = fuzzyMatches.OrderByDescending(m => m.Score).First();
matchedItem = bestFuzzy.Item;
matchScore = bestFuzzy.Score;
}
else
{
_logger.LogDebug("No valid match found for: {CompanyName}", companyName);
return CreateUnverifiedResult(companyName, startDate, endDate, jobTitle,
"Company name could not be verified against official records");
}
if (matchedItem is null)
{
_logger.LogDebug("No valid match found for: {CompanyName}", companyName);
return CreateUnverifiedResult(companyName, startDate, endDate, jobTitle,
"Company name could not be verified against official records");
}
var match = (Item: matchedItem, Score: matchScore);
// Fetch full company details for additional data
var companyDetails = await _companiesHouseClient.GetCompanyAsync(match.Item.CompanyNumber);
@@ -607,6 +679,7 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
{
var itemTitle = item.Title.ToUpperInvariant();
var itemTitleLower = item.Title.ToLowerInvariant();
var itemCoreWords = ExtractCoreIdentifiers(item.Title);
// Validate that ALL core identifiers appear in the match
// "Lloyds Bowmaker" must have BOTH "LLOYDS" and "BOWMAKER" in the match
@@ -614,6 +687,19 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
var hasAllQueryCores = queryCoreWords.Count == 0 || queryCoreWords.All(w => itemTitle.Contains(w));
if (!hasAllOriginalCores && !hasAllQueryCores) return false;
// Additional check: ensure the match doesn't have too many EXTRA core words
// "Families First" should NOT match "Families Against Conformity" because
// "Against" and "Conformity" are extra significant words
if (coreWords.Count > 0 && hasAllOriginalCores)
{
var extraWordsInMatch = itemCoreWords.Count(w => !coreWords.Contains(w));
// If the match has more than 1 extra core word, it's likely a different company
if (extraWordsInMatch > 1 && itemCoreWords.Count > coreWords.Count + 1)
{
return false;
}
}
// Filter out non-employment entities unless explicitly searching for that type
if (!IsValidEmploymentEntity(itemTitleLower, searchEntityTypes))
{