using System.Text.Json; using FuzzySharp; using Microsoft.EntityFrameworkCore; using Microsoft.Extensions.Logging; using TrueCV.Application.DTOs; using TrueCV.Application.Helpers; using TrueCV.Application.Interfaces; using TrueCV.Application.Models; using TrueCV.Domain.Entities; using TrueCV.Infrastructure.Data; using TrueCV.Infrastructure.ExternalApis; namespace TrueCV.Infrastructure.Services; public sealed class CompanyVerifierService : ICompanyVerifierService { private readonly CompaniesHouseClient _companiesHouseClient; private readonly IDbContextFactory _dbContextFactory; private readonly ILogger _logger; private const int FuzzyMatchThreshold = 85; private const int CacheExpirationDays = 30; // Non-employment entity patterns organized by category // These are entities that exist in Companies House but are not typical employers private static readonly Dictionary NonEmploymentEntityPatterns = new() { ["Clubs"] = new[] { "club", "fan club", "owners club", "car club", "supporters", "enthusiast", "aficionados" }, ["Associations"] = new[] { "association", "society", "federation", "institute", "institution", "guild", "chamber of commerce" }, ["Trusts"] = new[] { "benefit trust", "pension", "retirement", "employee trust", "share trust", "employee benefit", "superannuation", "provident" }, ["Charities"] = new[] { "charity", "charitable", "foundation", "relief fund", "benevolent", "philanthropic" }, ["Investment"] = new[] { "nominee", "custodian", "trustee", "investment trust", "unit trust", "investment fund", "capital partners" }, ["Property"] = new[] { "freehold", "leasehold", "property management", "residents association", "management company rtm", "commonhold" }, ["Religious"] = new[] { "church", "chapel", "mosque", "synagogue", "temple", "parish", "diocese", "ministry" }, ["Sports"] = new[] { "football club", "cricket club", "rugby club", "golf club", "tennis club", "sports club", "athletic club" }, ["Educational"] = new[] { "old boys", "old girls", "alumni", "school association", "pta", "parent teacher" }, ["Professional"] = new[] { "chartered institute", "royal college", "professional body", "trade body", "regulatory body" } }; // SIC codes that indicate non-trading or non-employment entities private static readonly HashSet NonTradingSicCodes = new() { "99999", // Dormant company "64209", // Activities of holding companies (shell companies) "68100", // Buying and selling of own real estate (often shell) }; // SIC codes for tech/software companies private static readonly HashSet TechSicCodes = new() { "62011", "62012", "62020", "62030", "62090", // Computer programming and consultancy "63110", "63120", // Data processing, hosting "58210", "58290", // Publishing of computer games, other software "61100", "61200", "61300", "61900" // Telecommunications }; public CompanyVerifierService( CompaniesHouseClient companiesHouseClient, IDbContextFactory dbContextFactory, ILogger logger) { _companiesHouseClient = companiesHouseClient; _dbContextFactory = dbContextFactory; _logger = logger; } public async Task VerifyCompanyAsync( string companyName, DateOnly? startDate, DateOnly? endDate, string? jobTitle = null) { ArgumentException.ThrowIfNullOrWhiteSpace(companyName); _logger.LogDebug("Verifying company: {CompanyName}", companyName); var flags = new List(); // Try to find a cached match first (but only if it existed at claimed start date) var cachedMatch = await FindCachedMatchAsync(companyName); if (cachedMatch is not null) { // Check if cached company existed at the claimed start date var cacheValid = !startDate.HasValue || cachedMatch.IncorporationDate == null || cachedMatch.IncorporationDate <= startDate.Value; if (cacheValid) { _logger.LogDebug("Found cached company match for: {CompanyName}", companyName); return CreateResultFromCache(cachedMatch, companyName, startDate, endDate, jobTitle, flags); } else { _logger.LogDebug("Cached company {CachedName} was incorporated after claimed start date, searching for alternatives", cachedMatch.CompanyName); } } // Search Companies House with fallback queries try { var searchQueries = GenerateSearchQueries(companyName); _logger.LogDebug("Generated {Count} search queries for '{CompanyName}': {Queries}", searchQueries.Count, companyName, string.Join(", ", searchQueries.Select(q => $"'{q}'"))); (CompaniesHouseSearchItem Item, int Score)? bestMatch = null; foreach (var query in searchQueries) { _logger.LogDebug("Searching Companies House with query: {Query}", query); var searchResponse = await _companiesHouseClient.SearchCompaniesAsync(query); if (searchResponse?.Items is null || searchResponse.Items.Count == 0) { continue; } // Find best fuzzy match, preferring companies that existed at claimed start date // Pass both original name and search query for matching flexibility bestMatch = FindBestMatch(companyName, query, searchResponse.Items, startDate); if (bestMatch is not null) { _logger.LogDebug("Found match with query '{Query}': {Company}", query, bestMatch.Value.Item.Title); break; } } if (bestMatch is null) { _logger.LogDebug("No valid match found for: {CompanyName} after trying {Count} queries", companyName, searchQueries.Count); return CreateUnverifiedResult(companyName, startDate, endDate, jobTitle, "Company name could not be verified against official records"); } var match = bestMatch.Value; // Fetch full company details for additional data var companyDetails = await _companiesHouseClient.GetCompanyAsync(match.Item.CompanyNumber); // Cache the matched company with full details await CacheCompanyAsync(match.Item, companyDetails); _logger.LogInformation( "Verified company {ClaimedName} matched to {MatchedName} with score {Score}%", companyName, match.Item.Title, match.Score); // Run all verification checks var incorporationDate = DateHelpers.ParseDate(match.Item.DateOfCreation); var dissolutionDate = DateHelpers.ParseDate(match.Item.DateOfCessation); var companyStatus = match.Item.CompanyStatus; var companyType = match.Item.CompanyType; var sicCodes = companyDetails?.SicCodes ?? match.Item.SicCodes; var accountsCategory = companyDetails?.Accounts?.LastAccounts?.Type; // Check 1: Employment before company incorporation CheckIncorporationDate(flags, startDate, incorporationDate, match.Item.Title); // Check 2: Employment at dissolved company CheckDissolutionDate(flags, endDate, dissolutionDate, companyStatus, match.Item.Title); // Check 3: Dormant company check CheckDormantCompany(flags, accountsCategory, jobTitle, match.Item.Title); // Check 4: Company size vs job title CheckCompanySizeVsRole(flags, accountsCategory, jobTitle, match.Item.Title); // Check 5: SIC code vs job title mismatch CheckSicCodeMismatch(flags, sicCodes, jobTitle, match.Item.Title); // Check 6: Job title plausibility for PLCs var (jobPlausible, jobNotes) = CheckJobTitlePlausibility(jobTitle, companyType); if (jobPlausible == false) { flags.Add(new CompanyVerificationFlag { Type = "ImplausibleJobTitle", Severity = "Critical", Message = jobNotes ?? "Job title requires verification", ScoreImpact = -15 }); } return new CompanyVerificationResult { ClaimedCompany = companyName, MatchedCompanyName = match.Item.Title, MatchedCompanyNumber = match.Item.CompanyNumber, MatchScore = match.Score, IsVerified = true, VerificationNotes = null, ClaimedStartDate = startDate, ClaimedEndDate = endDate, CompanyType = companyType, CompanyStatus = companyStatus, IncorporationDate = incorporationDate, DissolutionDate = dissolutionDate, AccountsCategory = accountsCategory, SicCodes = sicCodes, ClaimedJobTitle = jobTitle, JobTitlePlausible = jobPlausible, JobTitleNotes = jobNotes, Flags = flags }; } catch (CompaniesHouseRateLimitException ex) { _logger.LogWarning(ex, "Rate limit hit while verifying company: {CompanyName}", companyName); return CreateUnverifiedResult(companyName, startDate, endDate, jobTitle, "Verification temporarily unavailable due to rate limiting"); } } public async Task> SearchCompaniesAsync(string query) { ArgumentException.ThrowIfNullOrWhiteSpace(query); _logger.LogDebug("Searching companies for query: {Query}", query); var response = await _companiesHouseClient.SearchCompaniesAsync(query); if (response?.Items is null) { return []; } return response.Items.Select(item => new CompanySearchResult { CompanyNumber = item.CompanyNumber, CompanyName = item.Title, CompanyStatus = item.CompanyStatus ?? "Unknown", IncorporationDate = DateHelpers.ParseDate(item.DateOfCreation), AddressSnippet = item.AddressSnippet }).ToList(); } public async Task VerifyDirectorAsync( string companyNumber, string candidateName, DateOnly? startDate, DateOnly? endDate) { if (string.IsNullOrWhiteSpace(companyNumber) || string.IsNullOrWhiteSpace(candidateName)) { return null; } try { var officers = await _companiesHouseClient.GetOfficersAsync(companyNumber); if (officers?.Items is null || officers.Items.Count == 0) { _logger.LogDebug("No officers found for company {CompanyNumber}", companyNumber); return null; } // Normalize candidate name for comparison var normalizedCandidate = NormalizeName(candidateName); foreach (var officer in officers.Items) { // Check if officer role is director-like var role = officer.OfficerRole?.ToLowerInvariant() ?? ""; if (!role.Contains("director") && !role.Contains("secretary")) { continue; } // Fuzzy match the name var normalizedOfficer = NormalizeName(officer.Name); var matchScore = Fuzz.Ratio(normalizedCandidate, normalizedOfficer); if (matchScore >= 80) // High threshold for name matching { // Check date overlap var appointedOn = DateHelpers.ParseDate(officer.AppointedOn); var resignedOn = DateHelpers.ParseDate(officer.ResignedOn); // If no claimed dates, just check if names match if (!startDate.HasValue && !endDate.HasValue) { _logger.LogDebug( "Found matching director {OfficerName} for candidate {CandidateName} at company {CompanyNumber}", officer.Name, candidateName, companyNumber); return true; } // Check if employment period overlaps with directorship var datesOverlap = DatesOverlap( startDate, endDate, appointedOn, resignedOn); if (datesOverlap) { _logger.LogDebug( "Verified director {OfficerName} matches candidate {CandidateName} with overlapping dates", officer.Name, candidateName); return true; } } } _logger.LogDebug( "No matching director found for candidate {CandidateName} at company {CompanyNumber}", candidateName, companyNumber); return false; } catch (CompaniesHouseRateLimitException) { _logger.LogWarning("Rate limit hit while verifying director for company {CompanyNumber}", companyNumber); return null; } catch (Exception ex) { _logger.LogError(ex, "Error verifying director for company {CompanyNumber}", companyNumber); return null; } } private static string NormalizeName(string name) { if (string.IsNullOrWhiteSpace(name)) return ""; // Companies House often stores names as "SURNAME, Firstname" // Convert to "Firstname Surname" format for comparison var normalized = name.ToUpperInvariant().Trim(); if (normalized.Contains(',')) { var parts = normalized.Split(',', 2); if (parts.Length == 2) { normalized = $"{parts[1].Trim()} {parts[0].Trim()}"; } } return normalized; } private static bool DatesOverlap(DateOnly? start1, DateOnly? end1, DateOnly? start2, DateOnly? end2) { // If no dates, assume overlap if (!start1.HasValue && !end1.HasValue) return true; if (!start2.HasValue && !end2.HasValue) return true; // Use default dates for missing values var s1 = start1 ?? DateOnly.MinValue; var e1 = end1 ?? DateOnly.MaxValue; var s2 = start2 ?? DateOnly.MinValue; var e2 = end2 ?? DateOnly.MaxValue; // Check overlap: periods overlap if one starts before the other ends return s1 <= e2 && s2 <= e1; } #region Verification Checks private static void CheckIncorporationDate( List flags, DateOnly? claimedStartDate, DateOnly? incorporationDate, string companyName) { if (claimedStartDate.HasValue && incorporationDate.HasValue) { if (claimedStartDate.Value < incorporationDate.Value) { flags.Add(new CompanyVerificationFlag { Type = "EmploymentBeforeIncorporation", Severity = "Critical", Message = $"Claimed employment at '{companyName}' starting {claimedStartDate:MMM yyyy} is before company incorporation date {incorporationDate:MMM yyyy}", ScoreImpact = -20 }); } } } private static void CheckDissolutionDate( List flags, DateOnly? claimedEndDate, DateOnly? dissolutionDate, string? companyStatus, string companyName) { var isDissolvedStatus = companyStatus?.ToLowerInvariant() is "dissolved" or "liquidation" or "administration"; if (dissolutionDate.HasValue && isDissolvedStatus) { // Allow 3 month buffer for wind-down var bufferDate = dissolutionDate.Value.AddMonths(3); if (claimedEndDate.HasValue && claimedEndDate.Value > bufferDate) { flags.Add(new CompanyVerificationFlag { Type = "EmploymentAtDissolvedCompany", Severity = "Critical", Message = $"Claimed employment at '{companyName}' until {claimedEndDate:MMM yyyy} but company was dissolved on {dissolutionDate:MMM yyyy}", ScoreImpact = -20 }); } else if (!claimedEndDate.HasValue) // Current employment at dissolved company { flags.Add(new CompanyVerificationFlag { Type = "CurrentEmploymentAtDissolvedCompany", Severity = "Critical", Message = $"Claims current employment at '{companyName}' but company was dissolved on {dissolutionDate:MMM yyyy}", ScoreImpact = -25 }); } } } private static void CheckDormantCompany( List flags, string? accountsCategory, string? jobTitle, string companyName) { if (string.IsNullOrWhiteSpace(accountsCategory)) return; var isDormant = accountsCategory.ToLowerInvariant().Contains("dormant"); if (!isDormant) return; // Directors can maintain dormant companies, but other roles are suspicious var title = jobTitle?.ToLowerInvariant() ?? ""; var isDirectorRole = title.Contains("director") || title.Contains("company secretary"); if (!isDirectorRole) { flags.Add(new CompanyVerificationFlag { Type = "EmploymentAtDormantCompany", Severity = "Warning", Message = $"Claimed active employment as '{jobTitle}' at '{companyName}' which files dormant accounts", ScoreImpact = -10 }); } } private static void CheckCompanySizeVsRole( List flags, string? accountsCategory, string? jobTitle, string companyName) { if (string.IsNullOrWhiteSpace(accountsCategory) || string.IsNullOrWhiteSpace(jobTitle)) return; var category = accountsCategory.ToLowerInvariant(); var title = jobTitle.ToLowerInvariant(); // Micro-entity: < 10 employees, < £632k turnover var isMicroEntity = category.Contains("micro"); // Check for senior management roles at micro companies var isSeniorRole = title.Contains("vp") || title.Contains("vice president") || title.Contains("head of") || title.Contains("chief") || title.Contains("director of") || title.Contains("senior director"); // At micro companies, having many senior roles is suspicious if (isMicroEntity && isSeniorRole) { flags.Add(new CompanyVerificationFlag { Type = "SeniorRoleAtMicroCompany", Severity = "Warning", Message = $"Claimed senior role '{jobTitle}' at '{companyName}' which files micro-entity accounts (typically <10 employees)", ScoreImpact = -10 }); } } private static void CheckSicCodeMismatch( List flags, List? sicCodes, string? jobTitle, string companyName) { if (sicCodes is null || sicCodes.Count == 0 || string.IsNullOrWhiteSpace(jobTitle)) return; var title = jobTitle.ToLowerInvariant(); // Check if this is a tech role var isTechRole = title.Contains("software") || title.Contains("developer") || title.Contains("engineer") || title.Contains("programmer") || title.Contains("data scientist") || title.Contains("data analyst") || title.Contains("devops") || title.Contains("cloud") || title.Contains("machine learning") || title.Contains("ai ") || title.Contains("frontend") || title.Contains("backend") || title.Contains("full stack") || title.Contains("fullstack"); if (isTechRole) { // Check if company has any tech SIC codes var hasTechSic = sicCodes.Any(s => TechSicCodes.Contains(s)); if (!hasTechSic) { // Get the primary SIC code description (simplified - just show code) var primarySic = sicCodes.FirstOrDefault() ?? "Unknown"; flags.Add(new CompanyVerificationFlag { Type = "SicCodeMismatch", Severity = "Info", Message = $"Tech role '{jobTitle}' at '{companyName}' (SIC: {primarySic}) - company is not registered as a technology business", ScoreImpact = -5 }); } } } private static (bool? IsPlausible, string? Notes) CheckJobTitlePlausibility(string? jobTitle, string? companyType) { if (string.IsNullOrWhiteSpace(jobTitle) || string.IsNullOrWhiteSpace(companyType)) { return (null, null); } var title = jobTitle.Trim().ToLowerInvariant(); var type = companyType.Trim().ToLowerInvariant(); // Check if this is a PLC (Public Limited Company) - these are large companies var isPlc = type.Contains("plc") || type.Contains("public limited"); // Check for C-suite / very senior roles var isCsuiteRole = title.Contains("ceo") || title.Contains("chief executive") || title.Contains("cto") || title.Contains("chief technology") || title.Contains("cfo") || title.Contains("chief financial") || title.Contains("coo") || title.Contains("chief operating") || title.Contains("cio") || title.Contains("chief information") || title.Contains("managing director") || title == "md" || title.Contains("chairman") || title.Contains("chairwoman") || title.Contains("chairperson") || title.Contains("president"); // Check for board-level roles var isBoardRole = title.Contains("board member") || title.Contains("non-executive director") || title.Contains("executive director") || (title == "director" && !title.Contains("of")); if (isPlc && (isCsuiteRole || isBoardRole)) { return (false, $"Claimed senior role '{jobTitle}' at a PLC requires verification - C-suite positions at public companies are publicly disclosed"); } // Check for VP/SVP at PLCs (also usually disclosed) var isVpRole = title.Contains("vice president") || title.Contains("vp ") || title.StartsWith("vp") || title.Contains("svp") || title.Contains("senior vice president") || title.Contains("evp") || title.Contains("executive vice president"); if (isPlc && isVpRole) { return (false, $"Claimed VP-level role '{jobTitle}' at a PLC - senior positions at public companies should be verifiable"); } return (true, null); } #endregion #region Helper Methods private async Task FindCachedMatchAsync(string companyName) { var cutoffDate = DateTime.UtcNow.AddDays(-CacheExpirationDays); await using var dbContext = await _dbContextFactory.CreateDbContextAsync(); var cachedCompanies = await dbContext.CompanyCache .Where(c => c.CachedAt >= cutoffDate) .ToListAsync(); if (cachedCompanies.Count == 0) { return null; } var matches = cachedCompanies .Where(c => !string.IsNullOrWhiteSpace(c.CompanyName)) .Select(c => new { Company = c, Score = Fuzz.TokenSetRatio(companyName.ToUpperInvariant(), c.CompanyName.ToUpperInvariant()) }) .Where(m => m.Score >= FuzzyMatchThreshold) .OrderByDescending(m => m.Score) .FirstOrDefault(); return matches?.Company; } private (CompaniesHouseSearchItem Item, int Score)? FindBestMatch( string companyName, string searchQuery, List items, DateOnly? claimedStartDate) { var normalizedOriginal = companyName.ToUpperInvariant(); var normalizedQuery = searchQuery.ToUpperInvariant(); // Extract core identifying words that MUST appear in any valid match // This prevents "BMW Group Canada" matching "CANADA LIFE GROUP" just because of common words // and "Lloyds Bowmaker" matching "LLOYDS ALARMS" (missing "Bowmaker") var coreWords = ExtractCoreIdentifiers(companyName); var queryCoreWords = ExtractCoreIdentifiers(searchQuery); var originalLower = companyName.ToLowerInvariant(); var queryLower = searchQuery.ToLowerInvariant(); // Determine which entity types the search is explicitly looking for var searchEntityTypes = GetSearchEntityTypes(originalLower, queryLower); // Match against both the original company name AND the search query used // This handles cases like "Matthew Walker (Northern Foods Plc)" where we // search for "Northern Foods Plc" but need to match against it, not the full name var matches = items .Where(item => !string.IsNullOrWhiteSpace(item.Title)) .Where(item => { var itemTitle = item.Title.ToUpperInvariant(); var itemTitleLower = item.Title.ToLowerInvariant(); // Validate that ALL core identifiers appear in the match // "Lloyds Bowmaker" must have BOTH "LLOYDS" and "BOWMAKER" in the match var hasAllOriginalCores = coreWords.Count == 0 || coreWords.All(w => itemTitle.Contains(w)); var hasAllQueryCores = queryCoreWords.Count == 0 || queryCoreWords.All(w => itemTitle.Contains(w)); if (!hasAllOriginalCores && !hasAllQueryCores) return false; // Filter out non-employment entities unless explicitly searching for that type if (!IsValidEmploymentEntity(itemTitleLower, searchEntityTypes)) { return false; } return true; }) .Select(item => { var itemTitle = item.Title.ToUpperInvariant(); var scoreVsOriginal = Fuzz.TokenSetRatio(normalizedOriginal, itemTitle); var scoreVsQuery = Fuzz.TokenSetRatio(normalizedQuery, itemTitle); return (Item: item, Score: Math.Max(scoreVsOriginal, scoreVsQuery)); }) .Where(m => m.Score >= FuzzyMatchThreshold) .ToList(); _logger.LogDebug("Found {Count} matches above threshold for '{CompanyName}' (query: '{Query}')", matches.Count, companyName, searchQuery); foreach (var m in matches.Take(5)) { _logger.LogDebug(" Match: {Title} ({Number}), Score: {Score}, DateOfCreation: {Date}", m.Item.Title, m.Item.CompanyNumber, m.Score, m.Item.DateOfCreation ?? "null"); } if (matches.Count == 0) return null; // If we have a claimed start date, prefer companies that existed at that time if (claimedStartDate.HasValue) { _logger.LogDebug("Filtering for companies that existed at claimed start date: {StartDate}", claimedStartDate.Value); var existedAtStartDate = matches .Where(m => { var incDate = DateHelpers.ParseDate(m.Item.DateOfCreation); var existed = incDate == null || incDate <= claimedStartDate.Value; _logger.LogDebug(" {Title}: IncDate={IncDate}, Existed={Existed}", m.Item.Title, incDate?.ToString() ?? "null", existed); return existed; }) .OrderByDescending(m => m.Score) .ToList(); _logger.LogDebug("Companies that existed at start date: {Count}", existedAtStartDate.Count); // If any matches existed at the start date, prefer those if (existedAtStartDate.Count > 0) { _logger.LogDebug("Selected: {Title} ({Number})", existedAtStartDate[0].Item.Title, existedAtStartDate[0].Item.CompanyNumber); return existedAtStartDate[0]; } // No companies existed at the claimed start date - don't match a wrong company _logger.LogDebug("No companies found that existed at claimed start date {StartDate}, returning no match", claimedStartDate.Value); return null; } // No start date provided - just use highest score var fallback = matches.OrderByDescending(m => m.Score).First(); _logger.LogDebug("No start date filter, using highest score: {Title} ({Number})", fallback.Item.Title, fallback.Item.CompanyNumber); return fallback; } private async Task CacheCompanyAsync(CompaniesHouseSearchItem item, CompaniesHouseCompany? details) { try { await using var dbContext = await _dbContextFactory.CreateDbContextAsync(); var existingCache = await dbContext.CompanyCache .FirstOrDefaultAsync(c => c.CompanyNumber == item.CompanyNumber); var sicCodes = details?.SicCodes ?? item.SicCodes; var sicCodesJson = sicCodes != null ? JsonSerializer.Serialize(sicCodes) : null; var accountsCategory = details?.Accounts?.LastAccounts?.Type; if (existingCache is not null) { existingCache.CompanyName = item.Title; existingCache.Status = item.CompanyStatus ?? "Unknown"; existingCache.CompanyType = item.CompanyType; existingCache.IncorporationDate = DateHelpers.ParseDate(item.DateOfCreation); existingCache.DissolutionDate = DateHelpers.ParseDate(item.DateOfCessation); existingCache.AccountsCategory = accountsCategory; existingCache.SicCodesJson = sicCodesJson; existingCache.CachedAt = DateTime.UtcNow; } else { var cacheEntry = new CompanyCache { CompanyNumber = item.CompanyNumber, CompanyName = item.Title, Status = item.CompanyStatus ?? "Unknown", CompanyType = item.CompanyType, IncorporationDate = DateHelpers.ParseDate(item.DateOfCreation), DissolutionDate = DateHelpers.ParseDate(item.DateOfCessation), AccountsCategory = accountsCategory, SicCodesJson = sicCodesJson, CachedAt = DateTime.UtcNow }; dbContext.CompanyCache.Add(cacheEntry); } await dbContext.SaveChangesAsync(); } catch (DbUpdateException ex) when (ex.InnerException?.Message.Contains("PK_CompanyCache") == true) { // Race condition: another task already cached this company - ignore _logger.LogDebug("Company {CompanyNumber} already cached by another task", item.CompanyNumber); } } private CompanyVerificationResult CreateResultFromCache( CompanyCache cached, string claimedCompany, DateOnly? startDate, DateOnly? endDate, string? jobTitle, List flags) { var matchScore = Fuzz.TokenSetRatio( claimedCompany.ToUpperInvariant(), cached.CompanyName.ToUpperInvariant()); List? sicCodes = null; if (!string.IsNullOrEmpty(cached.SicCodesJson)) { try { sicCodes = JsonSerializer.Deserialize>(cached.SicCodesJson); } catch (JsonException) { // Ignore malformed JSON in cache } } // Run all verification checks CheckIncorporationDate(flags, startDate, cached.IncorporationDate, cached.CompanyName); CheckDissolutionDate(flags, endDate, cached.DissolutionDate, cached.Status, cached.CompanyName); CheckDormantCompany(flags, cached.AccountsCategory, jobTitle, cached.CompanyName); CheckCompanySizeVsRole(flags, cached.AccountsCategory, jobTitle, cached.CompanyName); CheckSicCodeMismatch(flags, sicCodes, jobTitle, cached.CompanyName); var (jobPlausible, jobNotes) = CheckJobTitlePlausibility(jobTitle, cached.CompanyType); if (jobPlausible == false) { flags.Add(new CompanyVerificationFlag { Type = "ImplausibleJobTitle", Severity = "Critical", Message = jobNotes ?? "Job title requires verification", ScoreImpact = -15 }); } return new CompanyVerificationResult { ClaimedCompany = claimedCompany, MatchedCompanyName = cached.CompanyName, MatchedCompanyNumber = cached.CompanyNumber, MatchScore = matchScore, IsVerified = true, VerificationNotes = null, ClaimedStartDate = startDate, ClaimedEndDate = endDate, CompanyType = cached.CompanyType, CompanyStatus = cached.Status, IncorporationDate = cached.IncorporationDate, DissolutionDate = cached.DissolutionDate, AccountsCategory = cached.AccountsCategory, SicCodes = sicCodes, ClaimedJobTitle = jobTitle, JobTitlePlausible = jobPlausible, JobTitleNotes = jobNotes, Flags = flags }; } private static CompanyVerificationResult CreateUnverifiedResult( string companyName, DateOnly? startDate, DateOnly? endDate, string? jobTitle, string reason) { return new CompanyVerificationResult { ClaimedCompany = companyName, MatchedCompanyName = null, MatchedCompanyNumber = null, MatchScore = 0, IsVerified = false, VerificationNotes = reason, ClaimedStartDate = startDate, ClaimedEndDate = endDate, ClaimedJobTitle = jobTitle }; } /// /// Generates alternative search queries to find companies that may be registered /// with slightly different names (e.g., "U.K." vs "UK", "Limited" vs "Ltd"). /// Also handles "Brand (Parent Company)" format by extracting and prioritizing the parent. /// private static List GenerateSearchQueries(string companyName) { var queries = new HashSet(StringComparer.OrdinalIgnoreCase); var normalized = companyName.Trim(); // Step 0a: Check for "Brand (Parent Company)" format and extract parent company // Parent company is more likely to be the registered name, so search it first var parentMatch = System.Text.RegularExpressions.Regex.Match(normalized, @"\(([^)]+)\)\s*$"); if (parentMatch.Success) { var parentCompany = parentMatch.Groups[1].Value.Trim(); // Generate queries for parent company first (higher priority) foreach (var parentQuery in GenerateNameVariations(parentCompany)) { queries.Add(parentQuery); } // Also try the brand name without parenthetical var brandName = normalized[..parentMatch.Index].Trim(); if (brandName.Length >= 3) { foreach (var brandQuery in GenerateNameVariations(brandName)) { queries.Add(brandQuery); } } } // Step 0b: Check for "Name1/Name2" format (e.g., "ASDA/WALMART") // Try each part separately as they may be different registered names if (normalized.Contains('/')) { var parts = normalized.Split('/', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries); foreach (var part in parts) { if (part.Length >= 3) { foreach (var partQuery in GenerateNameVariations(part)) { queries.Add(partQuery); } } } } // Step 0c: Try first word as potential parent company (e.g., "UNILEVER BESTFOOD" -> "UNILEVER") // Many company names are "ParentCompany Division" or "ParentCompany Brand" var words = normalized.Split(' ', StringSplitOptions.RemoveEmptyEntries); if (words.Length >= 2) { var firstWord = words[0]; // Only try if first word is substantial (not "The", "A", common prefixes) var skipWords = new HashSet(StringComparer.OrdinalIgnoreCase) { "the", "a", "an", "uk", "british", "national", "international", "global", "new" }; if (firstWord.Length >= 4 && !skipWords.Contains(firstWord)) { foreach (var firstWordQuery in GenerateNameVariations(firstWord)) { queries.Add(firstWordQuery); } // Also try first word + PLC/Limited for major corporations queries.Add(firstWord + " PLC"); queries.Add(firstWord + " Limited"); } } // Also add variations of the full original name foreach (var query in GenerateNameVariations(normalized)) { queries.Add(query); } return queries.ToList(); } /// /// Generates name variations for a single company name (UK/U.K., Ltd/Limited, etc.) /// private static List GenerateNameVariations(string name) { var variations = new HashSet(StringComparer.OrdinalIgnoreCase) { name }; // Step 1: Generate UK/U.K. variations var ukVariants = new List { name }; if (name.Contains(" UK", StringComparison.OrdinalIgnoreCase)) { // Add U.K. variant var withDots = name .Replace(" UK ", " U.K. ", StringComparison.OrdinalIgnoreCase) .Replace(" UK", " U.K.", StringComparison.OrdinalIgnoreCase); if (withDots != name) ukVariants.Add(withDots); } if (name.Contains(" U.K.", StringComparison.OrdinalIgnoreCase)) { // Add UK variant (no dots) var withoutDots = name .Replace(" U.K. ", " UK ", StringComparison.OrdinalIgnoreCase) .Replace(" U.K.", " UK", StringComparison.OrdinalIgnoreCase); if (withoutDots != name) ukVariants.Add(withoutDots); } // Step 2: For each UK variant, generate suffix variations (Ltd/Limited) foreach (var variant in ukVariants) { variations.Add(variant); // Try Ltd -> Limited if (variant.EndsWith(" Ltd", StringComparison.OrdinalIgnoreCase)) { variations.Add(variant[..^4] + " Limited"); } // Try Limited -> Ltd else if (variant.EndsWith(" Limited", StringComparison.OrdinalIgnoreCase)) { variations.Add(variant[..^8] + " Ltd"); } // Try PLC variations else if (variant.EndsWith(" PLC", StringComparison.OrdinalIgnoreCase)) { variations.Add(variant[..^4] + " Public Limited Company"); } else if (variant.EndsWith(" Public Limited Company", StringComparison.OrdinalIgnoreCase)) { variations.Add(variant[..^24] + " PLC"); } // Try Plc (mixed case) variations else if (variant.EndsWith(" Plc", StringComparison.Ordinal)) { variations.Add(variant[..^4] + " PLC"); variations.Add(variant[..^4] + " Public Limited Company"); } } // Step 3: Try core name without suffix var suffixesToRemove = new[] { " Ltd", " Limited", " PLC", " Plc", " LLP", " Inc", " Corporation", " Corp" }; var coreName = name; foreach (var suffix in suffixesToRemove) { if (coreName.EndsWith(suffix, StringComparison.OrdinalIgnoreCase)) { coreName = coreName[..^suffix.Length].Trim(); break; } } if (coreName != name && coreName.Length >= 3) { variations.Add(coreName); variations.Add(coreName + " Limited"); variations.Add(coreName + " PLC"); // Also add U.K. variant of core name if applicable if (coreName.Contains(" UK", StringComparison.OrdinalIgnoreCase)) { var coreWithDots = coreName .Replace(" UK ", " U.K. ", StringComparison.OrdinalIgnoreCase) .Replace(" UK", " U.K.", StringComparison.OrdinalIgnoreCase); variations.Add(coreWithDots); variations.Add(coreWithDots + " Limited"); } } return variations.ToList(); } /// /// Determines which non-employment entity categories the search query is explicitly looking for. /// Returns a set of category names (e.g., "Clubs", "Trusts") that should NOT be filtered out. /// private static HashSet GetSearchEntityTypes(string originalLower, string queryLower) { var allowedCategories = new HashSet(); var searchTerms = originalLower + " " + queryLower; foreach (var (category, patterns) in NonEmploymentEntityPatterns) { // If any pattern from this category appears in the search, allow matches from that category if (patterns.Any(pattern => searchTerms.Contains(pattern))) { allowedCategories.Add(category); } } return allowedCategories; } /// /// Checks if a company title represents a valid employment entity. /// Filters out non-employment entities (clubs, trusts, etc.) unless the search explicitly targets that type. /// private static bool IsValidEmploymentEntity(string itemTitleLower, HashSet allowedCategories) { foreach (var (category, patterns) in NonEmploymentEntityPatterns) { // Skip this category if the search explicitly allows it if (allowedCategories.Contains(category)) { continue; } // Check if the item matches any pattern in this non-employment category if (patterns.Any(pattern => itemTitleLower.Contains(pattern))) { return false; // This is a non-employment entity type that wasn't explicitly searched for } } return true; // No non-employment patterns matched, this is likely a valid employment entity } // Expanded skip words list for core identifier extraction // These words are too common to be meaningful differentiators between companies private static readonly HashSet SkipWords = new(StringComparer.OrdinalIgnoreCase) { // Articles and conjunctions "the", "a", "an", "and", "or", "of", "for", "in", "at", "on", "by", "to", "with", // Geographic - Countries and regions "uk", "u.k.", "gb", "british", "britain", "england", "english", "scotland", "scottish", "wales", "welsh", "ireland", "irish", "northern", "europe", "european", "america", "american", "usa", "us", "u.s.", "u.s.a.", "canada", "canadian", "asia", "asian", "pacific", "atlantic", "australia", "australian", "africa", "african", "india", "indian", "france", "french", "germany", "german", "spain", "spanish", "italy", "italian", "japan", "japanese", "china", "chinese", "korea", "korean", "middle", "east", "west", "north", "south", "central", "western", "eastern", // Geographic - Cities "london", "manchester", "birmingham", "leeds", "glasgow", "edinburgh", "bristol", "liverpool", "sheffield", "newcastle", "cardiff", "belfast", "nottingham", "southampton", "portsmouth", "brighton", "leicester", "coventry", "hull", // Legal suffixes "limited", "ltd", "plc", "llp", "llc", "inc", "incorporated", "corporation", "corp", "company", "co", "partners", "partnership", "enterprises", "unlimited", "registered", "cic", "cio", "se", "ag", "gmbh", "sarl", "bv", "nv", // Business descriptors "group", "holdings", "holding", "parent", "subsidiary", "division", "branch", "services", "service", "solutions", "solution", "consulting", "consultants", "consultancy", "management", "systems", "system", "technologies", "technology", "tech", "industries", "industry", "industrial", "commercial", "trading", "trade", "business", "businesses", "operations", "operational", "professional", "professionals", "resources", "resource", "network", "networks", "associates", "associated", // Size/Scope descriptors "national", "international", "global", "worldwide", "world", "regional", "local", "universal", "general", "standard", "premier", "prime", "first", "one", // Quality/Marketing terms "new", "modern", "advanced", "innovative", "premier", "elite", "premium", "quality", "superior", "excellent", "best", "top", "leading", "major", // Ownership indicators "royal", "imperial", "crown", "state", "public", "private", "independent", "mutual", "cooperative", "coop", "community", // Time-related "century", "millennium", "annual", "year", "years", // Numbers as words "one", "two", "three", "four", "five", "first", "second", "third" }; /// /// Extracts ALL core identifying words from a company name. /// These are significant words that aren't common prefixes/suffixes. /// E.g., "BMW Group Canada" -> ["BMW"], "Lloyds Bowmaker" -> ["LLOYDS", "BOWMAKER"] /// "Bank of Scotland" -> ["BANK", "SCOTLAND"] /// private static List ExtractCoreIdentifiers(string companyName) { if (string.IsNullOrWhiteSpace(companyName)) return new List(); // Remove parenthetical content first var cleanName = System.Text.RegularExpressions.Regex.Replace(companyName, @"\([^)]*\)", "").Trim(); // Split into words and collect all significant words var words = cleanName.Split(new[] { ' ', '-', '/', '&' }, StringSplitOptions.RemoveEmptyEntries); var coreWords = new List(); foreach (var word in words) { var cleanWord = word.Trim('.', ',', '\''); if (cleanWord.Length >= 2 && !SkipWords.Contains(cleanWord)) { coreWords.Add(cleanWord.ToUpperInvariant()); } } return coreWords; } #endregion }