using System.Text.Json; using FuzzySharp; using Microsoft.EntityFrameworkCore; using Microsoft.Extensions.Logging; using TrueCV.Application.DTOs; using TrueCV.Application.Helpers; using TrueCV.Application.Interfaces; using TrueCV.Application.Models; using TrueCV.Domain.Entities; using TrueCV.Infrastructure.Data; using TrueCV.Infrastructure.ExternalApis; namespace TrueCV.Infrastructure.Services; public sealed class CompanyVerifierService : ICompanyVerifierService { private readonly CompaniesHouseClient _companiesHouseClient; private readonly IDbContextFactory _dbContextFactory; private readonly ILogger _logger; private const int FuzzyMatchThreshold = 85; private const int CacheExpirationDays = 30; // SIC codes for tech/software companies private static readonly HashSet TechSicCodes = new() { "62011", "62012", "62020", "62030", "62090", // Computer programming and consultancy "63110", "63120", // Data processing, hosting "58210", "58290", // Publishing of computer games, other software "61100", "61200", "61300", "61900" // Telecommunications }; public CompanyVerifierService( CompaniesHouseClient companiesHouseClient, IDbContextFactory dbContextFactory, ILogger logger) { _companiesHouseClient = companiesHouseClient; _dbContextFactory = dbContextFactory; _logger = logger; } public async Task VerifyCompanyAsync( string companyName, DateOnly? startDate, DateOnly? endDate, string? jobTitle = null) { ArgumentException.ThrowIfNullOrWhiteSpace(companyName); _logger.LogDebug("Verifying company: {CompanyName}", companyName); var flags = new List(); // Try to find a cached match first (but only if it existed at claimed start date) var cachedMatch = await FindCachedMatchAsync(companyName); if (cachedMatch is not null) { // Check if cached company existed at the claimed start date var cacheValid = !startDate.HasValue || cachedMatch.IncorporationDate == null || cachedMatch.IncorporationDate <= startDate.Value; if (cacheValid) { _logger.LogDebug("Found cached company match for: {CompanyName}", companyName); return CreateResultFromCache(cachedMatch, companyName, startDate, endDate, jobTitle, flags); } else { _logger.LogDebug("Cached company {CachedName} was incorporated after claimed start date, searching for alternatives", cachedMatch.CompanyName); } } // Search Companies House with fallback queries try { var searchQueries = GenerateSearchQueries(companyName); _logger.LogDebug("Generated {Count} search queries for '{CompanyName}': {Queries}", searchQueries.Count, companyName, string.Join(", ", searchQueries.Select(q => $"'{q}'"))); (CompaniesHouseSearchItem Item, int Score)? bestMatch = null; foreach (var query in searchQueries) { _logger.LogDebug("Searching Companies House with query: {Query}", query); var searchResponse = await _companiesHouseClient.SearchCompaniesAsync(query); if (searchResponse?.Items is null || searchResponse.Items.Count == 0) { continue; } // Find best fuzzy match, preferring companies that existed at claimed start date // Pass both original name and search query for matching flexibility bestMatch = FindBestMatch(companyName, query, searchResponse.Items, startDate); if (bestMatch is not null) { _logger.LogDebug("Found match with query '{Query}': {Company}", query, bestMatch.Value.Item.Title); break; } } if (bestMatch is null) { _logger.LogDebug("No valid match found for: {CompanyName} after trying {Count} queries", companyName, searchQueries.Count); return CreateUnverifiedResult(companyName, startDate, endDate, jobTitle, "Company name could not be verified against official records"); } var match = bestMatch.Value; // Fetch full company details for additional data var companyDetails = await _companiesHouseClient.GetCompanyAsync(match.Item.CompanyNumber); // Cache the matched company with full details await CacheCompanyAsync(match.Item, companyDetails); _logger.LogInformation( "Verified company {ClaimedName} matched to {MatchedName} with score {Score}%", companyName, match.Item.Title, match.Score); // Run all verification checks var incorporationDate = DateHelpers.ParseDate(match.Item.DateOfCreation); var dissolutionDate = DateHelpers.ParseDate(match.Item.DateOfCessation); var companyStatus = match.Item.CompanyStatus; var companyType = match.Item.CompanyType; var sicCodes = companyDetails?.SicCodes ?? match.Item.SicCodes; var accountsCategory = companyDetails?.Accounts?.LastAccounts?.Type; // Check 1: Employment before company incorporation CheckIncorporationDate(flags, startDate, incorporationDate, match.Item.Title); // Check 2: Employment at dissolved company CheckDissolutionDate(flags, endDate, dissolutionDate, companyStatus, match.Item.Title); // Check 3: Dormant company check CheckDormantCompany(flags, accountsCategory, jobTitle, match.Item.Title); // Check 4: Company size vs job title CheckCompanySizeVsRole(flags, accountsCategory, jobTitle, match.Item.Title); // Check 5: SIC code vs job title mismatch CheckSicCodeMismatch(flags, sicCodes, jobTitle, match.Item.Title); // Check 6: Job title plausibility for PLCs var (jobPlausible, jobNotes) = CheckJobTitlePlausibility(jobTitle, companyType); if (jobPlausible == false) { flags.Add(new CompanyVerificationFlag { Type = "ImplausibleJobTitle", Severity = "Critical", Message = jobNotes ?? "Job title requires verification", ScoreImpact = -15 }); } return new CompanyVerificationResult { ClaimedCompany = companyName, MatchedCompanyName = match.Item.Title, MatchedCompanyNumber = match.Item.CompanyNumber, MatchScore = match.Score, IsVerified = true, VerificationNotes = null, ClaimedStartDate = startDate, ClaimedEndDate = endDate, CompanyType = companyType, CompanyStatus = companyStatus, IncorporationDate = incorporationDate, DissolutionDate = dissolutionDate, AccountsCategory = accountsCategory, SicCodes = sicCodes, ClaimedJobTitle = jobTitle, JobTitlePlausible = jobPlausible, JobTitleNotes = jobNotes, Flags = flags }; } catch (CompaniesHouseRateLimitException ex) { _logger.LogWarning(ex, "Rate limit hit while verifying company: {CompanyName}", companyName); return CreateUnverifiedResult(companyName, startDate, endDate, jobTitle, "Verification temporarily unavailable due to rate limiting"); } } public async Task> SearchCompaniesAsync(string query) { ArgumentException.ThrowIfNullOrWhiteSpace(query); _logger.LogDebug("Searching companies for query: {Query}", query); var response = await _companiesHouseClient.SearchCompaniesAsync(query); if (response?.Items is null) { return []; } return response.Items.Select(item => new CompanySearchResult { CompanyNumber = item.CompanyNumber, CompanyName = item.Title, CompanyStatus = item.CompanyStatus ?? "Unknown", IncorporationDate = DateHelpers.ParseDate(item.DateOfCreation), AddressSnippet = item.AddressSnippet }).ToList(); } public async Task VerifyDirectorAsync( string companyNumber, string candidateName, DateOnly? startDate, DateOnly? endDate) { if (string.IsNullOrWhiteSpace(companyNumber) || string.IsNullOrWhiteSpace(candidateName)) { return null; } try { var officers = await _companiesHouseClient.GetOfficersAsync(companyNumber); if (officers?.Items is null || officers.Items.Count == 0) { _logger.LogDebug("No officers found for company {CompanyNumber}", companyNumber); return null; } // Normalize candidate name for comparison var normalizedCandidate = NormalizeName(candidateName); foreach (var officer in officers.Items) { // Check if officer role is director-like var role = officer.OfficerRole?.ToLowerInvariant() ?? ""; if (!role.Contains("director") && !role.Contains("secretary")) { continue; } // Fuzzy match the name var normalizedOfficer = NormalizeName(officer.Name); var matchScore = Fuzz.Ratio(normalizedCandidate, normalizedOfficer); if (matchScore >= 80) // High threshold for name matching { // Check date overlap var appointedOn = DateHelpers.ParseDate(officer.AppointedOn); var resignedOn = DateHelpers.ParseDate(officer.ResignedOn); // If no claimed dates, just check if names match if (!startDate.HasValue && !endDate.HasValue) { _logger.LogDebug( "Found matching director {OfficerName} for candidate {CandidateName} at company {CompanyNumber}", officer.Name, candidateName, companyNumber); return true; } // Check if employment period overlaps with directorship var datesOverlap = DatesOverlap( startDate, endDate, appointedOn, resignedOn); if (datesOverlap) { _logger.LogDebug( "Verified director {OfficerName} matches candidate {CandidateName} with overlapping dates", officer.Name, candidateName); return true; } } } _logger.LogDebug( "No matching director found for candidate {CandidateName} at company {CompanyNumber}", candidateName, companyNumber); return false; } catch (CompaniesHouseRateLimitException) { _logger.LogWarning("Rate limit hit while verifying director for company {CompanyNumber}", companyNumber); return null; } catch (Exception ex) { _logger.LogError(ex, "Error verifying director for company {CompanyNumber}", companyNumber); return null; } } private static string NormalizeName(string name) { if (string.IsNullOrWhiteSpace(name)) return ""; // Companies House often stores names as "SURNAME, Firstname" // Convert to "Firstname Surname" format for comparison var normalized = name.ToUpperInvariant().Trim(); if (normalized.Contains(',')) { var parts = normalized.Split(',', 2); if (parts.Length == 2) { normalized = $"{parts[1].Trim()} {parts[0].Trim()}"; } } return normalized; } private static bool DatesOverlap(DateOnly? start1, DateOnly? end1, DateOnly? start2, DateOnly? end2) { // If no dates, assume overlap if (!start1.HasValue && !end1.HasValue) return true; if (!start2.HasValue && !end2.HasValue) return true; // Use default dates for missing values var s1 = start1 ?? DateOnly.MinValue; var e1 = end1 ?? DateOnly.MaxValue; var s2 = start2 ?? DateOnly.MinValue; var e2 = end2 ?? DateOnly.MaxValue; // Check overlap: periods overlap if one starts before the other ends return s1 <= e2 && s2 <= e1; } #region Verification Checks private static void CheckIncorporationDate( List flags, DateOnly? claimedStartDate, DateOnly? incorporationDate, string companyName) { if (claimedStartDate.HasValue && incorporationDate.HasValue) { if (claimedStartDate.Value < incorporationDate.Value) { flags.Add(new CompanyVerificationFlag { Type = "EmploymentBeforeIncorporation", Severity = "Critical", Message = $"Claimed employment at '{companyName}' starting {claimedStartDate:MMM yyyy} is before company incorporation date {incorporationDate:MMM yyyy}", ScoreImpact = -20 }); } } } private static void CheckDissolutionDate( List flags, DateOnly? claimedEndDate, DateOnly? dissolutionDate, string? companyStatus, string companyName) { var isDissolvedStatus = companyStatus?.ToLowerInvariant() is "dissolved" or "liquidation" or "administration"; if (dissolutionDate.HasValue && isDissolvedStatus) { // Allow 3 month buffer for wind-down var bufferDate = dissolutionDate.Value.AddMonths(3); if (claimedEndDate.HasValue && claimedEndDate.Value > bufferDate) { flags.Add(new CompanyVerificationFlag { Type = "EmploymentAtDissolvedCompany", Severity = "Critical", Message = $"Claimed employment at '{companyName}' until {claimedEndDate:MMM yyyy} but company was dissolved on {dissolutionDate:MMM yyyy}", ScoreImpact = -20 }); } else if (!claimedEndDate.HasValue) // Current employment at dissolved company { flags.Add(new CompanyVerificationFlag { Type = "CurrentEmploymentAtDissolvedCompany", Severity = "Critical", Message = $"Claims current employment at '{companyName}' but company was dissolved on {dissolutionDate:MMM yyyy}", ScoreImpact = -25 }); } } } private static void CheckDormantCompany( List flags, string? accountsCategory, string? jobTitle, string companyName) { if (string.IsNullOrWhiteSpace(accountsCategory)) return; var isDormant = accountsCategory.ToLowerInvariant().Contains("dormant"); if (!isDormant) return; // Directors can maintain dormant companies, but other roles are suspicious var title = jobTitle?.ToLowerInvariant() ?? ""; var isDirectorRole = title.Contains("director") || title.Contains("company secretary"); if (!isDirectorRole) { flags.Add(new CompanyVerificationFlag { Type = "EmploymentAtDormantCompany", Severity = "Warning", Message = $"Claimed active employment as '{jobTitle}' at '{companyName}' which files dormant accounts", ScoreImpact = -10 }); } } private static void CheckCompanySizeVsRole( List flags, string? accountsCategory, string? jobTitle, string companyName) { if (string.IsNullOrWhiteSpace(accountsCategory) || string.IsNullOrWhiteSpace(jobTitle)) return; var category = accountsCategory.ToLowerInvariant(); var title = jobTitle.ToLowerInvariant(); // Micro-entity: < 10 employees, < £632k turnover var isMicroEntity = category.Contains("micro"); // Check for senior management roles at micro companies var isSeniorRole = title.Contains("vp") || title.Contains("vice president") || title.Contains("head of") || title.Contains("chief") || title.Contains("director of") || title.Contains("senior director"); // At micro companies, having many senior roles is suspicious if (isMicroEntity && isSeniorRole) { flags.Add(new CompanyVerificationFlag { Type = "SeniorRoleAtMicroCompany", Severity = "Warning", Message = $"Claimed senior role '{jobTitle}' at '{companyName}' which files micro-entity accounts (typically <10 employees)", ScoreImpact = -10 }); } } private static void CheckSicCodeMismatch( List flags, List? sicCodes, string? jobTitle, string companyName) { if (sicCodes is null || sicCodes.Count == 0 || string.IsNullOrWhiteSpace(jobTitle)) return; var title = jobTitle.ToLowerInvariant(); // Check if this is a tech role var isTechRole = title.Contains("software") || title.Contains("developer") || title.Contains("engineer") || title.Contains("programmer") || title.Contains("data scientist") || title.Contains("data analyst") || title.Contains("devops") || title.Contains("cloud") || title.Contains("machine learning") || title.Contains("ai ") || title.Contains("frontend") || title.Contains("backend") || title.Contains("full stack") || title.Contains("fullstack"); if (isTechRole) { // Check if company has any tech SIC codes var hasTechSic = sicCodes.Any(s => TechSicCodes.Contains(s)); if (!hasTechSic) { // Get the primary SIC code description (simplified - just show code) var primarySic = sicCodes.FirstOrDefault() ?? "Unknown"; flags.Add(new CompanyVerificationFlag { Type = "SicCodeMismatch", Severity = "Info", Message = $"Tech role '{jobTitle}' at '{companyName}' (SIC: {primarySic}) - company is not registered as a technology business", ScoreImpact = -5 }); } } } private static (bool? IsPlausible, string? Notes) CheckJobTitlePlausibility(string? jobTitle, string? companyType) { if (string.IsNullOrWhiteSpace(jobTitle) || string.IsNullOrWhiteSpace(companyType)) { return (null, null); } var title = jobTitle.Trim().ToLowerInvariant(); var type = companyType.Trim().ToLowerInvariant(); // Check if this is a PLC (Public Limited Company) - these are large companies var isPlc = type.Contains("plc") || type.Contains("public limited"); // Check for C-suite / very senior roles var isCsuiteRole = title.Contains("ceo") || title.Contains("chief executive") || title.Contains("cto") || title.Contains("chief technology") || title.Contains("cfo") || title.Contains("chief financial") || title.Contains("coo") || title.Contains("chief operating") || title.Contains("cio") || title.Contains("chief information") || title.Contains("managing director") || title == "md" || title.Contains("chairman") || title.Contains("chairwoman") || title.Contains("chairperson") || title.Contains("president"); // Check for board-level roles var isBoardRole = title.Contains("board member") || title.Contains("non-executive director") || title.Contains("executive director") || (title == "director" && !title.Contains("of")); if (isPlc && (isCsuiteRole || isBoardRole)) { return (false, $"Claimed senior role '{jobTitle}' at a PLC requires verification - C-suite positions at public companies are publicly disclosed"); } // Check for VP/SVP at PLCs (also usually disclosed) var isVpRole = title.Contains("vice president") || title.Contains("vp ") || title.StartsWith("vp") || title.Contains("svp") || title.Contains("senior vice president") || title.Contains("evp") || title.Contains("executive vice president"); if (isPlc && isVpRole) { return (false, $"Claimed VP-level role '{jobTitle}' at a PLC - senior positions at public companies should be verifiable"); } return (true, null); } #endregion #region Helper Methods private async Task FindCachedMatchAsync(string companyName) { var cutoffDate = DateTime.UtcNow.AddDays(-CacheExpirationDays); await using var dbContext = await _dbContextFactory.CreateDbContextAsync(); var cachedCompanies = await dbContext.CompanyCache .Where(c => c.CachedAt >= cutoffDate) .ToListAsync(); if (cachedCompanies.Count == 0) { return null; } var matches = cachedCompanies .Where(c => !string.IsNullOrWhiteSpace(c.CompanyName)) .Select(c => new { Company = c, Score = Fuzz.TokenSetRatio(companyName.ToUpperInvariant(), c.CompanyName.ToUpperInvariant()) }) .Where(m => m.Score >= FuzzyMatchThreshold) .OrderByDescending(m => m.Score) .FirstOrDefault(); return matches?.Company; } private (CompaniesHouseSearchItem Item, int Score)? FindBestMatch( string companyName, string searchQuery, List items, DateOnly? claimedStartDate) { var normalizedOriginal = companyName.ToUpperInvariant(); var normalizedQuery = searchQuery.ToUpperInvariant(); // Match against both the original company name AND the search query used // This handles cases like "Matthew Walker (Northern Foods Plc)" where we // search for "Northern Foods Plc" but need to match against it, not the full name var matches = items .Where(item => !string.IsNullOrWhiteSpace(item.Title)) .Select(item => { var itemTitle = item.Title.ToUpperInvariant(); var scoreVsOriginal = Fuzz.TokenSetRatio(normalizedOriginal, itemTitle); var scoreVsQuery = Fuzz.TokenSetRatio(normalizedQuery, itemTitle); return (Item: item, Score: Math.Max(scoreVsOriginal, scoreVsQuery)); }) .Where(m => m.Score >= FuzzyMatchThreshold) .ToList(); _logger.LogDebug("Found {Count} matches above threshold for '{CompanyName}' (query: '{Query}')", matches.Count, companyName, searchQuery); foreach (var m in matches.Take(5)) { _logger.LogDebug(" Match: {Title} ({Number}), Score: {Score}, DateOfCreation: {Date}", m.Item.Title, m.Item.CompanyNumber, m.Score, m.Item.DateOfCreation ?? "null"); } if (matches.Count == 0) return null; // If we have a claimed start date, prefer companies that existed at that time if (claimedStartDate.HasValue) { _logger.LogDebug("Filtering for companies that existed at claimed start date: {StartDate}", claimedStartDate.Value); var existedAtStartDate = matches .Where(m => { var incDate = DateHelpers.ParseDate(m.Item.DateOfCreation); var existed = incDate == null || incDate <= claimedStartDate.Value; _logger.LogDebug(" {Title}: IncDate={IncDate}, Existed={Existed}", m.Item.Title, incDate?.ToString() ?? "null", existed); return existed; }) .OrderByDescending(m => m.Score) .ToList(); _logger.LogDebug("Companies that existed at start date: {Count}", existedAtStartDate.Count); // If any matches existed at the start date, prefer those if (existedAtStartDate.Count > 0) { _logger.LogDebug("Selected: {Title} ({Number})", existedAtStartDate[0].Item.Title, existedAtStartDate[0].Item.CompanyNumber); return existedAtStartDate[0]; } // No companies existed at the claimed start date - don't match a wrong company _logger.LogDebug("No companies found that existed at claimed start date {StartDate}, returning no match", claimedStartDate.Value); return null; } // No start date provided - just use highest score var fallback = matches.OrderByDescending(m => m.Score).First(); _logger.LogDebug("No start date filter, using highest score: {Title} ({Number})", fallback.Item.Title, fallback.Item.CompanyNumber); return fallback; } private async Task CacheCompanyAsync(CompaniesHouseSearchItem item, CompaniesHouseCompany? details) { try { await using var dbContext = await _dbContextFactory.CreateDbContextAsync(); var existingCache = await dbContext.CompanyCache .FirstOrDefaultAsync(c => c.CompanyNumber == item.CompanyNumber); var sicCodes = details?.SicCodes ?? item.SicCodes; var sicCodesJson = sicCodes != null ? JsonSerializer.Serialize(sicCodes) : null; var accountsCategory = details?.Accounts?.LastAccounts?.Type; if (existingCache is not null) { existingCache.CompanyName = item.Title; existingCache.Status = item.CompanyStatus ?? "Unknown"; existingCache.CompanyType = item.CompanyType; existingCache.IncorporationDate = DateHelpers.ParseDate(item.DateOfCreation); existingCache.DissolutionDate = DateHelpers.ParseDate(item.DateOfCessation); existingCache.AccountsCategory = accountsCategory; existingCache.SicCodesJson = sicCodesJson; existingCache.CachedAt = DateTime.UtcNow; } else { var cacheEntry = new CompanyCache { CompanyNumber = item.CompanyNumber, CompanyName = item.Title, Status = item.CompanyStatus ?? "Unknown", CompanyType = item.CompanyType, IncorporationDate = DateHelpers.ParseDate(item.DateOfCreation), DissolutionDate = DateHelpers.ParseDate(item.DateOfCessation), AccountsCategory = accountsCategory, SicCodesJson = sicCodesJson, CachedAt = DateTime.UtcNow }; dbContext.CompanyCache.Add(cacheEntry); } await dbContext.SaveChangesAsync(); } catch (DbUpdateException ex) when (ex.InnerException?.Message.Contains("PK_CompanyCache") == true) { // Race condition: another task already cached this company - ignore _logger.LogDebug("Company {CompanyNumber} already cached by another task", item.CompanyNumber); } } private CompanyVerificationResult CreateResultFromCache( CompanyCache cached, string claimedCompany, DateOnly? startDate, DateOnly? endDate, string? jobTitle, List flags) { var matchScore = Fuzz.TokenSetRatio( claimedCompany.ToUpperInvariant(), cached.CompanyName.ToUpperInvariant()); List? sicCodes = null; if (!string.IsNullOrEmpty(cached.SicCodesJson)) { try { sicCodes = JsonSerializer.Deserialize>(cached.SicCodesJson); } catch (JsonException) { // Ignore malformed JSON in cache } } // Run all verification checks CheckIncorporationDate(flags, startDate, cached.IncorporationDate, cached.CompanyName); CheckDissolutionDate(flags, endDate, cached.DissolutionDate, cached.Status, cached.CompanyName); CheckDormantCompany(flags, cached.AccountsCategory, jobTitle, cached.CompanyName); CheckCompanySizeVsRole(flags, cached.AccountsCategory, jobTitle, cached.CompanyName); CheckSicCodeMismatch(flags, sicCodes, jobTitle, cached.CompanyName); var (jobPlausible, jobNotes) = CheckJobTitlePlausibility(jobTitle, cached.CompanyType); if (jobPlausible == false) { flags.Add(new CompanyVerificationFlag { Type = "ImplausibleJobTitle", Severity = "Critical", Message = jobNotes ?? "Job title requires verification", ScoreImpact = -15 }); } return new CompanyVerificationResult { ClaimedCompany = claimedCompany, MatchedCompanyName = cached.CompanyName, MatchedCompanyNumber = cached.CompanyNumber, MatchScore = matchScore, IsVerified = true, VerificationNotes = null, ClaimedStartDate = startDate, ClaimedEndDate = endDate, CompanyType = cached.CompanyType, CompanyStatus = cached.Status, IncorporationDate = cached.IncorporationDate, DissolutionDate = cached.DissolutionDate, AccountsCategory = cached.AccountsCategory, SicCodes = sicCodes, ClaimedJobTitle = jobTitle, JobTitlePlausible = jobPlausible, JobTitleNotes = jobNotes, Flags = flags }; } private static CompanyVerificationResult CreateUnverifiedResult( string companyName, DateOnly? startDate, DateOnly? endDate, string? jobTitle, string reason) { return new CompanyVerificationResult { ClaimedCompany = companyName, MatchedCompanyName = null, MatchedCompanyNumber = null, MatchScore = 0, IsVerified = false, VerificationNotes = reason, ClaimedStartDate = startDate, ClaimedEndDate = endDate, ClaimedJobTitle = jobTitle }; } /// /// Generates alternative search queries to find companies that may be registered /// with slightly different names (e.g., "U.K." vs "UK", "Limited" vs "Ltd"). /// Also handles "Brand (Parent Company)" format by extracting and prioritizing the parent. /// private static List GenerateSearchQueries(string companyName) { var queries = new HashSet(StringComparer.OrdinalIgnoreCase); var normalized = companyName.Trim(); // Step 0a: Check for "Brand (Parent Company)" format and extract parent company // Parent company is more likely to be the registered name, so search it first var parentMatch = System.Text.RegularExpressions.Regex.Match(normalized, @"\(([^)]+)\)\s*$"); if (parentMatch.Success) { var parentCompany = parentMatch.Groups[1].Value.Trim(); // Generate queries for parent company first (higher priority) foreach (var parentQuery in GenerateNameVariations(parentCompany)) { queries.Add(parentQuery); } // Also try the brand name without parenthetical var brandName = normalized[..parentMatch.Index].Trim(); if (brandName.Length >= 3) { foreach (var brandQuery in GenerateNameVariations(brandName)) { queries.Add(brandQuery); } } } // Step 0b: Check for "Name1/Name2" format (e.g., "ASDA/WALMART") // Try each part separately as they may be different registered names if (normalized.Contains('/')) { var parts = normalized.Split('/', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries); foreach (var part in parts) { if (part.Length >= 3) { foreach (var partQuery in GenerateNameVariations(part)) { queries.Add(partQuery); } } } } // Step 0c: Try first word as potential parent company (e.g., "UNILEVER BESTFOOD" -> "UNILEVER") // Many company names are "ParentCompany Division" or "ParentCompany Brand" var words = normalized.Split(' ', StringSplitOptions.RemoveEmptyEntries); if (words.Length >= 2) { var firstWord = words[0]; // Only try if first word is substantial (not "The", "A", common prefixes) var skipWords = new HashSet(StringComparer.OrdinalIgnoreCase) { "the", "a", "an", "uk", "british", "national", "international", "global", "new" }; if (firstWord.Length >= 4 && !skipWords.Contains(firstWord)) { foreach (var firstWordQuery in GenerateNameVariations(firstWord)) { queries.Add(firstWordQuery); } // Also try first word + PLC/Limited for major corporations queries.Add(firstWord + " PLC"); queries.Add(firstWord + " Limited"); } } // Also add variations of the full original name foreach (var query in GenerateNameVariations(normalized)) { queries.Add(query); } return queries.ToList(); } /// /// Generates name variations for a single company name (UK/U.K., Ltd/Limited, etc.) /// private static List GenerateNameVariations(string name) { var variations = new HashSet(StringComparer.OrdinalIgnoreCase) { name }; // Step 1: Generate UK/U.K. variations var ukVariants = new List { name }; if (name.Contains(" UK", StringComparison.OrdinalIgnoreCase)) { // Add U.K. variant var withDots = name .Replace(" UK ", " U.K. ", StringComparison.OrdinalIgnoreCase) .Replace(" UK", " U.K.", StringComparison.OrdinalIgnoreCase); if (withDots != name) ukVariants.Add(withDots); } if (name.Contains(" U.K.", StringComparison.OrdinalIgnoreCase)) { // Add UK variant (no dots) var withoutDots = name .Replace(" U.K. ", " UK ", StringComparison.OrdinalIgnoreCase) .Replace(" U.K.", " UK", StringComparison.OrdinalIgnoreCase); if (withoutDots != name) ukVariants.Add(withoutDots); } // Step 2: For each UK variant, generate suffix variations (Ltd/Limited) foreach (var variant in ukVariants) { variations.Add(variant); // Try Ltd -> Limited if (variant.EndsWith(" Ltd", StringComparison.OrdinalIgnoreCase)) { variations.Add(variant[..^4] + " Limited"); } // Try Limited -> Ltd else if (variant.EndsWith(" Limited", StringComparison.OrdinalIgnoreCase)) { variations.Add(variant[..^8] + " Ltd"); } // Try PLC variations else if (variant.EndsWith(" PLC", StringComparison.OrdinalIgnoreCase)) { variations.Add(variant[..^4] + " Public Limited Company"); } else if (variant.EndsWith(" Public Limited Company", StringComparison.OrdinalIgnoreCase)) { variations.Add(variant[..^24] + " PLC"); } // Try Plc (mixed case) variations else if (variant.EndsWith(" Plc", StringComparison.Ordinal)) { variations.Add(variant[..^4] + " PLC"); variations.Add(variant[..^4] + " Public Limited Company"); } } // Step 3: Try core name without suffix var suffixesToRemove = new[] { " Ltd", " Limited", " PLC", " Plc", " LLP", " Inc", " Corporation", " Corp" }; var coreName = name; foreach (var suffix in suffixesToRemove) { if (coreName.EndsWith(suffix, StringComparison.OrdinalIgnoreCase)) { coreName = coreName[..^suffix.Length].Trim(); break; } } if (coreName != name && coreName.Length >= 3) { variations.Add(coreName); variations.Add(coreName + " Limited"); variations.Add(coreName + " PLC"); // Also add U.K. variant of core name if applicable if (coreName.Contains(" UK", StringComparison.OrdinalIgnoreCase)) { var coreWithDots = coreName .Replace(" UK ", " U.K. ", StringComparison.OrdinalIgnoreCase) .Replace(" UK", " U.K.", StringComparison.OrdinalIgnoreCase); variations.Add(coreWithDots); variations.Add(coreWithDots + " Limited"); } } return variations.ToList(); } #endregion }