diff --git a/src/TrueCV.Infrastructure/Services/CompanyVerifierService.cs b/src/TrueCV.Infrastructure/Services/CompanyVerifierService.cs index 1b29755..c05b1ae 100644 --- a/src/TrueCV.Infrastructure/Services/CompanyVerifierService.cs +++ b/src/TrueCV.Infrastructure/Services/CompanyVerifierService.cs @@ -45,6 +45,32 @@ public sealed class CompanyVerifierService : ICompanyVerifierService "68100", // Buying and selling of own real estate (often shell) }; + // Words that indicate a subsidiary rather than the main trading company + // When someone says they worked for "ASDA", they likely mean ASDA STORES LIMITED, + // not ASDA DELIVERY LIMITED or ASDA PROPERTY HOLDINGS LIMITED + private static readonly HashSet SubsidiaryIndicators = new(StringComparer.OrdinalIgnoreCase) + { + // Logistics/Operations subsidiaries + "delivery", "distribution", "logistics", "transport", "fleet", "haulage", "warehousing", "fulfilment", + // Property subsidiaries + "property", "properties", "estates", "land", "real estate", "developments", + // Financial/Holding subsidiaries + "holdings", "holding", "investments", "capital", "finance", "financial", "treasury", + // Administrative subsidiaries + "nominees", "nominee", "trustees", "trustee", "secretarial", "registrars", + // Insurance subsidiaries + "insurance", "assurance", "underwriting", + // Specific function subsidiaries + "leasing", "rentals", "procurement", "sourcing" + }; + + // Words that indicate a main trading/employer company (prefer these) + private static readonly HashSet MainCompanyIndicators = new(StringComparer.OrdinalIgnoreCase) + { + "stores", "retail", "supermarkets", "superstores", "hypermarkets", + "manufacturing", "operations", "trading" + }; + // SIC codes for tech/software companies private static readonly HashSet TechSicCodes = new() { @@ -657,9 +683,15 @@ public sealed class CompanyVerifierService : ICompanyVerifierService .Select(item => { var itemTitle = item.Title.ToUpperInvariant(); + var itemTitleLower = item.Title.ToLowerInvariant(); var scoreVsOriginal = Fuzz.TokenSetRatio(normalizedOriginal, itemTitle); var scoreVsQuery = Fuzz.TokenSetRatio(normalizedQuery, itemTitle); - return (Item: item, Score: Math.Max(scoreVsOriginal, scoreVsQuery)); + var baseScore = Math.Max(scoreVsOriginal, scoreVsQuery); + + // Calculate priority adjustment for main company vs subsidiary + var priorityScore = CalculateCompanyPriorityScore(itemTitleLower, originalLower, queryLower); + + return (Item: item, Score: baseScore, PriorityScore: priorityScore); }) .Where(m => m.Score >= FuzzyMatchThreshold) .ToList(); @@ -667,8 +699,8 @@ public sealed class CompanyVerifierService : ICompanyVerifierService _logger.LogDebug("Found {Count} matches above threshold for '{CompanyName}' (query: '{Query}')", matches.Count, companyName, searchQuery); foreach (var m in matches.Take(5)) { - _logger.LogDebug(" Match: {Title} ({Number}), Score: {Score}, DateOfCreation: {Date}", - m.Item.Title, m.Item.CompanyNumber, m.Score, m.Item.DateOfCreation ?? "null"); + _logger.LogDebug(" Match: {Title} ({Number}), Score: {Score}, Priority: {Priority}, DateOfCreation: {Date}", + m.Item.Title, m.Item.CompanyNumber, m.Score, m.PriorityScore, m.Item.DateOfCreation ?? "null"); } if (matches.Count == 0) return null; @@ -687,7 +719,9 @@ public sealed class CompanyVerifierService : ICompanyVerifierService m.Item.Title, incDate?.ToString() ?? "null", existed); return existed; }) - .OrderByDescending(m => m.Score) + // Sort by priority first, then by fuzzy score + .OrderByDescending(m => m.PriorityScore) + .ThenByDescending(m => m.Score) .ToList(); _logger.LogDebug("Companies that existed at start date: {Count}", existedAtStartDate.Count); @@ -695,8 +729,9 @@ public sealed class CompanyVerifierService : ICompanyVerifierService // If any matches existed at the start date, prefer those if (existedAtStartDate.Count > 0) { - _logger.LogDebug("Selected: {Title} ({Number})", existedAtStartDate[0].Item.Title, existedAtStartDate[0].Item.CompanyNumber); - return existedAtStartDate[0]; + var selected = existedAtStartDate[0]; + _logger.LogDebug("Selected: {Title} ({Number}), Priority: {Priority}", selected.Item.Title, selected.Item.CompanyNumber, selected.PriorityScore); + return (selected.Item, selected.Score); } // No companies existed at the claimed start date - don't match a wrong company @@ -704,10 +739,59 @@ public sealed class CompanyVerifierService : ICompanyVerifierService return null; } - // No start date provided - just use highest score - var fallback = matches.OrderByDescending(m => m.Score).First(); - _logger.LogDebug("No start date filter, using highest score: {Title} ({Number})", fallback.Item.Title, fallback.Item.CompanyNumber); - return fallback; + // No start date provided - sort by priority then score + var fallback = matches + .OrderByDescending(m => m.PriorityScore) + .ThenByDescending(m => m.Score) + .First(); + _logger.LogDebug("No start date filter, using highest priority: {Title} ({Number}), Priority: {Priority}", fallback.Item.Title, fallback.Item.CompanyNumber, fallback.PriorityScore); + return (fallback.Item, fallback.Score); + } + + /// + /// Calculates a priority score for company matching. + /// Higher scores = more likely to be the main employer company. + /// Penalizes subsidiaries (delivery, property, holdings, etc.) unless explicitly searched for. + /// Boosts main trading companies (stores, retail, etc.). + /// + private static int CalculateCompanyPriorityScore(string itemTitleLower, string originalLower, string queryLower) + { + var score = 0; + + // Check if search explicitly mentions subsidiary indicators + var searchText = originalLower + " " + queryLower; + + // Penalize subsidiary indicators (unless search explicitly included them) + foreach (var indicator in SubsidiaryIndicators) + { + if (itemTitleLower.Contains(indicator)) + { + // Only penalize if the search didn't explicitly include this indicator + if (!searchText.Contains(indicator)) + { + score -= 10; // Significant penalty for subsidiaries + } + break; // Only apply one subsidiary penalty + } + } + + // Boost main company indicators + foreach (var indicator in MainCompanyIndicators) + { + if (itemTitleLower.Contains(indicator)) + { + score += 5; // Boost for main trading companies + break; // Only apply one boost + } + } + + // Slight boost for PLC (usually the parent/main company) + if (itemTitleLower.EndsWith(" plc")) + { + score += 3; + } + + return score; } private async Task CacheCompanyAsync(CompaniesHouseSearchItem item, CompaniesHouseCompany? details)