diff --git a/src/TrueCV.Infrastructure/Services/CompanyVerifierService.cs b/src/TrueCV.Infrastructure/Services/CompanyVerifierService.cs index 3edad7e..f254b7d 100644 --- a/src/TrueCV.Infrastructure/Services/CompanyVerifierService.cs +++ b/src/TrueCV.Infrastructure/Services/CompanyVerifierService.cs @@ -594,11 +594,60 @@ public sealed class CompanyVerifierService : ICompanyVerifierService var normalizedOriginal = companyName.ToUpperInvariant(); var normalizedQuery = searchQuery.ToUpperInvariant(); + // Extract core identifying words that MUST appear in any valid match + // This prevents "BMW Group Canada" matching "CANADA LIFE GROUP" just because of common words + // and "Lloyds Bowmaker" matching "LLOYDS ALARMS" (missing "Bowmaker") + var coreWords = ExtractCoreIdentifiers(companyName); + var queryCoreWords = ExtractCoreIdentifiers(searchQuery); + + // Check if the search is looking for a club/association type entity + var originalLower = companyName.ToLowerInvariant(); + var queryLower = searchQuery.ToLowerInvariant(); + var searchingForClub = originalLower.Contains("club") || originalLower.Contains("association") || + originalLower.Contains("society") || originalLower.Contains("owners") || + queryLower.Contains("club") || queryLower.Contains("association") || + queryLower.Contains("society") || queryLower.Contains("owners"); + // Match against both the original company name AND the search query used // This handles cases like "Matthew Walker (Northern Foods Plc)" where we // search for "Northern Foods Plc" but need to match against it, not the full name var matches = items .Where(item => !string.IsNullOrWhiteSpace(item.Title)) + .Where(item => + { + var itemTitle = item.Title.ToUpperInvariant(); + var itemTitleLower = item.Title.ToLowerInvariant(); + + // Validate that ALL core identifiers appear in the match + // "Lloyds Bowmaker" must have BOTH "LLOYDS" and "BOWMAKER" in the match + var hasAllOriginalCores = coreWords.Count == 0 || coreWords.All(w => itemTitle.Contains(w)); + var hasAllQueryCores = queryCoreWords.Count == 0 || queryCoreWords.All(w => itemTitle.Contains(w)); + if (!hasAllOriginalCores && !hasAllQueryCores) return false; + + // Filter out club/association/society entities unless explicitly searching for one + // This prevents "BMW" matching "BMW CAR CLUB LIMITED" + if (!searchingForClub) + { + var isClubType = itemTitleLower.Contains("club") || itemTitleLower.Contains("association") || + itemTitleLower.Contains("society") || itemTitleLower.Contains("owners") || + itemTitleLower.Contains("enthusiast") || itemTitleLower.Contains("fan "); + if (isClubType) return false; + } + + // Filter out benefit/trust/pension entities unless explicitly searching for one + // This prevents "BMW Group" matching "BMW GROUP BENEFIT TRUST LIMITED" + var searchingForTrust = originalLower.Contains("trust") || originalLower.Contains("benefit") || + originalLower.Contains("pension") || queryLower.Contains("trust") || + queryLower.Contains("benefit") || queryLower.Contains("pension"); + if (!searchingForTrust) + { + var isTrustType = itemTitleLower.Contains("benefit trust") || itemTitleLower.Contains("pension") || + itemTitleLower.Contains("employee trust") || itemTitleLower.Contains("retirement"); + if (isTrustType) return false; + } + + return true; + }) .Select(item => { var itemTitle = item.Title.ToUpperInvariant(); @@ -966,5 +1015,47 @@ public sealed class CompanyVerifierService : ICompanyVerifierService return variations.ToList(); } + /// + /// Extracts ALL core identifying words from a company name. + /// These are significant words that aren't common prefixes/suffixes. + /// E.g., "BMW Group Canada" -> ["BMW"], "Lloyds Bowmaker" -> ["LLOYDS", "BOWMAKER"] + /// "Bank of Scotland" -> ["BANK", "SCOTLAND"] + /// + private static List ExtractCoreIdentifiers(string companyName) + { + if (string.IsNullOrWhiteSpace(companyName)) return new List(); + + // Common words to skip when finding core identifiers + var skipWords = new HashSet(StringComparer.OrdinalIgnoreCase) + { + "the", "a", "an", "and", "of", "for", "in", "at", "on", "by", + "uk", "u.k.", "gb", "british", "england", "scotland", "wales", "ireland", + "national", "international", "global", "worldwide", "world", + "group", "holdings", "services", "solutions", "consulting", "management", + "limited", "ltd", "plc", "llp", "inc", "corporation", "corp", + "company", "co", "partners", "partnership", "enterprises", + "europe", "european", "america", "american", "canada", "canadian", "asia", "asian", + "north", "south", "east", "west", "central", "new" + }; + + // Remove parenthetical content first + var cleanName = System.Text.RegularExpressions.Regex.Replace(companyName, @"\([^)]*\)", "").Trim(); + + // Split into words and collect all significant words + var words = cleanName.Split(new[] { ' ', '-', '/', '&' }, StringSplitOptions.RemoveEmptyEntries); + var coreWords = new List(); + + foreach (var word in words) + { + var cleanWord = word.Trim('.', ',', '\''); + if (cleanWord.Length >= 2 && !skipWords.Contains(cleanWord)) + { + coreWords.Add(cleanWord.ToUpperInvariant()); + } + } + + return coreWords; + } + #endregion }