Fix incorrect company matching with stricter validation

- Require ALL core identifier words to be present in match
  (e.g., "Lloyds Bowmaker" needs both "LLOYDS" and "BOWMAKER")
- Filter out club/association/society type entities
  (prevents "BMW" -> "BMW CAR CLUB LIMITED")
- Filter out benefit/trust/pension type entities
  (prevents "BMW Group" -> "BMW GROUP BENEFIT TRUST LIMITED")
- Core word extraction now returns all significant words, not just first

Fixes false matches like:
- "BMW Group Canada" -> "CANADA LIFE GROUP" (missing BMW)
- "Bank of Scotland" -> "BANK AND CLIENTS PLC" (missing Scotland)
- "Lloyds Bowmaker" -> "LLOYDS ALARMS" (missing Bowmaker)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-01-20 22:13:23 +01:00
parent be2f738e58
commit 4bd7f1cef1

View File

@@ -594,11 +594,60 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
var normalizedOriginal = companyName.ToUpperInvariant();
var normalizedQuery = searchQuery.ToUpperInvariant();
// Extract core identifying words that MUST appear in any valid match
// This prevents "BMW Group Canada" matching "CANADA LIFE GROUP" just because of common words
// and "Lloyds Bowmaker" matching "LLOYDS ALARMS" (missing "Bowmaker")
var coreWords = ExtractCoreIdentifiers(companyName);
var queryCoreWords = ExtractCoreIdentifiers(searchQuery);
// Check if the search is looking for a club/association type entity
var originalLower = companyName.ToLowerInvariant();
var queryLower = searchQuery.ToLowerInvariant();
var searchingForClub = originalLower.Contains("club") || originalLower.Contains("association") ||
originalLower.Contains("society") || originalLower.Contains("owners") ||
queryLower.Contains("club") || queryLower.Contains("association") ||
queryLower.Contains("society") || queryLower.Contains("owners");
// Match against both the original company name AND the search query used
// This handles cases like "Matthew Walker (Northern Foods Plc)" where we
// search for "Northern Foods Plc" but need to match against it, not the full name
var matches = items
.Where(item => !string.IsNullOrWhiteSpace(item.Title))
.Where(item =>
{
var itemTitle = item.Title.ToUpperInvariant();
var itemTitleLower = item.Title.ToLowerInvariant();
// Validate that ALL core identifiers appear in the match
// "Lloyds Bowmaker" must have BOTH "LLOYDS" and "BOWMAKER" in the match
var hasAllOriginalCores = coreWords.Count == 0 || coreWords.All(w => itemTitle.Contains(w));
var hasAllQueryCores = queryCoreWords.Count == 0 || queryCoreWords.All(w => itemTitle.Contains(w));
if (!hasAllOriginalCores && !hasAllQueryCores) return false;
// Filter out club/association/society entities unless explicitly searching for one
// This prevents "BMW" matching "BMW CAR CLUB LIMITED"
if (!searchingForClub)
{
var isClubType = itemTitleLower.Contains("club") || itemTitleLower.Contains("association") ||
itemTitleLower.Contains("society") || itemTitleLower.Contains("owners") ||
itemTitleLower.Contains("enthusiast") || itemTitleLower.Contains("fan ");
if (isClubType) return false;
}
// Filter out benefit/trust/pension entities unless explicitly searching for one
// This prevents "BMW Group" matching "BMW GROUP BENEFIT TRUST LIMITED"
var searchingForTrust = originalLower.Contains("trust") || originalLower.Contains("benefit") ||
originalLower.Contains("pension") || queryLower.Contains("trust") ||
queryLower.Contains("benefit") || queryLower.Contains("pension");
if (!searchingForTrust)
{
var isTrustType = itemTitleLower.Contains("benefit trust") || itemTitleLower.Contains("pension") ||
itemTitleLower.Contains("employee trust") || itemTitleLower.Contains("retirement");
if (isTrustType) return false;
}
return true;
})
.Select(item =>
{
var itemTitle = item.Title.ToUpperInvariant();
@@ -966,5 +1015,47 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
return variations.ToList();
}
/// <summary>
/// Extracts ALL core identifying words from a company name.
/// These are significant words that aren't common prefixes/suffixes.
/// E.g., "BMW Group Canada" -> ["BMW"], "Lloyds Bowmaker" -> ["LLOYDS", "BOWMAKER"]
/// "Bank of Scotland" -> ["BANK", "SCOTLAND"]
/// </summary>
private static List<string> ExtractCoreIdentifiers(string companyName)
{
if (string.IsNullOrWhiteSpace(companyName)) return new List<string>();
// Common words to skip when finding core identifiers
var skipWords = new HashSet<string>(StringComparer.OrdinalIgnoreCase)
{
"the", "a", "an", "and", "of", "for", "in", "at", "on", "by",
"uk", "u.k.", "gb", "british", "england", "scotland", "wales", "ireland",
"national", "international", "global", "worldwide", "world",
"group", "holdings", "services", "solutions", "consulting", "management",
"limited", "ltd", "plc", "llp", "inc", "corporation", "corp",
"company", "co", "partners", "partnership", "enterprises",
"europe", "european", "america", "american", "canada", "canadian", "asia", "asian",
"north", "south", "east", "west", "central", "new"
};
// Remove parenthetical content first
var cleanName = System.Text.RegularExpressions.Regex.Replace(companyName, @"\([^)]*\)", "").Trim();
// Split into words and collect all significant words
var words = cleanName.Split(new[] { ' ', '-', '/', '&' }, StringSplitOptions.RemoveEmptyEntries);
var coreWords = new List<string>();
foreach (var word in words)
{
var cleanWord = word.Trim('.', ',', '\'');
if (cleanWord.Length >= 2 && !skipWords.Contains(cleanWord))
{
coreWords.Add(cleanWord.ToUpperInvariant());
}
}
return coreWords;
}
#endregion
}