Fix incorrect company matching with stricter validation
- Require ALL core identifier words to be present in match (e.g., "Lloyds Bowmaker" needs both "LLOYDS" and "BOWMAKER") - Filter out club/association/society type entities (prevents "BMW" -> "BMW CAR CLUB LIMITED") - Filter out benefit/trust/pension type entities (prevents "BMW Group" -> "BMW GROUP BENEFIT TRUST LIMITED") - Core word extraction now returns all significant words, not just first Fixes false matches like: - "BMW Group Canada" -> "CANADA LIFE GROUP" (missing BMW) - "Bank of Scotland" -> "BANK AND CLIENTS PLC" (missing Scotland) - "Lloyds Bowmaker" -> "LLOYDS ALARMS" (missing Bowmaker) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -594,11 +594,60 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
|
|||||||
var normalizedOriginal = companyName.ToUpperInvariant();
|
var normalizedOriginal = companyName.ToUpperInvariant();
|
||||||
var normalizedQuery = searchQuery.ToUpperInvariant();
|
var normalizedQuery = searchQuery.ToUpperInvariant();
|
||||||
|
|
||||||
|
// Extract core identifying words that MUST appear in any valid match
|
||||||
|
// This prevents "BMW Group Canada" matching "CANADA LIFE GROUP" just because of common words
|
||||||
|
// and "Lloyds Bowmaker" matching "LLOYDS ALARMS" (missing "Bowmaker")
|
||||||
|
var coreWords = ExtractCoreIdentifiers(companyName);
|
||||||
|
var queryCoreWords = ExtractCoreIdentifiers(searchQuery);
|
||||||
|
|
||||||
|
// Check if the search is looking for a club/association type entity
|
||||||
|
var originalLower = companyName.ToLowerInvariant();
|
||||||
|
var queryLower = searchQuery.ToLowerInvariant();
|
||||||
|
var searchingForClub = originalLower.Contains("club") || originalLower.Contains("association") ||
|
||||||
|
originalLower.Contains("society") || originalLower.Contains("owners") ||
|
||||||
|
queryLower.Contains("club") || queryLower.Contains("association") ||
|
||||||
|
queryLower.Contains("society") || queryLower.Contains("owners");
|
||||||
|
|
||||||
// Match against both the original company name AND the search query used
|
// Match against both the original company name AND the search query used
|
||||||
// This handles cases like "Matthew Walker (Northern Foods Plc)" where we
|
// This handles cases like "Matthew Walker (Northern Foods Plc)" where we
|
||||||
// search for "Northern Foods Plc" but need to match against it, not the full name
|
// search for "Northern Foods Plc" but need to match against it, not the full name
|
||||||
var matches = items
|
var matches = items
|
||||||
.Where(item => !string.IsNullOrWhiteSpace(item.Title))
|
.Where(item => !string.IsNullOrWhiteSpace(item.Title))
|
||||||
|
.Where(item =>
|
||||||
|
{
|
||||||
|
var itemTitle = item.Title.ToUpperInvariant();
|
||||||
|
var itemTitleLower = item.Title.ToLowerInvariant();
|
||||||
|
|
||||||
|
// Validate that ALL core identifiers appear in the match
|
||||||
|
// "Lloyds Bowmaker" must have BOTH "LLOYDS" and "BOWMAKER" in the match
|
||||||
|
var hasAllOriginalCores = coreWords.Count == 0 || coreWords.All(w => itemTitle.Contains(w));
|
||||||
|
var hasAllQueryCores = queryCoreWords.Count == 0 || queryCoreWords.All(w => itemTitle.Contains(w));
|
||||||
|
if (!hasAllOriginalCores && !hasAllQueryCores) return false;
|
||||||
|
|
||||||
|
// Filter out club/association/society entities unless explicitly searching for one
|
||||||
|
// This prevents "BMW" matching "BMW CAR CLUB LIMITED"
|
||||||
|
if (!searchingForClub)
|
||||||
|
{
|
||||||
|
var isClubType = itemTitleLower.Contains("club") || itemTitleLower.Contains("association") ||
|
||||||
|
itemTitleLower.Contains("society") || itemTitleLower.Contains("owners") ||
|
||||||
|
itemTitleLower.Contains("enthusiast") || itemTitleLower.Contains("fan ");
|
||||||
|
if (isClubType) return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Filter out benefit/trust/pension entities unless explicitly searching for one
|
||||||
|
// This prevents "BMW Group" matching "BMW GROUP BENEFIT TRUST LIMITED"
|
||||||
|
var searchingForTrust = originalLower.Contains("trust") || originalLower.Contains("benefit") ||
|
||||||
|
originalLower.Contains("pension") || queryLower.Contains("trust") ||
|
||||||
|
queryLower.Contains("benefit") || queryLower.Contains("pension");
|
||||||
|
if (!searchingForTrust)
|
||||||
|
{
|
||||||
|
var isTrustType = itemTitleLower.Contains("benefit trust") || itemTitleLower.Contains("pension") ||
|
||||||
|
itemTitleLower.Contains("employee trust") || itemTitleLower.Contains("retirement");
|
||||||
|
if (isTrustType) return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
})
|
||||||
.Select(item =>
|
.Select(item =>
|
||||||
{
|
{
|
||||||
var itemTitle = item.Title.ToUpperInvariant();
|
var itemTitle = item.Title.ToUpperInvariant();
|
||||||
@@ -966,5 +1015,47 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
|
|||||||
return variations.ToList();
|
return variations.ToList();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// <summary>
|
||||||
|
/// Extracts ALL core identifying words from a company name.
|
||||||
|
/// These are significant words that aren't common prefixes/suffixes.
|
||||||
|
/// E.g., "BMW Group Canada" -> ["BMW"], "Lloyds Bowmaker" -> ["LLOYDS", "BOWMAKER"]
|
||||||
|
/// "Bank of Scotland" -> ["BANK", "SCOTLAND"]
|
||||||
|
/// </summary>
|
||||||
|
private static List<string> ExtractCoreIdentifiers(string companyName)
|
||||||
|
{
|
||||||
|
if (string.IsNullOrWhiteSpace(companyName)) return new List<string>();
|
||||||
|
|
||||||
|
// Common words to skip when finding core identifiers
|
||||||
|
var skipWords = new HashSet<string>(StringComparer.OrdinalIgnoreCase)
|
||||||
|
{
|
||||||
|
"the", "a", "an", "and", "of", "for", "in", "at", "on", "by",
|
||||||
|
"uk", "u.k.", "gb", "british", "england", "scotland", "wales", "ireland",
|
||||||
|
"national", "international", "global", "worldwide", "world",
|
||||||
|
"group", "holdings", "services", "solutions", "consulting", "management",
|
||||||
|
"limited", "ltd", "plc", "llp", "inc", "corporation", "corp",
|
||||||
|
"company", "co", "partners", "partnership", "enterprises",
|
||||||
|
"europe", "european", "america", "american", "canada", "canadian", "asia", "asian",
|
||||||
|
"north", "south", "east", "west", "central", "new"
|
||||||
|
};
|
||||||
|
|
||||||
|
// Remove parenthetical content first
|
||||||
|
var cleanName = System.Text.RegularExpressions.Regex.Replace(companyName, @"\([^)]*\)", "").Trim();
|
||||||
|
|
||||||
|
// Split into words and collect all significant words
|
||||||
|
var words = cleanName.Split(new[] { ' ', '-', '/', '&' }, StringSplitOptions.RemoveEmptyEntries);
|
||||||
|
var coreWords = new List<string>();
|
||||||
|
|
||||||
|
foreach (var word in words)
|
||||||
|
{
|
||||||
|
var cleanWord = word.Trim('.', ',', '\'');
|
||||||
|
if (cleanWord.Length >= 2 && !skipWords.Contains(cleanWord))
|
||||||
|
{
|
||||||
|
coreWords.Add(cleanWord.ToUpperInvariant());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return coreWords;
|
||||||
|
}
|
||||||
|
|
||||||
#endregion
|
#endregion
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user