Improve company verification filtering and fix duplicate points display

- Add NonEmploymentEntityPatterns dictionary with 10 categories of non-employer entities
  (clubs, associations, trusts, charities, investment, property, religious, sports, educational, professional)
- Expand SkipWords from ~30 to 120+ words for better core identifier extraction
- Add GetSearchEntityTypes() and IsValidEmploymentEntity() helper methods
- Refactor FindBestMatch() to use pattern-based filtering instead of hardcoded checks
- Fix UI showing duplicate points for same company appearing multiple times
  (now only shows points on first occurrence, subsequent rows show 0)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-01-20 22:33:16 +01:00
parent 4bd7f1cef1
commit 55c0aebdaa
2 changed files with 161 additions and 41 deletions

View File

@@ -21,6 +21,30 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
private const int FuzzyMatchThreshold = 85; private const int FuzzyMatchThreshold = 85;
private const int CacheExpirationDays = 30; private const int CacheExpirationDays = 30;
// Non-employment entity patterns organized by category
// These are entities that exist in Companies House but are not typical employers
private static readonly Dictionary<string, string[]> NonEmploymentEntityPatterns = new()
{
["Clubs"] = new[] { "club", "fan club", "owners club", "car club", "supporters", "enthusiast", "aficionados" },
["Associations"] = new[] { "association", "society", "federation", "institute", "institution", "guild", "chamber of commerce" },
["Trusts"] = new[] { "benefit trust", "pension", "retirement", "employee trust", "share trust", "employee benefit", "superannuation", "provident" },
["Charities"] = new[] { "charity", "charitable", "foundation", "relief fund", "benevolent", "philanthropic" },
["Investment"] = new[] { "nominee", "custodian", "trustee", "investment trust", "unit trust", "investment fund", "capital partners" },
["Property"] = new[] { "freehold", "leasehold", "property management", "residents association", "management company rtm", "commonhold" },
["Religious"] = new[] { "church", "chapel", "mosque", "synagogue", "temple", "parish", "diocese", "ministry" },
["Sports"] = new[] { "football club", "cricket club", "rugby club", "golf club", "tennis club", "sports club", "athletic club" },
["Educational"] = new[] { "old boys", "old girls", "alumni", "school association", "pta", "parent teacher" },
["Professional"] = new[] { "chartered institute", "royal college", "professional body", "trade body", "regulatory body" }
};
// SIC codes that indicate non-trading or non-employment entities
private static readonly HashSet<string> NonTradingSicCodes = new()
{
"99999", // Dormant company
"64209", // Activities of holding companies (shell companies)
"68100", // Buying and selling of own real estate (often shell)
};
// SIC codes for tech/software companies // SIC codes for tech/software companies
private static readonly HashSet<string> TechSicCodes = new() private static readonly HashSet<string> TechSicCodes = new()
{ {
@@ -600,13 +624,11 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
var coreWords = ExtractCoreIdentifiers(companyName); var coreWords = ExtractCoreIdentifiers(companyName);
var queryCoreWords = ExtractCoreIdentifiers(searchQuery); var queryCoreWords = ExtractCoreIdentifiers(searchQuery);
// Check if the search is looking for a club/association type entity
var originalLower = companyName.ToLowerInvariant(); var originalLower = companyName.ToLowerInvariant();
var queryLower = searchQuery.ToLowerInvariant(); var queryLower = searchQuery.ToLowerInvariant();
var searchingForClub = originalLower.Contains("club") || originalLower.Contains("association") ||
originalLower.Contains("society") || originalLower.Contains("owners") || // Determine which entity types the search is explicitly looking for
queryLower.Contains("club") || queryLower.Contains("association") || var searchEntityTypes = GetSearchEntityTypes(originalLower, queryLower);
queryLower.Contains("society") || queryLower.Contains("owners");
// Match against both the original company name AND the search query used // Match against both the original company name AND the search query used
// This handles cases like "Matthew Walker (Northern Foods Plc)" where we // This handles cases like "Matthew Walker (Northern Foods Plc)" where we
@@ -624,26 +646,10 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
var hasAllQueryCores = queryCoreWords.Count == 0 || queryCoreWords.All(w => itemTitle.Contains(w)); var hasAllQueryCores = queryCoreWords.Count == 0 || queryCoreWords.All(w => itemTitle.Contains(w));
if (!hasAllOriginalCores && !hasAllQueryCores) return false; if (!hasAllOriginalCores && !hasAllQueryCores) return false;
// Filter out club/association/society entities unless explicitly searching for one // Filter out non-employment entities unless explicitly searching for that type
// This prevents "BMW" matching "BMW CAR CLUB LIMITED" if (!IsValidEmploymentEntity(itemTitleLower, searchEntityTypes))
if (!searchingForClub)
{ {
var isClubType = itemTitleLower.Contains("club") || itemTitleLower.Contains("association") || return false;
itemTitleLower.Contains("society") || itemTitleLower.Contains("owners") ||
itemTitleLower.Contains("enthusiast") || itemTitleLower.Contains("fan ");
if (isClubType) return false;
}
// Filter out benefit/trust/pension entities unless explicitly searching for one
// This prevents "BMW Group" matching "BMW GROUP BENEFIT TRUST LIMITED"
var searchingForTrust = originalLower.Contains("trust") || originalLower.Contains("benefit") ||
originalLower.Contains("pension") || queryLower.Contains("trust") ||
queryLower.Contains("benefit") || queryLower.Contains("pension");
if (!searchingForTrust)
{
var isTrustType = itemTitleLower.Contains("benefit trust") || itemTitleLower.Contains("pension") ||
itemTitleLower.Contains("employee trust") || itemTitleLower.Contains("retirement");
if (isTrustType) return false;
} }
return true; return true;
@@ -1015,6 +1021,105 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
return variations.ToList(); return variations.ToList();
} }
/// <summary>
/// Determines which non-employment entity categories the search query is explicitly looking for.
/// Returns a set of category names (e.g., "Clubs", "Trusts") that should NOT be filtered out.
/// </summary>
private static HashSet<string> GetSearchEntityTypes(string originalLower, string queryLower)
{
var allowedCategories = new HashSet<string>();
var searchTerms = originalLower + " " + queryLower;
foreach (var (category, patterns) in NonEmploymentEntityPatterns)
{
// If any pattern from this category appears in the search, allow matches from that category
if (patterns.Any(pattern => searchTerms.Contains(pattern)))
{
allowedCategories.Add(category);
}
}
return allowedCategories;
}
/// <summary>
/// Checks if a company title represents a valid employment entity.
/// Filters out non-employment entities (clubs, trusts, etc.) unless the search explicitly targets that type.
/// </summary>
private static bool IsValidEmploymentEntity(string itemTitleLower, HashSet<string> allowedCategories)
{
foreach (var (category, patterns) in NonEmploymentEntityPatterns)
{
// Skip this category if the search explicitly allows it
if (allowedCategories.Contains(category))
{
continue;
}
// Check if the item matches any pattern in this non-employment category
if (patterns.Any(pattern => itemTitleLower.Contains(pattern)))
{
return false; // This is a non-employment entity type that wasn't explicitly searched for
}
}
return true; // No non-employment patterns matched, this is likely a valid employment entity
}
// Expanded skip words list for core identifier extraction
// These words are too common to be meaningful differentiators between companies
private static readonly HashSet<string> SkipWords = new(StringComparer.OrdinalIgnoreCase)
{
// Articles and conjunctions
"the", "a", "an", "and", "or", "of", "for", "in", "at", "on", "by", "to", "with",
// Geographic - Countries and regions
"uk", "u.k.", "gb", "british", "britain", "england", "english", "scotland", "scottish",
"wales", "welsh", "ireland", "irish", "northern",
"europe", "european", "america", "american", "usa", "us", "u.s.", "u.s.a.",
"canada", "canadian", "asia", "asian", "pacific", "atlantic",
"australia", "australian", "africa", "african", "india", "indian",
"france", "french", "germany", "german", "spain", "spanish", "italy", "italian",
"japan", "japanese", "china", "chinese", "korea", "korean",
"middle", "east", "west", "north", "south", "central", "western", "eastern",
// Geographic - Cities
"london", "manchester", "birmingham", "leeds", "glasgow", "edinburgh", "bristol",
"liverpool", "sheffield", "newcastle", "cardiff", "belfast", "nottingham",
"southampton", "portsmouth", "brighton", "leicester", "coventry", "hull",
// Legal suffixes
"limited", "ltd", "plc", "llp", "llc", "inc", "incorporated", "corporation", "corp",
"company", "co", "partners", "partnership", "enterprises", "unlimited",
"registered", "cic", "cio", "se", "ag", "gmbh", "sarl", "bv", "nv",
// Business descriptors
"group", "holdings", "holding", "parent", "subsidiary", "division", "branch",
"services", "service", "solutions", "solution", "consulting", "consultants", "consultancy",
"management", "systems", "system", "technologies", "technology", "tech",
"industries", "industry", "industrial", "commercial", "trading", "trade",
"business", "businesses", "operations", "operational", "professional", "professionals",
"resources", "resource", "network", "networks", "associates", "associated",
// Size/Scope descriptors
"national", "international", "global", "worldwide", "world", "regional", "local",
"universal", "general", "standard", "premier", "prime", "first", "one",
// Quality/Marketing terms
"new", "modern", "advanced", "innovative", "premier", "elite", "premium",
"quality", "superior", "excellent", "best", "top", "leading", "major",
// Ownership indicators
"royal", "imperial", "crown", "state", "public", "private", "independent",
"mutual", "cooperative", "coop", "community",
// Time-related
"century", "millennium", "annual", "year", "years",
// Numbers as words
"one", "two", "three", "four", "five", "first", "second", "third"
};
/// <summary> /// <summary>
/// Extracts ALL core identifying words from a company name. /// Extracts ALL core identifying words from a company name.
/// These are significant words that aren't common prefixes/suffixes. /// These are significant words that aren't common prefixes/suffixes.
@@ -1025,19 +1130,6 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
{ {
if (string.IsNullOrWhiteSpace(companyName)) return new List<string>(); if (string.IsNullOrWhiteSpace(companyName)) return new List<string>();
// Common words to skip when finding core identifiers
var skipWords = new HashSet<string>(StringComparer.OrdinalIgnoreCase)
{
"the", "a", "an", "and", "of", "for", "in", "at", "on", "by",
"uk", "u.k.", "gb", "british", "england", "scotland", "wales", "ireland",
"national", "international", "global", "worldwide", "world",
"group", "holdings", "services", "solutions", "consulting", "management",
"limited", "ltd", "plc", "llp", "inc", "corporation", "corp",
"company", "co", "partners", "partnership", "enterprises",
"europe", "european", "america", "american", "canada", "canadian", "asia", "asian",
"north", "south", "east", "west", "central", "new"
};
// Remove parenthetical content first // Remove parenthetical content first
var cleanName = System.Text.RegularExpressions.Regex.Replace(companyName, @"\([^)]*\)", "").Trim(); var cleanName = System.Text.RegularExpressions.Regex.Replace(companyName, @"\([^)]*\)", "").Trim();
@@ -1048,7 +1140,7 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
foreach (var word in words) foreach (var word in words)
{ {
var cleanWord = word.Trim('.', ',', '\''); var cleanWord = word.Trim('.', ',', '\'');
if (cleanWord.Length >= 2 && !skipWords.Contains(cleanWord)) if (cleanWord.Length >= 2 && !SkipWords.Contains(cleanWord))
{ {
coreWords.Add(cleanWord.ToUpperInvariant()); coreWords.Add(cleanWord.ToUpperInvariant());
} }

View File

@@ -169,8 +169,10 @@
</tr> </tr>
</thead> </thead>
<tbody> <tbody>
@foreach (var verification in _report.EmploymentVerifications) @for (int i = 0; i < _report.EmploymentVerifications.Count; i++)
{ {
var verification = _report.EmploymentVerifications[i];
var index = i;
<tr> <tr>
<td class="fw-medium">@verification.ClaimedCompany</td> <td class="fw-medium">@verification.ClaimedCompany</td>
<td> <td>
@@ -223,7 +225,7 @@
</td> </td>
<td class="text-center"> <td class="text-center">
@{ @{
var companyPoints = GetPointsForCompany(verification.ClaimedCompany, verification.MatchedCompanyName); var companyPoints = GetPointsForCompany(verification.ClaimedCompany, verification.MatchedCompanyName, index);
} }
@if (companyPoints < 0) @if (companyPoints < 0)
{ {
@@ -574,6 +576,7 @@
} }
else else
{ {
ComputeFirstOccurrences(); // Pre-compute which companies are first occurrences
await AuditService.LogAsync(_userId, AuditActions.ReportViewed, "CVCheck", Id, $"Score: {_report.OverallScore}"); await AuditService.LogAsync(_userId, AuditActions.ReportViewed, "CVCheck", Id, $"Score: {_report.OverallScore}");
} }
} }
@@ -703,10 +706,35 @@
}; };
} }
private int GetPointsForCompany(string claimedCompany, string? matchedCompany) // Lookup for first occurrence of each company (pre-computed when report loads)
private HashSet<int> _firstOccurrenceIndices = new();
private void ComputeFirstOccurrences()
{
_firstOccurrenceIndices.Clear();
if (_report?.EmploymentVerifications is null) return;
var seenCompanies = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
for (int i = 0; i < _report.EmploymentVerifications.Count; i++)
{
var company = _report.EmploymentVerifications[i].ClaimedCompany;
if (seenCompanies.Add(company))
{
_firstOccurrenceIndices.Add(i);
}
}
}
private int GetPointsForCompany(string claimedCompany, string? matchedCompany, int index)
{ {
if (_report?.Flags is null) return 0; if (_report?.Flags is null) return 0;
// Only show points for the first occurrence of each company
if (!_firstOccurrenceIndices.Contains(index))
{
return 0;
}
// Sum up all flags that mention this company in their description // Sum up all flags that mention this company in their description
var companyFlags = _report.Flags var companyFlags = _report.Flags
.Where(f => f.ScoreImpact < 0 && .Where(f => f.ScoreImpact < 0 &&