Improve company verification filtering and fix duplicate points display
- Add NonEmploymentEntityPatterns dictionary with 10 categories of non-employer entities (clubs, associations, trusts, charities, investment, property, religious, sports, educational, professional) - Expand SkipWords from ~30 to 120+ words for better core identifier extraction - Add GetSearchEntityTypes() and IsValidEmploymentEntity() helper methods - Refactor FindBestMatch() to use pattern-based filtering instead of hardcoded checks - Fix UI showing duplicate points for same company appearing multiple times (now only shows points on first occurrence, subsequent rows show 0) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -21,6 +21,30 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
|
||||
private const int FuzzyMatchThreshold = 85;
|
||||
private const int CacheExpirationDays = 30;
|
||||
|
||||
// Non-employment entity patterns organized by category
|
||||
// These are entities that exist in Companies House but are not typical employers
|
||||
private static readonly Dictionary<string, string[]> NonEmploymentEntityPatterns = new()
|
||||
{
|
||||
["Clubs"] = new[] { "club", "fan club", "owners club", "car club", "supporters", "enthusiast", "aficionados" },
|
||||
["Associations"] = new[] { "association", "society", "federation", "institute", "institution", "guild", "chamber of commerce" },
|
||||
["Trusts"] = new[] { "benefit trust", "pension", "retirement", "employee trust", "share trust", "employee benefit", "superannuation", "provident" },
|
||||
["Charities"] = new[] { "charity", "charitable", "foundation", "relief fund", "benevolent", "philanthropic" },
|
||||
["Investment"] = new[] { "nominee", "custodian", "trustee", "investment trust", "unit trust", "investment fund", "capital partners" },
|
||||
["Property"] = new[] { "freehold", "leasehold", "property management", "residents association", "management company rtm", "commonhold" },
|
||||
["Religious"] = new[] { "church", "chapel", "mosque", "synagogue", "temple", "parish", "diocese", "ministry" },
|
||||
["Sports"] = new[] { "football club", "cricket club", "rugby club", "golf club", "tennis club", "sports club", "athletic club" },
|
||||
["Educational"] = new[] { "old boys", "old girls", "alumni", "school association", "pta", "parent teacher" },
|
||||
["Professional"] = new[] { "chartered institute", "royal college", "professional body", "trade body", "regulatory body" }
|
||||
};
|
||||
|
||||
// SIC codes that indicate non-trading or non-employment entities
|
||||
private static readonly HashSet<string> NonTradingSicCodes = new()
|
||||
{
|
||||
"99999", // Dormant company
|
||||
"64209", // Activities of holding companies (shell companies)
|
||||
"68100", // Buying and selling of own real estate (often shell)
|
||||
};
|
||||
|
||||
// SIC codes for tech/software companies
|
||||
private static readonly HashSet<string> TechSicCodes = new()
|
||||
{
|
||||
@@ -600,13 +624,11 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
|
||||
var coreWords = ExtractCoreIdentifiers(companyName);
|
||||
var queryCoreWords = ExtractCoreIdentifiers(searchQuery);
|
||||
|
||||
// Check if the search is looking for a club/association type entity
|
||||
var originalLower = companyName.ToLowerInvariant();
|
||||
var queryLower = searchQuery.ToLowerInvariant();
|
||||
var searchingForClub = originalLower.Contains("club") || originalLower.Contains("association") ||
|
||||
originalLower.Contains("society") || originalLower.Contains("owners") ||
|
||||
queryLower.Contains("club") || queryLower.Contains("association") ||
|
||||
queryLower.Contains("society") || queryLower.Contains("owners");
|
||||
|
||||
// Determine which entity types the search is explicitly looking for
|
||||
var searchEntityTypes = GetSearchEntityTypes(originalLower, queryLower);
|
||||
|
||||
// Match against both the original company name AND the search query used
|
||||
// This handles cases like "Matthew Walker (Northern Foods Plc)" where we
|
||||
@@ -624,26 +646,10 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
|
||||
var hasAllQueryCores = queryCoreWords.Count == 0 || queryCoreWords.All(w => itemTitle.Contains(w));
|
||||
if (!hasAllOriginalCores && !hasAllQueryCores) return false;
|
||||
|
||||
// Filter out club/association/society entities unless explicitly searching for one
|
||||
// This prevents "BMW" matching "BMW CAR CLUB LIMITED"
|
||||
if (!searchingForClub)
|
||||
// Filter out non-employment entities unless explicitly searching for that type
|
||||
if (!IsValidEmploymentEntity(itemTitleLower, searchEntityTypes))
|
||||
{
|
||||
var isClubType = itemTitleLower.Contains("club") || itemTitleLower.Contains("association") ||
|
||||
itemTitleLower.Contains("society") || itemTitleLower.Contains("owners") ||
|
||||
itemTitleLower.Contains("enthusiast") || itemTitleLower.Contains("fan ");
|
||||
if (isClubType) return false;
|
||||
}
|
||||
|
||||
// Filter out benefit/trust/pension entities unless explicitly searching for one
|
||||
// This prevents "BMW Group" matching "BMW GROUP BENEFIT TRUST LIMITED"
|
||||
var searchingForTrust = originalLower.Contains("trust") || originalLower.Contains("benefit") ||
|
||||
originalLower.Contains("pension") || queryLower.Contains("trust") ||
|
||||
queryLower.Contains("benefit") || queryLower.Contains("pension");
|
||||
if (!searchingForTrust)
|
||||
{
|
||||
var isTrustType = itemTitleLower.Contains("benefit trust") || itemTitleLower.Contains("pension") ||
|
||||
itemTitleLower.Contains("employee trust") || itemTitleLower.Contains("retirement");
|
||||
if (isTrustType) return false;
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
@@ -1015,6 +1021,105 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
|
||||
return variations.ToList();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Determines which non-employment entity categories the search query is explicitly looking for.
|
||||
/// Returns a set of category names (e.g., "Clubs", "Trusts") that should NOT be filtered out.
|
||||
/// </summary>
|
||||
private static HashSet<string> GetSearchEntityTypes(string originalLower, string queryLower)
|
||||
{
|
||||
var allowedCategories = new HashSet<string>();
|
||||
var searchTerms = originalLower + " " + queryLower;
|
||||
|
||||
foreach (var (category, patterns) in NonEmploymentEntityPatterns)
|
||||
{
|
||||
// If any pattern from this category appears in the search, allow matches from that category
|
||||
if (patterns.Any(pattern => searchTerms.Contains(pattern)))
|
||||
{
|
||||
allowedCategories.Add(category);
|
||||
}
|
||||
}
|
||||
|
||||
return allowedCategories;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Checks if a company title represents a valid employment entity.
|
||||
/// Filters out non-employment entities (clubs, trusts, etc.) unless the search explicitly targets that type.
|
||||
/// </summary>
|
||||
private static bool IsValidEmploymentEntity(string itemTitleLower, HashSet<string> allowedCategories)
|
||||
{
|
||||
foreach (var (category, patterns) in NonEmploymentEntityPatterns)
|
||||
{
|
||||
// Skip this category if the search explicitly allows it
|
||||
if (allowedCategories.Contains(category))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
// Check if the item matches any pattern in this non-employment category
|
||||
if (patterns.Any(pattern => itemTitleLower.Contains(pattern)))
|
||||
{
|
||||
return false; // This is a non-employment entity type that wasn't explicitly searched for
|
||||
}
|
||||
}
|
||||
|
||||
return true; // No non-employment patterns matched, this is likely a valid employment entity
|
||||
}
|
||||
|
||||
// Expanded skip words list for core identifier extraction
|
||||
// These words are too common to be meaningful differentiators between companies
|
||||
private static readonly HashSet<string> SkipWords = new(StringComparer.OrdinalIgnoreCase)
|
||||
{
|
||||
// Articles and conjunctions
|
||||
"the", "a", "an", "and", "or", "of", "for", "in", "at", "on", "by", "to", "with",
|
||||
|
||||
// Geographic - Countries and regions
|
||||
"uk", "u.k.", "gb", "british", "britain", "england", "english", "scotland", "scottish",
|
||||
"wales", "welsh", "ireland", "irish", "northern",
|
||||
"europe", "european", "america", "american", "usa", "us", "u.s.", "u.s.a.",
|
||||
"canada", "canadian", "asia", "asian", "pacific", "atlantic",
|
||||
"australia", "australian", "africa", "african", "india", "indian",
|
||||
"france", "french", "germany", "german", "spain", "spanish", "italy", "italian",
|
||||
"japan", "japanese", "china", "chinese", "korea", "korean",
|
||||
"middle", "east", "west", "north", "south", "central", "western", "eastern",
|
||||
|
||||
// Geographic - Cities
|
||||
"london", "manchester", "birmingham", "leeds", "glasgow", "edinburgh", "bristol",
|
||||
"liverpool", "sheffield", "newcastle", "cardiff", "belfast", "nottingham",
|
||||
"southampton", "portsmouth", "brighton", "leicester", "coventry", "hull",
|
||||
|
||||
// Legal suffixes
|
||||
"limited", "ltd", "plc", "llp", "llc", "inc", "incorporated", "corporation", "corp",
|
||||
"company", "co", "partners", "partnership", "enterprises", "unlimited",
|
||||
"registered", "cic", "cio", "se", "ag", "gmbh", "sarl", "bv", "nv",
|
||||
|
||||
// Business descriptors
|
||||
"group", "holdings", "holding", "parent", "subsidiary", "division", "branch",
|
||||
"services", "service", "solutions", "solution", "consulting", "consultants", "consultancy",
|
||||
"management", "systems", "system", "technologies", "technology", "tech",
|
||||
"industries", "industry", "industrial", "commercial", "trading", "trade",
|
||||
"business", "businesses", "operations", "operational", "professional", "professionals",
|
||||
"resources", "resource", "network", "networks", "associates", "associated",
|
||||
|
||||
// Size/Scope descriptors
|
||||
"national", "international", "global", "worldwide", "world", "regional", "local",
|
||||
"universal", "general", "standard", "premier", "prime", "first", "one",
|
||||
|
||||
// Quality/Marketing terms
|
||||
"new", "modern", "advanced", "innovative", "premier", "elite", "premium",
|
||||
"quality", "superior", "excellent", "best", "top", "leading", "major",
|
||||
|
||||
// Ownership indicators
|
||||
"royal", "imperial", "crown", "state", "public", "private", "independent",
|
||||
"mutual", "cooperative", "coop", "community",
|
||||
|
||||
// Time-related
|
||||
"century", "millennium", "annual", "year", "years",
|
||||
|
||||
// Numbers as words
|
||||
"one", "two", "three", "four", "five", "first", "second", "third"
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// Extracts ALL core identifying words from a company name.
|
||||
/// These are significant words that aren't common prefixes/suffixes.
|
||||
@@ -1025,19 +1130,6 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(companyName)) return new List<string>();
|
||||
|
||||
// Common words to skip when finding core identifiers
|
||||
var skipWords = new HashSet<string>(StringComparer.OrdinalIgnoreCase)
|
||||
{
|
||||
"the", "a", "an", "and", "of", "for", "in", "at", "on", "by",
|
||||
"uk", "u.k.", "gb", "british", "england", "scotland", "wales", "ireland",
|
||||
"national", "international", "global", "worldwide", "world",
|
||||
"group", "holdings", "services", "solutions", "consulting", "management",
|
||||
"limited", "ltd", "plc", "llp", "inc", "corporation", "corp",
|
||||
"company", "co", "partners", "partnership", "enterprises",
|
||||
"europe", "european", "america", "american", "canada", "canadian", "asia", "asian",
|
||||
"north", "south", "east", "west", "central", "new"
|
||||
};
|
||||
|
||||
// Remove parenthetical content first
|
||||
var cleanName = System.Text.RegularExpressions.Regex.Replace(companyName, @"\([^)]*\)", "").Trim();
|
||||
|
||||
@@ -1048,7 +1140,7 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
|
||||
foreach (var word in words)
|
||||
{
|
||||
var cleanWord = word.Trim('.', ',', '\'');
|
||||
if (cleanWord.Length >= 2 && !skipWords.Contains(cleanWord))
|
||||
if (cleanWord.Length >= 2 && !SkipWords.Contains(cleanWord))
|
||||
{
|
||||
coreWords.Add(cleanWord.ToUpperInvariant());
|
||||
}
|
||||
|
||||
@@ -169,8 +169,10 @@
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
@foreach (var verification in _report.EmploymentVerifications)
|
||||
@for (int i = 0; i < _report.EmploymentVerifications.Count; i++)
|
||||
{
|
||||
var verification = _report.EmploymentVerifications[i];
|
||||
var index = i;
|
||||
<tr>
|
||||
<td class="fw-medium">@verification.ClaimedCompany</td>
|
||||
<td>
|
||||
@@ -223,7 +225,7 @@
|
||||
</td>
|
||||
<td class="text-center">
|
||||
@{
|
||||
var companyPoints = GetPointsForCompany(verification.ClaimedCompany, verification.MatchedCompanyName);
|
||||
var companyPoints = GetPointsForCompany(verification.ClaimedCompany, verification.MatchedCompanyName, index);
|
||||
}
|
||||
@if (companyPoints < 0)
|
||||
{
|
||||
@@ -574,6 +576,7 @@
|
||||
}
|
||||
else
|
||||
{
|
||||
ComputeFirstOccurrences(); // Pre-compute which companies are first occurrences
|
||||
await AuditService.LogAsync(_userId, AuditActions.ReportViewed, "CVCheck", Id, $"Score: {_report.OverallScore}");
|
||||
}
|
||||
}
|
||||
@@ -703,10 +706,35 @@
|
||||
};
|
||||
}
|
||||
|
||||
private int GetPointsForCompany(string claimedCompany, string? matchedCompany)
|
||||
// Lookup for first occurrence of each company (pre-computed when report loads)
|
||||
private HashSet<int> _firstOccurrenceIndices = new();
|
||||
|
||||
private void ComputeFirstOccurrences()
|
||||
{
|
||||
_firstOccurrenceIndices.Clear();
|
||||
if (_report?.EmploymentVerifications is null) return;
|
||||
|
||||
var seenCompanies = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
|
||||
for (int i = 0; i < _report.EmploymentVerifications.Count; i++)
|
||||
{
|
||||
var company = _report.EmploymentVerifications[i].ClaimedCompany;
|
||||
if (seenCompanies.Add(company))
|
||||
{
|
||||
_firstOccurrenceIndices.Add(i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private int GetPointsForCompany(string claimedCompany, string? matchedCompany, int index)
|
||||
{
|
||||
if (_report?.Flags is null) return 0;
|
||||
|
||||
// Only show points for the first occurrence of each company
|
||||
if (!_firstOccurrenceIndices.Contains(index))
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Sum up all flags that mention this company in their description
|
||||
var companyFlags = _report.Flags
|
||||
.Where(f => f.ScoreImpact < 0 &&
|
||||
|
||||
Reference in New Issue
Block a user