diff --git a/src/TrueCV.Infrastructure/Services/CompanyVerifierService.cs b/src/TrueCV.Infrastructure/Services/CompanyVerifierService.cs index f254b7d..1b29755 100644 --- a/src/TrueCV.Infrastructure/Services/CompanyVerifierService.cs +++ b/src/TrueCV.Infrastructure/Services/CompanyVerifierService.cs @@ -21,6 +21,30 @@ public sealed class CompanyVerifierService : ICompanyVerifierService private const int FuzzyMatchThreshold = 85; private const int CacheExpirationDays = 30; + // Non-employment entity patterns organized by category + // These are entities that exist in Companies House but are not typical employers + private static readonly Dictionary NonEmploymentEntityPatterns = new() + { + ["Clubs"] = new[] { "club", "fan club", "owners club", "car club", "supporters", "enthusiast", "aficionados" }, + ["Associations"] = new[] { "association", "society", "federation", "institute", "institution", "guild", "chamber of commerce" }, + ["Trusts"] = new[] { "benefit trust", "pension", "retirement", "employee trust", "share trust", "employee benefit", "superannuation", "provident" }, + ["Charities"] = new[] { "charity", "charitable", "foundation", "relief fund", "benevolent", "philanthropic" }, + ["Investment"] = new[] { "nominee", "custodian", "trustee", "investment trust", "unit trust", "investment fund", "capital partners" }, + ["Property"] = new[] { "freehold", "leasehold", "property management", "residents association", "management company rtm", "commonhold" }, + ["Religious"] = new[] { "church", "chapel", "mosque", "synagogue", "temple", "parish", "diocese", "ministry" }, + ["Sports"] = new[] { "football club", "cricket club", "rugby club", "golf club", "tennis club", "sports club", "athletic club" }, + ["Educational"] = new[] { "old boys", "old girls", "alumni", "school association", "pta", "parent teacher" }, + ["Professional"] = new[] { "chartered institute", "royal college", "professional body", "trade body", "regulatory body" } + }; + + // SIC codes that indicate non-trading or non-employment entities + private static readonly HashSet NonTradingSicCodes = new() + { + "99999", // Dormant company + "64209", // Activities of holding companies (shell companies) + "68100", // Buying and selling of own real estate (often shell) + }; + // SIC codes for tech/software companies private static readonly HashSet TechSicCodes = new() { @@ -600,13 +624,11 @@ public sealed class CompanyVerifierService : ICompanyVerifierService var coreWords = ExtractCoreIdentifiers(companyName); var queryCoreWords = ExtractCoreIdentifiers(searchQuery); - // Check if the search is looking for a club/association type entity var originalLower = companyName.ToLowerInvariant(); var queryLower = searchQuery.ToLowerInvariant(); - var searchingForClub = originalLower.Contains("club") || originalLower.Contains("association") || - originalLower.Contains("society") || originalLower.Contains("owners") || - queryLower.Contains("club") || queryLower.Contains("association") || - queryLower.Contains("society") || queryLower.Contains("owners"); + + // Determine which entity types the search is explicitly looking for + var searchEntityTypes = GetSearchEntityTypes(originalLower, queryLower); // Match against both the original company name AND the search query used // This handles cases like "Matthew Walker (Northern Foods Plc)" where we @@ -624,26 +646,10 @@ public sealed class CompanyVerifierService : ICompanyVerifierService var hasAllQueryCores = queryCoreWords.Count == 0 || queryCoreWords.All(w => itemTitle.Contains(w)); if (!hasAllOriginalCores && !hasAllQueryCores) return false; - // Filter out club/association/society entities unless explicitly searching for one - // This prevents "BMW" matching "BMW CAR CLUB LIMITED" - if (!searchingForClub) + // Filter out non-employment entities unless explicitly searching for that type + if (!IsValidEmploymentEntity(itemTitleLower, searchEntityTypes)) { - var isClubType = itemTitleLower.Contains("club") || itemTitleLower.Contains("association") || - itemTitleLower.Contains("society") || itemTitleLower.Contains("owners") || - itemTitleLower.Contains("enthusiast") || itemTitleLower.Contains("fan "); - if (isClubType) return false; - } - - // Filter out benefit/trust/pension entities unless explicitly searching for one - // This prevents "BMW Group" matching "BMW GROUP BENEFIT TRUST LIMITED" - var searchingForTrust = originalLower.Contains("trust") || originalLower.Contains("benefit") || - originalLower.Contains("pension") || queryLower.Contains("trust") || - queryLower.Contains("benefit") || queryLower.Contains("pension"); - if (!searchingForTrust) - { - var isTrustType = itemTitleLower.Contains("benefit trust") || itemTitleLower.Contains("pension") || - itemTitleLower.Contains("employee trust") || itemTitleLower.Contains("retirement"); - if (isTrustType) return false; + return false; } return true; @@ -1015,6 +1021,105 @@ public sealed class CompanyVerifierService : ICompanyVerifierService return variations.ToList(); } + /// + /// Determines which non-employment entity categories the search query is explicitly looking for. + /// Returns a set of category names (e.g., "Clubs", "Trusts") that should NOT be filtered out. + /// + private static HashSet GetSearchEntityTypes(string originalLower, string queryLower) + { + var allowedCategories = new HashSet(); + var searchTerms = originalLower + " " + queryLower; + + foreach (var (category, patterns) in NonEmploymentEntityPatterns) + { + // If any pattern from this category appears in the search, allow matches from that category + if (patterns.Any(pattern => searchTerms.Contains(pattern))) + { + allowedCategories.Add(category); + } + } + + return allowedCategories; + } + + /// + /// Checks if a company title represents a valid employment entity. + /// Filters out non-employment entities (clubs, trusts, etc.) unless the search explicitly targets that type. + /// + private static bool IsValidEmploymentEntity(string itemTitleLower, HashSet allowedCategories) + { + foreach (var (category, patterns) in NonEmploymentEntityPatterns) + { + // Skip this category if the search explicitly allows it + if (allowedCategories.Contains(category)) + { + continue; + } + + // Check if the item matches any pattern in this non-employment category + if (patterns.Any(pattern => itemTitleLower.Contains(pattern))) + { + return false; // This is a non-employment entity type that wasn't explicitly searched for + } + } + + return true; // No non-employment patterns matched, this is likely a valid employment entity + } + + // Expanded skip words list for core identifier extraction + // These words are too common to be meaningful differentiators between companies + private static readonly HashSet SkipWords = new(StringComparer.OrdinalIgnoreCase) + { + // Articles and conjunctions + "the", "a", "an", "and", "or", "of", "for", "in", "at", "on", "by", "to", "with", + + // Geographic - Countries and regions + "uk", "u.k.", "gb", "british", "britain", "england", "english", "scotland", "scottish", + "wales", "welsh", "ireland", "irish", "northern", + "europe", "european", "america", "american", "usa", "us", "u.s.", "u.s.a.", + "canada", "canadian", "asia", "asian", "pacific", "atlantic", + "australia", "australian", "africa", "african", "india", "indian", + "france", "french", "germany", "german", "spain", "spanish", "italy", "italian", + "japan", "japanese", "china", "chinese", "korea", "korean", + "middle", "east", "west", "north", "south", "central", "western", "eastern", + + // Geographic - Cities + "london", "manchester", "birmingham", "leeds", "glasgow", "edinburgh", "bristol", + "liverpool", "sheffield", "newcastle", "cardiff", "belfast", "nottingham", + "southampton", "portsmouth", "brighton", "leicester", "coventry", "hull", + + // Legal suffixes + "limited", "ltd", "plc", "llp", "llc", "inc", "incorporated", "corporation", "corp", + "company", "co", "partners", "partnership", "enterprises", "unlimited", + "registered", "cic", "cio", "se", "ag", "gmbh", "sarl", "bv", "nv", + + // Business descriptors + "group", "holdings", "holding", "parent", "subsidiary", "division", "branch", + "services", "service", "solutions", "solution", "consulting", "consultants", "consultancy", + "management", "systems", "system", "technologies", "technology", "tech", + "industries", "industry", "industrial", "commercial", "trading", "trade", + "business", "businesses", "operations", "operational", "professional", "professionals", + "resources", "resource", "network", "networks", "associates", "associated", + + // Size/Scope descriptors + "national", "international", "global", "worldwide", "world", "regional", "local", + "universal", "general", "standard", "premier", "prime", "first", "one", + + // Quality/Marketing terms + "new", "modern", "advanced", "innovative", "premier", "elite", "premium", + "quality", "superior", "excellent", "best", "top", "leading", "major", + + // Ownership indicators + "royal", "imperial", "crown", "state", "public", "private", "independent", + "mutual", "cooperative", "coop", "community", + + // Time-related + "century", "millennium", "annual", "year", "years", + + // Numbers as words + "one", "two", "three", "four", "five", "first", "second", "third" + }; + /// /// Extracts ALL core identifying words from a company name. /// These are significant words that aren't common prefixes/suffixes. @@ -1025,19 +1130,6 @@ public sealed class CompanyVerifierService : ICompanyVerifierService { if (string.IsNullOrWhiteSpace(companyName)) return new List(); - // Common words to skip when finding core identifiers - var skipWords = new HashSet(StringComparer.OrdinalIgnoreCase) - { - "the", "a", "an", "and", "of", "for", "in", "at", "on", "by", - "uk", "u.k.", "gb", "british", "england", "scotland", "wales", "ireland", - "national", "international", "global", "worldwide", "world", - "group", "holdings", "services", "solutions", "consulting", "management", - "limited", "ltd", "plc", "llp", "inc", "corporation", "corp", - "company", "co", "partners", "partnership", "enterprises", - "europe", "european", "america", "american", "canada", "canadian", "asia", "asian", - "north", "south", "east", "west", "central", "new" - }; - // Remove parenthetical content first var cleanName = System.Text.RegularExpressions.Regex.Replace(companyName, @"\([^)]*\)", "").Trim(); @@ -1048,7 +1140,7 @@ public sealed class CompanyVerifierService : ICompanyVerifierService foreach (var word in words) { var cleanWord = word.Trim('.', ',', '\''); - if (cleanWord.Length >= 2 && !skipWords.Contains(cleanWord)) + if (cleanWord.Length >= 2 && !SkipWords.Contains(cleanWord)) { coreWords.Add(cleanWord.ToUpperInvariant()); } diff --git a/src/TrueCV.Web/Components/Pages/Report.razor b/src/TrueCV.Web/Components/Pages/Report.razor index 2f77264..46abbb7 100644 --- a/src/TrueCV.Web/Components/Pages/Report.razor +++ b/src/TrueCV.Web/Components/Pages/Report.razor @@ -169,8 +169,10 @@ - @foreach (var verification in _report.EmploymentVerifications) + @for (int i = 0; i < _report.EmploymentVerifications.Count; i++) { + var verification = _report.EmploymentVerifications[i]; + var index = i; @verification.ClaimedCompany @@ -223,7 +225,7 @@ @{ - var companyPoints = GetPointsForCompany(verification.ClaimedCompany, verification.MatchedCompanyName); + var companyPoints = GetPointsForCompany(verification.ClaimedCompany, verification.MatchedCompanyName, index); } @if (companyPoints < 0) { @@ -574,6 +576,7 @@ } else { + ComputeFirstOccurrences(); // Pre-compute which companies are first occurrences await AuditService.LogAsync(_userId, AuditActions.ReportViewed, "CVCheck", Id, $"Score: {_report.OverallScore}"); } } @@ -703,10 +706,35 @@ }; } - private int GetPointsForCompany(string claimedCompany, string? matchedCompany) + // Lookup for first occurrence of each company (pre-computed when report loads) + private HashSet _firstOccurrenceIndices = new(); + + private void ComputeFirstOccurrences() + { + _firstOccurrenceIndices.Clear(); + if (_report?.EmploymentVerifications is null) return; + + var seenCompanies = new HashSet(StringComparer.OrdinalIgnoreCase); + for (int i = 0; i < _report.EmploymentVerifications.Count; i++) + { + var company = _report.EmploymentVerifications[i].ClaimedCompany; + if (seenCompanies.Add(company)) + { + _firstOccurrenceIndices.Add(i); + } + } + } + + private int GetPointsForCompany(string claimedCompany, string? matchedCompany, int index) { if (_report?.Flags is null) return 0; + // Only show points for the first occurrence of each company + if (!_firstOccurrenceIndices.Contains(index)) + { + return 0; + } + // Sum up all flags that mention this company in their description var companyFlags = _report.Flags .Where(f => f.ScoreImpact < 0 &&