2026-01-20 20:00:24 +01:00
using System.Text.Json ;
2026-01-18 19:20:50 +01:00
using FuzzySharp ;
using Microsoft.EntityFrameworkCore ;
using Microsoft.Extensions.Logging ;
using TrueCV.Application.DTOs ;
2026-01-20 16:45:43 +01:00
using TrueCV.Application.Helpers ;
2026-01-18 19:20:50 +01:00
using TrueCV.Application.Interfaces ;
using TrueCV.Application.Models ;
using TrueCV.Domain.Entities ;
using TrueCV.Infrastructure.Data ;
using TrueCV.Infrastructure.ExternalApis ;
namespace TrueCV.Infrastructure.Services ;
public sealed class CompanyVerifierService : ICompanyVerifierService
{
private readonly CompaniesHouseClient _companiesHouseClient ;
2026-01-20 16:54:58 +01:00
private readonly IDbContextFactory < ApplicationDbContext > _dbContextFactory ;
2026-01-18 19:20:50 +01:00
private readonly ILogger < CompanyVerifierService > _logger ;
2026-01-20 20:58:12 +01:00
private const int FuzzyMatchThreshold = 85 ;
2026-01-18 19:20:50 +01:00
private const int CacheExpirationDays = 30 ;
Improve company verification filtering and fix duplicate points display
- Add NonEmploymentEntityPatterns dictionary with 10 categories of non-employer entities
(clubs, associations, trusts, charities, investment, property, religious, sports, educational, professional)
- Expand SkipWords from ~30 to 120+ words for better core identifier extraction
- Add GetSearchEntityTypes() and IsValidEmploymentEntity() helper methods
- Refactor FindBestMatch() to use pattern-based filtering instead of hardcoded checks
- Fix UI showing duplicate points for same company appearing multiple times
(now only shows points on first occurrence, subsequent rows show 0)
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-20 22:33:16 +01:00
// Non-employment entity patterns organized by category
// These are entities that exist in Companies House but are not typical employers
private static readonly Dictionary < string , string [ ] > NonEmploymentEntityPatterns = new ( )
{
["Clubs"] = new [ ] { "club" , "fan club" , "owners club" , "car club" , "supporters" , "enthusiast" , "aficionados" } ,
["Associations"] = new [ ] { "association" , "society" , "federation" , "institute" , "institution" , "guild" , "chamber of commerce" } ,
["Trusts"] = new [ ] { "benefit trust" , "pension" , "retirement" , "employee trust" , "share trust" , "employee benefit" , "superannuation" , "provident" } ,
["Charities"] = new [ ] { "charity" , "charitable" , "foundation" , "relief fund" , "benevolent" , "philanthropic" } ,
["Investment"] = new [ ] { "nominee" , "custodian" , "trustee" , "investment trust" , "unit trust" , "investment fund" , "capital partners" } ,
["Property"] = new [ ] { "freehold" , "leasehold" , "property management" , "residents association" , "management company rtm" , "commonhold" } ,
["Religious"] = new [ ] { "church" , "chapel" , "mosque" , "synagogue" , "temple" , "parish" , "diocese" , "ministry" } ,
["Sports"] = new [ ] { "football club" , "cricket club" , "rugby club" , "golf club" , "tennis club" , "sports club" , "athletic club" } ,
["Educational"] = new [ ] { "old boys" , "old girls" , "alumni" , "school association" , "pta" , "parent teacher" } ,
["Professional"] = new [ ] { "chartered institute" , "royal college" , "professional body" , "trade body" , "regulatory body" }
} ;
// SIC codes that indicate non-trading or non-employment entities
private static readonly HashSet < string > NonTradingSicCodes = new ( )
{
"99999" , // Dormant company
"64209" , // Activities of holding companies (shell companies)
"68100" , // Buying and selling of own real estate (often shell)
} ;
Prefer main trading companies over subsidiaries in company matching
When matching brand names like "ASDA", prefer the main employer company
(ASDA STORES LIMITED) over subsidiaries (ASDA DELIVERY LIMITED).
- Add SubsidiaryIndicators set (delivery, distribution, holdings, property, etc.)
- Add MainCompanyIndicators set (stores, retail, manufacturing, etc.)
- Add CalculateCompanyPriorityScore() method for ranking matches
- Sort matches by priority score first, then by fuzzy score
- Subsidiaries get -10 priority unless explicitly searched for
- Main trading companies get +5 priority, PLCs get +3
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-20 22:41:27 +01:00
// Words that indicate a subsidiary rather than the main trading company
// When someone says they worked for "ASDA", they likely mean ASDA STORES LIMITED,
// not ASDA DELIVERY LIMITED or ASDA PROPERTY HOLDINGS LIMITED
private static readonly HashSet < string > SubsidiaryIndicators = new ( StringComparer . OrdinalIgnoreCase )
{
// Logistics/Operations subsidiaries
"delivery" , "distribution" , "logistics" , "transport" , "fleet" , "haulage" , "warehousing" , "fulfilment" ,
// Property subsidiaries
"property" , "properties" , "estates" , "land" , "real estate" , "developments" ,
// Financial/Holding subsidiaries
"holdings" , "holding" , "investments" , "capital" , "finance" , "financial" , "treasury" ,
// Administrative subsidiaries
"nominees" , "nominee" , "trustees" , "trustee" , "secretarial" , "registrars" ,
// Insurance subsidiaries
"insurance" , "assurance" , "underwriting" ,
// Specific function subsidiaries
"leasing" , "rentals" , "procurement" , "sourcing"
} ;
// Words that indicate a main trading/employer company (prefer these)
private static readonly HashSet < string > MainCompanyIndicators = new ( StringComparer . OrdinalIgnoreCase )
{
"stores" , "retail" , "supermarkets" , "superstores" , "hypermarkets" ,
"manufacturing" , "operations" , "trading"
} ;
2026-01-20 20:00:24 +01:00
2026-01-18 19:20:50 +01:00
public CompanyVerifierService (
CompaniesHouseClient companiesHouseClient ,
2026-01-20 16:54:58 +01:00
IDbContextFactory < ApplicationDbContext > dbContextFactory ,
2026-01-18 19:20:50 +01:00
ILogger < CompanyVerifierService > logger )
{
_companiesHouseClient = companiesHouseClient ;
2026-01-20 16:54:58 +01:00
_dbContextFactory = dbContextFactory ;
2026-01-18 19:20:50 +01:00
_logger = logger ;
}
public async Task < CompanyVerificationResult > VerifyCompanyAsync (
string companyName ,
DateOnly ? startDate ,
2026-01-20 20:00:24 +01:00
DateOnly ? endDate ,
string? jobTitle = null )
2026-01-18 19:20:50 +01:00
{
ArgumentException . ThrowIfNullOrWhiteSpace ( companyName ) ;
_logger . LogDebug ( "Verifying company: {CompanyName}" , companyName ) ;
2026-01-20 20:00:24 +01:00
var flags = new List < CompanyVerificationFlag > ( ) ;
2026-01-18 19:20:50 +01:00
2026-01-20 21:17:11 +01:00
// Try to find a cached match first (but only if it existed at claimed start date)
2026-01-18 19:20:50 +01:00
var cachedMatch = await FindCachedMatchAsync ( companyName ) ;
if ( cachedMatch is not null )
{
2026-01-20 21:17:11 +01:00
// Check if cached company existed at the claimed start date
var cacheValid = ! startDate . HasValue | |
cachedMatch . IncorporationDate = = null | |
cachedMatch . IncorporationDate < = startDate . Value ;
if ( cacheValid )
{
_logger . LogDebug ( "Found cached company match for: {CompanyName}" , companyName ) ;
return CreateResultFromCache ( cachedMatch , companyName , startDate , endDate , jobTitle , flags ) ;
}
else
{
_logger . LogDebug ( "Cached company {CachedName} was incorporated after claimed start date, searching for alternatives" , cachedMatch . CompanyName ) ;
}
2026-01-18 19:20:50 +01:00
}
2026-01-20 21:32:02 +01:00
// Search Companies House with fallback queries
2026-01-18 19:20:50 +01:00
try
{
2026-01-20 21:32:02 +01:00
var searchQueries = GenerateSearchQueries ( companyName ) ;
_logger . LogDebug ( "Generated {Count} search queries for '{CompanyName}': {Queries}" ,
searchQueries . Count , companyName , string . Join ( ", " , searchQueries . Select ( q = > $"'{q}'" ) ) ) ;
( CompaniesHouseSearchItem Item , int Score ) ? bestMatch = null ;
2026-01-18 19:20:50 +01:00
2026-01-20 21:32:02 +01:00
foreach ( var query in searchQueries )
2026-01-18 19:20:50 +01:00
{
2026-01-20 21:32:02 +01:00
_logger . LogDebug ( "Searching Companies House with query: {Query}" , query ) ;
var searchResponse = await _companiesHouseClient . SearchCompaniesAsync ( query ) ;
if ( searchResponse ? . Items is null | | searchResponse . Items . Count = = 0 )
{
continue ;
}
2026-01-18 19:20:50 +01:00
2026-01-20 21:32:02 +01:00
// Find best fuzzy match, preferring companies that existed at claimed start date
2026-01-20 21:49:26 +01:00
// Pass both original name and search query for matching flexibility
bestMatch = FindBestMatch ( companyName , query , searchResponse . Items , startDate ) ;
2026-01-20 21:32:02 +01:00
if ( bestMatch is not null )
{
_logger . LogDebug ( "Found match with query '{Query}': {Company}" , query , bestMatch . Value . Item . Title ) ;
break ;
}
}
2026-01-18 19:20:50 +01:00
if ( bestMatch is null )
{
2026-01-20 21:32:02 +01:00
_logger . LogDebug ( "No valid match found for: {CompanyName} after trying {Count} queries" , companyName , searchQueries . Count ) ;
2026-01-20 20:00:24 +01:00
return CreateUnverifiedResult ( companyName , startDate , endDate , jobTitle ,
2026-01-20 21:04:30 +01:00
"Company name could not be verified against official records" ) ;
2026-01-18 19:20:50 +01:00
}
var match = bestMatch . Value ;
2026-01-20 20:00:24 +01:00
// Fetch full company details for additional data
var companyDetails = await _companiesHouseClient . GetCompanyAsync ( match . Item . CompanyNumber ) ;
// Cache the matched company with full details
await CacheCompanyAsync ( match . Item , companyDetails ) ;
2026-01-18 19:20:50 +01:00
_logger . LogInformation (
"Verified company {ClaimedName} matched to {MatchedName} with score {Score}%" ,
companyName , match . Item . Title , match . Score ) ;
2026-01-20 20:00:24 +01:00
// Run all verification checks
var incorporationDate = DateHelpers . ParseDate ( match . Item . DateOfCreation ) ;
var dissolutionDate = DateHelpers . ParseDate ( match . Item . DateOfCessation ) ;
var companyStatus = match . Item . CompanyStatus ;
var companyType = match . Item . CompanyType ;
var sicCodes = companyDetails ? . SicCodes ? ? match . Item . SicCodes ;
var accountsCategory = companyDetails ? . Accounts ? . LastAccounts ? . Type ;
// Check 1: Employment before company incorporation
CheckIncorporationDate ( flags , startDate , incorporationDate , match . Item . Title ) ;
// Check 2: Employment at dissolved company
CheckDissolutionDate ( flags , endDate , dissolutionDate , companyStatus , match . Item . Title ) ;
// Check 3: Dormant company check
CheckDormantCompany ( flags , accountsCategory , jobTitle , match . Item . Title ) ;
// Check 4: Company size vs job title
CheckCompanySizeVsRole ( flags , accountsCategory , jobTitle , match . Item . Title ) ;
2026-01-20 23:07:36 +01:00
// Check 5: Job title plausibility for PLCs
2026-01-20 20:00:24 +01:00
var ( jobPlausible , jobNotes ) = CheckJobTitlePlausibility ( jobTitle , companyType ) ;
if ( jobPlausible = = false )
{
flags . Add ( new CompanyVerificationFlag
{
Type = "ImplausibleJobTitle" ,
Severity = "Critical" ,
Message = jobNotes ? ? "Job title requires verification" ,
ScoreImpact = - 15
} ) ;
}
2026-01-18 19:20:50 +01:00
return new CompanyVerificationResult
{
ClaimedCompany = companyName ,
MatchedCompanyName = match . Item . Title ,
MatchedCompanyNumber = match . Item . CompanyNumber ,
MatchScore = match . Score ,
IsVerified = true ,
2026-01-20 20:58:12 +01:00
VerificationNotes = null ,
2026-01-18 19:20:50 +01:00
ClaimedStartDate = startDate ,
2026-01-20 20:00:24 +01:00
ClaimedEndDate = endDate ,
CompanyType = companyType ,
CompanyStatus = companyStatus ,
IncorporationDate = incorporationDate ,
DissolutionDate = dissolutionDate ,
AccountsCategory = accountsCategory ,
SicCodes = sicCodes ,
ClaimedJobTitle = jobTitle ,
JobTitlePlausible = jobPlausible ,
JobTitleNotes = jobNotes ,
Flags = flags
2026-01-18 19:20:50 +01:00
} ;
}
catch ( CompaniesHouseRateLimitException ex )
{
_logger . LogWarning ( ex , "Rate limit hit while verifying company: {CompanyName}" , companyName ) ;
2026-01-20 20:00:24 +01:00
return CreateUnverifiedResult ( companyName , startDate , endDate , jobTitle ,
2026-01-18 19:20:50 +01:00
"Verification temporarily unavailable due to rate limiting" ) ;
}
}
public async Task < List < CompanySearchResult > > SearchCompaniesAsync ( string query )
{
ArgumentException . ThrowIfNullOrWhiteSpace ( query ) ;
_logger . LogDebug ( "Searching companies for query: {Query}" , query ) ;
var response = await _companiesHouseClient . SearchCompaniesAsync ( query ) ;
if ( response ? . Items is null )
{
return [ ] ;
}
return response . Items . Select ( item = > new CompanySearchResult
{
CompanyNumber = item . CompanyNumber ,
CompanyName = item . Title ,
CompanyStatus = item . CompanyStatus ? ? "Unknown" ,
2026-01-20 16:45:43 +01:00
IncorporationDate = DateHelpers . ParseDate ( item . DateOfCreation ) ,
2026-01-18 19:20:50 +01:00
AddressSnippet = item . AddressSnippet
} ) . ToList ( ) ;
}
2026-01-20 20:00:24 +01:00
public async Task < bool? > VerifyDirectorAsync (
string companyNumber ,
string candidateName ,
DateOnly ? startDate ,
DateOnly ? endDate )
{
if ( string . IsNullOrWhiteSpace ( companyNumber ) | | string . IsNullOrWhiteSpace ( candidateName ) )
{
return null ;
}
try
{
var officers = await _companiesHouseClient . GetOfficersAsync ( companyNumber ) ;
if ( officers ? . Items is null | | officers . Items . Count = = 0 )
{
_logger . LogDebug ( "No officers found for company {CompanyNumber}" , companyNumber ) ;
return null ;
}
// Normalize candidate name for comparison
var normalizedCandidate = NormalizeName ( candidateName ) ;
foreach ( var officer in officers . Items )
{
// Check if officer role is director-like
var role = officer . OfficerRole ? . ToLowerInvariant ( ) ? ? "" ;
if ( ! role . Contains ( "director" ) & & ! role . Contains ( "secretary" ) )
{
continue ;
}
// Fuzzy match the name
var normalizedOfficer = NormalizeName ( officer . Name ) ;
var matchScore = Fuzz . Ratio ( normalizedCandidate , normalizedOfficer ) ;
if ( matchScore > = 80 ) // High threshold for name matching
{
// Check date overlap
var appointedOn = DateHelpers . ParseDate ( officer . AppointedOn ) ;
var resignedOn = DateHelpers . ParseDate ( officer . ResignedOn ) ;
// If no claimed dates, just check if names match
if ( ! startDate . HasValue & & ! endDate . HasValue )
{
_logger . LogDebug (
"Found matching director {OfficerName} for candidate {CandidateName} at company {CompanyNumber}" ,
officer . Name , candidateName , companyNumber ) ;
return true ;
}
// Check if employment period overlaps with directorship
var datesOverlap = DatesOverlap (
startDate , endDate ,
appointedOn , resignedOn ) ;
if ( datesOverlap )
{
_logger . LogDebug (
"Verified director {OfficerName} matches candidate {CandidateName} with overlapping dates" ,
officer . Name , candidateName ) ;
return true ;
}
}
}
_logger . LogDebug (
"No matching director found for candidate {CandidateName} at company {CompanyNumber}" ,
candidateName , companyNumber ) ;
return false ;
}
catch ( CompaniesHouseRateLimitException )
{
_logger . LogWarning ( "Rate limit hit while verifying director for company {CompanyNumber}" , companyNumber ) ;
return null ;
}
catch ( Exception ex )
{
_logger . LogError ( ex , "Error verifying director for company {CompanyNumber}" , companyNumber ) ;
return null ;
}
}
private static string NormalizeName ( string name )
{
if ( string . IsNullOrWhiteSpace ( name ) ) return "" ;
// Companies House often stores names as "SURNAME, Firstname"
// Convert to "Firstname Surname" format for comparison
var normalized = name . ToUpperInvariant ( ) . Trim ( ) ;
if ( normalized . Contains ( ',' ) )
{
var parts = normalized . Split ( ',' , 2 ) ;
if ( parts . Length = = 2 )
{
normalized = $"{parts[1].Trim()} {parts[0].Trim()}" ;
}
}
return normalized ;
}
private static bool DatesOverlap ( DateOnly ? start1 , DateOnly ? end1 , DateOnly ? start2 , DateOnly ? end2 )
{
// If no dates, assume overlap
if ( ! start1 . HasValue & & ! end1 . HasValue ) return true ;
if ( ! start2 . HasValue & & ! end2 . HasValue ) return true ;
// Use default dates for missing values
var s1 = start1 ? ? DateOnly . MinValue ;
var e1 = end1 ? ? DateOnly . MaxValue ;
var s2 = start2 ? ? DateOnly . MinValue ;
var e2 = end2 ? ? DateOnly . MaxValue ;
// Check overlap: periods overlap if one starts before the other ends
return s1 < = e2 & & s2 < = e1 ;
}
#region Verification Checks
private static void CheckIncorporationDate (
List < CompanyVerificationFlag > flags ,
DateOnly ? claimedStartDate ,
DateOnly ? incorporationDate ,
string companyName )
{
if ( claimedStartDate . HasValue & & incorporationDate . HasValue )
{
if ( claimedStartDate . Value < incorporationDate . Value )
{
flags . Add ( new CompanyVerificationFlag
{
Type = "EmploymentBeforeIncorporation" ,
Severity = "Critical" ,
Message = $"Claimed employment at '{companyName}' starting {claimedStartDate:MMM yyyy} is before company incorporation date {incorporationDate:MMM yyyy}" ,
ScoreImpact = - 20
} ) ;
}
}
}
private static void CheckDissolutionDate (
List < CompanyVerificationFlag > flags ,
DateOnly ? claimedEndDate ,
DateOnly ? dissolutionDate ,
string? companyStatus ,
string companyName )
{
var isDissolvedStatus = companyStatus ? . ToLowerInvariant ( ) is "dissolved" or "liquidation" or "administration" ;
if ( dissolutionDate . HasValue & & isDissolvedStatus )
{
// Allow 3 month buffer for wind-down
var bufferDate = dissolutionDate . Value . AddMonths ( 3 ) ;
if ( claimedEndDate . HasValue & & claimedEndDate . Value > bufferDate )
{
flags . Add ( new CompanyVerificationFlag
{
Type = "EmploymentAtDissolvedCompany" ,
Severity = "Critical" ,
Message = $"Claimed employment at '{companyName}' until {claimedEndDate:MMM yyyy} but company was dissolved on {dissolutionDate:MMM yyyy}" ,
ScoreImpact = - 20
} ) ;
}
else if ( ! claimedEndDate . HasValue ) // Current employment at dissolved company
{
flags . Add ( new CompanyVerificationFlag
{
Type = "CurrentEmploymentAtDissolvedCompany" ,
Severity = "Critical" ,
Message = $"Claims current employment at '{companyName}' but company was dissolved on {dissolutionDate:MMM yyyy}" ,
ScoreImpact = - 25
} ) ;
}
}
}
private static void CheckDormantCompany (
List < CompanyVerificationFlag > flags ,
string? accountsCategory ,
string? jobTitle ,
string companyName )
{
if ( string . IsNullOrWhiteSpace ( accountsCategory ) ) return ;
var isDormant = accountsCategory . ToLowerInvariant ( ) . Contains ( "dormant" ) ;
if ( ! isDormant ) return ;
// Directors can maintain dormant companies, but other roles are suspicious
var title = jobTitle ? . ToLowerInvariant ( ) ? ? "" ;
var isDirectorRole = title . Contains ( "director" ) | | title . Contains ( "company secretary" ) ;
if ( ! isDirectorRole )
{
flags . Add ( new CompanyVerificationFlag
{
Type = "EmploymentAtDormantCompany" ,
Severity = "Warning" ,
Message = $"Claimed active employment as '{jobTitle}' at '{companyName}' which files dormant accounts" ,
ScoreImpact = - 10
} ) ;
}
}
private static void CheckCompanySizeVsRole (
List < CompanyVerificationFlag > flags ,
string? accountsCategory ,
string? jobTitle ,
string companyName )
{
if ( string . IsNullOrWhiteSpace ( accountsCategory ) | | string . IsNullOrWhiteSpace ( jobTitle ) ) return ;
var category = accountsCategory . ToLowerInvariant ( ) ;
var title = jobTitle . ToLowerInvariant ( ) ;
// Micro-entity: < 10 employees, < £632k turnover
var isMicroEntity = category . Contains ( "micro" ) ;
// Check for senior management roles at micro companies
var isSeniorRole = title . Contains ( "vp" ) | |
title . Contains ( "vice president" ) | |
title . Contains ( "head of" ) | |
title . Contains ( "chief" ) | |
title . Contains ( "director of" ) | |
title . Contains ( "senior director" ) ;
// At micro companies, having many senior roles is suspicious
if ( isMicroEntity & & isSeniorRole )
{
flags . Add ( new CompanyVerificationFlag
{
Type = "SeniorRoleAtMicroCompany" ,
Severity = "Warning" ,
Message = $"Claimed senior role '{jobTitle}' at '{companyName}' which files micro-entity accounts (typically <10 employees)" ,
ScoreImpact = - 10
} ) ;
}
}
private static ( bool? IsPlausible , string? Notes ) CheckJobTitlePlausibility ( string? jobTitle , string? companyType )
{
if ( string . IsNullOrWhiteSpace ( jobTitle ) | | string . IsNullOrWhiteSpace ( companyType ) )
{
return ( null , null ) ;
}
var title = jobTitle . Trim ( ) . ToLowerInvariant ( ) ;
var type = companyType . Trim ( ) . ToLowerInvariant ( ) ;
// Check if this is a PLC (Public Limited Company) - these are large companies
var isPlc = type . Contains ( "plc" ) | | type . Contains ( "public limited" ) ;
// Check for C-suite / very senior roles
var isCsuiteRole = title . Contains ( "ceo" ) | |
title . Contains ( "chief executive" ) | |
title . Contains ( "cto" ) | |
title . Contains ( "chief technology" ) | |
title . Contains ( "cfo" ) | |
title . Contains ( "chief financial" ) | |
title . Contains ( "coo" ) | |
title . Contains ( "chief operating" ) | |
title . Contains ( "cio" ) | |
title . Contains ( "chief information" ) | |
title . Contains ( "managing director" ) | |
title = = "md" | |
title . Contains ( "chairman" ) | |
title . Contains ( "chairwoman" ) | |
title . Contains ( "chairperson" ) | |
title . Contains ( "president" ) ;
// Check for board-level roles
var isBoardRole = title . Contains ( "board member" ) | |
title . Contains ( "non-executive director" ) | |
title . Contains ( "executive director" ) | |
( title = = "director" & & ! title . Contains ( "of" ) ) ;
if ( isPlc & & ( isCsuiteRole | | isBoardRole ) )
{
return ( false , $"Claimed senior role '{jobTitle}' at a PLC requires verification - C-suite positions at public companies are publicly disclosed" ) ;
}
// Check for VP/SVP at PLCs (also usually disclosed)
var isVpRole = title . Contains ( "vice president" ) | |
title . Contains ( "vp " ) | |
title . StartsWith ( "vp" ) | |
title . Contains ( "svp" ) | |
title . Contains ( "senior vice president" ) | |
title . Contains ( "evp" ) | |
title . Contains ( "executive vice president" ) ;
if ( isPlc & & isVpRole )
{
return ( false , $"Claimed VP-level role '{jobTitle}' at a PLC - senior positions at public companies should be verifiable" ) ;
}
return ( true , null ) ;
}
#endregion
#region Helper Methods
2026-01-18 19:20:50 +01:00
private async Task < CompanyCache ? > FindCachedMatchAsync ( string companyName )
{
var cutoffDate = DateTime . UtcNow . AddDays ( - CacheExpirationDays ) ;
2026-01-20 16:54:58 +01:00
await using var dbContext = await _dbContextFactory . CreateDbContextAsync ( ) ;
var cachedCompanies = await dbContext . CompanyCache
2026-01-18 19:20:50 +01:00
. Where ( c = > c . CachedAt > = cutoffDate )
. ToListAsync ( ) ;
if ( cachedCompanies . Count = = 0 )
{
return null ;
}
var matches = cachedCompanies
2026-01-20 20:58:12 +01:00
. Where ( c = > ! string . IsNullOrWhiteSpace ( c . CompanyName ) )
. Select ( c = > new { Company = c , Score = Fuzz . TokenSetRatio ( companyName . ToUpperInvariant ( ) , c . CompanyName . ToUpperInvariant ( ) ) } )
2026-01-18 19:20:50 +01:00
. Where ( m = > m . Score > = FuzzyMatchThreshold )
. OrderByDescending ( m = > m . Score )
. FirstOrDefault ( ) ;
return matches ? . Company ;
}
2026-01-20 21:21:21 +01:00
private ( CompaniesHouseSearchItem Item , int Score ) ? FindBestMatch (
2026-01-18 19:20:50 +01:00
string companyName ,
2026-01-20 21:49:26 +01:00
string searchQuery ,
2026-01-20 21:14:01 +01:00
List < CompaniesHouseSearchItem > items ,
DateOnly ? claimedStartDate )
2026-01-18 19:20:50 +01:00
{
2026-01-20 21:49:26 +01:00
var normalizedOriginal = companyName . ToUpperInvariant ( ) ;
var normalizedQuery = searchQuery . ToUpperInvariant ( ) ;
2026-01-18 19:20:50 +01:00
2026-01-20 22:13:23 +01:00
// Extract core identifying words that MUST appear in any valid match
// This prevents "BMW Group Canada" matching "CANADA LIFE GROUP" just because of common words
// and "Lloyds Bowmaker" matching "LLOYDS ALARMS" (missing "Bowmaker")
var coreWords = ExtractCoreIdentifiers ( companyName ) ;
var queryCoreWords = ExtractCoreIdentifiers ( searchQuery ) ;
var originalLower = companyName . ToLowerInvariant ( ) ;
var queryLower = searchQuery . ToLowerInvariant ( ) ;
Improve company verification filtering and fix duplicate points display
- Add NonEmploymentEntityPatterns dictionary with 10 categories of non-employer entities
(clubs, associations, trusts, charities, investment, property, religious, sports, educational, professional)
- Expand SkipWords from ~30 to 120+ words for better core identifier extraction
- Add GetSearchEntityTypes() and IsValidEmploymentEntity() helper methods
- Refactor FindBestMatch() to use pattern-based filtering instead of hardcoded checks
- Fix UI showing duplicate points for same company appearing multiple times
(now only shows points on first occurrence, subsequent rows show 0)
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-20 22:33:16 +01:00
// Determine which entity types the search is explicitly looking for
var searchEntityTypes = GetSearchEntityTypes ( originalLower , queryLower ) ;
2026-01-20 22:13:23 +01:00
2026-01-20 21:49:26 +01:00
// Match against both the original company name AND the search query used
// This handles cases like "Matthew Walker (Northern Foods Plc)" where we
// search for "Northern Foods Plc" but need to match against it, not the full name
2026-01-18 19:20:50 +01:00
var matches = items
2026-01-20 20:58:12 +01:00
. Where ( item = > ! string . IsNullOrWhiteSpace ( item . Title ) )
2026-01-20 22:13:23 +01:00
. Where ( item = >
{
var itemTitle = item . Title . ToUpperInvariant ( ) ;
var itemTitleLower = item . Title . ToLowerInvariant ( ) ;
// Validate that ALL core identifiers appear in the match
// "Lloyds Bowmaker" must have BOTH "LLOYDS" and "BOWMAKER" in the match
var hasAllOriginalCores = coreWords . Count = = 0 | | coreWords . All ( w = > itemTitle . Contains ( w ) ) ;
var hasAllQueryCores = queryCoreWords . Count = = 0 | | queryCoreWords . All ( w = > itemTitle . Contains ( w ) ) ;
if ( ! hasAllOriginalCores & & ! hasAllQueryCores ) return false ;
Improve company verification filtering and fix duplicate points display
- Add NonEmploymentEntityPatterns dictionary with 10 categories of non-employer entities
(clubs, associations, trusts, charities, investment, property, religious, sports, educational, professional)
- Expand SkipWords from ~30 to 120+ words for better core identifier extraction
- Add GetSearchEntityTypes() and IsValidEmploymentEntity() helper methods
- Refactor FindBestMatch() to use pattern-based filtering instead of hardcoded checks
- Fix UI showing duplicate points for same company appearing multiple times
(now only shows points on first occurrence, subsequent rows show 0)
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-20 22:33:16 +01:00
// Filter out non-employment entities unless explicitly searching for that type
if ( ! IsValidEmploymentEntity ( itemTitleLower , searchEntityTypes ) )
2026-01-20 22:13:23 +01:00
{
Improve company verification filtering and fix duplicate points display
- Add NonEmploymentEntityPatterns dictionary with 10 categories of non-employer entities
(clubs, associations, trusts, charities, investment, property, religious, sports, educational, professional)
- Expand SkipWords from ~30 to 120+ words for better core identifier extraction
- Add GetSearchEntityTypes() and IsValidEmploymentEntity() helper methods
- Refactor FindBestMatch() to use pattern-based filtering instead of hardcoded checks
- Fix UI showing duplicate points for same company appearing multiple times
(now only shows points on first occurrence, subsequent rows show 0)
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-20 22:33:16 +01:00
return false ;
2026-01-20 22:13:23 +01:00
}
return true ;
} )
2026-01-20 21:49:26 +01:00
. Select ( item = >
{
var itemTitle = item . Title . ToUpperInvariant ( ) ;
Prefer main trading companies over subsidiaries in company matching
When matching brand names like "ASDA", prefer the main employer company
(ASDA STORES LIMITED) over subsidiaries (ASDA DELIVERY LIMITED).
- Add SubsidiaryIndicators set (delivery, distribution, holdings, property, etc.)
- Add MainCompanyIndicators set (stores, retail, manufacturing, etc.)
- Add CalculateCompanyPriorityScore() method for ranking matches
- Sort matches by priority score first, then by fuzzy score
- Subsidiaries get -10 priority unless explicitly searched for
- Main trading companies get +5 priority, PLCs get +3
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-20 22:41:27 +01:00
var itemTitleLower = item . Title . ToLowerInvariant ( ) ;
2026-01-20 21:49:26 +01:00
var scoreVsOriginal = Fuzz . TokenSetRatio ( normalizedOriginal , itemTitle ) ;
var scoreVsQuery = Fuzz . TokenSetRatio ( normalizedQuery , itemTitle ) ;
Prefer main trading companies over subsidiaries in company matching
When matching brand names like "ASDA", prefer the main employer company
(ASDA STORES LIMITED) over subsidiaries (ASDA DELIVERY LIMITED).
- Add SubsidiaryIndicators set (delivery, distribution, holdings, property, etc.)
- Add MainCompanyIndicators set (stores, retail, manufacturing, etc.)
- Add CalculateCompanyPriorityScore() method for ranking matches
- Sort matches by priority score first, then by fuzzy score
- Subsidiaries get -10 priority unless explicitly searched for
- Main trading companies get +5 priority, PLCs get +3
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-20 22:41:27 +01:00
var baseScore = Math . Max ( scoreVsOriginal , scoreVsQuery ) ;
// Calculate priority adjustment for main company vs subsidiary
var priorityScore = CalculateCompanyPriorityScore ( itemTitleLower , originalLower , queryLower ) ;
return ( Item : item , Score : baseScore , PriorityScore : priorityScore ) ;
2026-01-20 21:49:26 +01:00
} )
2026-01-18 19:20:50 +01:00
. Where ( m = > m . Score > = FuzzyMatchThreshold )
. ToList ( ) ;
2026-01-20 21:49:26 +01:00
_logger . LogDebug ( "Found {Count} matches above threshold for '{CompanyName}' (query: '{Query}')" , matches . Count , companyName , searchQuery ) ;
2026-01-20 21:21:21 +01:00
foreach ( var m in matches . Take ( 5 ) )
{
Prefer main trading companies over subsidiaries in company matching
When matching brand names like "ASDA", prefer the main employer company
(ASDA STORES LIMITED) over subsidiaries (ASDA DELIVERY LIMITED).
- Add SubsidiaryIndicators set (delivery, distribution, holdings, property, etc.)
- Add MainCompanyIndicators set (stores, retail, manufacturing, etc.)
- Add CalculateCompanyPriorityScore() method for ranking matches
- Sort matches by priority score first, then by fuzzy score
- Subsidiaries get -10 priority unless explicitly searched for
- Main trading companies get +5 priority, PLCs get +3
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-20 22:41:27 +01:00
_logger . LogDebug ( " Match: {Title} ({Number}), Score: {Score}, Priority: {Priority}, DateOfCreation: {Date}" ,
m . Item . Title , m . Item . CompanyNumber , m . Score , m . PriorityScore , m . Item . DateOfCreation ? ? "null" ) ;
2026-01-20 21:21:21 +01:00
}
2026-01-20 21:14:01 +01:00
if ( matches . Count = = 0 ) return null ;
// If we have a claimed start date, prefer companies that existed at that time
if ( claimedStartDate . HasValue )
{
2026-01-20 21:21:21 +01:00
_logger . LogDebug ( "Filtering for companies that existed at claimed start date: {StartDate}" , claimedStartDate . Value ) ;
2026-01-20 21:14:01 +01:00
var existedAtStartDate = matches
. Where ( m = >
{
var incDate = DateHelpers . ParseDate ( m . Item . DateOfCreation ) ;
2026-01-20 21:21:21 +01:00
var existed = incDate = = null | | incDate < = claimedStartDate . Value ;
_logger . LogDebug ( " {Title}: IncDate={IncDate}, Existed={Existed}" ,
m . Item . Title , incDate ? . ToString ( ) ? ? "null" , existed ) ;
return existed ;
2026-01-20 21:14:01 +01:00
} )
Prefer main trading companies over subsidiaries in company matching
When matching brand names like "ASDA", prefer the main employer company
(ASDA STORES LIMITED) over subsidiaries (ASDA DELIVERY LIMITED).
- Add SubsidiaryIndicators set (delivery, distribution, holdings, property, etc.)
- Add MainCompanyIndicators set (stores, retail, manufacturing, etc.)
- Add CalculateCompanyPriorityScore() method for ranking matches
- Sort matches by priority score first, then by fuzzy score
- Subsidiaries get -10 priority unless explicitly searched for
- Main trading companies get +5 priority, PLCs get +3
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-20 22:41:27 +01:00
// Sort by priority first, then by fuzzy score
. OrderByDescending ( m = > m . PriorityScore )
. ThenByDescending ( m = > m . Score )
2026-01-20 21:14:01 +01:00
. ToList ( ) ;
2026-01-20 21:21:21 +01:00
_logger . LogDebug ( "Companies that existed at start date: {Count}" , existedAtStartDate . Count ) ;
2026-01-20 21:14:01 +01:00
// If any matches existed at the start date, prefer those
if ( existedAtStartDate . Count > 0 )
{
Prefer main trading companies over subsidiaries in company matching
When matching brand names like "ASDA", prefer the main employer company
(ASDA STORES LIMITED) over subsidiaries (ASDA DELIVERY LIMITED).
- Add SubsidiaryIndicators set (delivery, distribution, holdings, property, etc.)
- Add MainCompanyIndicators set (stores, retail, manufacturing, etc.)
- Add CalculateCompanyPriorityScore() method for ranking matches
- Sort matches by priority score first, then by fuzzy score
- Subsidiaries get -10 priority unless explicitly searched for
- Main trading companies get +5 priority, PLCs get +3
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-20 22:41:27 +01:00
var selected = existedAtStartDate [ 0 ] ;
_logger . LogDebug ( "Selected: {Title} ({Number}), Priority: {Priority}" , selected . Item . Title , selected . Item . CompanyNumber , selected . PriorityScore ) ;
return ( selected . Item , selected . Score ) ;
2026-01-20 21:14:01 +01:00
}
2026-01-20 21:26:36 +01:00
// No companies existed at the claimed start date - don't match a wrong company
_logger . LogDebug ( "No companies found that existed at claimed start date {StartDate}, returning no match" , claimedStartDate . Value ) ;
return null ;
2026-01-20 21:14:01 +01:00
}
Prefer main trading companies over subsidiaries in company matching
When matching brand names like "ASDA", prefer the main employer company
(ASDA STORES LIMITED) over subsidiaries (ASDA DELIVERY LIMITED).
- Add SubsidiaryIndicators set (delivery, distribution, holdings, property, etc.)
- Add MainCompanyIndicators set (stores, retail, manufacturing, etc.)
- Add CalculateCompanyPriorityScore() method for ranking matches
- Sort matches by priority score first, then by fuzzy score
- Subsidiaries get -10 priority unless explicitly searched for
- Main trading companies get +5 priority, PLCs get +3
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-20 22:41:27 +01:00
// No start date provided - sort by priority then score
var fallback = matches
. OrderByDescending ( m = > m . PriorityScore )
. ThenByDescending ( m = > m . Score )
. First ( ) ;
_logger . LogDebug ( "No start date filter, using highest priority: {Title} ({Number}), Priority: {Priority}" , fallback . Item . Title , fallback . Item . CompanyNumber , fallback . PriorityScore ) ;
return ( fallback . Item , fallback . Score ) ;
}
/// <summary>
/// Calculates a priority score for company matching.
/// Higher scores = more likely to be the main employer company.
/// Penalizes subsidiaries (delivery, property, holdings, etc.) unless explicitly searched for.
/// Boosts main trading companies (stores, retail, etc.).
/// </summary>
private static int CalculateCompanyPriorityScore ( string itemTitleLower , string originalLower , string queryLower )
{
var score = 0 ;
// Check if search explicitly mentions subsidiary indicators
var searchText = originalLower + " " + queryLower ;
// Penalize subsidiary indicators (unless search explicitly included them)
foreach ( var indicator in SubsidiaryIndicators )
{
if ( itemTitleLower . Contains ( indicator ) )
{
// Only penalize if the search didn't explicitly include this indicator
if ( ! searchText . Contains ( indicator ) )
{
score - = 10 ; // Significant penalty for subsidiaries
}
break ; // Only apply one subsidiary penalty
}
}
// Boost main company indicators
foreach ( var indicator in MainCompanyIndicators )
{
if ( itemTitleLower . Contains ( indicator ) )
{
score + = 5 ; // Boost for main trading companies
break ; // Only apply one boost
}
}
// Slight boost for PLC (usually the parent/main company)
if ( itemTitleLower . EndsWith ( " plc" ) )
{
score + = 3 ;
}
return score ;
2026-01-18 19:20:50 +01:00
}
2026-01-20 20:00:24 +01:00
private async Task CacheCompanyAsync ( CompaniesHouseSearchItem item , CompaniesHouseCompany ? details )
2026-01-18 19:20:50 +01:00
{
2026-01-20 20:58:12 +01:00
try
{
await using var dbContext = await _dbContextFactory . CreateDbContextAsync ( ) ;
2026-01-20 16:54:58 +01:00
2026-01-20 20:58:12 +01:00
var existingCache = await dbContext . CompanyCache
. FirstOrDefaultAsync ( c = > c . CompanyNumber = = item . CompanyNumber ) ;
2026-01-18 19:20:50 +01:00
2026-01-20 20:58:12 +01:00
var sicCodes = details ? . SicCodes ? ? item . SicCodes ;
var sicCodesJson = sicCodes ! = null ? JsonSerializer . Serialize ( sicCodes ) : null ;
var accountsCategory = details ? . Accounts ? . LastAccounts ? . Type ;
2026-01-20 20:00:24 +01:00
2026-01-20 20:58:12 +01:00
if ( existingCache is not null )
2026-01-18 19:20:50 +01:00
{
2026-01-20 20:58:12 +01:00
existingCache . CompanyName = item . Title ;
existingCache . Status = item . CompanyStatus ? ? "Unknown" ;
existingCache . CompanyType = item . CompanyType ;
existingCache . IncorporationDate = DateHelpers . ParseDate ( item . DateOfCreation ) ;
existingCache . DissolutionDate = DateHelpers . ParseDate ( item . DateOfCessation ) ;
existingCache . AccountsCategory = accountsCategory ;
existingCache . SicCodesJson = sicCodesJson ;
existingCache . CachedAt = DateTime . UtcNow ;
}
else
{
var cacheEntry = new CompanyCache
{
CompanyNumber = item . CompanyNumber ,
CompanyName = item . Title ,
Status = item . CompanyStatus ? ? "Unknown" ,
CompanyType = item . CompanyType ,
IncorporationDate = DateHelpers . ParseDate ( item . DateOfCreation ) ,
DissolutionDate = DateHelpers . ParseDate ( item . DateOfCessation ) ,
AccountsCategory = accountsCategory ,
SicCodesJson = sicCodesJson ,
CachedAt = DateTime . UtcNow
} ;
dbContext . CompanyCache . Add ( cacheEntry ) ;
}
2026-01-18 19:20:50 +01:00
2026-01-20 20:58:12 +01:00
await dbContext . SaveChangesAsync ( ) ;
}
catch ( DbUpdateException ex ) when ( ex . InnerException ? . Message . Contains ( "PK_CompanyCache" ) = = true )
{
// Race condition: another task already cached this company - ignore
_logger . LogDebug ( "Company {CompanyNumber} already cached by another task" , item . CompanyNumber ) ;
2026-01-18 19:20:50 +01:00
}
}
2026-01-20 20:00:24 +01:00
private CompanyVerificationResult CreateResultFromCache (
2026-01-18 19:20:50 +01:00
CompanyCache cached ,
2026-01-20 20:00:24 +01:00
string claimedCompany ,
2026-01-18 19:20:50 +01:00
DateOnly ? startDate ,
2026-01-20 20:00:24 +01:00
DateOnly ? endDate ,
string? jobTitle ,
List < CompanyVerificationFlag > flags )
2026-01-18 19:20:50 +01:00
{
2026-01-20 20:58:12 +01:00
var matchScore = Fuzz . TokenSetRatio (
2026-01-18 19:20:50 +01:00
claimedCompany . ToUpperInvariant ( ) ,
cached . CompanyName . ToUpperInvariant ( ) ) ;
2026-01-20 20:58:12 +01:00
List < string > ? sicCodes = null ;
if ( ! string . IsNullOrEmpty ( cached . SicCodesJson ) )
{
try
{
sicCodes = JsonSerializer . Deserialize < List < string > > ( cached . SicCodesJson ) ;
}
catch ( JsonException )
{
// Ignore malformed JSON in cache
}
}
2026-01-20 20:00:24 +01:00
// Run all verification checks
CheckIncorporationDate ( flags , startDate , cached . IncorporationDate , cached . CompanyName ) ;
CheckDissolutionDate ( flags , endDate , cached . DissolutionDate , cached . Status , cached . CompanyName ) ;
CheckDormantCompany ( flags , cached . AccountsCategory , jobTitle , cached . CompanyName ) ;
CheckCompanySizeVsRole ( flags , cached . AccountsCategory , jobTitle , cached . CompanyName ) ;
var ( jobPlausible , jobNotes ) = CheckJobTitlePlausibility ( jobTitle , cached . CompanyType ) ;
if ( jobPlausible = = false )
{
flags . Add ( new CompanyVerificationFlag
{
Type = "ImplausibleJobTitle" ,
Severity = "Critical" ,
Message = jobNotes ? ? "Job title requires verification" ,
ScoreImpact = - 15
} ) ;
}
2026-01-18 19:20:50 +01:00
return new CompanyVerificationResult
{
ClaimedCompany = claimedCompany ,
MatchedCompanyName = cached . CompanyName ,
MatchedCompanyNumber = cached . CompanyNumber ,
MatchScore = matchScore ,
IsVerified = true ,
2026-01-20 20:58:12 +01:00
VerificationNotes = null ,
2026-01-18 19:20:50 +01:00
ClaimedStartDate = startDate ,
2026-01-20 20:00:24 +01:00
ClaimedEndDate = endDate ,
CompanyType = cached . CompanyType ,
CompanyStatus = cached . Status ,
IncorporationDate = cached . IncorporationDate ,
DissolutionDate = cached . DissolutionDate ,
AccountsCategory = cached . AccountsCategory ,
SicCodes = sicCodes ,
ClaimedJobTitle = jobTitle ,
JobTitlePlausible = jobPlausible ,
JobTitleNotes = jobNotes ,
Flags = flags
2026-01-18 19:20:50 +01:00
} ;
}
private static CompanyVerificationResult CreateUnverifiedResult (
string companyName ,
DateOnly ? startDate ,
DateOnly ? endDate ,
2026-01-20 20:00:24 +01:00
string? jobTitle ,
2026-01-18 19:20:50 +01:00
string reason )
{
return new CompanyVerificationResult
{
ClaimedCompany = companyName ,
MatchedCompanyName = null ,
MatchedCompanyNumber = null ,
MatchScore = 0 ,
IsVerified = false ,
VerificationNotes = reason ,
ClaimedStartDate = startDate ,
2026-01-20 20:00:24 +01:00
ClaimedEndDate = endDate ,
ClaimedJobTitle = jobTitle
2026-01-18 19:20:50 +01:00
} ;
}
2026-01-20 21:32:02 +01:00
/// <summary>
/// Generates alternative search queries to find companies that may be registered
/// with slightly different names (e.g., "U.K." vs "UK", "Limited" vs "Ltd").
2026-01-20 21:49:26 +01:00
/// Also handles "Brand (Parent Company)" format by extracting and prioritizing the parent.
2026-01-20 21:32:02 +01:00
/// </summary>
private static List < string > GenerateSearchQueries ( string companyName )
{
2026-01-20 21:49:26 +01:00
var queries = new HashSet < string > ( StringComparer . OrdinalIgnoreCase ) ;
2026-01-20 21:32:02 +01:00
var normalized = companyName . Trim ( ) ;
2026-01-20 21:49:26 +01:00
// Step 0a: Check for "Brand (Parent Company)" format and extract parent company
// Parent company is more likely to be the registered name, so search it first
var parentMatch = System . Text . RegularExpressions . Regex . Match ( normalized , @"\(([^)]+)\)\s*$" ) ;
if ( parentMatch . Success )
{
var parentCompany = parentMatch . Groups [ 1 ] . Value . Trim ( ) ;
// Generate queries for parent company first (higher priority)
foreach ( var parentQuery in GenerateNameVariations ( parentCompany ) )
{
queries . Add ( parentQuery ) ;
}
// Also try the brand name without parenthetical
var brandName = normalized [ . . parentMatch . Index ] . Trim ( ) ;
if ( brandName . Length > = 3 )
{
foreach ( var brandQuery in GenerateNameVariations ( brandName ) )
{
queries . Add ( brandQuery ) ;
}
}
}
// Step 0b: Check for "Name1/Name2" format (e.g., "ASDA/WALMART")
// Try each part separately as they may be different registered names
if ( normalized . Contains ( '/' ) )
{
var parts = normalized . Split ( '/' , StringSplitOptions . RemoveEmptyEntries | StringSplitOptions . TrimEntries ) ;
foreach ( var part in parts )
{
if ( part . Length > = 3 )
{
foreach ( var partQuery in GenerateNameVariations ( part ) )
{
queries . Add ( partQuery ) ;
}
}
}
}
2026-01-20 22:04:44 +01:00
// Step 0c: Try first word as potential parent company (e.g., "UNILEVER BESTFOOD" -> "UNILEVER")
// Many company names are "ParentCompany Division" or "ParentCompany Brand"
var words = normalized . Split ( ' ' , StringSplitOptions . RemoveEmptyEntries ) ;
if ( words . Length > = 2 )
{
var firstWord = words [ 0 ] ;
// Only try if first word is substantial (not "The", "A", common prefixes)
var skipWords = new HashSet < string > ( StringComparer . OrdinalIgnoreCase )
{ "the" , "a" , "an" , "uk" , "british" , "national" , "international" , "global" , "new" } ;
if ( firstWord . Length > = 4 & & ! skipWords . Contains ( firstWord ) )
{
foreach ( var firstWordQuery in GenerateNameVariations ( firstWord ) )
{
queries . Add ( firstWordQuery ) ;
}
// Also try first word + PLC/Limited for major corporations
queries . Add ( firstWord + " PLC" ) ;
queries . Add ( firstWord + " Limited" ) ;
}
}
2026-01-20 21:49:26 +01:00
// Also add variations of the full original name
foreach ( var query in GenerateNameVariations ( normalized ) )
{
queries . Add ( query ) ;
}
return queries . ToList ( ) ;
}
/// <summary>
/// Generates name variations for a single company name (UK/U.K., Ltd/Limited, etc.)
/// </summary>
private static List < string > GenerateNameVariations ( string name )
{
var variations = new HashSet < string > ( StringComparer . OrdinalIgnoreCase ) { name } ;
2026-01-20 21:39:22 +01:00
// Step 1: Generate UK/U.K. variations
2026-01-20 21:49:26 +01:00
var ukVariants = new List < string > { name } ;
2026-01-20 21:32:02 +01:00
2026-01-20 21:49:26 +01:00
if ( name . Contains ( " UK" , StringComparison . OrdinalIgnoreCase ) )
2026-01-20 21:32:02 +01:00
{
2026-01-20 21:39:22 +01:00
// Add U.K. variant
2026-01-20 21:49:26 +01:00
var withDots = name
2026-01-20 21:39:22 +01:00
. Replace ( " UK " , " U.K. " , StringComparison . OrdinalIgnoreCase )
. Replace ( " UK" , " U.K." , StringComparison . OrdinalIgnoreCase ) ;
2026-01-20 21:49:26 +01:00
if ( withDots ! = name )
2026-01-20 21:39:22 +01:00
ukVariants . Add ( withDots ) ;
2026-01-20 21:32:02 +01:00
}
2026-01-20 21:49:26 +01:00
if ( name . Contains ( " U.K." , StringComparison . OrdinalIgnoreCase ) )
2026-01-20 21:39:22 +01:00
{
// Add UK variant (no dots)
2026-01-20 21:49:26 +01:00
var withoutDots = name
2026-01-20 21:39:22 +01:00
. Replace ( " U.K. " , " UK " , StringComparison . OrdinalIgnoreCase )
. Replace ( " U.K." , " UK" , StringComparison . OrdinalIgnoreCase ) ;
2026-01-20 21:49:26 +01:00
if ( withoutDots ! = name )
2026-01-20 21:39:22 +01:00
ukVariants . Add ( withoutDots ) ;
2026-01-20 21:32:02 +01:00
}
2026-01-20 21:39:22 +01:00
// Step 2: For each UK variant, generate suffix variations (Ltd/Limited)
foreach ( var variant in ukVariants )
2026-01-20 21:32:02 +01:00
{
2026-01-20 21:49:26 +01:00
variations . Add ( variant ) ;
2026-01-20 21:39:22 +01:00
// Try Ltd -> Limited
if ( variant . EndsWith ( " Ltd" , StringComparison . OrdinalIgnoreCase ) )
{
2026-01-20 21:49:26 +01:00
variations . Add ( variant [ . . ^ 4 ] + " Limited" ) ;
2026-01-20 21:39:22 +01:00
}
// Try Limited -> Ltd
else if ( variant . EndsWith ( " Limited" , StringComparison . OrdinalIgnoreCase ) )
{
2026-01-20 21:49:26 +01:00
variations . Add ( variant [ . . ^ 8 ] + " Ltd" ) ;
2026-01-20 21:39:22 +01:00
}
// Try PLC variations
else if ( variant . EndsWith ( " PLC" , StringComparison . OrdinalIgnoreCase ) )
{
2026-01-20 21:49:26 +01:00
variations . Add ( variant [ . . ^ 4 ] + " Public Limited Company" ) ;
2026-01-20 21:39:22 +01:00
}
else if ( variant . EndsWith ( " Public Limited Company" , StringComparison . OrdinalIgnoreCase ) )
{
2026-01-20 21:49:26 +01:00
variations . Add ( variant [ . . ^ 24 ] + " PLC" ) ;
}
// Try Plc (mixed case) variations
else if ( variant . EndsWith ( " Plc" , StringComparison . Ordinal ) )
{
variations . Add ( variant [ . . ^ 4 ] + " PLC" ) ;
variations . Add ( variant [ . . ^ 4 ] + " Public Limited Company" ) ;
2026-01-20 21:39:22 +01:00
}
2026-01-20 21:32:02 +01:00
}
2026-01-20 21:39:22 +01:00
// Step 3: Try core name without suffix
2026-01-20 21:49:26 +01:00
var suffixesToRemove = new [ ] { " Ltd" , " Limited" , " PLC" , " Plc" , " LLP" , " Inc" , " Corporation" , " Corp" } ;
var coreName = name ;
2026-01-20 21:32:02 +01:00
foreach ( var suffix in suffixesToRemove )
{
if ( coreName . EndsWith ( suffix , StringComparison . OrdinalIgnoreCase ) )
{
coreName = coreName [ . . ^ suffix . Length ] . Trim ( ) ;
break ;
}
}
2026-01-20 21:49:26 +01:00
if ( coreName ! = name & & coreName . Length > = 3 )
2026-01-20 21:32:02 +01:00
{
2026-01-20 21:49:26 +01:00
variations . Add ( coreName ) ;
variations . Add ( coreName + " Limited" ) ;
variations . Add ( coreName + " PLC" ) ;
2026-01-20 21:39:22 +01:00
// Also add U.K. variant of core name if applicable
if ( coreName . Contains ( " UK" , StringComparison . OrdinalIgnoreCase ) )
{
var coreWithDots = coreName
. Replace ( " UK " , " U.K. " , StringComparison . OrdinalIgnoreCase )
. Replace ( " UK" , " U.K." , StringComparison . OrdinalIgnoreCase ) ;
2026-01-20 21:49:26 +01:00
variations . Add ( coreWithDots ) ;
variations . Add ( coreWithDots + " Limited" ) ;
2026-01-20 21:39:22 +01:00
}
2026-01-20 21:32:02 +01:00
}
2026-01-20 21:49:26 +01:00
return variations . ToList ( ) ;
2026-01-20 21:32:02 +01:00
}
Improve company verification filtering and fix duplicate points display
- Add NonEmploymentEntityPatterns dictionary with 10 categories of non-employer entities
(clubs, associations, trusts, charities, investment, property, religious, sports, educational, professional)
- Expand SkipWords from ~30 to 120+ words for better core identifier extraction
- Add GetSearchEntityTypes() and IsValidEmploymentEntity() helper methods
- Refactor FindBestMatch() to use pattern-based filtering instead of hardcoded checks
- Fix UI showing duplicate points for same company appearing multiple times
(now only shows points on first occurrence, subsequent rows show 0)
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-20 22:33:16 +01:00
/// <summary>
/// Determines which non-employment entity categories the search query is explicitly looking for.
/// Returns a set of category names (e.g., "Clubs", "Trusts") that should NOT be filtered out.
/// </summary>
private static HashSet < string > GetSearchEntityTypes ( string originalLower , string queryLower )
{
var allowedCategories = new HashSet < string > ( ) ;
var searchTerms = originalLower + " " + queryLower ;
foreach ( var ( category , patterns ) in NonEmploymentEntityPatterns )
{
// If any pattern from this category appears in the search, allow matches from that category
if ( patterns . Any ( pattern = > searchTerms . Contains ( pattern ) ) )
{
allowedCategories . Add ( category ) ;
}
}
return allowedCategories ;
}
/// <summary>
/// Checks if a company title represents a valid employment entity.
/// Filters out non-employment entities (clubs, trusts, etc.) unless the search explicitly targets that type.
/// </summary>
private static bool IsValidEmploymentEntity ( string itemTitleLower , HashSet < string > allowedCategories )
{
foreach ( var ( category , patterns ) in NonEmploymentEntityPatterns )
{
// Skip this category if the search explicitly allows it
if ( allowedCategories . Contains ( category ) )
{
continue ;
}
// Check if the item matches any pattern in this non-employment category
if ( patterns . Any ( pattern = > itemTitleLower . Contains ( pattern ) ) )
{
return false ; // This is a non-employment entity type that wasn't explicitly searched for
}
}
return true ; // No non-employment patterns matched, this is likely a valid employment entity
}
// Expanded skip words list for core identifier extraction
// These words are too common to be meaningful differentiators between companies
private static readonly HashSet < string > SkipWords = new ( StringComparer . OrdinalIgnoreCase )
{
// Articles and conjunctions
"the" , "a" , "an" , "and" , "or" , "of" , "for" , "in" , "at" , "on" , "by" , "to" , "with" ,
// Geographic - Countries and regions
"uk" , "u.k." , "gb" , "british" , "britain" , "england" , "english" , "scotland" , "scottish" ,
"wales" , "welsh" , "ireland" , "irish" , "northern" ,
"europe" , "european" , "america" , "american" , "usa" , "us" , "u.s." , "u.s.a." ,
"canada" , "canadian" , "asia" , "asian" , "pacific" , "atlantic" ,
"australia" , "australian" , "africa" , "african" , "india" , "indian" ,
"france" , "french" , "germany" , "german" , "spain" , "spanish" , "italy" , "italian" ,
"japan" , "japanese" , "china" , "chinese" , "korea" , "korean" ,
"middle" , "east" , "west" , "north" , "south" , "central" , "western" , "eastern" ,
// Geographic - Cities
"london" , "manchester" , "birmingham" , "leeds" , "glasgow" , "edinburgh" , "bristol" ,
"liverpool" , "sheffield" , "newcastle" , "cardiff" , "belfast" , "nottingham" ,
"southampton" , "portsmouth" , "brighton" , "leicester" , "coventry" , "hull" ,
// Legal suffixes
"limited" , "ltd" , "plc" , "llp" , "llc" , "inc" , "incorporated" , "corporation" , "corp" ,
"company" , "co" , "partners" , "partnership" , "enterprises" , "unlimited" ,
"registered" , "cic" , "cio" , "se" , "ag" , "gmbh" , "sarl" , "bv" , "nv" ,
// Business descriptors
"group" , "holdings" , "holding" , "parent" , "subsidiary" , "division" , "branch" ,
"services" , "service" , "solutions" , "solution" , "consulting" , "consultants" , "consultancy" ,
"management" , "systems" , "system" , "technologies" , "technology" , "tech" ,
"industries" , "industry" , "industrial" , "commercial" , "trading" , "trade" ,
"business" , "businesses" , "operations" , "operational" , "professional" , "professionals" ,
"resources" , "resource" , "network" , "networks" , "associates" , "associated" ,
// Size/Scope descriptors
"national" , "international" , "global" , "worldwide" , "world" , "regional" , "local" ,
"universal" , "general" , "standard" , "premier" , "prime" , "first" , "one" ,
// Quality/Marketing terms
"new" , "modern" , "advanced" , "innovative" , "premier" , "elite" , "premium" ,
"quality" , "superior" , "excellent" , "best" , "top" , "leading" , "major" ,
// Ownership indicators
"royal" , "imperial" , "crown" , "state" , "public" , "private" , "independent" ,
"mutual" , "cooperative" , "coop" , "community" ,
// Time-related
"century" , "millennium" , "annual" , "year" , "years" ,
// Numbers as words
"one" , "two" , "three" , "four" , "five" , "first" , "second" , "third"
} ;
2026-01-20 22:13:23 +01:00
/// <summary>
/// Extracts ALL core identifying words from a company name.
/// These are significant words that aren't common prefixes/suffixes.
/// E.g., "BMW Group Canada" -> ["BMW"], "Lloyds Bowmaker" -> ["LLOYDS", "BOWMAKER"]
/// "Bank of Scotland" -> ["BANK", "SCOTLAND"]
/// </summary>
private static List < string > ExtractCoreIdentifiers ( string companyName )
{
if ( string . IsNullOrWhiteSpace ( companyName ) ) return new List < string > ( ) ;
// Remove parenthetical content first
var cleanName = System . Text . RegularExpressions . Regex . Replace ( companyName , @"\([^)]*\)" , "" ) . Trim ( ) ;
// Split into words and collect all significant words
var words = cleanName . Split ( new [ ] { ' ' , '-' , '/' , '&' } , StringSplitOptions . RemoveEmptyEntries ) ;
var coreWords = new List < string > ( ) ;
foreach ( var word in words )
{
var cleanWord = word . Trim ( '.' , ',' , '\'' ) ;
Improve company verification filtering and fix duplicate points display
- Add NonEmploymentEntityPatterns dictionary with 10 categories of non-employer entities
(clubs, associations, trusts, charities, investment, property, religious, sports, educational, professional)
- Expand SkipWords from ~30 to 120+ words for better core identifier extraction
- Add GetSearchEntityTypes() and IsValidEmploymentEntity() helper methods
- Refactor FindBestMatch() to use pattern-based filtering instead of hardcoded checks
- Fix UI showing duplicate points for same company appearing multiple times
(now only shows points on first occurrence, subsequent rows show 0)
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-20 22:33:16 +01:00
if ( cleanWord . Length > = 2 & & ! SkipWords . Contains ( cleanWord ) )
2026-01-20 22:13:23 +01:00
{
coreWords . Add ( cleanWord . ToUpperInvariant ( ) ) ;
}
}
return coreWords ;
}
2026-01-20 20:00:24 +01:00
#endregion
2026-01-18 19:20:50 +01:00
}