2026-01-20 20:00:24 +01:00
using System.Text.Json ;
2026-01-18 19:20:50 +01:00
using FuzzySharp ;
using Microsoft.EntityFrameworkCore ;
using Microsoft.Extensions.Logging ;
2026-01-22 10:43:45 +00:00
using RealCV.Application.Data ;
2026-01-21 15:07:20 +00:00
using RealCV.Application.DTOs ;
using RealCV.Application.Helpers ;
using RealCV.Application.Interfaces ;
using RealCV.Application.Models ;
using RealCV.Domain.Entities ;
using RealCV.Infrastructure.Data ;
using RealCV.Infrastructure.ExternalApis ;
2026-01-18 19:20:50 +01:00
2026-01-21 15:07:20 +00:00
namespace RealCV.Infrastructure.Services ;
2026-01-18 19:20:50 +01:00
public sealed class CompanyVerifierService : ICompanyVerifierService
{
private readonly CompaniesHouseClient _companiesHouseClient ;
2026-01-20 16:54:58 +01:00
private readonly IDbContextFactory < ApplicationDbContext > _dbContextFactory ;
2026-01-21 00:51:24 +01:00
private readonly ICompanyNameMatcherService _aiMatcher ;
2026-01-18 19:20:50 +01:00
private readonly ILogger < CompanyVerifierService > _logger ;
2026-01-20 20:58:12 +01:00
private const int FuzzyMatchThreshold = 85 ;
2026-01-18 19:20:50 +01:00
private const int CacheExpirationDays = 30 ;
Improve company verification filtering and fix duplicate points display
- Add NonEmploymentEntityPatterns dictionary with 10 categories of non-employer entities
(clubs, associations, trusts, charities, investment, property, religious, sports, educational, professional)
- Expand SkipWords from ~30 to 120+ words for better core identifier extraction
- Add GetSearchEntityTypes() and IsValidEmploymentEntity() helper methods
- Refactor FindBestMatch() to use pattern-based filtering instead of hardcoded checks
- Fix UI showing duplicate points for same company appearing multiple times
(now only shows points on first occurrence, subsequent rows show 0)
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-20 22:33:16 +01:00
// Non-employment entity patterns organized by category
// These are entities that exist in Companies House but are not typical employers
private static readonly Dictionary < string , string [ ] > NonEmploymentEntityPatterns = new ( )
{
["Clubs"] = new [ ] { "club" , "fan club" , "owners club" , "car club" , "supporters" , "enthusiast" , "aficionados" } ,
["Associations"] = new [ ] { "association" , "society" , "federation" , "institute" , "institution" , "guild" , "chamber of commerce" } ,
["Trusts"] = new [ ] { "benefit trust" , "pension" , "retirement" , "employee trust" , "share trust" , "employee benefit" , "superannuation" , "provident" } ,
["Charities"] = new [ ] { "charity" , "charitable" , "foundation" , "relief fund" , "benevolent" , "philanthropic" } ,
["Investment"] = new [ ] { "nominee" , "custodian" , "trustee" , "investment trust" , "unit trust" , "investment fund" , "capital partners" } ,
["Property"] = new [ ] { "freehold" , "leasehold" , "property management" , "residents association" , "management company rtm" , "commonhold" } ,
["Religious"] = new [ ] { "church" , "chapel" , "mosque" , "synagogue" , "temple" , "parish" , "diocese" , "ministry" } ,
["Sports"] = new [ ] { "football club" , "cricket club" , "rugby club" , "golf club" , "tennis club" , "sports club" , "athletic club" } ,
["Educational"] = new [ ] { "old boys" , "old girls" , "alumni" , "school association" , "pta" , "parent teacher" } ,
["Professional"] = new [ ] { "chartered institute" , "royal college" , "professional body" , "trade body" , "regulatory body" }
} ;
// SIC codes that indicate non-trading or non-employment entities
private static readonly HashSet < string > NonTradingSicCodes = new ( )
{
"99999" , // Dormant company
"64209" , // Activities of holding companies (shell companies)
"68100" , // Buying and selling of own real estate (often shell)
} ;
Prefer main trading companies over subsidiaries in company matching
When matching brand names like "ASDA", prefer the main employer company
(ASDA STORES LIMITED) over subsidiaries (ASDA DELIVERY LIMITED).
- Add SubsidiaryIndicators set (delivery, distribution, holdings, property, etc.)
- Add MainCompanyIndicators set (stores, retail, manufacturing, etc.)
- Add CalculateCompanyPriorityScore() method for ranking matches
- Sort matches by priority score first, then by fuzzy score
- Subsidiaries get -10 priority unless explicitly searched for
- Main trading companies get +5 priority, PLCs get +3
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-20 22:41:27 +01:00
// Words that indicate a subsidiary rather than the main trading company
// When someone says they worked for "ASDA", they likely mean ASDA STORES LIMITED,
// not ASDA DELIVERY LIMITED or ASDA PROPERTY HOLDINGS LIMITED
private static readonly HashSet < string > SubsidiaryIndicators = new ( StringComparer . OrdinalIgnoreCase )
{
// Logistics/Operations subsidiaries
"delivery" , "distribution" , "logistics" , "transport" , "fleet" , "haulage" , "warehousing" , "fulfilment" ,
// Property subsidiaries
"property" , "properties" , "estates" , "land" , "real estate" , "developments" ,
// Financial/Holding subsidiaries
"holdings" , "holding" , "investments" , "capital" , "finance" , "financial" , "treasury" ,
// Administrative subsidiaries
"nominees" , "nominee" , "trustees" , "trustee" , "secretarial" , "registrars" ,
// Insurance subsidiaries
"insurance" , "assurance" , "underwriting" ,
// Specific function subsidiaries
"leasing" , "rentals" , "procurement" , "sourcing"
} ;
// Words that indicate a main trading/employer company (prefer these)
private static readonly HashSet < string > MainCompanyIndicators = new ( StringComparer . OrdinalIgnoreCase )
{
"stores" , "retail" , "supermarkets" , "superstores" , "hypermarkets" ,
"manufacturing" , "operations" , "trading"
} ;
2026-01-20 20:00:24 +01:00
2026-01-18 19:20:50 +01:00
public CompanyVerifierService (
CompaniesHouseClient companiesHouseClient ,
2026-01-20 16:54:58 +01:00
IDbContextFactory < ApplicationDbContext > dbContextFactory ,
2026-01-21 00:51:24 +01:00
ICompanyNameMatcherService aiMatcher ,
2026-01-18 19:20:50 +01:00
ILogger < CompanyVerifierService > logger )
{
_companiesHouseClient = companiesHouseClient ;
2026-01-20 16:54:58 +01:00
_dbContextFactory = dbContextFactory ;
2026-01-21 00:51:24 +01:00
_aiMatcher = aiMatcher ;
2026-01-18 19:20:50 +01:00
_logger = logger ;
}
public async Task < CompanyVerificationResult > VerifyCompanyAsync (
string companyName ,
DateOnly ? startDate ,
2026-01-20 20:00:24 +01:00
DateOnly ? endDate ,
string? jobTitle = null )
2026-01-18 19:20:50 +01:00
{
ArgumentException . ThrowIfNullOrWhiteSpace ( companyName ) ;
2026-01-22 10:43:45 +00:00
// Normalize company name - strip trailing punctuation that causes matching issues
var normalizedName = NormalizeCompanyName ( companyName ) ;
_logger . LogDebug ( "Verifying company: {CompanyName} (normalized: {NormalizedName})" , companyName , normalizedName ) ;
2026-01-20 20:00:24 +01:00
var flags = new List < CompanyVerificationFlag > ( ) ;
2026-01-18 19:20:50 +01:00
2026-01-22 10:43:45 +00:00
// Check 1a: Is this a public sector employer?
if ( UKHistoricalEmployers . IsPublicSectorEmployer ( normalizedName ) )
{
_logger . LogInformation ( "Recognised public sector employer: {CompanyName}" , companyName ) ;
return new CompanyVerificationResult
{
ClaimedCompany = companyName ,
MatchedCompanyName = companyName ,
MatchedCompanyNumber = null ,
MatchScore = 100 ,
IsVerified = true ,
VerificationNotes = "Public sector employer - not registered at Companies House" ,
ClaimedStartDate = startDate ,
ClaimedEndDate = endDate ,
CompanyType = "public-sector" ,
CompanyStatus = "active" ,
ClaimedJobTitle = jobTitle ,
Flags = flags
} ;
}
// Check 1b: Is this a charity or non-profit organisation?
if ( UKHistoricalEmployers . IsCharityEmployer ( normalizedName ) )
{
_logger . LogInformation ( "Recognised charity employer: {CompanyName}" , companyName ) ;
return new CompanyVerificationResult
{
ClaimedCompany = companyName ,
MatchedCompanyName = companyName ,
MatchedCompanyNumber = null ,
MatchScore = 100 ,
IsVerified = true ,
VerificationNotes = "Charity/non-profit organisation" ,
ClaimedStartDate = startDate ,
ClaimedEndDate = endDate ,
CompanyType = "charity" ,
CompanyStatus = "active" ,
ClaimedJobTitle = jobTitle ,
Flags = flags
} ;
}
// Check 2: Is this an internal division of a larger company?
var parentCompany = UKHistoricalEmployers . GetParentCompanyForDivision ( normalizedName ) ;
if ( parentCompany ! = null )
{
_logger . LogInformation ( "Recognised division '{CompanyName}' of parent company '{ParentCompany}'" , companyName , parentCompany ) ;
// Try to verify the parent company instead
var parentResult = await VerifyCompanyAsync ( parentCompany , startDate , endDate , jobTitle ) ;
if ( parentResult . IsVerified )
{
return parentResult with
{
ClaimedCompany = companyName ,
VerificationNotes = $"Internal division of {parentResult.MatchedCompanyName}"
} ;
}
// If parent verification failed, return a partial match
return new CompanyVerificationResult
{
ClaimedCompany = companyName ,
MatchedCompanyName = parentCompany ,
MatchedCompanyNumber = null ,
MatchScore = 85 ,
IsVerified = true ,
VerificationNotes = $"Recognised as division of {parentCompany}" ,
ClaimedStartDate = startDate ,
ClaimedEndDate = endDate ,
ClaimedJobTitle = jobTitle ,
Flags = flags
} ;
}
// Check 3: Is this a known historical employer?
var historicalInfo = UKHistoricalEmployers . GetHistoricalEmployerInfo ( normalizedName ) ;
if ( historicalInfo ! = null )
{
_logger . LogInformation ( "Recognised historical employer: {CompanyName} -> {Successor}" , companyName , historicalInfo . SuccessorName ) ;
// If we have a company number for the successor, try to get current details
if ( ! string . IsNullOrEmpty ( historicalInfo . CompanyNumber ) )
{
try
{
var successorDetails = await _companiesHouseClient . GetCompanyAsync ( historicalInfo . CompanyNumber ) ;
if ( successorDetails ! = null )
{
return new CompanyVerificationResult
{
ClaimedCompany = companyName ,
MatchedCompanyName = $"{companyName} (now {successorDetails.CompanyName})" ,
MatchedCompanyNumber = historicalInfo . CompanyNumber ,
MatchScore = 90 ,
IsVerified = true ,
VerificationNotes = $"Historical company. {historicalInfo.Notes}" ,
ClaimedStartDate = startDate ,
ClaimedEndDate = endDate ,
CompanyType = successorDetails . Type ,
CompanyStatus = "historical" ,
ClaimedJobTitle = jobTitle ,
Flags = flags
} ;
}
}
catch ( Exception ex )
{
_logger . LogWarning ( ex , "Failed to fetch successor company details for {CompanyNumber}" , historicalInfo . CompanyNumber ) ;
}
}
// Return historical match without successor details
return new CompanyVerificationResult
{
ClaimedCompany = companyName ,
MatchedCompanyName = $"{companyName} (now {historicalInfo.SuccessorName})" ,
MatchedCompanyNumber = historicalInfo . CompanyNumber ,
MatchScore = 90 ,
IsVerified = true ,
VerificationNotes = $"Historical company. {historicalInfo.Notes}" ,
ClaimedStartDate = startDate ,
ClaimedEndDate = endDate ,
CompanyStatus = "historical" ,
ClaimedJobTitle = jobTitle ,
Flags = flags
} ;
}
2026-01-20 21:17:11 +01:00
// Try to find a cached match first (but only if it existed at claimed start date)
2026-01-22 10:43:45 +00:00
var cachedMatch = await FindCachedMatchAsync ( normalizedName ) ;
2026-01-18 19:20:50 +01:00
if ( cachedMatch is not null )
{
2026-01-20 21:17:11 +01:00
// Check if cached company existed at the claimed start date
var cacheValid = ! startDate . HasValue | |
cachedMatch . IncorporationDate = = null | |
cachedMatch . IncorporationDate < = startDate . Value ;
if ( cacheValid )
{
_logger . LogDebug ( "Found cached company match for: {CompanyName}" , companyName ) ;
return CreateResultFromCache ( cachedMatch , companyName , startDate , endDate , jobTitle , flags ) ;
}
else
{
_logger . LogDebug ( "Cached company {CachedName} was incorporated after claimed start date, searching for alternatives" , cachedMatch . CompanyName ) ;
}
2026-01-18 19:20:50 +01:00
}
2026-01-20 21:32:02 +01:00
// Search Companies House with fallback queries
2026-01-18 19:20:50 +01:00
try
{
2026-01-22 10:43:45 +00:00
var searchQueries = GenerateSearchQueries ( normalizedName ) ;
2026-01-20 21:32:02 +01:00
_logger . LogDebug ( "Generated {Count} search queries for '{CompanyName}': {Queries}" ,
2026-01-22 10:43:45 +00:00
searchQueries . Count , normalizedName , string . Join ( ", " , searchQueries . Select ( q = > $"'{q}'" ) ) ) ;
2026-01-21 00:51:24 +01:00
// Collect all candidates from all search queries for AI matching
var allCandidates = new Dictionary < string , CompaniesHouseSearchItem > ( ) ;
var fuzzyMatches = new List < ( CompaniesHouseSearchItem Item , int Score ) > ( ) ;
2026-01-18 19:20:50 +01:00
2026-01-20 21:32:02 +01:00
foreach ( var query in searchQueries )
2026-01-18 19:20:50 +01:00
{
2026-01-20 21:32:02 +01:00
_logger . LogDebug ( "Searching Companies House with query: {Query}" , query ) ;
var searchResponse = await _companiesHouseClient . SearchCompaniesAsync ( query ) ;
if ( searchResponse ? . Items is null | | searchResponse . Items . Count = = 0 )
{
continue ;
}
2026-01-18 19:20:50 +01:00
2026-01-21 00:51:24 +01:00
// Collect unique candidates
foreach ( var item in searchResponse . Items )
{
if ( ! string . IsNullOrWhiteSpace ( item . CompanyNumber ) & &
! allCandidates . ContainsKey ( item . CompanyNumber ) )
{
allCandidates [ item . CompanyNumber ] = item ;
}
}
2026-01-20 21:32:02 +01:00
2026-01-21 00:51:24 +01:00
// Find fuzzy matches (as before) for fallback
2026-01-22 10:43:45 +00:00
var fuzzyMatch = FindBestMatch ( normalizedName , query , searchResponse . Items , startDate ) ;
2026-01-21 00:51:24 +01:00
if ( fuzzyMatch is not null )
2026-01-20 21:32:02 +01:00
{
2026-01-21 00:51:24 +01:00
fuzzyMatches . Add ( fuzzyMatch . Value ) ;
2026-01-20 21:32:02 +01:00
}
}
2026-01-18 19:20:50 +01:00
2026-01-21 00:51:24 +01:00
if ( allCandidates . Count = = 0 )
2026-01-18 19:20:50 +01:00
{
2026-01-22 10:43:45 +00:00
_logger . LogDebug ( "No candidates found for: {CompanyName} after trying {Count} queries" , normalizedName , searchQueries . Count ) ;
2026-01-20 20:00:24 +01:00
return CreateUnverifiedResult ( companyName , startDate , endDate , jobTitle ,
2026-01-20 21:04:30 +01:00
"Company name could not be verified against official records" ) ;
2026-01-18 19:20:50 +01:00
}
2026-01-21 00:51:24 +01:00
// Use AI to find the best semantic match from all candidates
2026-01-22 10:43:45 +00:00
_logger . LogDebug ( "Using AI to match '{CompanyName}' against {Count} candidates" , normalizedName , allCandidates . Count ) ;
2026-01-21 00:51:24 +01:00
2026-01-22 10:43:45 +00:00
// Sort candidates by fuzzy relevance to the search term before taking top 10
// This ensures the most likely matches are sent to the AI, not just arbitrary entries
var normalizedUpper = normalizedName . ToUpperInvariant ( ) ;
2026-01-21 00:51:24 +01:00
var candidatesForAI = allCandidates . Values
2026-01-22 10:43:45 +00:00
. Select ( c = > new
{
Item = c ,
Score = Fuzz . TokenSetRatio ( normalizedUpper , c . Title . ToUpperInvariant ( ) )
} )
. OrderByDescending ( x = > x . Score )
. Take ( 10 )
. Select ( x = > new CompanyCandidate
2026-01-21 00:51:24 +01:00
{
2026-01-22 10:43:45 +00:00
CompanyName = x . Item . Title ,
CompanyNumber = x . Item . CompanyNumber ,
CompanyStatus = x . Item . CompanyStatus ,
DateOfCreation = x . Item . DateOfCreation
2026-01-21 00:51:24 +01:00
} )
. ToList ( ) ;
2026-01-22 10:43:45 +00:00
_logger . LogDebug ( "Top candidates for AI matching (sorted by relevance): {Candidates}" ,
string . Join ( ", " , candidatesForAI . Select ( c = > $"{c.CompanyName} [{c.CompanyNumber}]" ) ) ) ;
var aiResult = await _aiMatcher . FindBestMatchAsync ( normalizedName , candidatesForAI ) ;
2026-01-21 00:51:24 +01:00
CompaniesHouseSearchItem ? matchedItem = null ;
int matchScore ;
2026-01-22 10:43:45 +00:00
// Get best fuzzy match for potential fallback
var bestFuzzy = fuzzyMatches . Count > 0
? fuzzyMatches . OrderByDescending ( m = > m . Score ) . First ( )
: ( ( CompaniesHouseSearchItem Item , int Score ) ? ) null ;
2026-01-21 00:51:24 +01:00
if ( aiResult is not null & & aiResult . IsMatch )
{
// AI found a valid match
matchedItem = allCandidates . GetValueOrDefault ( aiResult . CandidateCompanyNumber ) ;
matchScore = aiResult . ConfidenceScore ;
_logger . LogInformation (
"AI matched '{ClaimedName}' to '{MatchedName}' with {Score}% confidence. Reasoning: {Reasoning}" ,
companyName , aiResult . CandidateCompanyName , aiResult . ConfidenceScore , aiResult . Reasoning ) ;
}
else if ( fuzzyMatches . Count > 0 )
{
// AI didn't find a match - check if it explicitly rejected or just failed
if ( aiResult ? . MatchType = = "NoMatch" )
{
2026-01-22 10:43:45 +00:00
// AI explicitly rejected. Only override if fuzzy match passes strict validation:
// 1. High fuzzy score (>= 90%)
// 2. ALL core identifying words from original name appear in the match
// 3. Match doesn't have significantly more core words (prevents partial word matches)
if ( bestFuzzy . HasValue & & bestFuzzy . Value . Score > = 90 )
{
var originalCores = ExtractCoreIdentifiers ( normalizedName ) ;
var matchCores = ExtractCoreIdentifiers ( bestFuzzy . Value . Item . Title ) ;
// All original core words must appear in the match
var allCoresPresent = originalCores . Count = = 0 | |
originalCores . All ( c = > bestFuzzy . Value . Item . Title . Contains ( c , StringComparison . OrdinalIgnoreCase ) ) ;
// Match shouldn't have too many extra core words (max 2 extra, e.g., "GROUP PLC")
var extraCores = matchCores . Count ( c = > ! originalCores . Any ( o = >
c . Equals ( o , StringComparison . OrdinalIgnoreCase ) ) ) ;
var reasonableExtras = extraCores < = 2 ;
if ( allCoresPresent & & reasonableExtras )
{
_logger . LogInformation (
"AI rejected '{CompanyName}' but fuzzy match '{MatchedName}' ({Score}%) passes validation. " +
"Original cores: [{OriginalCores}], Match cores: [{MatchCores}]" ,
normalizedName , bestFuzzy . Value . Item . Title , bestFuzzy . Value . Score ,
string . Join ( ", " , originalCores ) , string . Join ( ", " , matchCores ) ) ;
matchedItem = bestFuzzy . Value . Item ;
matchScore = bestFuzzy . Value . Score ;
}
else
{
_logger . LogDebug (
"AI rejected '{CompanyName}' and fuzzy match '{MatchedName}' fails validation. " +
"AllCoresPresent: {AllCores}, ExtraCores: {Extra}" ,
normalizedName , bestFuzzy . Value . Item . Title , allCoresPresent , extraCores ) ;
return CreateUnverifiedResult ( companyName , startDate , endDate , jobTitle ,
"Company name could not be verified - no matching company found in official records" ) ;
}
}
else
{
_logger . LogDebug ( "AI explicitly rejected all candidates for '{CompanyName}'. Reasoning: {Reasoning}" ,
normalizedName , aiResult ? . Reasoning ? ? "No match found" ) ;
return CreateUnverifiedResult ( companyName , startDate , endDate , jobTitle ,
"Company name could not be verified - no matching company found in official records" ) ;
}
}
else
{
// AI failed (API error, etc.) - fall back to fuzzy matching
_logger . LogWarning ( "AI matching failed for '{CompanyName}', falling back to fuzzy matching" , normalizedName ) ;
matchedItem = bestFuzzy ! . Value . Item ;
matchScore = bestFuzzy ! . Value . Score ;
2026-01-21 00:51:24 +01:00
}
}
else
{
2026-01-22 10:43:45 +00:00
_logger . LogDebug ( "No valid match found for: {CompanyName}" , normalizedName ) ;
2026-01-21 00:51:24 +01:00
return CreateUnverifiedResult ( companyName , startDate , endDate , jobTitle ,
"Company name could not be verified against official records" ) ;
}
if ( matchedItem is null )
{
_logger . LogDebug ( "No valid match found for: {CompanyName}" , companyName ) ;
return CreateUnverifiedResult ( companyName , startDate , endDate , jobTitle ,
"Company name could not be verified against official records" ) ;
}
var match = ( Item : matchedItem , Score : matchScore ) ;
2026-01-20 20:00:24 +01:00
// Fetch full company details for additional data
var companyDetails = await _companiesHouseClient . GetCompanyAsync ( match . Item . CompanyNumber ) ;
// Cache the matched company with full details
await CacheCompanyAsync ( match . Item , companyDetails ) ;
2026-01-18 19:20:50 +01:00
_logger . LogInformation (
"Verified company {ClaimedName} matched to {MatchedName} with score {Score}%" ,
companyName , match . Item . Title , match . Score ) ;
2026-01-20 20:00:24 +01:00
// Run all verification checks
var incorporationDate = DateHelpers . ParseDate ( match . Item . DateOfCreation ) ;
var dissolutionDate = DateHelpers . ParseDate ( match . Item . DateOfCessation ) ;
var companyStatus = match . Item . CompanyStatus ;
var companyType = match . Item . CompanyType ;
var sicCodes = companyDetails ? . SicCodes ? ? match . Item . SicCodes ;
var accountsCategory = companyDetails ? . Accounts ? . LastAccounts ? . Type ;
// Check 1: Employment before company incorporation
CheckIncorporationDate ( flags , startDate , incorporationDate , match . Item . Title ) ;
// Check 2: Employment at dissolved company
CheckDissolutionDate ( flags , endDate , dissolutionDate , companyStatus , match . Item . Title ) ;
// Check 3: Dormant company check
CheckDormantCompany ( flags , accountsCategory , jobTitle , match . Item . Title ) ;
// Check 4: Company size vs job title
CheckCompanySizeVsRole ( flags , accountsCategory , jobTitle , match . Item . Title ) ;
2026-01-20 23:07:36 +01:00
// Check 5: Job title plausibility for PLCs
2026-01-20 20:00:24 +01:00
var ( jobPlausible , jobNotes ) = CheckJobTitlePlausibility ( jobTitle , companyType ) ;
if ( jobPlausible = = false )
{
flags . Add ( new CompanyVerificationFlag
{
Type = "ImplausibleJobTitle" ,
Severity = "Critical" ,
Message = jobNotes ? ? "Job title requires verification" ,
ScoreImpact = - 15
} ) ;
}
2026-01-18 19:20:50 +01:00
return new CompanyVerificationResult
{
ClaimedCompany = companyName ,
MatchedCompanyName = match . Item . Title ,
MatchedCompanyNumber = match . Item . CompanyNumber ,
MatchScore = match . Score ,
IsVerified = true ,
2026-01-20 20:58:12 +01:00
VerificationNotes = null ,
2026-01-18 19:20:50 +01:00
ClaimedStartDate = startDate ,
2026-01-20 20:00:24 +01:00
ClaimedEndDate = endDate ,
CompanyType = companyType ,
CompanyStatus = companyStatus ,
IncorporationDate = incorporationDate ,
DissolutionDate = dissolutionDate ,
AccountsCategory = accountsCategory ,
SicCodes = sicCodes ,
ClaimedJobTitle = jobTitle ,
JobTitlePlausible = jobPlausible ,
JobTitleNotes = jobNotes ,
Flags = flags
2026-01-18 19:20:50 +01:00
} ;
}
catch ( CompaniesHouseRateLimitException ex )
{
_logger . LogWarning ( ex , "Rate limit hit while verifying company: {CompanyName}" , companyName ) ;
2026-01-20 20:00:24 +01:00
return CreateUnverifiedResult ( companyName , startDate , endDate , jobTitle ,
2026-01-18 19:20:50 +01:00
"Verification temporarily unavailable due to rate limiting" ) ;
}
}
public async Task < List < CompanySearchResult > > SearchCompaniesAsync ( string query )
{
ArgumentException . ThrowIfNullOrWhiteSpace ( query ) ;
_logger . LogDebug ( "Searching companies for query: {Query}" , query ) ;
var response = await _companiesHouseClient . SearchCompaniesAsync ( query ) ;
if ( response ? . Items is null )
{
return [ ] ;
}
return response . Items . Select ( item = > new CompanySearchResult
{
CompanyNumber = item . CompanyNumber ,
CompanyName = item . Title ,
CompanyStatus = item . CompanyStatus ? ? "Unknown" ,
2026-01-20 16:45:43 +01:00
IncorporationDate = DateHelpers . ParseDate ( item . DateOfCreation ) ,
2026-01-18 19:20:50 +01:00
AddressSnippet = item . AddressSnippet
} ) . ToList ( ) ;
}
2026-01-20 20:00:24 +01:00
public async Task < bool? > VerifyDirectorAsync (
string companyNumber ,
string candidateName ,
DateOnly ? startDate ,
DateOnly ? endDate )
{
if ( string . IsNullOrWhiteSpace ( companyNumber ) | | string . IsNullOrWhiteSpace ( candidateName ) )
{
return null ;
}
try
{
var officers = await _companiesHouseClient . GetOfficersAsync ( companyNumber ) ;
if ( officers ? . Items is null | | officers . Items . Count = = 0 )
{
_logger . LogDebug ( "No officers found for company {CompanyNumber}" , companyNumber ) ;
return null ;
}
// Normalize candidate name for comparison
var normalizedCandidate = NormalizeName ( candidateName ) ;
foreach ( var officer in officers . Items )
{
// Check if officer role is director-like
var role = officer . OfficerRole ? . ToLowerInvariant ( ) ? ? "" ;
if ( ! role . Contains ( "director" ) & & ! role . Contains ( "secretary" ) )
{
continue ;
}
// Fuzzy match the name
var normalizedOfficer = NormalizeName ( officer . Name ) ;
var matchScore = Fuzz . Ratio ( normalizedCandidate , normalizedOfficer ) ;
if ( matchScore > = 80 ) // High threshold for name matching
{
// Check date overlap
var appointedOn = DateHelpers . ParseDate ( officer . AppointedOn ) ;
var resignedOn = DateHelpers . ParseDate ( officer . ResignedOn ) ;
// If no claimed dates, just check if names match
if ( ! startDate . HasValue & & ! endDate . HasValue )
{
_logger . LogDebug (
"Found matching director {OfficerName} for candidate {CandidateName} at company {CompanyNumber}" ,
officer . Name , candidateName , companyNumber ) ;
return true ;
}
// Check if employment period overlaps with directorship
var datesOverlap = DatesOverlap (
startDate , endDate ,
appointedOn , resignedOn ) ;
if ( datesOverlap )
{
_logger . LogDebug (
"Verified director {OfficerName} matches candidate {CandidateName} with overlapping dates" ,
officer . Name , candidateName ) ;
return true ;
}
}
}
_logger . LogDebug (
"No matching director found for candidate {CandidateName} at company {CompanyNumber}" ,
candidateName , companyNumber ) ;
return false ;
}
catch ( CompaniesHouseRateLimitException )
{
_logger . LogWarning ( "Rate limit hit while verifying director for company {CompanyNumber}" , companyNumber ) ;
return null ;
}
catch ( Exception ex )
{
_logger . LogError ( ex , "Error verifying director for company {CompanyNumber}" , companyNumber ) ;
return null ;
}
}
private static string NormalizeName ( string name )
{
if ( string . IsNullOrWhiteSpace ( name ) ) return "" ;
// Companies House often stores names as "SURNAME, Firstname"
// Convert to "Firstname Surname" format for comparison
var normalized = name . ToUpperInvariant ( ) . Trim ( ) ;
if ( normalized . Contains ( ',' ) )
{
var parts = normalized . Split ( ',' , 2 ) ;
if ( parts . Length = = 2 )
{
normalized = $"{parts[1].Trim()} {parts[0].Trim()}" ;
}
}
return normalized ;
}
private static bool DatesOverlap ( DateOnly ? start1 , DateOnly ? end1 , DateOnly ? start2 , DateOnly ? end2 )
{
// If no dates, assume overlap
if ( ! start1 . HasValue & & ! end1 . HasValue ) return true ;
if ( ! start2 . HasValue & & ! end2 . HasValue ) return true ;
// Use default dates for missing values
var s1 = start1 ? ? DateOnly . MinValue ;
var e1 = end1 ? ? DateOnly . MaxValue ;
var s2 = start2 ? ? DateOnly . MinValue ;
var e2 = end2 ? ? DateOnly . MaxValue ;
// Check overlap: periods overlap if one starts before the other ends
return s1 < = e2 & & s2 < = e1 ;
}
#region Verification Checks
private static void CheckIncorporationDate (
List < CompanyVerificationFlag > flags ,
DateOnly ? claimedStartDate ,
DateOnly ? incorporationDate ,
string companyName )
{
if ( claimedStartDate . HasValue & & incorporationDate . HasValue )
{
if ( claimedStartDate . Value < incorporationDate . Value )
{
flags . Add ( new CompanyVerificationFlag
{
Type = "EmploymentBeforeIncorporation" ,
Severity = "Critical" ,
Message = $"Claimed employment at '{companyName}' starting {claimedStartDate:MMM yyyy} is before company incorporation date {incorporationDate:MMM yyyy}" ,
ScoreImpact = - 20
} ) ;
}
}
}
private static void CheckDissolutionDate (
List < CompanyVerificationFlag > flags ,
DateOnly ? claimedEndDate ,
DateOnly ? dissolutionDate ,
string? companyStatus ,
string companyName )
{
var isDissolvedStatus = companyStatus ? . ToLowerInvariant ( ) is "dissolved" or "liquidation" or "administration" ;
if ( dissolutionDate . HasValue & & isDissolvedStatus )
{
// Allow 3 month buffer for wind-down
var bufferDate = dissolutionDate . Value . AddMonths ( 3 ) ;
if ( claimedEndDate . HasValue & & claimedEndDate . Value > bufferDate )
{
flags . Add ( new CompanyVerificationFlag
{
Type = "EmploymentAtDissolvedCompany" ,
Severity = "Critical" ,
Message = $"Claimed employment at '{companyName}' until {claimedEndDate:MMM yyyy} but company was dissolved on {dissolutionDate:MMM yyyy}" ,
ScoreImpact = - 20
} ) ;
}
else if ( ! claimedEndDate . HasValue ) // Current employment at dissolved company
{
flags . Add ( new CompanyVerificationFlag
{
Type = "CurrentEmploymentAtDissolvedCompany" ,
Severity = "Critical" ,
Message = $"Claims current employment at '{companyName}' but company was dissolved on {dissolutionDate:MMM yyyy}" ,
ScoreImpact = - 25
} ) ;
}
}
}
private static void CheckDormantCompany (
List < CompanyVerificationFlag > flags ,
string? accountsCategory ,
string? jobTitle ,
string companyName )
{
if ( string . IsNullOrWhiteSpace ( accountsCategory ) ) return ;
var isDormant = accountsCategory . ToLowerInvariant ( ) . Contains ( "dormant" ) ;
if ( ! isDormant ) return ;
// Directors can maintain dormant companies, but other roles are suspicious
var title = jobTitle ? . ToLowerInvariant ( ) ? ? "" ;
var isDirectorRole = title . Contains ( "director" ) | | title . Contains ( "company secretary" ) ;
if ( ! isDirectorRole )
{
flags . Add ( new CompanyVerificationFlag
{
Type = "EmploymentAtDormantCompany" ,
Severity = "Warning" ,
Message = $"Claimed active employment as '{jobTitle}' at '{companyName}' which files dormant accounts" ,
ScoreImpact = - 10
} ) ;
}
}
private static void CheckCompanySizeVsRole (
List < CompanyVerificationFlag > flags ,
string? accountsCategory ,
string? jobTitle ,
string companyName )
{
if ( string . IsNullOrWhiteSpace ( accountsCategory ) | | string . IsNullOrWhiteSpace ( jobTitle ) ) return ;
var category = accountsCategory . ToLowerInvariant ( ) ;
var title = jobTitle . ToLowerInvariant ( ) ;
// Micro-entity: < 10 employees, < £632k turnover
var isMicroEntity = category . Contains ( "micro" ) ;
// Check for senior management roles at micro companies
var isSeniorRole = title . Contains ( "vp" ) | |
title . Contains ( "vice president" ) | |
title . Contains ( "head of" ) | |
title . Contains ( "chief" ) | |
title . Contains ( "director of" ) | |
title . Contains ( "senior director" ) ;
// At micro companies, having many senior roles is suspicious
if ( isMicroEntity & & isSeniorRole )
{
flags . Add ( new CompanyVerificationFlag
{
Type = "SeniorRoleAtMicroCompany" ,
Severity = "Warning" ,
Message = $"Claimed senior role '{jobTitle}' at '{companyName}' which files micro-entity accounts (typically <10 employees)" ,
ScoreImpact = - 10
} ) ;
}
}
private static ( bool? IsPlausible , string? Notes ) CheckJobTitlePlausibility ( string? jobTitle , string? companyType )
{
if ( string . IsNullOrWhiteSpace ( jobTitle ) | | string . IsNullOrWhiteSpace ( companyType ) )
{
return ( null , null ) ;
}
var title = jobTitle . Trim ( ) . ToLowerInvariant ( ) ;
var type = companyType . Trim ( ) . ToLowerInvariant ( ) ;
// Check if this is a PLC (Public Limited Company) - these are large companies
var isPlc = type . Contains ( "plc" ) | | type . Contains ( "public limited" ) ;
// Check for C-suite / very senior roles
var isCsuiteRole = title . Contains ( "ceo" ) | |
title . Contains ( "chief executive" ) | |
title . Contains ( "cto" ) | |
title . Contains ( "chief technology" ) | |
title . Contains ( "cfo" ) | |
title . Contains ( "chief financial" ) | |
title . Contains ( "coo" ) | |
title . Contains ( "chief operating" ) | |
title . Contains ( "cio" ) | |
title . Contains ( "chief information" ) | |
title . Contains ( "managing director" ) | |
title = = "md" | |
title . Contains ( "chairman" ) | |
title . Contains ( "chairwoman" ) | |
title . Contains ( "chairperson" ) | |
title . Contains ( "president" ) ;
// Check for board-level roles
var isBoardRole = title . Contains ( "board member" ) | |
title . Contains ( "non-executive director" ) | |
title . Contains ( "executive director" ) | |
( title = = "director" & & ! title . Contains ( "of" ) ) ;
if ( isPlc & & ( isCsuiteRole | | isBoardRole ) )
{
return ( false , $"Claimed senior role '{jobTitle}' at a PLC requires verification - C-suite positions at public companies are publicly disclosed" ) ;
}
// Check for VP/SVP at PLCs (also usually disclosed)
var isVpRole = title . Contains ( "vice president" ) | |
title . Contains ( "vp " ) | |
title . StartsWith ( "vp" ) | |
title . Contains ( "svp" ) | |
title . Contains ( "senior vice president" ) | |
title . Contains ( "evp" ) | |
title . Contains ( "executive vice president" ) ;
if ( isPlc & & isVpRole )
{
return ( false , $"Claimed VP-level role '{jobTitle}' at a PLC - senior positions at public companies should be verifiable" ) ;
}
return ( true , null ) ;
}
#endregion
#region Helper Methods
2026-01-22 10:43:45 +00:00
/// <summary>
/// Normalizes a company name by removing trailing punctuation and cleaning up common issues.
/// </summary>
private static string NormalizeCompanyName ( string companyName )
{
if ( string . IsNullOrWhiteSpace ( companyName ) )
return companyName ;
var normalized = companyName . Trim ( ) ;
// Remove trailing punctuation (dots, commas, etc.) that cause matching issues
// e.g., "Glaxo Research & Development Ltd." -> "Glaxo Research & Development Ltd"
normalized = normalized . TrimEnd ( '.' , ',' , ';' , ':' , '!' , '?' ) ;
// Normalize multiple spaces to single space
normalized = System . Text . RegularExpressions . Regex . Replace ( normalized , @"\s+" , " " ) ;
return normalized ;
}
2026-01-18 19:20:50 +01:00
private async Task < CompanyCache ? > FindCachedMatchAsync ( string companyName )
{
var cutoffDate = DateTime . UtcNow . AddDays ( - CacheExpirationDays ) ;
2026-01-20 16:54:58 +01:00
await using var dbContext = await _dbContextFactory . CreateDbContextAsync ( ) ;
var cachedCompanies = await dbContext . CompanyCache
2026-01-18 19:20:50 +01:00
. Where ( c = > c . CachedAt > = cutoffDate )
. ToListAsync ( ) ;
if ( cachedCompanies . Count = = 0 )
{
return null ;
}
var matches = cachedCompanies
2026-01-20 20:58:12 +01:00
. Where ( c = > ! string . IsNullOrWhiteSpace ( c . CompanyName ) )
. Select ( c = > new { Company = c , Score = Fuzz . TokenSetRatio ( companyName . ToUpperInvariant ( ) , c . CompanyName . ToUpperInvariant ( ) ) } )
2026-01-18 19:20:50 +01:00
. Where ( m = > m . Score > = FuzzyMatchThreshold )
. OrderByDescending ( m = > m . Score )
. FirstOrDefault ( ) ;
return matches ? . Company ;
}
2026-01-20 21:21:21 +01:00
private ( CompaniesHouseSearchItem Item , int Score ) ? FindBestMatch (
2026-01-18 19:20:50 +01:00
string companyName ,
2026-01-20 21:49:26 +01:00
string searchQuery ,
2026-01-20 21:14:01 +01:00
List < CompaniesHouseSearchItem > items ,
DateOnly ? claimedStartDate )
2026-01-18 19:20:50 +01:00
{
2026-01-20 21:49:26 +01:00
var normalizedOriginal = companyName . ToUpperInvariant ( ) ;
var normalizedQuery = searchQuery . ToUpperInvariant ( ) ;
2026-01-18 19:20:50 +01:00
2026-01-20 22:13:23 +01:00
// Extract core identifying words that MUST appear in any valid match
// This prevents "BMW Group Canada" matching "CANADA LIFE GROUP" just because of common words
// and "Lloyds Bowmaker" matching "LLOYDS ALARMS" (missing "Bowmaker")
var coreWords = ExtractCoreIdentifiers ( companyName ) ;
var queryCoreWords = ExtractCoreIdentifiers ( searchQuery ) ;
var originalLower = companyName . ToLowerInvariant ( ) ;
var queryLower = searchQuery . ToLowerInvariant ( ) ;
Improve company verification filtering and fix duplicate points display
- Add NonEmploymentEntityPatterns dictionary with 10 categories of non-employer entities
(clubs, associations, trusts, charities, investment, property, religious, sports, educational, professional)
- Expand SkipWords from ~30 to 120+ words for better core identifier extraction
- Add GetSearchEntityTypes() and IsValidEmploymentEntity() helper methods
- Refactor FindBestMatch() to use pattern-based filtering instead of hardcoded checks
- Fix UI showing duplicate points for same company appearing multiple times
(now only shows points on first occurrence, subsequent rows show 0)
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-20 22:33:16 +01:00
// Determine which entity types the search is explicitly looking for
var searchEntityTypes = GetSearchEntityTypes ( originalLower , queryLower ) ;
2026-01-20 22:13:23 +01:00
2026-01-20 21:49:26 +01:00
// Match against both the original company name AND the search query used
// This handles cases like "Matthew Walker (Northern Foods Plc)" where we
// search for "Northern Foods Plc" but need to match against it, not the full name
2026-01-18 19:20:50 +01:00
var matches = items
2026-01-20 20:58:12 +01:00
. Where ( item = > ! string . IsNullOrWhiteSpace ( item . Title ) )
2026-01-20 22:13:23 +01:00
. Where ( item = >
{
var itemTitle = item . Title . ToUpperInvariant ( ) ;
var itemTitleLower = item . Title . ToLowerInvariant ( ) ;
2026-01-21 00:51:24 +01:00
var itemCoreWords = ExtractCoreIdentifiers ( item . Title ) ;
2026-01-20 22:13:23 +01:00
// Validate that ALL core identifiers appear in the match
// "Lloyds Bowmaker" must have BOTH "LLOYDS" and "BOWMAKER" in the match
var hasAllOriginalCores = coreWords . Count = = 0 | | coreWords . All ( w = > itemTitle . Contains ( w ) ) ;
var hasAllQueryCores = queryCoreWords . Count = = 0 | | queryCoreWords . All ( w = > itemTitle . Contains ( w ) ) ;
if ( ! hasAllOriginalCores & & ! hasAllQueryCores ) return false ;
2026-01-21 00:51:24 +01:00
// Additional check: ensure the match doesn't have too many EXTRA core words
// "Families First" should NOT match "Families Against Conformity" because
// "Against" and "Conformity" are extra significant words
if ( coreWords . Count > 0 & & hasAllOriginalCores )
{
var extraWordsInMatch = itemCoreWords . Count ( w = > ! coreWords . Contains ( w ) ) ;
// If the match has more than 1 extra core word, it's likely a different company
if ( extraWordsInMatch > 1 & & itemCoreWords . Count > coreWords . Count + 1 )
{
return false ;
}
}
Improve company verification filtering and fix duplicate points display
- Add NonEmploymentEntityPatterns dictionary with 10 categories of non-employer entities
(clubs, associations, trusts, charities, investment, property, religious, sports, educational, professional)
- Expand SkipWords from ~30 to 120+ words for better core identifier extraction
- Add GetSearchEntityTypes() and IsValidEmploymentEntity() helper methods
- Refactor FindBestMatch() to use pattern-based filtering instead of hardcoded checks
- Fix UI showing duplicate points for same company appearing multiple times
(now only shows points on first occurrence, subsequent rows show 0)
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-20 22:33:16 +01:00
// Filter out non-employment entities unless explicitly searching for that type
if ( ! IsValidEmploymentEntity ( itemTitleLower , searchEntityTypes ) )
2026-01-20 22:13:23 +01:00
{
Improve company verification filtering and fix duplicate points display
- Add NonEmploymentEntityPatterns dictionary with 10 categories of non-employer entities
(clubs, associations, trusts, charities, investment, property, religious, sports, educational, professional)
- Expand SkipWords from ~30 to 120+ words for better core identifier extraction
- Add GetSearchEntityTypes() and IsValidEmploymentEntity() helper methods
- Refactor FindBestMatch() to use pattern-based filtering instead of hardcoded checks
- Fix UI showing duplicate points for same company appearing multiple times
(now only shows points on first occurrence, subsequent rows show 0)
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-20 22:33:16 +01:00
return false ;
2026-01-20 22:13:23 +01:00
}
return true ;
} )
2026-01-20 21:49:26 +01:00
. Select ( item = >
{
var itemTitle = item . Title . ToUpperInvariant ( ) ;
Prefer main trading companies over subsidiaries in company matching
When matching brand names like "ASDA", prefer the main employer company
(ASDA STORES LIMITED) over subsidiaries (ASDA DELIVERY LIMITED).
- Add SubsidiaryIndicators set (delivery, distribution, holdings, property, etc.)
- Add MainCompanyIndicators set (stores, retail, manufacturing, etc.)
- Add CalculateCompanyPriorityScore() method for ranking matches
- Sort matches by priority score first, then by fuzzy score
- Subsidiaries get -10 priority unless explicitly searched for
- Main trading companies get +5 priority, PLCs get +3
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-20 22:41:27 +01:00
var itemTitleLower = item . Title . ToLowerInvariant ( ) ;
2026-01-20 21:49:26 +01:00
var scoreVsOriginal = Fuzz . TokenSetRatio ( normalizedOriginal , itemTitle ) ;
var scoreVsQuery = Fuzz . TokenSetRatio ( normalizedQuery , itemTitle ) ;
Prefer main trading companies over subsidiaries in company matching
When matching brand names like "ASDA", prefer the main employer company
(ASDA STORES LIMITED) over subsidiaries (ASDA DELIVERY LIMITED).
- Add SubsidiaryIndicators set (delivery, distribution, holdings, property, etc.)
- Add MainCompanyIndicators set (stores, retail, manufacturing, etc.)
- Add CalculateCompanyPriorityScore() method for ranking matches
- Sort matches by priority score first, then by fuzzy score
- Subsidiaries get -10 priority unless explicitly searched for
- Main trading companies get +5 priority, PLCs get +3
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-20 22:41:27 +01:00
var baseScore = Math . Max ( scoreVsOriginal , scoreVsQuery ) ;
// Calculate priority adjustment for main company vs subsidiary
var priorityScore = CalculateCompanyPriorityScore ( itemTitleLower , originalLower , queryLower ) ;
return ( Item : item , Score : baseScore , PriorityScore : priorityScore ) ;
2026-01-20 21:49:26 +01:00
} )
2026-01-18 19:20:50 +01:00
. Where ( m = > m . Score > = FuzzyMatchThreshold )
. ToList ( ) ;
2026-01-20 21:49:26 +01:00
_logger . LogDebug ( "Found {Count} matches above threshold for '{CompanyName}' (query: '{Query}')" , matches . Count , companyName , searchQuery ) ;
2026-01-20 21:21:21 +01:00
foreach ( var m in matches . Take ( 5 ) )
{
Prefer main trading companies over subsidiaries in company matching
When matching brand names like "ASDA", prefer the main employer company
(ASDA STORES LIMITED) over subsidiaries (ASDA DELIVERY LIMITED).
- Add SubsidiaryIndicators set (delivery, distribution, holdings, property, etc.)
- Add MainCompanyIndicators set (stores, retail, manufacturing, etc.)
- Add CalculateCompanyPriorityScore() method for ranking matches
- Sort matches by priority score first, then by fuzzy score
- Subsidiaries get -10 priority unless explicitly searched for
- Main trading companies get +5 priority, PLCs get +3
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-20 22:41:27 +01:00
_logger . LogDebug ( " Match: {Title} ({Number}), Score: {Score}, Priority: {Priority}, DateOfCreation: {Date}" ,
m . Item . Title , m . Item . CompanyNumber , m . Score , m . PriorityScore , m . Item . DateOfCreation ? ? "null" ) ;
2026-01-20 21:21:21 +01:00
}
2026-01-20 21:14:01 +01:00
if ( matches . Count = = 0 ) return null ;
// If we have a claimed start date, prefer companies that existed at that time
if ( claimedStartDate . HasValue )
{
2026-01-20 21:21:21 +01:00
_logger . LogDebug ( "Filtering for companies that existed at claimed start date: {StartDate}" , claimedStartDate . Value ) ;
2026-01-20 21:14:01 +01:00
var existedAtStartDate = matches
. Where ( m = >
{
var incDate = DateHelpers . ParseDate ( m . Item . DateOfCreation ) ;
2026-01-20 21:21:21 +01:00
var existed = incDate = = null | | incDate < = claimedStartDate . Value ;
_logger . LogDebug ( " {Title}: IncDate={IncDate}, Existed={Existed}" ,
m . Item . Title , incDate ? . ToString ( ) ? ? "null" , existed ) ;
return existed ;
2026-01-20 21:14:01 +01:00
} )
Prefer main trading companies over subsidiaries in company matching
When matching brand names like "ASDA", prefer the main employer company
(ASDA STORES LIMITED) over subsidiaries (ASDA DELIVERY LIMITED).
- Add SubsidiaryIndicators set (delivery, distribution, holdings, property, etc.)
- Add MainCompanyIndicators set (stores, retail, manufacturing, etc.)
- Add CalculateCompanyPriorityScore() method for ranking matches
- Sort matches by priority score first, then by fuzzy score
- Subsidiaries get -10 priority unless explicitly searched for
- Main trading companies get +5 priority, PLCs get +3
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-20 22:41:27 +01:00
// Sort by priority first, then by fuzzy score
. OrderByDescending ( m = > m . PriorityScore )
. ThenByDescending ( m = > m . Score )
2026-01-20 21:14:01 +01:00
. ToList ( ) ;
2026-01-20 21:21:21 +01:00
_logger . LogDebug ( "Companies that existed at start date: {Count}" , existedAtStartDate . Count ) ;
2026-01-20 21:14:01 +01:00
// If any matches existed at the start date, prefer those
if ( existedAtStartDate . Count > 0 )
{
Prefer main trading companies over subsidiaries in company matching
When matching brand names like "ASDA", prefer the main employer company
(ASDA STORES LIMITED) over subsidiaries (ASDA DELIVERY LIMITED).
- Add SubsidiaryIndicators set (delivery, distribution, holdings, property, etc.)
- Add MainCompanyIndicators set (stores, retail, manufacturing, etc.)
- Add CalculateCompanyPriorityScore() method for ranking matches
- Sort matches by priority score first, then by fuzzy score
- Subsidiaries get -10 priority unless explicitly searched for
- Main trading companies get +5 priority, PLCs get +3
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-20 22:41:27 +01:00
var selected = existedAtStartDate [ 0 ] ;
_logger . LogDebug ( "Selected: {Title} ({Number}), Priority: {Priority}" , selected . Item . Title , selected . Item . CompanyNumber , selected . PriorityScore ) ;
return ( selected . Item , selected . Score ) ;
2026-01-20 21:14:01 +01:00
}
2026-01-20 21:26:36 +01:00
// No companies existed at the claimed start date - don't match a wrong company
_logger . LogDebug ( "No companies found that existed at claimed start date {StartDate}, returning no match" , claimedStartDate . Value ) ;
return null ;
2026-01-20 21:14:01 +01:00
}
Prefer main trading companies over subsidiaries in company matching
When matching brand names like "ASDA", prefer the main employer company
(ASDA STORES LIMITED) over subsidiaries (ASDA DELIVERY LIMITED).
- Add SubsidiaryIndicators set (delivery, distribution, holdings, property, etc.)
- Add MainCompanyIndicators set (stores, retail, manufacturing, etc.)
- Add CalculateCompanyPriorityScore() method for ranking matches
- Sort matches by priority score first, then by fuzzy score
- Subsidiaries get -10 priority unless explicitly searched for
- Main trading companies get +5 priority, PLCs get +3
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-20 22:41:27 +01:00
// No start date provided - sort by priority then score
var fallback = matches
. OrderByDescending ( m = > m . PriorityScore )
. ThenByDescending ( m = > m . Score )
. First ( ) ;
_logger . LogDebug ( "No start date filter, using highest priority: {Title} ({Number}), Priority: {Priority}" , fallback . Item . Title , fallback . Item . CompanyNumber , fallback . PriorityScore ) ;
return ( fallback . Item , fallback . Score ) ;
}
/// <summary>
/// Calculates a priority score for company matching.
/// Higher scores = more likely to be the main employer company.
/// Penalizes subsidiaries (delivery, property, holdings, etc.) unless explicitly searched for.
/// Boosts main trading companies (stores, retail, etc.).
/// </summary>
private static int CalculateCompanyPriorityScore ( string itemTitleLower , string originalLower , string queryLower )
{
var score = 0 ;
// Check if search explicitly mentions subsidiary indicators
var searchText = originalLower + " " + queryLower ;
// Penalize subsidiary indicators (unless search explicitly included them)
2026-01-22 10:43:45 +00:00
// Use word boundary matching to avoid "SCOTLAND" matching "land"
Prefer main trading companies over subsidiaries in company matching
When matching brand names like "ASDA", prefer the main employer company
(ASDA STORES LIMITED) over subsidiaries (ASDA DELIVERY LIMITED).
- Add SubsidiaryIndicators set (delivery, distribution, holdings, property, etc.)
- Add MainCompanyIndicators set (stores, retail, manufacturing, etc.)
- Add CalculateCompanyPriorityScore() method for ranking matches
- Sort matches by priority score first, then by fuzzy score
- Subsidiaries get -10 priority unless explicitly searched for
- Main trading companies get +5 priority, PLCs get +3
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-20 22:41:27 +01:00
foreach ( var indicator in SubsidiaryIndicators )
{
2026-01-22 10:43:45 +00:00
if ( ContainsWholeWord ( itemTitleLower , indicator ) )
Prefer main trading companies over subsidiaries in company matching
When matching brand names like "ASDA", prefer the main employer company
(ASDA STORES LIMITED) over subsidiaries (ASDA DELIVERY LIMITED).
- Add SubsidiaryIndicators set (delivery, distribution, holdings, property, etc.)
- Add MainCompanyIndicators set (stores, retail, manufacturing, etc.)
- Add CalculateCompanyPriorityScore() method for ranking matches
- Sort matches by priority score first, then by fuzzy score
- Subsidiaries get -10 priority unless explicitly searched for
- Main trading companies get +5 priority, PLCs get +3
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-20 22:41:27 +01:00
{
// Only penalize if the search didn't explicitly include this indicator
2026-01-22 10:43:45 +00:00
if ( ! ContainsWholeWord ( searchText , indicator ) )
Prefer main trading companies over subsidiaries in company matching
When matching brand names like "ASDA", prefer the main employer company
(ASDA STORES LIMITED) over subsidiaries (ASDA DELIVERY LIMITED).
- Add SubsidiaryIndicators set (delivery, distribution, holdings, property, etc.)
- Add MainCompanyIndicators set (stores, retail, manufacturing, etc.)
- Add CalculateCompanyPriorityScore() method for ranking matches
- Sort matches by priority score first, then by fuzzy score
- Subsidiaries get -10 priority unless explicitly searched for
- Main trading companies get +5 priority, PLCs get +3
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-20 22:41:27 +01:00
{
score - = 10 ; // Significant penalty for subsidiaries
}
break ; // Only apply one subsidiary penalty
}
}
// Boost main company indicators
foreach ( var indicator in MainCompanyIndicators )
{
2026-01-22 10:43:45 +00:00
if ( ContainsWholeWord ( itemTitleLower , indicator ) )
Prefer main trading companies over subsidiaries in company matching
When matching brand names like "ASDA", prefer the main employer company
(ASDA STORES LIMITED) over subsidiaries (ASDA DELIVERY LIMITED).
- Add SubsidiaryIndicators set (delivery, distribution, holdings, property, etc.)
- Add MainCompanyIndicators set (stores, retail, manufacturing, etc.)
- Add CalculateCompanyPriorityScore() method for ranking matches
- Sort matches by priority score first, then by fuzzy score
- Subsidiaries get -10 priority unless explicitly searched for
- Main trading companies get +5 priority, PLCs get +3
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-20 22:41:27 +01:00
{
score + = 5 ; // Boost for main trading companies
break ; // Only apply one boost
}
}
// Slight boost for PLC (usually the parent/main company)
if ( itemTitleLower . EndsWith ( " plc" ) )
{
score + = 3 ;
}
return score ;
2026-01-18 19:20:50 +01:00
}
2026-01-20 20:00:24 +01:00
private async Task CacheCompanyAsync ( CompaniesHouseSearchItem item , CompaniesHouseCompany ? details )
2026-01-18 19:20:50 +01:00
{
2026-01-20 20:58:12 +01:00
try
{
await using var dbContext = await _dbContextFactory . CreateDbContextAsync ( ) ;
2026-01-20 16:54:58 +01:00
2026-01-20 20:58:12 +01:00
var existingCache = await dbContext . CompanyCache
. FirstOrDefaultAsync ( c = > c . CompanyNumber = = item . CompanyNumber ) ;
2026-01-18 19:20:50 +01:00
2026-01-20 20:58:12 +01:00
var sicCodes = details ? . SicCodes ? ? item . SicCodes ;
var sicCodesJson = sicCodes ! = null ? JsonSerializer . Serialize ( sicCodes ) : null ;
var accountsCategory = details ? . Accounts ? . LastAccounts ? . Type ;
2026-01-20 20:00:24 +01:00
2026-01-20 20:58:12 +01:00
if ( existingCache is not null )
2026-01-18 19:20:50 +01:00
{
2026-01-20 20:58:12 +01:00
existingCache . CompanyName = item . Title ;
existingCache . Status = item . CompanyStatus ? ? "Unknown" ;
existingCache . CompanyType = item . CompanyType ;
existingCache . IncorporationDate = DateHelpers . ParseDate ( item . DateOfCreation ) ;
existingCache . DissolutionDate = DateHelpers . ParseDate ( item . DateOfCessation ) ;
existingCache . AccountsCategory = accountsCategory ;
existingCache . SicCodesJson = sicCodesJson ;
existingCache . CachedAt = DateTime . UtcNow ;
}
else
{
var cacheEntry = new CompanyCache
{
CompanyNumber = item . CompanyNumber ,
CompanyName = item . Title ,
Status = item . CompanyStatus ? ? "Unknown" ,
CompanyType = item . CompanyType ,
IncorporationDate = DateHelpers . ParseDate ( item . DateOfCreation ) ,
DissolutionDate = DateHelpers . ParseDate ( item . DateOfCessation ) ,
AccountsCategory = accountsCategory ,
SicCodesJson = sicCodesJson ,
CachedAt = DateTime . UtcNow
} ;
dbContext . CompanyCache . Add ( cacheEntry ) ;
}
2026-01-18 19:20:50 +01:00
2026-01-20 20:58:12 +01:00
await dbContext . SaveChangesAsync ( ) ;
}
catch ( DbUpdateException ex ) when ( ex . InnerException ? . Message . Contains ( "PK_CompanyCache" ) = = true )
{
// Race condition: another task already cached this company - ignore
_logger . LogDebug ( "Company {CompanyNumber} already cached by another task" , item . CompanyNumber ) ;
2026-01-18 19:20:50 +01:00
}
}
2026-01-20 20:00:24 +01:00
private CompanyVerificationResult CreateResultFromCache (
2026-01-18 19:20:50 +01:00
CompanyCache cached ,
2026-01-20 20:00:24 +01:00
string claimedCompany ,
2026-01-18 19:20:50 +01:00
DateOnly ? startDate ,
2026-01-20 20:00:24 +01:00
DateOnly ? endDate ,
string? jobTitle ,
List < CompanyVerificationFlag > flags )
2026-01-18 19:20:50 +01:00
{
2026-01-20 20:58:12 +01:00
var matchScore = Fuzz . TokenSetRatio (
2026-01-18 19:20:50 +01:00
claimedCompany . ToUpperInvariant ( ) ,
cached . CompanyName . ToUpperInvariant ( ) ) ;
2026-01-20 20:58:12 +01:00
List < string > ? sicCodes = null ;
if ( ! string . IsNullOrEmpty ( cached . SicCodesJson ) )
{
try
{
sicCodes = JsonSerializer . Deserialize < List < string > > ( cached . SicCodesJson ) ;
}
catch ( JsonException )
{
// Ignore malformed JSON in cache
}
}
2026-01-20 20:00:24 +01:00
// Run all verification checks
CheckIncorporationDate ( flags , startDate , cached . IncorporationDate , cached . CompanyName ) ;
CheckDissolutionDate ( flags , endDate , cached . DissolutionDate , cached . Status , cached . CompanyName ) ;
CheckDormantCompany ( flags , cached . AccountsCategory , jobTitle , cached . CompanyName ) ;
CheckCompanySizeVsRole ( flags , cached . AccountsCategory , jobTitle , cached . CompanyName ) ;
var ( jobPlausible , jobNotes ) = CheckJobTitlePlausibility ( jobTitle , cached . CompanyType ) ;
if ( jobPlausible = = false )
{
flags . Add ( new CompanyVerificationFlag
{
Type = "ImplausibleJobTitle" ,
Severity = "Critical" ,
Message = jobNotes ? ? "Job title requires verification" ,
ScoreImpact = - 15
} ) ;
}
2026-01-18 19:20:50 +01:00
return new CompanyVerificationResult
{
ClaimedCompany = claimedCompany ,
MatchedCompanyName = cached . CompanyName ,
MatchedCompanyNumber = cached . CompanyNumber ,
MatchScore = matchScore ,
IsVerified = true ,
2026-01-20 20:58:12 +01:00
VerificationNotes = null ,
2026-01-18 19:20:50 +01:00
ClaimedStartDate = startDate ,
2026-01-20 20:00:24 +01:00
ClaimedEndDate = endDate ,
CompanyType = cached . CompanyType ,
CompanyStatus = cached . Status ,
IncorporationDate = cached . IncorporationDate ,
DissolutionDate = cached . DissolutionDate ,
AccountsCategory = cached . AccountsCategory ,
SicCodes = sicCodes ,
ClaimedJobTitle = jobTitle ,
JobTitlePlausible = jobPlausible ,
JobTitleNotes = jobNotes ,
Flags = flags
2026-01-18 19:20:50 +01:00
} ;
}
private static CompanyVerificationResult CreateUnverifiedResult (
string companyName ,
DateOnly ? startDate ,
DateOnly ? endDate ,
2026-01-20 20:00:24 +01:00
string? jobTitle ,
2026-01-18 19:20:50 +01:00
string reason )
{
return new CompanyVerificationResult
{
ClaimedCompany = companyName ,
MatchedCompanyName = null ,
MatchedCompanyNumber = null ,
MatchScore = 0 ,
IsVerified = false ,
VerificationNotes = reason ,
ClaimedStartDate = startDate ,
2026-01-20 20:00:24 +01:00
ClaimedEndDate = endDate ,
ClaimedJobTitle = jobTitle
2026-01-18 19:20:50 +01:00
} ;
}
2026-01-20 21:32:02 +01:00
/// <summary>
/// Generates alternative search queries to find companies that may be registered
/// with slightly different names (e.g., "U.K." vs "UK", "Limited" vs "Ltd").
2026-01-20 21:49:26 +01:00
/// Also handles "Brand (Parent Company)" format by extracting and prioritizing the parent.
2026-01-20 21:32:02 +01:00
/// </summary>
private static List < string > GenerateSearchQueries ( string companyName )
{
2026-01-20 21:49:26 +01:00
var queries = new HashSet < string > ( StringComparer . OrdinalIgnoreCase ) ;
2026-01-20 21:32:02 +01:00
var normalized = companyName . Trim ( ) ;
2026-01-20 21:49:26 +01:00
// Step 0a: Check for "Brand (Parent Company)" format and extract parent company
// Parent company is more likely to be the registered name, so search it first
var parentMatch = System . Text . RegularExpressions . Regex . Match ( normalized , @"\(([^)]+)\)\s*$" ) ;
if ( parentMatch . Success )
{
var parentCompany = parentMatch . Groups [ 1 ] . Value . Trim ( ) ;
// Generate queries for parent company first (higher priority)
foreach ( var parentQuery in GenerateNameVariations ( parentCompany ) )
{
queries . Add ( parentQuery ) ;
}
// Also try the brand name without parenthetical
var brandName = normalized [ . . parentMatch . Index ] . Trim ( ) ;
if ( brandName . Length > = 3 )
{
foreach ( var brandQuery in GenerateNameVariations ( brandName ) )
{
queries . Add ( brandQuery ) ;
}
}
}
// Step 0b: Check for "Name1/Name2" format (e.g., "ASDA/WALMART")
// Try each part separately as they may be different registered names
if ( normalized . Contains ( '/' ) )
{
var parts = normalized . Split ( '/' , StringSplitOptions . RemoveEmptyEntries | StringSplitOptions . TrimEntries ) ;
foreach ( var part in parts )
{
if ( part . Length > = 3 )
{
foreach ( var partQuery in GenerateNameVariations ( part ) )
{
queries . Add ( partQuery ) ;
}
}
}
}
2026-01-20 22:04:44 +01:00
// Step 0c: Try first word as potential parent company (e.g., "UNILEVER BESTFOOD" -> "UNILEVER")
// Many company names are "ParentCompany Division" or "ParentCompany Brand"
var words = normalized . Split ( ' ' , StringSplitOptions . RemoveEmptyEntries ) ;
if ( words . Length > = 2 )
{
var firstWord = words [ 0 ] ;
// Only try if first word is substantial (not "The", "A", common prefixes)
var skipWords = new HashSet < string > ( StringComparer . OrdinalIgnoreCase )
{ "the" , "a" , "an" , "uk" , "british" , "national" , "international" , "global" , "new" } ;
if ( firstWord . Length > = 4 & & ! skipWords . Contains ( firstWord ) )
{
foreach ( var firstWordQuery in GenerateNameVariations ( firstWord ) )
{
queries . Add ( firstWordQuery ) ;
}
// Also try first word + PLC/Limited for major corporations
queries . Add ( firstWord + " PLC" ) ;
queries . Add ( firstWord + " Limited" ) ;
}
}
2026-01-20 21:49:26 +01:00
// Also add variations of the full original name
foreach ( var query in GenerateNameVariations ( normalized ) )
{
queries . Add ( query ) ;
}
return queries . ToList ( ) ;
}
/// <summary>
/// Generates name variations for a single company name (UK/U.K., Ltd/Limited, etc.)
/// </summary>
private static List < string > GenerateNameVariations ( string name )
{
var variations = new HashSet < string > ( StringComparer . OrdinalIgnoreCase ) { name } ;
2026-01-20 21:39:22 +01:00
// Step 1: Generate UK/U.K. variations
2026-01-20 21:49:26 +01:00
var ukVariants = new List < string > { name } ;
2026-01-20 21:32:02 +01:00
2026-01-20 21:49:26 +01:00
if ( name . Contains ( " UK" , StringComparison . OrdinalIgnoreCase ) )
2026-01-20 21:32:02 +01:00
{
2026-01-20 21:39:22 +01:00
// Add U.K. variant
2026-01-20 21:49:26 +01:00
var withDots = name
2026-01-20 21:39:22 +01:00
. Replace ( " UK " , " U.K. " , StringComparison . OrdinalIgnoreCase )
. Replace ( " UK" , " U.K." , StringComparison . OrdinalIgnoreCase ) ;
2026-01-20 21:49:26 +01:00
if ( withDots ! = name )
2026-01-20 21:39:22 +01:00
ukVariants . Add ( withDots ) ;
2026-01-20 21:32:02 +01:00
}
2026-01-20 21:49:26 +01:00
if ( name . Contains ( " U.K." , StringComparison . OrdinalIgnoreCase ) )
2026-01-20 21:39:22 +01:00
{
// Add UK variant (no dots)
2026-01-20 21:49:26 +01:00
var withoutDots = name
2026-01-20 21:39:22 +01:00
. Replace ( " U.K. " , " UK " , StringComparison . OrdinalIgnoreCase )
. Replace ( " U.K." , " UK" , StringComparison . OrdinalIgnoreCase ) ;
2026-01-20 21:49:26 +01:00
if ( withoutDots ! = name )
2026-01-20 21:39:22 +01:00
ukVariants . Add ( withoutDots ) ;
2026-01-20 21:32:02 +01:00
}
2026-01-20 21:39:22 +01:00
// Step 2: For each UK variant, generate suffix variations (Ltd/Limited)
foreach ( var variant in ukVariants )
2026-01-20 21:32:02 +01:00
{
2026-01-20 21:49:26 +01:00
variations . Add ( variant ) ;
2026-01-20 21:39:22 +01:00
// Try Ltd -> Limited
if ( variant . EndsWith ( " Ltd" , StringComparison . OrdinalIgnoreCase ) )
{
2026-01-20 21:49:26 +01:00
variations . Add ( variant [ . . ^ 4 ] + " Limited" ) ;
2026-01-20 21:39:22 +01:00
}
// Try Limited -> Ltd
else if ( variant . EndsWith ( " Limited" , StringComparison . OrdinalIgnoreCase ) )
{
2026-01-20 21:49:26 +01:00
variations . Add ( variant [ . . ^ 8 ] + " Ltd" ) ;
2026-01-20 21:39:22 +01:00
}
// Try PLC variations
else if ( variant . EndsWith ( " PLC" , StringComparison . OrdinalIgnoreCase ) )
{
2026-01-20 21:49:26 +01:00
variations . Add ( variant [ . . ^ 4 ] + " Public Limited Company" ) ;
2026-01-20 21:39:22 +01:00
}
else if ( variant . EndsWith ( " Public Limited Company" , StringComparison . OrdinalIgnoreCase ) )
{
2026-01-20 21:49:26 +01:00
variations . Add ( variant [ . . ^ 24 ] + " PLC" ) ;
}
// Try Plc (mixed case) variations
else if ( variant . EndsWith ( " Plc" , StringComparison . Ordinal ) )
{
variations . Add ( variant [ . . ^ 4 ] + " PLC" ) ;
variations . Add ( variant [ . . ^ 4 ] + " Public Limited Company" ) ;
2026-01-20 21:39:22 +01:00
}
2026-01-20 21:32:02 +01:00
}
2026-01-20 21:39:22 +01:00
// Step 3: Try core name without suffix
2026-01-20 21:49:26 +01:00
var suffixesToRemove = new [ ] { " Ltd" , " Limited" , " PLC" , " Plc" , " LLP" , " Inc" , " Corporation" , " Corp" } ;
var coreName = name ;
2026-01-20 21:32:02 +01:00
foreach ( var suffix in suffixesToRemove )
{
if ( coreName . EndsWith ( suffix , StringComparison . OrdinalIgnoreCase ) )
{
coreName = coreName [ . . ^ suffix . Length ] . Trim ( ) ;
break ;
}
}
2026-01-20 21:49:26 +01:00
if ( coreName ! = name & & coreName . Length > = 3 )
2026-01-20 21:32:02 +01:00
{
2026-01-20 21:49:26 +01:00
variations . Add ( coreName ) ;
variations . Add ( coreName + " Limited" ) ;
variations . Add ( coreName + " PLC" ) ;
2026-01-20 21:39:22 +01:00
// Also add U.K. variant of core name if applicable
if ( coreName . Contains ( " UK" , StringComparison . OrdinalIgnoreCase ) )
{
var coreWithDots = coreName
. Replace ( " UK " , " U.K. " , StringComparison . OrdinalIgnoreCase )
. Replace ( " UK" , " U.K." , StringComparison . OrdinalIgnoreCase ) ;
2026-01-20 21:49:26 +01:00
variations . Add ( coreWithDots ) ;
variations . Add ( coreWithDots + " Limited" ) ;
2026-01-20 21:39:22 +01:00
}
2026-01-20 21:32:02 +01:00
}
2026-01-20 21:49:26 +01:00
return variations . ToList ( ) ;
2026-01-20 21:32:02 +01:00
}
Improve company verification filtering and fix duplicate points display
- Add NonEmploymentEntityPatterns dictionary with 10 categories of non-employer entities
(clubs, associations, trusts, charities, investment, property, religious, sports, educational, professional)
- Expand SkipWords from ~30 to 120+ words for better core identifier extraction
- Add GetSearchEntityTypes() and IsValidEmploymentEntity() helper methods
- Refactor FindBestMatch() to use pattern-based filtering instead of hardcoded checks
- Fix UI showing duplicate points for same company appearing multiple times
(now only shows points on first occurrence, subsequent rows show 0)
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-20 22:33:16 +01:00
/// <summary>
/// Determines which non-employment entity categories the search query is explicitly looking for.
/// Returns a set of category names (e.g., "Clubs", "Trusts") that should NOT be filtered out.
/// </summary>
private static HashSet < string > GetSearchEntityTypes ( string originalLower , string queryLower )
{
var allowedCategories = new HashSet < string > ( ) ;
var searchTerms = originalLower + " " + queryLower ;
foreach ( var ( category , patterns ) in NonEmploymentEntityPatterns )
{
// If any pattern from this category appears in the search, allow matches from that category
if ( patterns . Any ( pattern = > searchTerms . Contains ( pattern ) ) )
{
allowedCategories . Add ( category ) ;
}
}
return allowedCategories ;
}
/// <summary>
/// Checks if a company title represents a valid employment entity.
/// Filters out non-employment entities (clubs, trusts, etc.) unless the search explicitly targets that type.
/// </summary>
private static bool IsValidEmploymentEntity ( string itemTitleLower , HashSet < string > allowedCategories )
{
foreach ( var ( category , patterns ) in NonEmploymentEntityPatterns )
{
// Skip this category if the search explicitly allows it
if ( allowedCategories . Contains ( category ) )
{
continue ;
}
// Check if the item matches any pattern in this non-employment category
2026-01-22 10:43:45 +00:00
// Use whole-word matching for single words, substring for multi-word patterns
if ( patterns . Any ( pattern = > pattern . Contains ( ' ' )
? itemTitleLower . Contains ( pattern )
: ContainsWholeWord ( itemTitleLower , pattern ) ) )
Improve company verification filtering and fix duplicate points display
- Add NonEmploymentEntityPatterns dictionary with 10 categories of non-employer entities
(clubs, associations, trusts, charities, investment, property, religious, sports, educational, professional)
- Expand SkipWords from ~30 to 120+ words for better core identifier extraction
- Add GetSearchEntityTypes() and IsValidEmploymentEntity() helper methods
- Refactor FindBestMatch() to use pattern-based filtering instead of hardcoded checks
- Fix UI showing duplicate points for same company appearing multiple times
(now only shows points on first occurrence, subsequent rows show 0)
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-20 22:33:16 +01:00
{
return false ; // This is a non-employment entity type that wasn't explicitly searched for
}
}
return true ; // No non-employment patterns matched, this is likely a valid employment entity
}
2026-01-22 10:43:45 +00:00
/// <summary>
/// Checks if a string contains a word as a whole word (not as a substring of another word).
/// E.g., "scotland" does NOT contain whole word "land", but "land holdings" does.
/// </summary>
private static bool ContainsWholeWord ( string text , string word )
{
if ( string . IsNullOrEmpty ( text ) | | string . IsNullOrEmpty ( word ) )
return false ;
var pattern = $@"\b{System.Text.RegularExpressions.Regex.Escape(word)}\b" ;
return System . Text . RegularExpressions . Regex . IsMatch ( text , pattern , System . Text . RegularExpressions . RegexOptions . IgnoreCase ) ;
}
Improve company verification filtering and fix duplicate points display
- Add NonEmploymentEntityPatterns dictionary with 10 categories of non-employer entities
(clubs, associations, trusts, charities, investment, property, religious, sports, educational, professional)
- Expand SkipWords from ~30 to 120+ words for better core identifier extraction
- Add GetSearchEntityTypes() and IsValidEmploymentEntity() helper methods
- Refactor FindBestMatch() to use pattern-based filtering instead of hardcoded checks
- Fix UI showing duplicate points for same company appearing multiple times
(now only shows points on first occurrence, subsequent rows show 0)
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-20 22:33:16 +01:00
// Expanded skip words list for core identifier extraction
// These words are too common to be meaningful differentiators between companies
private static readonly HashSet < string > SkipWords = new ( StringComparer . OrdinalIgnoreCase )
{
// Articles and conjunctions
"the" , "a" , "an" , "and" , "or" , "of" , "for" , "in" , "at" , "on" , "by" , "to" , "with" ,
// Geographic - Countries and regions
"uk" , "u.k." , "gb" , "british" , "britain" , "england" , "english" , "scotland" , "scottish" ,
"wales" , "welsh" , "ireland" , "irish" , "northern" ,
"europe" , "european" , "america" , "american" , "usa" , "us" , "u.s." , "u.s.a." ,
"canada" , "canadian" , "asia" , "asian" , "pacific" , "atlantic" ,
"australia" , "australian" , "africa" , "african" , "india" , "indian" ,
"france" , "french" , "germany" , "german" , "spain" , "spanish" , "italy" , "italian" ,
"japan" , "japanese" , "china" , "chinese" , "korea" , "korean" ,
"middle" , "east" , "west" , "north" , "south" , "central" , "western" , "eastern" ,
// Geographic - Cities
"london" , "manchester" , "birmingham" , "leeds" , "glasgow" , "edinburgh" , "bristol" ,
"liverpool" , "sheffield" , "newcastle" , "cardiff" , "belfast" , "nottingham" ,
"southampton" , "portsmouth" , "brighton" , "leicester" , "coventry" , "hull" ,
// Legal suffixes
"limited" , "ltd" , "plc" , "llp" , "llc" , "inc" , "incorporated" , "corporation" , "corp" ,
"company" , "co" , "partners" , "partnership" , "enterprises" , "unlimited" ,
"registered" , "cic" , "cio" , "se" , "ag" , "gmbh" , "sarl" , "bv" , "nv" ,
// Business descriptors
"group" , "holdings" , "holding" , "parent" , "subsidiary" , "division" , "branch" ,
"services" , "service" , "solutions" , "solution" , "consulting" , "consultants" , "consultancy" ,
"management" , "systems" , "system" , "technologies" , "technology" , "tech" ,
"industries" , "industry" , "industrial" , "commercial" , "trading" , "trade" ,
"business" , "businesses" , "operations" , "operational" , "professional" , "professionals" ,
"resources" , "resource" , "network" , "networks" , "associates" , "associated" ,
// Size/Scope descriptors
"national" , "international" , "global" , "worldwide" , "world" , "regional" , "local" ,
"universal" , "general" , "standard" , "premier" , "prime" , "first" , "one" ,
// Quality/Marketing terms
"new" , "modern" , "advanced" , "innovative" , "premier" , "elite" , "premium" ,
"quality" , "superior" , "excellent" , "best" , "top" , "leading" , "major" ,
2026-01-22 10:43:45 +00:00
// Ownership indicators (excluding "royal" as it's a meaningful company identifier)
"imperial" , "crown" , "state" , "public" , "private" , "independent" ,
Improve company verification filtering and fix duplicate points display
- Add NonEmploymentEntityPatterns dictionary with 10 categories of non-employer entities
(clubs, associations, trusts, charities, investment, property, religious, sports, educational, professional)
- Expand SkipWords from ~30 to 120+ words for better core identifier extraction
- Add GetSearchEntityTypes() and IsValidEmploymentEntity() helper methods
- Refactor FindBestMatch() to use pattern-based filtering instead of hardcoded checks
- Fix UI showing duplicate points for same company appearing multiple times
(now only shows points on first occurrence, subsequent rows show 0)
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-20 22:33:16 +01:00
"mutual" , "cooperative" , "coop" , "community" ,
// Time-related
"century" , "millennium" , "annual" , "year" , "years" ,
// Numbers as words
"one" , "two" , "three" , "four" , "five" , "first" , "second" , "third"
} ;
2026-01-20 22:13:23 +01:00
/// <summary>
/// Extracts ALL core identifying words from a company name.
/// These are significant words that aren't common prefixes/suffixes.
/// E.g., "BMW Group Canada" -> ["BMW"], "Lloyds Bowmaker" -> ["LLOYDS", "BOWMAKER"]
2026-01-22 10:43:45 +00:00
/// "Royal Bank of Scotland" -> ["ROYAL", "BANK"] (Scotland is a geographic skipWord)
2026-01-20 22:13:23 +01:00
/// </summary>
private static List < string > ExtractCoreIdentifiers ( string companyName )
{
if ( string . IsNullOrWhiteSpace ( companyName ) ) return new List < string > ( ) ;
// Remove parenthetical content first
var cleanName = System . Text . RegularExpressions . Regex . Replace ( companyName , @"\([^)]*\)" , "" ) . Trim ( ) ;
// Split into words and collect all significant words
var words = cleanName . Split ( new [ ] { ' ' , '-' , '/' , '&' } , StringSplitOptions . RemoveEmptyEntries ) ;
var coreWords = new List < string > ( ) ;
foreach ( var word in words )
{
var cleanWord = word . Trim ( '.' , ',' , '\'' ) ;
Improve company verification filtering and fix duplicate points display
- Add NonEmploymentEntityPatterns dictionary with 10 categories of non-employer entities
(clubs, associations, trusts, charities, investment, property, religious, sports, educational, professional)
- Expand SkipWords from ~30 to 120+ words for better core identifier extraction
- Add GetSearchEntityTypes() and IsValidEmploymentEntity() helper methods
- Refactor FindBestMatch() to use pattern-based filtering instead of hardcoded checks
- Fix UI showing duplicate points for same company appearing multiple times
(now only shows points on first occurrence, subsequent rows show 0)
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-20 22:33:16 +01:00
if ( cleanWord . Length > = 2 & & ! SkipWords . Contains ( cleanWord ) )
2026-01-20 22:13:23 +01:00
{
coreWords . Add ( cleanWord . ToUpperInvariant ( ) ) ;
}
}
return coreWords ;
}
2026-01-20 20:00:24 +01:00
#endregion
2026-01-18 19:20:50 +01:00
}