Files
RealCV/src/TrueCV.Infrastructure/Services/CompanyVerifierService.cs

1179 lines
49 KiB
C#
Raw Normal View History

using System.Text.Json;
using FuzzySharp;
using Microsoft.EntityFrameworkCore;
using Microsoft.Extensions.Logging;
using TrueCV.Application.DTOs;
using TrueCV.Application.Helpers;
using TrueCV.Application.Interfaces;
using TrueCV.Application.Models;
using TrueCV.Domain.Entities;
using TrueCV.Infrastructure.Data;
using TrueCV.Infrastructure.ExternalApis;
namespace TrueCV.Infrastructure.Services;
public sealed class CompanyVerifierService : ICompanyVerifierService
{
private readonly CompaniesHouseClient _companiesHouseClient;
private readonly IDbContextFactory<ApplicationDbContext> _dbContextFactory;
private readonly ILogger<CompanyVerifierService> _logger;
private const int FuzzyMatchThreshold = 85;
private const int CacheExpirationDays = 30;
// Non-employment entity patterns organized by category
// These are entities that exist in Companies House but are not typical employers
private static readonly Dictionary<string, string[]> NonEmploymentEntityPatterns = new()
{
["Clubs"] = new[] { "club", "fan club", "owners club", "car club", "supporters", "enthusiast", "aficionados" },
["Associations"] = new[] { "association", "society", "federation", "institute", "institution", "guild", "chamber of commerce" },
["Trusts"] = new[] { "benefit trust", "pension", "retirement", "employee trust", "share trust", "employee benefit", "superannuation", "provident" },
["Charities"] = new[] { "charity", "charitable", "foundation", "relief fund", "benevolent", "philanthropic" },
["Investment"] = new[] { "nominee", "custodian", "trustee", "investment trust", "unit trust", "investment fund", "capital partners" },
["Property"] = new[] { "freehold", "leasehold", "property management", "residents association", "management company rtm", "commonhold" },
["Religious"] = new[] { "church", "chapel", "mosque", "synagogue", "temple", "parish", "diocese", "ministry" },
["Sports"] = new[] { "football club", "cricket club", "rugby club", "golf club", "tennis club", "sports club", "athletic club" },
["Educational"] = new[] { "old boys", "old girls", "alumni", "school association", "pta", "parent teacher" },
["Professional"] = new[] { "chartered institute", "royal college", "professional body", "trade body", "regulatory body" }
};
// SIC codes that indicate non-trading or non-employment entities
private static readonly HashSet<string> NonTradingSicCodes = new()
{
"99999", // Dormant company
"64209", // Activities of holding companies (shell companies)
"68100", // Buying and selling of own real estate (often shell)
};
// Words that indicate a subsidiary rather than the main trading company
// When someone says they worked for "ASDA", they likely mean ASDA STORES LIMITED,
// not ASDA DELIVERY LIMITED or ASDA PROPERTY HOLDINGS LIMITED
private static readonly HashSet<string> SubsidiaryIndicators = new(StringComparer.OrdinalIgnoreCase)
{
// Logistics/Operations subsidiaries
"delivery", "distribution", "logistics", "transport", "fleet", "haulage", "warehousing", "fulfilment",
// Property subsidiaries
"property", "properties", "estates", "land", "real estate", "developments",
// Financial/Holding subsidiaries
"holdings", "holding", "investments", "capital", "finance", "financial", "treasury",
// Administrative subsidiaries
"nominees", "nominee", "trustees", "trustee", "secretarial", "registrars",
// Insurance subsidiaries
"insurance", "assurance", "underwriting",
// Specific function subsidiaries
"leasing", "rentals", "procurement", "sourcing"
};
// Words that indicate a main trading/employer company (prefer these)
private static readonly HashSet<string> MainCompanyIndicators = new(StringComparer.OrdinalIgnoreCase)
{
"stores", "retail", "supermarkets", "superstores", "hypermarkets",
"manufacturing", "operations", "trading"
};
public CompanyVerifierService(
CompaniesHouseClient companiesHouseClient,
IDbContextFactory<ApplicationDbContext> dbContextFactory,
ILogger<CompanyVerifierService> logger)
{
_companiesHouseClient = companiesHouseClient;
_dbContextFactory = dbContextFactory;
_logger = logger;
}
public async Task<CompanyVerificationResult> VerifyCompanyAsync(
string companyName,
DateOnly? startDate,
DateOnly? endDate,
string? jobTitle = null)
{
ArgumentException.ThrowIfNullOrWhiteSpace(companyName);
_logger.LogDebug("Verifying company: {CompanyName}", companyName);
var flags = new List<CompanyVerificationFlag>();
// Try to find a cached match first (but only if it existed at claimed start date)
var cachedMatch = await FindCachedMatchAsync(companyName);
if (cachedMatch is not null)
{
// Check if cached company existed at the claimed start date
var cacheValid = !startDate.HasValue ||
cachedMatch.IncorporationDate == null ||
cachedMatch.IncorporationDate <= startDate.Value;
if (cacheValid)
{
_logger.LogDebug("Found cached company match for: {CompanyName}", companyName);
return CreateResultFromCache(cachedMatch, companyName, startDate, endDate, jobTitle, flags);
}
else
{
_logger.LogDebug("Cached company {CachedName} was incorporated after claimed start date, searching for alternatives", cachedMatch.CompanyName);
}
}
// Search Companies House with fallback queries
try
{
var searchQueries = GenerateSearchQueries(companyName);
_logger.LogDebug("Generated {Count} search queries for '{CompanyName}': {Queries}",
searchQueries.Count, companyName, string.Join(", ", searchQueries.Select(q => $"'{q}'")));
(CompaniesHouseSearchItem Item, int Score)? bestMatch = null;
foreach (var query in searchQueries)
{
_logger.LogDebug("Searching Companies House with query: {Query}", query);
var searchResponse = await _companiesHouseClient.SearchCompaniesAsync(query);
if (searchResponse?.Items is null || searchResponse.Items.Count == 0)
{
continue;
}
// Find best fuzzy match, preferring companies that existed at claimed start date
// Pass both original name and search query for matching flexibility
bestMatch = FindBestMatch(companyName, query, searchResponse.Items, startDate);
if (bestMatch is not null)
{
_logger.LogDebug("Found match with query '{Query}': {Company}", query, bestMatch.Value.Item.Title);
break;
}
}
if (bestMatch is null)
{
_logger.LogDebug("No valid match found for: {CompanyName} after trying {Count} queries", companyName, searchQueries.Count);
return CreateUnverifiedResult(companyName, startDate, endDate, jobTitle,
"Company name could not be verified against official records");
}
var match = bestMatch.Value;
// Fetch full company details for additional data
var companyDetails = await _companiesHouseClient.GetCompanyAsync(match.Item.CompanyNumber);
// Cache the matched company with full details
await CacheCompanyAsync(match.Item, companyDetails);
_logger.LogInformation(
"Verified company {ClaimedName} matched to {MatchedName} with score {Score}%",
companyName, match.Item.Title, match.Score);
// Run all verification checks
var incorporationDate = DateHelpers.ParseDate(match.Item.DateOfCreation);
var dissolutionDate = DateHelpers.ParseDate(match.Item.DateOfCessation);
var companyStatus = match.Item.CompanyStatus;
var companyType = match.Item.CompanyType;
var sicCodes = companyDetails?.SicCodes ?? match.Item.SicCodes;
var accountsCategory = companyDetails?.Accounts?.LastAccounts?.Type;
// Check 1: Employment before company incorporation
CheckIncorporationDate(flags, startDate, incorporationDate, match.Item.Title);
// Check 2: Employment at dissolved company
CheckDissolutionDate(flags, endDate, dissolutionDate, companyStatus, match.Item.Title);
// Check 3: Dormant company check
CheckDormantCompany(flags, accountsCategory, jobTitle, match.Item.Title);
// Check 4: Company size vs job title
CheckCompanySizeVsRole(flags, accountsCategory, jobTitle, match.Item.Title);
// Check 5: Job title plausibility for PLCs
var (jobPlausible, jobNotes) = CheckJobTitlePlausibility(jobTitle, companyType);
if (jobPlausible == false)
{
flags.Add(new CompanyVerificationFlag
{
Type = "ImplausibleJobTitle",
Severity = "Critical",
Message = jobNotes ?? "Job title requires verification",
ScoreImpact = -15
});
}
return new CompanyVerificationResult
{
ClaimedCompany = companyName,
MatchedCompanyName = match.Item.Title,
MatchedCompanyNumber = match.Item.CompanyNumber,
MatchScore = match.Score,
IsVerified = true,
VerificationNotes = null,
ClaimedStartDate = startDate,
ClaimedEndDate = endDate,
CompanyType = companyType,
CompanyStatus = companyStatus,
IncorporationDate = incorporationDate,
DissolutionDate = dissolutionDate,
AccountsCategory = accountsCategory,
SicCodes = sicCodes,
ClaimedJobTitle = jobTitle,
JobTitlePlausible = jobPlausible,
JobTitleNotes = jobNotes,
Flags = flags
};
}
catch (CompaniesHouseRateLimitException ex)
{
_logger.LogWarning(ex, "Rate limit hit while verifying company: {CompanyName}", companyName);
return CreateUnverifiedResult(companyName, startDate, endDate, jobTitle,
"Verification temporarily unavailable due to rate limiting");
}
}
public async Task<List<CompanySearchResult>> SearchCompaniesAsync(string query)
{
ArgumentException.ThrowIfNullOrWhiteSpace(query);
_logger.LogDebug("Searching companies for query: {Query}", query);
var response = await _companiesHouseClient.SearchCompaniesAsync(query);
if (response?.Items is null)
{
return [];
}
return response.Items.Select(item => new CompanySearchResult
{
CompanyNumber = item.CompanyNumber,
CompanyName = item.Title,
CompanyStatus = item.CompanyStatus ?? "Unknown",
IncorporationDate = DateHelpers.ParseDate(item.DateOfCreation),
AddressSnippet = item.AddressSnippet
}).ToList();
}
public async Task<bool?> VerifyDirectorAsync(
string companyNumber,
string candidateName,
DateOnly? startDate,
DateOnly? endDate)
{
if (string.IsNullOrWhiteSpace(companyNumber) || string.IsNullOrWhiteSpace(candidateName))
{
return null;
}
try
{
var officers = await _companiesHouseClient.GetOfficersAsync(companyNumber);
if (officers?.Items is null || officers.Items.Count == 0)
{
_logger.LogDebug("No officers found for company {CompanyNumber}", companyNumber);
return null;
}
// Normalize candidate name for comparison
var normalizedCandidate = NormalizeName(candidateName);
foreach (var officer in officers.Items)
{
// Check if officer role is director-like
var role = officer.OfficerRole?.ToLowerInvariant() ?? "";
if (!role.Contains("director") && !role.Contains("secretary"))
{
continue;
}
// Fuzzy match the name
var normalizedOfficer = NormalizeName(officer.Name);
var matchScore = Fuzz.Ratio(normalizedCandidate, normalizedOfficer);
if (matchScore >= 80) // High threshold for name matching
{
// Check date overlap
var appointedOn = DateHelpers.ParseDate(officer.AppointedOn);
var resignedOn = DateHelpers.ParseDate(officer.ResignedOn);
// If no claimed dates, just check if names match
if (!startDate.HasValue && !endDate.HasValue)
{
_logger.LogDebug(
"Found matching director {OfficerName} for candidate {CandidateName} at company {CompanyNumber}",
officer.Name, candidateName, companyNumber);
return true;
}
// Check if employment period overlaps with directorship
var datesOverlap = DatesOverlap(
startDate, endDate,
appointedOn, resignedOn);
if (datesOverlap)
{
_logger.LogDebug(
"Verified director {OfficerName} matches candidate {CandidateName} with overlapping dates",
officer.Name, candidateName);
return true;
}
}
}
_logger.LogDebug(
"No matching director found for candidate {CandidateName} at company {CompanyNumber}",
candidateName, companyNumber);
return false;
}
catch (CompaniesHouseRateLimitException)
{
_logger.LogWarning("Rate limit hit while verifying director for company {CompanyNumber}", companyNumber);
return null;
}
catch (Exception ex)
{
_logger.LogError(ex, "Error verifying director for company {CompanyNumber}", companyNumber);
return null;
}
}
private static string NormalizeName(string name)
{
if (string.IsNullOrWhiteSpace(name)) return "";
// Companies House often stores names as "SURNAME, Firstname"
// Convert to "Firstname Surname" format for comparison
var normalized = name.ToUpperInvariant().Trim();
if (normalized.Contains(','))
{
var parts = normalized.Split(',', 2);
if (parts.Length == 2)
{
normalized = $"{parts[1].Trim()} {parts[0].Trim()}";
}
}
return normalized;
}
private static bool DatesOverlap(DateOnly? start1, DateOnly? end1, DateOnly? start2, DateOnly? end2)
{
// If no dates, assume overlap
if (!start1.HasValue && !end1.HasValue) return true;
if (!start2.HasValue && !end2.HasValue) return true;
// Use default dates for missing values
var s1 = start1 ?? DateOnly.MinValue;
var e1 = end1 ?? DateOnly.MaxValue;
var s2 = start2 ?? DateOnly.MinValue;
var e2 = end2 ?? DateOnly.MaxValue;
// Check overlap: periods overlap if one starts before the other ends
return s1 <= e2 && s2 <= e1;
}
#region Verification Checks
private static void CheckIncorporationDate(
List<CompanyVerificationFlag> flags,
DateOnly? claimedStartDate,
DateOnly? incorporationDate,
string companyName)
{
if (claimedStartDate.HasValue && incorporationDate.HasValue)
{
if (claimedStartDate.Value < incorporationDate.Value)
{
flags.Add(new CompanyVerificationFlag
{
Type = "EmploymentBeforeIncorporation",
Severity = "Critical",
Message = $"Claimed employment at '{companyName}' starting {claimedStartDate:MMM yyyy} is before company incorporation date {incorporationDate:MMM yyyy}",
ScoreImpact = -20
});
}
}
}
private static void CheckDissolutionDate(
List<CompanyVerificationFlag> flags,
DateOnly? claimedEndDate,
DateOnly? dissolutionDate,
string? companyStatus,
string companyName)
{
var isDissolvedStatus = companyStatus?.ToLowerInvariant() is "dissolved" or "liquidation" or "administration";
if (dissolutionDate.HasValue && isDissolvedStatus)
{
// Allow 3 month buffer for wind-down
var bufferDate = dissolutionDate.Value.AddMonths(3);
if (claimedEndDate.HasValue && claimedEndDate.Value > bufferDate)
{
flags.Add(new CompanyVerificationFlag
{
Type = "EmploymentAtDissolvedCompany",
Severity = "Critical",
Message = $"Claimed employment at '{companyName}' until {claimedEndDate:MMM yyyy} but company was dissolved on {dissolutionDate:MMM yyyy}",
ScoreImpact = -20
});
}
else if (!claimedEndDate.HasValue) // Current employment at dissolved company
{
flags.Add(new CompanyVerificationFlag
{
Type = "CurrentEmploymentAtDissolvedCompany",
Severity = "Critical",
Message = $"Claims current employment at '{companyName}' but company was dissolved on {dissolutionDate:MMM yyyy}",
ScoreImpact = -25
});
}
}
}
private static void CheckDormantCompany(
List<CompanyVerificationFlag> flags,
string? accountsCategory,
string? jobTitle,
string companyName)
{
if (string.IsNullOrWhiteSpace(accountsCategory)) return;
var isDormant = accountsCategory.ToLowerInvariant().Contains("dormant");
if (!isDormant) return;
// Directors can maintain dormant companies, but other roles are suspicious
var title = jobTitle?.ToLowerInvariant() ?? "";
var isDirectorRole = title.Contains("director") || title.Contains("company secretary");
if (!isDirectorRole)
{
flags.Add(new CompanyVerificationFlag
{
Type = "EmploymentAtDormantCompany",
Severity = "Warning",
Message = $"Claimed active employment as '{jobTitle}' at '{companyName}' which files dormant accounts",
ScoreImpact = -10
});
}
}
private static void CheckCompanySizeVsRole(
List<CompanyVerificationFlag> flags,
string? accountsCategory,
string? jobTitle,
string companyName)
{
if (string.IsNullOrWhiteSpace(accountsCategory) || string.IsNullOrWhiteSpace(jobTitle)) return;
var category = accountsCategory.ToLowerInvariant();
var title = jobTitle.ToLowerInvariant();
// Micro-entity: < 10 employees, < £632k turnover
var isMicroEntity = category.Contains("micro");
// Check for senior management roles at micro companies
var isSeniorRole = title.Contains("vp") ||
title.Contains("vice president") ||
title.Contains("head of") ||
title.Contains("chief") ||
title.Contains("director of") ||
title.Contains("senior director");
// At micro companies, having many senior roles is suspicious
if (isMicroEntity && isSeniorRole)
{
flags.Add(new CompanyVerificationFlag
{
Type = "SeniorRoleAtMicroCompany",
Severity = "Warning",
Message = $"Claimed senior role '{jobTitle}' at '{companyName}' which files micro-entity accounts (typically <10 employees)",
ScoreImpact = -10
});
}
}
private static (bool? IsPlausible, string? Notes) CheckJobTitlePlausibility(string? jobTitle, string? companyType)
{
if (string.IsNullOrWhiteSpace(jobTitle) || string.IsNullOrWhiteSpace(companyType))
{
return (null, null);
}
var title = jobTitle.Trim().ToLowerInvariant();
var type = companyType.Trim().ToLowerInvariant();
// Check if this is a PLC (Public Limited Company) - these are large companies
var isPlc = type.Contains("plc") || type.Contains("public limited");
// Check for C-suite / very senior roles
var isCsuiteRole = title.Contains("ceo") ||
title.Contains("chief executive") ||
title.Contains("cto") ||
title.Contains("chief technology") ||
title.Contains("cfo") ||
title.Contains("chief financial") ||
title.Contains("coo") ||
title.Contains("chief operating") ||
title.Contains("cio") ||
title.Contains("chief information") ||
title.Contains("managing director") ||
title == "md" ||
title.Contains("chairman") ||
title.Contains("chairwoman") ||
title.Contains("chairperson") ||
title.Contains("president");
// Check for board-level roles
var isBoardRole = title.Contains("board member") ||
title.Contains("non-executive director") ||
title.Contains("executive director") ||
(title == "director" && !title.Contains("of"));
if (isPlc && (isCsuiteRole || isBoardRole))
{
return (false, $"Claimed senior role '{jobTitle}' at a PLC requires verification - C-suite positions at public companies are publicly disclosed");
}
// Check for VP/SVP at PLCs (also usually disclosed)
var isVpRole = title.Contains("vice president") ||
title.Contains("vp ") ||
title.StartsWith("vp") ||
title.Contains("svp") ||
title.Contains("senior vice president") ||
title.Contains("evp") ||
title.Contains("executive vice president");
if (isPlc && isVpRole)
{
return (false, $"Claimed VP-level role '{jobTitle}' at a PLC - senior positions at public companies should be verifiable");
}
return (true, null);
}
#endregion
#region Helper Methods
private async Task<CompanyCache?> FindCachedMatchAsync(string companyName)
{
var cutoffDate = DateTime.UtcNow.AddDays(-CacheExpirationDays);
await using var dbContext = await _dbContextFactory.CreateDbContextAsync();
var cachedCompanies = await dbContext.CompanyCache
.Where(c => c.CachedAt >= cutoffDate)
.ToListAsync();
if (cachedCompanies.Count == 0)
{
return null;
}
var matches = cachedCompanies
.Where(c => !string.IsNullOrWhiteSpace(c.CompanyName))
.Select(c => new { Company = c, Score = Fuzz.TokenSetRatio(companyName.ToUpperInvariant(), c.CompanyName.ToUpperInvariant()) })
.Where(m => m.Score >= FuzzyMatchThreshold)
.OrderByDescending(m => m.Score)
.FirstOrDefault();
return matches?.Company;
}
private (CompaniesHouseSearchItem Item, int Score)? FindBestMatch(
string companyName,
string searchQuery,
List<CompaniesHouseSearchItem> items,
DateOnly? claimedStartDate)
{
var normalizedOriginal = companyName.ToUpperInvariant();
var normalizedQuery = searchQuery.ToUpperInvariant();
// Extract core identifying words that MUST appear in any valid match
// This prevents "BMW Group Canada" matching "CANADA LIFE GROUP" just because of common words
// and "Lloyds Bowmaker" matching "LLOYDS ALARMS" (missing "Bowmaker")
var coreWords = ExtractCoreIdentifiers(companyName);
var queryCoreWords = ExtractCoreIdentifiers(searchQuery);
var originalLower = companyName.ToLowerInvariant();
var queryLower = searchQuery.ToLowerInvariant();
// Determine which entity types the search is explicitly looking for
var searchEntityTypes = GetSearchEntityTypes(originalLower, queryLower);
// Match against both the original company name AND the search query used
// This handles cases like "Matthew Walker (Northern Foods Plc)" where we
// search for "Northern Foods Plc" but need to match against it, not the full name
var matches = items
.Where(item => !string.IsNullOrWhiteSpace(item.Title))
.Where(item =>
{
var itemTitle = item.Title.ToUpperInvariant();
var itemTitleLower = item.Title.ToLowerInvariant();
// Validate that ALL core identifiers appear in the match
// "Lloyds Bowmaker" must have BOTH "LLOYDS" and "BOWMAKER" in the match
var hasAllOriginalCores = coreWords.Count == 0 || coreWords.All(w => itemTitle.Contains(w));
var hasAllQueryCores = queryCoreWords.Count == 0 || queryCoreWords.All(w => itemTitle.Contains(w));
if (!hasAllOriginalCores && !hasAllQueryCores) return false;
// Filter out non-employment entities unless explicitly searching for that type
if (!IsValidEmploymentEntity(itemTitleLower, searchEntityTypes))
{
return false;
}
return true;
})
.Select(item =>
{
var itemTitle = item.Title.ToUpperInvariant();
var itemTitleLower = item.Title.ToLowerInvariant();
var scoreVsOriginal = Fuzz.TokenSetRatio(normalizedOriginal, itemTitle);
var scoreVsQuery = Fuzz.TokenSetRatio(normalizedQuery, itemTitle);
var baseScore = Math.Max(scoreVsOriginal, scoreVsQuery);
// Calculate priority adjustment for main company vs subsidiary
var priorityScore = CalculateCompanyPriorityScore(itemTitleLower, originalLower, queryLower);
return (Item: item, Score: baseScore, PriorityScore: priorityScore);
})
.Where(m => m.Score >= FuzzyMatchThreshold)
.ToList();
_logger.LogDebug("Found {Count} matches above threshold for '{CompanyName}' (query: '{Query}')", matches.Count, companyName, searchQuery);
foreach (var m in matches.Take(5))
{
_logger.LogDebug(" Match: {Title} ({Number}), Score: {Score}, Priority: {Priority}, DateOfCreation: {Date}",
m.Item.Title, m.Item.CompanyNumber, m.Score, m.PriorityScore, m.Item.DateOfCreation ?? "null");
}
if (matches.Count == 0) return null;
// If we have a claimed start date, prefer companies that existed at that time
if (claimedStartDate.HasValue)
{
_logger.LogDebug("Filtering for companies that existed at claimed start date: {StartDate}", claimedStartDate.Value);
var existedAtStartDate = matches
.Where(m =>
{
var incDate = DateHelpers.ParseDate(m.Item.DateOfCreation);
var existed = incDate == null || incDate <= claimedStartDate.Value;
_logger.LogDebug(" {Title}: IncDate={IncDate}, Existed={Existed}",
m.Item.Title, incDate?.ToString() ?? "null", existed);
return existed;
})
// Sort by priority first, then by fuzzy score
.OrderByDescending(m => m.PriorityScore)
.ThenByDescending(m => m.Score)
.ToList();
_logger.LogDebug("Companies that existed at start date: {Count}", existedAtStartDate.Count);
// If any matches existed at the start date, prefer those
if (existedAtStartDate.Count > 0)
{
var selected = existedAtStartDate[0];
_logger.LogDebug("Selected: {Title} ({Number}), Priority: {Priority}", selected.Item.Title, selected.Item.CompanyNumber, selected.PriorityScore);
return (selected.Item, selected.Score);
}
// No companies existed at the claimed start date - don't match a wrong company
_logger.LogDebug("No companies found that existed at claimed start date {StartDate}, returning no match", claimedStartDate.Value);
return null;
}
// No start date provided - sort by priority then score
var fallback = matches
.OrderByDescending(m => m.PriorityScore)
.ThenByDescending(m => m.Score)
.First();
_logger.LogDebug("No start date filter, using highest priority: {Title} ({Number}), Priority: {Priority}", fallback.Item.Title, fallback.Item.CompanyNumber, fallback.PriorityScore);
return (fallback.Item, fallback.Score);
}
/// <summary>
/// Calculates a priority score for company matching.
/// Higher scores = more likely to be the main employer company.
/// Penalizes subsidiaries (delivery, property, holdings, etc.) unless explicitly searched for.
/// Boosts main trading companies (stores, retail, etc.).
/// </summary>
private static int CalculateCompanyPriorityScore(string itemTitleLower, string originalLower, string queryLower)
{
var score = 0;
// Check if search explicitly mentions subsidiary indicators
var searchText = originalLower + " " + queryLower;
// Penalize subsidiary indicators (unless search explicitly included them)
foreach (var indicator in SubsidiaryIndicators)
{
if (itemTitleLower.Contains(indicator))
{
// Only penalize if the search didn't explicitly include this indicator
if (!searchText.Contains(indicator))
{
score -= 10; // Significant penalty for subsidiaries
}
break; // Only apply one subsidiary penalty
}
}
// Boost main company indicators
foreach (var indicator in MainCompanyIndicators)
{
if (itemTitleLower.Contains(indicator))
{
score += 5; // Boost for main trading companies
break; // Only apply one boost
}
}
// Slight boost for PLC (usually the parent/main company)
if (itemTitleLower.EndsWith(" plc"))
{
score += 3;
}
return score;
}
private async Task CacheCompanyAsync(CompaniesHouseSearchItem item, CompaniesHouseCompany? details)
{
try
{
await using var dbContext = await _dbContextFactory.CreateDbContextAsync();
var existingCache = await dbContext.CompanyCache
.FirstOrDefaultAsync(c => c.CompanyNumber == item.CompanyNumber);
var sicCodes = details?.SicCodes ?? item.SicCodes;
var sicCodesJson = sicCodes != null ? JsonSerializer.Serialize(sicCodes) : null;
var accountsCategory = details?.Accounts?.LastAccounts?.Type;
if (existingCache is not null)
{
existingCache.CompanyName = item.Title;
existingCache.Status = item.CompanyStatus ?? "Unknown";
existingCache.CompanyType = item.CompanyType;
existingCache.IncorporationDate = DateHelpers.ParseDate(item.DateOfCreation);
existingCache.DissolutionDate = DateHelpers.ParseDate(item.DateOfCessation);
existingCache.AccountsCategory = accountsCategory;
existingCache.SicCodesJson = sicCodesJson;
existingCache.CachedAt = DateTime.UtcNow;
}
else
{
var cacheEntry = new CompanyCache
{
CompanyNumber = item.CompanyNumber,
CompanyName = item.Title,
Status = item.CompanyStatus ?? "Unknown",
CompanyType = item.CompanyType,
IncorporationDate = DateHelpers.ParseDate(item.DateOfCreation),
DissolutionDate = DateHelpers.ParseDate(item.DateOfCessation),
AccountsCategory = accountsCategory,
SicCodesJson = sicCodesJson,
CachedAt = DateTime.UtcNow
};
dbContext.CompanyCache.Add(cacheEntry);
}
await dbContext.SaveChangesAsync();
}
catch (DbUpdateException ex) when (ex.InnerException?.Message.Contains("PK_CompanyCache") == true)
{
// Race condition: another task already cached this company - ignore
_logger.LogDebug("Company {CompanyNumber} already cached by another task", item.CompanyNumber);
}
}
private CompanyVerificationResult CreateResultFromCache(
CompanyCache cached,
string claimedCompany,
DateOnly? startDate,
DateOnly? endDate,
string? jobTitle,
List<CompanyVerificationFlag> flags)
{
var matchScore = Fuzz.TokenSetRatio(
claimedCompany.ToUpperInvariant(),
cached.CompanyName.ToUpperInvariant());
List<string>? sicCodes = null;
if (!string.IsNullOrEmpty(cached.SicCodesJson))
{
try
{
sicCodes = JsonSerializer.Deserialize<List<string>>(cached.SicCodesJson);
}
catch (JsonException)
{
// Ignore malformed JSON in cache
}
}
// Run all verification checks
CheckIncorporationDate(flags, startDate, cached.IncorporationDate, cached.CompanyName);
CheckDissolutionDate(flags, endDate, cached.DissolutionDate, cached.Status, cached.CompanyName);
CheckDormantCompany(flags, cached.AccountsCategory, jobTitle, cached.CompanyName);
CheckCompanySizeVsRole(flags, cached.AccountsCategory, jobTitle, cached.CompanyName);
var (jobPlausible, jobNotes) = CheckJobTitlePlausibility(jobTitle, cached.CompanyType);
if (jobPlausible == false)
{
flags.Add(new CompanyVerificationFlag
{
Type = "ImplausibleJobTitle",
Severity = "Critical",
Message = jobNotes ?? "Job title requires verification",
ScoreImpact = -15
});
}
return new CompanyVerificationResult
{
ClaimedCompany = claimedCompany,
MatchedCompanyName = cached.CompanyName,
MatchedCompanyNumber = cached.CompanyNumber,
MatchScore = matchScore,
IsVerified = true,
VerificationNotes = null,
ClaimedStartDate = startDate,
ClaimedEndDate = endDate,
CompanyType = cached.CompanyType,
CompanyStatus = cached.Status,
IncorporationDate = cached.IncorporationDate,
DissolutionDate = cached.DissolutionDate,
AccountsCategory = cached.AccountsCategory,
SicCodes = sicCodes,
ClaimedJobTitle = jobTitle,
JobTitlePlausible = jobPlausible,
JobTitleNotes = jobNotes,
Flags = flags
};
}
private static CompanyVerificationResult CreateUnverifiedResult(
string companyName,
DateOnly? startDate,
DateOnly? endDate,
string? jobTitle,
string reason)
{
return new CompanyVerificationResult
{
ClaimedCompany = companyName,
MatchedCompanyName = null,
MatchedCompanyNumber = null,
MatchScore = 0,
IsVerified = false,
VerificationNotes = reason,
ClaimedStartDate = startDate,
ClaimedEndDate = endDate,
ClaimedJobTitle = jobTitle
};
}
/// <summary>
/// Generates alternative search queries to find companies that may be registered
/// with slightly different names (e.g., "U.K." vs "UK", "Limited" vs "Ltd").
/// Also handles "Brand (Parent Company)" format by extracting and prioritizing the parent.
/// </summary>
private static List<string> GenerateSearchQueries(string companyName)
{
var queries = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
var normalized = companyName.Trim();
// Step 0a: Check for "Brand (Parent Company)" format and extract parent company
// Parent company is more likely to be the registered name, so search it first
var parentMatch = System.Text.RegularExpressions.Regex.Match(normalized, @"\(([^)]+)\)\s*$");
if (parentMatch.Success)
{
var parentCompany = parentMatch.Groups[1].Value.Trim();
// Generate queries for parent company first (higher priority)
foreach (var parentQuery in GenerateNameVariations(parentCompany))
{
queries.Add(parentQuery);
}
// Also try the brand name without parenthetical
var brandName = normalized[..parentMatch.Index].Trim();
if (brandName.Length >= 3)
{
foreach (var brandQuery in GenerateNameVariations(brandName))
{
queries.Add(brandQuery);
}
}
}
// Step 0b: Check for "Name1/Name2" format (e.g., "ASDA/WALMART")
// Try each part separately as they may be different registered names
if (normalized.Contains('/'))
{
var parts = normalized.Split('/', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries);
foreach (var part in parts)
{
if (part.Length >= 3)
{
foreach (var partQuery in GenerateNameVariations(part))
{
queries.Add(partQuery);
}
}
}
}
// Step 0c: Try first word as potential parent company (e.g., "UNILEVER BESTFOOD" -> "UNILEVER")
// Many company names are "ParentCompany Division" or "ParentCompany Brand"
var words = normalized.Split(' ', StringSplitOptions.RemoveEmptyEntries);
if (words.Length >= 2)
{
var firstWord = words[0];
// Only try if first word is substantial (not "The", "A", common prefixes)
var skipWords = new HashSet<string>(StringComparer.OrdinalIgnoreCase)
{ "the", "a", "an", "uk", "british", "national", "international", "global", "new" };
if (firstWord.Length >= 4 && !skipWords.Contains(firstWord))
{
foreach (var firstWordQuery in GenerateNameVariations(firstWord))
{
queries.Add(firstWordQuery);
}
// Also try first word + PLC/Limited for major corporations
queries.Add(firstWord + " PLC");
queries.Add(firstWord + " Limited");
}
}
// Also add variations of the full original name
foreach (var query in GenerateNameVariations(normalized))
{
queries.Add(query);
}
return queries.ToList();
}
/// <summary>
/// Generates name variations for a single company name (UK/U.K., Ltd/Limited, etc.)
/// </summary>
private static List<string> GenerateNameVariations(string name)
{
var variations = new HashSet<string>(StringComparer.OrdinalIgnoreCase) { name };
// Step 1: Generate UK/U.K. variations
var ukVariants = new List<string> { name };
if (name.Contains(" UK", StringComparison.OrdinalIgnoreCase))
{
// Add U.K. variant
var withDots = name
.Replace(" UK ", " U.K. ", StringComparison.OrdinalIgnoreCase)
.Replace(" UK", " U.K.", StringComparison.OrdinalIgnoreCase);
if (withDots != name)
ukVariants.Add(withDots);
}
if (name.Contains(" U.K.", StringComparison.OrdinalIgnoreCase))
{
// Add UK variant (no dots)
var withoutDots = name
.Replace(" U.K. ", " UK ", StringComparison.OrdinalIgnoreCase)
.Replace(" U.K.", " UK", StringComparison.OrdinalIgnoreCase);
if (withoutDots != name)
ukVariants.Add(withoutDots);
}
// Step 2: For each UK variant, generate suffix variations (Ltd/Limited)
foreach (var variant in ukVariants)
{
variations.Add(variant);
// Try Ltd -> Limited
if (variant.EndsWith(" Ltd", StringComparison.OrdinalIgnoreCase))
{
variations.Add(variant[..^4] + " Limited");
}
// Try Limited -> Ltd
else if (variant.EndsWith(" Limited", StringComparison.OrdinalIgnoreCase))
{
variations.Add(variant[..^8] + " Ltd");
}
// Try PLC variations
else if (variant.EndsWith(" PLC", StringComparison.OrdinalIgnoreCase))
{
variations.Add(variant[..^4] + " Public Limited Company");
}
else if (variant.EndsWith(" Public Limited Company", StringComparison.OrdinalIgnoreCase))
{
variations.Add(variant[..^24] + " PLC");
}
// Try Plc (mixed case) variations
else if (variant.EndsWith(" Plc", StringComparison.Ordinal))
{
variations.Add(variant[..^4] + " PLC");
variations.Add(variant[..^4] + " Public Limited Company");
}
}
// Step 3: Try core name without suffix
var suffixesToRemove = new[] { " Ltd", " Limited", " PLC", " Plc", " LLP", " Inc", " Corporation", " Corp" };
var coreName = name;
foreach (var suffix in suffixesToRemove)
{
if (coreName.EndsWith(suffix, StringComparison.OrdinalIgnoreCase))
{
coreName = coreName[..^suffix.Length].Trim();
break;
}
}
if (coreName != name && coreName.Length >= 3)
{
variations.Add(coreName);
variations.Add(coreName + " Limited");
variations.Add(coreName + " PLC");
// Also add U.K. variant of core name if applicable
if (coreName.Contains(" UK", StringComparison.OrdinalIgnoreCase))
{
var coreWithDots = coreName
.Replace(" UK ", " U.K. ", StringComparison.OrdinalIgnoreCase)
.Replace(" UK", " U.K.", StringComparison.OrdinalIgnoreCase);
variations.Add(coreWithDots);
variations.Add(coreWithDots + " Limited");
}
}
return variations.ToList();
}
/// <summary>
/// Determines which non-employment entity categories the search query is explicitly looking for.
/// Returns a set of category names (e.g., "Clubs", "Trusts") that should NOT be filtered out.
/// </summary>
private static HashSet<string> GetSearchEntityTypes(string originalLower, string queryLower)
{
var allowedCategories = new HashSet<string>();
var searchTerms = originalLower + " " + queryLower;
foreach (var (category, patterns) in NonEmploymentEntityPatterns)
{
// If any pattern from this category appears in the search, allow matches from that category
if (patterns.Any(pattern => searchTerms.Contains(pattern)))
{
allowedCategories.Add(category);
}
}
return allowedCategories;
}
/// <summary>
/// Checks if a company title represents a valid employment entity.
/// Filters out non-employment entities (clubs, trusts, etc.) unless the search explicitly targets that type.
/// </summary>
private static bool IsValidEmploymentEntity(string itemTitleLower, HashSet<string> allowedCategories)
{
foreach (var (category, patterns) in NonEmploymentEntityPatterns)
{
// Skip this category if the search explicitly allows it
if (allowedCategories.Contains(category))
{
continue;
}
// Check if the item matches any pattern in this non-employment category
if (patterns.Any(pattern => itemTitleLower.Contains(pattern)))
{
return false; // This is a non-employment entity type that wasn't explicitly searched for
}
}
return true; // No non-employment patterns matched, this is likely a valid employment entity
}
// Expanded skip words list for core identifier extraction
// These words are too common to be meaningful differentiators between companies
private static readonly HashSet<string> SkipWords = new(StringComparer.OrdinalIgnoreCase)
{
// Articles and conjunctions
"the", "a", "an", "and", "or", "of", "for", "in", "at", "on", "by", "to", "with",
// Geographic - Countries and regions
"uk", "u.k.", "gb", "british", "britain", "england", "english", "scotland", "scottish",
"wales", "welsh", "ireland", "irish", "northern",
"europe", "european", "america", "american", "usa", "us", "u.s.", "u.s.a.",
"canada", "canadian", "asia", "asian", "pacific", "atlantic",
"australia", "australian", "africa", "african", "india", "indian",
"france", "french", "germany", "german", "spain", "spanish", "italy", "italian",
"japan", "japanese", "china", "chinese", "korea", "korean",
"middle", "east", "west", "north", "south", "central", "western", "eastern",
// Geographic - Cities
"london", "manchester", "birmingham", "leeds", "glasgow", "edinburgh", "bristol",
"liverpool", "sheffield", "newcastle", "cardiff", "belfast", "nottingham",
"southampton", "portsmouth", "brighton", "leicester", "coventry", "hull",
// Legal suffixes
"limited", "ltd", "plc", "llp", "llc", "inc", "incorporated", "corporation", "corp",
"company", "co", "partners", "partnership", "enterprises", "unlimited",
"registered", "cic", "cio", "se", "ag", "gmbh", "sarl", "bv", "nv",
// Business descriptors
"group", "holdings", "holding", "parent", "subsidiary", "division", "branch",
"services", "service", "solutions", "solution", "consulting", "consultants", "consultancy",
"management", "systems", "system", "technologies", "technology", "tech",
"industries", "industry", "industrial", "commercial", "trading", "trade",
"business", "businesses", "operations", "operational", "professional", "professionals",
"resources", "resource", "network", "networks", "associates", "associated",
// Size/Scope descriptors
"national", "international", "global", "worldwide", "world", "regional", "local",
"universal", "general", "standard", "premier", "prime", "first", "one",
// Quality/Marketing terms
"new", "modern", "advanced", "innovative", "premier", "elite", "premium",
"quality", "superior", "excellent", "best", "top", "leading", "major",
// Ownership indicators
"royal", "imperial", "crown", "state", "public", "private", "independent",
"mutual", "cooperative", "coop", "community",
// Time-related
"century", "millennium", "annual", "year", "years",
// Numbers as words
"one", "two", "three", "four", "five", "first", "second", "third"
};
/// <summary>
/// Extracts ALL core identifying words from a company name.
/// These are significant words that aren't common prefixes/suffixes.
/// E.g., "BMW Group Canada" -> ["BMW"], "Lloyds Bowmaker" -> ["LLOYDS", "BOWMAKER"]
/// "Bank of Scotland" -> ["BANK", "SCOTLAND"]
/// </summary>
private static List<string> ExtractCoreIdentifiers(string companyName)
{
if (string.IsNullOrWhiteSpace(companyName)) return new List<string>();
// Remove parenthetical content first
var cleanName = System.Text.RegularExpressions.Regex.Replace(companyName, @"\([^)]*\)", "").Trim();
// Split into words and collect all significant words
var words = cleanName.Split(new[] { ' ', '-', '/', '&' }, StringSplitOptions.RemoveEmptyEntries);
var coreWords = new List<string>();
foreach (var word in words)
{
var cleanWord = word.Trim('.', ',', '\'');
if (cleanWord.Length >= 2 && !SkipWords.Contains(cleanWord))
{
coreWords.Add(cleanWord.ToUpperInvariant());
}
}
return coreWords;
}
#endregion
}