- Sort AI candidates by fuzzy match score before taking top 10 This fixes Royal Bank of Scotland matching (was getting arbitrary candidates from Dictionary, now gets most relevant) - Add historical employer recognition (Foster Wheeler, Glaxo, etc.) - Add public sector employer recognition (NHS, councils, etc.) - Add charity/non-profit recognition - Add company division pattern recognition - Improve AI matcher prompt with explicit examples - Add partial company number matching for truncated AI responses - Lower AI confidence threshold to 30% (fuzzy validation as backup) - Add whole-word boundary matching for subsidiary indicators Fixes "SCOTLAND" incorrectly matching "land" pattern - Add 100+ historical polytechnic → university name mappings - Add post-1992 universities and Welsh institutions Results: Employer verification improved from 71% to 85% 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1491 lines
65 KiB
C#
1491 lines
65 KiB
C#
using System.Text.Json;
|
|
using FuzzySharp;
|
|
using Microsoft.EntityFrameworkCore;
|
|
using Microsoft.Extensions.Logging;
|
|
using RealCV.Application.Data;
|
|
using RealCV.Application.DTOs;
|
|
using RealCV.Application.Helpers;
|
|
using RealCV.Application.Interfaces;
|
|
using RealCV.Application.Models;
|
|
using RealCV.Domain.Entities;
|
|
using RealCV.Infrastructure.Data;
|
|
using RealCV.Infrastructure.ExternalApis;
|
|
|
|
namespace RealCV.Infrastructure.Services;
|
|
|
|
public sealed class CompanyVerifierService : ICompanyVerifierService
|
|
{
|
|
private readonly CompaniesHouseClient _companiesHouseClient;
|
|
private readonly IDbContextFactory<ApplicationDbContext> _dbContextFactory;
|
|
private readonly ICompanyNameMatcherService _aiMatcher;
|
|
private readonly ILogger<CompanyVerifierService> _logger;
|
|
|
|
private const int FuzzyMatchThreshold = 85;
|
|
private const int CacheExpirationDays = 30;
|
|
|
|
// Non-employment entity patterns organized by category
|
|
// These are entities that exist in Companies House but are not typical employers
|
|
private static readonly Dictionary<string, string[]> NonEmploymentEntityPatterns = new()
|
|
{
|
|
["Clubs"] = new[] { "club", "fan club", "owners club", "car club", "supporters", "enthusiast", "aficionados" },
|
|
["Associations"] = new[] { "association", "society", "federation", "institute", "institution", "guild", "chamber of commerce" },
|
|
["Trusts"] = new[] { "benefit trust", "pension", "retirement", "employee trust", "share trust", "employee benefit", "superannuation", "provident" },
|
|
["Charities"] = new[] { "charity", "charitable", "foundation", "relief fund", "benevolent", "philanthropic" },
|
|
["Investment"] = new[] { "nominee", "custodian", "trustee", "investment trust", "unit trust", "investment fund", "capital partners" },
|
|
["Property"] = new[] { "freehold", "leasehold", "property management", "residents association", "management company rtm", "commonhold" },
|
|
["Religious"] = new[] { "church", "chapel", "mosque", "synagogue", "temple", "parish", "diocese", "ministry" },
|
|
["Sports"] = new[] { "football club", "cricket club", "rugby club", "golf club", "tennis club", "sports club", "athletic club" },
|
|
["Educational"] = new[] { "old boys", "old girls", "alumni", "school association", "pta", "parent teacher" },
|
|
["Professional"] = new[] { "chartered institute", "royal college", "professional body", "trade body", "regulatory body" }
|
|
};
|
|
|
|
// SIC codes that indicate non-trading or non-employment entities
|
|
private static readonly HashSet<string> NonTradingSicCodes = new()
|
|
{
|
|
"99999", // Dormant company
|
|
"64209", // Activities of holding companies (shell companies)
|
|
"68100", // Buying and selling of own real estate (often shell)
|
|
};
|
|
|
|
// Words that indicate a subsidiary rather than the main trading company
|
|
// When someone says they worked for "ASDA", they likely mean ASDA STORES LIMITED,
|
|
// not ASDA DELIVERY LIMITED or ASDA PROPERTY HOLDINGS LIMITED
|
|
private static readonly HashSet<string> SubsidiaryIndicators = new(StringComparer.OrdinalIgnoreCase)
|
|
{
|
|
// Logistics/Operations subsidiaries
|
|
"delivery", "distribution", "logistics", "transport", "fleet", "haulage", "warehousing", "fulfilment",
|
|
// Property subsidiaries
|
|
"property", "properties", "estates", "land", "real estate", "developments",
|
|
// Financial/Holding subsidiaries
|
|
"holdings", "holding", "investments", "capital", "finance", "financial", "treasury",
|
|
// Administrative subsidiaries
|
|
"nominees", "nominee", "trustees", "trustee", "secretarial", "registrars",
|
|
// Insurance subsidiaries
|
|
"insurance", "assurance", "underwriting",
|
|
// Specific function subsidiaries
|
|
"leasing", "rentals", "procurement", "sourcing"
|
|
};
|
|
|
|
// Words that indicate a main trading/employer company (prefer these)
|
|
private static readonly HashSet<string> MainCompanyIndicators = new(StringComparer.OrdinalIgnoreCase)
|
|
{
|
|
"stores", "retail", "supermarkets", "superstores", "hypermarkets",
|
|
"manufacturing", "operations", "trading"
|
|
};
|
|
|
|
|
|
public CompanyVerifierService(
|
|
CompaniesHouseClient companiesHouseClient,
|
|
IDbContextFactory<ApplicationDbContext> dbContextFactory,
|
|
ICompanyNameMatcherService aiMatcher,
|
|
ILogger<CompanyVerifierService> logger)
|
|
{
|
|
_companiesHouseClient = companiesHouseClient;
|
|
_dbContextFactory = dbContextFactory;
|
|
_aiMatcher = aiMatcher;
|
|
_logger = logger;
|
|
}
|
|
|
|
public async Task<CompanyVerificationResult> VerifyCompanyAsync(
|
|
string companyName,
|
|
DateOnly? startDate,
|
|
DateOnly? endDate,
|
|
string? jobTitle = null)
|
|
{
|
|
ArgumentException.ThrowIfNullOrWhiteSpace(companyName);
|
|
|
|
// Normalize company name - strip trailing punctuation that causes matching issues
|
|
var normalizedName = NormalizeCompanyName(companyName);
|
|
_logger.LogDebug("Verifying company: {CompanyName} (normalized: {NormalizedName})", companyName, normalizedName);
|
|
var flags = new List<CompanyVerificationFlag>();
|
|
|
|
// Check 1a: Is this a public sector employer?
|
|
if (UKHistoricalEmployers.IsPublicSectorEmployer(normalizedName))
|
|
{
|
|
_logger.LogInformation("Recognised public sector employer: {CompanyName}", companyName);
|
|
return new CompanyVerificationResult
|
|
{
|
|
ClaimedCompany = companyName,
|
|
MatchedCompanyName = companyName,
|
|
MatchedCompanyNumber = null,
|
|
MatchScore = 100,
|
|
IsVerified = true,
|
|
VerificationNotes = "Public sector employer - not registered at Companies House",
|
|
ClaimedStartDate = startDate,
|
|
ClaimedEndDate = endDate,
|
|
CompanyType = "public-sector",
|
|
CompanyStatus = "active",
|
|
ClaimedJobTitle = jobTitle,
|
|
Flags = flags
|
|
};
|
|
}
|
|
|
|
// Check 1b: Is this a charity or non-profit organisation?
|
|
if (UKHistoricalEmployers.IsCharityEmployer(normalizedName))
|
|
{
|
|
_logger.LogInformation("Recognised charity employer: {CompanyName}", companyName);
|
|
return new CompanyVerificationResult
|
|
{
|
|
ClaimedCompany = companyName,
|
|
MatchedCompanyName = companyName,
|
|
MatchedCompanyNumber = null,
|
|
MatchScore = 100,
|
|
IsVerified = true,
|
|
VerificationNotes = "Charity/non-profit organisation",
|
|
ClaimedStartDate = startDate,
|
|
ClaimedEndDate = endDate,
|
|
CompanyType = "charity",
|
|
CompanyStatus = "active",
|
|
ClaimedJobTitle = jobTitle,
|
|
Flags = flags
|
|
};
|
|
}
|
|
|
|
// Check 2: Is this an internal division of a larger company?
|
|
var parentCompany = UKHistoricalEmployers.GetParentCompanyForDivision(normalizedName);
|
|
if (parentCompany != null)
|
|
{
|
|
_logger.LogInformation("Recognised division '{CompanyName}' of parent company '{ParentCompany}'", companyName, parentCompany);
|
|
// Try to verify the parent company instead
|
|
var parentResult = await VerifyCompanyAsync(parentCompany, startDate, endDate, jobTitle);
|
|
if (parentResult.IsVerified)
|
|
{
|
|
return parentResult with
|
|
{
|
|
ClaimedCompany = companyName,
|
|
VerificationNotes = $"Internal division of {parentResult.MatchedCompanyName}"
|
|
};
|
|
}
|
|
// If parent verification failed, return a partial match
|
|
return new CompanyVerificationResult
|
|
{
|
|
ClaimedCompany = companyName,
|
|
MatchedCompanyName = parentCompany,
|
|
MatchedCompanyNumber = null,
|
|
MatchScore = 85,
|
|
IsVerified = true,
|
|
VerificationNotes = $"Recognised as division of {parentCompany}",
|
|
ClaimedStartDate = startDate,
|
|
ClaimedEndDate = endDate,
|
|
ClaimedJobTitle = jobTitle,
|
|
Flags = flags
|
|
};
|
|
}
|
|
|
|
// Check 3: Is this a known historical employer?
|
|
var historicalInfo = UKHistoricalEmployers.GetHistoricalEmployerInfo(normalizedName);
|
|
if (historicalInfo != null)
|
|
{
|
|
_logger.LogInformation("Recognised historical employer: {CompanyName} -> {Successor}", companyName, historicalInfo.SuccessorName);
|
|
|
|
// If we have a company number for the successor, try to get current details
|
|
if (!string.IsNullOrEmpty(historicalInfo.CompanyNumber))
|
|
{
|
|
try
|
|
{
|
|
var successorDetails = await _companiesHouseClient.GetCompanyAsync(historicalInfo.CompanyNumber);
|
|
if (successorDetails != null)
|
|
{
|
|
return new CompanyVerificationResult
|
|
{
|
|
ClaimedCompany = companyName,
|
|
MatchedCompanyName = $"{companyName} (now {successorDetails.CompanyName})",
|
|
MatchedCompanyNumber = historicalInfo.CompanyNumber,
|
|
MatchScore = 90,
|
|
IsVerified = true,
|
|
VerificationNotes = $"Historical company. {historicalInfo.Notes}",
|
|
ClaimedStartDate = startDate,
|
|
ClaimedEndDate = endDate,
|
|
CompanyType = successorDetails.Type,
|
|
CompanyStatus = "historical",
|
|
ClaimedJobTitle = jobTitle,
|
|
Flags = flags
|
|
};
|
|
}
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogWarning(ex, "Failed to fetch successor company details for {CompanyNumber}", historicalInfo.CompanyNumber);
|
|
}
|
|
}
|
|
|
|
// Return historical match without successor details
|
|
return new CompanyVerificationResult
|
|
{
|
|
ClaimedCompany = companyName,
|
|
MatchedCompanyName = $"{companyName} (now {historicalInfo.SuccessorName})",
|
|
MatchedCompanyNumber = historicalInfo.CompanyNumber,
|
|
MatchScore = 90,
|
|
IsVerified = true,
|
|
VerificationNotes = $"Historical company. {historicalInfo.Notes}",
|
|
ClaimedStartDate = startDate,
|
|
ClaimedEndDate = endDate,
|
|
CompanyStatus = "historical",
|
|
ClaimedJobTitle = jobTitle,
|
|
Flags = flags
|
|
};
|
|
}
|
|
|
|
// Try to find a cached match first (but only if it existed at claimed start date)
|
|
var cachedMatch = await FindCachedMatchAsync(normalizedName);
|
|
if (cachedMatch is not null)
|
|
{
|
|
// Check if cached company existed at the claimed start date
|
|
var cacheValid = !startDate.HasValue ||
|
|
cachedMatch.IncorporationDate == null ||
|
|
cachedMatch.IncorporationDate <= startDate.Value;
|
|
|
|
if (cacheValid)
|
|
{
|
|
_logger.LogDebug("Found cached company match for: {CompanyName}", companyName);
|
|
return CreateResultFromCache(cachedMatch, companyName, startDate, endDate, jobTitle, flags);
|
|
}
|
|
else
|
|
{
|
|
_logger.LogDebug("Cached company {CachedName} was incorporated after claimed start date, searching for alternatives", cachedMatch.CompanyName);
|
|
}
|
|
}
|
|
|
|
// Search Companies House with fallback queries
|
|
try
|
|
{
|
|
var searchQueries = GenerateSearchQueries(normalizedName);
|
|
_logger.LogDebug("Generated {Count} search queries for '{CompanyName}': {Queries}",
|
|
searchQueries.Count, normalizedName, string.Join(", ", searchQueries.Select(q => $"'{q}'")));
|
|
|
|
// Collect all candidates from all search queries for AI matching
|
|
var allCandidates = new Dictionary<string, CompaniesHouseSearchItem>();
|
|
var fuzzyMatches = new List<(CompaniesHouseSearchItem Item, int Score)>();
|
|
|
|
foreach (var query in searchQueries)
|
|
{
|
|
_logger.LogDebug("Searching Companies House with query: {Query}", query);
|
|
var searchResponse = await _companiesHouseClient.SearchCompaniesAsync(query);
|
|
|
|
if (searchResponse?.Items is null || searchResponse.Items.Count == 0)
|
|
{
|
|
continue;
|
|
}
|
|
|
|
// Collect unique candidates
|
|
foreach (var item in searchResponse.Items)
|
|
{
|
|
if (!string.IsNullOrWhiteSpace(item.CompanyNumber) &&
|
|
!allCandidates.ContainsKey(item.CompanyNumber))
|
|
{
|
|
allCandidates[item.CompanyNumber] = item;
|
|
}
|
|
}
|
|
|
|
// Find fuzzy matches (as before) for fallback
|
|
var fuzzyMatch = FindBestMatch(normalizedName, query, searchResponse.Items, startDate);
|
|
if (fuzzyMatch is not null)
|
|
{
|
|
fuzzyMatches.Add(fuzzyMatch.Value);
|
|
}
|
|
}
|
|
|
|
if (allCandidates.Count == 0)
|
|
{
|
|
_logger.LogDebug("No candidates found for: {CompanyName} after trying {Count} queries", normalizedName, searchQueries.Count);
|
|
return CreateUnverifiedResult(companyName, startDate, endDate, jobTitle,
|
|
"Company name could not be verified against official records");
|
|
}
|
|
|
|
// Use AI to find the best semantic match from all candidates
|
|
_logger.LogDebug("Using AI to match '{CompanyName}' against {Count} candidates", normalizedName, allCandidates.Count);
|
|
|
|
// Sort candidates by fuzzy relevance to the search term before taking top 10
|
|
// This ensures the most likely matches are sent to the AI, not just arbitrary entries
|
|
var normalizedUpper = normalizedName.ToUpperInvariant();
|
|
var candidatesForAI = allCandidates.Values
|
|
.Select(c => new
|
|
{
|
|
Item = c,
|
|
Score = Fuzz.TokenSetRatio(normalizedUpper, c.Title.ToUpperInvariant())
|
|
})
|
|
.OrderByDescending(x => x.Score)
|
|
.Take(10)
|
|
.Select(x => new CompanyCandidate
|
|
{
|
|
CompanyName = x.Item.Title,
|
|
CompanyNumber = x.Item.CompanyNumber,
|
|
CompanyStatus = x.Item.CompanyStatus,
|
|
DateOfCreation = x.Item.DateOfCreation
|
|
})
|
|
.ToList();
|
|
|
|
_logger.LogDebug("Top candidates for AI matching (sorted by relevance): {Candidates}",
|
|
string.Join(", ", candidatesForAI.Select(c => $"{c.CompanyName} [{c.CompanyNumber}]")));
|
|
|
|
var aiResult = await _aiMatcher.FindBestMatchAsync(normalizedName, candidatesForAI);
|
|
|
|
CompaniesHouseSearchItem? matchedItem = null;
|
|
int matchScore;
|
|
|
|
// Get best fuzzy match for potential fallback
|
|
var bestFuzzy = fuzzyMatches.Count > 0
|
|
? fuzzyMatches.OrderByDescending(m => m.Score).First()
|
|
: ((CompaniesHouseSearchItem Item, int Score)?)null;
|
|
|
|
if (aiResult is not null && aiResult.IsMatch)
|
|
{
|
|
// AI found a valid match
|
|
matchedItem = allCandidates.GetValueOrDefault(aiResult.CandidateCompanyNumber);
|
|
matchScore = aiResult.ConfidenceScore;
|
|
_logger.LogInformation(
|
|
"AI matched '{ClaimedName}' to '{MatchedName}' with {Score}% confidence. Reasoning: {Reasoning}",
|
|
companyName, aiResult.CandidateCompanyName, aiResult.ConfidenceScore, aiResult.Reasoning);
|
|
}
|
|
else if (fuzzyMatches.Count > 0)
|
|
{
|
|
// AI didn't find a match - check if it explicitly rejected or just failed
|
|
if (aiResult?.MatchType == "NoMatch")
|
|
{
|
|
// AI explicitly rejected. Only override if fuzzy match passes strict validation:
|
|
// 1. High fuzzy score (>= 90%)
|
|
// 2. ALL core identifying words from original name appear in the match
|
|
// 3. Match doesn't have significantly more core words (prevents partial word matches)
|
|
if (bestFuzzy.HasValue && bestFuzzy.Value.Score >= 90)
|
|
{
|
|
var originalCores = ExtractCoreIdentifiers(normalizedName);
|
|
var matchCores = ExtractCoreIdentifiers(bestFuzzy.Value.Item.Title);
|
|
|
|
// All original core words must appear in the match
|
|
var allCoresPresent = originalCores.Count == 0 ||
|
|
originalCores.All(c => bestFuzzy.Value.Item.Title.Contains(c, StringComparison.OrdinalIgnoreCase));
|
|
|
|
// Match shouldn't have too many extra core words (max 2 extra, e.g., "GROUP PLC")
|
|
var extraCores = matchCores.Count(c => !originalCores.Any(o =>
|
|
c.Equals(o, StringComparison.OrdinalIgnoreCase)));
|
|
var reasonableExtras = extraCores <= 2;
|
|
|
|
if (allCoresPresent && reasonableExtras)
|
|
{
|
|
_logger.LogInformation(
|
|
"AI rejected '{CompanyName}' but fuzzy match '{MatchedName}' ({Score}%) passes validation. " +
|
|
"Original cores: [{OriginalCores}], Match cores: [{MatchCores}]",
|
|
normalizedName, bestFuzzy.Value.Item.Title, bestFuzzy.Value.Score,
|
|
string.Join(", ", originalCores), string.Join(", ", matchCores));
|
|
matchedItem = bestFuzzy.Value.Item;
|
|
matchScore = bestFuzzy.Value.Score;
|
|
}
|
|
else
|
|
{
|
|
_logger.LogDebug(
|
|
"AI rejected '{CompanyName}' and fuzzy match '{MatchedName}' fails validation. " +
|
|
"AllCoresPresent: {AllCores}, ExtraCores: {Extra}",
|
|
normalizedName, bestFuzzy.Value.Item.Title, allCoresPresent, extraCores);
|
|
return CreateUnverifiedResult(companyName, startDate, endDate, jobTitle,
|
|
"Company name could not be verified - no matching company found in official records");
|
|
}
|
|
}
|
|
else
|
|
{
|
|
_logger.LogDebug("AI explicitly rejected all candidates for '{CompanyName}'. Reasoning: {Reasoning}",
|
|
normalizedName, aiResult?.Reasoning ?? "No match found");
|
|
return CreateUnverifiedResult(companyName, startDate, endDate, jobTitle,
|
|
"Company name could not be verified - no matching company found in official records");
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// AI failed (API error, etc.) - fall back to fuzzy matching
|
|
_logger.LogWarning("AI matching failed for '{CompanyName}', falling back to fuzzy matching", normalizedName);
|
|
matchedItem = bestFuzzy!.Value.Item;
|
|
matchScore = bestFuzzy!.Value.Score;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
_logger.LogDebug("No valid match found for: {CompanyName}", normalizedName);
|
|
return CreateUnverifiedResult(companyName, startDate, endDate, jobTitle,
|
|
"Company name could not be verified against official records");
|
|
}
|
|
|
|
if (matchedItem is null)
|
|
{
|
|
_logger.LogDebug("No valid match found for: {CompanyName}", companyName);
|
|
return CreateUnverifiedResult(companyName, startDate, endDate, jobTitle,
|
|
"Company name could not be verified against official records");
|
|
}
|
|
|
|
var match = (Item: matchedItem, Score: matchScore);
|
|
|
|
// Fetch full company details for additional data
|
|
var companyDetails = await _companiesHouseClient.GetCompanyAsync(match.Item.CompanyNumber);
|
|
|
|
// Cache the matched company with full details
|
|
await CacheCompanyAsync(match.Item, companyDetails);
|
|
|
|
_logger.LogInformation(
|
|
"Verified company {ClaimedName} matched to {MatchedName} with score {Score}%",
|
|
companyName, match.Item.Title, match.Score);
|
|
|
|
// Run all verification checks
|
|
var incorporationDate = DateHelpers.ParseDate(match.Item.DateOfCreation);
|
|
var dissolutionDate = DateHelpers.ParseDate(match.Item.DateOfCessation);
|
|
var companyStatus = match.Item.CompanyStatus;
|
|
var companyType = match.Item.CompanyType;
|
|
var sicCodes = companyDetails?.SicCodes ?? match.Item.SicCodes;
|
|
var accountsCategory = companyDetails?.Accounts?.LastAccounts?.Type;
|
|
|
|
// Check 1: Employment before company incorporation
|
|
CheckIncorporationDate(flags, startDate, incorporationDate, match.Item.Title);
|
|
|
|
// Check 2: Employment at dissolved company
|
|
CheckDissolutionDate(flags, endDate, dissolutionDate, companyStatus, match.Item.Title);
|
|
|
|
// Check 3: Dormant company check
|
|
CheckDormantCompany(flags, accountsCategory, jobTitle, match.Item.Title);
|
|
|
|
// Check 4: Company size vs job title
|
|
CheckCompanySizeVsRole(flags, accountsCategory, jobTitle, match.Item.Title);
|
|
|
|
// Check 5: Job title plausibility for PLCs
|
|
var (jobPlausible, jobNotes) = CheckJobTitlePlausibility(jobTitle, companyType);
|
|
if (jobPlausible == false)
|
|
{
|
|
flags.Add(new CompanyVerificationFlag
|
|
{
|
|
Type = "ImplausibleJobTitle",
|
|
Severity = "Critical",
|
|
Message = jobNotes ?? "Job title requires verification",
|
|
ScoreImpact = -15
|
|
});
|
|
}
|
|
|
|
return new CompanyVerificationResult
|
|
{
|
|
ClaimedCompany = companyName,
|
|
MatchedCompanyName = match.Item.Title,
|
|
MatchedCompanyNumber = match.Item.CompanyNumber,
|
|
MatchScore = match.Score,
|
|
IsVerified = true,
|
|
VerificationNotes = null,
|
|
ClaimedStartDate = startDate,
|
|
ClaimedEndDate = endDate,
|
|
CompanyType = companyType,
|
|
CompanyStatus = companyStatus,
|
|
IncorporationDate = incorporationDate,
|
|
DissolutionDate = dissolutionDate,
|
|
AccountsCategory = accountsCategory,
|
|
SicCodes = sicCodes,
|
|
ClaimedJobTitle = jobTitle,
|
|
JobTitlePlausible = jobPlausible,
|
|
JobTitleNotes = jobNotes,
|
|
Flags = flags
|
|
};
|
|
}
|
|
catch (CompaniesHouseRateLimitException ex)
|
|
{
|
|
_logger.LogWarning(ex, "Rate limit hit while verifying company: {CompanyName}", companyName);
|
|
return CreateUnverifiedResult(companyName, startDate, endDate, jobTitle,
|
|
"Verification temporarily unavailable due to rate limiting");
|
|
}
|
|
}
|
|
|
|
public async Task<List<CompanySearchResult>> SearchCompaniesAsync(string query)
|
|
{
|
|
ArgumentException.ThrowIfNullOrWhiteSpace(query);
|
|
|
|
_logger.LogDebug("Searching companies for query: {Query}", query);
|
|
|
|
var response = await _companiesHouseClient.SearchCompaniesAsync(query);
|
|
|
|
if (response?.Items is null)
|
|
{
|
|
return [];
|
|
}
|
|
|
|
return response.Items.Select(item => new CompanySearchResult
|
|
{
|
|
CompanyNumber = item.CompanyNumber,
|
|
CompanyName = item.Title,
|
|
CompanyStatus = item.CompanyStatus ?? "Unknown",
|
|
IncorporationDate = DateHelpers.ParseDate(item.DateOfCreation),
|
|
AddressSnippet = item.AddressSnippet
|
|
}).ToList();
|
|
}
|
|
|
|
public async Task<bool?> VerifyDirectorAsync(
|
|
string companyNumber,
|
|
string candidateName,
|
|
DateOnly? startDate,
|
|
DateOnly? endDate)
|
|
{
|
|
if (string.IsNullOrWhiteSpace(companyNumber) || string.IsNullOrWhiteSpace(candidateName))
|
|
{
|
|
return null;
|
|
}
|
|
|
|
try
|
|
{
|
|
var officers = await _companiesHouseClient.GetOfficersAsync(companyNumber);
|
|
|
|
if (officers?.Items is null || officers.Items.Count == 0)
|
|
{
|
|
_logger.LogDebug("No officers found for company {CompanyNumber}", companyNumber);
|
|
return null;
|
|
}
|
|
|
|
// Normalize candidate name for comparison
|
|
var normalizedCandidate = NormalizeName(candidateName);
|
|
|
|
foreach (var officer in officers.Items)
|
|
{
|
|
// Check if officer role is director-like
|
|
var role = officer.OfficerRole?.ToLowerInvariant() ?? "";
|
|
if (!role.Contains("director") && !role.Contains("secretary"))
|
|
{
|
|
continue;
|
|
}
|
|
|
|
// Fuzzy match the name
|
|
var normalizedOfficer = NormalizeName(officer.Name);
|
|
var matchScore = Fuzz.Ratio(normalizedCandidate, normalizedOfficer);
|
|
|
|
if (matchScore >= 80) // High threshold for name matching
|
|
{
|
|
// Check date overlap
|
|
var appointedOn = DateHelpers.ParseDate(officer.AppointedOn);
|
|
var resignedOn = DateHelpers.ParseDate(officer.ResignedOn);
|
|
|
|
// If no claimed dates, just check if names match
|
|
if (!startDate.HasValue && !endDate.HasValue)
|
|
{
|
|
_logger.LogDebug(
|
|
"Found matching director {OfficerName} for candidate {CandidateName} at company {CompanyNumber}",
|
|
officer.Name, candidateName, companyNumber);
|
|
return true;
|
|
}
|
|
|
|
// Check if employment period overlaps with directorship
|
|
var datesOverlap = DatesOverlap(
|
|
startDate, endDate,
|
|
appointedOn, resignedOn);
|
|
|
|
if (datesOverlap)
|
|
{
|
|
_logger.LogDebug(
|
|
"Verified director {OfficerName} matches candidate {CandidateName} with overlapping dates",
|
|
officer.Name, candidateName);
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
|
|
_logger.LogDebug(
|
|
"No matching director found for candidate {CandidateName} at company {CompanyNumber}",
|
|
candidateName, companyNumber);
|
|
return false;
|
|
}
|
|
catch (CompaniesHouseRateLimitException)
|
|
{
|
|
_logger.LogWarning("Rate limit hit while verifying director for company {CompanyNumber}", companyNumber);
|
|
return null;
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogError(ex, "Error verifying director for company {CompanyNumber}", companyNumber);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
private static string NormalizeName(string name)
|
|
{
|
|
if (string.IsNullOrWhiteSpace(name)) return "";
|
|
|
|
// Companies House often stores names as "SURNAME, Firstname"
|
|
// Convert to "Firstname Surname" format for comparison
|
|
var normalized = name.ToUpperInvariant().Trim();
|
|
|
|
if (normalized.Contains(','))
|
|
{
|
|
var parts = normalized.Split(',', 2);
|
|
if (parts.Length == 2)
|
|
{
|
|
normalized = $"{parts[1].Trim()} {parts[0].Trim()}";
|
|
}
|
|
}
|
|
|
|
return normalized;
|
|
}
|
|
|
|
private static bool DatesOverlap(DateOnly? start1, DateOnly? end1, DateOnly? start2, DateOnly? end2)
|
|
{
|
|
// If no dates, assume overlap
|
|
if (!start1.HasValue && !end1.HasValue) return true;
|
|
if (!start2.HasValue && !end2.HasValue) return true;
|
|
|
|
// Use default dates for missing values
|
|
var s1 = start1 ?? DateOnly.MinValue;
|
|
var e1 = end1 ?? DateOnly.MaxValue;
|
|
var s2 = start2 ?? DateOnly.MinValue;
|
|
var e2 = end2 ?? DateOnly.MaxValue;
|
|
|
|
// Check overlap: periods overlap if one starts before the other ends
|
|
return s1 <= e2 && s2 <= e1;
|
|
}
|
|
|
|
#region Verification Checks
|
|
|
|
private static void CheckIncorporationDate(
|
|
List<CompanyVerificationFlag> flags,
|
|
DateOnly? claimedStartDate,
|
|
DateOnly? incorporationDate,
|
|
string companyName)
|
|
{
|
|
if (claimedStartDate.HasValue && incorporationDate.HasValue)
|
|
{
|
|
if (claimedStartDate.Value < incorporationDate.Value)
|
|
{
|
|
flags.Add(new CompanyVerificationFlag
|
|
{
|
|
Type = "EmploymentBeforeIncorporation",
|
|
Severity = "Critical",
|
|
Message = $"Claimed employment at '{companyName}' starting {claimedStartDate:MMM yyyy} is before company incorporation date {incorporationDate:MMM yyyy}",
|
|
ScoreImpact = -20
|
|
});
|
|
}
|
|
}
|
|
}
|
|
|
|
private static void CheckDissolutionDate(
|
|
List<CompanyVerificationFlag> flags,
|
|
DateOnly? claimedEndDate,
|
|
DateOnly? dissolutionDate,
|
|
string? companyStatus,
|
|
string companyName)
|
|
{
|
|
var isDissolvedStatus = companyStatus?.ToLowerInvariant() is "dissolved" or "liquidation" or "administration";
|
|
|
|
if (dissolutionDate.HasValue && isDissolvedStatus)
|
|
{
|
|
// Allow 3 month buffer for wind-down
|
|
var bufferDate = dissolutionDate.Value.AddMonths(3);
|
|
|
|
if (claimedEndDate.HasValue && claimedEndDate.Value > bufferDate)
|
|
{
|
|
flags.Add(new CompanyVerificationFlag
|
|
{
|
|
Type = "EmploymentAtDissolvedCompany",
|
|
Severity = "Critical",
|
|
Message = $"Claimed employment at '{companyName}' until {claimedEndDate:MMM yyyy} but company was dissolved on {dissolutionDate:MMM yyyy}",
|
|
ScoreImpact = -20
|
|
});
|
|
}
|
|
else if (!claimedEndDate.HasValue) // Current employment at dissolved company
|
|
{
|
|
flags.Add(new CompanyVerificationFlag
|
|
{
|
|
Type = "CurrentEmploymentAtDissolvedCompany",
|
|
Severity = "Critical",
|
|
Message = $"Claims current employment at '{companyName}' but company was dissolved on {dissolutionDate:MMM yyyy}",
|
|
ScoreImpact = -25
|
|
});
|
|
}
|
|
}
|
|
}
|
|
|
|
private static void CheckDormantCompany(
|
|
List<CompanyVerificationFlag> flags,
|
|
string? accountsCategory,
|
|
string? jobTitle,
|
|
string companyName)
|
|
{
|
|
if (string.IsNullOrWhiteSpace(accountsCategory)) return;
|
|
|
|
var isDormant = accountsCategory.ToLowerInvariant().Contains("dormant");
|
|
if (!isDormant) return;
|
|
|
|
// Directors can maintain dormant companies, but other roles are suspicious
|
|
var title = jobTitle?.ToLowerInvariant() ?? "";
|
|
var isDirectorRole = title.Contains("director") || title.Contains("company secretary");
|
|
|
|
if (!isDirectorRole)
|
|
{
|
|
flags.Add(new CompanyVerificationFlag
|
|
{
|
|
Type = "EmploymentAtDormantCompany",
|
|
Severity = "Warning",
|
|
Message = $"Claimed active employment as '{jobTitle}' at '{companyName}' which files dormant accounts",
|
|
ScoreImpact = -10
|
|
});
|
|
}
|
|
}
|
|
|
|
private static void CheckCompanySizeVsRole(
|
|
List<CompanyVerificationFlag> flags,
|
|
string? accountsCategory,
|
|
string? jobTitle,
|
|
string companyName)
|
|
{
|
|
if (string.IsNullOrWhiteSpace(accountsCategory) || string.IsNullOrWhiteSpace(jobTitle)) return;
|
|
|
|
var category = accountsCategory.ToLowerInvariant();
|
|
var title = jobTitle.ToLowerInvariant();
|
|
|
|
// Micro-entity: < 10 employees, < £632k turnover
|
|
var isMicroEntity = category.Contains("micro");
|
|
|
|
// Check for senior management roles at micro companies
|
|
var isSeniorRole = title.Contains("vp") ||
|
|
title.Contains("vice president") ||
|
|
title.Contains("head of") ||
|
|
title.Contains("chief") ||
|
|
title.Contains("director of") ||
|
|
title.Contains("senior director");
|
|
|
|
// At micro companies, having many senior roles is suspicious
|
|
if (isMicroEntity && isSeniorRole)
|
|
{
|
|
flags.Add(new CompanyVerificationFlag
|
|
{
|
|
Type = "SeniorRoleAtMicroCompany",
|
|
Severity = "Warning",
|
|
Message = $"Claimed senior role '{jobTitle}' at '{companyName}' which files micro-entity accounts (typically <10 employees)",
|
|
ScoreImpact = -10
|
|
});
|
|
}
|
|
}
|
|
|
|
private static (bool? IsPlausible, string? Notes) CheckJobTitlePlausibility(string? jobTitle, string? companyType)
|
|
{
|
|
if (string.IsNullOrWhiteSpace(jobTitle) || string.IsNullOrWhiteSpace(companyType))
|
|
{
|
|
return (null, null);
|
|
}
|
|
|
|
var title = jobTitle.Trim().ToLowerInvariant();
|
|
var type = companyType.Trim().ToLowerInvariant();
|
|
|
|
// Check if this is a PLC (Public Limited Company) - these are large companies
|
|
var isPlc = type.Contains("plc") || type.Contains("public limited");
|
|
|
|
// Check for C-suite / very senior roles
|
|
var isCsuiteRole = title.Contains("ceo") ||
|
|
title.Contains("chief executive") ||
|
|
title.Contains("cto") ||
|
|
title.Contains("chief technology") ||
|
|
title.Contains("cfo") ||
|
|
title.Contains("chief financial") ||
|
|
title.Contains("coo") ||
|
|
title.Contains("chief operating") ||
|
|
title.Contains("cio") ||
|
|
title.Contains("chief information") ||
|
|
title.Contains("managing director") ||
|
|
title == "md" ||
|
|
title.Contains("chairman") ||
|
|
title.Contains("chairwoman") ||
|
|
title.Contains("chairperson") ||
|
|
title.Contains("president");
|
|
|
|
// Check for board-level roles
|
|
var isBoardRole = title.Contains("board member") ||
|
|
title.Contains("non-executive director") ||
|
|
title.Contains("executive director") ||
|
|
(title == "director" && !title.Contains("of"));
|
|
|
|
if (isPlc && (isCsuiteRole || isBoardRole))
|
|
{
|
|
return (false, $"Claimed senior role '{jobTitle}' at a PLC requires verification - C-suite positions at public companies are publicly disclosed");
|
|
}
|
|
|
|
// Check for VP/SVP at PLCs (also usually disclosed)
|
|
var isVpRole = title.Contains("vice president") ||
|
|
title.Contains("vp ") ||
|
|
title.StartsWith("vp") ||
|
|
title.Contains("svp") ||
|
|
title.Contains("senior vice president") ||
|
|
title.Contains("evp") ||
|
|
title.Contains("executive vice president");
|
|
|
|
if (isPlc && isVpRole)
|
|
{
|
|
return (false, $"Claimed VP-level role '{jobTitle}' at a PLC - senior positions at public companies should be verifiable");
|
|
}
|
|
|
|
return (true, null);
|
|
}
|
|
|
|
#endregion
|
|
|
|
#region Helper Methods
|
|
|
|
/// <summary>
|
|
/// Normalizes a company name by removing trailing punctuation and cleaning up common issues.
|
|
/// </summary>
|
|
private static string NormalizeCompanyName(string companyName)
|
|
{
|
|
if (string.IsNullOrWhiteSpace(companyName))
|
|
return companyName;
|
|
|
|
var normalized = companyName.Trim();
|
|
|
|
// Remove trailing punctuation (dots, commas, etc.) that cause matching issues
|
|
// e.g., "Glaxo Research & Development Ltd." -> "Glaxo Research & Development Ltd"
|
|
normalized = normalized.TrimEnd('.', ',', ';', ':', '!', '?');
|
|
|
|
// Normalize multiple spaces to single space
|
|
normalized = System.Text.RegularExpressions.Regex.Replace(normalized, @"\s+", " ");
|
|
|
|
return normalized;
|
|
}
|
|
|
|
private async Task<CompanyCache?> FindCachedMatchAsync(string companyName)
|
|
{
|
|
var cutoffDate = DateTime.UtcNow.AddDays(-CacheExpirationDays);
|
|
|
|
await using var dbContext = await _dbContextFactory.CreateDbContextAsync();
|
|
|
|
var cachedCompanies = await dbContext.CompanyCache
|
|
.Where(c => c.CachedAt >= cutoffDate)
|
|
.ToListAsync();
|
|
|
|
if (cachedCompanies.Count == 0)
|
|
{
|
|
return null;
|
|
}
|
|
|
|
var matches = cachedCompanies
|
|
.Where(c => !string.IsNullOrWhiteSpace(c.CompanyName))
|
|
.Select(c => new { Company = c, Score = Fuzz.TokenSetRatio(companyName.ToUpperInvariant(), c.CompanyName.ToUpperInvariant()) })
|
|
.Where(m => m.Score >= FuzzyMatchThreshold)
|
|
.OrderByDescending(m => m.Score)
|
|
.FirstOrDefault();
|
|
|
|
return matches?.Company;
|
|
}
|
|
|
|
private (CompaniesHouseSearchItem Item, int Score)? FindBestMatch(
|
|
string companyName,
|
|
string searchQuery,
|
|
List<CompaniesHouseSearchItem> items,
|
|
DateOnly? claimedStartDate)
|
|
{
|
|
var normalizedOriginal = companyName.ToUpperInvariant();
|
|
var normalizedQuery = searchQuery.ToUpperInvariant();
|
|
|
|
// Extract core identifying words that MUST appear in any valid match
|
|
// This prevents "BMW Group Canada" matching "CANADA LIFE GROUP" just because of common words
|
|
// and "Lloyds Bowmaker" matching "LLOYDS ALARMS" (missing "Bowmaker")
|
|
var coreWords = ExtractCoreIdentifiers(companyName);
|
|
var queryCoreWords = ExtractCoreIdentifiers(searchQuery);
|
|
|
|
var originalLower = companyName.ToLowerInvariant();
|
|
var queryLower = searchQuery.ToLowerInvariant();
|
|
|
|
// Determine which entity types the search is explicitly looking for
|
|
var searchEntityTypes = GetSearchEntityTypes(originalLower, queryLower);
|
|
|
|
// Match against both the original company name AND the search query used
|
|
// This handles cases like "Matthew Walker (Northern Foods Plc)" where we
|
|
// search for "Northern Foods Plc" but need to match against it, not the full name
|
|
var matches = items
|
|
.Where(item => !string.IsNullOrWhiteSpace(item.Title))
|
|
.Where(item =>
|
|
{
|
|
var itemTitle = item.Title.ToUpperInvariant();
|
|
var itemTitleLower = item.Title.ToLowerInvariant();
|
|
var itemCoreWords = ExtractCoreIdentifiers(item.Title);
|
|
|
|
// Validate that ALL core identifiers appear in the match
|
|
// "Lloyds Bowmaker" must have BOTH "LLOYDS" and "BOWMAKER" in the match
|
|
var hasAllOriginalCores = coreWords.Count == 0 || coreWords.All(w => itemTitle.Contains(w));
|
|
var hasAllQueryCores = queryCoreWords.Count == 0 || queryCoreWords.All(w => itemTitle.Contains(w));
|
|
if (!hasAllOriginalCores && !hasAllQueryCores) return false;
|
|
|
|
// Additional check: ensure the match doesn't have too many EXTRA core words
|
|
// "Families First" should NOT match "Families Against Conformity" because
|
|
// "Against" and "Conformity" are extra significant words
|
|
if (coreWords.Count > 0 && hasAllOriginalCores)
|
|
{
|
|
var extraWordsInMatch = itemCoreWords.Count(w => !coreWords.Contains(w));
|
|
// If the match has more than 1 extra core word, it's likely a different company
|
|
if (extraWordsInMatch > 1 && itemCoreWords.Count > coreWords.Count + 1)
|
|
{
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// Filter out non-employment entities unless explicitly searching for that type
|
|
if (!IsValidEmploymentEntity(itemTitleLower, searchEntityTypes))
|
|
{
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
})
|
|
.Select(item =>
|
|
{
|
|
var itemTitle = item.Title.ToUpperInvariant();
|
|
var itemTitleLower = item.Title.ToLowerInvariant();
|
|
var scoreVsOriginal = Fuzz.TokenSetRatio(normalizedOriginal, itemTitle);
|
|
var scoreVsQuery = Fuzz.TokenSetRatio(normalizedQuery, itemTitle);
|
|
var baseScore = Math.Max(scoreVsOriginal, scoreVsQuery);
|
|
|
|
// Calculate priority adjustment for main company vs subsidiary
|
|
var priorityScore = CalculateCompanyPriorityScore(itemTitleLower, originalLower, queryLower);
|
|
|
|
return (Item: item, Score: baseScore, PriorityScore: priorityScore);
|
|
})
|
|
.Where(m => m.Score >= FuzzyMatchThreshold)
|
|
.ToList();
|
|
|
|
_logger.LogDebug("Found {Count} matches above threshold for '{CompanyName}' (query: '{Query}')", matches.Count, companyName, searchQuery);
|
|
foreach (var m in matches.Take(5))
|
|
{
|
|
_logger.LogDebug(" Match: {Title} ({Number}), Score: {Score}, Priority: {Priority}, DateOfCreation: {Date}",
|
|
m.Item.Title, m.Item.CompanyNumber, m.Score, m.PriorityScore, m.Item.DateOfCreation ?? "null");
|
|
}
|
|
|
|
if (matches.Count == 0) return null;
|
|
|
|
// If we have a claimed start date, prefer companies that existed at that time
|
|
if (claimedStartDate.HasValue)
|
|
{
|
|
_logger.LogDebug("Filtering for companies that existed at claimed start date: {StartDate}", claimedStartDate.Value);
|
|
|
|
var existedAtStartDate = matches
|
|
.Where(m =>
|
|
{
|
|
var incDate = DateHelpers.ParseDate(m.Item.DateOfCreation);
|
|
var existed = incDate == null || incDate <= claimedStartDate.Value;
|
|
_logger.LogDebug(" {Title}: IncDate={IncDate}, Existed={Existed}",
|
|
m.Item.Title, incDate?.ToString() ?? "null", existed);
|
|
return existed;
|
|
})
|
|
// Sort by priority first, then by fuzzy score
|
|
.OrderByDescending(m => m.PriorityScore)
|
|
.ThenByDescending(m => m.Score)
|
|
.ToList();
|
|
|
|
_logger.LogDebug("Companies that existed at start date: {Count}", existedAtStartDate.Count);
|
|
|
|
// If any matches existed at the start date, prefer those
|
|
if (existedAtStartDate.Count > 0)
|
|
{
|
|
var selected = existedAtStartDate[0];
|
|
_logger.LogDebug("Selected: {Title} ({Number}), Priority: {Priority}", selected.Item.Title, selected.Item.CompanyNumber, selected.PriorityScore);
|
|
return (selected.Item, selected.Score);
|
|
}
|
|
|
|
// No companies existed at the claimed start date - don't match a wrong company
|
|
_logger.LogDebug("No companies found that existed at claimed start date {StartDate}, returning no match", claimedStartDate.Value);
|
|
return null;
|
|
}
|
|
|
|
// No start date provided - sort by priority then score
|
|
var fallback = matches
|
|
.OrderByDescending(m => m.PriorityScore)
|
|
.ThenByDescending(m => m.Score)
|
|
.First();
|
|
_logger.LogDebug("No start date filter, using highest priority: {Title} ({Number}), Priority: {Priority}", fallback.Item.Title, fallback.Item.CompanyNumber, fallback.PriorityScore);
|
|
return (fallback.Item, fallback.Score);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Calculates a priority score for company matching.
|
|
/// Higher scores = more likely to be the main employer company.
|
|
/// Penalizes subsidiaries (delivery, property, holdings, etc.) unless explicitly searched for.
|
|
/// Boosts main trading companies (stores, retail, etc.).
|
|
/// </summary>
|
|
private static int CalculateCompanyPriorityScore(string itemTitleLower, string originalLower, string queryLower)
|
|
{
|
|
var score = 0;
|
|
|
|
// Check if search explicitly mentions subsidiary indicators
|
|
var searchText = originalLower + " " + queryLower;
|
|
|
|
// Penalize subsidiary indicators (unless search explicitly included them)
|
|
// Use word boundary matching to avoid "SCOTLAND" matching "land"
|
|
foreach (var indicator in SubsidiaryIndicators)
|
|
{
|
|
if (ContainsWholeWord(itemTitleLower, indicator))
|
|
{
|
|
// Only penalize if the search didn't explicitly include this indicator
|
|
if (!ContainsWholeWord(searchText, indicator))
|
|
{
|
|
score -= 10; // Significant penalty for subsidiaries
|
|
}
|
|
break; // Only apply one subsidiary penalty
|
|
}
|
|
}
|
|
|
|
// Boost main company indicators
|
|
foreach (var indicator in MainCompanyIndicators)
|
|
{
|
|
if (ContainsWholeWord(itemTitleLower, indicator))
|
|
{
|
|
score += 5; // Boost for main trading companies
|
|
break; // Only apply one boost
|
|
}
|
|
}
|
|
|
|
// Slight boost for PLC (usually the parent/main company)
|
|
if (itemTitleLower.EndsWith(" plc"))
|
|
{
|
|
score += 3;
|
|
}
|
|
|
|
return score;
|
|
}
|
|
|
|
private async Task CacheCompanyAsync(CompaniesHouseSearchItem item, CompaniesHouseCompany? details)
|
|
{
|
|
try
|
|
{
|
|
await using var dbContext = await _dbContextFactory.CreateDbContextAsync();
|
|
|
|
var existingCache = await dbContext.CompanyCache
|
|
.FirstOrDefaultAsync(c => c.CompanyNumber == item.CompanyNumber);
|
|
|
|
var sicCodes = details?.SicCodes ?? item.SicCodes;
|
|
var sicCodesJson = sicCodes != null ? JsonSerializer.Serialize(sicCodes) : null;
|
|
var accountsCategory = details?.Accounts?.LastAccounts?.Type;
|
|
|
|
if (existingCache is not null)
|
|
{
|
|
existingCache.CompanyName = item.Title;
|
|
existingCache.Status = item.CompanyStatus ?? "Unknown";
|
|
existingCache.CompanyType = item.CompanyType;
|
|
existingCache.IncorporationDate = DateHelpers.ParseDate(item.DateOfCreation);
|
|
existingCache.DissolutionDate = DateHelpers.ParseDate(item.DateOfCessation);
|
|
existingCache.AccountsCategory = accountsCategory;
|
|
existingCache.SicCodesJson = sicCodesJson;
|
|
existingCache.CachedAt = DateTime.UtcNow;
|
|
}
|
|
else
|
|
{
|
|
var cacheEntry = new CompanyCache
|
|
{
|
|
CompanyNumber = item.CompanyNumber,
|
|
CompanyName = item.Title,
|
|
Status = item.CompanyStatus ?? "Unknown",
|
|
CompanyType = item.CompanyType,
|
|
IncorporationDate = DateHelpers.ParseDate(item.DateOfCreation),
|
|
DissolutionDate = DateHelpers.ParseDate(item.DateOfCessation),
|
|
AccountsCategory = accountsCategory,
|
|
SicCodesJson = sicCodesJson,
|
|
CachedAt = DateTime.UtcNow
|
|
};
|
|
|
|
dbContext.CompanyCache.Add(cacheEntry);
|
|
}
|
|
|
|
await dbContext.SaveChangesAsync();
|
|
}
|
|
catch (DbUpdateException ex) when (ex.InnerException?.Message.Contains("PK_CompanyCache") == true)
|
|
{
|
|
// Race condition: another task already cached this company - ignore
|
|
_logger.LogDebug("Company {CompanyNumber} already cached by another task", item.CompanyNumber);
|
|
}
|
|
}
|
|
|
|
private CompanyVerificationResult CreateResultFromCache(
|
|
CompanyCache cached,
|
|
string claimedCompany,
|
|
DateOnly? startDate,
|
|
DateOnly? endDate,
|
|
string? jobTitle,
|
|
List<CompanyVerificationFlag> flags)
|
|
{
|
|
var matchScore = Fuzz.TokenSetRatio(
|
|
claimedCompany.ToUpperInvariant(),
|
|
cached.CompanyName.ToUpperInvariant());
|
|
|
|
List<string>? sicCodes = null;
|
|
if (!string.IsNullOrEmpty(cached.SicCodesJson))
|
|
{
|
|
try
|
|
{
|
|
sicCodes = JsonSerializer.Deserialize<List<string>>(cached.SicCodesJson);
|
|
}
|
|
catch (JsonException)
|
|
{
|
|
// Ignore malformed JSON in cache
|
|
}
|
|
}
|
|
|
|
// Run all verification checks
|
|
CheckIncorporationDate(flags, startDate, cached.IncorporationDate, cached.CompanyName);
|
|
CheckDissolutionDate(flags, endDate, cached.DissolutionDate, cached.Status, cached.CompanyName);
|
|
CheckDormantCompany(flags, cached.AccountsCategory, jobTitle, cached.CompanyName);
|
|
CheckCompanySizeVsRole(flags, cached.AccountsCategory, jobTitle, cached.CompanyName);
|
|
|
|
var (jobPlausible, jobNotes) = CheckJobTitlePlausibility(jobTitle, cached.CompanyType);
|
|
if (jobPlausible == false)
|
|
{
|
|
flags.Add(new CompanyVerificationFlag
|
|
{
|
|
Type = "ImplausibleJobTitle",
|
|
Severity = "Critical",
|
|
Message = jobNotes ?? "Job title requires verification",
|
|
ScoreImpact = -15
|
|
});
|
|
}
|
|
|
|
return new CompanyVerificationResult
|
|
{
|
|
ClaimedCompany = claimedCompany,
|
|
MatchedCompanyName = cached.CompanyName,
|
|
MatchedCompanyNumber = cached.CompanyNumber,
|
|
MatchScore = matchScore,
|
|
IsVerified = true,
|
|
VerificationNotes = null,
|
|
ClaimedStartDate = startDate,
|
|
ClaimedEndDate = endDate,
|
|
CompanyType = cached.CompanyType,
|
|
CompanyStatus = cached.Status,
|
|
IncorporationDate = cached.IncorporationDate,
|
|
DissolutionDate = cached.DissolutionDate,
|
|
AccountsCategory = cached.AccountsCategory,
|
|
SicCodes = sicCodes,
|
|
ClaimedJobTitle = jobTitle,
|
|
JobTitlePlausible = jobPlausible,
|
|
JobTitleNotes = jobNotes,
|
|
Flags = flags
|
|
};
|
|
}
|
|
|
|
private static CompanyVerificationResult CreateUnverifiedResult(
|
|
string companyName,
|
|
DateOnly? startDate,
|
|
DateOnly? endDate,
|
|
string? jobTitle,
|
|
string reason)
|
|
{
|
|
return new CompanyVerificationResult
|
|
{
|
|
ClaimedCompany = companyName,
|
|
MatchedCompanyName = null,
|
|
MatchedCompanyNumber = null,
|
|
MatchScore = 0,
|
|
IsVerified = false,
|
|
VerificationNotes = reason,
|
|
ClaimedStartDate = startDate,
|
|
ClaimedEndDate = endDate,
|
|
ClaimedJobTitle = jobTitle
|
|
};
|
|
}
|
|
|
|
/// <summary>
|
|
/// Generates alternative search queries to find companies that may be registered
|
|
/// with slightly different names (e.g., "U.K." vs "UK", "Limited" vs "Ltd").
|
|
/// Also handles "Brand (Parent Company)" format by extracting and prioritizing the parent.
|
|
/// </summary>
|
|
private static List<string> GenerateSearchQueries(string companyName)
|
|
{
|
|
var queries = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
|
|
var normalized = companyName.Trim();
|
|
|
|
// Step 0a: Check for "Brand (Parent Company)" format and extract parent company
|
|
// Parent company is more likely to be the registered name, so search it first
|
|
var parentMatch = System.Text.RegularExpressions.Regex.Match(normalized, @"\(([^)]+)\)\s*$");
|
|
if (parentMatch.Success)
|
|
{
|
|
var parentCompany = parentMatch.Groups[1].Value.Trim();
|
|
// Generate queries for parent company first (higher priority)
|
|
foreach (var parentQuery in GenerateNameVariations(parentCompany))
|
|
{
|
|
queries.Add(parentQuery);
|
|
}
|
|
// Also try the brand name without parenthetical
|
|
var brandName = normalized[..parentMatch.Index].Trim();
|
|
if (brandName.Length >= 3)
|
|
{
|
|
foreach (var brandQuery in GenerateNameVariations(brandName))
|
|
{
|
|
queries.Add(brandQuery);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Step 0b: Check for "Name1/Name2" format (e.g., "ASDA/WALMART")
|
|
// Try each part separately as they may be different registered names
|
|
if (normalized.Contains('/'))
|
|
{
|
|
var parts = normalized.Split('/', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries);
|
|
foreach (var part in parts)
|
|
{
|
|
if (part.Length >= 3)
|
|
{
|
|
foreach (var partQuery in GenerateNameVariations(part))
|
|
{
|
|
queries.Add(partQuery);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Step 0c: Try first word as potential parent company (e.g., "UNILEVER BESTFOOD" -> "UNILEVER")
|
|
// Many company names are "ParentCompany Division" or "ParentCompany Brand"
|
|
var words = normalized.Split(' ', StringSplitOptions.RemoveEmptyEntries);
|
|
if (words.Length >= 2)
|
|
{
|
|
var firstWord = words[0];
|
|
// Only try if first word is substantial (not "The", "A", common prefixes)
|
|
var skipWords = new HashSet<string>(StringComparer.OrdinalIgnoreCase)
|
|
{ "the", "a", "an", "uk", "british", "national", "international", "global", "new" };
|
|
|
|
if (firstWord.Length >= 4 && !skipWords.Contains(firstWord))
|
|
{
|
|
foreach (var firstWordQuery in GenerateNameVariations(firstWord))
|
|
{
|
|
queries.Add(firstWordQuery);
|
|
}
|
|
// Also try first word + PLC/Limited for major corporations
|
|
queries.Add(firstWord + " PLC");
|
|
queries.Add(firstWord + " Limited");
|
|
}
|
|
}
|
|
|
|
// Also add variations of the full original name
|
|
foreach (var query in GenerateNameVariations(normalized))
|
|
{
|
|
queries.Add(query);
|
|
}
|
|
|
|
return queries.ToList();
|
|
}
|
|
|
|
/// <summary>
|
|
/// Generates name variations for a single company name (UK/U.K., Ltd/Limited, etc.)
|
|
/// </summary>
|
|
private static List<string> GenerateNameVariations(string name)
|
|
{
|
|
var variations = new HashSet<string>(StringComparer.OrdinalIgnoreCase) { name };
|
|
|
|
// Step 1: Generate UK/U.K. variations
|
|
var ukVariants = new List<string> { name };
|
|
|
|
if (name.Contains(" UK", StringComparison.OrdinalIgnoreCase))
|
|
{
|
|
// Add U.K. variant
|
|
var withDots = name
|
|
.Replace(" UK ", " U.K. ", StringComparison.OrdinalIgnoreCase)
|
|
.Replace(" UK", " U.K.", StringComparison.OrdinalIgnoreCase);
|
|
if (withDots != name)
|
|
ukVariants.Add(withDots);
|
|
}
|
|
if (name.Contains(" U.K.", StringComparison.OrdinalIgnoreCase))
|
|
{
|
|
// Add UK variant (no dots)
|
|
var withoutDots = name
|
|
.Replace(" U.K. ", " UK ", StringComparison.OrdinalIgnoreCase)
|
|
.Replace(" U.K.", " UK", StringComparison.OrdinalIgnoreCase);
|
|
if (withoutDots != name)
|
|
ukVariants.Add(withoutDots);
|
|
}
|
|
|
|
// Step 2: For each UK variant, generate suffix variations (Ltd/Limited)
|
|
foreach (var variant in ukVariants)
|
|
{
|
|
variations.Add(variant);
|
|
|
|
// Try Ltd -> Limited
|
|
if (variant.EndsWith(" Ltd", StringComparison.OrdinalIgnoreCase))
|
|
{
|
|
variations.Add(variant[..^4] + " Limited");
|
|
}
|
|
// Try Limited -> Ltd
|
|
else if (variant.EndsWith(" Limited", StringComparison.OrdinalIgnoreCase))
|
|
{
|
|
variations.Add(variant[..^8] + " Ltd");
|
|
}
|
|
// Try PLC variations
|
|
else if (variant.EndsWith(" PLC", StringComparison.OrdinalIgnoreCase))
|
|
{
|
|
variations.Add(variant[..^4] + " Public Limited Company");
|
|
}
|
|
else if (variant.EndsWith(" Public Limited Company", StringComparison.OrdinalIgnoreCase))
|
|
{
|
|
variations.Add(variant[..^24] + " PLC");
|
|
}
|
|
// Try Plc (mixed case) variations
|
|
else if (variant.EndsWith(" Plc", StringComparison.Ordinal))
|
|
{
|
|
variations.Add(variant[..^4] + " PLC");
|
|
variations.Add(variant[..^4] + " Public Limited Company");
|
|
}
|
|
}
|
|
|
|
// Step 3: Try core name without suffix
|
|
var suffixesToRemove = new[] { " Ltd", " Limited", " PLC", " Plc", " LLP", " Inc", " Corporation", " Corp" };
|
|
var coreName = name;
|
|
foreach (var suffix in suffixesToRemove)
|
|
{
|
|
if (coreName.EndsWith(suffix, StringComparison.OrdinalIgnoreCase))
|
|
{
|
|
coreName = coreName[..^suffix.Length].Trim();
|
|
break;
|
|
}
|
|
}
|
|
if (coreName != name && coreName.Length >= 3)
|
|
{
|
|
variations.Add(coreName);
|
|
variations.Add(coreName + " Limited");
|
|
variations.Add(coreName + " PLC");
|
|
|
|
// Also add U.K. variant of core name if applicable
|
|
if (coreName.Contains(" UK", StringComparison.OrdinalIgnoreCase))
|
|
{
|
|
var coreWithDots = coreName
|
|
.Replace(" UK ", " U.K. ", StringComparison.OrdinalIgnoreCase)
|
|
.Replace(" UK", " U.K.", StringComparison.OrdinalIgnoreCase);
|
|
variations.Add(coreWithDots);
|
|
variations.Add(coreWithDots + " Limited");
|
|
}
|
|
}
|
|
|
|
return variations.ToList();
|
|
}
|
|
|
|
/// <summary>
|
|
/// Determines which non-employment entity categories the search query is explicitly looking for.
|
|
/// Returns a set of category names (e.g., "Clubs", "Trusts") that should NOT be filtered out.
|
|
/// </summary>
|
|
private static HashSet<string> GetSearchEntityTypes(string originalLower, string queryLower)
|
|
{
|
|
var allowedCategories = new HashSet<string>();
|
|
var searchTerms = originalLower + " " + queryLower;
|
|
|
|
foreach (var (category, patterns) in NonEmploymentEntityPatterns)
|
|
{
|
|
// If any pattern from this category appears in the search, allow matches from that category
|
|
if (patterns.Any(pattern => searchTerms.Contains(pattern)))
|
|
{
|
|
allowedCategories.Add(category);
|
|
}
|
|
}
|
|
|
|
return allowedCategories;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Checks if a company title represents a valid employment entity.
|
|
/// Filters out non-employment entities (clubs, trusts, etc.) unless the search explicitly targets that type.
|
|
/// </summary>
|
|
private static bool IsValidEmploymentEntity(string itemTitleLower, HashSet<string> allowedCategories)
|
|
{
|
|
foreach (var (category, patterns) in NonEmploymentEntityPatterns)
|
|
{
|
|
// Skip this category if the search explicitly allows it
|
|
if (allowedCategories.Contains(category))
|
|
{
|
|
continue;
|
|
}
|
|
|
|
// Check if the item matches any pattern in this non-employment category
|
|
// Use whole-word matching for single words, substring for multi-word patterns
|
|
if (patterns.Any(pattern => pattern.Contains(' ')
|
|
? itemTitleLower.Contains(pattern)
|
|
: ContainsWholeWord(itemTitleLower, pattern)))
|
|
{
|
|
return false; // This is a non-employment entity type that wasn't explicitly searched for
|
|
}
|
|
}
|
|
|
|
return true; // No non-employment patterns matched, this is likely a valid employment entity
|
|
}
|
|
|
|
/// <summary>
|
|
/// Checks if a string contains a word as a whole word (not as a substring of another word).
|
|
/// E.g., "scotland" does NOT contain whole word "land", but "land holdings" does.
|
|
/// </summary>
|
|
private static bool ContainsWholeWord(string text, string word)
|
|
{
|
|
if (string.IsNullOrEmpty(text) || string.IsNullOrEmpty(word))
|
|
return false;
|
|
|
|
var pattern = $@"\b{System.Text.RegularExpressions.Regex.Escape(word)}\b";
|
|
return System.Text.RegularExpressions.Regex.IsMatch(text, pattern, System.Text.RegularExpressions.RegexOptions.IgnoreCase);
|
|
}
|
|
|
|
// Expanded skip words list for core identifier extraction
|
|
// These words are too common to be meaningful differentiators between companies
|
|
private static readonly HashSet<string> SkipWords = new(StringComparer.OrdinalIgnoreCase)
|
|
{
|
|
// Articles and conjunctions
|
|
"the", "a", "an", "and", "or", "of", "for", "in", "at", "on", "by", "to", "with",
|
|
|
|
// Geographic - Countries and regions
|
|
"uk", "u.k.", "gb", "british", "britain", "england", "english", "scotland", "scottish",
|
|
"wales", "welsh", "ireland", "irish", "northern",
|
|
"europe", "european", "america", "american", "usa", "us", "u.s.", "u.s.a.",
|
|
"canada", "canadian", "asia", "asian", "pacific", "atlantic",
|
|
"australia", "australian", "africa", "african", "india", "indian",
|
|
"france", "french", "germany", "german", "spain", "spanish", "italy", "italian",
|
|
"japan", "japanese", "china", "chinese", "korea", "korean",
|
|
"middle", "east", "west", "north", "south", "central", "western", "eastern",
|
|
|
|
// Geographic - Cities
|
|
"london", "manchester", "birmingham", "leeds", "glasgow", "edinburgh", "bristol",
|
|
"liverpool", "sheffield", "newcastle", "cardiff", "belfast", "nottingham",
|
|
"southampton", "portsmouth", "brighton", "leicester", "coventry", "hull",
|
|
|
|
// Legal suffixes
|
|
"limited", "ltd", "plc", "llp", "llc", "inc", "incorporated", "corporation", "corp",
|
|
"company", "co", "partners", "partnership", "enterprises", "unlimited",
|
|
"registered", "cic", "cio", "se", "ag", "gmbh", "sarl", "bv", "nv",
|
|
|
|
// Business descriptors
|
|
"group", "holdings", "holding", "parent", "subsidiary", "division", "branch",
|
|
"services", "service", "solutions", "solution", "consulting", "consultants", "consultancy",
|
|
"management", "systems", "system", "technologies", "technology", "tech",
|
|
"industries", "industry", "industrial", "commercial", "trading", "trade",
|
|
"business", "businesses", "operations", "operational", "professional", "professionals",
|
|
"resources", "resource", "network", "networks", "associates", "associated",
|
|
|
|
// Size/Scope descriptors
|
|
"national", "international", "global", "worldwide", "world", "regional", "local",
|
|
"universal", "general", "standard", "premier", "prime", "first", "one",
|
|
|
|
// Quality/Marketing terms
|
|
"new", "modern", "advanced", "innovative", "premier", "elite", "premium",
|
|
"quality", "superior", "excellent", "best", "top", "leading", "major",
|
|
|
|
// Ownership indicators (excluding "royal" as it's a meaningful company identifier)
|
|
"imperial", "crown", "state", "public", "private", "independent",
|
|
"mutual", "cooperative", "coop", "community",
|
|
|
|
// Time-related
|
|
"century", "millennium", "annual", "year", "years",
|
|
|
|
// Numbers as words
|
|
"one", "two", "three", "four", "five", "first", "second", "third"
|
|
};
|
|
|
|
/// <summary>
|
|
/// Extracts ALL core identifying words from a company name.
|
|
/// These are significant words that aren't common prefixes/suffixes.
|
|
/// E.g., "BMW Group Canada" -> ["BMW"], "Lloyds Bowmaker" -> ["LLOYDS", "BOWMAKER"]
|
|
/// "Royal Bank of Scotland" -> ["ROYAL", "BANK"] (Scotland is a geographic skipWord)
|
|
/// </summary>
|
|
private static List<string> ExtractCoreIdentifiers(string companyName)
|
|
{
|
|
if (string.IsNullOrWhiteSpace(companyName)) return new List<string>();
|
|
|
|
// Remove parenthetical content first
|
|
var cleanName = System.Text.RegularExpressions.Regex.Replace(companyName, @"\([^)]*\)", "").Trim();
|
|
|
|
// Split into words and collect all significant words
|
|
var words = cleanName.Split(new[] { ' ', '-', '/', '&' }, StringSplitOptions.RemoveEmptyEntries);
|
|
var coreWords = new List<string>();
|
|
|
|
foreach (var word in words)
|
|
{
|
|
var cleanWord = word.Trim('.', ',', '\'');
|
|
if (cleanWord.Length >= 2 && !SkipWords.Contains(cleanWord))
|
|
{
|
|
coreWords.Add(cleanWord.ToUpperInvariant());
|
|
}
|
|
}
|
|
|
|
return coreWords;
|
|
}
|
|
|
|
#endregion
|
|
}
|