- Add NonEmploymentEntityPatterns dictionary with 10 categories of non-employer entities (clubs, associations, trusts, charities, investment, property, religious, sports, educational, professional) - Expand SkipWords from ~30 to 120+ words for better core identifier extraction - Add GetSearchEntityTypes() and IsValidEmploymentEntity() helper methods - Refactor FindBestMatch() to use pattern-based filtering instead of hardcoded checks - Fix UI showing duplicate points for same company appearing multiple times (now only shows points on first occurrence, subsequent rows show 0) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1154 lines
48 KiB
C#
1154 lines
48 KiB
C#
using System.Text.Json;
|
|
using FuzzySharp;
|
|
using Microsoft.EntityFrameworkCore;
|
|
using Microsoft.Extensions.Logging;
|
|
using TrueCV.Application.DTOs;
|
|
using TrueCV.Application.Helpers;
|
|
using TrueCV.Application.Interfaces;
|
|
using TrueCV.Application.Models;
|
|
using TrueCV.Domain.Entities;
|
|
using TrueCV.Infrastructure.Data;
|
|
using TrueCV.Infrastructure.ExternalApis;
|
|
|
|
namespace TrueCV.Infrastructure.Services;
|
|
|
|
public sealed class CompanyVerifierService : ICompanyVerifierService
|
|
{
|
|
private readonly CompaniesHouseClient _companiesHouseClient;
|
|
private readonly IDbContextFactory<ApplicationDbContext> _dbContextFactory;
|
|
private readonly ILogger<CompanyVerifierService> _logger;
|
|
|
|
private const int FuzzyMatchThreshold = 85;
|
|
private const int CacheExpirationDays = 30;
|
|
|
|
// Non-employment entity patterns organized by category
|
|
// These are entities that exist in Companies House but are not typical employers
|
|
private static readonly Dictionary<string, string[]> NonEmploymentEntityPatterns = new()
|
|
{
|
|
["Clubs"] = new[] { "club", "fan club", "owners club", "car club", "supporters", "enthusiast", "aficionados" },
|
|
["Associations"] = new[] { "association", "society", "federation", "institute", "institution", "guild", "chamber of commerce" },
|
|
["Trusts"] = new[] { "benefit trust", "pension", "retirement", "employee trust", "share trust", "employee benefit", "superannuation", "provident" },
|
|
["Charities"] = new[] { "charity", "charitable", "foundation", "relief fund", "benevolent", "philanthropic" },
|
|
["Investment"] = new[] { "nominee", "custodian", "trustee", "investment trust", "unit trust", "investment fund", "capital partners" },
|
|
["Property"] = new[] { "freehold", "leasehold", "property management", "residents association", "management company rtm", "commonhold" },
|
|
["Religious"] = new[] { "church", "chapel", "mosque", "synagogue", "temple", "parish", "diocese", "ministry" },
|
|
["Sports"] = new[] { "football club", "cricket club", "rugby club", "golf club", "tennis club", "sports club", "athletic club" },
|
|
["Educational"] = new[] { "old boys", "old girls", "alumni", "school association", "pta", "parent teacher" },
|
|
["Professional"] = new[] { "chartered institute", "royal college", "professional body", "trade body", "regulatory body" }
|
|
};
|
|
|
|
// SIC codes that indicate non-trading or non-employment entities
|
|
private static readonly HashSet<string> NonTradingSicCodes = new()
|
|
{
|
|
"99999", // Dormant company
|
|
"64209", // Activities of holding companies (shell companies)
|
|
"68100", // Buying and selling of own real estate (often shell)
|
|
};
|
|
|
|
// SIC codes for tech/software companies
|
|
private static readonly HashSet<string> TechSicCodes = new()
|
|
{
|
|
"62011", "62012", "62020", "62030", "62090", // Computer programming and consultancy
|
|
"63110", "63120", // Data processing, hosting
|
|
"58210", "58290", // Publishing of computer games, other software
|
|
"61100", "61200", "61300", "61900" // Telecommunications
|
|
};
|
|
|
|
public CompanyVerifierService(
|
|
CompaniesHouseClient companiesHouseClient,
|
|
IDbContextFactory<ApplicationDbContext> dbContextFactory,
|
|
ILogger<CompanyVerifierService> logger)
|
|
{
|
|
_companiesHouseClient = companiesHouseClient;
|
|
_dbContextFactory = dbContextFactory;
|
|
_logger = logger;
|
|
}
|
|
|
|
public async Task<CompanyVerificationResult> VerifyCompanyAsync(
|
|
string companyName,
|
|
DateOnly? startDate,
|
|
DateOnly? endDate,
|
|
string? jobTitle = null)
|
|
{
|
|
ArgumentException.ThrowIfNullOrWhiteSpace(companyName);
|
|
|
|
_logger.LogDebug("Verifying company: {CompanyName}", companyName);
|
|
var flags = new List<CompanyVerificationFlag>();
|
|
|
|
// Try to find a cached match first (but only if it existed at claimed start date)
|
|
var cachedMatch = await FindCachedMatchAsync(companyName);
|
|
if (cachedMatch is not null)
|
|
{
|
|
// Check if cached company existed at the claimed start date
|
|
var cacheValid = !startDate.HasValue ||
|
|
cachedMatch.IncorporationDate == null ||
|
|
cachedMatch.IncorporationDate <= startDate.Value;
|
|
|
|
if (cacheValid)
|
|
{
|
|
_logger.LogDebug("Found cached company match for: {CompanyName}", companyName);
|
|
return CreateResultFromCache(cachedMatch, companyName, startDate, endDate, jobTitle, flags);
|
|
}
|
|
else
|
|
{
|
|
_logger.LogDebug("Cached company {CachedName} was incorporated after claimed start date, searching for alternatives", cachedMatch.CompanyName);
|
|
}
|
|
}
|
|
|
|
// Search Companies House with fallback queries
|
|
try
|
|
{
|
|
var searchQueries = GenerateSearchQueries(companyName);
|
|
_logger.LogDebug("Generated {Count} search queries for '{CompanyName}': {Queries}",
|
|
searchQueries.Count, companyName, string.Join(", ", searchQueries.Select(q => $"'{q}'")));
|
|
(CompaniesHouseSearchItem Item, int Score)? bestMatch = null;
|
|
|
|
foreach (var query in searchQueries)
|
|
{
|
|
_logger.LogDebug("Searching Companies House with query: {Query}", query);
|
|
var searchResponse = await _companiesHouseClient.SearchCompaniesAsync(query);
|
|
|
|
if (searchResponse?.Items is null || searchResponse.Items.Count == 0)
|
|
{
|
|
continue;
|
|
}
|
|
|
|
// Find best fuzzy match, preferring companies that existed at claimed start date
|
|
// Pass both original name and search query for matching flexibility
|
|
bestMatch = FindBestMatch(companyName, query, searchResponse.Items, startDate);
|
|
|
|
if (bestMatch is not null)
|
|
{
|
|
_logger.LogDebug("Found match with query '{Query}': {Company}", query, bestMatch.Value.Item.Title);
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (bestMatch is null)
|
|
{
|
|
_logger.LogDebug("No valid match found for: {CompanyName} after trying {Count} queries", companyName, searchQueries.Count);
|
|
return CreateUnverifiedResult(companyName, startDate, endDate, jobTitle,
|
|
"Company name could not be verified against official records");
|
|
}
|
|
|
|
var match = bestMatch.Value;
|
|
|
|
// Fetch full company details for additional data
|
|
var companyDetails = await _companiesHouseClient.GetCompanyAsync(match.Item.CompanyNumber);
|
|
|
|
// Cache the matched company with full details
|
|
await CacheCompanyAsync(match.Item, companyDetails);
|
|
|
|
_logger.LogInformation(
|
|
"Verified company {ClaimedName} matched to {MatchedName} with score {Score}%",
|
|
companyName, match.Item.Title, match.Score);
|
|
|
|
// Run all verification checks
|
|
var incorporationDate = DateHelpers.ParseDate(match.Item.DateOfCreation);
|
|
var dissolutionDate = DateHelpers.ParseDate(match.Item.DateOfCessation);
|
|
var companyStatus = match.Item.CompanyStatus;
|
|
var companyType = match.Item.CompanyType;
|
|
var sicCodes = companyDetails?.SicCodes ?? match.Item.SicCodes;
|
|
var accountsCategory = companyDetails?.Accounts?.LastAccounts?.Type;
|
|
|
|
// Check 1: Employment before company incorporation
|
|
CheckIncorporationDate(flags, startDate, incorporationDate, match.Item.Title);
|
|
|
|
// Check 2: Employment at dissolved company
|
|
CheckDissolutionDate(flags, endDate, dissolutionDate, companyStatus, match.Item.Title);
|
|
|
|
// Check 3: Dormant company check
|
|
CheckDormantCompany(flags, accountsCategory, jobTitle, match.Item.Title);
|
|
|
|
// Check 4: Company size vs job title
|
|
CheckCompanySizeVsRole(flags, accountsCategory, jobTitle, match.Item.Title);
|
|
|
|
// Check 5: SIC code vs job title mismatch
|
|
CheckSicCodeMismatch(flags, sicCodes, jobTitle, match.Item.Title);
|
|
|
|
// Check 6: Job title plausibility for PLCs
|
|
var (jobPlausible, jobNotes) = CheckJobTitlePlausibility(jobTitle, companyType);
|
|
if (jobPlausible == false)
|
|
{
|
|
flags.Add(new CompanyVerificationFlag
|
|
{
|
|
Type = "ImplausibleJobTitle",
|
|
Severity = "Critical",
|
|
Message = jobNotes ?? "Job title requires verification",
|
|
ScoreImpact = -15
|
|
});
|
|
}
|
|
|
|
return new CompanyVerificationResult
|
|
{
|
|
ClaimedCompany = companyName,
|
|
MatchedCompanyName = match.Item.Title,
|
|
MatchedCompanyNumber = match.Item.CompanyNumber,
|
|
MatchScore = match.Score,
|
|
IsVerified = true,
|
|
VerificationNotes = null,
|
|
ClaimedStartDate = startDate,
|
|
ClaimedEndDate = endDate,
|
|
CompanyType = companyType,
|
|
CompanyStatus = companyStatus,
|
|
IncorporationDate = incorporationDate,
|
|
DissolutionDate = dissolutionDate,
|
|
AccountsCategory = accountsCategory,
|
|
SicCodes = sicCodes,
|
|
ClaimedJobTitle = jobTitle,
|
|
JobTitlePlausible = jobPlausible,
|
|
JobTitleNotes = jobNotes,
|
|
Flags = flags
|
|
};
|
|
}
|
|
catch (CompaniesHouseRateLimitException ex)
|
|
{
|
|
_logger.LogWarning(ex, "Rate limit hit while verifying company: {CompanyName}", companyName);
|
|
return CreateUnverifiedResult(companyName, startDate, endDate, jobTitle,
|
|
"Verification temporarily unavailable due to rate limiting");
|
|
}
|
|
}
|
|
|
|
public async Task<List<CompanySearchResult>> SearchCompaniesAsync(string query)
|
|
{
|
|
ArgumentException.ThrowIfNullOrWhiteSpace(query);
|
|
|
|
_logger.LogDebug("Searching companies for query: {Query}", query);
|
|
|
|
var response = await _companiesHouseClient.SearchCompaniesAsync(query);
|
|
|
|
if (response?.Items is null)
|
|
{
|
|
return [];
|
|
}
|
|
|
|
return response.Items.Select(item => new CompanySearchResult
|
|
{
|
|
CompanyNumber = item.CompanyNumber,
|
|
CompanyName = item.Title,
|
|
CompanyStatus = item.CompanyStatus ?? "Unknown",
|
|
IncorporationDate = DateHelpers.ParseDate(item.DateOfCreation),
|
|
AddressSnippet = item.AddressSnippet
|
|
}).ToList();
|
|
}
|
|
|
|
public async Task<bool?> VerifyDirectorAsync(
|
|
string companyNumber,
|
|
string candidateName,
|
|
DateOnly? startDate,
|
|
DateOnly? endDate)
|
|
{
|
|
if (string.IsNullOrWhiteSpace(companyNumber) || string.IsNullOrWhiteSpace(candidateName))
|
|
{
|
|
return null;
|
|
}
|
|
|
|
try
|
|
{
|
|
var officers = await _companiesHouseClient.GetOfficersAsync(companyNumber);
|
|
|
|
if (officers?.Items is null || officers.Items.Count == 0)
|
|
{
|
|
_logger.LogDebug("No officers found for company {CompanyNumber}", companyNumber);
|
|
return null;
|
|
}
|
|
|
|
// Normalize candidate name for comparison
|
|
var normalizedCandidate = NormalizeName(candidateName);
|
|
|
|
foreach (var officer in officers.Items)
|
|
{
|
|
// Check if officer role is director-like
|
|
var role = officer.OfficerRole?.ToLowerInvariant() ?? "";
|
|
if (!role.Contains("director") && !role.Contains("secretary"))
|
|
{
|
|
continue;
|
|
}
|
|
|
|
// Fuzzy match the name
|
|
var normalizedOfficer = NormalizeName(officer.Name);
|
|
var matchScore = Fuzz.Ratio(normalizedCandidate, normalizedOfficer);
|
|
|
|
if (matchScore >= 80) // High threshold for name matching
|
|
{
|
|
// Check date overlap
|
|
var appointedOn = DateHelpers.ParseDate(officer.AppointedOn);
|
|
var resignedOn = DateHelpers.ParseDate(officer.ResignedOn);
|
|
|
|
// If no claimed dates, just check if names match
|
|
if (!startDate.HasValue && !endDate.HasValue)
|
|
{
|
|
_logger.LogDebug(
|
|
"Found matching director {OfficerName} for candidate {CandidateName} at company {CompanyNumber}",
|
|
officer.Name, candidateName, companyNumber);
|
|
return true;
|
|
}
|
|
|
|
// Check if employment period overlaps with directorship
|
|
var datesOverlap = DatesOverlap(
|
|
startDate, endDate,
|
|
appointedOn, resignedOn);
|
|
|
|
if (datesOverlap)
|
|
{
|
|
_logger.LogDebug(
|
|
"Verified director {OfficerName} matches candidate {CandidateName} with overlapping dates",
|
|
officer.Name, candidateName);
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
|
|
_logger.LogDebug(
|
|
"No matching director found for candidate {CandidateName} at company {CompanyNumber}",
|
|
candidateName, companyNumber);
|
|
return false;
|
|
}
|
|
catch (CompaniesHouseRateLimitException)
|
|
{
|
|
_logger.LogWarning("Rate limit hit while verifying director for company {CompanyNumber}", companyNumber);
|
|
return null;
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogError(ex, "Error verifying director for company {CompanyNumber}", companyNumber);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
private static string NormalizeName(string name)
|
|
{
|
|
if (string.IsNullOrWhiteSpace(name)) return "";
|
|
|
|
// Companies House often stores names as "SURNAME, Firstname"
|
|
// Convert to "Firstname Surname" format for comparison
|
|
var normalized = name.ToUpperInvariant().Trim();
|
|
|
|
if (normalized.Contains(','))
|
|
{
|
|
var parts = normalized.Split(',', 2);
|
|
if (parts.Length == 2)
|
|
{
|
|
normalized = $"{parts[1].Trim()} {parts[0].Trim()}";
|
|
}
|
|
}
|
|
|
|
return normalized;
|
|
}
|
|
|
|
private static bool DatesOverlap(DateOnly? start1, DateOnly? end1, DateOnly? start2, DateOnly? end2)
|
|
{
|
|
// If no dates, assume overlap
|
|
if (!start1.HasValue && !end1.HasValue) return true;
|
|
if (!start2.HasValue && !end2.HasValue) return true;
|
|
|
|
// Use default dates for missing values
|
|
var s1 = start1 ?? DateOnly.MinValue;
|
|
var e1 = end1 ?? DateOnly.MaxValue;
|
|
var s2 = start2 ?? DateOnly.MinValue;
|
|
var e2 = end2 ?? DateOnly.MaxValue;
|
|
|
|
// Check overlap: periods overlap if one starts before the other ends
|
|
return s1 <= e2 && s2 <= e1;
|
|
}
|
|
|
|
#region Verification Checks
|
|
|
|
private static void CheckIncorporationDate(
|
|
List<CompanyVerificationFlag> flags,
|
|
DateOnly? claimedStartDate,
|
|
DateOnly? incorporationDate,
|
|
string companyName)
|
|
{
|
|
if (claimedStartDate.HasValue && incorporationDate.HasValue)
|
|
{
|
|
if (claimedStartDate.Value < incorporationDate.Value)
|
|
{
|
|
flags.Add(new CompanyVerificationFlag
|
|
{
|
|
Type = "EmploymentBeforeIncorporation",
|
|
Severity = "Critical",
|
|
Message = $"Claimed employment at '{companyName}' starting {claimedStartDate:MMM yyyy} is before company incorporation date {incorporationDate:MMM yyyy}",
|
|
ScoreImpact = -20
|
|
});
|
|
}
|
|
}
|
|
}
|
|
|
|
private static void CheckDissolutionDate(
|
|
List<CompanyVerificationFlag> flags,
|
|
DateOnly? claimedEndDate,
|
|
DateOnly? dissolutionDate,
|
|
string? companyStatus,
|
|
string companyName)
|
|
{
|
|
var isDissolvedStatus = companyStatus?.ToLowerInvariant() is "dissolved" or "liquidation" or "administration";
|
|
|
|
if (dissolutionDate.HasValue && isDissolvedStatus)
|
|
{
|
|
// Allow 3 month buffer for wind-down
|
|
var bufferDate = dissolutionDate.Value.AddMonths(3);
|
|
|
|
if (claimedEndDate.HasValue && claimedEndDate.Value > bufferDate)
|
|
{
|
|
flags.Add(new CompanyVerificationFlag
|
|
{
|
|
Type = "EmploymentAtDissolvedCompany",
|
|
Severity = "Critical",
|
|
Message = $"Claimed employment at '{companyName}' until {claimedEndDate:MMM yyyy} but company was dissolved on {dissolutionDate:MMM yyyy}",
|
|
ScoreImpact = -20
|
|
});
|
|
}
|
|
else if (!claimedEndDate.HasValue) // Current employment at dissolved company
|
|
{
|
|
flags.Add(new CompanyVerificationFlag
|
|
{
|
|
Type = "CurrentEmploymentAtDissolvedCompany",
|
|
Severity = "Critical",
|
|
Message = $"Claims current employment at '{companyName}' but company was dissolved on {dissolutionDate:MMM yyyy}",
|
|
ScoreImpact = -25
|
|
});
|
|
}
|
|
}
|
|
}
|
|
|
|
private static void CheckDormantCompany(
|
|
List<CompanyVerificationFlag> flags,
|
|
string? accountsCategory,
|
|
string? jobTitle,
|
|
string companyName)
|
|
{
|
|
if (string.IsNullOrWhiteSpace(accountsCategory)) return;
|
|
|
|
var isDormant = accountsCategory.ToLowerInvariant().Contains("dormant");
|
|
if (!isDormant) return;
|
|
|
|
// Directors can maintain dormant companies, but other roles are suspicious
|
|
var title = jobTitle?.ToLowerInvariant() ?? "";
|
|
var isDirectorRole = title.Contains("director") || title.Contains("company secretary");
|
|
|
|
if (!isDirectorRole)
|
|
{
|
|
flags.Add(new CompanyVerificationFlag
|
|
{
|
|
Type = "EmploymentAtDormantCompany",
|
|
Severity = "Warning",
|
|
Message = $"Claimed active employment as '{jobTitle}' at '{companyName}' which files dormant accounts",
|
|
ScoreImpact = -10
|
|
});
|
|
}
|
|
}
|
|
|
|
private static void CheckCompanySizeVsRole(
|
|
List<CompanyVerificationFlag> flags,
|
|
string? accountsCategory,
|
|
string? jobTitle,
|
|
string companyName)
|
|
{
|
|
if (string.IsNullOrWhiteSpace(accountsCategory) || string.IsNullOrWhiteSpace(jobTitle)) return;
|
|
|
|
var category = accountsCategory.ToLowerInvariant();
|
|
var title = jobTitle.ToLowerInvariant();
|
|
|
|
// Micro-entity: < 10 employees, < £632k turnover
|
|
var isMicroEntity = category.Contains("micro");
|
|
|
|
// Check for senior management roles at micro companies
|
|
var isSeniorRole = title.Contains("vp") ||
|
|
title.Contains("vice president") ||
|
|
title.Contains("head of") ||
|
|
title.Contains("chief") ||
|
|
title.Contains("director of") ||
|
|
title.Contains("senior director");
|
|
|
|
// At micro companies, having many senior roles is suspicious
|
|
if (isMicroEntity && isSeniorRole)
|
|
{
|
|
flags.Add(new CompanyVerificationFlag
|
|
{
|
|
Type = "SeniorRoleAtMicroCompany",
|
|
Severity = "Warning",
|
|
Message = $"Claimed senior role '{jobTitle}' at '{companyName}' which files micro-entity accounts (typically <10 employees)",
|
|
ScoreImpact = -10
|
|
});
|
|
}
|
|
}
|
|
|
|
private static void CheckSicCodeMismatch(
|
|
List<CompanyVerificationFlag> flags,
|
|
List<string>? sicCodes,
|
|
string? jobTitle,
|
|
string companyName)
|
|
{
|
|
if (sicCodes is null || sicCodes.Count == 0 || string.IsNullOrWhiteSpace(jobTitle)) return;
|
|
|
|
var title = jobTitle.ToLowerInvariant();
|
|
|
|
// Check if this is a tech role
|
|
var isTechRole = title.Contains("software") ||
|
|
title.Contains("developer") ||
|
|
title.Contains("engineer") ||
|
|
title.Contains("programmer") ||
|
|
title.Contains("data scientist") ||
|
|
title.Contains("data analyst") ||
|
|
title.Contains("devops") ||
|
|
title.Contains("cloud") ||
|
|
title.Contains("machine learning") ||
|
|
title.Contains("ai ") ||
|
|
title.Contains("frontend") ||
|
|
title.Contains("backend") ||
|
|
title.Contains("full stack") ||
|
|
title.Contains("fullstack");
|
|
|
|
if (isTechRole)
|
|
{
|
|
// Check if company has any tech SIC codes
|
|
var hasTechSic = sicCodes.Any(s => TechSicCodes.Contains(s));
|
|
|
|
if (!hasTechSic)
|
|
{
|
|
// Get the primary SIC code description (simplified - just show code)
|
|
var primarySic = sicCodes.FirstOrDefault() ?? "Unknown";
|
|
|
|
flags.Add(new CompanyVerificationFlag
|
|
{
|
|
Type = "SicCodeMismatch",
|
|
Severity = "Info",
|
|
Message = $"Tech role '{jobTitle}' at '{companyName}' (SIC: {primarySic}) - company is not registered as a technology business",
|
|
ScoreImpact = -5
|
|
});
|
|
}
|
|
}
|
|
}
|
|
|
|
private static (bool? IsPlausible, string? Notes) CheckJobTitlePlausibility(string? jobTitle, string? companyType)
|
|
{
|
|
if (string.IsNullOrWhiteSpace(jobTitle) || string.IsNullOrWhiteSpace(companyType))
|
|
{
|
|
return (null, null);
|
|
}
|
|
|
|
var title = jobTitle.Trim().ToLowerInvariant();
|
|
var type = companyType.Trim().ToLowerInvariant();
|
|
|
|
// Check if this is a PLC (Public Limited Company) - these are large companies
|
|
var isPlc = type.Contains("plc") || type.Contains("public limited");
|
|
|
|
// Check for C-suite / very senior roles
|
|
var isCsuiteRole = title.Contains("ceo") ||
|
|
title.Contains("chief executive") ||
|
|
title.Contains("cto") ||
|
|
title.Contains("chief technology") ||
|
|
title.Contains("cfo") ||
|
|
title.Contains("chief financial") ||
|
|
title.Contains("coo") ||
|
|
title.Contains("chief operating") ||
|
|
title.Contains("cio") ||
|
|
title.Contains("chief information") ||
|
|
title.Contains("managing director") ||
|
|
title == "md" ||
|
|
title.Contains("chairman") ||
|
|
title.Contains("chairwoman") ||
|
|
title.Contains("chairperson") ||
|
|
title.Contains("president");
|
|
|
|
// Check for board-level roles
|
|
var isBoardRole = title.Contains("board member") ||
|
|
title.Contains("non-executive director") ||
|
|
title.Contains("executive director") ||
|
|
(title == "director" && !title.Contains("of"));
|
|
|
|
if (isPlc && (isCsuiteRole || isBoardRole))
|
|
{
|
|
return (false, $"Claimed senior role '{jobTitle}' at a PLC requires verification - C-suite positions at public companies are publicly disclosed");
|
|
}
|
|
|
|
// Check for VP/SVP at PLCs (also usually disclosed)
|
|
var isVpRole = title.Contains("vice president") ||
|
|
title.Contains("vp ") ||
|
|
title.StartsWith("vp") ||
|
|
title.Contains("svp") ||
|
|
title.Contains("senior vice president") ||
|
|
title.Contains("evp") ||
|
|
title.Contains("executive vice president");
|
|
|
|
if (isPlc && isVpRole)
|
|
{
|
|
return (false, $"Claimed VP-level role '{jobTitle}' at a PLC - senior positions at public companies should be verifiable");
|
|
}
|
|
|
|
return (true, null);
|
|
}
|
|
|
|
#endregion
|
|
|
|
#region Helper Methods
|
|
|
|
private async Task<CompanyCache?> FindCachedMatchAsync(string companyName)
|
|
{
|
|
var cutoffDate = DateTime.UtcNow.AddDays(-CacheExpirationDays);
|
|
|
|
await using var dbContext = await _dbContextFactory.CreateDbContextAsync();
|
|
|
|
var cachedCompanies = await dbContext.CompanyCache
|
|
.Where(c => c.CachedAt >= cutoffDate)
|
|
.ToListAsync();
|
|
|
|
if (cachedCompanies.Count == 0)
|
|
{
|
|
return null;
|
|
}
|
|
|
|
var matches = cachedCompanies
|
|
.Where(c => !string.IsNullOrWhiteSpace(c.CompanyName))
|
|
.Select(c => new { Company = c, Score = Fuzz.TokenSetRatio(companyName.ToUpperInvariant(), c.CompanyName.ToUpperInvariant()) })
|
|
.Where(m => m.Score >= FuzzyMatchThreshold)
|
|
.OrderByDescending(m => m.Score)
|
|
.FirstOrDefault();
|
|
|
|
return matches?.Company;
|
|
}
|
|
|
|
private (CompaniesHouseSearchItem Item, int Score)? FindBestMatch(
|
|
string companyName,
|
|
string searchQuery,
|
|
List<CompaniesHouseSearchItem> items,
|
|
DateOnly? claimedStartDate)
|
|
{
|
|
var normalizedOriginal = companyName.ToUpperInvariant();
|
|
var normalizedQuery = searchQuery.ToUpperInvariant();
|
|
|
|
// Extract core identifying words that MUST appear in any valid match
|
|
// This prevents "BMW Group Canada" matching "CANADA LIFE GROUP" just because of common words
|
|
// and "Lloyds Bowmaker" matching "LLOYDS ALARMS" (missing "Bowmaker")
|
|
var coreWords = ExtractCoreIdentifiers(companyName);
|
|
var queryCoreWords = ExtractCoreIdentifiers(searchQuery);
|
|
|
|
var originalLower = companyName.ToLowerInvariant();
|
|
var queryLower = searchQuery.ToLowerInvariant();
|
|
|
|
// Determine which entity types the search is explicitly looking for
|
|
var searchEntityTypes = GetSearchEntityTypes(originalLower, queryLower);
|
|
|
|
// Match against both the original company name AND the search query used
|
|
// This handles cases like "Matthew Walker (Northern Foods Plc)" where we
|
|
// search for "Northern Foods Plc" but need to match against it, not the full name
|
|
var matches = items
|
|
.Where(item => !string.IsNullOrWhiteSpace(item.Title))
|
|
.Where(item =>
|
|
{
|
|
var itemTitle = item.Title.ToUpperInvariant();
|
|
var itemTitleLower = item.Title.ToLowerInvariant();
|
|
|
|
// Validate that ALL core identifiers appear in the match
|
|
// "Lloyds Bowmaker" must have BOTH "LLOYDS" and "BOWMAKER" in the match
|
|
var hasAllOriginalCores = coreWords.Count == 0 || coreWords.All(w => itemTitle.Contains(w));
|
|
var hasAllQueryCores = queryCoreWords.Count == 0 || queryCoreWords.All(w => itemTitle.Contains(w));
|
|
if (!hasAllOriginalCores && !hasAllQueryCores) return false;
|
|
|
|
// Filter out non-employment entities unless explicitly searching for that type
|
|
if (!IsValidEmploymentEntity(itemTitleLower, searchEntityTypes))
|
|
{
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
})
|
|
.Select(item =>
|
|
{
|
|
var itemTitle = item.Title.ToUpperInvariant();
|
|
var scoreVsOriginal = Fuzz.TokenSetRatio(normalizedOriginal, itemTitle);
|
|
var scoreVsQuery = Fuzz.TokenSetRatio(normalizedQuery, itemTitle);
|
|
return (Item: item, Score: Math.Max(scoreVsOriginal, scoreVsQuery));
|
|
})
|
|
.Where(m => m.Score >= FuzzyMatchThreshold)
|
|
.ToList();
|
|
|
|
_logger.LogDebug("Found {Count} matches above threshold for '{CompanyName}' (query: '{Query}')", matches.Count, companyName, searchQuery);
|
|
foreach (var m in matches.Take(5))
|
|
{
|
|
_logger.LogDebug(" Match: {Title} ({Number}), Score: {Score}, DateOfCreation: {Date}",
|
|
m.Item.Title, m.Item.CompanyNumber, m.Score, m.Item.DateOfCreation ?? "null");
|
|
}
|
|
|
|
if (matches.Count == 0) return null;
|
|
|
|
// If we have a claimed start date, prefer companies that existed at that time
|
|
if (claimedStartDate.HasValue)
|
|
{
|
|
_logger.LogDebug("Filtering for companies that existed at claimed start date: {StartDate}", claimedStartDate.Value);
|
|
|
|
var existedAtStartDate = matches
|
|
.Where(m =>
|
|
{
|
|
var incDate = DateHelpers.ParseDate(m.Item.DateOfCreation);
|
|
var existed = incDate == null || incDate <= claimedStartDate.Value;
|
|
_logger.LogDebug(" {Title}: IncDate={IncDate}, Existed={Existed}",
|
|
m.Item.Title, incDate?.ToString() ?? "null", existed);
|
|
return existed;
|
|
})
|
|
.OrderByDescending(m => m.Score)
|
|
.ToList();
|
|
|
|
_logger.LogDebug("Companies that existed at start date: {Count}", existedAtStartDate.Count);
|
|
|
|
// If any matches existed at the start date, prefer those
|
|
if (existedAtStartDate.Count > 0)
|
|
{
|
|
_logger.LogDebug("Selected: {Title} ({Number})", existedAtStartDate[0].Item.Title, existedAtStartDate[0].Item.CompanyNumber);
|
|
return existedAtStartDate[0];
|
|
}
|
|
|
|
// No companies existed at the claimed start date - don't match a wrong company
|
|
_logger.LogDebug("No companies found that existed at claimed start date {StartDate}, returning no match", claimedStartDate.Value);
|
|
return null;
|
|
}
|
|
|
|
// No start date provided - just use highest score
|
|
var fallback = matches.OrderByDescending(m => m.Score).First();
|
|
_logger.LogDebug("No start date filter, using highest score: {Title} ({Number})", fallback.Item.Title, fallback.Item.CompanyNumber);
|
|
return fallback;
|
|
}
|
|
|
|
private async Task CacheCompanyAsync(CompaniesHouseSearchItem item, CompaniesHouseCompany? details)
|
|
{
|
|
try
|
|
{
|
|
await using var dbContext = await _dbContextFactory.CreateDbContextAsync();
|
|
|
|
var existingCache = await dbContext.CompanyCache
|
|
.FirstOrDefaultAsync(c => c.CompanyNumber == item.CompanyNumber);
|
|
|
|
var sicCodes = details?.SicCodes ?? item.SicCodes;
|
|
var sicCodesJson = sicCodes != null ? JsonSerializer.Serialize(sicCodes) : null;
|
|
var accountsCategory = details?.Accounts?.LastAccounts?.Type;
|
|
|
|
if (existingCache is not null)
|
|
{
|
|
existingCache.CompanyName = item.Title;
|
|
existingCache.Status = item.CompanyStatus ?? "Unknown";
|
|
existingCache.CompanyType = item.CompanyType;
|
|
existingCache.IncorporationDate = DateHelpers.ParseDate(item.DateOfCreation);
|
|
existingCache.DissolutionDate = DateHelpers.ParseDate(item.DateOfCessation);
|
|
existingCache.AccountsCategory = accountsCategory;
|
|
existingCache.SicCodesJson = sicCodesJson;
|
|
existingCache.CachedAt = DateTime.UtcNow;
|
|
}
|
|
else
|
|
{
|
|
var cacheEntry = new CompanyCache
|
|
{
|
|
CompanyNumber = item.CompanyNumber,
|
|
CompanyName = item.Title,
|
|
Status = item.CompanyStatus ?? "Unknown",
|
|
CompanyType = item.CompanyType,
|
|
IncorporationDate = DateHelpers.ParseDate(item.DateOfCreation),
|
|
DissolutionDate = DateHelpers.ParseDate(item.DateOfCessation),
|
|
AccountsCategory = accountsCategory,
|
|
SicCodesJson = sicCodesJson,
|
|
CachedAt = DateTime.UtcNow
|
|
};
|
|
|
|
dbContext.CompanyCache.Add(cacheEntry);
|
|
}
|
|
|
|
await dbContext.SaveChangesAsync();
|
|
}
|
|
catch (DbUpdateException ex) when (ex.InnerException?.Message.Contains("PK_CompanyCache") == true)
|
|
{
|
|
// Race condition: another task already cached this company - ignore
|
|
_logger.LogDebug("Company {CompanyNumber} already cached by another task", item.CompanyNumber);
|
|
}
|
|
}
|
|
|
|
private CompanyVerificationResult CreateResultFromCache(
|
|
CompanyCache cached,
|
|
string claimedCompany,
|
|
DateOnly? startDate,
|
|
DateOnly? endDate,
|
|
string? jobTitle,
|
|
List<CompanyVerificationFlag> flags)
|
|
{
|
|
var matchScore = Fuzz.TokenSetRatio(
|
|
claimedCompany.ToUpperInvariant(),
|
|
cached.CompanyName.ToUpperInvariant());
|
|
|
|
List<string>? sicCodes = null;
|
|
if (!string.IsNullOrEmpty(cached.SicCodesJson))
|
|
{
|
|
try
|
|
{
|
|
sicCodes = JsonSerializer.Deserialize<List<string>>(cached.SicCodesJson);
|
|
}
|
|
catch (JsonException)
|
|
{
|
|
// Ignore malformed JSON in cache
|
|
}
|
|
}
|
|
|
|
// Run all verification checks
|
|
CheckIncorporationDate(flags, startDate, cached.IncorporationDate, cached.CompanyName);
|
|
CheckDissolutionDate(flags, endDate, cached.DissolutionDate, cached.Status, cached.CompanyName);
|
|
CheckDormantCompany(flags, cached.AccountsCategory, jobTitle, cached.CompanyName);
|
|
CheckCompanySizeVsRole(flags, cached.AccountsCategory, jobTitle, cached.CompanyName);
|
|
CheckSicCodeMismatch(flags, sicCodes, jobTitle, cached.CompanyName);
|
|
|
|
var (jobPlausible, jobNotes) = CheckJobTitlePlausibility(jobTitle, cached.CompanyType);
|
|
if (jobPlausible == false)
|
|
{
|
|
flags.Add(new CompanyVerificationFlag
|
|
{
|
|
Type = "ImplausibleJobTitle",
|
|
Severity = "Critical",
|
|
Message = jobNotes ?? "Job title requires verification",
|
|
ScoreImpact = -15
|
|
});
|
|
}
|
|
|
|
return new CompanyVerificationResult
|
|
{
|
|
ClaimedCompany = claimedCompany,
|
|
MatchedCompanyName = cached.CompanyName,
|
|
MatchedCompanyNumber = cached.CompanyNumber,
|
|
MatchScore = matchScore,
|
|
IsVerified = true,
|
|
VerificationNotes = null,
|
|
ClaimedStartDate = startDate,
|
|
ClaimedEndDate = endDate,
|
|
CompanyType = cached.CompanyType,
|
|
CompanyStatus = cached.Status,
|
|
IncorporationDate = cached.IncorporationDate,
|
|
DissolutionDate = cached.DissolutionDate,
|
|
AccountsCategory = cached.AccountsCategory,
|
|
SicCodes = sicCodes,
|
|
ClaimedJobTitle = jobTitle,
|
|
JobTitlePlausible = jobPlausible,
|
|
JobTitleNotes = jobNotes,
|
|
Flags = flags
|
|
};
|
|
}
|
|
|
|
private static CompanyVerificationResult CreateUnverifiedResult(
|
|
string companyName,
|
|
DateOnly? startDate,
|
|
DateOnly? endDate,
|
|
string? jobTitle,
|
|
string reason)
|
|
{
|
|
return new CompanyVerificationResult
|
|
{
|
|
ClaimedCompany = companyName,
|
|
MatchedCompanyName = null,
|
|
MatchedCompanyNumber = null,
|
|
MatchScore = 0,
|
|
IsVerified = false,
|
|
VerificationNotes = reason,
|
|
ClaimedStartDate = startDate,
|
|
ClaimedEndDate = endDate,
|
|
ClaimedJobTitle = jobTitle
|
|
};
|
|
}
|
|
|
|
/// <summary>
|
|
/// Generates alternative search queries to find companies that may be registered
|
|
/// with slightly different names (e.g., "U.K." vs "UK", "Limited" vs "Ltd").
|
|
/// Also handles "Brand (Parent Company)" format by extracting and prioritizing the parent.
|
|
/// </summary>
|
|
private static List<string> GenerateSearchQueries(string companyName)
|
|
{
|
|
var queries = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
|
|
var normalized = companyName.Trim();
|
|
|
|
// Step 0a: Check for "Brand (Parent Company)" format and extract parent company
|
|
// Parent company is more likely to be the registered name, so search it first
|
|
var parentMatch = System.Text.RegularExpressions.Regex.Match(normalized, @"\(([^)]+)\)\s*$");
|
|
if (parentMatch.Success)
|
|
{
|
|
var parentCompany = parentMatch.Groups[1].Value.Trim();
|
|
// Generate queries for parent company first (higher priority)
|
|
foreach (var parentQuery in GenerateNameVariations(parentCompany))
|
|
{
|
|
queries.Add(parentQuery);
|
|
}
|
|
// Also try the brand name without parenthetical
|
|
var brandName = normalized[..parentMatch.Index].Trim();
|
|
if (brandName.Length >= 3)
|
|
{
|
|
foreach (var brandQuery in GenerateNameVariations(brandName))
|
|
{
|
|
queries.Add(brandQuery);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Step 0b: Check for "Name1/Name2" format (e.g., "ASDA/WALMART")
|
|
// Try each part separately as they may be different registered names
|
|
if (normalized.Contains('/'))
|
|
{
|
|
var parts = normalized.Split('/', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries);
|
|
foreach (var part in parts)
|
|
{
|
|
if (part.Length >= 3)
|
|
{
|
|
foreach (var partQuery in GenerateNameVariations(part))
|
|
{
|
|
queries.Add(partQuery);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Step 0c: Try first word as potential parent company (e.g., "UNILEVER BESTFOOD" -> "UNILEVER")
|
|
// Many company names are "ParentCompany Division" or "ParentCompany Brand"
|
|
var words = normalized.Split(' ', StringSplitOptions.RemoveEmptyEntries);
|
|
if (words.Length >= 2)
|
|
{
|
|
var firstWord = words[0];
|
|
// Only try if first word is substantial (not "The", "A", common prefixes)
|
|
var skipWords = new HashSet<string>(StringComparer.OrdinalIgnoreCase)
|
|
{ "the", "a", "an", "uk", "british", "national", "international", "global", "new" };
|
|
|
|
if (firstWord.Length >= 4 && !skipWords.Contains(firstWord))
|
|
{
|
|
foreach (var firstWordQuery in GenerateNameVariations(firstWord))
|
|
{
|
|
queries.Add(firstWordQuery);
|
|
}
|
|
// Also try first word + PLC/Limited for major corporations
|
|
queries.Add(firstWord + " PLC");
|
|
queries.Add(firstWord + " Limited");
|
|
}
|
|
}
|
|
|
|
// Also add variations of the full original name
|
|
foreach (var query in GenerateNameVariations(normalized))
|
|
{
|
|
queries.Add(query);
|
|
}
|
|
|
|
return queries.ToList();
|
|
}
|
|
|
|
/// <summary>
|
|
/// Generates name variations for a single company name (UK/U.K., Ltd/Limited, etc.)
|
|
/// </summary>
|
|
private static List<string> GenerateNameVariations(string name)
|
|
{
|
|
var variations = new HashSet<string>(StringComparer.OrdinalIgnoreCase) { name };
|
|
|
|
// Step 1: Generate UK/U.K. variations
|
|
var ukVariants = new List<string> { name };
|
|
|
|
if (name.Contains(" UK", StringComparison.OrdinalIgnoreCase))
|
|
{
|
|
// Add U.K. variant
|
|
var withDots = name
|
|
.Replace(" UK ", " U.K. ", StringComparison.OrdinalIgnoreCase)
|
|
.Replace(" UK", " U.K.", StringComparison.OrdinalIgnoreCase);
|
|
if (withDots != name)
|
|
ukVariants.Add(withDots);
|
|
}
|
|
if (name.Contains(" U.K.", StringComparison.OrdinalIgnoreCase))
|
|
{
|
|
// Add UK variant (no dots)
|
|
var withoutDots = name
|
|
.Replace(" U.K. ", " UK ", StringComparison.OrdinalIgnoreCase)
|
|
.Replace(" U.K.", " UK", StringComparison.OrdinalIgnoreCase);
|
|
if (withoutDots != name)
|
|
ukVariants.Add(withoutDots);
|
|
}
|
|
|
|
// Step 2: For each UK variant, generate suffix variations (Ltd/Limited)
|
|
foreach (var variant in ukVariants)
|
|
{
|
|
variations.Add(variant);
|
|
|
|
// Try Ltd -> Limited
|
|
if (variant.EndsWith(" Ltd", StringComparison.OrdinalIgnoreCase))
|
|
{
|
|
variations.Add(variant[..^4] + " Limited");
|
|
}
|
|
// Try Limited -> Ltd
|
|
else if (variant.EndsWith(" Limited", StringComparison.OrdinalIgnoreCase))
|
|
{
|
|
variations.Add(variant[..^8] + " Ltd");
|
|
}
|
|
// Try PLC variations
|
|
else if (variant.EndsWith(" PLC", StringComparison.OrdinalIgnoreCase))
|
|
{
|
|
variations.Add(variant[..^4] + " Public Limited Company");
|
|
}
|
|
else if (variant.EndsWith(" Public Limited Company", StringComparison.OrdinalIgnoreCase))
|
|
{
|
|
variations.Add(variant[..^24] + " PLC");
|
|
}
|
|
// Try Plc (mixed case) variations
|
|
else if (variant.EndsWith(" Plc", StringComparison.Ordinal))
|
|
{
|
|
variations.Add(variant[..^4] + " PLC");
|
|
variations.Add(variant[..^4] + " Public Limited Company");
|
|
}
|
|
}
|
|
|
|
// Step 3: Try core name without suffix
|
|
var suffixesToRemove = new[] { " Ltd", " Limited", " PLC", " Plc", " LLP", " Inc", " Corporation", " Corp" };
|
|
var coreName = name;
|
|
foreach (var suffix in suffixesToRemove)
|
|
{
|
|
if (coreName.EndsWith(suffix, StringComparison.OrdinalIgnoreCase))
|
|
{
|
|
coreName = coreName[..^suffix.Length].Trim();
|
|
break;
|
|
}
|
|
}
|
|
if (coreName != name && coreName.Length >= 3)
|
|
{
|
|
variations.Add(coreName);
|
|
variations.Add(coreName + " Limited");
|
|
variations.Add(coreName + " PLC");
|
|
|
|
// Also add U.K. variant of core name if applicable
|
|
if (coreName.Contains(" UK", StringComparison.OrdinalIgnoreCase))
|
|
{
|
|
var coreWithDots = coreName
|
|
.Replace(" UK ", " U.K. ", StringComparison.OrdinalIgnoreCase)
|
|
.Replace(" UK", " U.K.", StringComparison.OrdinalIgnoreCase);
|
|
variations.Add(coreWithDots);
|
|
variations.Add(coreWithDots + " Limited");
|
|
}
|
|
}
|
|
|
|
return variations.ToList();
|
|
}
|
|
|
|
/// <summary>
|
|
/// Determines which non-employment entity categories the search query is explicitly looking for.
|
|
/// Returns a set of category names (e.g., "Clubs", "Trusts") that should NOT be filtered out.
|
|
/// </summary>
|
|
private static HashSet<string> GetSearchEntityTypes(string originalLower, string queryLower)
|
|
{
|
|
var allowedCategories = new HashSet<string>();
|
|
var searchTerms = originalLower + " " + queryLower;
|
|
|
|
foreach (var (category, patterns) in NonEmploymentEntityPatterns)
|
|
{
|
|
// If any pattern from this category appears in the search, allow matches from that category
|
|
if (patterns.Any(pattern => searchTerms.Contains(pattern)))
|
|
{
|
|
allowedCategories.Add(category);
|
|
}
|
|
}
|
|
|
|
return allowedCategories;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Checks if a company title represents a valid employment entity.
|
|
/// Filters out non-employment entities (clubs, trusts, etc.) unless the search explicitly targets that type.
|
|
/// </summary>
|
|
private static bool IsValidEmploymentEntity(string itemTitleLower, HashSet<string> allowedCategories)
|
|
{
|
|
foreach (var (category, patterns) in NonEmploymentEntityPatterns)
|
|
{
|
|
// Skip this category if the search explicitly allows it
|
|
if (allowedCategories.Contains(category))
|
|
{
|
|
continue;
|
|
}
|
|
|
|
// Check if the item matches any pattern in this non-employment category
|
|
if (patterns.Any(pattern => itemTitleLower.Contains(pattern)))
|
|
{
|
|
return false; // This is a non-employment entity type that wasn't explicitly searched for
|
|
}
|
|
}
|
|
|
|
return true; // No non-employment patterns matched, this is likely a valid employment entity
|
|
}
|
|
|
|
// Expanded skip words list for core identifier extraction
|
|
// These words are too common to be meaningful differentiators between companies
|
|
private static readonly HashSet<string> SkipWords = new(StringComparer.OrdinalIgnoreCase)
|
|
{
|
|
// Articles and conjunctions
|
|
"the", "a", "an", "and", "or", "of", "for", "in", "at", "on", "by", "to", "with",
|
|
|
|
// Geographic - Countries and regions
|
|
"uk", "u.k.", "gb", "british", "britain", "england", "english", "scotland", "scottish",
|
|
"wales", "welsh", "ireland", "irish", "northern",
|
|
"europe", "european", "america", "american", "usa", "us", "u.s.", "u.s.a.",
|
|
"canada", "canadian", "asia", "asian", "pacific", "atlantic",
|
|
"australia", "australian", "africa", "african", "india", "indian",
|
|
"france", "french", "germany", "german", "spain", "spanish", "italy", "italian",
|
|
"japan", "japanese", "china", "chinese", "korea", "korean",
|
|
"middle", "east", "west", "north", "south", "central", "western", "eastern",
|
|
|
|
// Geographic - Cities
|
|
"london", "manchester", "birmingham", "leeds", "glasgow", "edinburgh", "bristol",
|
|
"liverpool", "sheffield", "newcastle", "cardiff", "belfast", "nottingham",
|
|
"southampton", "portsmouth", "brighton", "leicester", "coventry", "hull",
|
|
|
|
// Legal suffixes
|
|
"limited", "ltd", "plc", "llp", "llc", "inc", "incorporated", "corporation", "corp",
|
|
"company", "co", "partners", "partnership", "enterprises", "unlimited",
|
|
"registered", "cic", "cio", "se", "ag", "gmbh", "sarl", "bv", "nv",
|
|
|
|
// Business descriptors
|
|
"group", "holdings", "holding", "parent", "subsidiary", "division", "branch",
|
|
"services", "service", "solutions", "solution", "consulting", "consultants", "consultancy",
|
|
"management", "systems", "system", "technologies", "technology", "tech",
|
|
"industries", "industry", "industrial", "commercial", "trading", "trade",
|
|
"business", "businesses", "operations", "operational", "professional", "professionals",
|
|
"resources", "resource", "network", "networks", "associates", "associated",
|
|
|
|
// Size/Scope descriptors
|
|
"national", "international", "global", "worldwide", "world", "regional", "local",
|
|
"universal", "general", "standard", "premier", "prime", "first", "one",
|
|
|
|
// Quality/Marketing terms
|
|
"new", "modern", "advanced", "innovative", "premier", "elite", "premium",
|
|
"quality", "superior", "excellent", "best", "top", "leading", "major",
|
|
|
|
// Ownership indicators
|
|
"royal", "imperial", "crown", "state", "public", "private", "independent",
|
|
"mutual", "cooperative", "coop", "community",
|
|
|
|
// Time-related
|
|
"century", "millennium", "annual", "year", "years",
|
|
|
|
// Numbers as words
|
|
"one", "two", "three", "four", "five", "first", "second", "third"
|
|
};
|
|
|
|
/// <summary>
|
|
/// Extracts ALL core identifying words from a company name.
|
|
/// These are significant words that aren't common prefixes/suffixes.
|
|
/// E.g., "BMW Group Canada" -> ["BMW"], "Lloyds Bowmaker" -> ["LLOYDS", "BOWMAKER"]
|
|
/// "Bank of Scotland" -> ["BANK", "SCOTLAND"]
|
|
/// </summary>
|
|
private static List<string> ExtractCoreIdentifiers(string companyName)
|
|
{
|
|
if (string.IsNullOrWhiteSpace(companyName)) return new List<string>();
|
|
|
|
// Remove parenthetical content first
|
|
var cleanName = System.Text.RegularExpressions.Regex.Replace(companyName, @"\([^)]*\)", "").Trim();
|
|
|
|
// Split into words and collect all significant words
|
|
var words = cleanName.Split(new[] { ' ', '-', '/', '&' }, StringSplitOptions.RemoveEmptyEntries);
|
|
var coreWords = new List<string>();
|
|
|
|
foreach (var word in words)
|
|
{
|
|
var cleanWord = word.Trim('.', ',', '\'');
|
|
if (cleanWord.Length >= 2 && !SkipWords.Contains(cleanWord))
|
|
{
|
|
coreWords.Add(cleanWord.ToUpperInvariant());
|
|
}
|
|
}
|
|
|
|
return coreWords;
|
|
}
|
|
|
|
#endregion
|
|
}
|