Prefer main trading companies over subsidiaries in company matching
When matching brand names like "ASDA", prefer the main employer company (ASDA STORES LIMITED) over subsidiaries (ASDA DELIVERY LIMITED). - Add SubsidiaryIndicators set (delivery, distribution, holdings, property, etc.) - Add MainCompanyIndicators set (stores, retail, manufacturing, etc.) - Add CalculateCompanyPriorityScore() method for ranking matches - Sort matches by priority score first, then by fuzzy score - Subsidiaries get -10 priority unless explicitly searched for - Main trading companies get +5 priority, PLCs get +3 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -45,6 +45,32 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
|
||||
"68100", // Buying and selling of own real estate (often shell)
|
||||
};
|
||||
|
||||
// Words that indicate a subsidiary rather than the main trading company
|
||||
// When someone says they worked for "ASDA", they likely mean ASDA STORES LIMITED,
|
||||
// not ASDA DELIVERY LIMITED or ASDA PROPERTY HOLDINGS LIMITED
|
||||
private static readonly HashSet<string> SubsidiaryIndicators = new(StringComparer.OrdinalIgnoreCase)
|
||||
{
|
||||
// Logistics/Operations subsidiaries
|
||||
"delivery", "distribution", "logistics", "transport", "fleet", "haulage", "warehousing", "fulfilment",
|
||||
// Property subsidiaries
|
||||
"property", "properties", "estates", "land", "real estate", "developments",
|
||||
// Financial/Holding subsidiaries
|
||||
"holdings", "holding", "investments", "capital", "finance", "financial", "treasury",
|
||||
// Administrative subsidiaries
|
||||
"nominees", "nominee", "trustees", "trustee", "secretarial", "registrars",
|
||||
// Insurance subsidiaries
|
||||
"insurance", "assurance", "underwriting",
|
||||
// Specific function subsidiaries
|
||||
"leasing", "rentals", "procurement", "sourcing"
|
||||
};
|
||||
|
||||
// Words that indicate a main trading/employer company (prefer these)
|
||||
private static readonly HashSet<string> MainCompanyIndicators = new(StringComparer.OrdinalIgnoreCase)
|
||||
{
|
||||
"stores", "retail", "supermarkets", "superstores", "hypermarkets",
|
||||
"manufacturing", "operations", "trading"
|
||||
};
|
||||
|
||||
// SIC codes for tech/software companies
|
||||
private static readonly HashSet<string> TechSicCodes = new()
|
||||
{
|
||||
@@ -657,9 +683,15 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
|
||||
.Select(item =>
|
||||
{
|
||||
var itemTitle = item.Title.ToUpperInvariant();
|
||||
var itemTitleLower = item.Title.ToLowerInvariant();
|
||||
var scoreVsOriginal = Fuzz.TokenSetRatio(normalizedOriginal, itemTitle);
|
||||
var scoreVsQuery = Fuzz.TokenSetRatio(normalizedQuery, itemTitle);
|
||||
return (Item: item, Score: Math.Max(scoreVsOriginal, scoreVsQuery));
|
||||
var baseScore = Math.Max(scoreVsOriginal, scoreVsQuery);
|
||||
|
||||
// Calculate priority adjustment for main company vs subsidiary
|
||||
var priorityScore = CalculateCompanyPriorityScore(itemTitleLower, originalLower, queryLower);
|
||||
|
||||
return (Item: item, Score: baseScore, PriorityScore: priorityScore);
|
||||
})
|
||||
.Where(m => m.Score >= FuzzyMatchThreshold)
|
||||
.ToList();
|
||||
@@ -667,8 +699,8 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
|
||||
_logger.LogDebug("Found {Count} matches above threshold for '{CompanyName}' (query: '{Query}')", matches.Count, companyName, searchQuery);
|
||||
foreach (var m in matches.Take(5))
|
||||
{
|
||||
_logger.LogDebug(" Match: {Title} ({Number}), Score: {Score}, DateOfCreation: {Date}",
|
||||
m.Item.Title, m.Item.CompanyNumber, m.Score, m.Item.DateOfCreation ?? "null");
|
||||
_logger.LogDebug(" Match: {Title} ({Number}), Score: {Score}, Priority: {Priority}, DateOfCreation: {Date}",
|
||||
m.Item.Title, m.Item.CompanyNumber, m.Score, m.PriorityScore, m.Item.DateOfCreation ?? "null");
|
||||
}
|
||||
|
||||
if (matches.Count == 0) return null;
|
||||
@@ -687,7 +719,9 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
|
||||
m.Item.Title, incDate?.ToString() ?? "null", existed);
|
||||
return existed;
|
||||
})
|
||||
.OrderByDescending(m => m.Score)
|
||||
// Sort by priority first, then by fuzzy score
|
||||
.OrderByDescending(m => m.PriorityScore)
|
||||
.ThenByDescending(m => m.Score)
|
||||
.ToList();
|
||||
|
||||
_logger.LogDebug("Companies that existed at start date: {Count}", existedAtStartDate.Count);
|
||||
@@ -695,8 +729,9 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
|
||||
// If any matches existed at the start date, prefer those
|
||||
if (existedAtStartDate.Count > 0)
|
||||
{
|
||||
_logger.LogDebug("Selected: {Title} ({Number})", existedAtStartDate[0].Item.Title, existedAtStartDate[0].Item.CompanyNumber);
|
||||
return existedAtStartDate[0];
|
||||
var selected = existedAtStartDate[0];
|
||||
_logger.LogDebug("Selected: {Title} ({Number}), Priority: {Priority}", selected.Item.Title, selected.Item.CompanyNumber, selected.PriorityScore);
|
||||
return (selected.Item, selected.Score);
|
||||
}
|
||||
|
||||
// No companies existed at the claimed start date - don't match a wrong company
|
||||
@@ -704,10 +739,59 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
|
||||
return null;
|
||||
}
|
||||
|
||||
// No start date provided - just use highest score
|
||||
var fallback = matches.OrderByDescending(m => m.Score).First();
|
||||
_logger.LogDebug("No start date filter, using highest score: {Title} ({Number})", fallback.Item.Title, fallback.Item.CompanyNumber);
|
||||
return fallback;
|
||||
// No start date provided - sort by priority then score
|
||||
var fallback = matches
|
||||
.OrderByDescending(m => m.PriorityScore)
|
||||
.ThenByDescending(m => m.Score)
|
||||
.First();
|
||||
_logger.LogDebug("No start date filter, using highest priority: {Title} ({Number}), Priority: {Priority}", fallback.Item.Title, fallback.Item.CompanyNumber, fallback.PriorityScore);
|
||||
return (fallback.Item, fallback.Score);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Calculates a priority score for company matching.
|
||||
/// Higher scores = more likely to be the main employer company.
|
||||
/// Penalizes subsidiaries (delivery, property, holdings, etc.) unless explicitly searched for.
|
||||
/// Boosts main trading companies (stores, retail, etc.).
|
||||
/// </summary>
|
||||
private static int CalculateCompanyPriorityScore(string itemTitleLower, string originalLower, string queryLower)
|
||||
{
|
||||
var score = 0;
|
||||
|
||||
// Check if search explicitly mentions subsidiary indicators
|
||||
var searchText = originalLower + " " + queryLower;
|
||||
|
||||
// Penalize subsidiary indicators (unless search explicitly included them)
|
||||
foreach (var indicator in SubsidiaryIndicators)
|
||||
{
|
||||
if (itemTitleLower.Contains(indicator))
|
||||
{
|
||||
// Only penalize if the search didn't explicitly include this indicator
|
||||
if (!searchText.Contains(indicator))
|
||||
{
|
||||
score -= 10; // Significant penalty for subsidiaries
|
||||
}
|
||||
break; // Only apply one subsidiary penalty
|
||||
}
|
||||
}
|
||||
|
||||
// Boost main company indicators
|
||||
foreach (var indicator in MainCompanyIndicators)
|
||||
{
|
||||
if (itemTitleLower.Contains(indicator))
|
||||
{
|
||||
score += 5; // Boost for main trading companies
|
||||
break; // Only apply one boost
|
||||
}
|
||||
}
|
||||
|
||||
// Slight boost for PLC (usually the parent/main company)
|
||||
if (itemTitleLower.EndsWith(" plc"))
|
||||
{
|
||||
score += 3;
|
||||
}
|
||||
|
||||
return score;
|
||||
}
|
||||
|
||||
private async Task CacheCompanyAsync(CompaniesHouseSearchItem item, CompaniesHouseCompany? details)
|
||||
|
||||
Reference in New Issue
Block a user