Prefer main trading companies over subsidiaries in company matching

When matching brand names like "ASDA", prefer the main employer company
(ASDA STORES LIMITED) over subsidiaries (ASDA DELIVERY LIMITED).

- Add SubsidiaryIndicators set (delivery, distribution, holdings, property, etc.)
- Add MainCompanyIndicators set (stores, retail, manufacturing, etc.)
- Add CalculateCompanyPriorityScore() method for ranking matches
- Sort matches by priority score first, then by fuzzy score
- Subsidiaries get -10 priority unless explicitly searched for
- Main trading companies get +5 priority, PLCs get +3

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-01-20 22:41:27 +01:00
parent 55c0aebdaa
commit 1a53431757

View File

@@ -45,6 +45,32 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
"68100", // Buying and selling of own real estate (often shell) "68100", // Buying and selling of own real estate (often shell)
}; };
// Words that indicate a subsidiary rather than the main trading company
// When someone says they worked for "ASDA", they likely mean ASDA STORES LIMITED,
// not ASDA DELIVERY LIMITED or ASDA PROPERTY HOLDINGS LIMITED
private static readonly HashSet<string> SubsidiaryIndicators = new(StringComparer.OrdinalIgnoreCase)
{
// Logistics/Operations subsidiaries
"delivery", "distribution", "logistics", "transport", "fleet", "haulage", "warehousing", "fulfilment",
// Property subsidiaries
"property", "properties", "estates", "land", "real estate", "developments",
// Financial/Holding subsidiaries
"holdings", "holding", "investments", "capital", "finance", "financial", "treasury",
// Administrative subsidiaries
"nominees", "nominee", "trustees", "trustee", "secretarial", "registrars",
// Insurance subsidiaries
"insurance", "assurance", "underwriting",
// Specific function subsidiaries
"leasing", "rentals", "procurement", "sourcing"
};
// Words that indicate a main trading/employer company (prefer these)
private static readonly HashSet<string> MainCompanyIndicators = new(StringComparer.OrdinalIgnoreCase)
{
"stores", "retail", "supermarkets", "superstores", "hypermarkets",
"manufacturing", "operations", "trading"
};
// SIC codes for tech/software companies // SIC codes for tech/software companies
private static readonly HashSet<string> TechSicCodes = new() private static readonly HashSet<string> TechSicCodes = new()
{ {
@@ -657,9 +683,15 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
.Select(item => .Select(item =>
{ {
var itemTitle = item.Title.ToUpperInvariant(); var itemTitle = item.Title.ToUpperInvariant();
var itemTitleLower = item.Title.ToLowerInvariant();
var scoreVsOriginal = Fuzz.TokenSetRatio(normalizedOriginal, itemTitle); var scoreVsOriginal = Fuzz.TokenSetRatio(normalizedOriginal, itemTitle);
var scoreVsQuery = Fuzz.TokenSetRatio(normalizedQuery, itemTitle); var scoreVsQuery = Fuzz.TokenSetRatio(normalizedQuery, itemTitle);
return (Item: item, Score: Math.Max(scoreVsOriginal, scoreVsQuery)); var baseScore = Math.Max(scoreVsOriginal, scoreVsQuery);
// Calculate priority adjustment for main company vs subsidiary
var priorityScore = CalculateCompanyPriorityScore(itemTitleLower, originalLower, queryLower);
return (Item: item, Score: baseScore, PriorityScore: priorityScore);
}) })
.Where(m => m.Score >= FuzzyMatchThreshold) .Where(m => m.Score >= FuzzyMatchThreshold)
.ToList(); .ToList();
@@ -667,8 +699,8 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
_logger.LogDebug("Found {Count} matches above threshold for '{CompanyName}' (query: '{Query}')", matches.Count, companyName, searchQuery); _logger.LogDebug("Found {Count} matches above threshold for '{CompanyName}' (query: '{Query}')", matches.Count, companyName, searchQuery);
foreach (var m in matches.Take(5)) foreach (var m in matches.Take(5))
{ {
_logger.LogDebug(" Match: {Title} ({Number}), Score: {Score}, DateOfCreation: {Date}", _logger.LogDebug(" Match: {Title} ({Number}), Score: {Score}, Priority: {Priority}, DateOfCreation: {Date}",
m.Item.Title, m.Item.CompanyNumber, m.Score, m.Item.DateOfCreation ?? "null"); m.Item.Title, m.Item.CompanyNumber, m.Score, m.PriorityScore, m.Item.DateOfCreation ?? "null");
} }
if (matches.Count == 0) return null; if (matches.Count == 0) return null;
@@ -687,7 +719,9 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
m.Item.Title, incDate?.ToString() ?? "null", existed); m.Item.Title, incDate?.ToString() ?? "null", existed);
return existed; return existed;
}) })
.OrderByDescending(m => m.Score) // Sort by priority first, then by fuzzy score
.OrderByDescending(m => m.PriorityScore)
.ThenByDescending(m => m.Score)
.ToList(); .ToList();
_logger.LogDebug("Companies that existed at start date: {Count}", existedAtStartDate.Count); _logger.LogDebug("Companies that existed at start date: {Count}", existedAtStartDate.Count);
@@ -695,8 +729,9 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
// If any matches existed at the start date, prefer those // If any matches existed at the start date, prefer those
if (existedAtStartDate.Count > 0) if (existedAtStartDate.Count > 0)
{ {
_logger.LogDebug("Selected: {Title} ({Number})", existedAtStartDate[0].Item.Title, existedAtStartDate[0].Item.CompanyNumber); var selected = existedAtStartDate[0];
return existedAtStartDate[0]; _logger.LogDebug("Selected: {Title} ({Number}), Priority: {Priority}", selected.Item.Title, selected.Item.CompanyNumber, selected.PriorityScore);
return (selected.Item, selected.Score);
} }
// No companies existed at the claimed start date - don't match a wrong company // No companies existed at the claimed start date - don't match a wrong company
@@ -704,10 +739,59 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
return null; return null;
} }
// No start date provided - just use highest score // No start date provided - sort by priority then score
var fallback = matches.OrderByDescending(m => m.Score).First(); var fallback = matches
_logger.LogDebug("No start date filter, using highest score: {Title} ({Number})", fallback.Item.Title, fallback.Item.CompanyNumber); .OrderByDescending(m => m.PriorityScore)
return fallback; .ThenByDescending(m => m.Score)
.First();
_logger.LogDebug("No start date filter, using highest priority: {Title} ({Number}), Priority: {Priority}", fallback.Item.Title, fallback.Item.CompanyNumber, fallback.PriorityScore);
return (fallback.Item, fallback.Score);
}
/// <summary>
/// Calculates a priority score for company matching.
/// Higher scores = more likely to be the main employer company.
/// Penalizes subsidiaries (delivery, property, holdings, etc.) unless explicitly searched for.
/// Boosts main trading companies (stores, retail, etc.).
/// </summary>
private static int CalculateCompanyPriorityScore(string itemTitleLower, string originalLower, string queryLower)
{
var score = 0;
// Check if search explicitly mentions subsidiary indicators
var searchText = originalLower + " " + queryLower;
// Penalize subsidiary indicators (unless search explicitly included them)
foreach (var indicator in SubsidiaryIndicators)
{
if (itemTitleLower.Contains(indicator))
{
// Only penalize if the search didn't explicitly include this indicator
if (!searchText.Contains(indicator))
{
score -= 10; // Significant penalty for subsidiaries
}
break; // Only apply one subsidiary penalty
}
}
// Boost main company indicators
foreach (var indicator in MainCompanyIndicators)
{
if (itemTitleLower.Contains(indicator))
{
score += 5; // Boost for main trading companies
break; // Only apply one boost
}
}
// Slight boost for PLC (usually the parent/main company)
if (itemTitleLower.EndsWith(" plc"))
{
score += 3;
}
return score;
} }
private async Task CacheCompanyAsync(CompaniesHouseSearchItem item, CompaniesHouseCompany? details) private async Task CacheCompanyAsync(CompaniesHouseSearchItem item, CompaniesHouseCompany? details)