feat: Replace AI compound name detection with rule-based approach
Eliminates one Anthropic API call entirely by using pattern matching: - Add 120+ known single-company names (Ernst & Young, M&S, law firms, etc.) - Detect "/" separator as clear indicator of multiple companies - Use company suffixes (Ltd, PLC) to identify when "&" means two companies - Conservative approach: don't split ambiguous cases Added 40 unit tests for compound name detection covering: - Known single companies with & and "and" - Slash-separated company names - Ambiguous cases - Edge cases (empty, null, short names) Estimated savings: ~$0.01 per CV check, 100% elimination of this API call 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -229,106 +229,359 @@ public sealed class AICompanyNameMatcherService : ICompanyNameMatcherService
|
||||
}
|
||||
}
|
||||
|
||||
private const string CompoundNamePrompt = """
|
||||
Analyze this company name from a CV and determine if it refers to ONE company or MULTIPLE companies.
|
||||
/// <summary>
|
||||
/// Well-known company names that contain "&" or "and" but are SINGLE companies.
|
||||
/// These should NOT be split into multiple parts.
|
||||
/// </summary>
|
||||
private static readonly HashSet<string> KnownSingleCompanyNames = new(StringComparer.OrdinalIgnoreCase)
|
||||
{
|
||||
// Big 4 / Professional Services
|
||||
"Ernst & Young", "Ernst and Young", "EY",
|
||||
"Deloitte and Touche", "Deloitte & Touche",
|
||||
"PricewaterhouseCoopers", "Price Waterhouse",
|
||||
"KPMG",
|
||||
"Accenture",
|
||||
|
||||
Company name: "{COMPANY_NAME}"
|
||||
// Retail
|
||||
"Marks & Spencer", "Marks and Spencer", "M&S",
|
||||
"Fortnum & Mason", "Fortnum and Mason",
|
||||
"Crabtree & Evelyn",
|
||||
"Holland & Barrett", "Holland and Barrett",
|
||||
"Past Times & Present",
|
||||
"Barnes & Noble",
|
||||
"Abercrombie & Fitch",
|
||||
"Dolce & Gabbana",
|
||||
"Bang & Olufsen",
|
||||
"Crate & Barrel",
|
||||
"Bed Bath & Beyond",
|
||||
"Bath & Body Works",
|
||||
|
||||
Examples:
|
||||
- "Ernst & Young" → ONE company (it's the full name of the accounting firm)
|
||||
- "Marks & Spencer" → ONE company (it's the full name of the retailer)
|
||||
- "ASDA/WALMART" → TWO companies: ["ASDA", "WALMART"] (person worked at both or it's showing ownership)
|
||||
- "Corus & Laura Ashley Hotels" → TWO companies: ["Corus", "Laura Ashley Hotels"] (different industries)
|
||||
- "PwC" → ONE company
|
||||
- "Deloitte and Touche" → ONE company (historical name of Deloitte)
|
||||
- "BMW Group Ireland" → ONE company
|
||||
- "Tesco Stores and Distribution" → ONE company (departments of same company)
|
||||
// Consumer Goods
|
||||
"Procter & Gamble", "Procter and Gamble", "P&G",
|
||||
"Johnson & Johnson", "Johnson and Johnson", "J&J",
|
||||
"Reckitt & Colman", "Reckitt and Colman",
|
||||
"Colgate-Palmolive",
|
||||
"Unilever",
|
||||
"Henkel",
|
||||
|
||||
Rules:
|
||||
1. Well-known company names with "&" or "and" are SINGLE companies (Ernst & Young, Marks & Spencer, Procter & Gamble)
|
||||
2. A "/" usually indicates multiple companies or ownership relationship
|
||||
3. If the parts are in completely different industries, they're likely separate companies
|
||||
4. If one part is clearly a subsidiary/department of the other, treat as ONE company
|
||||
// Food & Beverage
|
||||
"Prêt A Manger", "Pret A Manger",
|
||||
"Fortnum and Mason",
|
||||
"Lyle & Scott",
|
||||
"Ben & Jerry's", "Ben and Jerry's",
|
||||
"Baskin & Robbins",
|
||||
"Haribo",
|
||||
|
||||
Respond with ONLY valid JSON:
|
||||
{
|
||||
"isSingleCompany": boolean,
|
||||
"companies": ["company1", "company2"] or ["single company name"],
|
||||
"reasoning": "brief explanation"
|
||||
}
|
||||
""";
|
||||
// Finance & Insurance
|
||||
"Standard & Poor's", "Standard and Poor's", "S&P",
|
||||
"Moody's",
|
||||
"Fitch Ratings",
|
||||
"Lloyd's of London",
|
||||
"Coutts & Co", "Coutts and Co",
|
||||
"Brown Shipley & Co",
|
||||
"Schroders",
|
||||
|
||||
public async Task<List<string>?> ExtractCompanyNamesAsync(
|
||||
// Law Firms (common patterns)
|
||||
"Allen & Overy", "Allen and Overy",
|
||||
"Clifford Chance",
|
||||
"Freshfields Bruckhaus Deringer",
|
||||
"Linklaters",
|
||||
"Slaughter and May", "Slaughter & May",
|
||||
"Herbert Smith Freehills",
|
||||
"Hogan Lovells",
|
||||
"Norton Rose Fulbright",
|
||||
"DLA Piper",
|
||||
"Baker & McKenzie", "Baker McKenzie",
|
||||
"Eversheds Sutherland",
|
||||
"Ashurst",
|
||||
"CMS",
|
||||
"Simmons & Simmons",
|
||||
"Travers Smith",
|
||||
"Macfarlanes",
|
||||
"Addleshaw Goddard",
|
||||
"Pinsent Masons",
|
||||
"Shoosmiths",
|
||||
"Irwin Mitchell",
|
||||
"DAC Beachcroft",
|
||||
"Weightmans",
|
||||
"Browne Jacobson",
|
||||
"Mills & Reeve", "Mills and Reeve",
|
||||
"Taylor Wessing",
|
||||
"Osborne Clarke",
|
||||
"Bird & Bird", "Bird and Bird",
|
||||
"Withers",
|
||||
"Charles Russell Speechlys",
|
||||
"Stephenson Harwood",
|
||||
"Watson Farley & Williams",
|
||||
"Clyde & Co", "Clyde and Co",
|
||||
"Reed Smith",
|
||||
"Kennedys",
|
||||
"Fieldfisher",
|
||||
"RPC",
|
||||
"Womble Bond Dickinson",
|
||||
"Burges Salmon",
|
||||
"Trowers & Hamlins", "Trowers and Hamlins",
|
||||
"Bevan Brittan",
|
||||
"Veale Wasbrough Vizards",
|
||||
|
||||
// Media & Entertainment
|
||||
"Simon & Schuster",
|
||||
"Warner Bros", "Warner Brothers",
|
||||
"William Morris Endeavor",
|
||||
"Creative Artists Agency",
|
||||
|
||||
// Automotive
|
||||
"Rolls-Royce",
|
||||
"Aston Martin",
|
||||
"Jaguar Land Rover",
|
||||
|
||||
// Pharmaceuticals
|
||||
"GlaxoSmithKline", "GSK",
|
||||
"AstraZeneca",
|
||||
"Smith & Nephew",
|
||||
"Roche",
|
||||
|
||||
// Engineering & Construction
|
||||
"Mott MacDonald",
|
||||
"Arup",
|
||||
"Laing O'Rourke",
|
||||
"Kier",
|
||||
"Balfour Beatty",
|
||||
"Taylor Wimpey",
|
||||
"Persimmon",
|
||||
"Bellway",
|
||||
"Berkeley",
|
||||
|
||||
// Technology
|
||||
"Hewlett-Packard", "HP",
|
||||
"Texas Instruments",
|
||||
"AT&T",
|
||||
"T-Mobile",
|
||||
|
||||
// Other
|
||||
"Young & Co", "Young and Co",
|
||||
"Smith & Williamson",
|
||||
"Grant Thornton",
|
||||
"BDO",
|
||||
"RSM",
|
||||
"Mazars",
|
||||
"Moore Kingston Smith",
|
||||
"Crowe",
|
||||
"PKF",
|
||||
"Saffery Champness",
|
||||
"Buzzacott",
|
||||
"HW Fisher",
|
||||
"Haysmacintyre",
|
||||
"Menzies",
|
||||
"MHA",
|
||||
"Azets",
|
||||
"Dains",
|
||||
"Streets",
|
||||
"Armstrong Watson",
|
||||
|
||||
// Common department/division patterns (not to be split)
|
||||
"Sales and Marketing",
|
||||
"Research and Development", "R&D",
|
||||
"Human Resources",
|
||||
"Finance and Operations",
|
||||
"Legal and Compliance",
|
||||
"IT and Digital",
|
||||
"Supply Chain and Logistics",
|
||||
};
|
||||
|
||||
/// <summary>
|
||||
/// Patterns that indicate a name is likely referring to divisions/departments of ONE company.
|
||||
/// </summary>
|
||||
private static readonly string[] SingleCompanyPatterns =
|
||||
[
|
||||
" stores and ", // "Tesco Stores and Distribution"
|
||||
" retail and ", // "Next Retail and Online"
|
||||
" uk and ", // "BMW UK and Ireland"
|
||||
" europe and ", // "Google Europe and Middle East"
|
||||
" division and ",
|
||||
" department and ",
|
||||
" services and ",
|
||||
" group and ",
|
||||
" plc and ",
|
||||
" ltd and ",
|
||||
" limited and ",
|
||||
];
|
||||
|
||||
/// <summary>
|
||||
/// Determines if a company name refers to multiple companies and extracts them.
|
||||
/// Uses rule-based detection instead of AI for better performance and cost savings.
|
||||
/// </summary>
|
||||
public Task<List<string>?> ExtractCompanyNamesAsync(
|
||||
string companyName,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(companyName))
|
||||
{
|
||||
return null;
|
||||
return Task.FromResult<List<string>?>(null);
|
||||
}
|
||||
|
||||
_logger.LogDebug("Using AI to check if '{CompanyName}' is a compound name", companyName);
|
||||
_logger.LogDebug("Checking if '{CompanyName}' is a compound name (rule-based)", companyName);
|
||||
|
||||
try
|
||||
var result = DetectCompoundName(companyName);
|
||||
|
||||
if (result is null)
|
||||
{
|
||||
var prompt = CompoundNamePrompt.Replace("{COMPANY_NAME}", companyName);
|
||||
|
||||
var messages = new List<Message>
|
||||
{
|
||||
new(RoleType.User, prompt)
|
||||
};
|
||||
|
||||
var parameters = new MessageParameters
|
||||
{
|
||||
Model = "claude-3-5-haiku-20241022",
|
||||
MaxTokens = 256,
|
||||
Messages = messages,
|
||||
System = [new SystemMessage("You are a company name parser. Respond only with valid JSON.")]
|
||||
};
|
||||
|
||||
var response = await _anthropicClient.Messages.GetClaudeMessageAsync(parameters, cancellationToken);
|
||||
|
||||
var responseText = response.Content
|
||||
.OfType<TextContent>()
|
||||
.FirstOrDefault()?.Text;
|
||||
|
||||
if (string.IsNullOrWhiteSpace(responseText))
|
||||
{
|
||||
_logger.LogWarning("AI returned empty response for compound name check");
|
||||
return null;
|
||||
}
|
||||
|
||||
responseText = JsonResponseHelper.CleanJsonResponse(responseText);
|
||||
|
||||
var result = JsonSerializer.Deserialize<CompoundNameResponse>(responseText, JsonDefaults.CamelCase);
|
||||
|
||||
if (result is null)
|
||||
{
|
||||
_logger.LogWarning("Failed to deserialize compound name response: {Response}", responseText);
|
||||
return null;
|
||||
}
|
||||
|
||||
_logger.LogDebug("AI compound name result: IsSingle={IsSingle}, Companies=[{Companies}], Reasoning={Reasoning}",
|
||||
result.IsSingleCompany, string.Join(", ", result.Companies ?? []), result.Reasoning);
|
||||
|
||||
if (result.IsSingleCompany || result.Companies is null || result.Companies.Count < 2)
|
||||
{
|
||||
return null; // Single company, no splitting needed
|
||||
}
|
||||
|
||||
return result.Companies;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "AI compound name detection failed for '{CompanyName}'", companyName);
|
||||
return null;
|
||||
_logger.LogDebug("'{CompanyName}' is a single company", companyName);
|
||||
return Task.FromResult<List<string>?>(null);
|
||||
}
|
||||
|
||||
_logger.LogDebug("'{CompanyName}' detected as compound, parts: [{Parts}]",
|
||||
companyName, string.Join(", ", result));
|
||||
|
||||
return Task.FromResult<List<string>?>(result);
|
||||
}
|
||||
|
||||
private sealed class CompoundNameResponse
|
||||
/// <summary>
|
||||
/// Rule-based detection of compound company names.
|
||||
/// Returns null if single company, or list of parts if multiple companies.
|
||||
/// </summary>
|
||||
private List<string>? DetectCompoundName(string name)
|
||||
{
|
||||
public bool IsSingleCompany { get; set; }
|
||||
public List<string>? Companies { get; set; }
|
||||
public string? Reasoning { get; set; }
|
||||
var trimmedName = name.Trim();
|
||||
|
||||
// Check 1: Is this a known single company name?
|
||||
if (IsKnownSingleCompany(trimmedName))
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
// Check 2: Does it match single-company patterns (departments/divisions)?
|
||||
if (MatchesSingleCompanyPattern(trimmedName))
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
// Check 3: "/" is a strong indicator of multiple companies
|
||||
if (trimmedName.Contains('/'))
|
||||
{
|
||||
var slashParts = trimmedName
|
||||
.Split('/')
|
||||
.Select(p => p.Trim())
|
||||
.Where(p => p.Length >= 2)
|
||||
.ToList();
|
||||
|
||||
if (slashParts.Count >= 2)
|
||||
{
|
||||
return slashParts;
|
||||
}
|
||||
}
|
||||
|
||||
// Check 4: " & " or " and " between what look like separate company names
|
||||
// Only split if both parts look like distinct company names
|
||||
var andMatch = System.Text.RegularExpressions.Regex.Match(
|
||||
trimmedName,
|
||||
@"^(.+?)\s+(?:&|and)\s+(.+)$",
|
||||
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
|
||||
|
||||
if (andMatch.Success)
|
||||
{
|
||||
var part1 = andMatch.Groups[1].Value.Trim();
|
||||
var part2 = andMatch.Groups[2].Value.Trim();
|
||||
|
||||
// If the combined name is a known single company, don't split
|
||||
if (IsKnownSingleCompany(trimmedName))
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
// If either part is very short (like initials), probably not a split
|
||||
if (part1.Length < 3 || part2.Length < 3)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
// If part2 looks like a department/role descriptor, don't split
|
||||
if (IsDepartmentOrRole(part2))
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
// If both parts look like independent company names, this is likely compound
|
||||
if (LooksLikeCompanyName(part1) && LooksLikeCompanyName(part2))
|
||||
{
|
||||
return [part1, part2];
|
||||
}
|
||||
}
|
||||
|
||||
// Default: treat as single company
|
||||
return null;
|
||||
}
|
||||
|
||||
private static bool IsKnownSingleCompany(string name)
|
||||
{
|
||||
// Direct match
|
||||
if (KnownSingleCompanyNames.Contains(name))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
// Check if the name contains any known single company as a substring
|
||||
foreach (var known in KnownSingleCompanyNames)
|
||||
{
|
||||
if (name.Contains(known, StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
private static bool MatchesSingleCompanyPattern(string name)
|
||||
{
|
||||
var lowerName = name.ToLowerInvariant();
|
||||
return SingleCompanyPatterns.Any(pattern => lowerName.Contains(pattern));
|
||||
}
|
||||
|
||||
private static bool IsDepartmentOrRole(string text)
|
||||
{
|
||||
var lower = text.ToLowerInvariant();
|
||||
string[] departmentKeywords =
|
||||
[
|
||||
"department", "division", "team", "group", "unit",
|
||||
"services", "solutions", "operations", "logistics",
|
||||
"distribution", "manufacturing", "production",
|
||||
"marketing", "sales", "finance", "accounting",
|
||||
"hr", "human resources", "it", "technology",
|
||||
"research", "development", "r&d", "engineering",
|
||||
"retail", "wholesale", "stores", "online",
|
||||
"consulting", "advisory", "support"
|
||||
];
|
||||
|
||||
return departmentKeywords.Any(kw => lower.Contains(kw));
|
||||
}
|
||||
|
||||
private static bool LooksLikeCompanyName(string text)
|
||||
{
|
||||
// A company name typically:
|
||||
// - Is at least 2 characters
|
||||
// - Starts with a capital letter (or is all caps)
|
||||
// - May end with Ltd, Limited, PLC, Inc, etc.
|
||||
|
||||
if (text.Length < 2)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
// If it contains company suffixes, definitely a company name
|
||||
string[] companySuffixes = ["ltd", "limited", "plc", "inc", "corp", "llp", "llc", "group", "holdings"];
|
||||
var lower = text.ToLowerInvariant();
|
||||
if (companySuffixes.Any(s => lower.EndsWith(s) || lower.Contains($" {s}")))
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
// If it looks like it could be a company (starts with capital, reasonable length)
|
||||
if (char.IsUpper(text[0]) && text.Length >= 3)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user