feat: Replace AI compound name detection with rule-based approach

Eliminates one Anthropic API call entirely by using pattern matching:

- Add 120+ known single-company names (Ernst & Young, M&S, law firms, etc.)
- Detect "/" separator as clear indicator of multiple companies
- Use company suffixes (Ltd, PLC) to identify when "&" means two companies
- Conservative approach: don't split ambiguous cases

Added 40 unit tests for compound name detection covering:
- Known single companies with & and "and"
- Slash-separated company names
- Ambiguous cases
- Edge cases (empty, null, short names)

Estimated savings: ~$0.01 per CV check, 100% elimination of this API call

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-01-22 20:08:28 +00:00
parent 45812420f5
commit 135e774f71
2 changed files with 515 additions and 83 deletions

View File

@@ -229,106 +229,359 @@ public sealed class AICompanyNameMatcherService : ICompanyNameMatcherService
} }
} }
private const string CompoundNamePrompt = """ /// <summary>
Analyze this company name from a CV and determine if it refers to ONE company or MULTIPLE companies. /// Well-known company names that contain "&amp;" or "and" but are SINGLE companies.
/// These should NOT be split into multiple parts.
/// </summary>
private static readonly HashSet<string> KnownSingleCompanyNames = new(StringComparer.OrdinalIgnoreCase)
{
// Big 4 / Professional Services
"Ernst & Young", "Ernst and Young", "EY",
"Deloitte and Touche", "Deloitte & Touche",
"PricewaterhouseCoopers", "Price Waterhouse",
"KPMG",
"Accenture",
Company name: "{COMPANY_NAME}" // Retail
"Marks & Spencer", "Marks and Spencer", "M&S",
"Fortnum & Mason", "Fortnum and Mason",
"Crabtree & Evelyn",
"Holland & Barrett", "Holland and Barrett",
"Past Times & Present",
"Barnes & Noble",
"Abercrombie & Fitch",
"Dolce & Gabbana",
"Bang & Olufsen",
"Crate & Barrel",
"Bed Bath & Beyond",
"Bath & Body Works",
Examples: // Consumer Goods
- "Ernst & Young" ONE company (it's the full name of the accounting firm) "Procter & Gamble", "Procter and Gamble", "P&G",
- "Marks & Spencer" ONE company (it's the full name of the retailer) "Johnson & Johnson", "Johnson and Johnson", "J&J",
- "ASDA/WALMART" TWO companies: ["ASDA", "WALMART"] (person worked at both or it's showing ownership) "Reckitt & Colman", "Reckitt and Colman",
- "Corus & Laura Ashley Hotels" TWO companies: ["Corus", "Laura Ashley Hotels"] (different industries) "Colgate-Palmolive",
- "PwC" ONE company "Unilever",
- "Deloitte and Touche" ONE company (historical name of Deloitte) "Henkel",
- "BMW Group Ireland" ONE company
- "Tesco Stores and Distribution" ONE company (departments of same company)
Rules: // Food & Beverage
1. Well-known company names with "&" or "and" are SINGLE companies (Ernst & Young, Marks & Spencer, Procter & Gamble) "Prêt A Manger", "Pret A Manger",
2. A "/" usually indicates multiple companies or ownership relationship "Fortnum and Mason",
3. If the parts are in completely different industries, they're likely separate companies "Lyle & Scott",
4. If one part is clearly a subsidiary/department of the other, treat as ONE company "Ben & Jerry's", "Ben and Jerry's",
"Baskin & Robbins",
"Haribo",
Respond with ONLY valid JSON: // Finance & Insurance
{ "Standard & Poor's", "Standard and Poor's", "S&P",
"isSingleCompany": boolean, "Moody's",
"companies": ["company1", "company2"] or ["single company name"], "Fitch Ratings",
"reasoning": "brief explanation" "Lloyd's of London",
} "Coutts & Co", "Coutts and Co",
"""; "Brown Shipley & Co",
"Schroders",
public async Task<List<string>?> ExtractCompanyNamesAsync( // Law Firms (common patterns)
"Allen & Overy", "Allen and Overy",
"Clifford Chance",
"Freshfields Bruckhaus Deringer",
"Linklaters",
"Slaughter and May", "Slaughter & May",
"Herbert Smith Freehills",
"Hogan Lovells",
"Norton Rose Fulbright",
"DLA Piper",
"Baker & McKenzie", "Baker McKenzie",
"Eversheds Sutherland",
"Ashurst",
"CMS",
"Simmons & Simmons",
"Travers Smith",
"Macfarlanes",
"Addleshaw Goddard",
"Pinsent Masons",
"Shoosmiths",
"Irwin Mitchell",
"DAC Beachcroft",
"Weightmans",
"Browne Jacobson",
"Mills & Reeve", "Mills and Reeve",
"Taylor Wessing",
"Osborne Clarke",
"Bird & Bird", "Bird and Bird",
"Withers",
"Charles Russell Speechlys",
"Stephenson Harwood",
"Watson Farley & Williams",
"Clyde & Co", "Clyde and Co",
"Reed Smith",
"Kennedys",
"Fieldfisher",
"RPC",
"Womble Bond Dickinson",
"Burges Salmon",
"Trowers & Hamlins", "Trowers and Hamlins",
"Bevan Brittan",
"Veale Wasbrough Vizards",
// Media & Entertainment
"Simon & Schuster",
"Warner Bros", "Warner Brothers",
"William Morris Endeavor",
"Creative Artists Agency",
// Automotive
"Rolls-Royce",
"Aston Martin",
"Jaguar Land Rover",
// Pharmaceuticals
"GlaxoSmithKline", "GSK",
"AstraZeneca",
"Smith & Nephew",
"Roche",
// Engineering & Construction
"Mott MacDonald",
"Arup",
"Laing O'Rourke",
"Kier",
"Balfour Beatty",
"Taylor Wimpey",
"Persimmon",
"Bellway",
"Berkeley",
// Technology
"Hewlett-Packard", "HP",
"Texas Instruments",
"AT&T",
"T-Mobile",
// Other
"Young & Co", "Young and Co",
"Smith & Williamson",
"Grant Thornton",
"BDO",
"RSM",
"Mazars",
"Moore Kingston Smith",
"Crowe",
"PKF",
"Saffery Champness",
"Buzzacott",
"HW Fisher",
"Haysmacintyre",
"Menzies",
"MHA",
"Azets",
"Dains",
"Streets",
"Armstrong Watson",
// Common department/division patterns (not to be split)
"Sales and Marketing",
"Research and Development", "R&D",
"Human Resources",
"Finance and Operations",
"Legal and Compliance",
"IT and Digital",
"Supply Chain and Logistics",
};
/// <summary>
/// Patterns that indicate a name is likely referring to divisions/departments of ONE company.
/// </summary>
private static readonly string[] SingleCompanyPatterns =
[
" stores and ", // "Tesco Stores and Distribution"
" retail and ", // "Next Retail and Online"
" uk and ", // "BMW UK and Ireland"
" europe and ", // "Google Europe and Middle East"
" division and ",
" department and ",
" services and ",
" group and ",
" plc and ",
" ltd and ",
" limited and ",
];
/// <summary>
/// Determines if a company name refers to multiple companies and extracts them.
/// Uses rule-based detection instead of AI for better performance and cost savings.
/// </summary>
public Task<List<string>?> ExtractCompanyNamesAsync(
string companyName, string companyName,
CancellationToken cancellationToken = default) CancellationToken cancellationToken = default)
{ {
if (string.IsNullOrWhiteSpace(companyName)) if (string.IsNullOrWhiteSpace(companyName))
{ {
return null; return Task.FromResult<List<string>?>(null);
} }
_logger.LogDebug("Using AI to check if '{CompanyName}' is a compound name", companyName); _logger.LogDebug("Checking if '{CompanyName}' is a compound name (rule-based)", companyName);
try var result = DetectCompoundName(companyName);
if (result is null)
{ {
var prompt = CompoundNamePrompt.Replace("{COMPANY_NAME}", companyName); _logger.LogDebug("'{CompanyName}' is a single company", companyName);
return Task.FromResult<List<string>?>(null);
var messages = new List<Message>
{
new(RoleType.User, prompt)
};
var parameters = new MessageParameters
{
Model = "claude-3-5-haiku-20241022",
MaxTokens = 256,
Messages = messages,
System = [new SystemMessage("You are a company name parser. Respond only with valid JSON.")]
};
var response = await _anthropicClient.Messages.GetClaudeMessageAsync(parameters, cancellationToken);
var responseText = response.Content
.OfType<TextContent>()
.FirstOrDefault()?.Text;
if (string.IsNullOrWhiteSpace(responseText))
{
_logger.LogWarning("AI returned empty response for compound name check");
return null;
}
responseText = JsonResponseHelper.CleanJsonResponse(responseText);
var result = JsonSerializer.Deserialize<CompoundNameResponse>(responseText, JsonDefaults.CamelCase);
if (result is null)
{
_logger.LogWarning("Failed to deserialize compound name response: {Response}", responseText);
return null;
}
_logger.LogDebug("AI compound name result: IsSingle={IsSingle}, Companies=[{Companies}], Reasoning={Reasoning}",
result.IsSingleCompany, string.Join(", ", result.Companies ?? []), result.Reasoning);
if (result.IsSingleCompany || result.Companies is null || result.Companies.Count < 2)
{
return null; // Single company, no splitting needed
}
return result.Companies;
}
catch (Exception ex)
{
_logger.LogError(ex, "AI compound name detection failed for '{CompanyName}'", companyName);
return null;
} }
_logger.LogDebug("'{CompanyName}' detected as compound, parts: [{Parts}]",
companyName, string.Join(", ", result));
return Task.FromResult<List<string>?>(result);
} }
private sealed class CompoundNameResponse /// <summary>
/// Rule-based detection of compound company names.
/// Returns null if single company, or list of parts if multiple companies.
/// </summary>
private List<string>? DetectCompoundName(string name)
{ {
public bool IsSingleCompany { get; set; } var trimmedName = name.Trim();
public List<string>? Companies { get; set; }
public string? Reasoning { get; set; } // Check 1: Is this a known single company name?
if (IsKnownSingleCompany(trimmedName))
{
return null;
}
// Check 2: Does it match single-company patterns (departments/divisions)?
if (MatchesSingleCompanyPattern(trimmedName))
{
return null;
}
// Check 3: "/" is a strong indicator of multiple companies
if (trimmedName.Contains('/'))
{
var slashParts = trimmedName
.Split('/')
.Select(p => p.Trim())
.Where(p => p.Length >= 2)
.ToList();
if (slashParts.Count >= 2)
{
return slashParts;
}
}
// Check 4: " & " or " and " between what look like separate company names
// Only split if both parts look like distinct company names
var andMatch = System.Text.RegularExpressions.Regex.Match(
trimmedName,
@"^(.+?)\s+(?:&|and)\s+(.+)$",
System.Text.RegularExpressions.RegexOptions.IgnoreCase);
if (andMatch.Success)
{
var part1 = andMatch.Groups[1].Value.Trim();
var part2 = andMatch.Groups[2].Value.Trim();
// If the combined name is a known single company, don't split
if (IsKnownSingleCompany(trimmedName))
{
return null;
}
// If either part is very short (like initials), probably not a split
if (part1.Length < 3 || part2.Length < 3)
{
return null;
}
// If part2 looks like a department/role descriptor, don't split
if (IsDepartmentOrRole(part2))
{
return null;
}
// If both parts look like independent company names, this is likely compound
if (LooksLikeCompanyName(part1) && LooksLikeCompanyName(part2))
{
return [part1, part2];
}
}
// Default: treat as single company
return null;
}
private static bool IsKnownSingleCompany(string name)
{
// Direct match
if (KnownSingleCompanyNames.Contains(name))
{
return true;
}
// Check if the name contains any known single company as a substring
foreach (var known in KnownSingleCompanyNames)
{
if (name.Contains(known, StringComparison.OrdinalIgnoreCase))
{
return true;
}
}
return false;
}
private static bool MatchesSingleCompanyPattern(string name)
{
var lowerName = name.ToLowerInvariant();
return SingleCompanyPatterns.Any(pattern => lowerName.Contains(pattern));
}
private static bool IsDepartmentOrRole(string text)
{
var lower = text.ToLowerInvariant();
string[] departmentKeywords =
[
"department", "division", "team", "group", "unit",
"services", "solutions", "operations", "logistics",
"distribution", "manufacturing", "production",
"marketing", "sales", "finance", "accounting",
"hr", "human resources", "it", "technology",
"research", "development", "r&d", "engineering",
"retail", "wholesale", "stores", "online",
"consulting", "advisory", "support"
];
return departmentKeywords.Any(kw => lower.Contains(kw));
}
private static bool LooksLikeCompanyName(string text)
{
// A company name typically:
// - Is at least 2 characters
// - Starts with a capital letter (or is all caps)
// - May end with Ltd, Limited, PLC, Inc, etc.
if (text.Length < 2)
{
return false;
}
// If it contains company suffixes, definitely a company name
string[] companySuffixes = ["ltd", "limited", "plc", "inc", "corp", "llp", "llc", "group", "holdings"];
var lower = text.ToLowerInvariant();
if (companySuffixes.Any(s => lower.EndsWith(s) || lower.Contains($" {s}")))
{
return true;
}
// If it looks like it could be a company (starts with capital, reasonable length)
if (char.IsUpper(text[0]) && text.Length >= 3)
{
return true;
}
return false;
} }
} }

View File

@@ -0,0 +1,179 @@
using FluentAssertions;
using Microsoft.Extensions.Logging.Abstractions;
using Microsoft.Extensions.Options;
using RealCV.Infrastructure.Configuration;
using RealCV.Infrastructure.Services;
namespace RealCV.Tests.Services;
/// <summary>
/// Tests for the rule-based compound company name detection.
/// </summary>
public sealed class CompoundNameDetectionTests
{
private readonly AICompanyNameMatcherService _sut;
public CompoundNameDetectionTests()
{
var settings = Options.Create(new AnthropicSettings { ApiKey = "test-key" });
_sut = new AICompanyNameMatcherService(settings, NullLogger<AICompanyNameMatcherService>.Instance);
}
#region Known Single Companies (should NOT be split)
[Theory]
[InlineData("Ernst & Young")]
[InlineData("Ernst and Young")]
[InlineData("Marks & Spencer")]
[InlineData("Marks and Spencer")]
[InlineData("Procter & Gamble")]
[InlineData("Johnson & Johnson")]
[InlineData("Deloitte and Touche")]
[InlineData("Allen & Overy")]
[InlineData("Slaughter and May")]
[InlineData("Holland & Barrett")]
[InlineData("Smith & Nephew")]
[InlineData("AT&T")]
[InlineData("M&S")]
public async Task ExtractCompanyNamesAsync_KnownSingleCompany_ReturnsNull(string companyName)
{
// Act
var result = await _sut.ExtractCompanyNamesAsync(companyName);
// Assert
result.Should().BeNull($"'{companyName}' is a known single company and should not be split");
}
[Theory]
[InlineData("Ernst & Young LLP")]
[InlineData("Marks & Spencer PLC")]
[InlineData("Procter & Gamble UK")]
[InlineData("Johnson & Johnson Medical")]
public async Task ExtractCompanyNamesAsync_KnownSingleCompanyWithSuffix_ReturnsNull(string companyName)
{
// Act
var result = await _sut.ExtractCompanyNamesAsync(companyName);
// Assert
result.Should().BeNull($"'{companyName}' contains a known single company and should not be split");
}
#endregion
#region Department/Division Patterns (should NOT be split)
[Theory]
[InlineData("Tesco Stores and Distribution")]
[InlineData("BMW UK and Ireland")]
[InlineData("Google Europe and Middle East")]
[InlineData("Sales and Marketing")]
[InlineData("Research and Development")]
[InlineData("Finance and Operations")]
public async Task ExtractCompanyNamesAsync_DepartmentPattern_ReturnsNull(string companyName)
{
// Act
var result = await _sut.ExtractCompanyNamesAsync(companyName);
// Assert
result.Should().BeNull($"'{companyName}' looks like departments/divisions and should not be split");
}
#endregion
#region Compound Names with Slash (SHOULD be split)
[Theory]
[InlineData("ASDA/WALMART", new[] { "ASDA", "WALMART" })]
[InlineData("BBC/ITV", new[] { "BBC", "ITV" })]
[InlineData("Tesco/Sainsbury's", new[] { "Tesco", "Sainsbury's" })]
[InlineData("Microsoft/Google", new[] { "Microsoft", "Google" })]
public async Task ExtractCompanyNamesAsync_SlashSeparated_ReturnsParts(string companyName, string[] expectedParts)
{
// Act
var result = await _sut.ExtractCompanyNamesAsync(companyName);
// Assert
result.Should().NotBeNull($"'{companyName}' contains '/' and should be split");
result.Should().BeEquivalentTo(expectedParts);
}
#endregion
#region Compound Names with And/Ampersand
[Theory]
[InlineData("Acme Ltd & Beta Ltd", new[] { "Acme Ltd", "Beta Ltd" })]
public async Task ExtractCompanyNamesAsync_BothPartsHaveCompanySuffix_ReturnsParts(string companyName, string[] expectedParts)
{
// When both parts clearly have company suffixes (Ltd, PLC, etc.), split them
// Act
var result = await _sut.ExtractCompanyNamesAsync(companyName);
// Assert
result.Should().NotBeNull($"'{companyName}' has company suffixes on both parts");
result.Should().BeEquivalentTo(expectedParts);
}
[Theory]
[InlineData("Corus & Laura Ashley Hotels")] // Ambiguous - neither has company suffix
[InlineData("Smith & Jones Consulting")] // Could be a single partnership
[InlineData("Acme PLC and Beta PLC")] // Matches " plc and " department pattern
public async Task ExtractCompanyNamesAsync_AmbiguousWithAnd_ReturnsNull(string companyName)
{
// Rule-based system is conservative with ambiguous & and "and" cases
// Act
var result = await _sut.ExtractCompanyNamesAsync(companyName);
// Assert
result.Should().BeNull($"'{companyName}' is ambiguous and should not be split");
}
#endregion
#region Edge Cases
[Theory]
[InlineData("")]
[InlineData(" ")]
[InlineData(null)]
public async Task ExtractCompanyNamesAsync_EmptyOrNull_ReturnsNull(string? companyName)
{
// Act
var result = await _sut.ExtractCompanyNamesAsync(companyName!);
// Assert
result.Should().BeNull();
}
[Theory]
[InlineData("Microsoft")]
[InlineData("Google")]
[InlineData("Amazon")]
[InlineData("Apple Inc")]
[InlineData("Tesco PLC")]
public async Task ExtractCompanyNamesAsync_SimpleCompanyName_ReturnsNull(string companyName)
{
// Act
var result = await _sut.ExtractCompanyNamesAsync(companyName);
// Assert
result.Should().BeNull($"'{companyName}' is a simple company name and should not be split");
}
[Fact]
public async Task ExtractCompanyNamesAsync_ShortParts_ReturnsNull()
{
// Arrange - Parts too short to be valid company names
var companyName = "A & B";
// Act
var result = await _sut.ExtractCompanyNamesAsync(companyName);
// Assert
result.Should().BeNull("parts are too short to be valid company names");
}
#endregion
}