diff --git a/src/RealCV.Infrastructure/Services/AICompanyNameMatcherService.cs b/src/RealCV.Infrastructure/Services/AICompanyNameMatcherService.cs
index 624dd9a..9a4ab6f 100644
--- a/src/RealCV.Infrastructure/Services/AICompanyNameMatcherService.cs
+++ b/src/RealCV.Infrastructure/Services/AICompanyNameMatcherService.cs
@@ -229,106 +229,359 @@ public sealed class AICompanyNameMatcherService : ICompanyNameMatcherService
}
}
- private const string CompoundNamePrompt = """
- Analyze this company name from a CV and determine if it refers to ONE company or MULTIPLE companies.
+ ///
+ /// Well-known company names that contain "&" or "and" but are SINGLE companies.
+ /// These should NOT be split into multiple parts.
+ ///
+ private static readonly HashSet KnownSingleCompanyNames = new(StringComparer.OrdinalIgnoreCase)
+ {
+ // Big 4 / Professional Services
+ "Ernst & Young", "Ernst and Young", "EY",
+ "Deloitte and Touche", "Deloitte & Touche",
+ "PricewaterhouseCoopers", "Price Waterhouse",
+ "KPMG",
+ "Accenture",
- Company name: "{COMPANY_NAME}"
+ // Retail
+ "Marks & Spencer", "Marks and Spencer", "M&S",
+ "Fortnum & Mason", "Fortnum and Mason",
+ "Crabtree & Evelyn",
+ "Holland & Barrett", "Holland and Barrett",
+ "Past Times & Present",
+ "Barnes & Noble",
+ "Abercrombie & Fitch",
+ "Dolce & Gabbana",
+ "Bang & Olufsen",
+ "Crate & Barrel",
+ "Bed Bath & Beyond",
+ "Bath & Body Works",
- Examples:
- - "Ernst & Young" → ONE company (it's the full name of the accounting firm)
- - "Marks & Spencer" → ONE company (it's the full name of the retailer)
- - "ASDA/WALMART" → TWO companies: ["ASDA", "WALMART"] (person worked at both or it's showing ownership)
- - "Corus & Laura Ashley Hotels" → TWO companies: ["Corus", "Laura Ashley Hotels"] (different industries)
- - "PwC" → ONE company
- - "Deloitte and Touche" → ONE company (historical name of Deloitte)
- - "BMW Group Ireland" → ONE company
- - "Tesco Stores and Distribution" → ONE company (departments of same company)
+ // Consumer Goods
+ "Procter & Gamble", "Procter and Gamble", "P&G",
+ "Johnson & Johnson", "Johnson and Johnson", "J&J",
+ "Reckitt & Colman", "Reckitt and Colman",
+ "Colgate-Palmolive",
+ "Unilever",
+ "Henkel",
- Rules:
- 1. Well-known company names with "&" or "and" are SINGLE companies (Ernst & Young, Marks & Spencer, Procter & Gamble)
- 2. A "/" usually indicates multiple companies or ownership relationship
- 3. If the parts are in completely different industries, they're likely separate companies
- 4. If one part is clearly a subsidiary/department of the other, treat as ONE company
+ // Food & Beverage
+ "Prêt A Manger", "Pret A Manger",
+ "Fortnum and Mason",
+ "Lyle & Scott",
+ "Ben & Jerry's", "Ben and Jerry's",
+ "Baskin & Robbins",
+ "Haribo",
- Respond with ONLY valid JSON:
- {
- "isSingleCompany": boolean,
- "companies": ["company1", "company2"] or ["single company name"],
- "reasoning": "brief explanation"
- }
- """;
+ // Finance & Insurance
+ "Standard & Poor's", "Standard and Poor's", "S&P",
+ "Moody's",
+ "Fitch Ratings",
+ "Lloyd's of London",
+ "Coutts & Co", "Coutts and Co",
+ "Brown Shipley & Co",
+ "Schroders",
- public async Task?> ExtractCompanyNamesAsync(
+ // Law Firms (common patterns)
+ "Allen & Overy", "Allen and Overy",
+ "Clifford Chance",
+ "Freshfields Bruckhaus Deringer",
+ "Linklaters",
+ "Slaughter and May", "Slaughter & May",
+ "Herbert Smith Freehills",
+ "Hogan Lovells",
+ "Norton Rose Fulbright",
+ "DLA Piper",
+ "Baker & McKenzie", "Baker McKenzie",
+ "Eversheds Sutherland",
+ "Ashurst",
+ "CMS",
+ "Simmons & Simmons",
+ "Travers Smith",
+ "Macfarlanes",
+ "Addleshaw Goddard",
+ "Pinsent Masons",
+ "Shoosmiths",
+ "Irwin Mitchell",
+ "DAC Beachcroft",
+ "Weightmans",
+ "Browne Jacobson",
+ "Mills & Reeve", "Mills and Reeve",
+ "Taylor Wessing",
+ "Osborne Clarke",
+ "Bird & Bird", "Bird and Bird",
+ "Withers",
+ "Charles Russell Speechlys",
+ "Stephenson Harwood",
+ "Watson Farley & Williams",
+ "Clyde & Co", "Clyde and Co",
+ "Reed Smith",
+ "Kennedys",
+ "Fieldfisher",
+ "RPC",
+ "Womble Bond Dickinson",
+ "Burges Salmon",
+ "Trowers & Hamlins", "Trowers and Hamlins",
+ "Bevan Brittan",
+ "Veale Wasbrough Vizards",
+
+ // Media & Entertainment
+ "Simon & Schuster",
+ "Warner Bros", "Warner Brothers",
+ "William Morris Endeavor",
+ "Creative Artists Agency",
+
+ // Automotive
+ "Rolls-Royce",
+ "Aston Martin",
+ "Jaguar Land Rover",
+
+ // Pharmaceuticals
+ "GlaxoSmithKline", "GSK",
+ "AstraZeneca",
+ "Smith & Nephew",
+ "Roche",
+
+ // Engineering & Construction
+ "Mott MacDonald",
+ "Arup",
+ "Laing O'Rourke",
+ "Kier",
+ "Balfour Beatty",
+ "Taylor Wimpey",
+ "Persimmon",
+ "Bellway",
+ "Berkeley",
+
+ // Technology
+ "Hewlett-Packard", "HP",
+ "Texas Instruments",
+ "AT&T",
+ "T-Mobile",
+
+ // Other
+ "Young & Co", "Young and Co",
+ "Smith & Williamson",
+ "Grant Thornton",
+ "BDO",
+ "RSM",
+ "Mazars",
+ "Moore Kingston Smith",
+ "Crowe",
+ "PKF",
+ "Saffery Champness",
+ "Buzzacott",
+ "HW Fisher",
+ "Haysmacintyre",
+ "Menzies",
+ "MHA",
+ "Azets",
+ "Dains",
+ "Streets",
+ "Armstrong Watson",
+
+ // Common department/division patterns (not to be split)
+ "Sales and Marketing",
+ "Research and Development", "R&D",
+ "Human Resources",
+ "Finance and Operations",
+ "Legal and Compliance",
+ "IT and Digital",
+ "Supply Chain and Logistics",
+ };
+
+ ///
+ /// Patterns that indicate a name is likely referring to divisions/departments of ONE company.
+ ///
+ private static readonly string[] SingleCompanyPatterns =
+ [
+ " stores and ", // "Tesco Stores and Distribution"
+ " retail and ", // "Next Retail and Online"
+ " uk and ", // "BMW UK and Ireland"
+ " europe and ", // "Google Europe and Middle East"
+ " division and ",
+ " department and ",
+ " services and ",
+ " group and ",
+ " plc and ",
+ " ltd and ",
+ " limited and ",
+ ];
+
+ ///
+ /// Determines if a company name refers to multiple companies and extracts them.
+ /// Uses rule-based detection instead of AI for better performance and cost savings.
+ ///
+ public Task?> ExtractCompanyNamesAsync(
string companyName,
CancellationToken cancellationToken = default)
{
if (string.IsNullOrWhiteSpace(companyName))
{
- return null;
+ return Task.FromResult?>(null);
}
- _logger.LogDebug("Using AI to check if '{CompanyName}' is a compound name", companyName);
+ _logger.LogDebug("Checking if '{CompanyName}' is a compound name (rule-based)", companyName);
- try
+ var result = DetectCompoundName(companyName);
+
+ if (result is null)
{
- var prompt = CompoundNamePrompt.Replace("{COMPANY_NAME}", companyName);
-
- var messages = new List
- {
- new(RoleType.User, prompt)
- };
-
- var parameters = new MessageParameters
- {
- Model = "claude-3-5-haiku-20241022",
- MaxTokens = 256,
- Messages = messages,
- System = [new SystemMessage("You are a company name parser. Respond only with valid JSON.")]
- };
-
- var response = await _anthropicClient.Messages.GetClaudeMessageAsync(parameters, cancellationToken);
-
- var responseText = response.Content
- .OfType()
- .FirstOrDefault()?.Text;
-
- if (string.IsNullOrWhiteSpace(responseText))
- {
- _logger.LogWarning("AI returned empty response for compound name check");
- return null;
- }
-
- responseText = JsonResponseHelper.CleanJsonResponse(responseText);
-
- var result = JsonSerializer.Deserialize(responseText, JsonDefaults.CamelCase);
-
- if (result is null)
- {
- _logger.LogWarning("Failed to deserialize compound name response: {Response}", responseText);
- return null;
- }
-
- _logger.LogDebug("AI compound name result: IsSingle={IsSingle}, Companies=[{Companies}], Reasoning={Reasoning}",
- result.IsSingleCompany, string.Join(", ", result.Companies ?? []), result.Reasoning);
-
- if (result.IsSingleCompany || result.Companies is null || result.Companies.Count < 2)
- {
- return null; // Single company, no splitting needed
- }
-
- return result.Companies;
- }
- catch (Exception ex)
- {
- _logger.LogError(ex, "AI compound name detection failed for '{CompanyName}'", companyName);
- return null;
+ _logger.LogDebug("'{CompanyName}' is a single company", companyName);
+ return Task.FromResult?>(null);
}
+
+ _logger.LogDebug("'{CompanyName}' detected as compound, parts: [{Parts}]",
+ companyName, string.Join(", ", result));
+
+ return Task.FromResult?>(result);
}
- private sealed class CompoundNameResponse
+ ///
+ /// Rule-based detection of compound company names.
+ /// Returns null if single company, or list of parts if multiple companies.
+ ///
+ private List? DetectCompoundName(string name)
{
- public bool IsSingleCompany { get; set; }
- public List? Companies { get; set; }
- public string? Reasoning { get; set; }
+ var trimmedName = name.Trim();
+
+ // Check 1: Is this a known single company name?
+ if (IsKnownSingleCompany(trimmedName))
+ {
+ return null;
+ }
+
+ // Check 2: Does it match single-company patterns (departments/divisions)?
+ if (MatchesSingleCompanyPattern(trimmedName))
+ {
+ return null;
+ }
+
+ // Check 3: "/" is a strong indicator of multiple companies
+ if (trimmedName.Contains('/'))
+ {
+ var slashParts = trimmedName
+ .Split('/')
+ .Select(p => p.Trim())
+ .Where(p => p.Length >= 2)
+ .ToList();
+
+ if (slashParts.Count >= 2)
+ {
+ return slashParts;
+ }
+ }
+
+ // Check 4: " & " or " and " between what look like separate company names
+ // Only split if both parts look like distinct company names
+ var andMatch = System.Text.RegularExpressions.Regex.Match(
+ trimmedName,
+ @"^(.+?)\s+(?:&|and)\s+(.+)$",
+ System.Text.RegularExpressions.RegexOptions.IgnoreCase);
+
+ if (andMatch.Success)
+ {
+ var part1 = andMatch.Groups[1].Value.Trim();
+ var part2 = andMatch.Groups[2].Value.Trim();
+
+ // If the combined name is a known single company, don't split
+ if (IsKnownSingleCompany(trimmedName))
+ {
+ return null;
+ }
+
+ // If either part is very short (like initials), probably not a split
+ if (part1.Length < 3 || part2.Length < 3)
+ {
+ return null;
+ }
+
+ // If part2 looks like a department/role descriptor, don't split
+ if (IsDepartmentOrRole(part2))
+ {
+ return null;
+ }
+
+ // If both parts look like independent company names, this is likely compound
+ if (LooksLikeCompanyName(part1) && LooksLikeCompanyName(part2))
+ {
+ return [part1, part2];
+ }
+ }
+
+ // Default: treat as single company
+ return null;
+ }
+
+ private static bool IsKnownSingleCompany(string name)
+ {
+ // Direct match
+ if (KnownSingleCompanyNames.Contains(name))
+ {
+ return true;
+ }
+
+ // Check if the name contains any known single company as a substring
+ foreach (var known in KnownSingleCompanyNames)
+ {
+ if (name.Contains(known, StringComparison.OrdinalIgnoreCase))
+ {
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+ private static bool MatchesSingleCompanyPattern(string name)
+ {
+ var lowerName = name.ToLowerInvariant();
+ return SingleCompanyPatterns.Any(pattern => lowerName.Contains(pattern));
+ }
+
+ private static bool IsDepartmentOrRole(string text)
+ {
+ var lower = text.ToLowerInvariant();
+ string[] departmentKeywords =
+ [
+ "department", "division", "team", "group", "unit",
+ "services", "solutions", "operations", "logistics",
+ "distribution", "manufacturing", "production",
+ "marketing", "sales", "finance", "accounting",
+ "hr", "human resources", "it", "technology",
+ "research", "development", "r&d", "engineering",
+ "retail", "wholesale", "stores", "online",
+ "consulting", "advisory", "support"
+ ];
+
+ return departmentKeywords.Any(kw => lower.Contains(kw));
+ }
+
+ private static bool LooksLikeCompanyName(string text)
+ {
+ // A company name typically:
+ // - Is at least 2 characters
+ // - Starts with a capital letter (or is all caps)
+ // - May end with Ltd, Limited, PLC, Inc, etc.
+
+ if (text.Length < 2)
+ {
+ return false;
+ }
+
+ // If it contains company suffixes, definitely a company name
+ string[] companySuffixes = ["ltd", "limited", "plc", "inc", "corp", "llp", "llc", "group", "holdings"];
+ var lower = text.ToLowerInvariant();
+ if (companySuffixes.Any(s => lower.EndsWith(s) || lower.Contains($" {s}")))
+ {
+ return true;
+ }
+
+ // If it looks like it could be a company (starts with capital, reasonable length)
+ if (char.IsUpper(text[0]) && text.Length >= 3)
+ {
+ return true;
+ }
+
+ return false;
}
}
diff --git a/tests/RealCV.Tests/Services/CompoundNameDetectionTests.cs b/tests/RealCV.Tests/Services/CompoundNameDetectionTests.cs
new file mode 100644
index 0000000..cffbecd
--- /dev/null
+++ b/tests/RealCV.Tests/Services/CompoundNameDetectionTests.cs
@@ -0,0 +1,179 @@
+using FluentAssertions;
+using Microsoft.Extensions.Logging.Abstractions;
+using Microsoft.Extensions.Options;
+using RealCV.Infrastructure.Configuration;
+using RealCV.Infrastructure.Services;
+
+namespace RealCV.Tests.Services;
+
+///
+/// Tests for the rule-based compound company name detection.
+///
+public sealed class CompoundNameDetectionTests
+{
+ private readonly AICompanyNameMatcherService _sut;
+
+ public CompoundNameDetectionTests()
+ {
+ var settings = Options.Create(new AnthropicSettings { ApiKey = "test-key" });
+ _sut = new AICompanyNameMatcherService(settings, NullLogger.Instance);
+ }
+
+ #region Known Single Companies (should NOT be split)
+
+ [Theory]
+ [InlineData("Ernst & Young")]
+ [InlineData("Ernst and Young")]
+ [InlineData("Marks & Spencer")]
+ [InlineData("Marks and Spencer")]
+ [InlineData("Procter & Gamble")]
+ [InlineData("Johnson & Johnson")]
+ [InlineData("Deloitte and Touche")]
+ [InlineData("Allen & Overy")]
+ [InlineData("Slaughter and May")]
+ [InlineData("Holland & Barrett")]
+ [InlineData("Smith & Nephew")]
+ [InlineData("AT&T")]
+ [InlineData("M&S")]
+ public async Task ExtractCompanyNamesAsync_KnownSingleCompany_ReturnsNull(string companyName)
+ {
+ // Act
+ var result = await _sut.ExtractCompanyNamesAsync(companyName);
+
+ // Assert
+ result.Should().BeNull($"'{companyName}' is a known single company and should not be split");
+ }
+
+ [Theory]
+ [InlineData("Ernst & Young LLP")]
+ [InlineData("Marks & Spencer PLC")]
+ [InlineData("Procter & Gamble UK")]
+ [InlineData("Johnson & Johnson Medical")]
+ public async Task ExtractCompanyNamesAsync_KnownSingleCompanyWithSuffix_ReturnsNull(string companyName)
+ {
+ // Act
+ var result = await _sut.ExtractCompanyNamesAsync(companyName);
+
+ // Assert
+ result.Should().BeNull($"'{companyName}' contains a known single company and should not be split");
+ }
+
+ #endregion
+
+ #region Department/Division Patterns (should NOT be split)
+
+ [Theory]
+ [InlineData("Tesco Stores and Distribution")]
+ [InlineData("BMW UK and Ireland")]
+ [InlineData("Google Europe and Middle East")]
+ [InlineData("Sales and Marketing")]
+ [InlineData("Research and Development")]
+ [InlineData("Finance and Operations")]
+ public async Task ExtractCompanyNamesAsync_DepartmentPattern_ReturnsNull(string companyName)
+ {
+ // Act
+ var result = await _sut.ExtractCompanyNamesAsync(companyName);
+
+ // Assert
+ result.Should().BeNull($"'{companyName}' looks like departments/divisions and should not be split");
+ }
+
+ #endregion
+
+ #region Compound Names with Slash (SHOULD be split)
+
+ [Theory]
+ [InlineData("ASDA/WALMART", new[] { "ASDA", "WALMART" })]
+ [InlineData("BBC/ITV", new[] { "BBC", "ITV" })]
+ [InlineData("Tesco/Sainsbury's", new[] { "Tesco", "Sainsbury's" })]
+ [InlineData("Microsoft/Google", new[] { "Microsoft", "Google" })]
+ public async Task ExtractCompanyNamesAsync_SlashSeparated_ReturnsParts(string companyName, string[] expectedParts)
+ {
+ // Act
+ var result = await _sut.ExtractCompanyNamesAsync(companyName);
+
+ // Assert
+ result.Should().NotBeNull($"'{companyName}' contains '/' and should be split");
+ result.Should().BeEquivalentTo(expectedParts);
+ }
+
+ #endregion
+
+ #region Compound Names with And/Ampersand
+
+ [Theory]
+ [InlineData("Acme Ltd & Beta Ltd", new[] { "Acme Ltd", "Beta Ltd" })]
+ public async Task ExtractCompanyNamesAsync_BothPartsHaveCompanySuffix_ReturnsParts(string companyName, string[] expectedParts)
+ {
+ // When both parts clearly have company suffixes (Ltd, PLC, etc.), split them
+
+ // Act
+ var result = await _sut.ExtractCompanyNamesAsync(companyName);
+
+ // Assert
+ result.Should().NotBeNull($"'{companyName}' has company suffixes on both parts");
+ result.Should().BeEquivalentTo(expectedParts);
+ }
+
+ [Theory]
+ [InlineData("Corus & Laura Ashley Hotels")] // Ambiguous - neither has company suffix
+ [InlineData("Smith & Jones Consulting")] // Could be a single partnership
+ [InlineData("Acme PLC and Beta PLC")] // Matches " plc and " department pattern
+ public async Task ExtractCompanyNamesAsync_AmbiguousWithAnd_ReturnsNull(string companyName)
+ {
+ // Rule-based system is conservative with ambiguous & and "and" cases
+
+ // Act
+ var result = await _sut.ExtractCompanyNamesAsync(companyName);
+
+ // Assert
+ result.Should().BeNull($"'{companyName}' is ambiguous and should not be split");
+ }
+
+ #endregion
+
+ #region Edge Cases
+
+ [Theory]
+ [InlineData("")]
+ [InlineData(" ")]
+ [InlineData(null)]
+ public async Task ExtractCompanyNamesAsync_EmptyOrNull_ReturnsNull(string? companyName)
+ {
+ // Act
+ var result = await _sut.ExtractCompanyNamesAsync(companyName!);
+
+ // Assert
+ result.Should().BeNull();
+ }
+
+ [Theory]
+ [InlineData("Microsoft")]
+ [InlineData("Google")]
+ [InlineData("Amazon")]
+ [InlineData("Apple Inc")]
+ [InlineData("Tesco PLC")]
+ public async Task ExtractCompanyNamesAsync_SimpleCompanyName_ReturnsNull(string companyName)
+ {
+ // Act
+ var result = await _sut.ExtractCompanyNamesAsync(companyName);
+
+ // Assert
+ result.Should().BeNull($"'{companyName}' is a simple company name and should not be split");
+ }
+
+ [Fact]
+ public async Task ExtractCompanyNamesAsync_ShortParts_ReturnsNull()
+ {
+ // Arrange - Parts too short to be valid company names
+ var companyName = "A & B";
+
+ // Act
+ var result = await _sut.ExtractCompanyNamesAsync(companyName);
+
+ // Assert
+ result.Should().BeNull("parts are too short to be valid company names");
+ }
+
+ #endregion
+}