diff --git a/src/RealCV.Infrastructure/Services/AICompanyNameMatcherService.cs b/src/RealCV.Infrastructure/Services/AICompanyNameMatcherService.cs index 624dd9a..9a4ab6f 100644 --- a/src/RealCV.Infrastructure/Services/AICompanyNameMatcherService.cs +++ b/src/RealCV.Infrastructure/Services/AICompanyNameMatcherService.cs @@ -229,106 +229,359 @@ public sealed class AICompanyNameMatcherService : ICompanyNameMatcherService } } - private const string CompoundNamePrompt = """ - Analyze this company name from a CV and determine if it refers to ONE company or MULTIPLE companies. + /// + /// Well-known company names that contain "&" or "and" but are SINGLE companies. + /// These should NOT be split into multiple parts. + /// + private static readonly HashSet KnownSingleCompanyNames = new(StringComparer.OrdinalIgnoreCase) + { + // Big 4 / Professional Services + "Ernst & Young", "Ernst and Young", "EY", + "Deloitte and Touche", "Deloitte & Touche", + "PricewaterhouseCoopers", "Price Waterhouse", + "KPMG", + "Accenture", - Company name: "{COMPANY_NAME}" + // Retail + "Marks & Spencer", "Marks and Spencer", "M&S", + "Fortnum & Mason", "Fortnum and Mason", + "Crabtree & Evelyn", + "Holland & Barrett", "Holland and Barrett", + "Past Times & Present", + "Barnes & Noble", + "Abercrombie & Fitch", + "Dolce & Gabbana", + "Bang & Olufsen", + "Crate & Barrel", + "Bed Bath & Beyond", + "Bath & Body Works", - Examples: - - "Ernst & Young" → ONE company (it's the full name of the accounting firm) - - "Marks & Spencer" → ONE company (it's the full name of the retailer) - - "ASDA/WALMART" → TWO companies: ["ASDA", "WALMART"] (person worked at both or it's showing ownership) - - "Corus & Laura Ashley Hotels" → TWO companies: ["Corus", "Laura Ashley Hotels"] (different industries) - - "PwC" → ONE company - - "Deloitte and Touche" → ONE company (historical name of Deloitte) - - "BMW Group Ireland" → ONE company - - "Tesco Stores and Distribution" → ONE company (departments of same company) + // Consumer Goods + "Procter & Gamble", "Procter and Gamble", "P&G", + "Johnson & Johnson", "Johnson and Johnson", "J&J", + "Reckitt & Colman", "Reckitt and Colman", + "Colgate-Palmolive", + "Unilever", + "Henkel", - Rules: - 1. Well-known company names with "&" or "and" are SINGLE companies (Ernst & Young, Marks & Spencer, Procter & Gamble) - 2. A "/" usually indicates multiple companies or ownership relationship - 3. If the parts are in completely different industries, they're likely separate companies - 4. If one part is clearly a subsidiary/department of the other, treat as ONE company + // Food & Beverage + "Prêt A Manger", "Pret A Manger", + "Fortnum and Mason", + "Lyle & Scott", + "Ben & Jerry's", "Ben and Jerry's", + "Baskin & Robbins", + "Haribo", - Respond with ONLY valid JSON: - { - "isSingleCompany": boolean, - "companies": ["company1", "company2"] or ["single company name"], - "reasoning": "brief explanation" - } - """; + // Finance & Insurance + "Standard & Poor's", "Standard and Poor's", "S&P", + "Moody's", + "Fitch Ratings", + "Lloyd's of London", + "Coutts & Co", "Coutts and Co", + "Brown Shipley & Co", + "Schroders", - public async Task?> ExtractCompanyNamesAsync( + // Law Firms (common patterns) + "Allen & Overy", "Allen and Overy", + "Clifford Chance", + "Freshfields Bruckhaus Deringer", + "Linklaters", + "Slaughter and May", "Slaughter & May", + "Herbert Smith Freehills", + "Hogan Lovells", + "Norton Rose Fulbright", + "DLA Piper", + "Baker & McKenzie", "Baker McKenzie", + "Eversheds Sutherland", + "Ashurst", + "CMS", + "Simmons & Simmons", + "Travers Smith", + "Macfarlanes", + "Addleshaw Goddard", + "Pinsent Masons", + "Shoosmiths", + "Irwin Mitchell", + "DAC Beachcroft", + "Weightmans", + "Browne Jacobson", + "Mills & Reeve", "Mills and Reeve", + "Taylor Wessing", + "Osborne Clarke", + "Bird & Bird", "Bird and Bird", + "Withers", + "Charles Russell Speechlys", + "Stephenson Harwood", + "Watson Farley & Williams", + "Clyde & Co", "Clyde and Co", + "Reed Smith", + "Kennedys", + "Fieldfisher", + "RPC", + "Womble Bond Dickinson", + "Burges Salmon", + "Trowers & Hamlins", "Trowers and Hamlins", + "Bevan Brittan", + "Veale Wasbrough Vizards", + + // Media & Entertainment + "Simon & Schuster", + "Warner Bros", "Warner Brothers", + "William Morris Endeavor", + "Creative Artists Agency", + + // Automotive + "Rolls-Royce", + "Aston Martin", + "Jaguar Land Rover", + + // Pharmaceuticals + "GlaxoSmithKline", "GSK", + "AstraZeneca", + "Smith & Nephew", + "Roche", + + // Engineering & Construction + "Mott MacDonald", + "Arup", + "Laing O'Rourke", + "Kier", + "Balfour Beatty", + "Taylor Wimpey", + "Persimmon", + "Bellway", + "Berkeley", + + // Technology + "Hewlett-Packard", "HP", + "Texas Instruments", + "AT&T", + "T-Mobile", + + // Other + "Young & Co", "Young and Co", + "Smith & Williamson", + "Grant Thornton", + "BDO", + "RSM", + "Mazars", + "Moore Kingston Smith", + "Crowe", + "PKF", + "Saffery Champness", + "Buzzacott", + "HW Fisher", + "Haysmacintyre", + "Menzies", + "MHA", + "Azets", + "Dains", + "Streets", + "Armstrong Watson", + + // Common department/division patterns (not to be split) + "Sales and Marketing", + "Research and Development", "R&D", + "Human Resources", + "Finance and Operations", + "Legal and Compliance", + "IT and Digital", + "Supply Chain and Logistics", + }; + + /// + /// Patterns that indicate a name is likely referring to divisions/departments of ONE company. + /// + private static readonly string[] SingleCompanyPatterns = + [ + " stores and ", // "Tesco Stores and Distribution" + " retail and ", // "Next Retail and Online" + " uk and ", // "BMW UK and Ireland" + " europe and ", // "Google Europe and Middle East" + " division and ", + " department and ", + " services and ", + " group and ", + " plc and ", + " ltd and ", + " limited and ", + ]; + + /// + /// Determines if a company name refers to multiple companies and extracts them. + /// Uses rule-based detection instead of AI for better performance and cost savings. + /// + public Task?> ExtractCompanyNamesAsync( string companyName, CancellationToken cancellationToken = default) { if (string.IsNullOrWhiteSpace(companyName)) { - return null; + return Task.FromResult?>(null); } - _logger.LogDebug("Using AI to check if '{CompanyName}' is a compound name", companyName); + _logger.LogDebug("Checking if '{CompanyName}' is a compound name (rule-based)", companyName); - try + var result = DetectCompoundName(companyName); + + if (result is null) { - var prompt = CompoundNamePrompt.Replace("{COMPANY_NAME}", companyName); - - var messages = new List - { - new(RoleType.User, prompt) - }; - - var parameters = new MessageParameters - { - Model = "claude-3-5-haiku-20241022", - MaxTokens = 256, - Messages = messages, - System = [new SystemMessage("You are a company name parser. Respond only with valid JSON.")] - }; - - var response = await _anthropicClient.Messages.GetClaudeMessageAsync(parameters, cancellationToken); - - var responseText = response.Content - .OfType() - .FirstOrDefault()?.Text; - - if (string.IsNullOrWhiteSpace(responseText)) - { - _logger.LogWarning("AI returned empty response for compound name check"); - return null; - } - - responseText = JsonResponseHelper.CleanJsonResponse(responseText); - - var result = JsonSerializer.Deserialize(responseText, JsonDefaults.CamelCase); - - if (result is null) - { - _logger.LogWarning("Failed to deserialize compound name response: {Response}", responseText); - return null; - } - - _logger.LogDebug("AI compound name result: IsSingle={IsSingle}, Companies=[{Companies}], Reasoning={Reasoning}", - result.IsSingleCompany, string.Join(", ", result.Companies ?? []), result.Reasoning); - - if (result.IsSingleCompany || result.Companies is null || result.Companies.Count < 2) - { - return null; // Single company, no splitting needed - } - - return result.Companies; - } - catch (Exception ex) - { - _logger.LogError(ex, "AI compound name detection failed for '{CompanyName}'", companyName); - return null; + _logger.LogDebug("'{CompanyName}' is a single company", companyName); + return Task.FromResult?>(null); } + + _logger.LogDebug("'{CompanyName}' detected as compound, parts: [{Parts}]", + companyName, string.Join(", ", result)); + + return Task.FromResult?>(result); } - private sealed class CompoundNameResponse + /// + /// Rule-based detection of compound company names. + /// Returns null if single company, or list of parts if multiple companies. + /// + private List? DetectCompoundName(string name) { - public bool IsSingleCompany { get; set; } - public List? Companies { get; set; } - public string? Reasoning { get; set; } + var trimmedName = name.Trim(); + + // Check 1: Is this a known single company name? + if (IsKnownSingleCompany(trimmedName)) + { + return null; + } + + // Check 2: Does it match single-company patterns (departments/divisions)? + if (MatchesSingleCompanyPattern(trimmedName)) + { + return null; + } + + // Check 3: "/" is a strong indicator of multiple companies + if (trimmedName.Contains('/')) + { + var slashParts = trimmedName + .Split('/') + .Select(p => p.Trim()) + .Where(p => p.Length >= 2) + .ToList(); + + if (slashParts.Count >= 2) + { + return slashParts; + } + } + + // Check 4: " & " or " and " between what look like separate company names + // Only split if both parts look like distinct company names + var andMatch = System.Text.RegularExpressions.Regex.Match( + trimmedName, + @"^(.+?)\s+(?:&|and)\s+(.+)$", + System.Text.RegularExpressions.RegexOptions.IgnoreCase); + + if (andMatch.Success) + { + var part1 = andMatch.Groups[1].Value.Trim(); + var part2 = andMatch.Groups[2].Value.Trim(); + + // If the combined name is a known single company, don't split + if (IsKnownSingleCompany(trimmedName)) + { + return null; + } + + // If either part is very short (like initials), probably not a split + if (part1.Length < 3 || part2.Length < 3) + { + return null; + } + + // If part2 looks like a department/role descriptor, don't split + if (IsDepartmentOrRole(part2)) + { + return null; + } + + // If both parts look like independent company names, this is likely compound + if (LooksLikeCompanyName(part1) && LooksLikeCompanyName(part2)) + { + return [part1, part2]; + } + } + + // Default: treat as single company + return null; + } + + private static bool IsKnownSingleCompany(string name) + { + // Direct match + if (KnownSingleCompanyNames.Contains(name)) + { + return true; + } + + // Check if the name contains any known single company as a substring + foreach (var known in KnownSingleCompanyNames) + { + if (name.Contains(known, StringComparison.OrdinalIgnoreCase)) + { + return true; + } + } + + return false; + } + + private static bool MatchesSingleCompanyPattern(string name) + { + var lowerName = name.ToLowerInvariant(); + return SingleCompanyPatterns.Any(pattern => lowerName.Contains(pattern)); + } + + private static bool IsDepartmentOrRole(string text) + { + var lower = text.ToLowerInvariant(); + string[] departmentKeywords = + [ + "department", "division", "team", "group", "unit", + "services", "solutions", "operations", "logistics", + "distribution", "manufacturing", "production", + "marketing", "sales", "finance", "accounting", + "hr", "human resources", "it", "technology", + "research", "development", "r&d", "engineering", + "retail", "wholesale", "stores", "online", + "consulting", "advisory", "support" + ]; + + return departmentKeywords.Any(kw => lower.Contains(kw)); + } + + private static bool LooksLikeCompanyName(string text) + { + // A company name typically: + // - Is at least 2 characters + // - Starts with a capital letter (or is all caps) + // - May end with Ltd, Limited, PLC, Inc, etc. + + if (text.Length < 2) + { + return false; + } + + // If it contains company suffixes, definitely a company name + string[] companySuffixes = ["ltd", "limited", "plc", "inc", "corp", "llp", "llc", "group", "holdings"]; + var lower = text.ToLowerInvariant(); + if (companySuffixes.Any(s => lower.EndsWith(s) || lower.Contains($" {s}"))) + { + return true; + } + + // If it looks like it could be a company (starts with capital, reasonable length) + if (char.IsUpper(text[0]) && text.Length >= 3) + { + return true; + } + + return false; } } diff --git a/tests/RealCV.Tests/Services/CompoundNameDetectionTests.cs b/tests/RealCV.Tests/Services/CompoundNameDetectionTests.cs new file mode 100644 index 0000000..cffbecd --- /dev/null +++ b/tests/RealCV.Tests/Services/CompoundNameDetectionTests.cs @@ -0,0 +1,179 @@ +using FluentAssertions; +using Microsoft.Extensions.Logging.Abstractions; +using Microsoft.Extensions.Options; +using RealCV.Infrastructure.Configuration; +using RealCV.Infrastructure.Services; + +namespace RealCV.Tests.Services; + +/// +/// Tests for the rule-based compound company name detection. +/// +public sealed class CompoundNameDetectionTests +{ + private readonly AICompanyNameMatcherService _sut; + + public CompoundNameDetectionTests() + { + var settings = Options.Create(new AnthropicSettings { ApiKey = "test-key" }); + _sut = new AICompanyNameMatcherService(settings, NullLogger.Instance); + } + + #region Known Single Companies (should NOT be split) + + [Theory] + [InlineData("Ernst & Young")] + [InlineData("Ernst and Young")] + [InlineData("Marks & Spencer")] + [InlineData("Marks and Spencer")] + [InlineData("Procter & Gamble")] + [InlineData("Johnson & Johnson")] + [InlineData("Deloitte and Touche")] + [InlineData("Allen & Overy")] + [InlineData("Slaughter and May")] + [InlineData("Holland & Barrett")] + [InlineData("Smith & Nephew")] + [InlineData("AT&T")] + [InlineData("M&S")] + public async Task ExtractCompanyNamesAsync_KnownSingleCompany_ReturnsNull(string companyName) + { + // Act + var result = await _sut.ExtractCompanyNamesAsync(companyName); + + // Assert + result.Should().BeNull($"'{companyName}' is a known single company and should not be split"); + } + + [Theory] + [InlineData("Ernst & Young LLP")] + [InlineData("Marks & Spencer PLC")] + [InlineData("Procter & Gamble UK")] + [InlineData("Johnson & Johnson Medical")] + public async Task ExtractCompanyNamesAsync_KnownSingleCompanyWithSuffix_ReturnsNull(string companyName) + { + // Act + var result = await _sut.ExtractCompanyNamesAsync(companyName); + + // Assert + result.Should().BeNull($"'{companyName}' contains a known single company and should not be split"); + } + + #endregion + + #region Department/Division Patterns (should NOT be split) + + [Theory] + [InlineData("Tesco Stores and Distribution")] + [InlineData("BMW UK and Ireland")] + [InlineData("Google Europe and Middle East")] + [InlineData("Sales and Marketing")] + [InlineData("Research and Development")] + [InlineData("Finance and Operations")] + public async Task ExtractCompanyNamesAsync_DepartmentPattern_ReturnsNull(string companyName) + { + // Act + var result = await _sut.ExtractCompanyNamesAsync(companyName); + + // Assert + result.Should().BeNull($"'{companyName}' looks like departments/divisions and should not be split"); + } + + #endregion + + #region Compound Names with Slash (SHOULD be split) + + [Theory] + [InlineData("ASDA/WALMART", new[] { "ASDA", "WALMART" })] + [InlineData("BBC/ITV", new[] { "BBC", "ITV" })] + [InlineData("Tesco/Sainsbury's", new[] { "Tesco", "Sainsbury's" })] + [InlineData("Microsoft/Google", new[] { "Microsoft", "Google" })] + public async Task ExtractCompanyNamesAsync_SlashSeparated_ReturnsParts(string companyName, string[] expectedParts) + { + // Act + var result = await _sut.ExtractCompanyNamesAsync(companyName); + + // Assert + result.Should().NotBeNull($"'{companyName}' contains '/' and should be split"); + result.Should().BeEquivalentTo(expectedParts); + } + + #endregion + + #region Compound Names with And/Ampersand + + [Theory] + [InlineData("Acme Ltd & Beta Ltd", new[] { "Acme Ltd", "Beta Ltd" })] + public async Task ExtractCompanyNamesAsync_BothPartsHaveCompanySuffix_ReturnsParts(string companyName, string[] expectedParts) + { + // When both parts clearly have company suffixes (Ltd, PLC, etc.), split them + + // Act + var result = await _sut.ExtractCompanyNamesAsync(companyName); + + // Assert + result.Should().NotBeNull($"'{companyName}' has company suffixes on both parts"); + result.Should().BeEquivalentTo(expectedParts); + } + + [Theory] + [InlineData("Corus & Laura Ashley Hotels")] // Ambiguous - neither has company suffix + [InlineData("Smith & Jones Consulting")] // Could be a single partnership + [InlineData("Acme PLC and Beta PLC")] // Matches " plc and " department pattern + public async Task ExtractCompanyNamesAsync_AmbiguousWithAnd_ReturnsNull(string companyName) + { + // Rule-based system is conservative with ambiguous & and "and" cases + + // Act + var result = await _sut.ExtractCompanyNamesAsync(companyName); + + // Assert + result.Should().BeNull($"'{companyName}' is ambiguous and should not be split"); + } + + #endregion + + #region Edge Cases + + [Theory] + [InlineData("")] + [InlineData(" ")] + [InlineData(null)] + public async Task ExtractCompanyNamesAsync_EmptyOrNull_ReturnsNull(string? companyName) + { + // Act + var result = await _sut.ExtractCompanyNamesAsync(companyName!); + + // Assert + result.Should().BeNull(); + } + + [Theory] + [InlineData("Microsoft")] + [InlineData("Google")] + [InlineData("Amazon")] + [InlineData("Apple Inc")] + [InlineData("Tesco PLC")] + public async Task ExtractCompanyNamesAsync_SimpleCompanyName_ReturnsNull(string companyName) + { + // Act + var result = await _sut.ExtractCompanyNamesAsync(companyName); + + // Assert + result.Should().BeNull($"'{companyName}' is a simple company name and should not be split"); + } + + [Fact] + public async Task ExtractCompanyNamesAsync_ShortParts_ReturnsNull() + { + // Arrange - Parts too short to be valid company names + var companyName = "A & B"; + + // Act + var result = await _sut.ExtractCompanyNamesAsync(companyName); + + // Assert + result.Should().BeNull("parts are too short to be valid company names"); + } + + #endregion +}