Eliminates one Anthropic API call entirely by using pattern matching: - Add 120+ known single-company names (Ernst & Young, M&S, law firms, etc.) - Detect "/" separator as clear indicator of multiple companies - Use company suffixes (Ltd, PLC) to identify when "&" means two companies - Conservative approach: don't split ambiguous cases Added 40 unit tests for compound name detection covering: - Known single companies with & and "and" - Slash-separated company names - Ambiguous cases - Edge cases (empty, null, short names) Estimated savings: ~$0.01 per CV check, 100% elimination of this API call 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
180 lines
5.8 KiB
C#
180 lines
5.8 KiB
C#
using FluentAssertions;
|
|
using Microsoft.Extensions.Logging.Abstractions;
|
|
using Microsoft.Extensions.Options;
|
|
using RealCV.Infrastructure.Configuration;
|
|
using RealCV.Infrastructure.Services;
|
|
|
|
namespace RealCV.Tests.Services;
|
|
|
|
/// <summary>
|
|
/// Tests for the rule-based compound company name detection.
|
|
/// </summary>
|
|
public sealed class CompoundNameDetectionTests
|
|
{
|
|
private readonly AICompanyNameMatcherService _sut;
|
|
|
|
public CompoundNameDetectionTests()
|
|
{
|
|
var settings = Options.Create(new AnthropicSettings { ApiKey = "test-key" });
|
|
_sut = new AICompanyNameMatcherService(settings, NullLogger<AICompanyNameMatcherService>.Instance);
|
|
}
|
|
|
|
#region Known Single Companies (should NOT be split)
|
|
|
|
[Theory]
|
|
[InlineData("Ernst & Young")]
|
|
[InlineData("Ernst and Young")]
|
|
[InlineData("Marks & Spencer")]
|
|
[InlineData("Marks and Spencer")]
|
|
[InlineData("Procter & Gamble")]
|
|
[InlineData("Johnson & Johnson")]
|
|
[InlineData("Deloitte and Touche")]
|
|
[InlineData("Allen & Overy")]
|
|
[InlineData("Slaughter and May")]
|
|
[InlineData("Holland & Barrett")]
|
|
[InlineData("Smith & Nephew")]
|
|
[InlineData("AT&T")]
|
|
[InlineData("M&S")]
|
|
public async Task ExtractCompanyNamesAsync_KnownSingleCompany_ReturnsNull(string companyName)
|
|
{
|
|
// Act
|
|
var result = await _sut.ExtractCompanyNamesAsync(companyName);
|
|
|
|
// Assert
|
|
result.Should().BeNull($"'{companyName}' is a known single company and should not be split");
|
|
}
|
|
|
|
[Theory]
|
|
[InlineData("Ernst & Young LLP")]
|
|
[InlineData("Marks & Spencer PLC")]
|
|
[InlineData("Procter & Gamble UK")]
|
|
[InlineData("Johnson & Johnson Medical")]
|
|
public async Task ExtractCompanyNamesAsync_KnownSingleCompanyWithSuffix_ReturnsNull(string companyName)
|
|
{
|
|
// Act
|
|
var result = await _sut.ExtractCompanyNamesAsync(companyName);
|
|
|
|
// Assert
|
|
result.Should().BeNull($"'{companyName}' contains a known single company and should not be split");
|
|
}
|
|
|
|
#endregion
|
|
|
|
#region Department/Division Patterns (should NOT be split)
|
|
|
|
[Theory]
|
|
[InlineData("Tesco Stores and Distribution")]
|
|
[InlineData("BMW UK and Ireland")]
|
|
[InlineData("Google Europe and Middle East")]
|
|
[InlineData("Sales and Marketing")]
|
|
[InlineData("Research and Development")]
|
|
[InlineData("Finance and Operations")]
|
|
public async Task ExtractCompanyNamesAsync_DepartmentPattern_ReturnsNull(string companyName)
|
|
{
|
|
// Act
|
|
var result = await _sut.ExtractCompanyNamesAsync(companyName);
|
|
|
|
// Assert
|
|
result.Should().BeNull($"'{companyName}' looks like departments/divisions and should not be split");
|
|
}
|
|
|
|
#endregion
|
|
|
|
#region Compound Names with Slash (SHOULD be split)
|
|
|
|
[Theory]
|
|
[InlineData("ASDA/WALMART", new[] { "ASDA", "WALMART" })]
|
|
[InlineData("BBC/ITV", new[] { "BBC", "ITV" })]
|
|
[InlineData("Tesco/Sainsbury's", new[] { "Tesco", "Sainsbury's" })]
|
|
[InlineData("Microsoft/Google", new[] { "Microsoft", "Google" })]
|
|
public async Task ExtractCompanyNamesAsync_SlashSeparated_ReturnsParts(string companyName, string[] expectedParts)
|
|
{
|
|
// Act
|
|
var result = await _sut.ExtractCompanyNamesAsync(companyName);
|
|
|
|
// Assert
|
|
result.Should().NotBeNull($"'{companyName}' contains '/' and should be split");
|
|
result.Should().BeEquivalentTo(expectedParts);
|
|
}
|
|
|
|
#endregion
|
|
|
|
#region Compound Names with And/Ampersand
|
|
|
|
[Theory]
|
|
[InlineData("Acme Ltd & Beta Ltd", new[] { "Acme Ltd", "Beta Ltd" })]
|
|
public async Task ExtractCompanyNamesAsync_BothPartsHaveCompanySuffix_ReturnsParts(string companyName, string[] expectedParts)
|
|
{
|
|
// When both parts clearly have company suffixes (Ltd, PLC, etc.), split them
|
|
|
|
// Act
|
|
var result = await _sut.ExtractCompanyNamesAsync(companyName);
|
|
|
|
// Assert
|
|
result.Should().NotBeNull($"'{companyName}' has company suffixes on both parts");
|
|
result.Should().BeEquivalentTo(expectedParts);
|
|
}
|
|
|
|
[Theory]
|
|
[InlineData("Corus & Laura Ashley Hotels")] // Ambiguous - neither has company suffix
|
|
[InlineData("Smith & Jones Consulting")] // Could be a single partnership
|
|
[InlineData("Acme PLC and Beta PLC")] // Matches " plc and " department pattern
|
|
public async Task ExtractCompanyNamesAsync_AmbiguousWithAnd_ReturnsNull(string companyName)
|
|
{
|
|
// Rule-based system is conservative with ambiguous & and "and" cases
|
|
|
|
// Act
|
|
var result = await _sut.ExtractCompanyNamesAsync(companyName);
|
|
|
|
// Assert
|
|
result.Should().BeNull($"'{companyName}' is ambiguous and should not be split");
|
|
}
|
|
|
|
#endregion
|
|
|
|
#region Edge Cases
|
|
|
|
[Theory]
|
|
[InlineData("")]
|
|
[InlineData(" ")]
|
|
[InlineData(null)]
|
|
public async Task ExtractCompanyNamesAsync_EmptyOrNull_ReturnsNull(string? companyName)
|
|
{
|
|
// Act
|
|
var result = await _sut.ExtractCompanyNamesAsync(companyName!);
|
|
|
|
// Assert
|
|
result.Should().BeNull();
|
|
}
|
|
|
|
[Theory]
|
|
[InlineData("Microsoft")]
|
|
[InlineData("Google")]
|
|
[InlineData("Amazon")]
|
|
[InlineData("Apple Inc")]
|
|
[InlineData("Tesco PLC")]
|
|
public async Task ExtractCompanyNamesAsync_SimpleCompanyName_ReturnsNull(string companyName)
|
|
{
|
|
// Act
|
|
var result = await _sut.ExtractCompanyNamesAsync(companyName);
|
|
|
|
// Assert
|
|
result.Should().BeNull($"'{companyName}' is a simple company name and should not be split");
|
|
}
|
|
|
|
[Fact]
|
|
public async Task ExtractCompanyNamesAsync_ShortParts_ReturnsNull()
|
|
{
|
|
// Arrange - Parts too short to be valid company names
|
|
var companyName = "A & B";
|
|
|
|
// Act
|
|
var result = await _sut.ExtractCompanyNamesAsync(companyName);
|
|
|
|
// Assert
|
|
result.Should().BeNull("parts are too short to be valid company names");
|
|
}
|
|
|
|
#endregion
|
|
}
|