feat: Add AI-powered compound company name splitting
Uses Claude Haiku to intelligently detect when a company name contains multiple companies (e.g., "ASDA/WALMART", "Corus & Laura Ashley Hotels") vs single companies with similar patterns (e.g., "Ernst & Young"). - Adds ExtractCompanyNamesAsync to ICompanyNameMatcherService - Only triggers for names with potential separators (/, &, "and") - Verifies each extracted part individually, returns first match - Uses fast Haiku model to minimize cost Results: - ASDA/WALMART → verified via 'ASDA' → ASDA GROUP LIMITED - Corus & Laura Ashley Hotels → verified via 'Corus' → Tata Steel UK - Employers: 104/120 verified (86%) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -12,4 +12,13 @@ public interface ICompanyNameMatcherService
|
||||
string cvCompanyName,
|
||||
List<CompanyCandidate> candidates,
|
||||
CancellationToken cancellationToken = default);
|
||||
|
||||
/// <summary>
|
||||
/// Uses AI to detect if a company name contains multiple companies and extract them.
|
||||
/// Returns null or single-item list if it's a single company (e.g., "Ernst & Young").
|
||||
/// Returns multiple items if compound (e.g., "ASDA/WALMART" -> ["ASDA", "WALMART"]).
|
||||
/// </summary>
|
||||
Task<List<string>?> ExtractCompanyNamesAsync(
|
||||
string companyName,
|
||||
CancellationToken cancellationToken = default);
|
||||
}
|
||||
|
||||
@@ -207,4 +207,107 @@ public sealed class AICompanyNameMatcherService : ICompanyNameMatcherService
|
||||
return null; // Fall back to fuzzy matching
|
||||
}
|
||||
}
|
||||
|
||||
private const string CompoundNamePrompt = """
|
||||
Analyze this company name from a CV and determine if it refers to ONE company or MULTIPLE companies.
|
||||
|
||||
Company name: "{COMPANY_NAME}"
|
||||
|
||||
Examples:
|
||||
- "Ernst & Young" → ONE company (it's the full name of the accounting firm)
|
||||
- "Marks & Spencer" → ONE company (it's the full name of the retailer)
|
||||
- "ASDA/WALMART" → TWO companies: ["ASDA", "WALMART"] (person worked at both or it's showing ownership)
|
||||
- "Corus & Laura Ashley Hotels" → TWO companies: ["Corus", "Laura Ashley Hotels"] (different industries)
|
||||
- "PwC" → ONE company
|
||||
- "Deloitte and Touche" → ONE company (historical name of Deloitte)
|
||||
- "BMW Group Ireland" → ONE company
|
||||
- "Tesco Stores and Distribution" → ONE company (departments of same company)
|
||||
|
||||
Rules:
|
||||
1. Well-known company names with "&" or "and" are SINGLE companies (Ernst & Young, Marks & Spencer, Procter & Gamble)
|
||||
2. A "/" usually indicates multiple companies or ownership relationship
|
||||
3. If the parts are in completely different industries, they're likely separate companies
|
||||
4. If one part is clearly a subsidiary/department of the other, treat as ONE company
|
||||
|
||||
Respond with ONLY valid JSON:
|
||||
{
|
||||
"isSingleCompany": boolean,
|
||||
"companies": ["company1", "company2"] or ["single company name"],
|
||||
"reasoning": "brief explanation"
|
||||
}
|
||||
""";
|
||||
|
||||
public async Task<List<string>?> ExtractCompanyNamesAsync(
|
||||
string companyName,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(companyName))
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
_logger.LogDebug("Using AI to check if '{CompanyName}' is a compound name", companyName);
|
||||
|
||||
try
|
||||
{
|
||||
var prompt = CompoundNamePrompt.Replace("{COMPANY_NAME}", companyName);
|
||||
|
||||
var messages = new List<Message>
|
||||
{
|
||||
new(RoleType.User, prompt)
|
||||
};
|
||||
|
||||
var parameters = new MessageParameters
|
||||
{
|
||||
Model = "claude-3-5-haiku-20241022",
|
||||
MaxTokens = 256,
|
||||
Messages = messages,
|
||||
System = [new SystemMessage("You are a company name parser. Respond only with valid JSON.")]
|
||||
};
|
||||
|
||||
var response = await _anthropicClient.Messages.GetClaudeMessageAsync(parameters, cancellationToken);
|
||||
|
||||
var responseText = response.Content
|
||||
.OfType<TextContent>()
|
||||
.FirstOrDefault()?.Text;
|
||||
|
||||
if (string.IsNullOrWhiteSpace(responseText))
|
||||
{
|
||||
_logger.LogWarning("AI returned empty response for compound name check");
|
||||
return null;
|
||||
}
|
||||
|
||||
responseText = JsonResponseHelper.CleanJsonResponse(responseText);
|
||||
|
||||
var result = JsonSerializer.Deserialize<CompoundNameResponse>(responseText, JsonDefaults.CamelCase);
|
||||
|
||||
if (result is null)
|
||||
{
|
||||
_logger.LogWarning("Failed to deserialize compound name response: {Response}", responseText);
|
||||
return null;
|
||||
}
|
||||
|
||||
_logger.LogDebug("AI compound name result: IsSingle={IsSingle}, Companies=[{Companies}], Reasoning={Reasoning}",
|
||||
result.IsSingleCompany, string.Join(", ", result.Companies ?? []), result.Reasoning);
|
||||
|
||||
if (result.IsSingleCompany || result.Companies is null || result.Companies.Count < 2)
|
||||
{
|
||||
return null; // Single company, no splitting needed
|
||||
}
|
||||
|
||||
return result.Companies;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "AI compound name detection failed for '{CompanyName}'", companyName);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private sealed class CompoundNameResponse
|
||||
{
|
||||
public bool IsSingleCompany { get; set; }
|
||||
public List<string>? Companies { get; set; }
|
||||
public string? Reasoning { get; set; }
|
||||
}
|
||||
}
|
||||
|
||||
@@ -226,6 +226,14 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
|
||||
};
|
||||
}
|
||||
|
||||
// Check 4: Is this a compound company name (e.g., "ASDA/WALMART", "Corus & Laura Ashley Hotels")?
|
||||
// Try to verify each part individually
|
||||
var compoundResult = await TryVerifyCompoundNameAsync(normalizedName, companyName, startDate, endDate, jobTitle, flags);
|
||||
if (compoundResult is not null)
|
||||
{
|
||||
return compoundResult;
|
||||
}
|
||||
|
||||
// Try to find a cached match first (but only if it existed at claimed start date)
|
||||
var cachedMatch = await FindCachedMatchAsync(normalizedName);
|
||||
if (cachedMatch is not null)
|
||||
@@ -833,6 +841,70 @@ public sealed class CompanyVerifierService : ICompanyVerifierService
|
||||
return normalized;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Attempts to verify compound company names by detecting if multiple companies are mentioned.
|
||||
/// Only triggers for names with potential separators (/, &, "and") to avoid unnecessary AI calls.
|
||||
/// </summary>
|
||||
private async Task<CompanyVerificationResult?> TryVerifyCompoundNameAsync(
|
||||
string normalizedName,
|
||||
string originalName,
|
||||
DateOnly? startDate,
|
||||
DateOnly? endDate,
|
||||
string? jobTitle,
|
||||
List<CompanyVerificationFlag> flags)
|
||||
{
|
||||
// Quick check: only process names that might be compound
|
||||
// Look for separators that could indicate multiple companies
|
||||
var hasPotentialSeparator = normalizedName.Contains('/')
|
||||
|| normalizedName.Contains(" & ")
|
||||
|| normalizedName.Contains(" and ", StringComparison.OrdinalIgnoreCase);
|
||||
|
||||
if (!hasPotentialSeparator)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
// Use AI to determine if this is a compound name and extract parts
|
||||
var extractedParts = await _aiMatcher.ExtractCompanyNamesAsync(normalizedName);
|
||||
|
||||
if (extractedParts is null || extractedParts.Count < 2)
|
||||
{
|
||||
// AI determined this is a single company (e.g., "Ernst & Young")
|
||||
return null;
|
||||
}
|
||||
|
||||
_logger.LogDebug("AI detected compound company name '{Name}', extracted parts: {Parts}",
|
||||
originalName, string.Join(", ", extractedParts.Select(p => $"'{p}'")));
|
||||
|
||||
// Try to verify each extracted part - return success on first match
|
||||
foreach (var part in extractedParts)
|
||||
{
|
||||
// Skip parts that are too short
|
||||
if (part.Length < 3) continue;
|
||||
|
||||
_logger.LogDebug("Trying to verify compound part: '{Part}'", part);
|
||||
|
||||
// Recursively verify this part
|
||||
var partResult = await VerifyCompanyAsync(part, startDate, endDate, jobTitle);
|
||||
|
||||
if (partResult.IsVerified)
|
||||
{
|
||||
_logger.LogInformation("Compound name '{Original}' verified via part '{Part}' -> {Match}",
|
||||
originalName, part, partResult.MatchedCompanyName);
|
||||
|
||||
return partResult with
|
||||
{
|
||||
ClaimedCompany = originalName,
|
||||
VerificationNotes = $"Verified via '{part}': {partResult.VerificationNotes ?? partResult.MatchedCompanyName}"
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
// None of the parts could be verified
|
||||
_logger.LogDebug("No parts of compound name '{Name}' could be verified", originalName);
|
||||
return null;
|
||||
}
|
||||
|
||||
private async Task<CompanyCache?> FindCachedMatchAsync(string companyName)
|
||||
{
|
||||
var cutoffDate = DateTime.UtcNow.AddDays(-CacheExpirationDays);
|
||||
|
||||
Reference in New Issue
Block a user