feat: Add AI-powered compound company name splitting
Uses Claude Haiku to intelligently detect when a company name contains multiple companies (e.g., "ASDA/WALMART", "Corus & Laura Ashley Hotels") vs single companies with similar patterns (e.g., "Ernst & Young"). - Adds ExtractCompanyNamesAsync to ICompanyNameMatcherService - Only triggers for names with potential separators (/, &, "and") - Verifies each extracted part individually, returns first match - Uses fast Haiku model to minimize cost Results: - ASDA/WALMART → verified via 'ASDA' → ASDA GROUP LIMITED - Corus & Laura Ashley Hotels → verified via 'Corus' → Tata Steel UK - Employers: 104/120 verified (86%) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
329
tests/RealCV.Tests/Integration/CVBatchTester.cs
Normal file
329
tests/RealCV.Tests/Integration/CVBatchTester.cs
Normal file
@@ -0,0 +1,329 @@
|
||||
using System.Text.Json;
|
||||
using Microsoft.EntityFrameworkCore;
|
||||
using Microsoft.Extensions.Configuration;
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using RealCV.Application.Interfaces;
|
||||
using RealCV.Application.Models;
|
||||
using RealCV.Infrastructure.Data;
|
||||
using RealCV.Infrastructure.ExternalApis;
|
||||
using RealCV.Infrastructure.Services;
|
||||
using RealCV.Infrastructure.Configuration;
|
||||
|
||||
namespace RealCV.Tests.Integration;
|
||||
|
||||
/// <summary>
|
||||
/// Test utility to batch process CVs and output verification findings.
|
||||
/// Run with: dotnet test --filter "FullyQualifiedName~CVBatchTester" -- TestRunParameters.Parameter(name=\"CvFolder\", value=\"/path/to/cvs\")
|
||||
/// Or use the ProcessFolder method directly.
|
||||
/// </summary>
|
||||
public class CVBatchTester
|
||||
{
|
||||
private readonly IServiceProvider _serviceProvider;
|
||||
|
||||
public CVBatchTester()
|
||||
{
|
||||
var services = new ServiceCollection();
|
||||
ConfigureServices(services);
|
||||
_serviceProvider = services.BuildServiceProvider();
|
||||
}
|
||||
|
||||
private static void ConfigureServices(IServiceCollection services)
|
||||
{
|
||||
// Load configuration
|
||||
var configuration = new ConfigurationBuilder()
|
||||
.SetBasePath(Directory.GetCurrentDirectory())
|
||||
.AddJsonFile("appsettings.json", optional: true)
|
||||
.AddJsonFile("appsettings.Development.json", optional: true)
|
||||
.AddEnvironmentVariables()
|
||||
.Build();
|
||||
|
||||
// Logging
|
||||
services.AddLogging(builder =>
|
||||
{
|
||||
builder.AddConsole();
|
||||
builder.SetMinimumLevel(LogLevel.Information);
|
||||
});
|
||||
|
||||
// Database
|
||||
var connectionString = configuration.GetConnectionString("DefaultConnection")
|
||||
?? "Server=127.0.0.1;Database=RealCV;User Id=SA;Password=TrueCV_Sql2024!;TrustServerCertificate=True";
|
||||
|
||||
services.AddDbContextFactory<ApplicationDbContext>(options =>
|
||||
options.UseSqlServer(connectionString));
|
||||
|
||||
// Companies House
|
||||
services.Configure<CompaniesHouseSettings>(options =>
|
||||
{
|
||||
options.BaseUrl = configuration["CompaniesHouse:BaseUrl"] ?? "https://api.company-information.service.gov.uk";
|
||||
options.ApiKey = configuration["CompaniesHouse:ApiKey"] ?? "";
|
||||
});
|
||||
|
||||
services.AddHttpClient<CompaniesHouseClient>();
|
||||
|
||||
// Anthropic (for AI matching)
|
||||
services.Configure<AnthropicSettings>(options =>
|
||||
{
|
||||
options.ApiKey = configuration["Anthropic:ApiKey"] ?? "";
|
||||
});
|
||||
|
||||
services.AddHttpClient<AnthropicClient>();
|
||||
services.AddScoped<ICompanyNameMatcherService, CompanyNameMatcherService>();
|
||||
|
||||
// Services
|
||||
services.AddScoped<ICompanyVerifierService, CompanyVerifierService>();
|
||||
services.AddScoped<IEducationVerifierService, EducationVerifierService>();
|
||||
services.AddScoped<ICVParserService, CVParserService>();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Process all CVs in a folder and return verification results.
|
||||
/// </summary>
|
||||
public async Task<List<CVVerificationSummary>> ProcessFolderAsync(string folderPath)
|
||||
{
|
||||
if (!Directory.Exists(folderPath))
|
||||
{
|
||||
throw new DirectoryNotFoundException($"Folder not found: {folderPath}");
|
||||
}
|
||||
|
||||
var cvFiles = Directory.GetFiles(folderPath, "*.*", SearchOption.TopDirectoryOnly)
|
||||
.Where(f => f.EndsWith(".pdf", StringComparison.OrdinalIgnoreCase) ||
|
||||
f.EndsWith(".docx", StringComparison.OrdinalIgnoreCase) ||
|
||||
f.EndsWith(".doc", StringComparison.OrdinalIgnoreCase))
|
||||
.ToList();
|
||||
|
||||
Console.WriteLine($"Found {cvFiles.Count} CV files in {folderPath}");
|
||||
Console.WriteLine(new string('=', 80));
|
||||
|
||||
var results = new List<CVVerificationSummary>();
|
||||
|
||||
foreach (var cvFile in cvFiles)
|
||||
{
|
||||
Console.WriteLine($"\nProcessing: {Path.GetFileName(cvFile)}");
|
||||
Console.WriteLine(new string('-', 60));
|
||||
|
||||
try
|
||||
{
|
||||
var result = await ProcessSingleCVAsync(cvFile);
|
||||
results.Add(result);
|
||||
PrintSummary(result);
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
Console.WriteLine($"ERROR: {ex.Message}");
|
||||
results.Add(new CVVerificationSummary
|
||||
{
|
||||
FileName = Path.GetFileName(cvFile),
|
||||
Error = ex.Message
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Print overall summary
|
||||
Console.WriteLine("\n" + new string('=', 80));
|
||||
Console.WriteLine("OVERALL SUMMARY");
|
||||
Console.WriteLine(new string('=', 80));
|
||||
PrintOverallSummary(results);
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
private async Task<CVVerificationSummary> ProcessSingleCVAsync(string filePath)
|
||||
{
|
||||
using var scope = _serviceProvider.CreateScope();
|
||||
var cvParser = scope.ServiceProvider.GetRequiredService<ICVParserService>();
|
||||
var companyVerifier = scope.ServiceProvider.GetRequiredService<ICompanyVerifierService>();
|
||||
var educationVerifier = scope.ServiceProvider.GetRequiredService<IEducationVerifierService>();
|
||||
|
||||
// Parse the CV
|
||||
await using var fileStream = File.OpenRead(filePath);
|
||||
var parsedCV = await cvParser.ParseAsync(fileStream, Path.GetFileName(filePath));
|
||||
|
||||
var summary = new CVVerificationSummary
|
||||
{
|
||||
FileName = Path.GetFileName(filePath),
|
||||
CandidateName = parsedCV.PersonalInfo?.FullName ?? "Unknown"
|
||||
};
|
||||
|
||||
// Verify employers
|
||||
if (parsedCV.Employment?.Count > 0)
|
||||
{
|
||||
foreach (var employment in parsedCV.Employment)
|
||||
{
|
||||
try
|
||||
{
|
||||
var result = await companyVerifier.VerifyCompanyAsync(
|
||||
employment.CompanyName,
|
||||
employment.StartDate,
|
||||
employment.EndDate,
|
||||
employment.JobTitle);
|
||||
|
||||
summary.EmployerResults.Add(new EmployerVerificationSummary
|
||||
{
|
||||
ClaimedName = employment.CompanyName,
|
||||
MatchedName = result.MatchedCompanyName,
|
||||
CompanyNumber = result.MatchedCompanyNumber,
|
||||
IsVerified = result.IsVerified,
|
||||
MatchScore = result.MatchScore,
|
||||
Notes = result.VerificationNotes,
|
||||
Status = result.CompanyStatus
|
||||
});
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
summary.EmployerResults.Add(new EmployerVerificationSummary
|
||||
{
|
||||
ClaimedName = employment.CompanyName,
|
||||
IsVerified = false,
|
||||
Notes = $"Error: {ex.Message}"
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Verify education
|
||||
if (parsedCV.Education?.Count > 0)
|
||||
{
|
||||
var educationResults = educationVerifier.VerifyAll(
|
||||
parsedCV.Education.Select(e => new EducationEntry
|
||||
{
|
||||
Institution = e.Institution,
|
||||
Qualification = e.Qualification,
|
||||
Subject = e.Subject,
|
||||
StartDate = e.StartDate,
|
||||
EndDate = e.EndDate
|
||||
}).ToList());
|
||||
|
||||
foreach (var result in educationResults)
|
||||
{
|
||||
summary.EducationResults.Add(new EducationVerificationSummary
|
||||
{
|
||||
ClaimedInstitution = result.ClaimedInstitution,
|
||||
MatchedInstitution = result.MatchedInstitution,
|
||||
Qualification = result.ClaimedQualification,
|
||||
IsVerified = result.IsVerified,
|
||||
Status = result.Status,
|
||||
Notes = result.VerificationNotes
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return summary;
|
||||
}
|
||||
|
||||
private static void PrintSummary(CVVerificationSummary summary)
|
||||
{
|
||||
Console.WriteLine($"Candidate: {summary.CandidateName}");
|
||||
|
||||
Console.WriteLine($"\n EMPLOYERS ({summary.EmployerResults.Count}):");
|
||||
foreach (var emp in summary.EmployerResults)
|
||||
{
|
||||
var status = emp.IsVerified ? "✓" : "✗";
|
||||
var matchInfo = emp.IsVerified
|
||||
? $"-> {emp.MatchedName} ({emp.MatchScore}%)"
|
||||
: emp.Notes ?? "Not found";
|
||||
Console.WriteLine($" {status} {emp.ClaimedName}");
|
||||
Console.WriteLine($" {matchInfo}");
|
||||
}
|
||||
|
||||
Console.WriteLine($"\n EDUCATION ({summary.EducationResults.Count}):");
|
||||
foreach (var edu in summary.EducationResults)
|
||||
{
|
||||
var status = edu.IsVerified ? "✓" : "✗";
|
||||
var matchInfo = edu.IsVerified && edu.MatchedInstitution != null
|
||||
? $"-> {edu.MatchedInstitution}"
|
||||
: edu.Notes ?? edu.Status;
|
||||
Console.WriteLine($" {status} {edu.ClaimedInstitution}");
|
||||
Console.WriteLine($" {edu.Qualification}");
|
||||
Console.WriteLine($" {matchInfo}");
|
||||
}
|
||||
}
|
||||
|
||||
private static void PrintOverallSummary(List<CVVerificationSummary> results)
|
||||
{
|
||||
var successfulCVs = results.Count(r => r.Error == null);
|
||||
var totalEmployers = results.Sum(r => r.EmployerResults.Count);
|
||||
var verifiedEmployers = results.Sum(r => r.EmployerResults.Count(e => e.IsVerified));
|
||||
var totalEducation = results.Sum(r => r.EducationResults.Count);
|
||||
var verifiedEducation = results.Sum(r => r.EducationResults.Count(e => e.IsVerified));
|
||||
|
||||
Console.WriteLine($"CVs Processed: {successfulCVs}/{results.Count}");
|
||||
Console.WriteLine($"Employers: {verifiedEmployers}/{totalEmployers} verified ({(totalEmployers > 0 ? verifiedEmployers * 100 / totalEmployers : 0)}%)");
|
||||
Console.WriteLine($"Education: {verifiedEducation}/{totalEducation} verified ({(totalEducation > 0 ? verifiedEducation * 100 / totalEducation : 0)}%)");
|
||||
|
||||
// List unverified employers
|
||||
var unverifiedEmployers = results
|
||||
.SelectMany(r => r.EmployerResults.Where(e => !e.IsVerified))
|
||||
.GroupBy(e => e.ClaimedName)
|
||||
.OrderByDescending(g => g.Count())
|
||||
.ToList();
|
||||
|
||||
if (unverifiedEmployers.Count > 0)
|
||||
{
|
||||
Console.WriteLine($"\nUNVERIFIED EMPLOYERS ({unverifiedEmployers.Count} unique):");
|
||||
foreach (var group in unverifiedEmployers.Take(20))
|
||||
{
|
||||
Console.WriteLine($" - {group.Key} (x{group.Count()})");
|
||||
}
|
||||
}
|
||||
|
||||
// List unverified institutions
|
||||
var unverifiedEducation = results
|
||||
.SelectMany(r => r.EducationResults.Where(e => !e.IsVerified))
|
||||
.GroupBy(e => e.ClaimedInstitution)
|
||||
.OrderByDescending(g => g.Count())
|
||||
.ToList();
|
||||
|
||||
if (unverifiedEducation.Count > 0)
|
||||
{
|
||||
Console.WriteLine($"\nUNVERIFIED INSTITUTIONS ({unverifiedEducation.Count} unique):");
|
||||
foreach (var group in unverifiedEducation.Take(20))
|
||||
{
|
||||
Console.WriteLine($" - {group.Key} (x{group.Count()})");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Export results to JSON for further analysis.
|
||||
/// </summary>
|
||||
public static void ExportToJson(List<CVVerificationSummary> results, string outputPath)
|
||||
{
|
||||
var json = JsonSerializer.Serialize(results, new JsonSerializerOptions
|
||||
{
|
||||
WriteIndented = true
|
||||
});
|
||||
File.WriteAllText(outputPath, json);
|
||||
Console.WriteLine($"\nResults exported to: {outputPath}");
|
||||
}
|
||||
}
|
||||
|
||||
public class CVVerificationSummary
|
||||
{
|
||||
public string FileName { get; set; } = "";
|
||||
public string CandidateName { get; set; } = "";
|
||||
public string? Error { get; set; }
|
||||
public List<EmployerVerificationSummary> EmployerResults { get; set; } = new();
|
||||
public List<EducationVerificationSummary> EducationResults { get; set; } = new();
|
||||
}
|
||||
|
||||
public class EmployerVerificationSummary
|
||||
{
|
||||
public string ClaimedName { get; set; } = "";
|
||||
public string? MatchedName { get; set; }
|
||||
public string? CompanyNumber { get; set; }
|
||||
public bool IsVerified { get; set; }
|
||||
public int MatchScore { get; set; }
|
||||
public string? Notes { get; set; }
|
||||
public string? Status { get; set; }
|
||||
}
|
||||
|
||||
public class EducationVerificationSummary
|
||||
{
|
||||
public string ClaimedInstitution { get; set; } = "";
|
||||
public string? MatchedInstitution { get; set; }
|
||||
public string? Qualification { get; set; }
|
||||
public bool IsVerified { get; set; }
|
||||
public string? Status { get; set; }
|
||||
public string? Notes { get; set; }
|
||||
}
|
||||
Reference in New Issue
Block a user