feat: Add AI-powered compound company name splitting

Uses Claude Haiku to intelligently detect when a company name contains
multiple companies (e.g., "ASDA/WALMART", "Corus & Laura Ashley Hotels")
vs single companies with similar patterns (e.g., "Ernst & Young").

- Adds ExtractCompanyNamesAsync to ICompanyNameMatcherService
- Only triggers for names with potential separators (/, &, "and")
- Verifies each extracted part individually, returns first match
- Uses fast Haiku model to minimize cost

Results:
- ASDA/WALMART → verified via 'ASDA' → ASDA GROUP LIMITED
- Corus & Laura Ashley Hotels → verified via 'Corus' → Tata Steel UK
- Employers: 104/120 verified (86%)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-01-22 11:04:30 +00:00
parent 27921d625f
commit 94ca6e1b9a
7 changed files with 1025 additions and 0 deletions

View File

@@ -0,0 +1,15 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>net8.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
</PropertyGroup>
<ItemGroup>
<ProjectReference Include="../../src/RealCV.Application/RealCV.Application.csproj" />
<ProjectReference Include="../../src/RealCV.Infrastructure/RealCV.Infrastructure.csproj" />
</ItemGroup>
</Project>

View File

@@ -0,0 +1,302 @@
using System.Text.Json;
using Microsoft.EntityFrameworkCore;
using Microsoft.Extensions.Configuration;
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Logging;
using RealCV.Application.Interfaces;
using RealCV.Application.Models;
using RealCV.Infrastructure.Configuration;
using RealCV.Infrastructure.Data;
using RealCV.Infrastructure.ExternalApis;
using RealCV.Infrastructure.Services;
namespace CVBatchTester;
class Program
{
static async Task<int> Main(string[] args)
{
var folderPath = args.FirstOrDefault() ?? AskForFolder();
if (string.IsNullOrEmpty(folderPath) || !Directory.Exists(folderPath))
{
Console.WriteLine($"Error: Folder not found: {folderPath}");
Console.WriteLine("Usage: CVBatchTester <folder-path>");
Console.WriteLine(" e.g. CVBatchTester /home/user/cvs");
return 1;
}
Console.WriteLine($"CV Batch Verification Tester");
Console.WriteLine($"Processing CVs from: {folderPath}");
Console.WriteLine(new string('=', 80));
// Setup DI
var services = new ServiceCollection();
ConfigureServices(services);
var provider = services.BuildServiceProvider();
// Find CV files
var cvFiles = Directory.GetFiles(folderPath, "*.*", SearchOption.TopDirectoryOnly)
.Where(f => f.EndsWith(".pdf", StringComparison.OrdinalIgnoreCase) ||
f.EndsWith(".docx", StringComparison.OrdinalIgnoreCase) ||
f.EndsWith(".doc", StringComparison.OrdinalIgnoreCase))
.OrderBy(f => f)
.ToList();
Console.WriteLine($"Found {cvFiles.Count} CV files\n");
if (cvFiles.Count == 0)
{
Console.WriteLine("No CV files found (.pdf, .docx, .doc)");
return 1;
}
// Track results
var allUnverifiedEmployers = new List<string>();
var allUnverifiedInstitutions = new List<string>();
var totalEmployers = 0;
var verifiedEmployers = 0;
var totalEducation = 0;
var verifiedEducation = 0;
var processedCount = 0;
var errorCount = 0;
foreach (var cvFile in cvFiles)
{
Console.WriteLine($"\n{new string('=', 80)}");
Console.WriteLine($"[{++processedCount}/{cvFiles.Count}] {Path.GetFileName(cvFile)}");
Console.WriteLine(new string('=', 80));
try
{
using var scope = provider.CreateScope();
var parser = scope.ServiceProvider.GetRequiredService<ICVParserService>();
var companyVerifier = scope.ServiceProvider.GetRequiredService<ICompanyVerifierService>();
var eduVerifier = scope.ServiceProvider.GetRequiredService<IEducationVerifierService>();
// Parse CV
await using var stream = File.OpenRead(cvFile);
var cv = await parser.ParseAsync(stream, Path.GetFileName(cvFile));
Console.WriteLine($"Candidate: {cv.FullName}");
// Verify Employers
if (cv.Employment?.Count > 0)
{
Console.WriteLine($"\nEMPLOYERS ({cv.Employment.Count}):");
Console.WriteLine(new string('-', 60));
foreach (var emp in cv.Employment)
{
totalEmployers++;
try
{
var result = await companyVerifier.VerifyCompanyAsync(
emp.CompanyName,
emp.StartDate,
emp.EndDate,
emp.JobTitle);
var icon = result.IsVerified ? "✓" : "✗";
var period = FormatPeriod(emp.StartDate, emp.EndDate);
Console.WriteLine($"\n {icon} {emp.CompanyName}");
Console.WriteLine($" Period: {period}");
Console.WriteLine($" Role: {emp.JobTitle}");
if (result.IsVerified)
{
verifiedEmployers++;
Console.WriteLine($" Match: {result.MatchedCompanyName} ({result.MatchScore}%)");
if (!string.IsNullOrEmpty(result.MatchedCompanyNumber))
Console.WriteLine($" Company #: {result.MatchedCompanyNumber}");
if (!string.IsNullOrEmpty(result.CompanyStatus))
Console.WriteLine($" Status: {result.CompanyStatus}");
}
else
{
allUnverifiedEmployers.Add(emp.CompanyName);
}
if (!string.IsNullOrEmpty(result.VerificationNotes))
Console.WriteLine($" Note: {result.VerificationNotes}");
}
catch (Exception ex)
{
Console.WriteLine($"\n ✗ {emp.CompanyName}");
Console.WriteLine($" ERROR: {ex.Message}");
allUnverifiedEmployers.Add(emp.CompanyName);
}
}
}
// Verify Education
if (cv.Education?.Count > 0)
{
Console.WriteLine($"\nEDUCATION ({cv.Education.Count}):");
Console.WriteLine(new string('-', 60));
var eduEntries = cv.Education.Select(e => new EducationEntry
{
Institution = e.Institution,
Qualification = e.Qualification,
Subject = e.Subject,
StartDate = e.StartDate,
EndDate = e.EndDate
}).ToList();
var eduResults = eduVerifier.VerifyAll(eduEntries);
foreach (var result in eduResults)
{
totalEducation++;
var icon = result.IsVerified ? "✓" : "✗";
Console.WriteLine($"\n {icon} {result.ClaimedInstitution}");
Console.WriteLine($" Qualification: {result.ClaimedQualification}");
if (!string.IsNullOrEmpty(result.ClaimedSubject))
Console.WriteLine($" Subject: {result.ClaimedSubject}");
if (result.IsVerified)
{
verifiedEducation++;
if (result.MatchedInstitution != null &&
!result.MatchedInstitution.Equals(result.ClaimedInstitution, StringComparison.OrdinalIgnoreCase))
{
Console.WriteLine($" Match: {result.MatchedInstitution}");
}
}
else
{
allUnverifiedInstitutions.Add(result.ClaimedInstitution ?? "Unknown");
Console.WriteLine($" Status: {result.Status}");
}
if (!string.IsNullOrEmpty(result.VerificationNotes))
Console.WriteLine($" Note: {result.VerificationNotes}");
}
}
}
catch (Exception ex)
{
errorCount++;
Console.WriteLine($"ERROR processing file: {ex.Message}");
}
}
// Print Summary
Console.WriteLine($"\n\n{new string('=', 80)}");
Console.WriteLine("VERIFICATION SUMMARY");
Console.WriteLine(new string('=', 80));
Console.WriteLine($"\nCVs Processed: {processedCount - errorCount}/{cvFiles.Count}");
if (errorCount > 0)
Console.WriteLine($"Errors: {errorCount}");
var empRate = totalEmployers > 0 ? verifiedEmployers * 100 / totalEmployers : 0;
var eduRate = totalEducation > 0 ? verifiedEducation * 100 / totalEducation : 0;
Console.WriteLine($"\nEmployers: {verifiedEmployers}/{totalEmployers} verified ({empRate}%)");
Console.WriteLine($"Education: {verifiedEducation}/{totalEducation} verified ({eduRate}%)");
// List unverified employers
var uniqueUnverifiedEmployers = allUnverifiedEmployers
.GroupBy(e => e, StringComparer.OrdinalIgnoreCase)
.OrderByDescending(g => g.Count())
.ThenBy(g => g.Key)
.ToList();
if (uniqueUnverifiedEmployers.Count > 0)
{
Console.WriteLine($"\n{new string('-', 60)}");
Console.WriteLine($"UNVERIFIED EMPLOYERS ({uniqueUnverifiedEmployers.Count} unique):");
foreach (var group in uniqueUnverifiedEmployers)
{
var count = group.Count() > 1 ? $" (x{group.Count()})" : "";
Console.WriteLine($" - {group.Key}{count}");
}
}
// List unverified institutions
var uniqueUnverifiedInstitutions = allUnverifiedInstitutions
.GroupBy(i => i, StringComparer.OrdinalIgnoreCase)
.OrderByDescending(g => g.Count())
.ThenBy(g => g.Key)
.ToList();
if (uniqueUnverifiedInstitutions.Count > 0)
{
Console.WriteLine($"\n{new string('-', 60)}");
Console.WriteLine($"UNVERIFIED INSTITUTIONS ({uniqueUnverifiedInstitutions.Count} unique):");
foreach (var group in uniqueUnverifiedInstitutions)
{
var count = group.Count() > 1 ? $" (x{group.Count()})" : "";
Console.WriteLine($" - {group.Key}{count}");
}
}
Console.WriteLine($"\n{new string('=', 80)}");
return 0;
}
static string AskForFolder()
{
Console.Write("Enter CV folder path: ");
return Console.ReadLine() ?? "";
}
static string FormatPeriod(DateOnly? start, DateOnly? end)
{
var startStr = start?.ToString("MMM yyyy") ?? "?";
var endStr = end?.ToString("MMM yyyy") ?? "Present";
return $"{startStr} - {endStr}";
}
static void ConfigureServices(IServiceCollection services)
{
// Load configuration - try multiple locations
var configPaths = new[]
{
"/var/www/realcv",
"/git/RealCV/src/RealCV.Web",
Path.GetFullPath(Path.Combine(AppContext.BaseDirectory, "..", "..", "..", "..", "..", "src", "RealCV.Web"))
};
var webProjectPath = configPaths.FirstOrDefault(Directory.Exists) ?? "/git/RealCV/src/RealCV.Web";
Console.WriteLine($"Loading config from: {webProjectPath}");
var configuration = new ConfigurationBuilder()
.SetBasePath(webProjectPath)
.AddJsonFile("appsettings.json", optional: true)
.AddJsonFile("appsettings.Development.json", optional: true)
.AddJsonFile("appsettings.Production.json", optional: true)
.Build();
// Logging - minimal output
services.AddLogging(builder =>
{
builder.AddConsole();
builder.SetMinimumLevel(LogLevel.Warning);
});
// Database
var connectionString = configuration.GetConnectionString("DefaultConnection")
?? "Server=127.0.0.1;Database=RealCV;User Id=SA;Password=TrueCV_Sql2024!;TrustServerCertificate=True";
services.AddDbContextFactory<ApplicationDbContext>(options =>
options.UseSqlServer(connectionString));
// Companies House - use configuration binding
services.Configure<CompaniesHouseSettings>(configuration.GetSection(CompaniesHouseSettings.SectionName));
services.AddHttpClient<CompaniesHouseClient>();
// Anthropic - use configuration binding
services.Configure<AnthropicSettings>(configuration.GetSection(AnthropicSettings.SectionName));
services.AddScoped<ICompanyNameMatcherService, AICompanyNameMatcherService>();
// Services
services.AddScoped<ICompanyVerifierService, CompanyVerifierService>();
services.AddScoped<IEducationVerifierService, EducationVerifierService>();
services.AddScoped<ICVParserService, CVParserService>();
}
}

195
tools/batch-test-cvs.cs Normal file
View File

@@ -0,0 +1,195 @@
#!/usr/bin/env dotnet-script
#r "nuget: Microsoft.EntityFrameworkCore.SqlServer, 8.0.0"
#r "nuget: Microsoft.Extensions.Configuration.Json, 8.0.0"
#r "nuget: Microsoft.Extensions.DependencyInjection, 8.0.0"
#r "nuget: Microsoft.Extensions.Logging.Console, 8.0.0"
#r "../src/RealCV.Application/bin/Debug/net8.0/RealCV.Application.dll"
#r "../src/RealCV.Infrastructure/bin/Debug/net8.0/RealCV.Infrastructure.dll"
#r "../src/RealCV.Domain/bin/Debug/net8.0/RealCV.Domain.dll"
// This is a dotnet-script file. Run with: dotnet script batch-test-cvs.cs -- /path/to/cvs
// Install dotnet-script: dotnet tool install -g dotnet-script
using System;
using System.IO;
using System.Linq;
using System.Threading.Tasks;
using System.Collections.Generic;
using System.Text.Json;
using Microsoft.EntityFrameworkCore;
using Microsoft.Extensions.Configuration;
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Logging;
using RealCV.Application.Interfaces;
using RealCV.Application.Models;
using RealCV.Infrastructure.Data;
using RealCV.Infrastructure.Services;
using RealCV.Infrastructure.ExternalApis;
using RealCV.Infrastructure.Configuration;
var folderPath = Args.FirstOrDefault() ?? "/tmp/test-cvs";
if (!Directory.Exists(folderPath))
{
Console.WriteLine($"Error: Folder not found: {folderPath}");
Console.WriteLine("Usage: dotnet script batch-test-cvs.cs -- /path/to/cvs");
return 1;
}
Console.WriteLine($"Processing CVs from: {folderPath}");
Console.WriteLine(new string('=', 80));
// Setup DI
var services = new ServiceCollection();
var configuration = new ConfigurationBuilder()
.SetBasePath(Path.Combine(Directory.GetCurrentDirectory(), "../src/RealCV.Web"))
.AddJsonFile("appsettings.json", optional: true)
.AddJsonFile("appsettings.Development.json", optional: true)
.Build();
services.AddLogging(b => b.AddConsole().SetMinimumLevel(LogLevel.Warning));
services.AddDbContextFactory<ApplicationDbContext>(options =>
options.UseSqlServer(configuration.GetConnectionString("DefaultConnection")));
services.Configure<CompaniesHouseSettings>(configuration.GetSection("CompaniesHouse"));
services.Configure<AnthropicSettings>(configuration.GetSection("Anthropic"));
services.AddHttpClient<CompaniesHouseClient>();
services.AddHttpClient<AnthropicClient>();
services.AddScoped<ICompanyNameMatcherService, CompanyNameMatcherService>();
services.AddScoped<ICompanyVerifierService, CompanyVerifierService>();
services.AddScoped<IEducationVerifierService, EducationVerifierService>();
services.AddScoped<ICVParserService, CVParserService>();
var provider = services.BuildServiceProvider();
var cvFiles = Directory.GetFiles(folderPath, "*.*")
.Where(f => f.EndsWith(".pdf", StringComparison.OrdinalIgnoreCase) ||
f.EndsWith(".docx", StringComparison.OrdinalIgnoreCase))
.ToList();
Console.WriteLine($"Found {cvFiles.Count} CV files\n");
var allUnverifiedEmployers = new List<string>();
var allUnverifiedInstitutions = new List<string>();
var totalEmployers = 0;
var verifiedEmployers = 0;
var totalEducation = 0;
var verifiedEducation = 0;
foreach (var cvFile in cvFiles)
{
Console.WriteLine($"\n{'=',-80}");
Console.WriteLine($"FILE: {Path.GetFileName(cvFile)}");
Console.WriteLine($"{'=',-80}");
try
{
using var scope = provider.CreateScope();
var parser = scope.ServiceProvider.GetRequiredService<ICVParserService>();
var companyVerifier = scope.ServiceProvider.GetRequiredService<ICompanyVerifierService>();
var eduVerifier = scope.ServiceProvider.GetRequiredService<IEducationVerifierService>();
await using var stream = File.OpenRead(cvFile);
var cv = await parser.ParseAsync(stream, Path.GetFileName(cvFile));
Console.WriteLine($"Candidate: {cv.PersonalInfo?.FullName ?? "Unknown"}");
// Employers
if (cv.Employment?.Count > 0)
{
Console.WriteLine($"\nEMPLOYERS ({cv.Employment.Count}):");
foreach (var emp in cv.Employment)
{
totalEmployers++;
var result = await companyVerifier.VerifyCompanyAsync(
emp.CompanyName, emp.StartDate, emp.EndDate, emp.JobTitle);
var icon = result.IsVerified ? "✓" : "✗";
Console.WriteLine($" {icon} {emp.CompanyName}");
if (result.IsVerified)
{
verifiedEmployers++;
Console.WriteLine($" → {result.MatchedCompanyName} ({result.MatchScore}%)");
if (!string.IsNullOrEmpty(result.VerificationNotes))
Console.WriteLine($" Note: {result.VerificationNotes}");
}
else
{
allUnverifiedEmployers.Add(emp.CompanyName);
Console.WriteLine($" Note: {result.VerificationNotes ?? "Not found"}");
}
}
}
// Education
if (cv.Education?.Count > 0)
{
Console.WriteLine($"\nEDUCATION ({cv.Education.Count}):");
var eduEntries = cv.Education.Select(e => new EducationEntry
{
Institution = e.Institution,
Qualification = e.Qualification,
Subject = e.Subject,
StartDate = e.StartDate,
EndDate = e.EndDate
}).ToList();
var eduResults = eduVerifier.VerifyAll(eduEntries);
foreach (var result in eduResults)
{
totalEducation++;
var icon = result.IsVerified ? "✓" : "✗";
Console.WriteLine($" {icon} {result.ClaimedInstitution}");
Console.WriteLine($" {result.ClaimedQualification}");
if (result.IsVerified)
{
verifiedEducation++;
if (result.MatchedInstitution != null && result.MatchedInstitution != result.ClaimedInstitution)
Console.WriteLine($" → {result.MatchedInstitution}");
}
else
{
allUnverifiedInstitutions.Add(result.ClaimedInstitution ?? "Unknown");
Console.WriteLine($" Status: {result.Status}");
if (!string.IsNullOrEmpty(result.VerificationNotes))
Console.WriteLine($" Note: {result.VerificationNotes}");
}
}
}
}
catch (Exception ex)
{
Console.WriteLine($"ERROR: {ex.Message}");
}
}
// Summary
Console.WriteLine($"\n\n{'=',-80}");
Console.WriteLine("SUMMARY");
Console.WriteLine($"{'=',-80}");
Console.WriteLine($"CVs Processed: {cvFiles.Count}");
Console.WriteLine($"Employers: {verifiedEmployers}/{totalEmployers} verified ({(totalEmployers > 0 ? verifiedEmployers * 100 / totalEmployers : 0)}%)");
Console.WriteLine($"Education: {verifiedEducation}/{totalEducation} verified ({(totalEducation > 0 ? verifiedEducation * 100 / totalEducation : 0)}%)");
var uniqueUnverifiedEmployers = allUnverifiedEmployers.Distinct().OrderBy(x => x).ToList();
if (uniqueUnverifiedEmployers.Count > 0)
{
Console.WriteLine($"\nUNVERIFIED EMPLOYERS ({uniqueUnverifiedEmployers.Count}):");
foreach (var emp in uniqueUnverifiedEmployers)
Console.WriteLine($" - {emp}");
}
var uniqueUnverifiedInstitutions = allUnverifiedInstitutions.Distinct().OrderBy(x => x).ToList();
if (uniqueUnverifiedInstitutions.Count > 0)
{
Console.WriteLine($"\nUNVERIFIED INSTITUTIONS ({uniqueUnverifiedInstitutions.Count}):");
foreach (var inst in uniqueUnverifiedInstitutions)
Console.WriteLine($" - {inst}");
}
return 0;