feat: Add AI-powered compound company name splitting
Uses Claude Haiku to intelligently detect when a company name contains multiple companies (e.g., "ASDA/WALMART", "Corus & Laura Ashley Hotels") vs single companies with similar patterns (e.g., "Ernst & Young"). - Adds ExtractCompanyNamesAsync to ICompanyNameMatcherService - Only triggers for names with potential separators (/, &, "and") - Verifies each extracted part individually, returns first match - Uses fast Haiku model to minimize cost Results: - ASDA/WALMART → verified via 'ASDA' → ASDA GROUP LIMITED - Corus & Laura Ashley Hotels → verified via 'Corus' → Tata Steel UK - Employers: 104/120 verified (86%) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
195
tools/batch-test-cvs.cs
Normal file
195
tools/batch-test-cvs.cs
Normal file
@@ -0,0 +1,195 @@
|
||||
#!/usr/bin/env dotnet-script
|
||||
#r "nuget: Microsoft.EntityFrameworkCore.SqlServer, 8.0.0"
|
||||
#r "nuget: Microsoft.Extensions.Configuration.Json, 8.0.0"
|
||||
#r "nuget: Microsoft.Extensions.DependencyInjection, 8.0.0"
|
||||
#r "nuget: Microsoft.Extensions.Logging.Console, 8.0.0"
|
||||
#r "../src/RealCV.Application/bin/Debug/net8.0/RealCV.Application.dll"
|
||||
#r "../src/RealCV.Infrastructure/bin/Debug/net8.0/RealCV.Infrastructure.dll"
|
||||
#r "../src/RealCV.Domain/bin/Debug/net8.0/RealCV.Domain.dll"
|
||||
|
||||
// This is a dotnet-script file. Run with: dotnet script batch-test-cvs.cs -- /path/to/cvs
|
||||
// Install dotnet-script: dotnet tool install -g dotnet-script
|
||||
|
||||
using System;
|
||||
using System.IO;
|
||||
using System.Linq;
|
||||
using System.Threading.Tasks;
|
||||
using System.Collections.Generic;
|
||||
using System.Text.Json;
|
||||
using Microsoft.EntityFrameworkCore;
|
||||
using Microsoft.Extensions.Configuration;
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using RealCV.Application.Interfaces;
|
||||
using RealCV.Application.Models;
|
||||
using RealCV.Infrastructure.Data;
|
||||
using RealCV.Infrastructure.Services;
|
||||
using RealCV.Infrastructure.ExternalApis;
|
||||
using RealCV.Infrastructure.Configuration;
|
||||
|
||||
var folderPath = Args.FirstOrDefault() ?? "/tmp/test-cvs";
|
||||
|
||||
if (!Directory.Exists(folderPath))
|
||||
{
|
||||
Console.WriteLine($"Error: Folder not found: {folderPath}");
|
||||
Console.WriteLine("Usage: dotnet script batch-test-cvs.cs -- /path/to/cvs");
|
||||
return 1;
|
||||
}
|
||||
|
||||
Console.WriteLine($"Processing CVs from: {folderPath}");
|
||||
Console.WriteLine(new string('=', 80));
|
||||
|
||||
// Setup DI
|
||||
var services = new ServiceCollection();
|
||||
|
||||
var configuration = new ConfigurationBuilder()
|
||||
.SetBasePath(Path.Combine(Directory.GetCurrentDirectory(), "../src/RealCV.Web"))
|
||||
.AddJsonFile("appsettings.json", optional: true)
|
||||
.AddJsonFile("appsettings.Development.json", optional: true)
|
||||
.Build();
|
||||
|
||||
services.AddLogging(b => b.AddConsole().SetMinimumLevel(LogLevel.Warning));
|
||||
|
||||
services.AddDbContextFactory<ApplicationDbContext>(options =>
|
||||
options.UseSqlServer(configuration.GetConnectionString("DefaultConnection")));
|
||||
|
||||
services.Configure<CompaniesHouseSettings>(configuration.GetSection("CompaniesHouse"));
|
||||
services.Configure<AnthropicSettings>(configuration.GetSection("Anthropic"));
|
||||
|
||||
services.AddHttpClient<CompaniesHouseClient>();
|
||||
services.AddHttpClient<AnthropicClient>();
|
||||
services.AddScoped<ICompanyNameMatcherService, CompanyNameMatcherService>();
|
||||
services.AddScoped<ICompanyVerifierService, CompanyVerifierService>();
|
||||
services.AddScoped<IEducationVerifierService, EducationVerifierService>();
|
||||
services.AddScoped<ICVParserService, CVParserService>();
|
||||
|
||||
var provider = services.BuildServiceProvider();
|
||||
|
||||
var cvFiles = Directory.GetFiles(folderPath, "*.*")
|
||||
.Where(f => f.EndsWith(".pdf", StringComparison.OrdinalIgnoreCase) ||
|
||||
f.EndsWith(".docx", StringComparison.OrdinalIgnoreCase))
|
||||
.ToList();
|
||||
|
||||
Console.WriteLine($"Found {cvFiles.Count} CV files\n");
|
||||
|
||||
var allUnverifiedEmployers = new List<string>();
|
||||
var allUnverifiedInstitutions = new List<string>();
|
||||
var totalEmployers = 0;
|
||||
var verifiedEmployers = 0;
|
||||
var totalEducation = 0;
|
||||
var verifiedEducation = 0;
|
||||
|
||||
foreach (var cvFile in cvFiles)
|
||||
{
|
||||
Console.WriteLine($"\n{'=',-80}");
|
||||
Console.WriteLine($"FILE: {Path.GetFileName(cvFile)}");
|
||||
Console.WriteLine($"{'=',-80}");
|
||||
|
||||
try
|
||||
{
|
||||
using var scope = provider.CreateScope();
|
||||
var parser = scope.ServiceProvider.GetRequiredService<ICVParserService>();
|
||||
var companyVerifier = scope.ServiceProvider.GetRequiredService<ICompanyVerifierService>();
|
||||
var eduVerifier = scope.ServiceProvider.GetRequiredService<IEducationVerifierService>();
|
||||
|
||||
await using var stream = File.OpenRead(cvFile);
|
||||
var cv = await parser.ParseAsync(stream, Path.GetFileName(cvFile));
|
||||
|
||||
Console.WriteLine($"Candidate: {cv.PersonalInfo?.FullName ?? "Unknown"}");
|
||||
|
||||
// Employers
|
||||
if (cv.Employment?.Count > 0)
|
||||
{
|
||||
Console.WriteLine($"\nEMPLOYERS ({cv.Employment.Count}):");
|
||||
foreach (var emp in cv.Employment)
|
||||
{
|
||||
totalEmployers++;
|
||||
var result = await companyVerifier.VerifyCompanyAsync(
|
||||
emp.CompanyName, emp.StartDate, emp.EndDate, emp.JobTitle);
|
||||
|
||||
var icon = result.IsVerified ? "✓" : "✗";
|
||||
Console.WriteLine($" {icon} {emp.CompanyName}");
|
||||
|
||||
if (result.IsVerified)
|
||||
{
|
||||
verifiedEmployers++;
|
||||
Console.WriteLine($" → {result.MatchedCompanyName} ({result.MatchScore}%)");
|
||||
if (!string.IsNullOrEmpty(result.VerificationNotes))
|
||||
Console.WriteLine($" Note: {result.VerificationNotes}");
|
||||
}
|
||||
else
|
||||
{
|
||||
allUnverifiedEmployers.Add(emp.CompanyName);
|
||||
Console.WriteLine($" Note: {result.VerificationNotes ?? "Not found"}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Education
|
||||
if (cv.Education?.Count > 0)
|
||||
{
|
||||
Console.WriteLine($"\nEDUCATION ({cv.Education.Count}):");
|
||||
var eduEntries = cv.Education.Select(e => new EducationEntry
|
||||
{
|
||||
Institution = e.Institution,
|
||||
Qualification = e.Qualification,
|
||||
Subject = e.Subject,
|
||||
StartDate = e.StartDate,
|
||||
EndDate = e.EndDate
|
||||
}).ToList();
|
||||
|
||||
var eduResults = eduVerifier.VerifyAll(eduEntries);
|
||||
foreach (var result in eduResults)
|
||||
{
|
||||
totalEducation++;
|
||||
var icon = result.IsVerified ? "✓" : "✗";
|
||||
Console.WriteLine($" {icon} {result.ClaimedInstitution}");
|
||||
Console.WriteLine($" {result.ClaimedQualification}");
|
||||
|
||||
if (result.IsVerified)
|
||||
{
|
||||
verifiedEducation++;
|
||||
if (result.MatchedInstitution != null && result.MatchedInstitution != result.ClaimedInstitution)
|
||||
Console.WriteLine($" → {result.MatchedInstitution}");
|
||||
}
|
||||
else
|
||||
{
|
||||
allUnverifiedInstitutions.Add(result.ClaimedInstitution ?? "Unknown");
|
||||
Console.WriteLine($" Status: {result.Status}");
|
||||
if (!string.IsNullOrEmpty(result.VerificationNotes))
|
||||
Console.WriteLine($" Note: {result.VerificationNotes}");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
Console.WriteLine($"ERROR: {ex.Message}");
|
||||
}
|
||||
}
|
||||
|
||||
// Summary
|
||||
Console.WriteLine($"\n\n{'=',-80}");
|
||||
Console.WriteLine("SUMMARY");
|
||||
Console.WriteLine($"{'=',-80}");
|
||||
Console.WriteLine($"CVs Processed: {cvFiles.Count}");
|
||||
Console.WriteLine($"Employers: {verifiedEmployers}/{totalEmployers} verified ({(totalEmployers > 0 ? verifiedEmployers * 100 / totalEmployers : 0)}%)");
|
||||
Console.WriteLine($"Education: {verifiedEducation}/{totalEducation} verified ({(totalEducation > 0 ? verifiedEducation * 100 / totalEducation : 0)}%)");
|
||||
|
||||
var uniqueUnverifiedEmployers = allUnverifiedEmployers.Distinct().OrderBy(x => x).ToList();
|
||||
if (uniqueUnverifiedEmployers.Count > 0)
|
||||
{
|
||||
Console.WriteLine($"\nUNVERIFIED EMPLOYERS ({uniqueUnverifiedEmployers.Count}):");
|
||||
foreach (var emp in uniqueUnverifiedEmployers)
|
||||
Console.WriteLine($" - {emp}");
|
||||
}
|
||||
|
||||
var uniqueUnverifiedInstitutions = allUnverifiedInstitutions.Distinct().OrderBy(x => x).ToList();
|
||||
if (uniqueUnverifiedInstitutions.Count > 0)
|
||||
{
|
||||
Console.WriteLine($"\nUNVERIFIED INSTITUTIONS ({uniqueUnverifiedInstitutions.Count}):");
|
||||
foreach (var inst in uniqueUnverifiedInstitutions)
|
||||
Console.WriteLine($" - {inst}");
|
||||
}
|
||||
|
||||
return 0;
|
||||
Reference in New Issue
Block a user