Files
RealCV/tools/CVBatchTester/Program.cs
Peter Foster 1a06d60f2d feat: Add company name aliases and fix education verification
- Add trading name aliases for major UK companies (Boots, BBC, Lloyds, etc.)
  mapping to their official Companies House registered names
- Add Leeds Beckett University (and former name Leeds Metropolitan) to
  recognised UK institutions
- This improves company verification from 65% to 84% on test data
- CVBatchTester tool for testing verification against JSON CVs

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-22 21:48:15 +00:00

446 lines
17 KiB
C#

using System.Text.Json;
using System.Text.Json.Serialization;
using Microsoft.EntityFrameworkCore;
using Microsoft.Extensions.Configuration;
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Logging;
using RealCV.Application.Interfaces;
using RealCV.Application.Models;
using RealCV.Infrastructure.Configuration;
using RealCV.Infrastructure.Data;
using RealCV.Infrastructure.ExternalApis;
using RealCV.Infrastructure.Services;
namespace CVBatchTester;
// DTOs for test JSON format (snake_case with nested personal object)
record TestCVData
{
public string? CvId { get; init; }
public string? Category { get; init; }
public List<string>? ExpectedFlags { get; init; }
public TestPersonalData? Personal { get; init; }
public string? Profile { get; init; }
public List<TestEmploymentEntry>? Employment { get; init; }
public List<TestEducationEntry>? Education { get; init; }
public List<string>? Skills { get; init; }
}
record TestPersonalData
{
public string? Name { get; init; }
public string? Email { get; init; }
public string? Phone { get; init; }
public string? Address { get; init; }
public string? LinkedIn { get; init; }
}
record TestEmploymentEntry
{
public string? Company { get; init; }
public string? JobTitle { get; init; }
public string? StartDate { get; init; }
public string? EndDate { get; init; }
public string? Location { get; init; }
public string? Description { get; init; }
public List<string>? Achievements { get; init; }
}
record TestEducationEntry
{
public string? Institution { get; init; }
public string? Qualification { get; init; }
public string? Subject { get; init; }
public string? Classification { get; init; }
public string? StartDate { get; init; }
public string? EndDate { get; init; }
}
class Program
{
private static StreamWriter? _logWriter;
private static readonly JsonSerializerOptions JsonOptions = new()
{
PropertyNameCaseInsensitive = true,
PropertyNamingPolicy = JsonNamingPolicy.SnakeCaseLower,
Converters = { new JsonStringEnumConverter() }
};
static async Task<int> Main(string[] args)
{
var folderPath = args.FirstOrDefault() ?? AskForFolder();
if (string.IsNullOrEmpty(folderPath) || !Directory.Exists(folderPath))
{
Log($"Error: Folder not found: {folderPath}");
Log("Usage: CVBatchTester <folder-path> [--output <file>]");
Log(" e.g. CVBatchTester /home/user/cvs");
Log(" e.g. CVBatchTester /home/user/cvs --output /tmp/results.log");
return 1;
}
// Check for --output flag
var outputIndex = Array.IndexOf(args, "--output");
var logPath = outputIndex >= 0 && outputIndex < args.Length - 1
? args[outputIndex + 1]
: Path.Combine(folderPath, $"batch-results-{DateTime.Now:yyyyMMdd-HHmmss}.log");
_logWriter = new StreamWriter(logPath, false) { AutoFlush = true };
Log($"CV Batch Verification Tester");
Log($"Processing CVs from: {folderPath}");
Log($"Output log: {logPath}");
Log($"Started: {DateTime.Now:yyyy-MM-dd HH:mm:ss}");
Log(new string('=', 80));
// Setup DI
var services = new ServiceCollection();
ConfigureServices(services);
var provider = services.BuildServiceProvider();
// Find CV files
var cvFiles = Directory.GetFiles(folderPath, "*.*", SearchOption.TopDirectoryOnly)
.Where(f => f.EndsWith(".pdf", StringComparison.OrdinalIgnoreCase) ||
f.EndsWith(".docx", StringComparison.OrdinalIgnoreCase) ||
f.EndsWith(".doc", StringComparison.OrdinalIgnoreCase) ||
f.EndsWith(".json", StringComparison.OrdinalIgnoreCase))
.OrderBy(f => f)
.ToList();
Log($"Found {cvFiles.Count} CV files\n");
if (cvFiles.Count == 0)
{
Log("No CV files found (.pdf, .docx, .doc, .json)");
return 1;
}
// Track results
var allUnverifiedEmployers = new List<string>();
var allUnverifiedInstitutions = new List<string>();
var totalEmployers = 0;
var verifiedEmployers = 0;
var totalEducation = 0;
var verifiedEducation = 0;
var processedCount = 0;
var errorCount = 0;
foreach (var cvFile in cvFiles)
{
Log($"\n{new string('=', 80)}");
Log($"[{++processedCount}/{cvFiles.Count}] {Path.GetFileName(cvFile)}");
Log(new string('=', 80));
try
{
using var scope = provider.CreateScope();
var parser = scope.ServiceProvider.GetRequiredService<ICVParserService>();
var companyVerifier = scope.ServiceProvider.GetRequiredService<ICompanyVerifierService>();
var eduVerifier = scope.ServiceProvider.GetRequiredService<IEducationVerifierService>();
// Parse CV - handle JSON files differently
CVData cv;
if (cvFile.EndsWith(".json", StringComparison.OrdinalIgnoreCase))
{
var jsonContent = await File.ReadAllTextAsync(cvFile);
var testCv = JsonSerializer.Deserialize<TestCVData>(jsonContent, JsonOptions)
?? throw new InvalidOperationException("Failed to deserialize JSON CV");
// Convert TestCVData to CVData
cv = ConvertTestCVData(testCv);
Log($"Loaded JSON CV: {cv.FullName}");
}
else
{
await using var stream = File.OpenRead(cvFile);
cv = await parser.ParseAsync(stream, Path.GetFileName(cvFile));
Log($"Parsed CV: {cv.FullName}");
}
// Verify Employers
if (cv.Employment?.Count > 0)
{
Log($"\nEMPLOYERS ({cv.Employment.Count}):");
Log(new string('-', 60));
foreach (var emp in cv.Employment)
{
totalEmployers++;
try
{
var result = await companyVerifier.VerifyCompanyAsync(
emp.CompanyName,
emp.StartDate,
emp.EndDate,
emp.JobTitle);
var icon = result.IsVerified ? "✓" : "✗";
var period = FormatPeriod(emp.StartDate, emp.EndDate);
Log($"\n {icon} {emp.CompanyName}");
Log($" Period: {period}");
Log($" Role: {emp.JobTitle}");
if (result.IsVerified)
{
verifiedEmployers++;
Log($" Match: {result.MatchedCompanyName} ({result.MatchScore}%)");
if (!string.IsNullOrEmpty(result.MatchedCompanyNumber))
Log($" Company #: {result.MatchedCompanyNumber}");
if (!string.IsNullOrEmpty(result.CompanyStatus))
Log($" Status: {result.CompanyStatus}");
}
else
{
allUnverifiedEmployers.Add(emp.CompanyName);
}
if (!string.IsNullOrEmpty(result.VerificationNotes))
Log($" Note: {result.VerificationNotes}");
}
catch (Exception ex)
{
Log($"\n ✗ {emp.CompanyName}");
Log($" ERROR: {ex.Message}");
allUnverifiedEmployers.Add(emp.CompanyName);
}
}
}
// Verify Education
if (cv.Education?.Count > 0)
{
Log($"\nEDUCATION ({cv.Education.Count}):");
Log(new string('-', 60));
var eduEntries = cv.Education.Select(e => new EducationEntry
{
Institution = e.Institution,
Qualification = e.Qualification,
Subject = e.Subject,
StartDate = e.StartDate,
EndDate = e.EndDate
}).ToList();
var eduResults = eduVerifier.VerifyAll(eduEntries);
foreach (var result in eduResults)
{
totalEducation++;
var icon = result.IsVerified ? "✓" : "✗";
Log($"\n {icon} {result.ClaimedInstitution}");
Log($" Qualification: {result.ClaimedQualification}");
if (!string.IsNullOrEmpty(result.ClaimedSubject))
Log($" Subject: {result.ClaimedSubject}");
if (result.IsVerified)
{
verifiedEducation++;
if (result.MatchedInstitution != null &&
!result.MatchedInstitution.Equals(result.ClaimedInstitution, StringComparison.OrdinalIgnoreCase))
{
Log($" Match: {result.MatchedInstitution}");
}
}
else
{
allUnverifiedInstitutions.Add(result.ClaimedInstitution ?? "Unknown");
Log($" Status: {result.Status}");
}
if (!string.IsNullOrEmpty(result.VerificationNotes))
Log($" Note: {result.VerificationNotes}");
}
}
}
catch (Exception ex)
{
errorCount++;
Log($"ERROR processing file: {ex.Message}");
}
}
// Print Summary
Log($"\n\n{new string('=', 80)}");
Log("VERIFICATION SUMMARY");
Log(new string('=', 80));
Log($"\nCVs Processed: {processedCount - errorCount}/{cvFiles.Count}");
if (errorCount > 0)
Log($"Errors: {errorCount}");
var empRate = totalEmployers > 0 ? verifiedEmployers * 100 / totalEmployers : 0;
var eduRate = totalEducation > 0 ? verifiedEducation * 100 / totalEducation : 0;
Log($"\nEmployers: {verifiedEmployers}/{totalEmployers} verified ({empRate}%)");
Log($"Education: {verifiedEducation}/{totalEducation} verified ({eduRate}%)");
// List unverified employers
var uniqueUnverifiedEmployers = allUnverifiedEmployers
.GroupBy(e => e, StringComparer.OrdinalIgnoreCase)
.OrderByDescending(g => g.Count())
.ThenBy(g => g.Key)
.ToList();
if (uniqueUnverifiedEmployers.Count > 0)
{
Log($"\n{new string('-', 60)}");
Log($"UNVERIFIED EMPLOYERS ({uniqueUnverifiedEmployers.Count} unique):");
foreach (var group in uniqueUnverifiedEmployers)
{
var count = group.Count() > 1 ? $" (x{group.Count()})" : "";
Log($" - {group.Key}{count}");
}
}
// List unverified institutions
var uniqueUnverifiedInstitutions = allUnverifiedInstitutions
.GroupBy(i => i, StringComparer.OrdinalIgnoreCase)
.OrderByDescending(g => g.Count())
.ThenBy(g => g.Key)
.ToList();
if (uniqueUnverifiedInstitutions.Count > 0)
{
Log($"\n{new string('-', 60)}");
Log($"UNVERIFIED INSTITUTIONS ({uniqueUnverifiedInstitutions.Count} unique):");
foreach (var group in uniqueUnverifiedInstitutions)
{
var count = group.Count() > 1 ? $" (x{group.Count()})" : "";
Log($" - {group.Key}{count}");
}
}
Log($"\nCompleted: {DateTime.Now:yyyy-MM-dd HH:mm:ss}");
Log($"\n{new string('=', 80)}");
_logWriter?.Close();
Console.WriteLine($"\nResults written to: {logPath}");
return 0;
}
static void Log(string message)
{
Console.WriteLine(message);
_logWriter?.WriteLine(message);
}
static string AskForFolder()
{
Console.Write("Enter CV folder path: ");
return Console.ReadLine() ?? "";
}
static string FormatPeriod(DateOnly? start, DateOnly? end)
{
var startStr = start?.ToString("MMM yyyy") ?? "?";
var endStr = end?.ToString("MMM yyyy") ?? "Present";
return $"{startStr} - {endStr}";
}
static CVData ConvertTestCVData(TestCVData testCv)
{
return new CVData
{
FullName = testCv.Personal?.Name ?? "Unknown",
Email = testCv.Personal?.Email,
Phone = testCv.Personal?.Phone,
Employment = testCv.Employment?.Select(e => new EmploymentEntry
{
CompanyName = e.Company ?? "Unknown",
JobTitle = e.JobTitle ?? "Unknown",
Location = e.Location,
StartDate = ParseDate(e.StartDate),
EndDate = ParseDate(e.EndDate),
IsCurrent = e.EndDate == null,
Description = e.Description
}).ToList() ?? [],
Education = testCv.Education?.Select(e => new EducationEntry
{
Institution = e.Institution ?? "Unknown",
Qualification = e.Qualification,
Subject = e.Subject,
StartDate = ParseDate(e.StartDate),
EndDate = ParseDate(e.EndDate)
}).ToList() ?? [],
Skills = testCv.Skills ?? []
};
}
static DateOnly? ParseDate(string? dateStr)
{
if (string.IsNullOrEmpty(dateStr)) return null;
// Try parsing YYYY-MM format
if (dateStr.Length == 7 && dateStr[4] == '-')
{
if (int.TryParse(dateStr[..4], out var year) && int.TryParse(dateStr[5..], out var month))
{
return new DateOnly(year, month, 1);
}
}
// Try standard parsing
if (DateOnly.TryParse(dateStr, out var date))
{
return date;
}
return null;
}
static void ConfigureServices(IServiceCollection services)
{
// Load configuration - try multiple locations
var configPaths = new[]
{
"/var/www/realcv",
"/git/RealCV/src/RealCV.Web",
Path.GetFullPath(Path.Combine(AppContext.BaseDirectory, "..", "..", "..", "..", "..", "src", "RealCV.Web"))
};
var webProjectPath = configPaths.FirstOrDefault(Directory.Exists) ?? "/git/RealCV/src/RealCV.Web";
Log($"Loading config from: {webProjectPath}");
var configuration = new ConfigurationBuilder()
.SetBasePath(webProjectPath)
.AddJsonFile("appsettings.json", optional: true)
.AddJsonFile("appsettings.Development.json", optional: true)
.AddJsonFile("appsettings.Production.json", optional: true)
.Build();
// Logging - show info level for verification details
services.AddLogging(builder =>
{
builder.AddConsole();
builder.SetMinimumLevel(LogLevel.Information);
// Filter out noisy libraries
builder.AddFilter("Microsoft", LogLevel.Warning);
builder.AddFilter("System", LogLevel.Warning);
});
// Database
var connectionString = configuration.GetConnectionString("DefaultConnection")
?? "Server=127.0.0.1;Database=RealCV;User Id=SA;Password=TrueCV_Sql2024!;TrustServerCertificate=True";
services.AddDbContextFactory<ApplicationDbContext>(options =>
options.UseSqlServer(connectionString));
// Companies House - use configuration binding
services.Configure<CompaniesHouseSettings>(configuration.GetSection(CompaniesHouseSettings.SectionName));
services.AddHttpClient<CompaniesHouseClient>();
// Anthropic - use configuration binding
services.Configure<AnthropicSettings>(configuration.GetSection(AnthropicSettings.SectionName));
services.AddScoped<ICompanyNameMatcherService, AICompanyNameMatcherService>();
// Services
services.AddScoped<ICompanyVerifierService, CompanyVerifierService>();
services.AddScoped<IEducationVerifierService, EducationVerifierService>();
services.AddScoped<ICVParserService, CVParserService>();
}
}