Files
RealCV/tools/CVBatchTester/Program.cs
Peter Foster 8a4e46d872 feat: Improve company alias matching and add batch tester flags display
- Add Deliveroo alias (ROOFOODS LTD) to fix matching to wrong company
- Add JCB alias (J.C. BAMFORD EXCAVATORS LIMITED)
- Improve FindDirectAliasMatch to prefer active companies over dissolved
- Display verification flags in CVBatchTester output
- Employer verification improved from 65% to 86%

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-24 05:43:04 +00:00

456 lines
17 KiB
C#
Raw Permalink Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
using System.Text.Json;
using System.Text.Json.Serialization;
using Microsoft.EntityFrameworkCore;
using Microsoft.Extensions.Configuration;
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Logging;
using RealCV.Application.Interfaces;
using RealCV.Application.Models;
using RealCV.Infrastructure.Configuration;
using RealCV.Infrastructure.Data;
using RealCV.Infrastructure.ExternalApis;
using RealCV.Infrastructure.Services;
namespace CVBatchTester;
// DTOs for test JSON format (snake_case with nested personal object)
record TestCVData
{
public string? CvId { get; init; }
public string? Category { get; init; }
public List<string>? ExpectedFlags { get; init; }
public TestPersonalData? Personal { get; init; }
public string? Profile { get; init; }
public List<TestEmploymentEntry>? Employment { get; init; }
public List<TestEducationEntry>? Education { get; init; }
public List<string>? Skills { get; init; }
}
record TestPersonalData
{
public string? Name { get; init; }
public string? Email { get; init; }
public string? Phone { get; init; }
public string? Address { get; init; }
public string? LinkedIn { get; init; }
}
record TestEmploymentEntry
{
public string? Company { get; init; }
public string? JobTitle { get; init; }
public string? StartDate { get; init; }
public string? EndDate { get; init; }
public string? Location { get; init; }
public string? Description { get; init; }
public List<string>? Achievements { get; init; }
}
record TestEducationEntry
{
public string? Institution { get; init; }
public string? Qualification { get; init; }
public string? Subject { get; init; }
public string? Classification { get; init; }
public string? StartDate { get; init; }
public string? EndDate { get; init; }
}
class Program
{
private static StreamWriter? _logWriter;
private static readonly JsonSerializerOptions JsonOptions = new()
{
PropertyNameCaseInsensitive = true,
PropertyNamingPolicy = JsonNamingPolicy.SnakeCaseLower,
Converters = { new JsonStringEnumConverter() }
};
static async Task<int> Main(string[] args)
{
var folderPath = args.FirstOrDefault() ?? AskForFolder();
if (string.IsNullOrEmpty(folderPath) || !Directory.Exists(folderPath))
{
Log($"Error: Folder not found: {folderPath}");
Log("Usage: CVBatchTester <folder-path> [--output <file>]");
Log(" e.g. CVBatchTester /home/user/cvs");
Log(" e.g. CVBatchTester /home/user/cvs --output /tmp/results.log");
return 1;
}
// Check for --output flag
var outputIndex = Array.IndexOf(args, "--output");
var logPath = outputIndex >= 0 && outputIndex < args.Length - 1
? args[outputIndex + 1]
: Path.Combine(folderPath, $"batch-results-{DateTime.Now:yyyyMMdd-HHmmss}.log");
_logWriter = new StreamWriter(logPath, false) { AutoFlush = true };
Log($"CV Batch Verification Tester");
Log($"Processing CVs from: {folderPath}");
Log($"Output log: {logPath}");
Log($"Started: {DateTime.Now:yyyy-MM-dd HH:mm:ss}");
Log(new string('=', 80));
// Setup DI
var services = new ServiceCollection();
ConfigureServices(services);
var provider = services.BuildServiceProvider();
// Find CV files
var cvFiles = Directory.GetFiles(folderPath, "*.*", SearchOption.TopDirectoryOnly)
.Where(f => f.EndsWith(".pdf", StringComparison.OrdinalIgnoreCase) ||
f.EndsWith(".docx", StringComparison.OrdinalIgnoreCase) ||
f.EndsWith(".doc", StringComparison.OrdinalIgnoreCase) ||
f.EndsWith(".json", StringComparison.OrdinalIgnoreCase))
.OrderBy(f => f)
.ToList();
Log($"Found {cvFiles.Count} CV files\n");
if (cvFiles.Count == 0)
{
Log("No CV files found (.pdf, .docx, .doc, .json)");
return 1;
}
// Track results
var allUnverifiedEmployers = new List<string>();
var allUnverifiedInstitutions = new List<string>();
var totalEmployers = 0;
var verifiedEmployers = 0;
var totalEducation = 0;
var verifiedEducation = 0;
var processedCount = 0;
var errorCount = 0;
foreach (var cvFile in cvFiles)
{
Log($"\n{new string('=', 80)}");
Log($"[{++processedCount}/{cvFiles.Count}] {Path.GetFileName(cvFile)}");
Log(new string('=', 80));
try
{
using var scope = provider.CreateScope();
var parser = scope.ServiceProvider.GetRequiredService<ICVParserService>();
var companyVerifier = scope.ServiceProvider.GetRequiredService<ICompanyVerifierService>();
var eduVerifier = scope.ServiceProvider.GetRequiredService<IEducationVerifierService>();
// Parse CV - handle JSON files differently
CVData cv;
if (cvFile.EndsWith(".json", StringComparison.OrdinalIgnoreCase))
{
var jsonContent = await File.ReadAllTextAsync(cvFile);
var testCv = JsonSerializer.Deserialize<TestCVData>(jsonContent, JsonOptions)
?? throw new InvalidOperationException("Failed to deserialize JSON CV");
// Convert TestCVData to CVData
cv = ConvertTestCVData(testCv);
Log($"Loaded JSON CV: {cv.FullName}");
}
else
{
await using var stream = File.OpenRead(cvFile);
cv = await parser.ParseAsync(stream, Path.GetFileName(cvFile));
Log($"Parsed CV: {cv.FullName}");
}
// Verify Employers
if (cv.Employment?.Count > 0)
{
Log($"\nEMPLOYERS ({cv.Employment.Count}):");
Log(new string('-', 60));
foreach (var emp in cv.Employment)
{
totalEmployers++;
try
{
var result = await companyVerifier.VerifyCompanyAsync(
emp.CompanyName,
emp.StartDate,
emp.EndDate,
emp.JobTitle);
var icon = result.IsVerified ? "✓" : "✗";
var period = FormatPeriod(emp.StartDate, emp.EndDate);
Log($"\n {icon} {emp.CompanyName}");
Log($" Period: {period}");
Log($" Role: {emp.JobTitle}");
if (result.IsVerified)
{
verifiedEmployers++;
Log($" Match: {result.MatchedCompanyName} ({result.MatchScore}%)");
if (!string.IsNullOrEmpty(result.MatchedCompanyNumber))
Log($" Company #: {result.MatchedCompanyNumber}");
if (!string.IsNullOrEmpty(result.CompanyStatus))
Log($" Status: {result.CompanyStatus}");
}
else
{
allUnverifiedEmployers.Add(emp.CompanyName);
}
if (!string.IsNullOrEmpty(result.VerificationNotes))
Log($" Note: {result.VerificationNotes}");
// Display any flags (warnings/issues)
if (result.Flags?.Count > 0)
{
foreach (var flag in result.Flags)
{
var flagIcon = flag.Severity == "Critical" ? "⚠️" : "";
Log($" {flagIcon} FLAG [{flag.Type}]: {flag.Message}");
}
}
}
catch (Exception ex)
{
Log($"\n ✗ {emp.CompanyName}");
Log($" ERROR: {ex.Message}");
allUnverifiedEmployers.Add(emp.CompanyName);
}
}
}
// Verify Education
if (cv.Education?.Count > 0)
{
Log($"\nEDUCATION ({cv.Education.Count}):");
Log(new string('-', 60));
var eduEntries = cv.Education.Select(e => new EducationEntry
{
Institution = e.Institution,
Qualification = e.Qualification,
Subject = e.Subject,
StartDate = e.StartDate,
EndDate = e.EndDate
}).ToList();
var eduResults = eduVerifier.VerifyAll(eduEntries);
foreach (var result in eduResults)
{
totalEducation++;
var icon = result.IsVerified ? "✓" : "✗";
Log($"\n {icon} {result.ClaimedInstitution}");
Log($" Qualification: {result.ClaimedQualification}");
if (!string.IsNullOrEmpty(result.ClaimedSubject))
Log($" Subject: {result.ClaimedSubject}");
if (result.IsVerified)
{
verifiedEducation++;
if (result.MatchedInstitution != null &&
!result.MatchedInstitution.Equals(result.ClaimedInstitution, StringComparison.OrdinalIgnoreCase))
{
Log($" Match: {result.MatchedInstitution}");
}
}
else
{
allUnverifiedInstitutions.Add(result.ClaimedInstitution ?? "Unknown");
Log($" Status: {result.Status}");
}
if (!string.IsNullOrEmpty(result.VerificationNotes))
Log($" Note: {result.VerificationNotes}");
}
}
}
catch (Exception ex)
{
errorCount++;
Log($"ERROR processing file: {ex.Message}");
}
}
// Print Summary
Log($"\n\n{new string('=', 80)}");
Log("VERIFICATION SUMMARY");
Log(new string('=', 80));
Log($"\nCVs Processed: {processedCount - errorCount}/{cvFiles.Count}");
if (errorCount > 0)
Log($"Errors: {errorCount}");
var empRate = totalEmployers > 0 ? verifiedEmployers * 100 / totalEmployers : 0;
var eduRate = totalEducation > 0 ? verifiedEducation * 100 / totalEducation : 0;
Log($"\nEmployers: {verifiedEmployers}/{totalEmployers} verified ({empRate}%)");
Log($"Education: {verifiedEducation}/{totalEducation} verified ({eduRate}%)");
// List unverified employers
var uniqueUnverifiedEmployers = allUnverifiedEmployers
.GroupBy(e => e, StringComparer.OrdinalIgnoreCase)
.OrderByDescending(g => g.Count())
.ThenBy(g => g.Key)
.ToList();
if (uniqueUnverifiedEmployers.Count > 0)
{
Log($"\n{new string('-', 60)}");
Log($"UNVERIFIED EMPLOYERS ({uniqueUnverifiedEmployers.Count} unique):");
foreach (var group in uniqueUnverifiedEmployers)
{
var count = group.Count() > 1 ? $" (x{group.Count()})" : "";
Log($" - {group.Key}{count}");
}
}
// List unverified institutions
var uniqueUnverifiedInstitutions = allUnverifiedInstitutions
.GroupBy(i => i, StringComparer.OrdinalIgnoreCase)
.OrderByDescending(g => g.Count())
.ThenBy(g => g.Key)
.ToList();
if (uniqueUnverifiedInstitutions.Count > 0)
{
Log($"\n{new string('-', 60)}");
Log($"UNVERIFIED INSTITUTIONS ({uniqueUnverifiedInstitutions.Count} unique):");
foreach (var group in uniqueUnverifiedInstitutions)
{
var count = group.Count() > 1 ? $" (x{group.Count()})" : "";
Log($" - {group.Key}{count}");
}
}
Log($"\nCompleted: {DateTime.Now:yyyy-MM-dd HH:mm:ss}");
Log($"\n{new string('=', 80)}");
_logWriter?.Close();
Console.WriteLine($"\nResults written to: {logPath}");
return 0;
}
static void Log(string message)
{
Console.WriteLine(message);
_logWriter?.WriteLine(message);
}
static string AskForFolder()
{
Console.Write("Enter CV folder path: ");
return Console.ReadLine() ?? "";
}
static string FormatPeriod(DateOnly? start, DateOnly? end)
{
var startStr = start?.ToString("MMM yyyy") ?? "?";
var endStr = end?.ToString("MMM yyyy") ?? "Present";
return $"{startStr} - {endStr}";
}
static CVData ConvertTestCVData(TestCVData testCv)
{
return new CVData
{
FullName = testCv.Personal?.Name ?? "Unknown",
Email = testCv.Personal?.Email,
Phone = testCv.Personal?.Phone,
Employment = testCv.Employment?.Select(e => new EmploymentEntry
{
CompanyName = e.Company ?? "Unknown",
JobTitle = e.JobTitle ?? "Unknown",
Location = e.Location,
StartDate = ParseDate(e.StartDate),
EndDate = ParseDate(e.EndDate),
IsCurrent = e.EndDate == null,
Description = e.Description
}).ToList() ?? [],
Education = testCv.Education?.Select(e => new EducationEntry
{
Institution = e.Institution ?? "Unknown",
Qualification = e.Qualification,
Subject = e.Subject,
StartDate = ParseDate(e.StartDate),
EndDate = ParseDate(e.EndDate)
}).ToList() ?? [],
Skills = testCv.Skills ?? []
};
}
static DateOnly? ParseDate(string? dateStr)
{
if (string.IsNullOrEmpty(dateStr)) return null;
// Try parsing YYYY-MM format
if (dateStr.Length == 7 && dateStr[4] == '-')
{
if (int.TryParse(dateStr[..4], out var year) && int.TryParse(dateStr[5..], out var month))
{
return new DateOnly(year, month, 1);
}
}
// Try standard parsing
if (DateOnly.TryParse(dateStr, out var date))
{
return date;
}
return null;
}
static void ConfigureServices(IServiceCollection services)
{
// Load configuration - try multiple locations
var configPaths = new[]
{
"/var/www/realcv",
"/git/RealCV/src/RealCV.Web",
Path.GetFullPath(Path.Combine(AppContext.BaseDirectory, "..", "..", "..", "..", "..", "src", "RealCV.Web"))
};
var webProjectPath = configPaths.FirstOrDefault(Directory.Exists) ?? "/git/RealCV/src/RealCV.Web";
Log($"Loading config from: {webProjectPath}");
var configuration = new ConfigurationBuilder()
.SetBasePath(webProjectPath)
.AddJsonFile("appsettings.json", optional: true)
.AddJsonFile("appsettings.Development.json", optional: true)
.AddJsonFile("appsettings.Production.json", optional: true)
.Build();
// Logging - show info level for verification details
services.AddLogging(builder =>
{
builder.AddConsole();
builder.SetMinimumLevel(LogLevel.Information);
// Filter out noisy libraries
builder.AddFilter("Microsoft", LogLevel.Warning);
builder.AddFilter("System", LogLevel.Warning);
});
// Database
var connectionString = configuration.GetConnectionString("DefaultConnection")
?? "Server=127.0.0.1;Database=RealCV;User Id=SA;Password=TrueCV_Sql2024!;TrustServerCertificate=True";
services.AddDbContextFactory<ApplicationDbContext>(options =>
options.UseSqlServer(connectionString));
// Companies House - use configuration binding
services.Configure<CompaniesHouseSettings>(configuration.GetSection(CompaniesHouseSettings.SectionName));
services.AddHttpClient<CompaniesHouseClient>();
// Anthropic - use configuration binding
services.Configure<AnthropicSettings>(configuration.GetSection(AnthropicSettings.SectionName));
services.AddScoped<ICompanyNameMatcherService, AICompanyNameMatcherService>();
// Services
services.AddScoped<ICompanyVerifierService, CompanyVerifierService>();
services.AddScoped<IEducationVerifierService, EducationVerifierService>();
services.AddScoped<ICVParserService, CVParserService>();
}
}