Files
RealCV/tests/RealCV.Tests/Integration/CVBatchTester.cs
Peter Foster 3d666d5f9c feat: Reduce false positives in company verification
Major improvements to company name matching accuracy:

- Add well-known brands dictionary with correct Companies House numbers
  for fast-track verification (Boots, Legal & General, EY, etc.)
- Add safe expansion words (UK, LIMITED, GROUP, PLC) that don't change
  company identity
- Fix core word validation to require original company's core words
- Remove overly aggressive skip words that removed meaningful identifiers
  (industries, technology, consulting, services, etc.)
- Add industry context hints for AI matching
- Fix CVBatchTester JSON deserialization for test files

Before: 98% verified but with false positives like:
- Boots → BOOTS AND BEARDS (wrong)
- Legal & General → LEGAL LIMITED (wrong)

After: 97% verified with correct matches:
- Boots → BOOTS UK LIMITED (correct)
- Legal & General → fast-tracked to correct company

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-22 19:07:17 +00:00

320 lines
12 KiB
C#

using System.Text.Json;
using Microsoft.EntityFrameworkCore;
using Microsoft.Extensions.Configuration;
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Logging;
using RealCV.Application.Interfaces;
using RealCV.Application.Models;
using RealCV.Infrastructure.Data;
using RealCV.Infrastructure.ExternalApis;
using RealCV.Infrastructure.Services;
using RealCV.Infrastructure.Configuration;
namespace RealCV.Tests.Integration;
/// <summary>
/// Test utility to batch process CVs and output verification findings.
/// Run with: dotnet test --filter "FullyQualifiedName~CVBatchTester" -- TestRunParameters.Parameter(name=\"CvFolder\", value=\"/path/to/cvs\")
/// Or use the ProcessFolder method directly.
/// </summary>
public class CVBatchTester
{
private readonly IServiceProvider _serviceProvider;
public CVBatchTester()
{
var services = new ServiceCollection();
ConfigureServices(services);
_serviceProvider = services.BuildServiceProvider();
}
private static void ConfigureServices(IServiceCollection services)
{
// Load configuration
var configuration = new ConfigurationBuilder()
.SetBasePath(Directory.GetCurrentDirectory())
.AddJsonFile("appsettings.json", optional: true)
.AddJsonFile("appsettings.Development.json", optional: true)
.AddEnvironmentVariables()
.Build();
// Logging
services.AddLogging(builder =>
{
builder.AddConsole();
builder.SetMinimumLevel(LogLevel.Information);
});
// Database
var connectionString = configuration.GetConnectionString("DefaultConnection")
?? "Server=127.0.0.1;Database=RealCV;User Id=SA;Password=TrueCV_Sql2024!;TrustServerCertificate=True";
services.AddDbContextFactory<ApplicationDbContext>(options =>
options.UseSqlServer(connectionString));
// Companies House
services.Configure<CompaniesHouseSettings>(configuration.GetSection("CompaniesHouse"));
services.AddHttpClient<CompaniesHouseClient>();
// Anthropic (for AI matching)
services.Configure<AnthropicSettings>(configuration.GetSection("Anthropic"));
services.AddScoped<ICompanyNameMatcherService, AICompanyNameMatcherService>();
// Services
services.AddScoped<ICompanyVerifierService, CompanyVerifierService>();
services.AddScoped<IEducationVerifierService, EducationVerifierService>();
services.AddScoped<ICVParserService, CVParserService>();
}
/// <summary>
/// Process all CVs in a folder and return verification results.
/// </summary>
public async Task<List<CVVerificationSummary>> ProcessFolderAsync(string folderPath)
{
if (!Directory.Exists(folderPath))
{
throw new DirectoryNotFoundException($"Folder not found: {folderPath}");
}
var cvFiles = Directory.GetFiles(folderPath, "*.*", SearchOption.TopDirectoryOnly)
.Where(f => f.EndsWith(".pdf", StringComparison.OrdinalIgnoreCase) ||
f.EndsWith(".docx", StringComparison.OrdinalIgnoreCase) ||
f.EndsWith(".doc", StringComparison.OrdinalIgnoreCase))
.ToList();
Console.WriteLine($"Found {cvFiles.Count} CV files in {folderPath}");
Console.WriteLine(new string('=', 80));
var results = new List<CVVerificationSummary>();
foreach (var cvFile in cvFiles)
{
Console.WriteLine($"\nProcessing: {Path.GetFileName(cvFile)}");
Console.WriteLine(new string('-', 60));
try
{
var result = await ProcessSingleCVAsync(cvFile);
results.Add(result);
PrintSummary(result);
}
catch (Exception ex)
{
Console.WriteLine($"ERROR: {ex.Message}");
results.Add(new CVVerificationSummary
{
FileName = Path.GetFileName(cvFile),
Error = ex.Message
});
}
}
// Print overall summary
Console.WriteLine("\n" + new string('=', 80));
Console.WriteLine("OVERALL SUMMARY");
Console.WriteLine(new string('=', 80));
PrintOverallSummary(results);
return results;
}
private async Task<CVVerificationSummary> ProcessSingleCVAsync(string filePath)
{
using var scope = _serviceProvider.CreateScope();
var cvParser = scope.ServiceProvider.GetRequiredService<ICVParserService>();
var companyVerifier = scope.ServiceProvider.GetRequiredService<ICompanyVerifierService>();
var educationVerifier = scope.ServiceProvider.GetRequiredService<IEducationVerifierService>();
// Parse the CV
await using var fileStream = File.OpenRead(filePath);
var parsedCV = await cvParser.ParseAsync(fileStream, Path.GetFileName(filePath));
var summary = new CVVerificationSummary
{
FileName = Path.GetFileName(filePath),
CandidateName = parsedCV.FullName ?? "Unknown"
};
// Verify employers
if (parsedCV.Employment?.Count > 0)
{
foreach (var employment in parsedCV.Employment)
{
try
{
var result = await companyVerifier.VerifyCompanyAsync(
employment.CompanyName,
employment.StartDate,
employment.EndDate,
employment.JobTitle);
summary.EmployerResults.Add(new EmployerVerificationSummary
{
ClaimedName = employment.CompanyName,
MatchedName = result.MatchedCompanyName,
CompanyNumber = result.MatchedCompanyNumber,
IsVerified = result.IsVerified,
MatchScore = result.MatchScore,
Notes = result.VerificationNotes,
Status = result.CompanyStatus
});
}
catch (Exception ex)
{
summary.EmployerResults.Add(new EmployerVerificationSummary
{
ClaimedName = employment.CompanyName,
IsVerified = false,
Notes = $"Error: {ex.Message}"
});
}
}
}
// Verify education
if (parsedCV.Education?.Count > 0)
{
var educationResults = educationVerifier.VerifyAll(
parsedCV.Education.Select(e => new EducationEntry
{
Institution = e.Institution,
Qualification = e.Qualification,
Subject = e.Subject,
StartDate = e.StartDate,
EndDate = e.EndDate
}).ToList());
foreach (var result in educationResults)
{
summary.EducationResults.Add(new EducationVerificationSummary
{
ClaimedInstitution = result.ClaimedInstitution,
MatchedInstitution = result.MatchedInstitution,
Qualification = result.ClaimedQualification,
IsVerified = result.IsVerified,
Status = result.Status,
Notes = result.VerificationNotes
});
}
}
return summary;
}
private static void PrintSummary(CVVerificationSummary summary)
{
Console.WriteLine($"Candidate: {summary.CandidateName}");
Console.WriteLine($"\n EMPLOYERS ({summary.EmployerResults.Count}):");
foreach (var emp in summary.EmployerResults)
{
var status = emp.IsVerified ? "✓" : "✗";
var matchInfo = emp.IsVerified
? $"-> {emp.MatchedName} ({emp.MatchScore}%)"
: emp.Notes ?? "Not found";
Console.WriteLine($" {status} {emp.ClaimedName}");
Console.WriteLine($" {matchInfo}");
}
Console.WriteLine($"\n EDUCATION ({summary.EducationResults.Count}):");
foreach (var edu in summary.EducationResults)
{
var status = edu.IsVerified ? "✓" : "✗";
var matchInfo = edu.IsVerified && edu.MatchedInstitution != null
? $"-> {edu.MatchedInstitution}"
: edu.Notes ?? edu.Status;
Console.WriteLine($" {status} {edu.ClaimedInstitution}");
Console.WriteLine($" {edu.Qualification}");
Console.WriteLine($" {matchInfo}");
}
}
private static void PrintOverallSummary(List<CVVerificationSummary> results)
{
var successfulCVs = results.Count(r => r.Error == null);
var totalEmployers = results.Sum(r => r.EmployerResults.Count);
var verifiedEmployers = results.Sum(r => r.EmployerResults.Count(e => e.IsVerified));
var totalEducation = results.Sum(r => r.EducationResults.Count);
var verifiedEducation = results.Sum(r => r.EducationResults.Count(e => e.IsVerified));
Console.WriteLine($"CVs Processed: {successfulCVs}/{results.Count}");
Console.WriteLine($"Employers: {verifiedEmployers}/{totalEmployers} verified ({(totalEmployers > 0 ? verifiedEmployers * 100 / totalEmployers : 0)}%)");
Console.WriteLine($"Education: {verifiedEducation}/{totalEducation} verified ({(totalEducation > 0 ? verifiedEducation * 100 / totalEducation : 0)}%)");
// List unverified employers
var unverifiedEmployers = results
.SelectMany(r => r.EmployerResults.Where(e => !e.IsVerified))
.GroupBy(e => e.ClaimedName)
.OrderByDescending(g => g.Count())
.ToList();
if (unverifiedEmployers.Count > 0)
{
Console.WriteLine($"\nUNVERIFIED EMPLOYERS ({unverifiedEmployers.Count} unique):");
foreach (var group in unverifiedEmployers.Take(20))
{
Console.WriteLine($" - {group.Key} (x{group.Count()})");
}
}
// List unverified institutions
var unverifiedEducation = results
.SelectMany(r => r.EducationResults.Where(e => !e.IsVerified))
.GroupBy(e => e.ClaimedInstitution)
.OrderByDescending(g => g.Count())
.ToList();
if (unverifiedEducation.Count > 0)
{
Console.WriteLine($"\nUNVERIFIED INSTITUTIONS ({unverifiedEducation.Count} unique):");
foreach (var group in unverifiedEducation.Take(20))
{
Console.WriteLine($" - {group.Key} (x{group.Count()})");
}
}
}
/// <summary>
/// Export results to JSON for further analysis.
/// </summary>
public static void ExportToJson(List<CVVerificationSummary> results, string outputPath)
{
var json = JsonSerializer.Serialize(results, new JsonSerializerOptions
{
WriteIndented = true
});
File.WriteAllText(outputPath, json);
Console.WriteLine($"\nResults exported to: {outputPath}");
}
}
public class CVVerificationSummary
{
public string FileName { get; set; } = "";
public string CandidateName { get; set; } = "";
public string? Error { get; set; }
public List<EmployerVerificationSummary> EmployerResults { get; set; } = new();
public List<EducationVerificationSummary> EducationResults { get; set; } = new();
}
public class EmployerVerificationSummary
{
public string ClaimedName { get; set; } = "";
public string? MatchedName { get; set; }
public string? CompanyNumber { get; set; }
public bool IsVerified { get; set; }
public int MatchScore { get; set; }
public string? Notes { get; set; }
public string? Status { get; set; }
}
public class EducationVerificationSummary
{
public string ClaimedInstitution { get; set; } = "";
public string? MatchedInstitution { get; set; }
public string? Qualification { get; set; }
public bool IsVerified { get; set; }
public string? Status { get; set; }
public string? Notes { get; set; }
}