2026-01-22 11:04:30 +00:00
|
|
|
using System.Text.Json;
|
|
|
|
|
using Microsoft.EntityFrameworkCore;
|
|
|
|
|
using Microsoft.Extensions.Configuration;
|
|
|
|
|
using Microsoft.Extensions.DependencyInjection;
|
|
|
|
|
using Microsoft.Extensions.Logging;
|
|
|
|
|
using RealCV.Application.Interfaces;
|
|
|
|
|
using RealCV.Application.Models;
|
|
|
|
|
using RealCV.Infrastructure.Data;
|
|
|
|
|
using RealCV.Infrastructure.ExternalApis;
|
|
|
|
|
using RealCV.Infrastructure.Services;
|
|
|
|
|
using RealCV.Infrastructure.Configuration;
|
|
|
|
|
|
|
|
|
|
namespace RealCV.Tests.Integration;
|
|
|
|
|
|
|
|
|
|
/// <summary>
|
|
|
|
|
/// Test utility to batch process CVs and output verification findings.
|
|
|
|
|
/// Run with: dotnet test --filter "FullyQualifiedName~CVBatchTester" -- TestRunParameters.Parameter(name=\"CvFolder\", value=\"/path/to/cvs\")
|
|
|
|
|
/// Or use the ProcessFolder method directly.
|
|
|
|
|
/// </summary>
|
|
|
|
|
public class CVBatchTester
|
|
|
|
|
{
|
|
|
|
|
private readonly IServiceProvider _serviceProvider;
|
|
|
|
|
|
|
|
|
|
public CVBatchTester()
|
|
|
|
|
{
|
|
|
|
|
var services = new ServiceCollection();
|
|
|
|
|
ConfigureServices(services);
|
|
|
|
|
_serviceProvider = services.BuildServiceProvider();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private static void ConfigureServices(IServiceCollection services)
|
|
|
|
|
{
|
|
|
|
|
// Load configuration
|
|
|
|
|
var configuration = new ConfigurationBuilder()
|
|
|
|
|
.SetBasePath(Directory.GetCurrentDirectory())
|
|
|
|
|
.AddJsonFile("appsettings.json", optional: true)
|
|
|
|
|
.AddJsonFile("appsettings.Development.json", optional: true)
|
|
|
|
|
.AddEnvironmentVariables()
|
|
|
|
|
.Build();
|
|
|
|
|
|
|
|
|
|
// Logging
|
|
|
|
|
services.AddLogging(builder =>
|
|
|
|
|
{
|
|
|
|
|
builder.AddConsole();
|
|
|
|
|
builder.SetMinimumLevel(LogLevel.Information);
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
// Database
|
|
|
|
|
var connectionString = configuration.GetConnectionString("DefaultConnection")
|
|
|
|
|
?? "Server=127.0.0.1;Database=RealCV;User Id=SA;Password=TrueCV_Sql2024!;TrustServerCertificate=True";
|
|
|
|
|
|
|
|
|
|
services.AddDbContextFactory<ApplicationDbContext>(options =>
|
|
|
|
|
options.UseSqlServer(connectionString));
|
|
|
|
|
|
|
|
|
|
// Companies House
|
feat: Reduce false positives in company verification
Major improvements to company name matching accuracy:
- Add well-known brands dictionary with correct Companies House numbers
for fast-track verification (Boots, Legal & General, EY, etc.)
- Add safe expansion words (UK, LIMITED, GROUP, PLC) that don't change
company identity
- Fix core word validation to require original company's core words
- Remove overly aggressive skip words that removed meaningful identifiers
(industries, technology, consulting, services, etc.)
- Add industry context hints for AI matching
- Fix CVBatchTester JSON deserialization for test files
Before: 98% verified but with false positives like:
- Boots → BOOTS AND BEARDS (wrong)
- Legal & General → LEGAL LIMITED (wrong)
After: 97% verified with correct matches:
- Boots → BOOTS UK LIMITED (correct)
- Legal & General → fast-tracked to correct company
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-22 19:01:11 +00:00
|
|
|
services.Configure<CompaniesHouseSettings>(configuration.GetSection("CompaniesHouse"));
|
2026-01-22 11:04:30 +00:00
|
|
|
services.AddHttpClient<CompaniesHouseClient>();
|
|
|
|
|
|
|
|
|
|
// Anthropic (for AI matching)
|
feat: Reduce false positives in company verification
Major improvements to company name matching accuracy:
- Add well-known brands dictionary with correct Companies House numbers
for fast-track verification (Boots, Legal & General, EY, etc.)
- Add safe expansion words (UK, LIMITED, GROUP, PLC) that don't change
company identity
- Fix core word validation to require original company's core words
- Remove overly aggressive skip words that removed meaningful identifiers
(industries, technology, consulting, services, etc.)
- Add industry context hints for AI matching
- Fix CVBatchTester JSON deserialization for test files
Before: 98% verified but with false positives like:
- Boots → BOOTS AND BEARDS (wrong)
- Legal & General → LEGAL LIMITED (wrong)
After: 97% verified with correct matches:
- Boots → BOOTS UK LIMITED (correct)
- Legal & General → fast-tracked to correct company
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-22 19:01:11 +00:00
|
|
|
services.Configure<AnthropicSettings>(configuration.GetSection("Anthropic"));
|
|
|
|
|
services.AddScoped<ICompanyNameMatcherService, AICompanyNameMatcherService>();
|
2026-01-22 11:04:30 +00:00
|
|
|
|
|
|
|
|
// Services
|
|
|
|
|
services.AddScoped<ICompanyVerifierService, CompanyVerifierService>();
|
|
|
|
|
services.AddScoped<IEducationVerifierService, EducationVerifierService>();
|
|
|
|
|
services.AddScoped<ICVParserService, CVParserService>();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// <summary>
|
|
|
|
|
/// Process all CVs in a folder and return verification results.
|
|
|
|
|
/// </summary>
|
|
|
|
|
public async Task<List<CVVerificationSummary>> ProcessFolderAsync(string folderPath)
|
|
|
|
|
{
|
|
|
|
|
if (!Directory.Exists(folderPath))
|
|
|
|
|
{
|
|
|
|
|
throw new DirectoryNotFoundException($"Folder not found: {folderPath}");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
var cvFiles = Directory.GetFiles(folderPath, "*.*", SearchOption.TopDirectoryOnly)
|
|
|
|
|
.Where(f => f.EndsWith(".pdf", StringComparison.OrdinalIgnoreCase) ||
|
|
|
|
|
f.EndsWith(".docx", StringComparison.OrdinalIgnoreCase) ||
|
|
|
|
|
f.EndsWith(".doc", StringComparison.OrdinalIgnoreCase))
|
|
|
|
|
.ToList();
|
|
|
|
|
|
|
|
|
|
Console.WriteLine($"Found {cvFiles.Count} CV files in {folderPath}");
|
|
|
|
|
Console.WriteLine(new string('=', 80));
|
|
|
|
|
|
|
|
|
|
var results = new List<CVVerificationSummary>();
|
|
|
|
|
|
|
|
|
|
foreach (var cvFile in cvFiles)
|
|
|
|
|
{
|
|
|
|
|
Console.WriteLine($"\nProcessing: {Path.GetFileName(cvFile)}");
|
|
|
|
|
Console.WriteLine(new string('-', 60));
|
|
|
|
|
|
|
|
|
|
try
|
|
|
|
|
{
|
|
|
|
|
var result = await ProcessSingleCVAsync(cvFile);
|
|
|
|
|
results.Add(result);
|
|
|
|
|
PrintSummary(result);
|
|
|
|
|
}
|
|
|
|
|
catch (Exception ex)
|
|
|
|
|
{
|
|
|
|
|
Console.WriteLine($"ERROR: {ex.Message}");
|
|
|
|
|
results.Add(new CVVerificationSummary
|
|
|
|
|
{
|
|
|
|
|
FileName = Path.GetFileName(cvFile),
|
|
|
|
|
Error = ex.Message
|
|
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Print overall summary
|
|
|
|
|
Console.WriteLine("\n" + new string('=', 80));
|
|
|
|
|
Console.WriteLine("OVERALL SUMMARY");
|
|
|
|
|
Console.WriteLine(new string('=', 80));
|
|
|
|
|
PrintOverallSummary(results);
|
|
|
|
|
|
|
|
|
|
return results;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private async Task<CVVerificationSummary> ProcessSingleCVAsync(string filePath)
|
|
|
|
|
{
|
|
|
|
|
using var scope = _serviceProvider.CreateScope();
|
|
|
|
|
var cvParser = scope.ServiceProvider.GetRequiredService<ICVParserService>();
|
|
|
|
|
var companyVerifier = scope.ServiceProvider.GetRequiredService<ICompanyVerifierService>();
|
|
|
|
|
var educationVerifier = scope.ServiceProvider.GetRequiredService<IEducationVerifierService>();
|
|
|
|
|
|
|
|
|
|
// Parse the CV
|
|
|
|
|
await using var fileStream = File.OpenRead(filePath);
|
|
|
|
|
var parsedCV = await cvParser.ParseAsync(fileStream, Path.GetFileName(filePath));
|
|
|
|
|
|
|
|
|
|
var summary = new CVVerificationSummary
|
|
|
|
|
{
|
|
|
|
|
FileName = Path.GetFileName(filePath),
|
feat: Reduce false positives in company verification
Major improvements to company name matching accuracy:
- Add well-known brands dictionary with correct Companies House numbers
for fast-track verification (Boots, Legal & General, EY, etc.)
- Add safe expansion words (UK, LIMITED, GROUP, PLC) that don't change
company identity
- Fix core word validation to require original company's core words
- Remove overly aggressive skip words that removed meaningful identifiers
(industries, technology, consulting, services, etc.)
- Add industry context hints for AI matching
- Fix CVBatchTester JSON deserialization for test files
Before: 98% verified but with false positives like:
- Boots → BOOTS AND BEARDS (wrong)
- Legal & General → LEGAL LIMITED (wrong)
After: 97% verified with correct matches:
- Boots → BOOTS UK LIMITED (correct)
- Legal & General → fast-tracked to correct company
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-22 19:01:11 +00:00
|
|
|
CandidateName = parsedCV.FullName ?? "Unknown"
|
2026-01-22 11:04:30 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
// Verify employers
|
|
|
|
|
if (parsedCV.Employment?.Count > 0)
|
|
|
|
|
{
|
|
|
|
|
foreach (var employment in parsedCV.Employment)
|
|
|
|
|
{
|
|
|
|
|
try
|
|
|
|
|
{
|
|
|
|
|
var result = await companyVerifier.VerifyCompanyAsync(
|
|
|
|
|
employment.CompanyName,
|
|
|
|
|
employment.StartDate,
|
|
|
|
|
employment.EndDate,
|
|
|
|
|
employment.JobTitle);
|
|
|
|
|
|
|
|
|
|
summary.EmployerResults.Add(new EmployerVerificationSummary
|
|
|
|
|
{
|
|
|
|
|
ClaimedName = employment.CompanyName,
|
|
|
|
|
MatchedName = result.MatchedCompanyName,
|
|
|
|
|
CompanyNumber = result.MatchedCompanyNumber,
|
|
|
|
|
IsVerified = result.IsVerified,
|
|
|
|
|
MatchScore = result.MatchScore,
|
|
|
|
|
Notes = result.VerificationNotes,
|
|
|
|
|
Status = result.CompanyStatus
|
|
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
catch (Exception ex)
|
|
|
|
|
{
|
|
|
|
|
summary.EmployerResults.Add(new EmployerVerificationSummary
|
|
|
|
|
{
|
|
|
|
|
ClaimedName = employment.CompanyName,
|
|
|
|
|
IsVerified = false,
|
|
|
|
|
Notes = $"Error: {ex.Message}"
|
|
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Verify education
|
|
|
|
|
if (parsedCV.Education?.Count > 0)
|
|
|
|
|
{
|
|
|
|
|
var educationResults = educationVerifier.VerifyAll(
|
|
|
|
|
parsedCV.Education.Select(e => new EducationEntry
|
|
|
|
|
{
|
|
|
|
|
Institution = e.Institution,
|
|
|
|
|
Qualification = e.Qualification,
|
|
|
|
|
Subject = e.Subject,
|
|
|
|
|
StartDate = e.StartDate,
|
|
|
|
|
EndDate = e.EndDate
|
|
|
|
|
}).ToList());
|
|
|
|
|
|
|
|
|
|
foreach (var result in educationResults)
|
|
|
|
|
{
|
|
|
|
|
summary.EducationResults.Add(new EducationVerificationSummary
|
|
|
|
|
{
|
|
|
|
|
ClaimedInstitution = result.ClaimedInstitution,
|
|
|
|
|
MatchedInstitution = result.MatchedInstitution,
|
|
|
|
|
Qualification = result.ClaimedQualification,
|
|
|
|
|
IsVerified = result.IsVerified,
|
|
|
|
|
Status = result.Status,
|
|
|
|
|
Notes = result.VerificationNotes
|
|
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return summary;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private static void PrintSummary(CVVerificationSummary summary)
|
|
|
|
|
{
|
|
|
|
|
Console.WriteLine($"Candidate: {summary.CandidateName}");
|
|
|
|
|
|
|
|
|
|
Console.WriteLine($"\n EMPLOYERS ({summary.EmployerResults.Count}):");
|
|
|
|
|
foreach (var emp in summary.EmployerResults)
|
|
|
|
|
{
|
|
|
|
|
var status = emp.IsVerified ? "✓" : "✗";
|
|
|
|
|
var matchInfo = emp.IsVerified
|
|
|
|
|
? $"-> {emp.MatchedName} ({emp.MatchScore}%)"
|
|
|
|
|
: emp.Notes ?? "Not found";
|
|
|
|
|
Console.WriteLine($" {status} {emp.ClaimedName}");
|
|
|
|
|
Console.WriteLine($" {matchInfo}");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
Console.WriteLine($"\n EDUCATION ({summary.EducationResults.Count}):");
|
|
|
|
|
foreach (var edu in summary.EducationResults)
|
|
|
|
|
{
|
|
|
|
|
var status = edu.IsVerified ? "✓" : "✗";
|
|
|
|
|
var matchInfo = edu.IsVerified && edu.MatchedInstitution != null
|
|
|
|
|
? $"-> {edu.MatchedInstitution}"
|
|
|
|
|
: edu.Notes ?? edu.Status;
|
|
|
|
|
Console.WriteLine($" {status} {edu.ClaimedInstitution}");
|
|
|
|
|
Console.WriteLine($" {edu.Qualification}");
|
|
|
|
|
Console.WriteLine($" {matchInfo}");
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private static void PrintOverallSummary(List<CVVerificationSummary> results)
|
|
|
|
|
{
|
|
|
|
|
var successfulCVs = results.Count(r => r.Error == null);
|
|
|
|
|
var totalEmployers = results.Sum(r => r.EmployerResults.Count);
|
|
|
|
|
var verifiedEmployers = results.Sum(r => r.EmployerResults.Count(e => e.IsVerified));
|
|
|
|
|
var totalEducation = results.Sum(r => r.EducationResults.Count);
|
|
|
|
|
var verifiedEducation = results.Sum(r => r.EducationResults.Count(e => e.IsVerified));
|
|
|
|
|
|
|
|
|
|
Console.WriteLine($"CVs Processed: {successfulCVs}/{results.Count}");
|
|
|
|
|
Console.WriteLine($"Employers: {verifiedEmployers}/{totalEmployers} verified ({(totalEmployers > 0 ? verifiedEmployers * 100 / totalEmployers : 0)}%)");
|
|
|
|
|
Console.WriteLine($"Education: {verifiedEducation}/{totalEducation} verified ({(totalEducation > 0 ? verifiedEducation * 100 / totalEducation : 0)}%)");
|
|
|
|
|
|
|
|
|
|
// List unverified employers
|
|
|
|
|
var unverifiedEmployers = results
|
|
|
|
|
.SelectMany(r => r.EmployerResults.Where(e => !e.IsVerified))
|
|
|
|
|
.GroupBy(e => e.ClaimedName)
|
|
|
|
|
.OrderByDescending(g => g.Count())
|
|
|
|
|
.ToList();
|
|
|
|
|
|
|
|
|
|
if (unverifiedEmployers.Count > 0)
|
|
|
|
|
{
|
|
|
|
|
Console.WriteLine($"\nUNVERIFIED EMPLOYERS ({unverifiedEmployers.Count} unique):");
|
|
|
|
|
foreach (var group in unverifiedEmployers.Take(20))
|
|
|
|
|
{
|
|
|
|
|
Console.WriteLine($" - {group.Key} (x{group.Count()})");
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// List unverified institutions
|
|
|
|
|
var unverifiedEducation = results
|
|
|
|
|
.SelectMany(r => r.EducationResults.Where(e => !e.IsVerified))
|
|
|
|
|
.GroupBy(e => e.ClaimedInstitution)
|
|
|
|
|
.OrderByDescending(g => g.Count())
|
|
|
|
|
.ToList();
|
|
|
|
|
|
|
|
|
|
if (unverifiedEducation.Count > 0)
|
|
|
|
|
{
|
|
|
|
|
Console.WriteLine($"\nUNVERIFIED INSTITUTIONS ({unverifiedEducation.Count} unique):");
|
|
|
|
|
foreach (var group in unverifiedEducation.Take(20))
|
|
|
|
|
{
|
|
|
|
|
Console.WriteLine($" - {group.Key} (x{group.Count()})");
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// <summary>
|
|
|
|
|
/// Export results to JSON for further analysis.
|
|
|
|
|
/// </summary>
|
|
|
|
|
public static void ExportToJson(List<CVVerificationSummary> results, string outputPath)
|
|
|
|
|
{
|
|
|
|
|
var json = JsonSerializer.Serialize(results, new JsonSerializerOptions
|
|
|
|
|
{
|
|
|
|
|
WriteIndented = true
|
|
|
|
|
});
|
|
|
|
|
File.WriteAllText(outputPath, json);
|
|
|
|
|
Console.WriteLine($"\nResults exported to: {outputPath}");
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public class CVVerificationSummary
|
|
|
|
|
{
|
|
|
|
|
public string FileName { get; set; } = "";
|
|
|
|
|
public string CandidateName { get; set; } = "";
|
|
|
|
|
public string? Error { get; set; }
|
|
|
|
|
public List<EmployerVerificationSummary> EmployerResults { get; set; } = new();
|
|
|
|
|
public List<EducationVerificationSummary> EducationResults { get; set; } = new();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public class EmployerVerificationSummary
|
|
|
|
|
{
|
|
|
|
|
public string ClaimedName { get; set; } = "";
|
|
|
|
|
public string? MatchedName { get; set; }
|
|
|
|
|
public string? CompanyNumber { get; set; }
|
|
|
|
|
public bool IsVerified { get; set; }
|
|
|
|
|
public int MatchScore { get; set; }
|
|
|
|
|
public string? Notes { get; set; }
|
|
|
|
|
public string? Status { get; set; }
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public class EducationVerificationSummary
|
|
|
|
|
{
|
|
|
|
|
public string ClaimedInstitution { get; set; } = "";
|
|
|
|
|
public string? MatchedInstitution { get; set; }
|
|
|
|
|
public string? Qualification { get; set; }
|
|
|
|
|
public bool IsVerified { get; set; }
|
|
|
|
|
public string? Status { get; set; }
|
|
|
|
|
public string? Notes { get; set; }
|
|
|
|
|
}
|