feat: Reduce false positives in company verification
Major improvements to company name matching accuracy: - Add well-known brands dictionary with correct Companies House numbers for fast-track verification (Boots, Legal & General, EY, etc.) - Add safe expansion words (UK, LIMITED, GROUP, PLC) that don't change company identity - Fix core word validation to require original company's core words - Remove overly aggressive skip words that removed meaningful identifiers (industries, technology, consulting, services, etc.) - Add industry context hints for AI matching - Fix CVBatchTester JSON deserialization for test files Before: 98% verified but with false positives like: - Boots → BOOTS AND BEARDS (wrong) - Legal & General → LEGAL LIMITED (wrong) After: 97% verified with correct matches: - Boots → BOOTS UK LIMITED (correct) - Legal & General → fast-tracked to correct company 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -1,4 +1,5 @@
|
||||
using System.Text.Json;
|
||||
using System.Text.Json.Serialization;
|
||||
using Microsoft.EntityFrameworkCore;
|
||||
using Microsoft.Extensions.Configuration;
|
||||
using Microsoft.Extensions.DependencyInjection;
|
||||
@@ -12,23 +13,86 @@ using RealCV.Infrastructure.Services;
|
||||
|
||||
namespace CVBatchTester;
|
||||
|
||||
// DTOs for test JSON format (snake_case with nested personal object)
|
||||
record TestCVData
|
||||
{
|
||||
public string? CvId { get; init; }
|
||||
public string? Category { get; init; }
|
||||
public List<string>? ExpectedFlags { get; init; }
|
||||
public TestPersonalData? Personal { get; init; }
|
||||
public string? Profile { get; init; }
|
||||
public List<TestEmploymentEntry>? Employment { get; init; }
|
||||
public List<TestEducationEntry>? Education { get; init; }
|
||||
public List<string>? Skills { get; init; }
|
||||
}
|
||||
|
||||
record TestPersonalData
|
||||
{
|
||||
public string? Name { get; init; }
|
||||
public string? Email { get; init; }
|
||||
public string? Phone { get; init; }
|
||||
public string? Address { get; init; }
|
||||
public string? LinkedIn { get; init; }
|
||||
}
|
||||
|
||||
record TestEmploymentEntry
|
||||
{
|
||||
public string? Company { get; init; }
|
||||
public string? JobTitle { get; init; }
|
||||
public string? StartDate { get; init; }
|
||||
public string? EndDate { get; init; }
|
||||
public string? Location { get; init; }
|
||||
public string? Description { get; init; }
|
||||
public List<string>? Achievements { get; init; }
|
||||
}
|
||||
|
||||
record TestEducationEntry
|
||||
{
|
||||
public string? Institution { get; init; }
|
||||
public string? Qualification { get; init; }
|
||||
public string? Subject { get; init; }
|
||||
public string? Classification { get; init; }
|
||||
public string? StartDate { get; init; }
|
||||
public string? EndDate { get; init; }
|
||||
}
|
||||
|
||||
class Program
|
||||
{
|
||||
private static StreamWriter? _logWriter;
|
||||
|
||||
private static readonly JsonSerializerOptions JsonOptions = new()
|
||||
{
|
||||
PropertyNameCaseInsensitive = true,
|
||||
PropertyNamingPolicy = JsonNamingPolicy.SnakeCaseLower,
|
||||
Converters = { new JsonStringEnumConverter() }
|
||||
};
|
||||
|
||||
static async Task<int> Main(string[] args)
|
||||
{
|
||||
var folderPath = args.FirstOrDefault() ?? AskForFolder();
|
||||
|
||||
if (string.IsNullOrEmpty(folderPath) || !Directory.Exists(folderPath))
|
||||
{
|
||||
Console.WriteLine($"Error: Folder not found: {folderPath}");
|
||||
Console.WriteLine("Usage: CVBatchTester <folder-path>");
|
||||
Console.WriteLine(" e.g. CVBatchTester /home/user/cvs");
|
||||
Log($"Error: Folder not found: {folderPath}");
|
||||
Log("Usage: CVBatchTester <folder-path> [--output <file>]");
|
||||
Log(" e.g. CVBatchTester /home/user/cvs");
|
||||
Log(" e.g. CVBatchTester /home/user/cvs --output /tmp/results.log");
|
||||
return 1;
|
||||
}
|
||||
|
||||
Console.WriteLine($"CV Batch Verification Tester");
|
||||
Console.WriteLine($"Processing CVs from: {folderPath}");
|
||||
Console.WriteLine(new string('=', 80));
|
||||
// Check for --output flag
|
||||
var outputIndex = Array.IndexOf(args, "--output");
|
||||
var logPath = outputIndex >= 0 && outputIndex < args.Length - 1
|
||||
? args[outputIndex + 1]
|
||||
: Path.Combine(folderPath, $"batch-results-{DateTime.Now:yyyyMMdd-HHmmss}.log");
|
||||
|
||||
_logWriter = new StreamWriter(logPath, false) { AutoFlush = true };
|
||||
|
||||
Log($"CV Batch Verification Tester");
|
||||
Log($"Processing CVs from: {folderPath}");
|
||||
Log($"Output log: {logPath}");
|
||||
Log($"Started: {DateTime.Now:yyyy-MM-dd HH:mm:ss}");
|
||||
Log(new string('=', 80));
|
||||
|
||||
// Setup DI
|
||||
var services = new ServiceCollection();
|
||||
@@ -39,15 +103,16 @@ class Program
|
||||
var cvFiles = Directory.GetFiles(folderPath, "*.*", SearchOption.TopDirectoryOnly)
|
||||
.Where(f => f.EndsWith(".pdf", StringComparison.OrdinalIgnoreCase) ||
|
||||
f.EndsWith(".docx", StringComparison.OrdinalIgnoreCase) ||
|
||||
f.EndsWith(".doc", StringComparison.OrdinalIgnoreCase))
|
||||
f.EndsWith(".doc", StringComparison.OrdinalIgnoreCase) ||
|
||||
f.EndsWith(".json", StringComparison.OrdinalIgnoreCase))
|
||||
.OrderBy(f => f)
|
||||
.ToList();
|
||||
|
||||
Console.WriteLine($"Found {cvFiles.Count} CV files\n");
|
||||
Log($"Found {cvFiles.Count} CV files\n");
|
||||
|
||||
if (cvFiles.Count == 0)
|
||||
{
|
||||
Console.WriteLine("No CV files found (.pdf, .docx, .doc)");
|
||||
Log("No CV files found (.pdf, .docx, .doc, .json)");
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -63,9 +128,9 @@ class Program
|
||||
|
||||
foreach (var cvFile in cvFiles)
|
||||
{
|
||||
Console.WriteLine($"\n{new string('=', 80)}");
|
||||
Console.WriteLine($"[{++processedCount}/{cvFiles.Count}] {Path.GetFileName(cvFile)}");
|
||||
Console.WriteLine(new string('=', 80));
|
||||
Log($"\n{new string('=', 80)}");
|
||||
Log($"[{++processedCount}/{cvFiles.Count}] {Path.GetFileName(cvFile)}");
|
||||
Log(new string('=', 80));
|
||||
|
||||
try
|
||||
{
|
||||
@@ -74,17 +139,30 @@ class Program
|
||||
var companyVerifier = scope.ServiceProvider.GetRequiredService<ICompanyVerifierService>();
|
||||
var eduVerifier = scope.ServiceProvider.GetRequiredService<IEducationVerifierService>();
|
||||
|
||||
// Parse CV
|
||||
await using var stream = File.OpenRead(cvFile);
|
||||
var cv = await parser.ParseAsync(stream, Path.GetFileName(cvFile));
|
||||
// Parse CV - handle JSON files differently
|
||||
CVData cv;
|
||||
if (cvFile.EndsWith(".json", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
var jsonContent = await File.ReadAllTextAsync(cvFile);
|
||||
var testCv = JsonSerializer.Deserialize<TestCVData>(jsonContent, JsonOptions)
|
||||
?? throw new InvalidOperationException("Failed to deserialize JSON CV");
|
||||
|
||||
Console.WriteLine($"Candidate: {cv.FullName}");
|
||||
// Convert TestCVData to CVData
|
||||
cv = ConvertTestCVData(testCv);
|
||||
Log($"Loaded JSON CV: {cv.FullName}");
|
||||
}
|
||||
else
|
||||
{
|
||||
await using var stream = File.OpenRead(cvFile);
|
||||
cv = await parser.ParseAsync(stream, Path.GetFileName(cvFile));
|
||||
Log($"Parsed CV: {cv.FullName}");
|
||||
}
|
||||
|
||||
// Verify Employers
|
||||
if (cv.Employment?.Count > 0)
|
||||
{
|
||||
Console.WriteLine($"\nEMPLOYERS ({cv.Employment.Count}):");
|
||||
Console.WriteLine(new string('-', 60));
|
||||
Log($"\nEMPLOYERS ({cv.Employment.Count}):");
|
||||
Log(new string('-', 60));
|
||||
|
||||
foreach (var emp in cv.Employment)
|
||||
{
|
||||
@@ -100,18 +178,18 @@ class Program
|
||||
var icon = result.IsVerified ? "✓" : "✗";
|
||||
var period = FormatPeriod(emp.StartDate, emp.EndDate);
|
||||
|
||||
Console.WriteLine($"\n {icon} {emp.CompanyName}");
|
||||
Console.WriteLine($" Period: {period}");
|
||||
Console.WriteLine($" Role: {emp.JobTitle}");
|
||||
Log($"\n {icon} {emp.CompanyName}");
|
||||
Log($" Period: {period}");
|
||||
Log($" Role: {emp.JobTitle}");
|
||||
|
||||
if (result.IsVerified)
|
||||
{
|
||||
verifiedEmployers++;
|
||||
Console.WriteLine($" Match: {result.MatchedCompanyName} ({result.MatchScore}%)");
|
||||
Log($" Match: {result.MatchedCompanyName} ({result.MatchScore}%)");
|
||||
if (!string.IsNullOrEmpty(result.MatchedCompanyNumber))
|
||||
Console.WriteLine($" Company #: {result.MatchedCompanyNumber}");
|
||||
Log($" Company #: {result.MatchedCompanyNumber}");
|
||||
if (!string.IsNullOrEmpty(result.CompanyStatus))
|
||||
Console.WriteLine($" Status: {result.CompanyStatus}");
|
||||
Log($" Status: {result.CompanyStatus}");
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -119,12 +197,12 @@ class Program
|
||||
}
|
||||
|
||||
if (!string.IsNullOrEmpty(result.VerificationNotes))
|
||||
Console.WriteLine($" Note: {result.VerificationNotes}");
|
||||
Log($" Note: {result.VerificationNotes}");
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
Console.WriteLine($"\n ✗ {emp.CompanyName}");
|
||||
Console.WriteLine($" ERROR: {ex.Message}");
|
||||
Log($"\n ✗ {emp.CompanyName}");
|
||||
Log($" ERROR: {ex.Message}");
|
||||
allUnverifiedEmployers.Add(emp.CompanyName);
|
||||
}
|
||||
}
|
||||
@@ -133,8 +211,8 @@ class Program
|
||||
// Verify Education
|
||||
if (cv.Education?.Count > 0)
|
||||
{
|
||||
Console.WriteLine($"\nEDUCATION ({cv.Education.Count}):");
|
||||
Console.WriteLine(new string('-', 60));
|
||||
Log($"\nEDUCATION ({cv.Education.Count}):");
|
||||
Log(new string('-', 60));
|
||||
|
||||
var eduEntries = cv.Education.Select(e => new EducationEntry
|
||||
{
|
||||
@@ -152,10 +230,10 @@ class Program
|
||||
totalEducation++;
|
||||
var icon = result.IsVerified ? "✓" : "✗";
|
||||
|
||||
Console.WriteLine($"\n {icon} {result.ClaimedInstitution}");
|
||||
Console.WriteLine($" Qualification: {result.ClaimedQualification}");
|
||||
Log($"\n {icon} {result.ClaimedInstitution}");
|
||||
Log($" Qualification: {result.ClaimedQualification}");
|
||||
if (!string.IsNullOrEmpty(result.ClaimedSubject))
|
||||
Console.WriteLine($" Subject: {result.ClaimedSubject}");
|
||||
Log($" Subject: {result.ClaimedSubject}");
|
||||
|
||||
if (result.IsVerified)
|
||||
{
|
||||
@@ -163,41 +241,41 @@ class Program
|
||||
if (result.MatchedInstitution != null &&
|
||||
!result.MatchedInstitution.Equals(result.ClaimedInstitution, StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
Console.WriteLine($" Match: {result.MatchedInstitution}");
|
||||
Log($" Match: {result.MatchedInstitution}");
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
allUnverifiedInstitutions.Add(result.ClaimedInstitution ?? "Unknown");
|
||||
Console.WriteLine($" Status: {result.Status}");
|
||||
Log($" Status: {result.Status}");
|
||||
}
|
||||
|
||||
if (!string.IsNullOrEmpty(result.VerificationNotes))
|
||||
Console.WriteLine($" Note: {result.VerificationNotes}");
|
||||
Log($" Note: {result.VerificationNotes}");
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
errorCount++;
|
||||
Console.WriteLine($"ERROR processing file: {ex.Message}");
|
||||
Log($"ERROR processing file: {ex.Message}");
|
||||
}
|
||||
}
|
||||
|
||||
// Print Summary
|
||||
Console.WriteLine($"\n\n{new string('=', 80)}");
|
||||
Console.WriteLine("VERIFICATION SUMMARY");
|
||||
Console.WriteLine(new string('=', 80));
|
||||
Log($"\n\n{new string('=', 80)}");
|
||||
Log("VERIFICATION SUMMARY");
|
||||
Log(new string('=', 80));
|
||||
|
||||
Console.WriteLine($"\nCVs Processed: {processedCount - errorCount}/{cvFiles.Count}");
|
||||
Log($"\nCVs Processed: {processedCount - errorCount}/{cvFiles.Count}");
|
||||
if (errorCount > 0)
|
||||
Console.WriteLine($"Errors: {errorCount}");
|
||||
Log($"Errors: {errorCount}");
|
||||
|
||||
var empRate = totalEmployers > 0 ? verifiedEmployers * 100 / totalEmployers : 0;
|
||||
var eduRate = totalEducation > 0 ? verifiedEducation * 100 / totalEducation : 0;
|
||||
|
||||
Console.WriteLine($"\nEmployers: {verifiedEmployers}/{totalEmployers} verified ({empRate}%)");
|
||||
Console.WriteLine($"Education: {verifiedEducation}/{totalEducation} verified ({eduRate}%)");
|
||||
Log($"\nEmployers: {verifiedEmployers}/{totalEmployers} verified ({empRate}%)");
|
||||
Log($"Education: {verifiedEducation}/{totalEducation} verified ({eduRate}%)");
|
||||
|
||||
// List unverified employers
|
||||
var uniqueUnverifiedEmployers = allUnverifiedEmployers
|
||||
@@ -208,12 +286,12 @@ class Program
|
||||
|
||||
if (uniqueUnverifiedEmployers.Count > 0)
|
||||
{
|
||||
Console.WriteLine($"\n{new string('-', 60)}");
|
||||
Console.WriteLine($"UNVERIFIED EMPLOYERS ({uniqueUnverifiedEmployers.Count} unique):");
|
||||
Log($"\n{new string('-', 60)}");
|
||||
Log($"UNVERIFIED EMPLOYERS ({uniqueUnverifiedEmployers.Count} unique):");
|
||||
foreach (var group in uniqueUnverifiedEmployers)
|
||||
{
|
||||
var count = group.Count() > 1 ? $" (x{group.Count()})" : "";
|
||||
Console.WriteLine($" - {group.Key}{count}");
|
||||
Log($" - {group.Key}{count}");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -226,19 +304,30 @@ class Program
|
||||
|
||||
if (uniqueUnverifiedInstitutions.Count > 0)
|
||||
{
|
||||
Console.WriteLine($"\n{new string('-', 60)}");
|
||||
Console.WriteLine($"UNVERIFIED INSTITUTIONS ({uniqueUnverifiedInstitutions.Count} unique):");
|
||||
Log($"\n{new string('-', 60)}");
|
||||
Log($"UNVERIFIED INSTITUTIONS ({uniqueUnverifiedInstitutions.Count} unique):");
|
||||
foreach (var group in uniqueUnverifiedInstitutions)
|
||||
{
|
||||
var count = group.Count() > 1 ? $" (x{group.Count()})" : "";
|
||||
Console.WriteLine($" - {group.Key}{count}");
|
||||
Log($" - {group.Key}{count}");
|
||||
}
|
||||
}
|
||||
|
||||
Console.WriteLine($"\n{new string('=', 80)}");
|
||||
Log($"\nCompleted: {DateTime.Now:yyyy-MM-dd HH:mm:ss}");
|
||||
Log($"\n{new string('=', 80)}");
|
||||
|
||||
_logWriter?.Close();
|
||||
Console.WriteLine($"\nResults written to: {logPath}");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void Log(string message)
|
||||
{
|
||||
Console.WriteLine(message);
|
||||
_logWriter?.WriteLine(message);
|
||||
}
|
||||
|
||||
static string AskForFolder()
|
||||
{
|
||||
Console.Write("Enter CV folder path: ");
|
||||
@@ -252,6 +341,57 @@ class Program
|
||||
return $"{startStr} - {endStr}";
|
||||
}
|
||||
|
||||
static CVData ConvertTestCVData(TestCVData testCv)
|
||||
{
|
||||
return new CVData
|
||||
{
|
||||
FullName = testCv.Personal?.Name ?? "Unknown",
|
||||
Email = testCv.Personal?.Email,
|
||||
Phone = testCv.Personal?.Phone,
|
||||
Employment = testCv.Employment?.Select(e => new EmploymentEntry
|
||||
{
|
||||
CompanyName = e.Company ?? "Unknown",
|
||||
JobTitle = e.JobTitle ?? "Unknown",
|
||||
Location = e.Location,
|
||||
StartDate = ParseDate(e.StartDate),
|
||||
EndDate = ParseDate(e.EndDate),
|
||||
IsCurrent = e.EndDate == null,
|
||||
Description = e.Description
|
||||
}).ToList() ?? [],
|
||||
Education = testCv.Education?.Select(e => new EducationEntry
|
||||
{
|
||||
Institution = e.Institution ?? "Unknown",
|
||||
Qualification = e.Qualification,
|
||||
Subject = e.Subject,
|
||||
StartDate = ParseDate(e.StartDate),
|
||||
EndDate = ParseDate(e.EndDate)
|
||||
}).ToList() ?? [],
|
||||
Skills = testCv.Skills ?? []
|
||||
};
|
||||
}
|
||||
|
||||
static DateOnly? ParseDate(string? dateStr)
|
||||
{
|
||||
if (string.IsNullOrEmpty(dateStr)) return null;
|
||||
|
||||
// Try parsing YYYY-MM format
|
||||
if (dateStr.Length == 7 && dateStr[4] == '-')
|
||||
{
|
||||
if (int.TryParse(dateStr[..4], out var year) && int.TryParse(dateStr[5..], out var month))
|
||||
{
|
||||
return new DateOnly(year, month, 1);
|
||||
}
|
||||
}
|
||||
|
||||
// Try standard parsing
|
||||
if (DateOnly.TryParse(dateStr, out var date))
|
||||
{
|
||||
return date;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
static void ConfigureServices(IServiceCollection services)
|
||||
{
|
||||
// Load configuration - try multiple locations
|
||||
@@ -263,7 +403,7 @@ class Program
|
||||
};
|
||||
|
||||
var webProjectPath = configPaths.FirstOrDefault(Directory.Exists) ?? "/git/RealCV/src/RealCV.Web";
|
||||
Console.WriteLine($"Loading config from: {webProjectPath}");
|
||||
Log($"Loading config from: {webProjectPath}");
|
||||
|
||||
var configuration = new ConfigurationBuilder()
|
||||
.SetBasePath(webProjectPath)
|
||||
@@ -272,11 +412,14 @@ class Program
|
||||
.AddJsonFile("appsettings.Production.json", optional: true)
|
||||
.Build();
|
||||
|
||||
// Logging - minimal output
|
||||
// Logging - show info level for verification details
|
||||
services.AddLogging(builder =>
|
||||
{
|
||||
builder.AddConsole();
|
||||
builder.SetMinimumLevel(LogLevel.Warning);
|
||||
builder.SetMinimumLevel(LogLevel.Information);
|
||||
// Filter out noisy libraries
|
||||
builder.AddFilter("Microsoft", LogLevel.Warning);
|
||||
builder.AddFilter("System", LogLevel.Warning);
|
||||
});
|
||||
|
||||
// Database
|
||||
|
||||
Reference in New Issue
Block a user