diff --git a/src/RealCV.Application/Interfaces/ICompanyNameMatcherService.cs b/src/RealCV.Application/Interfaces/ICompanyNameMatcherService.cs index 097f4e9..466ecfb 100644 --- a/src/RealCV.Application/Interfaces/ICompanyNameMatcherService.cs +++ b/src/RealCV.Application/Interfaces/ICompanyNameMatcherService.cs @@ -12,4 +12,13 @@ public interface ICompanyNameMatcherService string cvCompanyName, List candidates, CancellationToken cancellationToken = default); + + /// + /// Uses AI to detect if a company name contains multiple companies and extract them. + /// Returns null or single-item list if it's a single company (e.g., "Ernst & Young"). + /// Returns multiple items if compound (e.g., "ASDA/WALMART" -> ["ASDA", "WALMART"]). + /// + Task?> ExtractCompanyNamesAsync( + string companyName, + CancellationToken cancellationToken = default); } diff --git a/src/RealCV.Infrastructure/Services/AICompanyNameMatcherService.cs b/src/RealCV.Infrastructure/Services/AICompanyNameMatcherService.cs index b96d87a..11fb900 100644 --- a/src/RealCV.Infrastructure/Services/AICompanyNameMatcherService.cs +++ b/src/RealCV.Infrastructure/Services/AICompanyNameMatcherService.cs @@ -207,4 +207,107 @@ public sealed class AICompanyNameMatcherService : ICompanyNameMatcherService return null; // Fall back to fuzzy matching } } + + private const string CompoundNamePrompt = """ + Analyze this company name from a CV and determine if it refers to ONE company or MULTIPLE companies. + + Company name: "{COMPANY_NAME}" + + Examples: + - "Ernst & Young" → ONE company (it's the full name of the accounting firm) + - "Marks & Spencer" → ONE company (it's the full name of the retailer) + - "ASDA/WALMART" → TWO companies: ["ASDA", "WALMART"] (person worked at both or it's showing ownership) + - "Corus & Laura Ashley Hotels" → TWO companies: ["Corus", "Laura Ashley Hotels"] (different industries) + - "PwC" → ONE company + - "Deloitte and Touche" → ONE company (historical name of Deloitte) + - "BMW Group Ireland" → ONE company + - "Tesco Stores and Distribution" → ONE company (departments of same company) + + Rules: + 1. Well-known company names with "&" or "and" are SINGLE companies (Ernst & Young, Marks & Spencer, Procter & Gamble) + 2. A "/" usually indicates multiple companies or ownership relationship + 3. If the parts are in completely different industries, they're likely separate companies + 4. If one part is clearly a subsidiary/department of the other, treat as ONE company + + Respond with ONLY valid JSON: + { + "isSingleCompany": boolean, + "companies": ["company1", "company2"] or ["single company name"], + "reasoning": "brief explanation" + } + """; + + public async Task?> ExtractCompanyNamesAsync( + string companyName, + CancellationToken cancellationToken = default) + { + if (string.IsNullOrWhiteSpace(companyName)) + { + return null; + } + + _logger.LogDebug("Using AI to check if '{CompanyName}' is a compound name", companyName); + + try + { + var prompt = CompoundNamePrompt.Replace("{COMPANY_NAME}", companyName); + + var messages = new List + { + new(RoleType.User, prompt) + }; + + var parameters = new MessageParameters + { + Model = "claude-3-5-haiku-20241022", + MaxTokens = 256, + Messages = messages, + System = [new SystemMessage("You are a company name parser. Respond only with valid JSON.")] + }; + + var response = await _anthropicClient.Messages.GetClaudeMessageAsync(parameters, cancellationToken); + + var responseText = response.Content + .OfType() + .FirstOrDefault()?.Text; + + if (string.IsNullOrWhiteSpace(responseText)) + { + _logger.LogWarning("AI returned empty response for compound name check"); + return null; + } + + responseText = JsonResponseHelper.CleanJsonResponse(responseText); + + var result = JsonSerializer.Deserialize(responseText, JsonDefaults.CamelCase); + + if (result is null) + { + _logger.LogWarning("Failed to deserialize compound name response: {Response}", responseText); + return null; + } + + _logger.LogDebug("AI compound name result: IsSingle={IsSingle}, Companies=[{Companies}], Reasoning={Reasoning}", + result.IsSingleCompany, string.Join(", ", result.Companies ?? []), result.Reasoning); + + if (result.IsSingleCompany || result.Companies is null || result.Companies.Count < 2) + { + return null; // Single company, no splitting needed + } + + return result.Companies; + } + catch (Exception ex) + { + _logger.LogError(ex, "AI compound name detection failed for '{CompanyName}'", companyName); + return null; + } + } + + private sealed class CompoundNameResponse + { + public bool IsSingleCompany { get; set; } + public List? Companies { get; set; } + public string? Reasoning { get; set; } + } } diff --git a/src/RealCV.Infrastructure/Services/CompanyVerifierService.cs b/src/RealCV.Infrastructure/Services/CompanyVerifierService.cs index 3df890f..8a76440 100644 --- a/src/RealCV.Infrastructure/Services/CompanyVerifierService.cs +++ b/src/RealCV.Infrastructure/Services/CompanyVerifierService.cs @@ -226,6 +226,14 @@ public sealed class CompanyVerifierService : ICompanyVerifierService }; } + // Check 4: Is this a compound company name (e.g., "ASDA/WALMART", "Corus & Laura Ashley Hotels")? + // Try to verify each part individually + var compoundResult = await TryVerifyCompoundNameAsync(normalizedName, companyName, startDate, endDate, jobTitle, flags); + if (compoundResult is not null) + { + return compoundResult; + } + // Try to find a cached match first (but only if it existed at claimed start date) var cachedMatch = await FindCachedMatchAsync(normalizedName); if (cachedMatch is not null) @@ -833,6 +841,70 @@ public sealed class CompanyVerifierService : ICompanyVerifierService return normalized; } + /// + /// Attempts to verify compound company names by detecting if multiple companies are mentioned. + /// Only triggers for names with potential separators (/, &, "and") to avoid unnecessary AI calls. + /// + private async Task TryVerifyCompoundNameAsync( + string normalizedName, + string originalName, + DateOnly? startDate, + DateOnly? endDate, + string? jobTitle, + List flags) + { + // Quick check: only process names that might be compound + // Look for separators that could indicate multiple companies + var hasPotentialSeparator = normalizedName.Contains('/') + || normalizedName.Contains(" & ") + || normalizedName.Contains(" and ", StringComparison.OrdinalIgnoreCase); + + if (!hasPotentialSeparator) + { + return null; + } + + // Use AI to determine if this is a compound name and extract parts + var extractedParts = await _aiMatcher.ExtractCompanyNamesAsync(normalizedName); + + if (extractedParts is null || extractedParts.Count < 2) + { + // AI determined this is a single company (e.g., "Ernst & Young") + return null; + } + + _logger.LogDebug("AI detected compound company name '{Name}', extracted parts: {Parts}", + originalName, string.Join(", ", extractedParts.Select(p => $"'{p}'"))); + + // Try to verify each extracted part - return success on first match + foreach (var part in extractedParts) + { + // Skip parts that are too short + if (part.Length < 3) continue; + + _logger.LogDebug("Trying to verify compound part: '{Part}'", part); + + // Recursively verify this part + var partResult = await VerifyCompanyAsync(part, startDate, endDate, jobTitle); + + if (partResult.IsVerified) + { + _logger.LogInformation("Compound name '{Original}' verified via part '{Part}' -> {Match}", + originalName, part, partResult.MatchedCompanyName); + + return partResult with + { + ClaimedCompany = originalName, + VerificationNotes = $"Verified via '{part}': {partResult.VerificationNotes ?? partResult.MatchedCompanyName}" + }; + } + } + + // None of the parts could be verified + _logger.LogDebug("No parts of compound name '{Name}' could be verified", originalName); + return null; + } + private async Task FindCachedMatchAsync(string companyName) { var cutoffDate = DateTime.UtcNow.AddDays(-CacheExpirationDays); diff --git a/tests/RealCV.Tests/Integration/CVBatchTester.cs b/tests/RealCV.Tests/Integration/CVBatchTester.cs new file mode 100644 index 0000000..0f8aafe --- /dev/null +++ b/tests/RealCV.Tests/Integration/CVBatchTester.cs @@ -0,0 +1,329 @@ +using System.Text.Json; +using Microsoft.EntityFrameworkCore; +using Microsoft.Extensions.Configuration; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Logging; +using RealCV.Application.Interfaces; +using RealCV.Application.Models; +using RealCV.Infrastructure.Data; +using RealCV.Infrastructure.ExternalApis; +using RealCV.Infrastructure.Services; +using RealCV.Infrastructure.Configuration; + +namespace RealCV.Tests.Integration; + +/// +/// Test utility to batch process CVs and output verification findings. +/// Run with: dotnet test --filter "FullyQualifiedName~CVBatchTester" -- TestRunParameters.Parameter(name=\"CvFolder\", value=\"/path/to/cvs\") +/// Or use the ProcessFolder method directly. +/// +public class CVBatchTester +{ + private readonly IServiceProvider _serviceProvider; + + public CVBatchTester() + { + var services = new ServiceCollection(); + ConfigureServices(services); + _serviceProvider = services.BuildServiceProvider(); + } + + private static void ConfigureServices(IServiceCollection services) + { + // Load configuration + var configuration = new ConfigurationBuilder() + .SetBasePath(Directory.GetCurrentDirectory()) + .AddJsonFile("appsettings.json", optional: true) + .AddJsonFile("appsettings.Development.json", optional: true) + .AddEnvironmentVariables() + .Build(); + + // Logging + services.AddLogging(builder => + { + builder.AddConsole(); + builder.SetMinimumLevel(LogLevel.Information); + }); + + // Database + var connectionString = configuration.GetConnectionString("DefaultConnection") + ?? "Server=127.0.0.1;Database=RealCV;User Id=SA;Password=TrueCV_Sql2024!;TrustServerCertificate=True"; + + services.AddDbContextFactory(options => + options.UseSqlServer(connectionString)); + + // Companies House + services.Configure(options => + { + options.BaseUrl = configuration["CompaniesHouse:BaseUrl"] ?? "https://api.company-information.service.gov.uk"; + options.ApiKey = configuration["CompaniesHouse:ApiKey"] ?? ""; + }); + + services.AddHttpClient(); + + // Anthropic (for AI matching) + services.Configure(options => + { + options.ApiKey = configuration["Anthropic:ApiKey"] ?? ""; + }); + + services.AddHttpClient(); + services.AddScoped(); + + // Services + services.AddScoped(); + services.AddScoped(); + services.AddScoped(); + } + + /// + /// Process all CVs in a folder and return verification results. + /// + public async Task> ProcessFolderAsync(string folderPath) + { + if (!Directory.Exists(folderPath)) + { + throw new DirectoryNotFoundException($"Folder not found: {folderPath}"); + } + + var cvFiles = Directory.GetFiles(folderPath, "*.*", SearchOption.TopDirectoryOnly) + .Where(f => f.EndsWith(".pdf", StringComparison.OrdinalIgnoreCase) || + f.EndsWith(".docx", StringComparison.OrdinalIgnoreCase) || + f.EndsWith(".doc", StringComparison.OrdinalIgnoreCase)) + .ToList(); + + Console.WriteLine($"Found {cvFiles.Count} CV files in {folderPath}"); + Console.WriteLine(new string('=', 80)); + + var results = new List(); + + foreach (var cvFile in cvFiles) + { + Console.WriteLine($"\nProcessing: {Path.GetFileName(cvFile)}"); + Console.WriteLine(new string('-', 60)); + + try + { + var result = await ProcessSingleCVAsync(cvFile); + results.Add(result); + PrintSummary(result); + } + catch (Exception ex) + { + Console.WriteLine($"ERROR: {ex.Message}"); + results.Add(new CVVerificationSummary + { + FileName = Path.GetFileName(cvFile), + Error = ex.Message + }); + } + } + + // Print overall summary + Console.WriteLine("\n" + new string('=', 80)); + Console.WriteLine("OVERALL SUMMARY"); + Console.WriteLine(new string('=', 80)); + PrintOverallSummary(results); + + return results; + } + + private async Task ProcessSingleCVAsync(string filePath) + { + using var scope = _serviceProvider.CreateScope(); + var cvParser = scope.ServiceProvider.GetRequiredService(); + var companyVerifier = scope.ServiceProvider.GetRequiredService(); + var educationVerifier = scope.ServiceProvider.GetRequiredService(); + + // Parse the CV + await using var fileStream = File.OpenRead(filePath); + var parsedCV = await cvParser.ParseAsync(fileStream, Path.GetFileName(filePath)); + + var summary = new CVVerificationSummary + { + FileName = Path.GetFileName(filePath), + CandidateName = parsedCV.PersonalInfo?.FullName ?? "Unknown" + }; + + // Verify employers + if (parsedCV.Employment?.Count > 0) + { + foreach (var employment in parsedCV.Employment) + { + try + { + var result = await companyVerifier.VerifyCompanyAsync( + employment.CompanyName, + employment.StartDate, + employment.EndDate, + employment.JobTitle); + + summary.EmployerResults.Add(new EmployerVerificationSummary + { + ClaimedName = employment.CompanyName, + MatchedName = result.MatchedCompanyName, + CompanyNumber = result.MatchedCompanyNumber, + IsVerified = result.IsVerified, + MatchScore = result.MatchScore, + Notes = result.VerificationNotes, + Status = result.CompanyStatus + }); + } + catch (Exception ex) + { + summary.EmployerResults.Add(new EmployerVerificationSummary + { + ClaimedName = employment.CompanyName, + IsVerified = false, + Notes = $"Error: {ex.Message}" + }); + } + } + } + + // Verify education + if (parsedCV.Education?.Count > 0) + { + var educationResults = educationVerifier.VerifyAll( + parsedCV.Education.Select(e => new EducationEntry + { + Institution = e.Institution, + Qualification = e.Qualification, + Subject = e.Subject, + StartDate = e.StartDate, + EndDate = e.EndDate + }).ToList()); + + foreach (var result in educationResults) + { + summary.EducationResults.Add(new EducationVerificationSummary + { + ClaimedInstitution = result.ClaimedInstitution, + MatchedInstitution = result.MatchedInstitution, + Qualification = result.ClaimedQualification, + IsVerified = result.IsVerified, + Status = result.Status, + Notes = result.VerificationNotes + }); + } + } + + return summary; + } + + private static void PrintSummary(CVVerificationSummary summary) + { + Console.WriteLine($"Candidate: {summary.CandidateName}"); + + Console.WriteLine($"\n EMPLOYERS ({summary.EmployerResults.Count}):"); + foreach (var emp in summary.EmployerResults) + { + var status = emp.IsVerified ? "✓" : "✗"; + var matchInfo = emp.IsVerified + ? $"-> {emp.MatchedName} ({emp.MatchScore}%)" + : emp.Notes ?? "Not found"; + Console.WriteLine($" {status} {emp.ClaimedName}"); + Console.WriteLine($" {matchInfo}"); + } + + Console.WriteLine($"\n EDUCATION ({summary.EducationResults.Count}):"); + foreach (var edu in summary.EducationResults) + { + var status = edu.IsVerified ? "✓" : "✗"; + var matchInfo = edu.IsVerified && edu.MatchedInstitution != null + ? $"-> {edu.MatchedInstitution}" + : edu.Notes ?? edu.Status; + Console.WriteLine($" {status} {edu.ClaimedInstitution}"); + Console.WriteLine($" {edu.Qualification}"); + Console.WriteLine($" {matchInfo}"); + } + } + + private static void PrintOverallSummary(List results) + { + var successfulCVs = results.Count(r => r.Error == null); + var totalEmployers = results.Sum(r => r.EmployerResults.Count); + var verifiedEmployers = results.Sum(r => r.EmployerResults.Count(e => e.IsVerified)); + var totalEducation = results.Sum(r => r.EducationResults.Count); + var verifiedEducation = results.Sum(r => r.EducationResults.Count(e => e.IsVerified)); + + Console.WriteLine($"CVs Processed: {successfulCVs}/{results.Count}"); + Console.WriteLine($"Employers: {verifiedEmployers}/{totalEmployers} verified ({(totalEmployers > 0 ? verifiedEmployers * 100 / totalEmployers : 0)}%)"); + Console.WriteLine($"Education: {verifiedEducation}/{totalEducation} verified ({(totalEducation > 0 ? verifiedEducation * 100 / totalEducation : 0)}%)"); + + // List unverified employers + var unverifiedEmployers = results + .SelectMany(r => r.EmployerResults.Where(e => !e.IsVerified)) + .GroupBy(e => e.ClaimedName) + .OrderByDescending(g => g.Count()) + .ToList(); + + if (unverifiedEmployers.Count > 0) + { + Console.WriteLine($"\nUNVERIFIED EMPLOYERS ({unverifiedEmployers.Count} unique):"); + foreach (var group in unverifiedEmployers.Take(20)) + { + Console.WriteLine($" - {group.Key} (x{group.Count()})"); + } + } + + // List unverified institutions + var unverifiedEducation = results + .SelectMany(r => r.EducationResults.Where(e => !e.IsVerified)) + .GroupBy(e => e.ClaimedInstitution) + .OrderByDescending(g => g.Count()) + .ToList(); + + if (unverifiedEducation.Count > 0) + { + Console.WriteLine($"\nUNVERIFIED INSTITUTIONS ({unverifiedEducation.Count} unique):"); + foreach (var group in unverifiedEducation.Take(20)) + { + Console.WriteLine($" - {group.Key} (x{group.Count()})"); + } + } + } + + /// + /// Export results to JSON for further analysis. + /// + public static void ExportToJson(List results, string outputPath) + { + var json = JsonSerializer.Serialize(results, new JsonSerializerOptions + { + WriteIndented = true + }); + File.WriteAllText(outputPath, json); + Console.WriteLine($"\nResults exported to: {outputPath}"); + } +} + +public class CVVerificationSummary +{ + public string FileName { get; set; } = ""; + public string CandidateName { get; set; } = ""; + public string? Error { get; set; } + public List EmployerResults { get; set; } = new(); + public List EducationResults { get; set; } = new(); +} + +public class EmployerVerificationSummary +{ + public string ClaimedName { get; set; } = ""; + public string? MatchedName { get; set; } + public string? CompanyNumber { get; set; } + public bool IsVerified { get; set; } + public int MatchScore { get; set; } + public string? Notes { get; set; } + public string? Status { get; set; } +} + +public class EducationVerificationSummary +{ + public string ClaimedInstitution { get; set; } = ""; + public string? MatchedInstitution { get; set; } + public string? Qualification { get; set; } + public bool IsVerified { get; set; } + public string? Status { get; set; } + public string? Notes { get; set; } +} diff --git a/tools/CVBatchTester/CVBatchTester.csproj b/tools/CVBatchTester/CVBatchTester.csproj new file mode 100644 index 0000000..76d44ee --- /dev/null +++ b/tools/CVBatchTester/CVBatchTester.csproj @@ -0,0 +1,15 @@ + + + + Exe + net8.0 + enable + enable + + + + + + + + diff --git a/tools/CVBatchTester/Program.cs b/tools/CVBatchTester/Program.cs new file mode 100644 index 0000000..1c2f6b3 --- /dev/null +++ b/tools/CVBatchTester/Program.cs @@ -0,0 +1,302 @@ +using System.Text.Json; +using Microsoft.EntityFrameworkCore; +using Microsoft.Extensions.Configuration; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Logging; +using RealCV.Application.Interfaces; +using RealCV.Application.Models; +using RealCV.Infrastructure.Configuration; +using RealCV.Infrastructure.Data; +using RealCV.Infrastructure.ExternalApis; +using RealCV.Infrastructure.Services; + +namespace CVBatchTester; + +class Program +{ + static async Task Main(string[] args) + { + var folderPath = args.FirstOrDefault() ?? AskForFolder(); + + if (string.IsNullOrEmpty(folderPath) || !Directory.Exists(folderPath)) + { + Console.WriteLine($"Error: Folder not found: {folderPath}"); + Console.WriteLine("Usage: CVBatchTester "); + Console.WriteLine(" e.g. CVBatchTester /home/user/cvs"); + return 1; + } + + Console.WriteLine($"CV Batch Verification Tester"); + Console.WriteLine($"Processing CVs from: {folderPath}"); + Console.WriteLine(new string('=', 80)); + + // Setup DI + var services = new ServiceCollection(); + ConfigureServices(services); + var provider = services.BuildServiceProvider(); + + // Find CV files + var cvFiles = Directory.GetFiles(folderPath, "*.*", SearchOption.TopDirectoryOnly) + .Where(f => f.EndsWith(".pdf", StringComparison.OrdinalIgnoreCase) || + f.EndsWith(".docx", StringComparison.OrdinalIgnoreCase) || + f.EndsWith(".doc", StringComparison.OrdinalIgnoreCase)) + .OrderBy(f => f) + .ToList(); + + Console.WriteLine($"Found {cvFiles.Count} CV files\n"); + + if (cvFiles.Count == 0) + { + Console.WriteLine("No CV files found (.pdf, .docx, .doc)"); + return 1; + } + + // Track results + var allUnverifiedEmployers = new List(); + var allUnverifiedInstitutions = new List(); + var totalEmployers = 0; + var verifiedEmployers = 0; + var totalEducation = 0; + var verifiedEducation = 0; + var processedCount = 0; + var errorCount = 0; + + foreach (var cvFile in cvFiles) + { + Console.WriteLine($"\n{new string('=', 80)}"); + Console.WriteLine($"[{++processedCount}/{cvFiles.Count}] {Path.GetFileName(cvFile)}"); + Console.WriteLine(new string('=', 80)); + + try + { + using var scope = provider.CreateScope(); + var parser = scope.ServiceProvider.GetRequiredService(); + var companyVerifier = scope.ServiceProvider.GetRequiredService(); + var eduVerifier = scope.ServiceProvider.GetRequiredService(); + + // Parse CV + await using var stream = File.OpenRead(cvFile); + var cv = await parser.ParseAsync(stream, Path.GetFileName(cvFile)); + + Console.WriteLine($"Candidate: {cv.FullName}"); + + // Verify Employers + if (cv.Employment?.Count > 0) + { + Console.WriteLine($"\nEMPLOYERS ({cv.Employment.Count}):"); + Console.WriteLine(new string('-', 60)); + + foreach (var emp in cv.Employment) + { + totalEmployers++; + try + { + var result = await companyVerifier.VerifyCompanyAsync( + emp.CompanyName, + emp.StartDate, + emp.EndDate, + emp.JobTitle); + + var icon = result.IsVerified ? "✓" : "✗"; + var period = FormatPeriod(emp.StartDate, emp.EndDate); + + Console.WriteLine($"\n {icon} {emp.CompanyName}"); + Console.WriteLine($" Period: {period}"); + Console.WriteLine($" Role: {emp.JobTitle}"); + + if (result.IsVerified) + { + verifiedEmployers++; + Console.WriteLine($" Match: {result.MatchedCompanyName} ({result.MatchScore}%)"); + if (!string.IsNullOrEmpty(result.MatchedCompanyNumber)) + Console.WriteLine($" Company #: {result.MatchedCompanyNumber}"); + if (!string.IsNullOrEmpty(result.CompanyStatus)) + Console.WriteLine($" Status: {result.CompanyStatus}"); + } + else + { + allUnverifiedEmployers.Add(emp.CompanyName); + } + + if (!string.IsNullOrEmpty(result.VerificationNotes)) + Console.WriteLine($" Note: {result.VerificationNotes}"); + } + catch (Exception ex) + { + Console.WriteLine($"\n ✗ {emp.CompanyName}"); + Console.WriteLine($" ERROR: {ex.Message}"); + allUnverifiedEmployers.Add(emp.CompanyName); + } + } + } + + // Verify Education + if (cv.Education?.Count > 0) + { + Console.WriteLine($"\nEDUCATION ({cv.Education.Count}):"); + Console.WriteLine(new string('-', 60)); + + var eduEntries = cv.Education.Select(e => new EducationEntry + { + Institution = e.Institution, + Qualification = e.Qualification, + Subject = e.Subject, + StartDate = e.StartDate, + EndDate = e.EndDate + }).ToList(); + + var eduResults = eduVerifier.VerifyAll(eduEntries); + + foreach (var result in eduResults) + { + totalEducation++; + var icon = result.IsVerified ? "✓" : "✗"; + + Console.WriteLine($"\n {icon} {result.ClaimedInstitution}"); + Console.WriteLine($" Qualification: {result.ClaimedQualification}"); + if (!string.IsNullOrEmpty(result.ClaimedSubject)) + Console.WriteLine($" Subject: {result.ClaimedSubject}"); + + if (result.IsVerified) + { + verifiedEducation++; + if (result.MatchedInstitution != null && + !result.MatchedInstitution.Equals(result.ClaimedInstitution, StringComparison.OrdinalIgnoreCase)) + { + Console.WriteLine($" Match: {result.MatchedInstitution}"); + } + } + else + { + allUnverifiedInstitutions.Add(result.ClaimedInstitution ?? "Unknown"); + Console.WriteLine($" Status: {result.Status}"); + } + + if (!string.IsNullOrEmpty(result.VerificationNotes)) + Console.WriteLine($" Note: {result.VerificationNotes}"); + } + } + } + catch (Exception ex) + { + errorCount++; + Console.WriteLine($"ERROR processing file: {ex.Message}"); + } + } + + // Print Summary + Console.WriteLine($"\n\n{new string('=', 80)}"); + Console.WriteLine("VERIFICATION SUMMARY"); + Console.WriteLine(new string('=', 80)); + + Console.WriteLine($"\nCVs Processed: {processedCount - errorCount}/{cvFiles.Count}"); + if (errorCount > 0) + Console.WriteLine($"Errors: {errorCount}"); + + var empRate = totalEmployers > 0 ? verifiedEmployers * 100 / totalEmployers : 0; + var eduRate = totalEducation > 0 ? verifiedEducation * 100 / totalEducation : 0; + + Console.WriteLine($"\nEmployers: {verifiedEmployers}/{totalEmployers} verified ({empRate}%)"); + Console.WriteLine($"Education: {verifiedEducation}/{totalEducation} verified ({eduRate}%)"); + + // List unverified employers + var uniqueUnverifiedEmployers = allUnverifiedEmployers + .GroupBy(e => e, StringComparer.OrdinalIgnoreCase) + .OrderByDescending(g => g.Count()) + .ThenBy(g => g.Key) + .ToList(); + + if (uniqueUnverifiedEmployers.Count > 0) + { + Console.WriteLine($"\n{new string('-', 60)}"); + Console.WriteLine($"UNVERIFIED EMPLOYERS ({uniqueUnverifiedEmployers.Count} unique):"); + foreach (var group in uniqueUnverifiedEmployers) + { + var count = group.Count() > 1 ? $" (x{group.Count()})" : ""; + Console.WriteLine($" - {group.Key}{count}"); + } + } + + // List unverified institutions + var uniqueUnverifiedInstitutions = allUnverifiedInstitutions + .GroupBy(i => i, StringComparer.OrdinalIgnoreCase) + .OrderByDescending(g => g.Count()) + .ThenBy(g => g.Key) + .ToList(); + + if (uniqueUnverifiedInstitutions.Count > 0) + { + Console.WriteLine($"\n{new string('-', 60)}"); + Console.WriteLine($"UNVERIFIED INSTITUTIONS ({uniqueUnverifiedInstitutions.Count} unique):"); + foreach (var group in uniqueUnverifiedInstitutions) + { + var count = group.Count() > 1 ? $" (x{group.Count()})" : ""; + Console.WriteLine($" - {group.Key}{count}"); + } + } + + Console.WriteLine($"\n{new string('=', 80)}"); + return 0; + } + + static string AskForFolder() + { + Console.Write("Enter CV folder path: "); + return Console.ReadLine() ?? ""; + } + + static string FormatPeriod(DateOnly? start, DateOnly? end) + { + var startStr = start?.ToString("MMM yyyy") ?? "?"; + var endStr = end?.ToString("MMM yyyy") ?? "Present"; + return $"{startStr} - {endStr}"; + } + + static void ConfigureServices(IServiceCollection services) + { + // Load configuration - try multiple locations + var configPaths = new[] + { + "/var/www/realcv", + "/git/RealCV/src/RealCV.Web", + Path.GetFullPath(Path.Combine(AppContext.BaseDirectory, "..", "..", "..", "..", "..", "src", "RealCV.Web")) + }; + + var webProjectPath = configPaths.FirstOrDefault(Directory.Exists) ?? "/git/RealCV/src/RealCV.Web"; + Console.WriteLine($"Loading config from: {webProjectPath}"); + + var configuration = new ConfigurationBuilder() + .SetBasePath(webProjectPath) + .AddJsonFile("appsettings.json", optional: true) + .AddJsonFile("appsettings.Development.json", optional: true) + .AddJsonFile("appsettings.Production.json", optional: true) + .Build(); + + // Logging - minimal output + services.AddLogging(builder => + { + builder.AddConsole(); + builder.SetMinimumLevel(LogLevel.Warning); + }); + + // Database + var connectionString = configuration.GetConnectionString("DefaultConnection") + ?? "Server=127.0.0.1;Database=RealCV;User Id=SA;Password=TrueCV_Sql2024!;TrustServerCertificate=True"; + + services.AddDbContextFactory(options => + options.UseSqlServer(connectionString)); + + // Companies House - use configuration binding + services.Configure(configuration.GetSection(CompaniesHouseSettings.SectionName)); + services.AddHttpClient(); + + // Anthropic - use configuration binding + services.Configure(configuration.GetSection(AnthropicSettings.SectionName)); + services.AddScoped(); + + // Services + services.AddScoped(); + services.AddScoped(); + services.AddScoped(); + } +} diff --git a/tools/batch-test-cvs.cs b/tools/batch-test-cvs.cs new file mode 100644 index 0000000..179946b --- /dev/null +++ b/tools/batch-test-cvs.cs @@ -0,0 +1,195 @@ +#!/usr/bin/env dotnet-script +#r "nuget: Microsoft.EntityFrameworkCore.SqlServer, 8.0.0" +#r "nuget: Microsoft.Extensions.Configuration.Json, 8.0.0" +#r "nuget: Microsoft.Extensions.DependencyInjection, 8.0.0" +#r "nuget: Microsoft.Extensions.Logging.Console, 8.0.0" +#r "../src/RealCV.Application/bin/Debug/net8.0/RealCV.Application.dll" +#r "../src/RealCV.Infrastructure/bin/Debug/net8.0/RealCV.Infrastructure.dll" +#r "../src/RealCV.Domain/bin/Debug/net8.0/RealCV.Domain.dll" + +// This is a dotnet-script file. Run with: dotnet script batch-test-cvs.cs -- /path/to/cvs +// Install dotnet-script: dotnet tool install -g dotnet-script + +using System; +using System.IO; +using System.Linq; +using System.Threading.Tasks; +using System.Collections.Generic; +using System.Text.Json; +using Microsoft.EntityFrameworkCore; +using Microsoft.Extensions.Configuration; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Logging; +using RealCV.Application.Interfaces; +using RealCV.Application.Models; +using RealCV.Infrastructure.Data; +using RealCV.Infrastructure.Services; +using RealCV.Infrastructure.ExternalApis; +using RealCV.Infrastructure.Configuration; + +var folderPath = Args.FirstOrDefault() ?? "/tmp/test-cvs"; + +if (!Directory.Exists(folderPath)) +{ + Console.WriteLine($"Error: Folder not found: {folderPath}"); + Console.WriteLine("Usage: dotnet script batch-test-cvs.cs -- /path/to/cvs"); + return 1; +} + +Console.WriteLine($"Processing CVs from: {folderPath}"); +Console.WriteLine(new string('=', 80)); + +// Setup DI +var services = new ServiceCollection(); + +var configuration = new ConfigurationBuilder() + .SetBasePath(Path.Combine(Directory.GetCurrentDirectory(), "../src/RealCV.Web")) + .AddJsonFile("appsettings.json", optional: true) + .AddJsonFile("appsettings.Development.json", optional: true) + .Build(); + +services.AddLogging(b => b.AddConsole().SetMinimumLevel(LogLevel.Warning)); + +services.AddDbContextFactory(options => + options.UseSqlServer(configuration.GetConnectionString("DefaultConnection"))); + +services.Configure(configuration.GetSection("CompaniesHouse")); +services.Configure(configuration.GetSection("Anthropic")); + +services.AddHttpClient(); +services.AddHttpClient(); +services.AddScoped(); +services.AddScoped(); +services.AddScoped(); +services.AddScoped(); + +var provider = services.BuildServiceProvider(); + +var cvFiles = Directory.GetFiles(folderPath, "*.*") + .Where(f => f.EndsWith(".pdf", StringComparison.OrdinalIgnoreCase) || + f.EndsWith(".docx", StringComparison.OrdinalIgnoreCase)) + .ToList(); + +Console.WriteLine($"Found {cvFiles.Count} CV files\n"); + +var allUnverifiedEmployers = new List(); +var allUnverifiedInstitutions = new List(); +var totalEmployers = 0; +var verifiedEmployers = 0; +var totalEducation = 0; +var verifiedEducation = 0; + +foreach (var cvFile in cvFiles) +{ + Console.WriteLine($"\n{'=',-80}"); + Console.WriteLine($"FILE: {Path.GetFileName(cvFile)}"); + Console.WriteLine($"{'=',-80}"); + + try + { + using var scope = provider.CreateScope(); + var parser = scope.ServiceProvider.GetRequiredService(); + var companyVerifier = scope.ServiceProvider.GetRequiredService(); + var eduVerifier = scope.ServiceProvider.GetRequiredService(); + + await using var stream = File.OpenRead(cvFile); + var cv = await parser.ParseAsync(stream, Path.GetFileName(cvFile)); + + Console.WriteLine($"Candidate: {cv.PersonalInfo?.FullName ?? "Unknown"}"); + + // Employers + if (cv.Employment?.Count > 0) + { + Console.WriteLine($"\nEMPLOYERS ({cv.Employment.Count}):"); + foreach (var emp in cv.Employment) + { + totalEmployers++; + var result = await companyVerifier.VerifyCompanyAsync( + emp.CompanyName, emp.StartDate, emp.EndDate, emp.JobTitle); + + var icon = result.IsVerified ? "✓" : "✗"; + Console.WriteLine($" {icon} {emp.CompanyName}"); + + if (result.IsVerified) + { + verifiedEmployers++; + Console.WriteLine($" → {result.MatchedCompanyName} ({result.MatchScore}%)"); + if (!string.IsNullOrEmpty(result.VerificationNotes)) + Console.WriteLine($" Note: {result.VerificationNotes}"); + } + else + { + allUnverifiedEmployers.Add(emp.CompanyName); + Console.WriteLine($" Note: {result.VerificationNotes ?? "Not found"}"); + } + } + } + + // Education + if (cv.Education?.Count > 0) + { + Console.WriteLine($"\nEDUCATION ({cv.Education.Count}):"); + var eduEntries = cv.Education.Select(e => new EducationEntry + { + Institution = e.Institution, + Qualification = e.Qualification, + Subject = e.Subject, + StartDate = e.StartDate, + EndDate = e.EndDate + }).ToList(); + + var eduResults = eduVerifier.VerifyAll(eduEntries); + foreach (var result in eduResults) + { + totalEducation++; + var icon = result.IsVerified ? "✓" : "✗"; + Console.WriteLine($" {icon} {result.ClaimedInstitution}"); + Console.WriteLine($" {result.ClaimedQualification}"); + + if (result.IsVerified) + { + verifiedEducation++; + if (result.MatchedInstitution != null && result.MatchedInstitution != result.ClaimedInstitution) + Console.WriteLine($" → {result.MatchedInstitution}"); + } + else + { + allUnverifiedInstitutions.Add(result.ClaimedInstitution ?? "Unknown"); + Console.WriteLine($" Status: {result.Status}"); + if (!string.IsNullOrEmpty(result.VerificationNotes)) + Console.WriteLine($" Note: {result.VerificationNotes}"); + } + } + } + } + catch (Exception ex) + { + Console.WriteLine($"ERROR: {ex.Message}"); + } +} + +// Summary +Console.WriteLine($"\n\n{'=',-80}"); +Console.WriteLine("SUMMARY"); +Console.WriteLine($"{'=',-80}"); +Console.WriteLine($"CVs Processed: {cvFiles.Count}"); +Console.WriteLine($"Employers: {verifiedEmployers}/{totalEmployers} verified ({(totalEmployers > 0 ? verifiedEmployers * 100 / totalEmployers : 0)}%)"); +Console.WriteLine($"Education: {verifiedEducation}/{totalEducation} verified ({(totalEducation > 0 ? verifiedEducation * 100 / totalEducation : 0)}%)"); + +var uniqueUnverifiedEmployers = allUnverifiedEmployers.Distinct().OrderBy(x => x).ToList(); +if (uniqueUnverifiedEmployers.Count > 0) +{ + Console.WriteLine($"\nUNVERIFIED EMPLOYERS ({uniqueUnverifiedEmployers.Count}):"); + foreach (var emp in uniqueUnverifiedEmployers) + Console.WriteLine($" - {emp}"); +} + +var uniqueUnverifiedInstitutions = allUnverifiedInstitutions.Distinct().OrderBy(x => x).ToList(); +if (uniqueUnverifiedInstitutions.Count > 0) +{ + Console.WriteLine($"\nUNVERIFIED INSTITUTIONS ({uniqueUnverifiedInstitutions.Count}):"); + foreach (var inst in uniqueUnverifiedInstitutions) + Console.WriteLine($" - {inst}"); +} + +return 0;