Files
RealCV/src/TrueCV.Infrastructure/Jobs/ProcessCVCheckJob.cs
peter be2f738e58 Deduplicate penalties for same company appearing multiple times
When a company appears multiple times in employment history (e.g.,
multiple roles at same company), penalties are now applied only once
per unique company, not per employment entry.

- Unverified company: -10 pts once per company (not per role)
- Company flags (incorporation date, etc.): once per (company, flag type)

Description now shows "(X roles)" when multiple instances exist.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-20 22:07:23 +01:00

706 lines
29 KiB
C#

using System.Text.Json;
using Microsoft.EntityFrameworkCore;
using Microsoft.Extensions.Logging;
using TrueCV.Application.Helpers;
using TrueCV.Application.Interfaces;
using TrueCV.Application.Models;
using TrueCV.Domain.Entities;
using TrueCV.Domain.Enums;
using TrueCV.Infrastructure.Data;
namespace TrueCV.Infrastructure.Jobs;
public sealed class ProcessCVCheckJob
{
private readonly ApplicationDbContext _dbContext;
private readonly IFileStorageService _fileStorageService;
private readonly ICVParserService _cvParserService;
private readonly ICompanyVerifierService _companyVerifierService;
private readonly IEducationVerifierService _educationVerifierService;
private readonly ITimelineAnalyserService _timelineAnalyserService;
private readonly IAuditService _auditService;
private readonly ILogger<ProcessCVCheckJob> _logger;
private const int BaseScore = 100;
private const int UnverifiedCompanyPenalty = 10;
private const int ImplausibleJobTitlePenalty = 15;
private const int CompanyVerificationFlagPenalty = 5; // Base penalty for company flags, actual from flag.ScoreImpact
private const int RapidProgressionPenalty = 10;
private const int EarlyCareerSeniorRolePenalty = 10;
private const int GapMonthPenalty = 1;
private const int MaxGapPenalty = 10;
private const int OverlapMonthPenalty = 2;
private const int DiplomaMillPenalty = 25;
private const int SuspiciousInstitutionPenalty = 15;
private const int UnverifiedEducationPenalty = 5;
private const int EducationDatePenalty = 10;
public ProcessCVCheckJob(
ApplicationDbContext dbContext,
IFileStorageService fileStorageService,
ICVParserService cvParserService,
ICompanyVerifierService companyVerifierService,
IEducationVerifierService educationVerifierService,
ITimelineAnalyserService timelineAnalyserService,
IAuditService auditService,
ILogger<ProcessCVCheckJob> logger)
{
_dbContext = dbContext;
_fileStorageService = fileStorageService;
_cvParserService = cvParserService;
_companyVerifierService = companyVerifierService;
_educationVerifierService = educationVerifierService;
_timelineAnalyserService = timelineAnalyserService;
_auditService = auditService;
_logger = logger;
}
public async Task ExecuteAsync(Guid cvCheckId, CancellationToken cancellationToken)
{
_logger.LogInformation("Starting CV check processing for: {CheckId}", cvCheckId);
var cvCheck = await _dbContext.CVChecks
.FirstOrDefaultAsync(c => c.Id == cvCheckId, cancellationToken);
if (cvCheck is null)
{
_logger.LogError("CV check not found: {CheckId}", cvCheckId);
return;
}
try
{
// Step 1: Update status to Processing
cvCheck.Status = CheckStatus.Processing;
cvCheck.ProcessingStage = "Downloading CV";
await _dbContext.SaveChangesAsync(cancellationToken);
_logger.LogDebug("CV check {CheckId} status updated to Processing", cvCheckId);
// Step 2: Download file from blob
await using var fileStream = await _fileStorageService.DownloadAsync(cvCheck.BlobUrl);
_logger.LogDebug("Downloaded CV file for check {CheckId}", cvCheckId);
// Step 3: Parse CV
cvCheck.ProcessingStage = "Parsing CV";
await _dbContext.SaveChangesAsync(cancellationToken);
var cvData = await _cvParserService.ParseAsync(fileStream, cvCheck.OriginalFileName, cancellationToken);
_logger.LogDebug(
"Parsed CV for check {CheckId}: {EmploymentCount} employment entries",
cvCheckId, cvData.Employment.Count);
// Step 4: Save extracted data
cvCheck.ExtractedDataJson = JsonSerializer.Serialize(cvData, JsonDefaults.CamelCaseIndented);
cvCheck.ProcessingStage = "Verifying Employment";
await _dbContext.SaveChangesAsync(cancellationToken);
// Step 5: Verify each employment entry (parallelized with rate limiting)
// Skip freelance entries as they cannot be verified against company registries
var verificationTasks = cvData.Employment
.Where(e => !IsFreelance(e.CompanyName))
.Select(async employment =>
{
var result = await _companyVerifierService.VerifyCompanyAsync(
employment.CompanyName,
employment.StartDate,
employment.EndDate,
employment.JobTitle);
_logger.LogDebug(
"Verified {Company}: {IsVerified} (Score: {Score}%), JobTitle: {JobTitle}, Plausible: {Plausible}",
employment.CompanyName, result.IsVerified, result.MatchScore,
employment.JobTitle, result.JobTitlePlausible);
return result;
});
var verificationResults = (await Task.WhenAll(verificationTasks)).ToList();
// Add freelance entries as auto-verified (skipped)
foreach (var employment in cvData.Employment.Where(e => IsFreelance(e.CompanyName)))
{
verificationResults.Add(new CompanyVerificationResult
{
ClaimedCompany = employment.CompanyName,
IsVerified = true,
MatchScore = 100,
VerificationNotes = "Freelance/self-employed - verification skipped",
ClaimedJobTitle = employment.JobTitle,
JobTitlePlausible = true
});
_logger.LogDebug("Skipped verification for freelance entry: {Company}", employment.CompanyName);
}
// Step 5b: Verify director claims against Companies House officers
cvCheck.ProcessingStage = "Verifying Directors";
await _dbContext.SaveChangesAsync(cancellationToken);
await VerifyDirectorClaims(cvData.FullName, verificationResults, cancellationToken);
// Step 6: Verify education entries
cvCheck.ProcessingStage = "Verifying Education";
await _dbContext.SaveChangesAsync(cancellationToken);
var educationResults = _educationVerifierService.VerifyAll(
cvData.Education,
cvData.Employment);
_logger.LogDebug(
"Education verification for check {CheckId}: {Count} entries verified ({Recognised} recognised, {DiplomaMill} diploma mills)",
cvCheckId,
educationResults.Count,
educationResults.Count(e => e.IsVerified),
educationResults.Count(e => e.IsDiplomaMill));
// Step 7: Analyse timeline
cvCheck.ProcessingStage = "Analyzing Timeline";
await _dbContext.SaveChangesAsync(cancellationToken);
var timelineAnalysis = _timelineAnalyserService.Analyse(cvData.Employment);
_logger.LogDebug(
"Timeline analysis for check {CheckId}: {GapCount} gaps, {OverlapCount} overlaps",
cvCheckId, timelineAnalysis.Gaps.Count, timelineAnalysis.Overlaps.Count);
// Step 8: Calculate veracity score
cvCheck.ProcessingStage = "Calculating Score";
await _dbContext.SaveChangesAsync(cancellationToken);
var (score, flags) = CalculateVeracityScore(verificationResults, educationResults, timelineAnalysis, cvData);
_logger.LogDebug("Calculated veracity score for check {CheckId}: {Score}", cvCheckId, score);
// Step 9: Create CVFlag records
foreach (var flag in flags)
{
if (!Enum.TryParse<FlagCategory>(flag.Category, out var category))
{
_logger.LogWarning("Unknown flag category: {Category}, defaulting to Timeline", flag.Category);
category = FlagCategory.Timeline;
}
if (!Enum.TryParse<FlagSeverity>(flag.Severity, out var severity))
{
_logger.LogWarning("Unknown flag severity: {Severity}, defaulting to Info", flag.Severity);
severity = FlagSeverity.Info;
}
var cvFlag = new CVFlag
{
Id = Guid.NewGuid(),
CVCheckId = cvCheckId,
Category = category,
Severity = severity,
Title = flag.Title,
Description = flag.Description,
ScoreImpact = flag.ScoreImpact
};
_dbContext.CVFlags.Add(cvFlag);
}
// Step 10: Generate veracity report
cvCheck.ProcessingStage = "Generating Report";
await _dbContext.SaveChangesAsync(cancellationToken);
var report = new VeracityReport
{
OverallScore = score,
ScoreLabel = GetScoreLabel(score),
EmploymentVerifications = verificationResults,
EducationVerifications = educationResults,
TimelineAnalysis = timelineAnalysis,
Flags = flags,
GeneratedAt = DateTime.UtcNow
};
cvCheck.ReportJson = JsonSerializer.Serialize(report, JsonDefaults.CamelCaseIndented);
cvCheck.VeracityScore = score;
// Step 11: Update status to Completed
cvCheck.Status = CheckStatus.Completed;
cvCheck.ProcessingStage = null; // Clear stage on completion
cvCheck.CompletedAt = DateTime.UtcNow;
await _dbContext.SaveChangesAsync(cancellationToken);
_logger.LogInformation(
"CV check {CheckId} completed successfully with score {Score}",
cvCheckId, score);
await _auditService.LogAsync(cvCheck.UserId, AuditActions.CVProcessed, "CVCheck", cvCheckId, $"Score: {score}");
}
catch (Exception ex)
{
_logger.LogError(ex, "Error processing CV check {CheckId}", cvCheckId);
try
{
cvCheck.Status = CheckStatus.Failed;
// Use CancellationToken.None to ensure failure status is saved even if original token is cancelled
await _dbContext.SaveChangesAsync(CancellationToken.None);
}
catch (DbUpdateConcurrencyException)
{
// Record was deleted during processing - nothing to update
_logger.LogWarning("CV check {CheckId} was deleted during processing", cvCheckId);
return;
}
throw;
}
}
private static (int Score, List<FlagResult> Flags) CalculateVeracityScore(
List<CompanyVerificationResult> verifications,
List<EducationVerificationResult> educationResults,
TimelineAnalysisResult timeline,
CVData cvData)
{
var score = BaseScore;
var flags = new List<FlagResult>();
// Penalty for unverified companies (deduplicated by company name)
var unverifiedByCompany = verifications
.Where(v => !v.IsVerified)
.GroupBy(v => v.ClaimedCompany, StringComparer.OrdinalIgnoreCase)
.ToList();
foreach (var companyGroup in unverifiedByCompany)
{
score -= UnverifiedCompanyPenalty;
var firstInstance = companyGroup.First();
var instanceCount = companyGroup.Count();
var description = instanceCount > 1
? $"Could not verify employment at '{firstInstance.ClaimedCompany}' ({instanceCount} roles). {firstInstance.VerificationNotes}"
: $"Could not verify employment at '{firstInstance.ClaimedCompany}'. {firstInstance.VerificationNotes}";
flags.Add(new FlagResult
{
Category = FlagCategory.Employment.ToString(),
Severity = FlagSeverity.Warning.ToString(),
Title = "Unverified Company",
Description = description,
ScoreImpact = -UnverifiedCompanyPenalty
});
}
// Process company verification flags (incorporation date, dissolution, dormant, etc.)
// Deduplicate by (company, flag type) to avoid penalizing same issue multiple times
var processedCompanyFlags = new HashSet<(string Company, string FlagType)>(
new CompanyFlagComparer());
foreach (var verification in verifications.Where(v => v.Flags.Count > 0))
{
foreach (var companyFlag in verification.Flags)
{
var key = (verification.ClaimedCompany, companyFlag.Type);
if (!processedCompanyFlags.Add(key))
{
// Already processed this flag for this company, skip
continue;
}
var penalty = Math.Abs(companyFlag.ScoreImpact);
score -= penalty;
var severity = companyFlag.Severity switch
{
"Critical" => FlagSeverity.Critical,
"Warning" => FlagSeverity.Warning,
_ => FlagSeverity.Info
};
flags.Add(new FlagResult
{
Category = FlagCategory.Employment.ToString(),
Severity = severity.ToString(),
Title = companyFlag.Type switch
{
"EmploymentBeforeIncorporation" => "Employment Before Company Existed",
"EmploymentAtDissolvedCompany" => "Employment at Dissolved Company",
"CurrentEmploymentAtDissolvedCompany" => "Current Employment at Dissolved Company",
"EmploymentAtDormantCompany" => "Employment at Dormant Company",
"SeniorRoleAtMicroCompany" => "Senior Role at Micro Company",
"SicCodeMismatch" => "Role/Industry Mismatch",
"ImplausibleJobTitle" => "Implausible Job Title",
"UnverifiedDirectorClaim" => "Unverified Director Claim",
_ => companyFlag.Type
},
Description = companyFlag.Message,
ScoreImpact = -penalty
});
}
}
// Check for rapid career progression
CheckRapidCareerProgression(cvData.Employment, flags, ref score);
// Check for early career senior roles (relative to education end date)
CheckEarlyCareerSeniorRoles(cvData.Employment, cvData.Education, flags, ref score);
// Penalty for diploma mills (critical)
foreach (var edu in educationResults.Where(e => e.IsDiplomaMill))
{
score -= DiplomaMillPenalty;
flags.Add(new FlagResult
{
Category = FlagCategory.Education.ToString(),
Severity = FlagSeverity.Critical.ToString(),
Title = "Diploma Mill Detected",
Description = $"'{edu.ClaimedInstitution}' is a known diploma mill. {edu.VerificationNotes}",
ScoreImpact = -DiplomaMillPenalty
});
}
// Penalty for suspicious institutions
foreach (var edu in educationResults.Where(e => e.IsSuspicious && !e.IsDiplomaMill))
{
score -= SuspiciousInstitutionPenalty;
flags.Add(new FlagResult
{
Category = FlagCategory.Education.ToString(),
Severity = FlagSeverity.Warning.ToString(),
Title = "Suspicious Institution",
Description = $"'{edu.ClaimedInstitution}' has suspicious characteristics. {edu.VerificationNotes}",
ScoreImpact = -SuspiciousInstitutionPenalty
});
}
// Penalty for unverified education (not recognised, but not flagged as fake)
foreach (var edu in educationResults.Where(e => !e.IsVerified && !e.IsDiplomaMill && !e.IsSuspicious && e.Status == "Unknown"))
{
score -= UnverifiedEducationPenalty;
flags.Add(new FlagResult
{
Category = FlagCategory.Education.ToString(),
Severity = FlagSeverity.Info.ToString(),
Title = "Unverified Institution",
Description = $"Could not verify '{edu.ClaimedInstitution}'. {edu.VerificationNotes}",
ScoreImpact = -UnverifiedEducationPenalty
});
}
// Penalty for implausible education dates
foreach (var edu in educationResults.Where(e => !e.DatesArePlausible))
{
score -= EducationDatePenalty;
flags.Add(new FlagResult
{
Category = FlagCategory.Education.ToString(),
Severity = FlagSeverity.Warning.ToString(),
Title = "Education Date Issues",
Description = $"Date issues for '{edu.ClaimedInstitution}': {edu.DatePlausibilityNotes}",
ScoreImpact = -EducationDatePenalty
});
}
// Penalty for gaps (max -10 per gap)
foreach (var gap in timeline.Gaps)
{
var gapPenalty = Math.Min(gap.Months * GapMonthPenalty, MaxGapPenalty);
score -= gapPenalty;
var severity = gap.Months >= 6 ? FlagSeverity.Warning : FlagSeverity.Info;
flags.Add(new FlagResult
{
Category = FlagCategory.Timeline.ToString(),
Severity = severity.ToString(),
Title = "Employment Gap",
Description = $"{gap.Months} month gap in employment from {gap.StartDate:MMM yyyy} to {gap.EndDate:MMM yyyy}",
ScoreImpact = -gapPenalty
});
}
// Note overlaps - these are often legitimate (part-time, consulting, transitions)
// Only flag as informational, no score penalty
foreach (var overlap in timeline.Overlaps)
{
flags.Add(new FlagResult
{
Category = FlagCategory.Timeline.ToString(),
Severity = FlagSeverity.Info.ToString(),
Title = "Concurrent Employment",
Description = $"Worked at both '{overlap.Company1}' and '{overlap.Company2}' simultaneously for {overlap.Months} months ({overlap.OverlapStart:MMM yyyy} to {overlap.OverlapEnd:MMM yyyy})",
ScoreImpact = 0
});
}
// Deduplicate flags based on Title + Description
var uniqueFlags = flags
.GroupBy(f => (f.Title, f.Description))
.Select(g => g.First())
.ToList();
// Recalculate score based on unique flags
var uniqueScore = BaseScore + uniqueFlags.Sum(f => f.ScoreImpact);
// Ensure score doesn't go below 0
uniqueScore = Math.Max(0, uniqueScore);
return (uniqueScore, uniqueFlags);
}
private static string GetScoreLabel(int score)
{
return score switch
{
>= 90 => "Excellent",
>= 75 => "Good",
>= 60 => "Fair",
>= 40 => "Poor",
_ => "Very Poor"
};
}
private static bool IsFreelance(string companyName)
{
if (string.IsNullOrWhiteSpace(companyName)) return false;
var name = companyName.Trim().ToLowerInvariant();
return name == "freelance" ||
name == "freelancer" ||
name == "self-employed" ||
name == "self employed" ||
name.StartsWith("freelance ") ||
name.StartsWith("self-employed ") ||
name.Contains("(freelance)") ||
name.Contains("(self-employed)");
}
private async Task VerifyDirectorClaims(
string candidateName,
List<CompanyVerificationResult> verificationResults,
CancellationToken cancellationToken)
{
// Find all director claims at verified companies - use ToList() to avoid modifying during enumeration
var directorCandidates = verificationResults
.Select((result, index) => (result, index))
.Where(x => x.result.IsVerified && !string.IsNullOrEmpty(x.result.MatchedCompanyNumber))
.ToList();
foreach (var (result, index) in directorCandidates)
{
var jobTitle = result.ClaimedJobTitle?.ToLowerInvariant() ?? "";
// Check if this is a director claim
var isDirectorClaim = jobTitle.Contains("director") ||
jobTitle.Contains("company secretary") ||
jobTitle == "md" ||
jobTitle.Contains("managing director");
if (!isDirectorClaim) continue;
_logger.LogDebug(
"Verifying director claim for {Candidate} at {Company}",
candidateName, result.MatchedCompanyName);
var isVerifiedDirector = await _companyVerifierService.VerifyDirectorAsync(
result.MatchedCompanyNumber!,
candidateName,
result.ClaimedStartDate,
result.ClaimedEndDate);
if (isVerifiedDirector == false)
{
// Add a flag for unverified director claim
var flags = (result.Flags ?? []).ToList();
flags.Add(new CompanyVerificationFlag
{
Type = "UnverifiedDirectorClaim",
Severity = "Critical",
Message = $"Claimed director role at '{result.MatchedCompanyName}' but candidate name not found in Companies House officers list",
ScoreImpact = -20
});
// Update the result with the new flag
verificationResults[index] = result with { Flags = flags };
_logger.LogWarning(
"Director claim not verified for {Candidate} at {Company}",
candidateName, result.MatchedCompanyName);
}
else if (isVerifiedDirector == true)
{
_logger.LogInformation(
"Director claim verified for {Candidate} at {Company}",
candidateName, result.MatchedCompanyName);
}
}
}
private static void CheckRapidCareerProgression(
List<EmploymentEntry> employment,
List<FlagResult> flags,
ref int score)
{
// Group employment by company and check for rapid promotions
var byCompany = employment
.Where(e => !string.IsNullOrWhiteSpace(e.CompanyName) && e.StartDate.HasValue)
.GroupBy(e => e.CompanyName.ToLowerInvariant())
.Where(g => g.Count() > 1);
foreach (var companyGroup in byCompany)
{
var orderedRoles = companyGroup.OrderBy(e => e.StartDate).ToList();
for (int i = 1; i < orderedRoles.Count; i++)
{
var prevRole = orderedRoles[i - 1];
var currRole = orderedRoles[i];
var prevSeniority = GetSeniorityLevel(prevRole.JobTitle);
var currSeniority = GetSeniorityLevel(currRole.JobTitle);
// Check for jump of 3+ seniority levels
var seniorityJump = currSeniority - prevSeniority;
if (seniorityJump >= 3)
{
// Calculate time between roles
var monthsBetween = ((currRole.StartDate!.Value.Year - prevRole.StartDate!.Value.Year) * 12) +
(currRole.StartDate!.Value.Month - prevRole.StartDate!.Value.Month);
// If jumped 3+ levels in less than 2 years, flag it
if (monthsBetween < 24)
{
score -= RapidProgressionPenalty;
flags.Add(new FlagResult
{
Category = FlagCategory.Employment.ToString(),
Severity = FlagSeverity.Warning.ToString(),
Title = "Rapid Career Progression",
Description = $"Promoted from '{prevRole.JobTitle}' to '{currRole.JobTitle}' at '{companyGroup.First().CompanyName}' in {monthsBetween} months - unusually fast progression",
ScoreImpact = -RapidProgressionPenalty
});
}
}
}
}
}
private static void CheckEarlyCareerSeniorRoles(
List<EmploymentEntry> employment,
List<EducationEntry> education,
List<FlagResult> flags,
ref int score)
{
// Find the latest education end date to estimate career start
var latestEducationEnd = education
.Where(e => e.EndDate.HasValue)
.Select(e => e.EndDate!.Value)
.DefaultIfEmpty(DateOnly.MinValue)
.Max();
if (latestEducationEnd == DateOnly.MinValue)
{
// No education dates available, skip check
return;
}
foreach (var emp in employment.Where(e => e.StartDate.HasValue))
{
var monthsAfterEducation = ((emp.StartDate!.Value.Year - latestEducationEnd.Year) * 12) +
(emp.StartDate!.Value.Month - latestEducationEnd.Month);
// Check if this is a senior role started within 2 years of finishing education
if (monthsAfterEducation < 24 && monthsAfterEducation >= 0)
{
var seniority = GetSeniorityLevel(emp.JobTitle);
// Flag if they're claiming a senior role (level 4+) very early in career
if (seniority >= 4)
{
score -= EarlyCareerSeniorRolePenalty;
flags.Add(new FlagResult
{
Category = FlagCategory.Employment.ToString(),
Severity = FlagSeverity.Warning.ToString(),
Title = "Early Career Senior Role",
Description = $"Claimed senior role '{emp.JobTitle}' at '{emp.CompanyName}' only {monthsAfterEducation} months after completing education",
ScoreImpact = -EarlyCareerSeniorRolePenalty
});
}
}
}
}
private static int GetSeniorityLevel(string? jobTitle)
{
if (string.IsNullOrWhiteSpace(jobTitle)) return 0;
var title = jobTitle.ToLowerInvariant();
// Level 6: C-suite
if (title.Contains("ceo") || title.Contains("cto") || title.Contains("cfo") ||
title.Contains("coo") || title.Contains("cio") || title.Contains("chief") ||
title.Contains("managing director") || title == "md" ||
title.Contains("president") || title.Contains("chairman") ||
title.Contains("chairwoman") || title.Contains("chairperson"))
{
return 6;
}
// Level 5: VP / Executive
if (title.Contains("vice president") || title.Contains("vp ") ||
title.StartsWith("vp") || title.Contains("svp") ||
title.Contains("executive director") || title.Contains("executive vice"))
{
return 5;
}
// Level 4: Director / Head
if (title.Contains("director") || title.Contains("head of"))
{
return 4;
}
// Level 3: Senior / Lead / Principal / Manager
if (title.Contains("senior") || title.Contains("lead") ||
title.Contains("principal") || title.Contains("manager") ||
title.Contains("team lead") || title.Contains("staff"))
{
return 3;
}
// Level 2: Mid-level (no junior, no senior)
if (!title.Contains("junior") && !title.Contains("trainee") &&
!title.Contains("intern") && !title.Contains("graduate") &&
!title.Contains("entry") && !title.Contains("assistant"))
{
return 2;
}
// Level 1: Junior / Entry-level
return 1;
}
/// <summary>
/// Comparer for deduplicating company flags by (company name, flag type).
/// Uses case-insensitive comparison for company names.
/// </summary>
private sealed class CompanyFlagComparer : IEqualityComparer<(string Company, string FlagType)>
{
public bool Equals((string Company, string FlagType) x, (string Company, string FlagType) y)
{
return string.Equals(x.Company, y.Company, StringComparison.OrdinalIgnoreCase) &&
string.Equals(x.FlagType, y.FlagType, StringComparison.OrdinalIgnoreCase);
}
public int GetHashCode((string Company, string FlagType) obj)
{
return HashCode.Combine(
obj.Company?.ToUpperInvariant() ?? "",
obj.FlagType?.ToUpperInvariant() ?? "");
}
}
}