Initial commit: TrueCV CV verification platform
Clean architecture solution with: - Domain: Entities (User, CVCheck, CVFlag, CompanyCache) and Enums - Application: Service interfaces, DTOs, and models - Infrastructure: EF Core, Identity, Hangfire, external API clients, services - Web: Blazor Server UI with pages and components Features: - CV upload and parsing (PDF/DOCX) using Claude API - Employment verification against Companies House API - Timeline analysis for gaps and overlaps - Veracity scoring algorithm - Background job processing with Hangfire - Azure Blob Storage for file storage Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
318
src/TrueCV.Infrastructure/Services/CVParserService.cs
Normal file
318
src/TrueCV.Infrastructure/Services/CVParserService.cs
Normal file
@@ -0,0 +1,318 @@
|
||||
using System.Text;
|
||||
using System.Text.Json;
|
||||
using Anthropic.SDK;
|
||||
using Anthropic.SDK.Messaging;
|
||||
using DocumentFormat.OpenXml.Packaging;
|
||||
using DocumentFormat.OpenXml.Wordprocessing;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using TrueCV.Application.Interfaces;
|
||||
using TrueCV.Application.Models;
|
||||
using TrueCV.Infrastructure.Configuration;
|
||||
using UglyToad.PdfPig;
|
||||
|
||||
namespace TrueCV.Infrastructure.Services;
|
||||
|
||||
public sealed class CVParserService : ICVParserService
|
||||
{
|
||||
private readonly AnthropicClient _anthropicClient;
|
||||
private readonly ILogger<CVParserService> _logger;
|
||||
|
||||
private static readonly JsonSerializerOptions JsonOptions = new()
|
||||
{
|
||||
PropertyNamingPolicy = JsonNamingPolicy.CamelCase,
|
||||
PropertyNameCaseInsensitive = true
|
||||
};
|
||||
|
||||
private const string SystemPrompt = """
|
||||
You are a CV/Resume parser. Your task is to extract structured information from CV text.
|
||||
You must respond ONLY with valid JSON, no other text or markdown.
|
||||
""";
|
||||
|
||||
private const string ExtractionPrompt = """
|
||||
Parse the following CV text and extract the information into this exact JSON structure:
|
||||
|
||||
{
|
||||
"fullName": "string (required)",
|
||||
"email": "string or null",
|
||||
"phone": "string or null",
|
||||
"employment": [
|
||||
{
|
||||
"companyName": "string (required)",
|
||||
"jobTitle": "string (required)",
|
||||
"location": "string or null",
|
||||
"startDate": "YYYY-MM-DD or null",
|
||||
"endDate": "YYYY-MM-DD or null (null if current)",
|
||||
"isCurrent": "boolean",
|
||||
"description": "string or null"
|
||||
}
|
||||
],
|
||||
"education": [
|
||||
{
|
||||
"institution": "string (required)",
|
||||
"qualification": "string or null (e.g., BSc, MSc, PhD)",
|
||||
"subject": "string or null",
|
||||
"grade": "string or null",
|
||||
"startDate": "YYYY-MM-DD or null",
|
||||
"endDate": "YYYY-MM-DD or null"
|
||||
}
|
||||
],
|
||||
"skills": ["array of skill strings"]
|
||||
}
|
||||
|
||||
Rules:
|
||||
- For dates, use the first day of the month if only month/year is given (e.g., "Jan 2020" becomes "2020-01-01")
|
||||
- For dates with only year, use January 1st (e.g., "2020" becomes "2020-01-01")
|
||||
- Set isCurrent to true if the job appears to be ongoing (e.g., "Present", "Current", no end date mentioned with recent start)
|
||||
- Extract all employment history in chronological order
|
||||
- If information is not available, use null
|
||||
- Do not invent or assume information not present in the text
|
||||
|
||||
CV TEXT:
|
||||
{CV_TEXT}
|
||||
""";
|
||||
|
||||
public CVParserService(
|
||||
IOptions<AnthropicSettings> settings,
|
||||
ILogger<CVParserService> logger)
|
||||
{
|
||||
_logger = logger;
|
||||
_anthropicClient = new AnthropicClient(settings.Value.ApiKey);
|
||||
}
|
||||
|
||||
public async Task<CVData> ParseAsync(Stream fileStream, string fileName)
|
||||
{
|
||||
ArgumentNullException.ThrowIfNull(fileStream);
|
||||
ArgumentException.ThrowIfNullOrWhiteSpace(fileName);
|
||||
|
||||
_logger.LogDebug("Parsing CV file: {FileName}", fileName);
|
||||
|
||||
var text = await ExtractTextAsync(fileStream, fileName);
|
||||
|
||||
if (string.IsNullOrWhiteSpace(text))
|
||||
{
|
||||
_logger.LogWarning("No text content extracted from file: {FileName}", fileName);
|
||||
throw new InvalidOperationException($"Could not extract text content from file: {fileName}");
|
||||
}
|
||||
|
||||
_logger.LogDebug("Extracted {CharCount} characters from {FileName}", text.Length, fileName);
|
||||
|
||||
var cvData = await ParseWithClaudeAsync(text);
|
||||
|
||||
_logger.LogInformation(
|
||||
"Successfully parsed CV for {FullName} with {EmploymentCount} employment entries and {EducationCount} education entries",
|
||||
cvData.FullName,
|
||||
cvData.Employment.Count,
|
||||
cvData.Education.Count);
|
||||
|
||||
return cvData;
|
||||
}
|
||||
|
||||
private async Task<string> ExtractTextAsync(Stream fileStream, string fileName)
|
||||
{
|
||||
var extension = Path.GetExtension(fileName).ToLowerInvariant();
|
||||
|
||||
return extension switch
|
||||
{
|
||||
".pdf" => await ExtractTextFromPdfAsync(fileStream),
|
||||
".docx" => ExtractTextFromDocx(fileStream),
|
||||
_ => throw new NotSupportedException($"File type '{extension}' is not supported. Only PDF and DOCX files are accepted.")
|
||||
};
|
||||
}
|
||||
|
||||
private async Task<string> ExtractTextFromPdfAsync(Stream fileStream)
|
||||
{
|
||||
// Copy stream to memory for PdfPig (requires seekable stream)
|
||||
using var memoryStream = new MemoryStream();
|
||||
await fileStream.CopyToAsync(memoryStream);
|
||||
memoryStream.Position = 0;
|
||||
|
||||
using var document = PdfDocument.Open(memoryStream);
|
||||
var textBuilder = new StringBuilder();
|
||||
|
||||
foreach (var page in document.GetPages())
|
||||
{
|
||||
var pageText = page.Text;
|
||||
textBuilder.AppendLine(pageText);
|
||||
}
|
||||
|
||||
return textBuilder.ToString();
|
||||
}
|
||||
|
||||
private static string ExtractTextFromDocx(Stream fileStream)
|
||||
{
|
||||
using var document = WordprocessingDocument.Open(fileStream, false);
|
||||
var body = document.MainDocumentPart?.Document?.Body;
|
||||
|
||||
if (body is null)
|
||||
{
|
||||
return string.Empty;
|
||||
}
|
||||
|
||||
var textBuilder = new StringBuilder();
|
||||
|
||||
foreach (var paragraph in body.Elements<Paragraph>())
|
||||
{
|
||||
var paragraphText = paragraph.InnerText;
|
||||
if (!string.IsNullOrWhiteSpace(paragraphText))
|
||||
{
|
||||
textBuilder.AppendLine(paragraphText);
|
||||
}
|
||||
}
|
||||
|
||||
return textBuilder.ToString();
|
||||
}
|
||||
|
||||
private async Task<CVData> ParseWithClaudeAsync(string cvText)
|
||||
{
|
||||
var prompt = ExtractionPrompt.Replace("{CV_TEXT}", cvText);
|
||||
|
||||
var messages = new List<Message>
|
||||
{
|
||||
new(RoleType.User, prompt)
|
||||
};
|
||||
|
||||
var parameters = new MessageParameters
|
||||
{
|
||||
Model = "claude-sonnet-4-20250514",
|
||||
MaxTokens = 4096,
|
||||
Messages = messages,
|
||||
System = [new SystemMessage(SystemPrompt)]
|
||||
};
|
||||
|
||||
_logger.LogDebug("Sending CV text to Claude API for parsing");
|
||||
|
||||
var response = await _anthropicClient.Messages.GetClaudeMessageAsync(parameters);
|
||||
|
||||
var responseText = response.Content
|
||||
.OfType<TextContent>()
|
||||
.FirstOrDefault()?.Text;
|
||||
|
||||
if (string.IsNullOrWhiteSpace(responseText))
|
||||
{
|
||||
_logger.LogError("Claude API returned empty response");
|
||||
throw new InvalidOperationException("Failed to parse CV: AI returned empty response");
|
||||
}
|
||||
|
||||
// Clean up response - remove markdown code blocks if present
|
||||
responseText = CleanJsonResponse(responseText);
|
||||
|
||||
_logger.LogDebug("Received response from Claude API, parsing JSON");
|
||||
|
||||
try
|
||||
{
|
||||
var parsedResponse = JsonSerializer.Deserialize<ClaudeCVResponse>(responseText, JsonOptions);
|
||||
|
||||
if (parsedResponse is null)
|
||||
{
|
||||
throw new InvalidOperationException("Failed to deserialize CV data from AI response");
|
||||
}
|
||||
|
||||
return MapToCVData(parsedResponse);
|
||||
}
|
||||
catch (JsonException ex)
|
||||
{
|
||||
_logger.LogError(ex, "Failed to parse Claude response as JSON: {Response}", responseText);
|
||||
throw new InvalidOperationException("Failed to parse CV: AI returned invalid JSON", ex);
|
||||
}
|
||||
}
|
||||
|
||||
private static string CleanJsonResponse(string response)
|
||||
{
|
||||
var trimmed = response.Trim();
|
||||
|
||||
// Remove markdown code blocks
|
||||
if (trimmed.StartsWith("```json", StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
trimmed = trimmed[7..];
|
||||
}
|
||||
else if (trimmed.StartsWith("```"))
|
||||
{
|
||||
trimmed = trimmed[3..];
|
||||
}
|
||||
|
||||
if (trimmed.EndsWith("```"))
|
||||
{
|
||||
trimmed = trimmed[..^3];
|
||||
}
|
||||
|
||||
return trimmed.Trim();
|
||||
}
|
||||
|
||||
private static CVData MapToCVData(ClaudeCVResponse response)
|
||||
{
|
||||
return new CVData
|
||||
{
|
||||
FullName = response.FullName ?? "Unknown",
|
||||
Email = response.Email,
|
||||
Phone = response.Phone,
|
||||
Employment = response.Employment?.Select(e => new EmploymentEntry
|
||||
{
|
||||
CompanyName = e.CompanyName ?? "Unknown Company",
|
||||
JobTitle = e.JobTitle ?? "Unknown Position",
|
||||
Location = e.Location,
|
||||
StartDate = ParseDate(e.StartDate),
|
||||
EndDate = ParseDate(e.EndDate),
|
||||
IsCurrent = e.IsCurrent ?? false,
|
||||
Description = e.Description
|
||||
}).ToList() ?? [],
|
||||
Education = response.Education?.Select(e => new EducationEntry
|
||||
{
|
||||
Institution = e.Institution ?? "Unknown Institution",
|
||||
Qualification = e.Qualification,
|
||||
Subject = e.Subject,
|
||||
Grade = e.Grade,
|
||||
StartDate = ParseDate(e.StartDate),
|
||||
EndDate = ParseDate(e.EndDate)
|
||||
}).ToList() ?? [],
|
||||
Skills = response.Skills ?? []
|
||||
};
|
||||
}
|
||||
|
||||
private static DateOnly? ParseDate(string? dateString)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(dateString))
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
if (DateOnly.TryParse(dateString, out var date))
|
||||
{
|
||||
return date;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
// Internal DTOs for Claude response parsing
|
||||
private sealed record ClaudeCVResponse
|
||||
{
|
||||
public string? FullName { get; init; }
|
||||
public string? Email { get; init; }
|
||||
public string? Phone { get; init; }
|
||||
public List<ClaudeEmploymentEntry>? Employment { get; init; }
|
||||
public List<ClaudeEducationEntry>? Education { get; init; }
|
||||
public List<string>? Skills { get; init; }
|
||||
}
|
||||
|
||||
private sealed record ClaudeEmploymentEntry
|
||||
{
|
||||
public string? CompanyName { get; init; }
|
||||
public string? JobTitle { get; init; }
|
||||
public string? Location { get; init; }
|
||||
public string? StartDate { get; init; }
|
||||
public string? EndDate { get; init; }
|
||||
public bool? IsCurrent { get; init; }
|
||||
public string? Description { get; init; }
|
||||
}
|
||||
|
||||
private sealed record ClaudeEducationEntry
|
||||
{
|
||||
public string? Institution { get; init; }
|
||||
public string? Qualification { get; init; }
|
||||
public string? Subject { get; init; }
|
||||
public string? Grade { get; init; }
|
||||
public string? StartDate { get; init; }
|
||||
public string? EndDate { get; init; }
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user