using System.Text; using System.Text.Json; using Anthropic.SDK; using Anthropic.SDK.Messaging; using DocumentFormat.OpenXml.Packaging; using DocumentFormat.OpenXml.Wordprocessing; using Microsoft.Extensions.Logging; using Microsoft.Extensions.Options; using TrueCV.Application.Interfaces; using TrueCV.Application.Models; using TrueCV.Infrastructure.Configuration; using UglyToad.PdfPig; namespace TrueCV.Infrastructure.Services; public sealed class CVParserService : ICVParserService { private readonly AnthropicClient _anthropicClient; private readonly ILogger _logger; private static readonly JsonSerializerOptions JsonOptions = new() { PropertyNamingPolicy = JsonNamingPolicy.CamelCase, PropertyNameCaseInsensitive = true }; private const string SystemPrompt = """ You are a CV/Resume parser. Your task is to extract structured information from CV text. You must respond ONLY with valid JSON, no other text or markdown. """; private const string ExtractionPrompt = """ Parse the following CV text and extract the information into this exact JSON structure: { "fullName": "string (required)", "email": "string or null", "phone": "string or null", "employment": [ { "companyName": "string (required)", "jobTitle": "string (required)", "location": "string or null", "startDate": "YYYY-MM-DD or null", "endDate": "YYYY-MM-DD or null (null if current)", "isCurrent": "boolean", "description": "string or null" } ], "education": [ { "institution": "string (required)", "qualification": "string or null (e.g., BSc, MSc, PhD)", "subject": "string or null", "grade": "string or null", "startDate": "YYYY-MM-DD or null", "endDate": "YYYY-MM-DD or null" } ], "skills": ["array of skill strings"] } Rules: - For dates, use the first day of the month if only month/year is given (e.g., "Jan 2020" becomes "2020-01-01") - For dates with only year, use January 1st (e.g., "2020" becomes "2020-01-01") - Set isCurrent to true if the job appears to be ongoing (e.g., "Present", "Current", no end date mentioned with recent start) - Extract all employment history in chronological order - If information is not available, use null - Do not invent or assume information not present in the text CV TEXT: {CV_TEXT} """; public CVParserService( IOptions settings, ILogger logger) { _logger = logger; _anthropicClient = new AnthropicClient(settings.Value.ApiKey); } public async Task ParseAsync(Stream fileStream, string fileName) { ArgumentNullException.ThrowIfNull(fileStream); ArgumentException.ThrowIfNullOrWhiteSpace(fileName); _logger.LogDebug("Parsing CV file: {FileName}", fileName); var text = await ExtractTextAsync(fileStream, fileName); if (string.IsNullOrWhiteSpace(text)) { _logger.LogWarning("No text content extracted from file: {FileName}", fileName); throw new InvalidOperationException($"Could not extract text content from file: {fileName}"); } _logger.LogDebug("Extracted {CharCount} characters from {FileName}", text.Length, fileName); var cvData = await ParseWithClaudeAsync(text); _logger.LogInformation( "Successfully parsed CV for {FullName} with {EmploymentCount} employment entries and {EducationCount} education entries", cvData.FullName, cvData.Employment.Count, cvData.Education.Count); return cvData; } private async Task ExtractTextAsync(Stream fileStream, string fileName) { var extension = Path.GetExtension(fileName).ToLowerInvariant(); return extension switch { ".pdf" => await ExtractTextFromPdfAsync(fileStream), ".docx" => ExtractTextFromDocx(fileStream), _ => throw new NotSupportedException($"File type '{extension}' is not supported. Only PDF and DOCX files are accepted.") }; } private async Task ExtractTextFromPdfAsync(Stream fileStream) { // Copy stream to memory for PdfPig (requires seekable stream) using var memoryStream = new MemoryStream(); await fileStream.CopyToAsync(memoryStream); memoryStream.Position = 0; using var document = PdfDocument.Open(memoryStream); var textBuilder = new StringBuilder(); foreach (var page in document.GetPages()) { var pageText = page.Text; textBuilder.AppendLine(pageText); } return textBuilder.ToString(); } private static string ExtractTextFromDocx(Stream fileStream) { using var document = WordprocessingDocument.Open(fileStream, false); var body = document.MainDocumentPart?.Document?.Body; if (body is null) { return string.Empty; } var textBuilder = new StringBuilder(); foreach (var paragraph in body.Elements()) { var paragraphText = paragraph.InnerText; if (!string.IsNullOrWhiteSpace(paragraphText)) { textBuilder.AppendLine(paragraphText); } } return textBuilder.ToString(); } private async Task ParseWithClaudeAsync(string cvText) { var prompt = ExtractionPrompt.Replace("{CV_TEXT}", cvText); var messages = new List { new(RoleType.User, prompt) }; var parameters = new MessageParameters { Model = "claude-sonnet-4-20250514", MaxTokens = 4096, Messages = messages, System = [new SystemMessage(SystemPrompt)] }; _logger.LogDebug("Sending CV text to Claude API for parsing"); var response = await _anthropicClient.Messages.GetClaudeMessageAsync(parameters); var responseText = response.Content .OfType() .FirstOrDefault()?.Text; if (string.IsNullOrWhiteSpace(responseText)) { _logger.LogError("Claude API returned empty response"); throw new InvalidOperationException("Failed to parse CV: AI returned empty response"); } // Clean up response - remove markdown code blocks if present responseText = CleanJsonResponse(responseText); _logger.LogDebug("Received response from Claude API, parsing JSON"); try { var parsedResponse = JsonSerializer.Deserialize(responseText, JsonOptions); if (parsedResponse is null) { throw new InvalidOperationException("Failed to deserialize CV data from AI response"); } return MapToCVData(parsedResponse); } catch (JsonException ex) { _logger.LogError(ex, "Failed to parse Claude response as JSON: {Response}", responseText); throw new InvalidOperationException("Failed to parse CV: AI returned invalid JSON", ex); } } private static string CleanJsonResponse(string response) { var trimmed = response.Trim(); // Remove markdown code blocks if (trimmed.StartsWith("```json", StringComparison.OrdinalIgnoreCase)) { trimmed = trimmed[7..]; } else if (trimmed.StartsWith("```")) { trimmed = trimmed[3..]; } if (trimmed.EndsWith("```")) { trimmed = trimmed[..^3]; } return trimmed.Trim(); } private static CVData MapToCVData(ClaudeCVResponse response) { return new CVData { FullName = response.FullName ?? "Unknown", Email = response.Email, Phone = response.Phone, Employment = response.Employment?.Select(e => new EmploymentEntry { CompanyName = e.CompanyName ?? "Unknown Company", JobTitle = e.JobTitle ?? "Unknown Position", Location = e.Location, StartDate = ParseDate(e.StartDate), EndDate = ParseDate(e.EndDate), IsCurrent = e.IsCurrent ?? false, Description = e.Description }).ToList() ?? [], Education = response.Education?.Select(e => new EducationEntry { Institution = e.Institution ?? "Unknown Institution", Qualification = e.Qualification, Subject = e.Subject, Grade = e.Grade, StartDate = ParseDate(e.StartDate), EndDate = ParseDate(e.EndDate) }).ToList() ?? [], Skills = response.Skills ?? [] }; } private static DateOnly? ParseDate(string? dateString) { if (string.IsNullOrWhiteSpace(dateString)) { return null; } if (DateOnly.TryParse(dateString, out var date)) { return date; } return null; } // Internal DTOs for Claude response parsing private sealed record ClaudeCVResponse { public string? FullName { get; init; } public string? Email { get; init; } public string? Phone { get; init; } public List? Employment { get; init; } public List? Education { get; init; } public List? Skills { get; init; } } private sealed record ClaudeEmploymentEntry { public string? CompanyName { get; init; } public string? JobTitle { get; init; } public string? Location { get; init; } public string? StartDate { get; init; } public string? EndDate { get; init; } public bool? IsCurrent { get; init; } public string? Description { get; init; } } private sealed record ClaudeEducationEntry { public string? Institution { get; init; } public string? Qualification { get; init; } public string? Subject { get; init; } public string? Grade { get; init; } public string? StartDate { get; init; } public string? EndDate { get; init; } } }