Files
RealCV/src/RealCV.Infrastructure/Services/CVParserService.cs

279 lines
10 KiB
C#
Raw Normal View History

using System.Text;
using System.Text.Json;
using Anthropic.SDK;
using Anthropic.SDK.Messaging;
using DocumentFormat.OpenXml.Packaging;
using DocumentFormat.OpenXml.Wordprocessing;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using RealCV.Application.Helpers;
using RealCV.Application.Interfaces;
using RealCV.Application.Models;
using RealCV.Infrastructure.Configuration;
using RealCV.Infrastructure.Helpers;
using UglyToad.PdfPig;
namespace RealCV.Infrastructure.Services;
public sealed class CVParserService : ICVParserService
{
private readonly AnthropicClient _anthropicClient;
private readonly ILogger<CVParserService> _logger;
private const string SystemPrompt = """
You are a CV/Resume parser. Your task is to extract structured information from CV text.
You must respond ONLY with valid JSON, no other text or markdown.
""";
private const string ExtractionPrompt = """
Parse the following CV text and extract the information into this exact JSON structure:
{
"fullName": "string (required)",
"email": "string or null",
"phone": "string or null",
"employment": [
{
"companyName": "string (required)",
"jobTitle": "string (required)",
"location": "string or null",
"startDate": "YYYY-MM-DD or null",
"endDate": "YYYY-MM-DD or null (null if current)",
"isCurrent": "boolean",
"description": "string or null"
}
],
"education": [
{
"institution": "string (required)",
"qualification": "string or null (e.g., BSc, MSc, PhD)",
"subject": "string or null",
"grade": "string or null",
"startDate": "YYYY-MM-DD or null",
"endDate": "YYYY-MM-DD or null"
}
],
"skills": ["array of skill strings"]
}
Rules:
- For dates, use the first day of the month if only month/year is given (e.g., "Jan 2020" becomes "2020-01-01")
- For dates with only year, use January 1st (e.g., "2020" becomes "2020-01-01")
- Set isCurrent to true if the job appears to be ongoing (e.g., "Present", "Current", no end date mentioned with recent start)
- Extract all employment history in chronological order
- If information is not available, use null
- Do not invent or assume information not present in the text
CV TEXT:
{CV_TEXT}
""";
public CVParserService(
IOptions<AnthropicSettings> settings,
ILogger<CVParserService> logger)
{
_logger = logger;
_anthropicClient = new AnthropicClient(settings.Value.ApiKey);
}
public async Task<CVData> ParseAsync(Stream fileStream, string fileName, CancellationToken cancellationToken = default)
{
ArgumentNullException.ThrowIfNull(fileStream);
ArgumentException.ThrowIfNullOrWhiteSpace(fileName);
_logger.LogDebug("Parsing CV file: {FileName}", fileName);
var text = await ExtractTextAsync(fileStream, fileName, cancellationToken);
if (string.IsNullOrWhiteSpace(text))
{
_logger.LogWarning("No text content extracted from file: {FileName}", fileName);
throw new InvalidOperationException($"Could not extract text content from file: {fileName}");
}
_logger.LogDebug("Extracted {CharCount} characters from {FileName}", text.Length, fileName);
var cvData = await ParseWithClaudeAsync(text, cancellationToken);
_logger.LogInformation(
"Successfully parsed CV for {FullName} with {EmploymentCount} employment entries and {EducationCount} education entries",
cvData.FullName,
cvData.Employment.Count,
cvData.Education.Count);
return cvData;
}
private async Task<string> ExtractTextAsync(Stream fileStream, string fileName, CancellationToken cancellationToken)
{
var extension = Path.GetExtension(fileName).ToLowerInvariant();
return extension switch
{
".pdf" => await ExtractTextFromPdfAsync(fileStream, cancellationToken),
".docx" => ExtractTextFromDocx(fileStream),
_ => throw new NotSupportedException($"File type '{extension}' is not supported. Only PDF and DOCX files are accepted.")
};
}
private async Task<string> ExtractTextFromPdfAsync(Stream fileStream, CancellationToken cancellationToken)
{
// Copy stream to memory for PdfPig (requires seekable stream)
using var memoryStream = new MemoryStream();
await fileStream.CopyToAsync(memoryStream, cancellationToken);
memoryStream.Position = 0;
using var document = PdfDocument.Open(memoryStream);
var textBuilder = new StringBuilder();
foreach (var page in document.GetPages())
{
cancellationToken.ThrowIfCancellationRequested();
var pageText = page.Text;
textBuilder.AppendLine(pageText);
}
return textBuilder.ToString();
}
private static string ExtractTextFromDocx(Stream fileStream)
{
using var document = WordprocessingDocument.Open(fileStream, false);
var body = document.MainDocumentPart?.Document?.Body;
if (body is null)
{
return string.Empty;
}
var textBuilder = new StringBuilder();
foreach (var paragraph in body.Elements<Paragraph>())
{
var paragraphText = paragraph.InnerText;
if (!string.IsNullOrWhiteSpace(paragraphText))
{
textBuilder.AppendLine(paragraphText);
}
}
return textBuilder.ToString();
}
private async Task<CVData> ParseWithClaudeAsync(string cvText, CancellationToken cancellationToken)
{
var prompt = ExtractionPrompt.Replace("{CV_TEXT}", cvText);
var messages = new List<Message>
{
new(RoleType.User, prompt)
};
var parameters = new MessageParameters
{
Model = "claude-3-5-haiku-20241022",
MaxTokens = 2048,
Messages = messages,
System = [new SystemMessage(SystemPrompt)]
};
_logger.LogDebug("Sending CV text to Claude API for parsing");
var response = await _anthropicClient.Messages.GetClaudeMessageAsync(parameters, cancellationToken);
var responseText = response.Content
.OfType<TextContent>()
.FirstOrDefault()?.Text;
if (string.IsNullOrWhiteSpace(responseText))
{
_logger.LogError("Claude API returned empty response");
throw new InvalidOperationException("Failed to parse CV: AI returned empty response");
}
// Clean up response - remove markdown code blocks if present
responseText = JsonResponseHelper.CleanJsonResponse(responseText);
_logger.LogDebug("Received response from Claude API, parsing JSON");
try
{
var parsedResponse = JsonSerializer.Deserialize<ClaudeCVResponse>(responseText, JsonDefaults.CamelCase);
if (parsedResponse is null)
{
throw new InvalidOperationException("Failed to deserialize CV data from AI response");
}
return MapToCVData(parsedResponse);
}
catch (JsonException ex)
{
_logger.LogError(ex, "Failed to parse Claude response as JSON: {Response}", responseText);
throw new InvalidOperationException("Failed to parse CV: AI returned invalid JSON", ex);
}
}
private static CVData MapToCVData(ClaudeCVResponse response)
{
return new CVData
{
FullName = response.FullName ?? "Unknown",
Email = response.Email,
Phone = response.Phone,
Employment = response.Employment?.Select(e => new EmploymentEntry
{
CompanyName = e.CompanyName ?? "Unknown Company",
JobTitle = e.JobTitle ?? "Unknown Position",
Location = e.Location,
StartDate = DateHelpers.ParseDate(e.StartDate),
EndDate = DateHelpers.ParseDate(e.EndDate),
IsCurrent = e.IsCurrent ?? false,
Description = e.Description
}).ToList() ?? [],
Education = response.Education?.Select(e => new EducationEntry
{
Institution = e.Institution ?? "Unknown Institution",
Qualification = e.Qualification,
Subject = e.Subject,
Grade = e.Grade,
StartDate = DateHelpers.ParseDate(e.StartDate),
EndDate = DateHelpers.ParseDate(e.EndDate)
}).ToList() ?? [],
Skills = response.Skills ?? []
};
}
// Internal DTOs for Claude response parsing
private sealed record ClaudeCVResponse
{
public string? FullName { get; init; }
public string? Email { get; init; }
public string? Phone { get; init; }
public List<ClaudeEmploymentEntry>? Employment { get; init; }
public List<ClaudeEducationEntry>? Education { get; init; }
public List<string>? Skills { get; init; }
}
private sealed record ClaudeEmploymentEntry
{
public string? CompanyName { get; init; }
public string? JobTitle { get; init; }
public string? Location { get; init; }
public string? StartDate { get; init; }
public string? EndDate { get; init; }
public bool? IsCurrent { get; init; }
public string? Description { get; init; }
}
private sealed record ClaudeEducationEntry
{
public string? Institution { get; init; }
public string? Qualification { get; init; }
public string? Subject { get; init; }
public string? Grade { get; init; }
public string? StartDate { get; init; }
public string? EndDate { get; init; }
}
}