319 lines
11 KiB
C#
319 lines
11 KiB
C#
|
|
using System.Text;
|
||
|
|
using System.Text.Json;
|
||
|
|
using Anthropic.SDK;
|
||
|
|
using Anthropic.SDK.Messaging;
|
||
|
|
using DocumentFormat.OpenXml.Packaging;
|
||
|
|
using DocumentFormat.OpenXml.Wordprocessing;
|
||
|
|
using Microsoft.Extensions.Logging;
|
||
|
|
using Microsoft.Extensions.Options;
|
||
|
|
using TrueCV.Application.Interfaces;
|
||
|
|
using TrueCV.Application.Models;
|
||
|
|
using TrueCV.Infrastructure.Configuration;
|
||
|
|
using UglyToad.PdfPig;
|
||
|
|
|
||
|
|
namespace TrueCV.Infrastructure.Services;
|
||
|
|
|
||
|
|
public sealed class CVParserService : ICVParserService
|
||
|
|
{
|
||
|
|
private readonly AnthropicClient _anthropicClient;
|
||
|
|
private readonly ILogger<CVParserService> _logger;
|
||
|
|
|
||
|
|
private static readonly JsonSerializerOptions JsonOptions = new()
|
||
|
|
{
|
||
|
|
PropertyNamingPolicy = JsonNamingPolicy.CamelCase,
|
||
|
|
PropertyNameCaseInsensitive = true
|
||
|
|
};
|
||
|
|
|
||
|
|
private const string SystemPrompt = """
|
||
|
|
You are a CV/Resume parser. Your task is to extract structured information from CV text.
|
||
|
|
You must respond ONLY with valid JSON, no other text or markdown.
|
||
|
|
""";
|
||
|
|
|
||
|
|
private const string ExtractionPrompt = """
|
||
|
|
Parse the following CV text and extract the information into this exact JSON structure:
|
||
|
|
|
||
|
|
{
|
||
|
|
"fullName": "string (required)",
|
||
|
|
"email": "string or null",
|
||
|
|
"phone": "string or null",
|
||
|
|
"employment": [
|
||
|
|
{
|
||
|
|
"companyName": "string (required)",
|
||
|
|
"jobTitle": "string (required)",
|
||
|
|
"location": "string or null",
|
||
|
|
"startDate": "YYYY-MM-DD or null",
|
||
|
|
"endDate": "YYYY-MM-DD or null (null if current)",
|
||
|
|
"isCurrent": "boolean",
|
||
|
|
"description": "string or null"
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"education": [
|
||
|
|
{
|
||
|
|
"institution": "string (required)",
|
||
|
|
"qualification": "string or null (e.g., BSc, MSc, PhD)",
|
||
|
|
"subject": "string or null",
|
||
|
|
"grade": "string or null",
|
||
|
|
"startDate": "YYYY-MM-DD or null",
|
||
|
|
"endDate": "YYYY-MM-DD or null"
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"skills": ["array of skill strings"]
|
||
|
|
}
|
||
|
|
|
||
|
|
Rules:
|
||
|
|
- For dates, use the first day of the month if only month/year is given (e.g., "Jan 2020" becomes "2020-01-01")
|
||
|
|
- For dates with only year, use January 1st (e.g., "2020" becomes "2020-01-01")
|
||
|
|
- Set isCurrent to true if the job appears to be ongoing (e.g., "Present", "Current", no end date mentioned with recent start)
|
||
|
|
- Extract all employment history in chronological order
|
||
|
|
- If information is not available, use null
|
||
|
|
- Do not invent or assume information not present in the text
|
||
|
|
|
||
|
|
CV TEXT:
|
||
|
|
{CV_TEXT}
|
||
|
|
""";
|
||
|
|
|
||
|
|
public CVParserService(
|
||
|
|
IOptions<AnthropicSettings> settings,
|
||
|
|
ILogger<CVParserService> logger)
|
||
|
|
{
|
||
|
|
_logger = logger;
|
||
|
|
_anthropicClient = new AnthropicClient(settings.Value.ApiKey);
|
||
|
|
}
|
||
|
|
|
||
|
|
public async Task<CVData> ParseAsync(Stream fileStream, string fileName)
|
||
|
|
{
|
||
|
|
ArgumentNullException.ThrowIfNull(fileStream);
|
||
|
|
ArgumentException.ThrowIfNullOrWhiteSpace(fileName);
|
||
|
|
|
||
|
|
_logger.LogDebug("Parsing CV file: {FileName}", fileName);
|
||
|
|
|
||
|
|
var text = await ExtractTextAsync(fileStream, fileName);
|
||
|
|
|
||
|
|
if (string.IsNullOrWhiteSpace(text))
|
||
|
|
{
|
||
|
|
_logger.LogWarning("No text content extracted from file: {FileName}", fileName);
|
||
|
|
throw new InvalidOperationException($"Could not extract text content from file: {fileName}");
|
||
|
|
}
|
||
|
|
|
||
|
|
_logger.LogDebug("Extracted {CharCount} characters from {FileName}", text.Length, fileName);
|
||
|
|
|
||
|
|
var cvData = await ParseWithClaudeAsync(text);
|
||
|
|
|
||
|
|
_logger.LogInformation(
|
||
|
|
"Successfully parsed CV for {FullName} with {EmploymentCount} employment entries and {EducationCount} education entries",
|
||
|
|
cvData.FullName,
|
||
|
|
cvData.Employment.Count,
|
||
|
|
cvData.Education.Count);
|
||
|
|
|
||
|
|
return cvData;
|
||
|
|
}
|
||
|
|
|
||
|
|
private async Task<string> ExtractTextAsync(Stream fileStream, string fileName)
|
||
|
|
{
|
||
|
|
var extension = Path.GetExtension(fileName).ToLowerInvariant();
|
||
|
|
|
||
|
|
return extension switch
|
||
|
|
{
|
||
|
|
".pdf" => await ExtractTextFromPdfAsync(fileStream),
|
||
|
|
".docx" => ExtractTextFromDocx(fileStream),
|
||
|
|
_ => throw new NotSupportedException($"File type '{extension}' is not supported. Only PDF and DOCX files are accepted.")
|
||
|
|
};
|
||
|
|
}
|
||
|
|
|
||
|
|
private async Task<string> ExtractTextFromPdfAsync(Stream fileStream)
|
||
|
|
{
|
||
|
|
// Copy stream to memory for PdfPig (requires seekable stream)
|
||
|
|
using var memoryStream = new MemoryStream();
|
||
|
|
await fileStream.CopyToAsync(memoryStream);
|
||
|
|
memoryStream.Position = 0;
|
||
|
|
|
||
|
|
using var document = PdfDocument.Open(memoryStream);
|
||
|
|
var textBuilder = new StringBuilder();
|
||
|
|
|
||
|
|
foreach (var page in document.GetPages())
|
||
|
|
{
|
||
|
|
var pageText = page.Text;
|
||
|
|
textBuilder.AppendLine(pageText);
|
||
|
|
}
|
||
|
|
|
||
|
|
return textBuilder.ToString();
|
||
|
|
}
|
||
|
|
|
||
|
|
private static string ExtractTextFromDocx(Stream fileStream)
|
||
|
|
{
|
||
|
|
using var document = WordprocessingDocument.Open(fileStream, false);
|
||
|
|
var body = document.MainDocumentPart?.Document?.Body;
|
||
|
|
|
||
|
|
if (body is null)
|
||
|
|
{
|
||
|
|
return string.Empty;
|
||
|
|
}
|
||
|
|
|
||
|
|
var textBuilder = new StringBuilder();
|
||
|
|
|
||
|
|
foreach (var paragraph in body.Elements<Paragraph>())
|
||
|
|
{
|
||
|
|
var paragraphText = paragraph.InnerText;
|
||
|
|
if (!string.IsNullOrWhiteSpace(paragraphText))
|
||
|
|
{
|
||
|
|
textBuilder.AppendLine(paragraphText);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
return textBuilder.ToString();
|
||
|
|
}
|
||
|
|
|
||
|
|
private async Task<CVData> ParseWithClaudeAsync(string cvText)
|
||
|
|
{
|
||
|
|
var prompt = ExtractionPrompt.Replace("{CV_TEXT}", cvText);
|
||
|
|
|
||
|
|
var messages = new List<Message>
|
||
|
|
{
|
||
|
|
new(RoleType.User, prompt)
|
||
|
|
};
|
||
|
|
|
||
|
|
var parameters = new MessageParameters
|
||
|
|
{
|
||
|
|
Model = "claude-sonnet-4-20250514",
|
||
|
|
MaxTokens = 4096,
|
||
|
|
Messages = messages,
|
||
|
|
System = [new SystemMessage(SystemPrompt)]
|
||
|
|
};
|
||
|
|
|
||
|
|
_logger.LogDebug("Sending CV text to Claude API for parsing");
|
||
|
|
|
||
|
|
var response = await _anthropicClient.Messages.GetClaudeMessageAsync(parameters);
|
||
|
|
|
||
|
|
var responseText = response.Content
|
||
|
|
.OfType<TextContent>()
|
||
|
|
.FirstOrDefault()?.Text;
|
||
|
|
|
||
|
|
if (string.IsNullOrWhiteSpace(responseText))
|
||
|
|
{
|
||
|
|
_logger.LogError("Claude API returned empty response");
|
||
|
|
throw new InvalidOperationException("Failed to parse CV: AI returned empty response");
|
||
|
|
}
|
||
|
|
|
||
|
|
// Clean up response - remove markdown code blocks if present
|
||
|
|
responseText = CleanJsonResponse(responseText);
|
||
|
|
|
||
|
|
_logger.LogDebug("Received response from Claude API, parsing JSON");
|
||
|
|
|
||
|
|
try
|
||
|
|
{
|
||
|
|
var parsedResponse = JsonSerializer.Deserialize<ClaudeCVResponse>(responseText, JsonOptions);
|
||
|
|
|
||
|
|
if (parsedResponse is null)
|
||
|
|
{
|
||
|
|
throw new InvalidOperationException("Failed to deserialize CV data from AI response");
|
||
|
|
}
|
||
|
|
|
||
|
|
return MapToCVData(parsedResponse);
|
||
|
|
}
|
||
|
|
catch (JsonException ex)
|
||
|
|
{
|
||
|
|
_logger.LogError(ex, "Failed to parse Claude response as JSON: {Response}", responseText);
|
||
|
|
throw new InvalidOperationException("Failed to parse CV: AI returned invalid JSON", ex);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
private static string CleanJsonResponse(string response)
|
||
|
|
{
|
||
|
|
var trimmed = response.Trim();
|
||
|
|
|
||
|
|
// Remove markdown code blocks
|
||
|
|
if (trimmed.StartsWith("```json", StringComparison.OrdinalIgnoreCase))
|
||
|
|
{
|
||
|
|
trimmed = trimmed[7..];
|
||
|
|
}
|
||
|
|
else if (trimmed.StartsWith("```"))
|
||
|
|
{
|
||
|
|
trimmed = trimmed[3..];
|
||
|
|
}
|
||
|
|
|
||
|
|
if (trimmed.EndsWith("```"))
|
||
|
|
{
|
||
|
|
trimmed = trimmed[..^3];
|
||
|
|
}
|
||
|
|
|
||
|
|
return trimmed.Trim();
|
||
|
|
}
|
||
|
|
|
||
|
|
private static CVData MapToCVData(ClaudeCVResponse response)
|
||
|
|
{
|
||
|
|
return new CVData
|
||
|
|
{
|
||
|
|
FullName = response.FullName ?? "Unknown",
|
||
|
|
Email = response.Email,
|
||
|
|
Phone = response.Phone,
|
||
|
|
Employment = response.Employment?.Select(e => new EmploymentEntry
|
||
|
|
{
|
||
|
|
CompanyName = e.CompanyName ?? "Unknown Company",
|
||
|
|
JobTitle = e.JobTitle ?? "Unknown Position",
|
||
|
|
Location = e.Location,
|
||
|
|
StartDate = ParseDate(e.StartDate),
|
||
|
|
EndDate = ParseDate(e.EndDate),
|
||
|
|
IsCurrent = e.IsCurrent ?? false,
|
||
|
|
Description = e.Description
|
||
|
|
}).ToList() ?? [],
|
||
|
|
Education = response.Education?.Select(e => new EducationEntry
|
||
|
|
{
|
||
|
|
Institution = e.Institution ?? "Unknown Institution",
|
||
|
|
Qualification = e.Qualification,
|
||
|
|
Subject = e.Subject,
|
||
|
|
Grade = e.Grade,
|
||
|
|
StartDate = ParseDate(e.StartDate),
|
||
|
|
EndDate = ParseDate(e.EndDate)
|
||
|
|
}).ToList() ?? [],
|
||
|
|
Skills = response.Skills ?? []
|
||
|
|
};
|
||
|
|
}
|
||
|
|
|
||
|
|
private static DateOnly? ParseDate(string? dateString)
|
||
|
|
{
|
||
|
|
if (string.IsNullOrWhiteSpace(dateString))
|
||
|
|
{
|
||
|
|
return null;
|
||
|
|
}
|
||
|
|
|
||
|
|
if (DateOnly.TryParse(dateString, out var date))
|
||
|
|
{
|
||
|
|
return date;
|
||
|
|
}
|
||
|
|
|
||
|
|
return null;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Internal DTOs for Claude response parsing
|
||
|
|
private sealed record ClaudeCVResponse
|
||
|
|
{
|
||
|
|
public string? FullName { get; init; }
|
||
|
|
public string? Email { get; init; }
|
||
|
|
public string? Phone { get; init; }
|
||
|
|
public List<ClaudeEmploymentEntry>? Employment { get; init; }
|
||
|
|
public List<ClaudeEducationEntry>? Education { get; init; }
|
||
|
|
public List<string>? Skills { get; init; }
|
||
|
|
}
|
||
|
|
|
||
|
|
private sealed record ClaudeEmploymentEntry
|
||
|
|
{
|
||
|
|
public string? CompanyName { get; init; }
|
||
|
|
public string? JobTitle { get; init; }
|
||
|
|
public string? Location { get; init; }
|
||
|
|
public string? StartDate { get; init; }
|
||
|
|
public string? EndDate { get; init; }
|
||
|
|
public bool? IsCurrent { get; init; }
|
||
|
|
public string? Description { get; init; }
|
||
|
|
}
|
||
|
|
|
||
|
|
private sealed record ClaudeEducationEntry
|
||
|
|
{
|
||
|
|
public string? Institution { get; init; }
|
||
|
|
public string? Qualification { get; init; }
|
||
|
|
public string? Subject { get; init; }
|
||
|
|
public string? Grade { get; init; }
|
||
|
|
public string? StartDate { get; init; }
|
||
|
|
public string? EndDate { get; init; }
|
||
|
|
}
|
||
|
|
}
|