diff --git a/src/OFACScraper/Application.cs b/src/OFACScraper/Application.cs index e27bbde..46d4413 100644 --- a/src/OFACScraper/Application.cs +++ b/src/OFACScraper/Application.cs @@ -47,6 +47,34 @@ public class Application return 0; } + /// + /// Scrapes all years into the checkpoint DB without downloading PDFs or syncing to S3. + /// Used to produce a spreadsheet export quickly. + /// + public async Task RunScrapeOnlyAsync(CancellationToken ct = default) + { + var currentYear = DateTime.UtcNow.Year; + _logger.LogInformation("Starting scrape-only {StartYear}–{EndYear}", _options.StartYear, currentYear); + + var total = 0; + for (var year = _options.StartYear; year <= currentYear; year++) + { + if (ct.IsCancellationRequested) break; + var records = await _scraper.GetYearRecordsAsync(year, ct); + foreach (var record in records) + { + if (_checkpoint.HasRecord(record.TextId)) continue; + _checkpoint.MarkProcessed(record.TextId, record.Date, record.Name, + record.PenaltyTotalUsd, record.DocumentUrl, record.FileName, record.Year); + total++; + } + } + + _logger.LogInformation("Scrape-only complete. {Total} new records. DB total: {DbTotal}", + total, _checkpoint.GetTotalCount()); + return 0; + } + /// /// Daily/incremental run: scrapes current year only, exports any new records. /// diff --git a/src/OFACScraper/Models/OFACRecord.cs b/src/OFACScraper/Models/OFACRecord.cs index fb035a9..9d7c4da 100644 --- a/src/OFACScraper/Models/OFACRecord.cs +++ b/src/OFACScraper/Models/OFACRecord.cs @@ -11,8 +11,14 @@ public record OFACRecord public required string YearPageUrl { get; init; } /// - /// Stable text identifier derived from PDF filename (without extension). - /// E.g. "20260317_tradestation" from "20260317_tradestation.pdf" + /// Stable text identifier derived from date + name (unique per row). + /// Early years share batch PDFs across many rows, so the PDF filename is NOT unique per row. /// - public string TextId => Path.GetFileNameWithoutExtension(FileName); + public string TextId => $"{Date:yyyyMMdd}_{Slugify(Name)}"; + + private static string Slugify(string text) => + new string(text.ToLowerInvariant() + .Select(c => char.IsLetterOrDigit(c) ? c : '_') + .ToArray()) + .Trim('_')[..Math.Min(60, text.Length)]; } diff --git a/src/OFACScraper/Program.cs b/src/OFACScraper/Program.cs index 82df812..c70de0a 100644 --- a/src/OFACScraper/Program.cs +++ b/src/OFACScraper/Program.cs @@ -49,9 +49,10 @@ try var app = host.Services.GetRequiredService(); var exitCode = command switch { - "ofac-full" => await app.RunFullAsync(), - "ofac-daily" => await app.RunDailyAsync(), - _ => throw new ArgumentException($"Unknown command: {command}. Use ofac-full or ofac-daily.") + "ofac-full" => await app.RunFullAsync(), + "ofac-daily" => await app.RunDailyAsync(), + "ofac-scrape-only" => await app.RunScrapeOnlyAsync(), + _ => throw new ArgumentException($"Unknown command: {command}. Use ofac-full, ofac-daily, or ofac-scrape-only.") }; return exitCode; diff --git a/src/OFACScraper/Scraper.cs b/src/OFACScraper/Scraper.cs index 2def43e..c50aac0 100644 --- a/src/OFACScraper/Scraper.cs +++ b/src/OFACScraper/Scraper.cs @@ -75,8 +75,7 @@ public class OFACScraper if (dateLink == null) continue; // "Year to date totals" summary row var dateText = HtmlEntity.DeEntitize(dateLink.InnerText).Trim(); - if (!DateTime.TryParseExact(dateText, "MM/dd/yyyy", CultureInfo.InvariantCulture, - DateTimeStyles.None, out var date)) + if (!TryParseDate(dateText, out var date)) { _logger.LogWarning("Could not parse date '{Date}' in year {Year}", dateText, year); continue; @@ -88,7 +87,7 @@ public class OFACScraper var docUrl = docHref.StartsWith("http") ? docHref : _options.BaseUrl + docHref; var fileName = dateLink.GetAttributeValue("title", "").Trim(); if (string.IsNullOrEmpty(fileName)) - fileName = $"{date:yyyyMMdd}_{Slugify(HtmlEntity.DeEntitize(cells[1].InnerText).Trim())}.pdf"; + fileName = $"{date:yyyyMMdd}.pdf"; var name = HtmlEntity.DeEntitize(cells[1].InnerText).Trim(); var penaltyText = HtmlEntity.DeEntitize(cells[3].InnerText).Trim(); @@ -114,6 +113,20 @@ public class OFACScraper return records; } + private static bool TryParseDate(string raw, out DateTime date) + { + // Strip suffixes: " a", " b", " c", " (Revised ...)", etc. + var clean = raw.Trim(); + var parenIdx = clean.IndexOf('('); + if (parenIdx > 0) clean = clean[..parenIdx].Trim(); + // Strip trailing letter suffix like "05/06/2014 b" + if (clean.Length > 0 && char.IsLetter(clean[^1])) + clean = clean[..^1].Trim(); + + string[] formats = ["MM/dd/yyyy", "M/d/yyyy", "M/dd/yyyy", "MM/d/yyyy"]; + return DateTime.TryParseExact(clean, formats, CultureInfo.InvariantCulture, DateTimeStyles.None, out date); + } + private static decimal? ParsePenalty(string text) { if (string.IsNullOrWhiteSpace(text)) return null; @@ -123,9 +136,4 @@ public class OFACScraper : null; } - private static string Slugify(string text) => - new string(text.ToLowerInvariant() - .Select(c => char.IsLetterOrDigit(c) ? c : '_') - .ToArray()) - .Trim('_'); }