diff --git a/src/OFACScraper/Application.cs b/src/OFACScraper/Application.cs
index e27bbde..46d4413 100644
--- a/src/OFACScraper/Application.cs
+++ b/src/OFACScraper/Application.cs
@@ -47,6 +47,34 @@ public class Application
return 0;
}
+ ///
+ /// Scrapes all years into the checkpoint DB without downloading PDFs or syncing to S3.
+ /// Used to produce a spreadsheet export quickly.
+ ///
+ public async Task RunScrapeOnlyAsync(CancellationToken ct = default)
+ {
+ var currentYear = DateTime.UtcNow.Year;
+ _logger.LogInformation("Starting scrape-only {StartYear}–{EndYear}", _options.StartYear, currentYear);
+
+ var total = 0;
+ for (var year = _options.StartYear; year <= currentYear; year++)
+ {
+ if (ct.IsCancellationRequested) break;
+ var records = await _scraper.GetYearRecordsAsync(year, ct);
+ foreach (var record in records)
+ {
+ if (_checkpoint.HasRecord(record.TextId)) continue;
+ _checkpoint.MarkProcessed(record.TextId, record.Date, record.Name,
+ record.PenaltyTotalUsd, record.DocumentUrl, record.FileName, record.Year);
+ total++;
+ }
+ }
+
+ _logger.LogInformation("Scrape-only complete. {Total} new records. DB total: {DbTotal}",
+ total, _checkpoint.GetTotalCount());
+ return 0;
+ }
+
///
/// Daily/incremental run: scrapes current year only, exports any new records.
///
diff --git a/src/OFACScraper/Models/OFACRecord.cs b/src/OFACScraper/Models/OFACRecord.cs
index fb035a9..9d7c4da 100644
--- a/src/OFACScraper/Models/OFACRecord.cs
+++ b/src/OFACScraper/Models/OFACRecord.cs
@@ -11,8 +11,14 @@ public record OFACRecord
public required string YearPageUrl { get; init; }
///
- /// Stable text identifier derived from PDF filename (without extension).
- /// E.g. "20260317_tradestation" from "20260317_tradestation.pdf"
+ /// Stable text identifier derived from date + name (unique per row).
+ /// Early years share batch PDFs across many rows, so the PDF filename is NOT unique per row.
///
- public string TextId => Path.GetFileNameWithoutExtension(FileName);
+ public string TextId => $"{Date:yyyyMMdd}_{Slugify(Name)}";
+
+ private static string Slugify(string text) =>
+ new string(text.ToLowerInvariant()
+ .Select(c => char.IsLetterOrDigit(c) ? c : '_')
+ .ToArray())
+ .Trim('_')[..Math.Min(60, text.Length)];
}
diff --git a/src/OFACScraper/Program.cs b/src/OFACScraper/Program.cs
index 82df812..c70de0a 100644
--- a/src/OFACScraper/Program.cs
+++ b/src/OFACScraper/Program.cs
@@ -49,9 +49,10 @@ try
var app = host.Services.GetRequiredService();
var exitCode = command switch
{
- "ofac-full" => await app.RunFullAsync(),
- "ofac-daily" => await app.RunDailyAsync(),
- _ => throw new ArgumentException($"Unknown command: {command}. Use ofac-full or ofac-daily.")
+ "ofac-full" => await app.RunFullAsync(),
+ "ofac-daily" => await app.RunDailyAsync(),
+ "ofac-scrape-only" => await app.RunScrapeOnlyAsync(),
+ _ => throw new ArgumentException($"Unknown command: {command}. Use ofac-full, ofac-daily, or ofac-scrape-only.")
};
return exitCode;
diff --git a/src/OFACScraper/Scraper.cs b/src/OFACScraper/Scraper.cs
index 2def43e..c50aac0 100644
--- a/src/OFACScraper/Scraper.cs
+++ b/src/OFACScraper/Scraper.cs
@@ -75,8 +75,7 @@ public class OFACScraper
if (dateLink == null) continue; // "Year to date totals" summary row
var dateText = HtmlEntity.DeEntitize(dateLink.InnerText).Trim();
- if (!DateTime.TryParseExact(dateText, "MM/dd/yyyy", CultureInfo.InvariantCulture,
- DateTimeStyles.None, out var date))
+ if (!TryParseDate(dateText, out var date))
{
_logger.LogWarning("Could not parse date '{Date}' in year {Year}", dateText, year);
continue;
@@ -88,7 +87,7 @@ public class OFACScraper
var docUrl = docHref.StartsWith("http") ? docHref : _options.BaseUrl + docHref;
var fileName = dateLink.GetAttributeValue("title", "").Trim();
if (string.IsNullOrEmpty(fileName))
- fileName = $"{date:yyyyMMdd}_{Slugify(HtmlEntity.DeEntitize(cells[1].InnerText).Trim())}.pdf";
+ fileName = $"{date:yyyyMMdd}.pdf";
var name = HtmlEntity.DeEntitize(cells[1].InnerText).Trim();
var penaltyText = HtmlEntity.DeEntitize(cells[3].InnerText).Trim();
@@ -114,6 +113,20 @@ public class OFACScraper
return records;
}
+ private static bool TryParseDate(string raw, out DateTime date)
+ {
+ // Strip suffixes: " a", " b", " c", " (Revised ...)", etc.
+ var clean = raw.Trim();
+ var parenIdx = clean.IndexOf('(');
+ if (parenIdx > 0) clean = clean[..parenIdx].Trim();
+ // Strip trailing letter suffix like "05/06/2014 b"
+ if (clean.Length > 0 && char.IsLetter(clean[^1]))
+ clean = clean[..^1].Trim();
+
+ string[] formats = ["MM/dd/yyyy", "M/d/yyyy", "M/dd/yyyy", "MM/d/yyyy"];
+ return DateTime.TryParseExact(clean, formats, CultureInfo.InvariantCulture, DateTimeStyles.None, out date);
+ }
+
private static decimal? ParsePenalty(string text)
{
if (string.IsNullOrWhiteSpace(text)) return null;
@@ -123,9 +136,4 @@ public class OFACScraper
: null;
}
- private static string Slugify(string text) =>
- new string(text.ToLowerInvariant()
- .Select(c => char.IsLetterOrDigit(c) ? c : '_')
- .ToArray())
- .Trim('_');
}