Fix text_id to use date+name instead of PDF filename

Early OFAC years use batch PDFs where one document covers many penalty
cases (e.g. 56 rows sharing the same PDF in 2003). Deriving text_id from
the PDF filename caused all rows sharing a document to overwrite each other
in the DB, reducing 1061 rows to 348.

Fix: text_id = yyyyMMdd_{slugified_name}, which is unique per table row.
Also add ofac-scrape-only command for fast table-only scraping without PDF downloads.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Peter Foster
2026-04-09 16:14:52 +01:00
parent 11b1e79348
commit 73dcf9367b
4 changed files with 57 additions and 14 deletions

View File

@@ -47,6 +47,34 @@ public class Application
return 0;
}
/// <summary>
/// Scrapes all years into the checkpoint DB without downloading PDFs or syncing to S3.
/// Used to produce a spreadsheet export quickly.
/// </summary>
public async Task<int> RunScrapeOnlyAsync(CancellationToken ct = default)
{
var currentYear = DateTime.UtcNow.Year;
_logger.LogInformation("Starting scrape-only {StartYear}{EndYear}", _options.StartYear, currentYear);
var total = 0;
for (var year = _options.StartYear; year <= currentYear; year++)
{
if (ct.IsCancellationRequested) break;
var records = await _scraper.GetYearRecordsAsync(year, ct);
foreach (var record in records)
{
if (_checkpoint.HasRecord(record.TextId)) continue;
_checkpoint.MarkProcessed(record.TextId, record.Date, record.Name,
record.PenaltyTotalUsd, record.DocumentUrl, record.FileName, record.Year);
total++;
}
}
_logger.LogInformation("Scrape-only complete. {Total} new records. DB total: {DbTotal}",
total, _checkpoint.GetTotalCount());
return 0;
}
/// <summary>
/// Daily/incremental run: scrapes current year only, exports any new records.
/// </summary>

View File

@@ -11,8 +11,14 @@ public record OFACRecord
public required string YearPageUrl { get; init; }
/// <summary>
/// Stable text identifier derived from PDF filename (without extension).
/// E.g. "20260317_tradestation" from "20260317_tradestation.pdf"
/// Stable text identifier derived from date + name (unique per row).
/// Early years share batch PDFs across many rows, so the PDF filename is NOT unique per row.
/// </summary>
public string TextId => Path.GetFileNameWithoutExtension(FileName);
public string TextId => $"{Date:yyyyMMdd}_{Slugify(Name)}";
private static string Slugify(string text) =>
new string(text.ToLowerInvariant()
.Select(c => char.IsLetterOrDigit(c) ? c : '_')
.ToArray())
.Trim('_')[..Math.Min(60, text.Length)];
}

View File

@@ -51,7 +51,8 @@ try
{
"ofac-full" => await app.RunFullAsync(),
"ofac-daily" => await app.RunDailyAsync(),
_ => throw new ArgumentException($"Unknown command: {command}. Use ofac-full or ofac-daily.")
"ofac-scrape-only" => await app.RunScrapeOnlyAsync(),
_ => throw new ArgumentException($"Unknown command: {command}. Use ofac-full, ofac-daily, or ofac-scrape-only.")
};
return exitCode;

View File

@@ -75,8 +75,7 @@ public class OFACScraper
if (dateLink == null) continue; // "Year to date totals" summary row
var dateText = HtmlEntity.DeEntitize(dateLink.InnerText).Trim();
if (!DateTime.TryParseExact(dateText, "MM/dd/yyyy", CultureInfo.InvariantCulture,
DateTimeStyles.None, out var date))
if (!TryParseDate(dateText, out var date))
{
_logger.LogWarning("Could not parse date '{Date}' in year {Year}", dateText, year);
continue;
@@ -88,7 +87,7 @@ public class OFACScraper
var docUrl = docHref.StartsWith("http") ? docHref : _options.BaseUrl + docHref;
var fileName = dateLink.GetAttributeValue("title", "").Trim();
if (string.IsNullOrEmpty(fileName))
fileName = $"{date:yyyyMMdd}_{Slugify(HtmlEntity.DeEntitize(cells[1].InnerText).Trim())}.pdf";
fileName = $"{date:yyyyMMdd}.pdf";
var name = HtmlEntity.DeEntitize(cells[1].InnerText).Trim();
var penaltyText = HtmlEntity.DeEntitize(cells[3].InnerText).Trim();
@@ -114,6 +113,20 @@ public class OFACScraper
return records;
}
private static bool TryParseDate(string raw, out DateTime date)
{
// Strip suffixes: " a", " b", " c", " (Revised ...)", etc.
var clean = raw.Trim();
var parenIdx = clean.IndexOf('(');
if (parenIdx > 0) clean = clean[..parenIdx].Trim();
// Strip trailing letter suffix like "05/06/2014 b"
if (clean.Length > 0 && char.IsLetter(clean[^1]))
clean = clean[..^1].Trim();
string[] formats = ["MM/dd/yyyy", "M/d/yyyy", "M/dd/yyyy", "MM/d/yyyy"];
return DateTime.TryParseExact(clean, formats, CultureInfo.InvariantCulture, DateTimeStyles.None, out date);
}
private static decimal? ParsePenalty(string text)
{
if (string.IsNullOrWhiteSpace(text)) return null;
@@ -123,9 +136,4 @@ public class OFACScraper
: null;
}
private static string Slugify(string text) =>
new string(text.ToLowerInvariant()
.Select(c => char.IsLetterOrDigit(c) ? c : '_')
.ToArray())
.Trim('_');
}