Fix text_id to use date+name instead of PDF filename
Early OFAC years use batch PDFs where one document covers many penalty
cases (e.g. 56 rows sharing the same PDF in 2003). Deriving text_id from
the PDF filename caused all rows sharing a document to overwrite each other
in the DB, reducing 1061 rows to 348.
Fix: text_id = yyyyMMdd_{slugified_name}, which is unique per table row.
Also add ofac-scrape-only command for fast table-only scraping without PDF downloads.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -47,6 +47,34 @@ public class Application
|
||||
return 0;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Scrapes all years into the checkpoint DB without downloading PDFs or syncing to S3.
|
||||
/// Used to produce a spreadsheet export quickly.
|
||||
/// </summary>
|
||||
public async Task<int> RunScrapeOnlyAsync(CancellationToken ct = default)
|
||||
{
|
||||
var currentYear = DateTime.UtcNow.Year;
|
||||
_logger.LogInformation("Starting scrape-only {StartYear}–{EndYear}", _options.StartYear, currentYear);
|
||||
|
||||
var total = 0;
|
||||
for (var year = _options.StartYear; year <= currentYear; year++)
|
||||
{
|
||||
if (ct.IsCancellationRequested) break;
|
||||
var records = await _scraper.GetYearRecordsAsync(year, ct);
|
||||
foreach (var record in records)
|
||||
{
|
||||
if (_checkpoint.HasRecord(record.TextId)) continue;
|
||||
_checkpoint.MarkProcessed(record.TextId, record.Date, record.Name,
|
||||
record.PenaltyTotalUsd, record.DocumentUrl, record.FileName, record.Year);
|
||||
total++;
|
||||
}
|
||||
}
|
||||
|
||||
_logger.LogInformation("Scrape-only complete. {Total} new records. DB total: {DbTotal}",
|
||||
total, _checkpoint.GetTotalCount());
|
||||
return 0;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Daily/incremental run: scrapes current year only, exports any new records.
|
||||
/// </summary>
|
||||
|
||||
@@ -11,8 +11,14 @@ public record OFACRecord
|
||||
public required string YearPageUrl { get; init; }
|
||||
|
||||
/// <summary>
|
||||
/// Stable text identifier derived from PDF filename (without extension).
|
||||
/// E.g. "20260317_tradestation" from "20260317_tradestation.pdf"
|
||||
/// Stable text identifier derived from date + name (unique per row).
|
||||
/// Early years share batch PDFs across many rows, so the PDF filename is NOT unique per row.
|
||||
/// </summary>
|
||||
public string TextId => Path.GetFileNameWithoutExtension(FileName);
|
||||
public string TextId => $"{Date:yyyyMMdd}_{Slugify(Name)}";
|
||||
|
||||
private static string Slugify(string text) =>
|
||||
new string(text.ToLowerInvariant()
|
||||
.Select(c => char.IsLetterOrDigit(c) ? c : '_')
|
||||
.ToArray())
|
||||
.Trim('_')[..Math.Min(60, text.Length)];
|
||||
}
|
||||
|
||||
@@ -51,7 +51,8 @@ try
|
||||
{
|
||||
"ofac-full" => await app.RunFullAsync(),
|
||||
"ofac-daily" => await app.RunDailyAsync(),
|
||||
_ => throw new ArgumentException($"Unknown command: {command}. Use ofac-full or ofac-daily.")
|
||||
"ofac-scrape-only" => await app.RunScrapeOnlyAsync(),
|
||||
_ => throw new ArgumentException($"Unknown command: {command}. Use ofac-full, ofac-daily, or ofac-scrape-only.")
|
||||
};
|
||||
|
||||
return exitCode;
|
||||
|
||||
@@ -75,8 +75,7 @@ public class OFACScraper
|
||||
if (dateLink == null) continue; // "Year to date totals" summary row
|
||||
|
||||
var dateText = HtmlEntity.DeEntitize(dateLink.InnerText).Trim();
|
||||
if (!DateTime.TryParseExact(dateText, "MM/dd/yyyy", CultureInfo.InvariantCulture,
|
||||
DateTimeStyles.None, out var date))
|
||||
if (!TryParseDate(dateText, out var date))
|
||||
{
|
||||
_logger.LogWarning("Could not parse date '{Date}' in year {Year}", dateText, year);
|
||||
continue;
|
||||
@@ -88,7 +87,7 @@ public class OFACScraper
|
||||
var docUrl = docHref.StartsWith("http") ? docHref : _options.BaseUrl + docHref;
|
||||
var fileName = dateLink.GetAttributeValue("title", "").Trim();
|
||||
if (string.IsNullOrEmpty(fileName))
|
||||
fileName = $"{date:yyyyMMdd}_{Slugify(HtmlEntity.DeEntitize(cells[1].InnerText).Trim())}.pdf";
|
||||
fileName = $"{date:yyyyMMdd}.pdf";
|
||||
|
||||
var name = HtmlEntity.DeEntitize(cells[1].InnerText).Trim();
|
||||
var penaltyText = HtmlEntity.DeEntitize(cells[3].InnerText).Trim();
|
||||
@@ -114,6 +113,20 @@ public class OFACScraper
|
||||
return records;
|
||||
}
|
||||
|
||||
private static bool TryParseDate(string raw, out DateTime date)
|
||||
{
|
||||
// Strip suffixes: " a", " b", " c", " (Revised ...)", etc.
|
||||
var clean = raw.Trim();
|
||||
var parenIdx = clean.IndexOf('(');
|
||||
if (parenIdx > 0) clean = clean[..parenIdx].Trim();
|
||||
// Strip trailing letter suffix like "05/06/2014 b"
|
||||
if (clean.Length > 0 && char.IsLetter(clean[^1]))
|
||||
clean = clean[..^1].Trim();
|
||||
|
||||
string[] formats = ["MM/dd/yyyy", "M/d/yyyy", "M/dd/yyyy", "MM/d/yyyy"];
|
||||
return DateTime.TryParseExact(clean, formats, CultureInfo.InvariantCulture, DateTimeStyles.None, out date);
|
||||
}
|
||||
|
||||
private static decimal? ParsePenalty(string text)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(text)) return null;
|
||||
@@ -123,9 +136,4 @@ public class OFACScraper
|
||||
: null;
|
||||
}
|
||||
|
||||
private static string Slugify(string text) =>
|
||||
new string(text.ToLowerInvariant()
|
||||
.Select(c => char.IsLetterOrDigit(c) ? c : '_')
|
||||
.ToArray())
|
||||
.Trim('_');
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user