Initial OFAC Civil Penalties scraper
Scrapes https://ofac.treasury.gov/civil-penalties-and-enforcement-information for all years 2003-present. Downloads PDF documents and exports metadata.json per CGSH Publication spec (v3) to S3 experimental bucket under ofac/ prefix. Commands: ofac-full (all years), ofac-daily (current year incremental). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
93
src/OFACScraper/Application.cs
Normal file
93
src/OFACScraper/Application.cs
Normal file
@@ -0,0 +1,93 @@
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using OFACScraper.Configuration;
|
||||
|
||||
namespace OFACScraper;
|
||||
|
||||
public class Application
|
||||
{
|
||||
private readonly OFACScraper _scraper;
|
||||
private readonly Exporter _exporter;
|
||||
private readonly CheckpointStore _checkpoint;
|
||||
private readonly OFACOptions _options;
|
||||
private readonly ILogger<Application> _logger;
|
||||
|
||||
public Application(
|
||||
OFACScraper scraper,
|
||||
Exporter exporter,
|
||||
CheckpointStore checkpoint,
|
||||
IOptions<OFACOptions> options,
|
||||
ILogger<Application> logger)
|
||||
{
|
||||
_scraper = scraper;
|
||||
_exporter = exporter;
|
||||
_checkpoint = checkpoint;
|
||||
_options = options.Value;
|
||||
_logger = logger;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Full historical scrape: all years from StartYear to current year.
|
||||
/// Skips records already in checkpoint.
|
||||
/// </summary>
|
||||
public async Task<int> RunFullAsync(CancellationToken ct = default)
|
||||
{
|
||||
var currentYear = DateTime.UtcNow.Year;
|
||||
_logger.LogInformation("Starting full scrape {StartYear}–{EndYear}", _options.StartYear, currentYear);
|
||||
|
||||
var total = 0;
|
||||
for (var year = _options.StartYear; year <= currentYear; year++)
|
||||
{
|
||||
total += await ProcessYearAsync(year, ct);
|
||||
if (ct.IsCancellationRequested) break;
|
||||
}
|
||||
|
||||
_logger.LogInformation("Full scrape complete. {Total} new records exported. DB total: {DbTotal}",
|
||||
total, _checkpoint.GetTotalCount());
|
||||
return 0;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Daily/incremental run: scrapes current year only, exports any new records.
|
||||
/// </summary>
|
||||
public async Task<int> RunDailyAsync(CancellationToken ct = default)
|
||||
{
|
||||
var currentYear = DateTime.UtcNow.Year;
|
||||
_logger.LogInformation("Starting daily scrape for {Year}", currentYear);
|
||||
|
||||
var newRecords = await ProcessYearAsync(currentYear, ct);
|
||||
_logger.LogInformation("Daily scrape complete. {New} new records exported.", newRecords);
|
||||
return 0;
|
||||
}
|
||||
|
||||
private async Task<int> ProcessYearAsync(int year, CancellationToken ct)
|
||||
{
|
||||
var records = await _scraper.GetYearRecordsAsync(year, ct);
|
||||
var newCount = 0;
|
||||
|
||||
foreach (var record in records)
|
||||
{
|
||||
if (ct.IsCancellationRequested) break;
|
||||
|
||||
if (_checkpoint.HasRecord(record.TextId))
|
||||
{
|
||||
_logger.LogDebug("Skipping {TextId} (already processed)", record.TextId);
|
||||
continue;
|
||||
}
|
||||
|
||||
var success = await _exporter.ExportRecordAsync(record, ct);
|
||||
if (success)
|
||||
{
|
||||
_checkpoint.MarkProcessed(
|
||||
record.TextId, record.Date, record.Name, record.PenaltyTotalUsd,
|
||||
record.DocumentUrl, record.FileName, record.Year);
|
||||
newCount++;
|
||||
}
|
||||
}
|
||||
|
||||
if (newCount > 0)
|
||||
_logger.LogInformation("Year {Year}: exported {Count} new records", year, newCount);
|
||||
|
||||
return newCount;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user