Scrapes https://ofac.treasury.gov/civil-penalties-and-enforcement-information for all years 2003-present. Downloads PDF documents and exports metadata.json per CGSH Publication spec (v3) to S3 experimental bucket under ofac/ prefix. Commands: ofac-full (all years), ofac-daily (current year incremental). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
94 lines
3.0 KiB
C#
94 lines
3.0 KiB
C#
using Microsoft.Extensions.Logging;
|
||
using Microsoft.Extensions.Options;
|
||
using OFACScraper.Configuration;
|
||
|
||
namespace OFACScraper;
|
||
|
||
public class Application
|
||
{
|
||
private readonly OFACScraper _scraper;
|
||
private readonly Exporter _exporter;
|
||
private readonly CheckpointStore _checkpoint;
|
||
private readonly OFACOptions _options;
|
||
private readonly ILogger<Application> _logger;
|
||
|
||
public Application(
|
||
OFACScraper scraper,
|
||
Exporter exporter,
|
||
CheckpointStore checkpoint,
|
||
IOptions<OFACOptions> options,
|
||
ILogger<Application> logger)
|
||
{
|
||
_scraper = scraper;
|
||
_exporter = exporter;
|
||
_checkpoint = checkpoint;
|
||
_options = options.Value;
|
||
_logger = logger;
|
||
}
|
||
|
||
/// <summary>
|
||
/// Full historical scrape: all years from StartYear to current year.
|
||
/// Skips records already in checkpoint.
|
||
/// </summary>
|
||
public async Task<int> RunFullAsync(CancellationToken ct = default)
|
||
{
|
||
var currentYear = DateTime.UtcNow.Year;
|
||
_logger.LogInformation("Starting full scrape {StartYear}–{EndYear}", _options.StartYear, currentYear);
|
||
|
||
var total = 0;
|
||
for (var year = _options.StartYear; year <= currentYear; year++)
|
||
{
|
||
total += await ProcessYearAsync(year, ct);
|
||
if (ct.IsCancellationRequested) break;
|
||
}
|
||
|
||
_logger.LogInformation("Full scrape complete. {Total} new records exported. DB total: {DbTotal}",
|
||
total, _checkpoint.GetTotalCount());
|
||
return 0;
|
||
}
|
||
|
||
/// <summary>
|
||
/// Daily/incremental run: scrapes current year only, exports any new records.
|
||
/// </summary>
|
||
public async Task<int> RunDailyAsync(CancellationToken ct = default)
|
||
{
|
||
var currentYear = DateTime.UtcNow.Year;
|
||
_logger.LogInformation("Starting daily scrape for {Year}", currentYear);
|
||
|
||
var newRecords = await ProcessYearAsync(currentYear, ct);
|
||
_logger.LogInformation("Daily scrape complete. {New} new records exported.", newRecords);
|
||
return 0;
|
||
}
|
||
|
||
private async Task<int> ProcessYearAsync(int year, CancellationToken ct)
|
||
{
|
||
var records = await _scraper.GetYearRecordsAsync(year, ct);
|
||
var newCount = 0;
|
||
|
||
foreach (var record in records)
|
||
{
|
||
if (ct.IsCancellationRequested) break;
|
||
|
||
if (_checkpoint.HasRecord(record.TextId))
|
||
{
|
||
_logger.LogDebug("Skipping {TextId} (already processed)", record.TextId);
|
||
continue;
|
||
}
|
||
|
||
var success = await _exporter.ExportRecordAsync(record, ct);
|
||
if (success)
|
||
{
|
||
_checkpoint.MarkProcessed(
|
||
record.TextId, record.Date, record.Name, record.PenaltyTotalUsd,
|
||
record.DocumentUrl, record.FileName, record.Year);
|
||
newCount++;
|
||
}
|
||
}
|
||
|
||
if (newCount > 0)
|
||
_logger.LogInformation("Year {Year}: exported {Count} new records", year, newCount);
|
||
|
||
return newCount;
|
||
}
|
||
}
|