Files
cgsh-ofac/src/OFACScraper/Application.cs
Peter Foster ad7c5d55eb Initial OFAC Civil Penalties scraper
Scrapes https://ofac.treasury.gov/civil-penalties-and-enforcement-information
for all years 2003-present. Downloads PDF documents and exports metadata.json
per CGSH Publication spec (v3) to S3 experimental bucket under ofac/ prefix.

Commands: ofac-full (all years), ofac-daily (current year incremental).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-09 15:29:00 +01:00

94 lines
3.0 KiB
C#
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using OFACScraper.Configuration;
namespace OFACScraper;
public class Application
{
private readonly OFACScraper _scraper;
private readonly Exporter _exporter;
private readonly CheckpointStore _checkpoint;
private readonly OFACOptions _options;
private readonly ILogger<Application> _logger;
public Application(
OFACScraper scraper,
Exporter exporter,
CheckpointStore checkpoint,
IOptions<OFACOptions> options,
ILogger<Application> logger)
{
_scraper = scraper;
_exporter = exporter;
_checkpoint = checkpoint;
_options = options.Value;
_logger = logger;
}
/// <summary>
/// Full historical scrape: all years from StartYear to current year.
/// Skips records already in checkpoint.
/// </summary>
public async Task<int> RunFullAsync(CancellationToken ct = default)
{
var currentYear = DateTime.UtcNow.Year;
_logger.LogInformation("Starting full scrape {StartYear}{EndYear}", _options.StartYear, currentYear);
var total = 0;
for (var year = _options.StartYear; year <= currentYear; year++)
{
total += await ProcessYearAsync(year, ct);
if (ct.IsCancellationRequested) break;
}
_logger.LogInformation("Full scrape complete. {Total} new records exported. DB total: {DbTotal}",
total, _checkpoint.GetTotalCount());
return 0;
}
/// <summary>
/// Daily/incremental run: scrapes current year only, exports any new records.
/// </summary>
public async Task<int> RunDailyAsync(CancellationToken ct = default)
{
var currentYear = DateTime.UtcNow.Year;
_logger.LogInformation("Starting daily scrape for {Year}", currentYear);
var newRecords = await ProcessYearAsync(currentYear, ct);
_logger.LogInformation("Daily scrape complete. {New} new records exported.", newRecords);
return 0;
}
private async Task<int> ProcessYearAsync(int year, CancellationToken ct)
{
var records = await _scraper.GetYearRecordsAsync(year, ct);
var newCount = 0;
foreach (var record in records)
{
if (ct.IsCancellationRequested) break;
if (_checkpoint.HasRecord(record.TextId))
{
_logger.LogDebug("Skipping {TextId} (already processed)", record.TextId);
continue;
}
var success = await _exporter.ExportRecordAsync(record, ct);
if (success)
{
_checkpoint.MarkProcessed(
record.TextId, record.Date, record.Name, record.PenaltyTotalUsd,
record.DocumentUrl, record.FileName, record.Year);
newCount++;
}
}
if (newCount > 0)
_logger.LogInformation("Year {Year}: exported {Count} new records", year, newCount);
return newCount;
}
}