Initial OFAC Civil Penalties scraper

Scrapes https://ofac.treasury.gov/civil-penalties-and-enforcement-information
for all years 2003-present. Downloads PDF documents and exports metadata.json
per CGSH Publication spec (v3) to S3 experimental bucket under ofac/ prefix.

Commands: ofac-full (all years), ofac-daily (current year incremental).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Peter Foster
2026-04-09 15:29:00 +01:00
commit ad7c5d55eb
110 changed files with 5075 additions and 0 deletions

View File

@@ -0,0 +1,132 @@
using Amazon;
using Amazon.S3;
using Amazon.S3.Model;
using Microsoft.Extensions.Logging;
using Microsoft.Extensions.Options;
using OFACScraper.Configuration;
namespace OFACScraper.Services;
public class S3UploadService : IDisposable
{
private readonly S3Options _options;
private readonly ILogger<S3UploadService> _logger;
private readonly IAmazonS3? _s3Client;
public S3UploadService(IOptions<S3Options> options, ILogger<S3UploadService> logger)
{
_options = options.Value;
_logger = logger;
if (_options.IsConfigured)
{
var config = new AmazonS3Config
{
RegionEndpoint = RegionEndpoint.GetBySystemName(_options.Region)
};
_s3Client = new AmazonS3Client(_options.AccessKeyId, _options.SecretAccessKey, config);
_logger.LogInformation("S3 configured: bucket={Bucket} region={Region} prefix={Prefix}",
_options.BucketName, _options.Region, _options.Prefix);
}
else
{
_logger.LogWarning("S3 not configured — uploads will be skipped.");
}
}
public void Dispose() => (_s3Client as IDisposable)?.Dispose();
public async Task<bool> UploadFileAsync(string localPath, string s3Key)
{
if (_s3Client == null) return false;
try
{
await _s3Client.PutObjectAsync(new PutObjectRequest
{
BucketName = _options.BucketName,
Key = s3Key,
FilePath = localPath
});
_logger.LogDebug("Uploaded {Path} → s3://{Bucket}/{Key}", localPath, _options.BucketName, s3Key);
return true;
}
catch (Exception ex)
{
_logger.LogError(ex, "Failed to upload {Path}", localPath);
return false;
}
}
/// <summary>
/// Syncs localDirectory to S3 under s3Prefix, skipping files whose MD5 matches existing S3 ETag.
/// Uploads metadata.json last so CGSH processing triggers only after all documents are present.
/// </summary>
public async Task<S3SyncResult> SyncDirectoryAsync(string localDirectory, string s3Prefix)
{
var result = new S3SyncResult();
if (_s3Client == null)
{
_logger.LogWarning("S3 not configured, skipping sync of {Path}", localDirectory);
result.NotConfigured = true;
return result;
}
if (!Directory.Exists(localDirectory))
{
result.Error = $"Directory not found: {localDirectory}";
return result;
}
// List existing objects to skip unchanged files
var existing = new Dictionary<string, string>();
var listRequest = new ListObjectsV2Request { BucketName = _options.BucketName, Prefix = s3Prefix + "/" };
ListObjectsV2Response listResponse;
do
{
listResponse = await _s3Client.ListObjectsV2Async(listRequest);
foreach (var obj in listResponse.S3Objects ?? [])
existing[obj.Key] = obj.ETag?.Trim('"') ?? "";
listRequest.ContinuationToken = listResponse.NextContinuationToken;
} while (listResponse.IsTruncated == true);
// metadata.json last — CGSH triggers on its arrival
var files = Directory.GetFiles(localDirectory, "*", SearchOption.AllDirectories)
.OrderBy(f => Path.GetFileName(f) == "metadata.json" ? 1 : 0)
.ThenBy(f => f)
.ToArray();
foreach (var file in files)
{
var relativePath = Path.GetRelativePath(localDirectory, file).Replace('\\', '/');
var s3Key = $"{s3Prefix}/{relativePath}";
if (existing.TryGetValue(s3Key, out var etag) && !string.IsNullOrEmpty(etag))
{
var localMd5 = Convert.ToHexString(
System.Security.Cryptography.MD5.HashData(File.ReadAllBytes(file))
).ToLowerInvariant();
if (localMd5 == etag) { result.Skipped++; continue; }
}
if (await UploadFileAsync(file, s3Key))
result.Uploaded++;
else
result.Failed++;
}
_logger.LogInformation("S3 sync: {Uploaded} uploaded, {Skipped} unchanged, {Failed} failed",
result.Uploaded, result.Skipped, result.Failed);
return result;
}
}
public class S3SyncResult
{
public int Uploaded { get; set; }
public int Skipped { get; set; }
public int Failed { get; set; }
public bool NotConfigured { get; set; }
public string? Error { get; set; }
}