Initial OFAC Civil Penalties scraper
Scrapes https://ofac.treasury.gov/civil-penalties-and-enforcement-information for all years 2003-present. Downloads PDF documents and exports metadata.json per CGSH Publication spec (v3) to S3 experimental bucket under ofac/ prefix. Commands: ofac-full (all years), ofac-daily (current year incremental). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
132
src/OFACScraper/Services/S3UploadService.cs
Normal file
132
src/OFACScraper/Services/S3UploadService.cs
Normal file
@@ -0,0 +1,132 @@
|
||||
using Amazon;
|
||||
using Amazon.S3;
|
||||
using Amazon.S3.Model;
|
||||
using Microsoft.Extensions.Logging;
|
||||
using Microsoft.Extensions.Options;
|
||||
using OFACScraper.Configuration;
|
||||
|
||||
namespace OFACScraper.Services;
|
||||
|
||||
public class S3UploadService : IDisposable
|
||||
{
|
||||
private readonly S3Options _options;
|
||||
private readonly ILogger<S3UploadService> _logger;
|
||||
private readonly IAmazonS3? _s3Client;
|
||||
|
||||
public S3UploadService(IOptions<S3Options> options, ILogger<S3UploadService> logger)
|
||||
{
|
||||
_options = options.Value;
|
||||
_logger = logger;
|
||||
|
||||
if (_options.IsConfigured)
|
||||
{
|
||||
var config = new AmazonS3Config
|
||||
{
|
||||
RegionEndpoint = RegionEndpoint.GetBySystemName(_options.Region)
|
||||
};
|
||||
_s3Client = new AmazonS3Client(_options.AccessKeyId, _options.SecretAccessKey, config);
|
||||
_logger.LogInformation("S3 configured: bucket={Bucket} region={Region} prefix={Prefix}",
|
||||
_options.BucketName, _options.Region, _options.Prefix);
|
||||
}
|
||||
else
|
||||
{
|
||||
_logger.LogWarning("S3 not configured — uploads will be skipped.");
|
||||
}
|
||||
}
|
||||
|
||||
public void Dispose() => (_s3Client as IDisposable)?.Dispose();
|
||||
|
||||
public async Task<bool> UploadFileAsync(string localPath, string s3Key)
|
||||
{
|
||||
if (_s3Client == null) return false;
|
||||
|
||||
try
|
||||
{
|
||||
await _s3Client.PutObjectAsync(new PutObjectRequest
|
||||
{
|
||||
BucketName = _options.BucketName,
|
||||
Key = s3Key,
|
||||
FilePath = localPath
|
||||
});
|
||||
_logger.LogDebug("Uploaded {Path} → s3://{Bucket}/{Key}", localPath, _options.BucketName, s3Key);
|
||||
return true;
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
_logger.LogError(ex, "Failed to upload {Path}", localPath);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Syncs localDirectory to S3 under s3Prefix, skipping files whose MD5 matches existing S3 ETag.
|
||||
/// Uploads metadata.json last so CGSH processing triggers only after all documents are present.
|
||||
/// </summary>
|
||||
public async Task<S3SyncResult> SyncDirectoryAsync(string localDirectory, string s3Prefix)
|
||||
{
|
||||
var result = new S3SyncResult();
|
||||
|
||||
if (_s3Client == null)
|
||||
{
|
||||
_logger.LogWarning("S3 not configured, skipping sync of {Path}", localDirectory);
|
||||
result.NotConfigured = true;
|
||||
return result;
|
||||
}
|
||||
|
||||
if (!Directory.Exists(localDirectory))
|
||||
{
|
||||
result.Error = $"Directory not found: {localDirectory}";
|
||||
return result;
|
||||
}
|
||||
|
||||
// List existing objects to skip unchanged files
|
||||
var existing = new Dictionary<string, string>();
|
||||
var listRequest = new ListObjectsV2Request { BucketName = _options.BucketName, Prefix = s3Prefix + "/" };
|
||||
ListObjectsV2Response listResponse;
|
||||
do
|
||||
{
|
||||
listResponse = await _s3Client.ListObjectsV2Async(listRequest);
|
||||
foreach (var obj in listResponse.S3Objects ?? [])
|
||||
existing[obj.Key] = obj.ETag?.Trim('"') ?? "";
|
||||
listRequest.ContinuationToken = listResponse.NextContinuationToken;
|
||||
} while (listResponse.IsTruncated == true);
|
||||
|
||||
// metadata.json last — CGSH triggers on its arrival
|
||||
var files = Directory.GetFiles(localDirectory, "*", SearchOption.AllDirectories)
|
||||
.OrderBy(f => Path.GetFileName(f) == "metadata.json" ? 1 : 0)
|
||||
.ThenBy(f => f)
|
||||
.ToArray();
|
||||
|
||||
foreach (var file in files)
|
||||
{
|
||||
var relativePath = Path.GetRelativePath(localDirectory, file).Replace('\\', '/');
|
||||
var s3Key = $"{s3Prefix}/{relativePath}";
|
||||
|
||||
if (existing.TryGetValue(s3Key, out var etag) && !string.IsNullOrEmpty(etag))
|
||||
{
|
||||
var localMd5 = Convert.ToHexString(
|
||||
System.Security.Cryptography.MD5.HashData(File.ReadAllBytes(file))
|
||||
).ToLowerInvariant();
|
||||
if (localMd5 == etag) { result.Skipped++; continue; }
|
||||
}
|
||||
|
||||
if (await UploadFileAsync(file, s3Key))
|
||||
result.Uploaded++;
|
||||
else
|
||||
result.Failed++;
|
||||
}
|
||||
|
||||
_logger.LogInformation("S3 sync: {Uploaded} uploaded, {Skipped} unchanged, {Failed} failed",
|
||||
result.Uploaded, result.Skipped, result.Failed);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
public class S3SyncResult
|
||||
{
|
||||
public int Uploaded { get; set; }
|
||||
public int Skipped { get; set; }
|
||||
public int Failed { get; set; }
|
||||
public bool NotConfigured { get; set; }
|
||||
public string? Error { get; set; }
|
||||
}
|
||||
Reference in New Issue
Block a user