Scrapes https://ofac.treasury.gov/civil-penalties-and-enforcement-information for all years 2003-present. Downloads PDF documents and exports metadata.json per CGSH Publication spec (v3) to S3 experimental bucket under ofac/ prefix. Commands: ofac-full (all years), ofac-daily (current year incremental). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
133 lines
4.6 KiB
C#
133 lines
4.6 KiB
C#
using Amazon;
|
|
using Amazon.S3;
|
|
using Amazon.S3.Model;
|
|
using Microsoft.Extensions.Logging;
|
|
using Microsoft.Extensions.Options;
|
|
using OFACScraper.Configuration;
|
|
|
|
namespace OFACScraper.Services;
|
|
|
|
public class S3UploadService : IDisposable
|
|
{
|
|
private readonly S3Options _options;
|
|
private readonly ILogger<S3UploadService> _logger;
|
|
private readonly IAmazonS3? _s3Client;
|
|
|
|
public S3UploadService(IOptions<S3Options> options, ILogger<S3UploadService> logger)
|
|
{
|
|
_options = options.Value;
|
|
_logger = logger;
|
|
|
|
if (_options.IsConfigured)
|
|
{
|
|
var config = new AmazonS3Config
|
|
{
|
|
RegionEndpoint = RegionEndpoint.GetBySystemName(_options.Region)
|
|
};
|
|
_s3Client = new AmazonS3Client(_options.AccessKeyId, _options.SecretAccessKey, config);
|
|
_logger.LogInformation("S3 configured: bucket={Bucket} region={Region} prefix={Prefix}",
|
|
_options.BucketName, _options.Region, _options.Prefix);
|
|
}
|
|
else
|
|
{
|
|
_logger.LogWarning("S3 not configured — uploads will be skipped.");
|
|
}
|
|
}
|
|
|
|
public void Dispose() => (_s3Client as IDisposable)?.Dispose();
|
|
|
|
public async Task<bool> UploadFileAsync(string localPath, string s3Key)
|
|
{
|
|
if (_s3Client == null) return false;
|
|
|
|
try
|
|
{
|
|
await _s3Client.PutObjectAsync(new PutObjectRequest
|
|
{
|
|
BucketName = _options.BucketName,
|
|
Key = s3Key,
|
|
FilePath = localPath
|
|
});
|
|
_logger.LogDebug("Uploaded {Path} → s3://{Bucket}/{Key}", localPath, _options.BucketName, s3Key);
|
|
return true;
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_logger.LogError(ex, "Failed to upload {Path}", localPath);
|
|
return false;
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Syncs localDirectory to S3 under s3Prefix, skipping files whose MD5 matches existing S3 ETag.
|
|
/// Uploads metadata.json last so CGSH processing triggers only after all documents are present.
|
|
/// </summary>
|
|
public async Task<S3SyncResult> SyncDirectoryAsync(string localDirectory, string s3Prefix)
|
|
{
|
|
var result = new S3SyncResult();
|
|
|
|
if (_s3Client == null)
|
|
{
|
|
_logger.LogWarning("S3 not configured, skipping sync of {Path}", localDirectory);
|
|
result.NotConfigured = true;
|
|
return result;
|
|
}
|
|
|
|
if (!Directory.Exists(localDirectory))
|
|
{
|
|
result.Error = $"Directory not found: {localDirectory}";
|
|
return result;
|
|
}
|
|
|
|
// List existing objects to skip unchanged files
|
|
var existing = new Dictionary<string, string>();
|
|
var listRequest = new ListObjectsV2Request { BucketName = _options.BucketName, Prefix = s3Prefix + "/" };
|
|
ListObjectsV2Response listResponse;
|
|
do
|
|
{
|
|
listResponse = await _s3Client.ListObjectsV2Async(listRequest);
|
|
foreach (var obj in listResponse.S3Objects ?? [])
|
|
existing[obj.Key] = obj.ETag?.Trim('"') ?? "";
|
|
listRequest.ContinuationToken = listResponse.NextContinuationToken;
|
|
} while (listResponse.IsTruncated == true);
|
|
|
|
// metadata.json last — CGSH triggers on its arrival
|
|
var files = Directory.GetFiles(localDirectory, "*", SearchOption.AllDirectories)
|
|
.OrderBy(f => Path.GetFileName(f) == "metadata.json" ? 1 : 0)
|
|
.ThenBy(f => f)
|
|
.ToArray();
|
|
|
|
foreach (var file in files)
|
|
{
|
|
var relativePath = Path.GetRelativePath(localDirectory, file).Replace('\\', '/');
|
|
var s3Key = $"{s3Prefix}/{relativePath}";
|
|
|
|
if (existing.TryGetValue(s3Key, out var etag) && !string.IsNullOrEmpty(etag))
|
|
{
|
|
var localMd5 = Convert.ToHexString(
|
|
System.Security.Cryptography.MD5.HashData(File.ReadAllBytes(file))
|
|
).ToLowerInvariant();
|
|
if (localMd5 == etag) { result.Skipped++; continue; }
|
|
}
|
|
|
|
if (await UploadFileAsync(file, s3Key))
|
|
result.Uploaded++;
|
|
else
|
|
result.Failed++;
|
|
}
|
|
|
|
_logger.LogInformation("S3 sync: {Uploaded} uploaded, {Skipped} unchanged, {Failed} failed",
|
|
result.Uploaded, result.Skipped, result.Failed);
|
|
return result;
|
|
}
|
|
}
|
|
|
|
public class S3SyncResult
|
|
{
|
|
public int Uploaded { get; set; }
|
|
public int Skipped { get; set; }
|
|
public int Failed { get; set; }
|
|
public bool NotConfigured { get; set; }
|
|
public string? Error { get; set; }
|
|
}
|