Files
tenderpilot/cleanup-with-archival.mjs

195 lines
6.3 KiB
JavaScript
Raw Permalink Normal View History

#!/usr/bin/env node
/**
* IMPROVED CLEANUP WITH ARCHIVAL
*
* Instead of just marking tenders as "closed", this script:
* 1. Validates all open tender URLs
* 2. Saves full tender snapshot before archiving
* 3. Marks removed tenders as "archived" (not just closed)
* 4. Keeps tender data accessible even after source removes it
* 5. Tracks validation failures
*/
import pg from 'pg';
import dotenv from 'dotenv';
dotenv.config();
const pool = new pg.Pool({
connectionString: process.env.DATABASE_URL || 'postgresql://tenderpilot:tenderpilot123@localhost:5432/tenderpilot'
});
async function validateAndArchive() {
try {
console.log(`[${new Date().toISOString()}] Starting tender validation with archival...`);
// Get all open tenders
const result = await pool.query(
`SELECT id, source, source_id, title, description, notice_url, deadline,
value_low, value_high, currency, authority_name, location, sector,
cpv_codes, published_date, created_at, validation_failures
FROM tenders
WHERE status = 'open'
ORDER BY created_at DESC`
);
console.log(`Found ${result.rows.length} open tenders to validate\\n`);
let validated = 0;
let archived = 0;
let failed = 0;
let errors = 0;
for (const tender of result.rows) {
try {
// Validate URL
const response = await fetch(tender.notice_url, {
method: 'HEAD',
redirect: 'follow',
signal: AbortSignal.timeout(10000)
});
const status = response.status;
const finalUrl = response.url;
const isRemoved = (
status === 404 ||
finalUrl.includes('/syserror/') ||
finalUrl.includes('/notfound') ||
finalUrl.includes('/error')
);
if (isRemoved) {
// Create snapshot before archiving
const snapshot = {
source: tender.source,
source_id: tender.source_id,
title: tender.title,
description: tender.description,
notice_url: tender.notice_url,
deadline: tender.deadline,
value_low: tender.value_low,
value_high: tender.value_high,
currency: tender.currency,
authority_name: tender.authority_name,
location: tender.location,
sector: tender.sector,
cpv_codes: tender.cpv_codes,
published_date: tender.published_date,
scraped_at: tender.created_at,
archived_reason: 'URL removed from source',
archived_status: status,
archived_redirect: finalUrl !== tender.notice_url ? finalUrl : null
};
// Archive the tender
await pool.query(
`UPDATE tenders
SET status = 'closed',
archived = TRUE,
archived_at = NOW(),
archived_snapshot = $1,
last_validated = NOW(),
validation_failures = validation_failures + 1
WHERE id = $2`,
[JSON.stringify(snapshot), tender.id]
);
archived++;
console.log(`📦 Archived: [${tender.source}] ${tender.title.substring(0, 60)}`);
} else {
// URL still works - update validation timestamp
await pool.query(
`UPDATE tenders
SET last_validated = NOW(),
validation_failures = 0
WHERE id = $1`,
[tender.id]
);
validated++;
}
} catch (error) {
// Network error or timeout
const currentFailures = tender.validation_failures || 0;
if (currentFailures >= 3) {
// After 3 failures, mark as closed (probably permanently gone)
const snapshot = {
source: tender.source,
source_id: tender.source_id,
title: tender.title,
description: tender.description,
notice_url: tender.notice_url,
deadline: tender.deadline,
scraped_at: tender.created_at,
archived_reason: `Validation failed ${currentFailures + 1} times: ${error.message}`,
};
await pool.query(
`UPDATE tenders
SET status = 'closed',
archived = TRUE,
archived_at = NOW(),
archived_snapshot = $1,
validation_failures = validation_failures + 1
WHERE id = $2`,
[JSON.stringify(snapshot), tender.id]
);
archived++;
console.log(`⚠️ Archived (after ${currentFailures + 1} failures): ${tender.title.substring(0, 60)}`);
} else {
// Increment failure count but don't close yet
await pool.query(
`UPDATE tenders
SET validation_failures = validation_failures + 1,
last_validated = NOW()
WHERE id = $1`,
[tender.id]
);
failed++;
console.log(`❌ Validation failed (${currentFailures + 1}/3): ${tender.title.substring(0, 60)}`);
}
errors++;
}
}
console.log(`\\n=== VALIDATION COMPLETE ===`);
console.log(`Total checked: ${result.rows.length}`);
console.log(`✅ Still valid: ${validated}`);
console.log(`📦 Archived: ${archived}`);
console.log(`❌ Temporary failures: ${failed}`);
console.log(`⚠️ Errors: ${errors}`);
console.log(`Completion time: ${new Date().toISOString()}`);
// Print statistics
const stats = await pool.query(
`SELECT
COUNT(*) FILTER (WHERE status = 'open') as open,
COUNT(*) FILTER (WHERE status = 'closed' AND NOT archived) as closed,
COUNT(*) FILTER (WHERE archived) as archived,
COUNT(*) as total
FROM tenders`
);
console.log(`\\n=== DATABASE STATS ===`);
console.log(`Total tenders: ${stats.rows[0].total}`);
console.log(`Open: ${stats.rows[0].open}`);
console.log(`Closed: ${stats.rows[0].closed}`);
console.log(`Archived: ${stats.rows[0].archived}`);
} catch (error) {
console.error('Fatal error:', error);
} finally {
await pool.end();
}
}
validateAndArchive();