195 lines
6.3 KiB
JavaScript
195 lines
6.3 KiB
JavaScript
|
|
#!/usr/bin/env node
|
||
|
|
|
||
|
|
/**
|
||
|
|
* IMPROVED CLEANUP WITH ARCHIVAL
|
||
|
|
*
|
||
|
|
* Instead of just marking tenders as "closed", this script:
|
||
|
|
* 1. Validates all open tender URLs
|
||
|
|
* 2. Saves full tender snapshot before archiving
|
||
|
|
* 3. Marks removed tenders as "archived" (not just closed)
|
||
|
|
* 4. Keeps tender data accessible even after source removes it
|
||
|
|
* 5. Tracks validation failures
|
||
|
|
*/
|
||
|
|
|
||
|
|
import pg from 'pg';
|
||
|
|
import dotenv from 'dotenv';
|
||
|
|
|
||
|
|
dotenv.config();
|
||
|
|
|
||
|
|
const pool = new pg.Pool({
|
||
|
|
connectionString: process.env.DATABASE_URL || 'postgresql://tenderpilot:tenderpilot123@localhost:5432/tenderpilot'
|
||
|
|
});
|
||
|
|
|
||
|
|
async function validateAndArchive() {
|
||
|
|
try {
|
||
|
|
console.log(`[${new Date().toISOString()}] Starting tender validation with archival...`);
|
||
|
|
|
||
|
|
// Get all open tenders
|
||
|
|
const result = await pool.query(
|
||
|
|
`SELECT id, source, source_id, title, description, notice_url, deadline,
|
||
|
|
value_low, value_high, currency, authority_name, location, sector,
|
||
|
|
cpv_codes, published_date, created_at, validation_failures
|
||
|
|
FROM tenders
|
||
|
|
WHERE status = 'open'
|
||
|
|
ORDER BY created_at DESC`
|
||
|
|
);
|
||
|
|
|
||
|
|
console.log(`Found ${result.rows.length} open tenders to validate\\n`);
|
||
|
|
|
||
|
|
let validated = 0;
|
||
|
|
let archived = 0;
|
||
|
|
let failed = 0;
|
||
|
|
let errors = 0;
|
||
|
|
|
||
|
|
for (const tender of result.rows) {
|
||
|
|
try {
|
||
|
|
// Validate URL
|
||
|
|
const response = await fetch(tender.notice_url, {
|
||
|
|
method: 'HEAD',
|
||
|
|
redirect: 'follow',
|
||
|
|
signal: AbortSignal.timeout(10000)
|
||
|
|
});
|
||
|
|
|
||
|
|
const status = response.status;
|
||
|
|
const finalUrl = response.url;
|
||
|
|
|
||
|
|
const isRemoved = (
|
||
|
|
status === 404 ||
|
||
|
|
finalUrl.includes('/syserror/') ||
|
||
|
|
finalUrl.includes('/notfound') ||
|
||
|
|
finalUrl.includes('/error')
|
||
|
|
);
|
||
|
|
|
||
|
|
if (isRemoved) {
|
||
|
|
// Create snapshot before archiving
|
||
|
|
const snapshot = {
|
||
|
|
source: tender.source,
|
||
|
|
source_id: tender.source_id,
|
||
|
|
title: tender.title,
|
||
|
|
description: tender.description,
|
||
|
|
notice_url: tender.notice_url,
|
||
|
|
deadline: tender.deadline,
|
||
|
|
value_low: tender.value_low,
|
||
|
|
value_high: tender.value_high,
|
||
|
|
currency: tender.currency,
|
||
|
|
authority_name: tender.authority_name,
|
||
|
|
location: tender.location,
|
||
|
|
sector: tender.sector,
|
||
|
|
cpv_codes: tender.cpv_codes,
|
||
|
|
published_date: tender.published_date,
|
||
|
|
scraped_at: tender.created_at,
|
||
|
|
archived_reason: 'URL removed from source',
|
||
|
|
archived_status: status,
|
||
|
|
archived_redirect: finalUrl !== tender.notice_url ? finalUrl : null
|
||
|
|
};
|
||
|
|
|
||
|
|
// Archive the tender
|
||
|
|
await pool.query(
|
||
|
|
`UPDATE tenders
|
||
|
|
SET status = 'closed',
|
||
|
|
archived = TRUE,
|
||
|
|
archived_at = NOW(),
|
||
|
|
archived_snapshot = $1,
|
||
|
|
last_validated = NOW(),
|
||
|
|
validation_failures = validation_failures + 1
|
||
|
|
WHERE id = $2`,
|
||
|
|
[JSON.stringify(snapshot), tender.id]
|
||
|
|
);
|
||
|
|
|
||
|
|
archived++;
|
||
|
|
console.log(`📦 Archived: [${tender.source}] ${tender.title.substring(0, 60)}`);
|
||
|
|
|
||
|
|
} else {
|
||
|
|
// URL still works - update validation timestamp
|
||
|
|
await pool.query(
|
||
|
|
`UPDATE tenders
|
||
|
|
SET last_validated = NOW(),
|
||
|
|
validation_failures = 0
|
||
|
|
WHERE id = $1`,
|
||
|
|
[tender.id]
|
||
|
|
);
|
||
|
|
|
||
|
|
validated++;
|
||
|
|
}
|
||
|
|
|
||
|
|
} catch (error) {
|
||
|
|
// Network error or timeout
|
||
|
|
const currentFailures = tender.validation_failures || 0;
|
||
|
|
|
||
|
|
if (currentFailures >= 3) {
|
||
|
|
// After 3 failures, mark as closed (probably permanently gone)
|
||
|
|
const snapshot = {
|
||
|
|
source: tender.source,
|
||
|
|
source_id: tender.source_id,
|
||
|
|
title: tender.title,
|
||
|
|
description: tender.description,
|
||
|
|
notice_url: tender.notice_url,
|
||
|
|
deadline: tender.deadline,
|
||
|
|
scraped_at: tender.created_at,
|
||
|
|
archived_reason: `Validation failed ${currentFailures + 1} times: ${error.message}`,
|
||
|
|
};
|
||
|
|
|
||
|
|
await pool.query(
|
||
|
|
`UPDATE tenders
|
||
|
|
SET status = 'closed',
|
||
|
|
archived = TRUE,
|
||
|
|
archived_at = NOW(),
|
||
|
|
archived_snapshot = $1,
|
||
|
|
validation_failures = validation_failures + 1
|
||
|
|
WHERE id = $2`,
|
||
|
|
[JSON.stringify(snapshot), tender.id]
|
||
|
|
);
|
||
|
|
|
||
|
|
archived++;
|
||
|
|
console.log(`⚠️ Archived (after ${currentFailures + 1} failures): ${tender.title.substring(0, 60)}`);
|
||
|
|
} else {
|
||
|
|
// Increment failure count but don't close yet
|
||
|
|
await pool.query(
|
||
|
|
`UPDATE tenders
|
||
|
|
SET validation_failures = validation_failures + 1,
|
||
|
|
last_validated = NOW()
|
||
|
|
WHERE id = $1`,
|
||
|
|
[tender.id]
|
||
|
|
);
|
||
|
|
|
||
|
|
failed++;
|
||
|
|
console.log(`❌ Validation failed (${currentFailures + 1}/3): ${tender.title.substring(0, 60)}`);
|
||
|
|
}
|
||
|
|
|
||
|
|
errors++;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
console.log(`\\n=== VALIDATION COMPLETE ===`);
|
||
|
|
console.log(`Total checked: ${result.rows.length}`);
|
||
|
|
console.log(`✅ Still valid: ${validated}`);
|
||
|
|
console.log(`📦 Archived: ${archived}`);
|
||
|
|
console.log(`❌ Temporary failures: ${failed}`);
|
||
|
|
console.log(`⚠️ Errors: ${errors}`);
|
||
|
|
console.log(`Completion time: ${new Date().toISOString()}`);
|
||
|
|
|
||
|
|
// Print statistics
|
||
|
|
const stats = await pool.query(
|
||
|
|
`SELECT
|
||
|
|
COUNT(*) FILTER (WHERE status = 'open') as open,
|
||
|
|
COUNT(*) FILTER (WHERE status = 'closed' AND NOT archived) as closed,
|
||
|
|
COUNT(*) FILTER (WHERE archived) as archived,
|
||
|
|
COUNT(*) as total
|
||
|
|
FROM tenders`
|
||
|
|
);
|
||
|
|
|
||
|
|
console.log(`\\n=== DATABASE STATS ===`);
|
||
|
|
console.log(`Total tenders: ${stats.rows[0].total}`);
|
||
|
|
console.log(`Open: ${stats.rows[0].open}`);
|
||
|
|
console.log(`Closed: ${stats.rows[0].closed}`);
|
||
|
|
console.log(`Archived: ${stats.rows[0].archived}`);
|
||
|
|
|
||
|
|
} catch (error) {
|
||
|
|
console.error('Fatal error:', error);
|
||
|
|
} finally {
|
||
|
|
await pool.end();
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
validateAndArchive();
|