feat: three major improvements - stable sources, archival, email alerts
1. Focus on Stable International/Regional Sources - Improved TED EU scraper (5 search strategies, 5 pages each) - All stable sources now hourly (TED EU, Sell2Wales, PCS Scotland, eTendersNI) - De-prioritize unreliable UK gov sites (100% removal rate) 2. Archival Feature - New DB columns: archived, archived_at, archived_snapshot, last_validated, validation_failures - Cleanup script now preserves full tender snapshots before archiving - Gradual failure handling (3 retries before archiving) - No data loss - historical record preserved 3. Email Alerts - Daily digest (8am) - all new tenders from last 24h - High-value alerts (every 4h) - tenders >£100k - Professional HTML emails with all tender details - Configurable via environment variables Expected outcomes: - 50-100 stable tenders (vs 26 currently) - Zero 404 errors (archived data preserved) - Proactive notifications (no missed opportunities) - Historical archive for trend analysis Files: - scrapers/ted-eu.js (improved) - cleanup-with-archival.mjs (new) - send-tender-alerts.mjs (new) - migrations/add-archival-fields.sql (new) - THREE_IMPROVEMENTS_SUMMARY.md (documentation) All cron jobs updated for hourly scraping + daily cleanup + alerts
This commit is contained in:
194
cleanup-with-archival.mjs
Executable file
194
cleanup-with-archival.mjs
Executable file
@@ -0,0 +1,194 @@
|
||||
#!/usr/bin/env node
|
||||
|
||||
/**
|
||||
* IMPROVED CLEANUP WITH ARCHIVAL
|
||||
*
|
||||
* Instead of just marking tenders as "closed", this script:
|
||||
* 1. Validates all open tender URLs
|
||||
* 2. Saves full tender snapshot before archiving
|
||||
* 3. Marks removed tenders as "archived" (not just closed)
|
||||
* 4. Keeps tender data accessible even after source removes it
|
||||
* 5. Tracks validation failures
|
||||
*/
|
||||
|
||||
import pg from 'pg';
|
||||
import dotenv from 'dotenv';
|
||||
|
||||
dotenv.config();
|
||||
|
||||
const pool = new pg.Pool({
|
||||
connectionString: process.env.DATABASE_URL || 'postgresql://tenderpilot:tenderpilot123@localhost:5432/tenderpilot'
|
||||
});
|
||||
|
||||
async function validateAndArchive() {
|
||||
try {
|
||||
console.log(`[${new Date().toISOString()}] Starting tender validation with archival...`);
|
||||
|
||||
// Get all open tenders
|
||||
const result = await pool.query(
|
||||
`SELECT id, source, source_id, title, description, notice_url, deadline,
|
||||
value_low, value_high, currency, authority_name, location, sector,
|
||||
cpv_codes, published_date, created_at, validation_failures
|
||||
FROM tenders
|
||||
WHERE status = 'open'
|
||||
ORDER BY created_at DESC`
|
||||
);
|
||||
|
||||
console.log(`Found ${result.rows.length} open tenders to validate\\n`);
|
||||
|
||||
let validated = 0;
|
||||
let archived = 0;
|
||||
let failed = 0;
|
||||
let errors = 0;
|
||||
|
||||
for (const tender of result.rows) {
|
||||
try {
|
||||
// Validate URL
|
||||
const response = await fetch(tender.notice_url, {
|
||||
method: 'HEAD',
|
||||
redirect: 'follow',
|
||||
signal: AbortSignal.timeout(10000)
|
||||
});
|
||||
|
||||
const status = response.status;
|
||||
const finalUrl = response.url;
|
||||
|
||||
const isRemoved = (
|
||||
status === 404 ||
|
||||
finalUrl.includes('/syserror/') ||
|
||||
finalUrl.includes('/notfound') ||
|
||||
finalUrl.includes('/error')
|
||||
);
|
||||
|
||||
if (isRemoved) {
|
||||
// Create snapshot before archiving
|
||||
const snapshot = {
|
||||
source: tender.source,
|
||||
source_id: tender.source_id,
|
||||
title: tender.title,
|
||||
description: tender.description,
|
||||
notice_url: tender.notice_url,
|
||||
deadline: tender.deadline,
|
||||
value_low: tender.value_low,
|
||||
value_high: tender.value_high,
|
||||
currency: tender.currency,
|
||||
authority_name: tender.authority_name,
|
||||
location: tender.location,
|
||||
sector: tender.sector,
|
||||
cpv_codes: tender.cpv_codes,
|
||||
published_date: tender.published_date,
|
||||
scraped_at: tender.created_at,
|
||||
archived_reason: 'URL removed from source',
|
||||
archived_status: status,
|
||||
archived_redirect: finalUrl !== tender.notice_url ? finalUrl : null
|
||||
};
|
||||
|
||||
// Archive the tender
|
||||
await pool.query(
|
||||
`UPDATE tenders
|
||||
SET status = 'closed',
|
||||
archived = TRUE,
|
||||
archived_at = NOW(),
|
||||
archived_snapshot = $1,
|
||||
last_validated = NOW(),
|
||||
validation_failures = validation_failures + 1
|
||||
WHERE id = $2`,
|
||||
[JSON.stringify(snapshot), tender.id]
|
||||
);
|
||||
|
||||
archived++;
|
||||
console.log(`📦 Archived: [${tender.source}] ${tender.title.substring(0, 60)}`);
|
||||
|
||||
} else {
|
||||
// URL still works - update validation timestamp
|
||||
await pool.query(
|
||||
`UPDATE tenders
|
||||
SET last_validated = NOW(),
|
||||
validation_failures = 0
|
||||
WHERE id = $1`,
|
||||
[tender.id]
|
||||
);
|
||||
|
||||
validated++;
|
||||
}
|
||||
|
||||
} catch (error) {
|
||||
// Network error or timeout
|
||||
const currentFailures = tender.validation_failures || 0;
|
||||
|
||||
if (currentFailures >= 3) {
|
||||
// After 3 failures, mark as closed (probably permanently gone)
|
||||
const snapshot = {
|
||||
source: tender.source,
|
||||
source_id: tender.source_id,
|
||||
title: tender.title,
|
||||
description: tender.description,
|
||||
notice_url: tender.notice_url,
|
||||
deadline: tender.deadline,
|
||||
scraped_at: tender.created_at,
|
||||
archived_reason: `Validation failed ${currentFailures + 1} times: ${error.message}`,
|
||||
};
|
||||
|
||||
await pool.query(
|
||||
`UPDATE tenders
|
||||
SET status = 'closed',
|
||||
archived = TRUE,
|
||||
archived_at = NOW(),
|
||||
archived_snapshot = $1,
|
||||
validation_failures = validation_failures + 1
|
||||
WHERE id = $2`,
|
||||
[JSON.stringify(snapshot), tender.id]
|
||||
);
|
||||
|
||||
archived++;
|
||||
console.log(`⚠️ Archived (after ${currentFailures + 1} failures): ${tender.title.substring(0, 60)}`);
|
||||
} else {
|
||||
// Increment failure count but don't close yet
|
||||
await pool.query(
|
||||
`UPDATE tenders
|
||||
SET validation_failures = validation_failures + 1,
|
||||
last_validated = NOW()
|
||||
WHERE id = $1`,
|
||||
[tender.id]
|
||||
);
|
||||
|
||||
failed++;
|
||||
console.log(`❌ Validation failed (${currentFailures + 1}/3): ${tender.title.substring(0, 60)}`);
|
||||
}
|
||||
|
||||
errors++;
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`\\n=== VALIDATION COMPLETE ===`);
|
||||
console.log(`Total checked: ${result.rows.length}`);
|
||||
console.log(`✅ Still valid: ${validated}`);
|
||||
console.log(`📦 Archived: ${archived}`);
|
||||
console.log(`❌ Temporary failures: ${failed}`);
|
||||
console.log(`⚠️ Errors: ${errors}`);
|
||||
console.log(`Completion time: ${new Date().toISOString()}`);
|
||||
|
||||
// Print statistics
|
||||
const stats = await pool.query(
|
||||
`SELECT
|
||||
COUNT(*) FILTER (WHERE status = 'open') as open,
|
||||
COUNT(*) FILTER (WHERE status = 'closed' AND NOT archived) as closed,
|
||||
COUNT(*) FILTER (WHERE archived) as archived,
|
||||
COUNT(*) as total
|
||||
FROM tenders`
|
||||
);
|
||||
|
||||
console.log(`\\n=== DATABASE STATS ===`);
|
||||
console.log(`Total tenders: ${stats.rows[0].total}`);
|
||||
console.log(`Open: ${stats.rows[0].open}`);
|
||||
console.log(`Closed: ${stats.rows[0].closed}`);
|
||||
console.log(`Archived: ${stats.rows[0].archived}`);
|
||||
|
||||
} catch (error) {
|
||||
console.error('Fatal error:', error);
|
||||
} finally {
|
||||
await pool.end();
|
||||
}
|
||||
}
|
||||
|
||||
validateAndArchive();
|
||||
Reference in New Issue
Block a user