Files
tenderpilot/scrapers/find-tender.js.bak
Peter Foster c6b0169f3e feat: three major improvements - stable sources, archival, email alerts
1. Focus on Stable International/Regional Sources
   - Improved TED EU scraper (5 search strategies, 5 pages each)
   - All stable sources now hourly (TED EU, Sell2Wales, PCS Scotland, eTendersNI)
   - De-prioritize unreliable UK gov sites (100% removal rate)

2. Archival Feature
   - New DB columns: archived, archived_at, archived_snapshot, last_validated, validation_failures
   - Cleanup script now preserves full tender snapshots before archiving
   - Gradual failure handling (3 retries before archiving)
   - No data loss - historical record preserved

3. Email Alerts
   - Daily digest (8am) - all new tenders from last 24h
   - High-value alerts (every 4h) - tenders >£100k
   - Professional HTML emails with all tender details
   - Configurable via environment variables

Expected outcomes:
- 50-100 stable tenders (vs 26 currently)
- Zero 404 errors (archived data preserved)
- Proactive notifications (no missed opportunities)
- Historical archive for trend analysis

Files:
- scrapers/ted-eu.js (improved)
- cleanup-with-archival.mjs (new)
- send-tender-alerts.mjs (new)
- migrations/add-archival-fields.sql (new)
- THREE_IMPROVEMENTS_SUMMARY.md (documentation)

All cron jobs updated for hourly scraping + daily cleanup + alerts
2026-02-15 14:42:17 +00:00

131 lines
4.5 KiB
JavaScript

import axios from 'axios';
import * as cheerio from 'cheerio';
import { classifySector } from './classify-sector.js';
import pg from 'pg';
import dotenv from 'dotenv';
dotenv.config();
const pool = new pg.Pool({
connectionString: process.env.DATABASE_URL || 'postgresql://tenderpilot:tenderpilot123@localhost:5432/tenderpilot'
});
// Rate limiting
const delay = (ms) => new Promise(resolve => setTimeout(resolve, ms));
async function scrapeTenders() {
try {
console.log(`[${new Date().toISOString()}] Starting Find a Tender scrape...`);
let insertedCount = 0;
const maxPages = 5; // Limit to first 5 pages to be respectful
for (let page = 1; page <= maxPages; page++) {
console.log(`Fetching page ${page}...`);
const url = `https://www.find-tender.service.gov.uk/Search/Results?page=${page}&sort=recent`;
const response = await axios.get(url, {
timeout: 30000,
headers: {
'User-Agent': 'TenderRadar/1.0 (UK Public Procurement Aggregator; contact@tenderradar.co.uk)'
}
});
const $ = cheerio.load(response.data);
const tenderElements = $('div.search-result');
if (tenderElements.length === 0) {
console.log('No more tenders found, stopping pagination');
break;
}
console.log(`Found ${tenderElements.length} tenders on page ${page}`);
for (let i = 0; i < tenderElements.length; i++) {
try {
const element = tenderElements.eq(i);
const titleLink = element.find('.search-result-header a').first();
const title = titleLink.text().trim();
const rawHref = titleLink.attr('href') || '';
const noticeUrl = rawHref.startsWith('http') ? rawHref : 'https://www.find-tender.service.gov.uk' + rawHref;
// Extract source ID from URL
const urlMatch = noticeUrl.match(/\/([A-Z0-9-]+)$/);
const sourceId = urlMatch ? urlMatch[1] : noticeUrl;
const authority = element.find('.search-result-sub-header').text().trim();
const description = element.find('.search-result-description').text().trim();
// Extract dates and value
const metadata = element.find('.search-result-metadata').text();
let publishedDate = null;
let deadline = null;
let valueLow = null;
const publishMatch = metadata.match(/Published:\s*(\d{1,2}\s+\w+\s+\d{4})/);
if (publishMatch) {
publishedDate = new Date(publishMatch[1]).toISOString();
}
const deadlineMatch = metadata.match(/Deadline:\s*(\d{1,2}\s+\w+\s+\d{4})/);
if (deadlineMatch) {
deadline = new Date(deadlineMatch[1]).toISOString();
}
const valueMatch = metadata.match(/£([\d,]+)/);
if (valueMatch) {
valueLow = parseFloat(valueMatch[1].replace(/,/g, ''));
}
await pool.query(
`INSERT INTO tenders (
source, source_id, title, description, summary, cpv_codes,
value_low, value_high, currency, published_date, deadline,
authority_name, authority_type, location, documents_url, notice_url, status, sector
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18)
ON CONFLICT (source_id) DO NOTHING`,
[
'find_tender',
sourceId,
title.substring(0, 500),
description,
description.substring(0, 500),
[],
valueLow,
valueLow,
'GBP',
publishedDate,
deadline,
authority,
'government',
'UK',
'',
noticeUrl,
deadline && new Date(deadline) > new Date() ? 'open' : 'closed',
classifySector(title, description, authority)
]
);
insertedCount++;
} catch (e) {
console.error('Error inserting tender:', e.message);
}
}
// Rate limiting: wait 2 seconds between pages
if (page < maxPages) {
await delay(2000);
}
}
console.log(`[${new Date().toISOString()}] Find a Tender scrape complete. Inserted/updated ${insertedCount} tenders`);
} catch (error) {
console.error('Error scraping Find a Tender:', error.message);
} finally {
await pool.end();
}
}
scrapeTenders();