2026-02-14 14:17:15 +00:00
|
|
|
import axios from 'axios';
|
2026-02-14 17:12:51 +00:00
|
|
|
import { classifySector } from './classify-sector.js';
|
2026-02-14 14:17:15 +00:00
|
|
|
import pg from 'pg';
|
|
|
|
|
import dotenv from 'dotenv';
|
|
|
|
|
|
|
|
|
|
dotenv.config();
|
|
|
|
|
|
|
|
|
|
const pool = new pg.Pool({
|
|
|
|
|
connectionString: process.env.DATABASE_URL || 'postgresql://tenderpilot:tenderpilot123@localhost:5432/tenderpilot'
|
|
|
|
|
});
|
|
|
|
|
|
2026-02-14 17:12:51 +00:00
|
|
|
async function sleep(ms) {
|
|
|
|
|
return new Promise(resolve => setTimeout(resolve, ms));
|
|
|
|
|
}
|
|
|
|
|
|
2026-02-15 14:30:41 +00:00
|
|
|
/**
|
|
|
|
|
* IMPROVED CONTRACTS FINDER SCRAPER
|
|
|
|
|
*
|
|
|
|
|
* Enhancements:
|
|
|
|
|
* 1. Removes stage=tender filter - gets ALL notice types (planning, tender, award, contract)
|
|
|
|
|
* 2. Reduces lookback window from 90 days to 14 days (captures fresh tenders)
|
|
|
|
|
* 3. Adds sophisticated filtering - only tenders with deadlines >= 24 hours in future
|
|
|
|
|
* 4. Adds incremental mode support (tracks last scrape time)
|
|
|
|
|
* 5. Better error handling and rate limiting
|
|
|
|
|
*/
|
|
|
|
|
|
2026-02-14 14:17:15 +00:00
|
|
|
async function scrapeTenders() {
|
|
|
|
|
try {
|
2026-02-15 14:30:41 +00:00
|
|
|
console.log(`[${new Date().toISOString()}] Starting IMPROVED tender scrape...`);
|
|
|
|
|
|
|
|
|
|
// ENHANCEMENT 1: Get last scrape time for incremental updates
|
|
|
|
|
let publishedFrom;
|
|
|
|
|
try {
|
|
|
|
|
const lastScrape = await pool.query(
|
|
|
|
|
"SELECT MAX(created_at) as last_scrape FROM tenders WHERE source = 'contracts_finder'"
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
if (lastScrape.rows[0].last_scrape) {
|
|
|
|
|
// Incremental: get tenders published since last scrape
|
|
|
|
|
publishedFrom = new Date(lastScrape.rows[0].last_scrape);
|
|
|
|
|
publishedFrom.setHours(publishedFrom.getHours() - 1); // 1-hour overlap for safety
|
|
|
|
|
console.log(`Incremental mode: fetching since ${publishedFrom.toISOString()}`);
|
|
|
|
|
} else {
|
|
|
|
|
// First run: get last 14 days
|
|
|
|
|
publishedFrom = new Date();
|
|
|
|
|
publishedFrom.setDate(publishedFrom.getDate() - 14);
|
|
|
|
|
console.log(`First run: fetching last 14 days`);
|
|
|
|
|
}
|
|
|
|
|
} catch (e) {
|
|
|
|
|
// Fallback: 14 days
|
|
|
|
|
publishedFrom = new Date();
|
|
|
|
|
publishedFrom.setDate(publishedFrom.getDate() - 14);
|
|
|
|
|
console.log(`Fallback: fetching last 14 days`);
|
|
|
|
|
}
|
2026-02-14 14:17:15 +00:00
|
|
|
|
2026-02-15 14:30:41 +00:00
|
|
|
const dateStr = publishedFrom.toISOString().split('T')[0];
|
2026-02-14 14:17:15 +00:00
|
|
|
|
2026-02-15 14:30:41 +00:00
|
|
|
// ENHANCEMENT 2: Remove stage=tender filter to get ALL notice types
|
|
|
|
|
// Old: ?stage=tender&output=json&publishedFrom=${dateStr}
|
|
|
|
|
// New: ?output=json&publishedFrom=${dateStr}
|
|
|
|
|
const baseUrl = `https://www.contractsfinder.service.gov.uk/Published/Notices/OCDS/Search?output=json&publishedFrom=${dateStr}`;
|
2026-02-14 14:17:15 +00:00
|
|
|
|
2026-02-14 17:12:51 +00:00
|
|
|
console.log(`Base URL: ${baseUrl}`);
|
2026-02-15 14:30:41 +00:00
|
|
|
console.log(`Getting ALL notice types (not just stage=tender)`);
|
2026-02-14 14:17:15 +00:00
|
|
|
|
|
|
|
|
let insertedCount = 0;
|
2026-02-15 14:30:41 +00:00
|
|
|
let skippedExpired = 0;
|
|
|
|
|
let skippedNoDeadline = 0;
|
|
|
|
|
let skippedTooSoon = 0;
|
2026-02-14 17:12:51 +00:00
|
|
|
let totalProcessed = 0;
|
|
|
|
|
let pageNum = 1;
|
|
|
|
|
let hasNextPage = true;
|
|
|
|
|
let nextPageUrl = baseUrl;
|
2026-02-14 14:17:15 +00:00
|
|
|
|
2026-02-15 14:30:41 +00:00
|
|
|
// ENHANCEMENT 3: Filter criteria - only tenders with deadline >= 24 hours in future
|
|
|
|
|
const now = new Date();
|
|
|
|
|
const minDeadline = new Date(now.getTime() + 24 * 60 * 60 * 1000); // 24 hours from now
|
|
|
|
|
|
|
|
|
|
console.log(`Filtering: deadline must be after ${minDeadline.toISOString()}`);
|
|
|
|
|
|
2026-02-14 17:12:51 +00:00
|
|
|
while (hasNextPage) {
|
2026-02-14 14:17:15 +00:00
|
|
|
try {
|
2026-02-14 17:12:51 +00:00
|
|
|
console.log(`\nFetching page ${pageNum}...`);
|
2026-02-15 14:30:41 +00:00
|
|
|
const response = await axios.get(nextPageUrl, {
|
|
|
|
|
timeout: 30000,
|
|
|
|
|
headers: {
|
|
|
|
|
'User-Agent': 'TenderRadar/2.0 (UK Public Procurement Monitor)'
|
|
|
|
|
}
|
|
|
|
|
});
|
2026-02-14 14:17:15 +00:00
|
|
|
|
2026-02-14 17:12:51 +00:00
|
|
|
const data = response.data;
|
|
|
|
|
const releases = data.releases || [];
|
2026-02-14 14:17:15 +00:00
|
|
|
|
2026-02-15 14:30:41 +00:00
|
|
|
console.log(` Received ${releases.length} releases`);
|
|
|
|
|
|
2026-02-14 17:12:51 +00:00
|
|
|
for (const release of releases) {
|
2026-02-15 14:30:41 +00:00
|
|
|
totalProcessed++;
|
|
|
|
|
|
2026-02-14 17:12:51 +00:00
|
|
|
try {
|
|
|
|
|
const tender = release.tender || {};
|
|
|
|
|
const planning = release.planning || {};
|
|
|
|
|
const parties = release.parties || [];
|
|
|
|
|
|
|
|
|
|
// Find procuring entity
|
2026-02-15 14:30:41 +00:00
|
|
|
const procurer = parties.find(p =>
|
|
|
|
|
p.roles && (
|
|
|
|
|
p.roles.includes('buyer') ||
|
|
|
|
|
p.roles.includes('procuringEntity') ||
|
|
|
|
|
p.roles.includes('procurer')
|
|
|
|
|
)
|
|
|
|
|
) || (release.buyer ? release.buyer : null);
|
2026-02-14 17:12:51 +00:00
|
|
|
|
|
|
|
|
const sourceId = release.ocid || release.id;
|
2026-02-15 14:30:41 +00:00
|
|
|
const title = tender.title || release.title || 'Untitled';
|
|
|
|
|
const description = tender.description || release.description || '';
|
2026-02-14 17:12:51 +00:00
|
|
|
const publishedDate = release.date;
|
|
|
|
|
const deadline = tender.tenderPeriod?.endDate;
|
2026-02-15 14:30:41 +00:00
|
|
|
|
|
|
|
|
// ENHANCEMENT 3: Sophisticated filtering
|
|
|
|
|
if (!deadline) {
|
|
|
|
|
skippedNoDeadline++;
|
|
|
|
|
continue; // Skip if no deadline specified
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const deadlineDate = new Date(deadline);
|
|
|
|
|
|
|
|
|
|
// Skip if already expired
|
|
|
|
|
if (deadlineDate < now) {
|
|
|
|
|
skippedExpired++;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Skip if deadline is too soon (< 24 hours)
|
|
|
|
|
if (deadlineDate < minDeadline) {
|
|
|
|
|
skippedTooSoon++;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const authority = procurer?.name || release.buyer?.name || 'Unknown';
|
2026-02-14 17:12:51 +00:00
|
|
|
const location = planning?.budget?.description || tender.procurementMethod || '';
|
2026-02-15 14:30:41 +00:00
|
|
|
|
|
|
|
|
// Build notice URL
|
|
|
|
|
let noticeUrl;
|
|
|
|
|
if (release.url) {
|
|
|
|
|
noticeUrl = release.url;
|
|
|
|
|
} else if (sourceId) {
|
|
|
|
|
const uuid = sourceId.replace('ocds-b5fd17-', '');
|
|
|
|
|
noticeUrl = `https://www.contractsfinder.service.gov.uk/notice/${uuid}`;
|
|
|
|
|
} else {
|
|
|
|
|
continue; // Skip if we can't build a URL
|
|
|
|
|
}
|
|
|
|
|
|
2026-02-14 17:12:51 +00:00
|
|
|
const documentsUrl = tender.documents?.length > 0 ? tender.documents[0].url : '';
|
|
|
|
|
|
|
|
|
|
// Extract value
|
|
|
|
|
let valueLow = null, valueHigh = null;
|
|
|
|
|
if (planning?.budget?.amount?.amount) {
|
|
|
|
|
valueLow = planning.budget.amount.amount;
|
|
|
|
|
valueHigh = planning.budget.amount.amount;
|
|
|
|
|
} else if (tender.value?.amount) {
|
|
|
|
|
valueLow = tender.value.amount;
|
|
|
|
|
valueHigh = tender.value.amount;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const cpvCodes = tender.classification ? [tender.classification.scheme] : [];
|
|
|
|
|
|
2026-02-15 14:30:41 +00:00
|
|
|
// Get notice type/stage
|
|
|
|
|
const noticeType = release.tag?.[0] || 'tender';
|
|
|
|
|
|
2026-02-14 17:12:51 +00:00
|
|
|
const result = await pool.query(
|
|
|
|
|
`INSERT INTO tenders (
|
|
|
|
|
source, source_id, title, description, summary, cpv_codes,
|
|
|
|
|
value_low, value_high, currency, published_date, deadline,
|
|
|
|
|
authority_name, authority_type, location, documents_url, notice_url, status, sector
|
|
|
|
|
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18)
|
2026-02-15 14:30:41 +00:00
|
|
|
ON CONFLICT (source_id) DO NOTHING RETURNING id`,
|
2026-02-14 17:12:51 +00:00
|
|
|
[
|
|
|
|
|
'contracts_finder',
|
|
|
|
|
sourceId,
|
|
|
|
|
title.substring(0, 500),
|
|
|
|
|
description,
|
|
|
|
|
description.substring(0, 500),
|
|
|
|
|
cpvCodes,
|
|
|
|
|
valueLow,
|
|
|
|
|
valueHigh,
|
|
|
|
|
'GBP',
|
|
|
|
|
publishedDate,
|
|
|
|
|
deadline,
|
|
|
|
|
authority,
|
|
|
|
|
'government',
|
|
|
|
|
location.substring(0, 255),
|
|
|
|
|
documentsUrl,
|
|
|
|
|
noticeUrl,
|
|
|
|
|
'open',
|
|
|
|
|
classifySector(title, description, authority)
|
|
|
|
|
]
|
|
|
|
|
);
|
2026-02-15 14:30:41 +00:00
|
|
|
|
2026-02-14 17:12:51 +00:00
|
|
|
if (result.rowCount > 0) {
|
|
|
|
|
insertedCount++;
|
2026-02-15 14:30:41 +00:00
|
|
|
if (insertedCount % 10 === 0) {
|
|
|
|
|
console.log(` Inserted ${insertedCount} tenders so far...`);
|
|
|
|
|
}
|
2026-02-14 17:12:51 +00:00
|
|
|
}
|
2026-02-15 14:30:41 +00:00
|
|
|
|
2026-02-14 17:12:51 +00:00
|
|
|
} catch (e) {
|
2026-02-15 14:30:41 +00:00
|
|
|
console.error(` Error processing tender ${totalProcessed}:`, e.message);
|
2026-02-14 17:12:51 +00:00
|
|
|
}
|
2026-02-14 14:17:15 +00:00
|
|
|
}
|
|
|
|
|
|
2026-02-15 14:30:41 +00:00
|
|
|
console.log(`Page ${pageNum} complete: processed ${releases.length} releases`);
|
|
|
|
|
console.log(` Inserted: ${insertedCount} | Skipped (expired: ${skippedExpired}, no deadline: ${skippedNoDeadline}, too soon: ${skippedTooSoon})`);
|
2026-02-14 14:17:15 +00:00
|
|
|
|
2026-02-14 17:12:51 +00:00
|
|
|
// Check for next page
|
|
|
|
|
if (data.links && data.links.next) {
|
|
|
|
|
nextPageUrl = data.links.next;
|
|
|
|
|
hasNextPage = true;
|
|
|
|
|
pageNum++;
|
2026-02-15 14:30:41 +00:00
|
|
|
// Rate limiting: 1 second between pages
|
2026-02-14 17:12:51 +00:00
|
|
|
await sleep(1000);
|
|
|
|
|
} else {
|
|
|
|
|
hasNextPage = false;
|
|
|
|
|
}
|
2026-02-15 14:30:41 +00:00
|
|
|
|
2026-02-14 17:12:51 +00:00
|
|
|
} catch (error) {
|
|
|
|
|
console.error(`Error fetching page ${pageNum}:`, error.message);
|
|
|
|
|
hasNextPage = false;
|
2026-02-14 14:17:15 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2026-02-15 14:30:41 +00:00
|
|
|
console.log(`\n=== SCRAPE COMPLETE ===`);
|
|
|
|
|
console.log(`Total processed: ${totalProcessed}`);
|
|
|
|
|
console.log(`Inserted: ${insertedCount}`);
|
|
|
|
|
console.log(`Skipped - expired: ${skippedExpired}`);
|
|
|
|
|
console.log(`Skipped - no deadline: ${skippedNoDeadline}`);
|
|
|
|
|
console.log(`Skipped - deadline < 24h: ${skippedTooSoon}`);
|
|
|
|
|
console.log(`Completion time: ${new Date().toISOString()}`);
|
|
|
|
|
|
2026-02-14 14:17:15 +00:00
|
|
|
} catch (error) {
|
2026-02-15 14:30:41 +00:00
|
|
|
console.error('Fatal error in scraper:', error.message);
|
|
|
|
|
console.error(error.stack);
|
2026-02-14 14:17:15 +00:00
|
|
|
} finally {
|
|
|
|
|
await pool.end();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
scrapeTenders();
|