2026-02-14 14:17:15 +00:00
|
|
|
import axios from 'axios';
|
|
|
|
|
import * as cheerio from 'cheerio';
|
2026-02-14 17:12:51 +00:00
|
|
|
import { classifySector } from './classify-sector.js';
|
2026-02-14 14:17:15 +00:00
|
|
|
import pg from 'pg';
|
|
|
|
|
import dotenv from 'dotenv';
|
|
|
|
|
|
|
|
|
|
dotenv.config();
|
|
|
|
|
|
|
|
|
|
const pool = new pg.Pool({
|
|
|
|
|
connectionString: process.env.DATABASE_URL || 'postgresql://tenderpilot:tenderpilot123@localhost:5432/tenderpilot'
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
// Rate limiting
|
|
|
|
|
const delay = (ms) => new Promise(resolve => setTimeout(resolve, ms));
|
|
|
|
|
|
|
|
|
|
async function scrapeTenders() {
|
|
|
|
|
try {
|
|
|
|
|
console.log(`[${new Date().toISOString()}] Starting Find a Tender scrape...`);
|
|
|
|
|
|
|
|
|
|
let insertedCount = 0;
|
|
|
|
|
const maxPages = 5; // Limit to first 5 pages to be respectful
|
|
|
|
|
|
|
|
|
|
for (let page = 1; page <= maxPages; page++) {
|
|
|
|
|
console.log(`Fetching page ${page}...`);
|
|
|
|
|
|
|
|
|
|
const url = `https://www.find-tender.service.gov.uk/Search/Results?page=${page}&sort=recent`;
|
|
|
|
|
|
|
|
|
|
const response = await axios.get(url, {
|
|
|
|
|
timeout: 30000,
|
|
|
|
|
headers: {
|
|
|
|
|
'User-Agent': 'TenderRadar/1.0 (UK Public Procurement Aggregator; contact@tenderradar.co.uk)'
|
|
|
|
|
}
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
const $ = cheerio.load(response.data);
|
2026-02-14 17:12:51 +00:00
|
|
|
const tenderElements = $('div.search-result');
|
2026-02-14 14:17:15 +00:00
|
|
|
|
|
|
|
|
if (tenderElements.length === 0) {
|
|
|
|
|
console.log('No more tenders found, stopping pagination');
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
console.log(`Found ${tenderElements.length} tenders on page ${page}`);
|
|
|
|
|
|
|
|
|
|
for (let i = 0; i < tenderElements.length; i++) {
|
|
|
|
|
try {
|
|
|
|
|
const element = tenderElements.eq(i);
|
|
|
|
|
|
|
|
|
|
const titleLink = element.find('.search-result-header a').first();
|
|
|
|
|
const title = titleLink.text().trim();
|
2026-02-14 16:15:21 +00:00
|
|
|
const rawHref = titleLink.attr('href') || '';
|
2026-02-15 13:18:50 +00:00
|
|
|
const rawUrl = rawHref.startsWith("http") ? rawHref : "https://www.find-tender.service.gov.uk" + rawHref;
|
|
|
|
|
// Strip query parameters to get clean notice URL
|
|
|
|
|
const noticeUrl = rawUrl.split("?")[0];
|
2026-02-14 14:17:15 +00:00
|
|
|
|
|
|
|
|
// Extract source ID from URL
|
2026-02-15 13:18:50 +00:00
|
|
|
const urlMatch = noticeUrl.match(/\/Notice\/([A-Z0-9-]+)/);
|
2026-02-14 14:17:15 +00:00
|
|
|
const sourceId = urlMatch ? urlMatch[1] : noticeUrl;
|
|
|
|
|
|
|
|
|
|
const authority = element.find('.search-result-sub-header').text().trim();
|
|
|
|
|
const description = element.find('.search-result-description').text().trim();
|
|
|
|
|
|
|
|
|
|
// Extract dates and value
|
|
|
|
|
const metadata = element.find('.search-result-metadata').text();
|
|
|
|
|
let publishedDate = null;
|
|
|
|
|
let deadline = null;
|
|
|
|
|
let valueLow = null;
|
|
|
|
|
|
|
|
|
|
const publishMatch = metadata.match(/Published:\s*(\d{1,2}\s+\w+\s+\d{4})/);
|
|
|
|
|
if (publishMatch) {
|
|
|
|
|
publishedDate = new Date(publishMatch[1]).toISOString();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const deadlineMatch = metadata.match(/Deadline:\s*(\d{1,2}\s+\w+\s+\d{4})/);
|
|
|
|
|
if (deadlineMatch) {
|
|
|
|
|
deadline = new Date(deadlineMatch[1]).toISOString();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const valueMatch = metadata.match(/£([\d,]+)/);
|
|
|
|
|
if (valueMatch) {
|
|
|
|
|
valueLow = parseFloat(valueMatch[1].replace(/,/g, ''));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
await pool.query(
|
|
|
|
|
`INSERT INTO tenders (
|
|
|
|
|
source, source_id, title, description, summary, cpv_codes,
|
|
|
|
|
value_low, value_high, currency, published_date, deadline,
|
2026-02-14 17:12:51 +00:00
|
|
|
authority_name, authority_type, location, documents_url, notice_url, status, sector
|
|
|
|
|
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18)
|
2026-02-14 14:17:15 +00:00
|
|
|
ON CONFLICT (source_id) DO NOTHING`,
|
|
|
|
|
[
|
|
|
|
|
'find_tender',
|
|
|
|
|
sourceId,
|
|
|
|
|
title.substring(0, 500),
|
|
|
|
|
description,
|
|
|
|
|
description.substring(0, 500),
|
|
|
|
|
[],
|
|
|
|
|
valueLow,
|
|
|
|
|
valueLow,
|
|
|
|
|
'GBP',
|
|
|
|
|
publishedDate,
|
|
|
|
|
deadline,
|
|
|
|
|
authority,
|
|
|
|
|
'government',
|
|
|
|
|
'UK',
|
|
|
|
|
'',
|
|
|
|
|
noticeUrl,
|
2026-02-14 17:12:51 +00:00
|
|
|
deadline && new Date(deadline) > new Date() ? 'open' : 'closed',
|
|
|
|
|
classifySector(title, description, authority)
|
2026-02-14 14:17:15 +00:00
|
|
|
]
|
|
|
|
|
);
|
|
|
|
|
insertedCount++;
|
|
|
|
|
} catch (e) {
|
|
|
|
|
console.error('Error inserting tender:', e.message);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Rate limiting: wait 2 seconds between pages
|
|
|
|
|
if (page < maxPages) {
|
|
|
|
|
await delay(2000);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
console.log(`[${new Date().toISOString()}] Find a Tender scrape complete. Inserted/updated ${insertedCount} tenders`);
|
|
|
|
|
} catch (error) {
|
|
|
|
|
console.error('Error scraping Find a Tender:', error.message);
|
|
|
|
|
} finally {
|
|
|
|
|
await pool.end();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
scrapeTenders();
|