Files
tenderpilot/scrapers/find-tender.js

131 lines
4.5 KiB
JavaScript
Raw Normal View History

import axios from 'axios';
import * as cheerio from 'cheerio';
import { classifySector } from './classify-sector.js';
import pg from 'pg';
import dotenv from 'dotenv';
dotenv.config();
const pool = new pg.Pool({
connectionString: process.env.DATABASE_URL || 'postgresql://tenderpilot:tenderpilot123@localhost:5432/tenderpilot'
});
// Rate limiting
const delay = (ms) => new Promise(resolve => setTimeout(resolve, ms));
async function scrapeTenders() {
try {
console.log(`[${new Date().toISOString()}] Starting Find a Tender scrape...`);
let insertedCount = 0;
const maxPages = 5; // Limit to first 5 pages to be respectful
for (let page = 1; page <= maxPages; page++) {
console.log(`Fetching page ${page}...`);
const url = `https://www.find-tender.service.gov.uk/Search/Results?page=${page}&sort=recent`;
const response = await axios.get(url, {
timeout: 30000,
headers: {
'User-Agent': 'TenderRadar/1.0 (UK Public Procurement Aggregator; contact@tenderradar.co.uk)'
}
});
const $ = cheerio.load(response.data);
const tenderElements = $('div.search-result');
if (tenderElements.length === 0) {
console.log('No more tenders found, stopping pagination');
break;
}
console.log(`Found ${tenderElements.length} tenders on page ${page}`);
for (let i = 0; i < tenderElements.length; i++) {
try {
const element = tenderElements.eq(i);
const titleLink = element.find('.search-result-header a').first();
const title = titleLink.text().trim();
const rawHref = titleLink.attr('href') || '';
const noticeUrl = rawHref.startsWith('http') ? rawHref : 'https://www.find-tender.service.gov.uk' + rawHref;
// Extract source ID from URL
const urlMatch = noticeUrl.match(/\/([A-Z0-9-]+)$/);
const sourceId = urlMatch ? urlMatch[1] : noticeUrl;
const authority = element.find('.search-result-sub-header').text().trim();
const description = element.find('.search-result-description').text().trim();
// Extract dates and value
const metadata = element.find('.search-result-metadata').text();
let publishedDate = null;
let deadline = null;
let valueLow = null;
const publishMatch = metadata.match(/Published:\s*(\d{1,2}\s+\w+\s+\d{4})/);
if (publishMatch) {
publishedDate = new Date(publishMatch[1]).toISOString();
}
const deadlineMatch = metadata.match(/Deadline:\s*(\d{1,2}\s+\w+\s+\d{4})/);
if (deadlineMatch) {
deadline = new Date(deadlineMatch[1]).toISOString();
}
const valueMatch = metadata.match(/£([\d,]+)/);
if (valueMatch) {
valueLow = parseFloat(valueMatch[1].replace(/,/g, ''));
}
await pool.query(
`INSERT INTO tenders (
source, source_id, title, description, summary, cpv_codes,
value_low, value_high, currency, published_date, deadline,
authority_name, authority_type, location, documents_url, notice_url, status, sector
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18)
ON CONFLICT (source_id) DO NOTHING`,
[
'find_tender',
sourceId,
title.substring(0, 500),
description,
description.substring(0, 500),
[],
valueLow,
valueLow,
'GBP',
publishedDate,
deadline,
authority,
'government',
'UK',
'',
noticeUrl,
deadline && new Date(deadline) > new Date() ? 'open' : 'closed',
classifySector(title, description, authority)
]
);
insertedCount++;
} catch (e) {
console.error('Error inserting tender:', e.message);
}
}
// Rate limiting: wait 2 seconds between pages
if (page < maxPages) {
await delay(2000);
}
}
console.log(`[${new Date().toISOString()}] Find a Tender scrape complete. Inserted/updated ${insertedCount} tenders`);
} catch (error) {
console.error('Error scraping Find a Tender:', error.message);
} finally {
await pool.end();
}
}
scrapeTenders();