import axios from 'axios'; import * as cheerio from 'cheerio'; import { classifySector } from './classify-sector.js'; import pg from 'pg'; import dotenv from 'dotenv'; dotenv.config(); const pool = new pg.Pool({ connectionString: process.env.DATABASE_URL || 'postgresql://tenderpilot:tenderpilot123@localhost:5432/tenderpilot' }); // Rate limiting const delay = (ms) => new Promise(resolve => setTimeout(resolve, ms)); async function scrapeTenders() { try { console.log(`[${new Date().toISOString()}] Starting Find a Tender scrape...`); let insertedCount = 0; const maxPages = 5; // Limit to first 5 pages to be respectful for (let page = 1; page <= maxPages; page++) { console.log(`Fetching page ${page}...`); const url = `https://www.find-tender.service.gov.uk/Search/Results?page=${page}&sort=recent`; const response = await axios.get(url, { timeout: 30000, headers: { 'User-Agent': 'TenderRadar/1.0 (UK Public Procurement Aggregator; contact@tenderradar.co.uk)' } }); const $ = cheerio.load(response.data); const tenderElements = $('div.search-result'); if (tenderElements.length === 0) { console.log('No more tenders found, stopping pagination'); break; } console.log(`Found ${tenderElements.length} tenders on page ${page}`); for (let i = 0; i < tenderElements.length; i++) { try { const element = tenderElements.eq(i); const titleLink = element.find('.search-result-header a').first(); const title = titleLink.text().trim(); const rawHref = titleLink.attr('href') || ''; const noticeUrl = rawHref.startsWith('http') ? rawHref : 'https://www.find-tender.service.gov.uk' + rawHref; // Extract source ID from URL const urlMatch = noticeUrl.match(/\/([A-Z0-9-]+)$/); const sourceId = urlMatch ? urlMatch[1] : noticeUrl; const authority = element.find('.search-result-sub-header').text().trim(); const description = element.find('.search-result-description').text().trim(); // Extract dates and value const metadata = element.find('.search-result-metadata').text(); let publishedDate = null; let deadline = null; let valueLow = null; const publishMatch = metadata.match(/Published:\s*(\d{1,2}\s+\w+\s+\d{4})/); if (publishMatch) { publishedDate = new Date(publishMatch[1]).toISOString(); } const deadlineMatch = metadata.match(/Deadline:\s*(\d{1,2}\s+\w+\s+\d{4})/); if (deadlineMatch) { deadline = new Date(deadlineMatch[1]).toISOString(); } const valueMatch = metadata.match(/£([\d,]+)/); if (valueMatch) { valueLow = parseFloat(valueMatch[1].replace(/,/g, '')); } await pool.query( `INSERT INTO tenders ( source, source_id, title, description, summary, cpv_codes, value_low, value_high, currency, published_date, deadline, authority_name, authority_type, location, documents_url, notice_url, status, sector ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18) ON CONFLICT (source_id) DO NOTHING`, [ 'find_tender', sourceId, title.substring(0, 500), description, description.substring(0, 500), [], valueLow, valueLow, 'GBP', publishedDate, deadline, authority, 'government', 'UK', '', noticeUrl, deadline && new Date(deadline) > new Date() ? 'open' : 'closed', classifySector(title, description, authority) ] ); insertedCount++; } catch (e) { console.error('Error inserting tender:', e.message); } } // Rate limiting: wait 2 seconds between pages if (page < maxPages) { await delay(2000); } } console.log(`[${new Date().toISOString()}] Find a Tender scrape complete. Inserted/updated ${insertedCount} tenders`); } catch (error) { console.error('Error scraping Find a Tender:', error.message); } finally { await pool.end(); } } scrapeTenders();