import axios from 'axios'; import { classifySector } from './classify-sector.js'; import pg from 'pg'; import dotenv from 'dotenv'; dotenv.config(); const pool = new pg.Pool({ connectionString: process.env.DATABASE_URL || 'postgresql://tenderpilot:tenderpilot123@localhost:5432/tenderpilot' }); async function sleep(ms) { return new Promise(resolve => setTimeout(resolve, ms)); } /** * IMPROVED CONTRACTS FINDER SCRAPER * * Enhancements: * 1. Removes stage=tender filter - gets ALL notice types (planning, tender, award, contract) * 2. Reduces lookback window from 90 days to 14 days (captures fresh tenders) * 3. Adds sophisticated filtering - only tenders with deadlines >= 24 hours in future * 4. Adds incremental mode support (tracks last scrape time) * 5. Better error handling and rate limiting */ async function scrapeTenders() { try { console.log(`[${new Date().toISOString()}] Starting IMPROVED tender scrape...`); // ENHANCEMENT 1: Get last scrape time for incremental updates let publishedFrom; try { const lastScrape = await pool.query( "SELECT MAX(created_at) as last_scrape FROM tenders WHERE source = 'contracts_finder'" ); if (lastScrape.rows[0].last_scrape) { // Incremental: get tenders published since last scrape publishedFrom = new Date(lastScrape.rows[0].last_scrape); publishedFrom.setHours(publishedFrom.getHours() - 1); // 1-hour overlap for safety console.log(`Incremental mode: fetching since ${publishedFrom.toISOString()}`); } else { // First run: get last 14 days publishedFrom = new Date(); publishedFrom.setDate(publishedFrom.getDate() - 14); console.log(`First run: fetching last 14 days`); } } catch (e) { // Fallback: 14 days publishedFrom = new Date(); publishedFrom.setDate(publishedFrom.getDate() - 14); console.log(`Fallback: fetching last 14 days`); } const dateStr = publishedFrom.toISOString().split('T')[0]; // ENHANCEMENT 2: Remove stage=tender filter to get ALL notice types // Old: ?stage=tender&output=json&publishedFrom=${dateStr} // New: ?output=json&publishedFrom=${dateStr} const baseUrl = `https://www.contractsfinder.service.gov.uk/Published/Notices/OCDS/Search?output=json&publishedFrom=${dateStr}`; console.log(`Base URL: ${baseUrl}`); console.log(`Getting ALL notice types (not just stage=tender)`); let insertedCount = 0; let skippedExpired = 0; let skippedNoDeadline = 0; let skippedTooSoon = 0; let totalProcessed = 0; let pageNum = 1; let hasNextPage = true; let nextPageUrl = baseUrl; // ENHANCEMENT 3: Filter criteria - only tenders with deadline >= 24 hours in future const now = new Date(); const minDeadline = new Date(now.getTime() + 24 * 60 * 60 * 1000); // 24 hours from now console.log(`Filtering: deadline must be after ${minDeadline.toISOString()}`); while (hasNextPage) { try { console.log(`\nFetching page ${pageNum}...`); const response = await axios.get(nextPageUrl, { timeout: 30000, headers: { 'User-Agent': 'TenderRadar/2.0 (UK Public Procurement Monitor)' } }); const data = response.data; const releases = data.releases || []; console.log(` Received ${releases.length} releases`); for (const release of releases) { totalProcessed++; try { const tender = release.tender || {}; const planning = release.planning || {}; const parties = release.parties || []; // Find procuring entity const procurer = parties.find(p => p.roles && ( p.roles.includes('buyer') || p.roles.includes('procuringEntity') || p.roles.includes('procurer') ) ) || (release.buyer ? release.buyer : null); const sourceId = release.ocid || release.id; const title = tender.title || release.title || 'Untitled'; const description = tender.description || release.description || ''; const publishedDate = release.date; const deadline = tender.tenderPeriod?.endDate; // ENHANCEMENT 3: Sophisticated filtering if (!deadline) { skippedNoDeadline++; continue; // Skip if no deadline specified } const deadlineDate = new Date(deadline); // Skip if already expired if (deadlineDate < now) { skippedExpired++; continue; } // Skip if deadline is too soon (< 24 hours) if (deadlineDate < minDeadline) { skippedTooSoon++; continue; } const authority = procurer?.name || release.buyer?.name || 'Unknown'; const location = planning?.budget?.description || tender.procurementMethod || ''; // Build notice URL let noticeUrl; if (release.url) { noticeUrl = release.url; } else if (sourceId) { const uuid = sourceId.replace('ocds-b5fd17-', ''); noticeUrl = `https://www.contractsfinder.service.gov.uk/notice/${uuid}`; } else { continue; // Skip if we can't build a URL } const documentsUrl = tender.documents?.length > 0 ? tender.documents[0].url : ''; // Extract value let valueLow = null, valueHigh = null; if (planning?.budget?.amount?.amount) { valueLow = planning.budget.amount.amount; valueHigh = planning.budget.amount.amount; } else if (tender.value?.amount) { valueLow = tender.value.amount; valueHigh = tender.value.amount; } const cpvCodes = tender.classification ? [tender.classification.scheme] : []; // Get notice type/stage const noticeType = release.tag?.[0] || 'tender'; const result = await pool.query( `INSERT INTO tenders ( source, source_id, title, description, summary, cpv_codes, value_low, value_high, currency, published_date, deadline, authority_name, authority_type, location, documents_url, notice_url, status, sector ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18) ON CONFLICT (source_id) DO NOTHING RETURNING id`, [ 'contracts_finder', sourceId, title.substring(0, 500), description, description.substring(0, 500), cpvCodes, valueLow, valueHigh, 'GBP', publishedDate, deadline, authority, 'government', location.substring(0, 255), documentsUrl, noticeUrl, 'open', classifySector(title, description, authority) ] ); if (result.rowCount > 0) { insertedCount++; if (insertedCount % 10 === 0) { console.log(` Inserted ${insertedCount} tenders so far...`); } } } catch (e) { console.error(` Error processing tender ${totalProcessed}:`, e.message); } } console.log(`Page ${pageNum} complete: processed ${releases.length} releases`); console.log(` Inserted: ${insertedCount} | Skipped (expired: ${skippedExpired}, no deadline: ${skippedNoDeadline}, too soon: ${skippedTooSoon})`); // Check for next page if (data.links && data.links.next) { nextPageUrl = data.links.next; hasNextPage = true; pageNum++; // Rate limiting: 1 second between pages await sleep(1000); } else { hasNextPage = false; } } catch (error) { console.error(`Error fetching page ${pageNum}:`, error.message); hasNextPage = false; } } console.log(`\n=== SCRAPE COMPLETE ===`); console.log(`Total processed: ${totalProcessed}`); console.log(`Inserted: ${insertedCount}`); console.log(`Skipped - expired: ${skippedExpired}`); console.log(`Skipped - no deadline: ${skippedNoDeadline}`); console.log(`Skipped - deadline < 24h: ${skippedTooSoon}`); console.log(`Completion time: ${new Date().toISOString()}`); } catch (error) { console.error('Fatal error in scraper:', error.message); console.error(error.stack); } finally { await pool.end(); } } scrapeTenders();