import axios from 'axios'; import { classifySector } from './classify-sector.js'; import pg from 'pg'; import dotenv from 'dotenv'; dotenv.config(); const pool = new pg.Pool({ connectionString: process.env.DATABASE_URL || 'postgresql://tenderpilot:tenderpilot123@localhost:5432/tenderpilot' }); async function sleep(ms) { return new Promise(resolve => setTimeout(resolve, ms)); } async function scrapeTenders() { try { console.log(`[${new Date().toISOString()}] Starting tender scrape...`); // Get date from 90 days ago const fromDate = new Date(); fromDate.setDate(fromDate.getDate() - 90); const dateStr = fromDate.toISOString().split('T')[0]; const baseUrl = `https://www.contractsfinder.service.gov.uk/Published/Notices/OCDS/Search?stage=tender&output=json&publishedFrom=${dateStr}`; console.log(`Base URL: ${baseUrl}`); let insertedCount = 0; let totalProcessed = 0; let pageNum = 1; let hasNextPage = true; let nextPageUrl = baseUrl; while (hasNextPage) { try { console.log(`\nFetching page ${pageNum}...`); const response = await axios.get(nextPageUrl, { timeout: 30000 }); const data = response.data; const releases = data.releases || []; for (const release of releases) { try { const tender = release.tender || {}; const planning = release.planning || {}; const parties = release.parties || []; // Find procuring entity const procurer = parties.find(p => p.roles && (p.roles.includes('buyer') || p.roles.includes('procuringEntity') || p.roles.includes('procurer'))) || (release.buyer ? release.buyer : null); const sourceId = release.ocid || release.id; const title = tender.title || 'Untitled'; const description = tender.description || ''; const publishedDate = release.date; const deadline = tender.tenderPeriod?.endDate; // Skip expired tenders if (deadline && new Date(deadline) < new Date()) continue; const authority = procurer?.name || 'Unknown'; const location = planning?.budget?.description || tender.procurementMethod || ''; const noticeUrl = release.url || 'https://www.contractsfinder.service.gov.uk/Search'; const documentsUrl = tender.documents?.length > 0 ? tender.documents[0].url : ''; // Extract value let valueLow = null, valueHigh = null; if (planning?.budget?.amount?.amount) { valueLow = planning.budget.amount.amount; valueHigh = planning.budget.amount.amount; } else if (tender.value?.amount) { valueLow = tender.value.amount; valueHigh = tender.value.amount; } const cpvCodes = tender.classification ? [tender.classification.scheme] : []; const result = await pool.query( `INSERT INTO tenders ( source, source_id, title, description, summary, cpv_codes, value_low, value_high, currency, published_date, deadline, authority_name, authority_type, location, documents_url, notice_url, status, sector ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18) ON CONFLICT (source_id) DO NOTHING`, [ 'contracts_finder', sourceId, title.substring(0, 500), description, description.substring(0, 500), cpvCodes, valueLow, valueHigh, 'GBP', publishedDate, deadline, authority, 'government', location.substring(0, 255), documentsUrl, noticeUrl, 'open', classifySector(title, description, authority) ] ); if (result.rowCount > 0) { insertedCount++; } totalProcessed++; } catch (e) { console.error('Error inserting tender:', e.message); } } console.log(`Page ${pageNum}: fetched ${releases.length} tenders (total: ${totalProcessed})`); // Check for next page if (data.links && data.links.next) { nextPageUrl = data.links.next; hasNextPage = true; pageNum++; // Add 1 second delay between pages to avoid rate limiting await sleep(1000); } else { hasNextPage = false; } } catch (error) { console.error(`Error fetching page ${pageNum}:`, error.message); hasNextPage = false; } } console.log(`\n[${new Date().toISOString()}] Scrape complete. Inserted ${insertedCount} new tenders (total processed: ${totalProcessed})`); } catch (error) { console.error('Error scraping tenders:', error.message); } finally { await pool.end(); } } scrapeTenders();