import axios from 'axios'; import { classifySector } from './classify-sector.js'; import pg from 'pg'; import dotenv from 'dotenv'; dotenv.config(); const pool = new pg.Pool({ connectionString: process.env.DATABASE_URL || 'postgresql://tenderpilot:tenderpilot123@localhost:5432/tenderpilot' }); async function scrapeTenders() { try { console.log(`[${new Date().toISOString()}] Starting Digital Marketplace tender scrape...`); let insertedCount = 0; // Try to scrape from DOS endpoint try { insertedCount += await scrapeFromDOSEndpoint(); } catch (e) { console.error('Error scraping DOS endpoint:', e.message); } // Try alternative endpoint (if available) if (insertedCount === 0) { try { insertedCount += await scrapeFromGCloudEndpoint(); } catch (e) { console.error('Error scraping G-Cloud endpoint:', e.message); } } console.log(`[${new Date().toISOString()}] Scrape complete. Inserted ${insertedCount} tenders`); process.exit(0); } catch (error) { console.error('Fatal error in scraper:', error.message); process.exit(1); } finally { try { await pool.end(); } catch (e) { // ignore } } } async function scrapeFromDOSEndpoint() { let inserted = 0; const pageSize = 50; let page = 1; const maxPages = 20; console.log('Attempting to scrape Digital Outcomes & Specialists...'); for (page = 1; page <= maxPages; page++) { try { console.log(`Fetching DOS opportunities page ${page}...`); const url = 'https://api.digitalmarketplace.service.gov.uk/v0.1/opportunities'; let response; try { response = await axios.get(url, { params: { status: 'open', page: page, per_page: pageSize }, timeout: 8000, headers: { 'User-Agent': 'TenderRadar-Scraper/1.0', 'Accept': 'application/json' } }); } catch (axiosError) { if (axiosError.code === 'ECONNABORTED' || axiosError.message.includes('timeout')) { console.warn(`Timeout on page ${page} - API may be unavailable`); break; } throw axiosError; } const data = response.data; const opportunities = Array.isArray(data) ? data : (data.opportunities || data.data || []); if (!opportunities || opportunities.length === 0) { console.log('No more opportunities found'); break; } console.log(`Found ${opportunities.length} opportunities on page ${page}`); for (const opp of opportunities) { try { const count = await insertOpportunity(opp); inserted += count; } catch (e) { console.error('Error inserting opportunity:', e.message); } } // Check if there are more pages if (opportunities.length < pageSize) { break; } // Small delay between pages await new Promise(resolve => setTimeout(resolve, 300)); } catch (error) { console.error(`Error on page ${page}:`, error.message); // Try next page } } console.log(`DOS scraping complete, inserted ${inserted} records`); return inserted; } async function scrapeFromGCloudEndpoint() { let inserted = 0; console.log('Attempting to scrape G-Cloud services...'); try { const url = 'https://api.digitalmarketplace.service.gov.uk/v0.1/services'; const response = await axios.get(url, { params: { status: 'published', page: 1, per_page: 100 }, timeout: 8000 }); const services = Array.isArray(response.data) ? response.data : (response.data.services || response.data.data || []); if (services && services.length > 0) { console.log(`Found ${services.length} G-Cloud services`); for (const service of services) { try { const count = await insertService(service); inserted += count; } catch (e) { console.error('Error inserting service:', e.message); } } } } catch (e) { console.warn('G-Cloud endpoint unavailable:', e.message); } return inserted; } async function insertOpportunity(opp) { const oppId = opp.id || opp.ID || opp.opportunity_id; if (!oppId) { return 0; } const sourceId = `dm-${oppId}`; const title = (opp.title || opp.name || 'Untitled').substring(0, 500); const description = (opp.description || opp.brief || '').substring(0, 5000); const summary = (opp.summary || description).substring(0, 500); const publishedDate = opp.publishedAt || opp.published_at || opp.createdAt || new Date().toISOString(); const deadline = opp.applicationsClosedAt || opp.closing_date || opp.deadline; const authorityName = (opp.organisation?.name || opp.buyer?.name || opp.organisationName || 'Digital Marketplace').substring(0, 255); const location = (opp.location || opp.workingArrangements || 'UK').substring(0, 255); let valueLow = null, valueHigh = null; if (opp.budgetRange) { try { const matches = String(opp.budgetRange).match(/[0-9,]+\.?[0-9]*/g); if (matches && matches.length >= 2) { valueLow = parseFloat(matches[0].replace(/,/g, '')); valueHigh = parseFloat(matches[matches.length - 1].replace(/,/g, '')); } } catch (e) { // ignore } } else if (opp.minBudget || opp.maxBudget) { valueLow = opp.minBudget ? parseFloat(opp.minBudget) : null; valueHigh = opp.maxBudget ? parseFloat(opp.maxBudget) : null; } const noticeUrl = opp.link || opp.url || `https://www.digitalmarketplace.service.gov.uk/digital-outcomes-and-specialists/opportunities/${oppId}`; const cpvCodes = opp.specialistRole ? [opp.specialistRole] : (opp.cpv_codes || []); try { const result = await pool.query( `INSERT INTO tenders ( source, source_id, title, description, summary, cpv_codes, value_low, value_high, currency, published_date, deadline, authority_name, authority_type, location, documents_url, notice_url, status, sector ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18) ON CONFLICT (source_id) DO NOTHING`, [ 'digital_marketplace', sourceId, title, description, summary, cpvCodes, valueLow, valueHigh, 'GBP', publishedDate, deadline, authorityName, 'government', location, '', noticeUrl, 'open', classifySector(title, description, authorityName) ] ); return result.rowCount || 0; } catch (error) { if (error.code === '23505') { return 0; // Already exists } throw error; } } async function insertService(service) { const serviceId = service.id || service.service_id; if (!serviceId) { return 0; } const sourceId = `dm-gcloud-${serviceId}`; const title = (service.serviceName || service.name || 'Untitled').substring(0, 500); const description = (service.serviceDescription || service.description || '').substring(0, 5000); const supplierName = (service.supplierName || 'Digital Marketplace').substring(0, 255); const noticeUrl = `https://www.digitalmarketplace.service.gov.uk/g-cloud/services/${serviceId}`; try { const result = await pool.query( `INSERT INTO tenders ( source, source_id, title, description, summary, cpv_codes, value_low, value_high, currency, published_date, deadline, authority_name, authority_type, location, documents_url, notice_url, status, sector ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18) ON CONFLICT (source_id) DO NOTHING`, [ 'digital_marketplace', sourceId, title, description, description.substring(0, 500), [], null, null, 'GBP', service.createdAt || new Date().toISOString(), null, supplierName, 'supplier', 'UK', '', noticeUrl, 'open', classifySector(title, description, supplierName) ] ); return result.rowCount || 0; } catch (error) { if (error.code === '23505') { return 0; } throw error; } } scrapeTenders();