import axios from 'axios'; import * as cheerio from 'cheerio'; import { classifySector } from './classify-sector.js'; import pg from 'pg'; import dotenv from 'dotenv'; dotenv.config(); const pool = new pg.Pool({ connectionString: process.env.DATABASE_URL || 'postgresql://tenderpilot:tenderpilot123@localhost:5432/tenderpilot' }); const client = axios.create({ timeout: 15000, maxRedirects: 5, headers: { 'User-Agent': 'TenderRadar/1.0 (UK Public Procurement Aggregator)' } }); function parseDate(dateStr) { if (!dateStr || dateStr.trim() === '') return null; try { const date = new Date(dateStr); if (isNaN(date.getTime())) return null; return date.toISOString(); } catch (e) { return null; } } function cleanText(text) { if (!text) return ''; return text .replace(/\s+/g, ' ') .replace(/^\s+|\s+$/g, '') .trim(); } async function scrapePage(pageNum = 1) { try { // Fetch list page with pagination const listUrl = `https://etendersni.gov.uk/epps/home.do?page=${pageNum}&status=open`; console.log(`[${new Date().toISOString()}] Fetching page ${pageNum}: ${listUrl}`); const listResp = await client.get(listUrl); const $ = cheerio.load(listResp.data); // Extract entryIds and titles from list const tenders = []; const processedIds = new Set(); $('a[href*="entryId"]').each((i, el) => { const href = $(el).attr('href'); const text = $(el).text().trim(); if (!href || !text) return; const match = href.match(/entryId=(\d+)/); if (match) { const id = match[1]; if (!processedIds.has(id)) { processedIds.add(id); tenders.push({ id, titleSnippet: text.substring(0, 200), detailUrl: href.startsWith('http') ? href : 'https://etendersni.gov.uk' + (href.startsWith('/') ? href : '/epps/' + href) }); } } }); console.log(`Found ${tenders.length} tenders on page ${pageNum}`); let insertedCount = 0; // Fetch detail page for each tender for (const tender of tenders) { try { console.log(` Fetching tender ${tender.id}...`); const detailResp = await client.get(tender.detailUrl); const d$ = cheerio.load(detailResp.data); // Extract tender details from detail page let title = tender.titleSnippet; let description = ''; let summary = ''; let deadline = null; let value = null; let authority = 'Unknown'; let location = 'Northern Ireland'; let documentsUrl = ''; let cpvCodes = []; // Try to extract structured data const text = d$('body').text(); // Look for common patterns in the page d$('div, p, span, td, li').each((i, el) => { const content = d$(el).text().trim(); // Try to find deadline if (!deadline && content.match(/deadline|closing\s+date|deadline\s+date/i)) { const dateMatch = content.match(/(\d{1,2}[\/\-]\d{1,2}[\/\-]\d{2,4})/); if (dateMatch) { const parsed = parseDate(dateMatch[1]); if (parsed) deadline = parsed; } } // Try to find value if (!value && content.match(/value|budget|estimate|worth|£|GBP/i)) { const valueMatch = content.match(/[£\$€]?\s*(\d{1,3}(?:,\d{3})*(?:\.\d{2})?)/); if (valueMatch) { value = parseFloat(valueMatch[1].replace(/,/g, '')); } } // Try to find authority/department if (content.match(/department|authority|council|agency|body|organisation/i) && content.length < 200) { const cleanContent = cleanText(content); if (cleanContent.length > 5 && cleanContent.length < 150) { authority = cleanContent; } } }); // Get title from page header const pageTitle = d$('h1, h2, .page-title, [class*="title"]').first().text().trim(); if (pageTitle && pageTitle.length > 0 && pageTitle.length < 500) { title = pageTitle; } description = cleanText(text.substring(0, 1000)); summary = cleanText(title); // Find documents link if available d$('a[href*="download"], a[href*="document"], a[href*="file"]').each((i, el) => { const href = d$(el).attr('href'); if (href && !documentsUrl) { documentsUrl = href.startsWith('http') ? href : 'https://etendersni.gov.uk' + (href.startsWith('/') ? href : '/epps/' + href); return false; } }); // Insert into database await pool.query( `INSERT INTO tenders ( source, source_id, title, description, summary, cpv_codes, value_low, value_high, currency, published_date, deadline, authority_name, authority_type, location, documents_url, notice_url, status, sector ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18) ON CONFLICT (source_id) DO NOTHING`, [ 'etendersni', `etendersni_${tender.id}`, title.substring(0, 500) || 'Untitled Tender', description, summary.substring(0, 500), cpvCodes, value, value, 'GBP', new Date().toISOString(), deadline, authority.substring(0, 255), 'government', location.substring(0, 255), documentsUrl, tender.detailUrl, deadline && new Date(deadline) > new Date() ? 'open' : 'closed', classifySector(title, description, authority) ] ); insertedCount++; console.log(` ✓ Inserted tender ${tender.id}`); // Rate limiting await new Promise(resolve => setTimeout(resolve, 500)); } catch (e) { console.error(` Error processing tender ${tender.id}: ${e.message}`); } } return { pageNum, insertedCount, tenderCount: tenders.length }; } catch (error) { console.error(`Error scraping page ${pageNum}:`, error.message); return { pageNum, insertedCount: 0, tenderCount: 0 }; } } async function scrapeTenders() { try { console.log(`[${new Date().toISOString()}] Starting eTendersNI scrape...`); let totalInserted = 0; let pageNum = 1; let lastPageHadTenders = true; // Scrape pages until we find one with no tenders (or max 10 pages) while (lastPageHadTenders && pageNum <= 10) { const result = await scrapePage(pageNum); totalInserted += result.insertedCount; lastPageHadTenders = result.tenderCount > 0; pageNum++; // Avoid rate limiting await new Promise(resolve => setTimeout(resolve, 1000)); } console.log(`[${new Date().toISOString()}] eTendersNI scrape complete. Inserted ${totalInserted} tenders`); } catch (error) { console.error('Fatal error:', error.message); } finally { await pool.end(); } } scrapeTenders();