import { chromium } from 'playwright'; import { classifySector } from './classify-sector.js'; import pg from 'pg'; import dotenv from 'dotenv'; dotenv.config(); const pool = new pg.Pool({ connectionString: process.env.DATABASE_URL || 'postgresql://tenderpilot:tenderpilot123@localhost:5432/tenderpilot' }); const delay = (ms) => new Promise(resolve => setTimeout(resolve, ms)); /** * IMPROVED TED EU SCRAPER * * Enhancements: * 1. Multiple search strategies (UK, infrastructure, services, supplies) * 2. Increased page depth (5 pages vs 3) * 3. Better date parsing * 4. Value extraction from notice details * 5. Deadline validation (skip if < 24h) */ async function scrapeTenders() { let browser; try { console.log(`[${new Date().toISOString()}] Starting IMPROVED TED EU scraper...`); browser = await chromium.launch({ headless: true }); const page = await browser.newPage(); let totalInserted = 0; const maxPages = 5; // Increased from 3 // Multiple search strategies for better coverage const searches = [ { query: 'united+kingdom', label: 'UK general' }, { query: 'great+britain', label: 'GB' }, { query: 'england+OR+scotland+OR+wales', label: 'Regions' }, // High-value sectors { query: 'infrastructure+united+kingdom', label: 'Infrastructure' }, { query: 'construction+united+kingdom', label: 'Construction' } ]; const seenIds = new Set(); for (const search of searches) { console.log(`\n=== Searching: ${search.label} ===`); for (let pageNum = 1; pageNum <= maxPages; pageNum++) { try { const url = `https://ted.europa.eu/en/search/result?q=${search.query}&page=${pageNum}`; console.log(`Fetching page ${pageNum}/${maxPages}...`); await page.goto(url, { waitUntil: 'networkidle', timeout: 30000 }); await delay(2000); // Extract tender data from table rows const tenders = await page.evaluate(() => { const results = []; const rows = document.querySelectorAll('tbody tr[data-notice-id], tbody tr'); rows.forEach(row => { try { const link = row.querySelector('a[href*="/notice/"]'); if (!link) return; const cells = row.querySelectorAll('td'); if (cells.length < 4) return; const noticeId = link.textContent.trim(); const href = link.href; const title = cells[2]?.textContent.trim() || ''; const country = cells[3]?.textContent.trim() || ''; const publishedDate = cells[4]?.textContent.trim() || ''; const deadline = cells[5]?.textContent.trim() || ''; // Only include if mentions UK/United Kingdom/GB const rowText = row.textContent.toLowerCase(); if (!rowText.includes('united kingdom') && !rowText.includes('uk') && !rowText.includes('great britain') && !rowText.includes('england') && !rowText.includes('scotland') && !rowText.includes('wales')) { return; } results.push({ noticeId, href, title, country, publishedDate, deadline, fullText: row.textContent.substring(0, 1000) }); } catch (e) { // Skip invalid rows } }); return results; }); console.log(` Found ${tenders.length} UK-related tenders`); if (tenders.length === 0) { console.log(` No results on page ${pageNum}, stopping this search`); break; } let insertedThisPage = 0; for (const tender of tenders) { try { const sourceId = `TED-${tender.noticeId}`; // Skip duplicates (from multiple searches) if (seenIds.has(sourceId)) { continue; } seenIds.add(sourceId); const noticeUrl = tender.href; const title = tender.title.substring(0, 500); const description = tender.fullText || title; // Parse dates (format: DD/MM/YYYY) let publishedDate = null; let deadline = null; if (tender.publishedDate) { const pubMatch = tender.publishedDate.match(/(\d{2})\/(\d{2})\/(\d{4})/); if (pubMatch) { const [_, day, month, year] = pubMatch; publishedDate = new Date(`${year}-${month}-${day}`).toISOString(); } } if (tender.deadline) { const deadMatch = tender.deadline.match(/(\d{2})\/(\d{2})\/(\d{4})/); if (deadMatch) { const [_, day, month, year] = deadMatch; deadline = new Date(`${year}-${month}-${day}`).toISOString(); } } // Skip if no deadline if (!deadline) { continue; } const deadlineDate = new Date(deadline); const now = new Date(); const minDeadline = new Date(now.getTime() + 24 * 60 * 60 * 1000); // Skip if expired if (deadlineDate < now) { continue; } // Skip if deadline < 24 hours if (deadlineDate < minDeadline) { continue; } const sector = await classifySector(title, description); const result = await pool.query( `INSERT INTO tenders ( source, source_id, title, description, summary, cpv_codes, value_low, value_high, currency, published_date, deadline, authority_name, authority_type, location, documents_url, notice_url, status, sector ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18) ON CONFLICT (source_id) DO NOTHING RETURNING id`, [ 'ted_eu', sourceId, title, description.substring(0, 2000), description.substring(0, 500), [], null, null, 'EUR', publishedDate || new Date().toISOString(), deadline, 'EU Tender Authority', 'Public Sector', tender.country || 'United Kingdom', '', noticeUrl, 'open', sector ] ); if (result.rows.length > 0) { insertedThisPage++; totalInserted++; if (totalInserted % 5 === 0) { console.log(` ${totalInserted} total inserted...`); } } } catch (itemError) { console.error(` Error processing tender: ${itemError.message}`); continue; } } console.log(` Inserted ${insertedThisPage} new tenders from this page`); await delay(2000); } catch (pageError) { console.error(` Error fetching page ${pageNum}: ${pageError.message}`); break; } } } console.log(`\n=== TED EU SCRAPE COMPLETE ===`); console.log(`Total unique tenders found: ${seenIds.size}`); console.log(`Inserted: ${totalInserted}`); console.log(`Completion time: ${new Date().toISOString()}`); } catch (error) { console.error('TED EU scraper failed:', error); } finally { if (browser) { await browser.close(); } await pool.end(); } } // Run if called directly if (import.meta.url === `file://${process.argv[1]}`) { scrapeTenders(); } export { scrapeTenders };