import axios from 'axios'; import * as cheerio from 'cheerio'; import { classifySector } from './classify-sector.js'; import pg from 'pg'; import dotenv from 'dotenv'; dotenv.config(); const pool = new pg.Pool({ connectionString: process.env.DATABASE_URL || 'postgresql://tenderpilot:tenderpilot123@localhost:5432/tenderpilot' }); function parseDate(dateStr) { if (!dateStr || dateStr.trim() === '') return null; try { // Handle format like 13/02/2026 if (dateStr.match(/^\d{2}\/\d{2}\/\d{4}$/)) { const [day, month, year] = dateStr.split('/'); const date = new Date(`${year}-${month}-${day}`); if (isNaN(date.getTime())) return null; return date.toISOString(); } // Try general parsing const date = new Date(dateStr); if (isNaN(date.getTime())) return null; return date.toISOString(); } catch (e) { return null; } } async function scrapeTenders() { try { console.log(`[${new Date().toISOString()}] Starting Sell2Wales scrape...`); let insertedCount = 0; const url = 'https://www.sell2wales.gov.wales/search/Search_MainPage.aspx'; const response = await axios.get(url, { timeout: 30000, headers: { 'User-Agent': 'TenderRadar/1.0 (UK Public Procurement Aggregator; contact@tenderradar.co.uk)' } }); const $ = cheerio.load(response.data); // Find all links to tender detail pages const tenderLinks = $('a[href*=search_view.aspx?ID=]'); console.log(`Found ${tenderLinks.length} potential tenders`); // Group by parent containers to avoid duplicates const processed = new Set(); for (let i = 0; i < tenderLinks.length; i++) { try { const link = tenderLinks.eq(i); const href = link.attr('href'); if (!href || processed.has(href)) continue; processed.add(href); const title = link.text().trim(); if (!title || title.length === 0) continue; // Extract reference number from URL first const idMatch = href.match(/ID=([A-Z0-9]+)/); const sourceId = idMatch ? idMatch[1] : ('s2w_' + Date.now() + '_' + i); const noticeUrl = 'https://sell2wales.gov.wales/search/search_view.aspx?ID=' + sourceId; // Get the parent container for this tender const container = link.closest('div, li, tr'); const containerText = container.text(); // Extract metadata const refMatch = containerText.match(/Reference no:\s*([A-Z0-9]+)/i); const finalRef = refMatch ? refMatch[1] : sourceId; const authorityMatch = containerText.match(/Published by:\s*([^\n]+)/i); const authority = authorityMatch ? authorityMatch[1].trim() : 'Unknown'; const pubDateMatch = containerText.match(/Publication date:\s*(\d{2}\/\d{2}\/\d{4})/i); const publishedDate = pubDateMatch ? parseDate(pubDateMatch[1]) : null; const deadlineMatch = containerText.match(/Deadline date:\s*(\d{2}\/\d{2}\/\d{4})/i); const deadline = deadlineMatch ? parseDate(deadlineMatch[1]) : null; const noticeTypeMatch = containerText.match(/Notice Type:\s*([^\n]+)/i); const noticeType = noticeTypeMatch ? noticeTypeMatch[1].trim() : ''; const locationMatch = containerText.match(/Location:\s*([^\n#]+)/i); const location = locationMatch ? locationMatch[1].trim() : 'Wales'; const valueMatch = containerText.match(/Value:\s*(\d+)/i); let valueLow = null; if (valueMatch) { valueLow = parseInt(valueMatch[1]); } // Look for description in nearby paragraphs or divs let description = ''; const nearbyP = container.find('p').first(); if (nearbyP.length > 0) { description = nearbyP.text().trim(); } if (!description || description.length < 10) { description = noticeType || title; } await pool.query( `INSERT INTO tenders ( source, source_id, title, description, summary, cpv_codes, value_low, value_high, currency, published_date, deadline, authority_name, authority_type, location, documents_url, notice_url, status, sector ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18) ON CONFLICT (source_id) DO NOTHING`, [ 'sell2wales', finalRef, title.substring(0, 500), description.substring(0, 1000), description.substring(0, 500), [], valueLow, valueLow, 'GBP', publishedDate, deadline, authority.substring(0, 255), 'government', location.substring(0, 255), '', noticeUrl, deadline && new Date(deadline) > new Date() ? 'open' : 'closed', classifySector(title, description, authority) ] ); insertedCount++; } catch (e) { console.error('Error inserting tender:', e.message); } } console.log(`[${new Date().toISOString()}] Sell2Wales scrape complete. Inserted/updated ${insertedCount} tenders`); } catch (error) { console.error('Error scraping Sell2Wales:', error.message); } finally { await pool.end(); } } scrapeTenders();