import axios from 'axios'; import * as cheerio from 'cheerio'; import { classifySector } from './classify-sector.js'; import pg from 'pg'; import dotenv from 'dotenv'; dotenv.config(); const pool = new pg.Pool({ connectionString: process.env.DATABASE_URL || 'postgresql://tenderpilot:tenderpilot123@localhost:5432/tenderpilot' }); function parseDate(dateStr) { if (!dateStr || dateStr.trim() === '') return null; try { // Handle format like 13/02/2026 if (dateStr.match(/^\d{2}\/\d{2}\/\d{4}$/)) { const [day, month, year] = dateStr.split('/'); const date = new Date(`${year}-${month}-${day}`); return date.toISOString(); } // Handle format like 16-Mar-26 if (dateStr.match(/^\d{2}-\w+-\d{2}$/)) { const parts = dateStr.split('-'); const day = parts[0]; const month = parts[1]; const year = '20' + parts[2]; const date = new Date(`${day} ${month} ${year}`); if (isNaN(date.getTime())) return null; return date.toISOString(); } // Try general parsing const date = new Date(dateStr); if (isNaN(date.getTime())) return null; return date.toISOString(); } catch (e) { return null; } } function cleanTitle(title) { // Remove common artifacts return title .replace(/\s*\(Opens in new tab\)\s*/gi, '') .replace(/\s*\(Opens in new window\)\s*/gi, '') .trim(); } async function scrapeTenders() { try { console.log(`[${new Date().toISOString()}] Starting PCS Scotland scrape...`); let insertedCount = 0; const url = 'https://www.publiccontractsscotland.gov.uk/search/Search_MainPage.aspx'; const response = await axios.get(url, { timeout: 30000, headers: { 'User-Agent': 'TenderRadar/1.0 (UK Public Procurement Aggregator; contact@tenderradar.co.uk)' } }); const $ = cheerio.load(response.data); // Find all tender rows const tenderRows = $('table tr').filter((i, el) => { return $(el).find('a[href*=search_view.aspx]').length > 0; }); console.log(`Found ${tenderRows.length} tenders`); for (let i = 0; i < tenderRows.length; i++) { try { const row = tenderRows.eq(i); const cells = row.find('td'); if (cells.length === 0) continue; const dateText = cells.eq(0).text().trim(); const detailsCell = cells.eq(1); const titleLink = detailsCell.find('a').first(); const rawTitle = titleLink.text().trim(); const title = cleanTitle(rawTitle); if (!title || title.length === 0) continue; const noticeUrl = 'https://www.publiccontractsscotland.gov.uk' + titleLink.attr('href'); const detailsText = detailsCell.text(); const refMatch = detailsText.match(/Reference No:\s*([A-Z0-9]+)/); const sourceId = refMatch ? refMatch[1] : ('pcs_' + Date.now() + '_' + i); const authorityMatch = detailsText.match(/Published By:\s*([^\n]+)/); const authority = authorityMatch ? authorityMatch[1].trim() : 'Unknown'; const deadlineMatch = detailsText.match(/Deadline Date:\s*(\d{2}-\w+-\d{2})/); const deadline = deadlineMatch ? parseDate(deadlineMatch[1]) : null; const noticeTypeMatch = detailsText.match(/Notice Type:\s*([^\n]+)/); const noticeType = noticeTypeMatch ? noticeTypeMatch[1].trim() : ''; const publishedDate = parseDate(dateText); await pool.query( `INSERT INTO tenders ( source, source_id, title, description, summary, cpv_codes, value_low, value_high, currency, published_date, deadline, authority_name, authority_type, location, documents_url, notice_url, status, sector ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18) ON CONFLICT (source_id) DO UPDATE SET title = EXCLUDED.title, description = EXCLUDED.description, summary = EXCLUDED.summary, sector = EXCLUDED.sector`, [ 'pcs_scotland', sourceId, title.substring(0, 500), noticeType, noticeType.substring(0, 500), [], null, null, 'GBP', publishedDate, deadline, authority, 'government', 'Scotland', '', noticeUrl, deadline && new Date(deadline) > new Date() ? 'open' : 'closed', classifySector(title, noticeType, authority) ] ); insertedCount++; } catch (e) { console.error('Error inserting tender:', e.message); } } console.log(`[${new Date().toISOString()}] PCS Scotland scrape complete. Inserted/updated ${insertedCount} tenders`); } catch (error) { console.error('Error scraping PCS Scotland:', error.message); } finally { await pool.end(); } } scrapeTenders();