154 lines
4.8 KiB
JavaScript
154 lines
4.8 KiB
JavaScript
|
|
import axios from 'axios';
|
||
|
|
import * as cheerio from 'cheerio';
|
||
|
|
import pg from 'pg';
|
||
|
|
import dotenv from 'dotenv';
|
||
|
|
|
||
|
|
dotenv.config();
|
||
|
|
|
||
|
|
const pool = new pg.Pool({
|
||
|
|
connectionString: process.env.DATABASE_URL || 'postgresql://tenderpilot:tenderpilot123@localhost:5432/tenderpilot'
|
||
|
|
});
|
||
|
|
|
||
|
|
function parseDate(dateStr) {
|
||
|
|
if (!dateStr || dateStr.trim() === '') return null;
|
||
|
|
|
||
|
|
try {
|
||
|
|
// Handle format like "13/02/2026"
|
||
|
|
if (dateStr.match(/^\d{2}\/\d{2}\/\d{4}$/)) {
|
||
|
|
const [day, month, year] = dateStr.split('/');
|
||
|
|
const date = new Date(`${year}-${month}-${day}`);
|
||
|
|
return date.toISOString();
|
||
|
|
}
|
||
|
|
|
||
|
|
// Handle format like "16-Mar-26"
|
||
|
|
if (dateStr.match(/^\d{2}-\w+-\d{2}$/)) {
|
||
|
|
const parts = dateStr.split('-');
|
||
|
|
const day = parts[0];
|
||
|
|
const month = parts[1];
|
||
|
|
const year = '20' + parts[2];
|
||
|
|
const date = new Date(`${day} ${month} ${year}`);
|
||
|
|
if (isNaN(date.getTime())) return null;
|
||
|
|
return date.toISOString();
|
||
|
|
}
|
||
|
|
|
||
|
|
// Try general parsing
|
||
|
|
const date = new Date(dateStr);
|
||
|
|
if (isNaN(date.getTime())) return null;
|
||
|
|
return date.toISOString();
|
||
|
|
} catch (e) {
|
||
|
|
return null;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
function cleanTitle(title) {
|
||
|
|
// Remove common artifacts
|
||
|
|
return title
|
||
|
|
.replace(/\s*\(Opens in new tab\)\s*/gi, '')
|
||
|
|
.replace(/\s*\(Opens in new window\)\s*/gi, '')
|
||
|
|
.trim();
|
||
|
|
}
|
||
|
|
|
||
|
|
async function scrapeTenders() {
|
||
|
|
try {
|
||
|
|
console.log(`[${new Date().toISOString()}] Starting PCS Scotland scrape...`);
|
||
|
|
|
||
|
|
let insertedCount = 0;
|
||
|
|
|
||
|
|
const url = 'https://www.publiccontractsscotland.gov.uk/search/Search_MainPage.aspx';
|
||
|
|
|
||
|
|
const response = await axios.get(url, {
|
||
|
|
timeout: 30000,
|
||
|
|
headers: {
|
||
|
|
'User-Agent': 'TenderRadar/1.0 (UK Public Procurement Aggregator; contact@tenderradar.co.uk)'
|
||
|
|
}
|
||
|
|
});
|
||
|
|
|
||
|
|
const $ = cheerio.load(response.data);
|
||
|
|
|
||
|
|
// Find all tender rows
|
||
|
|
const tenderRows = $('table tr').filter((i, el) => {
|
||
|
|
return $(el).find('a[href*="search_view.aspx"]').length > 0;
|
||
|
|
});
|
||
|
|
|
||
|
|
console.log(`Found ${tenderRows.length} tenders`);
|
||
|
|
|
||
|
|
for (let i = 0; i < tenderRows.length; i++) {
|
||
|
|
try {
|
||
|
|
const row = tenderRows.eq(i);
|
||
|
|
const cells = row.find('td');
|
||
|
|
|
||
|
|
if (cells.length === 0) continue;
|
||
|
|
|
||
|
|
const dateText = cells.eq(0).text().trim();
|
||
|
|
const detailsCell = cells.eq(1);
|
||
|
|
|
||
|
|
const titleLink = detailsCell.find('a').first();
|
||
|
|
const rawTitle = titleLink.text().trim();
|
||
|
|
const title = cleanTitle(rawTitle);
|
||
|
|
|
||
|
|
if (!title || title.length === 0) continue;
|
||
|
|
|
||
|
|
const noticeUrl = 'https://www.publiccontractsscotland.gov.uk' + titleLink.attr('href');
|
||
|
|
|
||
|
|
const detailsText = detailsCell.text();
|
||
|
|
|
||
|
|
const refMatch = detailsText.match(/Reference No:\s*([A-Z0-9]+)/);
|
||
|
|
const sourceId = refMatch ? refMatch[1] : ('pcs_' + Date.now() + '_' + i);
|
||
|
|
|
||
|
|
const authorityMatch = detailsText.match(/Published By:\s*([^\n]+)/);
|
||
|
|
const authority = authorityMatch ? authorityMatch[1].trim() : 'Unknown';
|
||
|
|
|
||
|
|
const deadlineMatch = detailsText.match(/Deadline Date:\s*(\d{2}-\w+-\d{2})/);
|
||
|
|
const deadline = deadlineMatch ? parseDate(deadlineMatch[1]) : null;
|
||
|
|
|
||
|
|
const noticeTypeMatch = detailsText.match(/Notice Type:\s*([^\n]+)/);
|
||
|
|
const noticeType = noticeTypeMatch ? noticeTypeMatch[1].trim() : '';
|
||
|
|
|
||
|
|
const publishedDate = parseDate(dateText);
|
||
|
|
|
||
|
|
await pool.query(
|
||
|
|
`INSERT INTO tenders (
|
||
|
|
source, source_id, title, description, summary, cpv_codes,
|
||
|
|
value_low, value_high, currency, published_date, deadline,
|
||
|
|
authority_name, authority_type, location, documents_url, notice_url, status
|
||
|
|
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17)
|
||
|
|
ON CONFLICT (source_id) DO UPDATE SET
|
||
|
|
title = EXCLUDED.title,
|
||
|
|
description = EXCLUDED.description,
|
||
|
|
summary = EXCLUDED.summary`,
|
||
|
|
[
|
||
|
|
'pcs_scotland',
|
||
|
|
sourceId,
|
||
|
|
title.substring(0, 500),
|
||
|
|
noticeType,
|
||
|
|
noticeType.substring(0, 500),
|
||
|
|
[],
|
||
|
|
null,
|
||
|
|
null,
|
||
|
|
'GBP',
|
||
|
|
publishedDate,
|
||
|
|
deadline,
|
||
|
|
authority,
|
||
|
|
'government',
|
||
|
|
'Scotland',
|
||
|
|
'',
|
||
|
|
noticeUrl,
|
||
|
|
deadline && new Date(deadline) > new Date() ? 'open' : 'closed'
|
||
|
|
]
|
||
|
|
);
|
||
|
|
insertedCount++;
|
||
|
|
} catch (e) {
|
||
|
|
console.error('Error inserting tender:', e.message);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
console.log(`[${new Date().toISOString()}] PCS Scotland scrape complete. Inserted/updated ${insertedCount} tenders`);
|
||
|
|
} catch (error) {
|
||
|
|
console.error('Error scraping PCS Scotland:', error.message);
|
||
|
|
} finally {
|
||
|
|
await pool.end();
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
scrapeTenders();
|