Files
tenderpilot/scrapers/pcs-scotland.js

157 lines
5.0 KiB
JavaScript

import axios from 'axios';
import * as cheerio from 'cheerio';
import { classifySector } from './classify-sector.js';
import pg from 'pg';
import dotenv from 'dotenv';
dotenv.config();
const pool = new pg.Pool({
connectionString: process.env.DATABASE_URL || 'postgresql://tenderpilot:tenderpilot123@localhost:5432/tenderpilot'
});
function parseDate(dateStr) {
if (!dateStr || dateStr.trim() === '') return null;
try {
// Handle format like 13/02/2026
if (dateStr.match(/^\d{2}\/\d{2}\/\d{4}$/)) {
const [day, month, year] = dateStr.split('/');
const date = new Date(`${year}-${month}-${day}`);
return date.toISOString();
}
// Handle format like 16-Mar-26
if (dateStr.match(/^\d{2}-\w+-\d{2}$/)) {
const parts = dateStr.split('-');
const day = parts[0];
const month = parts[1];
const year = '20' + parts[2];
const date = new Date(`${day} ${month} ${year}`);
if (isNaN(date.getTime())) return null;
return date.toISOString();
}
// Try general parsing
const date = new Date(dateStr);
if (isNaN(date.getTime())) return null;
return date.toISOString();
} catch (e) {
return null;
}
}
function cleanTitle(title) {
// Remove common artifacts
return title
.replace(/\s*\(Opens in new tab\)\s*/gi, '')
.replace(/\s*\(Opens in new window\)\s*/gi, '')
.trim();
}
async function scrapeTenders() {
try {
console.log(`[${new Date().toISOString()}] Starting PCS Scotland scrape...`);
let insertedCount = 0;
const url = 'https://www.publiccontractsscotland.gov.uk/search/Search_MainPage.aspx';
const response = await axios.get(url, {
timeout: 30000,
headers: {
'User-Agent': 'TenderRadar/1.0 (UK Public Procurement Aggregator; contact@tenderradar.co.uk)'
}
});
const $ = cheerio.load(response.data);
// Find all tender rows
const tenderRows = $('table tr').filter((i, el) => {
return $(el).find('a[href*=search_view.aspx]').length > 0;
});
console.log(`Found ${tenderRows.length} tenders`);
for (let i = 0; i < tenderRows.length; i++) {
try {
const row = tenderRows.eq(i);
const cells = row.find('td');
if (cells.length === 0) continue;
const dateText = cells.eq(0).text().trim();
const detailsCell = cells.eq(1);
const titleLink = detailsCell.find('a').first();
const rawTitle = titleLink.text().trim();
const title = cleanTitle(rawTitle);
if (!title || title.length === 0) continue;
const noticeUrl = 'https://www.publiccontractsscotland.gov.uk' + titleLink.attr('href');
const detailsText = detailsCell.text();
const refMatch = detailsText.match(/Reference No:\s*([A-Z0-9]+)/);
const sourceId = refMatch ? refMatch[1] : ('pcs_' + Date.now() + '_' + i);
const authorityMatch = detailsText.match(/Published By:\s*([^\n]+)/);
const authority = authorityMatch ? authorityMatch[1].trim() : 'Unknown';
const deadlineMatch = detailsText.match(/Deadline Date:\s*(\d{2}-\w+-\d{2})/);
const deadline = deadlineMatch ? parseDate(deadlineMatch[1]) : null;
const noticeTypeMatch = detailsText.match(/Notice Type:\s*([^\n]+)/);
const noticeType = noticeTypeMatch ? noticeTypeMatch[1].trim() : '';
const publishedDate = parseDate(dateText);
await pool.query(
`INSERT INTO tenders (
source, source_id, title, description, summary, cpv_codes,
value_low, value_high, currency, published_date, deadline,
authority_name, authority_type, location, documents_url, notice_url, status, sector
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18)
ON CONFLICT (source_id) DO UPDATE SET
title = EXCLUDED.title,
description = EXCLUDED.description,
summary = EXCLUDED.summary,
sector = EXCLUDED.sector`,
[
'pcs_scotland',
sourceId,
title.substring(0, 500),
noticeType,
noticeType.substring(0, 500),
[],
null,
null,
'GBP',
publishedDate,
deadline,
authority,
'government',
'Scotland',
'',
noticeUrl,
deadline && new Date(deadline) > new Date() ? 'open' : 'closed',
classifySector(title, noticeType, authority)
]
);
insertedCount++;
} catch (e) {
console.error('Error inserting tender:', e.message);
}
}
console.log(`[${new Date().toISOString()}] PCS Scotland scrape complete. Inserted/updated ${insertedCount} tenders`);
} catch (error) {
console.error('Error scraping PCS Scotland:', error.message);
} finally {
await pool.end();
}
}
scrapeTenders();