224 lines
7.2 KiB
JavaScript
224 lines
7.2 KiB
JavaScript
|
|
import axios from 'axios';
|
||
|
|
import * as cheerio from 'cheerio';
|
||
|
|
import { classifySector } from './classify-sector.js';
|
||
|
|
import pg from 'pg';
|
||
|
|
import dotenv from 'dotenv';
|
||
|
|
|
||
|
|
dotenv.config();
|
||
|
|
|
||
|
|
const pool = new pg.Pool({
|
||
|
|
connectionString: process.env.DATABASE_URL || 'postgresql://tenderpilot:tenderpilot123@localhost:5432/tenderpilot'
|
||
|
|
});
|
||
|
|
|
||
|
|
const client = axios.create({
|
||
|
|
timeout: 15000,
|
||
|
|
maxRedirects: 5,
|
||
|
|
headers: {
|
||
|
|
'User-Agent': 'TenderRadar/1.0 (UK Public Procurement Aggregator)'
|
||
|
|
}
|
||
|
|
});
|
||
|
|
|
||
|
|
function parseDate(dateStr) {
|
||
|
|
if (!dateStr || dateStr.trim() === '') return null;
|
||
|
|
try {
|
||
|
|
const date = new Date(dateStr);
|
||
|
|
if (isNaN(date.getTime())) return null;
|
||
|
|
return date.toISOString();
|
||
|
|
} catch (e) {
|
||
|
|
return null;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
function cleanText(text) {
|
||
|
|
if (!text) return '';
|
||
|
|
return text
|
||
|
|
.replace(/\s+/g, ' ')
|
||
|
|
.replace(/^\s+|\s+$/g, '')
|
||
|
|
.trim();
|
||
|
|
}
|
||
|
|
|
||
|
|
async function scrapePage(pageNum = 1) {
|
||
|
|
try {
|
||
|
|
// Fetch list page with pagination
|
||
|
|
const listUrl = `https://etendersni.gov.uk/epps/home.do?page=${pageNum}&status=open`;
|
||
|
|
|
||
|
|
console.log(`[${new Date().toISOString()}] Fetching page ${pageNum}: ${listUrl}`);
|
||
|
|
const listResp = await client.get(listUrl);
|
||
|
|
const $ = cheerio.load(listResp.data);
|
||
|
|
|
||
|
|
// Extract entryIds and titles from list
|
||
|
|
const tenders = [];
|
||
|
|
const processedIds = new Set();
|
||
|
|
|
||
|
|
$('a[href*="entryId"]').each((i, el) => {
|
||
|
|
const href = $(el).attr('href');
|
||
|
|
const text = $(el).text().trim();
|
||
|
|
|
||
|
|
if (!href || !text) return;
|
||
|
|
|
||
|
|
const match = href.match(/entryId=(\d+)/);
|
||
|
|
if (match) {
|
||
|
|
const id = match[1];
|
||
|
|
if (!processedIds.has(id)) {
|
||
|
|
processedIds.add(id);
|
||
|
|
tenders.push({
|
||
|
|
id,
|
||
|
|
titleSnippet: text.substring(0, 200),
|
||
|
|
detailUrl: href.startsWith('http') ? href : 'https://etendersni.gov.uk' + (href.startsWith('/') ? href : '/epps/' + href)
|
||
|
|
});
|
||
|
|
}
|
||
|
|
}
|
||
|
|
});
|
||
|
|
|
||
|
|
console.log(`Found ${tenders.length} tenders on page ${pageNum}`);
|
||
|
|
|
||
|
|
let insertedCount = 0;
|
||
|
|
|
||
|
|
// Fetch detail page for each tender
|
||
|
|
for (const tender of tenders) {
|
||
|
|
try {
|
||
|
|
console.log(` Fetching tender ${tender.id}...`);
|
||
|
|
const detailResp = await client.get(tender.detailUrl);
|
||
|
|
const d$ = cheerio.load(detailResp.data);
|
||
|
|
|
||
|
|
// Extract tender details from detail page
|
||
|
|
let title = tender.titleSnippet;
|
||
|
|
let description = '';
|
||
|
|
let summary = '';
|
||
|
|
let deadline = null;
|
||
|
|
let value = null;
|
||
|
|
let authority = 'Unknown';
|
||
|
|
let location = 'Northern Ireland';
|
||
|
|
let documentsUrl = '';
|
||
|
|
let cpvCodes = [];
|
||
|
|
|
||
|
|
// Try to extract structured data
|
||
|
|
const text = d$('body').text();
|
||
|
|
|
||
|
|
// Look for common patterns in the page
|
||
|
|
d$('div, p, span, td, li').each((i, el) => {
|
||
|
|
const content = d$(el).text().trim();
|
||
|
|
|
||
|
|
// Try to find deadline
|
||
|
|
if (!deadline && content.match(/deadline|closing\s+date|deadline\s+date/i)) {
|
||
|
|
const dateMatch = content.match(/(\d{1,2}[\/\-]\d{1,2}[\/\-]\d{2,4})/);
|
||
|
|
if (dateMatch) {
|
||
|
|
const parsed = parseDate(dateMatch[1]);
|
||
|
|
if (parsed) deadline = parsed;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Try to find value
|
||
|
|
if (!value && content.match(/value|budget|estimate|worth|£|GBP/i)) {
|
||
|
|
const valueMatch = content.match(/[£\$€]?\s*(\d{1,3}(?:,\d{3})*(?:\.\d{2})?)/);
|
||
|
|
if (valueMatch) {
|
||
|
|
value = parseFloat(valueMatch[1].replace(/,/g, ''));
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Try to find authority/department
|
||
|
|
if (content.match(/department|authority|council|agency|body|organisation/i) && content.length < 200) {
|
||
|
|
const cleanContent = cleanText(content);
|
||
|
|
if (cleanContent.length > 5 && cleanContent.length < 150) {
|
||
|
|
authority = cleanContent;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
});
|
||
|
|
|
||
|
|
// Get title from page header
|
||
|
|
const pageTitle = d$('h1, h2, .page-title, [class*="title"]').first().text().trim();
|
||
|
|
if (pageTitle && pageTitle.length > 0 && pageTitle.length < 500) {
|
||
|
|
title = pageTitle;
|
||
|
|
}
|
||
|
|
|
||
|
|
description = cleanText(text.substring(0, 1000));
|
||
|
|
summary = cleanText(title);
|
||
|
|
|
||
|
|
// Find documents link if available
|
||
|
|
d$('a[href*="download"], a[href*="document"], a[href*="file"]').each((i, el) => {
|
||
|
|
const href = d$(el).attr('href');
|
||
|
|
if (href && !documentsUrl) {
|
||
|
|
documentsUrl = href.startsWith('http') ? href : 'https://etendersni.gov.uk' + (href.startsWith('/') ? href : '/epps/' + href);
|
||
|
|
return false;
|
||
|
|
}
|
||
|
|
});
|
||
|
|
|
||
|
|
// Insert into database
|
||
|
|
await pool.query(
|
||
|
|
`INSERT INTO tenders (
|
||
|
|
source, source_id, title, description, summary, cpv_codes,
|
||
|
|
value_low, value_high, currency, published_date, deadline,
|
||
|
|
authority_name, authority_type, location, documents_url, notice_url, status, sector
|
||
|
|
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18)
|
||
|
|
ON CONFLICT (source_id) DO NOTHING`,
|
||
|
|
[
|
||
|
|
'etendersni',
|
||
|
|
`etendersni_${tender.id}`,
|
||
|
|
title.substring(0, 500) || 'Untitled Tender',
|
||
|
|
description,
|
||
|
|
summary.substring(0, 500),
|
||
|
|
cpvCodes,
|
||
|
|
value,
|
||
|
|
value,
|
||
|
|
'GBP',
|
||
|
|
new Date().toISOString(),
|
||
|
|
deadline,
|
||
|
|
authority.substring(0, 255),
|
||
|
|
'government',
|
||
|
|
location.substring(0, 255),
|
||
|
|
documentsUrl,
|
||
|
|
tender.detailUrl,
|
||
|
|
deadline && new Date(deadline) > new Date() ? 'open' : 'closed',
|
||
|
|
classifySector(title, description, authority)
|
||
|
|
]
|
||
|
|
);
|
||
|
|
|
||
|
|
insertedCount++;
|
||
|
|
console.log(` ✓ Inserted tender ${tender.id}`);
|
||
|
|
|
||
|
|
// Rate limiting
|
||
|
|
await new Promise(resolve => setTimeout(resolve, 500));
|
||
|
|
|
||
|
|
} catch (e) {
|
||
|
|
console.error(` Error processing tender ${tender.id}: ${e.message}`);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
return { pageNum, insertedCount, tenderCount: tenders.length };
|
||
|
|
|
||
|
|
} catch (error) {
|
||
|
|
console.error(`Error scraping page ${pageNum}:`, error.message);
|
||
|
|
return { pageNum, insertedCount: 0, tenderCount: 0 };
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
async function scrapeTenders() {
|
||
|
|
try {
|
||
|
|
console.log(`[${new Date().toISOString()}] Starting eTendersNI scrape...`);
|
||
|
|
|
||
|
|
let totalInserted = 0;
|
||
|
|
let pageNum = 1;
|
||
|
|
let lastPageHadTenders = true;
|
||
|
|
|
||
|
|
// Scrape pages until we find one with no tenders (or max 10 pages)
|
||
|
|
while (lastPageHadTenders && pageNum <= 10) {
|
||
|
|
const result = await scrapePage(pageNum);
|
||
|
|
totalInserted += result.insertedCount;
|
||
|
|
lastPageHadTenders = result.tenderCount > 0;
|
||
|
|
pageNum++;
|
||
|
|
|
||
|
|
// Avoid rate limiting
|
||
|
|
await new Promise(resolve => setTimeout(resolve, 1000));
|
||
|
|
}
|
||
|
|
|
||
|
|
console.log(`[${new Date().toISOString()}] eTendersNI scrape complete. Inserted ${totalInserted} tenders`);
|
||
|
|
|
||
|
|
} catch (error) {
|
||
|
|
console.error('Fatal error:', error.message);
|
||
|
|
} finally {
|
||
|
|
await pool.end();
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
scrapeTenders();
|