Files

224 lines
7.2 KiB
JavaScript
Raw Permalink Normal View History

import axios from 'axios';
import * as cheerio from 'cheerio';
import { classifySector } from './classify-sector.js';
import pg from 'pg';
import dotenv from 'dotenv';
dotenv.config();
const pool = new pg.Pool({
connectionString: process.env.DATABASE_URL || 'postgresql://tenderpilot:tenderpilot123@localhost:5432/tenderpilot'
});
const client = axios.create({
timeout: 15000,
maxRedirects: 5,
headers: {
'User-Agent': 'TenderRadar/1.0 (UK Public Procurement Aggregator)'
}
});
function parseDate(dateStr) {
if (!dateStr || dateStr.trim() === '') return null;
try {
const date = new Date(dateStr);
if (isNaN(date.getTime())) return null;
return date.toISOString();
} catch (e) {
return null;
}
}
function cleanText(text) {
if (!text) return '';
return text
.replace(/\s+/g, ' ')
.replace(/^\s+|\s+$/g, '')
.trim();
}
async function scrapePage(pageNum = 1) {
try {
// Fetch list page with pagination
const listUrl = `https://etendersni.gov.uk/epps/home.do?page=${pageNum}&status=open`;
console.log(`[${new Date().toISOString()}] Fetching page ${pageNum}: ${listUrl}`);
const listResp = await client.get(listUrl);
const $ = cheerio.load(listResp.data);
// Extract entryIds and titles from list
const tenders = [];
const processedIds = new Set();
$('a[href*="entryId"]').each((i, el) => {
const href = $(el).attr('href');
const text = $(el).text().trim();
if (!href || !text) return;
const match = href.match(/entryId=(\d+)/);
if (match) {
const id = match[1];
if (!processedIds.has(id)) {
processedIds.add(id);
tenders.push({
id,
titleSnippet: text.substring(0, 200),
detailUrl: href.startsWith('http') ? href : 'https://etendersni.gov.uk' + (href.startsWith('/') ? href : '/epps/' + href)
});
}
}
});
console.log(`Found ${tenders.length} tenders on page ${pageNum}`);
let insertedCount = 0;
// Fetch detail page for each tender
for (const tender of tenders) {
try {
console.log(` Fetching tender ${tender.id}...`);
const detailResp = await client.get(tender.detailUrl);
const d$ = cheerio.load(detailResp.data);
// Extract tender details from detail page
let title = tender.titleSnippet;
let description = '';
let summary = '';
let deadline = null;
let value = null;
let authority = 'Unknown';
let location = 'Northern Ireland';
let documentsUrl = '';
let cpvCodes = [];
// Try to extract structured data
const text = d$('body').text();
// Look for common patterns in the page
d$('div, p, span, td, li').each((i, el) => {
const content = d$(el).text().trim();
// Try to find deadline
if (!deadline && content.match(/deadline|closing\s+date|deadline\s+date/i)) {
const dateMatch = content.match(/(\d{1,2}[\/\-]\d{1,2}[\/\-]\d{2,4})/);
if (dateMatch) {
const parsed = parseDate(dateMatch[1]);
if (parsed) deadline = parsed;
}
}
// Try to find value
if (!value && content.match(/value|budget|estimate|worth|£|GBP/i)) {
const valueMatch = content.match(/[£\$€]?\s*(\d{1,3}(?:,\d{3})*(?:\.\d{2})?)/);
if (valueMatch) {
value = parseFloat(valueMatch[1].replace(/,/g, ''));
}
}
// Try to find authority/department
if (content.match(/department|authority|council|agency|body|organisation/i) && content.length < 200) {
const cleanContent = cleanText(content);
if (cleanContent.length > 5 && cleanContent.length < 150) {
authority = cleanContent;
}
}
});
// Get title from page header
const pageTitle = d$('h1, h2, .page-title, [class*="title"]').first().text().trim();
if (pageTitle && pageTitle.length > 0 && pageTitle.length < 500) {
title = pageTitle;
}
description = cleanText(text.substring(0, 1000));
summary = cleanText(title);
// Find documents link if available
d$('a[href*="download"], a[href*="document"], a[href*="file"]').each((i, el) => {
const href = d$(el).attr('href');
if (href && !documentsUrl) {
documentsUrl = href.startsWith('http') ? href : 'https://etendersni.gov.uk' + (href.startsWith('/') ? href : '/epps/' + href);
return false;
}
});
// Insert into database
await pool.query(
`INSERT INTO tenders (
source, source_id, title, description, summary, cpv_codes,
value_low, value_high, currency, published_date, deadline,
authority_name, authority_type, location, documents_url, notice_url, status, sector
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18)
ON CONFLICT (source_id) DO NOTHING`,
[
'etendersni',
`etendersni_${tender.id}`,
title.substring(0, 500) || 'Untitled Tender',
description,
summary.substring(0, 500),
cpvCodes,
value,
value,
'GBP',
new Date().toISOString(),
deadline,
authority.substring(0, 255),
'government',
location.substring(0, 255),
documentsUrl,
tender.detailUrl,
deadline && new Date(deadline) > new Date() ? 'open' : 'closed',
classifySector(title, description, authority)
]
);
insertedCount++;
console.log(` ✓ Inserted tender ${tender.id}`);
// Rate limiting
await new Promise(resolve => setTimeout(resolve, 500));
} catch (e) {
console.error(` Error processing tender ${tender.id}: ${e.message}`);
}
}
return { pageNum, insertedCount, tenderCount: tenders.length };
} catch (error) {
console.error(`Error scraping page ${pageNum}:`, error.message);
return { pageNum, insertedCount: 0, tenderCount: 0 };
}
}
async function scrapeTenders() {
try {
console.log(`[${new Date().toISOString()}] Starting eTendersNI scrape...`);
let totalInserted = 0;
let pageNum = 1;
let lastPageHadTenders = true;
// Scrape pages until we find one with no tenders (or max 10 pages)
while (lastPageHadTenders && pageNum <= 10) {
const result = await scrapePage(pageNum);
totalInserted += result.insertedCount;
lastPageHadTenders = result.tenderCount > 0;
pageNum++;
// Avoid rate limiting
await new Promise(resolve => setTimeout(resolve, 1000));
}
console.log(`[${new Date().toISOString()}] eTendersNI scrape complete. Inserted ${totalInserted} tenders`);
} catch (error) {
console.error('Fatal error:', error.message);
} finally {
await pool.end();
}
}
scrapeTenders();