Add sector classification module, integrate into all 7 scrapers, fix CF pagination
This commit is contained in:
@@ -1,4 +1,5 @@
|
||||
import axios from 'axios';
|
||||
import { classifySector } from './classify-sector.js';
|
||||
import pg from 'pg';
|
||||
import dotenv from 'dotenv';
|
||||
|
||||
@@ -8,92 +9,127 @@ const pool = new pg.Pool({
|
||||
connectionString: process.env.DATABASE_URL || 'postgresql://tenderpilot:tenderpilot123@localhost:5432/tenderpilot'
|
||||
});
|
||||
|
||||
async function sleep(ms) {
|
||||
return new Promise(resolve => setTimeout(resolve, ms));
|
||||
}
|
||||
|
||||
async function scrapeTenders() {
|
||||
try {
|
||||
console.log(`[${new Date().toISOString()}] Starting tender scrape...`);
|
||||
|
||||
// Get date from 30 days ago
|
||||
// Get date from 90 days ago
|
||||
const fromDate = new Date();
|
||||
fromDate.setDate(fromDate.getDate() - 30);
|
||||
fromDate.setDate(fromDate.getDate() - 90);
|
||||
const dateStr = fromDate.toISOString().split('T')[0];
|
||||
|
||||
const url = `https://www.contractsfinder.service.gov.uk/Published/Notices/OCDS/Search?stage=tender&output=json&publishedFrom=${dateStr}`;
|
||||
const baseUrl = `https://www.contractsfinder.service.gov.uk/Published/Notices/OCDS/Search?stage=tender&output=json&publishedFrom=${dateStr}`;
|
||||
|
||||
console.log(`Fetching from: ${url}`);
|
||||
const response = await axios.get(url, { timeout: 30000 });
|
||||
|
||||
const data = response.data;
|
||||
const releases = data.releases || [];
|
||||
|
||||
console.log(`Found ${releases.length} tenders`);
|
||||
console.log(`Base URL: ${baseUrl}`);
|
||||
|
||||
let insertedCount = 0;
|
||||
let totalProcessed = 0;
|
||||
let pageNum = 1;
|
||||
let hasNextPage = true;
|
||||
let nextPageUrl = baseUrl;
|
||||
|
||||
for (const release of releases) {
|
||||
while (hasNextPage) {
|
||||
try {
|
||||
const tender = release.tender || {};
|
||||
const planning = release.planning || {};
|
||||
const parties = release.parties || [];
|
||||
console.log(`\nFetching page ${pageNum}...`);
|
||||
const response = await axios.get(nextPageUrl, { timeout: 30000 });
|
||||
|
||||
// Find procuring entity
|
||||
const procurer = parties.find(p => p.roles && (p.roles.includes('buyer') || p.roles.includes('procuringEntity') || p.roles.includes('procurer'))) || (release.buyer ? release.buyer : null);
|
||||
const data = response.data;
|
||||
const releases = data.releases || [];
|
||||
|
||||
const sourceId = release.ocid || release.id;
|
||||
const title = tender.title || 'Untitled';
|
||||
const description = tender.description || '';
|
||||
const publishedDate = release.date;
|
||||
const deadline = tender.tenderPeriod?.endDate;
|
||||
const authority = procurer?.name || 'Unknown';
|
||||
const location = planning?.budget?.description || tender.procurementMethod || '';
|
||||
const noticeUrl = release.url || (sourceId ? 'https://www.contractsfinder.service.gov.uk/Notice/' + sourceId.replace('ocds-b5fd17-', '') : '');
|
||||
const documentsUrl = tender.documents?.length > 0 ? tender.documents[0].url : '';
|
||||
for (const release of releases) {
|
||||
try {
|
||||
const tender = release.tender || {};
|
||||
const planning = release.planning || {};
|
||||
const parties = release.parties || [];
|
||||
|
||||
// Find procuring entity
|
||||
const procurer = parties.find(p => p.roles && (p.roles.includes('buyer') || p.roles.includes('procuringEntity') || p.roles.includes('procurer'))) || (release.buyer ? release.buyer : null);
|
||||
|
||||
const sourceId = release.ocid || release.id;
|
||||
const title = tender.title || 'Untitled';
|
||||
const description = tender.description || '';
|
||||
const publishedDate = release.date;
|
||||
const deadline = tender.tenderPeriod?.endDate;
|
||||
|
||||
// Extract value
|
||||
let valueLow = null, valueHigh = null;
|
||||
if (planning?.budget?.amount?.amount) {
|
||||
valueLow = planning.budget.amount.amount;
|
||||
valueHigh = planning.budget.amount.amount;
|
||||
} else if (tender.value?.amount) {
|
||||
valueLow = tender.value.amount;
|
||||
valueHigh = tender.value.amount;
|
||||
// Skip expired tenders
|
||||
if (deadline && new Date(deadline) < new Date()) continue;
|
||||
const authority = procurer?.name || 'Unknown';
|
||||
const location = planning?.budget?.description || tender.procurementMethod || '';
|
||||
const noticeUrl = release.url || (sourceId ? 'https://www.contractsfinder.service.gov.uk/notice/' + sourceId.replace('ocds-b5fd17-', '') : '');
|
||||
const documentsUrl = tender.documents?.length > 0 ? tender.documents[0].url : '';
|
||||
|
||||
// Extract value
|
||||
let valueLow = null, valueHigh = null;
|
||||
if (planning?.budget?.amount?.amount) {
|
||||
valueLow = planning.budget.amount.amount;
|
||||
valueHigh = planning.budget.amount.amount;
|
||||
} else if (tender.value?.amount) {
|
||||
valueLow = tender.value.amount;
|
||||
valueHigh = tender.value.amount;
|
||||
}
|
||||
|
||||
const cpvCodes = tender.classification ? [tender.classification.scheme] : [];
|
||||
|
||||
const result = await pool.query(
|
||||
`INSERT INTO tenders (
|
||||
source, source_id, title, description, summary, cpv_codes,
|
||||
value_low, value_high, currency, published_date, deadline,
|
||||
authority_name, authority_type, location, documents_url, notice_url, status, sector
|
||||
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18)
|
||||
ON CONFLICT (source_id) DO NOTHING`,
|
||||
[
|
||||
'contracts_finder',
|
||||
sourceId,
|
||||
title.substring(0, 500),
|
||||
description,
|
||||
description.substring(0, 500),
|
||||
cpvCodes,
|
||||
valueLow,
|
||||
valueHigh,
|
||||
'GBP',
|
||||
publishedDate,
|
||||
deadline,
|
||||
authority,
|
||||
'government',
|
||||
location.substring(0, 255),
|
||||
documentsUrl,
|
||||
noticeUrl,
|
||||
'open',
|
||||
classifySector(title, description, authority)
|
||||
]
|
||||
);
|
||||
if (result.rowCount > 0) {
|
||||
insertedCount++;
|
||||
}
|
||||
totalProcessed++;
|
||||
} catch (e) {
|
||||
console.error('Error inserting tender:', e.message);
|
||||
}
|
||||
}
|
||||
|
||||
const cpvCodes = tender.classification ? [tender.classification.scheme] : [];
|
||||
console.log(`Page ${pageNum}: fetched ${releases.length} tenders (total: ${totalProcessed})`);
|
||||
|
||||
await pool.query(
|
||||
`INSERT INTO tenders (
|
||||
source, source_id, title, description, summary, cpv_codes,
|
||||
value_low, value_high, currency, published_date, deadline,
|
||||
authority_name, authority_type, location, documents_url, notice_url, status
|
||||
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17)
|
||||
ON CONFLICT (source_id) DO NOTHING`,
|
||||
[
|
||||
'contracts_finder',
|
||||
sourceId,
|
||||
title.substring(0, 500),
|
||||
description,
|
||||
description.substring(0, 500),
|
||||
cpvCodes,
|
||||
valueLow,
|
||||
valueHigh,
|
||||
'GBP',
|
||||
publishedDate,
|
||||
deadline,
|
||||
authority,
|
||||
'government',
|
||||
location.substring(0, 255),
|
||||
documentsUrl,
|
||||
noticeUrl,
|
||||
'open'
|
||||
]
|
||||
);
|
||||
insertedCount++;
|
||||
} catch (e) {
|
||||
console.error('Error inserting tender:', e.message);
|
||||
// Check for next page
|
||||
if (data.links && data.links.next) {
|
||||
nextPageUrl = data.links.next;
|
||||
hasNextPage = true;
|
||||
pageNum++;
|
||||
// Add 1 second delay between pages to avoid rate limiting
|
||||
await sleep(1000);
|
||||
} else {
|
||||
hasNextPage = false;
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(`Error fetching page ${pageNum}:`, error.message);
|
||||
hasNextPage = false;
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`[${new Date().toISOString()}] Scrape complete. Inserted/updated ${insertedCount} tenders`);
|
||||
console.log(`\n[${new Date().toISOString()}] Scrape complete. Inserted ${insertedCount} new tenders (total processed: ${totalProcessed})`);
|
||||
} catch (error) {
|
||||
console.error('Error scraping tenders:', error.message);
|
||||
} finally {
|
||||
|
||||
Reference in New Issue
Block a user