Add sector classification module, integrate into all 7 scrapers, fix CF pagination
This commit is contained in:
284
scrapers/digital-marketplace.js
Normal file
284
scrapers/digital-marketplace.js
Normal file
@@ -0,0 +1,284 @@
|
||||
import axios from 'axios';
|
||||
import { classifySector } from './classify-sector.js';
|
||||
import pg from 'pg';
|
||||
import dotenv from 'dotenv';
|
||||
|
||||
dotenv.config();
|
||||
|
||||
const pool = new pg.Pool({
|
||||
connectionString: process.env.DATABASE_URL || 'postgresql://tenderpilot:tenderpilot123@localhost:5432/tenderpilot'
|
||||
});
|
||||
|
||||
async function scrapeTenders() {
|
||||
try {
|
||||
console.log(`[${new Date().toISOString()}] Starting Digital Marketplace tender scrape...`);
|
||||
|
||||
let insertedCount = 0;
|
||||
|
||||
// Try to scrape from DOS endpoint
|
||||
try {
|
||||
insertedCount += await scrapeFromDOSEndpoint();
|
||||
} catch (e) {
|
||||
console.error('Error scraping DOS endpoint:', e.message);
|
||||
}
|
||||
|
||||
// Try alternative endpoint (if available)
|
||||
if (insertedCount === 0) {
|
||||
try {
|
||||
insertedCount += await scrapeFromGCloudEndpoint();
|
||||
} catch (e) {
|
||||
console.error('Error scraping G-Cloud endpoint:', e.message);
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`[${new Date().toISOString()}] Scrape complete. Inserted ${insertedCount} tenders`);
|
||||
process.exit(0);
|
||||
} catch (error) {
|
||||
console.error('Fatal error in scraper:', error.message);
|
||||
process.exit(1);
|
||||
} finally {
|
||||
try {
|
||||
await pool.end();
|
||||
} catch (e) {
|
||||
// ignore
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async function scrapeFromDOSEndpoint() {
|
||||
let inserted = 0;
|
||||
const pageSize = 50;
|
||||
let page = 1;
|
||||
const maxPages = 20;
|
||||
|
||||
console.log('Attempting to scrape Digital Outcomes & Specialists...');
|
||||
|
||||
for (page = 1; page <= maxPages; page++) {
|
||||
try {
|
||||
console.log(`Fetching DOS opportunities page ${page}...`);
|
||||
|
||||
const url = 'https://api.digitalmarketplace.service.gov.uk/v0.1/opportunities';
|
||||
|
||||
let response;
|
||||
try {
|
||||
response = await axios.get(url, {
|
||||
params: {
|
||||
status: 'open',
|
||||
page: page,
|
||||
per_page: pageSize
|
||||
},
|
||||
timeout: 8000,
|
||||
headers: {
|
||||
'User-Agent': 'TenderRadar-Scraper/1.0',
|
||||
'Accept': 'application/json'
|
||||
}
|
||||
});
|
||||
} catch (axiosError) {
|
||||
if (axiosError.code === 'ECONNABORTED' || axiosError.message.includes('timeout')) {
|
||||
console.warn(`Timeout on page ${page} - API may be unavailable`);
|
||||
break;
|
||||
}
|
||||
throw axiosError;
|
||||
}
|
||||
|
||||
const data = response.data;
|
||||
const opportunities = Array.isArray(data) ? data : (data.opportunities || data.data || []);
|
||||
|
||||
if (!opportunities || opportunities.length === 0) {
|
||||
console.log('No more opportunities found');
|
||||
break;
|
||||
}
|
||||
|
||||
console.log(`Found ${opportunities.length} opportunities on page ${page}`);
|
||||
|
||||
for (const opp of opportunities) {
|
||||
try {
|
||||
const count = await insertOpportunity(opp);
|
||||
inserted += count;
|
||||
} catch (e) {
|
||||
console.error('Error inserting opportunity:', e.message);
|
||||
}
|
||||
}
|
||||
|
||||
// Check if there are more pages
|
||||
if (opportunities.length < pageSize) {
|
||||
break;
|
||||
}
|
||||
|
||||
// Small delay between pages
|
||||
await new Promise(resolve => setTimeout(resolve, 300));
|
||||
} catch (error) {
|
||||
console.error(`Error on page ${page}:`, error.message);
|
||||
// Try next page
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`DOS scraping complete, inserted ${inserted} records`);
|
||||
return inserted;
|
||||
}
|
||||
|
||||
async function scrapeFromGCloudEndpoint() {
|
||||
let inserted = 0;
|
||||
|
||||
console.log('Attempting to scrape G-Cloud services...');
|
||||
|
||||
try {
|
||||
const url = 'https://api.digitalmarketplace.service.gov.uk/v0.1/services';
|
||||
|
||||
const response = await axios.get(url, {
|
||||
params: {
|
||||
status: 'published',
|
||||
page: 1,
|
||||
per_page: 100
|
||||
},
|
||||
timeout: 8000
|
||||
});
|
||||
|
||||
const services = Array.isArray(response.data) ? response.data : (response.data.services || response.data.data || []);
|
||||
|
||||
if (services && services.length > 0) {
|
||||
console.log(`Found ${services.length} G-Cloud services`);
|
||||
for (const service of services) {
|
||||
try {
|
||||
const count = await insertService(service);
|
||||
inserted += count;
|
||||
} catch (e) {
|
||||
console.error('Error inserting service:', e.message);
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
console.warn('G-Cloud endpoint unavailable:', e.message);
|
||||
}
|
||||
|
||||
return inserted;
|
||||
}
|
||||
|
||||
async function insertOpportunity(opp) {
|
||||
const oppId = opp.id || opp.ID || opp.opportunity_id;
|
||||
if (!oppId) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
const sourceId = `dm-${oppId}`;
|
||||
const title = (opp.title || opp.name || 'Untitled').substring(0, 500);
|
||||
const description = (opp.description || opp.brief || '').substring(0, 5000);
|
||||
const summary = (opp.summary || description).substring(0, 500);
|
||||
|
||||
const publishedDate = opp.publishedAt || opp.published_at || opp.createdAt || new Date().toISOString();
|
||||
const deadline = opp.applicationsClosedAt || opp.closing_date || opp.deadline;
|
||||
|
||||
const authorityName = (opp.organisation?.name || opp.buyer?.name || opp.organisationName || 'Digital Marketplace').substring(0, 255);
|
||||
const location = (opp.location || opp.workingArrangements || 'UK').substring(0, 255);
|
||||
|
||||
let valueLow = null, valueHigh = null;
|
||||
if (opp.budgetRange) {
|
||||
try {
|
||||
const matches = String(opp.budgetRange).match(/[0-9,]+\.?[0-9]*/g);
|
||||
if (matches && matches.length >= 2) {
|
||||
valueLow = parseFloat(matches[0].replace(/,/g, ''));
|
||||
valueHigh = parseFloat(matches[matches.length - 1].replace(/,/g, ''));
|
||||
}
|
||||
} catch (e) {
|
||||
// ignore
|
||||
}
|
||||
} else if (opp.minBudget || opp.maxBudget) {
|
||||
valueLow = opp.minBudget ? parseFloat(opp.minBudget) : null;
|
||||
valueHigh = opp.maxBudget ? parseFloat(opp.maxBudget) : null;
|
||||
}
|
||||
|
||||
const noticeUrl = opp.link || opp.url ||
|
||||
`https://www.digitalmarketplace.service.gov.uk/digital-outcomes-and-specialists/opportunities/${oppId}`;
|
||||
|
||||
const cpvCodes = opp.specialistRole ? [opp.specialistRole] : (opp.cpv_codes || []);
|
||||
|
||||
try {
|
||||
const result = await pool.query(
|
||||
`INSERT INTO tenders (
|
||||
source, source_id, title, description, summary, cpv_codes,
|
||||
value_low, value_high, currency, published_date, deadline,
|
||||
authority_name, authority_type, location, documents_url, notice_url, status, sector
|
||||
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18)
|
||||
ON CONFLICT (source_id) DO NOTHING`,
|
||||
[
|
||||
'digital_marketplace',
|
||||
sourceId,
|
||||
title,
|
||||
description,
|
||||
summary,
|
||||
cpvCodes,
|
||||
valueLow,
|
||||
valueHigh,
|
||||
'GBP',
|
||||
publishedDate,
|
||||
deadline,
|
||||
authorityName,
|
||||
'government',
|
||||
location,
|
||||
'',
|
||||
noticeUrl,
|
||||
'open',
|
||||
classifySector(title, description, authorityName)
|
||||
]
|
||||
);
|
||||
return result.rowCount || 0;
|
||||
} catch (error) {
|
||||
if (error.code === '23505') {
|
||||
return 0; // Already exists
|
||||
}
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
async function insertService(service) {
|
||||
const serviceId = service.id || service.service_id;
|
||||
if (!serviceId) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
const sourceId = `dm-gcloud-${serviceId}`;
|
||||
const title = (service.serviceName || service.name || 'Untitled').substring(0, 500);
|
||||
const description = (service.serviceDescription || service.description || '').substring(0, 5000);
|
||||
const supplierName = (service.supplierName || 'Digital Marketplace').substring(0, 255);
|
||||
|
||||
const noticeUrl = `https://www.digitalmarketplace.service.gov.uk/g-cloud/services/${serviceId}`;
|
||||
|
||||
try {
|
||||
const result = await pool.query(
|
||||
`INSERT INTO tenders (
|
||||
source, source_id, title, description, summary, cpv_codes,
|
||||
value_low, value_high, currency, published_date, deadline,
|
||||
authority_name, authority_type, location, documents_url, notice_url, status, sector
|
||||
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18)
|
||||
ON CONFLICT (source_id) DO NOTHING`,
|
||||
[
|
||||
'digital_marketplace',
|
||||
sourceId,
|
||||
title,
|
||||
description,
|
||||
description.substring(0, 500),
|
||||
[],
|
||||
null,
|
||||
null,
|
||||
'GBP',
|
||||
service.createdAt || new Date().toISOString(),
|
||||
null,
|
||||
supplierName,
|
||||
'supplier',
|
||||
'UK',
|
||||
'',
|
||||
noticeUrl,
|
||||
'open',
|
||||
classifySector(title, description, supplierName)
|
||||
]
|
||||
);
|
||||
return result.rowCount || 0;
|
||||
} catch (error) {
|
||||
if (error.code === '23505') {
|
||||
return 0;
|
||||
}
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
scrapeTenders();
|
||||
Reference in New Issue
Block a user