Files
tenderpilot/scrapers/digital-marketplace.js

285 lines
8.3 KiB
JavaScript

import axios from 'axios';
import { classifySector } from './classify-sector.js';
import pg from 'pg';
import dotenv from 'dotenv';
dotenv.config();
const pool = new pg.Pool({
connectionString: process.env.DATABASE_URL || 'postgresql://tenderpilot:tenderpilot123@localhost:5432/tenderpilot'
});
async function scrapeTenders() {
try {
console.log(`[${new Date().toISOString()}] Starting Digital Marketplace tender scrape...`);
let insertedCount = 0;
// Try to scrape from DOS endpoint
try {
insertedCount += await scrapeFromDOSEndpoint();
} catch (e) {
console.error('Error scraping DOS endpoint:', e.message);
}
// Try alternative endpoint (if available)
if (insertedCount === 0) {
try {
insertedCount += await scrapeFromGCloudEndpoint();
} catch (e) {
console.error('Error scraping G-Cloud endpoint:', e.message);
}
}
console.log(`[${new Date().toISOString()}] Scrape complete. Inserted ${insertedCount} tenders`);
process.exit(0);
} catch (error) {
console.error('Fatal error in scraper:', error.message);
process.exit(1);
} finally {
try {
await pool.end();
} catch (e) {
// ignore
}
}
}
async function scrapeFromDOSEndpoint() {
let inserted = 0;
const pageSize = 50;
let page = 1;
const maxPages = 20;
console.log('Attempting to scrape Digital Outcomes & Specialists...');
for (page = 1; page <= maxPages; page++) {
try {
console.log(`Fetching DOS opportunities page ${page}...`);
const url = 'https://api.digitalmarketplace.service.gov.uk/v0.1/opportunities';
let response;
try {
response = await axios.get(url, {
params: {
status: 'open',
page: page,
per_page: pageSize
},
timeout: 8000,
headers: {
'User-Agent': 'TenderRadar-Scraper/1.0',
'Accept': 'application/json'
}
});
} catch (axiosError) {
if (axiosError.code === 'ECONNABORTED' || axiosError.message.includes('timeout')) {
console.warn(`Timeout on page ${page} - API may be unavailable`);
break;
}
throw axiosError;
}
const data = response.data;
const opportunities = Array.isArray(data) ? data : (data.opportunities || data.data || []);
if (!opportunities || opportunities.length === 0) {
console.log('No more opportunities found');
break;
}
console.log(`Found ${opportunities.length} opportunities on page ${page}`);
for (const opp of opportunities) {
try {
const count = await insertOpportunity(opp);
inserted += count;
} catch (e) {
console.error('Error inserting opportunity:', e.message);
}
}
// Check if there are more pages
if (opportunities.length < pageSize) {
break;
}
// Small delay between pages
await new Promise(resolve => setTimeout(resolve, 300));
} catch (error) {
console.error(`Error on page ${page}:`, error.message);
// Try next page
}
}
console.log(`DOS scraping complete, inserted ${inserted} records`);
return inserted;
}
async function scrapeFromGCloudEndpoint() {
let inserted = 0;
console.log('Attempting to scrape G-Cloud services...');
try {
const url = 'https://api.digitalmarketplace.service.gov.uk/v0.1/services';
const response = await axios.get(url, {
params: {
status: 'published',
page: 1,
per_page: 100
},
timeout: 8000
});
const services = Array.isArray(response.data) ? response.data : (response.data.services || response.data.data || []);
if (services && services.length > 0) {
console.log(`Found ${services.length} G-Cloud services`);
for (const service of services) {
try {
const count = await insertService(service);
inserted += count;
} catch (e) {
console.error('Error inserting service:', e.message);
}
}
}
} catch (e) {
console.warn('G-Cloud endpoint unavailable:', e.message);
}
return inserted;
}
async function insertOpportunity(opp) {
const oppId = opp.id || opp.ID || opp.opportunity_id;
if (!oppId) {
return 0;
}
const sourceId = `dm-${oppId}`;
const title = (opp.title || opp.name || 'Untitled').substring(0, 500);
const description = (opp.description || opp.brief || '').substring(0, 5000);
const summary = (opp.summary || description).substring(0, 500);
const publishedDate = opp.publishedAt || opp.published_at || opp.createdAt || new Date().toISOString();
const deadline = opp.applicationsClosedAt || opp.closing_date || opp.deadline;
const authorityName = (opp.organisation?.name || opp.buyer?.name || opp.organisationName || 'Digital Marketplace').substring(0, 255);
const location = (opp.location || opp.workingArrangements || 'UK').substring(0, 255);
let valueLow = null, valueHigh = null;
if (opp.budgetRange) {
try {
const matches = String(opp.budgetRange).match(/[0-9,]+\.?[0-9]*/g);
if (matches && matches.length >= 2) {
valueLow = parseFloat(matches[0].replace(/,/g, ''));
valueHigh = parseFloat(matches[matches.length - 1].replace(/,/g, ''));
}
} catch (e) {
// ignore
}
} else if (opp.minBudget || opp.maxBudget) {
valueLow = opp.minBudget ? parseFloat(opp.minBudget) : null;
valueHigh = opp.maxBudget ? parseFloat(opp.maxBudget) : null;
}
const noticeUrl = opp.link || opp.url ||
`https://www.digitalmarketplace.service.gov.uk/digital-outcomes-and-specialists/opportunities/${oppId}`;
const cpvCodes = opp.specialistRole ? [opp.specialistRole] : (opp.cpv_codes || []);
try {
const result = await pool.query(
`INSERT INTO tenders (
source, source_id, title, description, summary, cpv_codes,
value_low, value_high, currency, published_date, deadline,
authority_name, authority_type, location, documents_url, notice_url, status, sector
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18)
ON CONFLICT (source_id) DO NOTHING`,
[
'digital_marketplace',
sourceId,
title,
description,
summary,
cpvCodes,
valueLow,
valueHigh,
'GBP',
publishedDate,
deadline,
authorityName,
'government',
location,
'',
noticeUrl,
'open',
classifySector(title, description, authorityName)
]
);
return result.rowCount || 0;
} catch (error) {
if (error.code === '23505') {
return 0; // Already exists
}
throw error;
}
}
async function insertService(service) {
const serviceId = service.id || service.service_id;
if (!serviceId) {
return 0;
}
const sourceId = `dm-gcloud-${serviceId}`;
const title = (service.serviceName || service.name || 'Untitled').substring(0, 500);
const description = (service.serviceDescription || service.description || '').substring(0, 5000);
const supplierName = (service.supplierName || 'Digital Marketplace').substring(0, 255);
const noticeUrl = `https://www.digitalmarketplace.service.gov.uk/g-cloud/services/${serviceId}`;
try {
const result = await pool.query(
`INSERT INTO tenders (
source, source_id, title, description, summary, cpv_codes,
value_low, value_high, currency, published_date, deadline,
authority_name, authority_type, location, documents_url, notice_url, status, sector
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18)
ON CONFLICT (source_id) DO NOTHING`,
[
'digital_marketplace',
sourceId,
title,
description,
description.substring(0, 500),
[],
null,
null,
'GBP',
service.createdAt || new Date().toISOString(),
null,
supplierName,
'supplier',
'UK',
'',
noticeUrl,
'open',
classifySector(title, description, supplierName)
]
);
return result.rowCount || 0;
} catch (error) {
if (error.code === '23505') {
return 0;
}
throw error;
}
}
scrapeTenders();