285 lines
8.3 KiB
JavaScript
285 lines
8.3 KiB
JavaScript
import axios from 'axios';
|
|
import { classifySector } from './classify-sector.js';
|
|
import pg from 'pg';
|
|
import dotenv from 'dotenv';
|
|
|
|
dotenv.config();
|
|
|
|
const pool = new pg.Pool({
|
|
connectionString: process.env.DATABASE_URL || 'postgresql://tenderpilot:tenderpilot123@localhost:5432/tenderpilot'
|
|
});
|
|
|
|
async function scrapeTenders() {
|
|
try {
|
|
console.log(`[${new Date().toISOString()}] Starting Digital Marketplace tender scrape...`);
|
|
|
|
let insertedCount = 0;
|
|
|
|
// Try to scrape from DOS endpoint
|
|
try {
|
|
insertedCount += await scrapeFromDOSEndpoint();
|
|
} catch (e) {
|
|
console.error('Error scraping DOS endpoint:', e.message);
|
|
}
|
|
|
|
// Try alternative endpoint (if available)
|
|
if (insertedCount === 0) {
|
|
try {
|
|
insertedCount += await scrapeFromGCloudEndpoint();
|
|
} catch (e) {
|
|
console.error('Error scraping G-Cloud endpoint:', e.message);
|
|
}
|
|
}
|
|
|
|
console.log(`[${new Date().toISOString()}] Scrape complete. Inserted ${insertedCount} tenders`);
|
|
process.exit(0);
|
|
} catch (error) {
|
|
console.error('Fatal error in scraper:', error.message);
|
|
process.exit(1);
|
|
} finally {
|
|
try {
|
|
await pool.end();
|
|
} catch (e) {
|
|
// ignore
|
|
}
|
|
}
|
|
}
|
|
|
|
async function scrapeFromDOSEndpoint() {
|
|
let inserted = 0;
|
|
const pageSize = 50;
|
|
let page = 1;
|
|
const maxPages = 20;
|
|
|
|
console.log('Attempting to scrape Digital Outcomes & Specialists...');
|
|
|
|
for (page = 1; page <= maxPages; page++) {
|
|
try {
|
|
console.log(`Fetching DOS opportunities page ${page}...`);
|
|
|
|
const url = 'https://api.digitalmarketplace.service.gov.uk/v0.1/opportunities';
|
|
|
|
let response;
|
|
try {
|
|
response = await axios.get(url, {
|
|
params: {
|
|
status: 'open',
|
|
page: page,
|
|
per_page: pageSize
|
|
},
|
|
timeout: 8000,
|
|
headers: {
|
|
'User-Agent': 'TenderRadar-Scraper/1.0',
|
|
'Accept': 'application/json'
|
|
}
|
|
});
|
|
} catch (axiosError) {
|
|
if (axiosError.code === 'ECONNABORTED' || axiosError.message.includes('timeout')) {
|
|
console.warn(`Timeout on page ${page} - API may be unavailable`);
|
|
break;
|
|
}
|
|
throw axiosError;
|
|
}
|
|
|
|
const data = response.data;
|
|
const opportunities = Array.isArray(data) ? data : (data.opportunities || data.data || []);
|
|
|
|
if (!opportunities || opportunities.length === 0) {
|
|
console.log('No more opportunities found');
|
|
break;
|
|
}
|
|
|
|
console.log(`Found ${opportunities.length} opportunities on page ${page}`);
|
|
|
|
for (const opp of opportunities) {
|
|
try {
|
|
const count = await insertOpportunity(opp);
|
|
inserted += count;
|
|
} catch (e) {
|
|
console.error('Error inserting opportunity:', e.message);
|
|
}
|
|
}
|
|
|
|
// Check if there are more pages
|
|
if (opportunities.length < pageSize) {
|
|
break;
|
|
}
|
|
|
|
// Small delay between pages
|
|
await new Promise(resolve => setTimeout(resolve, 300));
|
|
} catch (error) {
|
|
console.error(`Error on page ${page}:`, error.message);
|
|
// Try next page
|
|
}
|
|
}
|
|
|
|
console.log(`DOS scraping complete, inserted ${inserted} records`);
|
|
return inserted;
|
|
}
|
|
|
|
async function scrapeFromGCloudEndpoint() {
|
|
let inserted = 0;
|
|
|
|
console.log('Attempting to scrape G-Cloud services...');
|
|
|
|
try {
|
|
const url = 'https://api.digitalmarketplace.service.gov.uk/v0.1/services';
|
|
|
|
const response = await axios.get(url, {
|
|
params: {
|
|
status: 'published',
|
|
page: 1,
|
|
per_page: 100
|
|
},
|
|
timeout: 8000
|
|
});
|
|
|
|
const services = Array.isArray(response.data) ? response.data : (response.data.services || response.data.data || []);
|
|
|
|
if (services && services.length > 0) {
|
|
console.log(`Found ${services.length} G-Cloud services`);
|
|
for (const service of services) {
|
|
try {
|
|
const count = await insertService(service);
|
|
inserted += count;
|
|
} catch (e) {
|
|
console.error('Error inserting service:', e.message);
|
|
}
|
|
}
|
|
}
|
|
} catch (e) {
|
|
console.warn('G-Cloud endpoint unavailable:', e.message);
|
|
}
|
|
|
|
return inserted;
|
|
}
|
|
|
|
async function insertOpportunity(opp) {
|
|
const oppId = opp.id || opp.ID || opp.opportunity_id;
|
|
if (!oppId) {
|
|
return 0;
|
|
}
|
|
|
|
const sourceId = `dm-${oppId}`;
|
|
const title = (opp.title || opp.name || 'Untitled').substring(0, 500);
|
|
const description = (opp.description || opp.brief || '').substring(0, 5000);
|
|
const summary = (opp.summary || description).substring(0, 500);
|
|
|
|
const publishedDate = opp.publishedAt || opp.published_at || opp.createdAt || new Date().toISOString();
|
|
const deadline = opp.applicationsClosedAt || opp.closing_date || opp.deadline;
|
|
|
|
const authorityName = (opp.organisation?.name || opp.buyer?.name || opp.organisationName || 'Digital Marketplace').substring(0, 255);
|
|
const location = (opp.location || opp.workingArrangements || 'UK').substring(0, 255);
|
|
|
|
let valueLow = null, valueHigh = null;
|
|
if (opp.budgetRange) {
|
|
try {
|
|
const matches = String(opp.budgetRange).match(/[0-9,]+\.?[0-9]*/g);
|
|
if (matches && matches.length >= 2) {
|
|
valueLow = parseFloat(matches[0].replace(/,/g, ''));
|
|
valueHigh = parseFloat(matches[matches.length - 1].replace(/,/g, ''));
|
|
}
|
|
} catch (e) {
|
|
// ignore
|
|
}
|
|
} else if (opp.minBudget || opp.maxBudget) {
|
|
valueLow = opp.minBudget ? parseFloat(opp.minBudget) : null;
|
|
valueHigh = opp.maxBudget ? parseFloat(opp.maxBudget) : null;
|
|
}
|
|
|
|
const noticeUrl = opp.link || opp.url ||
|
|
`https://www.digitalmarketplace.service.gov.uk/digital-outcomes-and-specialists/opportunities/${oppId}`;
|
|
|
|
const cpvCodes = opp.specialistRole ? [opp.specialistRole] : (opp.cpv_codes || []);
|
|
|
|
try {
|
|
const result = await pool.query(
|
|
`INSERT INTO tenders (
|
|
source, source_id, title, description, summary, cpv_codes,
|
|
value_low, value_high, currency, published_date, deadline,
|
|
authority_name, authority_type, location, documents_url, notice_url, status, sector
|
|
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18)
|
|
ON CONFLICT (source_id) DO NOTHING`,
|
|
[
|
|
'digital_marketplace',
|
|
sourceId,
|
|
title,
|
|
description,
|
|
summary,
|
|
cpvCodes,
|
|
valueLow,
|
|
valueHigh,
|
|
'GBP',
|
|
publishedDate,
|
|
deadline,
|
|
authorityName,
|
|
'government',
|
|
location,
|
|
'',
|
|
noticeUrl,
|
|
'open',
|
|
classifySector(title, description, authorityName)
|
|
]
|
|
);
|
|
return result.rowCount || 0;
|
|
} catch (error) {
|
|
if (error.code === '23505') {
|
|
return 0; // Already exists
|
|
}
|
|
throw error;
|
|
}
|
|
}
|
|
|
|
async function insertService(service) {
|
|
const serviceId = service.id || service.service_id;
|
|
if (!serviceId) {
|
|
return 0;
|
|
}
|
|
|
|
const sourceId = `dm-gcloud-${serviceId}`;
|
|
const title = (service.serviceName || service.name || 'Untitled').substring(0, 500);
|
|
const description = (service.serviceDescription || service.description || '').substring(0, 5000);
|
|
const supplierName = (service.supplierName || 'Digital Marketplace').substring(0, 255);
|
|
|
|
const noticeUrl = `https://www.digitalmarketplace.service.gov.uk/g-cloud/services/${serviceId}`;
|
|
|
|
try {
|
|
const result = await pool.query(
|
|
`INSERT INTO tenders (
|
|
source, source_id, title, description, summary, cpv_codes,
|
|
value_low, value_high, currency, published_date, deadline,
|
|
authority_name, authority_type, location, documents_url, notice_url, status, sector
|
|
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18)
|
|
ON CONFLICT (source_id) DO NOTHING`,
|
|
[
|
|
'digital_marketplace',
|
|
sourceId,
|
|
title,
|
|
description,
|
|
description.substring(0, 500),
|
|
[],
|
|
null,
|
|
null,
|
|
'GBP',
|
|
service.createdAt || new Date().toISOString(),
|
|
null,
|
|
supplierName,
|
|
'supplier',
|
|
'UK',
|
|
'',
|
|
noticeUrl,
|
|
'open',
|
|
classifySector(title, description, supplierName)
|
|
]
|
|
);
|
|
return result.rowCount || 0;
|
|
} catch (error) {
|
|
if (error.code === '23505') {
|
|
return 0;
|
|
}
|
|
throw error;
|
|
}
|
|
}
|
|
|
|
scrapeTenders();
|