- Strip tracking query params from find_tender URLs (?origin=SearchResults) - Disable TED EU scraper (requires browser automation, was using demo data) - Update 220 find_tender database records with clean URLs - Delete 4 TED demo records from database - Add URL_FIX_SUMMARY.md documentation All 615 tenders now have direct links to tender detail pages. Fixes Apply Now button UX issue.
133 lines
4.6 KiB
JavaScript
133 lines
4.6 KiB
JavaScript
import axios from 'axios';
|
|
import * as cheerio from 'cheerio';
|
|
import { classifySector } from './classify-sector.js';
|
|
import pg from 'pg';
|
|
import dotenv from 'dotenv';
|
|
|
|
dotenv.config();
|
|
|
|
const pool = new pg.Pool({
|
|
connectionString: process.env.DATABASE_URL || 'postgresql://tenderpilot:tenderpilot123@localhost:5432/tenderpilot'
|
|
});
|
|
|
|
// Rate limiting
|
|
const delay = (ms) => new Promise(resolve => setTimeout(resolve, ms));
|
|
|
|
async function scrapeTenders() {
|
|
try {
|
|
console.log(`[${new Date().toISOString()}] Starting Find a Tender scrape...`);
|
|
|
|
let insertedCount = 0;
|
|
const maxPages = 5; // Limit to first 5 pages to be respectful
|
|
|
|
for (let page = 1; page <= maxPages; page++) {
|
|
console.log(`Fetching page ${page}...`);
|
|
|
|
const url = `https://www.find-tender.service.gov.uk/Search/Results?page=${page}&sort=recent`;
|
|
|
|
const response = await axios.get(url, {
|
|
timeout: 30000,
|
|
headers: {
|
|
'User-Agent': 'TenderRadar/1.0 (UK Public Procurement Aggregator; contact@tenderradar.co.uk)'
|
|
}
|
|
});
|
|
|
|
const $ = cheerio.load(response.data);
|
|
const tenderElements = $('div.search-result');
|
|
|
|
if (tenderElements.length === 0) {
|
|
console.log('No more tenders found, stopping pagination');
|
|
break;
|
|
}
|
|
|
|
console.log(`Found ${tenderElements.length} tenders on page ${page}`);
|
|
|
|
for (let i = 0; i < tenderElements.length; i++) {
|
|
try {
|
|
const element = tenderElements.eq(i);
|
|
|
|
const titleLink = element.find('.search-result-header a').first();
|
|
const title = titleLink.text().trim();
|
|
const rawHref = titleLink.attr('href') || '';
|
|
const rawUrl = rawHref.startsWith("http") ? rawHref : "https://www.find-tender.service.gov.uk" + rawHref;
|
|
// Strip query parameters to get clean notice URL
|
|
const noticeUrl = rawUrl.split("?")[0];
|
|
|
|
// Extract source ID from URL
|
|
const urlMatch = noticeUrl.match(/\/Notice\/([A-Z0-9-]+)/);
|
|
const sourceId = urlMatch ? urlMatch[1] : noticeUrl;
|
|
|
|
const authority = element.find('.search-result-sub-header').text().trim();
|
|
const description = element.find('.search-result-description').text().trim();
|
|
|
|
// Extract dates and value
|
|
const metadata = element.find('.search-result-metadata').text();
|
|
let publishedDate = null;
|
|
let deadline = null;
|
|
let valueLow = null;
|
|
|
|
const publishMatch = metadata.match(/Published:\s*(\d{1,2}\s+\w+\s+\d{4})/);
|
|
if (publishMatch) {
|
|
publishedDate = new Date(publishMatch[1]).toISOString();
|
|
}
|
|
|
|
const deadlineMatch = metadata.match(/Deadline:\s*(\d{1,2}\s+\w+\s+\d{4})/);
|
|
if (deadlineMatch) {
|
|
deadline = new Date(deadlineMatch[1]).toISOString();
|
|
}
|
|
|
|
const valueMatch = metadata.match(/£([\d,]+)/);
|
|
if (valueMatch) {
|
|
valueLow = parseFloat(valueMatch[1].replace(/,/g, ''));
|
|
}
|
|
|
|
await pool.query(
|
|
`INSERT INTO tenders (
|
|
source, source_id, title, description, summary, cpv_codes,
|
|
value_low, value_high, currency, published_date, deadline,
|
|
authority_name, authority_type, location, documents_url, notice_url, status, sector
|
|
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18)
|
|
ON CONFLICT (source_id) DO NOTHING`,
|
|
[
|
|
'find_tender',
|
|
sourceId,
|
|
title.substring(0, 500),
|
|
description,
|
|
description.substring(0, 500),
|
|
[],
|
|
valueLow,
|
|
valueLow,
|
|
'GBP',
|
|
publishedDate,
|
|
deadline,
|
|
authority,
|
|
'government',
|
|
'UK',
|
|
'',
|
|
noticeUrl,
|
|
deadline && new Date(deadline) > new Date() ? 'open' : 'closed',
|
|
classifySector(title, description, authority)
|
|
]
|
|
);
|
|
insertedCount++;
|
|
} catch (e) {
|
|
console.error('Error inserting tender:', e.message);
|
|
}
|
|
}
|
|
|
|
// Rate limiting: wait 2 seconds between pages
|
|
if (page < maxPages) {
|
|
await delay(2000);
|
|
}
|
|
}
|
|
|
|
console.log(`[${new Date().toISOString()}] Find a Tender scrape complete. Inserted/updated ${insertedCount} tenders`);
|
|
} catch (error) {
|
|
console.error('Error scraping Find a Tender:', error.message);
|
|
} finally {
|
|
await pool.end();
|
|
}
|
|
}
|
|
|
|
scrapeTenders();
|