Add sector classification module, integrate into all 7 scrapers, fix CF pagination

This commit is contained in:
Peter Foster
2026-02-14 17:12:51 +00:00
parent d1aa21c59f
commit 771fcf9d76
23 changed files with 2044 additions and 83 deletions

View File

@@ -1,5 +1,6 @@
import axios from 'axios';
import * as cheerio from 'cheerio';
import { classifySector } from './classify-sector.js';
import pg from 'pg';
import dotenv from 'dotenv';
@@ -13,7 +14,7 @@ function parseDate(dateStr) {
if (!dateStr || dateStr.trim() === '') return null;
try {
// Handle format like "13/02/2026"
// Handle format like 13/02/2026
if (dateStr.match(/^\d{2}\/\d{2}\/\d{4}$/)) {
const [day, month, year] = dateStr.split('/');
const date = new Date(`${year}-${month}-${day}`);
@@ -48,7 +49,7 @@ async function scrapeTenders() {
const $ = cheerio.load(response.data);
// Find all links to tender detail pages
const tenderLinks = $('a[href*="search_view.aspx?ID="]');
const tenderLinks = $('a[href*=search_view.aspx?ID=]');
console.log(`Found ${tenderLinks.length} potential tenders`);
@@ -115,8 +116,8 @@ async function scrapeTenders() {
`INSERT INTO tenders (
source, source_id, title, description, summary, cpv_codes,
value_low, value_high, currency, published_date, deadline,
authority_name, authority_type, location, documents_url, notice_url, status
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17)
authority_name, authority_type, location, documents_url, notice_url, status, sector
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18)
ON CONFLICT (source_id) DO NOTHING`,
[
'sell2wales',
@@ -135,7 +136,8 @@ async function scrapeTenders() {
location.substring(0, 255),
'',
noticeUrl,
deadline && new Date(deadline) > new Date() ? 'open' : 'closed'
deadline && new Date(deadline) > new Date() ? 'open' : 'closed',
classifySector(title, description, authority)
]
);
insertedCount++;