Add sector classification module, integrate into all 7 scrapers, fix CF pagination
This commit is contained in:
@@ -1,5 +1,6 @@
|
||||
import axios from 'axios';
|
||||
import * as cheerio from 'cheerio';
|
||||
import { classifySector } from './classify-sector.js';
|
||||
import pg from 'pg';
|
||||
import dotenv from 'dotenv';
|
||||
|
||||
@@ -13,14 +14,14 @@ function parseDate(dateStr) {
|
||||
if (!dateStr || dateStr.trim() === '') return null;
|
||||
|
||||
try {
|
||||
// Handle format like "13/02/2026"
|
||||
// Handle format like 13/02/2026
|
||||
if (dateStr.match(/^\d{2}\/\d{2}\/\d{4}$/)) {
|
||||
const [day, month, year] = dateStr.split('/');
|
||||
const date = new Date(`${year}-${month}-${day}`);
|
||||
return date.toISOString();
|
||||
}
|
||||
|
||||
// Handle format like "16-Mar-26"
|
||||
// Handle format like 16-Mar-26
|
||||
if (dateStr.match(/^\d{2}-\w+-\d{2}$/)) {
|
||||
const parts = dateStr.split('-');
|
||||
const day = parts[0];
|
||||
@@ -67,7 +68,7 @@ async function scrapeTenders() {
|
||||
|
||||
// Find all tender rows
|
||||
const tenderRows = $('table tr').filter((i, el) => {
|
||||
return $(el).find('a[href*="search_view.aspx"]').length > 0;
|
||||
return $(el).find('a[href*=search_view.aspx]').length > 0;
|
||||
});
|
||||
|
||||
console.log(`Found ${tenderRows.length} tenders`);
|
||||
@@ -110,12 +111,13 @@ async function scrapeTenders() {
|
||||
`INSERT INTO tenders (
|
||||
source, source_id, title, description, summary, cpv_codes,
|
||||
value_low, value_high, currency, published_date, deadline,
|
||||
authority_name, authority_type, location, documents_url, notice_url, status
|
||||
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17)
|
||||
authority_name, authority_type, location, documents_url, notice_url, status, sector
|
||||
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18)
|
||||
ON CONFLICT (source_id) DO UPDATE SET
|
||||
title = EXCLUDED.title,
|
||||
description = EXCLUDED.description,
|
||||
summary = EXCLUDED.summary`,
|
||||
summary = EXCLUDED.summary,
|
||||
sector = EXCLUDED.sector`,
|
||||
[
|
||||
'pcs_scotland',
|
||||
sourceId,
|
||||
@@ -133,7 +135,8 @@ async function scrapeTenders() {
|
||||
'Scotland',
|
||||
'',
|
||||
noticeUrl,
|
||||
deadline && new Date(deadline) > new Date() ? 'open' : 'closed'
|
||||
deadline && new Date(deadline) > new Date() ? 'open' : 'closed',
|
||||
classifySector(title, noticeType, authority)
|
||||
]
|
||||
);
|
||||
insertedCount++;
|
||||
|
||||
Reference in New Issue
Block a user