Files
tenderpilot/scrapers/classify-sector.js

145 lines
4.6 KiB
JavaScript

/**
* Shared sector classification module for TenderRadar scrapers
* Exports a classifySector function that categorizes tenders into 9 sectors
*/
/**
* Classify a tender into one of 9 sectors based on title, description, and authority
* Uses regex patterns in priority order; first match wins
*
* @param {string} title - Tender title
* @param {string} description - Tender description
* @param {string} authorityName - Procuring authority name
* @returns {string} One of: Health, Education, Construction, IT & Technology,
* Transport, Defence, Energy, Government, Other
*/
export function classifySector(title, description, authorityName) {
// Combine all text and normalize to lowercase for case-insensitive matching
const combined = `${title || ''} ${description || ''} ${authorityName || ''}`.toLowerCase();
// 1. Health
if (/\bnhs\b|hospital|clinical|pharmac|medical|health\s*(care|service)|maternity|mental\s*health|dental|ambulance|patient|surgery|pathology|\bward\b/.test(combined)) {
return 'Health';
}
// 2. Education
if (/\bschool|universit|college|educat|academ|learning|pupil|student|teaching/.test(combined)) {
return 'Education';
}
// 3. Construction
if (/\bconstruct|demoliti|renovati|building\s*(work|maint|repair)|roofing|plumbing|electrical\s*install|painting\s*(and|&)\s*decorat|repair\s*(of|work)|refurbish|scaffolding|paving|groundwork/.test(combined)) {
return 'Construction';
}
// 4. IT & Technology
if (/\bsoftware|\b(it|ict)\s+(service|system|support|infra)|digital\s*(platform|service|transform)|cyber|cloud\s*(comput|service|hosting)|network\s*infra|data\s*(centre|center|analy|manage)/.test(combined)) {
return 'IT & Technology';
}
// 5. Transport
if (/\btransport|vehicle|fleet\s*(manage|maint)|highway|railway|bus\s*(service|route)|traffic|parking/.test(combined)) {
return 'Transport';
}
// 6. Defence
if (/\bdefence|defense|military|\bmod\b|armed\s*force|navy|royal\s*air/.test(combined)) {
return 'Defence';
}
// 7. Energy
if (/\benergy\s*(supply|effic|manage)|electricity|solar|renewable|power\s*generat|gas\s*supply|wind\s*(farm|turbin)/.test(combined)) {
return 'Energy';
}
// 8. Government - only if nothing else matched
if (/\bcouncil|government|civic|municipal|parliament|local\s*authorit/.test(combined)) {
return 'Government';
}
// 9. Other - default fallback
return 'Other';
}
// Self-test when run directly
if (import.meta.url === `file://${process.argv[1]}`) {
console.log('Running self-tests...\n');
const testCases = [
{
title: 'NHS Hospital Supplies',
description: 'Medical equipment for clinical use',
authority: 'NHS England',
expected: 'Health'
},
{
title: 'School Building Construction',
description: 'New educational facility',
authority: 'Local Education Authority',
expected: 'Education'
},
{
title: 'Roofing and Painting Services',
description: 'Building renovation and repairs',
authority: 'City Council',
expected: 'Construction'
},
{
title: 'Software Development Services',
description: 'IT system and cloud hosting',
authority: 'Government IT Department',
expected: 'IT & Technology'
},
{
title: 'Public Transport Fleet Maintenance',
description: 'Vehicle servicing and support',
authority: 'Transport Department',
expected: 'Transport'
},
{
title: 'Military Equipment Supply',
description: 'Defence and armed forces supplies',
authority: 'Ministry of Defence',
expected: 'Defence'
},
{
title: 'Renewable Energy Installation',
description: 'Solar power and wind turbine project',
authority: 'Energy Commission',
expected: 'Energy'
},
{
title: 'Council Office Supplies',
description: 'General supplies for local government',
authority: 'City Council',
expected: 'Government'
},
{
title: 'Generic Office Supplies',
description: 'Standard stationery and equipment',
authority: 'Random Organization',
expected: 'Other'
}
];
let passed = 0;
let failed = 0;
testCases.forEach((test, index) => {
const result = classifySector(test.title, test.description, test.authority);
const status = result === test.expected ? '✓' : '✗';
if (result === test.expected) {
passed++;
} else {
failed++;
}
console.log(`${status} Test ${index + 1}: "${test.title}"`);
console.log(` Expected: ${test.expected}, Got: ${result}\n`);
});
console.log(`\nResults: ${passed} passed, ${failed} failed`);
process.exit(failed > 0 ? 1 : 0);
}