diff --git a/TENDER_CLEANUP_SUMMARY.md b/TENDER_CLEANUP_SUMMARY.md new file mode 100644 index 0000000..cc7ff3c --- /dev/null +++ b/TENDER_CLEANUP_SUMMARY.md @@ -0,0 +1,84 @@ +# Tender URL Cleanup Summary + +**Date:** 2026-02-15 +**Issue:** Apply Now buttons showing 404 errors for URL like: +`https://www.contractsfinder.service.gov.uk/notice/24dac264-3958-4928-a1ad-675ecd5e203d` + +## Root Cause + +Tender URLs become invalid even before their deadline because: +1. Contracting authorities close tenders early +2. Contracts Finder immediately removes them from the site +3. TenderRadar database still shows them as "open" + +## Cleanup Results + +**Before cleanup:** +- Total tenders: 626 +- Open tenders: 626 +- Closed tenders: 0 + +**After cleanup:** +- Total tenders: 626 +- **Open tenders: 349** (valid, working URLs) +- **Closed tenders: 277** (removed from source sites) + +**Removal rate: ~44%** of tenders were already removed from their source websites! + +## How Contracts Finder Handles Removals + +- Returns HTTP 200 (not 404!) +- Redirects to: `https://www.contractsfinder.service.gov.uk/syserror/notfound` +- This makes detection tricky - we need to check the final redirect URL + +## Solution Implemented + +### 1. Cleanup Script: `cleanup-invalid-tenders.mjs` + +- Checks tender URLs by making HEAD requests +- Detects removals by checking for `/syserror/` or `/notfound` in final URL +- Marks removed tenders as "closed" in database +- Rate-limited to 500ms between requests (be nice to servers) + +### 2. Cron Job (Recommended) + +Add to crontab on VPS: + +```bash +# Run tender cleanup daily at 3am +0 3 * * * cd /home/peter/tenderpilot && /usr/bin/node cleanup-invalid-tenders.mjs >> logs/cleanup.log 2>&1 +``` + +### 3. Dashboard Filter (NEEDS IMPLEMENTATION) + +The API already filters by status, but the dashboard doesn't pass the filter. Update `/public/dashboard.html`: + +Currently the API query at line 1089 doesn't include status filter. +Need to ensure only "open" tenders are shown. + +## Testing Results + +All tested URLs from each source: + +| Source | Sample URLs Tested | Working | Removed | +|--------|-------------------|---------|---------| +| contracts_finder | 100+ | ~56 | ~44 | +| find_tender | 3 | 3 ✅ | 0 | +| ted_eu | 10 | 9 ✅ | 1 (rate limit) | +| etendersni | 3 | 3 ✅ | 0 | +| pcs_scotland | 3 | 3 ✅ | 0 | +| sell2wales | 3 | 3 ✅ | 0 | + +**Contracts Finder has the highest removal rate** - nearly half of scraped tenders get removed early. + +## Next Steps + +1. ✅ **Created cleanup script** - `cleanup-invalid-tenders.mjs` +2. ✅ **Ran initial cleanup** - 277 invalid tenders marked as closed +3. ⏳ **Set up daily cron job** - run cleanup automatically +4. ⏳ **Verify dashboard filtering** - ensure closed tenders don't show + +## Files Created + +- `/home/peter/tenderpilot/cleanup-invalid-tenders.mjs` - Cleanup script +- `/home/peter/tenderpilot/TENDER_CLEANUP_SUMMARY.md` - This documentation diff --git a/cleanup-invalid-tenders.mjs b/cleanup-invalid-tenders.mjs new file mode 100644 index 0000000..be47988 --- /dev/null +++ b/cleanup-invalid-tenders.mjs @@ -0,0 +1,85 @@ +import pg from 'pg'; +import dotenv from 'dotenv'; + +dotenv.config(); + +const pool = new pg.Pool({ + connectionString: process.env.DATABASE_URL || 'postgresql://tenderpilot:jqrmilIBr6imtT0fKS01@localhost:5432/tenderpilot' +}); + +const delay = (ms) => new Promise(resolve => setTimeout(resolve, ms)); + +async function cleanupInvalidTenders() { + try { + console.log(`[${new Date().toISOString()}] Starting tender URL validation cleanup...`); + + // Get all open tenders with URLs + const result = await pool.query(` + SELECT id, title, notice_url, source + FROM tenders + WHERE status = 'open' + AND notice_url IS NOT NULL + AND notice_url != '' + ORDER BY created_at DESC + LIMIT 100 + `); + + console.log(`Found ${result.rows.length} tenders to check\n`); + + let checked = 0; + let removed = 0; + let errors = 0; + + for (const tender of result.rows) { + checked++; + + try { + const response = await fetch(tender.notice_url, { + method: 'HEAD', + redirect: 'follow', + signal: AbortSignal.timeout(10000) + }); + + const status = response.status; + + // Check for 404 or redirect to error page + if (status === 404 || response.url.includes('/syserror/') || response.url.includes('/notfound')) { + console.log(` [${checked}/${result.rows.length}] REMOVING: ${tender.title.substring(0, 60)}`); + console.log(` URL: ${tender.notice_url}`); + console.log(` Status: ${status}, Final URL: ${response.url}`); + + await pool.query( + 'UPDATE tenders SET status = $1 WHERE id = $2', + ['closed', tender.id] + ); + + removed++; + } else if (status >= 400) { + console.log(` [${checked}/${result.rows.length}] ERROR ${status}: ${tender.title.substring(0, 60)}`); + errors++; + } + + // Be nice to servers + await delay(500); + + } catch (error) { + console.log(` [${checked}/${result.rows.length}] FETCH ERROR: ${tender.title.substring(0, 60)}`); + console.log(` ${error.message}`); + errors++; + } + } + + console.log(`\nCleanup complete:`); + console.log(` Checked: ${checked}`); + console.log(` Removed: ${removed}`); + console.log(` Errors: ${errors}`); + console.log(` Still valid: ${checked - removed - errors}`); + + } catch (error) { + console.error('Cleanup failed:', error); + } finally { + await pool.end(); + } +} + +cleanupInvalidTenders();