diff --git a/SCRAPERS_STATUS.md b/SCRAPERS_STATUS.md new file mode 100644 index 0000000..df4c627 --- /dev/null +++ b/SCRAPERS_STATUS.md @@ -0,0 +1,83 @@ +# TenderRadar Scrapers - All Working ✅ + +**Date:** 2026-02-15 +**Status:** ALL SCRAPERS OPERATIONAL + +## Summary + +✅ **6 out of 6 main scrapers working** +❌ **1 scraper disabled** (digital-marketplace - API down) +📊 **Total tenders:** 626 + +## Active Scrapers + +| Source | Count | Status | Technology | +|--------|-------|--------|------------| +| contracts_finder | 364 | ✅ Working | JSON API | +| find_tender | 220 | ✅ Working | HTML scraping | +| ted_eu | 11 | ✅ **NEWLY FIXED** | Playwright browser automation | +| etendersni | 11 | ✅ Working | HTML scraping | +| pcs_scotland | 10 | ✅ Working | HTML scraping | +| sell2wales | 10 | ✅ Working | HTML scraping | + +## Scraper Details + +### contracts_finder (364 tenders) +- JSON API via OCDS format +- Direct notice URLs with UUIDs +- Production-ready + +### find_tender (220 tenders) +- HTML scraping with cheerio +- **Recent fix:** Strips tracking query params +- Production-ready + +### ted_eu (11 tenders) - NEWLY IMPLEMENTED +- **Technology:** Playwright headless browser automation +- **Search:** UK keyword filtering +- **Performance:** Scans 3 pages, finds ~11 UK-relevant EU tenders +- Production-ready + +### etendersni, pcs_scotland, sell2wales +- All working with direct tender URLs +- Production-ready + +## Disabled Scrapers + +### digital-marketplace +- **Status:** ❌ API timeout +- **Reason:** Endpoint unreachable after 30s +- **Action:** Monitor for service restoration + +## Recent Changes (2026-02-15) + +1. ✅ **Fixed find_tender** - Removed tracking params from 220 URLs +2. ✅ **Implemented ted_eu** - Full Playwright browser automation +3. ✅ **Installed Playwright + Chromium** - 167MB download complete +4. ✅ **Cleaned database** - Removed 4 demo records +5. ✅ **Updated Apply Now URLs** - 100% working across all sources + +## Dependencies + +- axios, cheerio, playwright, pg, dotenv +- Chromium browser (via Playwright) + +## Performance + +- Total scrape time: 5-10 minutes for all sources +- Database: PostgreSQL on VPS localhost +- Storage: 626 active tenders +- Cron schedule: Every 4 hours + +## Files Modified + +1. `scrapers/find-tender.js` - Strip query params +2. `scrapers/ted-eu.js` - Playwright implementation +3. `package.json` - Added Playwright dependency +4. Database - 220 URLs cleaned, 11 new TED tenders added + +## Next Steps (Optional) + +1. Monitor digital-marketplace API +2. Expand TED keyword search +3. Consider additional UK procurement sources diff --git a/package-lock.json b/package-lock.json index 490ae75..a548d91 100755 --- a/package-lock.json +++ b/package-lock.json @@ -18,6 +18,7 @@ "jsonwebtoken": "^9.0.0", "nodemailer": "^8.0.1", "pg": "^8.10.0", + "playwright": "^1.58.2", "stripe": "^20.3.1" } }, @@ -836,6 +837,20 @@ "integrity": "sha512-OO0pH2lK6a0hZnAdau5ItzHPI6pUlvI7jMVnxUQRtw4owF2wk8lOSabtGDCTP4Ggrg2MbGnWO9X8K1t4+fGMDw==", "license": "ISC" }, + "node_modules/fsevents": { + "version": "2.3.2", + "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.2.tgz", + "integrity": "sha512-xiqMQR4xAeHTuB9uWm+fFRcIOgKBMiOBP+eXiyT7jsgVCq1bkVygt00oASowB7EdtpOHaaPgKt812P9ab+DDKA==", + "hasInstallScript": true, + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": "^8.16.0 || ^10.6.0 || >=11.0.0" + } + }, "node_modules/function-bind": { "version": "1.1.2", "resolved": "https://registry.npmjs.org/function-bind/-/function-bind-1.1.2.tgz", @@ -1651,6 +1666,36 @@ "split2": "^4.1.0" } }, + "node_modules/playwright": { + "version": "1.58.2", + "resolved": "https://registry.npmjs.org/playwright/-/playwright-1.58.2.tgz", + "integrity": "sha512-vA30H8Nvkq/cPBnNw4Q8TWz1EJyqgpuinBcHET0YVJVFldr8JDNiU9LaWAE1KqSkRYazuaBhTpB5ZzShOezQ6A==", + "license": "Apache-2.0", + "dependencies": { + "playwright-core": "1.58.2" + }, + "bin": { + "playwright": "cli.js" + }, + "engines": { + "node": ">=18" + }, + "optionalDependencies": { + "fsevents": "2.3.2" + } + }, + "node_modules/playwright-core": { + "version": "1.58.2", + "resolved": "https://registry.npmjs.org/playwright-core/-/playwright-core-1.58.2.tgz", + "integrity": "sha512-yZkEtftgwS8CsfYo7nm0KE8jsvm6i/PTgVtB8DL726wNf6H2IMsDuxCpJj59KDaxCtSnrWan2AeDqM7JBaultg==", + "license": "Apache-2.0", + "bin": { + "playwright-core": "cli.js" + }, + "engines": { + "node": ">=18" + } + }, "node_modules/postgres-array": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/postgres-array/-/postgres-array-2.0.0.tgz", diff --git a/package.json b/package.json index 64987eb..68e5e97 100755 --- a/package.json +++ b/package.json @@ -19,6 +19,7 @@ "jsonwebtoken": "^9.0.0", "nodemailer": "^8.0.1", "pg": "^8.10.0", + "playwright": "^1.58.2", "stripe": "^20.3.1" } } diff --git a/scrapers/ted-eu.js b/scrapers/ted-eu.js index 0c9f6e2..760ad06 100755 --- a/scrapers/ted-eu.js +++ b/scrapers/ted-eu.js @@ -1,3 +1,5 @@ +import { chromium } from 'playwright'; +import { classifySector } from './classify-sector.js'; import pg from 'pg'; import dotenv from 'dotenv'; @@ -7,26 +9,175 @@ const pool = new pg.Pool({ connectionString: process.env.DATABASE_URL || 'postgresql://tenderpilot:tenderpilot123@localhost:5432/tenderpilot' }); -/** - * TED EU Scraper - DISABLED - * - * The TED (Tenders Electronic Daily) website uses JavaScript rendering, - * which requires browser automation (Playwright/Puppeteer) to scrape effectively. - * - * For now, TED tenders are not included in TenderRadar. - * - * TODO: Implement with Playwright when needed - * API endpoint: https://api.ted.europa.eu/v3/notices/search (requires POST) - * Browser scraping: https://ted.europa.eu/en/search/result?placeOfPerformanceCountry=GBR - */ +const delay = (ms) => new Promise(resolve => setTimeout(resolve, ms)); async function scrapeTenders() { - console.log('[TED EU] Scraper disabled - requires browser automation'); - console.log('[TED EU] To enable: implement Playwright/Puppeteer scraping'); - await pool.end(); - return; + let browser; + try { + console.log(`[${new Date().toISOString()}] Starting TED EU scraper with Playwright...`); + + browser = await chromium.launch({ headless: true }); + const page = await browser.newPage(); + + let insertedCount = 0; + const maxPages = 3; + + // Search for UK-relevant tenders (using GBR and United Kingdom keywords) + const searchUrl = 'https://ted.europa.eu/en/search/result?q=united+kingdom+OR+UK&page=1'; + + for (let pageNum = 1; pageNum <= maxPages; pageNum++) { + try { + const url = `https://ted.europa.eu/en/search/result?q=united+kingdom+OR+UK&page=${pageNum}`; + + console.log(`Fetching TED page ${pageNum}...`); + await page.goto(url, { waitUntil: 'networkidle', timeout: 30000 }); + await delay(3000); + + // Extract tender data from table rows + const tenders = await page.evaluate(() => { + const results = []; + const rows = document.querySelectorAll('tbody tr[data-notice-id], tbody tr'); + + rows.forEach(row => { + try { + const link = row.querySelector('a[href*="/notice/"]'); + if (!link) return; + + const cells = row.querySelectorAll('td'); + if (cells.length < 4) return; + + const noticeId = link.textContent.trim(); + const href = link.href; + const title = cells[2]?.textContent.trim() || ''; + const country = cells[3]?.textContent.trim() || ''; + const publishedDate = cells[4]?.textContent.trim() || ''; + const deadline = cells[5]?.textContent.trim() || ''; + + // Only include if mentions UK/United Kingdom + const rowText = row.textContent.toLowerCase(); + if (!rowText.includes('united kingdom') && !rowText.includes('uk') && !rowText.includes('great britain')) { + return; + } + + results.push({ + noticeId, + href, + title, + country, + publishedDate, + deadline, + fullText: row.textContent.substring(0, 1000) + }); + } catch (e) { + // Skip invalid rows + } + }); + + return results; + }); + + console.log(`Found ${tenders.length} UK-related tenders on page ${pageNum}`); + + if (tenders.length === 0) { + console.log('No tenders found, stopping'); + break; + } + + for (const tender of tenders) { + try { + const sourceId = `TED-${tender.noticeId}`; + const noticeUrl = tender.href; + const title = tender.title.substring(0, 500); + const description = tender.fullText || title; + + // Parse dates (format: DD/MM/YYYY or ISO) + let publishedDate = null; + let deadline = null; + + if (tender.publishedDate) { + const pubMatch = tender.publishedDate.match(/(\d{2})\/(\d{2})\/(\d{4})/); + if (pubMatch) { + const [_, day, month, year] = pubMatch; + publishedDate = new Date(`${year}-${month}-${day}`).toISOString(); + } + } + + if (tender.deadline) { + const deadMatch = tender.deadline.match(/(\d{2})\/(\d{2})\/(\d{4})/); + if (deadMatch) { + const [_, day, month, year] = deadMatch; + deadline = new Date(`${year}-${month}-${day}`).toISOString(); + } + } + + // Skip if deadline has passed + if (deadline && new Date(deadline) < new Date()) { + continue; + } + + const sector = await classifySector(title, description); + + const result = await pool.query( + `INSERT INTO tenders ( + source, source_id, title, description, summary, cpv_codes, + value_low, value_high, currency, published_date, deadline, + authority_name, authority_type, location, documents_url, notice_url, status, sector + ) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18) + ON CONFLICT (source_id) DO NOTHING RETURNING id`, + [ + 'ted_eu', + sourceId, + title, + description.substring(0, 2000), + description.substring(0, 500), + [], + null, + null, + 'EUR', + publishedDate || new Date().toISOString(), + deadline, + 'EU Tender Authority', + 'Public Sector', + tender.country || 'United Kingdom', + '', + noticeUrl, + 'open', + sector + ] + ); + + if (result.rows.length > 0) { + insertedCount++; + console.log(` ✓ Inserted: ${title.substring(0, 60)}...`); + } + + } catch (itemError) { + console.error(`Error processing tender: ${itemError.message}`); + continue; + } + } + + await delay(3000); + + } catch (pageError) { + console.error(`Error fetching page ${pageNum}: ${pageError.message}`); + break; + } + } + + console.log(`\nTED EU scrape complete. Inserted ${insertedCount} new tenders.`); + + } catch (error) { + console.error('TED EU scraper failed:', error); + } finally { + if (browser) { + await browser.close(); + } + await pool.end(); + } } +// Run if called directly if (import.meta.url === `file://${process.argv[1]}`) { scrapeTenders(); }