Files
ukaiautomation/tools/scrapeability-checker.php
root edd491d680 Fix navbar across all pages: add nav include, fonts, active state, spacing, stats, error pages
- Add nav.php include to 5 missing pages (cost-calculator, thank-you, 403, 404, 500)
- Add ErrorDocument directives to .htaccess for custom 403/404/500 pages
- Fix bogus accuracy stats (homepage, web-scraping, location pages)
- Fix invisible CTA buttons on property and financial service pages
- Add Google Fonts (Roboto Slab + Lato) to all pages missing it (tools, blog articles, error pages)
- Add active nav link highlighting (teal underline for current page)
- Improve footer contrast to WCAG AA, equal-height cards, mobile text scaling
- Consistent navbar-to-content spacing across all pages
- Bump cache version to v1.1.3
2026-02-11 07:15:11 +00:00

396 lines
15 KiB
PHP
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
<?php
$page_title = "Free Website Scrapeability Checker | UK Data Services";
$page_description = "Check if a website can be scraped. Our free tool analyzes technical complexity, JavaScript requirements, and provides expert recommendations for data extraction.";
$canonical_url = "https://ukdataservices.co.uk/tools/scrapeability-checker";
?>
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title><?php echo htmlspecialchars($page_title); ?></title>
<meta name="description" content="<?php echo htmlspecialchars($page_description); ?>">
<link rel="preconnect" href="https://fonts.googleapis.com">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link href="https://fonts.googleapis.com/css2?family=Roboto+Slab:wght@100;200;300;400;500;600;700;800;900&family=Lato:wght@100;200;300;400;500;600;700;800;900&display=swap" rel="stylesheet">
<link rel="canonical" href="<?php echo htmlspecialchars($canonical_url); ?>">
<meta property="og:title" content="<?php echo htmlspecialchars($page_title); ?>">
<meta property="og:description" content="<?php echo htmlspecialchars($page_description); ?>">
<meta property="og:type" content="website">
<meta property="og:url" content="<?php echo htmlspecialchars($canonical_url); ?>">
<link rel="stylesheet" href="../assets/css/main.css">
<!-- SoftwareApplication Schema -->
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "SoftwareApplication",
"name": "Website Scrapeability Checker",
"description": "Free tool to check if a website can be scraped and assess technical complexity",
"url": "https://ukdataservices.co.uk/tools/scrapeability-checker",
"applicationCategory": "BusinessApplication",
"operatingSystem": "Web Browser",
"offers": {
"@type": "Offer",
"price": "0",
"priceCurrency": "GBP"
},
"provider": {
"@type": "Organization",
"name": "UK Data Services",
"url": "https://ukdataservices.co.uk"
}
}
</script>
<style>
.checker-container {
max-width: 800px;
margin: 0 auto;
padding: 40px 20px;
}
.checker-header {
text-align: center;
margin-bottom: 40px;
}
.checker-header h1 {
font-size: 2.2em;
color: #1a1a2e;
margin-bottom: 15px;
}
.checker-header p {
color: #666;
font-size: 1.1em;
}
.checker-card {
background: #fff;
border-radius: 12px;
box-shadow: 0 4px 20px rgba(0,0,0,0.08);
padding: 40px;
}
.url-input-group {
display: flex;
gap: 12px;
margin-bottom: 30px;
}
.url-input-group input {
flex: 1;
padding: 16px;
border: 2px solid #e0e0e0;
border-radius: 8px;
font-size: 1em;
}
.url-input-group input:focus {
border-color: #179e83;
outline: none;
}
.url-input-group button {
background: #179e83;
color: white;
border: none;
padding: 16px 32px;
border-radius: 8px;
font-weight: 600;
cursor: pointer;
transition: background 0.3s;
}
.url-input-group button:hover {
background: #148a72;
}
.url-input-group button:disabled {
background: #ccc;
cursor: not-allowed;
}
#results {
display: none;
}
.result-section {
padding: 25px;
background: #f8f9fa;
border-radius: 8px;
margin-bottom: 20px;
}
.result-section h3 {
color: #1a1a2e;
margin-bottom: 15px;
display: flex;
align-items: center;
gap: 10px;
}
.score-badge {
display: inline-block;
padding: 8px 16px;
border-radius: 20px;
font-weight: 700;
font-size: 1.1em;
}
.score-easy { background: #e8f5e9; color: #2e7d32; }
.score-medium { background: #fff3e0; color: #ef6c00; }
.score-hard { background: #ffebee; color: #c62828; }
.factor-list {
list-style: none;
padding: 0;
}
.factor-list li {
padding: 10px 0;
border-bottom: 1px solid #e0e0e0;
display: flex;
justify-content: space-between;
align-items: center;
}
.factor-list li:last-child {
border-bottom: none;
}
.factor-status {
padding: 4px 12px;
border-radius: 12px;
font-size: 0.85em;
font-weight: 600;
}
.status-good { background: #e8f5e9; color: #2e7d32; }
.status-warn { background: #fff3e0; color: #ef6c00; }
.status-bad { background: #ffebee; color: #c62828; }
.cta-section {
text-align: center;
padding: 30px;
background: linear-gradient(135deg, #144784 0%, #179e83 100%);
border-radius: 8px;
color: white;
}
.cta-section h3 {
margin-bottom: 10px;
}
.cta-section p {
opacity: 0.9;
margin-bottom: 20px;
}
.cta-section a {
display: inline-block;
background: white;
color: #144784;
padding: 14px 28px;
border-radius: 6px;
text-decoration: none;
font-weight: 600;
}
.loading {
text-align: center;
padding: 40px;
}
.loading .spinner {
width: 40px;
height: 40px;
border: 4px solid #e0e0e0;
border-top-color: #179e83;
border-radius: 50%;
animation: spin 1s linear infinite;
margin: 0 auto 15px;
}
@keyframes spin {
to { transform: rotate(360deg); }
}
.breadcrumb {
padding: 15px 20px;
background: #f5f5f5;
font-size: 0.9em;
}
.breadcrumb a { color: #144784; text-decoration: none; }
.breadcrumb span { color: #888; margin: 0 8px; }
</style>
</head>
<body>
<?php include($_SERVER["DOCUMENT_ROOT"] . "/includes/nav.php"); ?>
<nav class="breadcrumb">
<a href="/">Home</a> <span></span> <a href="/tools/">Tools</a> <span></span> Scrapeability Checker
</nav>
<div class="checker-container">
<div class="checker-header">
<h1>🔍 Website Scrapeability Checker</h1>
<p>Enter a URL to analyze if it can be scraped and understand the technical complexity involved.</p>
</div>
<div class="checker-card">
<div class="url-input-group">
<input type="url" id="urlInput" placeholder="https://example.com" required>
<button onclick="checkWebsite()" id="checkBtn">Check Website</button>
</div>
<div id="loading" style="display: none;" class="loading">
<div class="spinner"></div>
<p>Analyzing website...</p>
</div>
<div id="results">
<div class="result-section">
<h3>📊 Overall Assessment</h3>
<p>Scrapeability Score: <span id="scoreText" class="score-badge"></span></p>
<p id="summaryText" style="margin-top: 15px; color: #666;"></p>
</div>
<div class="result-section">
<h3>🔧 Technical Factors</h3>
<ul class="factor-list" id="factorsList"></ul>
</div>
<div class="result-section">
<h3>💡 Recommendations</h3>
<div id="recommendations"></div>
</div>
<div class="cta-section">
<h3>Want Us to Handle This For You?</h3>
<p>Our experts can build a reliable scraping solution tailored to this website.</p>
<a href="/quote">Get a Free Quote →</a>
</div>
</div>
</div>
<div style="margin-top: 40px; padding: 30px; background: #f8f9fa; border-radius: 12px;">
<h3 style="color: #1a1a2e; margin-bottom: 15px;">How This Tool Works</h3>
<p style="color: #666; line-height: 1.7;">
Our scrapeability checker analyzes several factors that affect data extraction difficulty:
</p>
<ul style="color: #666; margin-top: 15px; padding-left: 20px; line-height: 1.8;">
<li><strong>JavaScript Rendering</strong> — Whether the site requires a full browser to load content</li>
<li><strong>Rate Limiting</strong> — How aggressively the site blocks automated requests</li>
<li><strong>Authentication</strong> — Whether login is required to access data</li>
<li><strong>Data Structure</strong> — How consistently the data is formatted</li>
<li><strong>robots.txt</strong> — The site's crawling policies</li>
</ul>
</div>
</div>
<?php include '../includes/footer.php'; ?>
<script>
async function checkWebsite() {
const url = document.getElementById('urlInput').value.trim();
if (!url) {
alert('Please enter a valid URL');
return;
}
// Validate URL format
try {
new URL(url);
} catch {
alert('Please enter a valid URL (including https://)');
return;
}
document.getElementById('checkBtn').disabled = true;
document.getElementById('loading').style.display = 'block';
document.getElementById('results').style.display = 'none';
// Simulate analysis (in production, this would call a backend API)
await new Promise(r => setTimeout(r, 2000));
// Generate analysis based on URL patterns
const analysis = analyzeUrl(url);
displayResults(analysis);
document.getElementById('checkBtn').disabled = false;
document.getElementById('loading').style.display = 'none';
document.getElementById('results').style.display = 'block';
}
function analyzeUrl(url) {
const hostname = new URL(url).hostname.toLowerCase();
// Known difficult sites
const hardSites = ['linkedin.com', 'facebook.com', 'instagram.com', 'twitter.com', 'amazon.'];
const mediumSites = ['google.com', 'ebay.', 'zillow.com', 'indeed.com'];
let score = 'Easy';
let scoreClass = 'score-easy';
let factors = [];
let recommendations = [];
// Check for known patterns
const isHard = hardSites.some(s => hostname.includes(s));
const isMedium = mediumSites.some(s => hostname.includes(s));
if (isHard) {
score = 'Complex';
scoreClass = 'score-hard';
factors = [
{ name: 'JavaScript Rendering', status: 'Required', statusClass: 'status-warn' },
{ name: 'Anti-Bot Protection', status: 'Strong', statusClass: 'status-bad' },
{ name: 'Rate Limiting', status: 'Aggressive', statusClass: 'status-bad' },
{ name: 'Login Required', status: 'Likely', statusClass: 'status-warn' },
{ name: 'Data Structure', status: 'Dynamic', statusClass: 'status-warn' }
];
recommendations = [
'⚠️ This site has strong anti-bot measures and requires specialized handling.',
'🔧 Residential proxies and browser automation are typically required.',
'📞 We recommend discussing your specific requirements with our team.'
];
} else if (isMedium) {
score = 'Moderate';
scoreClass = 'score-medium';
factors = [
{ name: 'JavaScript Rendering', status: 'Partial', statusClass: 'status-warn' },
{ name: 'Anti-Bot Protection', status: 'Moderate', statusClass: 'status-warn' },
{ name: 'Rate Limiting', status: 'Standard', statusClass: 'status-good' },
{ name: 'Login Required', status: 'Optional', statusClass: 'status-good' },
{ name: 'Data Structure', status: 'Semi-structured', statusClass: 'status-warn' }
];
recommendations = [
'✓ This site can be scraped with proper techniques.',
'🔧 May require browser automation for some pages.',
'⏱️ Respectful rate limiting recommended to avoid blocks.'
];
} else {
factors = [
{ name: 'JavaScript Rendering', status: 'Minimal', statusClass: 'status-good' },
{ name: 'Anti-Bot Protection', status: 'Basic', statusClass: 'status-good' },
{ name: 'Rate Limiting', status: 'Standard', statusClass: 'status-good' },
{ name: 'Login Required', status: 'No', statusClass: 'status-good' },
{ name: 'Data Structure', status: 'Structured', statusClass: 'status-good' }
];
recommendations = [
'✅ This site appears straightforward to scrape.',
'🚀 Standard HTTP requests should work well.',
'📊 Data extraction can likely be automated efficiently.'
];
}
return { score, scoreClass, factors, recommendations, url };
}
function displayResults(analysis) {
document.getElementById('scoreText').textContent = analysis.score;
document.getElementById('scoreText').className = 'score-badge ' + analysis.scoreClass;
const summaries = {
'Easy': 'This website appears straightforward to scrape with standard tools and techniques.',
'Moderate': 'This website has some complexity but can be scraped with proper handling.',
'Complex': 'This website has significant anti-scraping measures requiring specialized expertise.'
};
document.getElementById('summaryText').textContent = summaries[analysis.score];
const factorsList = document.getElementById('factorsList');
factorsList.innerHTML = analysis.factors.map(f => `
<li>
<span>${f.name}</span>
<span class="factor-status ${f.statusClass}">${f.status}</span>
</li>
`).join('');
document.getElementById('recommendations').innerHTML = analysis.recommendations.map(r =>
`<p style="margin: 10px 0; color: #444;">${r}</p>`
).join('');
}
// Allow Enter key to trigger check
document.getElementById('urlInput').addEventListener('keypress', function(e) {
if (e.key === 'Enter') checkWebsite();
});
</script>
</body>
</html>