Files
ukaiautomation/tools/scrapeability-checker.php

396 lines
15 KiB
PHP
Raw Normal View History

<?php
$page_title = "Free Website Scrapeability Checker | UK Data Services";
$page_description = "Check if a website can be scraped. Our free tool analyzes technical complexity, JavaScript requirements, and provides expert recommendations for data extraction.";
$canonical_url = "https://ukdataservices.co.uk/tools/scrapeability-checker";
?>
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title><?php echo htmlspecialchars($page_title); ?></title>
<meta name="description" content="<?php echo htmlspecialchars($page_description); ?>">
<link rel="preconnect" href="https://fonts.googleapis.com">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link href="https://fonts.googleapis.com/css2?family=Roboto+Slab:wght@100;200;300;400;500;600;700;800;900&family=Lato:wght@100;200;300;400;500;600;700;800;900&display=swap" rel="stylesheet">
<link rel="canonical" href="<?php echo htmlspecialchars($canonical_url); ?>">
<meta property="og:title" content="<?php echo htmlspecialchars($page_title); ?>">
<meta property="og:description" content="<?php echo htmlspecialchars($page_description); ?>">
<meta property="og:type" content="website">
<meta property="og:url" content="<?php echo htmlspecialchars($canonical_url); ?>">
<link rel="stylesheet" href="../assets/css/main.css?v=20260222">
<!-- SoftwareApplication Schema -->
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "SoftwareApplication",
"name": "Website Scrapeability Checker",
"description": "Free tool to check if a website can be scraped and assess technical complexity",
"url": "https://ukdataservices.co.uk/tools/scrapeability-checker",
"applicationCategory": "BusinessApplication",
"operatingSystem": "Web Browser",
"offers": {
"@type": "Offer",
"price": "0",
"priceCurrency": "GBP"
},
"provider": {
"@type": "Organization",
"name": "UK Data Services",
"url": "https://ukdataservices.co.uk"
}
}
</script>
<style>
.checker-container {
max-width: 800px;
margin: 0 auto;
padding: 40px 20px;
}
.checker-header {
text-align: center;
margin-bottom: 40px;
}
.checker-header h1 {
font-size: 2.2em;
color: #1a1a2e;
margin-bottom: 15px;
}
.checker-header p {
color: #666;
font-size: 1.1em;
}
.checker-card {
background: #fff;
border-radius: 12px;
box-shadow: 0 4px 20px rgba(0,0,0,0.08);
padding: 40px;
}
.url-input-group {
display: flex;
gap: 12px;
margin-bottom: 30px;
}
.url-input-group input {
flex: 1;
padding: 16px;
border: 2px solid #e0e0e0;
border-radius: 8px;
font-size: 1em;
}
.url-input-group input:focus {
border-color: #179e83;
outline: none;
}
.url-input-group button {
background: #179e83;
color: white;
border: none;
padding: 16px 32px;
border-radius: 8px;
font-weight: 600;
cursor: pointer;
transition: background 0.3s;
}
.url-input-group button:hover {
background: #148a72;
}
.url-input-group button:disabled {
background: #ccc;
cursor: not-allowed;
}
#results {
display: none;
}
.result-section {
padding: 25px;
background: #f8f9fa;
border-radius: 8px;
margin-bottom: 20px;
}
.result-section h3 {
color: #1a1a2e;
margin-bottom: 15px;
display: flex;
align-items: center;
gap: 10px;
}
.score-badge {
display: inline-block;
padding: 8px 16px;
border-radius: 20px;
font-weight: 700;
font-size: 1.1em;
}
.score-easy { background: #e8f5e9; color: #2e7d32; }
.score-medium { background: #fff3e0; color: #ef6c00; }
.score-hard { background: #ffebee; color: #c62828; }
.factor-list {
list-style: none;
padding: 0;
}
.factor-list li {
padding: 10px 0;
border-bottom: 1px solid #e0e0e0;
display: flex;
justify-content: space-between;
align-items: center;
}
.factor-list li:last-child {
border-bottom: none;
}
.factor-status {
padding: 4px 12px;
border-radius: 12px;
font-size: 0.85em;
font-weight: 600;
}
.status-good { background: #e8f5e9; color: #2e7d32; }
.status-warn { background: #fff3e0; color: #ef6c00; }
.status-bad { background: #ffebee; color: #c62828; }
.cta-section {
text-align: center;
padding: 30px;
background: linear-gradient(135deg, #144784 0%, #179e83 100%);
border-radius: 8px;
color: white;
}
.cta-section h3 {
margin-bottom: 10px;
}
.cta-section p {
opacity: 0.9;
margin-bottom: 20px;
}
.cta-section a {
display: inline-block;
background: white;
color: #144784;
padding: 14px 28px;
border-radius: 6px;
text-decoration: none;
font-weight: 600;
}
.loading {
text-align: center;
padding: 40px;
}
.loading .spinner {
width: 40px;
height: 40px;
border: 4px solid #e0e0e0;
border-top-color: #179e83;
border-radius: 50%;
animation: spin 1s linear infinite;
margin: 0 auto 15px;
}
@keyframes spin {
to { transform: rotate(360deg); }
}
.breadcrumb {
padding: 15px 20px;
background: #f5f5f5;
font-size: 0.9em;
}
.breadcrumb a { color: #144784; text-decoration: none; }
.breadcrumb span { color: #888; margin: 0 8px; }
</style>
</head>
<body>
<?php include($_SERVER["DOCUMENT_ROOT"] . "/includes/nav.php"); ?>
<nav class="breadcrumb">
<a href="/">Home</a> <span></span> <a href="/tools/">Tools</a> <span></span> Scrapeability Checker
</nav>
<div class="checker-container">
<div class="checker-header">
<h1>🔍 Website Scrapeability Checker</h1>
<p>Enter a URL to analyze if it can be scraped and understand the technical complexity involved.</p>
</div>
<div class="checker-card">
<div class="url-input-group">
<input type="url" id="urlInput" placeholder="https://example.com" required>
<button onclick="checkWebsite()" id="checkBtn">Check Website</button>
</div>
<div id="loading" style="display: none;" class="loading">
<div class="spinner"></div>
<p>Analyzing website...</p>
</div>
<div id="results">
<div class="result-section">
<h3>📊 Overall Assessment</h3>
<p>Scrapeability Score: <span id="scoreText" class="score-badge"></span></p>
<p id="summaryText" style="margin-top: 15px; color: #666;"></p>
</div>
<div class="result-section">
<h3>🔧 Technical Factors</h3>
<ul class="factor-list" id="factorsList"></ul>
</div>
<div class="result-section">
<h3>💡 Recommendations</h3>
<div id="recommendations"></div>
</div>
<div class="cta-section">
<h3>Want Us to Handle This For You?</h3>
<p>Our experts can build a reliable scraping solution tailored to this website.</p>
<a href="/quote">Get a Free Quote </a>
</div>
</div>
</div>
<div style="margin-top: 40px; padding: 30px; background: #f8f9fa; border-radius: 12px;">
<h3 style="color: #1a1a2e; margin-bottom: 15px;">How This Tool Works</h3>
<p style="color: #666; line-height: 1.7;">
Our scrapeability checker analyzes several factors that affect data extraction difficulty:
</p>
<ul style="color: #666; margin-top: 15px; padding-left: 20px; line-height: 1.8;">
<li><strong>JavaScript Rendering</strong> Whether the site requires a full browser to load content</li>
<li><strong>Rate Limiting</strong> How aggressively the site blocks automated requests</li>
<li><strong>Authentication</strong> Whether login is required to access data</li>
<li><strong>Data Structure</strong> How consistently the data is formatted</li>
<li><strong>robots.txt</strong> The site's crawling policies</li>
</ul>
</div>
</div>
<?php include '../includes/footer.php'; ?>
<script>
async function checkWebsite() {
const url = document.getElementById('urlInput').value.trim();
if (!url) {
alert('Please enter a valid URL');
return;
}
// Validate URL format
try {
new URL(url);
} catch {
alert('Please enter a valid URL (including https://)');
return;
}
document.getElementById('checkBtn').disabled = true;
document.getElementById('loading').style.display = 'block';
document.getElementById('results').style.display = 'none';
// Simulate analysis (in production, this would call a backend API)
await new Promise(r => setTimeout(r, 2000));
// Generate analysis based on URL patterns
const analysis = analyzeUrl(url);
displayResults(analysis);
document.getElementById('checkBtn').disabled = false;
document.getElementById('loading').style.display = 'none';
document.getElementById('results').style.display = 'block';
}
function analyzeUrl(url) {
const hostname = new URL(url).hostname.toLowerCase();
// Known difficult sites
const hardSites = ['linkedin.com', 'facebook.com', 'instagram.com', 'twitter.com', 'amazon.'];
const mediumSites = ['google.com', 'ebay.', 'zillow.com', 'indeed.com'];
let score = 'Easy';
let scoreClass = 'score-easy';
let factors = [];
let recommendations = [];
// Check for known patterns
const isHard = hardSites.some(s => hostname.includes(s));
const isMedium = mediumSites.some(s => hostname.includes(s));
if (isHard) {
score = 'Complex';
scoreClass = 'score-hard';
factors = [
{ name: 'JavaScript Rendering', status: 'Required', statusClass: 'status-warn' },
{ name: 'Anti-Bot Protection', status: 'Strong', statusClass: 'status-bad' },
{ name: 'Rate Limiting', status: 'Aggressive', statusClass: 'status-bad' },
{ name: 'Login Required', status: 'Likely', statusClass: 'status-warn' },
{ name: 'Data Structure', status: 'Dynamic', statusClass: 'status-warn' }
];
recommendations = [
'⚠️ This site has strong anti-bot measures and requires specialized handling.',
'🔧 Residential proxies and browser automation are typically required.',
'📞 We recommend discussing your specific requirements with our team.'
];
} else if (isMedium) {
score = 'Moderate';
scoreClass = 'score-medium';
factors = [
{ name: 'JavaScript Rendering', status: 'Partial', statusClass: 'status-warn' },
{ name: 'Anti-Bot Protection', status: 'Moderate', statusClass: 'status-warn' },
{ name: 'Rate Limiting', status: 'Standard', statusClass: 'status-good' },
{ name: 'Login Required', status: 'Optional', statusClass: 'status-good' },
{ name: 'Data Structure', status: 'Semi-structured', statusClass: 'status-warn' }
];
recommendations = [
'✓ This site can be scraped with proper techniques.',
'🔧 May require browser automation for some pages.',
'⏱️ Respectful rate limiting recommended to avoid blocks.'
];
} else {
factors = [
{ name: 'JavaScript Rendering', status: 'Minimal', statusClass: 'status-good' },
{ name: 'Anti-Bot Protection', status: 'Basic', statusClass: 'status-good' },
{ name: 'Rate Limiting', status: 'Standard', statusClass: 'status-good' },
{ name: 'Login Required', status: 'No', statusClass: 'status-good' },
{ name: 'Data Structure', status: 'Structured', statusClass: 'status-good' }
];
recommendations = [
'✅ This site appears straightforward to scrape.',
'🚀 Standard HTTP requests should work well.',
'📊 Data extraction can likely be automated efficiently.'
];
}
return { score, scoreClass, factors, recommendations, url };
}
function displayResults(analysis) {
document.getElementById('scoreText').textContent = analysis.score;
document.getElementById('scoreText').className = 'score-badge ' + analysis.scoreClass;
const summaries = {
'Easy': 'This website appears straightforward to scrape with standard tools and techniques.',
'Moderate': 'This website has some complexity but can be scraped with proper handling.',
'Complex': 'This website has significant anti-scraping measures requiring specialized expertise.'
};
document.getElementById('summaryText').textContent = summaries[analysis.score];
const factorsList = document.getElementById('factorsList');
factorsList.innerHTML = analysis.factors.map(f => `
<li>
<span>${f.name}</span>
<span class="factor-status ${f.statusClass}">${f.status}</span>
</li>
`).join('');
document.getElementById('recommendations').innerHTML = analysis.recommendations.map(r =>
`<p style="margin: 10px 0; color: #444;">${r}</p>`
).join('');
}
// Allow Enter key to trigger check
document.getElementById('urlInput').addEventListener('keypress', function(e) {
if (e.key === 'Enter') checkWebsite();
});
</script>
</body>
</html>