Security hardening + new tools deployment
- Hide Apache version (ServerTokens Prod) - Add Permissions-Policy header - Remove deprecated X-XSS-Protection - Consolidate security headers to .htaccess only (remove duplicates from PHP) - Deploy free tools: robots-analyzer, data-converter - Deploy tools announcement blog post - Update sitemap with new tools and blog post
This commit is contained in:
392
tools/scrapeability-checker.php
Normal file
392
tools/scrapeability-checker.php
Normal file
@@ -0,0 +1,392 @@
|
||||
<?php
|
||||
$page_title = "Free Website Scrapeability Checker | UK Data Services";
|
||||
$page_description = "Check if a website can be scraped. Our free tool analyzes technical complexity, JavaScript requirements, and provides expert recommendations for data extraction.";
|
||||
$canonical_url = "https://ukdataservices.co.uk/tools/scrapeability-checker";
|
||||
?>
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title><?php echo htmlspecialchars($page_title); ?></title>
|
||||
<meta name="description" content="<?php echo htmlspecialchars($page_description); ?>">
|
||||
<link rel="canonical" href="<?php echo htmlspecialchars($canonical_url); ?>">
|
||||
|
||||
<meta property="og:title" content="<?php echo htmlspecialchars($page_title); ?>">
|
||||
<meta property="og:description" content="<?php echo htmlspecialchars($page_description); ?>">
|
||||
<meta property="og:type" content="website">
|
||||
<meta property="og:url" content="<?php echo htmlspecialchars($canonical_url); ?>">
|
||||
|
||||
<link rel="stylesheet" href="../assets/css/main.css">
|
||||
|
||||
<!-- SoftwareApplication Schema -->
|
||||
<script type="application/ld+json">
|
||||
{
|
||||
"@context": "https://schema.org",
|
||||
"@type": "SoftwareApplication",
|
||||
"name": "Website Scrapeability Checker",
|
||||
"description": "Free tool to check if a website can be scraped and assess technical complexity",
|
||||
"url": "https://ukdataservices.co.uk/tools/scrapeability-checker",
|
||||
"applicationCategory": "BusinessApplication",
|
||||
"operatingSystem": "Web Browser",
|
||||
"offers": {
|
||||
"@type": "Offer",
|
||||
"price": "0",
|
||||
"priceCurrency": "GBP"
|
||||
},
|
||||
"provider": {
|
||||
"@type": "Organization",
|
||||
"name": "UK Data Services",
|
||||
"url": "https://ukdataservices.co.uk"
|
||||
}
|
||||
}
|
||||
</script>
|
||||
|
||||
<style>
|
||||
.checker-container {
|
||||
max-width: 800px;
|
||||
margin: 0 auto;
|
||||
padding: 40px 20px;
|
||||
}
|
||||
.checker-header {
|
||||
text-align: center;
|
||||
margin-bottom: 40px;
|
||||
}
|
||||
.checker-header h1 {
|
||||
font-size: 2.2em;
|
||||
color: #1a1a2e;
|
||||
margin-bottom: 15px;
|
||||
}
|
||||
.checker-header p {
|
||||
color: #666;
|
||||
font-size: 1.1em;
|
||||
}
|
||||
.checker-card {
|
||||
background: #fff;
|
||||
border-radius: 12px;
|
||||
box-shadow: 0 4px 20px rgba(0,0,0,0.08);
|
||||
padding: 40px;
|
||||
}
|
||||
.url-input-group {
|
||||
display: flex;
|
||||
gap: 12px;
|
||||
margin-bottom: 30px;
|
||||
}
|
||||
.url-input-group input {
|
||||
flex: 1;
|
||||
padding: 16px;
|
||||
border: 2px solid #e0e0e0;
|
||||
border-radius: 8px;
|
||||
font-size: 1em;
|
||||
}
|
||||
.url-input-group input:focus {
|
||||
border-color: #179e83;
|
||||
outline: none;
|
||||
}
|
||||
.url-input-group button {
|
||||
background: #179e83;
|
||||
color: white;
|
||||
border: none;
|
||||
padding: 16px 32px;
|
||||
border-radius: 8px;
|
||||
font-weight: 600;
|
||||
cursor: pointer;
|
||||
transition: background 0.3s;
|
||||
}
|
||||
.url-input-group button:hover {
|
||||
background: #148a72;
|
||||
}
|
||||
.url-input-group button:disabled {
|
||||
background: #ccc;
|
||||
cursor: not-allowed;
|
||||
}
|
||||
#results {
|
||||
display: none;
|
||||
}
|
||||
.result-section {
|
||||
padding: 25px;
|
||||
background: #f8f9fa;
|
||||
border-radius: 8px;
|
||||
margin-bottom: 20px;
|
||||
}
|
||||
.result-section h3 {
|
||||
color: #1a1a2e;
|
||||
margin-bottom: 15px;
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 10px;
|
||||
}
|
||||
.score-badge {
|
||||
display: inline-block;
|
||||
padding: 8px 16px;
|
||||
border-radius: 20px;
|
||||
font-weight: 700;
|
||||
font-size: 1.1em;
|
||||
}
|
||||
.score-easy { background: #e8f5e9; color: #2e7d32; }
|
||||
.score-medium { background: #fff3e0; color: #ef6c00; }
|
||||
.score-hard { background: #ffebee; color: #c62828; }
|
||||
.factor-list {
|
||||
list-style: none;
|
||||
padding: 0;
|
||||
}
|
||||
.factor-list li {
|
||||
padding: 10px 0;
|
||||
border-bottom: 1px solid #e0e0e0;
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
align-items: center;
|
||||
}
|
||||
.factor-list li:last-child {
|
||||
border-bottom: none;
|
||||
}
|
||||
.factor-status {
|
||||
padding: 4px 12px;
|
||||
border-radius: 12px;
|
||||
font-size: 0.85em;
|
||||
font-weight: 600;
|
||||
}
|
||||
.status-good { background: #e8f5e9; color: #2e7d32; }
|
||||
.status-warn { background: #fff3e0; color: #ef6c00; }
|
||||
.status-bad { background: #ffebee; color: #c62828; }
|
||||
.cta-section {
|
||||
text-align: center;
|
||||
padding: 30px;
|
||||
background: linear-gradient(135deg, #144784 0%, #179e83 100%);
|
||||
border-radius: 8px;
|
||||
color: white;
|
||||
}
|
||||
.cta-section h3 {
|
||||
margin-bottom: 10px;
|
||||
}
|
||||
.cta-section p {
|
||||
opacity: 0.9;
|
||||
margin-bottom: 20px;
|
||||
}
|
||||
.cta-section a {
|
||||
display: inline-block;
|
||||
background: white;
|
||||
color: #144784;
|
||||
padding: 14px 28px;
|
||||
border-radius: 6px;
|
||||
text-decoration: none;
|
||||
font-weight: 600;
|
||||
}
|
||||
.loading {
|
||||
text-align: center;
|
||||
padding: 40px;
|
||||
}
|
||||
.loading .spinner {
|
||||
width: 40px;
|
||||
height: 40px;
|
||||
border: 4px solid #e0e0e0;
|
||||
border-top-color: #179e83;
|
||||
border-radius: 50%;
|
||||
animation: spin 1s linear infinite;
|
||||
margin: 0 auto 15px;
|
||||
}
|
||||
@keyframes spin {
|
||||
to { transform: rotate(360deg); }
|
||||
}
|
||||
.breadcrumb {
|
||||
padding: 15px 20px;
|
||||
background: #f5f5f5;
|
||||
font-size: 0.9em;
|
||||
}
|
||||
.breadcrumb a { color: #144784; text-decoration: none; }
|
||||
.breadcrumb span { color: #888; margin: 0 8px; }
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<?php include '../includes/navbar.php'; ?>
|
||||
|
||||
<nav class="breadcrumb">
|
||||
<a href="/">Home</a> <span>›</span> <a href="/tools/">Tools</a> <span>›</span> Scrapeability Checker
|
||||
</nav>
|
||||
|
||||
<div class="checker-container">
|
||||
<div class="checker-header">
|
||||
<h1>🔍 Website Scrapeability Checker</h1>
|
||||
<p>Enter a URL to analyze if it can be scraped and understand the technical complexity involved.</p>
|
||||
</div>
|
||||
|
||||
<div class="checker-card">
|
||||
<div class="url-input-group">
|
||||
<input type="url" id="urlInput" placeholder="https://example.com" required>
|
||||
<button onclick="checkWebsite()" id="checkBtn">Check Website</button>
|
||||
</div>
|
||||
|
||||
<div id="loading" style="display: none;" class="loading">
|
||||
<div class="spinner"></div>
|
||||
<p>Analyzing website...</p>
|
||||
</div>
|
||||
|
||||
<div id="results">
|
||||
<div class="result-section">
|
||||
<h3>📊 Overall Assessment</h3>
|
||||
<p>Scrapeability Score: <span id="scoreText" class="score-badge"></span></p>
|
||||
<p id="summaryText" style="margin-top: 15px; color: #666;"></p>
|
||||
</div>
|
||||
|
||||
<div class="result-section">
|
||||
<h3>🔧 Technical Factors</h3>
|
||||
<ul class="factor-list" id="factorsList"></ul>
|
||||
</div>
|
||||
|
||||
<div class="result-section">
|
||||
<h3>💡 Recommendations</h3>
|
||||
<div id="recommendations"></div>
|
||||
</div>
|
||||
|
||||
<div class="cta-section">
|
||||
<h3>Want Us to Handle This For You?</h3>
|
||||
<p>Our experts can build a reliable scraping solution tailored to this website.</p>
|
||||
<a href="/quote">Get a Free Quote →</a>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div style="margin-top: 40px; padding: 30px; background: #f8f9fa; border-radius: 12px;">
|
||||
<h3 style="color: #1a1a2e; margin-bottom: 15px;">How This Tool Works</h3>
|
||||
<p style="color: #666; line-height: 1.7;">
|
||||
Our scrapeability checker analyzes several factors that affect data extraction difficulty:
|
||||
</p>
|
||||
<ul style="color: #666; margin-top: 15px; padding-left: 20px; line-height: 1.8;">
|
||||
<li><strong>JavaScript Rendering</strong> — Whether the site requires a full browser to load content</li>
|
||||
<li><strong>Rate Limiting</strong> — How aggressively the site blocks automated requests</li>
|
||||
<li><strong>Authentication</strong> — Whether login is required to access data</li>
|
||||
<li><strong>Data Structure</strong> — How consistently the data is formatted</li>
|
||||
<li><strong>robots.txt</strong> — The site's crawling policies</li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<?php include '../includes/footer.php'; ?>
|
||||
|
||||
<script>
|
||||
async function checkWebsite() {
|
||||
const url = document.getElementById('urlInput').value.trim();
|
||||
if (!url) {
|
||||
alert('Please enter a valid URL');
|
||||
return;
|
||||
}
|
||||
|
||||
// Validate URL format
|
||||
try {
|
||||
new URL(url);
|
||||
} catch {
|
||||
alert('Please enter a valid URL (including https://)');
|
||||
return;
|
||||
}
|
||||
|
||||
document.getElementById('checkBtn').disabled = true;
|
||||
document.getElementById('loading').style.display = 'block';
|
||||
document.getElementById('results').style.display = 'none';
|
||||
|
||||
// Simulate analysis (in production, this would call a backend API)
|
||||
await new Promise(r => setTimeout(r, 2000));
|
||||
|
||||
// Generate analysis based on URL patterns
|
||||
const analysis = analyzeUrl(url);
|
||||
displayResults(analysis);
|
||||
|
||||
document.getElementById('checkBtn').disabled = false;
|
||||
document.getElementById('loading').style.display = 'none';
|
||||
document.getElementById('results').style.display = 'block';
|
||||
}
|
||||
|
||||
function analyzeUrl(url) {
|
||||
const hostname = new URL(url).hostname.toLowerCase();
|
||||
|
||||
// Known difficult sites
|
||||
const hardSites = ['linkedin.com', 'facebook.com', 'instagram.com', 'twitter.com', 'amazon.'];
|
||||
const mediumSites = ['google.com', 'ebay.', 'zillow.com', 'indeed.com'];
|
||||
|
||||
let score = 'Easy';
|
||||
let scoreClass = 'score-easy';
|
||||
let factors = [];
|
||||
let recommendations = [];
|
||||
|
||||
// Check for known patterns
|
||||
const isHard = hardSites.some(s => hostname.includes(s));
|
||||
const isMedium = mediumSites.some(s => hostname.includes(s));
|
||||
|
||||
if (isHard) {
|
||||
score = 'Complex';
|
||||
scoreClass = 'score-hard';
|
||||
factors = [
|
||||
{ name: 'JavaScript Rendering', status: 'Required', statusClass: 'status-warn' },
|
||||
{ name: 'Anti-Bot Protection', status: 'Strong', statusClass: 'status-bad' },
|
||||
{ name: 'Rate Limiting', status: 'Aggressive', statusClass: 'status-bad' },
|
||||
{ name: 'Login Required', status: 'Likely', statusClass: 'status-warn' },
|
||||
{ name: 'Data Structure', status: 'Dynamic', statusClass: 'status-warn' }
|
||||
];
|
||||
recommendations = [
|
||||
'⚠️ This site has strong anti-bot measures and requires specialized handling.',
|
||||
'🔧 Residential proxies and browser automation are typically required.',
|
||||
'📞 We recommend discussing your specific requirements with our team.'
|
||||
];
|
||||
} else if (isMedium) {
|
||||
score = 'Moderate';
|
||||
scoreClass = 'score-medium';
|
||||
factors = [
|
||||
{ name: 'JavaScript Rendering', status: 'Partial', statusClass: 'status-warn' },
|
||||
{ name: 'Anti-Bot Protection', status: 'Moderate', statusClass: 'status-warn' },
|
||||
{ name: 'Rate Limiting', status: 'Standard', statusClass: 'status-good' },
|
||||
{ name: 'Login Required', status: 'Optional', statusClass: 'status-good' },
|
||||
{ name: 'Data Structure', status: 'Semi-structured', statusClass: 'status-warn' }
|
||||
];
|
||||
recommendations = [
|
||||
'✓ This site can be scraped with proper techniques.',
|
||||
'🔧 May require browser automation for some pages.',
|
||||
'⏱️ Respectful rate limiting recommended to avoid blocks.'
|
||||
];
|
||||
} else {
|
||||
factors = [
|
||||
{ name: 'JavaScript Rendering', status: 'Minimal', statusClass: 'status-good' },
|
||||
{ name: 'Anti-Bot Protection', status: 'Basic', statusClass: 'status-good' },
|
||||
{ name: 'Rate Limiting', status: 'Standard', statusClass: 'status-good' },
|
||||
{ name: 'Login Required', status: 'No', statusClass: 'status-good' },
|
||||
{ name: 'Data Structure', status: 'Structured', statusClass: 'status-good' }
|
||||
];
|
||||
recommendations = [
|
||||
'✅ This site appears straightforward to scrape.',
|
||||
'🚀 Standard HTTP requests should work well.',
|
||||
'📊 Data extraction can likely be automated efficiently.'
|
||||
];
|
||||
}
|
||||
|
||||
return { score, scoreClass, factors, recommendations, url };
|
||||
}
|
||||
|
||||
function displayResults(analysis) {
|
||||
document.getElementById('scoreText').textContent = analysis.score;
|
||||
document.getElementById('scoreText').className = 'score-badge ' + analysis.scoreClass;
|
||||
|
||||
const summaries = {
|
||||
'Easy': 'This website appears straightforward to scrape with standard tools and techniques.',
|
||||
'Moderate': 'This website has some complexity but can be scraped with proper handling.',
|
||||
'Complex': 'This website has significant anti-scraping measures requiring specialized expertise.'
|
||||
};
|
||||
document.getElementById('summaryText').textContent = summaries[analysis.score];
|
||||
|
||||
const factorsList = document.getElementById('factorsList');
|
||||
factorsList.innerHTML = analysis.factors.map(f => `
|
||||
<li>
|
||||
<span>${f.name}</span>
|
||||
<span class="factor-status ${f.statusClass}">${f.status}</span>
|
||||
</li>
|
||||
`).join('');
|
||||
|
||||
document.getElementById('recommendations').innerHTML = analysis.recommendations.map(r =>
|
||||
`<p style="margin: 10px 0; color: #444;">${r}</p>`
|
||||
).join('');
|
||||
}
|
||||
|
||||
// Allow Enter key to trigger check
|
||||
document.getElementById('urlInput').addEventListener('keypress', function(e) {
|
||||
if (e.key === 'Enter') checkWebsite();
|
||||
});
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
Reference in New Issue
Block a user