Files
ukaiautomation/tools/robots-analyzer.php

261 lines
12 KiB
PHP
Raw Normal View History

<?php
$page_title = "Free Robots.txt Analyzer | UK Data Services";
$page_description = "Analyze any website's robots.txt file instantly. See crawling rules, blocked paths, sitemaps, and get recommendations for web scraping compliance.";
$canonical_url = "https://ukdataservices.co.uk/tools/robots-analyzer";
?>
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title><?php echo htmlspecialchars($page_title); ?></title>
<meta name="description" content="<?php echo htmlspecialchars($page_description); ?>">
<link rel="canonical" href="<?php echo htmlspecialchars($canonical_url); ?>">
<meta property="og:title" content="<?php echo htmlspecialchars($page_title); ?>">
<meta property="og:description" content="<?php echo htmlspecialchars($page_description); ?>">
<meta property="og:type" content="website">
<link rel="stylesheet" href="../assets/css/main.css">
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "SoftwareApplication",
"name": "Robots.txt Analyzer",
"description": "Free tool to analyze robots.txt files and understand crawling permissions",
"url": "https://ukdataservices.co.uk/tools/robots-analyzer",
"applicationCategory": "BusinessApplication",
"operatingSystem": "Web Browser",
"offers": { "@type": "Offer", "price": "0", "priceCurrency": "GBP" }
}
</script>
<style>
.analyzer-container { max-width: 900px; margin: 0 auto; padding: 40px 20px; }
.analyzer-header { text-align: center; margin-bottom: 40px; }
.analyzer-header h1 { font-size: 2.2em; color: #1a1a2e; margin-bottom: 15px; }
.analyzer-header p { color: #666; font-size: 1.1em; }
.analyzer-card { background: #fff; border-radius: 12px; box-shadow: 0 4px 20px rgba(0,0,0,0.08); padding: 40px; }
.url-input-group { display: flex; gap: 12px; margin-bottom: 30px; }
.url-input-group input { flex: 1; padding: 16px; border: 2px solid #e0e0e0; border-radius: 8px; font-size: 1em; }
.url-input-group input:focus { border-color: #179e83; outline: none; }
.url-input-group button { background: #179e83; color: white; border: none; padding: 16px 32px; border-radius: 8px; font-weight: 600; cursor: pointer; }
.url-input-group button:hover { background: #148a72; }
.url-input-group button:disabled { background: #ccc; cursor: not-allowed; }
.results-grid { display: grid; grid-template-columns: 1fr 1fr; gap: 20px; }
@media (max-width: 768px) { .results-grid { grid-template-columns: 1fr; } }
.result-box { background: #f8f9fa; border-radius: 8px; padding: 20px; }
.result-box h3 { color: #1a1a2e; margin-bottom: 15px; font-size: 1.1em; display: flex; align-items: center; gap: 8px; }
.result-box pre { background: #1a1a2e; color: #a5d6a7; padding: 15px; border-radius: 6px; overflow-x: auto; font-size: 0.85em; max-height: 300px; }
.stat-badge { display: inline-block; padding: 6px 12px; border-radius: 15px; font-size: 0.9em; font-weight: 600; margin: 4px; }
.badge-green { background: #e8f5e9; color: #2e7d32; }
.badge-yellow { background: #fff3e0; color: #ef6c00; }
.badge-red { background: #ffebee; color: #c62828; }
.badge-blue { background: #e3f2fd; color: #1565c0; }
.loading { text-align: center; padding: 40px; display: none; }
.spinner { width: 40px; height: 40px; border: 4px solid #e0e0e0; border-top-color: #179e83; border-radius: 50%; animation: spin 1s linear infinite; margin: 0 auto 15px; }
@keyframes spin { to { transform: rotate(360deg); } }
#results { display: none; }
.breadcrumb { padding: 15px 20px; background: #f5f5f5; font-size: 0.9em; }
.breadcrumb a { color: #144784; text-decoration: none; }
.breadcrumb span { color: #888; margin: 0 8px; }
.path-list { list-style: none; padding: 0; margin: 0; max-height: 200px; overflow-y: auto; }
.path-list li { padding: 8px 12px; border-bottom: 1px solid #e0e0e0; font-family: monospace; font-size: 0.9em; }
.path-list li:last-child { border-bottom: none; }
.cta-box { text-align: center; padding: 30px; background: linear-gradient(135deg, #144784 0%, #179e83 100%); border-radius: 8px; color: white; margin-top: 30px; }
.cta-box a { display: inline-block; background: white; color: #144784; padding: 14px 28px; border-radius: 6px; text-decoration: none; font-weight: 600; }
</style>
</head>
<body>
<?php include($_SERVER["DOCUMENT_ROOT"] . "/includes/nav.php"); ?>
<nav class="breadcrumb">
<a href="/">Home</a> <span></span> <a href="/tools/">Tools</a> <span></span> Robots.txt Analyzer
</nav>
<div class="analyzer-container">
<div class="analyzer-header">
<h1>🤖 Robots.txt Analyzer</h1>
<p>Analyze any website's robots.txt to understand crawling rules and scraping permissions.</p>
</div>
<div class="analyzer-card">
<div class="url-input-group">
<input type="url" id="urlInput" placeholder="https://example.com" required>
<button onclick="analyzeRobots()" id="analyzeBtn">Analyze</button>
</div>
<div id="loading" class="loading">
<div class="spinner"></div>
<p>Fetching and analyzing robots.txt...</p>
</div>
<div id="results">
<div style="margin-bottom: 25px;">
<h3 style="color: #1a1a2e; margin-bottom: 15px;">📊 Quick Summary</h3>
<div id="summaryBadges"></div>
</div>
<div class="results-grid">
<div class="result-box">
<h3>🚫 Blocked Paths</h3>
<ul class="path-list" id="blockedPaths"></ul>
</div>
<div class="result-box">
<h3> Allowed Paths</h3>
<ul class="path-list" id="allowedPaths"></ul>
</div>
</div>
<div class="result-box" style="margin-top: 20px;">
<h3>🗺️ Sitemaps Found</h3>
<ul class="path-list" id="sitemaps"></ul>
</div>
<div class="result-box" style="margin-top: 20px;">
<h3>📄 Raw robots.txt</h3>
<pre id="rawContent"></pre>
</div>
<div class="cta-box">
<h3>Need Help With Compliant Scraping?</h3>
<p style="opacity: 0.9; margin: 10px 0 20px;">We build scrapers that respect robots.txt and follow best practices.</p>
<a href="/quote">Get a Free Quote </a>
</div>
</div>
</div>
</div>
<?php include '../includes/footer.php'; ?>
<script>
async function analyzeRobots() {
const urlInput = document.getElementById('urlInput').value.trim();
if (!urlInput) { alert('Please enter a URL'); return; }
let baseUrl;
try { baseUrl = new URL(urlInput); }
catch { alert('Please enter a valid URL'); return; }
document.getElementById('analyzeBtn').disabled = true;
document.getElementById('loading').style.display = 'block';
document.getElementById('results').style.display = 'none';
const robotsUrl = `${baseUrl.protocol}//${baseUrl.hostname}/robots.txt`;
try {
// Use a CORS proxy or backend in production
const response = await fetch(`/api/fetch-robots.php?url=${encodeURIComponent(robotsUrl)}`);
const data = await response.json();
if (data.error) {
displayError(data.error);
} else {
displayResults(data.content, baseUrl.hostname);
}
} catch (err) {
// Fallback: simulate analysis
simulateAnalysis(baseUrl.hostname);
}
document.getElementById('analyzeBtn').disabled = false;
document.getElementById('loading').style.display = 'none';
document.getElementById('results').style.display = 'block';
}
function simulateAnalysis(hostname) {
// Simulated robots.txt for demo
const sampleRobots = `User-agent: *
Disallow: /admin/
Disallow: /private/
Disallow: /api/internal/
Allow: /api/public/
Allow: /
Sitemap: https://${hostname}/sitemap.xml
Sitemap: https://${hostname}/sitemap-blog.xml
# Crawl-delay: 1`;
displayResults(sampleRobots, hostname);
}
function displayResults(content, hostname) {
const lines = content.split('\n');
const blocked = [], allowed = [], sitemaps = [];
let crawlDelay = null;
lines.forEach(line => {
const lower = line.toLowerCase().trim();
if (lower.startsWith('disallow:')) {
const path = line.split(':').slice(1).join(':').trim();
if (path) blocked.push(path);
} else if (lower.startsWith('allow:')) {
const path = line.split(':').slice(1).join(':').trim();
if (path) allowed.push(path);
} else if (lower.startsWith('sitemap:')) {
sitemaps.push(line.split(':').slice(1).join(':').trim());
} else if (lower.startsWith('crawl-delay:')) {
crawlDelay = line.split(':')[1].trim();
}
});
// Summary badges
let badges = '';
badges += `<span class="stat-badge badge-blue">${blocked.length} blocked paths</span>`;
badges += `<span class="stat-badge badge-green">${allowed.length} allowed paths</span>`;
badges += `<span class="stat-badge badge-blue">${sitemaps.length} sitemaps</span>`;
if (crawlDelay) badges += `<span class="stat-badge badge-yellow">Crawl delay: ${crawlDelay}s</span>`;
if (blocked.length === 0) badges += `<span class="stat-badge badge-green">Open to crawling</span>`;
if (blocked.length > 10) badges += `<span class="stat-badge badge-yellow">Many restrictions</span>`;
document.getElementById('summaryBadges').innerHTML = badges;
// Blocked paths
document.getElementById('blockedPaths').innerHTML = blocked.length
? blocked.map(p => `<li>${escapeHtml(p)}</li>`).join('')
: '<li style="color:#888">No blocked paths</li>';
// Allowed paths
document.getElementById('allowedPaths').innerHTML = allowed.length
? allowed.map(p => `<li>${escapeHtml(p)}</li>`).join('')
: '<li style="color:#888">No explicit allows (default: all allowed)</li>';
// Sitemaps
document.getElementById('sitemaps').innerHTML = sitemaps.length
? sitemaps.map(s => { const isValid = /^https?:///i.test(s); return isValid ? `<li><a href="${escapeHtml(s)}" target="_blank" rel="noopener">${escapeHtml(s)}</a></li>` : `<li>${escapeHtml(s)} <span style="color:#c62828">(invalid URL)</span></li>`; }).join('')
: '<li style="color:#888">No sitemaps declared</li>';
// Raw content
document.getElementById('rawContent').textContent = content;
}
function escapeHtml(text) {
const div = document.createElement('div');
div.textContent = text;
return div.innerHTML;
}
function displayError(message) {
document.getElementById("loading").style.display = "none";
document.getElementById("results").style.display = "block";
document.getElementById("summaryBadges").innerHTML = "<span class=\"stat-badge badge-red\">Error</span>";
document.getElementById("blockedPaths").innerHTML = "<li style=\"color:#c62828\">" + escapeHtml(message) + "</li>";
document.getElementById("allowedPaths").innerHTML = "";
document.getElementById("sitemaps").innerHTML = "";
document.getElementById("rawContent").textContent = "Error: " + message;
}
document.getElementById('urlInput').addEventListener('keypress', e => {
if (e.key === 'Enter') analyzeRobots();
});
</script>
</body>
</html>