Files
ukaiautomation/tools/robots-analyzer.php
Peter Foster 15e9ba598e Cache: fix 1-year CSS cache and add version busting
- .htaccess: remove duplicate cache block that set all CSS/JS to max-age=31536000
  CSS/JS now use max-age=3600 must-revalidate (was 1 year, breaking live edits)
- index.php: bump main.min.css version v1.1.3 -> v1.1.4
- All 78 PHP files: add ?v=20260222 to main.css and cro-enhancements.css refs
  Forces browser cache bust for all pages after todays accessibility changes
2026-02-22 11:12:40 +00:00

264 lines
13 KiB
PHP
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
<?php
$page_title = "Free Robots.txt Analyzer | UK Data Services";
$page_description = "Analyze any website's robots.txt file instantly. See crawling rules, blocked paths, sitemaps, and get recommendations for web scraping compliance.";
$canonical_url = "https://ukdataservices.co.uk/tools/robots-analyzer";
?>
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title><?php echo htmlspecialchars($page_title); ?></title>
<meta name="description" content="<?php echo htmlspecialchars($page_description); ?>">
<link rel="preconnect" href="https://fonts.googleapis.com">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link href="https://fonts.googleapis.com/css2?family=Roboto+Slab:wght@100;200;300;400;500;600;700;800;900&family=Lato:wght@100;200;300;400;500;600;700;800;900&display=swap" rel="stylesheet">
<link rel="canonical" href="<?php echo htmlspecialchars($canonical_url); ?>">
<meta property="og:title" content="<?php echo htmlspecialchars($page_title); ?>">
<meta property="og:description" content="<?php echo htmlspecialchars($page_description); ?>">
<meta property="og:type" content="website">
<link rel="stylesheet" href="../assets/css/main.css?v=20260222">
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "SoftwareApplication",
"name": "Robots.txt Analyzer",
"description": "Free tool to analyze robots.txt files and understand crawling permissions",
"url": "https://ukdataservices.co.uk/tools/robots-analyzer",
"applicationCategory": "BusinessApplication",
"operatingSystem": "Web Browser",
"offers": { "@type": "Offer", "price": "0", "priceCurrency": "GBP" }
}
</script>
<style>
.analyzer-container { max-width: 900px; margin: 0 auto; padding: 40px 20px; }
.analyzer-header { text-align: center; margin-bottom: 40px; }
.analyzer-header h1 { font-size: 2.2em; color: #1a1a2e; margin-bottom: 15px; }
.analyzer-header p { color: #666; font-size: 1.1em; }
.analyzer-card { background: #fff; border-radius: 12px; box-shadow: 0 4px 20px rgba(0,0,0,0.08); padding: 40px; }
.url-input-group { display: flex; gap: 12px; margin-bottom: 30px; }
.url-input-group input { flex: 1; padding: 16px; border: 2px solid #e0e0e0; border-radius: 8px; font-size: 1em; }
.url-input-group input:focus { border-color: #179e83; outline: none; }
.url-input-group button { background: #179e83; color: white; border: none; padding: 16px 32px; border-radius: 8px; font-weight: 600; cursor: pointer; }
.url-input-group button:hover { background: #148a72; }
.url-input-group button:disabled { background: #ccc; cursor: not-allowed; }
.results-grid { display: grid; grid-template-columns: 1fr 1fr; gap: 20px; }
@media (max-width: 768px) { .results-grid { grid-template-columns: 1fr; } }
.result-box { background: #f8f9fa; border-radius: 8px; padding: 20px; }
.result-box h3 { color: #1a1a2e; margin-bottom: 15px; font-size: 1.1em; display: flex; align-items: center; gap: 8px; }
.result-box pre { background: #1a1a2e; color: #a5d6a7; padding: 15px; border-radius: 6px; overflow-x: auto; font-size: 0.85em; max-height: 300px; }
.stat-badge { display: inline-block; padding: 6px 12px; border-radius: 15px; font-size: 0.9em; font-weight: 600; margin: 4px; }
.badge-green { background: #e8f5e9; color: #2e7d32; }
.badge-yellow { background: #fff3e0; color: #ef6c00; }
.badge-red { background: #ffebee; color: #c62828; }
.badge-blue { background: #e3f2fd; color: #1565c0; }
.loading { text-align: center; padding: 40px; display: none; }
.spinner { width: 40px; height: 40px; border: 4px solid #e0e0e0; border-top-color: #179e83; border-radius: 50%; animation: spin 1s linear infinite; margin: 0 auto 15px; }
@keyframes spin { to { transform: rotate(360deg); } }
#results { display: none; }
.breadcrumb { padding: 15px 20px; background: #f5f5f5; font-size: 0.9em; }
.breadcrumb a { color: #144784; text-decoration: none; }
.breadcrumb span { color: #888; margin: 0 8px; }
.path-list { list-style: none; padding: 0; margin: 0; max-height: 200px; overflow-y: auto; }
.path-list li { padding: 8px 12px; border-bottom: 1px solid #e0e0e0; font-family: monospace; font-size: 0.9em; }
.path-list li:last-child { border-bottom: none; }
.cta-box { text-align: center; padding: 30px; background: linear-gradient(135deg, #144784 0%, #179e83 100%); border-radius: 8px; color: white; margin-top: 30px; }
.cta-box a { display: inline-block; background: white; color: #144784; padding: 14px 28px; border-radius: 6px; text-decoration: none; font-weight: 600; }
</style>
</head>
<body>
<?php include($_SERVER["DOCUMENT_ROOT"] . "/includes/nav.php"); ?>
<nav class="breadcrumb">
<a href="/">Home</a> <span></span> <a href="/tools/">Tools</a> <span></span> Robots.txt Analyzer
</nav>
<div class="analyzer-container">
<div class="analyzer-header">
<h1>🤖 Robots.txt Analyzer</h1>
<p>Analyze any website's robots.txt to understand crawling rules and scraping permissions.</p>
</div>
<div class="analyzer-card">
<div class="url-input-group">
<input type="url" id="urlInput" placeholder="https://example.com" required>
<button onclick="analyzeRobots()" id="analyzeBtn">Analyze</button>
</div>
<div id="loading" class="loading">
<div class="spinner"></div>
<p>Fetching and analyzing robots.txt...</p>
</div>
<div id="results">
<div style="margin-bottom: 25px;">
<h3 style="color: #1a1a2e; margin-bottom: 15px;">📊 Quick Summary</h3>
<div id="summaryBadges"></div>
</div>
<div class="results-grid">
<div class="result-box">
<h3>🚫 Blocked Paths</h3>
<ul class="path-list" id="blockedPaths"></ul>
</div>
<div class="result-box">
<h3>✅ Allowed Paths</h3>
<ul class="path-list" id="allowedPaths"></ul>
</div>
</div>
<div class="result-box" style="margin-top: 20px;">
<h3>🗺️ Sitemaps Found</h3>
<ul class="path-list" id="sitemaps"></ul>
</div>
<div class="result-box" style="margin-top: 20px;">
<h3>📄 Raw robots.txt</h3>
<pre id="rawContent"></pre>
</div>
<div class="cta-box">
<h3>Need Help With Compliant Scraping?</h3>
<p style="opacity: 0.9; margin: 10px 0 20px;">We build scrapers that respect robots.txt and follow best practices.</p>
<a href="/quote">Get a Free Quote →</a>
</div>
</div>
</div>
</div>
<?php include '../includes/footer.php'; ?>
<script>
async function analyzeRobots() {
const urlInput = document.getElementById('urlInput').value.trim();
if (!urlInput) { alert('Please enter a URL'); return; }
let baseUrl;
try { baseUrl = new URL(urlInput); }
catch { alert('Please enter a valid URL'); return; }
document.getElementById('analyzeBtn').disabled = true;
document.getElementById('loading').style.display = 'block';
document.getElementById('results').style.display = 'none';
const robotsUrl = `${baseUrl.protocol}//${baseUrl.hostname}/robots.txt`;
try {
// Use a CORS proxy or backend in production
const response = await fetch(`/api/fetch-robots.php?url=${encodeURIComponent(robotsUrl)}`);
const data = await response.json();
if (data.error) {
displayError(data.error);
} else {
displayResults(data.content, baseUrl.hostname);
}
} catch (err) {
// Fallback: simulate analysis
simulateAnalysis(baseUrl.hostname);
}
document.getElementById('analyzeBtn').disabled = false;
document.getElementById('loading').style.display = 'none';
document.getElementById('results').style.display = 'block';
}
function simulateAnalysis(hostname) {
// Simulated robots.txt for demo
const sampleRobots = `User-agent: *
Disallow: /admin/
Disallow: /private/
Disallow: /api/internal/
Allow: /api/public/
Allow: /
Sitemap: https://${hostname}/sitemap.xml
Sitemap: https://${hostname}/sitemap-blog.xml
# Crawl-delay: 1`;
displayResults(sampleRobots, hostname);
}
function displayResults(content, hostname) {
const lines = content.split('\n');
const blocked = [], allowed = [], sitemaps = [];
let crawlDelay = null;
lines.forEach(line => {
const lower = line.toLowerCase().trim();
if (lower.startsWith('disallow:')) {
const path = line.split(':').slice(1).join(':').trim();
if (path) blocked.push(path);
} else if (lower.startsWith('allow:')) {
const path = line.split(':').slice(1).join(':').trim();
if (path) allowed.push(path);
} else if (lower.startsWith('sitemap:')) {
sitemaps.push(line.split(':').slice(1).join(':').trim());
} else if (lower.startsWith('crawl-delay:')) {
crawlDelay = line.split(':')[1].trim();
}
});
// Summary badges
let badges = '';
badges += `<span class="stat-badge badge-blue">${blocked.length} blocked paths</span>`;
badges += `<span class="stat-badge badge-green">${allowed.length} allowed paths</span>`;
badges += `<span class="stat-badge badge-blue">${sitemaps.length} sitemaps</span>`;
if (crawlDelay) badges += `<span class="stat-badge badge-yellow">Crawl delay: ${crawlDelay}s</span>`;
if (blocked.length === 0) badges += `<span class="stat-badge badge-green">Open to crawling</span>`;
if (blocked.length > 10) badges += `<span class="stat-badge badge-yellow">Many restrictions</span>`;
document.getElementById('summaryBadges').innerHTML = badges;
// Blocked paths
document.getElementById('blockedPaths').innerHTML = blocked.length
? blocked.map(p => `<li>${escapeHtml(p)}</li>`).join('')
: '<li style="color:#888">No blocked paths</li>';
// Allowed paths
document.getElementById('allowedPaths').innerHTML = allowed.length
? allowed.map(p => `<li>${escapeHtml(p)}</li>`).join('')
: '<li style="color:#888">No explicit allows (default: all allowed)</li>';
// Sitemaps
document.getElementById('sitemaps').innerHTML = sitemaps.length
? sitemaps.map(s => { const isValid = /^https?:///i.test(s); return isValid ? `<li><a href="${escapeHtml(s)}" target="_blank" rel="noopener">${escapeHtml(s)}</a></li>` : `<li>${escapeHtml(s)} <span style="color:#c62828">(invalid URL)</span></li>`; }).join('')
: '<li style="color:#888">No sitemaps declared</li>';
// Raw content
document.getElementById('rawContent').textContent = content;
}
function escapeHtml(text) {
const div = document.createElement('div');
div.textContent = text;
return div.innerHTML;
}
function displayError(message) {
document.getElementById("loading").style.display = "none";
document.getElementById("results").style.display = "block";
document.getElementById("summaryBadges").innerHTML = "<span class=\"stat-badge badge-red\">Error</span>";
document.getElementById("blockedPaths").innerHTML = "<li style=\"color:#c62828\">" + escapeHtml(message) + "</li>";
document.getElementById("allowedPaths").innerHTML = "";
document.getElementById("sitemaps").innerHTML = "";
document.getElementById("rawContent").textContent = "Error: " + message;
}
document.getElementById('urlInput').addEventListener('keypress', e => {
if (e.key === 'Enter') analyzeRobots();
});
</script>
</body>
</html>