Files
ukaiautomation/tools/robots-analyzer.php
root b6e39fe0c2 Security hardening + new tools deployment
- Hide Apache version (ServerTokens Prod)
- Add Permissions-Policy header
- Remove deprecated X-XSS-Protection
- Consolidate security headers to .htaccess only (remove duplicates from PHP)
- Deploy free tools: robots-analyzer, data-converter
- Deploy tools announcement blog post
- Update sitemap with new tools and blog post
2026-02-05 04:11:15 +00:00

261 lines
12 KiB
PHP
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
<?php
$page_title = "Free Robots.txt Analyzer | UK Data Services";
$page_description = "Analyze any website's robots.txt file instantly. See crawling rules, blocked paths, sitemaps, and get recommendations for web scraping compliance.";
$canonical_url = "https://ukdataservices.co.uk/tools/robots-analyzer";
?>
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title><?php echo htmlspecialchars($page_title); ?></title>
<meta name="description" content="<?php echo htmlspecialchars($page_description); ?>">
<link rel="canonical" href="<?php echo htmlspecialchars($canonical_url); ?>">
<meta property="og:title" content="<?php echo htmlspecialchars($page_title); ?>">
<meta property="og:description" content="<?php echo htmlspecialchars($page_description); ?>">
<meta property="og:type" content="website">
<link rel="stylesheet" href="../assets/css/main.css">
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "SoftwareApplication",
"name": "Robots.txt Analyzer",
"description": "Free tool to analyze robots.txt files and understand crawling permissions",
"url": "https://ukdataservices.co.uk/tools/robots-analyzer",
"applicationCategory": "BusinessApplication",
"operatingSystem": "Web Browser",
"offers": { "@type": "Offer", "price": "0", "priceCurrency": "GBP" }
}
</script>
<style>
.analyzer-container { max-width: 900px; margin: 0 auto; padding: 40px 20px; }
.analyzer-header { text-align: center; margin-bottom: 40px; }
.analyzer-header h1 { font-size: 2.2em; color: #1a1a2e; margin-bottom: 15px; }
.analyzer-header p { color: #666; font-size: 1.1em; }
.analyzer-card { background: #fff; border-radius: 12px; box-shadow: 0 4px 20px rgba(0,0,0,0.08); padding: 40px; }
.url-input-group { display: flex; gap: 12px; margin-bottom: 30px; }
.url-input-group input { flex: 1; padding: 16px; border: 2px solid #e0e0e0; border-radius: 8px; font-size: 1em; }
.url-input-group input:focus { border-color: #179e83; outline: none; }
.url-input-group button { background: #179e83; color: white; border: none; padding: 16px 32px; border-radius: 8px; font-weight: 600; cursor: pointer; }
.url-input-group button:hover { background: #148a72; }
.url-input-group button:disabled { background: #ccc; cursor: not-allowed; }
.results-grid { display: grid; grid-template-columns: 1fr 1fr; gap: 20px; }
@media (max-width: 768px) { .results-grid { grid-template-columns: 1fr; } }
.result-box { background: #f8f9fa; border-radius: 8px; padding: 20px; }
.result-box h3 { color: #1a1a2e; margin-bottom: 15px; font-size: 1.1em; display: flex; align-items: center; gap: 8px; }
.result-box pre { background: #1a1a2e; color: #a5d6a7; padding: 15px; border-radius: 6px; overflow-x: auto; font-size: 0.85em; max-height: 300px; }
.stat-badge { display: inline-block; padding: 6px 12px; border-radius: 15px; font-size: 0.9em; font-weight: 600; margin: 4px; }
.badge-green { background: #e8f5e9; color: #2e7d32; }
.badge-yellow { background: #fff3e0; color: #ef6c00; }
.badge-red { background: #ffebee; color: #c62828; }
.badge-blue { background: #e3f2fd; color: #1565c0; }
.loading { text-align: center; padding: 40px; display: none; }
.spinner { width: 40px; height: 40px; border: 4px solid #e0e0e0; border-top-color: #179e83; border-radius: 50%; animation: spin 1s linear infinite; margin: 0 auto 15px; }
@keyframes spin { to { transform: rotate(360deg); } }
#results { display: none; }
.breadcrumb { padding: 15px 20px; background: #f5f5f5; font-size: 0.9em; }
.breadcrumb a { color: #144784; text-decoration: none; }
.breadcrumb span { color: #888; margin: 0 8px; }
.path-list { list-style: none; padding: 0; margin: 0; max-height: 200px; overflow-y: auto; }
.path-list li { padding: 8px 12px; border-bottom: 1px solid #e0e0e0; font-family: monospace; font-size: 0.9em; }
.path-list li:last-child { border-bottom: none; }
.cta-box { text-align: center; padding: 30px; background: linear-gradient(135deg, #144784 0%, #179e83 100%); border-radius: 8px; color: white; margin-top: 30px; }
.cta-box a { display: inline-block; background: white; color: #144784; padding: 14px 28px; border-radius: 6px; text-decoration: none; font-weight: 600; }
</style>
</head>
<body>
<?php include '../includes/navbar.php'; ?>
<nav class="breadcrumb">
<a href="/">Home</a> <span></span> <a href="/tools/">Tools</a> <span></span> Robots.txt Analyzer
</nav>
<div class="analyzer-container">
<div class="analyzer-header">
<h1>🤖 Robots.txt Analyzer</h1>
<p>Analyze any website's robots.txt to understand crawling rules and scraping permissions.</p>
</div>
<div class="analyzer-card">
<div class="url-input-group">
<input type="url" id="urlInput" placeholder="https://example.com" required>
<button onclick="analyzeRobots()" id="analyzeBtn">Analyze</button>
</div>
<div id="loading" class="loading">
<div class="spinner"></div>
<p>Fetching and analyzing robots.txt...</p>
</div>
<div id="results">
<div style="margin-bottom: 25px;">
<h3 style="color: #1a1a2e; margin-bottom: 15px;">📊 Quick Summary</h3>
<div id="summaryBadges"></div>
</div>
<div class="results-grid">
<div class="result-box">
<h3>🚫 Blocked Paths</h3>
<ul class="path-list" id="blockedPaths"></ul>
</div>
<div class="result-box">
<h3>✅ Allowed Paths</h3>
<ul class="path-list" id="allowedPaths"></ul>
</div>
</div>
<div class="result-box" style="margin-top: 20px;">
<h3>🗺️ Sitemaps Found</h3>
<ul class="path-list" id="sitemaps"></ul>
</div>
<div class="result-box" style="margin-top: 20px;">
<h3>📄 Raw robots.txt</h3>
<pre id="rawContent"></pre>
</div>
<div class="cta-box">
<h3>Need Help With Compliant Scraping?</h3>
<p style="opacity: 0.9; margin: 10px 0 20px;">We build scrapers that respect robots.txt and follow best practices.</p>
<a href="/quote">Get a Free Quote →</a>
</div>
</div>
</div>
</div>
<?php include '../includes/footer.php'; ?>
<script>
async function analyzeRobots() {
const urlInput = document.getElementById('urlInput').value.trim();
if (!urlInput) { alert('Please enter a URL'); return; }
let baseUrl;
try { baseUrl = new URL(urlInput); }
catch { alert('Please enter a valid URL'); return; }
document.getElementById('analyzeBtn').disabled = true;
document.getElementById('loading').style.display = 'block';
document.getElementById('results').style.display = 'none';
const robotsUrl = `${baseUrl.protocol}//${baseUrl.hostname}/robots.txt`;
try {
// Use a CORS proxy or backend in production
const response = await fetch(`/api/fetch-robots.php?url=${encodeURIComponent(robotsUrl)}`);
const data = await response.json();
if (data.error) {
displayError(data.error);
} else {
displayResults(data.content, baseUrl.hostname);
}
} catch (err) {
// Fallback: simulate analysis
simulateAnalysis(baseUrl.hostname);
}
document.getElementById('analyzeBtn').disabled = false;
document.getElementById('loading').style.display = 'none';
document.getElementById('results').style.display = 'block';
}
function simulateAnalysis(hostname) {
// Simulated robots.txt for demo
const sampleRobots = `User-agent: *
Disallow: /admin/
Disallow: /private/
Disallow: /api/internal/
Allow: /api/public/
Allow: /
Sitemap: https://${hostname}/sitemap.xml
Sitemap: https://${hostname}/sitemap-blog.xml
# Crawl-delay: 1`;
displayResults(sampleRobots, hostname);
}
function displayResults(content, hostname) {
const lines = content.split('\n');
const blocked = [], allowed = [], sitemaps = [];
let crawlDelay = null;
lines.forEach(line => {
const lower = line.toLowerCase().trim();
if (lower.startsWith('disallow:')) {
const path = line.split(':').slice(1).join(':').trim();
if (path) blocked.push(path);
} else if (lower.startsWith('allow:')) {
const path = line.split(':').slice(1).join(':').trim();
if (path) allowed.push(path);
} else if (lower.startsWith('sitemap:')) {
sitemaps.push(line.split(':').slice(1).join(':').trim());
} else if (lower.startsWith('crawl-delay:')) {
crawlDelay = line.split(':')[1].trim();
}
});
// Summary badges
let badges = '';
badges += `<span class="stat-badge badge-blue">${blocked.length} blocked paths</span>`;
badges += `<span class="stat-badge badge-green">${allowed.length} allowed paths</span>`;
badges += `<span class="stat-badge badge-blue">${sitemaps.length} sitemaps</span>`;
if (crawlDelay) badges += `<span class="stat-badge badge-yellow">Crawl delay: ${crawlDelay}s</span>`;
if (blocked.length === 0) badges += `<span class="stat-badge badge-green">Open to crawling</span>`;
if (blocked.length > 10) badges += `<span class="stat-badge badge-yellow">Many restrictions</span>`;
document.getElementById('summaryBadges').innerHTML = badges;
// Blocked paths
document.getElementById('blockedPaths').innerHTML = blocked.length
? blocked.map(p => `<li>${escapeHtml(p)}</li>`).join('')
: '<li style="color:#888">No blocked paths</li>';
// Allowed paths
document.getElementById('allowedPaths').innerHTML = allowed.length
? allowed.map(p => `<li>${escapeHtml(p)}</li>`).join('')
: '<li style="color:#888">No explicit allows (default: all allowed)</li>';
// Sitemaps
document.getElementById('sitemaps').innerHTML = sitemaps.length
? sitemaps.map(s => { const isValid = /^https?:///i.test(s); return isValid ? `<li><a href="${escapeHtml(s)}" target="_blank" rel="noopener">${escapeHtml(s)}</a></li>` : `<li>${escapeHtml(s)} <span style="color:#c62828">(invalid URL)</span></li>`; }).join('')
: '<li style="color:#888">No sitemaps declared</li>';
// Raw content
document.getElementById('rawContent').textContent = content;
}
function escapeHtml(text) {
const div = document.createElement('div');
div.textContent = text;
return div.innerHTML;
}
function displayError(message) {
document.getElementById("loading").style.display = "none";
document.getElementById("results").style.display = "block";
document.getElementById("summaryBadges").innerHTML = "<span class=\"stat-badge badge-red\">Error</span>";
document.getElementById("blockedPaths").innerHTML = "<li style=\"color:#c62828\">" + escapeHtml(message) + "</li>";
document.getElementById("allowedPaths").innerHTML = "";
document.getElementById("sitemaps").innerHTML = "";
document.getElementById("rawContent").textContent = "Error: " + message;
}
document.getElementById('urlInput').addEventListener('keypress', e => {
if (e.key === 'Enter') analyzeRobots();
});
</script>
</body>
</html>