Files
ukaiautomation/api/fetch-robots.php
root b6e39fe0c2 Security hardening + new tools deployment
- Hide Apache version (ServerTokens Prod)
- Add Permissions-Policy header
- Remove deprecated X-XSS-Protection
- Consolidate security headers to .htaccess only (remove duplicates from PHP)
- Deploy free tools: robots-analyzer, data-converter
- Deploy tools announcement blog post
- Update sitemap with new tools and blog post
2026-02-05 04:11:15 +00:00

123 lines
3.4 KiB
PHP

<?php
/**
* API endpoint to fetch robots.txt files
* Handles CORS and acts as a proxy to avoid browser restrictions
* SECURITY: Blocks internal/private IPs to prevent SSRF
*/
header("Content-Type: application/json");
header("Access-Control-Allow-Origin: *");
header("Access-Control-Allow-Methods: GET");
header("Cache-Control: public, max-age=300");
$url = $_GET["url"] ?? "";
if (empty($url)) {
http_response_code(400);
echo json_encode(["error" => "URL parameter required"]);
exit;
}
// Validate URL format
if (!filter_var($url, FILTER_VALIDATE_URL)) {
http_response_code(400);
echo json_encode(["error" => "Invalid URL"]);
exit;
}
// Parse URL components
$parsed = parse_url($url);
$scheme = $parsed["scheme"] ?? "";
$host = $parsed["host"] ?? "";
$path = $parsed["path"] ?? "";
// Only allow http/https
if (!in_array(strtolower($scheme), ["http", "https"])) {
http_response_code(400);
echo json_encode(["error" => "Only http/https URLs allowed"]);
exit;
}
// Path must be exactly /robots.txt
if ($path !== "/robots.txt") {
http_response_code(400);
echo json_encode(["error" => "Only /robots.txt paths allowed"]);
exit;
}
// Block query strings and fragments
if (!empty($parsed["query"]) || !empty($parsed["fragment"])) {
http_response_code(400);
echo json_encode(["error" => "Query strings not allowed"]);
exit;
}
// Resolve hostname to IP
$ip = gethostbyname($host);
if ($ip === $host) {
// DNS resolution failed - might be internal hostname
http_response_code(400);
echo json_encode(["error" => "Could not resolve hostname"]);
exit;
}
// Block private and reserved IP ranges (SSRF protection)
$flags = FILTER_FLAG_NO_PRIV_RANGE | FILTER_FLAG_NO_RES_RANGE;
if (!filter_var($ip, FILTER_VALIDATE_IP, $flags)) {
http_response_code(400);
echo json_encode(["error" => "Internal addresses not allowed"]);
exit;
}
// Also block IPv6 localhost variants
if (preg_match("/^(::1|fe80:|fc00:|fd00:)/i", $ip)) {
http_response_code(400);
echo json_encode(["error" => "Internal addresses not allowed"]);
exit;
}
// Fetch the robots.txt
$context = stream_context_create([
"http" => [
"timeout" => 10,
"user_agent" => "UK Data Services Robots Analyzer (+https://ukdataservices.co.uk/tools/robots-analyzer)",
"follow_location" => true,
"max_redirects" => 3
],
"ssl" => [
"verify_peer" => true,
"verify_peer_name" => true
]
]);
$content = @file_get_contents($url, false, $context);
if ($content === false) {
if (isset($http_response_header)) {
foreach ($http_response_header as $header) {
if (preg_match("/^HTTP\/\d\.\d\s+(\d+)/", $header, $matches)) {
$statusCode = intval($matches[1]);
if ($statusCode === 404) {
echo json_encode([
"content" => "# No robots.txt found\nUser-agent: *\nAllow: /",
"status" => 404,
"message" => "No robots.txt file found (this means the site allows all crawling by default)"
]);
exit;
}
}
}
}
http_response_code(502);
echo json_encode(["error" => "Failed to fetch robots.txt - site may be unreachable"]);
exit;
}
echo json_encode([
"content" => $content,
"status" => 200,
"url" => $url,
"fetchedAt" => date("c")
]);