"URL parameter required"]); exit; } // Validate URL format if (!filter_var($url, FILTER_VALIDATE_URL)) { http_response_code(400); echo json_encode(["error" => "Invalid URL"]); exit; } // Parse URL components $parsed = parse_url($url); $scheme = $parsed["scheme"] ?? ""; $host = $parsed["host"] ?? ""; $path = $parsed["path"] ?? ""; // Only allow http/https if (!in_array(strtolower($scheme), ["http", "https"])) { http_response_code(400); echo json_encode(["error" => "Only http/https URLs allowed"]); exit; } // Path must be exactly /robots.txt if ($path !== "/robots.txt") { http_response_code(400); echo json_encode(["error" => "Only /robots.txt paths allowed"]); exit; } // Block query strings and fragments if (!empty($parsed["query"]) || !empty($parsed["fragment"])) { http_response_code(400); echo json_encode(["error" => "Query strings not allowed"]); exit; } // Resolve hostname to IP $ip = gethostbyname($host); if ($ip === $host) { // DNS resolution failed - might be internal hostname http_response_code(400); echo json_encode(["error" => "Could not resolve hostname"]); exit; } // Block private and reserved IP ranges (SSRF protection) $flags = FILTER_FLAG_NO_PRIV_RANGE | FILTER_FLAG_NO_RES_RANGE; if (!filter_var($ip, FILTER_VALIDATE_IP, $flags)) { http_response_code(400); echo json_encode(["error" => "Internal addresses not allowed"]); exit; } // Also block IPv6 localhost variants if (preg_match("/^(::1|fe80:|fc00:|fd00:)/i", $ip)) { http_response_code(400); echo json_encode(["error" => "Internal addresses not allowed"]); exit; } // Fetch the robots.txt $context = stream_context_create([ "http" => [ "timeout" => 10, "user_agent" => "UK AI Automation Robots Analyzer (+https://ukaiautomation.co.uk/tools/robots-analyzer)", "follow_location" => true, "max_redirects" => 3 ], "ssl" => [ "verify_peer" => true, "verify_peer_name" => true ] ]); $content = @file_get_contents($url, false, $context); if ($content === false) { if (isset($http_response_header)) { foreach ($http_response_header as $header) { if (preg_match("/^HTTP\/\d\.\d\s+(\d+)/", $header, $matches)) { $statusCode = intval($matches[1]); if ($statusCode === 404) { echo json_encode([ "content" => "# No robots.txt found\nUser-agent: *\nAllow: /", "status" => 404, "message" => "No robots.txt file found (this means the site allows all crawling by default)" ]); exit; } } } } http_response_code(502); echo json_encode(["error" => "Failed to fetch robots.txt - site may be unreachable"]); exit; } echo json_encode([ "content" => $content, "status" => 200, "url" => $url, "fetchedAt" => date("c") ]);