123 lines
3.4 KiB
PHP
123 lines
3.4 KiB
PHP
|
|
<?php
|
||
|
|
/**
|
||
|
|
* API endpoint to fetch robots.txt files
|
||
|
|
* Handles CORS and acts as a proxy to avoid browser restrictions
|
||
|
|
* SECURITY: Blocks internal/private IPs to prevent SSRF
|
||
|
|
*/
|
||
|
|
|
||
|
|
header("Content-Type: application/json");
|
||
|
|
header("Access-Control-Allow-Origin: *");
|
||
|
|
header("Access-Control-Allow-Methods: GET");
|
||
|
|
header("Cache-Control: public, max-age=300");
|
||
|
|
|
||
|
|
$url = $_GET["url"] ?? "";
|
||
|
|
|
||
|
|
if (empty($url)) {
|
||
|
|
http_response_code(400);
|
||
|
|
echo json_encode(["error" => "URL parameter required"]);
|
||
|
|
exit;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Validate URL format
|
||
|
|
if (!filter_var($url, FILTER_VALIDATE_URL)) {
|
||
|
|
http_response_code(400);
|
||
|
|
echo json_encode(["error" => "Invalid URL"]);
|
||
|
|
exit;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Parse URL components
|
||
|
|
$parsed = parse_url($url);
|
||
|
|
$scheme = $parsed["scheme"] ?? "";
|
||
|
|
$host = $parsed["host"] ?? "";
|
||
|
|
$path = $parsed["path"] ?? "";
|
||
|
|
|
||
|
|
// Only allow http/https
|
||
|
|
if (!in_array(strtolower($scheme), ["http", "https"])) {
|
||
|
|
http_response_code(400);
|
||
|
|
echo json_encode(["error" => "Only http/https URLs allowed"]);
|
||
|
|
exit;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Path must be exactly /robots.txt
|
||
|
|
if ($path !== "/robots.txt") {
|
||
|
|
http_response_code(400);
|
||
|
|
echo json_encode(["error" => "Only /robots.txt paths allowed"]);
|
||
|
|
exit;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Block query strings and fragments
|
||
|
|
if (!empty($parsed["query"]) || !empty($parsed["fragment"])) {
|
||
|
|
http_response_code(400);
|
||
|
|
echo json_encode(["error" => "Query strings not allowed"]);
|
||
|
|
exit;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Resolve hostname to IP
|
||
|
|
$ip = gethostbyname($host);
|
||
|
|
if ($ip === $host) {
|
||
|
|
// DNS resolution failed - might be internal hostname
|
||
|
|
http_response_code(400);
|
||
|
|
echo json_encode(["error" => "Could not resolve hostname"]);
|
||
|
|
exit;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Block private and reserved IP ranges (SSRF protection)
|
||
|
|
$flags = FILTER_FLAG_NO_PRIV_RANGE | FILTER_FLAG_NO_RES_RANGE;
|
||
|
|
if (!filter_var($ip, FILTER_VALIDATE_IP, $flags)) {
|
||
|
|
http_response_code(400);
|
||
|
|
echo json_encode(["error" => "Internal addresses not allowed"]);
|
||
|
|
exit;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Also block IPv6 localhost variants
|
||
|
|
if (preg_match("/^(::1|fe80:|fc00:|fd00:)/i", $ip)) {
|
||
|
|
http_response_code(400);
|
||
|
|
echo json_encode(["error" => "Internal addresses not allowed"]);
|
||
|
|
exit;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Fetch the robots.txt
|
||
|
|
$context = stream_context_create([
|
||
|
|
"http" => [
|
||
|
|
"timeout" => 10,
|
||
|
|
"user_agent" => "UK Data Services Robots Analyzer (+https://ukdataservices.co.uk/tools/robots-analyzer)",
|
||
|
|
"follow_location" => true,
|
||
|
|
"max_redirects" => 3
|
||
|
|
],
|
||
|
|
"ssl" => [
|
||
|
|
"verify_peer" => true,
|
||
|
|
"verify_peer_name" => true
|
||
|
|
]
|
||
|
|
]);
|
||
|
|
|
||
|
|
$content = @file_get_contents($url, false, $context);
|
||
|
|
|
||
|
|
if ($content === false) {
|
||
|
|
if (isset($http_response_header)) {
|
||
|
|
foreach ($http_response_header as $header) {
|
||
|
|
if (preg_match("/^HTTP\/\d\.\d\s+(\d+)/", $header, $matches)) {
|
||
|
|
$statusCode = intval($matches[1]);
|
||
|
|
if ($statusCode === 404) {
|
||
|
|
echo json_encode([
|
||
|
|
"content" => "# No robots.txt found\nUser-agent: *\nAllow: /",
|
||
|
|
"status" => 404,
|
||
|
|
"message" => "No robots.txt file found (this means the site allows all crawling by default)"
|
||
|
|
]);
|
||
|
|
exit;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
http_response_code(502);
|
||
|
|
echo json_encode(["error" => "Failed to fetch robots.txt - site may be unreachable"]);
|
||
|
|
exit;
|
||
|
|
}
|
||
|
|
|
||
|
|
echo json_encode([
|
||
|
|
"content" => $content,
|
||
|
|
"status" => 200,
|
||
|
|
"url" => $url,
|
||
|
|
"fetchedAt" => date("c")
|
||
|
|
]);
|