Security hardening + new tools deployment
- Hide Apache version (ServerTokens Prod) - Add Permissions-Policy header - Remove deprecated X-XSS-Protection - Consolidate security headers to .htaccess only (remove duplicates from PHP) - Deploy free tools: robots-analyzer, data-converter - Deploy tools announcement blog post - Update sitemap with new tools and blog post
This commit is contained in:
122
api/fetch-robots.php
Normal file
122
api/fetch-robots.php
Normal file
@@ -0,0 +1,122 @@
|
||||
<?php
|
||||
/**
|
||||
* API endpoint to fetch robots.txt files
|
||||
* Handles CORS and acts as a proxy to avoid browser restrictions
|
||||
* SECURITY: Blocks internal/private IPs to prevent SSRF
|
||||
*/
|
||||
|
||||
header("Content-Type: application/json");
|
||||
header("Access-Control-Allow-Origin: *");
|
||||
header("Access-Control-Allow-Methods: GET");
|
||||
header("Cache-Control: public, max-age=300");
|
||||
|
||||
$url = $_GET["url"] ?? "";
|
||||
|
||||
if (empty($url)) {
|
||||
http_response_code(400);
|
||||
echo json_encode(["error" => "URL parameter required"]);
|
||||
exit;
|
||||
}
|
||||
|
||||
// Validate URL format
|
||||
if (!filter_var($url, FILTER_VALIDATE_URL)) {
|
||||
http_response_code(400);
|
||||
echo json_encode(["error" => "Invalid URL"]);
|
||||
exit;
|
||||
}
|
||||
|
||||
// Parse URL components
|
||||
$parsed = parse_url($url);
|
||||
$scheme = $parsed["scheme"] ?? "";
|
||||
$host = $parsed["host"] ?? "";
|
||||
$path = $parsed["path"] ?? "";
|
||||
|
||||
// Only allow http/https
|
||||
if (!in_array(strtolower($scheme), ["http", "https"])) {
|
||||
http_response_code(400);
|
||||
echo json_encode(["error" => "Only http/https URLs allowed"]);
|
||||
exit;
|
||||
}
|
||||
|
||||
// Path must be exactly /robots.txt
|
||||
if ($path !== "/robots.txt") {
|
||||
http_response_code(400);
|
||||
echo json_encode(["error" => "Only /robots.txt paths allowed"]);
|
||||
exit;
|
||||
}
|
||||
|
||||
// Block query strings and fragments
|
||||
if (!empty($parsed["query"]) || !empty($parsed["fragment"])) {
|
||||
http_response_code(400);
|
||||
echo json_encode(["error" => "Query strings not allowed"]);
|
||||
exit;
|
||||
}
|
||||
|
||||
// Resolve hostname to IP
|
||||
$ip = gethostbyname($host);
|
||||
if ($ip === $host) {
|
||||
// DNS resolution failed - might be internal hostname
|
||||
http_response_code(400);
|
||||
echo json_encode(["error" => "Could not resolve hostname"]);
|
||||
exit;
|
||||
}
|
||||
|
||||
// Block private and reserved IP ranges (SSRF protection)
|
||||
$flags = FILTER_FLAG_NO_PRIV_RANGE | FILTER_FLAG_NO_RES_RANGE;
|
||||
if (!filter_var($ip, FILTER_VALIDATE_IP, $flags)) {
|
||||
http_response_code(400);
|
||||
echo json_encode(["error" => "Internal addresses not allowed"]);
|
||||
exit;
|
||||
}
|
||||
|
||||
// Also block IPv6 localhost variants
|
||||
if (preg_match("/^(::1|fe80:|fc00:|fd00:)/i", $ip)) {
|
||||
http_response_code(400);
|
||||
echo json_encode(["error" => "Internal addresses not allowed"]);
|
||||
exit;
|
||||
}
|
||||
|
||||
// Fetch the robots.txt
|
||||
$context = stream_context_create([
|
||||
"http" => [
|
||||
"timeout" => 10,
|
||||
"user_agent" => "UK Data Services Robots Analyzer (+https://ukdataservices.co.uk/tools/robots-analyzer)",
|
||||
"follow_location" => true,
|
||||
"max_redirects" => 3
|
||||
],
|
||||
"ssl" => [
|
||||
"verify_peer" => true,
|
||||
"verify_peer_name" => true
|
||||
]
|
||||
]);
|
||||
|
||||
$content = @file_get_contents($url, false, $context);
|
||||
|
||||
if ($content === false) {
|
||||
if (isset($http_response_header)) {
|
||||
foreach ($http_response_header as $header) {
|
||||
if (preg_match("/^HTTP\/\d\.\d\s+(\d+)/", $header, $matches)) {
|
||||
$statusCode = intval($matches[1]);
|
||||
if ($statusCode === 404) {
|
||||
echo json_encode([
|
||||
"content" => "# No robots.txt found\nUser-agent: *\nAllow: /",
|
||||
"status" => 404,
|
||||
"message" => "No robots.txt file found (this means the site allows all crawling by default)"
|
||||
]);
|
||||
exit;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
http_response_code(502);
|
||||
echo json_encode(["error" => "Failed to fetch robots.txt - site may be unreachable"]);
|
||||
exit;
|
||||
}
|
||||
|
||||
echo json_encode([
|
||||
"content" => $content,
|
||||
"status" => 200,
|
||||
"url" => $url,
|
||||
"fetchedAt" => date("c")
|
||||
]);
|
||||
Reference in New Issue
Block a user