- Restore title to Feb 16 baseline (was ranking pos 12.8): "Web Scraping Services UK | Fast, GDPR-Compliant Data Extraction" - Restore original meta description targeting web scraping services for UK businesses - Add 301 redirects for /web-scraping-services/ and /data-scraping-services/ → /services/web-scraping (duplicate pages were cannibalizing the main page) - Fix blog article schema headline from generic "statistical validation" phrase to "Data Quality Validation for Web Scraping Pipelines" to stop attracting irrelevant academic queries in GSC
252 lines
9.7 KiB
ApacheConf
252 lines
9.7 KiB
ApacheConf
# Redirect www to non-www
|
|
RewriteEngine On
|
|
RewriteCond %{HTTP_HOST} ^www.(.*)$ [NC]
|
|
RewriteRule ^(.*)$ https://%1/$1 [L,R=301]
|
|
|
|
# Custom error pages
|
|
ErrorDocument 403 /403.php
|
|
ErrorDocument 404 /404.php
|
|
ErrorDocument 500 /500.php
|
|
|
|
# Security Rules for UK Data Services
|
|
|
|
# Protect sensitive files and configs
|
|
<FilesMatch "^\.(.*)$|\.log$|\.sql$|\.conf$|config\.php$|\.email-config\.php$|\.htaccess|\.htpasswd|\.ini|\.sh|\.inc|\.bak$">
|
|
Require all denied
|
|
</FilesMatch>
|
|
|
|
# Protect contact handlers from direct browser access (POST only)
|
|
<Files "contact-handler.php">
|
|
<LimitExcept POST>
|
|
Require all denied
|
|
</LimitExcept>
|
|
</Files>
|
|
|
|
<Files "quote-handler.php">
|
|
<LimitExcept POST>
|
|
Require all denied
|
|
</LimitExcept>
|
|
</Files>
|
|
|
|
# Security headers
|
|
<IfModule mod_headers.c>
|
|
Header always set X-Content-Type-Options "nosniff"
|
|
Header always set X-Frame-Options "SAMEORIGIN"
|
|
Header always set Referrer-Policy "strict-origin-when-cross-origin"
|
|
Header always set Permissions-Policy "geolocation=(), microphone=(), camera=(), payment=(), usb=()"
|
|
|
|
# CRITICAL: No caching for form pages (contain session-specific CSRF tokens)
|
|
<FilesMatch "(quote|contact)\.php$">
|
|
Header set Cache-Control "no-store, no-cache, must-revalidate, max-age=0"
|
|
Header set Pragma "no-cache"
|
|
Header set Expires "Sat, 01 Jan 2000 00:00:00 GMT"
|
|
</FilesMatch>
|
|
</IfModule>
|
|
|
|
# Enhanced Gzip compression
|
|
<IfModule mod_deflate.c>
|
|
AddOutputFilterByType DEFLATE text/html text/plain text/xml text/css text/javascript
|
|
AddOutputFilterByType DEFLATE application/javascript application/x-javascript
|
|
AddOutputFilterByType DEFLATE application/xml application/xhtml+xml application/rss+xml
|
|
AddOutputFilterByType DEFLATE application/json application/ld+json
|
|
AddOutputFilterByType DEFLATE image/svg+xml
|
|
AddOutputFilterByType DEFLATE font/ttf font/otf font/eot font/woff font/woff2
|
|
|
|
BrowserMatch ^Mozilla/4 gzip-only-text/html
|
|
BrowserMatch ^Mozilla/4\.0[678] no-gzip
|
|
BrowserMatch \bMSIE !no-gzip !gzip-only-text/html
|
|
Header append Vary User-Agent
|
|
</IfModule>
|
|
|
|
# Enable Brotli compression if available
|
|
<IfModule mod_brotli.c>
|
|
AddOutputFilterByType BROTLI_COMPRESS text/html text/plain text/xml text/css text/javascript
|
|
AddOutputFilterByType BROTLI_COMPRESS application/javascript application/x-javascript
|
|
AddOutputFilterByType BROTLI_COMPRESS application/xml application/xhtml+xml application/rss+xml
|
|
AddOutputFilterByType BROTLI_COMPRESS application/json application/ld+json
|
|
AddOutputFilterByType BROTLI_COMPRESS image/svg+xml
|
|
AddOutputFilterByType BROTLI_COMPRESS font/ttf font/otf font/woff font/woff2
|
|
</IfModule>
|
|
|
|
# Browser Caching Headers
|
|
<IfModule mod_expires.c>
|
|
ExpiresActive On
|
|
|
|
# Images - 1 year
|
|
ExpiresByType image/jpeg "access plus 1 year"
|
|
ExpiresByType image/jpg "access plus 1 year"
|
|
ExpiresByType image/gif "access plus 1 year"
|
|
ExpiresByType image/png "access plus 1 year"
|
|
ExpiresByType image/webp "access plus 1 year"
|
|
ExpiresByType image/svg+xml "access plus 1 year"
|
|
ExpiresByType image/x-icon "access plus 1 year"
|
|
ExpiresByType image/ico "access plus 1 year"
|
|
|
|
# Fonts - 1 year
|
|
ExpiresByType font/ttf "access plus 1 year"
|
|
ExpiresByType font/otf "access plus 1 year"
|
|
ExpiresByType font/woff "access plus 1 year"
|
|
ExpiresByType font/woff2 "access plus 1 year"
|
|
ExpiresByType application/font-woff "access plus 1 year"
|
|
ExpiresByType application/font-woff2 "access plus 1 year"
|
|
|
|
# CSS and JavaScript - 1 month
|
|
ExpiresByType text/css "access plus 1 month"
|
|
ExpiresByType application/javascript "access plus 1 month"
|
|
ExpiresByType text/javascript "access plus 1 month"
|
|
ExpiresByType application/x-javascript "access plus 1 month"
|
|
|
|
# HTML and PHP - 1 hour
|
|
ExpiresByType text/html "access plus 1 hour"
|
|
ExpiresByType application/xhtml+xml "access plus 1 hour"
|
|
|
|
# Data - no cache
|
|
ExpiresByType application/json "access plus 0 seconds"
|
|
ExpiresByType application/xml "access plus 0 seconds"
|
|
ExpiresByType text/xml "access plus 0 seconds"
|
|
|
|
# Default - 1 week
|
|
ExpiresDefault "access plus 1 week"
|
|
</IfModule>
|
|
|
|
# Cache-Control Headers
|
|
<IfModule mod_headers.c>
|
|
# Static assets - 1 year
|
|
<FilesMatch "\.(jpg|jpeg|png|gif|webp|svg|ico|woff|woff2|ttf|otf|eot)$">
|
|
Header set Cache-Control "max-age=31536000, public, immutable"
|
|
</FilesMatch>
|
|
|
|
# CSS and JS - 1 month
|
|
<FilesMatch "\.(css|js)$">
|
|
Header set Cache-Control "max-age=2592000, public"
|
|
</FilesMatch>
|
|
|
|
# Regular HTML/PHP - 1 hour (but form pages are excluded above)
|
|
<FilesMatch "\.(html)$">
|
|
Header set Cache-Control "max-age=3600, public, must-revalidate"
|
|
</FilesMatch>
|
|
|
|
# Keep-alive
|
|
Header set Connection keep-alive
|
|
</IfModule>
|
|
|
|
# HTTP/2 Server Push
|
|
<IfModule mod_http2.c>
|
|
<FilesMatch "index\.php">
|
|
Header add Link "</assets/css/main.min.css>; rel=preload; as=style"
|
|
Header add Link "</assets/images/ukds-main-logo.webp>; rel=preload; as=image"
|
|
Header add Link "</assets/js/main.min.js>; rel=preload; as=script"
|
|
</FilesMatch>
|
|
</IfModule>
|
|
|
|
# ETags
|
|
FileETag None
|
|
Header unset ETag
|
|
|
|
# Disable directory browsing
|
|
Options -Indexes
|
|
|
|
# Prevent access to logs and database directories
|
|
<IfModule mod_rewrite.c>
|
|
RewriteEngine On
|
|
|
|
# Block known scanner IPs
|
|
RewriteCond %{REMOTE_ADDR} ^(20\.63\.96\.50|4\.193\.248\.52)$
|
|
RewriteRule ^ - [F,L]
|
|
|
|
# Block requests for PHP files that don't exist (webshell scanners)
|
|
RewriteCond %{REQUEST_FILENAME} !-f
|
|
RewriteRule \.php$ - [F,L]
|
|
|
|
# Skip already processed .php files
|
|
RewriteCond %{REQUEST_FILENAME} -f
|
|
RewriteRule ^services/.*\.php$ - [L]
|
|
|
|
# Explicitly allow existing service pages
|
|
RewriteRule ^services/competitive-intelligence/?$ /services/competitive-intelligence.php [L]
|
|
RewriteRule ^services/data-cleaning/?$ /services/data-cleaning.php [L]
|
|
RewriteRule ^services/financial-data-services/?$ /services/financial-data-services.php [L]
|
|
RewriteRule ^services/price-monitoring/?$ /services/price-monitoring.php [L]
|
|
RewriteRule ^services/property-data-extraction/?$ /services/property-data-extraction.php [L]
|
|
RewriteRule ^services/web-scraping/?$ /services/web-scraping.php [L]
|
|
RewriteRule ^services/csharp-development-services/?$ /services/csharp-development-services.php [L]
|
|
RewriteRule ^services/data-processing-services/?$ /services/data-processing-services.php [L]
|
|
|
|
# Redirect /services index to project-types
|
|
RewriteRule ^services/?$ /project-types [R=301,L]
|
|
|
|
RewriteRule ^services/data-analytics-london/?$ /services/data-analytics-london.php [L]
|
|
RewriteRule ^services/data-analytics-consultancy-london/?$ /services/data-analytics-consultancy-london.php [L]
|
|
RewriteRule ^services/data-validation-cleaning/?$ /services/data-validation-cleaning.php [L]
|
|
RewriteRule ^services/data-analytics-services-uk/?$ /services/data-analytics-services-uk.php [L]
|
|
RewriteRule ^services/web-scraping-companies/?$ /services/web-scraping [R=301,L]
|
|
RewriteRule ^services/data-scraping/?$ /services/web-scraping [R=301,L]
|
|
RewriteRule ^web-scraping-services/?$ /services/web-scraping [R=301,L]
|
|
RewriteRule ^data-scraping-services/?$ /services/web-scraping [R=301,L]
|
|
# Redirect unknown service pages to project-types
|
|
RewriteRule ^services/(.+)$ /project-types [R=301,L]
|
|
|
|
# 301 Redirects for renamed pages
|
|
RewriteRule ^blog/articles/data-analytics-companies-london-top-providers/?$ /blog/articles/data-analytics-companies-london-top-providers-compared [R=301,L]
|
|
RewriteRule ^blog/articles/gdpr-compliance-web-scraping-uk-guide/?$ /blog/articles/web-scraping-compliance-uk-guide [R=301,L]
|
|
|
|
# Clean URL rewriting - remove .php extension
|
|
RewriteCond %{REQUEST_FILENAME} !-d
|
|
RewriteCond %{REQUEST_FILENAME} !-f
|
|
RewriteCond %{REQUEST_FILENAME}.php -f
|
|
RewriteRule ^(.+?)/?$ $1.php [END]
|
|
|
|
# Security rules
|
|
RewriteRule ^logs(/.*)?$ - [F,L]
|
|
RewriteRule ^database(/.*)?$ - [F,L]
|
|
RewriteRule ^\.git(/.*)?$ - [F,L]
|
|
RewriteRule ^docker(/.*)?$ - [F,L]
|
|
</IfModule>
|
|
|
|
# Disable server signature
|
|
ServerSignature Off
|
|
|
|
# === Page Speed Optimizations ===
|
|
|
|
# Enable Gzip compression
|
|
<IfModule mod_deflate.c>
|
|
AddOutputFilterByType DEFLATE text/html text/plain text/css text/javascript application/javascript application/json image/svg+xml
|
|
</IfModule>
|
|
|
|
# Browser caching
|
|
<IfModule mod_expires.c>
|
|
ExpiresActive On
|
|
ExpiresByType image/jpg "access plus 1 year"
|
|
ExpiresByType image/jpeg "access plus 1 year"
|
|
ExpiresByType image/gif "access plus 1 year"
|
|
ExpiresByType image/png "access plus 1 year"
|
|
ExpiresByType image/webp "access plus 1 year"
|
|
ExpiresByType image/svg+xml "access plus 1 year"
|
|
ExpiresByType text/css "access plus 1 month"
|
|
ExpiresByType application/javascript "access plus 1 month"
|
|
ExpiresByType text/javascript "access plus 1 month"
|
|
ExpiresByType application/pdf "access plus 1 month"
|
|
ExpiresByType image/x-icon "access plus 1 year"
|
|
ExpiresDefault "access plus 2 days"
|
|
</IfModule>
|
|
|
|
# Cache-Control headers
|
|
<IfModule mod_headers.c>
|
|
# Images only — long cache is safe since filenames don't change
|
|
<FilesMatch "\.(ico|pdf|flv|jpg|jpeg|png|gif|webp|svg)$">
|
|
Header set Cache-Control "max-age=31536000, public"
|
|
</FilesMatch>
|
|
# CSS and JS — short cache + must-revalidate so edits propagate within 1 hour
|
|
<FilesMatch "\.(css|js)$">
|
|
Header set Cache-Control "max-age=3600, public, must-revalidate"
|
|
</FilesMatch>
|
|
<FilesMatch "\.(html|htm|php)$">
|
|
Header set Cache-Control "max-age=600, private, must-revalidate"
|
|
</FilesMatch>
|
|
</IfModule>
|
|
|
|
# Keep-Alive
|
|
<IfModule mod_headers.c>
|
|
Header set Connection keep-alive
|
|
</IfModule>
|