2025-06-08 11:21:30 +01:00
< ? php
// Enhanced security headers
header ( 'Strict-Transport-Security: max-age=31536000; includeSubDomains' );
// Article-specific SEO variables
$article_title = " Web Scraping Rate Limiting: Professional Implementation Guide " ;
$article_description = " Master rate limiting techniques for ethical web scraping. Learn to implement respectful delays, adaptive throttling, and compliance strategies. " ;
$article_keywords = " web scraping rate limiting, scraping delays, ethical web scraping, rate limiting strategies, web scraping best practices, scraping throttling " ;
2026-02-22 09:54:47 +00:00
$article_author = " Michael Thompson " ;
2025-06-08 11:21:30 +01:00
$canonical_url = " https://ukdataservices.co.uk/blog/articles/web-scraping-rate-limiting " ;
$article_published = " 2025-04-28T09:00:00+00:00 " ;
$article_modified = " 2025-04-28T09:00:00+00:00 " ;
$og_image = " https://ukdataservices.co.uk/assets/images/icon-speed.svg " ;
$read_time = 9 ;
?>
<! DOCTYPE html >
< html lang = " en " >
< head >
< meta charset = " UTF-8 " >
< meta name = " viewport " content = " width=device-width, initial-scale=1.0 " >
< title >< ? php echo htmlspecialchars ( $article_title ); ?> | UK Data Services Blog</title>
< meta name = " description " content = " <?php echo htmlspecialchars( $article_description ); ?> " >
< meta name = " keywords " content = " <?php echo htmlspecialchars( $article_keywords ); ?> " >
< meta name = " author " content = " <?php echo htmlspecialchars( $article_author ); ?> " >
< meta name = " robots " content = " index, follow " >
< link rel = " canonical " href = " <?php echo htmlspecialchars( $canonical_url ); ?> " >
<!-- Article - specific meta tags -->
< meta name = " article:published_time " content = " <?php echo $article_published ; ?> " >
< meta name = " article:modified_time " content = " <?php echo $article_modified ; ?> " >
< meta name = " article:author " content = " <?php echo htmlspecialchars( $article_author ); ?> " >
< meta name = " article:section " content = " Web Scraping " >
< meta name = " article:tag " content = " Rate Limiting, Web Scraping, Ethics, Best Practices " >
<!-- Preload critical resources -->
< link rel = " preload " href = " ../../assets/css/main.css " as = " style " >
< link rel = " preload " href = " ../../assets/images/ukds-main-logo.png " as = " image " >
<!-- Open Graph / Social Media -->
< meta property = " og:type " content = " article " >
< meta property = " og:url " content = " <?php echo htmlspecialchars( $canonical_url ); ?> " >
< meta property = " og:title " content = " <?php echo htmlspecialchars( $article_title ); ?> " >
< meta property = " og:description " content = " <?php echo htmlspecialchars( $article_description ); ?> " >
< meta property = " og:image " content = " <?php echo htmlspecialchars( $og_image ); ?> " >
<!-- Twitter Card -->
< meta name = " twitter:card " content = " summary_large_image " >
< meta name = " twitter:title " content = " <?php echo htmlspecialchars( $article_title ); ?> " >
< meta name = " twitter:description " content = " <?php echo htmlspecialchars( $article_description ); ?> " >
< meta name = " twitter:image " content = " <?php echo htmlspecialchars( $og_image ); ?> " >
<!-- Favicon and App Icons -->
< link rel = " icon " type = " image/svg+xml " href = " ../../assets/images/favicon.svg " >
< link rel = " apple-touch-icon " sizes = " 180x180 " href = " ../../assets/images/apple-touch-icon.svg " >
<!-- Fonts -->
< link rel = " preconnect " href = " https://fonts.googleapis.com " >
< link rel = " preconnect " href = " https://fonts.gstatic.com " crossorigin >
< link href = " https://fonts.googleapis.com/css2?family=Roboto+Slab:wght@300;400;500;600;700&family=Lato:wght@300;400;500;600;700&display=swap " rel = " stylesheet " >
<!-- Styles -->
< link rel = " stylesheet " href = " ../../assets/css/main.css " >
2026-02-05 04:11:15 +00:00
< link rel = " stylesheet " href = " ../../assets/css/cro-enhancements.css " >
2025-06-08 11:21:30 +01:00
<!-- Article Schema -->
< script type = " application/ld+json " >
{
" @context " : " https://schema.org " ,
" @type " : " Article " ,
" mainEntityOfPage " : {
" @type " : " WebPage " ,
" @id " : " <?php echo htmlspecialchars( $canonical_url ); ?> "
},
" headline " : " <?php echo htmlspecialchars( $article_title ); ?> " ,
" description " : " <?php echo htmlspecialchars( $article_description ); ?> " ,
" image " : " <?php echo htmlspecialchars( $og_image ); ?> " ,
" author " : {
" @type " : " Organization " ,
" name " : " UK Data Services " ,
" url " : " https://ukdataservices.co.uk "
},
" publisher " : {
" @type " : " Organization " ,
" name " : " UK Data Services " ,
" logo " : {
" @type " : " ImageObject " ,
" url " : " https://ukdataservices.co.uk/assets/images/ukds-main-logo.png "
}
},
" datePublished " : " <?php echo $article_published ; ?> " ,
" dateModified " : " <?php echo $article_modified ; ?> "
}
</ script >
</ head >
< body >
<!-- Skip to content link for accessibility -->
< a href = " #main-content " class = " skip-to-content " > Skip to main content </ a >
2026-02-10 22:21:16 +00:00
< ? php include ( $_SERVER [ " DOCUMENT_ROOT " ] . " /includes/nav.php " ); ?> <!-- Article Content -->
2025-06-08 11:21:30 +01:00
< main id = " main-content " >
< article class = " article-page " >
< div class = " container " >
2025-06-09 05:47:40 +00:00
< div class = " article-meta " >
< span class = " category " >< a href = " /blog/categories/web-scraping.php " > Web Scraping </ a ></ span >
< time datetime = " 2025-04-28 " > 28 April 2025 </ time >
< span class = " read-time " > 9 min read </ span >
</ div >
< header class = " article-header " >
2025-06-08 11:21:30 +01:00
< h1 >< ? php echo htmlspecialchars ( $article_title ); ?> </h1>
< p class = " article-lead " >< ? php echo htmlspecialchars ( $article_description ); ?> </p>
< div class = " article-author " >
< div class = " author-info " >
< span > By < ? php echo htmlspecialchars ( $article_author ); ?> </span>
</ div >
< div class = " share-buttons " >
< a href = " https://www.linkedin.com/sharing/share-offsite/?url=<?php echo urlencode( $canonical_url ); ?> " class = " share-button linkedin " aria - label = " Share on LinkedIn " rel = " noopener " target = " _blank " >
2026-02-05 04:11:15 +00:00
< img loading = " lazy " src = " ../../assets/images/icon-linkedin.svg " alt = " LinkedIn " >
2025-06-08 11:21:30 +01:00
</ a >
< a href = " https://twitter.com/intent/tweet?url=<?php echo urlencode( $canonical_url ); ?>&text=<?php echo urlencode( $article_title ); ?> " class = " share-button twitter " aria - label = " Share on Twitter " rel = " noopener " target = " _blank " >
2026-02-05 04:11:15 +00:00
< img loading = " lazy " src = " ../../assets/images/icon-twitter.svg " alt = " Twitter " >
2025-06-08 11:21:30 +01:00
</ a >
</ div >
</ div >
</ header >
< div class = " article-content " >
< div class = " content-wrapper " >
< h2 > Why Rate Limiting Matters in Web Scraping </ h2 >
< p > Rate limiting is fundamental to ethical and sustainable web scraping . It protects websites from overload , maintains good relationships with site owners , and helps avoid IP bans and legal issues . Professional scrapers understand that respectful data collection leads to long - term success .</ p >
< p > This guide covers comprehensive rate limiting strategies , from basic delays to sophisticated adaptive throttling systems that automatically adjust to website conditions .</ p >
< h2 > Understanding Rate Limiting Principles </ h2 >
< h3 > What is Rate Limiting ? </ h3 >
< p > Rate limiting controls the frequency of requests sent to a target website . It involves :</ p >
< ul >
< li >< strong > Request Frequency :</ strong > Number of requests per time period </ li >
< li >< strong > Concurrent Connections :</ strong > Simultaneous connections to a domain </ li >
< li >< strong > Bandwidth Usage :</ strong > Data transfer rate control </ li >
< li >< strong > Resource Respect :</ strong > Consideration for server capacity </ li >
</ ul >
< h3 > Why Rate Limiting is Essential </ h3 >
< ul >
< li >< strong > Legal Compliance :</ strong > Avoid violating terms of service </ li >
< li >< strong > Server Protection :</ strong > Prevent overwhelming target systems </ li >
< li >< strong > IP Preservation :</ strong > Avoid getting blocked or banned </ li >
< li >< strong > Data Quality :</ strong > Ensure consistent , reliable data collection </ li >
< li >< strong > Ethical Standards :</ strong > Maintain professional scraping practices </ li >
</ ul >
< h2 > Basic Rate Limiting Implementation </ h2 >
< h3 > Simple Delay Mechanisms </ h3 >
< pre >< code >
import time
import random
import requests
class BasicRateLimiter :
def __init__ ( self , delay_range = ( 1 , 3 )) :
self . min_delay = delay_range [ 0 ]
self . max_delay = delay_range [ 1 ]
self . last_request_time = 0
def wait ( self ) :
" " " Implement random delay between requests " " "
current_time = time . time ()
elapsed = current_time - self . last_request_time
# Calculate required delay
delay = random . uniform ( self . min_delay , self . max_delay )
if elapsed < delay :
sleep_time = delay - elapsed
print ( f " Rate limiting: sleeping for { sleep_time:.2f} seconds " )
time . sleep ( sleep_time )
self . last_request_time = time . time ()
def request ( self , url , ** kwargs ) :
" " " Make rate-limited request " " "
self . wait ()
return requests . get ( url , ** kwargs )
# Usage example
limiter = BasicRateLimiter ( delay_range = ( 2 , 5 ))
urls = [
" https://example.com/page1 " ,
" https://example.com/page2 " ,
" https://example.com/page3 "
]
for url in urls :
response = limiter . request ( url )
print ( f " Scraped { url}: { response.status_code} " )
</ code ></ pre >
< h3 > Domain - Specific Rate Limiting </ h3 >
< pre >< code >
from urllib . parse import urlparse
from collections import defaultdict
class DomainRateLimiter :
def __init__ ( self ) :
self . domain_delays = defaultdict ( lambda : 1.0 ) # Default 1 second
self . last_request_times = defaultdict ( float )
def set_domain_delay ( self , domain , delay ) :
" " " Set specific delay for a domain " " "
self . domain_delays [ domain ] = delay
def wait_for_domain ( self , url ) :
" " " Wait appropriate time for specific domain " " "
domain = urlparse ( url ) . netloc
current_time = time . time ()
last_request = self . last_request_times [ domain ]
required_delay = self . domain_delays [ domain ]
elapsed = current_time - last_request
if elapsed < required_delay :
sleep_time = required_delay - elapsed
time . sleep ( sleep_time )
self . last_request_times [ domain ] = time . time ()
def request ( self , url , ** kwargs ) :
" " " Make domain-aware rate-limited request " " "
self . wait_for_domain ( url )
return requests . get ( url , ** kwargs )
# Usage with different domain settings
limiter = DomainRateLimiter ()
limiter . set_domain_delay ( " api.example.com " , 0.5 ) # Fast API
limiter . set_domain_delay ( " slow-site.com " , 5.0 ) # Slow site
limiter . set_domain_delay ( " ecommerce.com " , 2.0 ) # E-commerce site
# Requests will be automatically rate-limited per domain
response1 = limiter . request ( " https://api.example.com/data " )
response2 = limiter . request ( " https://slow-site.com/page " )
response3 = limiter . request ( " https://ecommerce.com/products " )
</ code ></ pre >
< h2 > Advanced Rate Limiting Strategies </ h2 >
< h3 > Exponential Backoff </ h3 >
< pre >< code >
import math
class ExponentialBackoffLimiter :
def __init__ ( self , base_delay = 1.0 , max_delay = 60.0 ) :
self . base_delay = base_delay
self . max_delay = max_delay
self . consecutive_errors = defaultdict ( int )
self . domain_delays = defaultdict ( lambda : base_delay )
def calculate_delay ( self , domain , error_occurred = False ) :
" " " Calculate delay using exponential backoff " " "
if error_occurred :
self . consecutive_errors [ domain ] += 1
else :
self . consecutive_errors [ domain ] = 0
# Exponential backoff formula
error_count = self . consecutive_errors [ domain ]
delay = min (
self . base_delay * ( 2 ** error_count ),
self . max_delay
)
self . domain_delays [ domain ] = delay
return delay
def request_with_backoff ( self , url , max_retries = 3 ) :
" " " Make request with exponential backoff on errors " " "
domain = urlparse ( url ) . netloc
for attempt in range ( max_retries + 1 ) :
try :
delay = self . calculate_delay ( domain , error_occurred = False )
time . sleep ( delay )
response = requests . get ( url , timeout = 10 )
if response . status_code == 429 : # Too Many Requests
raise requests . exceptions . RequestException ( " Rate limited " )
response . raise_for_status ()
return response
except requests . exceptions . RequestException as e :
print ( f " Request failed (attempt { attempt + 1}): { e} " )
if attempt < max_retries :
error_delay = self . calculate_delay ( domain , error_occurred = True )
print ( f " Backing off for { error_delay:.2f} seconds " )
time . sleep ( error_delay )
else :
raise
# Usage
backoff_limiter = ExponentialBackoffLimiter ()
response = backoff_limiter . request_with_backoff ( " https://api.example.com/data " )
</ code ></ pre >
< h3 > Adaptive Rate Limiting </ h3 >
< pre >< code >
class AdaptiveRateLimiter :
def __init__ ( self , initial_delay = 1.0 ) :
self . domain_stats = defaultdict ( lambda : {
'delay' : initial_delay ,
'response_times' : [],
'success_rate' : 1.0 ,
'last_adjustment' : time . time ()
})
def record_response ( self , domain , response_time , success ) :
" " " Record response statistics " " "
stats = self . domain_stats [ domain ]
# Keep only recent response times (last 10)
stats [ 'response_times' ] . append ( response_time )
if len ( stats [ 'response_times' ]) > 10 :
stats [ 'response_times' ] . pop ( 0 )
# Update success rate (exponential moving average)
alpha = 0.1
stats [ 'success_rate' ] = (
alpha * ( 1 if success else 0 ) +
( 1 - alpha ) * stats [ 'success_rate' ]
)
def adjust_delay ( self , domain ) :
" " " Dynamically adjust delay based on performance " " "
stats = self . domain_stats [ domain ]
current_time = time . time ()
# Only adjust every 30 seconds
if current_time - stats [ 'last_adjustment' ] < 30 :
return stats [ 'delay' ]
avg_response_time = (
sum ( stats [ 'response_times' ]) / len ( stats [ 'response_times' ])
if stats [ 'response_times' ] else 1.0
)
# Adjustment logic
if stats [ 'success_rate' ] < 0.8 : # Low success rate
stats [ 'delay' ] *= 1.5 # Increase delay
elif avg_response_time > 5.0 : # Slow responses
stats [ 'delay' ] *= 1.2
elif stats [ 'success_rate' ] > 0.95 and avg_response_time < 2.0 :
stats [ 'delay' ] *= 0.9 # Decrease delay for good performance
# Keep delay within reasonable bounds
stats [ 'delay' ] = max ( 0.5 , min ( stats [ 'delay' ], 30.0 ))
stats [ 'last_adjustment' ] = current_time
return stats [ 'delay' ]
def request ( self , url ) :
" " " Make adaptive rate-limited request " " "
domain = urlparse ( url ) . netloc
delay = self . adjust_delay ( domain )
time . sleep ( delay )
start_time = time . time ()
try :
response = requests . get ( url , timeout = 10 )
response_time = time . time () - start_time
success = response . status_code == 200
self . record_response ( domain , response_time , success )
return response
except Exception as e :
response_time = time . time () - start_time
self . record_response ( domain , response_time , False )
raise
# Usage
adaptive_limiter = AdaptiveRateLimiter ()
# The limiter will automatically adjust delays based on performance
for i in range ( 100 ) :
try :
response = adaptive_limiter . request ( f " https://api.example.com/data/ { i} " )
print ( f " Request { i}: { response.status_code} " )
except Exception as e :
print ( f " Request { i} failed: { e} " )
</ code ></ pre >
< h2 > Distributed Rate Limiting </ h2 >
< h3 > Redis - Based Rate Limiting </ h3 >
< pre >< code >
import redis
import json
class DistributedRateLimiter :
def __init__ ( self , redis_url = 'redis://localhost:6379' ) :
self . redis_client = redis . from_url ( redis_url )
self . default_window = 60 # 1 minute window
self . default_limit = 30 # 30 requests per minute
def is_allowed ( self , domain , limit = None , window = None ) :
" " " Check if request is allowed using sliding window " " "
limit = limit or self . default_limit
window = window or self . default_window
current_time = time . time ()
key = f " rate_limit: { domain} "
# Use Redis pipeline for atomic operations
pipe = self . redis_client . pipeline ()
# Remove old entries outside the window
pipe . zremrangebyscore ( key , 0 , current_time - window )
# Count current requests in window
pipe . zcard ( key )
# Add current request
pipe . zadd ( key , { str ( current_time ) : current_time })
# Set expiry for cleanup
pipe . expire ( key , window )
results = pipe . execute ()
current_requests = results [ 1 ]
return current_requests < limit
def wait_if_needed ( self , domain , limit = None , window = None ) :
" " " Wait until request is allowed " " "
while not self . is_allowed ( domain , limit , window ) :
print ( f " Rate limit exceeded for { domain}, waiting... " )
time . sleep ( 1 )
def request ( self , url , ** kwargs ) :
" " " Make distributed rate-limited request " " "
domain = urlparse ( url ) . netloc
self . wait_if_needed ( domain )
return requests . get ( url , ** kwargs )
# Usage across multiple scraper instances
distributed_limiter = DistributedRateLimiter ()
# This will coordinate rate limiting across all instances
response = distributed_limiter . request ( " https://api.example.com/data " )
</ code ></ pre >
< h3 > Token Bucket Algorithm </ h3 >
< pre >< code >
class TokenBucket :
def __init__ ( self , capacity , refill_rate ) :
self . capacity = capacity
self . tokens = capacity
self . refill_rate = refill_rate # tokens per second
self . last_refill = time . time ()
def consume ( self , tokens = 1 ) :
" " " Try to consume tokens from bucket " " "
self . _refill ()
if self . tokens >= tokens :
self . tokens -= tokens
return True
return False
def _refill ( self ) :
" " " Refill tokens based on elapsed time " " "
current_time = time . time ()
elapsed = current_time - self . last_refill
# Add tokens based on elapsed time
tokens_to_add = elapsed * self . refill_rate
self . tokens = min ( self . capacity , self . tokens + tokens_to_add )
self . last_refill = current_time
def wait_for_tokens ( self , tokens = 1 ) :
" " " Wait until enough tokens are available " " "
while not self . consume ( tokens ) :
time . sleep ( 0.1 )
class TokenBucketRateLimiter :
def __init__ ( self ) :
self . buckets = {}
def get_bucket ( self , domain , capacity = 10 , refill_rate = 1.0 ) :
" " " Get or create token bucket for domain " " "
if domain not in self . buckets :
self . buckets [ domain ] = TokenBucket ( capacity , refill_rate )
return self . buckets [ domain ]
def request ( self , url , ** kwargs ) :
" " " Make token bucket rate-limited request " " "
domain = urlparse ( url ) . netloc
bucket = self . get_bucket ( domain )
# Wait for token availability
bucket . wait_for_tokens ()
return requests . get ( url , ** kwargs )
# Usage
token_limiter = TokenBucketRateLimiter ()
# Allows burst requests up to bucket capacity
# then throttles to refill rate
for i in range ( 20 ) :
response = token_limiter . request ( f " https://api.example.com/data/ { i} " )
print ( f " Request { i}: { response.status_code} " )
</ code ></ pre >
< h2 > Integration with Popular Libraries </ h2 >
< h3 > Scrapy Rate Limiting </ h3 >
< pre >< code >
# Custom Scrapy middleware for advanced rate limiting
from scrapy . downloadermiddlewares . delay import DelayMiddleware
class AdaptiveDelayMiddleware :
def __init__ ( self , delay = 1.0 ) :
self . delay = delay
self . domain_stats = defaultdict ( lambda : {
'delay' : delay ,
'errors' : 0 ,
'successes' : 0
})
@ classmethod
def from_crawler ( cls , crawler ) :
return cls (
delay = crawler . settings . getfloat ( 'DOWNLOAD_DELAY' , 1.0 )
)
def process_request ( self , request , spider ) :
domain = urlparse ( request . url ) . netloc
delay = self . calculate_delay ( domain )
if delay > 0 :
time . sleep ( delay )
def process_response ( self , request , response , spider ) :
domain = urlparse ( request . url ) . netloc
stats = self . domain_stats [ domain ]
if response . status == 200 :
stats [ 'successes' ] += 1
stats [ 'errors' ] = max ( 0 , stats [ 'errors' ] - 1 )
else :
stats [ 'errors' ] += 1
self . adjust_delay ( domain )
return response
def calculate_delay ( self , domain ) :
return self . domain_stats [ domain ][ 'delay' ]
def adjust_delay ( self , domain ) :
stats = self . domain_stats [ domain ]
if stats [ 'errors' ] > 3 :
stats [ 'delay' ] *= 1.5
elif stats [ 'successes' ] > 10 and stats [ 'errors' ] == 0 :
stats [ 'delay' ] *= 0.9
stats [ 'delay' ] = max ( 0.5 , min ( stats [ 'delay' ], 10.0 ))
# settings.py
DOWNLOADER_MIDDLEWARES = {
'myproject.middlewares.AdaptiveDelayMiddleware' : 543 ,
}
DOWNLOAD_DELAY = 1.0
RANDOMIZE_DOWNLOAD_DELAY = 0.5
</ code ></ pre >
< h3 > Requests - HTML Rate Limiting </ h3 >
< pre >< code >
from requests_html import HTMLSession
class RateLimitedSession ( HTMLSession ) :
def __init__ ( self , rate_limiter = None ) :
super () . __init__ ()
self . rate_limiter = rate_limiter or BasicRateLimiter ()
def get ( self , url , ** kwargs ) :
" " " Override get method with rate limiting " " "
self . rate_limiter . wait_for_domain ( url )
return super () . get ( url , ** kwargs )
def post ( self , url , ** kwargs ) :
" " " Override post method with rate limiting " " "
self . rate_limiter . wait_for_domain ( url )
return super () . post ( url , ** kwargs )
# Usage
session = RateLimitedSession (
rate_limiter = DomainRateLimiter ()
)
response = session . get ( 'https://example.com' )
response . html . render () # JavaScript rendering with rate limiting
</ code ></ pre >
< h2 > Monitoring and Analytics </ h2 >
< h3 > Rate Limiting Metrics </ h3 >
< pre >< code >
import logging
from collections import defaultdict
class RateLimitingMonitor :
def __init__ ( self ) :
self . metrics = defaultdict ( lambda : {
'requests_made' : 0 ,
'requests_blocked' : 0 ,
'total_delay_time' : 0 ,
'errors' : 0
})
# Setup logging
logging . basicConfig (
level = logging . INFO ,
format = '%(asctime)s - %(levelname)s - %(message)s' ,
handlers = [
logging . FileHandler ( 'rate_limiting.log' ),
logging . StreamHandler ()
]
)
self . logger = logging . getLogger ( __name__ )
def log_request ( self , domain , delay_time , success = True ) :
" " " Log request metrics " " "
metrics = self . metrics [ domain ]
metrics [ 'requests_made' ] += 1
metrics [ 'total_delay_time' ] += delay_time
if not success :
metrics [ 'errors' ] += 1
self . logger . info ( f " Domain: { domain}, Delay: { delay_time:.2f}s, Success: { success} " )
def log_rate_limit_hit ( self , domain ) :
" " " Log when rate limit is encountered " " "
self . metrics [ domain ][ 'requests_blocked' ] += 1
self . logger . warning ( f " Rate limit hit for domain: { domain} " )
def get_statistics ( self ) :
" " " Get comprehensive statistics " " "
stats = {}
for domain , metrics in self . metrics . items () :
total_requests = metrics [ 'requests_made' ]
if total_requests > 0 :
stats [ domain ] = {
'total_requests' : total_requests ,
'requests_blocked' : metrics [ 'requests_blocked' ],
'error_rate' : metrics [ 'errors' ] / total_requests ,
'avg_delay' : metrics [ 'total_delay_time' ] / total_requests ,
'block_rate' : metrics [ 'requests_blocked' ] / total_requests
}
return stats
def print_report ( self ) :
" " " Print detailed statistics report " " "
stats = self . get_statistics ()
print ( " \n " + " = " * 60 )
print ( " RATE LIMITING STATISTICS REPORT " )
print ( " = " * 60 )
for domain , metrics in stats . items () :
print ( f " \n Domain: { domain} " )
print ( f " Total Requests: { metrics['total_requests']} " )
print ( f " Requests Blocked: { metrics['requests_blocked']} " )
print ( f " Error Rate: { metrics['error_rate']:.2%} " )
print ( f " Average Delay: { metrics['avg_delay']:.2f}s " )
print ( f " Block Rate: { metrics['block_rate']:.2%} " )
# Usage
monitor = RateLimitingMonitor ()
class MonitoredRateLimiter ( BasicRateLimiter ) :
def __init__ ( self , monitor , * args , ** kwargs ) :
super () . __init__ ( * args , ** kwargs )
self . monitor = monitor
def request ( self , url , ** kwargs ) :
domain = urlparse ( url ) . netloc
start_time = time . time ()
try :
response = super () . request ( url , ** kwargs )
delay_time = time . time () - start_time
success = response . status_code == 200
self . monitor . log_request ( domain , delay_time , success )
return response
except Exception as e :
delay_time = time . time () - start_time
self . monitor . log_request ( domain , delay_time , False )
raise
# Use monitored rate limiter
limiter = MonitoredRateLimiter ( monitor , delay_range = ( 1 , 3 ))
# After scraping session
monitor . print_report ()
</ code ></ pre >
< h2 > Best Practices and Recommendations </ h2 >
< h3 > General Guidelines </ h3 >
< ul >
< li >< strong > Start Conservative :</ strong > Begin with longer delays and adjust down </ li >
< li >< strong > Respect robots . txt :</ strong > Check crawl - delay directives </ li >
< li >< strong > Monitor Server Response :</ strong > Watch for 429 status codes </ li >
< li >< strong > Use Random Delays :</ strong > Avoid predictable patterns </ li >
< li >< strong > Implement Backoff :</ strong > Increase delays on errors </ li >
</ ul >
< h3 > Domain - Specific Strategies </ h3 >
< ul >
< li >< strong > E - commerce Sites :</ strong > 2 - 5 second delays during peak hours </ li >
< li >< strong > News Websites :</ strong > 1 - 3 second delays , respect peak traffic </ li >
< li >< strong > APIs :</ strong > Follow documented rate limits strictly </ li >
< li >< strong > Government Sites :</ strong > Very conservative approach ( 5 + seconds ) </ li >
< li >< strong > Social Media :</ strong > Use official APIs when possible </ li >
</ ul >
< h3 > Legal and Ethical Considerations </ h3 >
< ul >
< li > Review terms of service before scraping </ li >
< li > Identify yourself with proper User - Agent headers </ li >
< li > Consider reaching out for API access </ li >
< li > Respect copyright and data protection laws </ li >
< li > Implement circuit breakers for server protection </ li >
</ ul >
< div class = " article-cta " >
< h3 > Professional Rate Limiting Solutions </ h3 >
< p > UK Data Services implements sophisticated rate limiting strategies for ethical , compliant web scraping that respects website resources while maximizing data collection efficiency .</ p >
2025-06-08 20:51:14 +00:00
< a href = " /quote " class = " btn btn-primary " > Get Rate Limiting Consultation </ a >
2025-06-08 11:21:30 +01:00
</ div >
</ div >
</ div >
<!-- Related Articles -->
< aside class = " related-articles " >
< h3 > Related Articles </ h3 >
< div class = " related-grid " >
< article class = " related-card " >
< span class = " category " > Web Scraping </ span >
< h4 >< a href = " handling-captchas-scraping.php " > Handling CAPTCHAs in Web Scraping : Complete Guide </ a ></ h4 >
2026-02-22 09:58:16 +00:00
< span class = " read-time " > 8 min read </ span > < article class = " related-card " >
2025-06-08 11:21:30 +01:00
< span class = " category " > Web Scraping </ span >
< h4 >< a href = " python-scrapy-enterprise-guide.php " > Python Scrapy Enterprise Guide : Scaling Web Scraping Operations </ a ></ h4 >
2026-02-22 09:58:16 +00:00
< span class = " read-time " > 12 min read </ span > < article class = " related-card " >
2025-06-08 11:21:30 +01:00
< span class = " category " > Compliance </ span >
< h4 >< a href = " web-scraping-compliance-uk-guide.php " > Complete Guide to Web Scraping Compliance in the UK </ a ></ h4 >
2026-02-22 09:58:16 +00:00
< span class = " read-time " > 12 min read </ span > </ div >
2025-06-08 11:21:30 +01:00
</ aside >
</ div >
2025-12-07 11:49:39 +00:00
< ? php include ( $_SERVER [ 'DOCUMENT_ROOT' ] . '/includes/author-bio.php' ); ?>
< ? php include ( $_SERVER [ 'DOCUMENT_ROOT' ] . '/includes/article-footer.php' ); ?>
</ div >
</ article >
2025-06-08 11:21:30 +01:00
</ main >
<!-- Footer -->
< footer class = " footer " >
< div class = " container " >
< div class = " footer-content " >
< div class = " footer-section " >
< div class = " footer-logo " >
2026-02-05 04:11:15 +00:00
< img loading = " lazy " src = " ../../assets/images/logo-white.svg " alt = " UK Data Services " loading = " lazy " >
2025-06-08 11:21:30 +01:00
</ div >
< p > Enterprise data intelligence solutions for modern British business .</ p >
</ div >
< div class = " footer-section " >
< h3 > Quick Links </ h3 >
< ul >
2025-06-08 15:51:38 +00:00
< li >< a href = " /#services " > Services </ a ></ li >
< li >< a href = " /blog/ " > Blog </ a ></ li >
< li >< a href = " /case-studies/ " > Case Studies </ a ></ li >
2025-06-08 20:51:14 +00:00
< li >< a href = " /about " > About </ a ></ li >
2025-06-08 15:51:38 +00:00
< li >< a href = " /#contact " > Contact </ a ></ li >
2025-06-08 11:21:30 +01:00
</ ul >
</ div >
< div class = " footer-section " >
< h3 > Legal </ h3 >
< ul >
2025-06-08 20:51:14 +00:00
< li >< a href = " /privacy-policy " > Privacy Policy </ a ></ li >
< li >< a href = " /terms-of-service " > Terms of Service </ a ></ li >
< li >< a href = " /cookie-policy " > Cookie Policy </ a ></ li >
< li >< a href = " /gdpr-compliance " > GDPR Compliance </ a ></ li >
2025-06-08 11:21:30 +01:00
</ ul >
</ div >
</ div >
< div class = " footer-bottom " >
< p >& copy ; < ? php echo date ( 'Y' ); ?> UK Data Services. All rights reserved.</p>
< div class = " social-links " >
2025-12-21 08:08:45 +00:00
< a href = " https://linkedin.com/company/uk-data-services " aria - label = " LinkedIn " rel = " noopener " target = " _blank " >
2026-02-05 04:11:15 +00:00
< img loading = " lazy " src = " ../../assets/images/icon-linkedin.svg " alt = " LinkedIn " loading = " lazy " >
2025-06-08 11:21:30 +01:00
</ a >
< a href = " https://twitter.com/ukdataservices " aria - label = " Twitter " rel = " noopener " target = " _blank " >
2026-02-05 04:11:15 +00:00
< img loading = " lazy " src = " ../../assets/images/icon-twitter.svg " alt = " Twitter " loading = " lazy " >
2025-06-08 11:21:30 +01:00
</ a >
</ div >
</ div >
</ div >
</ footer >
<!-- Scripts -->
< script src = " ../../assets/js/main.js " ></ script >
2026-02-05 04:11:15 +00:00
< script src = " ../../assets/js/cro-enhancements.js " ></ script >
2025-06-08 11:21:30 +01:00
</ body >
</ html >