- Remap 20 articles from generic team names (UK Data Services Legal Team, Analytics Team, Technical Team etc.) to matching named authors from the author database (Sarah Chen, David Martinez, Michael Thompson, etc.) - Add 5 new named authors to author-bio.php: Alex Kumar, David Thompson, Emily Roberts, Michael Chen, Sarah Mitchell - Eliminates author name/bio mismatch where team name showed but Editorial Team bio/role rendered instead
848 lines
34 KiB
PHP
848 lines
34 KiB
PHP
<?php
|
|
// Enhanced security headers
|
|
header('Strict-Transport-Security: max-age=31536000; includeSubDomains');
|
|
|
|
// Article-specific SEO variables
|
|
$article_title = "Web Scraping Rate Limiting: Professional Implementation Guide";
|
|
$article_description = "Master rate limiting techniques for ethical web scraping. Learn to implement respectful delays, adaptive throttling, and compliance strategies.";
|
|
$article_keywords = "web scraping rate limiting, scraping delays, ethical web scraping, rate limiting strategies, web scraping best practices, scraping throttling";
|
|
$article_author = "Michael Thompson";
|
|
$canonical_url = "https://ukdataservices.co.uk/blog/articles/web-scraping-rate-limiting";
|
|
$article_published = "2025-04-28T09:00:00+00:00";
|
|
$article_modified = "2025-04-28T09:00:00+00:00";
|
|
$og_image = "https://ukdataservices.co.uk/assets/images/icon-speed.svg";
|
|
$read_time = 9;
|
|
?>
|
|
<!DOCTYPE html>
|
|
<html lang="en">
|
|
<head>
|
|
<meta charset="UTF-8">
|
|
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
<title><?php echo htmlspecialchars($article_title); ?> | UK Data Services Blog</title>
|
|
<meta name="description" content="<?php echo htmlspecialchars($article_description); ?>">
|
|
<meta name="keywords" content="<?php echo htmlspecialchars($article_keywords); ?>">
|
|
<meta name="author" content="<?php echo htmlspecialchars($article_author); ?>">
|
|
<meta name="robots" content="index, follow">
|
|
<link rel="canonical" href="<?php echo htmlspecialchars($canonical_url); ?>">
|
|
|
|
<!-- Article-specific meta tags -->
|
|
<meta name="article:published_time" content="<?php echo $article_published; ?>">
|
|
<meta name="article:modified_time" content="<?php echo $article_modified; ?>">
|
|
<meta name="article:author" content="<?php echo htmlspecialchars($article_author); ?>">
|
|
<meta name="article:section" content="Web Scraping">
|
|
<meta name="article:tag" content="Rate Limiting, Web Scraping, Ethics, Best Practices">
|
|
|
|
<!-- Preload critical resources -->
|
|
<link rel="preload" href="../../assets/css/main.css" as="style">
|
|
<link rel="preload" href="../../assets/images/ukds-main-logo.png" as="image">
|
|
|
|
<!-- Open Graph / Social Media -->
|
|
<meta property="og:type" content="article">
|
|
<meta property="og:url" content="<?php echo htmlspecialchars($canonical_url); ?>">
|
|
<meta property="og:title" content="<?php echo htmlspecialchars($article_title); ?>">
|
|
<meta property="og:description" content="<?php echo htmlspecialchars($article_description); ?>">
|
|
<meta property="og:image" content="<?php echo htmlspecialchars($og_image); ?>">
|
|
|
|
<!-- Twitter Card -->
|
|
<meta name="twitter:card" content="summary_large_image">
|
|
<meta name="twitter:title" content="<?php echo htmlspecialchars($article_title); ?>">
|
|
<meta name="twitter:description" content="<?php echo htmlspecialchars($article_description); ?>">
|
|
<meta name="twitter:image" content="<?php echo htmlspecialchars($og_image); ?>">
|
|
|
|
<!-- Favicon and App Icons -->
|
|
<link rel="icon" type="image/svg+xml" href="../../assets/images/favicon.svg">
|
|
<link rel="apple-touch-icon" sizes="180x180" href="../../assets/images/apple-touch-icon.svg">
|
|
|
|
<!-- Fonts -->
|
|
<link rel="preconnect" href="https://fonts.googleapis.com">
|
|
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
|
|
<link href="https://fonts.googleapis.com/css2?family=Roboto+Slab:wght@300;400;500;600;700&family=Lato:wght@300;400;500;600;700&display=swap" rel="stylesheet">
|
|
|
|
<!-- Styles -->
|
|
<link rel="stylesheet" href="../../assets/css/main.css">
|
|
<link rel="stylesheet" href="../../assets/css/cro-enhancements.css">
|
|
|
|
<!-- Article Schema -->
|
|
<script type="application/ld+json">
|
|
{
|
|
"@context": "https://schema.org",
|
|
"@type": "Article",
|
|
"mainEntityOfPage": {
|
|
"@type": "WebPage",
|
|
"@id": "<?php echo htmlspecialchars($canonical_url); ?>"
|
|
},
|
|
"headline": "<?php echo htmlspecialchars($article_title); ?>",
|
|
"description": "<?php echo htmlspecialchars($article_description); ?>",
|
|
"image": "<?php echo htmlspecialchars($og_image); ?>",
|
|
"author": {
|
|
"@type": "Organization",
|
|
"name": "UK Data Services",
|
|
"url": "https://ukdataservices.co.uk"
|
|
},
|
|
"publisher": {
|
|
"@type": "Organization",
|
|
"name": "UK Data Services",
|
|
"logo": {
|
|
"@type": "ImageObject",
|
|
"url": "https://ukdataservices.co.uk/assets/images/ukds-main-logo.png"
|
|
}
|
|
},
|
|
"datePublished": "<?php echo $article_published; ?>",
|
|
"dateModified": "<?php echo $article_modified; ?>"
|
|
}
|
|
</script>
|
|
</head>
|
|
<body>
|
|
<!-- Skip to content link for accessibility -->
|
|
<a href="#main-content" class="skip-to-content">Skip to main content</a>
|
|
|
|
<?php include($_SERVER["DOCUMENT_ROOT"] . "/includes/nav.php"); ?><!-- Article Content -->
|
|
<main id="main-content">
|
|
<article class="article-page">
|
|
<div class="container">
|
|
<div class="article-meta">
|
|
<span class="category"><a href="/blog/categories/web-scraping.php">Web Scraping</a></span>
|
|
<time datetime="2025-04-28">28 April 2025</time>
|
|
<span class="read-time">9 min read</span>
|
|
</div>
|
|
<header class="article-header">
|
|
<h1><?php echo htmlspecialchars($article_title); ?></h1>
|
|
<p class="article-lead"><?php echo htmlspecialchars($article_description); ?></p>
|
|
|
|
<div class="article-author">
|
|
<div class="author-info">
|
|
<span>By <?php echo htmlspecialchars($article_author); ?></span>
|
|
</div>
|
|
<div class="share-buttons">
|
|
<a href="https://www.linkedin.com/sharing/share-offsite/?url=<?php echo urlencode($canonical_url); ?>" class="share-button linkedin" aria-label="Share on LinkedIn" rel="noopener" target="_blank">
|
|
<img loading="lazy" src="../../assets/images/icon-linkedin.svg" alt="LinkedIn">
|
|
</a>
|
|
<a href="https://twitter.com/intent/tweet?url=<?php echo urlencode($canonical_url); ?>&text=<?php echo urlencode($article_title); ?>" class="share-button twitter" aria-label="Share on Twitter" rel="noopener" target="_blank">
|
|
<img loading="lazy" src="../../assets/images/icon-twitter.svg" alt="Twitter">
|
|
</a>
|
|
</div>
|
|
</div>
|
|
</header>
|
|
|
|
<div class="article-content">
|
|
<div class="content-wrapper">
|
|
<h2>Why Rate Limiting Matters in Web Scraping</h2>
|
|
<p>Rate limiting is fundamental to ethical and sustainable web scraping. It protects websites from overload, maintains good relationships with site owners, and helps avoid IP bans and legal issues. Professional scrapers understand that respectful data collection leads to long-term success.</p>
|
|
|
|
<p>This guide covers comprehensive rate limiting strategies, from basic delays to sophisticated adaptive throttling systems that automatically adjust to website conditions.</p>
|
|
|
|
<h2>Understanding Rate Limiting Principles</h2>
|
|
|
|
<h3>What is Rate Limiting?</h3>
|
|
<p>Rate limiting controls the frequency of requests sent to a target website. It involves:</p>
|
|
<ul>
|
|
<li><strong>Request Frequency:</strong> Number of requests per time period</li>
|
|
<li><strong>Concurrent Connections:</strong> Simultaneous connections to a domain</li>
|
|
<li><strong>Bandwidth Usage:</strong> Data transfer rate control</li>
|
|
<li><strong>Resource Respect:</strong> Consideration for server capacity</li>
|
|
</ul>
|
|
|
|
<h3>Why Rate Limiting is Essential</h3>
|
|
<ul>
|
|
<li><strong>Legal Compliance:</strong> Avoid violating terms of service</li>
|
|
<li><strong>Server Protection:</strong> Prevent overwhelming target systems</li>
|
|
<li><strong>IP Preservation:</strong> Avoid getting blocked or banned</li>
|
|
<li><strong>Data Quality:</strong> Ensure consistent, reliable data collection</li>
|
|
<li><strong>Ethical Standards:</strong> Maintain professional scraping practices</li>
|
|
</ul>
|
|
|
|
<h2>Basic Rate Limiting Implementation</h2>
|
|
|
|
<h3>Simple Delay Mechanisms</h3>
|
|
<pre><code>
|
|
import time
|
|
import random
|
|
import requests
|
|
|
|
class BasicRateLimiter:
|
|
def __init__(self, delay_range=(1, 3)):
|
|
self.min_delay = delay_range[0]
|
|
self.max_delay = delay_range[1]
|
|
self.last_request_time = 0
|
|
|
|
def wait(self):
|
|
"""Implement random delay between requests"""
|
|
current_time = time.time()
|
|
elapsed = current_time - self.last_request_time
|
|
|
|
# Calculate required delay
|
|
delay = random.uniform(self.min_delay, self.max_delay)
|
|
|
|
if elapsed < delay:
|
|
sleep_time = delay - elapsed
|
|
print(f"Rate limiting: sleeping for {sleep_time:.2f} seconds")
|
|
time.sleep(sleep_time)
|
|
|
|
self.last_request_time = time.time()
|
|
|
|
def request(self, url, **kwargs):
|
|
"""Make rate-limited request"""
|
|
self.wait()
|
|
return requests.get(url, **kwargs)
|
|
|
|
# Usage example
|
|
limiter = BasicRateLimiter(delay_range=(2, 5))
|
|
|
|
urls = [
|
|
"https://example.com/page1",
|
|
"https://example.com/page2",
|
|
"https://example.com/page3"
|
|
]
|
|
|
|
for url in urls:
|
|
response = limiter.request(url)
|
|
print(f"Scraped {url}: {response.status_code}")
|
|
</code></pre>
|
|
|
|
<h3>Domain-Specific Rate Limiting</h3>
|
|
<pre><code>
|
|
from urllib.parse import urlparse
|
|
from collections import defaultdict
|
|
|
|
class DomainRateLimiter:
|
|
def __init__(self):
|
|
self.domain_delays = defaultdict(lambda: 1.0) # Default 1 second
|
|
self.last_request_times = defaultdict(float)
|
|
|
|
def set_domain_delay(self, domain, delay):
|
|
"""Set specific delay for a domain"""
|
|
self.domain_delays[domain] = delay
|
|
|
|
def wait_for_domain(self, url):
|
|
"""Wait appropriate time for specific domain"""
|
|
domain = urlparse(url).netloc
|
|
current_time = time.time()
|
|
last_request = self.last_request_times[domain]
|
|
required_delay = self.domain_delays[domain]
|
|
|
|
elapsed = current_time - last_request
|
|
if elapsed < required_delay:
|
|
sleep_time = required_delay - elapsed
|
|
time.sleep(sleep_time)
|
|
|
|
self.last_request_times[domain] = time.time()
|
|
|
|
def request(self, url, **kwargs):
|
|
"""Make domain-aware rate-limited request"""
|
|
self.wait_for_domain(url)
|
|
return requests.get(url, **kwargs)
|
|
|
|
# Usage with different domain settings
|
|
limiter = DomainRateLimiter()
|
|
limiter.set_domain_delay("api.example.com", 0.5) # Fast API
|
|
limiter.set_domain_delay("slow-site.com", 5.0) # Slow site
|
|
limiter.set_domain_delay("ecommerce.com", 2.0) # E-commerce site
|
|
|
|
# Requests will be automatically rate-limited per domain
|
|
response1 = limiter.request("https://api.example.com/data")
|
|
response2 = limiter.request("https://slow-site.com/page")
|
|
response3 = limiter.request("https://ecommerce.com/products")
|
|
</code></pre>
|
|
|
|
<h2>Advanced Rate Limiting Strategies</h2>
|
|
|
|
<h3>Exponential Backoff</h3>
|
|
<pre><code>
|
|
import math
|
|
|
|
class ExponentialBackoffLimiter:
|
|
def __init__(self, base_delay=1.0, max_delay=60.0):
|
|
self.base_delay = base_delay
|
|
self.max_delay = max_delay
|
|
self.consecutive_errors = defaultdict(int)
|
|
self.domain_delays = defaultdict(lambda: base_delay)
|
|
|
|
def calculate_delay(self, domain, error_occurred=False):
|
|
"""Calculate delay using exponential backoff"""
|
|
if error_occurred:
|
|
self.consecutive_errors[domain] += 1
|
|
else:
|
|
self.consecutive_errors[domain] = 0
|
|
|
|
# Exponential backoff formula
|
|
error_count = self.consecutive_errors[domain]
|
|
delay = min(
|
|
self.base_delay * (2 ** error_count),
|
|
self.max_delay
|
|
)
|
|
|
|
self.domain_delays[domain] = delay
|
|
return delay
|
|
|
|
def request_with_backoff(self, url, max_retries=3):
|
|
"""Make request with exponential backoff on errors"""
|
|
domain = urlparse(url).netloc
|
|
|
|
for attempt in range(max_retries + 1):
|
|
try:
|
|
delay = self.calculate_delay(domain, error_occurred=False)
|
|
time.sleep(delay)
|
|
|
|
response = requests.get(url, timeout=10)
|
|
|
|
if response.status_code == 429: # Too Many Requests
|
|
raise requests.exceptions.RequestException("Rate limited")
|
|
|
|
response.raise_for_status()
|
|
return response
|
|
|
|
except requests.exceptions.RequestException as e:
|
|
print(f"Request failed (attempt {attempt + 1}): {e}")
|
|
|
|
if attempt < max_retries:
|
|
error_delay = self.calculate_delay(domain, error_occurred=True)
|
|
print(f"Backing off for {error_delay:.2f} seconds")
|
|
time.sleep(error_delay)
|
|
else:
|
|
raise
|
|
|
|
# Usage
|
|
backoff_limiter = ExponentialBackoffLimiter()
|
|
response = backoff_limiter.request_with_backoff("https://api.example.com/data")
|
|
</code></pre>
|
|
|
|
<h3>Adaptive Rate Limiting</h3>
|
|
<pre><code>
|
|
class AdaptiveRateLimiter:
|
|
def __init__(self, initial_delay=1.0):
|
|
self.domain_stats = defaultdict(lambda: {
|
|
'delay': initial_delay,
|
|
'response_times': [],
|
|
'success_rate': 1.0,
|
|
'last_adjustment': time.time()
|
|
})
|
|
|
|
def record_response(self, domain, response_time, success):
|
|
"""Record response statistics"""
|
|
stats = self.domain_stats[domain]
|
|
|
|
# Keep only recent response times (last 10)
|
|
stats['response_times'].append(response_time)
|
|
if len(stats['response_times']) > 10:
|
|
stats['response_times'].pop(0)
|
|
|
|
# Update success rate (exponential moving average)
|
|
alpha = 0.1
|
|
stats['success_rate'] = (
|
|
alpha * (1 if success else 0) +
|
|
(1 - alpha) * stats['success_rate']
|
|
)
|
|
|
|
def adjust_delay(self, domain):
|
|
"""Dynamically adjust delay based on performance"""
|
|
stats = self.domain_stats[domain]
|
|
current_time = time.time()
|
|
|
|
# Only adjust every 30 seconds
|
|
if current_time - stats['last_adjustment'] < 30:
|
|
return stats['delay']
|
|
|
|
avg_response_time = (
|
|
sum(stats['response_times']) / len(stats['response_times'])
|
|
if stats['response_times'] else 1.0
|
|
)
|
|
|
|
# Adjustment logic
|
|
if stats['success_rate'] < 0.8: # Low success rate
|
|
stats['delay'] *= 1.5 # Increase delay
|
|
elif avg_response_time > 5.0: # Slow responses
|
|
stats['delay'] *= 1.2
|
|
elif stats['success_rate'] > 0.95 and avg_response_time < 2.0:
|
|
stats['delay'] *= 0.9 # Decrease delay for good performance
|
|
|
|
# Keep delay within reasonable bounds
|
|
stats['delay'] = max(0.5, min(stats['delay'], 30.0))
|
|
stats['last_adjustment'] = current_time
|
|
|
|
return stats['delay']
|
|
|
|
def request(self, url):
|
|
"""Make adaptive rate-limited request"""
|
|
domain = urlparse(url).netloc
|
|
delay = self.adjust_delay(domain)
|
|
|
|
time.sleep(delay)
|
|
start_time = time.time()
|
|
|
|
try:
|
|
response = requests.get(url, timeout=10)
|
|
response_time = time.time() - start_time
|
|
success = response.status_code == 200
|
|
|
|
self.record_response(domain, response_time, success)
|
|
return response
|
|
|
|
except Exception as e:
|
|
response_time = time.time() - start_time
|
|
self.record_response(domain, response_time, False)
|
|
raise
|
|
|
|
# Usage
|
|
adaptive_limiter = AdaptiveRateLimiter()
|
|
|
|
# The limiter will automatically adjust delays based on performance
|
|
for i in range(100):
|
|
try:
|
|
response = adaptive_limiter.request(f"https://api.example.com/data/{i}")
|
|
print(f"Request {i}: {response.status_code}")
|
|
except Exception as e:
|
|
print(f"Request {i} failed: {e}")
|
|
</code></pre>
|
|
|
|
<h2>Distributed Rate Limiting</h2>
|
|
|
|
<h3>Redis-Based Rate Limiting</h3>
|
|
<pre><code>
|
|
import redis
|
|
import json
|
|
|
|
class DistributedRateLimiter:
|
|
def __init__(self, redis_url='redis://localhost:6379'):
|
|
self.redis_client = redis.from_url(redis_url)
|
|
self.default_window = 60 # 1 minute window
|
|
self.default_limit = 30 # 30 requests per minute
|
|
|
|
def is_allowed(self, domain, limit=None, window=None):
|
|
"""Check if request is allowed using sliding window"""
|
|
limit = limit or self.default_limit
|
|
window = window or self.default_window
|
|
|
|
current_time = time.time()
|
|
key = f"rate_limit:{domain}"
|
|
|
|
# Use Redis pipeline for atomic operations
|
|
pipe = self.redis_client.pipeline()
|
|
|
|
# Remove old entries outside the window
|
|
pipe.zremrangebyscore(key, 0, current_time - window)
|
|
|
|
# Count current requests in window
|
|
pipe.zcard(key)
|
|
|
|
# Add current request
|
|
pipe.zadd(key, {str(current_time): current_time})
|
|
|
|
# Set expiry for cleanup
|
|
pipe.expire(key, window)
|
|
|
|
results = pipe.execute()
|
|
current_requests = results[1]
|
|
|
|
return current_requests < limit
|
|
|
|
def wait_if_needed(self, domain, limit=None, window=None):
|
|
"""Wait until request is allowed"""
|
|
while not self.is_allowed(domain, limit, window):
|
|
print(f"Rate limit exceeded for {domain}, waiting...")
|
|
time.sleep(1)
|
|
|
|
def request(self, url, **kwargs):
|
|
"""Make distributed rate-limited request"""
|
|
domain = urlparse(url).netloc
|
|
self.wait_if_needed(domain)
|
|
return requests.get(url, **kwargs)
|
|
|
|
# Usage across multiple scraper instances
|
|
distributed_limiter = DistributedRateLimiter()
|
|
|
|
# This will coordinate rate limiting across all instances
|
|
response = distributed_limiter.request("https://api.example.com/data")
|
|
</code></pre>
|
|
|
|
<h3>Token Bucket Algorithm</h3>
|
|
<pre><code>
|
|
class TokenBucket:
|
|
def __init__(self, capacity, refill_rate):
|
|
self.capacity = capacity
|
|
self.tokens = capacity
|
|
self.refill_rate = refill_rate # tokens per second
|
|
self.last_refill = time.time()
|
|
|
|
def consume(self, tokens=1):
|
|
"""Try to consume tokens from bucket"""
|
|
self._refill()
|
|
|
|
if self.tokens >= tokens:
|
|
self.tokens -= tokens
|
|
return True
|
|
return False
|
|
|
|
def _refill(self):
|
|
"""Refill tokens based on elapsed time"""
|
|
current_time = time.time()
|
|
elapsed = current_time - self.last_refill
|
|
|
|
# Add tokens based on elapsed time
|
|
tokens_to_add = elapsed * self.refill_rate
|
|
self.tokens = min(self.capacity, self.tokens + tokens_to_add)
|
|
self.last_refill = current_time
|
|
|
|
def wait_for_tokens(self, tokens=1):
|
|
"""Wait until enough tokens are available"""
|
|
while not self.consume(tokens):
|
|
time.sleep(0.1)
|
|
|
|
class TokenBucketRateLimiter:
|
|
def __init__(self):
|
|
self.buckets = {}
|
|
|
|
def get_bucket(self, domain, capacity=10, refill_rate=1.0):
|
|
"""Get or create token bucket for domain"""
|
|
if domain not in self.buckets:
|
|
self.buckets[domain] = TokenBucket(capacity, refill_rate)
|
|
return self.buckets[domain]
|
|
|
|
def request(self, url, **kwargs):
|
|
"""Make token bucket rate-limited request"""
|
|
domain = urlparse(url).netloc
|
|
bucket = self.get_bucket(domain)
|
|
|
|
# Wait for token availability
|
|
bucket.wait_for_tokens()
|
|
|
|
return requests.get(url, **kwargs)
|
|
|
|
# Usage
|
|
token_limiter = TokenBucketRateLimiter()
|
|
|
|
# Allows burst requests up to bucket capacity
|
|
# then throttles to refill rate
|
|
for i in range(20):
|
|
response = token_limiter.request(f"https://api.example.com/data/{i}")
|
|
print(f"Request {i}: {response.status_code}")
|
|
</code></pre>
|
|
|
|
<h2>Integration with Popular Libraries</h2>
|
|
|
|
<h3>Scrapy Rate Limiting</h3>
|
|
<pre><code>
|
|
# Custom Scrapy middleware for advanced rate limiting
|
|
from scrapy.downloadermiddlewares.delay import DelayMiddleware
|
|
|
|
class AdaptiveDelayMiddleware:
|
|
def __init__(self, delay=1.0):
|
|
self.delay = delay
|
|
self.domain_stats = defaultdict(lambda: {
|
|
'delay': delay,
|
|
'errors': 0,
|
|
'successes': 0
|
|
})
|
|
|
|
@classmethod
|
|
def from_crawler(cls, crawler):
|
|
return cls(
|
|
delay=crawler.settings.getfloat('DOWNLOAD_DELAY', 1.0)
|
|
)
|
|
|
|
def process_request(self, request, spider):
|
|
domain = urlparse(request.url).netloc
|
|
delay = self.calculate_delay(domain)
|
|
|
|
if delay > 0:
|
|
time.sleep(delay)
|
|
|
|
def process_response(self, request, response, spider):
|
|
domain = urlparse(request.url).netloc
|
|
stats = self.domain_stats[domain]
|
|
|
|
if response.status == 200:
|
|
stats['successes'] += 1
|
|
stats['errors'] = max(0, stats['errors'] - 1)
|
|
else:
|
|
stats['errors'] += 1
|
|
|
|
self.adjust_delay(domain)
|
|
return response
|
|
|
|
def calculate_delay(self, domain):
|
|
return self.domain_stats[domain]['delay']
|
|
|
|
def adjust_delay(self, domain):
|
|
stats = self.domain_stats[domain]
|
|
|
|
if stats['errors'] > 3:
|
|
stats['delay'] *= 1.5
|
|
elif stats['successes'] > 10 and stats['errors'] == 0:
|
|
stats['delay'] *= 0.9
|
|
|
|
stats['delay'] = max(0.5, min(stats['delay'], 10.0))
|
|
|
|
# settings.py
|
|
DOWNLOADER_MIDDLEWARES = {
|
|
'myproject.middlewares.AdaptiveDelayMiddleware': 543,
|
|
}
|
|
DOWNLOAD_DELAY = 1.0
|
|
RANDOMIZE_DOWNLOAD_DELAY = 0.5
|
|
</code></pre>
|
|
|
|
<h3>Requests-HTML Rate Limiting</h3>
|
|
<pre><code>
|
|
from requests_html import HTMLSession
|
|
|
|
class RateLimitedSession(HTMLSession):
|
|
def __init__(self, rate_limiter=None):
|
|
super().__init__()
|
|
self.rate_limiter = rate_limiter or BasicRateLimiter()
|
|
|
|
def get(self, url, **kwargs):
|
|
"""Override get method with rate limiting"""
|
|
self.rate_limiter.wait_for_domain(url)
|
|
return super().get(url, **kwargs)
|
|
|
|
def post(self, url, **kwargs):
|
|
"""Override post method with rate limiting"""
|
|
self.rate_limiter.wait_for_domain(url)
|
|
return super().post(url, **kwargs)
|
|
|
|
# Usage
|
|
session = RateLimitedSession(
|
|
rate_limiter=DomainRateLimiter()
|
|
)
|
|
|
|
response = session.get('https://example.com')
|
|
response.html.render() # JavaScript rendering with rate limiting
|
|
</code></pre>
|
|
|
|
<h2>Monitoring and Analytics</h2>
|
|
|
|
<h3>Rate Limiting Metrics</h3>
|
|
<pre><code>
|
|
import logging
|
|
from collections import defaultdict
|
|
|
|
class RateLimitingMonitor:
|
|
def __init__(self):
|
|
self.metrics = defaultdict(lambda: {
|
|
'requests_made': 0,
|
|
'requests_blocked': 0,
|
|
'total_delay_time': 0,
|
|
'errors': 0
|
|
})
|
|
|
|
# Setup logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
|
handlers=[
|
|
logging.FileHandler('rate_limiting.log'),
|
|
logging.StreamHandler()
|
|
]
|
|
)
|
|
self.logger = logging.getLogger(__name__)
|
|
|
|
def log_request(self, domain, delay_time, success=True):
|
|
"""Log request metrics"""
|
|
metrics = self.metrics[domain]
|
|
metrics['requests_made'] += 1
|
|
metrics['total_delay_time'] += delay_time
|
|
|
|
if not success:
|
|
metrics['errors'] += 1
|
|
|
|
self.logger.info(f"Domain: {domain}, Delay: {delay_time:.2f}s, Success: {success}")
|
|
|
|
def log_rate_limit_hit(self, domain):
|
|
"""Log when rate limit is encountered"""
|
|
self.metrics[domain]['requests_blocked'] += 1
|
|
self.logger.warning(f"Rate limit hit for domain: {domain}")
|
|
|
|
def get_statistics(self):
|
|
"""Get comprehensive statistics"""
|
|
stats = {}
|
|
|
|
for domain, metrics in self.metrics.items():
|
|
total_requests = metrics['requests_made']
|
|
if total_requests > 0:
|
|
stats[domain] = {
|
|
'total_requests': total_requests,
|
|
'requests_blocked': metrics['requests_blocked'],
|
|
'error_rate': metrics['errors'] / total_requests,
|
|
'avg_delay': metrics['total_delay_time'] / total_requests,
|
|
'block_rate': metrics['requests_blocked'] / total_requests
|
|
}
|
|
|
|
return stats
|
|
|
|
def print_report(self):
|
|
"""Print detailed statistics report"""
|
|
stats = self.get_statistics()
|
|
|
|
print("\n" + "="*60)
|
|
print("RATE LIMITING STATISTICS REPORT")
|
|
print("="*60)
|
|
|
|
for domain, metrics in stats.items():
|
|
print(f"\nDomain: {domain}")
|
|
print(f" Total Requests: {metrics['total_requests']}")
|
|
print(f" Requests Blocked: {metrics['requests_blocked']}")
|
|
print(f" Error Rate: {metrics['error_rate']:.2%}")
|
|
print(f" Average Delay: {metrics['avg_delay']:.2f}s")
|
|
print(f" Block Rate: {metrics['block_rate']:.2%}")
|
|
|
|
# Usage
|
|
monitor = RateLimitingMonitor()
|
|
|
|
class MonitoredRateLimiter(BasicRateLimiter):
|
|
def __init__(self, monitor, *args, **kwargs):
|
|
super().__init__(*args, **kwargs)
|
|
self.monitor = monitor
|
|
|
|
def request(self, url, **kwargs):
|
|
domain = urlparse(url).netloc
|
|
start_time = time.time()
|
|
|
|
try:
|
|
response = super().request(url, **kwargs)
|
|
delay_time = time.time() - start_time
|
|
success = response.status_code == 200
|
|
|
|
self.monitor.log_request(domain, delay_time, success)
|
|
return response
|
|
|
|
except Exception as e:
|
|
delay_time = time.time() - start_time
|
|
self.monitor.log_request(domain, delay_time, False)
|
|
raise
|
|
|
|
# Use monitored rate limiter
|
|
limiter = MonitoredRateLimiter(monitor, delay_range=(1, 3))
|
|
|
|
# After scraping session
|
|
monitor.print_report()
|
|
</code></pre>
|
|
|
|
<h2>Best Practices and Recommendations</h2>
|
|
|
|
<h3>General Guidelines</h3>
|
|
<ul>
|
|
<li><strong>Start Conservative:</strong> Begin with longer delays and adjust down</li>
|
|
<li><strong>Respect robots.txt:</strong> Check crawl-delay directives</li>
|
|
<li><strong>Monitor Server Response:</strong> Watch for 429 status codes</li>
|
|
<li><strong>Use Random Delays:</strong> Avoid predictable patterns</li>
|
|
<li><strong>Implement Backoff:</strong> Increase delays on errors</li>
|
|
</ul>
|
|
|
|
<h3>Domain-Specific Strategies</h3>
|
|
<ul>
|
|
<li><strong>E-commerce Sites:</strong> 2-5 second delays during peak hours</li>
|
|
<li><strong>News Websites:</strong> 1-3 second delays, respect peak traffic</li>
|
|
<li><strong>APIs:</strong> Follow documented rate limits strictly</li>
|
|
<li><strong>Government Sites:</strong> Very conservative approach (5+ seconds)</li>
|
|
<li><strong>Social Media:</strong> Use official APIs when possible</li>
|
|
</ul>
|
|
|
|
<h3>Legal and Ethical Considerations</h3>
|
|
<ul>
|
|
<li>Review terms of service before scraping</li>
|
|
<li>Identify yourself with proper User-Agent headers</li>
|
|
<li>Consider reaching out for API access</li>
|
|
<li>Respect copyright and data protection laws</li>
|
|
<li>Implement circuit breakers for server protection</li>
|
|
</ul>
|
|
|
|
<div class="article-cta">
|
|
<h3>Professional Rate Limiting Solutions</h3>
|
|
<p>UK Data Services implements sophisticated rate limiting strategies for ethical, compliant web scraping that respects website resources while maximizing data collection efficiency.</p>
|
|
<a href="/quote" class="btn btn-primary">Get Rate Limiting Consultation</a>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
|
|
<!-- Related Articles -->
|
|
<aside class="related-articles">
|
|
<h3>Related Articles</h3>
|
|
<div class="related-grid">
|
|
<article class="related-card">
|
|
<span class="category">Web Scraping</span>
|
|
<h4><a href="handling-captchas-scraping.php">Handling CAPTCHAs in Web Scraping: Complete Guide</a></h4>
|
|
<span class="read-time">8 min read</span>
|
|
<?php include($_SERVER['DOCUMENT_ROOT'] . '/includes/author-bio.php'); ?>
|
|
|
|
<?php include($_SERVER['DOCUMENT_ROOT'] . '/includes/article-footer.php'); ?>
|
|
</div>
|
|
</article>
|
|
<article class="related-card">
|
|
<span class="category">Web Scraping</span>
|
|
<h4><a href="python-scrapy-enterprise-guide.php">Python Scrapy Enterprise Guide: Scaling Web Scraping Operations</a></h4>
|
|
<span class="read-time">12 min read</span>
|
|
<?php include($_SERVER['DOCUMENT_ROOT'] . '/includes/author-bio.php'); ?>
|
|
|
|
<?php include($_SERVER['DOCUMENT_ROOT'] . '/includes/article-footer.php'); ?>
|
|
</div>
|
|
</article>
|
|
<article class="related-card">
|
|
<span class="category">Compliance</span>
|
|
<h4><a href="web-scraping-compliance-uk-guide.php">Complete Guide to Web Scraping Compliance in the UK</a></h4>
|
|
<span class="read-time">12 min read</span>
|
|
<?php include($_SERVER['DOCUMENT_ROOT'] . '/includes/author-bio.php'); ?>
|
|
|
|
<?php include($_SERVER['DOCUMENT_ROOT'] . '/includes/article-footer.php'); ?>
|
|
</div>
|
|
</article>
|
|
</div>
|
|
</aside>
|
|
</div>
|
|
<?php include($_SERVER['DOCUMENT_ROOT'] . '/includes/author-bio.php'); ?>
|
|
|
|
<?php include($_SERVER['DOCUMENT_ROOT'] . '/includes/article-footer.php'); ?>
|
|
</div>
|
|
</article>
|
|
</main>
|
|
|
|
<!-- Footer -->
|
|
<footer class="footer">
|
|
<div class="container">
|
|
<div class="footer-content">
|
|
<div class="footer-section">
|
|
<div class="footer-logo">
|
|
<img loading="lazy" src="../../assets/images/logo-white.svg" alt="UK Data Services" loading="lazy">
|
|
</div>
|
|
<p>Enterprise data intelligence solutions for modern British business.</p>
|
|
</div>
|
|
|
|
<div class="footer-section">
|
|
<h3>Quick Links</h3>
|
|
<ul>
|
|
<li><a href="/#services">Services</a></li>
|
|
<li><a href="/blog/">Blog</a></li>
|
|
<li><a href="/case-studies/">Case Studies</a></li>
|
|
<li><a href="/about">About</a></li>
|
|
<li><a href="/#contact">Contact</a></li>
|
|
</ul>
|
|
</div>
|
|
|
|
<div class="footer-section">
|
|
<h3>Legal</h3>
|
|
<ul>
|
|
<li><a href="/privacy-policy">Privacy Policy</a></li>
|
|
<li><a href="/terms-of-service">Terms of Service</a></li>
|
|
<li><a href="/cookie-policy">Cookie Policy</a></li>
|
|
<li><a href="/gdpr-compliance">GDPR Compliance</a></li>
|
|
</ul>
|
|
</div>
|
|
</div>
|
|
|
|
<div class="footer-bottom">
|
|
<p>© <?php echo date('Y'); ?> UK Data Services. All rights reserved.</p>
|
|
<div class="social-links">
|
|
<a href="https://linkedin.com/company/uk-data-services" aria-label="LinkedIn" rel="noopener" target="_blank">
|
|
<img loading="lazy" src="../../assets/images/icon-linkedin.svg" alt="LinkedIn" loading="lazy">
|
|
</a>
|
|
<a href="https://twitter.com/ukdataservices" aria-label="Twitter" rel="noopener" target="_blank">
|
|
<img loading="lazy" src="../../assets/images/icon-twitter.svg" alt="Twitter" loading="lazy">
|
|
</a>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
</footer>
|
|
|
|
<!-- Scripts -->
|
|
<script src="../../assets/js/main.js"></script>
|
|
<script src="../../assets/js/cro-enhancements.js"></script>
|
|
</body>
|
|
</html>
|