blog/articles/web-scraping-rate-limiting.php

<?php
// Enhanced security headers
header('Strict-Transport-Security: max-age=31536000; includeSubDomains');

// Article-specific SEO variables
$article_title = "Web Scraping Rate Limiting: Professional Implementation Guide";
$article_description = "Master rate limiting techniques for ethical web scraping. Learn to implement respectful delays, adaptive throttling, and compliance strategies.";
$article_keywords = "web scraping rate limiting, scraping delays, ethical web scraping, rate limiting strategies, web scraping best practices, scraping throttling";
$article_author = "Michael Thompson";
$canonical_url = "https://ukdataservices.co.uk/blog/articles/web-scraping-rate-limiting";
$article_published = "2025-04-28T09:00:00+00:00";
$article_modified = "2025-04-28T09:00:00+00:00";
$og_image = "https://ukdataservices.co.uk/assets/images/icon-speed.svg";
$read_time = 9;
?>
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title><?php echo htmlspecialchars($article_title); ?> | UK Data Services Blog</title>
    <meta name="description" content="<?php echo htmlspecialchars($article_description); ?>">
    <meta name="keywords" content="<?php echo htmlspecialchars($article_keywords); ?>">
    <meta name="author" content="<?php echo htmlspecialchars($article_author); ?>">
    <meta name="robots" content="index, follow">
    <link rel="canonical" href="<?php echo htmlspecialchars($canonical_url); ?>">
    
    <!-- Article-specific meta tags -->
    <meta name="article:published_time" content="<?php echo $article_published; ?>">
    <meta name="article:modified_time" content="<?php echo $article_modified; ?>">
    <meta name="article:author" content="<?php echo htmlspecialchars($article_author); ?>">
    <meta name="article:section" content="Web Scraping">
    <meta name="article:tag" content="Rate Limiting, Web Scraping, Ethics, Best Practices">
    
    <!-- Preload critical resources -->
    <link rel="preload" href="../../assets/css/main.css" as="style">
    <link rel="preload" href="../../assets/images/ukds-main-logo.png" as="image">
    
    <!-- Open Graph / Social Media -->
    <meta property="og:type" content="article">
    <meta property="og:url" content="<?php echo htmlspecialchars($canonical_url); ?>">
    <meta property="og:title" content="<?php echo htmlspecialchars($article_title); ?>">
    <meta property="og:description" content="<?php echo htmlspecialchars($article_description); ?>">
    <meta property="og:image" content="<?php echo htmlspecialchars($og_image); ?>">
    
    <!-- Twitter Card -->
    <meta name="twitter:card" content="summary_large_image">
    <meta name="twitter:title" content="<?php echo htmlspecialchars($article_title); ?>">
    <meta name="twitter:description" content="<?php echo htmlspecialchars($article_description); ?>">
    <meta name="twitter:image" content="<?php echo htmlspecialchars($og_image); ?>">
    
    <!-- Favicon and App Icons -->
    <link rel="icon" type="image/svg+xml" href="../../assets/images/favicon.svg">
    <link rel="apple-touch-icon" sizes="180x180" href="../../assets/images/apple-touch-icon.svg">
    
    <!-- Fonts -->
    <link rel="preconnect" href="https://fonts.googleapis.com">
    <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
    <link href="https://fonts.googleapis.com/css2?family=Roboto+Slab:wght@300;400;500;600;700&family=Lato:wght@300;400;500;600;700&display=swap" rel="stylesheet">
    
    <!-- Styles -->
    <link rel="stylesheet" href="../../assets/css/main.css">
    <link rel="stylesheet" href="../../assets/css/cro-enhancements.css">
    
    <!-- Article Schema -->
    <script type="application/ld+json">
    {
        "@context": "https://schema.org",
        "@type": "Article",
        "mainEntityOfPage": {
            "@type": "WebPage",
            "@id": "<?php echo htmlspecialchars($canonical_url); ?>"
        },
        "headline": "<?php echo htmlspecialchars($article_title); ?>",
        "description": "<?php echo htmlspecialchars($article_description); ?>",
        "image": "<?php echo htmlspecialchars($og_image); ?>",
        "author": {
            "@type": "Organization",
            "name": "UK Data Services",
            "url": "https://ukdataservices.co.uk"
        },
        "publisher": {
            "@type": "Organization",
            "name": "UK Data Services",
            "logo": {
                "@type": "ImageObject",
                "url": "https://ukdataservices.co.uk/assets/images/ukds-main-logo.png"
            }
        },
        "datePublished": "<?php echo $article_published; ?>",
        "dateModified": "<?php echo $article_modified; ?>"
    }
    </script>
</head>
<body>
    <!-- Skip to content link for accessibility -->
    <a href="#main-content" class="skip-to-content">Skip to main content</a>
    
        <?php include($_SERVER["DOCUMENT_ROOT"] . "/includes/nav.php"); ?><!-- Article Content -->
    <main id="main-content">
        <article class="article-page">
            <div class="container">
            <div class="article-meta">
                <span class="category"><a href="/blog/categories/web-scraping.php">Web Scraping</a></span>
                <time datetime="2025-04-28">28 April 2025</time>
                <span class="read-time">9 min read</span>
            </div>
<header class="article-header">
                    <h1><?php echo htmlspecialchars($article_title); ?></h1>
                    <p class="article-lead"><?php echo htmlspecialchars($article_description); ?></p>
                    
                    <div class="article-author">
                        <div class="author-info">
                            <span>By <?php echo htmlspecialchars($article_author); ?></span>
                        </div>
                        <div class="share-buttons">
                            <a href="https://www.linkedin.com/sharing/share-offsite/?url=<?php echo urlencode($canonical_url); ?>" class="share-button linkedin" aria-label="Share on LinkedIn" rel="noopener" target="_blank">
                                <img loading="lazy" src="../../assets/images/icon-linkedin.svg" alt="LinkedIn">
                            </a>
                            <a href="https://twitter.com/intent/tweet?url=<?php echo urlencode($canonical_url); ?>&text=<?php echo urlencode($article_title); ?>" class="share-button twitter" aria-label="Share on Twitter" rel="noopener" target="_blank">
                                <img loading="lazy" src="../../assets/images/icon-twitter.svg" alt="Twitter">
                            </a>
                        </div>
                    </div>
                </header>

                <div class="article-content">
                    <div class="content-wrapper">
                        <h2>Why Rate Limiting Matters in Web Scraping</h2>
                        <p>Rate limiting is fundamental to ethical and sustainable web scraping. It protects websites from overload, maintains good relationships with site owners, and helps avoid IP bans and legal issues. Professional scrapers understand that respectful data collection leads to long-term success.</p>

                        <p>This guide covers comprehensive rate limiting strategies, from basic delays to sophisticated adaptive throttling systems that automatically adjust to website conditions.</p>

                        <h2>Understanding Rate Limiting Principles</h2>
                        
                        <h3>What is Rate Limiting?</h3>
                        <p>Rate limiting controls the frequency of requests sent to a target website. It involves:</p>
                        <ul>
                            <li><strong>Request Frequency:</strong> Number of requests per time period</li>
                            <li><strong>Concurrent Connections:</strong> Simultaneous connections to a domain</li>
                            <li><strong>Bandwidth Usage:</strong> Data transfer rate control</li>
                            <li><strong>Resource Respect:</strong> Consideration for server capacity</li>
                        </ul>

                        <h3>Why Rate Limiting is Essential</h3>
                        <ul>
                            <li><strong>Legal Compliance:</strong> Avoid violating terms of service</li>
                            <li><strong>Server Protection:</strong> Prevent overwhelming target systems</li>
                            <li><strong>IP Preservation:</strong> Avoid getting blocked or banned</li>
                            <li><strong>Data Quality:</strong> Ensure consistent, reliable data collection</li>
                            <li><strong>Ethical Standards:</strong> Maintain professional scraping practices</li>
                        </ul>

                        <h2>Basic Rate Limiting Implementation</h2>
                        
                        <h3>Simple Delay Mechanisms</h3>
                        <pre><code>
import time
import random
import requests

class BasicRateLimiter:
    def __init__(self, delay_range=(1, 3)):
        self.min_delay = delay_range[0]
        self.max_delay = delay_range[1]
        self.last_request_time = 0
    
    def wait(self):
        """Implement random delay between requests"""
        current_time = time.time()
        elapsed = current_time - self.last_request_time
        
        # Calculate required delay
        delay = random.uniform(self.min_delay, self.max_delay)
        
        if elapsed < delay:
            sleep_time = delay - elapsed
            print(f"Rate limiting: sleeping for {sleep_time:.2f} seconds")
            time.sleep(sleep_time)
        
        self.last_request_time = time.time()
    
    def request(self, url, **kwargs):
        """Make rate-limited request"""
        self.wait()
        return requests.get(url, **kwargs)

# Usage example
limiter = BasicRateLimiter(delay_range=(2, 5))

urls = [
    "https://example.com/page1",
    "https://example.com/page2", 
    "https://example.com/page3"
]

for url in urls:
    response = limiter.request(url)
    print(f"Scraped {url}: {response.status_code}")
                        </code></pre>

                        <h3>Domain-Specific Rate Limiting</h3>
                        <pre><code>
from urllib.parse import urlparse
from collections import defaultdict

class DomainRateLimiter:
    def __init__(self):
        self.domain_delays = defaultdict(lambda: 1.0)  # Default 1 second
        self.last_request_times = defaultdict(float)
    
    def set_domain_delay(self, domain, delay):
        """Set specific delay for a domain"""
        self.domain_delays[domain] = delay
    
    def wait_for_domain(self, url):
        """Wait appropriate time for specific domain"""
        domain = urlparse(url).netloc
        current_time = time.time()
        last_request = self.last_request_times[domain]
        required_delay = self.domain_delays[domain]
        
        elapsed = current_time - last_request
        if elapsed < required_delay:
            sleep_time = required_delay - elapsed
            time.sleep(sleep_time)
        
        self.last_request_times[domain] = time.time()
    
    def request(self, url, **kwargs):
        """Make domain-aware rate-limited request"""
        self.wait_for_domain(url)
        return requests.get(url, **kwargs)

# Usage with different domain settings
limiter = DomainRateLimiter()
limiter.set_domain_delay("api.example.com", 0.5)  # Fast API
limiter.set_domain_delay("slow-site.com", 5.0)    # Slow site
limiter.set_domain_delay("ecommerce.com", 2.0)    # E-commerce site

# Requests will be automatically rate-limited per domain
response1 = limiter.request("https://api.example.com/data")
response2 = limiter.request("https://slow-site.com/page")
response3 = limiter.request("https://ecommerce.com/products")
                        </code></pre>

                        <h2>Advanced Rate Limiting Strategies</h2>
                        
                        <h3>Exponential Backoff</h3>
                        <pre><code>
import math

class ExponentialBackoffLimiter:
    def __init__(self, base_delay=1.0, max_delay=60.0):
        self.base_delay = base_delay
        self.max_delay = max_delay
        self.consecutive_errors = defaultdict(int)
        self.domain_delays = defaultdict(lambda: base_delay)
    
    def calculate_delay(self, domain, error_occurred=False):
        """Calculate delay using exponential backoff"""
        if error_occurred:
            self.consecutive_errors[domain] += 1
        else:
            self.consecutive_errors[domain] = 0
        
        # Exponential backoff formula
        error_count = self.consecutive_errors[domain]
        delay = min(
            self.base_delay * (2 ** error_count),
            self.max_delay
        )
        
        self.domain_delays[domain] = delay
        return delay
    
    def request_with_backoff(self, url, max_retries=3):
        """Make request with exponential backoff on errors"""
        domain = urlparse(url).netloc
        
        for attempt in range(max_retries + 1):
            try:
                delay = self.calculate_delay(domain, error_occurred=False)
                time.sleep(delay)
                
                response = requests.get(url, timeout=10)
                
                if response.status_code == 429:  # Too Many Requests
                    raise requests.exceptions.RequestException("Rate limited")
                
                response.raise_for_status()
                return response
                
            except requests.exceptions.RequestException as e:
                print(f"Request failed (attempt {attempt + 1}): {e}")
                
                if attempt < max_retries:
                    error_delay = self.calculate_delay(domain, error_occurred=True)
                    print(f"Backing off for {error_delay:.2f} seconds")
                    time.sleep(error_delay)
                else:
                    raise

# Usage
backoff_limiter = ExponentialBackoffLimiter()
response = backoff_limiter.request_with_backoff("https://api.example.com/data")
                        </code></pre>

                        <h3>Adaptive Rate Limiting</h3>
                        <pre><code>
class AdaptiveRateLimiter:
    def __init__(self, initial_delay=1.0):
        self.domain_stats = defaultdict(lambda: {
            'delay': initial_delay,
            'response_times': [],
            'success_rate': 1.0,
            'last_adjustment': time.time()
        })
    
    def record_response(self, domain, response_time, success):
        """Record response statistics"""
        stats = self.domain_stats[domain]
        
        # Keep only recent response times (last 10)
        stats['response_times'].append(response_time)
        if len(stats['response_times']) > 10:
            stats['response_times'].pop(0)
        
        # Update success rate (exponential moving average)
        alpha = 0.1
        stats['success_rate'] = (
            alpha * (1 if success else 0) + 
            (1 - alpha) * stats['success_rate']
        )
    
    def adjust_delay(self, domain):
        """Dynamically adjust delay based on performance"""
        stats = self.domain_stats[domain]
        current_time = time.time()
        
        # Only adjust every 30 seconds
        if current_time - stats['last_adjustment'] < 30:
            return stats['delay']
        
        avg_response_time = (
            sum(stats['response_times']) / len(stats['response_times'])
            if stats['response_times'] else 1.0
        )
        
        # Adjustment logic
        if stats['success_rate'] < 0.8:  # Low success rate
            stats['delay'] *= 1.5  # Increase delay
        elif avg_response_time > 5.0:  # Slow responses
            stats['delay'] *= 1.2
        elif stats['success_rate'] > 0.95 and avg_response_time < 2.0:
            stats['delay'] *= 0.9  # Decrease delay for good performance
        
        # Keep delay within reasonable bounds
        stats['delay'] = max(0.5, min(stats['delay'], 30.0))
        stats['last_adjustment'] = current_time
        
        return stats['delay']
    
    def request(self, url):
        """Make adaptive rate-limited request"""
        domain = urlparse(url).netloc
        delay = self.adjust_delay(domain)
        
        time.sleep(delay)
        start_time = time.time()
        
        try:
            response = requests.get(url, timeout=10)
            response_time = time.time() - start_time
            success = response.status_code == 200
            
            self.record_response(domain, response_time, success)
            return response
            
        except Exception as e:
            response_time = time.time() - start_time
            self.record_response(domain, response_time, False)
            raise

# Usage
adaptive_limiter = AdaptiveRateLimiter()

# The limiter will automatically adjust delays based on performance
for i in range(100):
    try:
        response = adaptive_limiter.request(f"https://api.example.com/data/{i}")
        print(f"Request {i}: {response.status_code}")
    except Exception as e:
        print(f"Request {i} failed: {e}")
                        </code></pre>

                        <h2>Distributed Rate Limiting</h2>
                        
                        <h3>Redis-Based Rate Limiting</h3>
                        <pre><code>
import redis
import json

class DistributedRateLimiter:
    def __init__(self, redis_url='redis://localhost:6379'):
        self.redis_client = redis.from_url(redis_url)
        self.default_window = 60  # 1 minute window
        self.default_limit = 30   # 30 requests per minute
    
    def is_allowed(self, domain, limit=None, window=None):
        """Check if request is allowed using sliding window"""
        limit = limit or self.default_limit
        window = window or self.default_window
        
        current_time = time.time()
        key = f"rate_limit:{domain}"
        
        # Use Redis pipeline for atomic operations
        pipe = self.redis_client.pipeline()
        
        # Remove old entries outside the window
        pipe.zremrangebyscore(key, 0, current_time - window)
        
        # Count current requests in window
        pipe.zcard(key)
        
        # Add current request
        pipe.zadd(key, {str(current_time): current_time})
        
        # Set expiry for cleanup
        pipe.expire(key, window)
        
        results = pipe.execute()
        current_requests = results[1]
        
        return current_requests < limit
    
    def wait_if_needed(self, domain, limit=None, window=None):
        """Wait until request is allowed"""
        while not self.is_allowed(domain, limit, window):
            print(f"Rate limit exceeded for {domain}, waiting...")
            time.sleep(1)
    
    def request(self, url, **kwargs):
        """Make distributed rate-limited request"""
        domain = urlparse(url).netloc
        self.wait_if_needed(domain)
        return requests.get(url, **kwargs)

# Usage across multiple scraper instances
distributed_limiter = DistributedRateLimiter()

# This will coordinate rate limiting across all instances
response = distributed_limiter.request("https://api.example.com/data")
                        </code></pre>

                        <h3>Token Bucket Algorithm</h3>
                        <pre><code>
class TokenBucket:
    def __init__(self, capacity, refill_rate):
        self.capacity = capacity
        self.tokens = capacity
        self.refill_rate = refill_rate  # tokens per second
        self.last_refill = time.time()
    
    def consume(self, tokens=1):
        """Try to consume tokens from bucket"""
        self._refill()
        
        if self.tokens >= tokens:
            self.tokens -= tokens
            return True
        return False
    
    def _refill(self):
        """Refill tokens based on elapsed time"""
        current_time = time.time()
        elapsed = current_time - self.last_refill
        
        # Add tokens based on elapsed time
        tokens_to_add = elapsed * self.refill_rate
        self.tokens = min(self.capacity, self.tokens + tokens_to_add)
        self.last_refill = current_time
    
    def wait_for_tokens(self, tokens=1):
        """Wait until enough tokens are available"""
        while not self.consume(tokens):
            time.sleep(0.1)

class TokenBucketRateLimiter:
    def __init__(self):
        self.buckets = {}
    
    def get_bucket(self, domain, capacity=10, refill_rate=1.0):
        """Get or create token bucket for domain"""
        if domain not in self.buckets:
            self.buckets[domain] = TokenBucket(capacity, refill_rate)
        return self.buckets[domain]
    
    def request(self, url, **kwargs):
        """Make token bucket rate-limited request"""
        domain = urlparse(url).netloc
        bucket = self.get_bucket(domain)
        
        # Wait for token availability
        bucket.wait_for_tokens()
        
        return requests.get(url, **kwargs)

# Usage
token_limiter = TokenBucketRateLimiter()

# Allows burst requests up to bucket capacity
# then throttles to refill rate
for i in range(20):
    response = token_limiter.request(f"https://api.example.com/data/{i}")
    print(f"Request {i}: {response.status_code}")
                        </code></pre>

                        <h2>Integration with Popular Libraries</h2>
                        
                        <h3>Scrapy Rate Limiting</h3>
                        <pre><code>
# Custom Scrapy middleware for advanced rate limiting
from scrapy.downloadermiddlewares.delay import DelayMiddleware

class AdaptiveDelayMiddleware:
    def __init__(self, delay=1.0):
        self.delay = delay
        self.domain_stats = defaultdict(lambda: {
            'delay': delay,
            'errors': 0,
            'successes': 0
        })
    
    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            delay=crawler.settings.getfloat('DOWNLOAD_DELAY', 1.0)
        )
    
    def process_request(self, request, spider):
        domain = urlparse(request.url).netloc
        delay = self.calculate_delay(domain)
        
        if delay > 0:
            time.sleep(delay)
    
    def process_response(self, request, response, spider):
        domain = urlparse(request.url).netloc
        stats = self.domain_stats[domain]
        
        if response.status == 200:
            stats['successes'] += 1
            stats['errors'] = max(0, stats['errors'] - 1)
        else:
            stats['errors'] += 1
        
        self.adjust_delay(domain)
        return response
    
    def calculate_delay(self, domain):
        return self.domain_stats[domain]['delay']
    
    def adjust_delay(self, domain):
        stats = self.domain_stats[domain]
        
        if stats['errors'] > 3:
            stats['delay'] *= 1.5
        elif stats['successes'] > 10 and stats['errors'] == 0:
            stats['delay'] *= 0.9
        
        stats['delay'] = max(0.5, min(stats['delay'], 10.0))

# settings.py
DOWNLOADER_MIDDLEWARES = {
    'myproject.middlewares.AdaptiveDelayMiddleware': 543,
}
DOWNLOAD_DELAY = 1.0
RANDOMIZE_DOWNLOAD_DELAY = 0.5
                        </code></pre>

                        <h3>Requests-HTML Rate Limiting</h3>
                        <pre><code>
from requests_html import HTMLSession

class RateLimitedSession(HTMLSession):
    def __init__(self, rate_limiter=None):
        super().__init__()
        self.rate_limiter = rate_limiter or BasicRateLimiter()
    
    def get(self, url, **kwargs):
        """Override get method with rate limiting"""
        self.rate_limiter.wait_for_domain(url)
        return super().get(url, **kwargs)
    
    def post(self, url, **kwargs):
        """Override post method with rate limiting"""
        self.rate_limiter.wait_for_domain(url)
        return super().post(url, **kwargs)

# Usage
session = RateLimitedSession(
    rate_limiter=DomainRateLimiter()
)

response = session.get('https://example.com')
response.html.render()  # JavaScript rendering with rate limiting
                        </code></pre>

                        <h2>Monitoring and Analytics</h2>
                        
                        <h3>Rate Limiting Metrics</h3>
                        <pre><code>
import logging
from collections import defaultdict

class RateLimitingMonitor:
    def __init__(self):
        self.metrics = defaultdict(lambda: {
            'requests_made': 0,
            'requests_blocked': 0,
            'total_delay_time': 0,
            'errors': 0
        })
        
        # Setup logging
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s',
            handlers=[
                logging.FileHandler('rate_limiting.log'),
                logging.StreamHandler()
            ]
        )
        self.logger = logging.getLogger(__name__)
    
    def log_request(self, domain, delay_time, success=True):
        """Log request metrics"""
        metrics = self.metrics[domain]
        metrics['requests_made'] += 1
        metrics['total_delay_time'] += delay_time
        
        if not success:
            metrics['errors'] += 1
        
        self.logger.info(f"Domain: {domain}, Delay: {delay_time:.2f}s, Success: {success}")
    
    def log_rate_limit_hit(self, domain):
        """Log when rate limit is encountered"""
        self.metrics[domain]['requests_blocked'] += 1
        self.logger.warning(f"Rate limit hit for domain: {domain}")
    
    def get_statistics(self):
        """Get comprehensive statistics"""
        stats = {}
        
        for domain, metrics in self.metrics.items():
            total_requests = metrics['requests_made']
            if total_requests > 0:
                stats[domain] = {
                    'total_requests': total_requests,
                    'requests_blocked': metrics['requests_blocked'],
                    'error_rate': metrics['errors'] / total_requests,
                    'avg_delay': metrics['total_delay_time'] / total_requests,
                    'block_rate': metrics['requests_blocked'] / total_requests
                }
        
        return stats
    
    def print_report(self):
        """Print detailed statistics report"""
        stats = self.get_statistics()
        
        print("\n" + "="*60)
        print("RATE LIMITING STATISTICS REPORT")
        print("="*60)
        
        for domain, metrics in stats.items():
            print(f"\nDomain: {domain}")
            print(f"  Total Requests: {metrics['total_requests']}")
            print(f"  Requests Blocked: {metrics['requests_blocked']}")
            print(f"  Error Rate: {metrics['error_rate']:.2%}")
            print(f"  Average Delay: {metrics['avg_delay']:.2f}s")
            print(f"  Block Rate: {metrics['block_rate']:.2%}")

# Usage
monitor = RateLimitingMonitor()

class MonitoredRateLimiter(BasicRateLimiter):
    def __init__(self, monitor, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.monitor = monitor
    
    def request(self, url, **kwargs):
        domain = urlparse(url).netloc
        start_time = time.time()
        
        try:
            response = super().request(url, **kwargs)
            delay_time = time.time() - start_time
            success = response.status_code == 200
            
            self.monitor.log_request(domain, delay_time, success)
            return response
            
        except Exception as e:
            delay_time = time.time() - start_time
            self.monitor.log_request(domain, delay_time, False)
            raise

# Use monitored rate limiter
limiter = MonitoredRateLimiter(monitor, delay_range=(1, 3))

# After scraping session
monitor.print_report()
                        </code></pre>

                        <h2>Best Practices and Recommendations</h2>
                        
                        <h3>General Guidelines</h3>
                        <ul>
                            <li><strong>Start Conservative:</strong> Begin with longer delays and adjust down</li>
                            <li><strong>Respect robots.txt:</strong> Check crawl-delay directives</li>
                            <li><strong>Monitor Server Response:</strong> Watch for 429 status codes</li>
                            <li><strong>Use Random Delays:</strong> Avoid predictable patterns</li>
                            <li><strong>Implement Backoff:</strong> Increase delays on errors</li>
                        </ul>

                        <h3>Domain-Specific Strategies</h3>
                        <ul>
                            <li><strong>E-commerce Sites:</strong> 2-5 second delays during peak hours</li>
                            <li><strong>News Websites:</strong> 1-3 second delays, respect peak traffic</li>
                            <li><strong>APIs:</strong> Follow documented rate limits strictly</li>
                            <li><strong>Government Sites:</strong> Very conservative approach (5+ seconds)</li>
                            <li><strong>Social Media:</strong> Use official APIs when possible</li>
                        </ul>

                        <h3>Legal and Ethical Considerations</h3>
                        <ul>
                            <li>Review terms of service before scraping</li>
                            <li>Identify yourself with proper User-Agent headers</li>
                            <li>Consider reaching out for API access</li>
                            <li>Respect copyright and data protection laws</li>
                            <li>Implement circuit breakers for server protection</li>
                        </ul>

                        <div class="article-cta">
                            <h3>Professional Rate Limiting Solutions</h3>
                            <p>UK Data Services implements sophisticated rate limiting strategies for ethical, compliant web scraping that respects website resources while maximizing data collection efficiency.</p>
                            <a href="/quote" class="btn btn-primary">Get Rate Limiting Consultation</a>
                        </div>
                    </div>
                </div>

                <!-- Related Articles -->
                <aside class="related-articles">
                    <h3>Related Articles</h3>
                    <div class="related-grid">
                        <article class="related-card">
                            <span class="category">Web Scraping</span>
                            <h4><a href="handling-captchas-scraping.php">Handling CAPTCHAs in Web Scraping: Complete Guide</a></h4>
                            <span class="read-time">8 min read</span>                        <article class="related-card">
                            <span class="category">Web Scraping</span>
                            <h4><a href="python-scrapy-enterprise-guide.php">Python Scrapy Enterprise Guide: Scaling Web Scraping Operations</a></h4>
                            <span class="read-time">12 min read</span>                        <article class="related-card">
                            <span class="category">Compliance</span>
                            <h4><a href="web-scraping-compliance-uk-guide.php">Complete Guide to Web Scraping Compliance in the UK</a></h4>
                            <span class="read-time">12 min read</span>                    </div>
                </aside>
            </div>
        <?php include($_SERVER['DOCUMENT_ROOT'] . '/includes/author-bio.php'); ?>

        <?php include($_SERVER['DOCUMENT_ROOT'] . '/includes/article-footer.php'); ?>
        </div>
    </article>
    </main>

    <!-- Footer -->
    <footer class="footer">
        <div class="container">
            <div class="footer-content">
                <div class="footer-section">
                    <div class="footer-logo">
                        <img loading="lazy" src="../../assets/images/logo-white.svg" alt="UK Data Services" loading="lazy">
                    </div>
                    <p>Enterprise data intelligence solutions for modern British business.</p>
                </div>
                
                <div class="footer-section">
                    <h3>Quick Links</h3>
                    <ul>
                        <li><a href="/#services">Services</a></li>
                        <li><a href="/blog/">Blog</a></li>
                        <li><a href="/case-studies/">Case Studies</a></li>
                        <li><a href="/about">About</a></li>
                        <li><a href="/#contact">Contact</a></li>
                    </ul>
                </div>
                
                <div class="footer-section">
                    <h3>Legal</h3>
                    <ul>
                        <li><a href="/privacy-policy">Privacy Policy</a></li>
                        <li><a href="/terms-of-service">Terms of Service</a></li>
                        <li><a href="/cookie-policy">Cookie Policy</a></li>
                        <li><a href="/gdpr-compliance">GDPR Compliance</a></li>
                    </ul>
                </div>
            </div>
            
            <div class="footer-bottom">
                <p>&copy; <?php echo date('Y'); ?> UK Data Services. All rights reserved.</p>
                <div class="social-links">
                    <a href="https://linkedin.com/company/uk-data-services" aria-label="LinkedIn" rel="noopener" target="_blank">
                        <img loading="lazy" src="../../assets/images/icon-linkedin.svg" alt="LinkedIn" loading="lazy">
                    </a>
                    <a href="https://twitter.com/ukdataservices" aria-label="Twitter" rel="noopener" target="_blank">
                        <img loading="lazy" src="../../assets/images/icon-twitter.svg" alt="Twitter" loading="lazy">
                    </a>
                </div>
            </div>
        </div>
    </footer>

    <!-- Scripts -->
    <script src="../../assets/js/main.js"></script>
<script src="../../assets/js/cro-enhancements.js"></script>
</body>
</html>