882 lines
34 KiB
PHP
882 lines
34 KiB
PHP
<?php
|
|
// Enhanced security headers
|
|
header('X-Content-Type-Options: nosniff');
|
|
header('X-Frame-Options: DENY');
|
|
header('X-XSS-Protection: 1; mode=block');
|
|
header('Strict-Transport-Security: max-age=31536000; includeSubDomains');
|
|
header('Referrer-Policy: strict-origin-when-cross-origin');
|
|
|
|
// Article-specific SEO variables
|
|
$article_title = "Python Scrapy Enterprise Guide: Scaling Web Scraping Operations";
|
|
$article_description = "Master Scrapy for enterprise-scale web scraping operations. Learn advanced techniques, best practices, and optimization strategies for production deployments.";
|
|
$article_keywords = "Python Scrapy enterprise, web scraping framework, Scrapy best practices, enterprise web scraping, Python data extraction, Scrapy optimization";
|
|
$article_author = "UK Data Services Technical Team";
|
|
$canonical_url = "https://ukdataservices.co.uk/blog/articles/python-scrapy-enterprise-guide";
|
|
$article_published = "2025-05-15T09:00:00+00:00";
|
|
$article_modified = "2025-05-15T09:00:00+00:00";
|
|
$og_image = "https://ukdataservices.co.uk/assets/images/icon-web-scraping-v2.svg";
|
|
$read_time = 12;
|
|
?>
|
|
<!DOCTYPE html>
|
|
<html lang="en">
|
|
<head>
|
|
<meta charset="UTF-8">
|
|
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
<title><?php echo htmlspecialchars($article_title); ?> | UK Data Services Blog</title>
|
|
<meta name="description" content="<?php echo htmlspecialchars($article_description); ?>">
|
|
<meta name="keywords" content="<?php echo htmlspecialchars($article_keywords); ?>">
|
|
<meta name="author" content="<?php echo htmlspecialchars($article_author); ?>">
|
|
<meta name="robots" content="index, follow">
|
|
<link rel="canonical" href="<?php echo htmlspecialchars($canonical_url); ?>">
|
|
|
|
<!-- Article-specific meta tags -->
|
|
<meta name="article:published_time" content="<?php echo $article_published; ?>">
|
|
<meta name="article:modified_time" content="<?php echo $article_modified; ?>">
|
|
<meta name="article:author" content="<?php echo htmlspecialchars($article_author); ?>">
|
|
<meta name="article:section" content="Web Scraping">
|
|
<meta name="article:tag" content="Python, Scrapy, Web Scraping, Enterprise, Framework">
|
|
|
|
<!-- Preload critical resources -->
|
|
<link rel="preload" href="../../assets/css/main.css" as="style">
|
|
<link rel="preload" href="../../assets/images/ukds-main-logo.png" as="image">
|
|
|
|
<!-- Open Graph / Social Media -->
|
|
<meta property="og:type" content="article">
|
|
<meta property="og:url" content="<?php echo htmlspecialchars($canonical_url); ?>">
|
|
<meta property="og:title" content="<?php echo htmlspecialchars($article_title); ?>">
|
|
<meta property="og:description" content="<?php echo htmlspecialchars($article_description); ?>">
|
|
<meta property="og:image" content="<?php echo htmlspecialchars($og_image); ?>">
|
|
|
|
<!-- Twitter Card -->
|
|
<meta name="twitter:card" content="summary_large_image">
|
|
<meta name="twitter:title" content="<?php echo htmlspecialchars($article_title); ?>">
|
|
<meta name="twitter:description" content="<?php echo htmlspecialchars($article_description); ?>">
|
|
<meta name="twitter:image" content="<?php echo htmlspecialchars($og_image); ?>">
|
|
|
|
<!-- Favicon and App Icons -->
|
|
<link rel="icon" type="image/svg+xml" href="../../assets/images/favicon.svg">
|
|
<link rel="apple-touch-icon" sizes="180x180" href="../../assets/images/apple-touch-icon.svg">
|
|
|
|
<!-- Fonts -->
|
|
<link rel="preconnect" href="https://fonts.googleapis.com">
|
|
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
|
|
<link href="https://fonts.googleapis.com/css2?family=Roboto+Slab:wght@300;400;500;600;700&family=Lato:wght@300;400;500;600;700&display=swap" rel="stylesheet">
|
|
|
|
<!-- Styles -->
|
|
<link rel="stylesheet" href="../../assets/css/main.css">
|
|
|
|
<!-- Article Schema -->
|
|
<script type="application/ld+json">
|
|
{
|
|
"@context": "https://schema.org",
|
|
"@type": "Article",
|
|
"mainEntityOfPage": {
|
|
"@type": "WebPage",
|
|
"@id": "<?php echo htmlspecialchars($canonical_url); ?>"
|
|
},
|
|
"headline": "<?php echo htmlspecialchars($article_title); ?>",
|
|
"description": "<?php echo htmlspecialchars($article_description); ?>",
|
|
"image": "<?php echo htmlspecialchars($og_image); ?>",
|
|
"author": {
|
|
"@type": "Organization",
|
|
"name": "UK Data Services",
|
|
"url": "https://ukdataservices.co.uk"
|
|
},
|
|
"publisher": {
|
|
"@type": "Organization",
|
|
"name": "UK Data Services",
|
|
"logo": {
|
|
"@type": "ImageObject",
|
|
"url": "https://ukdataservices.co.uk/assets/images/ukds-main-logo.png"
|
|
}
|
|
},
|
|
"datePublished": "<?php echo $article_published; ?>",
|
|
"dateModified": "<?php echo $article_modified; ?>"
|
|
}
|
|
</script>
|
|
|
|
<!-- HowTo Schema for Technical Guide -->
|
|
<script type="application/ld+json">
|
|
{
|
|
"@context": "https://schema.org",
|
|
"@type": "HowTo",
|
|
"name": "How to Set Up Scrapy for Enterprise Web Scraping Operations",
|
|
"description": "Step-by-step guide to implement and scale Python Scrapy for enterprise web scraping operations with best practices and optimization techniques.",
|
|
"image": "https://ukdataservices.co.uk/assets/images/icon-web-scraping-v2.svg",
|
|
"estimatedCost": {
|
|
"@type": "MonetaryAmount",
|
|
"currency": "GBP",
|
|
"value": "0"
|
|
},
|
|
"totalTime": "PT45M",
|
|
"supply": [
|
|
{
|
|
"@type": "HowToSupply",
|
|
"name": "Python 3.8+"
|
|
},
|
|
{
|
|
"@type": "HowToSupply",
|
|
"name": "Scrapy Framework"
|
|
},
|
|
{
|
|
"@type": "HowToSupply",
|
|
"name": "Development Environment"
|
|
}
|
|
],
|
|
"tool": [
|
|
{
|
|
"@type": "HowToTool",
|
|
"name": "Python IDE"
|
|
},
|
|
{
|
|
"@type": "HowToTool",
|
|
"name": "Command Line Interface"
|
|
}
|
|
],
|
|
"step": [
|
|
{
|
|
"@type": "HowToStep",
|
|
"name": "Install Scrapy Framework",
|
|
"text": "Install Scrapy using pip and set up your development environment",
|
|
"url": "https://ukdataservices.co.uk/blog/articles/python-scrapy-enterprise-guide#installation"
|
|
},
|
|
{
|
|
"@type": "HowToStep",
|
|
"name": "Create Scrapy Project",
|
|
"text": "Initialize a new Scrapy project with proper directory structure",
|
|
"url": "https://ukdataservices.co.uk/blog/articles/python-scrapy-enterprise-guide#project-setup"
|
|
},
|
|
{
|
|
"@type": "HowToStep",
|
|
"name": "Configure Settings",
|
|
"text": "Set up enterprise-grade configuration for production deployment",
|
|
"url": "https://ukdataservices.co.uk/blog/articles/python-scrapy-enterprise-guide#configuration"
|
|
},
|
|
{
|
|
"@type": "HowToStep",
|
|
"name": "Implement Spiders",
|
|
"text": "Build scalable spider classes with proper error handling",
|
|
"url": "https://ukdataservices.co.uk/blog/articles/python-scrapy-enterprise-guide#spider-development"
|
|
},
|
|
{
|
|
"@type": "HowToStep",
|
|
"name": "Deploy and Monitor",
|
|
"text": "Deploy to production and implement monitoring systems",
|
|
"url": "https://ukdataservices.co.uk/blog/articles/python-scrapy-enterprise-guide#deployment"
|
|
}
|
|
]
|
|
}
|
|
</script>
|
|
</head>
|
|
<body>
|
|
<!-- Skip to content link for accessibility -->
|
|
<a href="#main-content" class="skip-to-content">Skip to main content</a>
|
|
|
|
<nav class="navbar scrolled" id="navbar">
|
|
<div class="nav-container">
|
|
<div class="nav-logo">
|
|
<a href="/">
|
|
<img src="../../assets/images/ukds-main-logo.png" alt="UK Data Services" class="logo" loading="eager">
|
|
</a>
|
|
</div>
|
|
<div class="nav-menu" id="nav-menu">
|
|
<a href="/" class="nav-link">Home</a>
|
|
<a href="/#services" class="nav-link">Capabilities</a>
|
|
<a href="/project-types" class="nav-link">Project Types</a>
|
|
<a href="/about" class="nav-link">About</a>
|
|
<a href="/blog/" class="nav-link active">Blog</a>
|
|
<a href="/#contact" class="nav-link">Contact</a>
|
|
<a href="/quote" class="nav-link cta-button">Request Consultation</a>
|
|
</div>
|
|
<div class="nav-toggle" id="nav-toggle">
|
|
<span class="bar"></span>
|
|
<span class="bar"></span>
|
|
<span class="bar"></span>
|
|
</div>
|
|
</div>
|
|
</nav>
|
|
|
|
<!-- Breadcrumb Navigation -->
|
|
<div class="breadcrumb">
|
|
<nav aria-label="Breadcrumb">
|
|
<ol>
|
|
<li><a href="/">Home</a></li>
|
|
<li><a href="/blog/">Blog</a></li>
|
|
<li><a href="../categories/web-scraping.php">Web Scraping</a></li>
|
|
<li aria-current="page"><span>Python Scrapy Enterprise Guide</span></li>
|
|
</ol>
|
|
</nav>
|
|
</div>
|
|
|
|
<!-- Article Content -->
|
|
<main id="main-content">
|
|
<article class="article-page">
|
|
<div class="container">
|
|
<header class="article-header">
|
|
<div class="article-meta">
|
|
<span class="category">Web Scraping</span>
|
|
<time datetime="2025-05-15">15 May 2025</time>
|
|
<span class="read-time"><?php echo $read_time; ?> min read</span>
|
|
</div>
|
|
<h1><?php echo htmlspecialchars($article_title); ?></h1>
|
|
<p class="article-lead"><?php echo htmlspecialchars($article_description); ?></p>
|
|
|
|
<div class="article-author">
|
|
<div class="author-info">
|
|
<span>By <?php echo htmlspecialchars($article_author); ?></span>
|
|
</div>
|
|
<div class="share-buttons">
|
|
<a href="https://www.linkedin.com/sharing/share-offsite/?url=<?php echo urlencode($canonical_url); ?>" class="share-button linkedin" aria-label="Share on LinkedIn" rel="noopener" target="_blank">
|
|
<img src="../../assets/images/icon-linkedin.svg" alt="LinkedIn">
|
|
</a>
|
|
<a href="https://twitter.com/intent/tweet?url=<?php echo urlencode($canonical_url); ?>&text=<?php echo urlencode($article_title); ?>" class="share-button twitter" aria-label="Share on Twitter" rel="noopener" target="_blank">
|
|
<img src="../../assets/images/icon-twitter.svg" alt="Twitter">
|
|
</a>
|
|
</div>
|
|
</div>
|
|
</header>
|
|
|
|
<div class="article-content">
|
|
<div class="content-wrapper">
|
|
<h2>Why Scrapy for Enterprise Web Scraping?</h2>
|
|
<p>Scrapy stands out as the premier Python framework for large-scale web scraping operations. Unlike simple scripts or basic tools, Scrapy provides the robust architecture, built-in features, and extensibility that enterprise applications demand.</p>
|
|
|
|
<p>This comprehensive guide covers everything you need to know to deploy Scrapy in production environments, from initial setup to advanced optimization techniques.</p>
|
|
|
|
<h2>Enterprise-Grade Scrapy Architecture</h2>
|
|
|
|
<h3>Core Components Overview</h3>
|
|
<ul>
|
|
<li><strong>Scrapy Engine:</strong> Controls data flow between components</li>
|
|
<li><strong>Scheduler:</strong> Receives requests and queues them for processing</li>
|
|
<li><strong>Downloader:</strong> Fetches web pages and returns responses</li>
|
|
<li><strong>Spiders:</strong> Custom classes that define scraping logic</li>
|
|
<li><strong>Item Pipeline:</strong> Processes extracted data</li>
|
|
<li><strong>Middlewares:</strong> Hooks for customizing request/response processing</li>
|
|
</ul>
|
|
|
|
<h3>Production Project Structure</h3>
|
|
<pre><code>
|
|
enterprise_scraper/
|
|
├── scrapy.cfg
|
|
├── requirements.txt
|
|
├── docker-compose.yml
|
|
├── enterprise_scraper/
|
|
│ ├── __init__.py
|
|
│ ├── settings/
|
|
│ │ ├── __init__.py
|
|
│ │ ├── base.py
|
|
│ │ ├── development.py
|
|
│ │ ├── staging.py
|
|
│ │ └── production.py
|
|
│ ├── spiders/
|
|
│ │ ├── __init__.py
|
|
│ │ ├── base_spider.py
|
|
│ │ └── ecommerce_spider.py
|
|
│ ├── items.py
|
|
│ ├── pipelines.py
|
|
│ ├── middlewares.py
|
|
│ └── utils/
|
|
│ ├── __init__.py
|
|
│ ├── database.py
|
|
│ └── monitoring.py
|
|
├── deploy/
|
|
│ ├── Dockerfile
|
|
│ └── kubernetes/
|
|
└── tests/
|
|
├── unit/
|
|
└── integration/
|
|
</code></pre>
|
|
|
|
<h2>Advanced Configuration Management</h2>
|
|
|
|
<h3>Environment-Specific Settings</h3>
|
|
<pre><code>
|
|
# settings/base.py
|
|
BOT_NAME = 'enterprise_scraper'
|
|
SPIDER_MODULES = ['enterprise_scraper.spiders']
|
|
NEWSPIDER_MODULE = 'enterprise_scraper.spiders'
|
|
|
|
# Respect robots.txt for compliance
|
|
ROBOTSTXT_OBEY = True
|
|
|
|
# Configure concurrent requests
|
|
CONCURRENT_REQUESTS = 32
|
|
CONCURRENT_REQUESTS_PER_DOMAIN = 8
|
|
|
|
# Download delays for respectful scraping
|
|
DOWNLOAD_DELAY = 1
|
|
RANDOMIZE_DOWNLOAD_DELAY = 0.5
|
|
|
|
# Production settings/production.py
|
|
from .base import *
|
|
|
|
# Increase concurrency for production
|
|
CONCURRENT_REQUESTS = 100
|
|
CONCURRENT_REQUESTS_PER_DOMAIN = 16
|
|
|
|
# Enable autothrottling
|
|
AUTOTHROTTLE_ENABLED = True
|
|
AUTOTHROTTLE_START_DELAY = 1
|
|
AUTOTHROTTLE_MAX_DELAY = 10
|
|
AUTOTHROTTLE_TARGET_CONCURRENCY = 2.0
|
|
|
|
# Logging configuration
|
|
LOG_LEVEL = 'INFO'
|
|
LOG_FILE = '/var/log/scrapy/scrapy.log'
|
|
|
|
# Database settings
|
|
DATABASE_URL = os.environ.get('DATABASE_URL')
|
|
REDIS_URL = os.environ.get('REDIS_URL')
|
|
</code></pre>
|
|
|
|
<h3>Dynamic Settings with Environment Variables</h3>
|
|
<pre><code>
|
|
import os
|
|
from scrapy.utils.project import get_project_settings
|
|
|
|
def get_scrapy_settings():
|
|
settings = get_project_settings()
|
|
|
|
# Environment-specific overrides
|
|
if os.environ.get('SCRAPY_ENV') == 'production':
|
|
settings.set('CONCURRENT_REQUESTS', 200)
|
|
settings.set('DOWNLOAD_DELAY', 0.5)
|
|
elif os.environ.get('SCRAPY_ENV') == 'development':
|
|
settings.set('CONCURRENT_REQUESTS', 16)
|
|
settings.set('DOWNLOAD_DELAY', 2)
|
|
|
|
return settings
|
|
</code></pre>
|
|
|
|
<h2>Enterprise Spider Development</h2>
|
|
|
|
<h3>Base Spider Class</h3>
|
|
<pre><code>
|
|
import scrapy
|
|
from scrapy.http import Request
|
|
from typing import Generator, Optional
|
|
import logging
|
|
|
|
class BaseSpider(scrapy.Spider):
|
|
"""Base spider with common enterprise functionality"""
|
|
|
|
def __init__(self, *args, **kwargs):
|
|
super().__init__(*args, **kwargs)
|
|
self.setup_logging()
|
|
self.setup_monitoring()
|
|
|
|
def setup_logging(self):
|
|
"""Configure structured logging"""
|
|
self.logger = logging.getLogger(self.name)
|
|
|
|
def setup_monitoring(self):
|
|
"""Initialize monitoring metrics"""
|
|
self.stats = {
|
|
'pages_scraped': 0,
|
|
'items_extracted': 0,
|
|
'errors': 0
|
|
}
|
|
|
|
def parse_with_error_handling(self, response):
|
|
"""Parse with comprehensive error handling"""
|
|
try:
|
|
yield from self.parse_content(response)
|
|
except Exception as e:
|
|
self.logger.error(f"Error parsing {response.url}: {e}")
|
|
self.stats['errors'] += 1
|
|
|
|
def make_request(self, url: str, callback=None, meta: dict = None) -> Request:
|
|
"""Create request with standard metadata"""
|
|
return Request(
|
|
url=url,
|
|
callback=callback or self.parse_with_error_handling,
|
|
meta={
|
|
'spider_name': self.name,
|
|
'timestamp': time.time(),
|
|
**(meta or {})
|
|
},
|
|
dont_filter=False
|
|
)
|
|
</code></pre>
|
|
|
|
<h3>Advanced E-commerce Spider</h3>
|
|
<pre><code>
|
|
from enterprise_scraper.spiders.base_spider import BaseSpider
|
|
from enterprise_scraper.items import ProductItem
|
|
|
|
class EcommerceSpider(BaseSpider):
|
|
name = 'ecommerce'
|
|
allowed_domains = ['example-store.com']
|
|
|
|
custom_settings = {
|
|
'ITEM_PIPELINES': {
|
|
'enterprise_scraper.pipelines.ValidationPipeline': 300,
|
|
'enterprise_scraper.pipelines.DatabasePipeline': 400,
|
|
},
|
|
'DOWNLOAD_DELAY': 2,
|
|
}
|
|
|
|
def start_requests(self):
|
|
"""Generate initial requests with pagination"""
|
|
base_url = "https://example-store.com/products"
|
|
|
|
for page in range(1, 101): # First 100 pages
|
|
url = f"{base_url}?page={page}"
|
|
yield self.make_request(
|
|
url=url,
|
|
callback=self.parse_product_list,
|
|
meta={'page': page}
|
|
)
|
|
|
|
def parse_product_list(self, response):
|
|
"""Extract product URLs from listing pages"""
|
|
product_urls = response.css('.product-link::attr(href)').getall()
|
|
|
|
for url in product_urls:
|
|
yield self.make_request(
|
|
url=response.urljoin(url),
|
|
callback=self.parse_product,
|
|
meta={'category': response.meta.get('category')}
|
|
)
|
|
|
|
# Handle pagination
|
|
next_page = response.css('.pagination .next::attr(href)').get()
|
|
if next_page:
|
|
yield self.make_request(
|
|
url=response.urljoin(next_page),
|
|
callback=self.parse_product_list
|
|
)
|
|
|
|
def parse_product(self, response):
|
|
"""Extract product details"""
|
|
item = ProductItem()
|
|
|
|
item['url'] = response.url
|
|
item['name'] = response.css('h1.product-title::text').get()
|
|
item['price'] = self.extract_price(response)
|
|
item['description'] = response.css('.product-description::text').getall()
|
|
item['images'] = response.css('.product-images img::attr(src)').getall()
|
|
item['availability'] = response.css('.stock-status::text').get()
|
|
item['rating'] = self.extract_rating(response)
|
|
item['reviews_count'] = self.extract_reviews_count(response)
|
|
|
|
self.stats['items_extracted'] += 1
|
|
yield item
|
|
|
|
def extract_price(self, response):
|
|
"""Extract and normalize price data"""
|
|
price_text = response.css('.price::text').get()
|
|
if price_text:
|
|
# Remove currency symbols and normalize
|
|
import re
|
|
price = re.sub(r'[^\d.]', '', price_text)
|
|
return float(price) if price else None
|
|
return None
|
|
</code></pre>
|
|
|
|
<h2>Enterprise Pipeline System</h2>
|
|
|
|
<h3>Validation Pipeline</h3>
|
|
<pre><code>
|
|
from itemadapter import ItemAdapter
|
|
from scrapy.exceptions import DropItem
|
|
import validators
|
|
|
|
class ValidationPipeline:
|
|
"""Validate items before processing"""
|
|
|
|
def process_item(self, item, spider):
|
|
adapter = ItemAdapter(item)
|
|
|
|
# Required field validation
|
|
if not adapter.get('name'):
|
|
raise DropItem(f"Missing product name: {item}")
|
|
|
|
# URL validation
|
|
if not validators.url(adapter.get('url')):
|
|
raise DropItem(f"Invalid URL: {adapter.get('url')}")
|
|
|
|
# Price validation
|
|
price = adapter.get('price')
|
|
if price is not None:
|
|
try:
|
|
price = float(price)
|
|
if price < 0:
|
|
raise DropItem(f"Invalid price: {price}")
|
|
adapter['price'] = price
|
|
except (ValueError, TypeError):
|
|
raise DropItem(f"Invalid price format: {price}")
|
|
|
|
spider.logger.info(f"Item validated: {adapter.get('name')}")
|
|
return item
|
|
</code></pre>
|
|
|
|
<h3>Database Pipeline with Connection Pooling</h3>
|
|
<pre><code>
|
|
import asyncio
|
|
import asyncpg
|
|
from itemadapter import ItemAdapter
|
|
|
|
class DatabasePipeline:
|
|
"""Asynchronous database pipeline"""
|
|
|
|
def __init__(self, db_url, pool_size=20):
|
|
self.db_url = db_url
|
|
self.pool_size = pool_size
|
|
self.pool = None
|
|
|
|
@classmethod
|
|
def from_crawler(cls, crawler):
|
|
return cls(
|
|
db_url=crawler.settings.get('DATABASE_URL'),
|
|
pool_size=crawler.settings.get('DB_POOL_SIZE', 20)
|
|
)
|
|
|
|
async def open_spider(self, spider):
|
|
"""Initialize database connection pool"""
|
|
self.pool = await asyncpg.create_pool(
|
|
self.db_url,
|
|
min_size=5,
|
|
max_size=self.pool_size
|
|
)
|
|
spider.logger.info("Database connection pool created")
|
|
|
|
async def close_spider(self, spider):
|
|
"""Close database connection pool"""
|
|
if self.pool:
|
|
await self.pool.close()
|
|
spider.logger.info("Database connection pool closed")
|
|
|
|
async def process_item(self, item, spider):
|
|
"""Insert item into database"""
|
|
adapter = ItemAdapter(item)
|
|
|
|
async with self.pool.acquire() as connection:
|
|
await connection.execute('''
|
|
INSERT INTO products (url, name, price, description)
|
|
VALUES ($1, $2, $3, $4)
|
|
ON CONFLICT (url) DO UPDATE SET
|
|
name = EXCLUDED.name,
|
|
price = EXCLUDED.price,
|
|
description = EXCLUDED.description,
|
|
updated_at = NOW()
|
|
''',
|
|
adapter.get('url'),
|
|
adapter.get('name'),
|
|
adapter.get('price'),
|
|
'\n'.join(adapter.get('description', []))
|
|
)
|
|
|
|
spider.logger.info(f"Item saved: {adapter.get('name')}")
|
|
return item
|
|
</code></pre>
|
|
|
|
<h2>Middleware for Enterprise Features</h2>
|
|
|
|
<h3>Rotating Proxy Middleware</h3>
|
|
<pre><code>
|
|
import random
|
|
from scrapy.downloadermiddlewares.httpproxy import HttpProxyMiddleware
|
|
|
|
class RotatingProxyMiddleware(HttpProxyMiddleware):
|
|
"""Rotate proxies for each request"""
|
|
|
|
def __init__(self, proxy_list):
|
|
self.proxy_list = proxy_list
|
|
|
|
@classmethod
|
|
def from_crawler(cls, crawler):
|
|
proxy_list = crawler.settings.get('PROXY_LIST', [])
|
|
return cls(proxy_list)
|
|
|
|
def process_request(self, request, spider):
|
|
if self.proxy_list:
|
|
proxy = random.choice(self.proxy_list)
|
|
request.meta['proxy'] = proxy
|
|
spider.logger.debug(f"Using proxy: {proxy}")
|
|
|
|
return None
|
|
</code></pre>
|
|
|
|
<h3>Rate Limiting Middleware</h3>
|
|
<pre><code>
|
|
import time
|
|
from collections import defaultdict
|
|
from scrapy.downloadermiddlewares.retry import RetryMiddleware
|
|
|
|
class RateLimitMiddleware(RetryMiddleware):
|
|
"""Implement per-domain rate limiting"""
|
|
|
|
def __init__(self, settings):
|
|
super().__init__(settings)
|
|
self.domain_delays = defaultdict(float)
|
|
self.last_request_time = defaultdict(float)
|
|
|
|
def process_request(self, request, spider):
|
|
domain = request.url.split('/')[2]
|
|
current_time = time.time()
|
|
|
|
# Calculate required delay
|
|
min_delay = self.domain_delays.get(domain, 1.0)
|
|
time_since_last = current_time - self.last_request_time[domain]
|
|
|
|
if time_since_last < min_delay:
|
|
delay = min_delay - time_since_last
|
|
spider.logger.debug(f"Rate limiting {domain}: {delay:.2f}s")
|
|
time.sleep(delay)
|
|
|
|
self.last_request_time[domain] = time.time()
|
|
return None
|
|
</code></pre>
|
|
|
|
<h2>Monitoring and Observability</h2>
|
|
|
|
<h3>Custom Stats Collection</h3>
|
|
<pre><code>
|
|
from scrapy.statscollectors import StatsCollector
|
|
import time
|
|
|
|
class EnterpriseStatsCollector(StatsCollector):
|
|
"""Enhanced stats collection for monitoring"""
|
|
|
|
def __init__(self, crawler):
|
|
super().__init__(crawler)
|
|
self.start_time = time.time()
|
|
self.custom_stats = {}
|
|
|
|
def get_stats(self):
|
|
"""Enhanced stats with custom metrics"""
|
|
stats = super().get_stats()
|
|
|
|
# Add runtime statistics
|
|
runtime = time.time() - self.start_time
|
|
stats['runtime_seconds'] = runtime
|
|
|
|
# Add rate calculations
|
|
pages_count = stats.get('response_received_count', 0)
|
|
if runtime > 0:
|
|
stats['pages_per_minute'] = (pages_count / runtime) * 60
|
|
|
|
# Add custom metrics
|
|
stats.update(self.custom_stats)
|
|
|
|
return stats
|
|
|
|
def inc_value(self, key, count=1, start=0):
|
|
"""Increment custom counter"""
|
|
super().inc_value(key, count, start)
|
|
|
|
# Log significant milestones
|
|
current_value = self.get_value(key, 0)
|
|
if current_value % 1000 == 0: # Every 1000 items
|
|
self.crawler.spider.logger.info(f"{key}: {current_value}")
|
|
</code></pre>
|
|
|
|
<h2>Production Deployment</h2>
|
|
|
|
<h3>Docker Configuration</h3>
|
|
<pre><code>
|
|
# Dockerfile
|
|
FROM python:3.9-slim
|
|
|
|
WORKDIR /app
|
|
|
|
# Install system dependencies
|
|
RUN apt-get update && apt-get install -y \
|
|
gcc \
|
|
libc-dev \
|
|
libffi-dev \
|
|
libssl-dev \
|
|
&& rm -rf /var/lib/apt/lists/*
|
|
|
|
# Install Python dependencies
|
|
COPY requirements.txt .
|
|
RUN pip install --no-cache-dir -r requirements.txt
|
|
|
|
# Copy application code
|
|
COPY . .
|
|
|
|
# Create non-root user
|
|
RUN useradd -m -u 1000 scrapy && chown -R scrapy:scrapy /app
|
|
USER scrapy
|
|
|
|
# Default command
|
|
CMD ["scrapy", "crawl", "ecommerce"]
|
|
</code></pre>
|
|
|
|
<h3>Kubernetes Deployment</h3>
|
|
<pre><code>
|
|
apiVersion: apps/v1
|
|
kind: Deployment
|
|
metadata:
|
|
name: scrapy-deployment
|
|
spec:
|
|
replicas: 3
|
|
selector:
|
|
matchLabels:
|
|
app: scrapy
|
|
template:
|
|
metadata:
|
|
labels:
|
|
app: scrapy
|
|
spec:
|
|
containers:
|
|
- name: scrapy
|
|
image: enterprise-scrapy:latest
|
|
resources:
|
|
requests:
|
|
memory: "1Gi"
|
|
cpu: "500m"
|
|
limits:
|
|
memory: "2Gi"
|
|
cpu: "1000m"
|
|
env:
|
|
- name: SCRAPY_ENV
|
|
value: "production"
|
|
- name: DATABASE_URL
|
|
valueFrom:
|
|
secretKeyRef:
|
|
name: db-secret
|
|
key: url
|
|
---
|
|
apiVersion: v1
|
|
kind: Service
|
|
metadata:
|
|
name: scrapy-service
|
|
spec:
|
|
selector:
|
|
app: scrapy
|
|
ports:
|
|
- port: 6800
|
|
targetPort: 6800
|
|
</code></pre>
|
|
|
|
<h2>Performance Optimization</h2>
|
|
|
|
<h3>Memory Management</h3>
|
|
<ul>
|
|
<li><strong>Item Pipeline:</strong> Process items immediately to avoid memory buildup</li>
|
|
<li><strong>Response Caching:</strong> Disable for production unless specifically needed</li>
|
|
<li><strong>Request Filtering:</strong> Use duplicate filters efficiently</li>
|
|
<li><strong>Large Responses:</strong> Stream large files instead of loading into memory</li>
|
|
</ul>
|
|
|
|
<h3>Scaling Strategies</h3>
|
|
<ul>
|
|
<li><strong>Horizontal Scaling:</strong> Multiple spider instances</li>
|
|
<li><strong>Domain Sharding:</strong> Distribute domains across instances</li>
|
|
<li><strong>Queue Management:</strong> Redis-based distributed queuing</li>
|
|
<li><strong>Load Balancing:</strong> Distribute requests across proxy pools</li>
|
|
</ul>
|
|
|
|
<h2>Best Practices Summary</h2>
|
|
|
|
<h3>Code Organization</h3>
|
|
<ul>
|
|
<li>Use inheritance for common spider functionality</li>
|
|
<li>Separate settings by environment</li>
|
|
<li>Implement comprehensive error handling</li>
|
|
<li>Write unit tests for custom components</li>
|
|
</ul>
|
|
|
|
<h3>Operational Excellence</h3>
|
|
<ul>
|
|
<li>Monitor performance metrics continuously</li>
|
|
<li>Implement circuit breakers for external services</li>
|
|
<li>Use structured logging for better observability</li>
|
|
<li>Plan for graceful degradation</li>
|
|
</ul>
|
|
|
|
<h3>Compliance and Ethics</h3>
|
|
<ul>
|
|
<li>Respect robots.txt and rate limits</li>
|
|
<li>Implement proper user agent identification</li>
|
|
<li>Handle personal data according to GDPR</li>
|
|
<li>Maintain audit trails for data collection</li>
|
|
</ul>
|
|
|
|
<div class="article-cta">
|
|
<h3>Scale Your Scrapy Operations</h3>
|
|
<p>UK Data Services provides enterprise Scrapy development and deployment services. Let our experts help you build robust, scalable web scraping solutions.</p>
|
|
<a href="/quote" class="btn btn-primary">Get Scrapy Consultation</a>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
|
|
<!-- Related Articles -->
|
|
<aside class="related-articles">
|
|
<h3>Related Articles</h3>
|
|
<div class="related-grid">
|
|
<article class="related-card">
|
|
<span class="category">Web Scraping</span>
|
|
<h4><a href="javascript-heavy-sites-scraping.php">Scraping JavaScript-Heavy Sites: Advanced Techniques</a></h4>
|
|
<span class="read-time">6 min read</span>
|
|
</article>
|
|
<article class="related-card">
|
|
<span class="category">Technology</span>
|
|
<h4><a href="cloud-native-scraping-architecture.php">Cloud-Native Scraping Architecture for Enterprise Scale</a></h4>
|
|
<span class="read-time">11 min read</span>
|
|
</article>
|
|
<article class="related-card">
|
|
<span class="category">Compliance</span>
|
|
<h4><a href="web-scraping-compliance-uk-guide.php">Complete Guide to Web Scraping Compliance in the UK</a></h4>
|
|
<span class="read-time">12 min read</span>
|
|
</article>
|
|
</div>
|
|
</aside>
|
|
</div>
|
|
</article>
|
|
</main>
|
|
|
|
<!-- Footer -->
|
|
<footer class="footer">
|
|
<div class="container">
|
|
<div class="footer-content">
|
|
<div class="footer-section">
|
|
<div class="footer-logo">
|
|
<img src="../../assets/images/logo-white.svg" alt="UK Data Services" loading="lazy">
|
|
</div>
|
|
<p>Enterprise data intelligence solutions for modern British business.</p>
|
|
</div>
|
|
|
|
<div class="footer-section">
|
|
<h3>Quick Links</h3>
|
|
<ul>
|
|
<li><a href="/#services">Services</a></li>
|
|
<li><a href="/blog/">Blog</a></li>
|
|
<li><a href="/case-studies/">Case Studies</a></li>
|
|
<li><a href="/about">About</a></li>
|
|
<li><a href="/#contact">Contact</a></li>
|
|
</ul>
|
|
</div>
|
|
|
|
<div class="footer-section">
|
|
<h3>Legal</h3>
|
|
<ul>
|
|
<li><a href="/privacy-policy">Privacy Policy</a></li>
|
|
<li><a href="/terms-of-service">Terms of Service</a></li>
|
|
<li><a href="/cookie-policy">Cookie Policy</a></li>
|
|
<li><a href="/gdpr-compliance">GDPR Compliance</a></li>
|
|
</ul>
|
|
</div>
|
|
</div>
|
|
|
|
<div class="footer-bottom">
|
|
<p>© <?php echo date('Y'); ?> UK Data Services. All rights reserved.</p>
|
|
<div class="social-links">
|
|
<a href="https://www.linkedin.com/company/uk-data-services" aria-label="LinkedIn" rel="noopener" target="_blank">
|
|
<img src="../../assets/images/icon-linkedin.svg" alt="LinkedIn" loading="lazy">
|
|
</a>
|
|
<a href="https://twitter.com/ukdataservices" aria-label="Twitter" rel="noopener" target="_blank">
|
|
<img src="../../assets/images/icon-twitter.svg" alt="Twitter" loading="lazy">
|
|
</a>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
</footer>
|
|
|
|
<!-- Scripts -->
|
|
<script src="../../assets/js/main.js"></script>
|
|
</body>
|
|
</html>
|