- Remap 20 articles from generic team names (UK Data Services Legal Team, Analytics Team, Technical Team etc.) to matching named authors from the author database (Sarah Chen, David Martinez, Michael Thompson, etc.) - Add 5 new named authors to author-bio.php: Alex Kumar, David Thompson, Emily Roberts, Michael Chen, Sarah Mitchell - Eliminates author name/bio mismatch where team name showed but Editorial Team bio/role rendered instead
861 lines
34 KiB
PHP
861 lines
34 KiB
PHP
<?php
|
|
// Enhanced security headers
|
|
header('Strict-Transport-Security: max-age=31536000; includeSubDomains');
|
|
|
|
// Article-specific SEO variables
|
|
$article_title = "Python Scrapy Enterprise Guide: Scaling Web Scraping Operations";
|
|
$article_description = "Master Scrapy for enterprise-scale web scraping operations. Learn advanced techniques, best practices, and optimization strategies for production deployments.";
|
|
$article_keywords = "Python Scrapy enterprise, web scraping framework, Scrapy best practices, enterprise web scraping, Python data extraction, Scrapy optimization";
|
|
$article_author = "Michael Thompson";
|
|
$canonical_url = "https://ukdataservices.co.uk/blog/articles/python-scrapy-enterprise-guide";
|
|
$article_published = "2025-05-15T09:00:00+00:00";
|
|
$article_modified = "2025-05-15T09:00:00+00:00";
|
|
$og_image = "https://ukdataservices.co.uk/assets/images/icon-web-scraping-v2.svg";
|
|
$read_time = 12;
|
|
?>
|
|
<!DOCTYPE html>
|
|
<html lang="en">
|
|
<head>
|
|
<meta charset="UTF-8">
|
|
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
<title><?php echo htmlspecialchars($article_title); ?> | UK Data Services Blog</title>
|
|
<meta name="description" content="<?php echo htmlspecialchars($article_description); ?>">
|
|
<meta name="keywords" content="<?php echo htmlspecialchars($article_keywords); ?>">
|
|
<meta name="author" content="<?php echo htmlspecialchars($article_author); ?>">
|
|
<meta name="robots" content="index, follow">
|
|
<link rel="canonical" href="<?php echo htmlspecialchars($canonical_url); ?>">
|
|
|
|
<!-- Article-specific meta tags -->
|
|
<meta name="article:published_time" content="<?php echo $article_published; ?>">
|
|
<meta name="article:modified_time" content="<?php echo $article_modified; ?>">
|
|
<meta name="article:author" content="<?php echo htmlspecialchars($article_author); ?>">
|
|
<meta name="article:section" content="Web Scraping">
|
|
<meta name="article:tag" content="Python, Scrapy, Web Scraping, Enterprise, Framework">
|
|
|
|
<!-- Preload critical resources -->
|
|
<link rel="preload" href="../../assets/css/main.css" as="style">
|
|
<link rel="preload" href="../../assets/images/ukds-main-logo.png" as="image">
|
|
|
|
<!-- Open Graph / Social Media -->
|
|
<meta property="og:type" content="article">
|
|
<meta property="og:url" content="<?php echo htmlspecialchars($canonical_url); ?>">
|
|
<meta property="og:title" content="<?php echo htmlspecialchars($article_title); ?>">
|
|
<meta property="og:description" content="<?php echo htmlspecialchars($article_description); ?>">
|
|
<meta property="og:image" content="<?php echo htmlspecialchars($og_image); ?>">
|
|
|
|
<!-- Twitter Card -->
|
|
<meta name="twitter:card" content="summary_large_image">
|
|
<meta name="twitter:title" content="<?php echo htmlspecialchars($article_title); ?>">
|
|
<meta name="twitter:description" content="<?php echo htmlspecialchars($article_description); ?>">
|
|
<meta name="twitter:image" content="<?php echo htmlspecialchars($og_image); ?>">
|
|
|
|
<!-- Favicon and App Icons -->
|
|
<link rel="icon" type="image/svg+xml" href="../../assets/images/favicon.svg">
|
|
<link rel="apple-touch-icon" sizes="180x180" href="../../assets/images/apple-touch-icon.svg">
|
|
|
|
<!-- Fonts -->
|
|
<link rel="preconnect" href="https://fonts.googleapis.com">
|
|
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
|
|
<link href="https://fonts.googleapis.com/css2?family=Roboto+Slab:wght@300;400;500;600;700&family=Lato:wght@300;400;500;600;700&display=swap" rel="stylesheet">
|
|
|
|
<!-- Styles -->
|
|
<link rel="stylesheet" href="../../assets/css/main.css">
|
|
<link rel="stylesheet" href="../../assets/css/cro-enhancements.css">
|
|
|
|
<!-- Article Schema -->
|
|
<script type="application/ld+json">
|
|
{
|
|
"@context": "https://schema.org",
|
|
"@type": "Article",
|
|
"mainEntityOfPage": {
|
|
"@type": "WebPage",
|
|
"@id": "<?php echo htmlspecialchars($canonical_url); ?>"
|
|
},
|
|
"headline": "<?php echo htmlspecialchars($article_title); ?>",
|
|
"description": "<?php echo htmlspecialchars($article_description); ?>",
|
|
"image": "<?php echo htmlspecialchars($og_image); ?>",
|
|
"author": {
|
|
"@type": "Organization",
|
|
"name": "UK Data Services",
|
|
"url": "https://ukdataservices.co.uk"
|
|
},
|
|
"publisher": {
|
|
"@type": "Organization",
|
|
"name": "UK Data Services",
|
|
"logo": {
|
|
"@type": "ImageObject",
|
|
"url": "https://ukdataservices.co.uk/assets/images/ukds-main-logo.png"
|
|
}
|
|
},
|
|
"datePublished": "<?php echo $article_published; ?>",
|
|
"dateModified": "<?php echo $article_modified; ?>"
|
|
}
|
|
</script>
|
|
|
|
<!-- HowTo Schema for Technical Guide -->
|
|
<script type="application/ld+json">
|
|
{
|
|
"@context": "https://schema.org",
|
|
"@type": "HowTo",
|
|
"name": "How to Set Up Scrapy for Enterprise Web Scraping Operations",
|
|
"description": "Step-by-step guide to implement and scale Python Scrapy for enterprise web scraping operations with best practices and optimization techniques.",
|
|
"image": "https://ukdataservices.co.uk/assets/images/icon-web-scraping-v2.svg",
|
|
"estimatedCost": {
|
|
"@type": "MonetaryAmount",
|
|
"currency": "GBP",
|
|
"value": "0"
|
|
},
|
|
"totalTime": "PT45M",
|
|
"supply": [
|
|
{
|
|
"@type": "HowToSupply",
|
|
"name": "Python 3.8+"
|
|
},
|
|
{
|
|
"@type": "HowToSupply",
|
|
"name": "Scrapy Framework"
|
|
},
|
|
{
|
|
"@type": "HowToSupply",
|
|
"name": "Development Environment"
|
|
}
|
|
],
|
|
"tool": [
|
|
{
|
|
"@type": "HowToTool",
|
|
"name": "Python IDE"
|
|
},
|
|
{
|
|
"@type": "HowToTool",
|
|
"name": "Command Line Interface"
|
|
}
|
|
],
|
|
"step": [
|
|
{
|
|
"@type": "HowToStep",
|
|
"name": "Install Scrapy Framework",
|
|
"text": "Install Scrapy using pip and set up your development environment",
|
|
"url": "https://ukdataservices.co.uk/blog/articles/python-scrapy-enterprise-guide#installation"
|
|
},
|
|
{
|
|
"@type": "HowToStep",
|
|
"name": "Create Scrapy Project",
|
|
"text": "Initialize a new Scrapy project with proper directory structure",
|
|
"url": "https://ukdataservices.co.uk/blog/articles/python-scrapy-enterprise-guide#project-setup"
|
|
},
|
|
{
|
|
"@type": "HowToStep",
|
|
"name": "Configure Settings",
|
|
"text": "Set up enterprise-grade configuration for production deployment",
|
|
"url": "https://ukdataservices.co.uk/blog/articles/python-scrapy-enterprise-guide#configuration"
|
|
},
|
|
{
|
|
"@type": "HowToStep",
|
|
"name": "Implement Spiders",
|
|
"text": "Build scalable spider classes with proper error handling",
|
|
"url": "https://ukdataservices.co.uk/blog/articles/python-scrapy-enterprise-guide#spider-development"
|
|
},
|
|
{
|
|
"@type": "HowToStep",
|
|
"name": "Deploy and Monitor",
|
|
"text": "Deploy to production and implement monitoring systems",
|
|
"url": "https://ukdataservices.co.uk/blog/articles/python-scrapy-enterprise-guide#deployment"
|
|
}
|
|
]
|
|
}
|
|
</script>
|
|
</head>
|
|
<body>
|
|
<!-- Skip to content link for accessibility -->
|
|
<a href="#main-content" class="skip-to-content">Skip to main content</a>
|
|
|
|
<?php include($_SERVER["DOCUMENT_ROOT"] . "/includes/nav.php"); ?><!-- Article Content -->
|
|
<main id="main-content">
|
|
<article class="article-page">
|
|
<div class="container">
|
|
<div class="article-meta">
|
|
<span class="category"><a href="/blog/categories/technology.php">Technology</a></span>
|
|
<time datetime="2025-05-15">15 May 2025</time>
|
|
<span class="read-time">12 min read</span>
|
|
</div>
|
|
<header class="article-header">
|
|
<h1><?php echo htmlspecialchars($article_title); ?></h1>
|
|
<p class="article-lead"><?php echo htmlspecialchars($article_description); ?></p>
|
|
|
|
<div class="article-author">
|
|
<div class="author-info">
|
|
<span>By <?php echo htmlspecialchars($article_author); ?></span>
|
|
</div>
|
|
<div class="share-buttons">
|
|
<a href="https://www.linkedin.com/sharing/share-offsite/?url=<?php echo urlencode($canonical_url); ?>" class="share-button linkedin" aria-label="Share on LinkedIn" rel="noopener" target="_blank">
|
|
<img loading="lazy" src="../../assets/images/icon-linkedin.svg" alt="LinkedIn">
|
|
</a>
|
|
<a href="https://twitter.com/intent/tweet?url=<?php echo urlencode($canonical_url); ?>&text=<?php echo urlencode($article_title); ?>" class="share-button twitter" aria-label="Share on Twitter" rel="noopener" target="_blank">
|
|
<img loading="lazy" src="../../assets/images/icon-twitter.svg" alt="Twitter">
|
|
</a>
|
|
</div>
|
|
</div>
|
|
</header>
|
|
|
|
<div class="article-content">
|
|
<div class="content-wrapper">
|
|
<h2>Why Scrapy for Enterprise Web Scraping?</h2>
|
|
<p>Scrapy stands out as the premier Python framework for large-scale web scraping operations. Unlike simple scripts or basic tools, Scrapy provides the robust architecture, built-in features, and extensibility that enterprise applications demand.</p>
|
|
|
|
<p>This comprehensive guide covers everything you need to know to deploy Scrapy in production environments, from initial setup to advanced optimization techniques.</p>
|
|
|
|
<h2>Enterprise-Grade Scrapy Architecture</h2>
|
|
|
|
<h3>Core Components Overview</h3>
|
|
<ul>
|
|
<li><strong>Scrapy Engine:</strong> Controls data flow between components</li>
|
|
<li><strong>Scheduler:</strong> Receives requests and queues them for processing</li>
|
|
<li><strong>Downloader:</strong> Fetches web pages and returns responses</li>
|
|
<li><strong>Spiders:</strong> Custom classes that define scraping logic</li>
|
|
<li><strong>Item Pipeline:</strong> Processes extracted data</li>
|
|
<li><strong>Middlewares:</strong> Hooks for customizing request/response processing</li>
|
|
</ul>
|
|
|
|
<h3>Production Project Structure</h3>
|
|
<pre><code>
|
|
enterprise_scraper/
|
|
├── scrapy.cfg
|
|
├── requirements.txt
|
|
├── docker-compose.yml
|
|
├── enterprise_scraper/
|
|
│ ├── __init__.py
|
|
│ ├── settings/
|
|
│ │ ├── __init__.py
|
|
│ │ ├── base.py
|
|
│ │ ├── development.py
|
|
│ │ ├── staging.py
|
|
│ │ └── production.py
|
|
│ ├── spiders/
|
|
│ │ ├── __init__.py
|
|
│ │ ├── base_spider.py
|
|
│ │ └── ecommerce_spider.py
|
|
│ ├── items.py
|
|
│ ├── pipelines.py
|
|
│ ├── middlewares.py
|
|
│ └── utils/
|
|
│ ├── __init__.py
|
|
│ ├── database.py
|
|
│ └── monitoring.py
|
|
├── deploy/
|
|
│ ├── Dockerfile
|
|
│ └── kubernetes/
|
|
└── tests/
|
|
├── unit/
|
|
└── integration/
|
|
</code></pre>
|
|
|
|
<h2>Advanced Configuration Management</h2>
|
|
|
|
<h3>Environment-Specific Settings</h3>
|
|
<pre><code>
|
|
# settings/base.py
|
|
BOT_NAME = 'enterprise_scraper'
|
|
SPIDER_MODULES = ['enterprise_scraper.spiders']
|
|
NEWSPIDER_MODULE = 'enterprise_scraper.spiders'
|
|
|
|
# Respect robots.txt for compliance
|
|
ROBOTSTXT_OBEY = True
|
|
|
|
# Configure concurrent requests
|
|
CONCURRENT_REQUESTS = 32
|
|
CONCURRENT_REQUESTS_PER_DOMAIN = 8
|
|
|
|
# Download delays for respectful scraping
|
|
DOWNLOAD_DELAY = 1
|
|
RANDOMIZE_DOWNLOAD_DELAY = 0.5
|
|
|
|
# Production settings/production.py
|
|
from .base import *
|
|
|
|
# Increase concurrency for production
|
|
CONCURRENT_REQUESTS = 100
|
|
CONCURRENT_REQUESTS_PER_DOMAIN = 16
|
|
|
|
# Enable autothrottling
|
|
AUTOTHROTTLE_ENABLED = True
|
|
AUTOTHROTTLE_START_DELAY = 1
|
|
AUTOTHROTTLE_MAX_DELAY = 10
|
|
AUTOTHROTTLE_TARGET_CONCURRENCY = 2.0
|
|
|
|
# Logging configuration
|
|
LOG_LEVEL = 'INFO'
|
|
LOG_FILE = '/var/log/scrapy/scrapy.log'
|
|
|
|
# Database settings
|
|
DATABASE_URL = os.environ.get('DATABASE_URL')
|
|
REDIS_URL = os.environ.get('REDIS_URL')
|
|
</code></pre>
|
|
|
|
<h3>Dynamic Settings with Environment Variables</h3>
|
|
<pre><code>
|
|
import os
|
|
from scrapy.utils.project import get_project_settings
|
|
|
|
def get_scrapy_settings():
|
|
settings = get_project_settings()
|
|
|
|
# Environment-specific overrides
|
|
if os.environ.get('SCRAPY_ENV') == 'production':
|
|
settings.set('CONCURRENT_REQUESTS', 200)
|
|
settings.set('DOWNLOAD_DELAY', 0.5)
|
|
elif os.environ.get('SCRAPY_ENV') == 'development':
|
|
settings.set('CONCURRENT_REQUESTS', 16)
|
|
settings.set('DOWNLOAD_DELAY', 2)
|
|
|
|
return settings
|
|
</code></pre>
|
|
|
|
<h2>Enterprise Spider Development</h2>
|
|
|
|
<h3>Base Spider Class</h3>
|
|
<pre><code>
|
|
import scrapy
|
|
from scrapy.http import Request
|
|
from typing import Generator, Optional
|
|
import logging
|
|
|
|
class BaseSpider(scrapy.Spider):
|
|
"""Base spider with common enterprise functionality"""
|
|
|
|
def __init__(self, *args, **kwargs):
|
|
super().__init__(*args, **kwargs)
|
|
self.setup_logging()
|
|
self.setup_monitoring()
|
|
|
|
def setup_logging(self):
|
|
"""Configure structured logging"""
|
|
self.logger = logging.getLogger(self.name)
|
|
|
|
def setup_monitoring(self):
|
|
"""Initialize monitoring metrics"""
|
|
self.stats = {
|
|
'pages_scraped': 0,
|
|
'items_extracted': 0,
|
|
'errors': 0
|
|
}
|
|
|
|
def parse_with_error_handling(self, response):
|
|
"""Parse with comprehensive error handling"""
|
|
try:
|
|
yield from self.parse_content(response)
|
|
except Exception as e:
|
|
self.logger.error(f"Error parsing {response.url}: {e}")
|
|
self.stats['errors'] += 1
|
|
|
|
def make_request(self, url: str, callback=None, meta: dict = None) -> Request:
|
|
"""Create request with standard metadata"""
|
|
return Request(
|
|
url=url,
|
|
callback=callback or self.parse_with_error_handling,
|
|
meta={
|
|
'spider_name': self.name,
|
|
'timestamp': time.time(),
|
|
**(meta or {})
|
|
},
|
|
dont_filter=False
|
|
)
|
|
</code></pre>
|
|
|
|
<h3>Advanced E-commerce Spider</h3>
|
|
<pre><code>
|
|
from enterprise_scraper.spiders.base_spider import BaseSpider
|
|
from enterprise_scraper.items import ProductItem
|
|
|
|
class EcommerceSpider(BaseSpider):
|
|
name = 'ecommerce'
|
|
allowed_domains = ['example-store.com']
|
|
|
|
custom_settings = {
|
|
'ITEM_PIPELINES': {
|
|
'enterprise_scraper.pipelines.ValidationPipeline': 300,
|
|
'enterprise_scraper.pipelines.DatabasePipeline': 400,
|
|
},
|
|
'DOWNLOAD_DELAY': 2,
|
|
}
|
|
|
|
def start_requests(self):
|
|
"""Generate initial requests with pagination"""
|
|
base_url = "https://example-store.com/products"
|
|
|
|
for page in range(1, 101): # First 100 pages
|
|
url = f"{base_url}?page={page}"
|
|
yield self.make_request(
|
|
url=url,
|
|
callback=self.parse_product_list,
|
|
meta={'page': page}
|
|
)
|
|
|
|
def parse_product_list(self, response):
|
|
"""Extract product URLs from listing pages"""
|
|
product_urls = response.css('.product-link::attr(href)').getall()
|
|
|
|
for url in product_urls:
|
|
yield self.make_request(
|
|
url=response.urljoin(url),
|
|
callback=self.parse_product,
|
|
meta={'category': response.meta.get('category')}
|
|
)
|
|
|
|
# Handle pagination
|
|
next_page = response.css('.pagination .next::attr(href)').get()
|
|
if next_page:
|
|
yield self.make_request(
|
|
url=response.urljoin(next_page),
|
|
callback=self.parse_product_list
|
|
)
|
|
|
|
def parse_product(self, response):
|
|
"""Extract product details"""
|
|
item = ProductItem()
|
|
|
|
item['url'] = response.url
|
|
item['name'] = response.css('h1.product-title::text').get()
|
|
item['price'] = self.extract_price(response)
|
|
item['description'] = response.css('.product-description::text').getall()
|
|
item['images'] = response.css('.product-images img::attr(src)').getall()
|
|
item['availability'] = response.css('.stock-status::text').get()
|
|
item['rating'] = self.extract_rating(response)
|
|
item['reviews_count'] = self.extract_reviews_count(response)
|
|
|
|
self.stats['items_extracted'] += 1
|
|
yield item
|
|
|
|
def extract_price(self, response):
|
|
"""Extract and normalize price data"""
|
|
price_text = response.css('.price::text').get()
|
|
if price_text:
|
|
# Remove currency symbols and normalize
|
|
import re
|
|
price = re.sub(r'[^\d.]', '', price_text)
|
|
return float(price) if price else None
|
|
return None
|
|
</code></pre>
|
|
|
|
<h2>Enterprise Pipeline System</h2>
|
|
|
|
<h3>Validation Pipeline</h3>
|
|
<pre><code>
|
|
from itemadapter import ItemAdapter
|
|
from scrapy.exceptions import DropItem
|
|
import validators
|
|
|
|
class ValidationPipeline:
|
|
"""Validate items before processing"""
|
|
|
|
def process_item(self, item, spider):
|
|
adapter = ItemAdapter(item)
|
|
|
|
# Required field validation
|
|
if not adapter.get('name'):
|
|
raise DropItem(f"Missing product name: {item}")
|
|
|
|
# URL validation
|
|
if not validators.url(adapter.get('url')):
|
|
raise DropItem(f"Invalid URL: {adapter.get('url')}")
|
|
|
|
# Price validation
|
|
price = adapter.get('price')
|
|
if price is not None:
|
|
try:
|
|
price = float(price)
|
|
if price < 0:
|
|
raise DropItem(f"Invalid price: {price}")
|
|
adapter['price'] = price
|
|
except (ValueError, TypeError):
|
|
raise DropItem(f"Invalid price format: {price}")
|
|
|
|
spider.logger.info(f"Item validated: {adapter.get('name')}")
|
|
return item
|
|
</code></pre>
|
|
|
|
<h3>Database Pipeline with Connection Pooling</h3>
|
|
<pre><code>
|
|
import asyncio
|
|
import asyncpg
|
|
from itemadapter import ItemAdapter
|
|
|
|
class DatabasePipeline:
|
|
"""Asynchronous database pipeline"""
|
|
|
|
def __init__(self, db_url, pool_size=20):
|
|
self.db_url = db_url
|
|
self.pool_size = pool_size
|
|
self.pool = None
|
|
|
|
@classmethod
|
|
def from_crawler(cls, crawler):
|
|
return cls(
|
|
db_url=crawler.settings.get('DATABASE_URL'),
|
|
pool_size=crawler.settings.get('DB_POOL_SIZE', 20)
|
|
)
|
|
|
|
async def open_spider(self, spider):
|
|
"""Initialize database connection pool"""
|
|
self.pool = await asyncpg.create_pool(
|
|
self.db_url,
|
|
min_size=5,
|
|
max_size=self.pool_size
|
|
)
|
|
spider.logger.info("Database connection pool created")
|
|
|
|
async def close_spider(self, spider):
|
|
"""Close database connection pool"""
|
|
if self.pool:
|
|
await self.pool.close()
|
|
spider.logger.info("Database connection pool closed")
|
|
|
|
async def process_item(self, item, spider):
|
|
"""Insert item into database"""
|
|
adapter = ItemAdapter(item)
|
|
|
|
async with self.pool.acquire() as connection:
|
|
await connection.execute('''
|
|
INSERT INTO products (url, name, price, description)
|
|
VALUES ($1, $2, $3, $4)
|
|
ON CONFLICT (url) DO UPDATE SET
|
|
name = EXCLUDED.name,
|
|
price = EXCLUDED.price,
|
|
description = EXCLUDED.description,
|
|
updated_at = NOW()
|
|
''',
|
|
adapter.get('url'),
|
|
adapter.get('name'),
|
|
adapter.get('price'),
|
|
'\n'.join(adapter.get('description', []))
|
|
)
|
|
|
|
spider.logger.info(f"Item saved: {adapter.get('name')}")
|
|
return item
|
|
</code></pre>
|
|
|
|
<h2>Middleware for Enterprise Features</h2>
|
|
|
|
<h3>Rotating Proxy Middleware</h3>
|
|
<pre><code>
|
|
import random
|
|
from scrapy.downloadermiddlewares.httpproxy import HttpProxyMiddleware
|
|
|
|
class RotatingProxyMiddleware(HttpProxyMiddleware):
|
|
"""Rotate proxies for each request"""
|
|
|
|
def __init__(self, proxy_list):
|
|
self.proxy_list = proxy_list
|
|
|
|
@classmethod
|
|
def from_crawler(cls, crawler):
|
|
proxy_list = crawler.settings.get('PROXY_LIST', [])
|
|
return cls(proxy_list)
|
|
|
|
def process_request(self, request, spider):
|
|
if self.proxy_list:
|
|
proxy = random.choice(self.proxy_list)
|
|
request.meta['proxy'] = proxy
|
|
spider.logger.debug(f"Using proxy: {proxy}")
|
|
|
|
return None
|
|
</code></pre>
|
|
|
|
<h3>Rate Limiting Middleware</h3>
|
|
<pre><code>
|
|
import time
|
|
from collections import defaultdict
|
|
from scrapy.downloadermiddlewares.retry import RetryMiddleware
|
|
|
|
class RateLimitMiddleware(RetryMiddleware):
|
|
"""Implement per-domain rate limiting"""
|
|
|
|
def __init__(self, settings):
|
|
super().__init__(settings)
|
|
self.domain_delays = defaultdict(float)
|
|
self.last_request_time = defaultdict(float)
|
|
|
|
def process_request(self, request, spider):
|
|
domain = request.url.split('/')[2]
|
|
current_time = time.time()
|
|
|
|
# Calculate required delay
|
|
min_delay = self.domain_delays.get(domain, 1.0)
|
|
time_since_last = current_time - self.last_request_time[domain]
|
|
|
|
if time_since_last < min_delay:
|
|
delay = min_delay - time_since_last
|
|
spider.logger.debug(f"Rate limiting {domain}: {delay:.2f}s")
|
|
time.sleep(delay)
|
|
|
|
self.last_request_time[domain] = time.time()
|
|
return None
|
|
</code></pre>
|
|
|
|
<h2>Monitoring and Observability</h2>
|
|
|
|
<h3>Custom Stats Collection</h3>
|
|
<pre><code>
|
|
from scrapy.statscollectors import StatsCollector
|
|
import time
|
|
|
|
class EnterpriseStatsCollector(StatsCollector):
|
|
"""Enhanced stats collection for monitoring"""
|
|
|
|
def __init__(self, crawler):
|
|
super().__init__(crawler)
|
|
self.start_time = time.time()
|
|
self.custom_stats = {}
|
|
|
|
def get_stats(self):
|
|
"""Enhanced stats with custom metrics"""
|
|
stats = super().get_stats()
|
|
|
|
# Add runtime statistics
|
|
runtime = time.time() - self.start_time
|
|
stats['runtime_seconds'] = runtime
|
|
|
|
# Add rate calculations
|
|
pages_count = stats.get('response_received_count', 0)
|
|
if runtime > 0:
|
|
stats['pages_per_minute'] = (pages_count / runtime) * 60
|
|
|
|
# Add custom metrics
|
|
stats.update(self.custom_stats)
|
|
|
|
return stats
|
|
|
|
def inc_value(self, key, count=1, start=0):
|
|
"""Increment custom counter"""
|
|
super().inc_value(key, count, start)
|
|
|
|
# Log significant milestones
|
|
current_value = self.get_value(key, 0)
|
|
if current_value % 1000 == 0: # Every 1000 items
|
|
self.crawler.spider.logger.info(f"{key}: {current_value}")
|
|
</code></pre>
|
|
|
|
<h2>Production Deployment</h2>
|
|
<p>Deploying Scrapy at enterprise scale requires robust infrastructure and monitoring. For comprehensive <a href="../../services/data-cleaning.php">data pipeline solutions</a>, consider our managed deployment services that handle scaling, monitoring, and compliance automatically.</p>
|
|
|
|
<h3>Docker Configuration</h3>
|
|
<pre><code>
|
|
# Dockerfile
|
|
FROM python:3.9-slim
|
|
|
|
WORKDIR /app
|
|
|
|
# Install system dependencies
|
|
RUN apt-get update && apt-get install -y \
|
|
gcc \
|
|
libc-dev \
|
|
libffi-dev \
|
|
libssl-dev \
|
|
&& rm -rf /var/lib/apt/lists/*
|
|
|
|
# Install Python dependencies
|
|
COPY requirements.txt .
|
|
RUN pip install --no-cache-dir -r requirements.txt
|
|
|
|
# Copy application code
|
|
COPY . .
|
|
|
|
# Create non-root user
|
|
RUN useradd -m -u 1000 scrapy && chown -R scrapy:scrapy /app
|
|
USER scrapy
|
|
|
|
# Default command
|
|
CMD ["scrapy", "crawl", "ecommerce"]
|
|
</code></pre>
|
|
|
|
<h3>Kubernetes Deployment</h3>
|
|
<pre><code>
|
|
apiVersion: apps/v1
|
|
kind: Deployment
|
|
metadata:
|
|
name: scrapy-deployment
|
|
spec:
|
|
replicas: 3
|
|
selector:
|
|
matchLabels:
|
|
app: scrapy
|
|
template:
|
|
metadata:
|
|
labels:
|
|
app: scrapy
|
|
spec:
|
|
containers:
|
|
- name: scrapy
|
|
image: enterprise-scrapy:latest
|
|
resources:
|
|
requests:
|
|
memory: "1Gi"
|
|
cpu: "500m"
|
|
limits:
|
|
memory: "2Gi"
|
|
cpu: "1000m"
|
|
env:
|
|
- name: SCRAPY_ENV
|
|
value: "production"
|
|
- name: DATABASE_URL
|
|
valueFrom:
|
|
secretKeyRef:
|
|
name: db-secret
|
|
key: url
|
|
---
|
|
apiVersion: v1
|
|
kind: Service
|
|
metadata:
|
|
name: scrapy-service
|
|
spec:
|
|
selector:
|
|
app: scrapy
|
|
ports:
|
|
- port: 6800
|
|
targetPort: 6800
|
|
</code></pre>
|
|
|
|
<h2>Performance Optimization</h2>
|
|
|
|
<h3>Memory Management</h3>
|
|
<ul>
|
|
<li><strong>Item Pipeline:</strong> Process items immediately to avoid memory buildup</li>
|
|
<li><strong>Response Caching:</strong> Disable for production unless specifically needed</li>
|
|
<li><strong>Request Filtering:</strong> Use duplicate filters efficiently</li>
|
|
<li><strong>Large Responses:</strong> Stream large files instead of loading into memory</li>
|
|
</ul>
|
|
|
|
<h3>Scaling Strategies</h3>
|
|
<ul>
|
|
<li><strong>Horizontal Scaling:</strong> Multiple spider instances</li>
|
|
<li><strong>Domain Sharding:</strong> Distribute domains across instances</li>
|
|
<li><strong>Queue Management:</strong> Redis-based distributed queuing</li>
|
|
<li><strong>Load Balancing:</strong> Distribute requests across proxy pools</li>
|
|
</ul>
|
|
|
|
<h2>Best Practices Summary</h2>
|
|
|
|
<h3>Code Organization</h3>
|
|
<ul>
|
|
<li>Use inheritance for common spider functionality</li>
|
|
<li>Separate settings by environment</li>
|
|
<li>Implement comprehensive error handling</li>
|
|
<li>Write unit tests for custom components</li>
|
|
</ul>
|
|
|
|
<h3>Operational Excellence</h3>
|
|
<ul>
|
|
<li>Monitor performance metrics continuously</li>
|
|
<li>Implement circuit breakers for external services</li>
|
|
<li>Use structured logging for better observability</li>
|
|
<li>Plan for graceful degradation</li>
|
|
</ul>
|
|
|
|
<h3>Compliance and Ethics</h3>
|
|
<ul>
|
|
<li>Respect robots.txt and rate limits</li>
|
|
<li>Implement proper user agent identification</li>
|
|
<li>Handle personal data according to GDPR</li>
|
|
<li>Maintain audit trails for data collection</li>
|
|
</ul>
|
|
|
|
<div class="article-cta">
|
|
<h3>Scale Your Scrapy Operations</h3>
|
|
<p>UK Data Services provides enterprise Scrapy development and deployment services. Let our experts help you build robust, scalable web scraping solutions.</p>
|
|
<a href="/quote" class="btn btn-primary">Get Scrapy Consultation</a>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
|
|
<!-- Related Articles -->
|
|
<aside class="related-articles">
|
|
<h3>Related Articles</h3>
|
|
<div class="related-grid">
|
|
<article class="related-card">
|
|
<span class="category">Web Scraping</span>
|
|
<h4><a href="javascript-heavy-sites-scraping.php">Scraping JavaScript-Heavy Sites: Advanced Techniques</a></h4>
|
|
<span class="read-time">6 min read</span>
|
|
<?php include($_SERVER['DOCUMENT_ROOT'] . '/includes/author-bio.php'); ?>
|
|
|
|
<?php include($_SERVER['DOCUMENT_ROOT'] . '/includes/article-footer.php'); ?>
|
|
</div>
|
|
</article>
|
|
<article class="related-card">
|
|
<span class="category">Technology</span>
|
|
<h4><a href="cloud-native-scraping-architecture.php">Cloud-Native Scraping Architecture for Enterprise Scale</a></h4>
|
|
<span class="read-time">11 min read</span>
|
|
<?php include($_SERVER['DOCUMENT_ROOT'] . '/includes/author-bio.php'); ?>
|
|
|
|
<?php include($_SERVER['DOCUMENT_ROOT'] . '/includes/article-footer.php'); ?>
|
|
</div>
|
|
</article>
|
|
<article class="related-card">
|
|
<span class="category">Compliance</span>
|
|
<h4><a href="web-scraping-compliance-uk-guide.php">Complete Guide to Web Scraping Compliance in the UK</a></h4>
|
|
<span class="read-time">12 min read</span>
|
|
<?php include($_SERVER['DOCUMENT_ROOT'] . '/includes/author-bio.php'); ?>
|
|
|
|
<?php include($_SERVER['DOCUMENT_ROOT'] . '/includes/article-footer.php'); ?>
|
|
</div>
|
|
</article>
|
|
</div>
|
|
</aside>
|
|
</div>
|
|
<?php include($_SERVER['DOCUMENT_ROOT'] . '/includes/author-bio.php'); ?>
|
|
|
|
<?php include($_SERVER['DOCUMENT_ROOT'] . '/includes/article-footer.php'); ?>
|
|
</div>
|
|
</article>
|
|
</main>
|
|
|
|
<!-- Footer -->
|
|
<footer class="footer">
|
|
<div class="container">
|
|
<div class="footer-content">
|
|
<div class="footer-section">
|
|
<div class="footer-logo">
|
|
<img loading="lazy" src="../../assets/images/logo-white.svg" alt="UK Data Services" loading="lazy">
|
|
</div>
|
|
<p>Enterprise data intelligence solutions for modern British business.</p>
|
|
</div>
|
|
|
|
<div class="footer-section">
|
|
<h3>Quick Links</h3>
|
|
<ul>
|
|
<li><a href="/#services">Services</a></li>
|
|
<li><a href="/blog/">Blog</a></li>
|
|
<li><a href="/case-studies/">Case Studies</a></li>
|
|
<li><a href="/about">About</a></li>
|
|
<li><a href="/#contact">Contact</a></li>
|
|
</ul>
|
|
</div>
|
|
|
|
<div class="footer-section">
|
|
<h3>Legal</h3>
|
|
<ul>
|
|
<li><a href="/privacy-policy">Privacy Policy</a></li>
|
|
<li><a href="/terms-of-service">Terms of Service</a></li>
|
|
<li><a href="/cookie-policy">Cookie Policy</a></li>
|
|
<li><a href="/gdpr-compliance">GDPR Compliance</a></li>
|
|
</ul>
|
|
</div>
|
|
</div>
|
|
|
|
<div class="footer-bottom">
|
|
<p>© <?php echo date('Y'); ?> UK Data Services. All rights reserved.</p>
|
|
<div class="social-links">
|
|
<a href="https://linkedin.com/company/uk-data-services" aria-label="LinkedIn" rel="noopener" target="_blank">
|
|
<img loading="lazy" src="../../assets/images/icon-linkedin.svg" alt="LinkedIn" loading="lazy">
|
|
</a>
|
|
<a href="https://twitter.com/ukdataservices" aria-label="Twitter" rel="noopener" target="_blank">
|
|
<img loading="lazy" src="../../assets/images/icon-twitter.svg" alt="Twitter" loading="lazy">
|
|
</a>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
</footer>
|
|
|
|
<!-- Scripts -->
|
|
<script src="../../assets/js/main.js"></script>
|
|
<script src="../../assets/js/cro-enhancements.js"></script>
|
|
</body>
|
|
</html>
|