Files
ukaiautomation/blog/articles/python-scrapy-enterprise-guide.php
root 1a8d9b4f9d Site improvements: nav refactor, CSS readability, hover fixes, remove unverified badges
- Refactored navigation: all 44 pages now use shared includes/nav.php
- Added Free Tools link to navigation (was missing from 29+ pages)
- CSS readability: darker body text (#333), secondary text (#555), bolder hero subtitle
- CSS: darkened link colour (#148a72) for WCAG AA compliance
- CSS: increased stat label font size to 1rem
- Fixed industry-card hover white-on-white text bug
- Removed ICO Registered and Cyber Essentials claims (not yet registered)
- Cache version bumped to v1.1.2
2026-02-10 22:21:16 +00:00

861 lines
34 KiB
PHP

<?php
// Enhanced security headers
header('Strict-Transport-Security: max-age=31536000; includeSubDomains');
// Article-specific SEO variables
$article_title = "Python Scrapy Enterprise Guide: Scaling Web Scraping Operations";
$article_description = "Master Scrapy for enterprise-scale web scraping operations. Learn advanced techniques, best practices, and optimization strategies for production deployments.";
$article_keywords = "Python Scrapy enterprise, web scraping framework, Scrapy best practices, enterprise web scraping, Python data extraction, Scrapy optimization";
$article_author = "UK Data Services Technical Team";
$canonical_url = "https://ukdataservices.co.uk/blog/articles/python-scrapy-enterprise-guide";
$article_published = "2025-05-15T09:00:00+00:00";
$article_modified = "2025-05-15T09:00:00+00:00";
$og_image = "https://ukdataservices.co.uk/assets/images/icon-web-scraping-v2.svg";
$read_time = 12;
?>
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title><?php echo htmlspecialchars($article_title); ?> | UK Data Services Blog</title>
<meta name="description" content="<?php echo htmlspecialchars($article_description); ?>">
<meta name="keywords" content="<?php echo htmlspecialchars($article_keywords); ?>">
<meta name="author" content="<?php echo htmlspecialchars($article_author); ?>">
<meta name="robots" content="index, follow">
<link rel="canonical" href="<?php echo htmlspecialchars($canonical_url); ?>">
<!-- Article-specific meta tags -->
<meta name="article:published_time" content="<?php echo $article_published; ?>">
<meta name="article:modified_time" content="<?php echo $article_modified; ?>">
<meta name="article:author" content="<?php echo htmlspecialchars($article_author); ?>">
<meta name="article:section" content="Web Scraping">
<meta name="article:tag" content="Python, Scrapy, Web Scraping, Enterprise, Framework">
<!-- Preload critical resources -->
<link rel="preload" href="../../assets/css/main.css" as="style">
<link rel="preload" href="../../assets/images/ukds-main-logo.png" as="image">
<!-- Open Graph / Social Media -->
<meta property="og:type" content="article">
<meta property="og:url" content="<?php echo htmlspecialchars($canonical_url); ?>">
<meta property="og:title" content="<?php echo htmlspecialchars($article_title); ?>">
<meta property="og:description" content="<?php echo htmlspecialchars($article_description); ?>">
<meta property="og:image" content="<?php echo htmlspecialchars($og_image); ?>">
<!-- Twitter Card -->
<meta name="twitter:card" content="summary_large_image">
<meta name="twitter:title" content="<?php echo htmlspecialchars($article_title); ?>">
<meta name="twitter:description" content="<?php echo htmlspecialchars($article_description); ?>">
<meta name="twitter:image" content="<?php echo htmlspecialchars($og_image); ?>">
<!-- Favicon and App Icons -->
<link rel="icon" type="image/svg+xml" href="../../assets/images/favicon.svg">
<link rel="apple-touch-icon" sizes="180x180" href="../../assets/images/apple-touch-icon.svg">
<!-- Fonts -->
<link rel="preconnect" href="https://fonts.googleapis.com">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link href="https://fonts.googleapis.com/css2?family=Roboto+Slab:wght@300;400;500;600;700&family=Lato:wght@300;400;500;600;700&display=swap" rel="stylesheet">
<!-- Styles -->
<link rel="stylesheet" href="../../assets/css/main.css">
<link rel="stylesheet" href="../../assets/css/cro-enhancements.css">
<!-- Article Schema -->
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "Article",
"mainEntityOfPage": {
"@type": "WebPage",
"@id": "<?php echo htmlspecialchars($canonical_url); ?>"
},
"headline": "<?php echo htmlspecialchars($article_title); ?>",
"description": "<?php echo htmlspecialchars($article_description); ?>",
"image": "<?php echo htmlspecialchars($og_image); ?>",
"author": {
"@type": "Organization",
"name": "UK Data Services",
"url": "https://ukdataservices.co.uk"
},
"publisher": {
"@type": "Organization",
"name": "UK Data Services",
"logo": {
"@type": "ImageObject",
"url": "https://ukdataservices.co.uk/assets/images/ukds-main-logo.png"
}
},
"datePublished": "<?php echo $article_published; ?>",
"dateModified": "<?php echo $article_modified; ?>"
}
</script>
<!-- HowTo Schema for Technical Guide -->
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "HowTo",
"name": "How to Set Up Scrapy for Enterprise Web Scraping Operations",
"description": "Step-by-step guide to implement and scale Python Scrapy for enterprise web scraping operations with best practices and optimization techniques.",
"image": "https://ukdataservices.co.uk/assets/images/icon-web-scraping-v2.svg",
"estimatedCost": {
"@type": "MonetaryAmount",
"currency": "GBP",
"value": "0"
},
"totalTime": "PT45M",
"supply": [
{
"@type": "HowToSupply",
"name": "Python 3.8+"
},
{
"@type": "HowToSupply",
"name": "Scrapy Framework"
},
{
"@type": "HowToSupply",
"name": "Development Environment"
}
],
"tool": [
{
"@type": "HowToTool",
"name": "Python IDE"
},
{
"@type": "HowToTool",
"name": "Command Line Interface"
}
],
"step": [
{
"@type": "HowToStep",
"name": "Install Scrapy Framework",
"text": "Install Scrapy using pip and set up your development environment",
"url": "https://ukdataservices.co.uk/blog/articles/python-scrapy-enterprise-guide#installation"
},
{
"@type": "HowToStep",
"name": "Create Scrapy Project",
"text": "Initialize a new Scrapy project with proper directory structure",
"url": "https://ukdataservices.co.uk/blog/articles/python-scrapy-enterprise-guide#project-setup"
},
{
"@type": "HowToStep",
"name": "Configure Settings",
"text": "Set up enterprise-grade configuration for production deployment",
"url": "https://ukdataservices.co.uk/blog/articles/python-scrapy-enterprise-guide#configuration"
},
{
"@type": "HowToStep",
"name": "Implement Spiders",
"text": "Build scalable spider classes with proper error handling",
"url": "https://ukdataservices.co.uk/blog/articles/python-scrapy-enterprise-guide#spider-development"
},
{
"@type": "HowToStep",
"name": "Deploy and Monitor",
"text": "Deploy to production and implement monitoring systems",
"url": "https://ukdataservices.co.uk/blog/articles/python-scrapy-enterprise-guide#deployment"
}
]
}
</script>
</head>
<body>
<!-- Skip to content link for accessibility -->
<a href="#main-content" class="skip-to-content">Skip to main content</a>
<?php include($_SERVER["DOCUMENT_ROOT"] . "/includes/nav.php"); ?><!-- Article Content -->
<main id="main-content">
<article class="article-page">
<div class="container">
<div class="article-meta">
<span class="category"><a href="/blog/categories/technology.php">Technology</a></span>
<time datetime="2025-05-15">15 May 2025</time>
<span class="read-time">12 min read</span>
</div>
<header class="article-header">
<h1><?php echo htmlspecialchars($article_title); ?></h1>
<p class="article-lead"><?php echo htmlspecialchars($article_description); ?></p>
<div class="article-author">
<div class="author-info">
<span>By <?php echo htmlspecialchars($article_author); ?></span>
</div>
<div class="share-buttons">
<a href="https://www.linkedin.com/sharing/share-offsite/?url=<?php echo urlencode($canonical_url); ?>" class="share-button linkedin" aria-label="Share on LinkedIn" rel="noopener" target="_blank">
<img loading="lazy" src="../../assets/images/icon-linkedin.svg" alt="LinkedIn">
</a>
<a href="https://twitter.com/intent/tweet?url=<?php echo urlencode($canonical_url); ?>&text=<?php echo urlencode($article_title); ?>" class="share-button twitter" aria-label="Share on Twitter" rel="noopener" target="_blank">
<img loading="lazy" src="../../assets/images/icon-twitter.svg" alt="Twitter">
</a>
</div>
</div>
</header>
<div class="article-content">
<div class="content-wrapper">
<h2>Why Scrapy for Enterprise Web Scraping?</h2>
<p>Scrapy stands out as the premier Python framework for large-scale web scraping operations. Unlike simple scripts or basic tools, Scrapy provides the robust architecture, built-in features, and extensibility that enterprise applications demand.</p>
<p>This comprehensive guide covers everything you need to know to deploy Scrapy in production environments, from initial setup to advanced optimization techniques.</p>
<h2>Enterprise-Grade Scrapy Architecture</h2>
<h3>Core Components Overview</h3>
<ul>
<li><strong>Scrapy Engine:</strong> Controls data flow between components</li>
<li><strong>Scheduler:</strong> Receives requests and queues them for processing</li>
<li><strong>Downloader:</strong> Fetches web pages and returns responses</li>
<li><strong>Spiders:</strong> Custom classes that define scraping logic</li>
<li><strong>Item Pipeline:</strong> Processes extracted data</li>
<li><strong>Middlewares:</strong> Hooks for customizing request/response processing</li>
</ul>
<h3>Production Project Structure</h3>
<pre><code>
enterprise_scraper/
├── scrapy.cfg
├── requirements.txt
├── docker-compose.yml
├── enterprise_scraper/
│ ├── __init__.py
│ ├── settings/
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── development.py
│ │ ├── staging.py
│ │ └── production.py
│ ├── spiders/
│ │ ├── __init__.py
│ │ ├── base_spider.py
│ │ └── ecommerce_spider.py
│ ├── items.py
│ ├── pipelines.py
│ ├── middlewares.py
│ └── utils/
│ ├── __init__.py
│ ├── database.py
│ └── monitoring.py
├── deploy/
│ ├── Dockerfile
│ └── kubernetes/
└── tests/
├── unit/
└── integration/
</code></pre>
<h2>Advanced Configuration Management</h2>
<h3>Environment-Specific Settings</h3>
<pre><code>
# settings/base.py
BOT_NAME = 'enterprise_scraper'
SPIDER_MODULES = ['enterprise_scraper.spiders']
NEWSPIDER_MODULE = 'enterprise_scraper.spiders'
# Respect robots.txt for compliance
ROBOTSTXT_OBEY = True
# Configure concurrent requests
CONCURRENT_REQUESTS = 32
CONCURRENT_REQUESTS_PER_DOMAIN = 8
# Download delays for respectful scraping
DOWNLOAD_DELAY = 1
RANDOMIZE_DOWNLOAD_DELAY = 0.5
# Production settings/production.py
from .base import *
# Increase concurrency for production
CONCURRENT_REQUESTS = 100
CONCURRENT_REQUESTS_PER_DOMAIN = 16
# Enable autothrottling
AUTOTHROTTLE_ENABLED = True
AUTOTHROTTLE_START_DELAY = 1
AUTOTHROTTLE_MAX_DELAY = 10
AUTOTHROTTLE_TARGET_CONCURRENCY = 2.0
# Logging configuration
LOG_LEVEL = 'INFO'
LOG_FILE = '/var/log/scrapy/scrapy.log'
# Database settings
DATABASE_URL = os.environ.get('DATABASE_URL')
REDIS_URL = os.environ.get('REDIS_URL')
</code></pre>
<h3>Dynamic Settings with Environment Variables</h3>
<pre><code>
import os
from scrapy.utils.project import get_project_settings
def get_scrapy_settings():
settings = get_project_settings()
# Environment-specific overrides
if os.environ.get('SCRAPY_ENV') == 'production':
settings.set('CONCURRENT_REQUESTS', 200)
settings.set('DOWNLOAD_DELAY', 0.5)
elif os.environ.get('SCRAPY_ENV') == 'development':
settings.set('CONCURRENT_REQUESTS', 16)
settings.set('DOWNLOAD_DELAY', 2)
return settings
</code></pre>
<h2>Enterprise Spider Development</h2>
<h3>Base Spider Class</h3>
<pre><code>
import scrapy
from scrapy.http import Request
from typing import Generator, Optional
import logging
class BaseSpider(scrapy.Spider):
"""Base spider with common enterprise functionality"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.setup_logging()
self.setup_monitoring()
def setup_logging(self):
"""Configure structured logging"""
self.logger = logging.getLogger(self.name)
def setup_monitoring(self):
"""Initialize monitoring metrics"""
self.stats = {
'pages_scraped': 0,
'items_extracted': 0,
'errors': 0
}
def parse_with_error_handling(self, response):
"""Parse with comprehensive error handling"""
try:
yield from self.parse_content(response)
except Exception as e:
self.logger.error(f"Error parsing {response.url}: {e}")
self.stats['errors'] += 1
def make_request(self, url: str, callback=None, meta: dict = None) -> Request:
"""Create request with standard metadata"""
return Request(
url=url,
callback=callback or self.parse_with_error_handling,
meta={
'spider_name': self.name,
'timestamp': time.time(),
**(meta or {})
},
dont_filter=False
)
</code></pre>
<h3>Advanced E-commerce Spider</h3>
<pre><code>
from enterprise_scraper.spiders.base_spider import BaseSpider
from enterprise_scraper.items import ProductItem
class EcommerceSpider(BaseSpider):
name = 'ecommerce'
allowed_domains = ['example-store.com']
custom_settings = {
'ITEM_PIPELINES': {
'enterprise_scraper.pipelines.ValidationPipeline': 300,
'enterprise_scraper.pipelines.DatabasePipeline': 400,
},
'DOWNLOAD_DELAY': 2,
}
def start_requests(self):
"""Generate initial requests with pagination"""
base_url = "https://example-store.com/products"
for page in range(1, 101): # First 100 pages
url = f"{base_url}?page={page}"
yield self.make_request(
url=url,
callback=self.parse_product_list,
meta={'page': page}
)
def parse_product_list(self, response):
"""Extract product URLs from listing pages"""
product_urls = response.css('.product-link::attr(href)').getall()
for url in product_urls:
yield self.make_request(
url=response.urljoin(url),
callback=self.parse_product,
meta={'category': response.meta.get('category')}
)
# Handle pagination
next_page = response.css('.pagination .next::attr(href)').get()
if next_page:
yield self.make_request(
url=response.urljoin(next_page),
callback=self.parse_product_list
)
def parse_product(self, response):
"""Extract product details"""
item = ProductItem()
item['url'] = response.url
item['name'] = response.css('h1.product-title::text').get()
item['price'] = self.extract_price(response)
item['description'] = response.css('.product-description::text').getall()
item['images'] = response.css('.product-images img::attr(src)').getall()
item['availability'] = response.css('.stock-status::text').get()
item['rating'] = self.extract_rating(response)
item['reviews_count'] = self.extract_reviews_count(response)
self.stats['items_extracted'] += 1
yield item
def extract_price(self, response):
"""Extract and normalize price data"""
price_text = response.css('.price::text').get()
if price_text:
# Remove currency symbols and normalize
import re
price = re.sub(r'[^\d.]', '', price_text)
return float(price) if price else None
return None
</code></pre>
<h2>Enterprise Pipeline System</h2>
<h3>Validation Pipeline</h3>
<pre><code>
from itemadapter import ItemAdapter
from scrapy.exceptions import DropItem
import validators
class ValidationPipeline:
"""Validate items before processing"""
def process_item(self, item, spider):
adapter = ItemAdapter(item)
# Required field validation
if not adapter.get('name'):
raise DropItem(f"Missing product name: {item}")
# URL validation
if not validators.url(adapter.get('url')):
raise DropItem(f"Invalid URL: {adapter.get('url')}")
# Price validation
price = adapter.get('price')
if price is not None:
try:
price = float(price)
if price < 0:
raise DropItem(f"Invalid price: {price}")
adapter['price'] = price
except (ValueError, TypeError):
raise DropItem(f"Invalid price format: {price}")
spider.logger.info(f"Item validated: {adapter.get('name')}")
return item
</code></pre>
<h3>Database Pipeline with Connection Pooling</h3>
<pre><code>
import asyncio
import asyncpg
from itemadapter import ItemAdapter
class DatabasePipeline:
"""Asynchronous database pipeline"""
def __init__(self, db_url, pool_size=20):
self.db_url = db_url
self.pool_size = pool_size
self.pool = None
@classmethod
def from_crawler(cls, crawler):
return cls(
db_url=crawler.settings.get('DATABASE_URL'),
pool_size=crawler.settings.get('DB_POOL_SIZE', 20)
)
async def open_spider(self, spider):
"""Initialize database connection pool"""
self.pool = await asyncpg.create_pool(
self.db_url,
min_size=5,
max_size=self.pool_size
)
spider.logger.info("Database connection pool created")
async def close_spider(self, spider):
"""Close database connection pool"""
if self.pool:
await self.pool.close()
spider.logger.info("Database connection pool closed")
async def process_item(self, item, spider):
"""Insert item into database"""
adapter = ItemAdapter(item)
async with self.pool.acquire() as connection:
await connection.execute('''
INSERT INTO products (url, name, price, description)
VALUES ($1, $2, $3, $4)
ON CONFLICT (url) DO UPDATE SET
name = EXCLUDED.name,
price = EXCLUDED.price,
description = EXCLUDED.description,
updated_at = NOW()
''',
adapter.get('url'),
adapter.get('name'),
adapter.get('price'),
'\n'.join(adapter.get('description', []))
)
spider.logger.info(f"Item saved: {adapter.get('name')}")
return item
</code></pre>
<h2>Middleware for Enterprise Features</h2>
<h3>Rotating Proxy Middleware</h3>
<pre><code>
import random
from scrapy.downloadermiddlewares.httpproxy import HttpProxyMiddleware
class RotatingProxyMiddleware(HttpProxyMiddleware):
"""Rotate proxies for each request"""
def __init__(self, proxy_list):
self.proxy_list = proxy_list
@classmethod
def from_crawler(cls, crawler):
proxy_list = crawler.settings.get('PROXY_LIST', [])
return cls(proxy_list)
def process_request(self, request, spider):
if self.proxy_list:
proxy = random.choice(self.proxy_list)
request.meta['proxy'] = proxy
spider.logger.debug(f"Using proxy: {proxy}")
return None
</code></pre>
<h3>Rate Limiting Middleware</h3>
<pre><code>
import time
from collections import defaultdict
from scrapy.downloadermiddlewares.retry import RetryMiddleware
class RateLimitMiddleware(RetryMiddleware):
"""Implement per-domain rate limiting"""
def __init__(self, settings):
super().__init__(settings)
self.domain_delays = defaultdict(float)
self.last_request_time = defaultdict(float)
def process_request(self, request, spider):
domain = request.url.split('/')[2]
current_time = time.time()
# Calculate required delay
min_delay = self.domain_delays.get(domain, 1.0)
time_since_last = current_time - self.last_request_time[domain]
if time_since_last < min_delay:
delay = min_delay - time_since_last
spider.logger.debug(f"Rate limiting {domain}: {delay:.2f}s")
time.sleep(delay)
self.last_request_time[domain] = time.time()
return None
</code></pre>
<h2>Monitoring and Observability</h2>
<h3>Custom Stats Collection</h3>
<pre><code>
from scrapy.statscollectors import StatsCollector
import time
class EnterpriseStatsCollector(StatsCollector):
"""Enhanced stats collection for monitoring"""
def __init__(self, crawler):
super().__init__(crawler)
self.start_time = time.time()
self.custom_stats = {}
def get_stats(self):
"""Enhanced stats with custom metrics"""
stats = super().get_stats()
# Add runtime statistics
runtime = time.time() - self.start_time
stats['runtime_seconds'] = runtime
# Add rate calculations
pages_count = stats.get('response_received_count', 0)
if runtime > 0:
stats['pages_per_minute'] = (pages_count / runtime) * 60
# Add custom metrics
stats.update(self.custom_stats)
return stats
def inc_value(self, key, count=1, start=0):
"""Increment custom counter"""
super().inc_value(key, count, start)
# Log significant milestones
current_value = self.get_value(key, 0)
if current_value % 1000 == 0: # Every 1000 items
self.crawler.spider.logger.info(f"{key}: {current_value}")
</code></pre>
<h2>Production Deployment</h2>
<p>Deploying Scrapy at enterprise scale requires robust infrastructure and monitoring. For comprehensive <a href="../../services/data-cleaning.php">data pipeline solutions</a>, consider our managed deployment services that handle scaling, monitoring, and compliance automatically.</p>
<h3>Docker Configuration</h3>
<pre><code>
# Dockerfile
FROM python:3.9-slim
WORKDIR /app
# Install system dependencies
RUN apt-get update && apt-get install -y \
gcc \
libc-dev \
libffi-dev \
libssl-dev \
&& rm -rf /var/lib/apt/lists/*
# Install Python dependencies
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# Copy application code
COPY . .
# Create non-root user
RUN useradd -m -u 1000 scrapy && chown -R scrapy:scrapy /app
USER scrapy
# Default command
CMD ["scrapy", "crawl", "ecommerce"]
</code></pre>
<h3>Kubernetes Deployment</h3>
<pre><code>
apiVersion: apps/v1
kind: Deployment
metadata:
name: scrapy-deployment
spec:
replicas: 3
selector:
matchLabels:
app: scrapy
template:
metadata:
labels:
app: scrapy
spec:
containers:
- name: scrapy
image: enterprise-scrapy:latest
resources:
requests:
memory: "1Gi"
cpu: "500m"
limits:
memory: "2Gi"
cpu: "1000m"
env:
- name: SCRAPY_ENV
value: "production"
- name: DATABASE_URL
valueFrom:
secretKeyRef:
name: db-secret
key: url
---
apiVersion: v1
kind: Service
metadata:
name: scrapy-service
spec:
selector:
app: scrapy
ports:
- port: 6800
targetPort: 6800
</code></pre>
<h2>Performance Optimization</h2>
<h3>Memory Management</h3>
<ul>
<li><strong>Item Pipeline:</strong> Process items immediately to avoid memory buildup</li>
<li><strong>Response Caching:</strong> Disable for production unless specifically needed</li>
<li><strong>Request Filtering:</strong> Use duplicate filters efficiently</li>
<li><strong>Large Responses:</strong> Stream large files instead of loading into memory</li>
</ul>
<h3>Scaling Strategies</h3>
<ul>
<li><strong>Horizontal Scaling:</strong> Multiple spider instances</li>
<li><strong>Domain Sharding:</strong> Distribute domains across instances</li>
<li><strong>Queue Management:</strong> Redis-based distributed queuing</li>
<li><strong>Load Balancing:</strong> Distribute requests across proxy pools</li>
</ul>
<h2>Best Practices Summary</h2>
<h3>Code Organization</h3>
<ul>
<li>Use inheritance for common spider functionality</li>
<li>Separate settings by environment</li>
<li>Implement comprehensive error handling</li>
<li>Write unit tests for custom components</li>
</ul>
<h3>Operational Excellence</h3>
<ul>
<li>Monitor performance metrics continuously</li>
<li>Implement circuit breakers for external services</li>
<li>Use structured logging for better observability</li>
<li>Plan for graceful degradation</li>
</ul>
<h3>Compliance and Ethics</h3>
<ul>
<li>Respect robots.txt and rate limits</li>
<li>Implement proper user agent identification</li>
<li>Handle personal data according to GDPR</li>
<li>Maintain audit trails for data collection</li>
</ul>
<div class="article-cta">
<h3>Scale Your Scrapy Operations</h3>
<p>UK Data Services provides enterprise Scrapy development and deployment services. Let our experts help you build robust, scalable web scraping solutions.</p>
<a href="/quote" class="btn btn-primary">Get Scrapy Consultation</a>
</div>
</div>
</div>
<!-- Related Articles -->
<aside class="related-articles">
<h3>Related Articles</h3>
<div class="related-grid">
<article class="related-card">
<span class="category">Web Scraping</span>
<h4><a href="javascript-heavy-sites-scraping.php">Scraping JavaScript-Heavy Sites: Advanced Techniques</a></h4>
<span class="read-time">6 min read</span>
<?php include($_SERVER['DOCUMENT_ROOT'] . '/includes/author-bio.php'); ?>
<?php include($_SERVER['DOCUMENT_ROOT'] . '/includes/article-footer.php'); ?>
</div>
</article>
<article class="related-card">
<span class="category">Technology</span>
<h4><a href="cloud-native-scraping-architecture.php">Cloud-Native Scraping Architecture for Enterprise Scale</a></h4>
<span class="read-time">11 min read</span>
<?php include($_SERVER['DOCUMENT_ROOT'] . '/includes/author-bio.php'); ?>
<?php include($_SERVER['DOCUMENT_ROOT'] . '/includes/article-footer.php'); ?>
</div>
</article>
<article class="related-card">
<span class="category">Compliance</span>
<h4><a href="web-scraping-compliance-uk-guide.php">Complete Guide to Web Scraping Compliance in the UK</a></h4>
<span class="read-time">12 min read</span>
<?php include($_SERVER['DOCUMENT_ROOT'] . '/includes/author-bio.php'); ?>
<?php include($_SERVER['DOCUMENT_ROOT'] . '/includes/article-footer.php'); ?>
</div>
</article>
</div>
</aside>
</div>
<?php include($_SERVER['DOCUMENT_ROOT'] . '/includes/author-bio.php'); ?>
<?php include($_SERVER['DOCUMENT_ROOT'] . '/includes/article-footer.php'); ?>
</div>
</article>
</main>
<!-- Footer -->
<footer class="footer">
<div class="container">
<div class="footer-content">
<div class="footer-section">
<div class="footer-logo">
<img loading="lazy" src="../../assets/images/logo-white.svg" alt="UK Data Services" loading="lazy">
</div>
<p>Enterprise data intelligence solutions for modern British business.</p>
</div>
<div class="footer-section">
<h3>Quick Links</h3>
<ul>
<li><a href="/#services">Services</a></li>
<li><a href="/blog/">Blog</a></li>
<li><a href="/case-studies/">Case Studies</a></li>
<li><a href="/about">About</a></li>
<li><a href="/#contact">Contact</a></li>
</ul>
</div>
<div class="footer-section">
<h3>Legal</h3>
<ul>
<li><a href="/privacy-policy">Privacy Policy</a></li>
<li><a href="/terms-of-service">Terms of Service</a></li>
<li><a href="/cookie-policy">Cookie Policy</a></li>
<li><a href="/gdpr-compliance">GDPR Compliance</a></li>
</ul>
</div>
</div>
<div class="footer-bottom">
<p>&copy; <?php echo date('Y'); ?> UK Data Services. All rights reserved.</p>
<div class="social-links">
<a href="https://linkedin.com/company/uk-data-services" aria-label="LinkedIn" rel="noopener" target="_blank">
<img loading="lazy" src="../../assets/images/icon-linkedin.svg" alt="LinkedIn" loading="lazy">
</a>
<a href="https://twitter.com/ukdataservices" aria-label="Twitter" rel="noopener" target="_blank">
<img loading="lazy" src="../../assets/images/icon-twitter.svg" alt="Twitter" loading="lazy">
</a>
</div>
</div>
</div>
</footer>
<!-- Scripts -->
<script src="../../assets/js/main.js"></script>
<script src="../../assets/js/cro-enhancements.js"></script>
</body>
</html>