Files
ukaiautomation/blog/articles/python-scrapy-enterprise-guide.php
root 4d44e84478 SEO/E-E-A-T: fix author attribution across all blog articles
- Remap 20 articles from generic team names (UK Data Services Legal Team,
  Analytics Team, Technical Team etc.) to matching named authors from the
  author database (Sarah Chen, David Martinez, Michael Thompson, etc.)
- Add 5 new named authors to author-bio.php: Alex Kumar, David Thompson,
  Emily Roberts, Michael Chen, Sarah Mitchell
- Eliminates author name/bio mismatch where team name showed but
  Editorial Team bio/role rendered instead
2026-02-22 09:55:13 +00:00

861 lines
34 KiB
PHP

<?php
// Enhanced security headers
header('Strict-Transport-Security: max-age=31536000; includeSubDomains');
// Article-specific SEO variables
$article_title = "Python Scrapy Enterprise Guide: Scaling Web Scraping Operations";
$article_description = "Master Scrapy for enterprise-scale web scraping operations. Learn advanced techniques, best practices, and optimization strategies for production deployments.";
$article_keywords = "Python Scrapy enterprise, web scraping framework, Scrapy best practices, enterprise web scraping, Python data extraction, Scrapy optimization";
$article_author = "Michael Thompson";
$canonical_url = "https://ukdataservices.co.uk/blog/articles/python-scrapy-enterprise-guide";
$article_published = "2025-05-15T09:00:00+00:00";
$article_modified = "2025-05-15T09:00:00+00:00";
$og_image = "https://ukdataservices.co.uk/assets/images/icon-web-scraping-v2.svg";
$read_time = 12;
?>
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title><?php echo htmlspecialchars($article_title); ?> | UK Data Services Blog</title>
<meta name="description" content="<?php echo htmlspecialchars($article_description); ?>">
<meta name="keywords" content="<?php echo htmlspecialchars($article_keywords); ?>">
<meta name="author" content="<?php echo htmlspecialchars($article_author); ?>">
<meta name="robots" content="index, follow">
<link rel="canonical" href="<?php echo htmlspecialchars($canonical_url); ?>">
<!-- Article-specific meta tags -->
<meta name="article:published_time" content="<?php echo $article_published; ?>">
<meta name="article:modified_time" content="<?php echo $article_modified; ?>">
<meta name="article:author" content="<?php echo htmlspecialchars($article_author); ?>">
<meta name="article:section" content="Web Scraping">
<meta name="article:tag" content="Python, Scrapy, Web Scraping, Enterprise, Framework">
<!-- Preload critical resources -->
<link rel="preload" href="../../assets/css/main.css" as="style">
<link rel="preload" href="../../assets/images/ukds-main-logo.png" as="image">
<!-- Open Graph / Social Media -->
<meta property="og:type" content="article">
<meta property="og:url" content="<?php echo htmlspecialchars($canonical_url); ?>">
<meta property="og:title" content="<?php echo htmlspecialchars($article_title); ?>">
<meta property="og:description" content="<?php echo htmlspecialchars($article_description); ?>">
<meta property="og:image" content="<?php echo htmlspecialchars($og_image); ?>">
<!-- Twitter Card -->
<meta name="twitter:card" content="summary_large_image">
<meta name="twitter:title" content="<?php echo htmlspecialchars($article_title); ?>">
<meta name="twitter:description" content="<?php echo htmlspecialchars($article_description); ?>">
<meta name="twitter:image" content="<?php echo htmlspecialchars($og_image); ?>">
<!-- Favicon and App Icons -->
<link rel="icon" type="image/svg+xml" href="../../assets/images/favicon.svg">
<link rel="apple-touch-icon" sizes="180x180" href="../../assets/images/apple-touch-icon.svg">
<!-- Fonts -->
<link rel="preconnect" href="https://fonts.googleapis.com">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link href="https://fonts.googleapis.com/css2?family=Roboto+Slab:wght@300;400;500;600;700&family=Lato:wght@300;400;500;600;700&display=swap" rel="stylesheet">
<!-- Styles -->
<link rel="stylesheet" href="../../assets/css/main.css">
<link rel="stylesheet" href="../../assets/css/cro-enhancements.css">
<!-- Article Schema -->
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "Article",
"mainEntityOfPage": {
"@type": "WebPage",
"@id": "<?php echo htmlspecialchars($canonical_url); ?>"
},
"headline": "<?php echo htmlspecialchars($article_title); ?>",
"description": "<?php echo htmlspecialchars($article_description); ?>",
"image": "<?php echo htmlspecialchars($og_image); ?>",
"author": {
"@type": "Organization",
"name": "UK Data Services",
"url": "https://ukdataservices.co.uk"
},
"publisher": {
"@type": "Organization",
"name": "UK Data Services",
"logo": {
"@type": "ImageObject",
"url": "https://ukdataservices.co.uk/assets/images/ukds-main-logo.png"
}
},
"datePublished": "<?php echo $article_published; ?>",
"dateModified": "<?php echo $article_modified; ?>"
}
</script>
<!-- HowTo Schema for Technical Guide -->
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "HowTo",
"name": "How to Set Up Scrapy for Enterprise Web Scraping Operations",
"description": "Step-by-step guide to implement and scale Python Scrapy for enterprise web scraping operations with best practices and optimization techniques.",
"image": "https://ukdataservices.co.uk/assets/images/icon-web-scraping-v2.svg",
"estimatedCost": {
"@type": "MonetaryAmount",
"currency": "GBP",
"value": "0"
},
"totalTime": "PT45M",
"supply": [
{
"@type": "HowToSupply",
"name": "Python 3.8+"
},
{
"@type": "HowToSupply",
"name": "Scrapy Framework"
},
{
"@type": "HowToSupply",
"name": "Development Environment"
}
],
"tool": [
{
"@type": "HowToTool",
"name": "Python IDE"
},
{
"@type": "HowToTool",
"name": "Command Line Interface"
}
],
"step": [
{
"@type": "HowToStep",
"name": "Install Scrapy Framework",
"text": "Install Scrapy using pip and set up your development environment",
"url": "https://ukdataservices.co.uk/blog/articles/python-scrapy-enterprise-guide#installation"
},
{
"@type": "HowToStep",
"name": "Create Scrapy Project",
"text": "Initialize a new Scrapy project with proper directory structure",
"url": "https://ukdataservices.co.uk/blog/articles/python-scrapy-enterprise-guide#project-setup"
},
{
"@type": "HowToStep",
"name": "Configure Settings",
"text": "Set up enterprise-grade configuration for production deployment",
"url": "https://ukdataservices.co.uk/blog/articles/python-scrapy-enterprise-guide#configuration"
},
{
"@type": "HowToStep",
"name": "Implement Spiders",
"text": "Build scalable spider classes with proper error handling",
"url": "https://ukdataservices.co.uk/blog/articles/python-scrapy-enterprise-guide#spider-development"
},
{
"@type": "HowToStep",
"name": "Deploy and Monitor",
"text": "Deploy to production and implement monitoring systems",
"url": "https://ukdataservices.co.uk/blog/articles/python-scrapy-enterprise-guide#deployment"
}
]
}
</script>
</head>
<body>
<!-- Skip to content link for accessibility -->
<a href="#main-content" class="skip-to-content">Skip to main content</a>
<?php include($_SERVER["DOCUMENT_ROOT"] . "/includes/nav.php"); ?><!-- Article Content -->
<main id="main-content">
<article class="article-page">
<div class="container">
<div class="article-meta">
<span class="category"><a href="/blog/categories/technology.php">Technology</a></span>
<time datetime="2025-05-15">15 May 2025</time>
<span class="read-time">12 min read</span>
</div>
<header class="article-header">
<h1><?php echo htmlspecialchars($article_title); ?></h1>
<p class="article-lead"><?php echo htmlspecialchars($article_description); ?></p>
<div class="article-author">
<div class="author-info">
<span>By <?php echo htmlspecialchars($article_author); ?></span>
</div>
<div class="share-buttons">
<a href="https://www.linkedin.com/sharing/share-offsite/?url=<?php echo urlencode($canonical_url); ?>" class="share-button linkedin" aria-label="Share on LinkedIn" rel="noopener" target="_blank">
<img loading="lazy" src="../../assets/images/icon-linkedin.svg" alt="LinkedIn">
</a>
<a href="https://twitter.com/intent/tweet?url=<?php echo urlencode($canonical_url); ?>&text=<?php echo urlencode($article_title); ?>" class="share-button twitter" aria-label="Share on Twitter" rel="noopener" target="_blank">
<img loading="lazy" src="../../assets/images/icon-twitter.svg" alt="Twitter">
</a>
</div>
</div>
</header>
<div class="article-content">
<div class="content-wrapper">
<h2>Why Scrapy for Enterprise Web Scraping?</h2>
<p>Scrapy stands out as the premier Python framework for large-scale web scraping operations. Unlike simple scripts or basic tools, Scrapy provides the robust architecture, built-in features, and extensibility that enterprise applications demand.</p>
<p>This comprehensive guide covers everything you need to know to deploy Scrapy in production environments, from initial setup to advanced optimization techniques.</p>
<h2>Enterprise-Grade Scrapy Architecture</h2>
<h3>Core Components Overview</h3>
<ul>
<li><strong>Scrapy Engine:</strong> Controls data flow between components</li>
<li><strong>Scheduler:</strong> Receives requests and queues them for processing</li>
<li><strong>Downloader:</strong> Fetches web pages and returns responses</li>
<li><strong>Spiders:</strong> Custom classes that define scraping logic</li>
<li><strong>Item Pipeline:</strong> Processes extracted data</li>
<li><strong>Middlewares:</strong> Hooks for customizing request/response processing</li>
</ul>
<h3>Production Project Structure</h3>
<pre><code>
enterprise_scraper/
├── scrapy.cfg
├── requirements.txt
├── docker-compose.yml
├── enterprise_scraper/
│ ├── __init__.py
│ ├── settings/
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── development.py
│ │ ├── staging.py
│ │ └── production.py
│ ├── spiders/
│ │ ├── __init__.py
│ │ ├── base_spider.py
│ │ └── ecommerce_spider.py
│ ├── items.py
│ ├── pipelines.py
│ ├── middlewares.py
│ └── utils/
│ ├── __init__.py
│ ├── database.py
│ └── monitoring.py
├── deploy/
│ ├── Dockerfile
│ └── kubernetes/
└── tests/
├── unit/
└── integration/
</code></pre>
<h2>Advanced Configuration Management</h2>
<h3>Environment-Specific Settings</h3>
<pre><code>
# settings/base.py
BOT_NAME = 'enterprise_scraper'
SPIDER_MODULES = ['enterprise_scraper.spiders']
NEWSPIDER_MODULE = 'enterprise_scraper.spiders'
# Respect robots.txt for compliance
ROBOTSTXT_OBEY = True
# Configure concurrent requests
CONCURRENT_REQUESTS = 32
CONCURRENT_REQUESTS_PER_DOMAIN = 8
# Download delays for respectful scraping
DOWNLOAD_DELAY = 1
RANDOMIZE_DOWNLOAD_DELAY = 0.5
# Production settings/production.py
from .base import *
# Increase concurrency for production
CONCURRENT_REQUESTS = 100
CONCURRENT_REQUESTS_PER_DOMAIN = 16
# Enable autothrottling
AUTOTHROTTLE_ENABLED = True
AUTOTHROTTLE_START_DELAY = 1
AUTOTHROTTLE_MAX_DELAY = 10
AUTOTHROTTLE_TARGET_CONCURRENCY = 2.0
# Logging configuration
LOG_LEVEL = 'INFO'
LOG_FILE = '/var/log/scrapy/scrapy.log'
# Database settings
DATABASE_URL = os.environ.get('DATABASE_URL')
REDIS_URL = os.environ.get('REDIS_URL')
</code></pre>
<h3>Dynamic Settings with Environment Variables</h3>
<pre><code>
import os
from scrapy.utils.project import get_project_settings
def get_scrapy_settings():
settings = get_project_settings()
# Environment-specific overrides
if os.environ.get('SCRAPY_ENV') == 'production':
settings.set('CONCURRENT_REQUESTS', 200)
settings.set('DOWNLOAD_DELAY', 0.5)
elif os.environ.get('SCRAPY_ENV') == 'development':
settings.set('CONCURRENT_REQUESTS', 16)
settings.set('DOWNLOAD_DELAY', 2)
return settings
</code></pre>
<h2>Enterprise Spider Development</h2>
<h3>Base Spider Class</h3>
<pre><code>
import scrapy
from scrapy.http import Request
from typing import Generator, Optional
import logging
class BaseSpider(scrapy.Spider):
"""Base spider with common enterprise functionality"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.setup_logging()
self.setup_monitoring()
def setup_logging(self):
"""Configure structured logging"""
self.logger = logging.getLogger(self.name)
def setup_monitoring(self):
"""Initialize monitoring metrics"""
self.stats = {
'pages_scraped': 0,
'items_extracted': 0,
'errors': 0
}
def parse_with_error_handling(self, response):
"""Parse with comprehensive error handling"""
try:
yield from self.parse_content(response)
except Exception as e:
self.logger.error(f"Error parsing {response.url}: {e}")
self.stats['errors'] += 1
def make_request(self, url: str, callback=None, meta: dict = None) -> Request:
"""Create request with standard metadata"""
return Request(
url=url,
callback=callback or self.parse_with_error_handling,
meta={
'spider_name': self.name,
'timestamp': time.time(),
**(meta or {})
},
dont_filter=False
)
</code></pre>
<h3>Advanced E-commerce Spider</h3>
<pre><code>
from enterprise_scraper.spiders.base_spider import BaseSpider
from enterprise_scraper.items import ProductItem
class EcommerceSpider(BaseSpider):
name = 'ecommerce'
allowed_domains = ['example-store.com']
custom_settings = {
'ITEM_PIPELINES': {
'enterprise_scraper.pipelines.ValidationPipeline': 300,
'enterprise_scraper.pipelines.DatabasePipeline': 400,
},
'DOWNLOAD_DELAY': 2,
}
def start_requests(self):
"""Generate initial requests with pagination"""
base_url = "https://example-store.com/products"
for page in range(1, 101): # First 100 pages
url = f"{base_url}?page={page}"
yield self.make_request(
url=url,
callback=self.parse_product_list,
meta={'page': page}
)
def parse_product_list(self, response):
"""Extract product URLs from listing pages"""
product_urls = response.css('.product-link::attr(href)').getall()
for url in product_urls:
yield self.make_request(
url=response.urljoin(url),
callback=self.parse_product,
meta={'category': response.meta.get('category')}
)
# Handle pagination
next_page = response.css('.pagination .next::attr(href)').get()
if next_page:
yield self.make_request(
url=response.urljoin(next_page),
callback=self.parse_product_list
)
def parse_product(self, response):
"""Extract product details"""
item = ProductItem()
item['url'] = response.url
item['name'] = response.css('h1.product-title::text').get()
item['price'] = self.extract_price(response)
item['description'] = response.css('.product-description::text').getall()
item['images'] = response.css('.product-images img::attr(src)').getall()
item['availability'] = response.css('.stock-status::text').get()
item['rating'] = self.extract_rating(response)
item['reviews_count'] = self.extract_reviews_count(response)
self.stats['items_extracted'] += 1
yield item
def extract_price(self, response):
"""Extract and normalize price data"""
price_text = response.css('.price::text').get()
if price_text:
# Remove currency symbols and normalize
import re
price = re.sub(r'[^\d.]', '', price_text)
return float(price) if price else None
return None
</code></pre>
<h2>Enterprise Pipeline System</h2>
<h3>Validation Pipeline</h3>
<pre><code>
from itemadapter import ItemAdapter
from scrapy.exceptions import DropItem
import validators
class ValidationPipeline:
"""Validate items before processing"""
def process_item(self, item, spider):
adapter = ItemAdapter(item)
# Required field validation
if not adapter.get('name'):
raise DropItem(f"Missing product name: {item}")
# URL validation
if not validators.url(adapter.get('url')):
raise DropItem(f"Invalid URL: {adapter.get('url')}")
# Price validation
price = adapter.get('price')
if price is not None:
try:
price = float(price)
if price < 0:
raise DropItem(f"Invalid price: {price}")
adapter['price'] = price
except (ValueError, TypeError):
raise DropItem(f"Invalid price format: {price}")
spider.logger.info(f"Item validated: {adapter.get('name')}")
return item
</code></pre>
<h3>Database Pipeline with Connection Pooling</h3>
<pre><code>
import asyncio
import asyncpg
from itemadapter import ItemAdapter
class DatabasePipeline:
"""Asynchronous database pipeline"""
def __init__(self, db_url, pool_size=20):
self.db_url = db_url
self.pool_size = pool_size
self.pool = None
@classmethod
def from_crawler(cls, crawler):
return cls(
db_url=crawler.settings.get('DATABASE_URL'),
pool_size=crawler.settings.get('DB_POOL_SIZE', 20)
)
async def open_spider(self, spider):
"""Initialize database connection pool"""
self.pool = await asyncpg.create_pool(
self.db_url,
min_size=5,
max_size=self.pool_size
)
spider.logger.info("Database connection pool created")
async def close_spider(self, spider):
"""Close database connection pool"""
if self.pool:
await self.pool.close()
spider.logger.info("Database connection pool closed")
async def process_item(self, item, spider):
"""Insert item into database"""
adapter = ItemAdapter(item)
async with self.pool.acquire() as connection:
await connection.execute('''
INSERT INTO products (url, name, price, description)
VALUES ($1, $2, $3, $4)
ON CONFLICT (url) DO UPDATE SET
name = EXCLUDED.name,
price = EXCLUDED.price,
description = EXCLUDED.description,
updated_at = NOW()
''',
adapter.get('url'),
adapter.get('name'),
adapter.get('price'),
'\n'.join(adapter.get('description', []))
)
spider.logger.info(f"Item saved: {adapter.get('name')}")
return item
</code></pre>
<h2>Middleware for Enterprise Features</h2>
<h3>Rotating Proxy Middleware</h3>
<pre><code>
import random
from scrapy.downloadermiddlewares.httpproxy import HttpProxyMiddleware
class RotatingProxyMiddleware(HttpProxyMiddleware):
"""Rotate proxies for each request"""
def __init__(self, proxy_list):
self.proxy_list = proxy_list
@classmethod
def from_crawler(cls, crawler):
proxy_list = crawler.settings.get('PROXY_LIST', [])
return cls(proxy_list)
def process_request(self, request, spider):
if self.proxy_list:
proxy = random.choice(self.proxy_list)
request.meta['proxy'] = proxy
spider.logger.debug(f"Using proxy: {proxy}")
return None
</code></pre>
<h3>Rate Limiting Middleware</h3>
<pre><code>
import time
from collections import defaultdict
from scrapy.downloadermiddlewares.retry import RetryMiddleware
class RateLimitMiddleware(RetryMiddleware):
"""Implement per-domain rate limiting"""
def __init__(self, settings):
super().__init__(settings)
self.domain_delays = defaultdict(float)
self.last_request_time = defaultdict(float)
def process_request(self, request, spider):
domain = request.url.split('/')[2]
current_time = time.time()
# Calculate required delay
min_delay = self.domain_delays.get(domain, 1.0)
time_since_last = current_time - self.last_request_time[domain]
if time_since_last < min_delay:
delay = min_delay - time_since_last
spider.logger.debug(f"Rate limiting {domain}: {delay:.2f}s")
time.sleep(delay)
self.last_request_time[domain] = time.time()
return None
</code></pre>
<h2>Monitoring and Observability</h2>
<h3>Custom Stats Collection</h3>
<pre><code>
from scrapy.statscollectors import StatsCollector
import time
class EnterpriseStatsCollector(StatsCollector):
"""Enhanced stats collection for monitoring"""
def __init__(self, crawler):
super().__init__(crawler)
self.start_time = time.time()
self.custom_stats = {}
def get_stats(self):
"""Enhanced stats with custom metrics"""
stats = super().get_stats()
# Add runtime statistics
runtime = time.time() - self.start_time
stats['runtime_seconds'] = runtime
# Add rate calculations
pages_count = stats.get('response_received_count', 0)
if runtime > 0:
stats['pages_per_minute'] = (pages_count / runtime) * 60
# Add custom metrics
stats.update(self.custom_stats)
return stats
def inc_value(self, key, count=1, start=0):
"""Increment custom counter"""
super().inc_value(key, count, start)
# Log significant milestones
current_value = self.get_value(key, 0)
if current_value % 1000 == 0: # Every 1000 items
self.crawler.spider.logger.info(f"{key}: {current_value}")
</code></pre>
<h2>Production Deployment</h2>
<p>Deploying Scrapy at enterprise scale requires robust infrastructure and monitoring. For comprehensive <a href="../../services/data-cleaning.php">data pipeline solutions</a>, consider our managed deployment services that handle scaling, monitoring, and compliance automatically.</p>
<h3>Docker Configuration</h3>
<pre><code>
# Dockerfile
FROM python:3.9-slim
WORKDIR /app
# Install system dependencies
RUN apt-get update && apt-get install -y \
gcc \
libc-dev \
libffi-dev \
libssl-dev \
&& rm -rf /var/lib/apt/lists/*
# Install Python dependencies
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# Copy application code
COPY . .
# Create non-root user
RUN useradd -m -u 1000 scrapy && chown -R scrapy:scrapy /app
USER scrapy
# Default command
CMD ["scrapy", "crawl", "ecommerce"]
</code></pre>
<h3>Kubernetes Deployment</h3>
<pre><code>
apiVersion: apps/v1
kind: Deployment
metadata:
name: scrapy-deployment
spec:
replicas: 3
selector:
matchLabels:
app: scrapy
template:
metadata:
labels:
app: scrapy
spec:
containers:
- name: scrapy
image: enterprise-scrapy:latest
resources:
requests:
memory: "1Gi"
cpu: "500m"
limits:
memory: "2Gi"
cpu: "1000m"
env:
- name: SCRAPY_ENV
value: "production"
- name: DATABASE_URL
valueFrom:
secretKeyRef:
name: db-secret
key: url
---
apiVersion: v1
kind: Service
metadata:
name: scrapy-service
spec:
selector:
app: scrapy
ports:
- port: 6800
targetPort: 6800
</code></pre>
<h2>Performance Optimization</h2>
<h3>Memory Management</h3>
<ul>
<li><strong>Item Pipeline:</strong> Process items immediately to avoid memory buildup</li>
<li><strong>Response Caching:</strong> Disable for production unless specifically needed</li>
<li><strong>Request Filtering:</strong> Use duplicate filters efficiently</li>
<li><strong>Large Responses:</strong> Stream large files instead of loading into memory</li>
</ul>
<h3>Scaling Strategies</h3>
<ul>
<li><strong>Horizontal Scaling:</strong> Multiple spider instances</li>
<li><strong>Domain Sharding:</strong> Distribute domains across instances</li>
<li><strong>Queue Management:</strong> Redis-based distributed queuing</li>
<li><strong>Load Balancing:</strong> Distribute requests across proxy pools</li>
</ul>
<h2>Best Practices Summary</h2>
<h3>Code Organization</h3>
<ul>
<li>Use inheritance for common spider functionality</li>
<li>Separate settings by environment</li>
<li>Implement comprehensive error handling</li>
<li>Write unit tests for custom components</li>
</ul>
<h3>Operational Excellence</h3>
<ul>
<li>Monitor performance metrics continuously</li>
<li>Implement circuit breakers for external services</li>
<li>Use structured logging for better observability</li>
<li>Plan for graceful degradation</li>
</ul>
<h3>Compliance and Ethics</h3>
<ul>
<li>Respect robots.txt and rate limits</li>
<li>Implement proper user agent identification</li>
<li>Handle personal data according to GDPR</li>
<li>Maintain audit trails for data collection</li>
</ul>
<div class="article-cta">
<h3>Scale Your Scrapy Operations</h3>
<p>UK Data Services provides enterprise Scrapy development and deployment services. Let our experts help you build robust, scalable web scraping solutions.</p>
<a href="/quote" class="btn btn-primary">Get Scrapy Consultation</a>
</div>
</div>
</div>
<!-- Related Articles -->
<aside class="related-articles">
<h3>Related Articles</h3>
<div class="related-grid">
<article class="related-card">
<span class="category">Web Scraping</span>
<h4><a href="javascript-heavy-sites-scraping.php">Scraping JavaScript-Heavy Sites: Advanced Techniques</a></h4>
<span class="read-time">6 min read</span>
<?php include($_SERVER['DOCUMENT_ROOT'] . '/includes/author-bio.php'); ?>
<?php include($_SERVER['DOCUMENT_ROOT'] . '/includes/article-footer.php'); ?>
</div>
</article>
<article class="related-card">
<span class="category">Technology</span>
<h4><a href="cloud-native-scraping-architecture.php">Cloud-Native Scraping Architecture for Enterprise Scale</a></h4>
<span class="read-time">11 min read</span>
<?php include($_SERVER['DOCUMENT_ROOT'] . '/includes/author-bio.php'); ?>
<?php include($_SERVER['DOCUMENT_ROOT'] . '/includes/article-footer.php'); ?>
</div>
</article>
<article class="related-card">
<span class="category">Compliance</span>
<h4><a href="web-scraping-compliance-uk-guide.php">Complete Guide to Web Scraping Compliance in the UK</a></h4>
<span class="read-time">12 min read</span>
<?php include($_SERVER['DOCUMENT_ROOT'] . '/includes/author-bio.php'); ?>
<?php include($_SERVER['DOCUMENT_ROOT'] . '/includes/article-footer.php'); ?>
</div>
</article>
</div>
</aside>
</div>
<?php include($_SERVER['DOCUMENT_ROOT'] . '/includes/author-bio.php'); ?>
<?php include($_SERVER['DOCUMENT_ROOT'] . '/includes/article-footer.php'); ?>
</div>
</article>
</main>
<!-- Footer -->
<footer class="footer">
<div class="container">
<div class="footer-content">
<div class="footer-section">
<div class="footer-logo">
<img loading="lazy" src="../../assets/images/logo-white.svg" alt="UK Data Services" loading="lazy">
</div>
<p>Enterprise data intelligence solutions for modern British business.</p>
</div>
<div class="footer-section">
<h3>Quick Links</h3>
<ul>
<li><a href="/#services">Services</a></li>
<li><a href="/blog/">Blog</a></li>
<li><a href="/case-studies/">Case Studies</a></li>
<li><a href="/about">About</a></li>
<li><a href="/#contact">Contact</a></li>
</ul>
</div>
<div class="footer-section">
<h3>Legal</h3>
<ul>
<li><a href="/privacy-policy">Privacy Policy</a></li>
<li><a href="/terms-of-service">Terms of Service</a></li>
<li><a href="/cookie-policy">Cookie Policy</a></li>
<li><a href="/gdpr-compliance">GDPR Compliance</a></li>
</ul>
</div>
</div>
<div class="footer-bottom">
<p>&copy; <?php echo date('Y'); ?> UK Data Services. All rights reserved.</p>
<div class="social-links">
<a href="https://linkedin.com/company/uk-data-services" aria-label="LinkedIn" rel="noopener" target="_blank">
<img loading="lazy" src="../../assets/images/icon-linkedin.svg" alt="LinkedIn" loading="lazy">
</a>
<a href="https://twitter.com/ukdataservices" aria-label="Twitter" rel="noopener" target="_blank">
<img loading="lazy" src="../../assets/images/icon-twitter.svg" alt="Twitter" loading="lazy">
</a>
</div>
</div>
</div>
</footer>
<!-- Scripts -->
<script src="../../assets/js/main.js"></script>
<script src="../../assets/js/cro-enhancements.js"></script>
</body>
</html>