Files
ukaiautomation/blog/articles/sql-analytics-advanced-techniques.php

1617 lines
66 KiB
PHP
Raw Normal View History

2025-06-08 11:21:30 +01:00
<?php
// Enhanced security headers
header('X-Content-Type-Options: nosniff');
header('X-Frame-Options: DENY');
header('X-XSS-Protection: 1; mode=block');
header('Strict-Transport-Security: max-age=31536000; includeSubDomains');
header('Referrer-Policy: strict-origin-when-cross-origin');
// Article-specific SEO variables
$article_title = "Advanced SQL Analytics Techniques for Business Intelligence";
$article_description = "Master advanced SQL techniques for complex analytics including window functions, CTEs, advanced joins, and optimization strategies for large-scale business intelligence.";
$article_keywords = "advanced SQL, SQL analytics, window functions, CTE, business intelligence SQL, SQL optimization, data analytics";
$article_author = "UK Data Services Analytics Team";
$canonical_url = "https://ukdataservices.co.uk/blog/articles/sql-analytics-advanced-techniques.php";
$article_published = "2025-06-08T11:15:00+00:00";
$article_modified = "2025-06-08T16:30:00+00:00";
$og_image = "https://ukdataservices.co.uk/assets/images/chart-icon.png";
$read_time = 16;
?>
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title><?php echo htmlspecialchars($article_title); ?> | UK Data Services Blog</title>
<meta name="description" content="<?php echo htmlspecialchars($article_description); ?>">
<meta name="keywords" content="<?php echo htmlspecialchars($article_keywords); ?>">
<meta name="author" content="<?php echo htmlspecialchars($article_author); ?>">
<meta name="robots" content="index, follow">
<link rel="canonical" href="<?php echo htmlspecialchars($canonical_url); ?>">
<!-- Article-specific meta tags -->
<meta name="article:published_time" content="<?php echo $article_published; ?>">
<meta name="article:modified_time" content="<?php echo $article_modified; ?>">
<meta name="article:author" content="<?php echo htmlspecialchars($article_author); ?>">
<meta name="article:section" content="Business Intelligence">
<meta name="article:tag" content="SQL, Analytics, Data Science, Business Intelligence">
<!-- Preload critical resources -->
<link rel="preload" href="../../assets/css/main.css" as="style">
<link rel="preload" href="../../assets/images/ukds-main-logo.png" as="image">
<!-- Open Graph / Social Media -->
<meta property="og:type" content="article">
<meta property="og:url" content="<?php echo htmlspecialchars($canonical_url); ?>">
<meta property="og:title" content="<?php echo htmlspecialchars($article_title); ?>">
<meta property="og:description" content="<?php echo htmlspecialchars($article_description); ?>">
<meta property="og:image" content="<?php echo htmlspecialchars($og_image); ?>">
<meta property="og:image:width" content="1200">
<meta property="og:image:height" content="630">
<meta property="article:published_time" content="<?php echo $article_published; ?>">
<meta property="article:modified_time" content="<?php echo $article_modified; ?>">
<meta property="article:author" content="<?php echo htmlspecialchars($article_author); ?>">
<!-- Twitter Card -->
<meta name="twitter:card" content="summary_large_image">
<meta name="twitter:title" content="<?php echo htmlspecialchars($article_title); ?>">
<meta name="twitter:description" content="<?php echo htmlspecialchars($article_description); ?>">
<meta name="twitter:image" content="<?php echo htmlspecialchars($og_image); ?>">
<!-- Favicon and App Icons -->
<link rel="icon" type="image/svg+xml" href="../../assets/images/favicon.svg">
<link rel="apple-touch-icon" sizes="180x180" href="../../assets/images/apple-touch-icon.svg">
<!-- Fonts -->
<link rel="preconnect" href="https://fonts.googleapis.com">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link href="https://fonts.googleapis.com/css2?family=Roboto+Slab:wght@300;400;500;600;700&family=Lato:wght@300;400;500;600;700&display=swap" rel="stylesheet">
<!-- Styles -->
<link rel="stylesheet" href="../../assets/css/main.css">
<!-- Article Schema Markup -->
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "Article",
"headline": "<?php echo htmlspecialchars($article_title); ?>",
"description": "<?php echo htmlspecialchars($article_description); ?>",
"url": "<?php echo htmlspecialchars($canonical_url); ?>",
"datePublished": "<?php echo $article_published; ?>",
"dateModified": "<?php echo $article_modified; ?>",
"author": {
"@type": "Organization",
"name": "<?php echo htmlspecialchars($article_author); ?>",
"url": "https://ukdataservices.co.uk"
},
"publisher": {
"@type": "Organization",
"name": "UK Data Services",
"logo": {
"@type": "ImageObject",
"url": "https://ukdataservices.co.uk/assets/images/ukds-main-logo.png",
"width": 300,
"height": 100
}
},
"image": {
"@type": "ImageObject",
"url": "<?php echo htmlspecialchars($og_image); ?>",
"width": 1200,
"height": 630
},
"mainEntityOfPage": {
"@type": "WebPage",
"@id": "<?php echo htmlspecialchars($canonical_url); ?>"
},
"articleSection": "Business Intelligence",
"keywords": "<?php echo htmlspecialchars($article_keywords); ?>",
"wordCount": 4200,
"timeRequired": "PT<?php echo $read_time; ?>M",
"inLanguage": "en-GB"
}
</script>
</head>
<body>
<!-- Skip to content link for accessibility -->
<a href="#main-content" class="skip-to-content">Skip to main content</a>
<!-- Navigation -->
<nav class="navbar" id="navbar">
<div class="nav-container">
<div class="nav-logo">
<a href="../../">
<img src="../../assets/images/ukds-main-logo.png" alt="UK Data Services" class="logo" loading="eager">
</a>
</div>
<div class="nav-menu" id="nav-menu">
<a href="../../" class="nav-link">Home</a>
<a href="../../#services" class="nav-link">Capabilities</a>
<a href="../../project-types.php" class="nav-link">Project Types</a>
<a href="../../about.php" class="nav-link">About</a>
<a href="../" class="nav-link active">Blog</a>
<a href="../../#contact" class="nav-link">Contact</a>
<a href="../../quote.php" class="nav-link cta-button">Request Consultation</a>
</div>
<div class="nav-toggle" id="nav-toggle">
<span class="bar"></span>
<span class="bar"></span>
<span class="bar"></span>
</div>
</div>
</nav>
<!-- Breadcrumb Navigation -->
<div class="breadcrumb">
<nav aria-label="Breadcrumb">
<ol>
<li><a href="../../">Home</a></li>
<li><a href="../">Blog</a></li>
<li><a href="../categories/business-intelligence.php">Business Intelligence</a></li>
<li aria-current="page"><span>Advanced SQL Analytics</span></li>
</ol>
</nav>
</div>
<!-- Article Content -->
<main id="main-content">
<article class="blog-article">
<div class="container">
<!-- Article Header -->
<header class="article-header">
<div class="article-meta">
<a href="../categories/business-intelligence.php" class="category-link">Business Intelligence</a>
<time datetime="<?php echo $article_published; ?>" class="publish-date">8 June 2025</time>
<span class="read-time"><?php echo $read_time; ?> min read</span>
</div>
<h1 class="article-title"><?php echo htmlspecialchars($article_title); ?></h1>
<p class="article-subtitle"><?php echo htmlspecialchars($article_description); ?></p>
<div class="article-author">
<div class="author-info">
<strong>By <?php echo htmlspecialchars($article_author); ?></strong>
<p>SQL analytics and database optimization specialists</p>
</div>
<div class="article-share">
<a href="https://twitter.com/intent/tweet?text=<?php echo urlencode($article_title); ?>&url=<?php echo urlencode($canonical_url); ?>" target="_blank" rel="noopener" aria-label="Share on Twitter">📤 Share</a>
</div>
</div>
</header>
<!-- Table of Contents -->
<nav class="article-toc">
<h2>Table of Contents</h2>
<ol>
<li><a href="#window-functions">Advanced Window Functions</a></li>
<li><a href="#cte-recursive">CTEs and Recursive Queries</a></li>
<li><a href="#advanced-joins">Complex Joins and Set Operations</a></li>
<li><a href="#analytical-functions">Analytical and Statistical Functions</a></li>
<li><a href="#time-series-analysis">Time Series Analysis in SQL</a></li>
<li><a href="#performance-optimization">Query Optimization Strategies</a></li>
<li><a href="#data-quality">Data Quality and Validation</a></li>
<li><a href="#practical-examples">Real-World Business Cases</a></li>
</ol>
</nav>
<!-- Article Content -->
<div class="article-content">
<section id="window-functions">
<h2>Advanced Window Functions</h2>
<p>Window functions are among the most powerful SQL features for analytics, enabling complex calculations across row sets without grouping restrictions. These functions provide elegant solutions for ranking, moving averages, percentiles, and comparative analysis essential for business intelligence.</p>
<h3>Ranking and Row Number Functions</h3>
<p>Ranking functions help identify top performers, outliers, and relative positioning within datasets:</p>
<div class="code-example">
<h4>Customer Revenue Ranking Example</h4>
<pre><code>-- Calculate customer revenue rankings with ties handling
SELECT
customer_id,
customer_name,
total_revenue,
ROW_NUMBER() OVER (ORDER BY total_revenue DESC) as row_num,
RANK() OVER (ORDER BY total_revenue DESC) as rank_with_gaps,
DENSE_RANK() OVER (ORDER BY total_revenue DESC) as dense_rank,
NTILE(4) OVER (ORDER BY total_revenue DESC) as quartile,
PERCENT_RANK() OVER (ORDER BY total_revenue) as percentile_rank
FROM customer_revenue_summary
WHERE date_year = 2024;</code></pre>
</div>
<div class="technique-explanation">
<h4>Advanced Ranking Techniques</h4>
<div class="technique">
<h5>Conditional Ranking</h5>
<pre><code>-- Rank customers within regions, with revenue threshold filtering
SELECT
customer_id,
region,
total_revenue,
CASE
<pre><code>SELECT
customer_id,
transaction_date,
daily_revenue,
AVG(daily_revenue) OVER (
ORDER BY transaction_date
ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
) as seven_day_avg,
LAG(daily_revenue, 1) OVER (ORDER BY transaction_date) as prev_day,
LEAD(daily_revenue, 1) OVER (ORDER BY transaction_date) as next_day,
FIRST_VALUE(daily_revenue) OVER (
ORDER BY transaction_date
ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
) as first_revenue,
LAST_VALUE(daily_revenue) OVER (
ORDER BY transaction_date
ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING
) as last_revenue
FROM daily_customer_revenue
WHERE customer_id = 12345
ORDER BY transaction_date;</code></pre>
</div>
<h3>Advanced Frame Specifications</h3>
<p>Master different frame types for precise analytical calculations:</p>
<div class="frame-types">
<div class="frame-type">
<h4>ROWS vs RANGE Frame Types</h4>
<pre><code>-- ROWS: Physical row-based frame (faster, more predictable)
SELECT
order_date,
daily_sales,
SUM(daily_sales) OVER (
ORDER BY order_date
ROWS BETWEEN 2 PRECEDING AND 2 FOLLOWING
) as five_day_sum_rows,
-- RANGE: Logical value-based frame (handles ties)
SUM(daily_sales) OVER (
ORDER BY order_date
RANGE BETWEEN INTERVAL '2' DAY PRECEDING
AND INTERVAL '2' DAY FOLLOWING
) as five_day_sum_range
FROM daily_sales_summary;</code></pre>
</div>
<div class="frame-type">
<h4>Dynamic Frame Boundaries</h4>
<pre><code>-- Month-to-date and year-to-date calculations
SELECT
order_date,
daily_sales,
SUM(daily_sales) OVER (
PARTITION BY EXTRACT(YEAR FROM order_date),
EXTRACT(MONTH FROM order_date)
ORDER BY order_date
ROWS UNBOUNDED PRECEDING
) as month_to_date,
SUM(daily_sales) OVER (
PARTITION BY EXTRACT(YEAR FROM order_date)
ORDER BY order_date
ROWS UNBOUNDED PRECEDING
) as year_to_date
FROM daily_sales_summary;</code></pre>
</div>
</div>
</section>
<section id="cte-recursive">
<h2>CTEs and Recursive Queries</h2>
<p>Common Table Expressions (CTEs) provide readable, maintainable approaches to complex queries. Recursive CTEs enable hierarchical data processing essential for organizational structures, product categories, and network analysis.</p>
<h3>Basic CTE Patterns</h3>
<p>Structure complex queries with multiple CTEs for clarity and reusability:</p>
<div class="cte-example">
<h4>Multi-CTE Customer Analysis</h4>
<pre><code>-- Complex customer segmentation using multiple CTEs
WITH customer_metrics AS (
SELECT
customer_id,
COUNT(DISTINCT order_id) as order_count,
SUM(order_total) as total_revenue,
AVG(order_total) as avg_order_value,
MAX(order_date) as last_order_date,
MIN(order_date) as first_order_date
FROM orders
WHERE order_date >= '2024-01-01'
GROUP BY customer_id
),
recency_scoring AS (
SELECT
customer_id,
CASE
WHEN DATEDIFF(day, last_order_date, GETDATE()) <= 30 THEN 5
WHEN DATEDIFF(day, last_order_date, GETDATE()) <= 90 THEN 4
WHEN DATEDIFF(day, last_order_date, GETDATE()) <= 180 THEN 3
WHEN DATEDIFF(day, last_order_date, GETDATE()) <= 365 THEN 2
ELSE 1
END as recency_score
FROM customer_metrics
),
frequency_scoring AS (
SELECT
customer_id,
NTILE(5) OVER (ORDER BY order_count) as frequency_score
FROM customer_metrics
),
monetary_scoring AS (
SELECT
customer_id,
NTILE(5) OVER (ORDER BY total_revenue) as monetary_score
FROM customer_metrics
)
SELECT
cm.customer_id,
cm.total_revenue,
cm.order_count,
cm.avg_order_value,
rs.recency_score,
fs.frequency_score,
ms.monetary_score,
(rs.recency_score + fs.frequency_score + ms.monetary_score) as rfm_score,
CASE
WHEN (rs.recency_score + fs.frequency_score + ms.monetary_score) >= 13 THEN 'Champions'
WHEN (rs.recency_score + fs.frequency_score + ms.monetary_score) >= 10 THEN 'Loyal Customers'
WHEN (rs.recency_score + fs.frequency_score + ms.monetary_score) >= 7 THEN 'Potential Loyalists'
WHEN (rs.recency_score + fs.frequency_score + ms.monetary_score) >= 5 THEN 'At Risk'
ELSE 'Lost Customers'
END as customer_segment
FROM customer_metrics cm
JOIN recency_scoring rs ON cm.customer_id = rs.customer_id
JOIN frequency_scoring fs ON cm.customer_id = fs.customer_id
JOIN monetary_scoring ms ON cm.customer_id = ms.customer_id;</code></pre>
</div>
<h3>Recursive CTEs for Hierarchical Data</h3>
<p>Handle organizational structures, category trees, and network analysis with recursive queries:</p>
<div class="recursive-examples">
<div class="recursive-example">
<h4>Organizational Hierarchy Analysis</h4>
<pre><code>-- Calculate organization levels and reporting chains
WITH RECURSIVE org_hierarchy AS (
-- Anchor: Top-level executives
SELECT
employee_id,
employee_name,
manager_id,
salary,
1 as level,
CAST(employee_name as VARCHAR(1000)) as hierarchy_path,
employee_id as top_manager_id
FROM employees
WHERE manager_id IS NULL
UNION ALL
-- Recursive: Add direct reports
SELECT
e.employee_id,
e.employee_name,
e.manager_id,
e.salary,
oh.level + 1,
oh.hierarchy_path + ' -> ' + e.employee_name,
oh.top_manager_id
FROM employees e
INNER JOIN org_hierarchy oh ON e.manager_id = oh.employee_id
WHERE oh.level < 10 -- Prevent infinite recursion
)
SELECT
employee_id,
employee_name,
level,
hierarchy_path,
salary,
AVG(salary) OVER (PARTITION BY level) as avg_salary_at_level,
COUNT(*) OVER (PARTITION BY top_manager_id) as org_size
FROM org_hierarchy
ORDER BY top_manager_id, level, employee_name;</code></pre>
</div>
<div class="recursive-example">
<h4>Product Category Tree with Aggregations</h4>
<pre><code>-- Recursive category analysis with sales rollups
WITH RECURSIVE category_tree AS (
-- Anchor: Root categories
SELECT
category_id,
category_name,
parent_category_id,
1 as level,
CAST(category_id as VARCHAR(1000)) as path
FROM product_categories
WHERE parent_category_id IS NULL
UNION ALL
-- Recursive: Child categories
SELECT
pc.category_id,
pc.category_name,
pc.parent_category_id,
ct.level + 1,
ct.path + '/' + CAST(pc.category_id as VARCHAR)
FROM product_categories pc
INNER JOIN category_tree ct ON pc.parent_category_id = ct.category_id
),
category_sales AS (
SELECT
ct.category_id,
ct.category_name,
ct.level,
ct.path,
COALESCE(SUM(s.sales_amount), 0) as direct_sales,
COUNT(DISTINCT s.product_id) as product_count
FROM category_tree ct
LEFT JOIN products p ON ct.category_id = p.category_id
LEFT JOIN sales s ON p.product_id = s.product_id
WHERE s.sale_date >= '2024-01-01'
GROUP BY ct.category_id, ct.category_name, ct.level, ct.path
)
SELECT
category_id,
category_name,
level,
REPLICATE(' ', level - 1) + category_name as indented_name,
direct_sales,
product_count,
-- Calculate total sales including subcategories
(SELECT SUM(cs2.direct_sales)
FROM category_sales cs2
WHERE cs2.path LIKE cs1.path + '%') as total_sales_with_children
FROM category_sales cs1
ORDER BY path;</code></pre>
</div>
</div>
</section>
<section id="advanced-joins">
<h2>Complex Joins and Set Operations</h2>
<p>Advanced join techniques and set operations enable sophisticated data analysis patterns essential for comprehensive business intelligence queries.</p>
<h3>Advanced Join Patterns</h3>
<p>Go beyond basic joins to handle complex analytical requirements:</p>
<div class="join-patterns">
<div class="join-pattern">
<h4>Self-Joins for Comparative Analysis</h4>
<pre><code>-- Compare customer performance year-over-year
SELECT
current_year.customer_id,
current_year.customer_name,
current_year.total_revenue as revenue_2024,
previous_year.total_revenue as revenue_2023,
(current_year.total_revenue - COALESCE(previous_year.total_revenue, 0)) as revenue_change,
CASE
WHEN previous_year.total_revenue > 0 THEN
((current_year.total_revenue - previous_year.total_revenue)
/ previous_year.total_revenue) * 100
ELSE NULL
END as growth_percentage
FROM (
SELECT customer_id, customer_name, SUM(order_total) as total_revenue
FROM orders o
JOIN customers c ON o.customer_id = c.customer_id
WHERE YEAR(order_date) = 2024
GROUP BY customer_id, customer_name
) current_year
LEFT JOIN (
SELECT customer_id, SUM(order_total) as total_revenue
FROM orders
WHERE YEAR(order_date) = 2023
GROUP BY customer_id
) previous_year ON current_year.customer_id = previous_year.customer_id;</code></pre>
</div>
<div class="join-pattern">
<h4>Lateral Joins for Correlated Subqueries</h4>
<pre><code>-- Get top 3 products for each customer with lateral join
SELECT
c.customer_id,
c.customer_name,
tp.product_id,
tp.product_name,
tp.total_purchased,
tp.rank_in_customer
FROM customers c
CROSS JOIN LATERAL (
SELECT
p.product_id,
p.product_name,
SUM(oi.quantity) as total_purchased,
ROW_NUMBER() OVER (ORDER BY SUM(oi.quantity) DESC) as rank_in_customer
FROM orders o
JOIN order_items oi ON o.order_id = oi.order_id
JOIN products p ON oi.product_id = p.product_id
WHERE o.customer_id = c.customer_id
GROUP BY p.product_id, p.product_name
ORDER BY total_purchased DESC
LIMIT 3
) tp
WHERE c.customer_id IN (SELECT customer_id FROM high_value_customers);</code></pre>
</div>
</div>
<h3>Set Operations for Complex Analysis</h3>
<p>Combine result sets to identify patterns, gaps, and overlaps in business data:</p>
<div class="set-operations">
<div class="set-operation">
<h4>Customer Behavior Analysis with EXCEPT</h4>
<pre><code>-- Find customers who purchased in 2023 but not in 2024
WITH customers_2023 AS (
SELECT DISTINCT customer_id
FROM orders
WHERE YEAR(order_date) = 2023
),
customers_2024 AS (
SELECT DISTINCT customer_id
FROM orders
WHERE YEAR(order_date) = 2024
),
churned_customers AS (
SELECT customer_id FROM customers_2023
EXCEPT
SELECT customer_id FROM customers_2024
)
SELECT
cc.customer_id,
c.customer_name,
c.email,
last_order.last_order_date,
last_order.last_order_total,
lifetime_stats.total_orders,
lifetime_stats.lifetime_value
FROM churned_customers cc
JOIN customers c ON cc.customer_id = c.customer_id
JOIN (
SELECT
customer_id,
MAX(order_date) as last_order_date,
MAX(order_total) as last_order_total
FROM orders
WHERE customer_id IN (SELECT customer_id FROM churned_customers)
GROUP BY customer_id
) last_order ON cc.customer_id = last_order.customer_id
JOIN (
SELECT
customer_id,
COUNT(*) as total_orders,
SUM(order_total) as lifetime_value
FROM orders
WHERE customer_id IN (SELECT customer_id FROM churned_customers)
GROUP BY customer_id
) lifetime_stats ON cc.customer_id = lifetime_stats.customer_id;</code></pre>
</div>
<div class="set-operation">
<h4>Product Affinity Analysis with INTERSECT</h4>
<pre><code>-- Find products frequently bought together
WITH product_pairs AS (
SELECT
oi1.product_id as product_a,
oi2.product_id as product_b,
COUNT(DISTINCT oi1.order_id) as co_purchase_count
FROM order_items oi1
JOIN order_items oi2 ON oi1.order_id = oi2.order_id
WHERE oi1.product_id < oi2.product_id -- Avoid duplicates and self-pairs
GROUP BY oi1.product_id, oi2.product_id
HAVING COUNT(DISTINCT oi1.order_id) >= 5 -- Minimum co-purchases
),
product_stats AS (
SELECT
product_id,
COUNT(DISTINCT order_id) as individual_purchase_count
FROM order_items
GROUP BY product_id
)
SELECT
pp.product_a,
pa.product_name as product_a_name,
pp.product_b,
pb.product_name as product_b_name,
pp.co_purchase_count,
psa.individual_purchase_count as product_a_total,
psb.individual_purchase_count as product_b_total,
ROUND(
(pp.co_purchase_count * 1.0 / LEAST(psa.individual_purchase_count, psb.individual_purchase_count)) * 100,
2
) as affinity_percentage
FROM product_pairs pp
JOIN products pa ON pp.product_a = pa.product_id
JOIN products pb ON pp.product_b = pb.product_id
JOIN product_stats psa ON pp.product_a = psa.product_id
JOIN product_stats psb ON pp.product_b = psb.product_id
ORDER BY affinity_percentage DESC, co_purchase_count DESC;</code></pre>
</div>
</div>
</section>
<section id="analytical-functions">
<h2>Analytical and Statistical Functions</h2>
<p>Modern SQL provides extensive statistical and analytical functions for advanced business intelligence without requiring external tools.</p>
<h3>Statistical Aggregates</h3>
<p>Calculate comprehensive statistics for business metrics:</p>
<div class="statistical-functions">
<h4>Comprehensive Revenue Analysis</h4>
<pre><code>-- Advanced statistical analysis of revenue by region
SELECT
region,
COUNT(*) as customer_count,
-- Central tendency measures
AVG(annual_revenue) as mean_revenue,
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY annual_revenue) as median_revenue,
MODE() WITHIN GROUP (ORDER BY annual_revenue) as modal_revenue,
-- Variability measures
STDDEV(annual_revenue) as revenue_stddev,
VAR(annual_revenue) as revenue_variance,
(STDDEV(annual_revenue) / AVG(annual_revenue)) * 100 as coefficient_of_variation,
-- Distribution measures
PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY annual_revenue) as q1,
PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY annual_revenue) as q3,
PERCENTILE_CONT(0.9) WITHIN GROUP (ORDER BY annual_revenue) as p90,
PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY annual_revenue) as p95,
-- Range measures
MIN(annual_revenue) as min_revenue,
MAX(annual_revenue) as max_revenue,
MAX(annual_revenue) - MIN(annual_revenue) as revenue_range,
-- Outlier detection (IQR method)
PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY annual_revenue) -
PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY annual_revenue) as iqr,
PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY annual_revenue) -
1.5 * (PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY annual_revenue) -
PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY annual_revenue)) as lower_outlier_threshold,
PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY annual_revenue) +
1.5 * (PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY annual_revenue) -
PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY annual_revenue)) as upper_outlier_threshold
FROM customer_revenue_summary
WHERE year = 2024
GROUP BY region
ORDER BY mean_revenue DESC;</code></pre>
</div>
<h3>Correlation and Regression Analysis</h3>
<p>Identify relationships between business metrics using SQL:</p>
<div class="correlation-analysis">
<h4>Marketing Spend vs Revenue Correlation</h4>
<pre><code>-- Calculate correlation between marketing spend and revenue
WITH monthly_metrics AS (
SELECT
DATE_TRUNC('month', metric_date) as month,
SUM(marketing_spend) as total_marketing_spend,
SUM(revenue) as total_revenue,
AVG(customer_satisfaction_score) as avg_satisfaction
FROM business_metrics
WHERE metric_date >= '2024-01-01'
GROUP BY DATE_TRUNC('month', metric_date)
),
correlation_prep AS (
SELECT
month,
total_marketing_spend,
total_revenue,
avg_satisfaction,
AVG(total_marketing_spend) OVER () as mean_marketing,
AVG(total_revenue) OVER () as mean_revenue,
AVG(avg_satisfaction) OVER () as mean_satisfaction,
COUNT(*) OVER () as n
FROM monthly_metrics
)
SELECT
-- Pearson correlation coefficient for marketing spend vs revenue
SUM((total_marketing_spend - mean_marketing) * (total_revenue - mean_revenue)) /
(SQRT(SUM(POWER(total_marketing_spend - mean_marketing, 2))) *
SQRT(SUM(POWER(total_revenue - mean_revenue, 2)))) as marketing_revenue_correlation,
-- Simple linear regression: revenue = a + b * marketing_spend
(n * SUM(total_marketing_spend * total_revenue) - SUM(total_marketing_spend) * SUM(total_revenue)) /
(n * SUM(POWER(total_marketing_spend, 2)) - POWER(SUM(total_marketing_spend), 2)) as regression_slope,
(SUM(total_revenue) -
((n * SUM(total_marketing_spend * total_revenue) - SUM(total_marketing_spend) * SUM(total_revenue)) /
(n * SUM(POWER(total_marketing_spend, 2)) - POWER(SUM(total_marketing_spend), 2))) * SUM(total_marketing_spend)) / n as regression_intercept,
-- R-squared calculation
1 - (SUM(POWER(total_revenue - (regression_intercept + regression_slope * total_marketing_spend), 2)) /
SUM(POWER(total_revenue - mean_revenue, 2))) as r_squared
FROM correlation_prep;</code></pre>
</div>
</section>
<section id="time-series-analysis">
<h2>Time Series Analysis in SQL</h2>
<p>Time series analysis capabilities in SQL enable trend analysis, seasonality detection, and forecasting essential for business planning.</p>
<h3>Trend Analysis and Decomposition</h3>
<p>Identify underlying trends and seasonal patterns in business data:</p>
<div class="time-series-example">
<h4>Sales Trend and Seasonality Analysis</h4>
<pre><code>-- Comprehensive time series decomposition
WITH daily_sales AS (
SELECT
sale_date,
SUM(sale_amount) as daily_revenue,
EXTRACT(DOW FROM sale_date) as day_of_week,
EXTRACT(MONTH FROM sale_date) as month,
EXTRACT(QUARTER FROM sale_date) as quarter
FROM sales
WHERE sale_date >= '2023-01-01' AND sale_date <= '2024-12-31'
GROUP BY sale_date
),
moving_averages AS (
SELECT
sale_date,
daily_revenue,
day_of_week,
month,
quarter,
-- Various moving averages for trend analysis
AVG(daily_revenue) OVER (
ORDER BY sale_date
ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
) as ma_7_day,
AVG(daily_revenue) OVER (
ORDER BY sale_date
ROWS BETWEEN 29 PRECEDING AND CURRENT ROW
) as ma_30_day,
AVG(daily_revenue) OVER (
ORDER BY sale_date
ROWS BETWEEN 89 PRECEDING AND CURRENT ROW
) as ma_90_day,
-- Exponential moving average (approximate)
daily_revenue * 0.1 +
LAG(daily_revenue, 1, daily_revenue) OVER (ORDER BY sale_date) * 0.9 as ema_approx
FROM daily_sales
),
seasonal_decomposition AS (
SELECT
sale_date,
daily_revenue,
ma_30_day as trend,
daily_revenue - ma_30_day as detrended,
-- Calculate seasonal component by day of week
AVG(daily_revenue - ma_30_day) OVER (
PARTITION BY day_of_week
) as seasonal_dow,
-- Calculate seasonal component by month
AVG(daily_revenue - ma_30_day) OVER (
PARTITION BY month
) as seasonal_month,
-- Residual component
daily_revenue - ma_30_day -
AVG(daily_revenue - ma_30_day) OVER (PARTITION BY day_of_week) as residual
FROM moving_averages
WHERE ma_30_day IS NOT NULL
)
SELECT
sale_date,
daily_revenue,
trend,
seasonal_dow,
seasonal_month,
residual,
-- Reconstruct the time series
trend + seasonal_dow + residual as reconstructed_value,
-- Calculate percentage components
(seasonal_dow / daily_revenue) * 100 as seasonal_dow_pct,
(residual / daily_revenue) * 100 as residual_pct,
-- Trend direction indicators
CASE
WHEN trend > LAG(trend, 7) OVER (ORDER BY sale_date) THEN 'Increasing'
WHEN trend < LAG(trend, 7) OVER (ORDER BY sale_date) THEN 'Decreasing'
ELSE 'Stable'
END as trend_direction
FROM seasonal_decomposition
ORDER BY sale_date;</code></pre>
</div>
<h3>Advanced Time Series Functions</h3>
<p>Utilize specialized time series functions for sophisticated analysis:</p>
<div class="advanced-time-series">
<h4>Change Point Detection and Forecasting</h4>
<pre><code>-- Detect significant changes in business metrics
WITH metric_changes AS (
SELECT
metric_date,
revenue,
LAG(revenue, 1) OVER (ORDER BY metric_date) as prev_revenue,
LAG(revenue, 7) OVER (ORDER BY metric_date) as prev_week_revenue,
LAG(revenue, 30) OVER (ORDER BY metric_date) as prev_month_revenue,
-- Percentage changes
CASE
WHEN LAG(revenue, 1) OVER (ORDER BY metric_date) > 0 THEN
((revenue - LAG(revenue, 1) OVER (ORDER BY metric_date)) /
LAG(revenue, 1) OVER (ORDER BY metric_date)) * 100
END as daily_change_pct,
CASE
WHEN LAG(revenue, 7) OVER (ORDER BY metric_date) > 0 THEN
((revenue - LAG(revenue, 7) OVER (ORDER BY metric_date)) /
LAG(revenue, 7) OVER (ORDER BY metric_date)) * 100
END as weekly_change_pct,
-- Rolling statistics for change point detection
AVG(revenue) OVER (
ORDER BY metric_date
ROWS BETWEEN 29 PRECEDING AND CURRENT ROW
) as rolling_30_avg,
STDDEV(revenue) OVER (
ORDER BY metric_date
ROWS BETWEEN 29 PRECEDING AND CURRENT ROW
) as rolling_30_stddev
FROM daily_business_metrics
),
change_points AS (
SELECT
metric_date,
revenue,
daily_change_pct,
weekly_change_pct,
rolling_30_avg,
rolling_30_stddev,
-- Z-score for anomaly detection
CASE
WHEN rolling_30_stddev > 0 THEN
(revenue - rolling_30_avg) / rolling_30_stddev
END as z_score,
-- Flag significant changes
CASE
WHEN ABS(daily_change_pct) > 20 THEN 'Significant Daily Change'
WHEN ABS(weekly_change_pct) > 30 THEN 'Significant Weekly Change'
WHEN ABS((revenue - rolling_30_avg) / rolling_30_stddev) > 2 THEN 'Statistical Anomaly'
ELSE 'Normal'
END as change_classification
FROM metric_changes
WHERE rolling_30_stddev IS NOT NULL
),
-- Simple linear trend for forecasting
trend_analysis AS (
SELECT
COUNT(*) as n,
SUM(EXTRACT(DAY FROM metric_date)) as sum_x,
SUM(revenue) as sum_y,
SUM(EXTRACT(DAY FROM metric_date) * revenue) as sum_xy,
SUM(POWER(EXTRACT(DAY FROM metric_date), 2)) as sum_x2,
-- Linear regression coefficients
(n * SUM(EXTRACT(DAY FROM metric_date) * revenue) -
SUM(EXTRACT(DAY FROM metric_date)) * SUM(revenue)) /
(n * SUM(POWER(EXTRACT(DAY FROM metric_date), 2)) -
POWER(SUM(EXTRACT(DAY FROM metric_date)), 2)) as slope,
(SUM(revenue) -
((n * SUM(EXTRACT(DAY FROM metric_date) * revenue) -
SUM(EXTRACT(DAY FROM metric_date)) * SUM(revenue)) /
(n * SUM(POWER(EXTRACT(DAY FROM metric_date), 2)) -
POWER(SUM(EXTRACT(DAY FROM metric_date)), 2))) * SUM(EXTRACT(DAY FROM metric_date))) / n as intercept
FROM change_points
WHERE metric_date >= CURRENT_DATE - INTERVAL '90 days'
)
SELECT
cp.metric_date,
cp.revenue,
cp.change_classification,
cp.z_score,
-- Trend line
ta.intercept + ta.slope * EXTRACT(DAY FROM cp.metric_date) as trend_value,
-- Simple forecast (next 7 days)
ta.intercept + ta.slope * (EXTRACT(DAY FROM cp.metric_date) + 7) as forecast_7_days
FROM change_points cp
CROSS JOIN trend_analysis ta
ORDER BY cp.metric_date;</code></pre>
</div>
</section>
<section id="performance-optimization">
<h2>Query Optimization Strategies</h2>
<p>Advanced SQL analytics requires optimization techniques to handle large datasets efficiently while maintaining query readability and maintainability.</p>
<h3>Index Strategy for Analytics</h3>
<p>Design indexes specifically for analytical workloads:</p>
<div class="index-strategies">
<div class="index-strategy">
<h4>Composite Indexes for Window Functions</h4>
<pre><code>-- Optimize window function queries with proper indexing
-- Index design for partition by + order by patterns
-- For queries with PARTITION BY customer_id ORDER BY order_date
CREATE INDEX idx_orders_customer_date_analytics ON orders (
customer_id, -- Partition column first
order_date, -- Order by column second
order_total -- Include frequently selected columns
);
-- For time series analysis queries
CREATE INDEX idx_sales_date_analytics ON sales (
sale_date, -- Primary ordering column
product_category, -- Common partition column
region -- Secondary partition column
) INCLUDE (
sale_amount, -- Avoid key lookups
quantity,
customer_id
);
-- For ranking queries within categories
CREATE INDEX idx_products_category_ranking ON products (
category_id, -- Partition column
total_sales DESC -- Order by column with sort direction
) INCLUDE (
product_name,
price,
stock_level
);</code></pre>
</div>
<div class="index-strategy">
<h4>Filtered Indexes for Specific Analytics</h4>
<pre><code>-- Create filtered indexes for specific analytical scenarios
-- Index for active customers only
CREATE INDEX idx_orders_active_customers ON orders (
customer_id,
order_date DESC
)
WHERE order_date >= DATEADD(YEAR, -2, GETDATE())
INCLUDE (order_total, product_count);
-- Index for high-value transactions
CREATE INDEX idx_orders_high_value ON orders (
order_date,
customer_id
)
WHERE order_total >= 1000
INCLUDE (order_total, discount_amount);
-- Index for specific time periods (quarterly analysis)
CREATE INDEX idx_sales_current_quarter ON sales (
product_id,
sale_date
)
WHERE sale_date >= DATEADD(QUARTER, DATEDIFF(QUARTER, 0, GETDATE()), 0)
INCLUDE (sale_amount, quantity);</code></pre>
</div>
</div>
<h3>Query Optimization Techniques</h3>
<p>Apply specific optimization patterns for complex analytical queries:</p>
<div class="optimization-techniques">
<div class="technique">
<h4>Avoiding Redundant Window Function Calculations</h4>
<pre><code>-- INEFFICIENT: Multiple similar window function calls
SELECT
customer_id,
order_date,
order_total,
SUM(order_total) OVER (PARTITION BY customer_id ORDER BY order_date) as running_total,
AVG(order_total) OVER (PARTITION BY customer_id ORDER BY order_date) as running_avg,
COUNT(*) OVER (PARTITION BY customer_id ORDER BY order_date) as running_count,
MAX(order_total) OVER (PARTITION BY customer_id ORDER BY order_date) as running_max
FROM orders;
-- EFFICIENT: Calculate once, derive others
WITH base_calculations AS (
SELECT
customer_id,
order_date,
order_total,
SUM(order_total) OVER (PARTITION BY customer_id ORDER BY order_date) as running_total,
COUNT(*) OVER (PARTITION BY customer_id ORDER BY order_date) as running_count,
MAX(order_total) OVER (PARTITION BY customer_id ORDER BY order_date) as running_max
FROM orders
)
SELECT
customer_id,
order_date,
order_total,
running_total,
running_total / running_count as running_avg, -- Derive from existing calculations
running_count,
running_max
FROM base_calculations;</code></pre>
</div>
<div class="technique">
<h4>Optimizing Large Aggregations</h4>
<pre><code>-- Use materialized views for frequently accessed aggregations
CREATE MATERIALIZED VIEW mv_customer_monthly_stats AS
SELECT
customer_id,
DATE_TRUNC('month', order_date) as order_month,
COUNT(*) as order_count,
SUM(order_total) as total_revenue,
AVG(order_total) as avg_order_value,
MAX(order_date) as last_order_date
FROM orders
GROUP BY customer_id, DATE_TRUNC('month', order_date);
-- Create appropriate indexes on materialized view
CREATE INDEX idx_mv_customer_monthly_customer_month
ON mv_customer_monthly_stats (customer_id, order_month);
-- Use partitioning for very large fact tables
CREATE TABLE sales_partitioned (
sale_id BIGINT,
sale_date DATE,
customer_id INT,
product_id INT,
sale_amount DECIMAL(10,2),
region VARCHAR(50)
)
PARTITION BY RANGE (sale_date) (
PARTITION p2023 VALUES LESS THAN ('2024-01-01'),
PARTITION p2024_q1 VALUES LESS THAN ('2024-04-01'),
PARTITION p2024_q2 VALUES LESS THAN ('2024-07-01'),
PARTITION p2024_q3 VALUES LESS THAN ('2024-10-01'),
PARTITION p2024_q4 VALUES LESS THAN ('2025-01-01')
);</code></pre>
</div>
</div>
</section>
<section id="data-quality">
<h2>Data Quality and Validation</h2>
<p>Robust data quality checks ensure analytical results are reliable and trustworthy. Implement comprehensive validation within your SQL analytics workflows.</p>
<h3>Comprehensive Data Quality Framework</h3>
<p>Build systematic data quality checks into analytical processes:</p>
<div class="data-quality-example">
<h4>Multi-Dimensional Data Quality Assessment</h4>
<pre><code>-- Comprehensive data quality assessment query
WITH data_quality_metrics AS (
SELECT
'orders' as table_name,
COUNT(*) as total_records,
-- Completeness checks
COUNT(*) - COUNT(customer_id) as missing_customer_id,
COUNT(*) - COUNT(order_date) as missing_order_date,
COUNT(*) - COUNT(order_total) as missing_order_total,
-- Validity checks
SUM(CASE WHEN order_total < 0 THEN 1 ELSE 0 END) as negative_amounts,
SUM(CASE WHEN order_date > CURRENT_DATE THEN 1 ELSE 0 END) as future_dates,
SUM(CASE WHEN order_date < '2020-01-01' THEN 1 ELSE 0 END) as very_old_dates,
-- Consistency checks
SUM(CASE WHEN order_total != (
SELECT SUM(oi.quantity * oi.unit_price)
FROM order_items oi
WHERE oi.order_id = o.order_id
) THEN 1 ELSE 0 END) as inconsistent_totals,
-- Uniqueness checks
COUNT(*) - COUNT(DISTINCT order_id) as duplicate_order_ids,
-- Range checks
SUM(CASE WHEN order_total > 10000 THEN 1 ELSE 0 END) as potentially_high_amounts,
-- Statistical outliers (using IQR method)
PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY order_total) as q3,
PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY order_total) as q1,
PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY order_total) -
PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY order_total) as iqr
FROM orders o
WHERE order_date >= '2024-01-01'
),
quality_summary AS (
SELECT
table_name,
total_records,
-- Calculate quality percentages
ROUND((1.0 - (missing_customer_id * 1.0 / total_records)) * 100, 2) as customer_id_completeness,
ROUND((1.0 - (missing_order_date * 1.0 / total_records)) * 100, 2) as order_date_completeness,
ROUND((1.0 - (missing_order_total * 1.0 / total_records)) * 100, 2) as order_total_completeness,
ROUND((1.0 - (negative_amounts * 1.0 / total_records)) * 100, 2) as amount_validity,
ROUND((1.0 - (future_dates * 1.0 / total_records)) * 100, 2) as date_validity,
ROUND((1.0 - (inconsistent_totals * 1.0 / total_records)) * 100, 2) as total_consistency,
ROUND((1.0 - (duplicate_order_ids * 1.0 / total_records)) * 100, 2) as id_uniqueness,
-- Outlier detection
q1 - 1.5 * iqr as lower_outlier_threshold,
q3 + 1.5 * iqr as upper_outlier_threshold,
-- Overall quality score (weighted average)
ROUND((
(1.0 - (missing_customer_id * 1.0 / total_records)) * 0.2 +
(1.0 - (missing_order_date * 1.0 / total_records)) * 0.2 +
(1.0 - (missing_order_total * 1.0 / total_records)) * 0.2 +
(1.0 - (negative_amounts * 1.0 / total_records)) * 0.15 +
(1.0 - (future_dates * 1.0 / total_records)) * 0.1 +
(1.0 - (inconsistent_totals * 1.0 / total_records)) * 0.1 +
(1.0 - (duplicate_order_ids * 1.0 / total_records)) * 0.05
) * 100, 2) as overall_quality_score
FROM data_quality_metrics
)
SELECT
table_name,
total_records,
customer_id_completeness || '%' as customer_id_completeness,
order_date_completeness || '%' as order_date_completeness,
order_total_completeness || '%' as order_total_completeness,
amount_validity || '%' as amount_validity,
date_validity || '%' as date_validity,
total_consistency || '%' as total_consistency,
id_uniqueness || '%' as id_uniqueness,
overall_quality_score || '%' as overall_quality_score,
CASE
WHEN overall_quality_score >= 95 THEN 'Excellent'
WHEN overall_quality_score >= 90 THEN 'Good'
WHEN overall_quality_score >= 80 THEN 'Acceptable'
WHEN overall_quality_score >= 70 THEN 'Poor'
ELSE 'Critical'
END as quality_rating
FROM quality_summary;</code></pre>
</div>
<h3>Automated Data Quality Monitoring</h3>
<p>Implement ongoing data quality monitoring with automated alerts:</p>
<div class="monitoring-example">
<h4>Daily Data Quality Dashboard</h4>
<pre><code>-- Create automated data quality monitoring
CREATE OR REPLACE VIEW daily_data_quality_dashboard AS
WITH daily_metrics AS (
SELECT
CURRENT_DATE as check_date,
'daily_sales' as table_name,
-- Volume checks
COUNT(*) as record_count,
COUNT(*) - LAG(COUNT(*), 1) OVER (ORDER BY DATE(created_at)) as volume_change,
-- Completeness monitoring
COUNT(CASE WHEN sale_amount IS NULL THEN 1 END) as missing_amounts,
COUNT(CASE WHEN customer_id IS NULL THEN 1 END) as missing_customers,
-- Freshness checks
MAX(created_at) as latest_record,
EXTRACT(HOUR FROM (CURRENT_TIMESTAMP - MAX(created_at))) as hours_since_latest,
-- Business rule validation
COUNT(CASE WHEN sale_amount <= 0 THEN 1 END) as invalid_amounts,
COUNT(CASE WHEN sale_date > CURRENT_DATE THEN 1 END) as future_sales,
-- Statistical monitoring
AVG(sale_amount) as avg_sale_amount,
STDDEV(sale_amount) as stddev_sale_amount
FROM sales
WHERE DATE(created_at) = CURRENT_DATE
GROUP BY DATE(created_at)
),
quality_alerts AS (
SELECT
*,
CASE
WHEN ABS(volume_change) > (record_count * 0.2) THEN 'Volume Alert: >20% change'
WHEN missing_amounts > (record_count * 0.05) THEN 'Completeness Alert: >5% missing amounts'
WHEN hours_since_latest > 2 THEN 'Freshness Alert: Data older than 2 hours'
WHEN invalid_amounts > 0 THEN 'Validity Alert: Invalid amounts detected'
WHEN future_sales > 0 THEN 'Logic Alert: Future sales detected'
ELSE 'No alerts'
END as alert_status,
CASE
WHEN hours_since_latest > 4 OR invalid_amounts > (record_count * 0.1) THEN 'Critical'
WHEN ABS(volume_change) > (record_count * 0.2) OR missing_amounts > (record_count * 0.05) THEN 'Warning'
ELSE 'Normal'
END as severity_level
FROM daily_metrics
)
SELECT
check_date,
table_name,
record_count,
volume_change,
ROUND((1.0 - missing_amounts * 1.0 / record_count) * 100, 2) as amount_completeness_pct,
hours_since_latest,
invalid_amounts,
alert_status,
severity_level,
-- Quality score calculation
CASE
WHEN severity_level = 'Critical' THEN 0
WHEN severity_level = 'Warning' THEN 70
ELSE 100
END as daily_quality_score
FROM quality_alerts;</code></pre>
</div>
</section>
<section id="practical-examples">
<h2>Real-World Business Cases</h2>
<p>Apply advanced SQL techniques to solve complex business problems across different industries and use cases.</p>
<h3>Customer Lifetime Value Analysis</h3>
<p>Calculate sophisticated CLV metrics using advanced SQL patterns:</p>
<div class="clv-example">
<h4>Predictive Customer Lifetime Value</h4>
<pre><code>-- Advanced CLV calculation with cohort analysis and predictive elements
WITH customer_cohorts AS (
SELECT
customer_id,
MIN(order_date) as first_order_date,
DATE_TRUNC('month', MIN(order_date)) as cohort_month
FROM orders
GROUP BY customer_id
),
monthly_customer_activity AS (
SELECT
c.customer_id,
c.cohort_month,
DATE_TRUNC('month', o.order_date) as activity_month,
EXTRACT(EPOCH FROM (DATE_TRUNC('month', o.order_date) - c.cohort_month)) /
EXTRACT(EPOCH FROM INTERVAL '1 month') as period_number,
COUNT(DISTINCT o.order_id) as orders_count,
SUM(o.order_total) as revenue,
AVG(o.order_total) as avg_order_value
FROM customer_cohorts c
JOIN orders o ON c.customer_id = o.customer_id
GROUP BY c.customer_id, c.cohort_month, DATE_TRUNC('month', o.order_date)
),
retention_rates AS (
SELECT
cohort_month,
period_number,
COUNT(DISTINCT customer_id) as customers_active,
FIRST_VALUE(COUNT(DISTINCT customer_id)) OVER (
PARTITION BY cohort_month
ORDER BY period_number
) as cohort_size,
COUNT(DISTINCT customer_id) * 1.0 /
FIRST_VALUE(COUNT(DISTINCT customer_id)) OVER (
PARTITION BY cohort_month
ORDER BY period_number
) as retention_rate
FROM monthly_customer_activity
GROUP BY cohort_month, period_number
),
customer_metrics AS (
SELECT
c.customer_id,
c.cohort_month,
COUNT(DISTINCT mca.activity_month) as active_months,
SUM(mca.revenue) as total_revenue,
AVG(mca.revenue) as avg_monthly_revenue,
MAX(mca.activity_month) as last_active_month,
-- Calculate customer age in months
EXTRACT(EPOCH FROM (COALESCE(MAX(mca.activity_month), CURRENT_DATE) - c.cohort_month)) /
EXTRACT(EPOCH FROM INTERVAL '1 month') as customer_age_months,
-- Historical CLV (actual)
SUM(mca.revenue) as historical_clv,
-- Frequency and monetary components
COUNT(DISTINCT mca.activity_month) * 1.0 /
NULLIF(EXTRACT(EPOCH FROM (MAX(mca.activity_month) - c.cohort_month)) /
EXTRACT(EPOCH FROM INTERVAL '1 month'), 0) as purchase_frequency,
SUM(mca.revenue) / NULLIF(COUNT(DISTINCT mca.activity_month), 0) as avg_revenue_per_active_month
FROM customer_cohorts c
LEFT JOIN monthly_customer_activity mca ON c.customer_id = mca.customer_id
GROUP BY c.customer_id, c.cohort_month
),
predictive_clv AS (
SELECT
cm.*,
-- Get cohort-level retention curve
COALESCE(AVG(rr.retention_rate) OVER (
PARTITION BY cm.cohort_month
), 0.1) as avg_cohort_retention,
-- Predictive CLV calculation
-- Formula: (Average Monthly Revenue × Purchase Frequency × Gross Margin) / (1 + Discount Rate - Retention Rate)
CASE
WHEN avg_cohort_retention > 0 AND avg_cohort_retention < 1 THEN
(COALESCE(avg_revenue_per_active_month, 0) *
COALESCE(purchase_frequency, 0) *
0.3) / -- Assuming 30% gross margin
(1 + 0.01 - avg_cohort_retention) -- 1% monthly discount rate
ELSE historical_clv
END as predicted_clv,
-- Risk segmentation
CASE
WHEN EXTRACT(EPOCH FROM (CURRENT_DATE - last_active_month)) /
EXTRACT(EPOCH FROM INTERVAL '1 month') > 6 THEN 'High Risk'
WHEN EXTRACT(EPOCH FROM (CURRENT_DATE - last_active_month)) /
EXTRACT(EPOCH FROM INTERVAL '1 month') > 3 THEN 'Medium Risk'
WHEN last_active_month >= CURRENT_DATE - INTERVAL '1 month' THEN 'Active'
ELSE 'Inactive'
END as customer_status,
-- Value tier classification
NTILE(5) OVER (ORDER BY historical_clv) as value_quintile
FROM customer_metrics cm
LEFT JOIN retention_rates rr ON cm.cohort_month = rr.cohort_month
AND ROUND(cm.customer_age_months) = rr.period_number
)
SELECT
customer_id,
cohort_month,
customer_status,
value_quintile,
active_months,
customer_age_months,
ROUND(total_revenue, 2) as historical_clv,
ROUND(predicted_clv, 2) as predicted_clv,
ROUND(avg_revenue_per_active_month, 2) as avg_monthly_revenue,
ROUND(purchase_frequency, 3) as purchase_frequency,
ROUND(avg_cohort_retention, 3) as cohort_retention_rate,
-- Strategic recommendations
CASE
WHEN customer_status = 'Active' AND value_quintile >= 4 THEN 'VIP Program'
WHEN customer_status = 'Active' AND value_quintile = 3 THEN 'Loyalty Program'
WHEN customer_status = 'Medium Risk' AND value_quintile >= 3 THEN 'Retention Campaign'
WHEN customer_status = 'High Risk' AND value_quintile >= 3 THEN 'Win-Back Campaign'
WHEN customer_status = 'Inactive' THEN 'Re-engagement Required'
ELSE 'Standard Marketing'
END as recommended_action
FROM predictive_clv
WHERE predicted_clv > 0
ORDER BY predicted_clv DESC;</code></pre>
</div>
<div class="expert-consultation-cta">
<h3>Need Advanced SQL Analytics Support?</h3>
<p>Our database specialists can help you implement sophisticated SQL analytics solutions that scale with your business requirements.</p>
<a href="../../quote.php?service=sql-analytics" class="btn btn-primary">Get SQL Analytics Consultation</a>
</div>
</section>
</div>
<!-- Related Articles -->
<section class="related-articles">
<h2>Related Articles</h2>
<div class="related-grid">
<article class="related-card">
<h3><a href="business-intelligence-dashboard-design.php">BI Dashboard Design Best Practices</a></h3>
<p>Transform your SQL analytics into compelling visual dashboards for business users.</p>
<span class="read-time">12 min read</span>
</article>
<article class="related-card">
<h3><a href="../categories/business-intelligence.php">More Business Intelligence Articles</a></h3>
<p>Explore our complete collection of business intelligence and data analytics resources.</p>
<span class="read-time">Browse category</span>
</article>
<article class="related-card">
<h3><a href="../../case-studies/">SQL Analytics Success Stories</a></h3>
<p>See real-world examples of advanced SQL implementations across different industries.</p>
<span class="read-time">Multiple studies</span>
</article>
</div>
</section>
</div>
</article>
<!-- CTA Section -->
<section class="cta">
<div class="container">
<div class="cta-content">
<h2>Need Expert SQL Analytics Services?</h2>
<p>Our data engineering team builds high-performance SQL solutions that unlock insights from your business data.</p>
<div class="cta-buttons">
<a href="../../quote.php" class="btn btn-primary">Get Free Consultation</a>
<a href="../../#services" class="btn btn-secondary">Explore Data Services</a>
</div>
</div>
</div>
</section>
</main>
<!-- Footer -->
<footer class="footer">
<div class="container">
<div class="footer-content">
<div class="footer-section">
<div class="footer-logo">
<img src="../../assets/images/logo-white.svg" alt="UK Data Services" loading="lazy">
</div>
<p>Enterprise data intelligence solutions for modern British business. Transform your operations with accurate, actionable insights and regulatory-compliant data services.</p>
</div>
<div class="footer-section">
<h3>Analytics Services</h3>
<ul>
<li><a href="../../#services">SQL Analytics</a></li>
<li><a href="../../#services">Database Optimization</a></li>
<li><a href="../../#services">Business Intelligence</a></li>
<li><a href="../../#services">Data Engineering</a></li>
<li><a href="../../#services">Performance Tuning</a></li>
</ul>
</div>
<div class="footer-section">
<h3>Resources</h3>
<ul>
<li><a href="../">SQL Analytics Blog</a></li>
<li><a href="../../case-studies/">Case Studies</a></li>
<li><a href="../../about.php">About UK Data Services</a></li>
<li><a href="../../project-types.php">Project Types</a></li>
<li><a href="../../quote.php">Request Consultation</a></li>
</ul>
</div>
<div class="footer-section">
<h3>Legal & Support</h3>
<ul>
<li><a href="../../privacy-policy.php">Privacy Policy</a></li>
<li><a href="../../terms-of-service.php">Terms of Service</a></li>
<li><a href="../../cookie-policy.php">Cookie Policy</a></li>
<li><a href="../../gdpr-compliance.php">GDPR Compliance</a></li>
<li><a href="../../#contact">Contact & Support</a></li>
</ul>
</div>
</div>
<div class="footer-bottom">
<p>&copy; <?php echo date('Y'); ?> UK Data Services. All rights reserved.</p>
<div class="social-links">
<a href="https://www.linkedin.com/company/uk-data-services" aria-label="LinkedIn" rel="noopener" target="_blank">
<img src="../../assets/images/icon-linkedin.svg" alt="LinkedIn" loading="lazy">
</a>
<a href="https://twitter.com/ukdataservices" aria-label="Twitter" rel="noopener" target="_blank">
<img src="../../assets/images/icon-twitter.svg" alt="Twitter" loading="lazy">
</a>
</div>
</div>
</div>
</footer>
<!-- Scripts -->
<script src="../../assets/js/main.js"></script>
<!-- Article-specific functionality -->
<script>
document.addEventListener('DOMContentLoaded', function() {
// Enhanced table of contents navigation
const tocLinks = document.querySelectorAll('.article-toc a');
const sections = document.querySelectorAll('.article-content section[id]');
// Smooth scrolling with offset for fixed header
tocLinks.forEach(link => {
link.addEventListener('click', function(e) {
e.preventDefault();
const targetId = this.getAttribute('href');
const targetSection = document.querySelector(targetId);
if (targetSection) {
const headerOffset = 100;
const elementPosition = targetSection.getBoundingClientRect().top;
const offsetPosition = elementPosition + window.pageYOffset - headerOffset;
window.scrollTo({
top: offsetPosition,
behavior: 'smooth'
});
}
});
});
// Reading progress indicator
const article = document.querySelector('.article-content');
const progressBar = document.createElement('div');
progressBar.className = 'reading-progress';
progressBar.style.cssText = `
position: fixed;
top: 70px;
left: 0;
width: 0%;
height: 3px;
background: linear-gradient(90deg, #179e83, #144784);
z-index: 999;
transition: width 0.3s ease;
`;
document.body.appendChild(progressBar);
function updateReadingProgress() {
const articleRect = article.getBoundingClientRect();
const articleHeight = article.offsetHeight;
const viewportHeight = window.innerHeight;
const scrolled = Math.max(0, -articleRect.top);
const progress = Math.min(100, (scrolled / (articleHeight - viewportHeight)) * 100);
progressBar.style.width = progress + '%';
}
window.addEventListener('scroll', updateReadingProgress);
updateReadingProgress();
});
</script>
</body>
</html>