📌 Day 7: Time Series & Final Project
Prerequisite Setup
import pandas as pd import numpy as np import matplotlib.pyplot as plt # Set display options pd.set_option('display.max_rows', 10) pd.set_option('display.float_format', '{:.2f}'.format) # Create sample time series data np.random.seed(42) # Generate daily sales data for 2024 dates = pd.date_range('2024-01-01', '2024-12-31', freq='D') n = len(dates) # Create realistic sales pattern with trend, seasonality, and noise trend = np.linspace(100, 150, n) # Upward trend seasonality = 20 * np.sin(2 * np.pi * dates.dayofyear / 365.25 * 2) # Bi-weekly pattern noise = np.random.normal(0, 5, n) # Random variation weekly_pattern = -5 * (dates.dayofweek >= 5) # Lower on weekends sales = trend + seasonality + noise + weekly_pattern sales = np.maximum(sales, 50) # Minimum sales of 50 # Create temperature data (seasonal) temperature = 15 + 10 * np.sin(2 * np.pi * dates.dayofyear / 365.25 - np.pi/2) + np.random.normal(0, 2, n) df = pd.DataFrame({ 'date': dates, 'sales': sales, 'temperature': temperature, 'customers': np.random.poisson(50, n) + (sales / 10).astype(int) }) print("Original Dataset:") print(df.head(10)) print(f"\nShape: {df.shape}") print(f"Date range: {df['date'].min()} to {df['date'].max()}") print("\n" + "="*60)
1️⃣ Convert string dates to datetime and set index
Why datetime index?
Enables time-based operations (resampling, shifting, rolling)
Makes plotting easier
Allows date-based slicing
Convert string to datetime
# First, create a DataFrame with string dates df_string = df.copy() df_string['date'] = df_string['date'].astype(str) print("Date as string:") print(df_string[['date']].head()) print(f"Data type: {df_string['date'].dtype}") # Convert to datetime df_string['date'] = pd.to_datetime(df_string['date']) print("\nAfter conversion:") print(df_string[['date']].head()) print(f"Data type: {df_string['date'].dtype}")
Different date formats
# Common date formats and how to parse them date_formats = pd.DataFrame({ 'us_format': ['01/15/2024', '02/20/2024', '03/25/2024'], 'eu_format': ['15/01/2024', '20/02/2024', '25/03/2024'], 'iso_format': ['2024-01-15', '2024-02-20', '2024-03-25'], 'custom': ['2024_Jan_15', '2024_Feb_20', '2024_Mar_25'] }) print("Parsing different date formats:") print("\nUS format (month/day/year):") print(pd.to_datetime(date_formats['us_format'])) print("\nEU format (day/month/year) - need format specifier:") print(pd.to_datetime(date_formats['eu_format'], format='%d/%m/%Y')) print("\nCustom format:") print(pd.to_datetime(date_formats['custom'], format='%Y_%b_%d'))
Set date as index
# Set date column as index (creates DatetimeIndex) df_ts = df.set_index('date') print("DataFrame with datetime index:") print(df_ts.head()) print(f"\nIndex type: {type(df_ts.index)}") print(f"Index name: {df_ts.index.name}") # Benefits of datetime index print("\nAccess by year:") print(df_ts['2024-06'].head()) # All of June 2024 print("\nAccess by date range:") print(df_ts['2024-03-15':'2024-03-20']) print("\nAccess specific month/day:") print(df_ts[df_ts.index.month == 12].head()) # December data
Extract date components
# Once index is datetime, you can extract various components df_ts['year'] = df_ts.index.year df_ts['month'] = df_ts.index.month df_ts['day'] = df_ts.index.day df_ts['dayofweek'] = df_ts.index.dayofweek # Monday=0, Sunday=6 df_ts['quarter'] = df_ts.index.quarter df_ts['is_weekend'] = df_ts.index.dayofweek >= 5 print("\nExtracted date components:") print(df_ts[['sales', 'year', 'month', 'dayofweek', 'is_weekend']].head(10)) # Day of week mapping day_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'] df_ts['day_name'] = df_ts.index.dayofweek.map(lambda x: day_names[x])
Handling errors in date conversion
# Handling invalid dates bad_dates = pd.Series(['2024-01-01', '2024-02-30', '2024-03-01', 'invalid', '2024-04-01']) # Coerce errors to NaT (Not a Time) converted = pd.to_datetime(bad_dates, errors='coerce') print("\nHandling invalid dates:") print(pd.DataFrame({'original': bad_dates, 'converted': converted})) # Drop NaT values valid_dates = converted.dropna() print(f"Valid dates: {len(valid_dates)} out of {len(bad_dates)}")
2️⃣ Resample daily data to monthly aggregates
What is resampling?
Changing the frequency of time series data (e.g., daily → monthly, hourly → daily).
Basic resampling to monthly
# Resample daily sales to monthly (sum) monthly_sales = df_ts['sales'].resample('M').sum() print("Monthly Sales (sum):") print(monthly_sales) # Resample to monthly average monthly_avg_sales = df_ts['sales'].resample('M').mean() print("\nMonthly Average Sales:") print(monthly_avg_sales)
Different aggregation methods
# Multiple aggregations at once monthly_agg = df_ts.resample('M').agg({ 'sales': ['sum', 'mean', 'min', 'max', 'std'], 'temperature': 'mean', 'customers': 'sum' }) print("\nMonthly aggregations:") print(monthly_agg.round(2)) # Flatten column names monthly_agg.columns = ['_'.join(col).strip() for col in monthly_agg.columns.values] print("\nWith flattened columns:") print(monthly_agg.head())
Resampling to different frequencies
# Weekly aggregations (W = week end Sunday) weekly_sales = df_ts['sales'].resample('W').sum() print("\nWeekly Sales (Sun-Sat weeks):") print(weekly_sales.head(10)) # Quarterly aggregations (Q = quarter end) quarterly_sales = df_ts['sales'].resample('Q').sum() print("\nQuarterly Sales:") print(quarterly_sales) # Annual aggregations (Y = year end) annual_sales = df_ts['sales'].resample('Y').sum() print("\nAnnual Sales:") print(annual_sales)
Different resampling frequencies
# Common frequency codes frequencies = { 'D': 'Daily', 'W': 'Weekly', 'M': 'Month end', 'MS': 'Month start', 'Q': 'Quarter end', 'QS': 'Quarter start', 'Y': 'Year end', 'YS': 'Year start', 'H': 'Hourly', 'T': 'Minutely', 'S': 'Secondly' } print("\nCommon frequency codes:") for code, name in frequencies.items(): print(f" {code}: {name}")
Resampling with custom periods
# First week of each month first_week = df_ts['sales'].resample('W-MON').first() print("\nFirst day of each week (Monday):") print(first_week.head(10)) # Last business day of month last_business_day = df_ts['sales'].resample('BM').last() print("\nLast business day of month:") print(last_business_day.head())
Forward/backward fill in resampling
# Create sparse time series sparse_dates = pd.date_range('2024-01-01', periods=10, freq='2D') sparse_data = pd.Series([100, 110, 105, 120, 115, 130, 125, 140, 135, 150], index=sparse_dates) print("\nSparse time series (every 2 days):") print(sparse_data) # Resample to daily with forward fill daily_ffill = sparse_data.resample('D').ffill() print("\nDaily with forward fill:") print(daily_ffill.head(15)) # Resample to daily with interpolation daily_interp = sparse_data.resample('D').interpolate() print("\nDaily with interpolation:") print(daily_interp.head(15))
3️⃣ Calculate 7-day rolling average
What is rolling window?
Moving average that smooths out short-term fluctuations and highlights longer-term trends.
Basic 7-day rolling average
# Calculate 7-day rolling average df_ts['sales_7d_avg'] = df_ts['sales'].rolling(window=7).mean() print("Sales with 7-day rolling average:") print(df_ts[['sales', 'sales_7d_avg']].head(15))
Different rolling window sizes
# Multiple rolling windows df_ts['sales_7d_avg'] = df_ts['sales'].rolling(window=7).mean() df_ts['sales_30d_avg'] = df_ts['sales'].rolling(window=30).mean() df_ts['sales_90d_avg'] = df_ts['sales'].rolling(window=90).mean() print("\nMultiple rolling averages (first 30 days):") print(df_ts[['sales', 'sales_7d_avg', 'sales_30d_avg', 'sales_90d_avg']].head(30)) # Note: NaN for first window-1 days (e.g., first 6 days for 7-day window)
Different rolling aggregations
# Rolling sum (7-day total) df_ts['sales_7d_sum'] = df_ts['sales'].rolling(window=7).sum() # Rolling standard deviation (volatility) df_ts['sales_7d_std'] = df_ts['sales'].rolling(window=7).std() # Rolling maximum (peak in last 7 days) df_ts['sales_7d_max'] = df_ts['sales'].rolling(window=7).max() # Rolling minimum (trough in last 7 days) df_ts['sales_7d_min'] = df_ts['sales'].rolling(window=7).min() print("\nVarious rolling statistics:") print(df_ts[['sales', 'sales_7d_sum', 'sales_7d_std', 'sales_7d_max', 'sales_7d_min']].head(15))
Center-aligned rolling window
# Center-aligned window (uses past and future) df_ts['sales_7d_center'] = df_ts['sales'].rolling(window=7, center=True).mean() # Compare forward vs centered comparison = df_ts[['sales', 'sales_7d_avg', 'sales_7d_center']].head(20) print("\nForward vs Center-aligned rolling average:") print(comparison)
Rolling with minimum periods
# Require minimum number of observations df_ts['sales_7d_min3'] = df_ts['sales'].rolling(window=7, min_periods=3).mean() print("\nRolling with min_periods=3 (fewer NaNs at start):") print(df_ts[['sales', 'sales_7d_avg', 'sales_7d_min3']].head(10))
Custom rolling functions
# Rolling median (robust to outliers) df_ts['sales_7d_median'] = df_ts['sales'].rolling(window=7).median() # Rolling quantile (e.g., 90th percentile) df_ts['sales_7d_q90'] = df_ts['sales'].rolling(window=7).quantile(0.9) # Custom rolling function using apply def rolling_range(x): return x.max() - x.min() df_ts['sales_7d_range'] = df_ts['sales'].rolling(window=7).apply(rolling_range) print("\nAdvanced rolling statistics:") print(df_ts[['sales', 'sales_7d_median', 'sales_7d_q90', 'sales_7d_range']].head(15))
Expanding window (cumulative)
# Expanding window (all data from start to current) df_ts['sales_cumulative_avg'] = df_ts['sales'].expanding().mean() df_ts['sales_cumulative_max'] = df_ts['sales'].expanding().max() df_ts['sales_cumulative_min'] = df_ts['sales'].expanding().min() print("\nExpanding window statistics (cumulative):") print(df_ts[['sales', 'sales_cumulative_avg', 'sales_cumulative_max']].head(20))
4️⃣ Compute percentage change using pct_change()
What is percentage change?
Calculates the rate of change between current and previous period: (current - previous) / previous * 100
Basic percentage change
# Daily percentage change in sales df_ts['sales_pct_change'] = df_ts['sales'].pct_change() * 100 print("Daily percentage change in sales:") print(df_ts[['sales', 'sales_pct_change']].head(15)) # Note: First value is NaN (no previous period)
Percentage change over different periods
# Weekly percentage change (7 days apart) df_ts['sales_pct_change_7d'] = df_ts['sales'].pct_change(periods=7) * 100 # Monthly percentage change (30 days apart) df_ts['sales_pct_change_30d'] = df_ts['sales'].pct_change(periods=30) * 100 print("\nPercentage change over different periods:") print(df_ts[['sales', 'sales_pct_change', 'sales_pct_change_7d', 'sales_pct_change_30d']].head(30))
Understanding growth rates
# Identify big jumps or drops big_changes = df_ts[abs(df_ts['sales_pct_change']) > 20] print(f"\nDays with >20% change in sales: {len(big_changes)}") print(big_changes[['sales', 'sales_pct_change']].head(10)) # Best and worst days best_day = df_ts['sales_pct_change'].max() worst_day = df_ts['sales_pct_change'].min() print(f"\nBest day growth: {best_day:.2f}%") print(f"Worst day decline: {worst_day:.2f}%")
Cumulative returns
# Calculate cumulative growth (start from 1) df_ts['cumulative_growth'] = (1 + df_ts['sales_pct_change'] / 100).cumprod() print("\nCumulative growth (starting at 1):") print(df_ts[['sales', 'sales_pct_change', 'cumulative_growth']].head(20)) # Growth over full period total_growth = (df_ts['sales'].iloc[-1] / df_ts['sales'].iloc[0] - 1) * 100 print(f"\nTotal growth over period: {total_growth:.2f}%")
Percentage change for multiple columns
# Calculate pct_change for all numeric columns pct_changes = df_ts[['sales', 'temperature', 'customers']].pct_change() * 100 pct_changes.columns = [f'{col}_pct_change' for col in pct_changes.columns] df_ts = pd.concat([df_ts, pct_changes], axis=1) print("\nPercentage changes for all metrics:") print(df_ts[['sales', 'sales_pct_change', 'temperature', 'temperature_pct_change']].head(10))
Correlation with lagged changes
# Check if yesterday's temperature change affects today's sales df_ts['temp_change_lag1'] = df_ts['temperature_pct_change'].shift(1) df_ts['sales_change'] = df_ts['sales_pct_change'] # Correlation correlation = df_ts[['temp_change_lag1', 'sales_change']].corr() print("\nCorrelation between yesterday's temp change and today's sales change:") print(correlation)
5️⃣ Build complete EDA pipeline: load, clean, group, merge, and summarize
Final Project: Complete Sales Analytics Pipeline
print("="*80) print("FINAL PROJECT: COMPLETE EDA PIPELINE") print("="*80) # Step 1: Load multiple data sources print("\n📂 STEP 1: LOADING DATA") print("-"*40) # Sales data (already have df_ts) print("✓ Sales data loaded") # Create additional datasets for merging products = pd.DataFrame({ 'product_id': [1, 2, 3], 'product_name': ['Laptop', 'Mouse', 'Keyboard'], 'category': ['Electronics', 'Accessories', 'Accessories'], 'base_price': [1000, 25, 75] }) # Create daily product sales np.random.seed(42) product_sales = pd.DataFrame({ 'date': pd.date_range('2024-01-01', '2024-12-31', freq='D'), 'product_id': np.random.choice([1, 2, 3], 366), 'quantity': np.random.poisson(10, 366), 'discount': np.random.uniform(0, 0.2, 366) }) # Create store information stores = pd.DataFrame({ 'store_id': [1, 2, 3], 'store_name': ['Downtown', 'Uptown', 'Suburb'], 'region': ['Central', 'North', 'South'] }) # Create daily store sales store_sales = pd.DataFrame({ 'date': pd.date_range('2024-01-01', '2024-12-31', freq='D'), 'store_id': np.random.choice([1, 2, 3], 366), 'sales_amount': np.random.uniform(5000, 15000, 366) }) print("✓ Product data loaded") print("✓ Store data loaded") # Step 2: Clean data print("\n🧹 STEP 2: DATA CLEANING") print("-"*40) # Check for missing values print("Missing values in main dataset:") print(df_ts[['sales', 'temperature', 'customers']].isnull().sum()) # Remove any remaining duplicates df_clean = df_ts.drop_duplicates() print(f"✓ Removed duplicates: {len(df_ts) - len(df_clean)} rows") # Handle outliers in sales (cap at 99th percentile) sales_99 = df_clean['sales'].quantile(0.99) outliers_before = (df_clean['sales'] > sales_99).sum() df_clean['sales_capped'] = df_clean['sales'].clip(upper=sales_99) print(f"✓ Capped {outliers_before} sales outliers at {sales_99:.0f}") # Step 3: Feature engineering print("\n🔧 STEP 3: FEATURE ENGINEERING") print("-"*40) # Add time-based features df_clean['dayofweek'] = df_clean.index.dayofweek df_clean['month'] = df_clean.index.month df_clean['quarter'] = df_clean.index.quarter df_clean['is_weekend'] = df_clean.index.dayofweek >= 5 df_clean['is_month_start'] = df_clean.index.is_month_start df_clean['is_month_end'] = df_clean.index.is_month_end # Add rolling statistics df_clean['sales_7d_avg'] = df_clean['sales'].rolling(7).mean() df_clean['sales_30d_avg'] = df_clean['sales'].rolling(30).mean() df_clean['sales_7d_std'] = df_clean['sales'].rolling(7).std() # Add percentage changes df_clean['sales_daily_growth'] = df_clean['sales'].pct_change() * 100 df_clean['sales_weekly_growth'] = df_clean['sales'].pct_change(7) * 100 print("✓ Added time-based features") print("✓ Added rolling statistics") print("✓ Added growth metrics") # Step 4: Merge datasets print("\n🔗 STEP 4: MERGING DATASETS") print("-"*40) # Merge product sales with product info product_sales_full = pd.merge( product_sales, products, on='product_id', how='left' ) print(f"✓ Merged product sales ({len(product_sales_full)} rows)") # Calculate revenue product_sales_full['revenue'] = ( product_sales_full['quantity'] * product_sales_full['base_price'] * (1 - product_sales_full['discount']) ) # Merge store sales with store info store_sales_full = pd.merge( store_sales, stores, on='store_id', how='left' ) print(f"✓ Merged store sales ({len(store_sales_full)} rows)") # Step 5: Group and aggregate print("\n📊 STEP 5: GROUPING AND AGGREGATION") print("-"*40) # Monthly sales by product category monthly_category_sales = product_sales_full.copy() monthly_category_sales['month'] = pd.to_datetime(monthly_category_sales['date']).dt.to_period('M') category_monthly = monthly_category_sales.groupby(['month', 'category'])['revenue'].sum().unstack() print("\nMonthly revenue by category (first 6 months):") print(category_monthly.head(6).round(2)) # Store performance by region and month store_sales_full['month'] = pd.to_datetime(store_sales_full['date']).dt.to_period('M') store_performance = store_sales_full.groupby(['region', 'month']).agg({ 'sales_amount': ['sum', 'mean', 'count'] }).round(2) print("\nStore performance by region:") print(store_performance.head(10)) # Step 6: Time series analysis print("\n📈 STEP 6: TIME SERIES ANALYSIS") print("-"*40) # Resample main sales data to different frequencies monthly_sales = df_clean['sales'].resample('M').agg(['sum', 'mean', 'std']) quarterly_sales = df_clean['sales'].resample('Q').agg(['sum', 'mean']) print("Monthly sales summary (first 6 months):") print(monthly_sales.head(6).round(2)) print("\nQuarterly sales summary:") print(quarterly_sales.round(2)) # Step 7: Final summary statistics print("\n📋 STEP 7: FINAL SUMMARY REPORT") print("="*80) # Overall metrics total_sales = df_clean['sales'].sum() avg_daily_sales = df_clean['sales'].mean() max_sales_day = df_clean['sales'].max() best_day = df_clean[df_clean['sales'] == max_sales_day].index[0] print(f""" SALES PERFORMANCE SUMMARY {'='*40} Time Period: {df_clean.index.min().date()} to {df_clean.index.max().date()} Total Days: {len(df_clean)} Total Sales: ${total_sales:,.0f} Average Daily Sales: ${avg_daily_sales:.2f} Best Sales Day: {best_day.date()} (${max_sales_day:,.0f}) Sales Growth: {df_clean['sales_daily_growth'].mean():.2f}% avg daily Sales Volatility: ±${df_clean['sales'].std():.2f} std dev Weekend vs Weekday: Weekend Avg: ${df_clean[df_clean['is_weekend']]['sales'].mean():.2f} Weekday Avg: ${df_clean[~df_clean['is_weekend']]['sales'].mean():.2f} Top Performing Month: {monthly_sales['sum'].idxmax().strftime('%B %Y')} (${monthly_sales['sum'].max():,.0f}) """) # Step 8: Create visualizations (optional - if you have matplotlib) print("\n📊 STEP 8: VISUALIZATION READY") print("-"*40) print("Data ready for plotting:") print(" ✓ Daily sales with 7-day moving average") print(" ✓ Monthly aggregated sales") print(" ✓ Sales by product category") print(" ✓ Store performance by region") # Export prepared data print("\n💾 Exporting final datasets:") df_clean.to_csv('cleaned_sales_data.csv') monthly_sales.to_csv('monthly_sales_summary.csv') category_monthly.to_csv('category_monthly_sales.csv') print("\n✓ Exported to CSV files") print("\n🎉 FINAL PROJECT COMPLETE! 🎉")
🎯 Complete Hands-on Checklist
import pandas as pd import numpy as np # Create a complete workflow from scratch print("="*60) print("COMPLETE TIME SERIES WORKFLOW") print("="*60) # 1. Create raw data print("\n1. CREATING RAW DATA") dates = pd.date_range('2024-01-01', periods=100, freq='D') raw_data = pd.DataFrame({ 'date_str': dates.strftime('%Y-%m-%d'), 'sales': np.random.randint(100, 1000, 100), 'transactions': np.random.randint(10, 100, 100) }) print(raw_data.head()) # 2. Convert to datetime and set index print("\n2. CONVERTING TO DATETIME") raw_data['date'] = pd.to_datetime(raw_data['date_str']) ts_data = raw_data.set_index('date').drop('date_str', axis=1) print(ts_data.head()) # 3. Handle missing values (if any) print("\n3. HANDLING MISSING VALUES") print(f"Missing values: {ts_data.isnull().sum().sum()}") ts_data = ts_data.fillna(method='ffill') # 4. Resample to weekly print("\n4. RESAMPLING TO WEEKLY") weekly_data = ts_data.resample('W').agg({ 'sales': 'sum', 'transactions': 'sum' }) print(weekly_data.head()) # 5. Calculate rolling averages print("\n5. CALCULATING ROLLING AVERAGES") ts_data['sales_7d_avg'] = ts_data['sales'].rolling(7).mean() ts_data['sales_30d_avg'] = ts_data['sales'].rolling(30).mean() print(ts_data[['sales', 'sales_7d_avg']].head(15)) # 6. Calculate growth rates print("\n6. CALCULATING GROWTH RATES") ts_data['sales_growth'] = ts_data['sales'].pct_change() * 100 ts_data['transactions_growth'] = ts_data['transactions'].pct_change() * 100 print(ts_data[['sales', 'sales_growth']].head(10)) # 7. Create summary report print("\n7. GENERATING SUMMARY REPORT") summary = { 'Total Sales': ts_data['sales'].sum(), 'Average Daily Sales': ts_data['sales'].mean(), 'Max Daily Sales': ts_data['sales'].max(), 'Min Daily Sales': ts_data['sales'].min(), 'Sales Std Dev': ts_data['sales'].std(), 'Average Daily Growth': ts_data['sales_growth'].mean(), 'Best Growth Day': ts_data['sales_growth'].max(), 'Worst Decline Day': ts_data['sales_growth'].min() } summary_df = pd.DataFrame([summary]) print(summary_df.round(2)) print("\n✅ WORKFLOW COMPLETE!")
❌ Common Mistakes & Solutions
| Mistake | Why it fails | Correct way |
|---|---|---|
| Not setting datetime index | Can't use resample/rolling | df.set_index('date', inplace=True) |
| Wrong frequency code | 'M' vs 'MS' confusion | 'M' = month end, 'MS' = month start |
| Rolling window too small | Too noisy | Use 7 for weekly, 30 for monthly patterns |
| Forgetting pct_change returns decimal | Values < 1 look small | Multiply by 100 for percentages |
| Not handling NaNs in rolling/pct_change | First values are NaN | Use fillna() or dropna() |
📝 Practice Exercises
Load a CSV with dates, convert to datetime, and set as index.
Resample hourly data to daily aggregates (mean, sum, count).
Calculate 14-day rolling average and identify when sales deviated by more than 2 standard deviations.
Compute month-over-month growth rates for a business metric.
Build a complete pipeline that: loads data → cleans → adds time features → groups by month/week → exports summary.
✅ Summary of Day 7
| Operation | Method | Key Parameters |
|---|---|---|
| Date conversion | pd.to_datetime() | format, errors='coerce' |
| Set index | df.set_index('date') | inplace, drop |
| Resampling | df.resample('M').mean() | Frequency code, aggregation |
| Rolling window | df.rolling(window=7).mean() | window, min_periods, center |
| Percentage change | df.pct_change(periods=1) | periods, fill_method |
| Date extraction | df.index.month | year, month, day, dayofweek |
🎓 Pro Tips for Time Series
Always verify timezone - Use
tz_localize()andtz_convert()for timezone-aware dataHandle missing dates - Use
asfreq()to create consistent frequencyCheck for seasonality - Use
.resample()on different periods to detect patternsBe careful with lookahead - Rolling windows should use only past data (not centered)
Validate date ranges - Use
df.index.is_monotonicto check sorting
🚀 Final Challenge: Complete Data Pipeline
def complete_analysis_pipeline(filepath): """ Complete ETL + Analysis pipeline for time series data """ # Load df = pd.read_csv(filepath) # Convert dates df['date'] = pd.to_datetime(df['date']) df = df.set_index('date') # Clean df = df.drop_duplicates() df = df.fillna(method='ffill').fillna(method='bfill') # Feature engineering df['dayofweek'] = df.index.dayofweek df['month'] = df.index.month df['quarter'] = df.index.quarter # Rolling stats df['rolling_7d_mean'] = df['value'].rolling(7).mean() df['rolling_7d_std'] = df['value'].rolling(7).std() # Growth df['pct_change'] = df['value'].pct_change() * 100 # Resample to monthly monthly = df['value'].resample('M').agg(['mean', 'sum', 'count']) # Summary summary = { 'start_date': df.index.min(), 'end_date': df.index.max(), 'total_days': len(df), 'total_value': df['value'].sum(), 'avg_daily': df['value'].mean(), 'growth_rate': df['pct_change'].mean() } return df, monthly, summary # Usage # df_processed, monthly_agg, metrics = complete_analysis_pipeline('your_data.csv')
Discussion (0)