The Foundation of Successful Machine Learning
ISM6251 | Week 3
Missing Data • Filtering • Grouping • Quality Assessment
By the end of this week, you will be able to:
import pandas as pd
import numpy as np
# Load sample dataset
df = pd.read_csv('customer_data.csv')
# Basic quality assessment
print("Dataset Shape:", df.shape)
print("\nColumn Data Types:")
print(df.dtypes)
# Missing value analysis
print("\nMissing Values:")
missing_stats = pd.DataFrame({
'Column': df.columns,
'Missing_Count': df.isnull().sum(),
'Missing_Percent': (df.isnull().sum() / len(df)) * 100
})
missing_stats = missing_stats[missing_stats['Missing_Count'] > 0]
print(missing_stats.sort_values('Missing_Percent', ascending=False))
# Duplicate analysis
print(f"\nDuplicate Rows: {df.duplicated().sum()}")
# Basic statistics
print("\nNumeric Columns Summary:")
print(df.describe())
# Check for inconsistent categories
for col in df.select_dtypes(include='object').columns:
unique_count = df[col].nunique()
if unique_count < 20: # Show if manageable number
print(f"\n{col} unique values: {sorted(df[col].unique())}")
Customer Data Issues:
ID Name Age Income City
1 Alice 25 50000 NYC
2 Bob 30 NULL new york
3 Alice 25 50000 NYC # Duplicate
4 Charlie 999 75000 N.Y. # Invalid age
5 Diana 28 -10000 NULL # Negative income
6 35 60000 Boston # Missing name
After Data Cleaning:
ID Name Age Income City
1 Alice 25 50000 New York
2 Bob 30 65000 New York # Imputed income
4 Charlie 35 75000 New York # Fixed age
5 Diana 28 60000 Boston # Fixed income
Common Fixes Applied: Removed duplicates, imputed missing values, standardized city names, corrected obvious data entry errors
# Demonstrate impact of data quality
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
# Create sample data with quality issues
np.random.seed(42)
X_clean = np.random.randn(100, 1)
y_clean = 2 * X_clean.ravel() + np.random.randn(100) * 0.1
# Add outliers and missing values
X_dirty = X_clean.copy()
y_dirty = y_clean.copy()
# Add outliers
X_dirty[95] = 10 # Extreme outlier
y_dirty[95] = -20
# Add missing values (simulate)
missing_idx = np.random.choice(100, 10, replace=False)
# Train models
model_clean = LinearRegression()
model_clean.fit(X_clean, y_clean)
# For dirty data, remove missing values first
clean_idx = ~np.isin(range(100), missing_idx)
X_dirty_clean = X_dirty[clean_idx]
y_dirty_clean = y_dirty[clean_idx]
model_dirty = LinearRegression()
model_dirty.fit(X_dirty_clean, y_dirty_clean)
# Compare performance
y_pred_clean = model_clean.predict(X_clean)
y_pred_dirty = model_dirty.predict(X_clean)
print(f"Clean data MSE: {mean_squared_error(y_clean, y_pred_clean):.4f}")
print(f"Dirty data MSE: {mean_squared_error(y_clean, y_pred_dirty):.4f}")
print(f"Performance degradation: {(mean_squared_error(y_clean, y_pred_dirty) / mean_squared_error(y_clean, y_pred_clean) - 1) * 100:.1f}%")
# Analyze missing data patterns
import missingno as msno
# Visualize missing data patterns
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
# Matrix plot - shows missing data pattern
msno.matrix(df, ax=axes[0,0])
axes[0,0].set_title('Missing Data Matrix')
# Bar plot - count of missing values per column
msno.bar(df, ax=axes[0,1])
axes[0,1].set_title('Missing Data Count')
# Heatmap - correlation of missing data between columns
msno.heatmap(df, ax=axes[1,0])
axes[1,0].set_title('Missing Data Correlation')
# Dendrogram - hierarchical clustering of missing patterns
msno.dendrogram(df, ax=axes[1,1])
axes[1,1].set_title('Missing Data Clustering')
plt.tight_layout()
plt.show()
# Test for MCAR using Little's test (conceptual)
def analyze_missingness(df):
"""Analyze patterns in missing data"""
missing_by_column = df.isnull().sum()
missing_patterns = df.isnull().value_counts()
print("Missing values by column:")
print(missing_by_column[missing_by_column > 0])
print(f"\nMost common missing patterns:")
print(missing_patterns.head())
return missing_by_column, missing_patterns
Clear Decision Rules:
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
# Create sample data with missing values
data = {
'age': [25, 30, np.nan, 35, 40, np.nan, 45],
'income': [50000, np.nan, 60000, 70000, np.nan, 80000, 90000],
'city': ['NYC', 'LA', np.nan, 'Chicago', 'NYC', 'LA', 'Chicago']
}
df = pd.DataFrame(data)
print("Original data:")
print(df)
# Simple imputation - numeric
numeric_imputer = SimpleImputer(strategy='median')
df['age_imputed'] = numeric_imputer.fit_transform(df[['age']]).ravel()
# Simple imputation - categorical
cat_imputer = SimpleImputer(strategy='most_frequent')
df['city_imputed'] = cat_imputer.fit_transform(df[['city']]).ravel()
# KNN imputation (numeric only)
# Note: KNN is a machine learning technique we'll study later
# This shows how ML can improve imputation beyond simple statistics
knn_imputer = KNNImputer(n_neighbors=3)
numeric_cols = ['age', 'income']
df_numeric = df[numeric_cols].copy()
df_knn = pd.DataFrame(
knn_imputer.fit_transform(df_numeric),
columns=[col + '_knn' for col in numeric_cols]
)
# Iterative imputation
iter_imputer = IterativeImputer(random_state=42)
df_iter = pd.DataFrame(
iter_imputer.fit_transform(df_numeric),
columns=[col + '_iter' for col in numeric_cols]
)
# Combine results
result = pd.concat([df, df_knn, df_iter], axis=1)
print("\nAfter imputation:")
print(result.round(0))
# Comprehensive imputation workflow
def smart_impute(df, numeric_strategy='median',
categorical_strategy='most_frequent'):
"""
Smart imputation based on data characteristics
"""
df_imputed = df.copy()
for column in df.columns:
missing_pct = df[column].isnull().sum() / len(df) * 100
print(f"{column}: {missing_pct:.1f}% missing")
if missing_pct == 0:
print(f" → No missing values")
continue
elif missing_pct > 50:
print(f" → Consider dropping column")
continue
if df[column].dtype in ['object', 'category']:
# Categorical variable
if df[column].nunique() < 10:
imputer = SimpleImputer(strategy=categorical_strategy)
df_imputed[column] = imputer.fit_transform(
df[[column]]).ravel()
print(f" → Imputed with {categorical_strategy}")
else:
print(f" → Too many categories, consider dropping")
else:
# Numeric variable
if df[column].skew() > 1: # Highly skewed
imputer = SimpleImputer(strategy='median')
print(f" → Skewed data, using median")
else:
imputer = SimpleImputer(strategy=numeric_strategy)
print(f" → Using {numeric_strategy}")
df_imputed[column] = imputer.fit_transform(
df[[column]]).ravel()
return df_imputed
# Apply smart imputation
df_smart = smart_impute(df)
print("\nImputation complete!")
# Create sample customer dataset
np.random.seed(42)
customers = pd.DataFrame({
'customer_id': range(1, 1001),
'age': np.random.randint(18, 80, 1000),
'income': np.random.normal(50000, 20000, 1000),
'purchase_amount': np.random.exponential(100, 1000),
'city': np.random.choice(['NYC', 'LA', 'Chicago', 'Miami'], 1000),
'signup_date': pd.date_range('2020-01-01', '2023-12-31', periods=1000)
})
print(f"Original dataset: {len(customers)} customers")
# Single condition filtering
young_customers = customers[customers['age'] < 30]
print(f"Young customers (age < 30): {len(young_customers)}")
# Multiple conditions with AND
high_value = customers[
(customers['age'] >= 30) &
(customers['income'] > 60000) &
(customers['purchase_amount'] > 150)
]
print(f"High-value customers: {len(high_value)}")
# OR conditions
target_cities = customers[
(customers['city'] == 'NYC') |
(customers['city'] == 'LA')
]
print(f"NYC or LA customers: {len(target_cities)}")
# Using isin() for multiple values
major_cities = customers[customers['city'].isin(['NYC', 'LA', 'Chicago'])]
print(f"Major city customers: {len(major_cities)}")
# String filtering
customers['email'] = [f"user{i}@{'gmail' if i%2 else 'yahoo'}.com"
for i in range(1, 1001)]
gmail_users = customers[customers['email'].str.contains('gmail')]
print(f"Gmail users: {len(gmail_users)}")
# Date filtering
recent_signups = customers[customers['signup_date'] >= '2023-01-01']
print(f"Recent signups (2023+): {len(recent_signups)}")
from scipy import stats
def detect_outliers_zscore(df, column, threshold=3):
"""Detect outliers using Z-score method"""
z_scores = np.abs(stats.zscore(df[column]))
return df[z_scores > threshold]
def detect_outliers_iqr(df, column, multiplier=1.5):
"""Detect outliers using IQR method"""
Q1 = df[column].quantile(0.25)
Q3 = df[column].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - multiplier * IQR
upper_bound = Q3 + multiplier * IQR
outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
return outliers, lower_bound, upper_bound
def detect_outliers_percentile(df, column, lower=5, upper=95):
"""Detect outliers using percentile method"""
lower_bound = df[column].quantile(lower / 100)
upper_bound = df[column].quantile(upper / 100)
outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
return outliers, lower_bound, upper_bound
# Apply outlier detection to income
print("Outlier Detection for Income:")
# Z-score method
income_outliers_z = detect_outliers_zscore(customers, 'income')
print(f"Z-score outliers: {len(income_outliers_z)}")
# IQR method
income_outliers_iqr, lower_iqr, upper_iqr = detect_outliers_iqr(customers, 'income')
print(f"IQR outliers: {len(income_outliers_iqr)}")
print(f" IQR bounds: [{lower_iqr:.0f}, {upper_iqr:.0f}]")
# Percentile method
income_outliers_pct, lower_pct, upper_pct = detect_outliers_percentile(
customers, 'income', 2.5, 97.5)
print(f"Percentile outliers: {len(income_outliers_pct)}")
print(f" Percentile bounds: [{lower_pct:.0f}, {upper_pct:.0f}]")
# Visualize outliers
plt.figure(figsize=(12, 4))
plt.subplot(1, 3, 1)
plt.boxplot(customers['income'])
plt.title('Income Distribution\n(with outliers)')
plt.subplot(1, 3, 2)
customers['income'].hist(bins=50, alpha=0.7)
plt.title('Income Histogram')
plt.subplot(1, 3, 3)
# Remove IQR outliers
clean_income = customers[~customers.index.isin(income_outliers_iqr.index)]
clean_income['income'].hist(bins=50, alpha=0.7, color='green')
plt.title('Income After Outlier Removal')
plt.tight_layout()
plt.show()
# Smart filtering workflow
def smart_filter(df, target_col=None):
"""
Apply progressive filtering with validation
"""
print(f"Starting with {len(df)} records")
df_filtered = df.copy()
# Step 1: Remove obvious errors
if 'age' in df.columns:
df_filtered = df_filtered[
(df_filtered['age'] >= 0) &
(df_filtered['age'] <= 120)
]
print(f"After age validation: {len(df_filtered)} records")
# Step 2: Remove statistical outliers (income)
if 'income' in df.columns:
Q1 = df_filtered['income'].quantile(0.25)
Q3 = df_filtered['income'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 3 * IQR # More conservative
upper_bound = Q3 + 3 * IQR
df_filtered = df_filtered[
(df_filtered['income'] >= lower_bound) &
(df_filtered['income'] <= upper_bound)
]
print(f"After income outlier removal: {len(df_filtered)} records")
# Step 3: Remove low-quality records
if 'purchase_amount' in df.columns:
# Remove very small purchases (potential test data)
df_filtered = df_filtered[df_filtered['purchase_amount'] >= 10]
print(f"After minimum purchase filter: {len(df_filtered)} records")
# Step 4: Validate impact on target (if provided)
if target_col and target_col in df.columns:
original_dist = df[target_col].describe()
filtered_dist = df_filtered[target_col].describe()
print("\nTarget variable distribution:")
print("Original vs Filtered:")
comparison = pd.DataFrame({
'Original': original_dist,
'Filtered': filtered_dist
})
print(comparison.round(2))
return df_filtered
# Apply smart filtering
customers_clean = smart_filter(customers, 'purchase_amount')
# Create quality score
customers_clean['data_quality_score'] = 0
customers_clean['data_quality_score'] += (~customers_clean['age'].isnull()).astype(int)
customers_clean['data_quality_score'] += (~customers_clean['income'].isnull()).astype(int)
customers_clean['data_quality_score'] += (~customers_clean['city'].isnull()).astype(int)
print(f"\nData Quality Score Distribution:")
print(customers_clean['data_quality_score'].value_counts().sort_index())
# Basic grouping operations
print("Customer Analysis by City:")
# Single column grouping
city_stats = customers_clean.groupby('city').agg({
'age': ['mean', 'median', 'std'],
'income': ['mean', 'median', 'count'],
'purchase_amount': ['sum', 'mean', 'max']
}).round(2)
print(city_stats)
# Flatten column names
city_stats.columns = ['_'.join(col).strip() for col in city_stats.columns]
city_stats = city_stats.reset_index()
print("\nFlattened column names:")
print(city_stats.head())
# Multiple column grouping
customers_clean['age_group'] = pd.cut(
customers_clean['age'],
bins=[0, 30, 50, 100],
labels=['Young', 'Middle', 'Senior']
)
age_city_stats = customers_clean.groupby(['age_group', 'city']).agg({
'income': 'mean',
'purchase_amount': 'mean',
'customer_id': 'count'
}).round(2)
age_city_stats.columns = ['avg_income', 'avg_purchase', 'customer_count']
print("\nAge Group & City Analysis:")
print(age_city_stats)
# Percentage analysis
city_pcts = customers_clean.groupby('city').size()
city_pcts_norm = (city_pcts / city_pcts.sum() * 100).round(1)
print(f"\nCustomer Distribution by City:")
for city, pct in city_pcts_norm.items():
print(f"{city}: {pct}%")
# Advanced aggregation techniques
# Custom aggregation functions
def coefficient_of_variation(series):
"""Calculate coefficient of variation (std/mean)"""
return series.std() / series.mean() if series.mean() != 0 else 0
def percentile_95(series):
"""Calculate 95th percentile"""
return series.quantile(0.95)
def outlier_count(series):
"""Count outliers using IQR method"""
Q1, Q3 = series.quantile([0.25, 0.75])
IQR = Q3 - Q1
outliers = series[(series < Q1 - 1.5*IQR) | (series > Q3 + 1.5*IQR)]
return len(outliers)
# Named aggregations with custom functions
advanced_stats = customers_clean.groupby('city').agg(
avg_income=('income', 'mean'),
median_income=('income', 'median'),
income_cv=('income', coefficient_of_variation),
high_income_95pct=('income', percentile_95),
income_outliers=('income', outlier_count),
total_purchases=('purchase_amount', 'sum'),
avg_purchase=('purchase_amount', 'mean'),
purchase_cv=('purchase_amount', coefficient_of_variation),
customer_count=('customer_id', 'count'),
age_range=('age', lambda x: x.max() - x.min())
).round(2)
print("Advanced City Statistics:")
print(advanced_stats)
# Conditional aggregations
def high_value_customers(group):
"""Calculate metrics for high-value customers only"""
high_value = group[group['purchase_amount'] > group['purchase_amount'].median()]
return pd.Series({
'high_value_count': len(high_value),
'high_value_avg_income': high_value['income'].mean(),
'high_value_avg_purchase': high_value['purchase_amount'].mean()
})
high_value_stats = customers_clean.groupby('city').apply(high_value_customers).round(2)
print("\nHigh-Value Customer Analysis:")
print(high_value_stats)
# Time-based analysis
customers_clean['signup_year'] = customers_clean['signup_date'].dt.year
customers_clean['signup_month'] = customers_clean['signup_date'].dt.month
customers_clean['signup_quarter'] = customers_clean['signup_date'].dt.quarter
customers_clean['signup_day_name'] = customers_clean['signup_date'].dt.day_name()
# Yearly trends
yearly_trends = customers_clean.groupby('signup_year').agg({
'customer_id': 'count',
'age': 'mean',
'income': 'mean',
'purchase_amount': 'mean'
}).round(2)
yearly_trends.columns = ['new_customers', 'avg_age', 'avg_income', 'avg_purchase']
print("Yearly Signup Trends:")
print(yearly_trends)
# Monthly seasonality
monthly_patterns = customers_clean.groupby('signup_month').agg({
'customer_id': 'count',
'purchase_amount': 'mean'
}).round(2)
monthly_patterns.columns = ['signup_count', 'avg_purchase']
monthly_patterns.index.name = 'month'
print("\nMonthly Signup Patterns:")
print(monthly_patterns)
# Day of week analysis
dow_patterns = customers_clean.groupby('signup_day_name').agg({
'customer_id': 'count',
'purchase_amount': 'mean'
}).round(2)
# Reorder by day of week
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
dow_patterns = dow_patterns.reindex(day_order)
dow_patterns.columns = ['signup_count', 'avg_purchase']
print("\nDay of Week Patterns:")
print(dow_patterns)
# Create time-based features for ML
customers_clean['days_since_signup'] = (
pd.Timestamp.now() - customers_clean['signup_date']
).dt.days
customers_clean['is_weekend_signup'] = customers_clean['signup_day_name'].isin(['Saturday', 'Sunday'])
customers_clean['is_q4_signup'] = customers_clean['signup_quarter'] == 4
print(f"\nTime-based features created:")
print(f"Average days since signup: {customers_clean['days_since_signup'].mean():.0f}")
print(f"Weekend signups: {customers_clean['is_weekend_signup'].sum()}")
print(f"Q4 signups: {customers_clean['is_q4_signup'].sum()}")
class DataPreparationPipeline:
"""
Comprehensive data preparation pipeline
"""
def __init__(self, config=None):
self.config = config or {
'missing_threshold': 0.5, # 50% threshold for dropping columns
'outlier_method': 'iqr',
'outlier_multiplier': 1.5
}
self.transformations = []
self.quality_report = {}
def assess_quality(self, df):
"""Generate comprehensive data quality report"""
report = {
'shape': df.shape,
'missing_data': {},
'duplicates': df.duplicated().sum(),
'data_types': df.dtypes.to_dict(),
'numeric_stats': {},
'categorical_stats': {}
}
# Missing data analysis
for col in df.columns:
missing_count = df[col].isnull().sum()
missing_pct = missing_count / len(df)
report['missing_data'][col] = {
'count': missing_count,
'percentage': missing_pct
}
# Numeric statistics
numeric_cols = df.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
report['numeric_stats'][col] = {
'mean': df[col].mean(),
'std': df[col].std(),
'min': df[col].min(),
'max': df[col].max(),
'outliers_iqr': self._count_outliers_iqr(df[col])
}
# Categorical statistics
cat_cols = df.select_dtypes(include=['object', 'category']).columns
for col in cat_cols:
report['categorical_stats'][col] = {
'unique_count': df[col].nunique(),
'top_value': df[col].mode().iloc[0] if len(df[col].mode()) > 0 else None,
'value_counts': df[col].value_counts().head().to_dict()
}
self.quality_report = report
return report
def _count_outliers_iqr(self, series):
"""Count outliers using IQR method"""
Q1 = series.quantile(0.25)
Q3 = series.quantile(0.75)
IQR = Q3 - Q1
multiplier = self.config['outlier_multiplier']
lower_bound = Q1 - multiplier * IQR
upper_bound = Q3 + multiplier * IQR
outliers = series[(series < lower_bound) | (series > upper_bound)]
return len(outliers)
def handle_missing_data(self, df):
"""Handle missing data based on configuration"""
df_clean = df.copy()
for col in df.columns:
missing_pct = df[col].isnull().sum() / len(df)
if missing_pct > self.config['missing_threshold']:
print(f"Dropping column {col} (>{self.config['missing_threshold']*100:.0f}% missing)")
df_clean = df_clean.drop(columns=[col])
elif missing_pct > 0:
if df[col].dtype in ['object', 'category']:
# Categorical: use mode
mode_value = df[col].mode().iloc[0] if len(df[col].mode()) > 0 else 'Unknown'
df_clean[col] = df_clean[col].fillna(mode_value)
print(f"Imputed {col} with mode: {mode_value}")
else:
# Numeric: use median
median_value = df[col].median()
df_clean[col] = df_clean[col].fillna(median_value)
print(f"Imputed {col} with median: {median_value}")
return df_clean
def handle_outliers(self, df):
"""Handle outliers in numeric columns"""
df_clean = df.copy()
numeric_cols = df_clean.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
outliers_before = self._count_outliers_iqr(df_clean[col])
if outliers_before > 0:
Q1 = df_clean[col].quantile(0.25)
Q3 = df_clean[col].quantile(0.75)
IQR = Q3 - Q1
multiplier = self.config['outlier_multiplier']
lower_bound = Q1 - multiplier * IQR
upper_bound = Q3 + multiplier * IQR
# Cap outliers instead of removing
df_clean[col] = df_clean[col].clip(lower_bound, upper_bound)
outliers_after = self._count_outliers_iqr(df_clean[col])
print(f"Capped {outliers_before - outliers_after} outliers in {col}")
return df_clean
def remove_duplicates(self, df):
"""Remove duplicate records"""
initial_count = len(df)
df_clean = df.drop_duplicates()
duplicates_removed = initial_count - len(df_clean)
if duplicates_removed > 0:
print(f"Removed {duplicates_removed} duplicate records")
return df_clean
def fit_transform(self, df):
"""Apply full preparation pipeline"""
print("Starting data preparation pipeline...")
print(f"Initial dataset shape: {df.shape}")
# Step 1: Quality assessment
print("\n1. Assessing data quality...")
self.assess_quality(df)
# Step 2: Remove duplicates
print("\n2. Removing duplicates...")
df_clean = self.remove_duplicates(df)
# Step 3: Handle missing data
print("\n3. Handling missing data...")
df_clean = self.handle_missing_data(df_clean)
# Step 4: Handle outliers
print("\n4. Handling outliers...")
df_clean = self.handle_outliers(df_clean)
print(f"\nFinal dataset shape: {df_clean.shape}")
print("Data preparation pipeline completed!")
return df_clean
# Apply the pipeline
pipeline = DataPreparationPipeline()
customers_final = pipeline.fit_transform(customers)
print("\nQuality Report Summary:")
print(f"Original duplicates: {pipeline.quality_report['duplicates']}")
print(f"Columns with missing data: {sum(1 for col, stats in pipeline.quality_report['missing_data'].items() if stats['count'] > 0)}")
print(f"Numeric columns: {len(pipeline.quality_report['numeric_stats'])}")
print(f"Categorical columns: {len(pipeline.quality_report['categorical_stats'])}")
Data Preparation Decision Tree:
MISSING DATA COLUMNS:
├── > 50% missing?
│ ├── YES → DROP COLUMN
│ └── NO → Continue
├── Irrelevant to prediction task?
│ ├── YES → DROP COLUMN
│ └── NO → Continue
├── 10-50% missing in important column?
│ ├── YES → IMPUTE
│ │ ├── Categorical → MODE imputation
│ │ └── Numeric →
│ │ ├── Skewed → MEDIAN imputation
│ │ ├── Normal → MEAN imputation
│ │ └── Complex → KNN/Iterative imputation
│ └── NO → Continue to row analysis
MISSING DATA ROWS:
├── < 5% of rows affected?
│ ├── YES →
│ │ ├── Large dataset (>10k) + MCAR → DROP ROWS
│ │ └── Small dataset OR MAR/MNAR → IMPUTE
│ └── NO →
├── 5-10% of rows affected (GRAY AREA)?
│ ├── Consider: Dataset size, column importance, pattern
│ └── Default → IMPUTE if important columns
├── > 10% of rows affected?
│ ├── YES → MUST IMPUTE (dropping loses too much data)
OUTLIERS:
├── Obviously incorrect (age=999, negative prices)?
│ ├── YES → CORRECT or REMOVE
│ └── NO → Continue
├── Valid but extreme (high earner, rare event)?
│ ├── YES → KEEP (valuable information)
│ └── NO → Continue
├── Dataset size < 1000 rows?
│ ├── YES → CAP outliers (preserve sample size)
│ └── NO → REMOVE or CAP based on impact
CATEGORICAL VARIABLES:
├── > 50 unique values?
│ ├── YES → Consider DROP or GROUP rare categories
│ └── NO → Continue
├── 10-50 unique values?
│ ├── YES → Frequency/Target encoding
│ └── NO → One-hot encoding (if < 10 categories)
FINAL VALIDATION:
✓ Target variable distribution preserved?
✓ Key statistical relationships maintained?
✓ Sufficient sample size for modeling?
✓ Business logic and domain knowledge satisfied?
# Common pitfalls and solutions
# PITFALL 1: Data leakage
# BAD: Including future information
def bad_feature_engineering(df):
# This creates leakage!
df['future_purchases'] = df.groupby('customer_id')['purchase_amount'].cumsum()
return df
# GOOD: Only use past information
def good_feature_engineering(df):
df_sorted = df.sort_values(['customer_id', 'purchase_date'])
df_sorted['past_purchases'] = df_sorted.groupby('customer_id')['purchase_amount'].cumsum().shift(1)
return df_sorted
# PITFALL 2: Inconsistent preprocessing
# BAD: Different preprocessing for train/test
def bad_preprocessing():
# Train set
train_mean = train_data['income'].mean()
train_data['income_scaled'] = train_data['income'] / train_mean
# Test set (WRONG!)
test_mean = test_data['income'].mean() # Should use train_mean!
test_data['income_scaled'] = test_data['income'] / test_mean
# GOOD: Consistent preprocessing
def good_preprocessing():
# Fit on training data
scaler = StandardScaler()
scaler.fit(train_data[['income']])
# Apply to both train and test
train_data['income_scaled'] = scaler.transform(train_data[['income']])
test_data['income_scaled'] = scaler.transform(test_data[['income']])
# PITFALL 3: Over-preprocessing
# BAD: Removing too much variation
def over_processed(df):
# Removing all outliers might remove important patterns
for col in df.select_dtypes(include=[np.number]).columns:
Q1, Q3 = df[col].quantile([0.25, 0.75])
IQR = Q3 - Q1
# Too aggressive - removes 25% of data
df = df[(df[col] >= Q1 - 0.5*IQR) & (df[col] <= Q3 + 0.5*IQR)]
return df
# GOOD: Conservative outlier handling
def conservative_processing(df):
for col in df.select_dtypes(include=[np.number]).columns:
# Cap extreme outliers (99.5th percentile)
upper_cap = df[col].quantile(0.995)
lower_cap = df[col].quantile(0.005)
df[col] = df[col].clip(lower_cap, upper_cap)
return df
# PITFALL 4: Ignoring business context
# BAD: Statistical-only approach
def statistics_only(df):
# Age of 999 is clearly an error, but statistical methods might miss it
return df[np.abs(stats.zscore(df['age'])) < 3] # Might keep age=999
# GOOD: Business rules + statistics
def business_informed(df):
# Apply business logic first
df = df[(df['age'] >= 0) & (df['age'] <= 120)] # Reasonable age range
df = df[df['income'] >= 0] # No negative income
# Then apply statistical methods
for col in ['income', 'purchase_amount']:
if col in df.columns:
Q1, Q3 = df[col].quantile([0.25, 0.75])
IQR = Q3 - Q1
df = df[(df[col] >= Q1 - 1.5*IQR) & (df[col] <= Q3 + 1.5*IQR)]
return df
print("Data preparation pitfalls and solutions demonstrated!")
print("Remember: Always validate your preprocessing steps!")
Week 3 Assignment: Data Wrangling and Imputation Logic
Essential Data Preparation Skills:
Remember: Quality data preparation is the foundation of successful machine learning projects!
Week 4 Preview:
Clean data is the foundation of good models!