Data Science & ML Mini-Cheat Sheets (Pandas, NumPy, Matplotlib, Scikit-learn)

Data Science & ML Mini-Cheat Sheets (Pandas, NumPy, Matplotlib, Scikit-learn)

Advanced Pandas Operations

Data Manipulation and Transformation

import pandas as pd  # Import pandas for data manipulation and analysis
import numpy as np   # Import numpy for numerical operations

Create sample data: demonstration DataFrame with various data types

df = pd.DataFrame({ 'category': ['A', 'B', 'A', 'C', 'B', 'A'], # Categorical data for grouping 'value': [10, 20, 15, 25, 30, 12], # Numerical values for aggregation 'date': pd.date_range('2023-01-01', periods=6), # Date range for time-based analysis 'score': [0.8, 0.6, 0.9, 0.7, 0.5, 0.8] # Float values for statistical analysis })

Advanced groupby operations: demonstrate powerful data aggregation capabilities

def advanced_groupby_examples(): """Advanced groupby operations with multiple aggregations and custom functions.""" # Multiple aggregations: apply different functions to different columns grouped = df.groupby('category').agg({ 'value': ['mean', 'std', 'count'], # Multiple statistics for value column 'score': ['min', 'max', 'median'] # Multiple statistics for score column }) print("Multiple aggregations:") print(grouped) # Custom aggregation functions: define your own aggregation logic def custom_agg(x): return { 'range': x.max() - x.min(), # Calculate range (max - min) 'iqr': x.quantile(0.75) - x.quantile(0.25) # Calculate interquartile range } custom_grouped = df.groupby('category')['value'].apply(custom_agg) # Apply custom function print("\nCustom aggregations:") print(custom_grouped) # Groupby with multiple columns: aggregate across multiple grouping variables multi_grouped = df.groupby(['category']).agg({ 'value': 'sum', # Sum of values for each category 'score': 'mean' # Mean of scores for each category }).round(2) # Round results to 2 decimal places for readability print("\nMulti-column groupby:") print(multi_grouped)

Usage: demonstrate advanced groupby functionality

advanced_groupby_examples() # Execute the groupby examples

Pivot Tables and Cross-tabulation

def pivot_operations():
    """Advanced pivot table operations for data reshaping and analysis."""
    
    # Create sample data for pivoting: sales data with multiple dimensions
    sales_data = pd.DataFrame({
        'date': pd.date_range('2023-01-01', periods=20),  # Date dimension for time analysis
        'product': ['A', 'B', 'A', 'C', 'B'] * 4,        # Product dimension for product analysis
        'region': ['North', 'South', 'East', 'West'] * 5, # Region dimension for geographic analysis
        'sales': np.random.randint(100, 1000, 20),       # Sales values for aggregation
        'quantity': np.random.randint(10, 100, 20)       # Quantity values for aggregation
    })
    
    # Basic pivot table: reshape data with region as rows and product as columns
    pivot_sales = sales_data.pivot_table(
        index='region',      # Rows: geographic regions
        columns='product',   # Columns: product types
        values='sales',      # Values to aggregate: sales amounts
        aggfunc='sum',       # Aggregation function: sum sales by region/product
        fill_value=0         # Fill missing values with 0
    )
    print("Sales pivot table:")
    print(pivot_sales)
    
    # Pivot table with multiple values: aggregate multiple columns simultaneously
    pivot_multi = sales_data.pivot_table(
        index='region',      # Rows: geographic regions
        columns='product',   # Columns: product types
        values=['sales', 'quantity'],  # Multiple value columns to aggregate
        aggfunc={'sales': 'sum', 'quantity': 'mean'},  # Different functions for different values
        fill_value=0         # Fill missing values with 0
    )
    print("\nMulti-value pivot table:")
    print(pivot_multi)
    
    # Cross-tabulation: frequency analysis with normalization
    crosstab_result = pd.crosstab(
        sales_data['region'],    # Row variable: geographic regions
        sales_data['product'],   # Column variable: product types
        values=sales_data['sales'],  # Values to aggregate: sales amounts
        aggfunc='sum',          # Aggregation function: sum sales
        normalize='index'       # Normalize by row (percentage within each region)
    )
    print("\nCross-tabulation (normalized):")
    print(crosstab_result)

Usage: demonstrate pivot table operations

pivot_operations() # Execute pivot table examples

Advanced Merging and Joining

def advanced_merging():
    """Advanced merging and joining operations for combining multiple datasets."""
    
    # Create sample dataframes: employee data with different schemas
    df1 = pd.DataFrame({
        'id': [1, 2, 3, 4],                    # Employee IDs (primary key)
        'name': ['Alice', 'Bob', 'Charlie', 'David'],  # Employee names
        'dept': ['HR', 'IT', 'HR', 'IT']       # Department assignments
    })
    
    df2 = pd.DataFrame({
        'id': [1, 2, 3, 5],                    # Employee IDs (some overlap with df1)
        'salary': [50000, 60000, 55000, 70000], # Salary information
        'bonus': [5000, 6000, 5500, 7000]      # Bonus information
    })
    
    df3 = pd.DataFrame({
        'dept': ['HR', 'IT', 'Finance'],       # Department names
        'manager': ['John', 'Jane', 'Mike']    # Department managers
    })
    
    # Inner join: only keep rows that exist in both dataframes
    inner_merged = df1.merge(df2, on='id', how='inner')  # Join on employee ID
    print("Inner join:")
    print(inner_merged)
    
    # Left join: keep all rows from left dataframe (df1)
    left_merged = df1.merge(df2, on='id', how='left')  # Keep all employees from df1
    print("\nLeft join:")
    print(left_merged)
    
    # Multiple joins: chain multiple merge operations
    final_df = df1.merge(df2, on='id', how='left').merge(df3, on='dept', how='left')  # Join employee data with department info
    print("\nMultiple joins:")
    print(final_df)
    
    # Merge with different column names: handle schema mismatches
    df2_renamed = df2.rename(columns={'id': 'employee_id'})  # Rename column to match different naming convention
    merged_diff_cols = df1.merge(df2_renamed, left_on='id', right_on='employee_id')  # Specify different column names
    print("\nMerge with different column names:")
    print(merged_diff_cols)

Usage: demonstrate advanced merging operations

advanced_merging() # Execute merging examples

Multi-index Operations

def multi_index_operations():
    """Working with multi-index DataFrames for hierarchical data analysis."""
    
    # Create multi-index DataFrame: hierarchical structure with date and category levels
    dates = pd.date_range('2023-01-01', periods=6)  # Date range for time dimension
    categories = ['A', 'B']  # Category dimension for grouping
    
    index = pd.MultiIndex.from_product([dates, categories], names=['date', 'category'])  # Create hierarchical index
    
    multi_df = pd.DataFrame({
        'value': np.random.randn(12),  # Random values for demonstration
        'count': np.random.randint(1, 100, 12)  # Random counts for demonstration
    }, index=index)  # Use hierarchical index for DataFrame
    
    print("Multi-index DataFrame:")
    print(multi_df)
    
    # Selecting data with multi-index: access specific levels of hierarchy
    print("\nSelecting specific date:")
    print(multi_df.loc['2023-01-01'])  # Select all data for specific date
    
    print("\nSelecting specific category:")
    print(multi_df.xs('A', level='category'))  # Cross-section: select all data for category 'A'
    
    # Groupby with multi-index: aggregate across hierarchical levels
    grouped_multi = multi_df.groupby(level='category').agg({
        'value': ['mean', 'std'],  # Multiple statistics for value column
        'count': 'sum'             # Sum for count column
    })
    print("\nGroupby with multi-index:")
    print(grouped_multi)
    
    # Unstacking multi-index: convert hierarchical index to columns
    unstacked = multi_df.unstack('category')  # Move category level to columns
    print("\nUnstacked DataFrame:")
    print(unstacked)
    
    # Stacking back: convert columns back to hierarchical index
    restacked = unstacked.stack('category')  # Move category columns back to index
    print("\nRestacked DataFrame:")
    print(restacked)

Usage: demonstrate multi-index operations

multi_index_operations() # Execute multi-index examples

Advanced NumPy Operations

Broadcasting and Vectorization

def numpy_broadcasting():
    """Advanced NumPy broadcasting examples for efficient array operations."""
    
    # Create arrays: demonstrate different shapes for broadcasting
    a = np.array([[1, 2, 3], [4, 5, 6]])  # 2D array (2x3)
    b = np.array([10, 20, 30])            # 1D array (3,)
    c = np.array([[1], [2]])              # 2D array (2x1)
    
    print("Array a:")
    print(a)
    print("Array b:")
    print(b)
    print("Array c:")
    print(c)
    
    # Broadcasting examples: automatic shape alignment for element-wise operations
    print("\nBroadcasting a + b:")
    print(a + b)  # b is broadcasted to match a's shape (2x3)
    
    print("\nBroadcasting a + c:")
    print(a + c)  # c is broadcasted to match a's shape (2x3)
    
    # Advanced broadcasting: demonstrate broadcasting with higher dimensional arrays
    arr_3d = np.random.randn(3, 4, 5)  # 3D array (3x4x5)
    arr_2d = np.random.randn(4, 5)     # 2D array (4x5)
    arr_1d = np.random.randn(5)        # 1D array (5,)
    
    print(f"\n3D array shape: {arr_3d.shape}")
    print(f"2D array shape: {arr_2d.shape}")
    print(f"1D array shape: {arr_1d.shape}")
    
    # Broadcasting with different dimensions: automatic shape expansion
    result_3d_2d = arr_3d + arr_2d  # 2D array broadcasted to 3D
    result_3d_1d = arr_3d + arr_1d  # 1D array broadcasted to 3D
    
    print(f"Result of 3D + 2D shape: {result_3d_2d.shape}")
    print(f"Result of 3D + 1D shape: {result_3d_1d.shape}")

Usage: demonstrate NumPy broadcasting

numpy_broadcasting() # Execute broadcasting examples

Fancy Indexing and Boolean Masks

def numpy_indexing():
    """Advanced NumPy indexing techniques."""
    
    # Create sample array
    arr = np.random.randn(10, 10)
    print("Original array:")
    print(arr)
    
    # Boolean indexing
    mask = arr > 0.5
    positive_values = arr[mask]
    print(f"\nValues > 0.5: {positive_values}")
    
    # Fancy indexing with integer arrays
    indices = np.array([0, 2, 4, 6, 8])
    selected_rows = arr[indices]
    print(f"\nSelected rows {indices}:")
    print(selected_rows)
    
    # Advanced boolean operations
    mask1 = arr > 0.5
    mask2 = arr < 1.0
    combined_mask = mask1 & mask2
    filtered_values = arr[combined_mask]
    print(f"\nValues between 0.5 and 1.0: {filtered_values}")
    
    # Indexing with multiple conditions
    row_indices = np.array([1, 3, 5])
    col_indices = np.array([2, 4, 6])
    selected_elements = arr[row_indices, col_indices]
    print(f"\nSelected elements at positions (1,2), (3,4), (5,6):")
    print(selected_elements)

Usage

numpy_indexing()

Advanced Array Operations

def numpy_advanced_ops():
    """Advanced NumPy array operations."""
    
    # Create sample arrays
    arr1 = np.random.randn(5, 5)
    arr2 = np.random.randn(5, 5)
    
    print("Array 1:")
    print(arr1)
    print("\nArray 2:")
    print(arr2)
    
    # Element-wise operations
    element_wise_sum = np.add(arr1, arr2)
    element_wise_prod = np.multiply(arr1, arr2)
    
    print("\nElement-wise sum:")
    print(element_wise_sum)
    print("\nElement-wise product:")
    print(element_wise_prod)
    
    # Matrix operations
    matrix_prod = np.dot(arr1, arr2)
    print("\nMatrix product:")
    print(matrix_prod)
    
    # Statistical operations
    print(f"\nArray 1 statistics:")
    print(f"Mean: {np.mean(arr1):.4f}")
    print(f"Std: {np.std(arr1):.4f}")
    print(f"Min: {np.min(arr1):.4f}")
    print(f"Max: {np.max(arr1):.4f}")
    print(f"Median: {np.median(arr1):.4f}")
    
    # Axis-wise operations
    row_means = np.mean(arr1, axis=1)
    col_means = np.mean(arr1, axis=0)
    
    print(f"\nRow means: {row_means}")
    print(f"Column means: {col_means}")

Usage

numpy_advanced_ops()

Advanced Matplotlib and Seaborn

Custom Plotting Styles

import matplotlib.pyplot as plt
import seaborn as sns

def advanced_plotting(): """Advanced plotting with custom styles.""" # Set style plt.style.use('seaborn-v0_8') sns.set_palette("husl") # Create sample data x = np.linspace(0, 10, 100) y1 = np.sin(x) y2 = np.cos(x) y3 = np.sin(x + np.pi/4) # Create subplots with custom layout fig, axes = plt.subplots(2, 2, figsize=(12, 8)) fig.suptitle('Advanced Plotting Examples', fontsize=16, fontweight='bold') # Plot 1: Multiple lines with custom styling axes[0, 0].plot(x, y1, label='sin(x)', linewidth=2, color='blue', alpha=0.7) axes[0, 0].plot(x, y2, label='cos(x)', linewidth=2, color='red', alpha=0.7) axes[0, 0].plot(x, y3, label='sin(x+π/4)', linewidth=2, color='green', alpha=0.7) axes[0, 0].set_title('Trigonometric Functions') axes[0, 0].set_xlabel('x') axes[0, 0].set_ylabel('y') axes[0, 0].legend() axes[0, 0].grid(True, alpha=0.3) # Plot 2: Scatter plot with color mapping scatter_x = np.random.randn(100) scatter_y = np.random.randn(100) colors = np.random.rand(100) scatter = axes[0, 1].scatter(scatter_x, scatter_y, c=colors, cmap='viridis', alpha=0.6, s=50) axes[0, 1].set_title('Scatter Plot with Color Mapping') axes[0, 1].set_xlabel('X') axes[0, 1].set_ylabel('Y') plt.colorbar(scatter, ax=axes[0, 1]) # Plot 3: Histogram with multiple datasets data1 = np.random.normal(0, 1, 1000) data2 = np.random.normal(2, 1.5, 1000) axes[1, 0].hist(data1, bins=30, alpha=0.7, label='Dataset 1', color='blue') axes[1, 0].hist(data2, bins=30, alpha=0.7, label='Dataset 2', color='red') axes[1, 0].set_title('Histogram Comparison') axes[1, 0].set_xlabel('Value') axes[1, 0].set_ylabel('Frequency') axes[1, 0].legend() # Plot 4: Box plot box_data = [np.random.normal(0, 1, 100), np.random.normal(1, 1.2, 100), np.random.normal(2, 0.8, 100)] axes[1, 1].boxplot(box_data, labels=['Group A', 'Group B', 'Group C']) axes[1, 1].set_title('Box Plot Comparison') axes[1, 1].set_ylabel('Value') plt.tight_layout() plt.show()

Usage

advanced_plotting()

Seaborn Advanced Visualizations

def seaborn_advanced():
    """Advanced Seaborn visualizations."""
    
    # Create sample data
    np.random.seed(42)
    n_samples = 1000
    
    data = pd.DataFrame({
        'x': np.random.normal(0, 1, n_samples),
        'y': np.random.normal(0, 1, n_samples),
        'category': np.random.choice(['A', 'B', 'C'], n_samples),
        'value': np.random.exponential(1, n_samples),
        'group': np.random.choice(['Group1', 'Group2'], n_samples)
    })
    
    # Create figure with multiple subplots
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    fig.suptitle('Advanced Seaborn Visualizations', fontsize=16, fontweight='bold')
    
    # Plot 1: Joint plot (scatter + histograms)
    sns.jointplot(data=data, x='x', y='y', hue='category', 
                  kind='scatter', alpha=0.6, ax=axes[0, 0])
    axes[0, 0].set_title('Joint Distribution')
    
    # Plot 2: Violin plot
    sns.violinplot(data=data, x='category', y='value', hue='group', ax=axes[0, 1])
    axes[0, 1].set_title('Violin Plot by Category and Group')
    
    # Plot 3: Heatmap
    correlation_matrix = data[['x', 'y', 'value']].corr()
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
                square=True, ax=axes[1, 0])
    axes[1, 0].set_title('Correlation Heatmap')
    
    # Plot 4: Faceted histogram
    sns.histplot(data=data, x='value', hue='category', multiple='stack', 
                 bins=30, ax=axes[1, 1])
    axes[1, 1].set_title('Stacked Histogram by Category')
    
    plt.tight_layout()
    plt.show()

Usage

seaborn_advanced()

Scikit-learn Machine Learning

Data Preprocessing Pipeline

from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

def ml_preprocessing_pipeline(): """Complete ML preprocessing pipeline.""" # Create sample dataset np.random.seed(42) n_samples = 1000 data = pd.DataFrame({ 'age': np.random.normal(35, 10, n_samples), 'income': np.random.normal(50000, 20000, n_samples), 'education': np.random.choice(['High School', 'Bachelor', 'Master', 'PhD'], n_samples), 'city': np.random.choice(['NYC', 'LA', 'Chicago', 'Houston'], n_samples), 'target': np.random.choice([0, 1], n_samples, p=[0.7, 0.3]) }) # Add some missing values data.loc[np.random.choice(data.index, 50), 'age'] = np.nan data.loc[np.random.choice(data.index, 30), 'income'] = np.nan print("Original data shape:", data.shape) print("Missing values:") print(data.isnull().sum()) # Separate features and target X = data.drop('target', axis=1) y = data['target'] # Split data X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y ) # Define preprocessing steps numeric_features = ['age', 'income'] categorical_features = ['education', 'city'] # Numeric preprocessing numeric_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler()) ]) # Categorical preprocessing categorical_transformer = Pipeline(steps=[ ('imputer', SimpleImputer(strategy='constant', fill_value='missing')), ('onehot', OneHotEncoder(drop='first', sparse=False)) ]) # Combine transformers preprocessor = ColumnTransformer( transformers=[ ('num', numeric_transformer, numeric_features), ('cat', categorical_transformer, categorical_features) ] ) # Create full pipeline model = Pipeline(steps=[ ('preprocessor', preprocessor), ('classifier', RandomForestClassifier(n_estimators=100, random_state=42)) ]) # Fit and predict model.fit(X_train, y_train) y_pred = model.predict(X_test) # Evaluate print("\nClassification Report:") print(classification_report(y_test, y_pred)) print("\nConfusion Matrix:") print(confusion_matrix(y_test, y_pred)) return model, X_test, y_test

Usage

model, X_test, y_test = ml_preprocessing_pipeline()

Feature Engineering and Selection

from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA

def feature_engineering(): """Feature engineering and selection techniques.""" # Create sample data np.random.seed(42) X = np.random.randn(1000, 20) y = np.random.choice([0, 1], 1000, p=[0.7, 0.3]) # Add some informative features X[:, 0] = y + np.random.normal(0, 0.1, 1000) # Feature 0 is informative X[:, 1] = y * 2 + np.random.normal(0, 0.1, 1000) # Feature 1 is informative print("Original data shape:", X.shape) # 1. Statistical feature selection selector_kbest = SelectKBest(score_func=f_classif, k=10) X_selected_kbest = selector_kbest.fit_transform(X, y) print(f"\nAfter SelectKBest: {X_selected_kbest.shape}") print("Selected feature indices:", selector_kbest.get_support()) # 2. Recursive Feature Elimination estimator = LogisticRegression(random_state=42) selector_rfe = RFE(estimator=estimator, n_features_to_select=10) X_selected_rfe = selector_rfe.fit_transform(X, y) print(f"\nAfter RFE: {X_selected_rfe.shape}") print("Selected feature indices:", selector_rfe.get_support()) # 3. Principal Component Analysis pca = PCA(n_components=0.95) # Keep 95% of variance X_pca = pca.fit_transform(X) print(f"\nAfter PCA: {X_pca.shape}") print(f"Explained variance ratio: {pca.explained_variance_ratio_}") print(f"Total explained variance: {sum(pca.explained_variance_ratio_):.3f}") return X_selected_kbest, X_selected_rfe, X_pca

Usage

X_kbest, X_rfe, X_pca = feature_engineering()

Model Evaluation and Cross-Validation

from sklearn.model_selection import cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

def model_evaluation(): """Comprehensive model evaluation.""" # Create sample data np.random.seed(42) X = np.random.randn(1000, 10) y = np.random.choice([0, 1], 1000, p=[0.7, 0.3]) # Define models models = { 'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42), 'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42) } # Cross-validation cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) for name, model in models.items(): print(f"\n{name}:") # Cross-validation scores cv_scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy') print(f"CV Accuracy: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})") # ROC-AUC cv_auc = cross_val_score(model, X, y, cv=cv, scoring='roc_auc') print(f"CV ROC-AUC: {cv_auc.mean():.3f} (+/- {cv_auc.std() * 2:.3f})") # Hyperparameter tuning param_grid = { 'n_estimators': [50, 100, 200], 'max_depth': [3, 5, 7, None], 'min_samples_split': [2, 5, 10] } rf = RandomForestClassifier(random_state=42) grid_search = GridSearchCV(rf, param_grid, cv=cv, scoring='roc_auc', n_jobs=-1) grid_search.fit(X, y) print(f"\nBest parameters: {grid_search.best_params_}") print(f"Best CV score: {grid_search.best_score_:.3f}") return grid_search.best_estimator_

Usage

best_model = model_evaluation()

Advanced Visualization for ML

def ml_visualizations():
    """Advanced visualizations for machine learning."""
    
    # Create sample data
    np.random.seed(42)
    X = np.random.randn(1000, 2)
    y = np.random.choice([0, 1], 1000, p=[0.7, 0.3])
    
    # Train a model
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X, y)
    
    # Create figure
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    fig.suptitle('Machine Learning Visualizations', fontsize=16, fontweight='bold')
    
    # Plot 1: Feature importance
    feature_importance = model.feature_importances_
    axes[0, 0].bar(range(len(feature_importance)), feature_importance)
    axes[0, 0].set_title('Feature Importance')
    axes[0, 0].set_xlabel('Feature Index')
    axes[0, 0].set_ylabel('Importance')
    
    # Plot 2: ROC Curve
    y_pred_proba = model.predict_proba(X)[:, 1]
    fpr, tpr, _ = roc_curve(y, y_pred_proba)
    auc_score = roc_auc_score(y, y_pred_proba)
    
    axes[0, 1].plot(fpr, tpr, label=f'ROC Curve (AUC = {auc_score:.3f})')
    axes[0, 1].plot([0, 1], [0, 1], 'k--', label='Random')
    axes[0, 1].set_title('ROC Curve')
    axes[0, 1].set_xlabel('False Positive Rate')
    axes[0, 1].set_ylabel('True Positive Rate')
    axes[0, 1].legend()
    
    # Plot 3: Precision-Recall Curve
    precision, recall, _ = precision_recall_curve(y, y_pred_proba)
    axes[1, 0].plot(recall, precision)
    axes[1, 0].set_title('Precision-Recall Curve')
    axes[1, 0].set_xlabel('Recall')
    axes[1, 0].set_ylabel('Precision')
    
    # Plot 4: Decision boundary
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1),
                         np.arange(y_min, y_max, 0.1))
    
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    
    axes[1, 1].contourf(xx, yy, Z, alpha=0.4)
    scatter = axes[1, 1].scatter(X[:, 0], X[:, 1], c=y, alpha=0.8)
    axes[1, 1].set_title('Decision Boundary')
    axes[1, 1].set_xlabel('Feature 1')
    axes[1, 1].set_ylabel('Feature 2')
    
    plt.tight_layout()
    plt.show()

Usage

ml_visualizations()

---

Previous Chapter | Next Chapter