Skip to content

Latest commit

 

History

History
797 lines (629 loc) · 28.3 KB

5 Reasons for Feature Selection in Machine Learning.md

File metadata and controls

797 lines (629 loc) · 28.3 KB

5 Reasons for Feature Selection in Machine Learning

Slide 1: Dimensionality Reduction and Computational Efficiency

Feature selection significantly reduces computational complexity and training time by eliminating irrelevant or redundant features. This process becomes crucial when dealing with high-dimensional datasets where the curse of dimensionality can severely impact model performance and resource utilization.

import numpy as np
from sklearn.datasets import make_classification
from sklearn.feature_selection import SelectKBest, f_classif
from time import time

# Generate synthetic dataset with redundant features
X, y = make_classification(n_samples=1000, n_features=100, 
                         n_informative=10, random_state=42)

# Measure training time with all features
t0 = time()
selected_features = SelectKBest(score_func=f_classif, k=10)
X_selected = selected_features.fit_transform(X, y)

print(f"Original features: {X.shape[1]}")
print(f"Selected features: {X_selected.shape[1]}")
print(f"Feature selection time: {time() - t0:.3f} seconds")

Slide 2: Variance Thresholding Implementation

Understanding feature variance helps identify constant or near-constant features that provide minimal discriminative power. This implementation demonstrates how to remove low-variance features using a custom threshold approach with numpy.

import numpy as np

def variance_threshold_selector(X, threshold=0.01):
    # Calculate variance of each feature
    variances = np.var(X, axis=0)
    
    # Create boolean mask for features above threshold
    mask = variances > threshold
    
    # Return selected features and their indices
    return X[:, mask], np.where(mask)[0]

# Example usage
X = np.random.randn(100, 20)
X[:, 5] = 0.1  # Create low-variance feature

X_selected, selected_indices = variance_threshold_selector(X)
print(f"Original features: {X.shape[1]}")
print(f"Features after variance threshold: {X_selected.shape[1]}")
print(f"Removed feature indices: {np.where(~np.in1d(range(X.shape[1]), selected_indices))[0]}")

Slide 3: Correlation-Based Feature Selection

High correlation between features indicates redundancy in the dataset. This implementation uses correlation matrix analysis to identify and remove highly correlated features while retaining the most informative ones based on their correlation with the target variable.

import pandas as pd
import numpy as np
from scipy.stats import spearmanr

def correlation_selector(X, y, threshold=0.7):
    # Calculate correlation matrix
    corr_matrix = pd.DataFrame(X).corr().abs()
    
    # Calculate correlation with target
    target_corr = np.array([abs(spearmanr(X[:, i], y)[0]) for i in range(X.shape[1])])
    
    # Find features to remove
    features_to_remove = set()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if corr_matrix.iloc[i, j] > threshold:
                # Remove feature with lower correlation to target
                if target_corr[i] < target_corr[j]:
                    features_to_remove.add(i)
                else:
                    features_to_remove.add(j)
    
    # Select features
    selected_features = [i for i in range(X.shape[1]) if i not in features_to_remove]
    return X[:, selected_features], selected_features

# Example usage
X, y = make_classification(n_samples=1000, n_features=20, 
                         n_informative=10, random_state=42)
X_selected, selected_features = correlation_selector(X, y)
print(f"Selected features: {selected_features}")
print(f"Reduced feature shape: {X_selected.shape}")

Slide 4: Recursive Feature Elimination

A sophisticated approach that iteratively constructs the model, ranks features by importance, and removes the least important ones. This implementation showcases RFE with cross-validation to determine the optimal number of features.

from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

def recursive_feature_elimination(X, y, step=1):
    # Initialize model
    model = LogisticRegression(random_state=42)
    
    # Initialize RFE
    rfe = RFE(estimator=model, n_features_to_select=1, step=step)
    
    # Fit RFE
    rfe.fit(X, y)
    
    # Get ranking and support mask
    ranking = rfe.ranking_
    support = rfe.support_
    
    # Calculate cross-validation scores for different feature counts
    scores = []
    features_counts = range(1, X.shape[1] + 1)
    
    for n_features in features_counts:
        rfe = RFE(estimator=model, n_features_to_select=n_features, step=1)
        score = cross_val_score(estimator=model, X=X, y=y, cv=5)
        scores.append(np.mean(score))
    
    # Get optimal number of features
    optimal_n_features = features_counts[np.argmax(scores)]
    
    return optimal_n_features, ranking, support, scores

# Example usage
X, y = make_classification(n_samples=1000, n_features=20, 
                         n_informative=10, random_state=42)
opt_features, ranking, support, scores = recursive_feature_elimination(X, y)
print(f"Optimal number of features: {opt_features}")
print(f"Feature ranking: {ranking}")

Slide 5: Information Gain and Mutual Information

Information theory metrics provide powerful insights into feature relevance by measuring the mutual information between features and target variables. This implementation demonstrates both entropy-based and mutual information-based feature selection.

from sklearn.feature_selection import mutual_info_classif
import numpy as np
from scipy.stats import entropy

def information_based_selection(X, y, method='mutual_info', k=5):
    if method == 'mutual_info':
        # Calculate mutual information scores
        mi_scores = mutual_info_classif(X, y)
    else:
        # Calculate information gain
        mi_scores = np.array([
            entropy(y) - conditional_entropy(X[:, i], y)
            for i in range(X.shape[1])
        ])
    
    # Select top k features
    top_features = np.argsort(mi_scores)[-k:]
    
    return X[:, top_features], top_features, mi_scores

def conditional_entropy(x, y):
    # Calculate conditional entropy H(Y|X)
    y_unique = np.unique(y)
    x_unique = np.unique(x)
    
    cond_entropy = 0
    for x_val in x_unique:
        p_x = np.mean(x == x_val)
        y_given_x = y[x == x_val]
        if len(y_given_x) > 0:
            cond_entropy += p_x * entropy(y_given_x)
    
    return cond_entropy

# Example usage
X, y = make_classification(n_samples=1000, n_features=20,
                         n_informative=10, random_state=42)
X_selected, selected_features, scores = information_based_selection(X, y)
print(f"Selected features: {selected_features}")
print(f"Feature scores: {scores}")

Slide 6: Lasso Regularization for Feature Selection

Lasso (L1) regularization inherently performs feature selection by driving coefficients of less important features to exactly zero. This implementation demonstrates how to use Lasso regression to identify and select the most relevant features while handling multicollinearity.

from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler
import numpy as np

def lasso_feature_selection(X, y, cv=5):
    # Standardize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Fit LassoCV
    lasso = LassoCV(cv=cv, random_state=42, max_iter=10000)
    lasso.fit(X_scaled, y)
    
    # Get selected features
    selected_features = np.where(lasso.coef_ != 0)[0]
    feature_importance = np.abs(lasso.coef_)
    
    return X[:, selected_features], selected_features, feature_importance

# Example usage with synthetic data
X, y = make_classification(n_samples=1000, n_features=20,
                         n_informative=10, random_state=42)
X_selected, selected_features, importance = lasso_feature_selection(X, y)

print(f"Selected features: {selected_features}")
print(f"Feature importance: {importance}")
print(f"Optimal alpha: {lasso.alpha_:.4f}")

Slide 7: Statistical Testing for Feature Selection

Statistical tests provide a rigorous framework for evaluating feature significance. This implementation uses multiple statistical methods including Chi-square, ANOVA, and Mann-Whitney U tests for different types of features.

from scipy.stats import chi2_contingency, f_oneway, mannwhitneyu
import numpy as np
import pandas as pd

def statistical_feature_selection(X, y, method='anova', alpha=0.05):
    p_values = []
    selected_features = []
    
    for i in range(X.shape[1]):
        if method == 'anova':
            # ANOVA test for numerical features
            _, p_value = f_oneway(*[X[y == label, i] 
                                  for label in np.unique(y)])
        elif method == 'chi2':
            # Chi-square test for categorical features
            contingency = pd.crosstab(X[:, i], y)
            _, p_value, _, _ = chi2_contingency(contingency)
        elif method == 'mannwhitney':
            # Mann-Whitney U test for binary classification
            _, p_value = mannwhitneyu(X[y == 0, i], X[y == 1, i])
            
        p_values.append(p_value)
        if p_value < alpha:
            selected_features.append(i)
    
    return X[:, selected_features], selected_features, p_values

# Example usage
X, y = make_classification(n_samples=1000, n_features=20,
                         n_informative=10, random_state=42)
X_selected, selected_feats, p_vals = statistical_feature_selection(X, y)

print(f"Selected features: {selected_feats}")
print(f"P-values: {np.round(p_vals, 4)}")

Slide 8: Random Forest Feature Importance

Random Forests provide built-in feature importance measures through mean decrease in impurity or permutation importance. This implementation demonstrates both approaches and their differences in feature selection.

from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
import numpy as np

def random_forest_feature_selection(X, y, importance_type='mdi', 
                                  threshold=0.05):
    # Initialize and fit Random Forest
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X, y)
    
    if importance_type == 'mdi':
        # Mean Decrease Impurity importance
        importance = rf.feature_importances_
    else:
        # Permutation importance
        result = permutation_importance(rf, X, y, n_repeats=10,
                                      random_state=42)
        importance = result.importances_mean
    
    # Select features above threshold
    selected_features = np.where(importance > threshold)[0]
    
    return (X[:, selected_features], selected_features, 
            importance[selected_features])

# Example usage
X, y = make_classification(n_samples=1000, n_features=20,
                         n_informative=10, random_state=42)
X_selected, selected_feats, importance = random_forest_feature_selection(X, y)

print(f"Selected features: {selected_feats}")
print(f"Feature importance: {np.round(importance, 4)}")

Slide 9: Recursive Feature Addition

Instead of eliminating features, this approach starts with an empty feature set and recursively adds the most important features until a stopping criterion is met. This implementation includes cross-validation for optimal feature subset selection.

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
import numpy as np

def recursive_feature_addition(X, y, estimator=None, cv=5):
    if estimator is None:
        estimator = LogisticRegression(random_state=42)
    
    n_features = X.shape[1]
    selected_features = []
    current_score = 0
    
    # Score tracking
    scores_history = []
    
    while len(selected_features) < n_features:
        best_score = 0
        best_feature = None
        
        # Try adding each feature
        for feature in range(n_features):
            if feature not in selected_features:
                features_to_try = selected_features + [feature]
                X_subset = X[:, features_to_try]
                
                # Calculate cross-validation score
                scores = cross_val_score(estimator, X_subset, y, cv=cv)
                score = np.mean(scores)
                
                if score > best_score:
                    best_score = score
                    best_feature = feature
        
        # Add best feature if it improves score
        if best_score > current_score:
            selected_features.append(best_feature)
            current_score = best_score
            scores_history.append(current_score)
        else:
            break
    
    return (X[:, selected_features], selected_features, 
            scores_history)

# Example usage
X, y = make_classification(n_samples=1000, n_features=20,
                         n_informative=10, random_state=42)
X_selected, selected_feats, scores = recursive_feature_addition(X, y)

print(f"Selected features: {selected_feats}")
print(f"Score history: {np.round(scores, 4)}")

Slide 10: Real-world Example - Credit Card Fraud Detection

This comprehensive example demonstrates feature selection in a credit card fraud detection scenario, implementing multiple selection methods and comparing their effectiveness in identifying fraudulent transactions.

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

def credit_fraud_feature_selection(X, y):
    # Initialize results dictionary
    results = {}
    
    # 1. Variance Threshold
    def variance_selector(X, threshold=0.01):
        variances = np.var(X, axis=0)
        return variances > threshold
    
    var_mask = variance_selector(X)
    X_var = X[:, var_mask]
    results['variance'] = {'mask': var_mask, 'X': X_var}
    
    # 2. Correlation Analysis
    corr_matrix = np.corrcoef(X.T)
    high_corr_features = set()
    for i in range(len(corr_matrix)):
        for j in range(i):
            if abs(corr_matrix[i, j]) > 0.95:
                high_corr_features.add(i)
    
    corr_mask = [i not in high_corr_features for i in range(X.shape[1])]
    X_corr = X[:, corr_mask]
    results['correlation'] = {'mask': corr_mask, 'X': X_corr}
    
    # 3. Random Forest Importance
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X, y)
    importance_mask = rf.feature_importances_ > np.mean(rf.feature_importances_)
    X_rf = X[:, importance_mask]
    results['rf_importance'] = {'mask': importance_mask, 'X': X_rf}
    
    return results

# Example usage with synthetic fraud data
np.random.seed(42)
n_samples = 10000
n_features = 30

# Generate synthetic credit card transaction data
X = np.random.randn(n_samples, n_features)
# Add some correlated features
X[:, 5] = X[:, 0] * 0.9 + np.random.randn(n_samples) * 0.1
X[:, 6] = X[:, 1] * 0.95 + np.random.randn(n_samples) * 0.05
# Generate fraud labels (1% fraud rate)
y = np.random.choice([0, 1], size=n_samples, p=[0.99, 0.01])

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Apply feature selection
results = credit_fraud_feature_selection(X_train, y_train)

# Evaluate each method
for method, data in results.items():
    clf = RandomForestClassifier(random_state=42)
    clf.fit(data['X'], y_train)
    y_pred = clf.predict(X_test[:, data['mask']])
    print(f"\nResults for {method}:")
    print(classification_report(y_test, y_pred))

Slide 11: Real-world Example - Gene Expression Analysis

This implementation showcases feature selection in genomics, where high-dimensional gene expression data requires efficient feature selection to identify relevant genes for disease classification.

import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFdr, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

def gene_expression_feature_selection(X, y, method='combined'):
    """
    Feature selection for gene expression data using multiple methods
    """
    results = {}
    
    # 1. Statistical Testing with FDR control
    fdr_selector = SelectFdr(f_classif, alpha=0.05)
    X_fdr = fdr_selector.fit_transform(X, y)
    results['fdr'] = {
        'mask': fdr_selector.get_support(),
        'X': X_fdr,
        'pvalues': fdr_selector.pvalues_
    }
    
    # 2. Stability Selection
    def stability_selection(X, y, n_iterations=50):
        feature_counts = np.zeros(X.shape[1])
        for _ in range(n_iterations):
            # Bootstrap sample
            indices = np.random.choice(len(X), size=len(X), replace=True)
            X_boot, y_boot = X[indices], y[indices]
            
            # Random Forest on bootstrap sample
            rf = RandomForestClassifier(n_estimators=100, random_state=42)
            rf.fit(X_boot, y_boot)
            
            # Count important features
            important_features = rf.feature_importances_ > np.mean(rf.feature_importances_)
            feature_counts += important_features
        
        return feature_counts / n_iterations > 0.5
    
    stability_mask = stability_selection(X, y)
    X_stability = X[:, stability_mask]
    results['stability'] = {'mask': stability_mask, 'X': X_stability}
    
    # 3. Combined approach
    if method == 'combined':
        combined_mask = results['fdr']['mask'] & results['stability']['mask']
        X_combined = X[:, combined_mask]
        results['combined'] = {'mask': combined_mask, 'X': X_combined}
    
    return results

# Example usage with synthetic gene expression data
np.random.seed(42)
n_samples = 200
n_genes = 1000

# Generate synthetic gene expression data
X = np.random.randn(n_samples, n_genes)
# Add some informative genes
informative_genes = np.random.choice(n_genes, 50, replace=False)
X[:100, informative_genes] += 2
y = np.array([1] * 100 + [0] * 100)

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Apply feature selection
results = gene_expression_feature_selection(X_train, y_train)

# Evaluate each method
for method, data in results.items():
    clf = RandomForestClassifier(random_state=42)
    clf.fit(data['X'], y_train)
    y_pred_proba = clf.predict_proba(X_test[:, data['mask']])[:, 1]
    auc = roc_auc_score(y_test, y_pred_proba)
    print(f"\n{method} AUC-ROC: {auc:.4f}")
    print(f"Selected features: {sum(data['mask'])}")

Slide 12: Feature Selection Performance Metrics

This implementation provides a comprehensive suite of metrics to evaluate feature selection methods, including stability, relevance, and redundancy measures across different selection techniques.

import numpy as np
from sklearn.metrics import mutual_info_score
from sklearn.model_selection import cross_val_score

def evaluate_feature_selection(X, y, selected_features, 
                             original_features=None):
    """
    Comprehensive evaluation of feature selection results
    """
    metrics = {}
    
    # 1. Stability (Jaccard similarity between feature subsets)
    def calculate_stability(feature_sets):
        n_sets = len(feature_sets)
        stability = 0
        for i in range(n_sets):
            for j in range(i + 1, n_sets):
                intersection = len(set(feature_sets[i]) & 
                                 set(feature_sets[j]))
                union = len(set(feature_sets[i]) | 
                           set(feature_sets[j]))
                stability += intersection / union
        return 2 * stability / (n_sets * (n_sets - 1))
    
    # 2. Relevance (Mutual Information with target)
    def calculate_relevance(X, y, features):
        relevance = np.mean([
            mutual_info_score(X[:, f], y) for f in features
        ])
        return relevance
    
    # 3. Redundancy (Average mutual information between features)
    def calculate_redundancy(X, features):
        redundancy = 0
        n_features = len(features)
        if n_features < 2:
            return 0
        
        for i in range(n_features):
            for j in range(i + 1, n_features):
                redundancy += mutual_info_score(
                    X[:, features[i]], X[:, features[j]])
        return 2 * redundancy / (n_features * (n_features - 1))
    
    # Calculate metrics
    metrics['n_selected'] = len(selected_features)
    metrics['relevance'] = calculate_relevance(X, y, selected_features)
    metrics['redundancy'] = calculate_redundancy(X, selected_features)
    
    # Calculate prediction performance
    clf = RandomForestClassifier(random_state=42)
    scores = cross_val_score(clf, X[:, selected_features], y, cv=5)
    metrics['cv_score_mean'] = np.mean(scores)
    metrics['cv_score_std'] = np.std(scores)
    
    # Compare with original features if provided
    if original_features is not None:
        original_scores = cross_val_score(
            clf, X[:, original_features], y, cv=5)
        metrics['original_cv_score'] = np.mean(original_scores)
        metrics['feature_reduction'] = (
            1 - len(selected_features) / len(original_features))
    
    return metrics

# Example usage
X, y = make_classification(n_samples=1000, n_features=20,
                         n_informative=10, random_state=42)

# Apply different feature selection methods
methods = {
    'variance': variance_threshold_selector,
    'correlation': correlation_selector,
    'mutual_info': information_based_selection
}

results = {}
for name, method in methods.items():
    _, selected_features, _ = method(X, y)
    results[name] = evaluate_feature_selection(
        X, y, selected_features, range(X.shape[1]))
    
    print(f"\nMetrics for {name}:")
    for metric, value in results[name].items():
        print(f"{metric}: {value:.4f}")

Slide 13: Ensemble Feature Selection

This implementation combines multiple feature selection methods through a voting mechanism to create a more robust and reliable feature selection process, reducing the bias of individual methods.

import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import mutual_info_classif, f_classif
from sklearn.linear_model import LassoCV

class EnsembleFeatureSelector:
    def __init__(self, methods=None, voting_threshold=0.5):
        self.methods = methods if methods else {
            'rf': self._rf_importance,
            'mutual_info': self._mutual_info,
            'lasso': self._lasso_selection,
            'f_score': self._f_score
        }
        self.voting_threshold = voting_threshold
        self.feature_votes = None
        
    def _rf_importance(self, X, y):
        rf = RandomForestClassifier(n_estimators=100, random_state=42)
        rf.fit(X, y)
        return rf.feature_importances_ > np.mean(rf.feature_importances_)
    
    def _mutual_info(self, X, y):
        mi_scores = mutual_info_classif(X, y)
        return mi_scores > np.mean(mi_scores)
    
    def _lasso_selection(self, X, y):
        lasso = LassoCV(cv=5, random_state=42)
        lasso.fit(X, y)
        return np.abs(lasso.coef_) > 0
    
    def _f_score(self, X, y):
        f_scores, _ = f_classif(X, y)
        return f_scores > np.mean(f_scores)
    
    def fit(self, X, y):
        self.feature_votes = np.zeros(X.shape[1])
        
        # Apply each method and collect votes
        for method in self.methods.values():
            self.feature_votes += method(X, y)
        
        # Normalize votes
        self.feature_votes /= len(self.methods)
        
        return self
    
    def transform(self, X):
        selected = self.feature_votes >= self.voting_threshold
        return X[:, selected]
    
    def fit_transform(self, X, y):
        self.fit(X, y)
        return self.transform(X)

# Example usage
X, y = make_classification(n_samples=1000, n_features=20,
                         n_informative=10, random_state=42)

# Apply ensemble feature selection
selector = EnsembleFeatureSelector()
X_selected = selector.fit_transform(X, y)

print(f"Original features: {X.shape[1]}")
print(f"Selected features: {X_selected.shape[1]}")
print(f"Feature votes:\n{selector.feature_votes}")

Slide 14: Feature Selection Visualization

This implementation provides comprehensive visualization tools for analyzing feature selection results, including feature importance distributions, stability plots, and performance comparisons.

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_curve, auc

def visualize_feature_selection(X, y, results_dict, 
                              output_format='png'):
    """
    Comprehensive visualization of feature selection results
    """
    plt.style.use('seaborn')
    fig = plt.figure(figsize=(20, 15))
    
    # 1. Feature Importance Distribution
    plt.subplot(2, 2, 1)
    for method, result in results_dict.items():
        if 'importance' in result:
            sns.kdeplot(result['importance'], label=method)
    plt.title('Feature Importance Distribution')
    plt.xlabel('Importance Score')
    plt.ylabel('Density')
    plt.legend()
    
    # 2. Selected Features Heatmap
    plt.subplot(2, 2, 2)
    selection_matrix = np.array([result['mask'] 
                               for result in results_dict.values()])
    sns.heatmap(selection_matrix, 
                yticklabels=list(results_dict.keys()),
                cmap='YlOrRd')
    plt.title('Feature Selection Comparison')
    plt.xlabel('Feature Index')
    
    # 3. Performance Comparison
    plt.subplot(2, 2, 3)
    performance_data = []
    for method, result in results_dict.items():
        if 'cv_scores' in result:
            performance_data.append({
                'method': method,
                'score': result['cv_scores'].mean(),
                'std': result['cv_scores'].std()
            })
    
    performance_df = pd.DataFrame(performance_data)
    sns.barplot(data=performance_df, x='method', y='score',
                yerr=performance_df['std'])
    plt.title('Performance Comparison')
    plt.ylabel('Cross-validation Score')
    
    # 4. ROC Curves
    plt.subplot(2, 2, 4)
    for method, result in results_dict.items():
        if 'predictions' in result:
            fpr, tpr, _ = roc_curve(y, result['predictions'])
            roc_auc = auc(fpr, tpr)
            plt.plot(fpr, tpr, label=f'{method} (AUC = {roc_auc:.2f})')
    
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curves')
    plt.legend(loc="lower right")
    
    plt.tight_layout()
    return fig

# Example usage
X, y = make_classification(n_samples=1000, n_features=20,
                         n_informative=10, random_state=42)

# Apply different feature selection methods and collect results
results = {}
methods = {
    'RF': RandomForestClassifier(n_estimators=100),
    'Lasso': LassoCV(),
    'MutualInfo': SelectKBest(mutual_info_classif, k=10)
}

for name, method in methods.items():
    if hasattr(method, 'fit_transform'):
        X_selected = method.fit_transform(X, y)
    else:
        method.fit(X, y)
        X_selected = method.transform(X)
    
    results[name] = {
        'mask': np.where(X_selected.any(axis=0))[0],
        'importance': method.feature_importances_ 
                     if hasattr(method, 'feature_importances_') 
                     else None,
        'cv_scores': cross_val_score(
            RandomForestClassifier(), X_selected, y, cv=5),
        'predictions': cross_val_predict(
            RandomForestClassifier(), X_selected, y, cv=5,
            method='predict_proba')[:, 1]
    }

# Generate visualization
fig = visualize_feature_selection(X, y, results)
plt.show()

Slide 15: Additional Resources

  • "An Introduction to Variable and Feature Selection" - https://www.jmlr.org/papers/volume3/guyon03a/guyon03a.pdf
  • "Feature Selection for High-Dimensional Data: A Fast Correlation-Based Filter Solution" - Search on Google Scholar
  • "Stability Selection" by Nicolai Meinshausen and Peter Bühlmann - https://arxiv.org/abs/0809.2932
  • "Feature Selection with Ensemble Methods" - Search on Google Scholar for recent publications
  • "A Review of Feature Selection Methods for Machine Learning" - https://arxiv.org/abs/1905.13525
  • "Deep Feature Selection: Theory and Application to Identifying Compounds" - Search on Google Scholar