Source code for spectoprep.modelling.ridge

# spectroprep/modeling/ridge.py

import numpy as np
from sklearn.linear_model import RidgeCV as SklearnRidgeCV
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.model_selection import GroupKFold, KFold
from sklearn.metrics import mean_squared_error, r2_score


[docs]
class OptimizedRidgeCV(BaseEstimator, RegressorMixin):
    """
    Ridge regression with built-in cross-validation and optimization capabilities.
    
    Parameters
    ----------
    alphas : array-like, default=np.logspace(-3, 3, 10)
        Array of alpha values to try. A large array of values will slow
        down the computation.
    cv : int, cross-validation generator or an iterable, default=5
        Determines the cross-validation splitting strategy.
    scoring : str, callable, default='neg_mean_squared_error'
        A string or a scorer callable object / function with signature
        ``scorer(estimator, X, y)``.
    fit_intercept : bool, default=True
        Whether to calculate the intercept for this model.
    normalize : bool, default=False
        This parameter is ignored when fit_intercept is set to False.
        If True, the regressors X will be normalized before regression
        by subtracting the mean and dividing by the l2-norm.
    gcv_mode : {None, 'auto', 'svd', 'eigen'}, default=None
        Flag indicating which strategy to use when performing
        Generalized Cross-Validation.
    store_cv_values : bool, default=False
        Flag indicating if the cross-validation values corresponding to
        each alpha should be stored in the cv_values_ attribute.
    groups : array-like, default=None
        Group labels for the samples. Only used if cv is a group-based
        cross-validation splitter.
    """
    
    def __init__(self, alphas=None, cv=5, scoring='neg_mean_squared_error', 
                 fit_intercept=True, normalize=False, gcv_mode=None, 
                 store_cv_values=False, groups=None):
        self.alphas = alphas if alphas is not None else np.logspace(-3, 3, 10)
        self.cv = cv
        self.scoring = scoring
        self.fit_intercept = fit_intercept
        self.normalize = normalize
        self.gcv_mode = gcv_mode
        self.store_cv_values = store_cv_values
        self.groups = groups
        

[docs]
    def fit(self, X, y, sample_weight=None):
        """
        Fit Ridge regression model with cross-validation.
        
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training data.
        y : array-like of shape (n_samples,) or (n_samples, n_targets)
            Target values.
        sample_weight : float or array-like of shape (n_samples,), default=None
            Individual weights for each sample.
            
        Returns
        -------
        self : object
            Returns self.
        """
        X, y = check_X_y(X, y, y_numeric=True, multi_output=True)
        
        # Create cross-validation object
        if self.groups is not None:
            if len(self.groups) != X.shape[0]:
                raise ValueError("groups must have the same length as X")
            cv = GroupKFold(n_splits=self.cv) if isinstance(self.cv, int) else self.cv
        else:
            cv = KFold(n_splits=self.cv, shuffle=True, random_state=42) if isinstance(self.cv, int) else self.cv
        
        # Create RidgeCV estimator from scikit-learn
        self.ridge_cv_ = SklearnRidgeCV(
            alphas=self.alphas,
            fit_intercept=self.fit_intercept,
            normalize=self.normalize,
            scoring=self.scoring,
            cv=cv,
            gcv_mode=self.gcv_mode,
            store_cv_values=self.store_cv_values
        )
        
        # Fit the model
        if self.groups is not None:
            # Custom CV with groups
            self.ridge_cv_.fit(X, y, sample_weight=sample_weight)
        else:
            self.ridge_cv_.fit(X, y, sample_weight=sample_weight)
        
        # Store fitted attributes
        self.alpha_ = self.ridge_cv_.alpha_
        self.coef_ = self.ridge_cv_.coef_
        self.intercept_ = self.ridge_cv_.intercept_
        if hasattr(self.ridge_cv_, 'cv_values_'):
            self.cv_values_ = self.ridge_cv_.cv_values_
        
        return self

    

[docs]
    def predict(self, X):
        """
        Predict using the Ridge model.
        
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Samples.
            
        Returns
        -------
        y_pred : array-like of shape (n_samples,) or (n_samples, n_targets)
            Returns predicted values.
        """
        check_is_fitted(self, ["ridge_cv_", "alpha_", "coef_", "intercept_"])
        X = check_array(X)
        return self.ridge_cv_.predict(X)

    

[docs]
    def score(self, X, y, sample_weight=None):
        """
        Return the coefficient of determination R^2 of the prediction.
        
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Test samples.
        y : array-like of shape (n_samples,) or (n_samples, n_targets)
            True values for X.
        sample_weight : array-like of shape (n_samples,), default=None
            Sample weights.
            
        Returns
        -------
        score : float
            R^2 of self.predict(X) wrt. y.
        """
        check_is_fitted(self, ["ridge_cv_", "alpha_", "coef_", "intercept_"])
        return self.ridge_cv_.score(X, y, sample_weight=sample_weight)

    

[docs]
    def get_cv_results(self):
        """
        Return cross-validation results.
        
        Returns
        -------
        cv_results : dict
            Results from cross-validation.
        """
        check_is_fitted(self, ["ridge_cv_"])
        
        return {
            'alpha': self.alpha_,
            'alphas_tested': self.alphas,
            'cv_values': getattr(self, 'cv_values_', None),
            'coef': self.coef_,
            'intercept': self.intercept_
        }