Source code for spectoprep.modelling.ridge

# spectroprep/modeling/ridge.py

import numpy as np
from sklearn.linear_model import RidgeCV as SklearnRidgeCV
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.model_selection import GroupKFold, KFold
from sklearn.metrics import mean_squared_error, r2_score

[docs] class OptimizedRidgeCV(BaseEstimator, RegressorMixin): """ Ridge regression with built-in cross-validation and optimization capabilities. Parameters ---------- alphas : array-like, default=np.logspace(-3, 3, 10) Array of alpha values to try. A large array of values will slow down the computation. cv : int, cross-validation generator or an iterable, default=5 Determines the cross-validation splitting strategy. scoring : str, callable, default='neg_mean_squared_error' A string or a scorer callable object / function with signature ``scorer(estimator, X, y)``. fit_intercept : bool, default=True Whether to calculate the intercept for this model. normalize : bool, default=False This parameter is ignored when fit_intercept is set to False. If True, the regressors X will be normalized before regression by subtracting the mean and dividing by the l2-norm. gcv_mode : {None, 'auto', 'svd', 'eigen'}, default=None Flag indicating which strategy to use when performing Generalized Cross-Validation. store_cv_values : bool, default=False Flag indicating if the cross-validation values corresponding to each alpha should be stored in the cv_values_ attribute. groups : array-like, default=None Group labels for the samples. Only used if cv is a group-based cross-validation splitter. """ def __init__(self, alphas=None, cv=5, scoring='neg_mean_squared_error', fit_intercept=True, normalize=False, gcv_mode=None, store_cv_values=False, groups=None): self.alphas = alphas if alphas is not None else np.logspace(-3, 3, 10) self.cv = cv self.scoring = scoring self.fit_intercept = fit_intercept self.normalize = normalize self.gcv_mode = gcv_mode self.store_cv_values = store_cv_values self.groups = groups
[docs] def fit(self, X, y, sample_weight=None): """ Fit Ridge regression model with cross-validation. Parameters ---------- X : array-like of shape (n_samples, n_features) Training data. y : array-like of shape (n_samples,) or (n_samples, n_targets) Target values. sample_weight : float or array-like of shape (n_samples,), default=None Individual weights for each sample. Returns ------- self : object Returns self. """ X, y = check_X_y(X, y, y_numeric=True, multi_output=True) # Create cross-validation object if self.groups is not None: if len(self.groups) != X.shape[0]: raise ValueError("groups must have the same length as X") cv = GroupKFold(n_splits=self.cv) if isinstance(self.cv, int) else self.cv else: cv = KFold(n_splits=self.cv, shuffle=True, random_state=42) if isinstance(self.cv, int) else self.cv # Create RidgeCV estimator from scikit-learn self.ridge_cv_ = SklearnRidgeCV( alphas=self.alphas, fit_intercept=self.fit_intercept, normalize=self.normalize, scoring=self.scoring, cv=cv, gcv_mode=self.gcv_mode, store_cv_values=self.store_cv_values ) # Fit the model if self.groups is not None: # Custom CV with groups self.ridge_cv_.fit(X, y, sample_weight=sample_weight) else: self.ridge_cv_.fit(X, y, sample_weight=sample_weight) # Store fitted attributes self.alpha_ = self.ridge_cv_.alpha_ self.coef_ = self.ridge_cv_.coef_ self.intercept_ = self.ridge_cv_.intercept_ if hasattr(self.ridge_cv_, 'cv_values_'): self.cv_values_ = self.ridge_cv_.cv_values_ return self
[docs] def predict(self, X): """ Predict using the Ridge model. Parameters ---------- X : array-like of shape (n_samples, n_features) Samples. Returns ------- y_pred : array-like of shape (n_samples,) or (n_samples, n_targets) Returns predicted values. """ check_is_fitted(self, ["ridge_cv_", "alpha_", "coef_", "intercept_"]) X = check_array(X) return self.ridge_cv_.predict(X)
[docs] def score(self, X, y, sample_weight=None): """ Return the coefficient of determination R^2 of the prediction. Parameters ---------- X : array-like of shape (n_samples, n_features) Test samples. y : array-like of shape (n_samples,) or (n_samples, n_targets) True values for X. sample_weight : array-like of shape (n_samples,), default=None Sample weights. Returns ------- score : float R^2 of self.predict(X) wrt. y. """ check_is_fitted(self, ["ridge_cv_", "alpha_", "coef_", "intercept_"]) return self.ridge_cv_.score(X, y, sample_weight=sample_weight)
[docs] def get_cv_results(self): """ Return cross-validation results. Returns ------- cv_results : dict Results from cross-validation. """ check_is_fitted(self, ["ridge_cv_"]) return { 'alpha': self.alpha_, 'alphas_tested': self.alphas, 'cv_values': getattr(self, 'cv_values_', None), 'coef': self.coef_, 'intercept': self.intercept_ }