Source code for spectoprep.visualization.plots

import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
from sklearn.metrics import mean_squared_error, r2_score
from typing import List, Dict, Tuple, Union, Optional
import pandas as pd
from spectoprep.pipeline.optimizer import PipelineOptimizer


[docs]
class SpectroPrepPlotter:
    """
    A class for creating high-quality plots for spectroscopy data.
    
    This class provides various plotting functions specifically designed
    for spectroscopy data and pipeline optimization results.
    """
    

[docs]
    @staticmethod
    def set_style(style='whitegrid', context='paper', font_scale=1.2):
        """
        Set the visual style for the plots.
        
        Parameters
        ----------
        style : str, default='whitegrid'
            The seaborn style.
        context : str, default='paper'
            The seaborn context.
        font_scale : float, default=1.2
            The font scale.
        """
        sns.set_style(style)
        sns.set_context(context, font_scale=font_scale)
        plt.rcParams['figure.figsize'] = (10, 6)
        plt.rcParams['figure.dpi'] = 100

        

[docs]
    @staticmethod
    def plot_spectra(wavenumbers: np.ndarray, 
                     spectra: np.ndarray, 
                     labels: Optional[List[str]] = None,
                     title: str = 'Spectral Data',
                     xlabel: str = 'Wavenumber (cm$^{-1}$)',
                     ylabel: str = 'Absorbance',
                     alpha: float = 0.7,
                     figsize: Tuple[int, int] = (12, 6),
                     color_map: str = 'viridis',
                     legend_loc: str = 'best',
                     grid: bool = True,
                     save_path: Optional[str] = None):
        """
        Plot spectral data.
        
        Parameters
        ----------
        wavenumbers : array-like
            The x-axis values (wavenumbers).
        spectra : array-like
            The spectra data of shape (n_samples, n_features).
        labels : list of str, optional
            Labels for each spectrum. If None, spectra are numbered.
        title : str, default='Spectral Data'
            Plot title.
        xlabel : str, default='Wavenumber (cm$^{-1}$)'
            X-axis label.
        ylabel : str, default='Absorbance'
            Y-axis label.
        alpha : float, default=0.7
            Transparency of the lines.
        figsize : tuple, default=(12, 6)
            Figure size.
        color_map : str, default='viridis'
            Colormap for the spectra.
        legend_loc : str, default='best'
            Location of the legend.
        grid : bool, default=True
            Whether to show grid.
        save_path : str, optional
            If provided, save the figure to this path.
            
        Returns
        -------
        fig : matplotlib.figure.Figure
            The figure object.
        ax : matplotlib.axes.Axes
            The axes object.
        """
        fig, ax = plt.subplots(figsize=figsize)
        
        if spectra.ndim == 1:
            spectra = spectra.reshape(1, -1)
            
        n_spectra = spectra.shape[0]
        
        # Get colors from colormap
        cmap = plt.get_cmap(color_map)
        colors = [cmap(i/max(1, n_spectra-1)) for i in range(n_spectra)]
        
        # Plot each spectrum
        for i, spectrum in enumerate(spectra):
            label = f'Spectrum {i+1}' if labels is None else labels[i]
            ax.plot(wavenumbers, spectrum, label=label, color=colors[i], alpha=alpha)
        
        # Set labels and title
        ax.set_xlabel(xlabel, fontsize=12)
        ax.set_ylabel(ylabel, fontsize=12)
        ax.set_title(title, fontsize=14, fontweight='bold')
        
        # Show grid if requested
        if grid:
            ax.grid(True, linestyle='--', alpha=0.7)
        
        # Add legend if there are multiple spectra
        if n_spectra > 1:
            ax.legend(loc=legend_loc, frameon=True, framealpha=0.8)
        
        # Adjust layout
        plt.tight_layout()
        
        # Save figure if path is provided
        if save_path:
            plt.savefig(save_path, dpi=300, bbox_inches='tight')
        
        return fig, ax

    

[docs]
    @staticmethod
    def plot_preprocessing_comparison(wavenumbers: np.ndarray,
                                    original_spectra: np.ndarray,
                                    processed_spectra: Dict[str, np.ndarray],
                                    sample_indices: Optional[List[int]] = None,
                                    figsize: Tuple[int, int] = (15, 10),
                                    title: str = 'Preprocessing Comparison',
                                    color_map: str = 'tab10',
                                    save_path: Optional[str] = None):
        """
        Plot comparison of original and processed spectra.
        
        Parameters
        ----------
        wavenumbers : array-like
            The x-axis values (wavenumbers).
        original_spectra : array-like
            The original spectra data of shape (n_samples, n_features).
        processed_spectra : dict
            Dictionary mapping preprocessing method names to processed spectra.
        sample_indices : list of int, optional
            Indices of samples to plot. If None, all samples are plotted.
        figsize : tuple, default=(15, 10)
            Figure size.
        title : str, default='Preprocessing Comparison'
            Main title for the figure.
        color_map : str, default='tab10'
            Colormap for differentiating samples.
        save_path : str, optional
            If provided, save the figure to this path.
            
        Returns
        -------
        fig : matplotlib.figure.Figure
            The figure object.
        """
        # Get the number of preprocessing methods
        n_methods = len(processed_spectra) + 1  # +1 for original spectra
        
        # Determine the samples to plot
        if sample_indices is None:
            sample_indices = list(range(original_spectra.shape[0]))
        n_samples = len(sample_indices)
        
        # Create a figure with subplots
        fig = plt.figure(figsize=figsize)
        gs = gridspec.GridSpec(n_methods, 1, height_ratios=[1] * n_methods)
        
        # Get colors for samples
        cmap = plt.get_cmap(color_map)
        colors = [cmap(i % 10) for i in range(n_samples)]
        
        # Plot original spectra
        ax_orig = fig.add_subplot(gs[0])
        for i, idx in enumerate(sample_indices):
            ax_orig.plot(wavenumbers, original_spectra[idx], 
                        color=colors[i], label=f'Sample {idx+1}')
        ax_orig.set_title('Original Spectra', fontsize=12)
        ax_orig.set_xlabel('')
        ax_orig.set_ylabel('Absorbance')
        ax_orig.legend(loc='best', frameon=True)
        ax_orig.grid(True, linestyle='--', alpha=0.3)
        
        # Plot processed spectra
        for i, (method_name, processed) in enumerate(processed_spectra.items(), 1):
            ax = fig.add_subplot(gs[i], sharex=ax_orig)
            for j, idx in enumerate(sample_indices):
                ax.plot(wavenumbers, processed[idx], color=colors[j], label=f'Sample {idx+1}')
            ax.set_title(f'{method_name}', fontsize=12)
            if i == n_methods - 1:
                ax.set_xlabel('Wavenumber (cm$^{-1}$)', fontsize=12)
            else:
                ax.set_xlabel('')
            ax.set_ylabel('Absorbance')
            ax.grid(True, linestyle='--', alpha=0.3)
        
        plt.suptitle(title, fontsize=16, fontweight='bold', y=0.98)
        plt.tight_layout(rect=[0, 0, 1, 0.97])
        
        if save_path:
            plt.savefig(save_path, dpi=300, bbox_inches='tight')
        
        return fig

    

[docs]
    @staticmethod
    def plot_optimization_results(optimizer: PipelineOptimizer,
                                top_n: int = 5,
                                figsize: Tuple[int, int] = (12, 8),
                                title: str = 'Pipeline Optimization Results',
                                save_path: Optional[str] = None):
        """
        Plot optimization results from PipelineOptimizer.
        
        Parameters
        ----------
        optimizer : PipelineOptimizer
            The fitted pipeline optimizer.
        top_n : int, default=5
            Number of top pipelines to display.
        figsize : tuple, default=(12, 8)
            Figure size.
        title : str, default='Pipeline Optimization Results'
            Plot title.
        save_path : str, optional
            If provided, save the figure to this path.
            
        Returns
        -------
        fig : matplotlib.figure.Figure
            The figure object.
        """
        if optimizer.optimizer is None:
            raise ValueError("No optimization results found. Run bayesian_optimize() first.")
        
        # Get all results
        results = optimizer.get_all_tested_pipelines()
        
        # Sort by RMSE (ascending)
        sorted_results = sorted(results, key=lambda x: x['rmse'])
        
        # Select top N results
        top_results = sorted_results[:top_n]
        
        # Extract data for plotting
        pipelines = [' → '.join(res['pipeline_config']) for res in top_results]
        rmses = [res['rmse'] for res in top_results]
        r2s = [res['r2'] for res in top_results]
        
        # Create figure with two subplots
        fig, (ax1, ax2) = plt.subplots(2, 1, figsize=figsize, sharex=True)
        
        # Plot RMSE values
        ax1.barh(pipelines, rmses, color='skyblue', alpha=0.8)
        ax1.set_title('RMSE (lower is better)', fontsize=12)
        ax1.set_ylabel('Pipeline Configuration')
        ax1.grid(True, linestyle='--', alpha=0.3, axis='x')
        
        # Add RMSE values as text
        for i, rmse in enumerate(rmses):
            ax1.text(rmse + max(rmses) * 0.01, i, f'{rmse:.4f}', 
                    va='center', fontsize=10)
        
        # Plot R² values
        ax2.barh(pipelines, r2s, color='lightgreen', alpha=0.8)
        ax2.set_title('R² (higher is better)', fontsize=12)
        ax2.set_ylabel('Pipeline Configuration')
        ax2.set_xlabel('Score')
        ax2.grid(True, linestyle='--', alpha=0.3, axis='x')
        
        # Add R² values as text
        for i, r2 in enumerate(r2s):
            ax2.text(r2 + max(r2s) * 0.01, i, f'{r2:.4f}', 
                    va='center', fontsize=10)
        
        plt.suptitle(title, fontsize=16, fontweight='bold')
        plt.tight_layout(rect=[0, 0, 1, 0.96])
        
        if save_path:
            plt.savefig(save_path, dpi=300, bbox_inches='tight')
        
        return fig

    

[docs]
    @staticmethod
    def plot_prediction_scatter(y_true: np.ndarray,
                               y_pred: np.ndarray,
                               title: str = 'Prediction Performance',
                               xlabel: str = 'Measured',
                               ylabel: str = 'Predicted',
                               figsize: Tuple[int, int] = (10, 8),
                               alpha: float = 0.7,
                               color: str = 'blue',
                               add_metrics: bool = True,
                               save_path: Optional[str] = None):
        """
        Create a scatter plot of predicted vs true values.
        
        Parameters
        ----------
        y_true : array-like
            True target values.
        y_pred : array-like
            Predicted target values.
        title : str, default='Prediction Performance'
            Plot title.
        xlabel : str, default='Measured'
            X-axis label.
        ylabel : str, default='Predicted'
            Y-axis label.
        figsize : tuple, default=(10, 8)
            Figure size.
        alpha : float, default=0.7
            Transparency of the points.
        color : str, default='blue'
            Color of the scatter points.
        add_metrics : bool, default=True
            Whether to add RMSE and R² metrics to the plot.
        save_path : str, optional
            If provided, save the figure to this path.
            
        Returns
        -------
        fig : matplotlib.figure.Figure
            The figure object.
        ax : matplotlib.axes.Axes
            The axes object.
        """
        # Calculate metrics
        rmse = np.sqrt(mean_squared_error(y_true, y_pred))
        r2 = r2_score(y_true, y_pred)
        
        # Create figure
        fig, ax = plt.subplots(figsize=figsize)
        
        # Plot scatter points
        ax.scatter(y_true, y_pred, alpha=alpha, color=color, edgecolor='k', linewidth=0.5)
        
        # Calculate and plot identity line
        min_val = min(np.min(y_true), np.min(y_pred))
        max_val = max(np.max(y_true), np.max(y_pred))
        padding = (max_val - min_val) * 0.05
        line_x = np.array([min_val - padding, max_val + padding])
        ax.plot(line_x, line_x, 'k--', alpha=0.7, label='Identity Line')
        
        # Set axis limits
        ax.set_xlim(min_val - padding, max_val + padding)
        ax.set_ylim(min_val - padding, max_val + padding)
        
        # Add metrics text if requested
        if add_metrics:
            metrics_text = f'RMSE = {rmse:.4f}\nR² = {r2:.4f}'
            ax.text(0.05, 0.95, metrics_text, transform=ax.transAxes,
                   bbox=dict(facecolor='white', alpha=0.8, boxstyle='round,pad=0.5'),
                   verticalalignment='top', fontsize=12)
        
        # Set labels and title
        ax.set_xlabel(xlabel, fontsize=12)
        ax.set_ylabel(ylabel, fontsize=12)
        ax.set_title(title, fontsize=14, fontweight='bold')
        
        # Add grid
        ax.grid(True, linestyle='--', alpha=0.3)
        
        # Adjust layout
        plt.tight_layout()
        
        if save_path:
            plt.savefig(save_path, dpi=300, bbox_inches='tight')
        
        return fig, ax

    

[docs]
    @staticmethod
    def plot_optimization_progress(optimizer: PipelineOptimizer,
                                 figsize: Tuple[int, int] = (12, 6),
                                 title: str = 'Optimization Progress',
                                 save_path: Optional[str] = None):
        """
        Plot optimization progress over iterations.
        
        Parameters
        ----------
        optimizer : PipelineOptimizer
            The fitted pipeline optimizer.
        figsize : tuple, default=(12, 6)
            Figure size.
        title : str, default='Optimization Progress'
            Plot title.
        save_path : str, optional
            If provided, save the figure to this path.
            
        Returns
        -------
        fig : matplotlib.figure.Figure
            The figure object.
        ax : matplotlib.axes.Axes
            The axes object.
        """
        if optimizer.optimizer is None:
            raise ValueError("No optimization results found. Run bayesian_optimize() first.")
        
        # Get all results
        results = optimizer.get_all_tested_pipelines()
        
        # Extract iteration and RMSE data
        iterations = [res['trial'] for res in results if res['trial'] is not None]
        rmses = [res['rmse'] for res in results if res['trial'] is not None]
        
        # Create DataFrame for easier plotting
        df = pd.DataFrame({'Iteration': iterations, 'RMSE': rmses})
        df = df.sort_values('Iteration')
        
        # Create figure
        fig, ax = plt.subplots(figsize=figsize)
        
        # Plot RMSE values
        ax.plot(df['Iteration'], df['RMSE'], marker='o', linestyle='-', color='blue', alpha=0.7)
        
        # Calculate and plot running minimum RMSE
        running_min = df['RMSE'].cummin()
        ax.plot(df['Iteration'], running_min, marker='', linestyle='--', color='red', 
               alpha=0.8, label='Best RMSE')
        
        # Add labels and title
        ax.set_xlabel('Iteration', fontsize=12)
        ax.set_ylabel('RMSE', fontsize=12)
        ax.set_title(title, fontsize=14, fontweight='bold')
        
        # Add legend
        ax.legend(loc='upper right')
        
        # Add grid
        ax.grid(True, linestyle='--', alpha=0.3)
        
        # Adjust layout
        plt.tight_layout()
        
        if save_path:
            plt.savefig(save_path, dpi=300, bbox_inches='tight')
        
        return fig, ax

    

[docs]
    @staticmethod
    def plot_feature_importance(wavenumbers: np.ndarray,
                              coefficients: np.ndarray,
                              title: str = 'Feature Importance',
                              xlabel: str = 'Wavenumber (cm$^{-1}$)',
                              ylabel: str = 'Coefficient Value',
                              figsize: Tuple[int, int] = (12, 6),
                              color: str = 'purple',
                              highlight_threshold: Optional[float] = None,
                              highlight_color: str = 'red',
                              save_path: Optional[str] = None):
        """
        Plot feature importance from model coefficients.
        
        Parameters
        ----------
        wavenumbers : array-like
            The x-axis values (wavenumbers).
        coefficients : array-like
            Model coefficients corresponding to each wavenumber.
        title : str, default='Feature Importance'
            Plot title.
        xlabel : str, default='Wavenumber (cm$^{-1}$)'
            X-axis label.
        ylabel : str, default='Coefficient Value'
            Y-axis label.
        figsize : tuple, default=(12, 6)
            Figure size.
        color : str, default='purple'
            Color of the line.
        highlight_threshold : float, optional
            If provided, highlights coefficients with absolute values above this threshold.
        highlight_color : str, default='red'
            Color for highlighted coefficients.
        save_path : str, optional
            If provided, save the figure to this path.
            
        Returns
        -------
        fig : matplotlib.figure.Figure
            The figure object.
        ax : matplotlib.axes.Axes
            The axes object.
        """
        fig, ax = plt.subplots(figsize=figsize)
        
        # Plot coefficients
        ax.plot(wavenumbers, coefficients, color=color, alpha=0.7)
        
        # Highlight important features if threshold is provided
        if highlight_threshold is not None:
            important_mask = np.abs(coefficients) > highlight_threshold
            if np.any(important_mask):
                ax.scatter(wavenumbers[important_mask], coefficients[important_mask], 
                         color=highlight_color, s=50, zorder=3, 
                         label=f'|Coef| > {highlight_threshold}')
                ax.legend(loc='best')
        
        # Add zero line
        ax.axhline(y=0, color='gray', linestyle='--', alpha=0.5)
        
        # Set labels and title
        ax.set_xlabel(xlabel, fontsize=12)
        ax.set_ylabel(ylabel, fontsize=12)
        ax.set_title(title, fontsize=14, fontweight='bold')
        
        # Add grid
        ax.grid(True, linestyle='--', alpha=0.3)
        
        # Adjust layout
        plt.tight_layout()
        
        if save_path:
            plt.savefig(save_path, dpi=300, bbox_inches='tight')
        
        return fig, ax