Source code for spectoprep.visualization.plots

import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
from sklearn.metrics import mean_squared_error, r2_score
from typing import List, Dict, Tuple, Union, Optional
import pandas as pd
from spectoprep.pipeline.optimizer import PipelineOptimizer

[docs] class SpectroPrepPlotter: """ A class for creating high-quality plots for spectroscopy data. This class provides various plotting functions specifically designed for spectroscopy data and pipeline optimization results. """
[docs] @staticmethod def set_style(style='whitegrid', context='paper', font_scale=1.2): """ Set the visual style for the plots. Parameters ---------- style : str, default='whitegrid' The seaborn style. context : str, default='paper' The seaborn context. font_scale : float, default=1.2 The font scale. """ sns.set_style(style) sns.set_context(context, font_scale=font_scale) plt.rcParams['figure.figsize'] = (10, 6) plt.rcParams['figure.dpi'] = 100
[docs] @staticmethod def plot_spectra(wavenumbers: np.ndarray, spectra: np.ndarray, labels: Optional[List[str]] = None, title: str = 'Spectral Data', xlabel: str = 'Wavenumber (cm$^{-1}$)', ylabel: str = 'Absorbance', alpha: float = 0.7, figsize: Tuple[int, int] = (12, 6), color_map: str = 'viridis', legend_loc: str = 'best', grid: bool = True, save_path: Optional[str] = None): """ Plot spectral data. Parameters ---------- wavenumbers : array-like The x-axis values (wavenumbers). spectra : array-like The spectra data of shape (n_samples, n_features). labels : list of str, optional Labels for each spectrum. If None, spectra are numbered. title : str, default='Spectral Data' Plot title. xlabel : str, default='Wavenumber (cm$^{-1}$)' X-axis label. ylabel : str, default='Absorbance' Y-axis label. alpha : float, default=0.7 Transparency of the lines. figsize : tuple, default=(12, 6) Figure size. color_map : str, default='viridis' Colormap for the spectra. legend_loc : str, default='best' Location of the legend. grid : bool, default=True Whether to show grid. save_path : str, optional If provided, save the figure to this path. Returns ------- fig : matplotlib.figure.Figure The figure object. ax : matplotlib.axes.Axes The axes object. """ fig, ax = plt.subplots(figsize=figsize) if spectra.ndim == 1: spectra = spectra.reshape(1, -1) n_spectra = spectra.shape[0] # Get colors from colormap cmap = plt.get_cmap(color_map) colors = [cmap(i/max(1, n_spectra-1)) for i in range(n_spectra)] # Plot each spectrum for i, spectrum in enumerate(spectra): label = f'Spectrum {i+1}' if labels is None else labels[i] ax.plot(wavenumbers, spectrum, label=label, color=colors[i], alpha=alpha) # Set labels and title ax.set_xlabel(xlabel, fontsize=12) ax.set_ylabel(ylabel, fontsize=12) ax.set_title(title, fontsize=14, fontweight='bold') # Show grid if requested if grid: ax.grid(True, linestyle='--', alpha=0.7) # Add legend if there are multiple spectra if n_spectra > 1: ax.legend(loc=legend_loc, frameon=True, framealpha=0.8) # Adjust layout plt.tight_layout() # Save figure if path is provided if save_path: plt.savefig(save_path, dpi=300, bbox_inches='tight') return fig, ax
[docs] @staticmethod def plot_preprocessing_comparison(wavenumbers: np.ndarray, original_spectra: np.ndarray, processed_spectra: Dict[str, np.ndarray], sample_indices: Optional[List[int]] = None, figsize: Tuple[int, int] = (15, 10), title: str = 'Preprocessing Comparison', color_map: str = 'tab10', save_path: Optional[str] = None): """ Plot comparison of original and processed spectra. Parameters ---------- wavenumbers : array-like The x-axis values (wavenumbers). original_spectra : array-like The original spectra data of shape (n_samples, n_features). processed_spectra : dict Dictionary mapping preprocessing method names to processed spectra. sample_indices : list of int, optional Indices of samples to plot. If None, all samples are plotted. figsize : tuple, default=(15, 10) Figure size. title : str, default='Preprocessing Comparison' Main title for the figure. color_map : str, default='tab10' Colormap for differentiating samples. save_path : str, optional If provided, save the figure to this path. Returns ------- fig : matplotlib.figure.Figure The figure object. """ # Get the number of preprocessing methods n_methods = len(processed_spectra) + 1 # +1 for original spectra # Determine the samples to plot if sample_indices is None: sample_indices = list(range(original_spectra.shape[0])) n_samples = len(sample_indices) # Create a figure with subplots fig = plt.figure(figsize=figsize) gs = gridspec.GridSpec(n_methods, 1, height_ratios=[1] * n_methods) # Get colors for samples cmap = plt.get_cmap(color_map) colors = [cmap(i % 10) for i in range(n_samples)] # Plot original spectra ax_orig = fig.add_subplot(gs[0]) for i, idx in enumerate(sample_indices): ax_orig.plot(wavenumbers, original_spectra[idx], color=colors[i], label=f'Sample {idx+1}') ax_orig.set_title('Original Spectra', fontsize=12) ax_orig.set_xlabel('') ax_orig.set_ylabel('Absorbance') ax_orig.legend(loc='best', frameon=True) ax_orig.grid(True, linestyle='--', alpha=0.3) # Plot processed spectra for i, (method_name, processed) in enumerate(processed_spectra.items(), 1): ax = fig.add_subplot(gs[i], sharex=ax_orig) for j, idx in enumerate(sample_indices): ax.plot(wavenumbers, processed[idx], color=colors[j], label=f'Sample {idx+1}') ax.set_title(f'{method_name}', fontsize=12) if i == n_methods - 1: ax.set_xlabel('Wavenumber (cm$^{-1}$)', fontsize=12) else: ax.set_xlabel('') ax.set_ylabel('Absorbance') ax.grid(True, linestyle='--', alpha=0.3) plt.suptitle(title, fontsize=16, fontweight='bold', y=0.98) plt.tight_layout(rect=[0, 0, 1, 0.97]) if save_path: plt.savefig(save_path, dpi=300, bbox_inches='tight') return fig
[docs] @staticmethod def plot_optimization_results(optimizer: PipelineOptimizer, top_n: int = 5, figsize: Tuple[int, int] = (12, 8), title: str = 'Pipeline Optimization Results', save_path: Optional[str] = None): """ Plot optimization results from PipelineOptimizer. Parameters ---------- optimizer : PipelineOptimizer The fitted pipeline optimizer. top_n : int, default=5 Number of top pipelines to display. figsize : tuple, default=(12, 8) Figure size. title : str, default='Pipeline Optimization Results' Plot title. save_path : str, optional If provided, save the figure to this path. Returns ------- fig : matplotlib.figure.Figure The figure object. """ if optimizer.optimizer is None: raise ValueError("No optimization results found. Run bayesian_optimize() first.") # Get all results results = optimizer.get_all_tested_pipelines() # Sort by RMSE (ascending) sorted_results = sorted(results, key=lambda x: x['rmse']) # Select top N results top_results = sorted_results[:top_n] # Extract data for plotting pipelines = [' → '.join(res['pipeline_config']) for res in top_results] rmses = [res['rmse'] for res in top_results] r2s = [res['r2'] for res in top_results] # Create figure with two subplots fig, (ax1, ax2) = plt.subplots(2, 1, figsize=figsize, sharex=True) # Plot RMSE values ax1.barh(pipelines, rmses, color='skyblue', alpha=0.8) ax1.set_title('RMSE (lower is better)', fontsize=12) ax1.set_ylabel('Pipeline Configuration') ax1.grid(True, linestyle='--', alpha=0.3, axis='x') # Add RMSE values as text for i, rmse in enumerate(rmses): ax1.text(rmse + max(rmses) * 0.01, i, f'{rmse:.4f}', va='center', fontsize=10) # Plot R² values ax2.barh(pipelines, r2s, color='lightgreen', alpha=0.8) ax2.set_title('R² (higher is better)', fontsize=12) ax2.set_ylabel('Pipeline Configuration') ax2.set_xlabel('Score') ax2.grid(True, linestyle='--', alpha=0.3, axis='x') # Add R² values as text for i, r2 in enumerate(r2s): ax2.text(r2 + max(r2s) * 0.01, i, f'{r2:.4f}', va='center', fontsize=10) plt.suptitle(title, fontsize=16, fontweight='bold') plt.tight_layout(rect=[0, 0, 1, 0.96]) if save_path: plt.savefig(save_path, dpi=300, bbox_inches='tight') return fig
[docs] @staticmethod def plot_prediction_scatter(y_true: np.ndarray, y_pred: np.ndarray, title: str = 'Prediction Performance', xlabel: str = 'Measured', ylabel: str = 'Predicted', figsize: Tuple[int, int] = (10, 8), alpha: float = 0.7, color: str = 'blue', add_metrics: bool = True, save_path: Optional[str] = None): """ Create a scatter plot of predicted vs true values. Parameters ---------- y_true : array-like True target values. y_pred : array-like Predicted target values. title : str, default='Prediction Performance' Plot title. xlabel : str, default='Measured' X-axis label. ylabel : str, default='Predicted' Y-axis label. figsize : tuple, default=(10, 8) Figure size. alpha : float, default=0.7 Transparency of the points. color : str, default='blue' Color of the scatter points. add_metrics : bool, default=True Whether to add RMSE and R² metrics to the plot. save_path : str, optional If provided, save the figure to this path. Returns ------- fig : matplotlib.figure.Figure The figure object. ax : matplotlib.axes.Axes The axes object. """ # Calculate metrics rmse = np.sqrt(mean_squared_error(y_true, y_pred)) r2 = r2_score(y_true, y_pred) # Create figure fig, ax = plt.subplots(figsize=figsize) # Plot scatter points ax.scatter(y_true, y_pred, alpha=alpha, color=color, edgecolor='k', linewidth=0.5) # Calculate and plot identity line min_val = min(np.min(y_true), np.min(y_pred)) max_val = max(np.max(y_true), np.max(y_pred)) padding = (max_val - min_val) * 0.05 line_x = np.array([min_val - padding, max_val + padding]) ax.plot(line_x, line_x, 'k--', alpha=0.7, label='Identity Line') # Set axis limits ax.set_xlim(min_val - padding, max_val + padding) ax.set_ylim(min_val - padding, max_val + padding) # Add metrics text if requested if add_metrics: metrics_text = f'RMSE = {rmse:.4f}\nR² = {r2:.4f}' ax.text(0.05, 0.95, metrics_text, transform=ax.transAxes, bbox=dict(facecolor='white', alpha=0.8, boxstyle='round,pad=0.5'), verticalalignment='top', fontsize=12) # Set labels and title ax.set_xlabel(xlabel, fontsize=12) ax.set_ylabel(ylabel, fontsize=12) ax.set_title(title, fontsize=14, fontweight='bold') # Add grid ax.grid(True, linestyle='--', alpha=0.3) # Adjust layout plt.tight_layout() if save_path: plt.savefig(save_path, dpi=300, bbox_inches='tight') return fig, ax
[docs] @staticmethod def plot_optimization_progress(optimizer: PipelineOptimizer, figsize: Tuple[int, int] = (12, 6), title: str = 'Optimization Progress', save_path: Optional[str] = None): """ Plot optimization progress over iterations. Parameters ---------- optimizer : PipelineOptimizer The fitted pipeline optimizer. figsize : tuple, default=(12, 6) Figure size. title : str, default='Optimization Progress' Plot title. save_path : str, optional If provided, save the figure to this path. Returns ------- fig : matplotlib.figure.Figure The figure object. ax : matplotlib.axes.Axes The axes object. """ if optimizer.optimizer is None: raise ValueError("No optimization results found. Run bayesian_optimize() first.") # Get all results results = optimizer.get_all_tested_pipelines() # Extract iteration and RMSE data iterations = [res['trial'] for res in results if res['trial'] is not None] rmses = [res['rmse'] for res in results if res['trial'] is not None] # Create DataFrame for easier plotting df = pd.DataFrame({'Iteration': iterations, 'RMSE': rmses}) df = df.sort_values('Iteration') # Create figure fig, ax = plt.subplots(figsize=figsize) # Plot RMSE values ax.plot(df['Iteration'], df['RMSE'], marker='o', linestyle='-', color='blue', alpha=0.7) # Calculate and plot running minimum RMSE running_min = df['RMSE'].cummin() ax.plot(df['Iteration'], running_min, marker='', linestyle='--', color='red', alpha=0.8, label='Best RMSE') # Add labels and title ax.set_xlabel('Iteration', fontsize=12) ax.set_ylabel('RMSE', fontsize=12) ax.set_title(title, fontsize=14, fontweight='bold') # Add legend ax.legend(loc='upper right') # Add grid ax.grid(True, linestyle='--', alpha=0.3) # Adjust layout plt.tight_layout() if save_path: plt.savefig(save_path, dpi=300, bbox_inches='tight') return fig, ax
[docs] @staticmethod def plot_feature_importance(wavenumbers: np.ndarray, coefficients: np.ndarray, title: str = 'Feature Importance', xlabel: str = 'Wavenumber (cm$^{-1}$)', ylabel: str = 'Coefficient Value', figsize: Tuple[int, int] = (12, 6), color: str = 'purple', highlight_threshold: Optional[float] = None, highlight_color: str = 'red', save_path: Optional[str] = None): """ Plot feature importance from model coefficients. Parameters ---------- wavenumbers : array-like The x-axis values (wavenumbers). coefficients : array-like Model coefficients corresponding to each wavenumber. title : str, default='Feature Importance' Plot title. xlabel : str, default='Wavenumber (cm$^{-1}$)' X-axis label. ylabel : str, default='Coefficient Value' Y-axis label. figsize : tuple, default=(12, 6) Figure size. color : str, default='purple' Color of the line. highlight_threshold : float, optional If provided, highlights coefficients with absolute values above this threshold. highlight_color : str, default='red' Color for highlighted coefficients. save_path : str, optional If provided, save the figure to this path. Returns ------- fig : matplotlib.figure.Figure The figure object. ax : matplotlib.axes.Axes The axes object. """ fig, ax = plt.subplots(figsize=figsize) # Plot coefficients ax.plot(wavenumbers, coefficients, color=color, alpha=0.7) # Highlight important features if threshold is provided if highlight_threshold is not None: important_mask = np.abs(coefficients) > highlight_threshold if np.any(important_mask): ax.scatter(wavenumbers[important_mask], coefficients[important_mask], color=highlight_color, s=50, zorder=3, label=f'|Coef| > {highlight_threshold}') ax.legend(loc='best') # Add zero line ax.axhline(y=0, color='gray', linestyle='--', alpha=0.5) # Set labels and title ax.set_xlabel(xlabel, fontsize=12) ax.set_ylabel(ylabel, fontsize=12) ax.set_title(title, fontsize=14, fontweight='bold') # Add grid ax.grid(True, linestyle='--', alpha=0.3) # Adjust layout plt.tight_layout() if save_path: plt.savefig(save_path, dpi=300, bbox_inches='tight') return fig, ax