Source code for autoemxsp.runners.analyze_sample

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Single-sample clustering and analysis of X-ray spectra.

This module loads configurations and acquired X-ray spectra for a single sample,
performs clustering/statistical analysis, and prints results. It is suitable for
both interactive use and integration into larger workflows.

Import this module in your own code and call the
`analyze_sample()` function, passing the sample ID (and optional arguments)
to perform analysis programmatically.

Workflow:
    - Loads sample configuration from `Spectra_collection_info.json`
    - Loads acquired spectral data from `Data.csv`
    - Performs clustering/statistical analysis
    - Prints summary results

Notes
-----
- Requires `sample_ID` (and optionally `results_path` if not using the default directory).
- Designed to be robust and flexible for both batch and single-sample workflows.

Typical usage:
    - Edit the `sample_ID` and options in the script, or
    - Import and call `analyze_sample()` with your own arguments.
    
Parameters
----------
sample_ID : str
    Sample identifier.
results_path : str, optional
    Directory where results are stored. If None, sets to default autoemxsp/Results
output_filename_suffix : str, optional
    Suffix for output files.
ref_formulae : list of str, optional
    Reference formulae for clustering. If the first entry is "" or None, the rest are appended to the 
    list loaded from Comp_analysis_configs.json; otherwise, the provided list replaces it.
els_excluded_clust_plot : list of str, optional
    Elements to exclude from cluster plot.
clustering_features : list of str, optional
    Features to use for clustering. 
k_finding_method : str, optional
    Method for determining optimal number of clusters. Set to "forced" if a value of 'k' is specified manually.
        Allowed methods are "silhouette", "calinski_harabasz", "elbow".
k_forced : int, optional
    Forced number of clusters.
max_analytical_error_percent : float, optional
    Maximum analytical error allowed for clustering.
quant_flags_accepted : list of int, optional
    Accepted quantification flags.
plot_custom_plots : bool, optional
    Whether to use custom plots.
show_unused_compositions_cluster_plot : bool, optional
    Whether to show unused compositions in cluster plot.

Created on Tue Jul 29 13:18:16 2025

@author: Andrea
"""

import os
import time
import logging
from typing import Optional, List

from autoemxsp.utils import (
    print_single_separator,
    print_double_separator,
    get_sample_dir,
    load_configurations_from_json,
    extract_spectral_data,
)
import autoemxsp.utils.constants as cnst
from autoemxsp.config import config_classes_dict
from autoemxsp.core.EMXSp_composition_analyser import EMXSp_Composition_Analyzer

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s %(levelname)s: %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)

[docs] def analyze_sample( sample_ID: str, results_path: str = None, output_filename_suffix: str = "", ref_formulae: Optional[List[str]] = None, els_excluded_clust_plot: Optional[List[str]] = None, clustering_features: Optional[List[str]] = None, k_finding_method: Optional[str] = None, k_forced: Optional[int] = None, max_analytical_error_percent: float = 5, quant_flags_accepted: Optional[List[int]] = None, plot_custom_plots: bool = False, show_unused_compositions_cluster_plot: bool = True, ) -> None: """ Run clustering and analysis for a single sample. Parameters ---------- sample_ID : str Sample identifier. results_path : str, optional Directory where results are loaded and stored. If None, defaults to autoemxsp/Results output_filename_suffix : str, optional Suffix for output files. ref_formulae : list of str, optional Reference formulae for clustering. If the first entry is "" or None, the rest are appended to the list loaded from Comp_analysis_configs.json; otherwise, the provided list replaces it. els_excluded_clust_plot : list of str, optional Elements to exclude from cluster plot. clustering_features : list of str, optional Features to use for clustering. k_finding_method : str, optional Method for determining optimal number of clusters. Set to "forced" if a value of 'k' is specified manually. Allowed methods are "silhouette", "calinski_harabasz", "elbow". k_forced : int, optional Forced number of clusters. max_analytical_error_percent : float, optional Maximum analytical error allowed for clustering. quant_flags_accepted : list of int, optional Accepted quantification flags. plot_custom_plots : bool, optional Whether to use custom plots. show_unused_compositions_cluster_plot : bool, optional Whether to show unused compositions in cluster plot. Returns ------- comp_analyzer : EMXSp_Composition_Analyzer The composition analysis object containing the results and methods for further analysis. """ if results_path is None: parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) results_path = os.path.join(parent_dir, cnst.RESULTS_DIR) print_double_separator() logging.info(f"Sample '{sample_ID}'") sample_dir = get_sample_dir(results_path, sample_ID) spectral_info_f_path = os.path.join(sample_dir, f'{cnst.ACQUISITION_INFO_FILENAME}.json') try: configs, metadata = load_configurations_from_json(spectral_info_f_path, config_classes_dict) except FileNotFoundError: logging.error(f"Could not find {spectral_info_f_path}. Skipping sample '{sample_ID}'.") return except Exception as e: logging.error(f"Error loading {spectral_info_f_path}. Skipping sample '{sample_ID}': {e}") return sample_processing_time_start = time.time() # Retrieve configuration objects for this sample try: microscope_cfg = configs[cnst.MICROSCOPE_CFG_KEY] sample_cfg = configs[cnst.SAMPLE_CFG_KEY] measurement_cfg = configs[cnst.MEASUREMENT_CFG_KEY] sample_substrate_cfg= configs[cnst.SAMPLESUBSTRATE_CFG_KEY] quant_cfg = configs[cnst.QUANTIFICATION_CFG_KEY] clustering_cfg = configs[cnst.CLUSTERING_CFG_KEY] plot_cfg = configs[cnst.PLOT_CFG_KEY] powder_meas_cfg = configs.get(cnst.POWDER_MEASUREMENT_CFG_KEY, None) # Optional bulk_meas_cfg = configs.get(cnst.BULK_MEASUREMENT_CFG_KEY, None) # Optional except KeyError as e: logging.error(f"Missing configuration '{e.args[0]}' in {spectral_info_f_path}. Skipping sample '{sample_ID}'.") return # --- Modify Clustering Configuration forced_key = clustering_cfg.FORCED_K_METHOD_KEY if quant_flags_accepted is not None: clustering_cfg.quant_flags_accepted = quant_flags_accepted clustering_cfg.max_analytical_error_percent = max_analytical_error_percent if ref_formulae is not None: if ref_formulae and (ref_formulae[0] == "" or ref_formulae[0] is None): # Append mode: skip the first empty entry clustering_cfg.ref_formulae.extend(ref_formulae[1:]) else: # Replace mode clustering_cfg.ref_formulae = ref_formulae if clustering_features is not None: clustering_cfg.features = clustering_features if isinstance(k_forced, int): # Forces the k to be the provided number of clusters clustering_cfg.k = k_forced clustering_cfg.k_finding_method = forced_key elif k_finding_method == forced_key: raise ValueError(f"'k_finding_method' must be one of {clustering_cfg.ALLOWED_K_FINDING_METHODS}, but not {forced_key}, if 'k_forced' is set to None") elif k_finding_method is not None: # If k_forced is None, and a k_finding_method is defined, it forces the recomputation of k, despite of the values loaded from from clustering_cfg clustering_cfg.k = k_forced clustering_cfg.k_finding_method = k_finding_method else: # If a finding method is not specified and k_forced is None, simply loads the default values from clustering_cfg pass # --- Modify Plot Configuration plot_cfg.show_unused_comps_clust = show_unused_compositions_cluster_plot plot_cfg.use_custom_plots = plot_custom_plots if els_excluded_clust_plot is not None: plot_cfg.els_excluded_clust_plot = els_excluded_clust_plot # Load 'Data.csv' into a DataFrame data_path = os.path.join(sample_dir, f'{cnst.DATA_FILENAME}.csv') try: spectra_quant, spectral_data, sp_coords, _ = extract_spectral_data(data_path) except Exception as e: logging.error(f"Could not load spectral data for '{sample_ID}': {e}") return if spectra_quant is None: logging.error(f"No quantification data found in {data_path}") return # --- Run Composition Analysis or Spectral Acquisition comp_analyzer = EMXSp_Composition_Analyzer( microscope_cfg=microscope_cfg, sample_cfg=sample_cfg, measurement_cfg=measurement_cfg, sample_substrate_cfg=sample_substrate_cfg, quant_cfg=quant_cfg, clustering_cfg=clustering_cfg, powder_meas_cfg=powder_meas_cfg, bulk_meas_cfg=bulk_meas_cfg, plot_cfg=plot_cfg, is_acquisition=False, development_mode=False, output_filename_suffix=output_filename_suffix, verbose=True, results_dir=sample_dir ) comp_analyzer.spectra_quant = spectra_quant comp_analyzer.sp_coords = sp_coords comp_analyzer.spectral_data = spectral_data # Perform analysis and print results try: analysis_successful, _, _ = comp_analyzer.analyse_data(max_analytical_error_percent, k = clustering_cfg.k) except Exception as e: logging.exception(f'Error during clustering analysis for {sample_ID}: {e}') return total_process_time = (time.time() - sample_processing_time_start) if analysis_successful: comp_analyzer.print_results() print_single_separator() logging.info(f"Sample '{sample_ID}' successfully analysed in {total_process_time:.1f} sec.") else: print_single_separator() logging.info(f"Analysis was not successful for '{sample_ID}'.") return comp_analyzer