Source code for autoemxsp.runners.analyze_sample

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Single-sample clustering and analysis of X-ray spectra.

This module loads configurations and acquired X-ray spectra for a single sample,
performs clustering/statistical analysis, and prints results. It is suitable for
both interactive use and integration into larger workflows.

Import this module in your own code and call the
`analyze_sample()` function, passing the sample ID (and optional arguments)
to perform analysis programmatically.

Workflow:
    - Loads sample configuration from `Spectra_collection_info.json`
    - Loads acquired spectral data from `Data.csv`
    - Performs clustering/statistical analysis
    - Prints summary results

Notes
-----
- Requires `sample_ID` (and optionally `results_path` if not using the default directory).
- Designed to be robust and flexible for both batch and single-sample workflows.

Typical usage:
    - Edit the `sample_ID` and options in the script, or
    - Import and call `analyze_sample()` with your own arguments.
    
Parameters
----------
sample_ID : str
    Sample identifier.
results_path : str, optional
    Directory where results are stored. If None, sets to default autoemxsp/Results
output_filename_suffix : str, optional
    Suffix for output files.
ref_formulae : list of str, optional
    Reference formulae for clustering. If the first entry is "" or None, the rest are appended to the 
    list loaded from Comp_analysis_configs.json; otherwise, the provided list replaces it.
els_excluded_clust_plot : list of str, optional
    Elements to exclude from cluster plot.
clustering_features : list of str, optional
    Features to use for clustering. 
k_finding_method : str, optional
    Method for determining optimal number of clusters. Set to "forced" if a value of 'k' is specified manually.
        Allowed methods are "silhouette", "calinski_harabasz", "elbow".
k_forced : int, optional
    Forced number of clusters.
max_analytical_error_percent : float, optional
    Maximum analytical error allowed for clustering.
quant_flags_accepted : list of int, optional
    Accepted quantification flags.
plot_custom_plots : bool, optional
    Whether to use custom plots.
show_unused_compositions_cluster_plot : bool, optional
    Whether to show unused compositions in cluster plot.

Created on Tue Jul 29 13:18:16 2025

@author: Andrea
"""

import os
import time
import logging
from typing import Optional, List

from autoemxsp.utils import (
    print_single_separator,
    print_double_separator,
    get_sample_dir,
    load_configurations_from_json,
    extract_spectral_data,
)
import autoemxsp.utils.constants as cnst
from autoemxsp.config import config_classes_dict
from autoemxsp.core.EMXSp_composition_analyser import EMXSp_Composition_Analyzer

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s %(levelname)s: %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)


[docs]
def analyze_sample(
    sample_ID: str,
    results_path: str = None,
    output_filename_suffix: str = "",
    ref_formulae: Optional[List[str]] = None,
    els_excluded_clust_plot: Optional[List[str]] = None,
    clustering_features: Optional[List[str]] = None,
    k_finding_method: Optional[str] = None,
    k_forced: Optional[int] = None,
    max_analytical_error_percent: float = 5,
    quant_flags_accepted: Optional[List[int]] = None,
    plot_custom_plots: bool = False,
    show_unused_compositions_cluster_plot: bool = True,
) -> None:
    """
    Run clustering and analysis for a single sample.

    Parameters
    ----------
    sample_ID : str
        Sample identifier.
    results_path : str, optional
        Directory where results are loaded and stored. If None, defaults to autoemxsp/Results
    output_filename_suffix : str, optional
        Suffix for output files.
    ref_formulae : list of str, optional
        Reference formulae for clustering. If the first entry is "" or None, the rest are appended to the 
        list loaded from Comp_analysis_configs.json; otherwise, the provided list replaces it.
    els_excluded_clust_plot : list of str, optional
        Elements to exclude from cluster plot.
    clustering_features : list of str, optional
        Features to use for clustering.
    k_finding_method : str, optional
        Method for determining optimal number of clusters. Set to "forced" if a value of 'k' is specified manually.
            Allowed methods are "silhouette", "calinski_harabasz", "elbow".
    k_forced : int, optional
        Forced number of clusters.
    max_analytical_error_percent : float, optional
        Maximum analytical error allowed for clustering.
    quant_flags_accepted : list of int, optional
        Accepted quantification flags.
    plot_custom_plots : bool, optional
        Whether to use custom plots.
    show_unused_compositions_cluster_plot : bool, optional
        Whether to show unused compositions in cluster plot.
        
    Returns
    -------
    comp_analyzer : EMXSp_Composition_Analyzer
        The composition analysis object containing the results and methods for further analysis.
    """
    if results_path is None:
        parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
        results_path = os.path.join(parent_dir, cnst.RESULTS_DIR)
        
    print_double_separator()
    logging.info(f"Sample '{sample_ID}'")
    
    sample_dir = get_sample_dir(results_path, sample_ID)
    spectral_info_f_path = os.path.join(sample_dir, f'{cnst.ACQUISITION_INFO_FILENAME}.json')
    try:
        configs, metadata = load_configurations_from_json(spectral_info_f_path, config_classes_dict)
    except FileNotFoundError:
        logging.error(f"Could not find {spectral_info_f_path}. Skipping sample '{sample_ID}'.")
        return
    except Exception as e:
        logging.error(f"Error loading {spectral_info_f_path}. Skipping sample '{sample_ID}': {e}")
        return

    sample_processing_time_start = time.time()

    # Retrieve configuration objects for this sample
    try:
        microscope_cfg      = configs[cnst.MICROSCOPE_CFG_KEY]
        sample_cfg          = configs[cnst.SAMPLE_CFG_KEY]
        measurement_cfg     = configs[cnst.MEASUREMENT_CFG_KEY]
        sample_substrate_cfg= configs[cnst.SAMPLESUBSTRATE_CFG_KEY]
        quant_cfg           = configs[cnst.QUANTIFICATION_CFG_KEY]
        clustering_cfg      = configs[cnst.CLUSTERING_CFG_KEY]
        plot_cfg            = configs[cnst.PLOT_CFG_KEY]
        powder_meas_cfg     = configs.get(cnst.POWDER_MEASUREMENT_CFG_KEY, None)  # Optional
        bulk_meas_cfg     = configs.get(cnst.BULK_MEASUREMENT_CFG_KEY, None)  # Optional
    except KeyError as e:
        logging.error(f"Missing configuration '{e.args[0]}' in {spectral_info_f_path}. Skipping sample '{sample_ID}'.")
        return
    
    # --- Modify Clustering Configuration
    forced_key = clustering_cfg.FORCED_K_METHOD_KEY
    if quant_flags_accepted is not None:
        clustering_cfg.quant_flags_accepted = quant_flags_accepted
    clustering_cfg.max_analytical_error_percent = max_analytical_error_percent
    if ref_formulae is not None:
        if ref_formulae and (ref_formulae[0] == "" or ref_formulae[0] is None):
            # Append mode: skip the first empty entry
            clustering_cfg.ref_formulae.extend(ref_formulae[1:])
        else:
            # Replace mode
            clustering_cfg.ref_formulae = ref_formulae
    if clustering_features is not None:
        clustering_cfg.features = clustering_features
    if isinstance(k_forced, int):
        # Forces the k to be the provided number of clusters
        clustering_cfg.k = k_forced
        clustering_cfg.k_finding_method = forced_key
    elif k_finding_method == forced_key:
        raise ValueError(f"'k_finding_method' must be one of {clustering_cfg.ALLOWED_K_FINDING_METHODS}, but not {forced_key}, if 'k_forced' is set to None")
    elif k_finding_method is not None:
        # If k_forced is None, and a k_finding_method is defined, it forces the recomputation of k, despite of the values loaded from from clustering_cfg
        clustering_cfg.k = k_forced
        clustering_cfg.k_finding_method = k_finding_method
    else:
        # If a finding method is not specified and k_forced is None, simply loads the default values from clustering_cfg
        pass

    # --- Modify Plot Configuration
    plot_cfg.show_unused_comps_clust = show_unused_compositions_cluster_plot
    plot_cfg.use_custom_plots = plot_custom_plots
    if els_excluded_clust_plot is not None:
        plot_cfg.els_excluded_clust_plot = els_excluded_clust_plot

    # Load 'Data.csv' into a DataFrame
    data_path = os.path.join(sample_dir, f'{cnst.DATA_FILENAME}.csv')
    try:
        spectra_quant, spectral_data, sp_coords, _ = extract_spectral_data(data_path)
    except Exception as e:
        logging.error(f"Could not load spectral data for '{sample_ID}': {e}")
        return
    
    if spectra_quant is None:
        logging.error(f"No quantification data found in {data_path}")
        return
    
    # --- Run Composition Analysis or Spectral Acquisition
    comp_analyzer = EMXSp_Composition_Analyzer(
        microscope_cfg=microscope_cfg,
        sample_cfg=sample_cfg,
        measurement_cfg=measurement_cfg,
        sample_substrate_cfg=sample_substrate_cfg,
        quant_cfg=quant_cfg,
        clustering_cfg=clustering_cfg,
        powder_meas_cfg=powder_meas_cfg,
        bulk_meas_cfg=bulk_meas_cfg,
        plot_cfg=plot_cfg,
        is_acquisition=False,
        development_mode=False,
        output_filename_suffix=output_filename_suffix,
        verbose=True,
        results_dir=sample_dir
    )

    comp_analyzer.spectra_quant = spectra_quant
    comp_analyzer.sp_coords = sp_coords
    comp_analyzer.spectral_data = spectral_data

    # Perform analysis and print results
    try:
        analysis_successful, _, _ = comp_analyzer.analyse_data(max_analytical_error_percent, k = clustering_cfg.k)
    except Exception as e:
        logging.exception(f'Error during clustering analysis for {sample_ID}: {e}')
        return

    total_process_time = (time.time() - sample_processing_time_start)
    
    if analysis_successful:
        comp_analyzer.print_results()
        print_single_separator()
        logging.info(f"Sample '{sample_ID}' successfully analysed in {total_process_time:.1f} sec.")
    else:
        print_single_separator()
        logging.info(f"Analysis was not successful for '{sample_ID}'.")
    
    return comp_analyzer