Source code for autoemxsp.runners.batch_quantify_and_analyze

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Batch quantification and analysis of X-ray spectra for a list of samples.

This module provides automated batch quantification and (optionally) clustering/statistical
analysis of acquired X-ray spectra for multiple samples. It is robust to missing files or
errors in individual samples, making it suitable for unattended batch processing.

Import this module in your own code and call the
`batch_quantify_and_analyze()` function, passing your desired sample IDs and
options as arguments. This enables integration into larger workflows or pipelines.

Workflow:
    - Loads sample configurations from `Spectra_collection_info.json`
    - Loads acquired spectral data from `Data.csv`
    - Performs quantification (optionally only on unquantified spectra)
    - Optionally performs clustering/statistical analysis and saves results

Notes
-----
- Only the `sample_ID` is required if acquisition output is saved in the default directory;
  otherwise, specify `results_path`.
- Designed to continue processing even if some samples are missing or have errors.

Typical usage:
    - Edit the `sample_IDs` list and parameter options in the script, or
    - Import and call `batch_quantify_and_analyze()` with your own arguments.
    
Parameters
----------
sample_IDs : List[str]
    List of sample identifiers.
quantification_method : str, optional
    Method to use for quantification. Uses quant_cfg.method if unspecified. Currently only supports 'PB'.
results_path : str, optional
    Base directory where results are stored. Default: autoemxsp/Results
min_bckgrnd_cnts : float, optional
    Minimum number of background counts underneath reference peaks below which spectra are flagged.
    If None, leaves it unchanged. Default: None
output_filename_suffix : str, optional
    Suffix to append to output filenames.
use_instrument_background : bool, optional
    Whether to use instrument background if present (Default: False).
max_analytical_error : float, optional
    Maximum allowed analytical error for analysis.
run_analysis : bool, optional
    Whether to run clustering/statistical analysis after quantification.
quantify_only_unquantified_spectra : bool, optional
    If True, only quantify spectra that lack analytical error.
interrupt_fits_bad_spectra : bool, optional
    If True, interrupt fitting if bad spectra are encountered. Speeds up computations
is_known_precursor_mixture : bool, optional
    Whether sample is a mixture of two known powders. Used to characterize extent of intermixing in powders.
    See example at:
        L. N. Walters et al., Synthetic Accessibility and Sodium Ion Conductivity of the Na 8– x A x P 2 O 9 (NAP)
        High-Temperature Sodium Superionic Conductor Framework, Chem. Mater. 37, 6807 (2025).
    
Returns
-------
quant_results : list()
    List of EMXSp_Composition_Analyzer, the composition analysis object containing the results and methods for further analysis.

Created on Tue Jul 29 13:18:16 2025

@author: Andrea
"""

import os
import pandas as pd
import warnings
import time
import logging
import traceback
from typing import List, Optional

from autoemxsp.utils import (
    print_double_separator,
    get_sample_dir,
    load_configurations_from_json,
    extract_spectral_data,
)
import autoemxsp.utils.constants as cnst
from autoemxsp.config import config_classes_dict
from autoemxsp.core.EMXSp_composition_analyser import EMXSp_Composition_Analyzer

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s %(levelname)s: %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)


[docs]
def batch_quantify_and_analyze(
    sample_IDs: List[str],
    quantification_method: str = None,
    results_path: str = None,
    min_bckgrnd_cnts: float = None,
    output_filename_suffix: str = "",
    use_instrument_background: bool = False,
    max_analytical_error: float = 5,
    run_analysis: bool = True,
    num_CPU_cores: int = None,
    quantify_only_unquantified_spectra: bool = False,
    interrupt_fits_bad_spectra: bool = False,
    is_known_precursor_mixture: Optional[bool] = None,
    standards_dict: dict = None,
) -> None:
    """
    Batch quantification and analysis for a list of samples.

    Parameters
    ----------
    sample_IDs : List[str]
        List of sample identifiers.
    quantification_method : str, optional
        Method to use for quantification. Uses quant_cfg.method if unspecified. Currently only supports 'PB'.
    results_path : str, optional
        Base directory where results are stored. Default: autoemxsp/Results
    min_bckgrnd_cnts : float, optional
        Minimum number of background counts underneath reference peaks below which spectra are flagged.
        If None, leaves it unchanged. Default: None
    output_filename_suffix : str, optional
        Suffix to append to output filenames.
    use_instrument_background : bool, optional
        Whether to use instrument background if present (Default: False).
    max_analytical_error : float, optional
        Maximum allowed analytical error for analysis.
    run_analysis : bool, optional
        Whether to run clustering/statistical analysis after quantification.
    num_CPU_cores : bool | None, optional
        Number of CPU cores to use during fitting and quantification. If None, half of the available cores are used.
    quantify_only_unquantified_spectra : bool, optional
        If True, only quantify spectra that lack analytical error.
    interrupt_fits_bad_spectra : bool, optional
        If True, interrupt fitting if bad spectra are encountered. Speeds up computations
    is_known_precursor_mixture : bool, optional
        Whether sample is a mixture of two known powders. Used to characterize extent of intermixing in powders.
        See example at:
            L. N. Walters et al., Synthetic Accessibility and Sodium Ion Conductivity of the Na 8– x A x P 2 O 9 (NAP)
            High-Temperature Sodium Superionic Conductor Framework, Chem. Mater. 37, 6807 (2025).
    standards_dict : dict, optional
        Dictionary of reference PB values from experimental standards. Default : None.
        If None, dictionary of standards is loaded from the XSp_calibs/Your_Microscope_ID directory.
        Provide standards_dict only when providing different standards from those normally used for quantification.
            
    Returns
    -------
    quant_results : list()
        List of EMXSp_Composition_Analyzer, the composition analysis object containing the results and methods for further analysis.
    """
    if results_path is None:
        parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
        results_path = os.path.join(parent_dir, cnst.RESULTS_DIR)
        
    quant_results = []
    for sample_ID in sample_IDs:
        try:
            sample_dir = get_sample_dir(results_path, sample_ID)
        except Exception as e:
            logging.warning("Failed to get sample directory for %s: %s", sample_ID, e)
            continue
        
        spectral_info_f_path = os.path.join(sample_dir, f"{cnst.ACQUISITION_INFO_FILENAME}.json")
        data_path = os.path.join(sample_dir, f"{cnst.DATA_FILENAME}{cnst.DATA_FILEEXT}")
        
        print_double_separator()
        logging.info(f"Sample '{sample_ID}'")
        
        try:
            configs, metadata = load_configurations_from_json(spectral_info_f_path, config_classes_dict)
        except FileNotFoundError:
            logging.warning(f"Could not find {spectral_info_f_path}. Skipping sample '{sample_ID}'.")
            continue
        except Exception as e:
            logging.warning(f"Error loading {spectral_info_f_path}. Skipping sample '{sample_ID}': {e}")
            continue

        sample_processing_time_start = time.time()

        # Retrieve configuration objects for this sample
        try:
            microscope_cfg      = configs[cnst.MICROSCOPE_CFG_KEY]
            sample_cfg          = configs[cnst.SAMPLE_CFG_KEY]
            measurement_cfg     = configs[cnst.MEASUREMENT_CFG_KEY]
            sample_substrate_cfg= configs[cnst.SAMPLESUBSTRATE_CFG_KEY]
            quant_cfg           = configs[cnst.QUANTIFICATION_CFG_KEY]
            clustering_cfg      = configs[cnst.CLUSTERING_CFG_KEY]
            plot_cfg            = configs[cnst.PLOT_CFG_KEY]
            powder_meas_cfg     = configs.get(cnst.POWDER_MEASUREMENT_CFG_KEY, None)  # Optional
            bulk_meas_cfg     = configs.get(cnst.BULK_MEASUREMENT_CFG_KEY, None)  # Optional
        except KeyError as e:
            logging.warning(f"Missing configuration '{e.args[0]}' in {spectral_info_f_path}. Skipping sample '{sample_ID}'.")
            continue
        
        if min_bckgrnd_cnts is not None:
            quant_cfg.min_bckgrnd_cnts = min_bckgrnd_cnts
        if quantification_method is not None:
            quant_cfg.method = quantification_method
            
        # Load 'Data.csv' into a DataFrame
        try:
            spectra_quant, spectral_data, sp_coords, original_df = extract_spectral_data(data_path)
        except Exception as e:
            logging.warning(f"Could not load spectral data for '{sample_ID}': {e}")
            continue

        if use_instrument_background:
            if getattr(spectral_data, 'get', None) and spectral_data.get(cnst.BACKGROUND_DF_KEY, []) == []:
                warnings.warn(
                    "Background column not found in input data. "
                    "Spectral background will be computed instead."
                )
                
        # Change is_known_precursor_mixture if provided
        if is_known_precursor_mixture:
            powder_meas_cfg.is_known_precursor_mixture = is_known_precursor_mixture

        # Decide which spectra to quantify
        quant_cfg.interrupt_fits_bad_spectra = interrupt_fits_bad_spectra
        if quantify_only_unquantified_spectra and cnst.AN_ER_DF_KEY in original_df.columns:
            to_quantify_mask = original_df[cnst.AN_ER_DF_KEY].isna()
            if not to_quantify_mask.any():
                logging.info("All spectra already quantified. Skipping.")
                continue
            indices_to_quantify = list(original_df.index[to_quantify_mask])
            logging.info(f"Quantifying {len(indices_to_quantify)} unquantified spectra out of {len(original_df)}.")
        else:
            indices_to_quantify = list(original_df.index)
            logging.info(f"Quantifying all {len(original_df)} spectra.")
        if num_CPU_cores is not None:
            quant_cfg.num_CPU_cores = num_CPU_cores
            
        # Subset the data for quantification
        spectral_data_sub = {key: [spectral_data[key][i] for i in indices_to_quantify] for key in cnst.LIST_SPECTRAL_DATA_QUANT_KEYS}
        sp_coords_sub = [sp_coords[i] for i in indices_to_quantify]

        # Handle output filename for analyzer
        if quantify_only_unquantified_spectra:
            temp_suffix = output_filename_suffix + '_temp'
            output_suffix = temp_suffix
        else:
            output_suffix = output_filename_suffix

        # --- Run Composition Analysis or Spectral Acquisition
        comp_analyzer = EMXSp_Composition_Analyzer(
            microscope_cfg=microscope_cfg,
            sample_cfg=sample_cfg,
            measurement_cfg=measurement_cfg,
            sample_substrate_cfg=sample_substrate_cfg,
            quant_cfg=quant_cfg,
            clustering_cfg=clustering_cfg,
            powder_meas_cfg=powder_meas_cfg,
            bulk_meas_cfg=bulk_meas_cfg,
            plot_cfg=plot_cfg,
            is_acquisition=False,
            development_mode=False,
            standards_dict=standards_dict,
            output_filename_suffix=output_suffix,
            verbose=True,
            results_dir=sample_dir
        )

        comp_analyzer.sp_coords = sp_coords_sub
        for key in cnst.LIST_SPECTRAL_DATA_QUANT_KEYS:
            comp_analyzer.spectral_data[key] = spectral_data_sub[key]
        
        try:
            comp_analyzer.run_quantification()
        except Exception:
            tb_str = traceback.format_exc()  # get full traceback as a string
            logging.warning(
                f"Error during spectral quantification for '{sample_ID}'. Skipping sample.\nFull traceback:\n{tb_str}"
            )
            continue

        if quantify_only_unquantified_spectra:
            # Read the temporary file produced by the analyzer
            temp_data_path = os.path.join(sample_dir, f"{cnst.DATA_FILENAME}{output_suffix}{cnst.DATA_FILEEXT}")
            try:
                temp_df = pd.read_csv(temp_data_path)
            except Exception as e:
                logging.error(f"Could not read temporary quantification file '{temp_data_path}' for '{sample_ID}': {e}")
                continue
            temp_df.index = indices_to_quantify

            # Update only the quantified rows in the original DataFrame
            # Use vectorized assignment for performance
            try:
                original_df.loc[indices_to_quantify, temp_df.columns] = temp_df.values
            except Exception as e:
                logging.error(f"Failed to update quantified rows in DataFrame for '{sample_ID}': {e}")
                continue

            # Convert quantification flag column to int if present
            if cnst.QUANT_FLAG_DF_KEY in original_df.columns:
                original_df[cnst.QUANT_FLAG_DF_KEY] = original_df[cnst.QUANT_FLAG_DF_KEY].astype(int)

            # Save the updated Data.csv with counter logic to avoid overwriting
            new_data_path = os.path.join(sample_dir, f'{cnst.DATA_FILENAME}{output_filename_suffix}{cnst.DATA_FILEEXT}')
            cntr = 1
            while os.path.exists(new_data_path):
                cntr += 1
                new_data_path = os.path.join(sample_dir, f'{cnst.DATA_FILENAME}{output_filename_suffix}_{cntr}{cnst.DATA_FILEEXT}')

            try:
                original_df.to_csv(new_data_path, index=False)
            except Exception as e:
                logging.error(f"Failed to save updated {cnst.DATA_FILENAME}{cnst.DATA_FILEEXT} for '{sample_ID}': {e}")
                continue

            # Remove the temporary file
            try:
                os.remove(temp_data_path)
            except Exception as e:
                logging.warning(f"Could not remove temporary file '{temp_data_path}': {e}")

            # Reload data for analysis if needed
            if run_analysis:
                try:
                    spectra_quant, spectral_data, sp_coords, _ = extract_spectral_data(new_data_path)
                    comp_analyzer.spectra_quant = spectra_quant
                    comp_analyzer.sp_coords = sp_coords
                    comp_analyzer.spectral_data = spectral_data
                except Exception as e:
                    logging.warning(f"Could not reload data for analysis for '{sample_ID}': {e}")
                    continue

        # Perform analysis and print results
        if run_analysis:
            try:
                analysis_successful, _, _ = comp_analyzer.analyse_data(max_analytical_error)
            except Exception as e:
                logging.exception(f"Error during clustering analysis for '{sample_ID}'. Rerun separately if needed: {e}")
                continue
            if analysis_successful:
                comp_analyzer.print_results()
            else:
                logging.info(f"Analysis was not successful for '{sample_ID}'.")

        total_process_time = (time.time() - sample_processing_time_start) / 60
        print_double_separator()
        logging.info(f"Sample '{sample_ID}' successfully quantified in {total_process_time:.1f} min.")
        logging.info(f"{len(indices_to_quantify)} spectra have been quantified and saved for '{sample_ID}'.")
        
        quant_results.append(comp_analyzer)
    
    return quant_results