#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Batch quantification and analysis of X-ray spectra for a list of samples.
This module provides automated batch quantification and (optionally) clustering/statistical
analysis of acquired X-ray spectra for multiple samples. It is robust to missing files or
errors in individual samples, making it suitable for unattended batch processing.
Import this module in your own code and call the
`batch_quantify_and_analyze()` function, passing your desired sample IDs and
options as arguments. This enables integration into larger workflows or pipelines.
Workflow:
- Loads sample configurations from `Spectra_collection_info.json`
- Loads acquired spectral data from `Data.csv`
- Performs quantification (optionally only on unquantified spectra)
- Optionally performs clustering/statistical analysis and saves results
Notes
-----
- Only the `sample_ID` is required if acquisition output is saved in the default directory;
otherwise, specify `results_path`.
- Designed to continue processing even if some samples are missing or have errors.
Typical usage:
- Edit the `sample_IDs` list and parameter options in the script, or
- Import and call `batch_quantify_and_analyze()` with your own arguments.
Parameters
----------
sample_IDs : List[str]
List of sample identifiers.
quantification_method : str, optional
Method to use for quantification. Uses quant_cfg.method if unspecified. Currently only supports 'PB'.
results_path : str, optional
Base directory where results are stored. Default: autoemxsp/Results
min_bckgrnd_cnts : float, optional
Minimum number of background counts underneath reference peaks below which spectra are flagged.
If None, leaves it unchanged. Default: None
output_filename_suffix : str, optional
Suffix to append to output filenames.
use_instrument_background : bool, optional
Whether to use instrument background if present (Default: False).
max_analytical_error : float, optional
Maximum allowed analytical error for analysis.
run_analysis : bool, optional
Whether to run clustering/statistical analysis after quantification.
quantify_only_unquantified_spectra : bool, optional
If True, only quantify spectra that lack analytical error.
interrupt_fits_bad_spectra : bool, optional
If True, interrupt fitting if bad spectra are encountered. Speeds up computations
is_known_precursor_mixture : bool, optional
Whether sample is a mixture of two known powders. Used to characterize extent of intermixing in powders.
See example at:
L. N. Walters et al., Synthetic Accessibility and Sodium Ion Conductivity of the Na 8– x A x P 2 O 9 (NAP)
High-Temperature Sodium Superionic Conductor Framework, Chem. Mater. 37, 6807 (2025).
Returns
-------
quant_results : list()
List of EMXSp_Composition_Analyzer, the composition analysis object containing the results and methods for further analysis.
Created on Tue Jul 29 13:18:16 2025
@author: Andrea
"""
import os
import pandas as pd
import warnings
import time
import logging
import traceback
from typing import List, Optional
from autoemxsp.utils import (
print_double_separator,
get_sample_dir,
load_configurations_from_json,
extract_spectral_data,
)
import autoemxsp.utils.constants as cnst
from autoemxsp.config import config_classes_dict
from autoemxsp.core.EMXSp_composition_analyser import EMXSp_Composition_Analyzer
# Configure logging
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s: %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
[docs]
def batch_quantify_and_analyze(
sample_IDs: List[str],
quantification_method: str = None,
results_path: str = None,
min_bckgrnd_cnts: float = None,
output_filename_suffix: str = "",
use_instrument_background: bool = False,
max_analytical_error: float = 5,
run_analysis: bool = True,
num_CPU_cores: int = None,
quantify_only_unquantified_spectra: bool = False,
interrupt_fits_bad_spectra: bool = False,
is_known_precursor_mixture: Optional[bool] = None,
standards_dict: dict = None,
) -> None:
"""
Batch quantification and analysis for a list of samples.
Parameters
----------
sample_IDs : List[str]
List of sample identifiers.
quantification_method : str, optional
Method to use for quantification. Uses quant_cfg.method if unspecified. Currently only supports 'PB'.
results_path : str, optional
Base directory where results are stored. Default: autoemxsp/Results
min_bckgrnd_cnts : float, optional
Minimum number of background counts underneath reference peaks below which spectra are flagged.
If None, leaves it unchanged. Default: None
output_filename_suffix : str, optional
Suffix to append to output filenames.
use_instrument_background : bool, optional
Whether to use instrument background if present (Default: False).
max_analytical_error : float, optional
Maximum allowed analytical error for analysis.
run_analysis : bool, optional
Whether to run clustering/statistical analysis after quantification.
num_CPU_cores : bool | None, optional
Number of CPU cores to use during fitting and quantification. If None, half of the available cores are used.
quantify_only_unquantified_spectra : bool, optional
If True, only quantify spectra that lack analytical error.
interrupt_fits_bad_spectra : bool, optional
If True, interrupt fitting if bad spectra are encountered. Speeds up computations
is_known_precursor_mixture : bool, optional
Whether sample is a mixture of two known powders. Used to characterize extent of intermixing in powders.
See example at:
L. N. Walters et al., Synthetic Accessibility and Sodium Ion Conductivity of the Na 8– x A x P 2 O 9 (NAP)
High-Temperature Sodium Superionic Conductor Framework, Chem. Mater. 37, 6807 (2025).
standards_dict : dict, optional
Dictionary of reference PB values from experimental standards. Default : None.
If None, dictionary of standards is loaded from the XSp_calibs/Your_Microscope_ID directory.
Provide standards_dict only when providing different standards from those normally used for quantification.
Returns
-------
quant_results : list()
List of EMXSp_Composition_Analyzer, the composition analysis object containing the results and methods for further analysis.
"""
if results_path is None:
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
results_path = os.path.join(parent_dir, cnst.RESULTS_DIR)
quant_results = []
for sample_ID in sample_IDs:
try:
sample_dir = get_sample_dir(results_path, sample_ID)
except Exception as e:
logging.warning("Failed to get sample directory for %s: %s", sample_ID, e)
continue
spectral_info_f_path = os.path.join(sample_dir, f"{cnst.ACQUISITION_INFO_FILENAME}.json")
data_path = os.path.join(sample_dir, f"{cnst.DATA_FILENAME}{cnst.DATA_FILEEXT}")
print_double_separator()
logging.info(f"Sample '{sample_ID}'")
try:
configs, metadata = load_configurations_from_json(spectral_info_f_path, config_classes_dict)
except FileNotFoundError:
logging.warning(f"Could not find {spectral_info_f_path}. Skipping sample '{sample_ID}'.")
continue
except Exception as e:
logging.warning(f"Error loading {spectral_info_f_path}. Skipping sample '{sample_ID}': {e}")
continue
sample_processing_time_start = time.time()
# Retrieve configuration objects for this sample
try:
microscope_cfg = configs[cnst.MICROSCOPE_CFG_KEY]
sample_cfg = configs[cnst.SAMPLE_CFG_KEY]
measurement_cfg = configs[cnst.MEASUREMENT_CFG_KEY]
sample_substrate_cfg= configs[cnst.SAMPLESUBSTRATE_CFG_KEY]
quant_cfg = configs[cnst.QUANTIFICATION_CFG_KEY]
clustering_cfg = configs[cnst.CLUSTERING_CFG_KEY]
plot_cfg = configs[cnst.PLOT_CFG_KEY]
powder_meas_cfg = configs.get(cnst.POWDER_MEASUREMENT_CFG_KEY, None) # Optional
bulk_meas_cfg = configs.get(cnst.BULK_MEASUREMENT_CFG_KEY, None) # Optional
except KeyError as e:
logging.warning(f"Missing configuration '{e.args[0]}' in {spectral_info_f_path}. Skipping sample '{sample_ID}'.")
continue
if min_bckgrnd_cnts is not None:
quant_cfg.min_bckgrnd_cnts = min_bckgrnd_cnts
if quantification_method is not None:
quant_cfg.method = quantification_method
# Load 'Data.csv' into a DataFrame
try:
spectra_quant, spectral_data, sp_coords, original_df = extract_spectral_data(data_path)
except Exception as e:
logging.warning(f"Could not load spectral data for '{sample_ID}': {e}")
continue
if use_instrument_background:
if getattr(spectral_data, 'get', None) and spectral_data.get(cnst.BACKGROUND_DF_KEY, []) == []:
warnings.warn(
"Background column not found in input data. "
"Spectral background will be computed instead."
)
# Change is_known_precursor_mixture if provided
if is_known_precursor_mixture:
powder_meas_cfg.is_known_precursor_mixture = is_known_precursor_mixture
# Decide which spectra to quantify
quant_cfg.interrupt_fits_bad_spectra = interrupt_fits_bad_spectra
if quantify_only_unquantified_spectra and cnst.AN_ER_DF_KEY in original_df.columns:
to_quantify_mask = original_df[cnst.AN_ER_DF_KEY].isna()
if not to_quantify_mask.any():
logging.info("All spectra already quantified. Skipping.")
continue
indices_to_quantify = list(original_df.index[to_quantify_mask])
logging.info(f"Quantifying {len(indices_to_quantify)} unquantified spectra out of {len(original_df)}.")
else:
indices_to_quantify = list(original_df.index)
logging.info(f"Quantifying all {len(original_df)} spectra.")
if num_CPU_cores is not None:
quant_cfg.num_CPU_cores = num_CPU_cores
# Subset the data for quantification
spectral_data_sub = {key: [spectral_data[key][i] for i in indices_to_quantify] for key in cnst.LIST_SPECTRAL_DATA_QUANT_KEYS}
sp_coords_sub = [sp_coords[i] for i in indices_to_quantify]
# Handle output filename for analyzer
if quantify_only_unquantified_spectra:
temp_suffix = output_filename_suffix + '_temp'
output_suffix = temp_suffix
else:
output_suffix = output_filename_suffix
# --- Run Composition Analysis or Spectral Acquisition
comp_analyzer = EMXSp_Composition_Analyzer(
microscope_cfg=microscope_cfg,
sample_cfg=sample_cfg,
measurement_cfg=measurement_cfg,
sample_substrate_cfg=sample_substrate_cfg,
quant_cfg=quant_cfg,
clustering_cfg=clustering_cfg,
powder_meas_cfg=powder_meas_cfg,
bulk_meas_cfg=bulk_meas_cfg,
plot_cfg=plot_cfg,
is_acquisition=False,
development_mode=False,
standards_dict=standards_dict,
output_filename_suffix=output_suffix,
verbose=True,
results_dir=sample_dir
)
comp_analyzer.sp_coords = sp_coords_sub
for key in cnst.LIST_SPECTRAL_DATA_QUANT_KEYS:
comp_analyzer.spectral_data[key] = spectral_data_sub[key]
try:
comp_analyzer.run_quantification()
except Exception:
tb_str = traceback.format_exc() # get full traceback as a string
logging.warning(
f"Error during spectral quantification for '{sample_ID}'. Skipping sample.\nFull traceback:\n{tb_str}"
)
continue
if quantify_only_unquantified_spectra:
# Read the temporary file produced by the analyzer
temp_data_path = os.path.join(sample_dir, f"{cnst.DATA_FILENAME}{output_suffix}{cnst.DATA_FILEEXT}")
try:
temp_df = pd.read_csv(temp_data_path)
except Exception as e:
logging.error(f"Could not read temporary quantification file '{temp_data_path}' for '{sample_ID}': {e}")
continue
temp_df.index = indices_to_quantify
# Update only the quantified rows in the original DataFrame
# Use vectorized assignment for performance
try:
original_df.loc[indices_to_quantify, temp_df.columns] = temp_df.values
except Exception as e:
logging.error(f"Failed to update quantified rows in DataFrame for '{sample_ID}': {e}")
continue
# Convert quantification flag column to int if present
if cnst.QUANT_FLAG_DF_KEY in original_df.columns:
original_df[cnst.QUANT_FLAG_DF_KEY] = original_df[cnst.QUANT_FLAG_DF_KEY].astype(int)
# Save the updated Data.csv with counter logic to avoid overwriting
new_data_path = os.path.join(sample_dir, f'{cnst.DATA_FILENAME}{output_filename_suffix}{cnst.DATA_FILEEXT}')
cntr = 1
while os.path.exists(new_data_path):
cntr += 1
new_data_path = os.path.join(sample_dir, f'{cnst.DATA_FILENAME}{output_filename_suffix}_{cntr}{cnst.DATA_FILEEXT}')
try:
original_df.to_csv(new_data_path, index=False)
except Exception as e:
logging.error(f"Failed to save updated {cnst.DATA_FILENAME}{cnst.DATA_FILEEXT} for '{sample_ID}': {e}")
continue
# Remove the temporary file
try:
os.remove(temp_data_path)
except Exception as e:
logging.warning(f"Could not remove temporary file '{temp_data_path}': {e}")
# Reload data for analysis if needed
if run_analysis:
try:
spectra_quant, spectral_data, sp_coords, _ = extract_spectral_data(new_data_path)
comp_analyzer.spectra_quant = spectra_quant
comp_analyzer.sp_coords = sp_coords
comp_analyzer.spectral_data = spectral_data
except Exception as e:
logging.warning(f"Could not reload data for analysis for '{sample_ID}': {e}")
continue
# Perform analysis and print results
if run_analysis:
try:
analysis_successful, _, _ = comp_analyzer.analyse_data(max_analytical_error)
except Exception as e:
logging.exception(f"Error during clustering analysis for '{sample_ID}'. Rerun separately if needed: {e}")
continue
if analysis_successful:
comp_analyzer.print_results()
else:
logging.info(f"Analysis was not successful for '{sample_ID}'.")
total_process_time = (time.time() - sample_processing_time_start) / 60
print_double_separator()
logging.info(f"Sample '{sample_ID}' successfully quantified in {total_process_time:.1f} min.")
logging.info(f"{len(indices_to_quantify)} spectra have been quantified and saved for '{sample_ID}'.")
quant_results.append(comp_analyzer)
return quant_results