Source code for smol.cofe.wrangling.tools

"""Functions to filter data in a StructureWrangler and calculate fitting weights.

For example, weights can be calculated by energy above hull or energy by composition.
"""

from collections import defaultdict

import numpy as np
from pymatgen.analysis.phase_diagram import PDEntry, PhaseDiagram
from pymatgen.core import Composition

from smol.cofe.extern import EwaldTerm

__author__ = "Luis Barroso-Luque, William Davidson Richard"

from smol.constants import kB



[docs]
def unique_corr_vector_indices(
    wrangler, property_key, filter_by="min", cutoffs=None, return_compliment=False
):
    """Return set of indices of structures with unique correlation vectors.

    Keep structures with the min or max value of the given property.
    Note correlation vectors exclude external terms such that even if the
    external term is different for the structure but all correlations are
    the same, the structures will be considered duplicates.

    Args:
        wrangler (StructureWrangler):
            a StructureWrangler containing data to be filtered.
        property_key (str):
            name of property to consider when returning structure indices
            for the feature matrix with duplicate corr vectors removed.
        filter_by (str)
            the criteria for the property value to keep. Options are min
            or max.
        cutoffs (dict): optional
            Dictionary with cluster diameter cutoffs for correlation
            functions to consider in correlation vectors.
        return_compliment (bool): optional
            if True, will return the complement of the unique indices
    Returns:
        indices, complement
    """
    if filter_by not in ("max", "min"):
        raise ValueError(f"Filtering by {filter_by} is not an option.")

    choose_val = np.argmin if filter_by == "min" else np.argmax

    property_vector = wrangler.get_property_vector(property_key)
    dupe_inds = wrangler.get_duplicate_corr_indices(cutoffs=cutoffs)
    indices = {inds[choose_val(property_vector[inds])] for inds in dupe_inds}
    duplicates = {i for inds in dupe_inds for i in inds} - indices
    indices = list(set(range(wrangler.num_structures)) - duplicates)

    if return_compliment:
        return indices, list(duplicates)
    return indices




[docs]
def max_ewald_energy_indices(wrangler, max_relative_energy, return_compliment=False):
    """Return set of indices of structures by electrostatic interaction energy.

    Filter the input structures to remove those with low electrostatic
    energies (no large charge separation in cell). This energy is
    referenced to the lowest value at that composition. Note that this is
    before the division by the relative dielectric constant and is per
    primitive cell in the cluster expansion -- 1.5 eV/atom seems to be a
    reasonable value for dielectric constants around 10.

    Args:
        wrangler (StructureWrangler):
            a StructureWrangler containing data to be filtered.
        max_relative_energy (float):
            Ewald threshold. The maximum Ewald energy relative to the
            minimum energy at a structure's composition (normalized per prim).
        return_compliment (bool): optional
            If True will return the compliment of the unique indices.
    Returns:
        indices, compliment
    """
    ewald_energy = None
    for term in wrangler.cluster_subspace.external_terms:
        if isinstance(term, EwaldTerm):
            ewald_energy = [
                entry.data["correlations"][-1] for entry in wrangler.entries
            ]
    if ewald_energy is None:
        ewald_energy = []
        for entry in wrangler.entries:
            struct = entry.structure
            matrix = entry.data["supercell_matrix"]
            mapping = entry.data["site_mapping"]
            occu = wrangler.cluster_subspace.occupancy_from_structure(
                struct, encode=True, scmatrix=matrix, site_mapping=mapping
            )

            supercell = wrangler.cluster_subspace.structure.copy()
            supercell.make_supercell(matrix)
            size = wrangler.cluster_subspace.num_prims_from_matrix(matrix)
            term = EwaldTerm().value_from_occupancy(occu, supercell) / size
            ewald_energy.append(term)

    min_energy_by_comp = defaultdict(lambda: np.inf)
    for energy, entry in zip(ewald_energy, wrangler.entries):
        comp = entry.structure.composition.reduced_composition
        if energy < min_energy_by_comp[comp]:
            min_energy_by_comp[comp] = energy

    indices, compliment = [], []
    for i, (energy, entry) in enumerate(zip(ewald_energy, wrangler.entries)):
        min_energy = min_energy_by_comp[entry.structure.composition.reduced_composition]
        rel_energy = energy - min_energy
        if rel_energy < max_relative_energy:
            indices.append(i)
        else:
            compliment.append(i)

    if return_compliment:
        return indices, compliment
    return indices




[docs]
def weights_energy_above_composition(structures, energies, temperature=2000):
    """Compute weights for energy above the minimum reduced composition energy.

    Args:
        structures (list):
            list of pymatgen Structures
        energies (ndarray):
            energies of corresponding Structures
        temperature (float):
            temperature to use in Boltzmann weight

    Returns: weights for each structure.
        array
    """
    e_above_comp = _energies_above_composition(structures, energies)
    return np.exp(-e_above_comp / (kB * temperature))




[docs]
def weights_energy_above_hull(structures, energies, cs_structure, temperature=2000):
    """Compute weights for structure energy above the hull of given structures.

    Args:
        structures (list):
            list of pymatgen Structures
        energies (ndarray):
            energies of corresponding Structures.
        cs_structure (Structure):
            The pymatgen Structure used to define the ClusterSubspace
        temperature (float):
            temperature to use in Boltzmann weight.

    Returns: weights for each structure.
        array
    """
    e_above_hull = _energies_above_hull(structures, energies, cs_structure)
    return np.exp(-e_above_hull / (kB * temperature))



def _energies_above_hull(structures, energies, ce_structure):
    """Compute energies above hull.

    The hull is constructed from the phase diagram of the given structures.
    """
    phase_diagram = _pd(structures, energies, ce_structure)
    e_above_hull = []
    for structure, energy in zip(structures, energies):
        entry = PDEntry(structure.composition.element_composition, energy)
        e_above_hull.append(phase_diagram.get_e_above_hull(entry))
    return np.array(e_above_hull)


def _pd(structures, energies, cs_structure):
    """Generate a phase diagram with the structures and energies."""
    entries = []

    for structure, energy in zip(structures, energies):
        entries.append(PDEntry(structure.composition.element_composition, energy))

    max_e = max(entries, key=lambda e: e.energy_per_atom).energy_per_atom
    max_e += 1000
    for element in cs_structure.composition.keys():
        entry = PDEntry(Composition({element: 1}).element_composition, max_e)
        entries.append(entry)

    return PhaseDiagram(entries)


def _energies_above_composition(structures, energies):
    """Compute structure energies above reduced composition."""
    min_e = defaultdict(lambda: np.inf)
    for structure, energy in zip(structures, energies):
        comp = structure.composition.reduced_composition
        if energy / len(structure) < min_e[comp]:
            min_e[comp] = energy / len(structure)
    e_above = []
    for structure, energy in zip(structures, energies):
        comp = structure.composition.reduced_composition
        e_above.append(energy / len(structure) - min_e[comp])
    return np.array(e_above)