add matbench_discovery/structure/prototype.py module for generating A…

…FLOW-style protostructure labels - replace aviary's get_protostructure_label_from_spglib with matbench_discovery.structure.prototype.get_protostructure_label in data/mp/get_mp_energies.py and data/wbm/compile_wbm_test_set.py - add test cases for prototype structure labeling - update 2023-01-10 mp-energies.csv.gz with moyo-based prototype labels (~50k out of 154k still missing where label generation failed so also keeping the original spglib labels for now)
janosh · Feb 2, 2025 · 3057919 · 3057919
1 parent b7d1435
commit 3057919
Show file tree

Hide file tree

Showing 10 changed files with 356 additions and 48 deletions.
diff --git a/data/mp/2023-01-10-mp-energies.csv.gz b/data/mp/2023-01-10-mp-energies.csv.gz
diff --git a/data/mp/get_mp_energies.py b/data/mp/get_mp_energies.py
@@ -1,5 +1,7 @@
 """Download all MP formation and above hull energies on 2023-01-10.
 
+The main purpose of this script is produce the file at DataFiles.mp_energies.path.
+
 Related EDA of MP formation energies:
 https://github.com/janosh/pymatviz/blob/-/examples/mp_bimodal_e_form.ipynb
 """
@@ -9,14 +11,14 @@
 
 import pandas as pd
 import pymatviz as pmv
-from aviary.wren.utils import get_protostructure_label_from_spglib
 from mp_api.client import MPRester
 from pymatgen.core import Structure
 from pymatviz.enums import Key
 from tqdm import tqdm
 
 from matbench_discovery import STABILITY_THRESHOLD, today
 from matbench_discovery.data import DataFiles
+from matbench_discovery.structure import prototype
 
 __author__ = "Janosh Riebesell"
 __date__ = "2023-01-10"
@@ -63,17 +65,18 @@
 )
 
 df_cse[Key.structure] = [
-    Structure.from_dict(cse[Key.structure]) for cse in tqdm(df_cse.entry)
+    Structure.from_dict(cse[Key.structure])
+    for cse in tqdm(df_cse.entry, desc="Hydrating structures")
 ]
-df_cse[Key.wyckoff] = [
-    get_protostructure_label_from_spglib(struct, errors="ignore")
-    for struct in tqdm(df_cse.structure)
+df_cse[f"{Key.protostructure}_moyo"] = [
+    prototype.get_protostructure_label(struct)
+    for struct in tqdm(df_cse.structure, desc="Calculating proto-structure labels")
 ]
 # make sure symmetry detection succeeded for all structures
-assert df_cse[Key.wyckoff].str.startswith("invalid").sum() == 0
-df_mp[Key.wyckoff] = df_cse[Key.wyckoff]
+assert df_cse[f"{Key.protostructure}_moyo"].str.startswith("invalid").sum() == 0
+df_mp[f"{Key.protostructure}_moyo"] = df_cse[f"{Key.protostructure}_moyo"]
 
-spg_nums = df_mp[Key.wyckoff].str.split("_").str[2].astype(int)
+spg_nums = df_mp[f"{Key.protostructure}_moyo"].str.split("_").str[2].astype(int)
 # make sure all our spacegroup numbers match MP's
 assert (spg_nums.sort_index() == df_spg["number"].sort_index()).all()
 

diff --git a/data/wbm/compile_wbm_test_set.py b/data/wbm/compile_wbm_test_set.py
@@ -29,6 +29,7 @@
 from matbench_discovery.data import DataFiles
 from matbench_discovery.energy import calc_energy_from_e_refs, mp_elemental_ref_energies
 from matbench_discovery.enums import MbdKey
+from matbench_discovery.structure import prototype
 
 try:
     import gdown
@@ -629,41 +630,34 @@ def fix_bad_struct_index_mismatch(material_id: str) -> str:
 
 
 # %%
-try:
-    from aviary.wren.utils import get_protostructure_label_from_spglib
-
-    # from initial structures
-    for idx in tqdm(df_wbm.index):
-        if not pd.isna(df_summary.loc[idx].get(MbdKey.init_wyckoff)):
-            continue  # Aflow label already computed
-        try:
-            struct = Structure.from_dict(df_wbm.loc[idx, Key.init_struct])
-            df_summary.loc[idx, MbdKey.init_wyckoff] = (
-                get_protostructure_label_from_spglib(struct)
-            )
-        except Exception as exc:
-            print(f"{idx=} {exc=}")
-
-    # from relaxed structures
-    for idx in tqdm(df_wbm.index):
-        if not pd.isna(df_summary.loc[idx].get(Key.wyckoff)):
-            continue
-
-        try:
-            cse = df_wbm.loc[idx, Key.computed_structure_entry]
-            struct = Structure.from_dict(cse["structure"])
-            df_summary.loc[idx, Key.wyckoff] = get_protostructure_label_from_spglib(
-                struct
-            )
-        except Exception as exc:
-            print(f"{idx=} {exc=}")
-
-    assert df_summary[MbdKey.init_wyckoff].isna().sum() == 0
-    assert df_summary[Key.wyckoff].isna().sum() == 0
-except ImportError:
-    print("aviary not installed, skipping Wyckoff label generation")
-except Exception as exception:
-    print(f"Generating Aflow labels raised {exception=}")
+# from initial structures
+for idx in tqdm(df_wbm.index):
+    if not pd.isna(df_summary.loc[idx].get(MbdKey.init_wyckoff)):
+        continue  # Aflow label already computed
+    try:
+        struct = Structure.from_dict(df_wbm.loc[idx, Key.init_struct])
+        df_summary.loc[idx, f"{Key.protostructure}_moyo_init"] = (
+            prototype.get_protostructure_label(struct)
+        )
+    except Exception as exc:
+        print(f"{idx=} {exc=}")
+
+# from relaxed structures
+for idx in tqdm(df_wbm.index):
+    if not pd.isna(df_summary.loc[idx].get(Key.wyckoff)):
+        continue
+
+    try:
+        cse = df_wbm.loc[idx, Key.computed_structure_entry]
+        struct = Structure.from_dict(cse["structure"])
+        df_summary.loc[idx, f"{Key.protostructure}_moyo_relaxed"] = (
+            prototype.get_protostructure_label(struct)
+        )
+    except Exception as exc:
+        print(f"{idx=} {exc=}")
+
+assert df_summary[MbdKey.init_wyckoff].isna().sum() == 0
+assert df_summary[Key.wyckoff].isna().sum() == 0
 
 
 # %%

diff --git a/matbench_discovery/data-files.yml b/matbench_discovery/data-files.yml
@@ -22,10 +22,10 @@ mp_elemental_ref_entries:
   md5: 6e93b6f38d6e27d6c811d3cafb23a070
 
 mp_energies:
-  url: https://figshare.com/ndownloader/files/49083124
-  path: mp/2023-01-10-mp-energies.csv.gz
-  description: Materials Project formation energies and energies above convex hull in eV/atom as a fast-to-load CSV file
-  md5: 888579e287c8417e2202330c40e1367f
+  url: https://figshare.com/ndownloader/files/52080797
+  path: mp/2025-02-01-mp-energies.csv.gz
+  description: Materials Project formation energies and energies above convex hull in eV/atom as a fast-to-load CSV file.
+  md5: 7eb0c49fc169ba92783f2f6d0d19d741
 
 mp_patched_phase_diagram:
   url: https://figshare.com/ndownloader/files/48241624

diff --git a/matbench_discovery/data.py b/matbench_discovery/data.py
@@ -310,7 +310,7 @@ class DataFiles(Files):
         "mp/2023-02-07-mp-computed-structure-entries.json.gz"
     )
     mp_elemental_ref_entries = "mp/2023-02-07-mp-elemental-reference-entries.json.gz"
-    mp_energies = "mp/2023-01-10-mp-energies.csv.gz"
+    mp_energies = "mp/2025-02-01-mp-energies.csv.gz"
     mp_patched_phase_diagram = "mp/2023-02-07-ppd-mp.pkl.gz"
     mp_trj_json_gz = "mp/2022-09-16-mp-trj.json.gz"
     mp_trj_extxyz = "mp/2024-09-03-mp-trj.extxyz.zip"

diff --git a/matbench_discovery/structure/prototype.py b/matbench_discovery/structure/prototype.py
@@ -0,0 +1,211 @@
+"""Get AFLOW prototype labels for crystal structures using Moyopy for symmetry
+detection.
+"""
+
+import gzip
+import itertools
+import math
+import os
+import string
+from typing import Final
+
+import moyopy
+import yaml
+from pymatgen.core import Composition, Structure
+
+__author__ = "Janosh Riebesell"
+__date__ = "2022-12-02"
+
+module_dir = os.path.dirname(__file__)
+
+with gzip.open(
+    f"{module_dir}/wyckoff-position-multiplicities.yaml.gz", "rb"
+) as gz_file:
+    wyckoff_multiplicity_dict = yaml.safe_load(gz_file)
+
+with gzip.open(f"{module_dir}/wyckoff-position-relabelings.yaml.gz") as gz_file:
+    wyckoff_relabelings = yaml.safe_load(gz_file)
+
+crys_sys_letters: Final[dict[str, str]] = {
+    "Triclinic": "a",
+    "Monoclinic": "m",
+    "Orthorhombic": "o",
+    "Tetragonal": "t",
+    "Trigonal": "h",
+    "Hexagonal": "h",
+    "Cubic": "c",
+}
+
+
+def get_pearson_symbol(moyo_data: moyopy.MoyoDataset) -> str:
+    """Get the Pearson symbol for the structure from a MoyoDataset."""
+    hall_entry = moyopy.HallSymbolEntry(hall_number=moyo_data.hall_number)
+    spg_sym = hall_entry.hm_short
+    # Get centering from first letter of space group symbol, handle special case for
+    # C-centered
+    centering = "C" if spg_sym[0] in ("A", "B", "C", "S") else spg_sym[0]
+    n_sites = len(moyo_data.std_cell.numbers)
+    crys_sys = str(moyo_data.crystal_system)
+    return f"{crys_sys_letters[crys_sys]}{centering}{n_sites}"
+
+
+def get_prototype_formula(composition: Composition, amt_tol: float = 1e-8) -> str:
+    """Get anonymized formula for a Composition where species are in alphabetical
+    order and assigned ascending letters. This format is used in the Aflow
+    structure prototype labelling scheme.
+
+    Args:
+        composition (Composition): Pymatgen Composition to process
+        amt_tol (float): Tolerance for amount of species. Defaults to 1e-8.
+
+    Returns:
+        str: anonymized formula where the species are in alphabetical order
+    """
+    reduced = composition.element_composition
+    if all(x == int(x) for x in composition.values()):
+        gcd = math.gcd(*map(int, composition.values()))
+        reduced /= gcd
+
+    return "".join(
+        f"{elem}{int(amt) if abs(amt % 1) < amt_tol else amt}" if amt != 1 else elem
+        for elem, amt in zip(
+            string.ascii_uppercase,
+            [reduced[key] for key in sorted(reduced, key=str)],
+        )
+    )
+
+
+def canonicalize_wyckoffs(element_wyckoffs: str, spg_num: int) -> str:
+    """Given an element ordering, canonicalize the associated Wyckoff positions
+    based on the alphabetical weight of equivalent choices of origin.
+
+    Args:
+        element_wyckoffs (str): wyckoff substring section from aflow_label with the
+            wyckoff letters for different elements separated by underscores.
+        spg_num (int | str): International space group number.
+
+    Returns:
+        str: element_wyckoff string with canonical ordering of the wyckoff letters.
+    """
+    scored_wyckoffs: list[tuple[str, int]] = []
+
+    for trans in wyckoff_relabelings[spg_num]:
+        wyckoffs = element_wyckoffs.translate(str.maketrans(trans))
+        score: int = 0
+        sorted_wyckoffs: list[str] = []
+
+        for el_wyckoff_letter in wyckoffs.split("_"):
+            # Split into alpha and numeric groups
+            groups = [
+                "".join(grp)
+                for _, grp in itertools.groupby(el_wyckoff_letter, str.isalpha)
+            ]
+            letters = [char for char in groups if char.isalpha()]
+            counts = [char for char in groups if char.isnumeric()]
+
+            # Sort by Wyckoff letter and build string
+            sorted_pairs = sorted(zip(counts, letters), key=lambda x: x[1])
+            sorted_wyckoffs.append(
+                "".join(
+                    f"{count}{letter}" if count != "1" else letter
+                    for count, letter in sorted_pairs
+                )
+            )
+            score += sum(0 if letter == "A" else ord(letter) - 96 for letter in letters)
+
+        scored_wyckoffs += [("_".join(sorted_wyckoffs), score)]
+
+    return min(scored_wyckoffs, key=lambda x: (x[1], x[0]))[0]
+
+
+def get_protostructure_label(
+    struct: Structure, *, symprec: float = 0.1, raise_errors: bool = False
+) -> str | None:
+    """Get AFLOW-style proto-structure label using Moyopy for symmetry detection.
+
+    Args:
+        struct (Structure): pymatgen Structure object.
+        symprec (float): Initial symmetry precision for Moyopy. Defaults to 0.1.
+        raise_errors (bool): Whether to raise ValueError for failing structures or
+            return the error message as string instead of the prototype label. Defaults
+            to False.
+
+    Returns:
+        str: protostructure_label which is constructed as `aflow_label:chemsys` or
+            explanation of failure if symmetry detection failed and `raise_errors`
+            is False.
+    """
+    import moyopy.interface
+
+    moyo_cell = moyopy.interface.MoyoAdapter.from_structure(struct)
+    symmetry_data = moyopy.MoyoDataset(moyo_cell, symprec=symprec)
+    spg_num = symmetry_data.number
+
+    # Group sites by orbit
+    orbit_groups: list[list[int]] = []
+    current_orbit: list[int] = []
+
+    for idx, orbit_id in enumerate(symmetry_data.orbits):
+        if not current_orbit or orbit_id == symmetry_data.orbits[current_orbit[0]]:
+            current_orbit += [idx]
+        else:
+            orbit_groups += [current_orbit]
+            current_orbit = [idx]
+    if current_orbit:
+        orbit_groups += [current_orbit]
+
+    # Create equivalent_wyckoff_labels from orbit groups
+    element_dict: dict[str, int] = {}
+    element_wyckoffs: list[str] = []
+
+    equivalent_wyckoff_labels = [
+        (
+            struct.species[orbit[0]].symbol,
+            len(orbit),
+            symmetry_data.wyckoffs[orbit[0]].translate(
+                str.maketrans("", "", string.digits)
+            ),
+        )
+        for orbit in orbit_groups
+    ]
+    equivalent_wyckoff_labels = sorted(
+        equivalent_wyckoff_labels, key=lambda x: (x[0], x[2])
+    )
+
+    for el, group in itertools.groupby(equivalent_wyckoff_labels, key=lambda x: x[0]):
+        # NOTE create a list from the iterator so that we can use it without exhausting
+        label_group = list(group)
+        element_dict[el] = sum(
+            wyckoff_multiplicity_dict[spg_num][itm[2]] for itm in label_group
+        )
+        wyckoff_counts_str = "".join(
+            f"{len(list(occurrences))}{wyk_letter}"
+            for wyk_letter, occurrences in itertools.groupby(
+                label_group, key=lambda x: x[2]
+            )
+        )
+
+        element_wyckoffs += [wyckoff_counts_str]
+
+    all_wyckoffs = canonicalize_wyckoffs("_".join(element_wyckoffs), spg_num)
+
+    # Build prototype label
+    prototype_label = (
+        f"{get_prototype_formula(struct.composition)}_"
+        f"{get_pearson_symbol(symmetry_data)}_"
+        f"{spg_num}_{all_wyckoffs}:{struct.chemical_system}"
+    )
+
+    # Verify multiplicities match composition
+    observed_formula = Composition(element_dict).reduced_formula
+    expected_formula = struct.composition.reduced_formula
+    if observed_formula != expected_formula:
+        err_msg = (
+            f"Invalid WP multiplicities - {prototype_label}, expected "
+            f"{observed_formula} to be {expected_formula}"
+        )
+        if raise_errors:
+            raise ValueError(err_msg)
+        return err_msg
+
+    return prototype_label
diff --git a/matbench_discovery/structure/wyckoff-position-multiplicities.yaml.gz b/matbench_discovery/structure/wyckoff-position-multiplicities.yaml.gz
diff --git a/matbench_discovery/structure/wyckoff-position-relabelings.yaml.gz b/matbench_discovery/structure/wyckoff-position-relabelings.yaml.gz
diff --git a/tests/files/structures/esseneite.cif.gz b/tests/files/structures/esseneite.cif.gz