janosh · janosh · Feb 9, 2025 · Feb 1, 2025 · Feb 1, 2025 · Feb 1, 2025
diff --git a/data/mp/2023-01-10-mp-energies.csv.gz b/data/mp/2023-01-10-mp-energies.csv.gz
diff --git a/data/mp/get_mp_energies.py b/data/mp/get_mp_energies.py
@@ -1,5 +1,7 @@
 """Download all MP formation and above hull energies on 2023-01-10.
 
+The main purpose of this script is produce the file at DataFiles.mp_energies.path.
+
 Related EDA of MP formation energies:
 https://github.com/janosh/pymatviz/blob/-/examples/mp_bimodal_e_form.ipynb
 """
@@ -9,14 +11,14 @@
 
 import pandas as pd
 import pymatviz as pmv
-from aviary.wren.utils import get_protostructure_label_from_spglib
 from mp_api.client import MPRester
 from pymatgen.core import Structure
 from pymatviz.enums import Key
 from tqdm import tqdm
 
 from matbench_discovery import STABILITY_THRESHOLD, today
-from matbench_discovery.enums import DataFiles, MbdKey
+from matbench_discovery.data import DataFiles
+from matbench_discovery.structure import prototype
 
 __author__ = "Janosh Riebesell"
 __date__ = "2023-01-10"
@@ -63,16 +65,18 @@
 )
 
 df_cse[Key.structure] = [
-    Structure.from_dict(cse[Key.structure]) for cse in tqdm(df_cse.entry)
+    Structure.from_dict(cse[Key.structure])
+    for cse in tqdm(df_cse.entry, desc="Hydrating structures")
 ]
-df_cse[MbdKey.wyckoff_spglib] = [
-    get_protostructure_label_from_spglib(struct) for struct in tqdm(df_cse.structure)
+df_cse[f"{Key.protostructure}_moyo"] = [
+    prototype.get_protostructure_label(struct)
+    for struct in tqdm(df_cse.structure, desc="Calculating proto-structure labels")
 ]
 # make sure symmetry detection succeeded for all structures
-assert df_cse[MbdKey.wyckoff_spglib].str.startswith("invalid").sum() == 0
-df_mp[MbdKey.wyckoff_spglib] = df_cse[MbdKey.wyckoff_spglib]
+assert df_cse[f"{Key.protostructure}_moyo"].str.startswith("invalid").sum() == 0
+df_mp[f"{Key.protostructure}_moyo"] = df_cse[f"{Key.protostructure}_moyo"]
 
-spg_nums = df_mp[MbdKey.wyckoff_spglib].str.split("_").str[2].astype(int)
+spg_nums = df_mp[f"{Key.protostructure}_moyo"].str.split("_").str[2].astype(int)
 # make sure all our spacegroup numbers match MP's
 assert (spg_nums.sort_index() == df_spg["number"].sort_index()).all()
 

diff --git a/data/wbm/compile_wbm_test_set.py b/data/wbm/compile_wbm_test_set.py
@@ -26,8 +26,10 @@
 from tqdm import tqdm
 
 from matbench_discovery import PDF_FIGS, SITE_FIGS, WBM_DIR, today
+from matbench_discovery.data import DataFiles
 from matbench_discovery.energy import calc_energy_from_e_refs, mp_elemental_ref_energies
-from matbench_discovery.enums import DataFiles, MbdKey
+from matbench_discovery.enums import MbdKey
+from matbench_discovery.structure import prototype
 
 try:
     import gdown
@@ -628,41 +630,34 @@ def fix_bad_struct_index_mismatch(material_id: str) -> str:
 
 
 # %%
-try:
-    from aviary.wren.utils import get_protostructure_label_from_spglib
-
-    # from initial structures
-    for idx in tqdm(df_wbm.index):
-        if not pd.isna(df_summary.loc[idx].get(MbdKey.init_wyckoff_spglib)):
-            continue  # Aflow label already computed
-        try:
-            struct = Structure.from_dict(df_wbm.loc[idx, Key.init_struct])
-            df_summary.loc[idx, MbdKey.init_wyckoff_spglib] = (
-                get_protostructure_label_from_spglib(struct)
-            )
-        except Exception as exc:
-            print(f"{idx=} {exc=}")
-
-    # from relaxed structures
-    for idx in tqdm(df_wbm.index):
-        if not pd.isna(df_summary.loc[idx].get(MbdKey.wyckoff_spglib)):
-            continue
-
-        try:
-            cse = df_wbm.loc[idx, Key.computed_structure_entry]
-            struct = Structure.from_dict(cse["structure"])
-            df_summary.loc[idx, Key.wyckoff] = get_protostructure_label_from_spglib(
-                struct
-            )
-        except Exception as exc:
-            print(f"{idx=} {exc=}")
-
-    assert df_summary[MbdKey.init_wyckoff_spglib].isna().sum() == 0
-    assert df_summary[MbdKey.wyckoff_spglib].isna().sum() == 0
-except ImportError:
-    print("aviary not installed, skipping Wyckoff label generation")
-except Exception as exception:
-    print(f"Generating Aflow labels raised {exception=}")
+# from initial structures
+for idx in tqdm(df_wbm.index):
+    if not pd.isna(df_summary.loc[idx].get(MbdKey.init_wyckoff_spglib)):
+        continue  # Aflow label already computed
+    try:
+        struct = Structure.from_dict(df_wbm.loc[idx, Key.init_struct])
+        df_summary.loc[idx, f"{Key.protostructure}_moyo_init"] = (
+            prototype.get_protostructure_label(struct)
+        )
+    except Exception as exc:
+        print(f"{idx=} {exc=}")
+
+# from relaxed structures
+for idx in tqdm(df_wbm.index):
+    if not pd.isna(df_summary.loc[idx].get(Key.wyckoff)):
+        continue
+
+    try:
+        cse = df_wbm.loc[idx, Key.computed_structure_entry]
+        struct = Structure.from_dict(cse["structure"])
+        df_summary.loc[idx, f"{Key.protostructure}_moyo_relaxed"] = (
+            prototype.get_protostructure_label(struct)
+        )
+    except Exception as exc:
+        print(f"{idx=} {exc=}")
+
+assert df_summary[MbdKey.init_wyckoff_spglib].isna().sum() == 0
+assert df_summary[Key.wyckoff].isna().sum() == 0
 
 
 # %%

diff --git a/matbench_discovery/data-files.yml b/matbench_discovery/data-files.yml
@@ -22,10 +22,10 @@ mp_elemental_ref_entries:
   md5: 6e93b6f38d6e27d6c811d3cafb23a070
 
 mp_energies:
-  url: https://figshare.com/ndownloader/files/49083124
-  path: mp/2023-01-10-mp-energies.csv.gz
-  description: Materials Project formation energies and energies above convex hull in eV/atom as a fast-to-load CSV file
-  md5: 888579e287c8417e2202330c40e1367f
+  url: https://figshare.com/ndownloader/files/52080797
+  path: mp/2025-02-01-mp-energies.csv.gz
+  description: Materials Project formation energies and energies above convex hull in eV/atom as a fast-to-load CSV file.
+  md5: 7eb0c49fc169ba92783f2f6d0d19d741
 
 mp_patched_phase_diagram:
   url: https://figshare.com/ndownloader/files/48241624

diff --git a/matbench_discovery/enums.py b/matbench_discovery/enums.py
@@ -337,7 +337,7 @@ def label(self) -> str:
         return self.metadata["model_name"]
 
     @property
-    def url(self) -> str:
+    def pr_url(self) -> str:
         """Pull request URL in which the model was originally added to the repo."""
         return self.metadata["pr_url"]
 

diff --git a/matbench_discovery/structure/__init__.py b/matbench_discovery/structure/__init__.py
@@ -0,0 +1,31 @@
+"""Perturb atomic coordinates of a pymatgen structure."""
+
+import numpy as np
+from pymatgen.core import Structure
+
+rng = np.random.default_rng(seed=0)  # ensure reproducible structure perturbations
+
+
+def perturb_structure(struct: Structure, gamma: float = 1.5) -> Structure:
+    """Perturb the atomic coordinates of a pymatgen structure. Used for CGCNN+P
+    training set augmentation.
+
+    Not identical but very similar to the perturbation method used in
+    https://nature.com/articles/s41524-022-00891-8#Fig5.
+
+    Args:
+        struct (Structure): pymatgen structure to be perturbed
+        gamma (float, optional): Weibull distribution parameter. Defaults to 1.5.
+
+    Returns:
+        Structure: Perturbed structure
+    """
+    perturbed = struct.copy()
+    for site in perturbed:
+        magnitude = rng.weibull(gamma)
+        vec = rng.normal(3)  # TODO maybe make func recursive to deal with 0-vector
+        vec /= np.linalg.norm(vec)  # unit vector
+        site.coords += vec * magnitude
+        site.to_unit_cell(in_place=True)
+
+    return perturbed
diff --git a/matbench_discovery/structure/prototype.py b/matbench_discovery/structure/prototype.py
@@ -0,0 +1,195 @@
+"""Get AFLOW prototype labels for crystal structures using Moyopy for symmetry
+detection.
+"""
+
+import gzip
+import itertools
+import math
+import os
+import string
+from typing import Final
+
+import ase
+import yaml
+from pymatgen.core import Composition, Structure
+
+module_dir = os.path.dirname(__file__)
+
+with gzip.open(
+    f"{module_dir}/wyckoff-position-multiplicities.yaml.gz", "rb"
+) as gz_file:
+    wyckoff_multiplicity_dict = yaml.safe_load(gz_file)
+
+with gzip.open(f"{module_dir}/wyckoff-position-relabelings.yaml.gz") as gz_file:
+    wyckoff_relabelings = yaml.safe_load(gz_file)
+
+crys_sys_letters: Final[dict[str, str]] = {
+    "Triclinic": "a",
+    "Monoclinic": "m",
+    "Orthorhombic": "o",
+    "Tetragonal": "t",
+    "Trigonal": "h",
+    "Hexagonal": "h",
+    "Cubic": "c",
+}
+
+
+def get_prototype_formula(composition: Composition, amt_tol: float = 1e-8) -> str:
+    """Get anonymized formula for a Composition where species are in alphabetical
+    order and assigned ascending letters. This format is used in the Aflow
+    structure prototype labelling scheme.
+
+    Args:
+        composition (Composition): Pymatgen Composition to process
+        amt_tol (float): Tolerance for amount of species. Defaults to 1e-8.
+
+    Returns:
+        str: anonymized formula where the species are in alphabetical order
+    """
+    reduced = composition.element_composition
+    if all(x == int(x) for x in composition.values()):
+        gcd = math.gcd(*map(int, composition.values()))
+        reduced /= gcd
+
+    return "".join(
+        f"{elem}{int(amt) if abs(amt % 1) < amt_tol else amt}" if amt != 1 else elem
+        for elem, amt in zip(
+            string.ascii_uppercase,
+            [reduced[key] for key in sorted(reduced, key=str)],
+        )
+    )
+
+
+def canonicalize_wyckoffs(element_wyckoffs: str, spg_num: int) -> str:
+    """Given an element ordering, canonicalize the associated Wyckoff positions
+    based on the alphabetical weight of equivalent choices of origin.
+
+    Args:
+        element_wyckoffs (str): wyckoff substring section from aflow_label with the
+            wyckoff letters for different elements separated by underscores.
+        spg_num (int | str): International space group number.
+
+    Returns:
+        str: element_wyckoff string with canonical ordering of the wyckoff letters.
+    """
+    scored_wyckoffs: list[tuple[str, int]] = []
+
+    for trans in wyckoff_relabelings[spg_num]:
+        wyckoffs = element_wyckoffs.translate(str.maketrans(trans))
+        score: int = 0
+        sorted_wyckoffs: list[str] = []
+
+        for el_wyckoff_letter in wyckoffs.split("_"):
+            # Split into alpha and numeric groups
+            groups = [
+                "".join(grp)
+                for _, grp in itertools.groupby(el_wyckoff_letter, str.isalpha)
+            ]
+            letters = [char for char in groups if char.isalpha()]
+            counts = [char for char in groups if char.isnumeric()]
+
+            # Sort by Wyckoff letter and build string
+            sorted_pairs = sorted(zip(counts, letters), key=lambda x: x[1])
+            sorted_wyckoffs.append(
+                "".join(
+                    f"{count}{letter}" if count != "1" else letter
+                    for count, letter in sorted_pairs
+                )
+            )
+            score += sum(0 if letter == "A" else ord(letter) - 96 for letter in letters)
+
+        scored_wyckoffs += [("_".join(sorted_wyckoffs), score)]
+
+    return min(scored_wyckoffs, key=lambda x: (x[1], x[0]))[0]
+
+
+def get_protostructure_label(
+    struct: Structure | ase.Atoms, *, symprec: float = 0.1, raise_errors: bool = False
+) -> str | None:
+    """Get AFLOW-style proto-structure label using Moyopy for symmetry detection.
+
+    Args:
+        struct (Structure | ase.Atoms): pymatgen Structure object or ase.Atoms object.
+        symprec (float): Initial symmetry precision for Moyopy. Defaults to 0.1.
+        raise_errors (bool): Whether to raise ValueError for failing structures or
+            return the error message as string instead of the prototype label. Defaults
+            to False.
+
+    Returns:
+        str: protostructure_label which is constructed as `aflow_label:chemsys` or
+            explanation of failure if symmetry detection failed and `raise_errors`
+            is False.
+    """
+    import moyopy.interface
+
+    if isinstance(struct, ase.Atoms):
+        from pymatgen.io.ase import AseAtomsAdaptor
+
+        struct = AseAtomsAdaptor.get_structure(struct)
+
+    moyo_cell = moyopy.interface.MoyoAdapter.from_py_obj(struct)
+    symmetry_data = moyopy.MoyoDataset(moyo_cell, symprec=symprec)
+    spg_num = symmetry_data.number
+
+    # Group sites by orbit
+    orbit_groups: dict[int, list[int]] = {}
+
+    for idx, orbit_id in enumerate(symmetry_data.orbits):
+        if orbit_id not in orbit_groups:
+            orbit_groups[orbit_id] = []
+        orbit_groups[orbit_id].append(idx)
+
+    # Create equivalent_wyckoff_labels from orbit groups
+    element_dict: dict[str, int] = {}
+    element_wyckoffs: list[str] = []
+
+    equivalent_wyckoff_labels = [
+        (
+            struct.species[orbit[0]].symbol,
+            len(orbit),
+            symmetry_data.wyckoffs[orbit[0]].translate(
+                str.maketrans("", "", string.digits)
+            ),
+        )
+        for orbit in orbit_groups.values()
+    ]
+    equivalent_wyckoff_labels = sorted(
+        equivalent_wyckoff_labels, key=lambda x: (x[0], x[2])
+    )
+
+    for el, group in itertools.groupby(equivalent_wyckoff_labels, key=lambda x: x[0]):
+        # NOTE create a list from the iterator so that we can use it without exhausting
+        label_group = list(group)
+        element_dict[el] = sum(
+            wyckoff_multiplicity_dict[spg_num][itm[2]] for itm in label_group
+        )
+        wyckoff_counts_str = "".join(
+            f"{len(list(occurrences))}{wyk_letter}"
+            for wyk_letter, occurrences in itertools.groupby(
+                label_group, key=lambda x: x[2]
+            )
+        )
+
+        element_wyckoffs += [wyckoff_counts_str]
+
+    all_wyckoffs = canonicalize_wyckoffs("_".join(element_wyckoffs), spg_num)
+
+    # Build prototype label
+    prototype_label = (
+        f"{get_prototype_formula(struct.composition)}_"
+        f"{symmetry_data.pearson_symbol}_"
+        f"{spg_num}_{all_wyckoffs}:{struct.chemical_system}"
+    )
+
+    # Verify multiplicities match composition
+    observed_formula = Composition(element_dict).reduced_formula
+    if observed_formula != struct.reduced_formula:
+        err_msg = (
+            f"Invalid WP multiplicities - {prototype_label}, expected "
+            f"{observed_formula} to be {struct.reduced_formula}"
+        )
+        if raise_errors:
+            raise ValueError(err_msg)
+        return err_msg
+
+    return prototype_label