Skip to content

Commit

Permalink
add matbench_discovery/structure/prototype.py module for generating A…
Browse files Browse the repository at this point in the history
…FLOW-style protostructure labels

- replace aviary's get_protostructure_label_from_spglib with matbench_discovery.structure.prototype.get_protostructure_label in data/mp/get_mp_energies.py and data/wbm/compile_wbm_test_set.py
- add test cases for prototype structure labeling
- update 2023-01-10 mp-energies.csv.gz with moyo-based prototype labels (~50k out of 154k still missing where label generation failed so also keeping the original spglib labels for now)
  • Loading branch information
janosh committed Feb 2, 2025
1 parent b7d1435 commit 3057919
Show file tree
Hide file tree
Showing 10 changed files with 356 additions and 48 deletions.
Binary file removed data/mp/2023-01-10-mp-energies.csv.gz
Binary file not shown.
19 changes: 11 additions & 8 deletions data/mp/get_mp_energies.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
"""Download all MP formation and above hull energies on 2023-01-10.
The main purpose of this script is produce the file at DataFiles.mp_energies.path.
Related EDA of MP formation energies:
https://github.com/janosh/pymatviz/blob/-/examples/mp_bimodal_e_form.ipynb
"""
Expand All @@ -9,14 +11,14 @@

import pandas as pd
import pymatviz as pmv
from aviary.wren.utils import get_protostructure_label_from_spglib
from mp_api.client import MPRester
from pymatgen.core import Structure
from pymatviz.enums import Key
from tqdm import tqdm

from matbench_discovery import STABILITY_THRESHOLD, today
from matbench_discovery.data import DataFiles
from matbench_discovery.structure import prototype

__author__ = "Janosh Riebesell"
__date__ = "2023-01-10"
Expand Down Expand Up @@ -63,17 +65,18 @@
)

df_cse[Key.structure] = [
Structure.from_dict(cse[Key.structure]) for cse in tqdm(df_cse.entry)
Structure.from_dict(cse[Key.structure])
for cse in tqdm(df_cse.entry, desc="Hydrating structures")
]
df_cse[Key.wyckoff] = [
get_protostructure_label_from_spglib(struct, errors="ignore")
for struct in tqdm(df_cse.structure)
df_cse[f"{Key.protostructure}_moyo"] = [
prototype.get_protostructure_label(struct)
for struct in tqdm(df_cse.structure, desc="Calculating proto-structure labels")
]
# make sure symmetry detection succeeded for all structures
assert df_cse[Key.wyckoff].str.startswith("invalid").sum() == 0
df_mp[Key.wyckoff] = df_cse[Key.wyckoff]
assert df_cse[f"{Key.protostructure}_moyo"].str.startswith("invalid").sum() == 0
df_mp[f"{Key.protostructure}_moyo"] = df_cse[f"{Key.protostructure}_moyo"]

spg_nums = df_mp[Key.wyckoff].str.split("_").str[2].astype(int)
spg_nums = df_mp[f"{Key.protostructure}_moyo"].str.split("_").str[2].astype(int)
# make sure all our spacegroup numbers match MP's
assert (spg_nums.sort_index() == df_spg["number"].sort_index()).all()

Expand Down
64 changes: 29 additions & 35 deletions data/wbm/compile_wbm_test_set.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
from matbench_discovery.data import DataFiles
from matbench_discovery.energy import calc_energy_from_e_refs, mp_elemental_ref_energies
from matbench_discovery.enums import MbdKey
from matbench_discovery.structure import prototype

try:
import gdown
Expand Down Expand Up @@ -629,41 +630,34 @@ def fix_bad_struct_index_mismatch(material_id: str) -> str:


# %%
try:
from aviary.wren.utils import get_protostructure_label_from_spglib

# from initial structures
for idx in tqdm(df_wbm.index):
if not pd.isna(df_summary.loc[idx].get(MbdKey.init_wyckoff)):
continue # Aflow label already computed
try:
struct = Structure.from_dict(df_wbm.loc[idx, Key.init_struct])
df_summary.loc[idx, MbdKey.init_wyckoff] = (
get_protostructure_label_from_spglib(struct)
)
except Exception as exc:
print(f"{idx=} {exc=}")

# from relaxed structures
for idx in tqdm(df_wbm.index):
if not pd.isna(df_summary.loc[idx].get(Key.wyckoff)):
continue

try:
cse = df_wbm.loc[idx, Key.computed_structure_entry]
struct = Structure.from_dict(cse["structure"])
df_summary.loc[idx, Key.wyckoff] = get_protostructure_label_from_spglib(
struct
)
except Exception as exc:
print(f"{idx=} {exc=}")

assert df_summary[MbdKey.init_wyckoff].isna().sum() == 0
assert df_summary[Key.wyckoff].isna().sum() == 0
except ImportError:
print("aviary not installed, skipping Wyckoff label generation")
except Exception as exception:
print(f"Generating Aflow labels raised {exception=}")
# from initial structures
for idx in tqdm(df_wbm.index):
if not pd.isna(df_summary.loc[idx].get(MbdKey.init_wyckoff)):
continue # Aflow label already computed
try:
struct = Structure.from_dict(df_wbm.loc[idx, Key.init_struct])
df_summary.loc[idx, f"{Key.protostructure}_moyo_init"] = (
prototype.get_protostructure_label(struct)
)
except Exception as exc:
print(f"{idx=} {exc=}")

# from relaxed structures
for idx in tqdm(df_wbm.index):
if not pd.isna(df_summary.loc[idx].get(Key.wyckoff)):
continue

try:
cse = df_wbm.loc[idx, Key.computed_structure_entry]
struct = Structure.from_dict(cse["structure"])
df_summary.loc[idx, f"{Key.protostructure}_moyo_relaxed"] = (
prototype.get_protostructure_label(struct)
)
except Exception as exc:
print(f"{idx=} {exc=}")

assert df_summary[MbdKey.init_wyckoff].isna().sum() == 0
assert df_summary[Key.wyckoff].isna().sum() == 0


# %%
Expand Down
8 changes: 4 additions & 4 deletions matbench_discovery/data-files.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,10 @@ mp_elemental_ref_entries:
md5: 6e93b6f38d6e27d6c811d3cafb23a070

mp_energies:
url: https://figshare.com/ndownloader/files/49083124
path: mp/2023-01-10-mp-energies.csv.gz
description: Materials Project formation energies and energies above convex hull in eV/atom as a fast-to-load CSV file
md5: 888579e287c8417e2202330c40e1367f
url: https://figshare.com/ndownloader/files/52080797
path: mp/2025-02-01-mp-energies.csv.gz
description: Materials Project formation energies and energies above convex hull in eV/atom as a fast-to-load CSV file.
md5: 7eb0c49fc169ba92783f2f6d0d19d741

mp_patched_phase_diagram:
url: https://figshare.com/ndownloader/files/48241624
Expand Down
2 changes: 1 addition & 1 deletion matbench_discovery/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -310,7 +310,7 @@ class DataFiles(Files):
"mp/2023-02-07-mp-computed-structure-entries.json.gz"
)
mp_elemental_ref_entries = "mp/2023-02-07-mp-elemental-reference-entries.json.gz"
mp_energies = "mp/2023-01-10-mp-energies.csv.gz"
mp_energies = "mp/2025-02-01-mp-energies.csv.gz"
mp_patched_phase_diagram = "mp/2023-02-07-ppd-mp.pkl.gz"
mp_trj_json_gz = "mp/2022-09-16-mp-trj.json.gz"
mp_trj_extxyz = "mp/2024-09-03-mp-trj.extxyz.zip"
Expand Down
211 changes: 211 additions & 0 deletions matbench_discovery/structure/prototype.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,211 @@
"""Get AFLOW prototype labels for crystal structures using Moyopy for symmetry
detection.
"""

import gzip
import itertools
import math
import os
import string
from typing import Final

import moyopy
import yaml
from pymatgen.core import Composition, Structure

__author__ = "Janosh Riebesell"
__date__ = "2022-12-02"

module_dir = os.path.dirname(__file__)

with gzip.open(
f"{module_dir}/wyckoff-position-multiplicities.yaml.gz", "rb"
) as gz_file:
wyckoff_multiplicity_dict = yaml.safe_load(gz_file)

with gzip.open(f"{module_dir}/wyckoff-position-relabelings.yaml.gz") as gz_file:
wyckoff_relabelings = yaml.safe_load(gz_file)

crys_sys_letters: Final[dict[str, str]] = {
"Triclinic": "a",
"Monoclinic": "m",
"Orthorhombic": "o",
"Tetragonal": "t",
"Trigonal": "h",
"Hexagonal": "h",
"Cubic": "c",
}


def get_pearson_symbol(moyo_data: moyopy.MoyoDataset) -> str:
"""Get the Pearson symbol for the structure from a MoyoDataset."""
hall_entry = moyopy.HallSymbolEntry(hall_number=moyo_data.hall_number)
spg_sym = hall_entry.hm_short
# Get centering from first letter of space group symbol, handle special case for
# C-centered
centering = "C" if spg_sym[0] in ("A", "B", "C", "S") else spg_sym[0]
n_sites = len(moyo_data.std_cell.numbers)
crys_sys = str(moyo_data.crystal_system)
return f"{crys_sys_letters[crys_sys]}{centering}{n_sites}"


def get_prototype_formula(composition: Composition, amt_tol: float = 1e-8) -> str:
"""Get anonymized formula for a Composition where species are in alphabetical
order and assigned ascending letters. This format is used in the Aflow
structure prototype labelling scheme.
Args:
composition (Composition): Pymatgen Composition to process
amt_tol (float): Tolerance for amount of species. Defaults to 1e-8.
Returns:
str: anonymized formula where the species are in alphabetical order
"""
reduced = composition.element_composition
if all(x == int(x) for x in composition.values()):
gcd = math.gcd(*map(int, composition.values()))
reduced /= gcd

return "".join(
f"{elem}{int(amt) if abs(amt % 1) < amt_tol else amt}" if amt != 1 else elem
for elem, amt in zip(
string.ascii_uppercase,
[reduced[key] for key in sorted(reduced, key=str)],
)
)


def canonicalize_wyckoffs(element_wyckoffs: str, spg_num: int) -> str:
"""Given an element ordering, canonicalize the associated Wyckoff positions
based on the alphabetical weight of equivalent choices of origin.
Args:
element_wyckoffs (str): wyckoff substring section from aflow_label with the
wyckoff letters for different elements separated by underscores.
spg_num (int | str): International space group number.
Returns:
str: element_wyckoff string with canonical ordering of the wyckoff letters.
"""
scored_wyckoffs: list[tuple[str, int]] = []

for trans in wyckoff_relabelings[spg_num]:
wyckoffs = element_wyckoffs.translate(str.maketrans(trans))
score: int = 0
sorted_wyckoffs: list[str] = []

for el_wyckoff_letter in wyckoffs.split("_"):
# Split into alpha and numeric groups
groups = [
"".join(grp)
for _, grp in itertools.groupby(el_wyckoff_letter, str.isalpha)
]
letters = [char for char in groups if char.isalpha()]
counts = [char for char in groups if char.isnumeric()]

# Sort by Wyckoff letter and build string
sorted_pairs = sorted(zip(counts, letters), key=lambda x: x[1])
sorted_wyckoffs.append(
"".join(
f"{count}{letter}" if count != "1" else letter
for count, letter in sorted_pairs
)
)
score += sum(0 if letter == "A" else ord(letter) - 96 for letter in letters)

scored_wyckoffs += [("_".join(sorted_wyckoffs), score)]

return min(scored_wyckoffs, key=lambda x: (x[1], x[0]))[0]


def get_protostructure_label(
struct: Structure, *, symprec: float = 0.1, raise_errors: bool = False
) -> str | None:
"""Get AFLOW-style proto-structure label using Moyopy for symmetry detection.
Args:
struct (Structure): pymatgen Structure object.
symprec (float): Initial symmetry precision for Moyopy. Defaults to 0.1.
raise_errors (bool): Whether to raise ValueError for failing structures or
return the error message as string instead of the prototype label. Defaults
to False.
Returns:
str: protostructure_label which is constructed as `aflow_label:chemsys` or
explanation of failure if symmetry detection failed and `raise_errors`
is False.
"""
import moyopy.interface

moyo_cell = moyopy.interface.MoyoAdapter.from_structure(struct)
symmetry_data = moyopy.MoyoDataset(moyo_cell, symprec=symprec)
spg_num = symmetry_data.number

# Group sites by orbit
orbit_groups: list[list[int]] = []
current_orbit: list[int] = []

for idx, orbit_id in enumerate(symmetry_data.orbits):
if not current_orbit or orbit_id == symmetry_data.orbits[current_orbit[0]]:
current_orbit += [idx]
else:
orbit_groups += [current_orbit]
current_orbit = [idx]
if current_orbit:
orbit_groups += [current_orbit]

# Create equivalent_wyckoff_labels from orbit groups
element_dict: dict[str, int] = {}
element_wyckoffs: list[str] = []

equivalent_wyckoff_labels = [
(
struct.species[orbit[0]].symbol,
len(orbit),
symmetry_data.wyckoffs[orbit[0]].translate(
str.maketrans("", "", string.digits)
),
)
for orbit in orbit_groups
]
equivalent_wyckoff_labels = sorted(
equivalent_wyckoff_labels, key=lambda x: (x[0], x[2])
)

for el, group in itertools.groupby(equivalent_wyckoff_labels, key=lambda x: x[0]):
# NOTE create a list from the iterator so that we can use it without exhausting
label_group = list(group)
element_dict[el] = sum(
wyckoff_multiplicity_dict[spg_num][itm[2]] for itm in label_group
)
wyckoff_counts_str = "".join(
f"{len(list(occurrences))}{wyk_letter}"
for wyk_letter, occurrences in itertools.groupby(
label_group, key=lambda x: x[2]
)
)

element_wyckoffs += [wyckoff_counts_str]

all_wyckoffs = canonicalize_wyckoffs("_".join(element_wyckoffs), spg_num)

# Build prototype label
prototype_label = (
f"{get_prototype_formula(struct.composition)}_"
f"{get_pearson_symbol(symmetry_data)}_"
f"{spg_num}_{all_wyckoffs}:{struct.chemical_system}"
)

# Verify multiplicities match composition
observed_formula = Composition(element_dict).reduced_formula
expected_formula = struct.composition.reduced_formula
if observed_formula != expected_formula:
err_msg = (
f"Invalid WP multiplicities - {prototype_label}, expected "
f"{observed_formula} to be {expected_formula}"
)
if raise_errors:
raise ValueError(err_msg)
return err_msg

return prototype_label
Binary file not shown.
Binary file not shown.
Binary file added tests/files/structures/esseneite.cif.gz
Binary file not shown.
Loading

0 comments on commit 3057919

Please sign in to comment.