Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New prototype.py module with moyopy-powered AFLOW-style proto-structure labeling #198

Merged
merged 18 commits into from
Feb 9, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file removed data/mp/2023-01-10-mp-energies.csv.gz
Binary file not shown.
20 changes: 12 additions & 8 deletions data/mp/get_mp_energies.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
"""Download all MP formation and above hull energies on 2023-01-10.

The main purpose of this script is produce the file at DataFiles.mp_energies.path.

Related EDA of MP formation energies:
https://github.com/janosh/pymatviz/blob/-/examples/mp_bimodal_e_form.ipynb
"""
Expand All @@ -9,14 +11,14 @@

import pandas as pd
import pymatviz as pmv
from aviary.wren.utils import get_protostructure_label_from_spglib
from mp_api.client import MPRester
from pymatgen.core import Structure
from pymatviz.enums import Key
from tqdm import tqdm

from matbench_discovery import STABILITY_THRESHOLD, today
from matbench_discovery.enums import DataFiles, MbdKey
from matbench_discovery.data import DataFiles
from matbench_discovery.structure import prototype

__author__ = "Janosh Riebesell"
__date__ = "2023-01-10"
Expand Down Expand Up @@ -63,16 +65,18 @@
)

df_cse[Key.structure] = [
Structure.from_dict(cse[Key.structure]) for cse in tqdm(df_cse.entry)
Structure.from_dict(cse[Key.structure])
for cse in tqdm(df_cse.entry, desc="Hydrating structures")
]
df_cse[MbdKey.wyckoff_spglib] = [
get_protostructure_label_from_spglib(struct) for struct in tqdm(df_cse.structure)
df_cse[f"{Key.protostructure}_moyo"] = [
prototype.get_protostructure_label(struct)
for struct in tqdm(df_cse.structure, desc="Calculating proto-structure labels")
]
# make sure symmetry detection succeeded for all structures
assert df_cse[MbdKey.wyckoff_spglib].str.startswith("invalid").sum() == 0
df_mp[MbdKey.wyckoff_spglib] = df_cse[MbdKey.wyckoff_spglib]
assert df_cse[f"{Key.protostructure}_moyo"].str.startswith("invalid").sum() == 0
df_mp[f"{Key.protostructure}_moyo"] = df_cse[f"{Key.protostructure}_moyo"]

spg_nums = df_mp[MbdKey.wyckoff_spglib].str.split("_").str[2].astype(int)
spg_nums = df_mp[f"{Key.protostructure}_moyo"].str.split("_").str[2].astype(int)
# make sure all our spacegroup numbers match MP's
assert (spg_nums.sort_index() == df_spg["number"].sort_index()).all()

Expand Down
67 changes: 31 additions & 36 deletions data/wbm/compile_wbm_test_set.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,10 @@
from tqdm import tqdm

from matbench_discovery import PDF_FIGS, SITE_FIGS, WBM_DIR, today
from matbench_discovery.data import DataFiles
from matbench_discovery.energy import calc_energy_from_e_refs, mp_elemental_ref_energies
from matbench_discovery.enums import DataFiles, MbdKey
from matbench_discovery.enums import MbdKey
from matbench_discovery.structure import prototype

try:
import gdown
Expand Down Expand Up @@ -628,41 +630,34 @@ def fix_bad_struct_index_mismatch(material_id: str) -> str:


# %%
try:
from aviary.wren.utils import get_protostructure_label_from_spglib

# from initial structures
for idx in tqdm(df_wbm.index):
if not pd.isna(df_summary.loc[idx].get(MbdKey.init_wyckoff_spglib)):
continue # Aflow label already computed
try:
struct = Structure.from_dict(df_wbm.loc[idx, Key.init_struct])
df_summary.loc[idx, MbdKey.init_wyckoff_spglib] = (
get_protostructure_label_from_spglib(struct)
)
except Exception as exc:
print(f"{idx=} {exc=}")

# from relaxed structures
for idx in tqdm(df_wbm.index):
if not pd.isna(df_summary.loc[idx].get(MbdKey.wyckoff_spglib)):
continue

try:
cse = df_wbm.loc[idx, Key.computed_structure_entry]
struct = Structure.from_dict(cse["structure"])
df_summary.loc[idx, Key.wyckoff] = get_protostructure_label_from_spglib(
struct
)
except Exception as exc:
print(f"{idx=} {exc=}")

assert df_summary[MbdKey.init_wyckoff_spglib].isna().sum() == 0
assert df_summary[MbdKey.wyckoff_spglib].isna().sum() == 0
except ImportError:
print("aviary not installed, skipping Wyckoff label generation")
except Exception as exception:
print(f"Generating Aflow labels raised {exception=}")
# from initial structures
for idx in tqdm(df_wbm.index):
if not pd.isna(df_summary.loc[idx].get(MbdKey.init_wyckoff_spglib)):
continue # Aflow label already computed
try:
struct = Structure.from_dict(df_wbm.loc[idx, Key.init_struct])
df_summary.loc[idx, f"{Key.protostructure}_moyo_init"] = (
prototype.get_protostructure_label(struct)
)
except Exception as exc:
print(f"{idx=} {exc=}")

# from relaxed structures
for idx in tqdm(df_wbm.index):
if not pd.isna(df_summary.loc[idx].get(Key.wyckoff)):
continue

try:
cse = df_wbm.loc[idx, Key.computed_structure_entry]
struct = Structure.from_dict(cse["structure"])
df_summary.loc[idx, f"{Key.protostructure}_moyo_relaxed"] = (
prototype.get_protostructure_label(struct)
)
except Exception as exc:
print(f"{idx=} {exc=}")

assert df_summary[MbdKey.init_wyckoff_spglib].isna().sum() == 0
assert df_summary[Key.wyckoff].isna().sum() == 0


# %%
Expand Down
8 changes: 4 additions & 4 deletions matbench_discovery/data-files.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,10 @@ mp_elemental_ref_entries:
md5: 6e93b6f38d6e27d6c811d3cafb23a070

mp_energies:
url: https://figshare.com/ndownloader/files/49083124
path: mp/2023-01-10-mp-energies.csv.gz
description: Materials Project formation energies and energies above convex hull in eV/atom as a fast-to-load CSV file
md5: 888579e287c8417e2202330c40e1367f
url: https://figshare.com/ndownloader/files/52080797
path: mp/2025-02-01-mp-energies.csv.gz
description: Materials Project formation energies and energies above convex hull in eV/atom as a fast-to-load CSV file.
md5: 7eb0c49fc169ba92783f2f6d0d19d741

mp_patched_phase_diagram:
url: https://figshare.com/ndownloader/files/48241624
Expand Down
2 changes: 1 addition & 1 deletion matbench_discovery/enums.py
Original file line number Diff line number Diff line change
Expand Up @@ -337,7 +337,7 @@ def label(self) -> str:
return self.metadata["model_name"]

@property
def url(self) -> str:
def pr_url(self) -> str:
"""Pull request URL in which the model was originally added to the repo."""
return self.metadata["pr_url"]

Expand Down
31 changes: 31 additions & 0 deletions matbench_discovery/structure/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
"""Perturb atomic coordinates of a pymatgen structure."""

import numpy as np
from pymatgen.core import Structure

rng = np.random.default_rng(seed=0) # ensure reproducible structure perturbations


def perturb_structure(struct: Structure, gamma: float = 1.5) -> Structure:
"""Perturb the atomic coordinates of a pymatgen structure. Used for CGCNN+P
training set augmentation.

Not identical but very similar to the perturbation method used in
https://nature.com/articles/s41524-022-00891-8#Fig5.

Args:
struct (Structure): pymatgen structure to be perturbed
gamma (float, optional): Weibull distribution parameter. Defaults to 1.5.

Returns:
Structure: Perturbed structure
"""
perturbed = struct.copy()
for site in perturbed:
magnitude = rng.weibull(gamma)
vec = rng.normal(3) # TODO maybe make func recursive to deal with 0-vector
vec /= np.linalg.norm(vec) # unit vector
site.coords += vec * magnitude
site.to_unit_cell(in_place=True)

return perturbed
195 changes: 195 additions & 0 deletions matbench_discovery/structure/prototype.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
"""Get AFLOW prototype labels for crystal structures using Moyopy for symmetry
detection.
"""

import gzip
import itertools
import math
import os
import string
from typing import Final

import ase
import yaml
from pymatgen.core import Composition, Structure

module_dir = os.path.dirname(__file__)

with gzip.open(
f"{module_dir}/wyckoff-position-multiplicities.yaml.gz", "rb"
) as gz_file:
wyckoff_multiplicity_dict = yaml.safe_load(gz_file)

with gzip.open(f"{module_dir}/wyckoff-position-relabelings.yaml.gz") as gz_file:
wyckoff_relabelings = yaml.safe_load(gz_file)

crys_sys_letters: Final[dict[str, str]] = {
"Triclinic": "a",
"Monoclinic": "m",
"Orthorhombic": "o",
"Tetragonal": "t",
"Trigonal": "h",
"Hexagonal": "h",
"Cubic": "c",
}


def get_prototype_formula(composition: Composition, amt_tol: float = 1e-8) -> str:
"""Get anonymized formula for a Composition where species are in alphabetical
order and assigned ascending letters. This format is used in the Aflow
structure prototype labelling scheme.

Args:
composition (Composition): Pymatgen Composition to process
amt_tol (float): Tolerance for amount of species. Defaults to 1e-8.

Returns:
str: anonymized formula where the species are in alphabetical order
"""
reduced = composition.element_composition
if all(x == int(x) for x in composition.values()):
gcd = math.gcd(*map(int, composition.values()))
reduced /= gcd

return "".join(
f"{elem}{int(amt) if abs(amt % 1) < amt_tol else amt}" if amt != 1 else elem
for elem, amt in zip(
string.ascii_uppercase,
[reduced[key] for key in sorted(reduced, key=str)],
)
)


def canonicalize_wyckoffs(element_wyckoffs: str, spg_num: int) -> str:
"""Given an element ordering, canonicalize the associated Wyckoff positions
based on the alphabetical weight of equivalent choices of origin.

Args:
element_wyckoffs (str): wyckoff substring section from aflow_label with the
wyckoff letters for different elements separated by underscores.
spg_num (int | str): International space group number.

Returns:
str: element_wyckoff string with canonical ordering of the wyckoff letters.
"""
scored_wyckoffs: list[tuple[str, int]] = []

for trans in wyckoff_relabelings[spg_num]:
wyckoffs = element_wyckoffs.translate(str.maketrans(trans))
score: int = 0
sorted_wyckoffs: list[str] = []

for el_wyckoff_letter in wyckoffs.split("_"):
# Split into alpha and numeric groups
groups = [
"".join(grp)
for _, grp in itertools.groupby(el_wyckoff_letter, str.isalpha)
]
letters = [char for char in groups if char.isalpha()]
counts = [char for char in groups if char.isnumeric()]

# Sort by Wyckoff letter and build string
sorted_pairs = sorted(zip(counts, letters), key=lambda x: x[1])
sorted_wyckoffs.append(
"".join(
f"{count}{letter}" if count != "1" else letter
for count, letter in sorted_pairs
)
)
score += sum(0 if letter == "A" else ord(letter) - 96 for letter in letters)

scored_wyckoffs += [("_".join(sorted_wyckoffs), score)]

return min(scored_wyckoffs, key=lambda x: (x[1], x[0]))[0]


def get_protostructure_label(
struct: Structure | ase.Atoms, *, symprec: float = 0.1, raise_errors: bool = False
) -> str | None:
"""Get AFLOW-style proto-structure label using Moyopy for symmetry detection.

Args:
struct (Structure | ase.Atoms): pymatgen Structure object or ase.Atoms object.
symprec (float): Initial symmetry precision for Moyopy. Defaults to 0.1.
raise_errors (bool): Whether to raise ValueError for failing structures or
return the error message as string instead of the prototype label. Defaults
to False.

Returns:
str: protostructure_label which is constructed as `aflow_label:chemsys` or
explanation of failure if symmetry detection failed and `raise_errors`
is False.
"""
import moyopy.interface

if isinstance(struct, ase.Atoms):
from pymatgen.io.ase import AseAtomsAdaptor

struct = AseAtomsAdaptor.get_structure(struct)

moyo_cell = moyopy.interface.MoyoAdapter.from_py_obj(struct)
symmetry_data = moyopy.MoyoDataset(moyo_cell, symprec=symprec)
spg_num = symmetry_data.number

# Group sites by orbit
orbit_groups: dict[int, list[int]] = {}

for idx, orbit_id in enumerate(symmetry_data.orbits):
if orbit_id not in orbit_groups:
orbit_groups[orbit_id] = []
orbit_groups[orbit_id].append(idx)

# Create equivalent_wyckoff_labels from orbit groups
element_dict: dict[str, int] = {}
element_wyckoffs: list[str] = []

equivalent_wyckoff_labels = [
(
struct.species[orbit[0]].symbol,
len(orbit),
symmetry_data.wyckoffs[orbit[0]].translate(
str.maketrans("", "", string.digits)
),
)
for orbit in orbit_groups.values()
]
equivalent_wyckoff_labels = sorted(
equivalent_wyckoff_labels, key=lambda x: (x[0], x[2])
)

for el, group in itertools.groupby(equivalent_wyckoff_labels, key=lambda x: x[0]):
# NOTE create a list from the iterator so that we can use it without exhausting
label_group = list(group)
element_dict[el] = sum(
wyckoff_multiplicity_dict[spg_num][itm[2]] for itm in label_group
)
wyckoff_counts_str = "".join(
f"{len(list(occurrences))}{wyk_letter}"
for wyk_letter, occurrences in itertools.groupby(
label_group, key=lambda x: x[2]
)
)

element_wyckoffs += [wyckoff_counts_str]

all_wyckoffs = canonicalize_wyckoffs("_".join(element_wyckoffs), spg_num)

# Build prototype label
prototype_label = (
f"{get_prototype_formula(struct.composition)}_"
f"{symmetry_data.pearson_symbol}_"
f"{spg_num}_{all_wyckoffs}:{struct.chemical_system}"
)

# Verify multiplicities match composition
observed_formula = Composition(element_dict).reduced_formula
if observed_formula != struct.reduced_formula:
err_msg = (
f"Invalid WP multiplicities - {prototype_label}, expected "
f"{observed_formula} to be {struct.reduced_formula}"
)
if raise_errors:
raise ValueError(err_msg)
return err_msg

return prototype_label
Loading
Loading