Skip to content

Commit

Permalink
Merge pull request #2 from glotzerlab/feature/read-table
Browse files Browse the repository at this point in the history
Add CIF table reader
  • Loading branch information
janbridley authored Apr 10, 2024
2 parents 0ead0cc + 5d89a96 commit a2c3b58
Show file tree
Hide file tree
Showing 16 changed files with 1,511 additions and 3 deletions.
14 changes: 14 additions & 0 deletions parsnip/_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
class ParseWarning(Warning):
def __init__(self, message):
self.message = message

def __str__(self):
return repr(self.message)


class ParseError(RuntimeError):
def __init__(self, message):
self.message = message

def __str__(self):
return repr(self.message)
175 changes: 175 additions & 0 deletions parsnip/parse.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
"""CIF parsing tools."""

import warnings

import numpy as np

from ._utils import ParseError, ParseWarning
from .patterns import LineCleaner, cast_array_to_float


def _remove_comments_from_line(line):
return line.split("#")[0].strip()


def read_table(
filename: str,
keys: str,
filter_line: tuple[tuple[str, str]] = ((r",\s+", ",")),
keep_original_key_order=False,
) -> np.ndarray[str]:
r"""Extract data from a CIF file loop_ table.
CIF files store tabular data as whitespace-delimited blocks that start with `loop_`.
Keys are kept at the top of the table, and the vertical position of keys corresponds
to the horizontal position of the column storing the data for that key. The end of
the table is not necessarily marked: instead, the script detects when the table
format is exited.
For example:
```
loop_
_space_group_symop_id
_space_group_symop_operation_xyz
1 x,y,z
2 -x,y,-z+1/2
3 -x,-y,-z
4 x,-y,z+1/2
5 x+1/2,y+1/2,z
6 -x+1/2,y+1/2,-z+1/2
7 -x+1/2,-y+1/2,-z
8 x+1/2,-y+1/2,z+1/2
```
Only data columns corresponding to a key in the input keys list will be returned.
Note that this function will ONLY return data from a single table. If keys are
provided that correspond to data from multiple tables, only the first table will
be read.
The ``filter_line`` argument allows for dynamic input creation of regex filters to
apply to each line that contains data to be saved. The default value is
``((",\s+",","))``, which helps differentiate between individual data fragments
seperated by commas and whitespace characters, and other sections of the line that
are also whitespace separated. Adding another tuple to remove single quotes can
also be helpful: try ``((",\s+",","),(",",""))`` to achieve this. To disable the
feature entirely, pass in a tuple of empty strings: ``("","")``. Note that doing so
will cause errors if the table contains non-delimiting whitespaces.
Args:
filename (str): The name of the .cif file to be parsed.
keys (tuple[str]): The names of the keys to be parsed.
filter_line (tuple[tuple[str]], optional):
A tuple of strings that are compiled to a regex filter and applied to each
data line. (Default value: ((r",\s+",",")) )
keep_original_key_order (bool, optional):
When True, preserve the order of keys in the table from the cif file.
When False, return columns of data in order of the input ``keys`` arg.
(Default value: False)
Returns:
np.ndarray[str]: A numpy array of the data as strings.
"""
with open(filename) as f:
tables = f.read().split("loop_")

line_cleaner = LineCleaner(filter_line)
nontable_line_prefixes = ("_", "#")

for table in tables:
lines = table.strip().split("\n")
in_header = True
data_column_indices, data, column_order = [], [], []

for line_number, line in enumerate(lines):
# Check for invalid blank lines in the table header
if in_header and data_column_indices and line == "":
raise ParseError(
"Whitespace may not be used in between keys in the table header. "
"See https://www.iucr.org/resources/cif/spec/version1.1/cifsyntax#general"
", section 7 for more details."
)

# We will get errors if there is a comment after the loop_ block that
# contains our data. This is questionably legal, but very uncommon

line = _remove_comments_from_line(line)

# Save current key position if it is one of the keys we want.
if in_header and (line in keys):
data_column_indices.append(line_number)
if not keep_original_key_order:
column_order.append(keys.index(line))
continue

# If we exit the header and enter the table body
if data_column_indices and (line[:1] not in nontable_line_prefixes):
in_header = False # Exit the header and start writing data
clean_line = line_cleaner(line)
split_line = clean_line.split()

# Only add data if the line has at least as many columns as required.
n_cols_found, n_cols_expected = (
len(split_line),
len(data_column_indices),
)
if n_cols_found >= n_cols_expected:
data.append(split_line)
elif split_line != [] and n_cols_found < n_cols_expected:
warnings.warn(
f"Data line is a fragment and will be skipped: (expected line "
f"with {n_cols_expected} values, got {split_line}).",
ParseWarning,
stacklevel=2,
)
continue
elif (not in_header) and (line[:1] == "_"):
break
if data_column_indices:
break

if not keep_original_key_order:
# Reorder the column indices to match the order of the input keys
data_column_indices = np.array(data_column_indices)[np.argsort(column_order)]

if len(column_order) != len(keys):
missing_keys = {key for i, key in enumerate(keys) if i not in column_order}
warnings.warn(
f"Keys {missing_keys} were not found in the table.",
ParseWarning,
stacklevel=2,
)
return np.atleast_2d(data)[:, data_column_indices]


def read_fractional_positions(
filename: str,
filter_line: tuple[tuple[str, str]] = ((r",\s+", ",")),
):
r"""Extract the fractional X,Y,Z coordinates from a CIF file.
Args:
filename (str): The name of the .cif file to be parsed.
filter_line (tuple[tuple[str]], optional):
A tuple of strings that are compiled to a regex filter and applied to each
data line. (Default value: ((r",\s+",",")) )
Returns:
np.array[np.float32]: Fractional X,Y,Z coordinates of the unit cell.
"""
xyz_keys = ("_atom_site_fract_x", "_atom_site_fract_y", "_atom_site_fract_z")
# Once #6 is added, we should warnings.catch_warnings(action="error")
xyz_data = read_table(
filename=filename,
keys=xyz_keys,
)

xyz_data = cast_array_to_float(arr=xyz_data, dtype=np.float32)

# Validate results
assert xyz_data.shape[1] == 3
assert xyz_data.dtype == np.float32

return xyz_data
6 changes: 6 additions & 0 deletions parsnip/parsemm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
"""mmCIF parsing tools."""
if __name__ == "__main__":
raise NotImplementedError(
"mmCIF functionality has not been implemented.\n"
"See https://github.com/glotzerlab/parsnip/issues/1 for more details."
)
78 changes: 78 additions & 0 deletions parsnip/patterns.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
"""Functions and classes to process string data."""
import re

import numpy as np

# Compile in common patterns for cif parsing. These are reused throughout the package.
_multiple_whitespace_pattern = re.compile(r"\s+")
_comma_prune_spaces = re.compile(r",\s+")


def compile_pattern_from_strings(filter_patterns: tuple[str]):
"""Return a regex pattern that matches any of the characters in the filter.
Args:
filter_patterns (list[str]): Description
Returns:
re.Pattern: Pattern matching any of the input characters.
"""
return re.compile("|".join(filter_patterns))


def cast_array_to_float(arr: np.ndarray, dtype: type = np.float32):
"""Cast a Numpy array to a dtype, pruning significant digits from numerical values.
Args:
arr (np.array): Array of data to convert
dtype (type, optional): dtype to cast array to (Default value: np.float32).
Returns:
np.array[float]: Array with new dtype and no significant digit information.
"""
return np.char.partition(arr, "(")[..., 0].astype(dtype)


class LineCleaner:
"""Simple object to apply a series of regex patterns to a string.
To intialize a line cleaner, pass in a tuple of strings of the form
``(pattern, replacement)``. Patterns are compiled on initialization to accelerate
future processing.
Args:
patterns (tuple[tuple[str,str]]): Tuple of tuples of strings.
The first item in each tuple is the pattern to match, and the second item is
what that pattern will be replaced with.
"""

def __init__(self, patterns: tuple[tuple[str, str]]):
self.patterns, self.replacements = [], []

# If we only have a single tuple
if isinstance(patterns[0], str):
pattern, replacement = patterns
self.patterns.append(re.compile(pattern))

self.replacements.append(replacement)
else:
for pattern, replacement in patterns:
self.patterns.append(re.compile(pattern))

self.replacements.append(replacement)

def __call__(self, line: str):
"""Apply patterns defined on initialization of the object to the string.
``re.sub(pattern,line)`` is run for each pattern (in order) in self.patterns,
which is defined on initialization.
Args:
line (str): String to apply patterns to.
Returns:
str: The substituted lines.
"""
for pattern, replacement in zip(self.patterns, self.replacements):
line = pattern.sub(replacement, line)
return line
6 changes: 3 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ build-backend = "setuptools.build_meta"

[project]
name = "parsnip"
version = "0.1.0"
requires-python = ">=3.9" # This could be updated to a minimum of 3.6
version = "0.0.2"
requires-python = ">=3.9"
description = "Minimal library for parsing CIF/mmCIF files in Python."
readme = "README.md"
license = { file = "LICENSE" }
Expand Down Expand Up @@ -61,7 +61,7 @@ select = [
"PIE794", # enable c-style single definition of variables
]
ignore = [
"S101", # Assertions are agood thing
"S101", # Assertions are a good thing
"D105", # Magic methods don't require documentation.
"D107", # __init__ methods don't require documentation.
"SIM116", # Allow certain branched if statements
Expand Down
82 changes: 82 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import dataclasses
import os

import pytest

# ruff: noqa: N816. Allow mixed-case global variables


@dataclasses.dataclass
class CifData:
"""Class to hold the filename and stored keys for a CIF file."""

filename: str
symop_keys: tuple[str]
atom_site_keys: tuple[str]


box_keys = (
"_cell_angle_alpha",
"_cell_angle_beta",
"_cell_angle_gamma",
"_cell_length_a",
"_cell_length_b",
"_cell_length_c",
)
atom_site_keys = (
"_atom_site_label",
"_atom_site_type_symbol",
"_atom_site_fract_x",
"_atom_site_fract_y",
"_atom_site_fract_z",
"_atom_site_occupancy",
)


data_file_path = os.path.dirname(__file__) + "/sample_data/"


aflow_mC24 = CifData(
filename=data_file_path + "AFLOW_mC24.cif",
symop_keys=("_space_group_symop_id", "_space_group_symop_operation_xyz"),
atom_site_keys=atom_site_keys,
)

bisd_Ccmm = CifData(
filename=data_file_path + "B-IncStrDb_Ccmm.cif",
symop_keys=("_space_group_symop_operation_xyz", "_space_group_symop_id"),
# Our code works with extra keys, but gemmi does not!
atom_site_keys=(atom_site_keys[0], *atom_site_keys[2:]),
)

ccdc_Pm3m = CifData(
filename=data_file_path + "CCDC_1446529_Pm-3m.cif",
symop_keys=("_space_group_symop_operation_xyz",),
atom_site_keys=sorted(atom_site_keys),
)

cod_aP16 = CifData(
filename=data_file_path + "COD_1540955_aP16.cif",
symop_keys=("_symmetry_equiv_pos_as_xyz",),
atom_site_keys=atom_site_keys,
)

bad_cif = CifData(
filename=data_file_path + "INTENTIONALLY_BAD_CIF.cif",
symop_keys=("_space_group_symop_id", "_space_group_symop_operation_xyz"),
atom_site_keys=(
"_atom_site",
"_atom_site_type_symbol",
"_atom_site_symmetry_multiplicity",
"_atom_si te",
"_atom_site_fract_z",
"_this_key_does_not_exist",
),
)

cif_data_array = [aflow_mC24, bisd_Ccmm, ccdc_Pm3m, cod_aP16]
cif_files_mark = pytest.mark.parametrize(
argnames="cif_data",
argvalues=cif_data_array,
ids=[cif.filename.split("/")[-1] for cif in cif_data_array],
)
1 change: 1 addition & 0 deletions tests/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
gemmi
Loading

0 comments on commit a2c3b58

Please sign in to comment.