Merge pull request #2 from glotzerlab/feature/read-table

Add CIF table reader
glotzerlab · Apr 10, 2024 · a2c3b58 · a2c3b58
2 parents 0ead0cc + 5d89a96
commit a2c3b58
Show file tree

Hide file tree

Showing 16 changed files with 1,511 additions and 3 deletions.
diff --git a/parsnip/_utils.py b/parsnip/_utils.py
@@ -0,0 +1,14 @@
+class ParseWarning(Warning):
+    def __init__(self, message):
+        self.message = message
+
+    def __str__(self):
+        return repr(self.message)
+
+
+class ParseError(RuntimeError):
+    def __init__(self, message):
+        self.message = message
+
+    def __str__(self):
+        return repr(self.message)
diff --git a/parsnip/parse.py b/parsnip/parse.py
@@ -0,0 +1,175 @@
+"""CIF parsing tools."""
+
+import warnings
+
+import numpy as np
+
+from ._utils import ParseError, ParseWarning
+from .patterns import LineCleaner, cast_array_to_float
+
+
+def _remove_comments_from_line(line):
+    return line.split("#")[0].strip()
+
+
+def read_table(
+    filename: str,
+    keys: str,
+    filter_line: tuple[tuple[str, str]] = ((r",\s+", ",")),
+    keep_original_key_order=False,
+) -> np.ndarray[str]:
+    r"""Extract data from a CIF file loop_ table.
+
+    CIF files store tabular data as whitespace-delimited blocks that start with `loop_`.
+    Keys are kept at the top of the table, and the vertical position of keys corresponds
+    to the horizontal position of the column storing the data for that key. The end of
+    the table is not necessarily marked: instead, the script detects when the table
+    format is exited.
+
+    For example:
+
+    ```
+    loop_
+    _space_group_symop_id
+    _space_group_symop_operation_xyz
+    1 x,y,z
+    2 -x,y,-z+1/2
+    3 -x,-y,-z
+    4 x,-y,z+1/2
+    5 x+1/2,y+1/2,z
+    6 -x+1/2,y+1/2,-z+1/2
+    7 -x+1/2,-y+1/2,-z
+    8 x+1/2,-y+1/2,z+1/2
+
+    ```
+
+    Only data columns corresponding to a key in the input keys list will be returned.
+
+    Note that this function will ONLY return data from a single table. If keys are
+    provided that correspond to data from multiple tables, only the first table will
+    be read.
+
+    The ``filter_line`` argument allows for dynamic input creation of regex filters to
+    apply to each line that contains data to be saved. The default value is
+    ``((",\s+",","))``, which helps differentiate between individual data fragments
+    seperated by commas and whitespace characters, and other sections of the line that
+    are also whitespace separated. Adding another tuple to remove single quotes can
+    also be helpful: try ``((",\s+",","),(",",""))`` to achieve this. To disable the
+    feature entirely, pass in a tuple of empty strings: ``("","")``. Note that doing so
+    will cause errors if the table contains non-delimiting whitespaces.
+
+    Args:
+        filename (str): The name of the .cif file to be parsed.
+        keys (tuple[str]): The names of the keys to be parsed.
+        filter_line (tuple[tuple[str]], optional):
+            A tuple of strings that are compiled to a regex filter and applied to each
+            data line. (Default value: ((r",\s+",",")) )
+        keep_original_key_order (bool, optional):
+            When True, preserve the order of keys in the table from the cif file.
+            When False, return columns of data in order of the input ``keys`` arg.
+            (Default value: False)
+
+    Returns:
+        np.ndarray[str]: A numpy array of the data as strings.
+    """
+    with open(filename) as f:
+        tables = f.read().split("loop_")
+
+    line_cleaner = LineCleaner(filter_line)
+    nontable_line_prefixes = ("_", "#")
+
+    for table in tables:
+        lines = table.strip().split("\n")
+        in_header = True
+        data_column_indices, data, column_order = [], [], []
+
+        for line_number, line in enumerate(lines):
+            # Check for invalid blank lines in the table header
+            if in_header and data_column_indices and line == "":
+                raise ParseError(
+                    "Whitespace may not be used in between keys in the table header. "
+                    "See https://www.iucr.org/resources/cif/spec/version1.1/cifsyntax#general"
+                    ", section 7 for more details."
+                )
+
+            # We will get errors if there is a comment after the loop_ block that
+            # contains our data. This is questionably legal, but very uncommon
+
+            line = _remove_comments_from_line(line)
+
+            # Save current key position if it is one of the keys we want.
+            if in_header and (line in keys):
+                data_column_indices.append(line_number)
+                if not keep_original_key_order:
+                    column_order.append(keys.index(line))
+                continue
+
+            # If we exit the header and enter the table body
+            if data_column_indices and (line[:1] not in nontable_line_prefixes):
+                in_header = False  # Exit the header and start writing data
+                clean_line = line_cleaner(line)
+                split_line = clean_line.split()
+
+                # Only add data if the line has at least as many columns as required.
+                n_cols_found, n_cols_expected = (
+                    len(split_line),
+                    len(data_column_indices),
+                )
+                if n_cols_found >= n_cols_expected:
+                    data.append(split_line)
+                elif split_line != [] and n_cols_found < n_cols_expected:
+                    warnings.warn(
+                        f"Data line is a fragment and will be skipped: (expected line "
+                        f"with {n_cols_expected} values, got {split_line}).",
+                        ParseWarning,
+                        stacklevel=2,
+                    )
+                continue
+            elif (not in_header) and (line[:1] == "_"):
+                break
+        if data_column_indices:
+            break
+
+    if not keep_original_key_order:
+        # Reorder the column indices to match the order of the input keys
+        data_column_indices = np.array(data_column_indices)[np.argsort(column_order)]
+
+    if len(column_order) != len(keys):
+        missing_keys = {key for i, key in enumerate(keys) if i not in column_order}
+        warnings.warn(
+            f"Keys {missing_keys} were not found in the table.",
+            ParseWarning,
+            stacklevel=2,
+        )
+    return np.atleast_2d(data)[:, data_column_indices]
+
+
+def read_fractional_positions(
+    filename: str,
+    filter_line: tuple[tuple[str, str]] = ((r",\s+", ",")),
+):
+    r"""Extract the fractional X,Y,Z coordinates from a CIF file.
+
+    Args:
+        filename (str): The name of the .cif file to be parsed.
+        filter_line (tuple[tuple[str]], optional):
+            A tuple of strings that are compiled to a regex filter and applied to each
+            data line. (Default value: ((r",\s+",",")) )
+
+    Returns:
+        np.array[np.float32]: Fractional X,Y,Z coordinates of the unit cell.
+    """
+    xyz_keys = ("_atom_site_fract_x", "_atom_site_fract_y", "_atom_site_fract_z")
+    # Once #6 is added, we should warnings.catch_warnings(action="error")
+    xyz_data = read_table(
+        filename=filename,
+        keys=xyz_keys,
+    )
+
+    xyz_data = cast_array_to_float(arr=xyz_data, dtype=np.float32)
+
+    # Validate results
+    assert xyz_data.shape[1] == 3
+    assert xyz_data.dtype == np.float32
+
+    return xyz_data
diff --git a/parsnip/parsemm.py b/parsnip/parsemm.py
@@ -0,0 +1,6 @@
+"""mmCIF parsing tools."""
+if __name__ == "__main__":
+    raise NotImplementedError(
+        "mmCIF functionality has not been implemented.\n"
+        "See https://github.com/glotzerlab/parsnip/issues/1 for more details."
+    )
diff --git a/parsnip/patterns.py b/parsnip/patterns.py
@@ -0,0 +1,78 @@
+"""Functions and classes to process string data."""
+import re
+
+import numpy as np
+
+# Compile in common patterns for cif parsing. These are reused throughout the package.
+_multiple_whitespace_pattern = re.compile(r"\s+")
+_comma_prune_spaces = re.compile(r",\s+")
+
+
+def compile_pattern_from_strings(filter_patterns: tuple[str]):
+    """Return a regex pattern that matches any of the characters in the filter.
+
+    Args:
+        filter_patterns (list[str]): Description
+
+    Returns:
+        re.Pattern: Pattern matching any of the input characters.
+    """
+    return re.compile("|".join(filter_patterns))
+
+
+def cast_array_to_float(arr: np.ndarray, dtype: type = np.float32):
+    """Cast a Numpy array to a dtype, pruning significant digits from numerical values.
+
+    Args:
+        arr (np.array): Array of data to convert
+        dtype (type, optional): dtype to cast array to (Default value: np.float32).
+
+    Returns:
+        np.array[float]: Array with new dtype and no significant digit information.
+    """
+    return np.char.partition(arr, "(")[..., 0].astype(dtype)
+
+
+class LineCleaner:
+    """Simple object to apply a series of regex patterns to a string.
+
+    To intialize a line cleaner, pass in a tuple of strings of the form
+    ``(pattern, replacement)``. Patterns are compiled on initialization to accelerate
+    future processing.
+
+    Args:
+        patterns (tuple[tuple[str,str]]): Tuple of tuples of strings.
+            The first item in each tuple is the pattern to match, and the second item is
+            what that pattern will be replaced with.
+    """
+
+    def __init__(self, patterns: tuple[tuple[str, str]]):
+        self.patterns, self.replacements = [], []
+
+        # If we only have a single tuple
+        if isinstance(patterns[0], str):
+            pattern, replacement = patterns
+            self.patterns.append(re.compile(pattern))
+
+            self.replacements.append(replacement)
+        else:
+            for pattern, replacement in patterns:
+                self.patterns.append(re.compile(pattern))
+
+                self.replacements.append(replacement)
+
+    def __call__(self, line: str):
+        """Apply patterns defined on initialization of the object to the string.
+
+        ``re.sub(pattern,line)`` is run for each pattern (in order) in self.patterns,
+        which is defined on initialization.
+
+        Args:
+            line (str): String to apply patterns to.
+
+        Returns:
+            str: The substituted lines.
+        """
+        for pattern, replacement in zip(self.patterns, self.replacements):
+            line = pattern.sub(replacement, line)
+        return line
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,8 +4,8 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "parsnip"
-version = "0.1.0"
-requires-python = ">=3.9" # This could be updated to a minimum of 3.6
+version = "0.0.2"
+requires-python = ">=3.9"
 description = "Minimal library for parsing CIF/mmCIF files in Python."
 readme = "README.md"
 license = { file = "LICENSE" }
@@ -61,7 +61,7 @@ select = [
     "PIE794", # enable c-style single definition of variables
 ]
 ignore = [
-  "S101", # Assertions are agood thing
+  "S101", # Assertions are a good thing
   "D105", # Magic methods don't require documentation.
   "D107", # __init__ methods don't require documentation.
   "SIM116", # Allow certain branched if statements

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -0,0 +1,82 @@
+import dataclasses
+import os
+
+import pytest
+
+# ruff: noqa: N816. Allow mixed-case global variables
+
+
+@dataclasses.dataclass
+class CifData:
+    """Class to hold the filename and stored keys for a CIF file."""
+
+    filename: str
+    symop_keys: tuple[str]
+    atom_site_keys: tuple[str]
+
+
+box_keys = (
+    "_cell_angle_alpha",
+    "_cell_angle_beta",
+    "_cell_angle_gamma",
+    "_cell_length_a",
+    "_cell_length_b",
+    "_cell_length_c",
+)
+atom_site_keys = (
+    "_atom_site_label",
+    "_atom_site_type_symbol",
+    "_atom_site_fract_x",
+    "_atom_site_fract_y",
+    "_atom_site_fract_z",
+    "_atom_site_occupancy",
+)
+
+
+data_file_path = os.path.dirname(__file__) + "/sample_data/"
+
+
+aflow_mC24 = CifData(
+    filename=data_file_path + "AFLOW_mC24.cif",
+    symop_keys=("_space_group_symop_id", "_space_group_symop_operation_xyz"),
+    atom_site_keys=atom_site_keys,
+)
+
+bisd_Ccmm = CifData(
+    filename=data_file_path + "B-IncStrDb_Ccmm.cif",
+    symop_keys=("_space_group_symop_operation_xyz", "_space_group_symop_id"),
+    # Our code works with extra keys, but gemmi does not!
+    atom_site_keys=(atom_site_keys[0], *atom_site_keys[2:]),
+)
+
+ccdc_Pm3m = CifData(
+    filename=data_file_path + "CCDC_1446529_Pm-3m.cif",
+    symop_keys=("_space_group_symop_operation_xyz",),
+    atom_site_keys=sorted(atom_site_keys),
+)
+
+cod_aP16 = CifData(
+    filename=data_file_path + "COD_1540955_aP16.cif",
+    symop_keys=("_symmetry_equiv_pos_as_xyz",),
+    atom_site_keys=atom_site_keys,
+)
+
+bad_cif = CifData(
+    filename=data_file_path + "INTENTIONALLY_BAD_CIF.cif",
+    symop_keys=("_space_group_symop_id", "_space_group_symop_operation_xyz"),
+    atom_site_keys=(
+        "_atom_site",
+        "_atom_site_type_symbol",
+        "_atom_site_symmetry_multiplicity",
+        "_atom_si te",
+        "_atom_site_fract_z",
+        "_this_key_does_not_exist",
+    ),
+)
+
+cif_data_array = [aflow_mC24, bisd_Ccmm, ccdc_Pm3m, cod_aP16]
+cif_files_mark = pytest.mark.parametrize(
+    argnames="cif_data",
+    argvalues=cif_data_array,
+    ids=[cif.filename.split("/")[-1] for cif in cif_data_array],
+)
diff --git a/tests/requirements.txt b/tests/requirements.txt
@@ -0,0 +1 @@
+gemmi