From 8c359b8de6f0866ed31a5c2ad3e7372ab495f5e8 Mon Sep 17 00:00:00 2001
From: Kevin Dalton <kmdalton@pop-os.localdomain>
Date: Wed, 28 Jun 2023 23:22:56 -0400
Subject: [PATCH 1/2] faster stream file parser

---
 reciprocalspaceship/io/crystfel.py | 577 +++++++++++++++++------------
 1 file changed, 341 insertions(+), 236 deletions(-)

diff --git a/reciprocalspaceship/io/crystfel.py b/reciprocalspaceship/io/crystfel.py
index 71cb0a04..85fe346a 100644
--- a/reciprocalspaceship/io/crystfel.py
+++ b/reciprocalspaceship/io/crystfel.py
@@ -1,218 +1,352 @@
 import numpy as np
 import pandas as pd
+from typing import Union,Optional
 
+import reciprocalspaceship as rs
 from reciprocalspaceship import DataSet
+import gemmi
 from reciprocalspaceship.utils import angle_between
-
-
-def _parse_stream(filename: str) -> dict:
-    """
-    Parses stream and returns all indexed peak positions
-
-    Parameters
-    ----------
-    filename : stream filename
-        name of a .stream file
-
-    Returns
-    --------
-    (dict, np.ndarray)
-    """
-
-    answ_crystals = {}
-
-    def contains_filename(s):
-        return s.startswith("Image filename")
-
-    def contains_event(s):
-        return s.startswith("Event")
-
-    def contains_serial_number(s):
-        return s.startswith("Image serial number")
-
-    def starts_chunk_peaks(s):
-        return s.startswith("  fs/px   ss/px (1/d)/nm^-1   Intensity  Panel")
-
-    def ends_chunk_peaks(s):
-        return s.startswith("End of peak list")
-
-    def starts_crystal_peaks(s):
-        return s.startswith(
-            "   h    k    l          I   sigma(I)       peak background  fs/px  ss/px panel"
-        )
-
-    def is_photon_energy(s):
-        return s.startswith("photon_energy_eV")
-
-    def is_astar(s):
-        return s.startswith("astar")
-
-    def is_bstar(s):
-        return s.startswith("bstar")
-
-    def is_cstar(s):
-        return s.startswith("cstar")
-
-    def ends_crystal_peaks(s):
-        return s.startswith("End of reflections")
-
-    def eV2Angstrom(e_eV):
-        return 12398.0 / e_eV
-
-    # add unit cell parameters parsing
-    with open(filename, "r") as stream:
-        is_unit_cell = False
-        get_cellparam = lambda s: float(s.split()[2])
-        rv_cell_param = None
-        a, b, c, al, be, ga = [
-            None
-        ] * 6  # None's are needed since stream not always has all 6 parameters
-        for line in stream:
-            if "Begin unit cell" in line:
-                is_unit_cell = True
-                continue
-            elif is_unit_cell:
-                if line.startswith("a ="):
-                    a = get_cellparam(line)
-                if line.startswith("b ="):
-                    b = get_cellparam(line)
-                if line.startswith("c ="):
-                    c = get_cellparam(line)
-                if line.startswith("al ="):
-                    al = get_cellparam(line)
-                if line.startswith("be ="):
-                    be = get_cellparam(line)
-                if line.startswith("ga ="):
-                    ga = get_cellparam(line)
-                    is_unit_cell = False  # gamma is the last parameters
-            elif "End unit cell" in line:
-                rv_cell_param = np.array([a, b, c, al, be, ga])
-                break
-
-    with open(filename, "r") as stream:
-        is_chunk = False
-        is_crystal = False
-        current_filename = None
-        current_event = None  # to handle non-event streams
-        current_serial_number = None
-        corrupted_chunk = False
-        crystal_peak_number = 0
-        crystal_idx = 0
-
-        for line in stream:
-            # analyzing what we have
-            if ends_chunk_peaks(line):
-                is_chunk = False
-                chunk_peak_number = 0
-            elif ends_crystal_peaks(line):
-                is_crystal = False
-                crystal_peak_number = 0
-
-            elif is_photon_energy(line):
-                photon_energy = float(line.split()[2])
-            elif is_astar(line):
+from reciprocalspaceship.decorators import cellify,spacegroupify
+
+#See Rupp Table 5-2
+_cell_constraints = {
+    'triclinic' : lambda x: x,
+    'orthorhombic' : lambda x: [x[0], x[1], x[2], 90., 90., 90.],
+    'monoclinic' : lambda x: [x[0], x[1], x[2], 90., x[4], 90.],
+    'hexagonal' : lambda x: [0.5*(x[0] + x[1]), 0.5*(x[0] + x[1]), x[2], 90., 90., 120.],
+    'rhombohedral' : lambda x: [0.5*(x[0] + x[1]), 0.5*(x[0] + x[1]), x[2], 90., 90., 120.],
+    'cubic' : lambda x: [np.mean(x[:3]), np.mean(x[:3]), np.mean(x[:3]), 90., 90., 90.],
+    'tetragonal' : lambda x: [0.5*(x[0] + x[1]), 0.5*(x[0] + x[1]), x[2], 90., 90., 90.],
+}
+
+class StreamLoader():
+    @cellify
+    @spacegroupify
+    def __init__(
+            self, 
+            stream_file : str, 
+            cell : Optional[gemmi.UnitCell]=None, 
+            spacegroup : Optional[gemmi.SpaceGroup] = None, 
+            wavelength : Optional[float] = None,
+        ):
+        """
+        This class can be used to convert CrystFEL `.stream` files into `rs.DataSet` objects.
+
+        Parameters
+        ----------
+        stream_file : str
+            The path of the `.stream` file. Must end in `.stream`.
+        cell : gemmi.UnitCell (optional)
+            The cell to assign to the DataSet. If None is supplied, it will be determined from the file.
+        spacegroup : gemmi.SpaceGroup (optional)
+            The spacegroup to assign to the DataSet. 
+        wavelength : float (optional)
+            The wavelength to use for geometry calculations. If None is supplied, it will be determined from the file.
+        """
+        super().__init__()
+        self.cell = cell
+        if self.cell is None:
+            self.cell = self.get_target_cell(stream_file)
+
+        self.spacegroup = spacegroup
+        self.wavelength = wavelength
+        if self.wavelength is None:
+            self.wavelength = self.get_wavelength(stream_file)
+        self.inverse_wavelength = 1. / self.wavelength
+        if not stream_file.endswith('.stream'):
+            raise ValueError("Stream file should end with .stream")
+        self.stream_file = stream_file
+
+    @staticmethod
+    def to_crystals(stream_file : str) -> iter:
+        """ Convert the stream file into an iterator of crystal blocks. """
+        return StreamLoader.to_blocks(stream_file, 'crystal')
+
+    @staticmethod
+    def get_lattice_type(stream_file : str) -> str: 
+        """ Extract the crystal system string from a stream file. """
+        lines = next(StreamLoader.to_blocks(stream_file, 'cell'))
+        for line in lines:
+            if line.startswith("lattice_type ="):
+                lattice_type = line.split()[2]
+                return lattice_type
+        raise ValueError("No lattice_type entry!")
+
+    @staticmethod
+    def get_target_cell(stream_file : str) -> gemmi.UnitCell:
+        """ Extract the target unit cell constants from a stream file. """
+        lines = next(StreamLoader.to_blocks(stream_file, 'cell'))
+        cell = [0.]*6
+        for line in lines:
+            if line.startswith("a ="):
+                cell[0] = float(line.split()[2])
+            elif line.startswith("b ="):
+                cell[1] = float(line.split()[2])
+            elif line.startswith("c ="):
+                cell[2] = float(line.split()[2])
+            elif line.startswith("al ="):
+                cell[3] = float(line.split()[2])
+            elif line.startswith("be ="):
+                cell[4] = float(line.split()[2])
+            elif line.startswith("ga ="):
+                cell[5] = float(line.split()[2])
+
+        return gemmi.UnitCell(*cell)
+
+    @staticmethod
+    def get_wavelength(stream_file : str) -> float:
+        """ Extract the wavelength from a stream file. """
+        geo = next(StreamLoader.to_blocks(stream_file, 'geometry'))
+        for line in geo:
+            if line.startswith("photon_energy"):
+                eV = float(line.split()[2])
+                lam = rs.utils.ev2angstroms(eV)
+                return lam
+
+    @staticmethod
+    def online_mean_variance(iterator) -> (Union[float, np.ndarray], Union[float, np.ndarray]):
+        """ Compute the mean and variance of an iterator of floats or arrays online. """
+        def update(count, mean, m2, value):
+            count = count + 1
+            delta = value - mean
+            mean += delta / count
+            delta2 = value - mean
+            m2 += delta * delta2
+            return count, mean, m2
+
+        count, mean, m2 = 0, 0, 0
+        for value in iterator:
+            count, mean, m2 = update(count, mean, m2, value)
+
+        variance = m2 / count
+        return mean, variance
+
+    @staticmethod
+    def get_average_cell(stream_file, constrain_by_crystal_system=True):
+        """
+        Gets the average cell across all the indexed
+        crystals in a stream file.
+
+        Parameters
+        ----------
+        stream_file : str
+            The path to a CrystFEL stream file.
+        constrain_by_crystal_system : bool (optional)
+            Whether to apply the appropriate constraints
+            for the crystal system described in the stream.
+            The default value is True.
+
+        Returns
+        -------
+        cell : gemmi.UnitCell
+        """
+        def cell_iter(stream_file):
+            for crystal in StreamLoader.to_crystals(stream_file):
+                for line in crystal:
+                    if line.startswith("Cell parameters"):
+                        cell = line.split()
+                        cell = np.array([
+                            cell[2],
+                            cell[3],
+                            cell[4],
+                            cell[6],
+                            cell[7],
+                            cell[8],
+                        ], dtype='float32')
+                        cell[:3] = 10.*cell[:3]
+                        yield cell
+                        break
+
+        mean, variance = StreamLoader.online_mean_variance(cell_iter(stream_file))
+        cell = gemmi.UnitCell(*mean)
+        if constrain:
+            lattice_type = StreamLoader.get_lattice_type(stream_file)
+            cell = _cell_constraints[lattice_type](cell)
+        return cell
+
+    @staticmethod
+    def to_blocks(stream_file : str, block_name : str) -> iter:
+        """
+        Parameters
+        ----------
+        stream_file : str
+            The path to a CrystFEL stream file.
+        block_name : str
+            One of the following types of blocks
+             - 'geometry'
+             - 'chunk' 
+             - 'cell' 
+             - 'peaks' 
+             - 'crystal' 
+             - 'reflections'
+
+        Returns
+        -------
+        blocks : iter
+            An interable containing lists of lines for each block. 
+        """
+        # See crystFEL API reference here: https://www.desy.de/~twhite/crystfel/reference/stream_8h.html
+        block_markers = {
+            "geometry" : ("----- Begin geometry file -----", "----- End geometry file -----"),
+            "chunk" : ("----- Begin chunk -----", "----- End chunk -----"),
+            "cell" : ("----- Begin unit cell -----", "----- End unit cell -----"),
+            "peaks" : ("Peaks from peak search", "End of peak list"),
+            "crystal" : ("--- Begin crystal", "--- End crystal"),
+            "reflections" : ("Reflections measured after indexing", "End of reflections"),
+        }
+        block_begin_marker, block_end_marker = block_markers[block_name]
+
+        block = []
+        in_block = False
+        for line in open(stream_file):
+            if line.startswith(block_end_marker):
+                in_block = False
+                yield block
+                block = []
+            if in_block:
+                block.append(line)
+            if line.startswith(block_begin_marker):
+                in_block = True
+
+    @staticmethod
+    def crystal_to_data(crystal,  wavelength, cell=None, dmin=None, batch=None):
+        """
+        Convert a crystal block (list of strings) to a numpy array. 
+
+        Parameters
+        ----------
+        crystal : list or similar
+            A list of strings corresponding to a single crystal 
+            block in a stream file. 
+        wavelength : float
+            The wavelength of the dataset. Used for calculating ewald
+            offsets. 
+        cell : gemmi.UnitCell (optional)
+            The unit cell to use to calculate reflection resolution. 
+            This is only used for the dmin resolution cutoff.
+        dmin : float (optional)
+            An optional resolution cutoff. Requires a cell. 
+        batch : int (optional)
+            Optionally supply a batch number which will be appended to
+            the output array as another column
+
+        Returns
+        -------
+        data : array (float32)
+            An array of columns which correspond to the following data
+             0) H
+             1) K
+             2) L
+             3) I
+             4) SigI
+             5) s1x
+             6) s1y
+             7) s1z
+             8) ewald_offset
+             9) angular_ewald_offset
+             10) XDET
+             11) YDET
+             12) BATCH
+            
+        """
+        block_name : str
+        inverse_wavelength = 1. / wavelength
+        astar = bstar = cstar = None
+        in_refls = False
+        crystal_iter = iter(crystal)
+
+        refls = []
+        for line in crystal_iter:
+            if line.startswith('astar ='):
                 astar = (
                     np.array(line.split()[2:5], dtype="float32") / 10.0
                 )  # crystfel's notation uses nm-1
-            elif is_bstar(line):
+            if line.startswith('bstar ='):
                 bstar = (
                     np.array(line.split()[2:5], dtype="float32") / 10.0
                 )  # crystfel's notation uses nm-1
-            elif is_cstar(line):
+            if line.startswith('cstar ='):
                 cstar = (
                     np.array(line.split()[2:5], dtype="float32") / 10.0
                 )  # crystfel's notation uses nm-1
-
-                # since it's the last line needed to construct Ewald offset,
-                # we'll pre-compute the matrices here
-                A = np.array([astar, bstar, cstar]).T
-                lambda_inv = 1 / eV2Angstrom(photon_energy)
-                s0 = np.array([0, 0, lambda_inv]).T
-
-            elif is_crystal:
-                # example line:
-                #    h    k    l          I   sigma(I)       peak background  fs/px  ss/px panel
-                #  -63   41    9     -41.31      57.45     195.00     170.86  731.0 1350.4 p0
-                crystal_peak_number += 1
-                h, k, l, I, sigmaI, peak, background, xdet, ydet, panel = [
-                    i for i in line.split()
-                ]
-                h, k, l = map(int, [h, k, l])
-
-                # calculate ewald offset and s1
-                hkl = np.array([h, k, l])
-                q = A @ hkl
-                s1 = q + s0
-                s1x, s1y, s1z = s1
-                s1_norm = np.linalg.norm(s1)
-                ewald_offset = s1_norm - lambda_inv
-
-                # project calculated s1 onto the ewald sphere
-                s1_obs = lambda_inv * s1 / s1_norm
-
-                # Compute the angular ewald offset
-                q_obs = s1_obs - s0
-                qangle = np.sign(ewald_offset) * angle_between(q, q_obs)
-
-                record = {
-                    "H": h,
-                    "K": k,
-                    "L": l,
-                    "I": float(I),
-                    "SigI": float(sigmaI),
-                    "BATCH": crystal_idx,
-                    "s1x": s1x,
-                    "s1y": s1y,
-                    "s1z": s1z,
-                    "ewald_offset": ewald_offset,
-                    "angular_ewald_offset": qangle,
-                    "XDET": float(xdet),
-                    "YDET": float(ydet),
-                }
-                if current_event is not None:
-                    name = (
-                        current_filename,
-                        current_event,
-                        current_serial_number,
-                        crystal_idx,
-                        crystal_peak_number,
-                    )
-                else:
-                    name = (
-                        current_filename,
-                        current_serial_number,
-                        crystal_idx,
-                        crystal_peak_number,
-                    )
-                answ_crystals[name] = record
-
-            # start analyzing where we are now
-            if corrupted_chunk:
-                if "Begin chunk" not in line:
-                    continue
-                else:
-                    is_crystal, is_chunk = False, False
-                    corrupted_chunk = False
-                    continue
-
-            if contains_filename(line):
-                current_filename = line.split()[-1]
-            elif contains_event(line):
-                current_event = line.split()[-1][2:]
-            elif contains_serial_number(line):
-                current_serial_number = line.split()[-1]
-
-            elif starts_chunk_peaks(line):
-                is_chunk = True
-                continue
-
-            elif starts_crystal_peaks(line):
-                crystal_idx += 1
-                is_crystal = True
-                continue
-
-    return answ_crystals, rv_cell_param
-
+            if line == "End of reflections\n":
+                in_refls = False
+            if in_refls:
+                refls.append(line.split()[:-1])
+            if line == "Reflections measured after indexing\n":
+                in_refls = True
+                crystal_iter = next(crystal_iter) #skip header
+
+        refls = np.array(refls, dtype='float32')
+        hkl = refls[:,:3]
+
+        #Apply dmin
+        if dmin is not None:
+            if cell is None:
+                raise ValueError("dmin supplied without a cell")
+            d = cell.calculate_d_array(hkl).astype('float32')
+            idx = d >= dmin
+            refls = refls[idx]
+            d = d[idx]
+            hkl = hkl[idx]
+
+        A = np.array([astar, bstar, cstar]).T
+        # calculate ewald offset and s1
+
+        s0 = np.array([0, 0, inverse_wavelength]).T
+        q = hkl @ A.T # == (A @ hkl.T).T
+        s1 = q + s0
+        s1x, s1y, s1z = s1.T
+        s1_norm = np.sqrt(s1x * s1x + s1y * s1y + s1z * s1z)
+
+        # project calculated s1 onto the ewald sphere
+        s1_obs = inverse_wavelength * s1 / s1_norm[:,None]
+
+        # Compute the ewald offset vector
+        eov = s1_obs - s1
+
+        # Compute scalar ewald offset
+        eo = s1_norm - inverse_wavelength
+
+        # Compute angular ewald offset
+        eo_sign = np.sign(eo)
+        q_obs = s1_obs - s0
+        ao = eo_sign * rs.utils.angle_between(q, q_obs)
+
+        I = refls[:,3]
+        SigI = refls[:,4]
+        bg = refls[:,5]
+
+        if batch is not None:
+            batch = batch * np.ones_like(I, dtype='int32')
+            return np.concatenate((hkl, I[:,None], SigI[:,None], s1, eo[:,None], ao[:,None], refls[:,7,None], refls[:,8,None], batch[:,None]), axis=1)
+        return np.concatenate((hkl, I[:,None], SigI[:,None], s1, eo[:,None], ao[:,None], refls[:,7,None], refls[:,8,None]), axis=1)
+
+    def to_dataset(self, spacegroup : Optional[gemmi.SpaceGroup] = None) -> rs.DataSet:
+        """ Convert self.stream_file to an rs DataSet. Optionally set the spacegroup """
+        def data_gen():
+            for i,crystal in enumerate(StreamLoader.to_crystals(self.stream_file)):
+                ds = StreamLoader.crystal_to_data(crystal, self.wavelength, self.cell,  batch=i+1)
+                yield ds
+        data = np.concatenate(list(data_gen()), axis=0)
+
+        names = [
+            "H",
+            "K",
+            "L",
+            "I",
+            "SigI",
+            "s1x",
+            "s1y",
+            "s1z",
+            "ewald_offset",
+            "angular_ewald_offset",
+            "XDET",
+            "YDET",
+            "BATCH",
+        ]
+        ds = rs.DataSet(data, columns=names, cell=self.cell, spacegroup=spacegroup, merged=False).infer_mtz_dtypes()
+        return ds
 
 def read_crystfel(streamfile: str, spacegroup=None) -> DataSet:
     """
@@ -233,38 +367,9 @@ def read_crystfel(streamfile: str, spacegroup=None) -> DataSet:
 
     if not streamfile.endswith(".stream"):
         raise ValueError("Stream file should end with .stream")
-    # read data from stream file
-    d, cell = _parse_stream(streamfile)
-    df = pd.DataFrame.from_records(list(d.values()))
-
-    # set mtztypes as in precognition.py
-    # hkl -- H
-    # I, sigmaI -- J, Q
-    # BATCH -- B
-    # s1{x,y,z} -- R
-    # ewald_offset -- R
-    mtzdtypes = {
-        "H": "H",
-        "K": "H",
-        "L": "H",
-        "I": "J",
-        "SigI": "Q",
-        "BATCH": "B",
-        "s1x": "R",
-        "s1y": "R",
-        "s1z": "R",
-        "ewald_offset": "R",
-        "angular_ewald_offset": "R",
-        "XDET": "R",
-        "YDET": "R",
-    }
-    dataset = DataSet(
-        spacegroup=spacegroup,
-        cell=cell,
-        merged=False,  # CrystFEL stream is always unmerged
-    )
-    for k, v in df.items():
-        dataset[k] = v.astype(mtzdtypes[k])
-    dataset.set_index(["H", "K", "L"], inplace=True)
-
-    return dataset
+    loader = StreamLoader(streamfile)
+    ds = loader.to_dataset(spacegroup=spacegroup)
+    ds.set_index(["H", "K", "L"], inplace=True)
+    return ds
+
+

From d0ee0d655137ae0fb73d876ef35da71a003d27d4 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 29 Jun 2023 03:24:22 +0000
Subject: [PATCH 2/2] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 reciprocalspaceship/io/crystfel.py | 253 +++++++++++++++++++----------
 1 file changed, 163 insertions(+), 90 deletions(-)

diff --git a/reciprocalspaceship/io/crystfel.py b/reciprocalspaceship/io/crystfel.py
index 85fe346a..de0ef990 100644
--- a/reciprocalspaceship/io/crystfel.py
+++ b/reciprocalspaceship/io/crystfel.py
@@ -1,34 +1,64 @@
+from typing import Optional, Union
+
+import gemmi
 import numpy as np
 import pandas as pd
-from typing import Union,Optional
 
 import reciprocalspaceship as rs
 from reciprocalspaceship import DataSet
-import gemmi
+from reciprocalspaceship.decorators import cellify, spacegroupify
 from reciprocalspaceship.utils import angle_between
-from reciprocalspaceship.decorators import cellify,spacegroupify
 
-#See Rupp Table 5-2
+# See Rupp Table 5-2
 _cell_constraints = {
-    'triclinic' : lambda x: x,
-    'orthorhombic' : lambda x: [x[0], x[1], x[2], 90., 90., 90.],
-    'monoclinic' : lambda x: [x[0], x[1], x[2], 90., x[4], 90.],
-    'hexagonal' : lambda x: [0.5*(x[0] + x[1]), 0.5*(x[0] + x[1]), x[2], 90., 90., 120.],
-    'rhombohedral' : lambda x: [0.5*(x[0] + x[1]), 0.5*(x[0] + x[1]), x[2], 90., 90., 120.],
-    'cubic' : lambda x: [np.mean(x[:3]), np.mean(x[:3]), np.mean(x[:3]), 90., 90., 90.],
-    'tetragonal' : lambda x: [0.5*(x[0] + x[1]), 0.5*(x[0] + x[1]), x[2], 90., 90., 90.],
+    "triclinic": lambda x: x,
+    "orthorhombic": lambda x: [x[0], x[1], x[2], 90.0, 90.0, 90.0],
+    "monoclinic": lambda x: [x[0], x[1], x[2], 90.0, x[4], 90.0],
+    "hexagonal": lambda x: [
+        0.5 * (x[0] + x[1]),
+        0.5 * (x[0] + x[1]),
+        x[2],
+        90.0,
+        90.0,
+        120.0,
+    ],
+    "rhombohedral": lambda x: [
+        0.5 * (x[0] + x[1]),
+        0.5 * (x[0] + x[1]),
+        x[2],
+        90.0,
+        90.0,
+        120.0,
+    ],
+    "cubic": lambda x: [
+        np.mean(x[:3]),
+        np.mean(x[:3]),
+        np.mean(x[:3]),
+        90.0,
+        90.0,
+        90.0,
+    ],
+    "tetragonal": lambda x: [
+        0.5 * (x[0] + x[1]),
+        0.5 * (x[0] + x[1]),
+        x[2],
+        90.0,
+        90.0,
+        90.0,
+    ],
 }
 
-class StreamLoader():
+
+class StreamLoader:
     @cellify
     @spacegroupify
     def __init__(
-            self, 
-            stream_file : str, 
-            cell : Optional[gemmi.UnitCell]=None, 
-            spacegroup : Optional[gemmi.SpaceGroup] = None, 
-            wavelength : Optional[float] = None,
-        ):
+        self,
+        stream_file: str,
+        cell: Optional[gemmi.UnitCell] = None,
+        spacegroup: Optional[gemmi.SpaceGroup] = None,
+        wavelength: Optional[float] = None,
+    ):
         """
         This class can be used to convert CrystFEL `.stream` files into `rs.DataSet` objects.
 
@@ -39,7 +69,7 @@ def __init__(
         cell : gemmi.UnitCell (optional)
             The cell to assign to the DataSet. If None is supplied, it will be determined from the file.
         spacegroup : gemmi.SpaceGroup (optional)
-            The spacegroup to assign to the DataSet. 
+            The spacegroup to assign to the DataSet.
         wavelength : float (optional)
             The wavelength to use for geometry calculations. If None is supplied, it will be determined from the file.
         """
@@ -52,20 +82,20 @@ def __init__(
         self.wavelength = wavelength
         if self.wavelength is None:
             self.wavelength = self.get_wavelength(stream_file)
-        self.inverse_wavelength = 1. / self.wavelength
-        if not stream_file.endswith('.stream'):
+        self.inverse_wavelength = 1.0 / self.wavelength
+        if not stream_file.endswith(".stream"):
             raise ValueError("Stream file should end with .stream")
         self.stream_file = stream_file
 
     @staticmethod
-    def to_crystals(stream_file : str) -> iter:
-        """ Convert the stream file into an iterator of crystal blocks. """
-        return StreamLoader.to_blocks(stream_file, 'crystal')
+    def to_crystals(stream_file: str) -> iter:
+        """Convert the stream file into an iterator of crystal blocks."""
+        return StreamLoader.to_blocks(stream_file, "crystal")
 
     @staticmethod
-    def get_lattice_type(stream_file : str) -> str: 
-        """ Extract the crystal system string from a stream file. """
-        lines = next(StreamLoader.to_blocks(stream_file, 'cell'))
+    def get_lattice_type(stream_file: str) -> str:
+        """Extract the crystal system string from a stream file."""
+        lines = next(StreamLoader.to_blocks(stream_file, "cell"))
         for line in lines:
             if line.startswith("lattice_type ="):
                 lattice_type = line.split()[2]
@@ -73,10 +103,10 @@ def get_lattice_type(stream_file : str) -> str:
         raise ValueError("No lattice_type entry!")
 
     @staticmethod
-    def get_target_cell(stream_file : str) -> gemmi.UnitCell:
-        """ Extract the target unit cell constants from a stream file. """
-        lines = next(StreamLoader.to_blocks(stream_file, 'cell'))
-        cell = [0.]*6
+    def get_target_cell(stream_file: str) -> gemmi.UnitCell:
+        """Extract the target unit cell constants from a stream file."""
+        lines = next(StreamLoader.to_blocks(stream_file, "cell"))
+        cell = [0.0] * 6
         for line in lines:
             if line.startswith("a ="):
                 cell[0] = float(line.split()[2])
@@ -94,9 +124,9 @@ def get_target_cell(stream_file : str) -> gemmi.UnitCell:
         return gemmi.UnitCell(*cell)
 
     @staticmethod
-    def get_wavelength(stream_file : str) -> float:
-        """ Extract the wavelength from a stream file. """
-        geo = next(StreamLoader.to_blocks(stream_file, 'geometry'))
+    def get_wavelength(stream_file: str) -> float:
+        """Extract the wavelength from a stream file."""
+        geo = next(StreamLoader.to_blocks(stream_file, "geometry"))
         for line in geo:
             if line.startswith("photon_energy"):
                 eV = float(line.split()[2])
@@ -104,8 +134,11 @@ def get_wavelength(stream_file : str) -> float:
                 return lam
 
     @staticmethod
-    def online_mean_variance(iterator) -> (Union[float, np.ndarray], Union[float, np.ndarray]):
-        """ Compute the mean and variance of an iterator of floats or arrays online. """
+    def online_mean_variance(
+        iterator,
+    ) -> (Union[float, np.ndarray], Union[float, np.ndarray]):
+        """Compute the mean and variance of an iterator of floats or arrays online."""
+
         def update(count, mean, m2, value):
             count = count + 1
             delta = value - mean
@@ -140,20 +173,24 @@ def get_average_cell(stream_file, constrain_by_crystal_system=True):
         -------
         cell : gemmi.UnitCell
         """
+
         def cell_iter(stream_file):
             for crystal in StreamLoader.to_crystals(stream_file):
                 for line in crystal:
                     if line.startswith("Cell parameters"):
                         cell = line.split()
-                        cell = np.array([
-                            cell[2],
-                            cell[3],
-                            cell[4],
-                            cell[6],
-                            cell[7],
-                            cell[8],
-                        ], dtype='float32')
-                        cell[:3] = 10.*cell[:3]
+                        cell = np.array(
+                            [
+                                cell[2],
+                                cell[3],
+                                cell[4],
+                                cell[6],
+                                cell[7],
+                                cell[8],
+                            ],
+                            dtype="float32",
+                        )
+                        cell[:3] = 10.0 * cell[:3]
                         yield cell
                         break
 
@@ -165,7 +202,7 @@ def cell_iter(stream_file):
         return cell
 
     @staticmethod
-    def to_blocks(stream_file : str, block_name : str) -> iter:
+    def to_blocks(stream_file: str, block_name: str) -> iter:
         """
         Parameters
         ----------
@@ -174,25 +211,31 @@ def to_blocks(stream_file : str, block_name : str) -> iter:
         block_name : str
             One of the following types of blocks
              - 'geometry'
-             - 'chunk' 
-             - 'cell' 
-             - 'peaks' 
-             - 'crystal' 
+             - 'chunk'
+             - 'cell'
+             - 'peaks'
+             - 'crystal'
              - 'reflections'
 
         Returns
         -------
         blocks : iter
-            An interable containing lists of lines for each block. 
+            An interable containing lists of lines for each block.
         """
         # See crystFEL API reference here: https://www.desy.de/~twhite/crystfel/reference/stream_8h.html
         block_markers = {
-            "geometry" : ("----- Begin geometry file -----", "----- End geometry file -----"),
-            "chunk" : ("----- Begin chunk -----", "----- End chunk -----"),
-            "cell" : ("----- Begin unit cell -----", "----- End unit cell -----"),
-            "peaks" : ("Peaks from peak search", "End of peak list"),
-            "crystal" : ("--- Begin crystal", "--- End crystal"),
-            "reflections" : ("Reflections measured after indexing", "End of reflections"),
+            "geometry": (
+                "----- Begin geometry file -----",
+                "----- End geometry file -----",
+            ),
+            "chunk": ("----- Begin chunk -----", "----- End chunk -----"),
+            "cell": ("----- Begin unit cell -----", "----- End unit cell -----"),
+            "peaks": ("Peaks from peak search", "End of peak list"),
+            "crystal": ("--- Begin crystal", "--- End crystal"),
+            "reflections": (
+                "Reflections measured after indexing",
+                "End of reflections",
+            ),
         }
         block_begin_marker, block_end_marker = block_markers[block_name]
 
@@ -209,23 +252,23 @@ def to_blocks(stream_file : str, block_name : str) -> iter:
                 in_block = True
 
     @staticmethod
-    def crystal_to_data(crystal,  wavelength, cell=None, dmin=None, batch=None):
+    def crystal_to_data(crystal, wavelength, cell=None, dmin=None, batch=None):
         """
-        Convert a crystal block (list of strings) to a numpy array. 
+        Convert a crystal block (list of strings) to a numpy array.
 
         Parameters
         ----------
         crystal : list or similar
-            A list of strings corresponding to a single crystal 
-            block in a stream file. 
+            A list of strings corresponding to a single crystal
+            block in a stream file.
         wavelength : float
             The wavelength of the dataset. Used for calculating ewald
-            offsets. 
+            offsets.
         cell : gemmi.UnitCell (optional)
-            The unit cell to use to calculate reflection resolution. 
+            The unit cell to use to calculate reflection resolution.
             This is only used for the dmin resolution cutoff.
         dmin : float (optional)
-            An optional resolution cutoff. Requires a cell. 
+            An optional resolution cutoff. Requires a cell.
         batch : int (optional)
             Optionally supply a batch number which will be appended to
             the output array as another column
@@ -247,25 +290,25 @@ def crystal_to_data(crystal,  wavelength, cell=None, dmin=None, batch=None):
              10) XDET
              11) YDET
              12) BATCH
-            
+
         """
-        block_name : str
-        inverse_wavelength = 1. / wavelength
+        block_name: str
+        inverse_wavelength = 1.0 / wavelength
         astar = bstar = cstar = None
         in_refls = False
         crystal_iter = iter(crystal)
 
         refls = []
         for line in crystal_iter:
-            if line.startswith('astar ='):
+            if line.startswith("astar ="):
                 astar = (
                     np.array(line.split()[2:5], dtype="float32") / 10.0
                 )  # crystfel's notation uses nm-1
-            if line.startswith('bstar ='):
+            if line.startswith("bstar ="):
                 bstar = (
                     np.array(line.split()[2:5], dtype="float32") / 10.0
                 )  # crystfel's notation uses nm-1
-            if line.startswith('cstar ='):
+            if line.startswith("cstar ="):
                 cstar = (
                     np.array(line.split()[2:5], dtype="float32") / 10.0
                 )  # crystfel's notation uses nm-1
@@ -275,16 +318,16 @@ def crystal_to_data(crystal,  wavelength, cell=None, dmin=None, batch=None):
                 refls.append(line.split()[:-1])
             if line == "Reflections measured after indexing\n":
                 in_refls = True
-                crystal_iter = next(crystal_iter) #skip header
+                crystal_iter = next(crystal_iter)  # skip header
 
-        refls = np.array(refls, dtype='float32')
-        hkl = refls[:,:3]
+        refls = np.array(refls, dtype="float32")
+        hkl = refls[:, :3]
 
-        #Apply dmin
+        # Apply dmin
         if dmin is not None:
             if cell is None:
                 raise ValueError("dmin supplied without a cell")
-            d = cell.calculate_d_array(hkl).astype('float32')
+            d = cell.calculate_d_array(hkl).astype("float32")
             idx = d >= dmin
             refls = refls[idx]
             d = d[idx]
@@ -294,13 +337,13 @@ def crystal_to_data(crystal,  wavelength, cell=None, dmin=None, batch=None):
         # calculate ewald offset and s1
 
         s0 = np.array([0, 0, inverse_wavelength]).T
-        q = hkl @ A.T # == (A @ hkl.T).T
+        q = hkl @ A.T  # == (A @ hkl.T).T
         s1 = q + s0
         s1x, s1y, s1z = s1.T
         s1_norm = np.sqrt(s1x * s1x + s1y * s1y + s1z * s1z)
 
         # project calculated s1 onto the ewald sphere
-        s1_obs = inverse_wavelength * s1 / s1_norm[:,None]
+        s1_obs = inverse_wavelength * s1 / s1_norm[:, None]
 
         # Compute the ewald offset vector
         eov = s1_obs - s1
@@ -313,21 +356,50 @@ def crystal_to_data(crystal,  wavelength, cell=None, dmin=None, batch=None):
         q_obs = s1_obs - s0
         ao = eo_sign * rs.utils.angle_between(q, q_obs)
 
-        I = refls[:,3]
-        SigI = refls[:,4]
-        bg = refls[:,5]
+        I = refls[:, 3]
+        SigI = refls[:, 4]
+        bg = refls[:, 5]
 
         if batch is not None:
-            batch = batch * np.ones_like(I, dtype='int32')
-            return np.concatenate((hkl, I[:,None], SigI[:,None], s1, eo[:,None], ao[:,None], refls[:,7,None], refls[:,8,None], batch[:,None]), axis=1)
-        return np.concatenate((hkl, I[:,None], SigI[:,None], s1, eo[:,None], ao[:,None], refls[:,7,None], refls[:,8,None]), axis=1)
+            batch = batch * np.ones_like(I, dtype="int32")
+            return np.concatenate(
+                (
+                    hkl,
+                    I[:, None],
+                    SigI[:, None],
+                    s1,
+                    eo[:, None],
+                    ao[:, None],
+                    refls[:, 7, None],
+                    refls[:, 8, None],
+                    batch[:, None],
+                ),
+                axis=1,
+            )
+        return np.concatenate(
+            (
+                hkl,
+                I[:, None],
+                SigI[:, None],
+                s1,
+                eo[:, None],
+                ao[:, None],
+                refls[:, 7, None],
+                refls[:, 8, None],
+            ),
+            axis=1,
+        )
+
+    def to_dataset(self, spacegroup: Optional[gemmi.SpaceGroup] = None) -> rs.DataSet:
+        """Convert self.stream_file to an rs DataSet. Optionally set the spacegroup"""
 
-    def to_dataset(self, spacegroup : Optional[gemmi.SpaceGroup] = None) -> rs.DataSet:
-        """ Convert self.stream_file to an rs DataSet. Optionally set the spacegroup """
         def data_gen():
-            for i,crystal in enumerate(StreamLoader.to_crystals(self.stream_file)):
-                ds = StreamLoader.crystal_to_data(crystal, self.wavelength, self.cell,  batch=i+1)
+            for i, crystal in enumerate(StreamLoader.to_crystals(self.stream_file)):
+                ds = StreamLoader.crystal_to_data(
+                    crystal, self.wavelength, self.cell, batch=i + 1
+                )
                 yield ds
+
         data = np.concatenate(list(data_gen()), axis=0)
 
         names = [
@@ -345,9 +417,12 @@ def data_gen():
             "YDET",
             "BATCH",
         ]
-        ds = rs.DataSet(data, columns=names, cell=self.cell, spacegroup=spacegroup, merged=False).infer_mtz_dtypes()
+        ds = rs.DataSet(
+            data, columns=names, cell=self.cell, spacegroup=spacegroup, merged=False
+        ).infer_mtz_dtypes()
         return ds
 
+
 def read_crystfel(streamfile: str, spacegroup=None) -> DataSet:
     """
     Initialize attributes and populate the DataSet object with data from a CrystFEL stream with indexed reflections.
@@ -371,5 +446,3 @@ def read_crystfel(streamfile: str, spacegroup=None) -> DataSet:
     ds = loader.to_dataset(spacegroup=spacegroup)
     ds.set_index(["H", "K", "L"], inplace=True)
     return ds
-
-