From 8c359b8de6f0866ed31a5c2ad3e7372ab495f5e8 Mon Sep 17 00:00:00 2001 From: Kevin Dalton Date: Wed, 28 Jun 2023 23:22:56 -0400 Subject: [PATCH 1/2] faster stream file parser --- reciprocalspaceship/io/crystfel.py | 577 +++++++++++++++++------------ 1 file changed, 341 insertions(+), 236 deletions(-) diff --git a/reciprocalspaceship/io/crystfel.py b/reciprocalspaceship/io/crystfel.py index 71cb0a04..85fe346a 100644 --- a/reciprocalspaceship/io/crystfel.py +++ b/reciprocalspaceship/io/crystfel.py @@ -1,218 +1,352 @@ import numpy as np import pandas as pd +from typing import Union,Optional +import reciprocalspaceship as rs from reciprocalspaceship import DataSet +import gemmi from reciprocalspaceship.utils import angle_between - - -def _parse_stream(filename: str) -> dict: - """ - Parses stream and returns all indexed peak positions - - Parameters - ---------- - filename : stream filename - name of a .stream file - - Returns - -------- - (dict, np.ndarray) - """ - - answ_crystals = {} - - def contains_filename(s): - return s.startswith("Image filename") - - def contains_event(s): - return s.startswith("Event") - - def contains_serial_number(s): - return s.startswith("Image serial number") - - def starts_chunk_peaks(s): - return s.startswith(" fs/px ss/px (1/d)/nm^-1 Intensity Panel") - - def ends_chunk_peaks(s): - return s.startswith("End of peak list") - - def starts_crystal_peaks(s): - return s.startswith( - " h k l I sigma(I) peak background fs/px ss/px panel" - ) - - def is_photon_energy(s): - return s.startswith("photon_energy_eV") - - def is_astar(s): - return s.startswith("astar") - - def is_bstar(s): - return s.startswith("bstar") - - def is_cstar(s): - return s.startswith("cstar") - - def ends_crystal_peaks(s): - return s.startswith("End of reflections") - - def eV2Angstrom(e_eV): - return 12398.0 / e_eV - - # add unit cell parameters parsing - with open(filename, "r") as stream: - is_unit_cell = False - get_cellparam = lambda s: float(s.split()[2]) - rv_cell_param = None - a, b, c, al, be, ga = [ - None - ] * 6 # None's are needed since stream not always has all 6 parameters - for line in stream: - if "Begin unit cell" in line: - is_unit_cell = True - continue - elif is_unit_cell: - if line.startswith("a ="): - a = get_cellparam(line) - if line.startswith("b ="): - b = get_cellparam(line) - if line.startswith("c ="): - c = get_cellparam(line) - if line.startswith("al ="): - al = get_cellparam(line) - if line.startswith("be ="): - be = get_cellparam(line) - if line.startswith("ga ="): - ga = get_cellparam(line) - is_unit_cell = False # gamma is the last parameters - elif "End unit cell" in line: - rv_cell_param = np.array([a, b, c, al, be, ga]) - break - - with open(filename, "r") as stream: - is_chunk = False - is_crystal = False - current_filename = None - current_event = None # to handle non-event streams - current_serial_number = None - corrupted_chunk = False - crystal_peak_number = 0 - crystal_idx = 0 - - for line in stream: - # analyzing what we have - if ends_chunk_peaks(line): - is_chunk = False - chunk_peak_number = 0 - elif ends_crystal_peaks(line): - is_crystal = False - crystal_peak_number = 0 - - elif is_photon_energy(line): - photon_energy = float(line.split()[2]) - elif is_astar(line): +from reciprocalspaceship.decorators import cellify,spacegroupify + +#See Rupp Table 5-2 +_cell_constraints = { + 'triclinic' : lambda x: x, + 'orthorhombic' : lambda x: [x[0], x[1], x[2], 90., 90., 90.], + 'monoclinic' : lambda x: [x[0], x[1], x[2], 90., x[4], 90.], + 'hexagonal' : lambda x: [0.5*(x[0] + x[1]), 0.5*(x[0] + x[1]), x[2], 90., 90., 120.], + 'rhombohedral' : lambda x: [0.5*(x[0] + x[1]), 0.5*(x[0] + x[1]), x[2], 90., 90., 120.], + 'cubic' : lambda x: [np.mean(x[:3]), np.mean(x[:3]), np.mean(x[:3]), 90., 90., 90.], + 'tetragonal' : lambda x: [0.5*(x[0] + x[1]), 0.5*(x[0] + x[1]), x[2], 90., 90., 90.], +} + +class StreamLoader(): + @cellify + @spacegroupify + def __init__( + self, + stream_file : str, + cell : Optional[gemmi.UnitCell]=None, + spacegroup : Optional[gemmi.SpaceGroup] = None, + wavelength : Optional[float] = None, + ): + """ + This class can be used to convert CrystFEL `.stream` files into `rs.DataSet` objects. + + Parameters + ---------- + stream_file : str + The path of the `.stream` file. Must end in `.stream`. + cell : gemmi.UnitCell (optional) + The cell to assign to the DataSet. If None is supplied, it will be determined from the file. + spacegroup : gemmi.SpaceGroup (optional) + The spacegroup to assign to the DataSet. + wavelength : float (optional) + The wavelength to use for geometry calculations. If None is supplied, it will be determined from the file. + """ + super().__init__() + self.cell = cell + if self.cell is None: + self.cell = self.get_target_cell(stream_file) + + self.spacegroup = spacegroup + self.wavelength = wavelength + if self.wavelength is None: + self.wavelength = self.get_wavelength(stream_file) + self.inverse_wavelength = 1. / self.wavelength + if not stream_file.endswith('.stream'): + raise ValueError("Stream file should end with .stream") + self.stream_file = stream_file + + @staticmethod + def to_crystals(stream_file : str) -> iter: + """ Convert the stream file into an iterator of crystal blocks. """ + return StreamLoader.to_blocks(stream_file, 'crystal') + + @staticmethod + def get_lattice_type(stream_file : str) -> str: + """ Extract the crystal system string from a stream file. """ + lines = next(StreamLoader.to_blocks(stream_file, 'cell')) + for line in lines: + if line.startswith("lattice_type ="): + lattice_type = line.split()[2] + return lattice_type + raise ValueError("No lattice_type entry!") + + @staticmethod + def get_target_cell(stream_file : str) -> gemmi.UnitCell: + """ Extract the target unit cell constants from a stream file. """ + lines = next(StreamLoader.to_blocks(stream_file, 'cell')) + cell = [0.]*6 + for line in lines: + if line.startswith("a ="): + cell[0] = float(line.split()[2]) + elif line.startswith("b ="): + cell[1] = float(line.split()[2]) + elif line.startswith("c ="): + cell[2] = float(line.split()[2]) + elif line.startswith("al ="): + cell[3] = float(line.split()[2]) + elif line.startswith("be ="): + cell[4] = float(line.split()[2]) + elif line.startswith("ga ="): + cell[5] = float(line.split()[2]) + + return gemmi.UnitCell(*cell) + + @staticmethod + def get_wavelength(stream_file : str) -> float: + """ Extract the wavelength from a stream file. """ + geo = next(StreamLoader.to_blocks(stream_file, 'geometry')) + for line in geo: + if line.startswith("photon_energy"): + eV = float(line.split()[2]) + lam = rs.utils.ev2angstroms(eV) + return lam + + @staticmethod + def online_mean_variance(iterator) -> (Union[float, np.ndarray], Union[float, np.ndarray]): + """ Compute the mean and variance of an iterator of floats or arrays online. """ + def update(count, mean, m2, value): + count = count + 1 + delta = value - mean + mean += delta / count + delta2 = value - mean + m2 += delta * delta2 + return count, mean, m2 + + count, mean, m2 = 0, 0, 0 + for value in iterator: + count, mean, m2 = update(count, mean, m2, value) + + variance = m2 / count + return mean, variance + + @staticmethod + def get_average_cell(stream_file, constrain_by_crystal_system=True): + """ + Gets the average cell across all the indexed + crystals in a stream file. + + Parameters + ---------- + stream_file : str + The path to a CrystFEL stream file. + constrain_by_crystal_system : bool (optional) + Whether to apply the appropriate constraints + for the crystal system described in the stream. + The default value is True. + + Returns + ------- + cell : gemmi.UnitCell + """ + def cell_iter(stream_file): + for crystal in StreamLoader.to_crystals(stream_file): + for line in crystal: + if line.startswith("Cell parameters"): + cell = line.split() + cell = np.array([ + cell[2], + cell[3], + cell[4], + cell[6], + cell[7], + cell[8], + ], dtype='float32') + cell[:3] = 10.*cell[:3] + yield cell + break + + mean, variance = StreamLoader.online_mean_variance(cell_iter(stream_file)) + cell = gemmi.UnitCell(*mean) + if constrain: + lattice_type = StreamLoader.get_lattice_type(stream_file) + cell = _cell_constraints[lattice_type](cell) + return cell + + @staticmethod + def to_blocks(stream_file : str, block_name : str) -> iter: + """ + Parameters + ---------- + stream_file : str + The path to a CrystFEL stream file. + block_name : str + One of the following types of blocks + - 'geometry' + - 'chunk' + - 'cell' + - 'peaks' + - 'crystal' + - 'reflections' + + Returns + ------- + blocks : iter + An interable containing lists of lines for each block. + """ + # See crystFEL API reference here: https://www.desy.de/~twhite/crystfel/reference/stream_8h.html + block_markers = { + "geometry" : ("----- Begin geometry file -----", "----- End geometry file -----"), + "chunk" : ("----- Begin chunk -----", "----- End chunk -----"), + "cell" : ("----- Begin unit cell -----", "----- End unit cell -----"), + "peaks" : ("Peaks from peak search", "End of peak list"), + "crystal" : ("--- Begin crystal", "--- End crystal"), + "reflections" : ("Reflections measured after indexing", "End of reflections"), + } + block_begin_marker, block_end_marker = block_markers[block_name] + + block = [] + in_block = False + for line in open(stream_file): + if line.startswith(block_end_marker): + in_block = False + yield block + block = [] + if in_block: + block.append(line) + if line.startswith(block_begin_marker): + in_block = True + + @staticmethod + def crystal_to_data(crystal, wavelength, cell=None, dmin=None, batch=None): + """ + Convert a crystal block (list of strings) to a numpy array. + + Parameters + ---------- + crystal : list or similar + A list of strings corresponding to a single crystal + block in a stream file. + wavelength : float + The wavelength of the dataset. Used for calculating ewald + offsets. + cell : gemmi.UnitCell (optional) + The unit cell to use to calculate reflection resolution. + This is only used for the dmin resolution cutoff. + dmin : float (optional) + An optional resolution cutoff. Requires a cell. + batch : int (optional) + Optionally supply a batch number which will be appended to + the output array as another column + + Returns + ------- + data : array (float32) + An array of columns which correspond to the following data + 0) H + 1) K + 2) L + 3) I + 4) SigI + 5) s1x + 6) s1y + 7) s1z + 8) ewald_offset + 9) angular_ewald_offset + 10) XDET + 11) YDET + 12) BATCH + + """ + block_name : str + inverse_wavelength = 1. / wavelength + astar = bstar = cstar = None + in_refls = False + crystal_iter = iter(crystal) + + refls = [] + for line in crystal_iter: + if line.startswith('astar ='): astar = ( np.array(line.split()[2:5], dtype="float32") / 10.0 ) # crystfel's notation uses nm-1 - elif is_bstar(line): + if line.startswith('bstar ='): bstar = ( np.array(line.split()[2:5], dtype="float32") / 10.0 ) # crystfel's notation uses nm-1 - elif is_cstar(line): + if line.startswith('cstar ='): cstar = ( np.array(line.split()[2:5], dtype="float32") / 10.0 ) # crystfel's notation uses nm-1 - - # since it's the last line needed to construct Ewald offset, - # we'll pre-compute the matrices here - A = np.array([astar, bstar, cstar]).T - lambda_inv = 1 / eV2Angstrom(photon_energy) - s0 = np.array([0, 0, lambda_inv]).T - - elif is_crystal: - # example line: - # h k l I sigma(I) peak background fs/px ss/px panel - # -63 41 9 -41.31 57.45 195.00 170.86 731.0 1350.4 p0 - crystal_peak_number += 1 - h, k, l, I, sigmaI, peak, background, xdet, ydet, panel = [ - i for i in line.split() - ] - h, k, l = map(int, [h, k, l]) - - # calculate ewald offset and s1 - hkl = np.array([h, k, l]) - q = A @ hkl - s1 = q + s0 - s1x, s1y, s1z = s1 - s1_norm = np.linalg.norm(s1) - ewald_offset = s1_norm - lambda_inv - - # project calculated s1 onto the ewald sphere - s1_obs = lambda_inv * s1 / s1_norm - - # Compute the angular ewald offset - q_obs = s1_obs - s0 - qangle = np.sign(ewald_offset) * angle_between(q, q_obs) - - record = { - "H": h, - "K": k, - "L": l, - "I": float(I), - "SigI": float(sigmaI), - "BATCH": crystal_idx, - "s1x": s1x, - "s1y": s1y, - "s1z": s1z, - "ewald_offset": ewald_offset, - "angular_ewald_offset": qangle, - "XDET": float(xdet), - "YDET": float(ydet), - } - if current_event is not None: - name = ( - current_filename, - current_event, - current_serial_number, - crystal_idx, - crystal_peak_number, - ) - else: - name = ( - current_filename, - current_serial_number, - crystal_idx, - crystal_peak_number, - ) - answ_crystals[name] = record - - # start analyzing where we are now - if corrupted_chunk: - if "Begin chunk" not in line: - continue - else: - is_crystal, is_chunk = False, False - corrupted_chunk = False - continue - - if contains_filename(line): - current_filename = line.split()[-1] - elif contains_event(line): - current_event = line.split()[-1][2:] - elif contains_serial_number(line): - current_serial_number = line.split()[-1] - - elif starts_chunk_peaks(line): - is_chunk = True - continue - - elif starts_crystal_peaks(line): - crystal_idx += 1 - is_crystal = True - continue - - return answ_crystals, rv_cell_param - + if line == "End of reflections\n": + in_refls = False + if in_refls: + refls.append(line.split()[:-1]) + if line == "Reflections measured after indexing\n": + in_refls = True + crystal_iter = next(crystal_iter) #skip header + + refls = np.array(refls, dtype='float32') + hkl = refls[:,:3] + + #Apply dmin + if dmin is not None: + if cell is None: + raise ValueError("dmin supplied without a cell") + d = cell.calculate_d_array(hkl).astype('float32') + idx = d >= dmin + refls = refls[idx] + d = d[idx] + hkl = hkl[idx] + + A = np.array([astar, bstar, cstar]).T + # calculate ewald offset and s1 + + s0 = np.array([0, 0, inverse_wavelength]).T + q = hkl @ A.T # == (A @ hkl.T).T + s1 = q + s0 + s1x, s1y, s1z = s1.T + s1_norm = np.sqrt(s1x * s1x + s1y * s1y + s1z * s1z) + + # project calculated s1 onto the ewald sphere + s1_obs = inverse_wavelength * s1 / s1_norm[:,None] + + # Compute the ewald offset vector + eov = s1_obs - s1 + + # Compute scalar ewald offset + eo = s1_norm - inverse_wavelength + + # Compute angular ewald offset + eo_sign = np.sign(eo) + q_obs = s1_obs - s0 + ao = eo_sign * rs.utils.angle_between(q, q_obs) + + I = refls[:,3] + SigI = refls[:,4] + bg = refls[:,5] + + if batch is not None: + batch = batch * np.ones_like(I, dtype='int32') + return np.concatenate((hkl, I[:,None], SigI[:,None], s1, eo[:,None], ao[:,None], refls[:,7,None], refls[:,8,None], batch[:,None]), axis=1) + return np.concatenate((hkl, I[:,None], SigI[:,None], s1, eo[:,None], ao[:,None], refls[:,7,None], refls[:,8,None]), axis=1) + + def to_dataset(self, spacegroup : Optional[gemmi.SpaceGroup] = None) -> rs.DataSet: + """ Convert self.stream_file to an rs DataSet. Optionally set the spacegroup """ + def data_gen(): + for i,crystal in enumerate(StreamLoader.to_crystals(self.stream_file)): + ds = StreamLoader.crystal_to_data(crystal, self.wavelength, self.cell, batch=i+1) + yield ds + data = np.concatenate(list(data_gen()), axis=0) + + names = [ + "H", + "K", + "L", + "I", + "SigI", + "s1x", + "s1y", + "s1z", + "ewald_offset", + "angular_ewald_offset", + "XDET", + "YDET", + "BATCH", + ] + ds = rs.DataSet(data, columns=names, cell=self.cell, spacegroup=spacegroup, merged=False).infer_mtz_dtypes() + return ds def read_crystfel(streamfile: str, spacegroup=None) -> DataSet: """ @@ -233,38 +367,9 @@ def read_crystfel(streamfile: str, spacegroup=None) -> DataSet: if not streamfile.endswith(".stream"): raise ValueError("Stream file should end with .stream") - # read data from stream file - d, cell = _parse_stream(streamfile) - df = pd.DataFrame.from_records(list(d.values())) - - # set mtztypes as in precognition.py - # hkl -- H - # I, sigmaI -- J, Q - # BATCH -- B - # s1{x,y,z} -- R - # ewald_offset -- R - mtzdtypes = { - "H": "H", - "K": "H", - "L": "H", - "I": "J", - "SigI": "Q", - "BATCH": "B", - "s1x": "R", - "s1y": "R", - "s1z": "R", - "ewald_offset": "R", - "angular_ewald_offset": "R", - "XDET": "R", - "YDET": "R", - } - dataset = DataSet( - spacegroup=spacegroup, - cell=cell, - merged=False, # CrystFEL stream is always unmerged - ) - for k, v in df.items(): - dataset[k] = v.astype(mtzdtypes[k]) - dataset.set_index(["H", "K", "L"], inplace=True) - - return dataset + loader = StreamLoader(streamfile) + ds = loader.to_dataset(spacegroup=spacegroup) + ds.set_index(["H", "K", "L"], inplace=True) + return ds + + From d0ee0d655137ae0fb73d876ef35da71a003d27d4 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 29 Jun 2023 03:24:22 +0000 Subject: [PATCH 2/2] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- reciprocalspaceship/io/crystfel.py | 253 +++++++++++++++++++---------- 1 file changed, 163 insertions(+), 90 deletions(-) diff --git a/reciprocalspaceship/io/crystfel.py b/reciprocalspaceship/io/crystfel.py index 85fe346a..de0ef990 100644 --- a/reciprocalspaceship/io/crystfel.py +++ b/reciprocalspaceship/io/crystfel.py @@ -1,34 +1,64 @@ +from typing import Optional, Union + +import gemmi import numpy as np import pandas as pd -from typing import Union,Optional import reciprocalspaceship as rs from reciprocalspaceship import DataSet -import gemmi +from reciprocalspaceship.decorators import cellify, spacegroupify from reciprocalspaceship.utils import angle_between -from reciprocalspaceship.decorators import cellify,spacegroupify -#See Rupp Table 5-2 +# See Rupp Table 5-2 _cell_constraints = { - 'triclinic' : lambda x: x, - 'orthorhombic' : lambda x: [x[0], x[1], x[2], 90., 90., 90.], - 'monoclinic' : lambda x: [x[0], x[1], x[2], 90., x[4], 90.], - 'hexagonal' : lambda x: [0.5*(x[0] + x[1]), 0.5*(x[0] + x[1]), x[2], 90., 90., 120.], - 'rhombohedral' : lambda x: [0.5*(x[0] + x[1]), 0.5*(x[0] + x[1]), x[2], 90., 90., 120.], - 'cubic' : lambda x: [np.mean(x[:3]), np.mean(x[:3]), np.mean(x[:3]), 90., 90., 90.], - 'tetragonal' : lambda x: [0.5*(x[0] + x[1]), 0.5*(x[0] + x[1]), x[2], 90., 90., 90.], + "triclinic": lambda x: x, + "orthorhombic": lambda x: [x[0], x[1], x[2], 90.0, 90.0, 90.0], + "monoclinic": lambda x: [x[0], x[1], x[2], 90.0, x[4], 90.0], + "hexagonal": lambda x: [ + 0.5 * (x[0] + x[1]), + 0.5 * (x[0] + x[1]), + x[2], + 90.0, + 90.0, + 120.0, + ], + "rhombohedral": lambda x: [ + 0.5 * (x[0] + x[1]), + 0.5 * (x[0] + x[1]), + x[2], + 90.0, + 90.0, + 120.0, + ], + "cubic": lambda x: [ + np.mean(x[:3]), + np.mean(x[:3]), + np.mean(x[:3]), + 90.0, + 90.0, + 90.0, + ], + "tetragonal": lambda x: [ + 0.5 * (x[0] + x[1]), + 0.5 * (x[0] + x[1]), + x[2], + 90.0, + 90.0, + 90.0, + ], } -class StreamLoader(): + +class StreamLoader: @cellify @spacegroupify def __init__( - self, - stream_file : str, - cell : Optional[gemmi.UnitCell]=None, - spacegroup : Optional[gemmi.SpaceGroup] = None, - wavelength : Optional[float] = None, - ): + self, + stream_file: str, + cell: Optional[gemmi.UnitCell] = None, + spacegroup: Optional[gemmi.SpaceGroup] = None, + wavelength: Optional[float] = None, + ): """ This class can be used to convert CrystFEL `.stream` files into `rs.DataSet` objects. @@ -39,7 +69,7 @@ def __init__( cell : gemmi.UnitCell (optional) The cell to assign to the DataSet. If None is supplied, it will be determined from the file. spacegroup : gemmi.SpaceGroup (optional) - The spacegroup to assign to the DataSet. + The spacegroup to assign to the DataSet. wavelength : float (optional) The wavelength to use for geometry calculations. If None is supplied, it will be determined from the file. """ @@ -52,20 +82,20 @@ def __init__( self.wavelength = wavelength if self.wavelength is None: self.wavelength = self.get_wavelength(stream_file) - self.inverse_wavelength = 1. / self.wavelength - if not stream_file.endswith('.stream'): + self.inverse_wavelength = 1.0 / self.wavelength + if not stream_file.endswith(".stream"): raise ValueError("Stream file should end with .stream") self.stream_file = stream_file @staticmethod - def to_crystals(stream_file : str) -> iter: - """ Convert the stream file into an iterator of crystal blocks. """ - return StreamLoader.to_blocks(stream_file, 'crystal') + def to_crystals(stream_file: str) -> iter: + """Convert the stream file into an iterator of crystal blocks.""" + return StreamLoader.to_blocks(stream_file, "crystal") @staticmethod - def get_lattice_type(stream_file : str) -> str: - """ Extract the crystal system string from a stream file. """ - lines = next(StreamLoader.to_blocks(stream_file, 'cell')) + def get_lattice_type(stream_file: str) -> str: + """Extract the crystal system string from a stream file.""" + lines = next(StreamLoader.to_blocks(stream_file, "cell")) for line in lines: if line.startswith("lattice_type ="): lattice_type = line.split()[2] @@ -73,10 +103,10 @@ def get_lattice_type(stream_file : str) -> str: raise ValueError("No lattice_type entry!") @staticmethod - def get_target_cell(stream_file : str) -> gemmi.UnitCell: - """ Extract the target unit cell constants from a stream file. """ - lines = next(StreamLoader.to_blocks(stream_file, 'cell')) - cell = [0.]*6 + def get_target_cell(stream_file: str) -> gemmi.UnitCell: + """Extract the target unit cell constants from a stream file.""" + lines = next(StreamLoader.to_blocks(stream_file, "cell")) + cell = [0.0] * 6 for line in lines: if line.startswith("a ="): cell[0] = float(line.split()[2]) @@ -94,9 +124,9 @@ def get_target_cell(stream_file : str) -> gemmi.UnitCell: return gemmi.UnitCell(*cell) @staticmethod - def get_wavelength(stream_file : str) -> float: - """ Extract the wavelength from a stream file. """ - geo = next(StreamLoader.to_blocks(stream_file, 'geometry')) + def get_wavelength(stream_file: str) -> float: + """Extract the wavelength from a stream file.""" + geo = next(StreamLoader.to_blocks(stream_file, "geometry")) for line in geo: if line.startswith("photon_energy"): eV = float(line.split()[2]) @@ -104,8 +134,11 @@ def get_wavelength(stream_file : str) -> float: return lam @staticmethod - def online_mean_variance(iterator) -> (Union[float, np.ndarray], Union[float, np.ndarray]): - """ Compute the mean and variance of an iterator of floats or arrays online. """ + def online_mean_variance( + iterator, + ) -> (Union[float, np.ndarray], Union[float, np.ndarray]): + """Compute the mean and variance of an iterator of floats or arrays online.""" + def update(count, mean, m2, value): count = count + 1 delta = value - mean @@ -140,20 +173,24 @@ def get_average_cell(stream_file, constrain_by_crystal_system=True): ------- cell : gemmi.UnitCell """ + def cell_iter(stream_file): for crystal in StreamLoader.to_crystals(stream_file): for line in crystal: if line.startswith("Cell parameters"): cell = line.split() - cell = np.array([ - cell[2], - cell[3], - cell[4], - cell[6], - cell[7], - cell[8], - ], dtype='float32') - cell[:3] = 10.*cell[:3] + cell = np.array( + [ + cell[2], + cell[3], + cell[4], + cell[6], + cell[7], + cell[8], + ], + dtype="float32", + ) + cell[:3] = 10.0 * cell[:3] yield cell break @@ -165,7 +202,7 @@ def cell_iter(stream_file): return cell @staticmethod - def to_blocks(stream_file : str, block_name : str) -> iter: + def to_blocks(stream_file: str, block_name: str) -> iter: """ Parameters ---------- @@ -174,25 +211,31 @@ def to_blocks(stream_file : str, block_name : str) -> iter: block_name : str One of the following types of blocks - 'geometry' - - 'chunk' - - 'cell' - - 'peaks' - - 'crystal' + - 'chunk' + - 'cell' + - 'peaks' + - 'crystal' - 'reflections' Returns ------- blocks : iter - An interable containing lists of lines for each block. + An interable containing lists of lines for each block. """ # See crystFEL API reference here: https://www.desy.de/~twhite/crystfel/reference/stream_8h.html block_markers = { - "geometry" : ("----- Begin geometry file -----", "----- End geometry file -----"), - "chunk" : ("----- Begin chunk -----", "----- End chunk -----"), - "cell" : ("----- Begin unit cell -----", "----- End unit cell -----"), - "peaks" : ("Peaks from peak search", "End of peak list"), - "crystal" : ("--- Begin crystal", "--- End crystal"), - "reflections" : ("Reflections measured after indexing", "End of reflections"), + "geometry": ( + "----- Begin geometry file -----", + "----- End geometry file -----", + ), + "chunk": ("----- Begin chunk -----", "----- End chunk -----"), + "cell": ("----- Begin unit cell -----", "----- End unit cell -----"), + "peaks": ("Peaks from peak search", "End of peak list"), + "crystal": ("--- Begin crystal", "--- End crystal"), + "reflections": ( + "Reflections measured after indexing", + "End of reflections", + ), } block_begin_marker, block_end_marker = block_markers[block_name] @@ -209,23 +252,23 @@ def to_blocks(stream_file : str, block_name : str) -> iter: in_block = True @staticmethod - def crystal_to_data(crystal, wavelength, cell=None, dmin=None, batch=None): + def crystal_to_data(crystal, wavelength, cell=None, dmin=None, batch=None): """ - Convert a crystal block (list of strings) to a numpy array. + Convert a crystal block (list of strings) to a numpy array. Parameters ---------- crystal : list or similar - A list of strings corresponding to a single crystal - block in a stream file. + A list of strings corresponding to a single crystal + block in a stream file. wavelength : float The wavelength of the dataset. Used for calculating ewald - offsets. + offsets. cell : gemmi.UnitCell (optional) - The unit cell to use to calculate reflection resolution. + The unit cell to use to calculate reflection resolution. This is only used for the dmin resolution cutoff. dmin : float (optional) - An optional resolution cutoff. Requires a cell. + An optional resolution cutoff. Requires a cell. batch : int (optional) Optionally supply a batch number which will be appended to the output array as another column @@ -247,25 +290,25 @@ def crystal_to_data(crystal, wavelength, cell=None, dmin=None, batch=None): 10) XDET 11) YDET 12) BATCH - + """ - block_name : str - inverse_wavelength = 1. / wavelength + block_name: str + inverse_wavelength = 1.0 / wavelength astar = bstar = cstar = None in_refls = False crystal_iter = iter(crystal) refls = [] for line in crystal_iter: - if line.startswith('astar ='): + if line.startswith("astar ="): astar = ( np.array(line.split()[2:5], dtype="float32") / 10.0 ) # crystfel's notation uses nm-1 - if line.startswith('bstar ='): + if line.startswith("bstar ="): bstar = ( np.array(line.split()[2:5], dtype="float32") / 10.0 ) # crystfel's notation uses nm-1 - if line.startswith('cstar ='): + if line.startswith("cstar ="): cstar = ( np.array(line.split()[2:5], dtype="float32") / 10.0 ) # crystfel's notation uses nm-1 @@ -275,16 +318,16 @@ def crystal_to_data(crystal, wavelength, cell=None, dmin=None, batch=None): refls.append(line.split()[:-1]) if line == "Reflections measured after indexing\n": in_refls = True - crystal_iter = next(crystal_iter) #skip header + crystal_iter = next(crystal_iter) # skip header - refls = np.array(refls, dtype='float32') - hkl = refls[:,:3] + refls = np.array(refls, dtype="float32") + hkl = refls[:, :3] - #Apply dmin + # Apply dmin if dmin is not None: if cell is None: raise ValueError("dmin supplied without a cell") - d = cell.calculate_d_array(hkl).astype('float32') + d = cell.calculate_d_array(hkl).astype("float32") idx = d >= dmin refls = refls[idx] d = d[idx] @@ -294,13 +337,13 @@ def crystal_to_data(crystal, wavelength, cell=None, dmin=None, batch=None): # calculate ewald offset and s1 s0 = np.array([0, 0, inverse_wavelength]).T - q = hkl @ A.T # == (A @ hkl.T).T + q = hkl @ A.T # == (A @ hkl.T).T s1 = q + s0 s1x, s1y, s1z = s1.T s1_norm = np.sqrt(s1x * s1x + s1y * s1y + s1z * s1z) # project calculated s1 onto the ewald sphere - s1_obs = inverse_wavelength * s1 / s1_norm[:,None] + s1_obs = inverse_wavelength * s1 / s1_norm[:, None] # Compute the ewald offset vector eov = s1_obs - s1 @@ -313,21 +356,50 @@ def crystal_to_data(crystal, wavelength, cell=None, dmin=None, batch=None): q_obs = s1_obs - s0 ao = eo_sign * rs.utils.angle_between(q, q_obs) - I = refls[:,3] - SigI = refls[:,4] - bg = refls[:,5] + I = refls[:, 3] + SigI = refls[:, 4] + bg = refls[:, 5] if batch is not None: - batch = batch * np.ones_like(I, dtype='int32') - return np.concatenate((hkl, I[:,None], SigI[:,None], s1, eo[:,None], ao[:,None], refls[:,7,None], refls[:,8,None], batch[:,None]), axis=1) - return np.concatenate((hkl, I[:,None], SigI[:,None], s1, eo[:,None], ao[:,None], refls[:,7,None], refls[:,8,None]), axis=1) + batch = batch * np.ones_like(I, dtype="int32") + return np.concatenate( + ( + hkl, + I[:, None], + SigI[:, None], + s1, + eo[:, None], + ao[:, None], + refls[:, 7, None], + refls[:, 8, None], + batch[:, None], + ), + axis=1, + ) + return np.concatenate( + ( + hkl, + I[:, None], + SigI[:, None], + s1, + eo[:, None], + ao[:, None], + refls[:, 7, None], + refls[:, 8, None], + ), + axis=1, + ) + + def to_dataset(self, spacegroup: Optional[gemmi.SpaceGroup] = None) -> rs.DataSet: + """Convert self.stream_file to an rs DataSet. Optionally set the spacegroup""" - def to_dataset(self, spacegroup : Optional[gemmi.SpaceGroup] = None) -> rs.DataSet: - """ Convert self.stream_file to an rs DataSet. Optionally set the spacegroup """ def data_gen(): - for i,crystal in enumerate(StreamLoader.to_crystals(self.stream_file)): - ds = StreamLoader.crystal_to_data(crystal, self.wavelength, self.cell, batch=i+1) + for i, crystal in enumerate(StreamLoader.to_crystals(self.stream_file)): + ds = StreamLoader.crystal_to_data( + crystal, self.wavelength, self.cell, batch=i + 1 + ) yield ds + data = np.concatenate(list(data_gen()), axis=0) names = [ @@ -345,9 +417,12 @@ def data_gen(): "YDET", "BATCH", ] - ds = rs.DataSet(data, columns=names, cell=self.cell, spacegroup=spacegroup, merged=False).infer_mtz_dtypes() + ds = rs.DataSet( + data, columns=names, cell=self.cell, spacegroup=spacegroup, merged=False + ).infer_mtz_dtypes() return ds + def read_crystfel(streamfile: str, spacegroup=None) -> DataSet: """ Initialize attributes and populate the DataSet object with data from a CrystFEL stream with indexed reflections. @@ -371,5 +446,3 @@ def read_crystfel(streamfile: str, spacegroup=None) -> DataSet: ds = loader.to_dataset(spacegroup=spacegroup) ds.set_index(["H", "K", "L"], inplace=True) return ds - -