Merge pull request #137 from renecotyfanboy/data-upgrade

Data upgrade
renecotyfanboy · Mar 19, 2024 · 0bcf86f · 0bcf86f
2 parents 03b713d + 2b5d00b
commit 0bcf86f
Show file tree

Hide file tree

Showing 23 changed files with 457 additions and 131 deletions.
diff --git a/.dockerignore b/.dockerignore
@@ -2,6 +2,7 @@
 *.npz
 *.arf
 *.rmf
+*.lock
 
 data
 data_to_test

diff --git a/.github/workflows/test-and-coverage.yml b/.github/workflows/test-and-coverage.yml
@@ -25,6 +25,8 @@ jobs:
 
     steps:
     - uses: actions/checkout@v4
+      with:
+        submodules: 'recursive'
     - name: Set up Python 3.10
       uses: actions/setup-python@v4
       with:

diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "tests/data"]
+	path = tests/data
+	url = https://github.com/HEACIT/curated-test-data
diff --git a/docs/examples/statics/subtract_background_with_errors.png b/docs/examples/statics/subtract_background_with_errors.png
diff --git a/docs/references/data.md b/docs/references/data.md
@@ -34,3 +34,11 @@
       show_root_heading: false
       show_root_toc_entry: false
       heading_level: 3
+
+## OGIP data parsers
+
+::: jaxspec.data.ogip
+    options:
+      show_root_heading: false
+      show_root_toc_entry: false
+      heading_level: 3
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "jaxspec"
-version = "0.0.2"
+version = "0.0.3"
 description = "jaxspec is a bayesian spectral fitting library for X-ray astronomy."
 authors = ["sdupourque <sdupourque@irap.omp.eu>"]
 license = "MIT"
@@ -26,6 +26,7 @@ jaxopt = "^0.8.1"
 tinygp = "^0.3.0"
 seaborn = "^0.13.1"
 mkdocstrings = "^0.24.0"
+sparse = "^0.15.1"
 
 
 [tool.poetry.group.docs.dependencies]

diff --git a/src/jaxspec/analysis/results.py b/src/jaxspec/analysis/results.py
@@ -343,7 +343,7 @@ def plot_ppc(self, percentile: Tuple[int, int] = (14, 86)) -> plt.Figure:
                     folding_model.out_energies,
                     y_observed=folding_model.folded_background.data,
                     y_samples=bkg_count,
-                    denominator=denominator / folding_model.backratio.data,
+                    denominator=denominator * folding_model.folded_backratio.data,
                     color=(0.26787604, 0.60085972, 0.63302651),
                     percentile=percentile,
                 )

diff --git a/src/jaxspec/data/__init__.py b/src/jaxspec/data/__init__.py
@@ -5,3 +5,6 @@
 import astropy.units as u
 
 u.add_enabled_aliases({"counts": u.count})
+u.add_enabled_aliases({"channel": u.dimensionless_unscaled})
+# Arbitrary units are found in .rsp files , let's hope it is compatible with what we would expect as the rmf x arf
+# u.add_enabled_aliases({"au": u.dimensionless_unscaled})
diff --git a/src/jaxspec/data/instrument.py b/src/jaxspec/data/instrument.py
@@ -1,6 +1,7 @@
 import os
 import numpy as np
 import xarray as xr
+from matplotlib import colors
 from .ogip import DataARF, DataRMF
 
 
@@ -64,21 +65,26 @@ def from_matrix(cls, redistribution_matrix, spectral_response, e_min_unfolded, e
         )
 
     @classmethod
-    def from_ogip_file(cls, arf_file: str | os.PathLike, rmf_file: str | os.PathLike, **kwargs):
+    def from_ogip_file(cls, rmf_path: str | os.PathLike, arf_path: str | os.PathLike = None):
         """
         Load the data from OGIP files.
 
         Parameters:
-            arf_file: The ARF file path.
-            rmf_file: The RMF file path.
+            rmf_path: The RMF file path.
+            arf_path: The ARF file path.
             exposure: The exposure time in second.
             grouping: The grouping matrix.
         """
 
-        arf = DataARF.from_file(arf_file)
-        rmf = DataRMF.from_file(rmf_file)
+        rmf = DataRMF.from_file(rmf_path)
 
-        return cls.from_matrix(rmf.matrix, arf.specresp, rmf.energ_lo, rmf.energ_hi, rmf.e_min, rmf.e_max)
+        if arf_path is not None:
+            specresp = DataARF.from_file(arf_path).specresp
+
+        else:
+            specresp = np.ones(rmf.energ_lo.shape)
+
+        return cls.from_matrix(rmf.sparse_matrix, specresp, rmf.energ_lo, rmf.energ_hi, rmf.e_min, rmf.e_max)
 
     def plot_redistribution(self, **kwargs):
         """
@@ -95,9 +101,8 @@ def plot_redistribution(self, **kwargs):
             y="e_max_channel",
             xscale="log",
             yscale="log",
-            cmap=cmr.cosmic,
-            vmin=0,
-            vmax=0.075,
+            cmap=cmr.ember_r,
+            norm=colors.LogNorm(vmin=1e-6, vmax=1),
             add_labels=True,
             **kwargs,
         )

diff --git a/src/jaxspec/data/obsconf.py b/src/jaxspec/data/obsconf.py
@@ -1,9 +1,14 @@
 import numpy as np
 import xarray as xr
+import sparse
 from .instrument import Instrument
 from .observation import Observation
 
 
+def densify_xarray(xarray):
+    return xr.DataArray(xarray.data.todense(), dims=xarray.dims, coords=xarray.coords, attrs=xarray.attrs, name=xarray.name)
+
+
 class ObsConfiguration(xr.Dataset):
     """
     Class to store the data of a folding model, which is the link between the unfolded and folded spectra.
@@ -51,20 +56,34 @@ def out_energies(self):
 
         out_energies = np.stack(
             (
-                np.asarray(self.coords["e_min_folded"], dtype=np.float64),
-                np.asarray(self.coords["e_max_folded"], dtype=np.float64),
+                np.asarray(self.coords["e_min_folded"].data.todense(), dtype=np.float64),
+                np.asarray(self.coords["e_max_folded"].data.todense(), dtype=np.float64),
             )
         )
 
         return out_energies
 
     @classmethod
-    def from_pha_file(cls, pha_file, low_energy: float = 1e-20, high_energy: float = 1e20):
+    def from_pha_file(
+        cls, pha_path, rmf_path=None, arf_path=None, bkg_path=None, low_energy: float = 1e-20, high_energy: float = 1e20
+    ):
         from .util import data_loader
 
-        pha, arf, rmf, bkg, metadata = data_loader(pha_file)
+        pha, arf, rmf, bkg, metadata = data_loader(pha_path, arf_path=arf_path, rmf_path=rmf_path, bkg_path=bkg_path)
 
-        instrument = Instrument.from_matrix(rmf.matrix, arf.specresp, rmf.energ_lo, rmf.energ_hi, rmf.e_min, rmf.e_max)
+        instrument = Instrument.from_matrix(
+            rmf.sparse_matrix,
+            arf.specresp if arf is not None else np.ones_like(rmf.energ_lo),
+            rmf.energ_lo,
+            rmf.energ_hi,
+            rmf.e_min,
+            rmf.e_max,
+        )
+
+        if bkg is not None:
+            backratio = np.where(bkg.backscal > 0.0, pha.backscal / np.where(bkg.backscal > 0, bkg.backscal, 1.0), 0.0)
+        else:
+            backratio = np.ones_like(pha.counts)
 
         observation = Observation.from_matrix(
             pha.counts,
@@ -73,7 +92,7 @@ def from_pha_file(cls, pha_file, low_energy: float = 1e-20, high_energy: float =
             pha.quality,
             pha.exposure,
             background=bkg.counts if bkg is not None else None,
-            backratio=pha.backscal / bkg.backscal if bkg is not None else 1.0,
+            backratio=backratio,
             attributes=metadata,
         )
 
@@ -83,33 +102,42 @@ def from_pha_file(cls, pha_file, low_energy: float = 1e-20, high_energy: float =
     def from_instrument(
         cls, instrument: Instrument, observation: Observation, low_energy: float = 1e-20, high_energy: float = 1e20
     ):
-        grouping = observation.grouping.copy()
+        # Exclude the bins flagged with bad quality
+        quality_filter = observation.quality == 0
+        grouping = observation.grouping * quality_filter
 
         # Computing the lower and upper energies of the bins after grouping
         # This is just a trick to compute it without 10 lines of code
         e_min = (xr.where(grouping > 0, grouping, np.nan) * instrument.coords["e_min_channel"]).min(
             skipna=True, dim="instrument_channel"
         )
+
         e_max = (xr.where(grouping > 0, grouping, np.nan) * instrument.coords["e_max_channel"]).max(
             skipna=True, dim="instrument_channel"
         )
 
         transfer_matrix = grouping @ (instrument.redistribution * instrument.area * observation.exposure)
         transfer_matrix = transfer_matrix.assign_coords({"e_min_folded": e_min, "e_max_folded": e_max})
 
-        # Exclude the bins flagged with bad quality
-        quality_filter = observation.quality == 0
-        grouping[:, ~quality_filter] = 0
+        # Exclude bins out of the considered energy range, and bins without contribution from the RMF
+        row_idx = densify_xarray(((e_min > low_energy) & (e_max < high_energy)) * (grouping.sum(dim="instrument_channel") > 0))
+
+        col_idx = densify_xarray(
+            (instrument.coords["e_min_unfolded"] > 0) * (instrument.redistribution.sum(dim="instrument_channel") > 0)
+        )
+
+        # The transfer matrix is converted locally to csr format to allow FAST slicing
+        transfer_matrix_scipy = transfer_matrix.data.to_scipy_sparse().tocsr()
+        transfer_matrix_reduced = transfer_matrix_scipy[row_idx.data][:, col_idx.data]
+        transfer_matrix_reduced = sparse.COO.from_scipy_sparse(transfer_matrix_reduced)
 
-        row_idx = xr.ones_like(e_min, dtype=bool)
-        row_idx *= (e_min > low_energy) & (e_max < high_energy)  # Strict exclusion as in XSPEC
-        row_idx *= grouping.sum(dim="instrument_channel") > 0  # Exclude channels with no contribution
+        # A dummy zero matrix is put so that the slicing in xarray is fast
+        transfer_matrix.data = sparse.zeros_like(transfer_matrix.data)
+        transfer_matrix = transfer_matrix[row_idx][:, col_idx]
 
-        col_idx = xr.ones_like(instrument.area, dtype=bool)
-        col_idx *= instrument.coords["e_min_unfolded"] > 0.0  # Exclude channels with 0. as lower energy bound
-        col_idx *= instrument.redistribution.sum(dim="instrument_channel") > 0  # Exclude channels with no contribution
+        # The reduced transfer matrix is put back in the xarray
+        transfer_matrix.data = transfer_matrix_reduced
 
-        transfer_matrix = transfer_matrix.where(row_idx & col_idx, drop=True)
         folded_counts = observation.folded_counts.copy().where(row_idx, drop=True)
 
         if observation.folded_background is not None:
@@ -123,7 +151,7 @@ def from_instrument(
                 "transfer_matrix": transfer_matrix,
                 "area": instrument.area.copy().where(col_idx, drop=True),
                 "exposure": observation.exposure,
-                "backratio": observation.backratio,
+                "folded_backratio": observation.folded_backratio.copy().where(row_idx, drop=True),
                 "folded_counts": folded_counts,
                 "folded_background": folded_background,
             }

diff --git a/src/jaxspec/data/observation.py b/src/jaxspec/data/observation.py
@@ -43,39 +43,48 @@ def from_matrix(
             attributes = {}
 
         if background is None:
-            background = np.zeros_like(counts, dtype=int)
+            background = np.zeros_like(counts, dtype=np.int64)
 
         data_dict = {
-            "counts": (["instrument_channel"], np.array(counts, dtype=np.int64), {"description": "Counts", "unit": "photons"}),
+            "counts": (["instrument_channel"], np.asarray(counts, dtype=np.int64), {"description": "Counts", "unit": "photons"}),
             "folded_counts": (
                 ["folded_channel"],
-                np.array(grouping @ counts, dtype=int),
+                np.asarray(np.ma.filled(grouping @ counts), dtype=np.int64),
                 {"description": "Folded counts, after grouping", "unit": "photons"},
             ),
             "grouping": (
                 ["folded_channel", "instrument_channel"],
-                np.array(grouping, dtype=bool),
+                grouping,
                 {"description": "Grouping matrix."},
             ),
-            "quality": (["instrument_channel"], np.array(quality, dtype=int), {"description": "Quality flag."}),
+            "quality": (["instrument_channel"], np.asarray(quality, dtype=np.int64), {"description": "Quality flag."}),
             "exposure": ([], float(exposure), {"description": "Total exposure", "unit": "s"}),
-            "backratio": ([], float(backratio), {"description": "Background scaling (SRC_BACKSCAL/BKG_BACKSCAL)"}),
+            "backratio": (
+                ["instrument_channel"],
+                np.asarray(backratio, dtype=float),
+                {"description": "Background scaling (SRC_BACKSCAL/BKG_BACKSCAL)"},
+            ),
+            "folded_backratio": (
+                ["folded_channel"],
+                np.asarray(np.ma.filled(grouping @ backratio), dtype=float),
+                {"description": "Background scaling after grouping"},
+            ),
             "background": (
                 ["instrument_channel"],
-                np.array(background, dtype=int),
+                np.asarray(background, dtype=np.int64),
                 {"description": "Background counts", "unit": "photons"},
             ),
             "folded_background": (
                 ["folded_channel"],
-                np.array(grouping @ background, dtype=int),
+                np.asarray(np.ma.filled(grouping @ background), dtype=np.int64),
                 {"description": "Background counts", "unit": "photons"},
             ),
         }
 
         return cls(
             data_dict,
             coords={
-                "channel": (["instrument_channel"], np.array(channel, dtype=np.int64), {"description": "Channel number"}),
+                "channel": (["instrument_channel"], np.asarray(channel, dtype=np.int64), {"description": "Channel number"}),
                 "grouped_channel": (
                     ["folded_channel"],
                     np.arange(len(grouping @ counts), dtype=np.int64),
@@ -92,9 +101,9 @@ def from_pha_file(cls, pha_file: str | os.PathLike, **kwargs):
         pha, arf, rmf, bkg, metadata = data_loader(pha_file)
 
         if bkg is not None:
-            backratio = (pha.backscal * pha.exposure * pha.areascal) / (bkg.backscal * bkg.exposure * bkg.areascal)
+            backratio = np.nan_to_num((pha.backscal * pha.exposure * pha.areascal) / (bkg.backscal * bkg.exposure * bkg.areascal))
         else:
-            backratio = 1.0
+            backratio = np.ones_like(pha.counts)
 
         return cls.from_matrix(
             pha.counts,
@@ -133,7 +142,7 @@ def plot_grouping(self):
         ax = fig.add_subplot(gs[1, 0])
         ax_histx = fig.add_subplot(gs[0, 0], sharex=ax)
         ax_histy = fig.add_subplot(gs[1, 1], sharey=ax)
-        sns.heatmap(self.grouping.T, ax=ax, cbar=False)
+        sns.heatmap(self.grouping.data.todense().T, ax=ax, cbar=False)
         ax_histx.step(np.arange(len(self.folded_counts)), self.folded_counts, where="post")
         ax_histy.step(self.counts, np.arange(len(self.counts)), where="post")
-Original file line number
+Diff line change
@@ Expand Up / @@ -2,6 +2,7 @@ @@
     *.npz
     *.arf
     *.rmf
+    *.lock
     data
     data_to_test
@@ Expand Down @@