Merge pull request #1147 from lsst/tickets/DM-35396

DM-35396: Add provenance hooks for datastore to use
lsst · Feb 4, 2025 · 84e7e1b · 84e7e1b
2 parents 19513f1 + dd3996e
commit 84e7e1b
Show file tree

Hide file tree

Showing 25 changed files with 289 additions and 30 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -8,7 +8,7 @@ repos:
       - id: end-of-file-fixer
       - id: trailing-whitespace
   - repo: https://github.com/psf/black-pre-commit-mirror
-    rev: 24.10.0
+    rev: 25.1.0
     hooks:
       - id: black
         # It is recommended to specify the latest version of Python
@@ -17,13 +17,13 @@ repos:
         # https://pre-commit.com/#top_level-default_language_version
         language_version: python3.11
   - repo: https://github.com/pycqa/isort
-    rev: 5.13.2
+    rev: 6.0.0
     hooks:
       - id: isort
         name: isort (python)
   - repo: https://github.com/astral-sh/ruff-pre-commit
     # Ruff version.
-    rev: v0.9.1
+    rev: v0.9.3
     hooks:
       - id: ruff
   - repo: https://github.com/numpy/numpydoc

diff --git a/doc/changes/DM-35396.feature.rst b/doc/changes/DM-35396.feature.rst
@@ -0,0 +1,4 @@
+* Added new class ``DatasetProvenance`` for tracking the provenance of an individual dataset.
+* Modified ``Butler.put()`` to accept an optional ``DatasetProvenance``.
+* Added ``add_provenance`` methods to ``FormatterV2`` and ``StorageClassDelegate``.
+  These methods will now be called with the provenance object during ``Butler.put()`` to allow the in-memory dataset to be updated prior to writing.
diff --git a/python/lsst/daf/butler/__init__.py b/python/lsst/daf/butler/__init__.py
@@ -47,6 +47,7 @@
 from ._config_support import LookupKey
 from ._dataset_association import *
 from ._dataset_existence import *
+from ._dataset_provenance import *
 from ._dataset_ref import *
 from ._dataset_type import *
 from ._deferredDatasetHandle import *

diff --git a/python/lsst/daf/butler/_butler.py b/python/lsst/daf/butler/_butler.py
@@ -59,6 +59,7 @@
 
 if TYPE_CHECKING:
     from ._dataset_existence import DatasetExistence
+    from ._dataset_provenance import DatasetProvenance
     from ._dataset_ref import DatasetId, DatasetRef
     from ._dataset_type import DatasetType
     from ._deferredDatasetHandle import DeferredDatasetHandle
@@ -664,6 +665,7 @@ def put(
         dataId: DataId | None = None,
         *,
         run: str | None = None,
+        provenance: DatasetProvenance | None = None,
         **kwargs: Any,
     ) -> DatasetRef:
         """Store and register a dataset.
@@ -683,6 +685,9 @@ def put(
         run : `str`, optional
             The name of the run the dataset should be added to, overriding
             ``self.run``. Not used if a resolved `DatasetRef` is provided.
+        provenance : `DatasetProvenance` or `None`, optional
+            Any provenance that should be attached to the serialized dataset.
+            Not supported by all serialization mechanisms.
         **kwargs
             Additional keyword arguments used to augment or construct a
             `DataCoordinate`.  See `DataCoordinate.standardize`

diff --git a/python/lsst/daf/butler/_dataset_provenance.py b/python/lsst/daf/butler/_dataset_provenance.py
@@ -0,0 +1,84 @@
+# This file is part of daf_butler.
+#
+# Developed for the LSST Data Management System.
+# This product includes software developed by the LSST Project
+# (http://www.lsst.org).
+# See the COPYRIGHT file at the top-level directory of this distribution
+# for details of code ownership.
+#
+# This software is dual licensed under the GNU General Public License and also
+# under a 3-clause BSD license. Recipients may choose which of these licenses
+# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
+# respectively.  If you choose the GPL option then the following text applies
+# (but note that there is still no warranty even if you opt for BSD instead):
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+from __future__ import annotations
+
+__all__ = ("DatasetProvenance",)
+
+import typing
+import uuid
+
+import pydantic
+
+from ._dataset_ref import DatasetRef, SerializedDatasetRef
+
+
+class DatasetProvenance(pydantic.BaseModel):
+    """Provenance of a single `DatasetRef`."""
+
+    inputs: list[SerializedDatasetRef] = pydantic.Field(default_factory=list)
+    """The input datasets."""
+    quantum_id: uuid.UUID | None = None
+    """Identifier of the Quantum that was executed."""
+    extras: dict[uuid.UUID, dict[str, int | float | str | bool]] = pydantic.Field(default_factory=dict)
+    """Extra provenance information associated with a particular dataset."""
+    _uuids: set[uuid.UUID] = pydantic.PrivateAttr(default_factory=set)
+
+    @pydantic.model_validator(mode="after")
+    def populate_cache(self) -> typing.Self:
+        for ref in self.inputs:
+            self._uuids.add(ref.id)
+        return self
+
+    def add_input(self, ref: DatasetRef) -> None:
+        """Add an input dataset to the provenance.
+
+        Parameters
+        ----------
+        ref : `DatasetRef`
+            A dataset to register as an input.
+        """
+        if ref.id in self._uuids:
+            # Already registered.
+            return
+        self._uuids.add(ref.id)
+        self.inputs.append(ref.to_simple())
+
+    def add_extra_provenance(self, dataset_id: uuid.UUID, extra: dict[str, int | float | str | bool]) -> None:
+        """Attach extra provenance to a specific dataset.
+
+        Parameters
+        ----------
+        dataset_id : `uuid.UUID`
+            The ID of the dataset to receive this provenance.
+        extra : `dict` [ `str`, `typing.Any` ]
+            The extra provenance information as a dictionary. The values
+            must be simple Python scalars.
+        """
+        if dataset_id not in self._uuids:
+            raise ValueError(f"The given dataset ID {dataset_id} is not known to this provenance instance.")
+        self.extras.setdefault(dataset_id, {}).update(extra)
diff --git a/python/lsst/daf/butler/_formatter.py b/python/lsst/daf/butler/_formatter.py
@@ -60,6 +60,7 @@
 log = logging.getLogger(__name__)
 
 if TYPE_CHECKING:
+    from ._dataset_provenance import DatasetProvenance
     from ._dataset_ref import DatasetRef
     from ._dataset_type import DatasetType
     from ._storage_class import StorageClass
@@ -98,6 +99,7 @@ class FormatterV2:
          Parameters to control how the dataset is serialized.
     write_recipes : `dict`, optional
         Detailed write recipes indexed by recipe name.
+
     **kwargs
         Additional arguments that will be ignored but allow for
         `Formatter` V1 parameters to be given.
@@ -863,11 +865,38 @@ def read_from_local_file(self, path: str, component: str | None = None, expected
         """
         return NotImplemented
 
+    def add_provenance(
+        self, in_memory_dataset: Any, /, *, provenance: DatasetProvenance | None = None
+    ) -> Any:
+        """Add provenance to the dataset.
+
+        Parameters
+        ----------
+        in_memory_dataset : `object`
+            The dataset to serialize.
+        provenance : `DatasetProvenance` or `None`, optional
+            Provenance to attach to dataset.
+
+        Returns
+        -------
+        dataset_to_write : `object`
+            The dataset to use for serialization. Can be the same object as
+            given.
+
+        Notes
+        -----
+        The base class implementation returns the given object unchanged.
+        """
+        return in_memory_dataset
+
     @final
     def write(
         self,
         in_memory_dataset: Any,
+        /,
+        *,
         cache_manager: AbstractDatastoreCacheManager | None = None,
+        provenance: DatasetProvenance | None = None,
     ) -> None:
         """Write a Dataset.
 
@@ -878,6 +907,8 @@ def write(
         cache_manager : `AbstractDatastoreCacheManager`
             A cache manager to use to allow a formatter to cache the written
             file.
+        provenance : `DatasetProvenance` | `None`, optional
+            Provenance to attach to the file being written.
 
         Raises
         ------
@@ -895,6 +926,10 @@ def write(
         # Ensure we are using the correct file extension.
         uri = self.file_descriptor.location.uri.updatedExtension(self.get_write_extension())
 
+        # Attach any provenance to the dataset. This could involve returning
+        # a different object.
+        in_memory_dataset = self.add_provenance(in_memory_dataset, provenance=provenance)
+
         written = self.write_direct(in_memory_dataset, uri, cache_manager)
         if not written:
             self.write_locally_then_move(in_memory_dataset, uri, cache_manager)

diff --git a/python/lsst/daf/butler/_limited_butler.py b/python/lsst/daf/butler/_limited_butler.py
@@ -36,6 +36,7 @@
 
 from lsst.resources import ResourcePath
 
+from ._dataset_provenance import DatasetProvenance
 from ._dataset_ref import DatasetRef
 from ._deferredDatasetHandle import DeferredDatasetHandle
 from ._storage_class import StorageClass, StorageClassFactory
@@ -64,7 +65,7 @@ def isWriteable(self) -> bool:
         raise NotImplementedError()
 
     @abstractmethod
-    def put(self, obj: Any, ref: DatasetRef, /) -> DatasetRef:
+    def put(self, obj: Any, ref: DatasetRef, /, *, provenance: DatasetProvenance | None = None) -> DatasetRef:
         """Store a dataset that already has a UUID and ``RUN`` collection.
 
         Parameters
@@ -73,6 +74,9 @@ def put(self, obj: Any, ref: DatasetRef, /) -> DatasetRef:
             The dataset.
         ref : `DatasetRef`
             Resolved reference for a not-yet-stored dataset.
+        provenance : `DatasetProvenance` or `None`, optional
+            Any provenance that should be attached to the serialized dataset.
+            Not supported by all serialization mechanisms.
 
         Returns
         -------
@@ -261,8 +265,7 @@ def getURI(
 
         if primary is None or components:
             raise RuntimeError(
-                f"Dataset ({ref}) includes distinct URIs for components. "
-                "Use LimitedButler.getURIs() instead."
+                f"Dataset ({ref}) includes distinct URIs for components. Use LimitedButler.getURIs() instead."
             )
         return primary
 

diff --git a/python/lsst/daf/butler/_quantum_backed.py b/python/lsst/daf/butler/_quantum_backed.py
@@ -43,6 +43,7 @@
 
 from ._butler_config import ButlerConfig
 from ._config import Config
+from ._dataset_provenance import DatasetProvenance
 from ._dataset_ref import DatasetId, DatasetRef
 from ._dataset_type import DatasetType
 from ._deferredDatasetHandle import DeferredDatasetHandle
@@ -453,11 +454,11 @@ def dimensions(self) -> DimensionUniverse:
         # Docstring inherited.
         return self._dimensions
 
-    def put(self, obj: Any, ref: DatasetRef, /) -> DatasetRef:
+    def put(self, obj: Any, ref: DatasetRef, /, *, provenance: DatasetProvenance | None = None) -> DatasetRef:
         # Docstring inherited.
         if ref.id not in self._predicted_outputs:
             raise RuntimeError("Cannot `put` dataset that was not predicted as an output.")
-        self._datastore.put(obj, ref)
+        self._datastore.put(obj, ref, provenance=provenance)
         self._actual_output_refs.add(ref)
         return ref
 

diff --git a/python/lsst/daf/butler/_storage_class_delegate.py b/python/lsst/daf/butler/_storage_class_delegate.py
@@ -40,6 +40,8 @@
 from lsst.utils.introspection import get_full_type_name
 
 if TYPE_CHECKING:
+    from lsst.daf.butler import DatasetProvenance, DatasetRef
+
     from ._storage_class import StorageClass
 
 log = logging.getLogger(__name__)
@@ -333,6 +335,33 @@ def disassemble(
 
         return components
 
+    def add_provenance(
+        self, inMemoryDataset: Any, ref: DatasetRef, provenance: DatasetProvenance | None = None
+    ) -> Any:
+        """Add provenance to the composite dataset.
+
+        Parameters
+        ----------
+        inMemoryDataset : `object`
+            The composite dataset to serialize.
+        ref : `DatasetRef`
+            The dataset associated with this in-memory dataset.
+        provenance : `DatasetProvenance` or `None`, optional
+            Any provenance that should be attached to the serialized dataset.
+            Can be ignored by a delegate.
+
+        Returns
+        -------
+        dataset_to_disassemble : `object`
+            The dataset to use for serialization and disassembly.
+            Can be the same object as given.
+
+        Notes
+        -----
+        The base class implementation returns the given object unchanged.
+        """
+        return inMemoryDataset
+
     def handleParameters(self, inMemoryDataset: Any, parameters: Mapping[str, Any] | None = None) -> Any:
         """Modify the in-memory dataset using the supplied parameters.
 

diff --git a/python/lsst/daf/butler/datastore/_datastore.py b/python/lsst/daf/butler/datastore/_datastore.py
@@ -61,6 +61,7 @@
 
     from .. import ddl
     from .._config_support import LookupKey
+    from .._dataset_provenance import DatasetProvenance
     from .._dataset_ref import DatasetRef
     from .._dataset_type import DatasetType
     from .._storage_class import StorageClass
@@ -626,7 +627,9 @@ def prepare_get_for_external_client(self, ref: DatasetRef) -> object | None:
         raise NotImplementedError()
 
     @abstractmethod
-    def put(self, inMemoryDataset: Any, datasetRef: DatasetRef) -> None:
+    def put(
+        self, inMemoryDataset: Any, datasetRef: DatasetRef, provenance: DatasetProvenance | None = None
+    ) -> None:
         """Write a `InMemoryDataset` with a given `DatasetRef` to the store.
 
         Parameters
@@ -635,6 +638,9 @@ def put(self, inMemoryDataset: Any, datasetRef: DatasetRef) -> None:
             The Dataset to store.
         datasetRef : `DatasetRef`
             Reference to the associated Dataset.
+        provenance : `DatasetProvenance` or `None`, optional
+            Any provenance that should be attached to the serialized dataset.
+            Not supported by all serialization mechanisms.
         """
         raise NotImplementedError("Must be implemented by subclass")
 
@@ -1449,7 +1455,9 @@ def get(
     ) -> Any:
         raise FileNotFoundError("This is a no-op datastore that can not access a real datastore")
 
-    def put(self, inMemoryDataset: Any, datasetRef: DatasetRef) -> None:
+    def put(
+        self, inMemoryDataset: Any, datasetRef: DatasetRef, provenance: DatasetProvenance | None = None
+    ) -> None:
         raise NotImplementedError("This is a no-op datastore that can not access a real datastore")
 
     def put_new(self, in_memory_dataset: Any, ref: DatasetRef) -> Mapping[str, DatasetRef]: