Skip to content

Commit

Permalink
Merge pull request #1147 from lsst/tickets/DM-35396
Browse files Browse the repository at this point in the history
DM-35396: Add provenance hooks for datastore to use
  • Loading branch information
timj authored Feb 4, 2025
2 parents 19513f1 + dd3996e commit 84e7e1b
Show file tree
Hide file tree
Showing 25 changed files with 289 additions and 30 deletions.
6 changes: 3 additions & 3 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ repos:
- id: end-of-file-fixer
- id: trailing-whitespace
- repo: https://github.com/psf/black-pre-commit-mirror
rev: 24.10.0
rev: 25.1.0
hooks:
- id: black
# It is recommended to specify the latest version of Python
Expand All @@ -17,13 +17,13 @@ repos:
# https://pre-commit.com/#top_level-default_language_version
language_version: python3.11
- repo: https://github.com/pycqa/isort
rev: 5.13.2
rev: 6.0.0
hooks:
- id: isort
name: isort (python)
- repo: https://github.com/astral-sh/ruff-pre-commit
# Ruff version.
rev: v0.9.1
rev: v0.9.3
hooks:
- id: ruff
- repo: https://github.com/numpy/numpydoc
Expand Down
4 changes: 4 additions & 0 deletions doc/changes/DM-35396.feature.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
* Added new class ``DatasetProvenance`` for tracking the provenance of an individual dataset.
* Modified ``Butler.put()`` to accept an optional ``DatasetProvenance``.
* Added ``add_provenance`` methods to ``FormatterV2`` and ``StorageClassDelegate``.
These methods will now be called with the provenance object during ``Butler.put()`` to allow the in-memory dataset to be updated prior to writing.
1 change: 1 addition & 0 deletions python/lsst/daf/butler/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
from ._config_support import LookupKey
from ._dataset_association import *
from ._dataset_existence import *
from ._dataset_provenance import *
from ._dataset_ref import *
from ._dataset_type import *
from ._deferredDatasetHandle import *
Expand Down
5 changes: 5 additions & 0 deletions python/lsst/daf/butler/_butler.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@

if TYPE_CHECKING:
from ._dataset_existence import DatasetExistence
from ._dataset_provenance import DatasetProvenance
from ._dataset_ref import DatasetId, DatasetRef
from ._dataset_type import DatasetType
from ._deferredDatasetHandle import DeferredDatasetHandle
Expand Down Expand Up @@ -664,6 +665,7 @@ def put(
dataId: DataId | None = None,
*,
run: str | None = None,
provenance: DatasetProvenance | None = None,
**kwargs: Any,
) -> DatasetRef:
"""Store and register a dataset.
Expand All @@ -683,6 +685,9 @@ def put(
run : `str`, optional
The name of the run the dataset should be added to, overriding
``self.run``. Not used if a resolved `DatasetRef` is provided.
provenance : `DatasetProvenance` or `None`, optional
Any provenance that should be attached to the serialized dataset.
Not supported by all serialization mechanisms.
**kwargs
Additional keyword arguments used to augment or construct a
`DataCoordinate`. See `DataCoordinate.standardize`
Expand Down
84 changes: 84 additions & 0 deletions python/lsst/daf/butler/_dataset_provenance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
# This file is part of daf_butler.
#
# Developed for the LSST Data Management System.
# This product includes software developed by the LSST Project
# (http://www.lsst.org).
# See the COPYRIGHT file at the top-level directory of this distribution
# for details of code ownership.
#
# This software is dual licensed under the GNU General Public License and also
# under a 3-clause BSD license. Recipients may choose which of these licenses
# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
# respectively. If you choose the GPL option then the following text applies
# (but note that there is still no warranty even if you opt for BSD instead):
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

from __future__ import annotations

__all__ = ("DatasetProvenance",)

import typing
import uuid

import pydantic

from ._dataset_ref import DatasetRef, SerializedDatasetRef


class DatasetProvenance(pydantic.BaseModel):
"""Provenance of a single `DatasetRef`."""

inputs: list[SerializedDatasetRef] = pydantic.Field(default_factory=list)
"""The input datasets."""
quantum_id: uuid.UUID | None = None
"""Identifier of the Quantum that was executed."""
extras: dict[uuid.UUID, dict[str, int | float | str | bool]] = pydantic.Field(default_factory=dict)
"""Extra provenance information associated with a particular dataset."""
_uuids: set[uuid.UUID] = pydantic.PrivateAttr(default_factory=set)

@pydantic.model_validator(mode="after")
def populate_cache(self) -> typing.Self:
for ref in self.inputs:
self._uuids.add(ref.id)
return self

def add_input(self, ref: DatasetRef) -> None:
"""Add an input dataset to the provenance.
Parameters
----------
ref : `DatasetRef`
A dataset to register as an input.
"""
if ref.id in self._uuids:
# Already registered.
return
self._uuids.add(ref.id)
self.inputs.append(ref.to_simple())

def add_extra_provenance(self, dataset_id: uuid.UUID, extra: dict[str, int | float | str | bool]) -> None:
"""Attach extra provenance to a specific dataset.
Parameters
----------
dataset_id : `uuid.UUID`
The ID of the dataset to receive this provenance.
extra : `dict` [ `str`, `typing.Any` ]
The extra provenance information as a dictionary. The values
must be simple Python scalars.
"""
if dataset_id not in self._uuids:
raise ValueError(f"The given dataset ID {dataset_id} is not known to this provenance instance.")
self.extras.setdefault(dataset_id, {}).update(extra)
35 changes: 35 additions & 0 deletions python/lsst/daf/butler/_formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@
log = logging.getLogger(__name__)

if TYPE_CHECKING:
from ._dataset_provenance import DatasetProvenance
from ._dataset_ref import DatasetRef
from ._dataset_type import DatasetType
from ._storage_class import StorageClass
Expand Down Expand Up @@ -98,6 +99,7 @@ class FormatterV2:
Parameters to control how the dataset is serialized.
write_recipes : `dict`, optional
Detailed write recipes indexed by recipe name.
**kwargs
Additional arguments that will be ignored but allow for
`Formatter` V1 parameters to be given.
Expand Down Expand Up @@ -863,11 +865,38 @@ def read_from_local_file(self, path: str, component: str | None = None, expected
"""
return NotImplemented

def add_provenance(
self, in_memory_dataset: Any, /, *, provenance: DatasetProvenance | None = None
) -> Any:
"""Add provenance to the dataset.
Parameters
----------
in_memory_dataset : `object`
The dataset to serialize.
provenance : `DatasetProvenance` or `None`, optional
Provenance to attach to dataset.
Returns
-------
dataset_to_write : `object`
The dataset to use for serialization. Can be the same object as
given.
Notes
-----
The base class implementation returns the given object unchanged.
"""
return in_memory_dataset

@final
def write(
self,
in_memory_dataset: Any,
/,
*,
cache_manager: AbstractDatastoreCacheManager | None = None,
provenance: DatasetProvenance | None = None,
) -> None:
"""Write a Dataset.
Expand All @@ -878,6 +907,8 @@ def write(
cache_manager : `AbstractDatastoreCacheManager`
A cache manager to use to allow a formatter to cache the written
file.
provenance : `DatasetProvenance` | `None`, optional
Provenance to attach to the file being written.
Raises
------
Expand All @@ -895,6 +926,10 @@ def write(
# Ensure we are using the correct file extension.
uri = self.file_descriptor.location.uri.updatedExtension(self.get_write_extension())

# Attach any provenance to the dataset. This could involve returning
# a different object.
in_memory_dataset = self.add_provenance(in_memory_dataset, provenance=provenance)

written = self.write_direct(in_memory_dataset, uri, cache_manager)
if not written:
self.write_locally_then_move(in_memory_dataset, uri, cache_manager)
Expand Down
9 changes: 6 additions & 3 deletions python/lsst/daf/butler/_limited_butler.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@

from lsst.resources import ResourcePath

from ._dataset_provenance import DatasetProvenance
from ._dataset_ref import DatasetRef
from ._deferredDatasetHandle import DeferredDatasetHandle
from ._storage_class import StorageClass, StorageClassFactory
Expand Down Expand Up @@ -64,7 +65,7 @@ def isWriteable(self) -> bool:
raise NotImplementedError()

@abstractmethod
def put(self, obj: Any, ref: DatasetRef, /) -> DatasetRef:
def put(self, obj: Any, ref: DatasetRef, /, *, provenance: DatasetProvenance | None = None) -> DatasetRef:
"""Store a dataset that already has a UUID and ``RUN`` collection.
Parameters
Expand All @@ -73,6 +74,9 @@ def put(self, obj: Any, ref: DatasetRef, /) -> DatasetRef:
The dataset.
ref : `DatasetRef`
Resolved reference for a not-yet-stored dataset.
provenance : `DatasetProvenance` or `None`, optional
Any provenance that should be attached to the serialized dataset.
Not supported by all serialization mechanisms.
Returns
-------
Expand Down Expand Up @@ -261,8 +265,7 @@ def getURI(

if primary is None or components:
raise RuntimeError(
f"Dataset ({ref}) includes distinct URIs for components. "
"Use LimitedButler.getURIs() instead."
f"Dataset ({ref}) includes distinct URIs for components. Use LimitedButler.getURIs() instead."
)
return primary

Expand Down
5 changes: 3 additions & 2 deletions python/lsst/daf/butler/_quantum_backed.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@

from ._butler_config import ButlerConfig
from ._config import Config
from ._dataset_provenance import DatasetProvenance
from ._dataset_ref import DatasetId, DatasetRef
from ._dataset_type import DatasetType
from ._deferredDatasetHandle import DeferredDatasetHandle
Expand Down Expand Up @@ -453,11 +454,11 @@ def dimensions(self) -> DimensionUniverse:
# Docstring inherited.
return self._dimensions

def put(self, obj: Any, ref: DatasetRef, /) -> DatasetRef:
def put(self, obj: Any, ref: DatasetRef, /, *, provenance: DatasetProvenance | None = None) -> DatasetRef:
# Docstring inherited.
if ref.id not in self._predicted_outputs:
raise RuntimeError("Cannot `put` dataset that was not predicted as an output.")
self._datastore.put(obj, ref)
self._datastore.put(obj, ref, provenance=provenance)
self._actual_output_refs.add(ref)
return ref

Expand Down
29 changes: 29 additions & 0 deletions python/lsst/daf/butler/_storage_class_delegate.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@
from lsst.utils.introspection import get_full_type_name

if TYPE_CHECKING:
from lsst.daf.butler import DatasetProvenance, DatasetRef

from ._storage_class import StorageClass

log = logging.getLogger(__name__)
Expand Down Expand Up @@ -333,6 +335,33 @@ def disassemble(

return components

def add_provenance(
self, inMemoryDataset: Any, ref: DatasetRef, provenance: DatasetProvenance | None = None
) -> Any:
"""Add provenance to the composite dataset.
Parameters
----------
inMemoryDataset : `object`
The composite dataset to serialize.
ref : `DatasetRef`
The dataset associated with this in-memory dataset.
provenance : `DatasetProvenance` or `None`, optional
Any provenance that should be attached to the serialized dataset.
Can be ignored by a delegate.
Returns
-------
dataset_to_disassemble : `object`
The dataset to use for serialization and disassembly.
Can be the same object as given.
Notes
-----
The base class implementation returns the given object unchanged.
"""
return inMemoryDataset

def handleParameters(self, inMemoryDataset: Any, parameters: Mapping[str, Any] | None = None) -> Any:
"""Modify the in-memory dataset using the supplied parameters.
Expand Down
12 changes: 10 additions & 2 deletions python/lsst/daf/butler/datastore/_datastore.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@

from .. import ddl
from .._config_support import LookupKey
from .._dataset_provenance import DatasetProvenance
from .._dataset_ref import DatasetRef
from .._dataset_type import DatasetType
from .._storage_class import StorageClass
Expand Down Expand Up @@ -626,7 +627,9 @@ def prepare_get_for_external_client(self, ref: DatasetRef) -> object | None:
raise NotImplementedError()

@abstractmethod
def put(self, inMemoryDataset: Any, datasetRef: DatasetRef) -> None:
def put(
self, inMemoryDataset: Any, datasetRef: DatasetRef, provenance: DatasetProvenance | None = None
) -> None:
"""Write a `InMemoryDataset` with a given `DatasetRef` to the store.
Parameters
Expand All @@ -635,6 +638,9 @@ def put(self, inMemoryDataset: Any, datasetRef: DatasetRef) -> None:
The Dataset to store.
datasetRef : `DatasetRef`
Reference to the associated Dataset.
provenance : `DatasetProvenance` or `None`, optional
Any provenance that should be attached to the serialized dataset.
Not supported by all serialization mechanisms.
"""
raise NotImplementedError("Must be implemented by subclass")

Expand Down Expand Up @@ -1449,7 +1455,9 @@ def get(
) -> Any:
raise FileNotFoundError("This is a no-op datastore that can not access a real datastore")

def put(self, inMemoryDataset: Any, datasetRef: DatasetRef) -> None:
def put(
self, inMemoryDataset: Any, datasetRef: DatasetRef, provenance: DatasetProvenance | None = None
) -> None:
raise NotImplementedError("This is a no-op datastore that can not access a real datastore")

def put_new(self, in_memory_dataset: Any, ref: DatasetRef) -> Mapping[str, DatasetRef]:
Expand Down
Loading

0 comments on commit 84e7e1b

Please sign in to comment.