Skip to content

Commit

Permalink
Escape prefix/sep values as needed and restrict sep values
Browse files Browse the repository at this point in the history
Can now use "*" as a separator. Previously this broke the
regex for finding the keys. Sep can no longer be alphanumeric
or an underscore.
  • Loading branch information
timj committed Feb 19, 2025
1 parent 6fec412 commit 15f7a77
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 24 deletions.
18 changes: 13 additions & 5 deletions python/lsst/daf/butler/_dataset_provenance.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,8 @@ def to_flat_dict(
A prefix to use for each key in the provenance dictionary.
sep : `str`, optional
Separator to use to represent hierarchy. Must be a single
character.
character. Can not be a number, letter, or underscore (to avoid
confusion with provenance keys themselves).
simple_types : `bool`, optional
If `True` only simple Python types will be used in the returned
dictionary, specifically UUIDs will be returned as `str`. If
Expand Down Expand Up @@ -169,6 +170,12 @@ def to_flat_dict(
ValueError
Raised if the separator is not a single character.
"""
if len(sep) != 1:
raise ValueError(f"Separator for provenance keys must be a single character. Got {sep!r}.")
if re.match(r"[_\w\d]$", sep):
raise ValueError(
f"Separator for provenance keys can not be word character or underscore. Got {sep!r}."
)

def _make_key(*keys: str | int) -> str:
"""Make the key in the correct form with simpler API."""
Expand Down Expand Up @@ -221,8 +228,6 @@ def _make_provenance_key(prefix: str, sep: str, *keys: str | int) -> str:
prefix (defaulting to lower case if the first character of
prefix has no case).
"""
if len(sep) != 1:
raise ValueError(f"Separator for provenance keys must be a single character. Got {sep!r}.")
use_upper = prefix[0].isupper() if prefix else False
if prefix:
prefix += sep
Expand Down Expand Up @@ -348,16 +353,19 @@ def _find_provenance_keys_in_flat_dict(cls, prov_dict: Mapping) -> dict[str, str

core_provenance = tuple(f"{prefix}{k}".lower() for k in ("run", "id", "datasettype", "quantum"))

# Need to escape the prefix and separator for regex usage.
esc_sep = re.escape(sep)
esc_prefix = re.escape(prefix)
prov_keys: dict[str, str] = {}
for k in list(prov_dict):
# the input provenance can include extra keys that we cannot
# know so have to match solely on INPUT N.
found_key = False
if re.match(rf"{prefix}input{sep}(\d+){sep}(.*)$", k, flags=re.IGNORECASE):
if re.match(rf"{esc_prefix}input{esc_sep}(\d+){esc_sep}(.*)$", k, flags=re.IGNORECASE):
found_key = True
elif k.lower() in core_provenance:
found_key = True
elif re.match(f"{prefix}dataid{sep}[a-z_]+$", k, flags=re.IGNORECASE):
elif re.match(f"{esc_prefix}dataid{esc_sep}[a-z_]+$", k, flags=re.IGNORECASE):
found_key = True

if found_key:
Expand Down
52 changes: 33 additions & 19 deletions tests/test_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -815,17 +815,23 @@ def test_dataset_provenance(self) -> None:
DatasetProvenance.strip_provenance_from_flat_dict(prov_dict)
self.assertEqual(prov_dict, {})

# Use a prefix and different separator.
prov_dict = prov.to_flat_dict(ref1, prefix="LSST BUTLER 🔭", sep=" ")
self.assertIn("LSST BUTLER 🔭 RUN", prov_dict)
self.assertIn("LSST BUTLER 🔭 INPUT 0 EXTRA_NUMBER", prov_dict)
self.assertEqual(prov_dict["LSST BUTLER 🔭 RUN"], "somerun")
self.assertEqual(prov_dict["LSST BUTLER 🔭 INPUT 0 EXTRA_NUMBER"], 42)
DatasetProvenance.strip_provenance_from_flat_dict(prov_dict)
self.assertEqual(prov_dict, {})

for prefix, sep in (
("LSST BUTLER 🔭", " "), # Unicode in prefix.
("LSST*BUTLER 🔭", " "), # regex character.
("LSST*BUTLER", "+"), # two regex characters.
("LSST_BUTLER", "\\"), # backslash for extra difficulty.
("LSST BUTLER 🔭", "→"), # Unicode separator.
):
prov_dict = prov.to_flat_dict(ref1, prefix=prefix, sep=sep)
self.assertIn(f"{prefix}{sep}RUN", prov_dict)
self.assertIn(f"{prefix}{sep}INPUT{sep}0{sep}EXTRA_NUMBER", prov_dict)
self.assertEqual(prov_dict[f"{prefix}{sep}RUN"], "somerun")
self.assertEqual(prov_dict[f"{prefix}{sep}INPUT{sep}0{sep}EXTRA_NUMBER"], 42)
DatasetProvenance.strip_provenance_from_flat_dict(prov_dict)
self.assertEqual(prov_dict, {})

# Prefix has no case so lower case assumed.
prov_dict = prov.to_flat_dict(ref1, prefix="🔭 LSST BUTLER", sep="→")

self.assertIn("🔭 LSST BUTLER→run", prov_dict)
self.assertIn("🔭 LSST BUTLER→input→0→extra_number", prov_dict)
self.assertEqual(prov_dict["🔭 LSST BUTLER→run"], "somerun")
Expand All @@ -847,13 +853,13 @@ def test_dataset_provenance(self) -> None:
self.assertEqual(prov_dict, {})

# Check that an empty provenance with a ref returns info just for
# that ref.
prov_dict = prov2.to_flat_dict(ref1, prefix="", sep=".")
# that ref. Use separator that needs escaping in a regex.
prov_dict = prov2.to_flat_dict(ref1, prefix="", sep="*")
expected = {
"id": ref1.id,
"datasettype": "test",
"dataid.instrument": "DummyCam",
"dataid.visit": 42,
"dataid*instrument": "DummyCam",
"dataid*visit": 42,
"run": "somerun",
}
self.assertEqual(prov_dict, expected)
Expand All @@ -874,18 +880,26 @@ def test_dataset_provenance(self) -> None:
DatasetProvenance.strip_provenance_from_flat_dict(prov_dict)
self.assertEqual(prov_dict, {})

prov_dict = prov3.to_flat_dict(empty_ref, prefix="xyz", sep="-")
prov_dict = prov3.to_flat_dict(empty_ref, prefix="x-yz", sep="-")
expected = {
"xyz-id": empty_ref.id,
"xyz-datasettype": "empty",
"xyz-run": "empty_run",
"x-yz-id": empty_ref.id,
"x-yz-datasettype": "empty",
"x-yz-run": "empty_run",
}
self.assertEqual(prov_dict, expected)
DatasetProvenance.strip_provenance_from_flat_dict(prov_dict)
self.assertEqual(prov_dict, {})

with self.assertRaises(ValueError):
prov3.to_flat_dict(empty_ref, sep="abc")
prov3.to_flat_dict(empty_ref, sep="##")
with self.assertRaises(ValueError):
prov3.to_flat_dict(empty_ref, sep="a")
with self.assertRaises(ValueError):
prov3.to_flat_dict(empty_ref, sep="1")
with self.assertRaises(ValueError):
prov3.to_flat_dict(empty_ref, sep="_")
with self.assertRaises(ValueError):
prov3.to_flat_dict(empty_ref, sep="Σ")

# Dictionary with inconsistent prefixes and separators.
test_dicts = (
Expand Down

0 comments on commit 15f7a77

Please sign in to comment.