Skip to content

Commit

Permalink
Add show_dataset_types option to queryCollections
Browse files Browse the repository at this point in the history
  • Loading branch information
leeskelvin committed Feb 17, 2025
1 parent 8544ada commit e1eea3c
Show file tree
Hide file tree
Showing 2 changed files with 143 additions and 29 deletions.
8 changes: 7 additions & 1 deletion python/lsst/daf/butler/cli/cmd/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -424,6 +424,12 @@ def prune_datasets(**kwargs: Any) -> None:
case_sensitive=False,
),
)
@click.option(
"-t",
"--show-dataset-types",
is_flag=True,
help="Also show the dataset types registered within each collection.",
)
@options_file_option()
def query_collections(*args: Any, **kwargs: Any) -> None:
"""Get the collections whose names match an expression."""
Expand Down Expand Up @@ -454,7 +460,7 @@ def query_dataset_types(*args: Any, **kwargs: Any) -> None:
"""Get the dataset types in a repository."""
table = script.queryDatasetTypes(*args, **kwargs)
if table:
table.pprint_all()
table.pprint_all(align="<")
else:
print("No results. Try --help for more information.")

Expand Down
164 changes: 136 additions & 28 deletions python/lsst/daf/butler/script/queryCollections.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,18 +30,25 @@
from collections.abc import Iterable
from typing import Literal

from astropy.table import Table
from astropy.table import Column, Table, hstack, vstack

from .._butler import Butler
from .._butler_collections import CollectionInfo
from .._collection_type import CollectionType


def _isAutomaticConnection(dataset_type: str) -> bool:
"""Return True if the dataset type is an automatically generated type."""
return dataset_type.endswith(("_config", "_log", "_metadata", "packages"))


def _getTable(
repo: str,
glob: Iterable[str],
collection_type: Iterable[CollectionType],
inverse: bool,
show_dataset_types: bool = False,
include_automatic_connections: bool = False,
) -> Table:
"""Run queryCollections and return the results in Table form.
Expand All @@ -60,6 +67,11 @@ def _getTable(
True if parent CHAINED datasets of each dataset should be listed in the
description column, False if children of CHAINED datasets should be
listed.
show_dataset_types : `bool`
If True, also show the dataset types present within each collection.
include_automatic_connections : `bool`
If True, include automatic connections in the output dataset types.
Only has an effect if `show_dataset_types` is True.
Returns
-------
Expand All @@ -72,35 +84,51 @@ def _getTable(
names=("Name", typeCol, descriptionCol),
dtype=(str, str, str),
)
if show_dataset_types:
table.add_column(Column(name="Dataset Types", dtype=str))
butler = Butler.from_config(repo)

def addCollection(info: CollectionInfo, relative: str) -> None:
info_relatives = getattr(info, relative)
if info_relatives:
collection_table = Table([[info.name], [info.type.name]], names=("Name", typeCol))
description_table = Table(names=(descriptionCol,), dtype=(str,))
for info_relative in info_relatives:
relative_table = Table([[info_relative]], names=(descriptionCol,))
if show_dataset_types:
cinfo = butler.collections.get_info(info_relative, include_summary=True)
dataset_types = [""] if not cinfo.dataset_types else cinfo.dataset_types
if not include_automatic_connections:
dataset_types = [dt for dt in dataset_types if not _isAutomaticConnection(dt)]
dataset_types = [""] if not dataset_types else dataset_types
types_table = Table({"Dataset Types": sorted(dataset_types)}, dtype=(str,))
relative_table = hstack([relative_table, types_table]).filled("")
description_table = vstack([description_table, relative_table])
collection_table = hstack([collection_table, description_table]).filled("")
for row in collection_table:
table.add_row(row)
else:
table.add_row((info.name, info.type.name, ""))

collections = sorted(
butler.collections.query_info(
glob or "*", collection_types=frozenset(collection_type), include_parents=inverse
glob or "*",
collection_types=frozenset(collection_type),
include_parents=inverse,
include_summary=show_dataset_types,
)
)
if inverse:
for info in collections:
if info.parents:
first = True
for parentName in sorted(info.parents):
table.add_row((info.name if first else "", info.type.name if first else "", parentName))
first = False
else:
table.add_row((info.name, info.type.name, ""))
addCollection(info, "parents")
# If none of the datasets has a parent dataset then remove the
# description column.
if not any(c for c in table[descriptionCol]):
del table[descriptionCol]
else:
for info in collections:
if info.type == CollectionType.CHAINED:
if info.children:
first = True
for child in info.children:
table.add_row((info.name if first else "", info.type.name if first else "", child))
first = False
else:
table.add_row((info.name, info.type.name, ""))
addCollection(info, "children")
else:
table.add_row((info.name, info.type.name, ""))
# If there aren't any CHAINED datasets in the results then remove the
Expand All @@ -116,6 +144,8 @@ def _getTree(
glob: Iterable[str],
collection_type: Iterable[CollectionType],
inverse: bool,
show_dataset_types: bool = False,
include_automatic_connections: bool = False,
) -> Table:
"""Run queryCollections and return the results in a table representing tree
form.
Expand All @@ -134,6 +164,11 @@ def _getTree(
True if parent CHAINED datasets of each dataset should be listed in the
description column, False if children of CHAINED datasets should be
listed.
show_dataset_types : `bool`
If True, also show the dataset types present within each collection.
include_automatic_connections : `bool`
If True, include automatic connections in the output dataset types.
Only has an effect if `show_dataset_types` is True.
Returns
-------
Expand All @@ -144,51 +179,119 @@ def _getTree(
names=("Name", "Type"),
dtype=(str, str),
)
if show_dataset_types:
table.add_column(Column(name="Dataset Types", dtype=str))
butler = Butler.from_config(repo, without_datastore=True)

def addCollection(info: CollectionInfo, level: int = 0) -> None:
table.add_row((" " * level + info.name, info.type.name))
collection_table = Table([[" " * level + info.name], [info.type.name]], names=["Name", "Type"])
if show_dataset_types:
dataset_types = [""] if not info.dataset_types else info.dataset_types
if not include_automatic_connections:
dataset_types = [dt for dt in dataset_types if not _isAutomaticConnection(dt)]
dataset_types = [""] if not dataset_types else dataset_types
dataset_types_table = Table({"Dataset Types": sorted(dataset_types)}, dtype=(str,))
collection_table = hstack([collection_table, dataset_types_table]).filled("")
for row in collection_table:
table.add_row(row)

if inverse:
assert info.parents is not None # For mypy.
for pname in sorted(info.parents):
pinfo = butler.collections.get_info(pname, include_parents=inverse)
pinfo = butler.collections.get_info(
pname, include_parents=inverse, include_summary=show_dataset_types
)
addCollection(pinfo, level + 1)
else:
if info.type == CollectionType.CHAINED:
for name in info.children:
cinfo = butler.collections.get_info(name)
cinfo = butler.collections.get_info(name, include_summary=show_dataset_types)
addCollection(cinfo, level + 1)

collections = butler.collections.query_info(
glob or "*", collection_types=frozenset(collection_type), include_parents=inverse
glob or "*",
collection_types=frozenset(collection_type),
include_parents=inverse,
include_summary=show_dataset_types,
)
for collection in sorted(collections):
addCollection(collection)
return table


def _getList(
repo: str, glob: Iterable[str], collection_type: Iterable[CollectionType], flatten_chains: bool
repo: str,
glob: Iterable[str],
collection_type: Iterable[CollectionType],
flatten_chains: bool,
show_dataset_types: bool = False,
include_automatic_connections: bool = False,
) -> Table:
"""Return collection results as a table representing a flat list of
collections.
Parameters
----------
repo : `str`
Butler repository location.
glob : `collections.abc.Iterable` of `str`
Wildcards to pass to ``queryCollections``.
collection_type
Same as `queryCollections`
flatten_chains : `bool`
If True, flatten the tree of CHAINED datasets.
show_dataset_types : `bool`
If True, also show the dataset types present within each collection.
include_automatic_connections : `bool`
If True, include automatic connections in the output dataset types.
Only has an effect if `show_dataset_types` is True.
Returns
-------
collections : `astropy.table.Table`
Same as `queryCollections`
"""
table = Table(
names=("Name", "Type"),
dtype=(str, str),
)
if show_dataset_types:
table.add_column(Column(name="Dataset Types", dtype=str))
butler = Butler.from_config(repo)

def addCollection(info: CollectionInfo) -> None:
collection_table = Table([[info.name], [info.type.name]], names=["Name", "Type"])
if show_dataset_types:
dataset_types = [""] if not info.dataset_types else info.dataset_types
if not include_automatic_connections:
dataset_types = [dt for dt in dataset_types if not _isAutomaticConnection(dt)]
dataset_types = [""] if not dataset_types else dataset_types
dataset_types_table = Table({"Dataset Types": sorted(dataset_types)}, dtype=(str,))
collection_table = hstack([collection_table, dataset_types_table]).filled("")
for row in collection_table:
table.add_row(row)

collections = list(
butler.collections.query_info(
glob or "*", collection_types=frozenset(collection_type), flatten_chains=flatten_chains
glob or "*",
collection_types=frozenset(collection_type),
flatten_chains=flatten_chains,
include_summary=show_dataset_types,
)
)
names = [c.name for c in collections]
types = [c.type.name for c in collections]
return Table((names, types), names=("Name", "Type"))
for collection in collections:
addCollection(collection)

return table


def queryCollections(
repo: str,
glob: Iterable[str],
collection_type: Iterable[CollectionType],
chains: Literal["INVERSE-TABLE", "TABLE", "TREE", "INVERSE-TREE", "FLATTEN", "NO-CHILDREN"],
show_dataset_types: bool = False,
include_automatic_connections: bool = False,
) -> Table:
"""Get the collections whose names match an expression.
Expand All @@ -206,17 +309,22 @@ def queryCollections(
chains : `str`
Affects contents and formatting of results, see
``cli.commands.query_collections``.
show_dataset_types : `bool`, optional
If True, include the dataset types present within each collection.
include_automatic_connections : `bool`, optional
If True, include automatic connections in the output dataset types.
Only has an effect if `show_dataset_types` is True.
Returns
-------
collections : `astropy.table.Table`
A table containing information about collections.
"""
if (inverse := chains == "INVERSE-TABLE") or chains == "TABLE":
return _getTable(repo, glob, collection_type, inverse)
return _getTable(repo, glob, collection_type, inverse, show_dataset_types)
elif (inverse := chains == "INVERSE-TREE") or chains == "TREE":
return _getTree(repo, glob, collection_type, inverse)
return _getTree(repo, glob, collection_type, inverse, show_dataset_types)
elif chains == "FLATTEN" or chains == "NO-CHILDREN":
flatten = chains == "FLATTEN"
return _getList(repo, glob, collection_type, flatten)
return _getList(repo, glob, collection_type, flatten, show_dataset_types)
raise RuntimeError(f"Value for --chains not recognized: {chains}")

0 comments on commit e1eea3c

Please sign in to comment.