Add show_dataset_types option to queryCollections

lsst · Feb 18, 2025 · eaadbb1 · eaadbb1
1 parent d8501d5
commit eaadbb1
Show file tree

Hide file tree

Showing 2 changed files with 168 additions and 29 deletions.
diff --git a/python/lsst/daf/butler/cli/cmd/commands.py b/python/lsst/daf/butler/cli/cmd/commands.py
@@ -424,6 +424,21 @@ def prune_datasets(**kwargs: Any) -> None:
         case_sensitive=False,
     ),
 )
+@click.option(
+    "-t",
+    "--show-dataset-types",
+    is_flag=True,
+    help="Also show the dataset types registered within each collection.",
+)
+@click.option(
+    "--exclude-dataset-types",
+    type=click.STRING,
+    multiple=True,
+    default=["*_config,*_log,*_metadata,packages"],
+    callback=split_commas,
+    show_default=True,
+    help="Dataset types (comma-separated) to exclude. Only valid with --show-dataset-types.",
+)
 @options_file_option()
 def query_collections(*args: Any, **kwargs: Any) -> None:
     """Get the collections whose names match an expression."""

diff --git a/python/lsst/daf/butler/script/queryCollections.py b/python/lsst/daf/butler/script/queryCollections.py
@@ -28,9 +28,10 @@
 from __future__ import annotations
 
 from collections.abc import Iterable
+from fnmatch import fnmatch
 from typing import Literal
 
-from astropy.table import Table
+from astropy.table import Column, Table, hstack, vstack
 
 from .._butler import Butler
 from .._butler_collections import CollectionInfo
@@ -42,6 +43,8 @@ def _getTable(
     glob: Iterable[str],
     collection_type: Iterable[CollectionType],
     inverse: bool,
+    show_dataset_types: bool = False,
+    exclude_dataset_types: Iterable[str] = [],
 ) -> Table:
     """Run queryCollections and return the results in Table form.
 
@@ -60,6 +63,11 @@ def _getTable(
         True if parent CHAINED datasets of each dataset should be listed in the
         description column, False if children of CHAINED datasets should be
         listed.
+    show_dataset_types : `bool`
+        If True, also show the dataset types present within each collection.
+    exclude_dataset_types : `~collections.abc.Iterable` [ `str` ]
+        A glob-style comma-separated list of dataset types to exclude.
+        Only has an effect if `show_dataset_types` is True.
 
     Returns
     -------
@@ -72,37 +80,65 @@ def _getTable(
         names=("Name", typeCol, descriptionCol),
         dtype=(str, str, str),
     )
+    if show_dataset_types:
+        table.add_column(Column(name="Dataset Types", dtype=str))
     butler = Butler.from_config(repo)
+
+    def addCollection(info: CollectionInfo, relative: str) -> None:
+        info_relatives = getattr(info, relative)
+        # Parent results can be returned in a non-deterministic order, so sort
+        # them to make the output deterministic.
+        if relative == "parents":
+            info_relatives = sorted(info_relatives)
+        if info_relatives:
+            collection_table = Table([[info.name], [info.type.name]], names=("Name", typeCol))
+            description_table = Table(names=(descriptionCol,), dtype=(str,))
+            for info_relative in info_relatives:
+                relative_table = Table([[info_relative]], names=(descriptionCol,))
+                if show_dataset_types:
+                    cinfo = butler.collections.get_info(info_relative, include_summary=True)
+                    dataset_types = [""] if not cinfo.dataset_types else cinfo.dataset_types
+                    if exclude_dataset_types:
+                        dataset_types = [
+                            dt
+                            for dt in dataset_types
+                            if not any(fnmatch(dt, pattern) for pattern in exclude_dataset_types)
+                        ]
+                        dataset_types = [""] if not dataset_types else dataset_types
+                    types_table = Table({"Dataset Types": sorted(dataset_types)}, dtype=(str,))
+                    relative_table = hstack([relative_table, types_table]).filled("")
+                description_table = vstack([description_table, relative_table])
+            collection_table = hstack([collection_table, description_table]).filled("")
+            for row in collection_table:
+                table.add_row(row)
+        else:
+            new_row = [info.name, info.type.name]
+            new_row.extend([""] * (len(table.colnames) - len(new_row)))
+            table.add_row(new_row)
+
     collections = sorted(
         butler.collections.query_info(
-            glob or "*", collection_types=frozenset(collection_type), include_parents=inverse
+            glob or "*",
+            collection_types=frozenset(collection_type),
+            include_parents=inverse,
+            include_summary=show_dataset_types,
         )
     )
     if inverse:
         for info in collections:
-            if info.parents:
-                first = True
-                for parentName in sorted(info.parents):
-                    table.add_row((info.name if first else "", info.type.name if first else "", parentName))
-                    first = False
-            else:
-                table.add_row((info.name, info.type.name, ""))
+            addCollection(info, "parents")
         # If none of the datasets has a parent dataset then remove the
         # description column.
         if not any(c for c in table[descriptionCol]):
             del table[descriptionCol]
     else:
         for info in collections:
             if info.type == CollectionType.CHAINED:
-                if info.children:
-                    first = True
-                    for child in info.children:
-                        table.add_row((info.name if first else "", info.type.name if first else "", child))
-                        first = False
-                else:
-                    table.add_row((info.name, info.type.name, ""))
+                addCollection(info, "children")
             else:
-                table.add_row((info.name, info.type.name, ""))
+                new_row = [info.name, info.type.name]
+                new_row.extend([""] * (len(table.colnames) - len(new_row)))
+                table.add_row(new_row)
         # If there aren't any CHAINED datasets in the results then remove the
         # description column.
         if not any(columnVal == CollectionType.CHAINED.name for columnVal in table[typeCol]):
@@ -116,6 +152,8 @@ def _getTree(
     glob: Iterable[str],
     collection_type: Iterable[CollectionType],
     inverse: bool,
+    show_dataset_types: bool = False,
+    exclude_dataset_types: Iterable[str] = [],
 ) -> Table:
     """Run queryCollections and return the results in a table representing tree
     form.
@@ -134,6 +172,11 @@ def _getTree(
         True if parent CHAINED datasets of each dataset should be listed in the
         description column, False if children of CHAINED datasets should be
         listed.
+    show_dataset_types : `bool`
+        If True, also show the dataset types present within each collection.
+    exclude_dataset_types : `~collections.abc.Iterable` [ `str` ]
+        A glob-style comma-separated list of dataset types to exclude.
+        Only has an effect if `show_dataset_types` is True.
 
     Returns
     -------
@@ -144,51 +187,127 @@ def _getTree(
         names=("Name", "Type"),
         dtype=(str, str),
     )
+    if show_dataset_types:
+        table.add_column(Column(name="Dataset Types", dtype=str))
     butler = Butler.from_config(repo, without_datastore=True)
 
     def addCollection(info: CollectionInfo, level: int = 0) -> None:
-        table.add_row(("  " * level + info.name, info.type.name))
+        collection_table = Table([["  " * level + info.name], [info.type.name]], names=["Name", "Type"])
+        if show_dataset_types:
+            dataset_types = [""] if not info.dataset_types else info.dataset_types
+            if exclude_dataset_types:
+                dataset_types = [
+                    dt
+                    for dt in dataset_types
+                    if not any(fnmatch(dt, pattern) for pattern in exclude_dataset_types)
+                ]
+                dataset_types = [""] if not dataset_types else dataset_types
+            dataset_types_table = Table({"Dataset Types": sorted(dataset_types)}, dtype=(str,))
+            collection_table = hstack([collection_table, dataset_types_table]).filled("")
+        for row in collection_table:
+            table.add_row(row)
+
         if inverse:
             assert info.parents is not None  # For mypy.
             for pname in sorted(info.parents):
-                pinfo = butler.collections.get_info(pname, include_parents=inverse)
+                pinfo = butler.collections.get_info(
+                    pname, include_parents=inverse, include_summary=show_dataset_types
+                )
                 addCollection(pinfo, level + 1)
         else:
             if info.type == CollectionType.CHAINED:
                 for name in info.children:
-                    cinfo = butler.collections.get_info(name)
+                    cinfo = butler.collections.get_info(name, include_summary=show_dataset_types)
                     addCollection(cinfo, level + 1)
 
     collections = butler.collections.query_info(
-        glob or "*", collection_types=frozenset(collection_type), include_parents=inverse
+        glob or "*",
+        collection_types=frozenset(collection_type),
+        include_parents=inverse,
+        include_summary=show_dataset_types,
     )
     for collection in sorted(collections):
         addCollection(collection)
     return table
 
 
 def _getList(
-    repo: str, glob: Iterable[str], collection_type: Iterable[CollectionType], flatten_chains: bool
+    repo: str,
+    glob: Iterable[str],
+    collection_type: Iterable[CollectionType],
+    flatten_chains: bool,
+    show_dataset_types: bool = False,
+    exclude_dataset_types: Iterable[str] = [],
 ) -> Table:
     """Return collection results as a table representing a flat list of
     collections.
+
+    Parameters
+    ----------
+    repo : `str`
+        Butler repository location.
+    glob : `collections.abc.Iterable` of `str`
+        Wildcards to pass to ``queryCollections``.
+    collection_type
+        Same as `queryCollections`
+    flatten_chains : `bool`
+        If True, flatten the tree of CHAINED datasets.
+    show_dataset_types : `bool`
+        If True, also show the dataset types present within each collection.
+    exclude_dataset_types : `~collections.abc.Iterable` [ `str` ]
+        A glob-style comma-separated list of dataset types to exclude.
+        Only has an effect if `show_dataset_types` is True.
+
+    Returns
+    -------
+    collections : `astropy.table.Table`
+        Same as `queryCollections`
     """
+    table = Table(
+        names=("Name", "Type"),
+        dtype=(str, str),
+    )
+    if show_dataset_types:
+        table.add_column(Column(name="Dataset Types", dtype=str))
     butler = Butler.from_config(repo)
+
+    def addCollection(info: CollectionInfo) -> None:
+        collection_table = Table([[info.name], [info.type.name]], names=["Name", "Type"])
+        if show_dataset_types:
+            dataset_types = [""] if not info.dataset_types else info.dataset_types
+            if exclude_dataset_types:
+                dataset_types = [
+                    dt
+                    for dt in dataset_types
+                    if not any(fnmatch(dt, pattern) for pattern in exclude_dataset_types)
+                ]
+                dataset_types = [""] if not dataset_types else dataset_types
+            dataset_types_table = Table({"Dataset Types": sorted(dataset_types)}, dtype=(str,))
+            collection_table = hstack([collection_table, dataset_types_table]).filled("")
+        for row in collection_table:
+            table.add_row(row)
+
     collections = list(
         butler.collections.query_info(
-            glob or "*", collection_types=frozenset(collection_type), flatten_chains=flatten_chains
+            glob or "*",
+            collection_types=frozenset(collection_type),
+            flatten_chains=flatten_chains,
+            include_summary=show_dataset_types,
         )
     )
-    names = [c.name for c in collections]
-    types = [c.type.name for c in collections]
-    return Table((names, types), names=("Name", "Type"))
+    for collection in collections:
+        addCollection(collection)
+
+    return table
 
 
 def queryCollections(
     repo: str,
     glob: Iterable[str],
     collection_type: Iterable[CollectionType],
     chains: Literal["INVERSE-TABLE", "TABLE", "TREE", "INVERSE-TREE", "FLATTEN", "NO-CHILDREN"],
+    show_dataset_types: bool = False,
+    exclude_dataset_types: Iterable[str] = [],
 ) -> Table:
     """Get the collections whose names match an expression.
 
@@ -206,17 +325,22 @@ def queryCollections(
     chains : `str`
         Affects contents and formatting of results, see
         ``cli.commands.query_collections``.
+    show_dataset_types : `bool`, optional
+        If True, include the dataset types present within each collection.
+    exclude_dataset_types : `~collections.abc.Iterable` [ `str` ], optional
+        A glob-style comma-separated list of dataset types to exclude.
+        Only has an effect if `show_dataset_types` is True.
 
     Returns
     -------
     collections : `astropy.table.Table`
         A table containing information about collections.
     """
     if (inverse := chains == "INVERSE-TABLE") or chains == "TABLE":
-        return _getTable(repo, glob, collection_type, inverse)
+        return _getTable(repo, glob, collection_type, inverse, show_dataset_types, exclude_dataset_types)
     elif (inverse := chains == "INVERSE-TREE") or chains == "TREE":
-        return _getTree(repo, glob, collection_type, inverse)
+        return _getTree(repo, glob, collection_type, inverse, show_dataset_types, exclude_dataset_types)
     elif chains == "FLATTEN" or chains == "NO-CHILDREN":
         flatten = chains == "FLATTEN"
-        return _getList(repo, glob, collection_type, flatten)
+        return _getList(repo, glob, collection_type, flatten, show_dataset_types, exclude_dataset_types)
     raise RuntimeError(f"Value for --chains not recognized: {chains}")