Skip to content

Commit

Permalink
WIP: decouple public query types from the old ones, rework interfaces.
Browse files Browse the repository at this point in the history
  • Loading branch information
TallJimbo committed Jan 12, 2024
1 parent 560646d commit bb5a22a
Show file tree
Hide file tree
Showing 14 changed files with 841 additions and 801 deletions.
43 changes: 23 additions & 20 deletions python/lsst/daf/butler/direct_query_driver/_driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,22 @@
from .._dataset_type import DatasetType
from ..dimensions import DataIdValue, DimensionElement, DimensionGroup, DimensionUniverse
from ..queries import tree as qt
from ..queries.data_coordinate_results import DataCoordinateResultPage, DataCoordinateResultSpec
from ..queries.dataset_results import DatasetRefResultPage, DatasetRefResultSpec
from ..queries.dimension_record_results import DimensionRecordResultPage, DimensionRecordResultSpec
from ..queries.driver import PageKey, QueryDriver, ResultPage, ResultSpec
from ..queries.general_results import GeneralResultPage, GeneralResultSpec
from ..queries.driver import (
DataCoordinateResultPage,
DatasetRefResultPage,
DimensionRecordResultPage,
GeneralResultPage,
PageKey,
QueryDriver,
ResultPage,
)
from ..queries.result_specs import (
DataCoordinateResultSpec,
DatasetRefResultSpec,
DimensionRecordResultSpec,
GeneralResultSpec,
ResultSpec,
)
from ..registry.managers import RegistryManagerInstances

if TYPE_CHECKING:
Expand Down Expand Up @@ -147,7 +158,7 @@ def count(
tree: qt.QueryTree,
*,
dimensions: DimensionGroup,
datasets: frozenset[str],
datasets: Set[str],
exact: bool,
discard: bool,
) -> int:
Expand Down Expand Up @@ -194,14 +205,8 @@ def _build_sql_select(
else:
sql_builder, postprocessing = self._make_vanilla_sql_builder(tree, columns_to_select)
# TODO: make results unique over columns_to_select, while taking into
# account postprocessing columns
return sql_builder.sql_select(
columns_to_select,
postprocessing,
order_by=[self._build_sql_order_by_expression(sql_builder, term) for term in tree.order_terms],
limit=tree.limit,
offset=tree.offset,
)
# account postprocessing columns.
return sql_builder.sql_select(columns_to_select, postprocessing)

def _make_vanilla_sql_builder(
self,
Expand Down Expand Up @@ -324,15 +329,13 @@ def _make_find_first_sql_builder(
#

# We'll start with the Common Table Expression (CTE) at the top, which
# we mostly get from _build_sql_selet. Note that we need to use
# 'columns_required' to populate the SELECT clause list, because this
# isn't the outermost query, and hence we need to propagate columns
# we'll use but not return to the user through it.
# we mostly get from _make_vanilla_sql_builder. Note that we need to
# use 'columns_required' to populate the SELECT clause list, because
# this isn't the outermost query, and hence we need to propagate
# columns we'll use but not return to the user through it.
rank = qt.DatasetFieldReference.model_construct(dataset_type=dataset_type, field="rank")
internal_columns = set(columns_to_select)
internal_columns.add(rank)
for term in tree.order_terms:
internal_columns.update(term.gather_required_columns())
base_sql_builder, postprocessing = self._make_vanilla_sql_builder(tree, internal_columns)
base_select, postprocessing = base_sql_builder.sql_select(internal_columns, postprocessing)
internal_columns.update(postprocessing.gather_columns_required())
Expand Down
6 changes: 6 additions & 0 deletions python/lsst/daf/butler/queries/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,9 @@
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

from ._base import *
from ._data_coordinate_query_results import *
from ._dataset_query_results import *
from ._dimension_record_query_results import *
from ._query import *
217 changes: 217 additions & 0 deletions python/lsst/daf/butler/queries/_base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,217 @@
# This file is part of daf_butler.
#
# Developed for the LSST Data Management System.
# This product includes software developed by the LSST Project
# (http://www.lsst.org).
# See the COPYRIGHT file at the top-level directory of this distribution
# for details of code ownership.
#
# This software is dual licensed under the GNU General Public License and also
# under a 3-clause BSD license. Recipients may choose which of these licenses
# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
# respectively. If you choose the GPL option then the following text applies
# (but note that there is still no warranty even if you opt for BSD instead):
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.

from __future__ import annotations

__all__ = ("ResultSpecBase", "QueryBase", "HomogeneousQueryBase", "CountableQueryBase", "QueryResultsBase")

from abc import ABC, abstractmethod
from collections.abc import Iterable, Set
from typing import Any, Self

import pydantic

from ..dimensions import DimensionGroup
from .convert_args import convert_order_by_args
from .driver import QueryDriver
from .expression_factory import ExpressionProxy
from .tree import OrderExpression, QueryTree


class ResultSpecBase(pydantic.BaseModel):
"""Base class for all query-result specification objects."""

order_by: tuple[OrderExpression, ...] = ()
"""Expressions to sort the rows by."""

offset: int = 0
"""Index of the first row to return."""

limit: int | None = None
"""Maximum number of rows to return, or `None` for no bound."""


class QueryBase(ABC):
@abstractmethod
def any(self, *, execute: bool = True, exact: bool = True) -> bool:
"""Test whether the query would return any rows.
Parameters
----------
execute : `bool`, optional
If `True`, execute at least a ``LIMIT 1`` query if it cannot be
determined prior to execution that the query would return no rows.
exact : `bool`, optional
If `True`, run the full query and perform post-query filtering if
needed, until at least one result row is found. If `False`, the
returned result does not account for post-query filtering, and
hence may be `True` even when all result rows would be filtered
out.
Returns
-------
any : `bool`
`True` if the query would (or might, depending on arguments) yield
result rows. `False` if it definitely would not.
"""
raise NotImplementedError()

@abstractmethod
def explain_no_results(self, execute: bool = True) -> Iterable[str]:
"""Return human-readable messages that may help explain why the query
yields no results.
Parameters
----------
execute : `bool`, optional
If `True` (default) execute simplified versions (e.g. ``LIMIT 1``)
of aspects of the tree to more precisely determine where rows were
filtered out.
Returns
-------
messages : `~collections.abc.Iterable` [ `str` ]
String messages that describe reasons the query might not yield any
results.
"""
raise NotImplementedError()


class HomogeneousQueryBase(QueryBase):
def __init__(self, driver: QueryDriver, tree: QueryTree):
self._driver = driver
self._tree = tree

@property
def dimensions(self) -> DimensionGroup:
"""All dimensions included in the query's columns."""
return self._tree.dimensions

def any(self, *, execute: bool = True, exact: bool = True) -> bool:
# Docstring inherited.
return self._driver.any(self._tree, execute=execute, exact=exact)

def explain_no_results(self, execute: bool = True) -> Iterable[str]:
# Docstring inherited.
return self._driver.explain_no_results(self._tree, execute=execute)


class CountableQueryBase(QueryBase):
@abstractmethod
def count(self, *, exact: bool = True, discard: bool = False) -> int:
"""Count the number of rows this query would return.
Parameters
----------
exact : `bool`, optional
If `True`, run the full query and perform post-query filtering if
needed to account for that filtering in the count. If `False`, the
result may be an upper bound.
discard : `bool`, optional
If `True`, compute the exact count even if it would require running
the full query and then throwing away the result rows after
counting them. If `False`, this is an error, as the user would
usually be better off executing the query first to fetch its rows
into a new query (or passing ``exact=False``). Ignored if
``exact=False``.
Returns
-------
count : `int`
The number of rows the query would return, or an upper bound if
``exact=False``.
"""
raise NotImplementedError()


class QueryResultsBase(HomogeneousQueryBase, CountableQueryBase):
def count(self, *, exact: bool = True, discard: bool = False) -> int:
# Docstring inherited.
return self._driver.count(
self._tree,
dimensions=self.dimensions,
datasets=self._get_datasets(),
exact=exact,
discard=discard,
)

def order_by(self, *args: str | OrderExpression | ExpressionProxy) -> Self:
"""Return a new query that yields ordered results.
Parameters
----------
*args : `str`
Names of the columns/dimensions to use for ordering. Column name
can be prefixed with minus (``-``) to use descending ordering.
Returns
-------
result : `QueryResultsBase`
An ordered version of this query results object.
Notes
-----
If this method is called multiple times, the new sort terms replace
the old ones.
"""
return self._copy(
self._tree, order_by=convert_order_by_args(self.dimensions, self._get_datasets(), *args)
)

def limit(self, limit: int | None = None, offset: int = 0) -> Self:
"""Return a new query that slices its result rows positionally.
Parameters
----------
limit : `int` or `None`, optional
Upper limit on the number of returned records.
offset : `int`, optional
The number of records to skip before returning at most ``limit``
records.
Returns
-------
result : `QueryResultsBase`
A sliced version of this query results object.
Notes
-----
If this method is called multiple times, the new slice parameters
replace the old ones. Slicing always occurs after sorting, even if
`limit` is called before `order_by`.
"""
return self._copy(self._tree, limit=limit, offset=offset)

@abstractmethod
def _get_datasets(self) -> Set[str]:
"""Return all dataset types included in the query's result rows."""
raise NotImplementedError()

@abstractmethod
def _copy(self, tree: QueryTree, **kwargs: Any) -> Self:
"""Return a modified copy of ``self``."""
raise NotImplementedError()
Loading

0 comments on commit bb5a22a

Please sign in to comment.