From df0fb9752beb8df98a18d175f278e1134978932f Mon Sep 17 00:00:00 2001 From: "David H. Irving" Date: Fri, 26 Jan 2024 11:17:58 -0700 Subject: [PATCH] Allow direct configuration of repository index Instead of requiring the repository index to be loaded from a file, allow direct configuration as a string in a new environment variable DAF_BUTLER_REPOSITORIES. The file was problematic for services for a few reasons: 1. We don't have anywhere to host it -- currently it is stored in S3, but services using Butler server will not have the credentials to access it. 2. There are cache invalidation issues. If it's configured directly, Phalanx can automatically restart services using it when the configuration changes. 3. Manually editing live configuration files is error-prone. If this configuration lives in Phalanx, we are able to validate the configuration before deploying it. --- doc/changes/DM-42660.feature.md | 1 + python/lsst/daf/butler/_butler_repo_index.py | 90 +++++++++++--------- tests/test_butler.py | 23 ++++- 3 files changed, 72 insertions(+), 42 deletions(-) create mode 100644 doc/changes/DM-42660.feature.md diff --git a/doc/changes/DM-42660.feature.md b/doc/changes/DM-42660.feature.md new file mode 100644 index 0000000000..6ea5dee2dd --- /dev/null +++ b/doc/changes/DM-42660.feature.md @@ -0,0 +1 @@ +The Butler repository index can now be configured by a new environment variable `DAF_BUTLER_REPOSITORIES`, which contains the configuration directly instead of requiring lookup via a URI. diff --git a/python/lsst/daf/butler/_butler_repo_index.py b/python/lsst/daf/butler/_butler_repo_index.py index 7eda8905e1..0bff390a8c 100644 --- a/python/lsst/daf/butler/_butler_repo_index.py +++ b/python/lsst/daf/butler/_butler_repo_index.py @@ -30,9 +30,11 @@ __all__ = ("ButlerRepoIndex",) import os -from typing import ClassVar +from typing import Any, ClassVar +import yaml from lsst.resources import ResourcePath +from pydantic import TypeAdapter, ValidationError from ._config import Config from ._utilities.thread_safe_cache import ThreadSafeCache @@ -41,10 +43,14 @@ class ButlerRepoIndex: """Index of all known butler repositories. - The index of butler repositories is found by looking for a - configuration file at the URI pointed at by the environment - variable ``$DAF_BUTLER_REPOSITORY_INDEX``. The configuration file - is a simple dictionary lookup of the form: + The index of butler repositories can be configured in two ways: + 1. By setting the environment variable ``DAF_BUTLER_REPOSITORY_INDEX`` to + the URI of a configuration file. + 2. By setting the environment variable ``DAF_BUTLER_REPOSITORIES`` to the + contents of the configuration file as a string. + + In either case, the configuration is a simple dictionary lookup of the + form: .. code-block:: yaml @@ -56,9 +62,15 @@ class ButlerRepoIndex: """ index_env_var: ClassVar[str] = "DAF_BUTLER_REPOSITORY_INDEX" - """The name of the environment variable to read to locate the index.""" + """The name of the environment variable containing the URI of the index + configuration file. + """ + repositories_env_var: ClassVar[str] = "DAF_BUTLER_REPOSITORIES" + """The name of the environment variable containing the configuration + directly as a string. + """ - _cache: ClassVar[ThreadSafeCache[ResourcePath, Config]] = ThreadSafeCache() + _cache: ClassVar[ThreadSafeCache[str, Config]] = ThreadSafeCache() """Cache of indexes. In most scenarios only one index will be found and the environment will not change. In tests this may not be true.""" @@ -67,7 +79,7 @@ class ButlerRepoIndex: every read.""" @classmethod - def _read_repository_index(cls, index_uri: ResourcePath) -> Config: + def _read_repository_index(cls, index_uri: str) -> Config: """Read the repository index from the supplied URI. Parameters @@ -107,26 +119,7 @@ def _read_repository_index(cls, index_uri: ResourcePath) -> Config: return repo_index @classmethod - def _get_index_uri(cls) -> ResourcePath: - """Find the URI to the repository index. - - Returns - ------- - index_uri : `lsst.resources.ResourcePath` - URI to the repository index. - - Raises - ------ - KeyError - Raised if the location of the index could not be determined. - """ - index_uri = os.environ.get(cls.index_env_var) - if index_uri is None: - raise KeyError(f"No repository index defined in environment variable {cls.index_env_var}") - return ResourcePath(index_uri) - - @classmethod - def _read_repository_index_from_environment(cls) -> Config: + def _read_repository_index_from_environment(cls) -> dict[str, str]: """Look in environment for index location and read it. Returns @@ -136,16 +129,29 @@ def _read_repository_index_from_environment(cls) -> Config: """ cls._most_recent_failure = "" try: - index_uri = cls._get_index_uri() - except KeyError as e: - cls._most_recent_failure = str(e) - raise - try: - repo_index = cls._read_repository_index(index_uri) + index_uri = os.getenv(cls.index_env_var) + direct_configuration = os.getenv(cls.repositories_env_var) + + if index_uri and direct_configuration: + raise RuntimeError( + f"Only one of the environment variables {cls.repositories_env_var} and" + f" {cls.index_env_var} should be set." + ) + + if direct_configuration: + return cls._validate_configuration(yaml.safe_load(direct_configuration)) + + if index_uri: + repo_index = cls._read_repository_index(index_uri) + return cls._validate_configuration(repo_index) + + raise RuntimeError( + "No repository index defined. Neither of the environment variables" + f" {cls.repositories_env_var} or {cls.index_env_var} was set." + ) except Exception as e: cls._most_recent_failure = str(e) raise - return repo_index @classmethod def get_known_repos(cls) -> set[str]: @@ -222,10 +228,12 @@ def get_repo_uri(cls, label: str, return_label: bool = False) -> ResourcePath: if repo_uri is None: if return_label: return ResourcePath(label, forceAbsolute=False) - # This should not raise since it worked earlier. - try: - index_uri = str(cls._get_index_uri()) - except KeyError: - index_uri = "" - raise KeyError(f"Label '{label}' not known to repository index at {index_uri}") + raise KeyError(f"Label '{label}' not known to repository index") return ResourcePath(repo_uri) + + @classmethod + def _validate_configuration(cls, obj: Any) -> dict[str, str]: + try: + return TypeAdapter(dict[str, str]).validate_python(obj) + except ValidationError as e: + raise ValueError("Repository index not in expected format") from e diff --git a/tests/test_butler.py b/tests/test_butler.py index 59a5f7b2c6..aab2a68889 100644 --- a/tests/test_butler.py +++ b/tests/test_butler.py @@ -667,7 +667,7 @@ def testConstructor(self) -> None: # Check that we can create Butler when the alias file is not found. butler = Butler.from_config(self.tmpConfigFile, writeable=False) self.assertIsInstance(butler, Butler) - with self.assertRaises(KeyError) as cm: + with self.assertRaises(RuntimeError) as cm: # No environment variable set. Butler.get_repo_uri("label") self.assertEqual(Butler.get_repo_uri("label", True), ResourcePath("label", forceAbsolute=False)) @@ -677,6 +677,27 @@ def testConstructor(self) -> None: Butler.from_config("not_there") self.assertEqual(Butler.get_known_repos(), set()) + def testDafButlerRepositories(self): + with unittest.mock.patch.dict(os.environ, {"DAF_BUTLER_REPOSITORIES": "label: https://someuri.com"}): + self.assertEqual(str(Butler.get_repo_uri("label")), "https://someuri.com") + + with unittest.mock.patch.dict( + os.environ, + { + "DAF_BUTLER_REPOSITORIES": "label: https://someuri.com", + "DAF_BUTLER_REPOSITORY_INDEX": "https://someuri.com", + }, + ): + with self.assertRaisesRegex(RuntimeError, "Only one of the environment variables"): + Butler.get_repo_uri("label") + + with unittest.mock.patch.dict( + os.environ, + {"DAF_BUTLER_REPOSITORIES": "invalid"}, + ): + with self.assertRaisesRegex(ValueError, "Repository index not in expected format"): + Butler.get_repo_uri("label") + def testBasicPutGet(self) -> None: storageClass = self.storageClassFactory.getStorageClass("StructuredDataNoComponents") self.runPutGetTest(storageClass, "test_metric")