From 770c6f1dc0176c7a276f0f24199247b1e01f18a1 Mon Sep 17 00:00:00 2001 From: Michael Hanke Date: Thu, 12 Oct 2023 15:29:36 +0200 Subject: [PATCH 1/2] Stop substitution `{python}` at `containers-run` runtime This causes worse portability issues than the previous approach of storing `python(.exe)`. Instead pass the placeholder on to `datalad run`. This would now error with ``` command has an unrecognized placeholder: 'python' ``` and requires a further intervention. Three options: - have datalad-core define `datalad.run.substitutions.python=sys.executable` - wait for https://github.com/datalad/datalad-container/pull/244 and use `register_config()` to have datalad-container define it - have datalad-container define it at `containers-run` runtime by patching the configuration for the execution time (would have `rerun` fail still) - add the configuration item to the committed dataset config Closes #249 --- datalad_container/containers_add.py | 5 +++-- datalad_container/containers_run.py | 4 ---- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/datalad_container/containers_add.py b/datalad_container/containers_add.py index bf40ad9..b781bbe 100644 --- a/datalad_container/containers_add.py +++ b/datalad_container/containers_add.py @@ -75,8 +75,9 @@ def _guess_call_fmt(ds, name, url): elif url.startswith('shub://') or url.startswith('docker://'): return 'singularity exec {img} {cmd}' elif url.startswith('dhub://'): - # {python} is replaced with sys.executable on *execute* - return '{python} -m datalad_container.adapters.docker run {img} {cmd}' + # {{python}} is eventually substituted with something like + # `sys.executable` on *execute* by datalad-run + return '{{python}} -m datalad_container.adapters.docker run {img} {cmd}' def _ensure_datalad_remote(repo): diff --git a/datalad_container/containers_run.py b/datalad_container/containers_run.py index 749e764..aeede63 100644 --- a/datalad_container/containers_run.py +++ b/datalad_container/containers_run.py @@ -129,10 +129,6 @@ def __call__(cmd, container_name=None, dataset=None, 'Convert it to a plain string.'.format(callspec)) try: cmd_kwargs = dict( - # point to the python installation that runs *this* code - # we know that it would have things like the docker - # adaptor installed with this extension package - python=sys.executable, img=image_path, cmd=cmd, img_dspath=image_dspath, From fe704e4079fcba5899a31b4a3bb3b4f83e254732 Mon Sep 17 00:00:00 2001 From: Michael Hanke Date: Thu, 12 Oct 2023 15:35:42 +0200 Subject: [PATCH 2/2] Expand the documentation of `containers-add` Mostly to avoid the misconceptions that led to #249 --- datalad_container/containers_add.py | 85 ++++++++++++++++++++++++++--- 1 file changed, 77 insertions(+), 8 deletions(-) diff --git a/datalad_container/containers_add.py b/datalad_container/containers_add.py index b781bbe..3f34638 100644 --- a/datalad_container/containers_add.py +++ b/datalad_container/containers_add.py @@ -108,7 +108,82 @@ class ContainersAdd(Interface): # first docstring line is used a short description in the cmdline help # the rest is put in the verbose help and manpage """Add a container to a dataset + + Adding a container is primarily placing container-related configuration in + the committed dataset configuration at ``.datalad/config``. At minimum, + only two bits of information are required: + + - location of the container image + - specification of how the images is to be executed + + The command assists with assembling this configuration for a range of use + cases, for example: + + - build and configure Singularity images + - retrieve and configure Docker images from Docker hub + - register arbitrary images with custom call specifications + + Moreover, there is limited support for updating registered container + images from remote sources. + + Call format specification + ------------------------- + + While this command handles the container call specification for standard + use cases automatically, fully custom configurations are supported too. + This is done via the [PY: ``call_fmt`` PY][CMD: ``--call-fmt`` CMD] + parameter. + + Like command specification for ``datalad run``, placeholder substitution + is supported. More precisely, call specifications with placeholders are + configured with ``datalad containers-add``, but they are only substituted by + ``datalad containers-run``, i.e., when the container is executed with a + particular command. The following placeholders are supported: + + - ``{cmd}``: command given to ``containers-run`` + - ``{img}``: the path to the container image + - ``{img_dspath}``: the path to the container image, relative to the dataset + containing it (the container may be in a subdataset) + - ``{img_dirpath}``: path to the directory that contains the container image + + In addition to these built-in placeholders, it is possible to pass + additional placeholders on to ``datalad run``, which is responsible for + performing the actual container execution. In order to do this, such + placeholder have to be "double-braced". For example, in order to pass the + ``tmpdir`` placeholder on to ``datalad run``, it must be declared as + ``{{tmpdir}}``. It will pass through ``datalad containers-runs`` and reach + ``datalad run`` as ``{tmpdir}``, where it will be substituted with the path + to a temporary directory. + + In this fashion, it is also possible to define custom placeholders that can + also be (re)defined when (re-)executing a run-record. For example, using a + ``{{python}}`` placeholder will require a definition for a ``{python}`` + placeholder to exist at runtime (even with ``datalad rerun``). A + (re)definition is possible by (temporarily) setting a matching + configuration items:: + + datalad -c datalad.run.substitutions.python=python3.12 rerun submitted-rev2 + + This can be particularly useful when certain aspects of a (re)execution + shall remain configurable, for example to aid portability. Substitutions + for placeholders are read from configuration. This means that a default + value can be added to the committed dataset configuration, even for + placeholders that neither ``datalad containers-run`` nor ``datalad run`` + have built-in support for. + """ + _examples_ = [ + dict( + text="Register a 'busybox' container from Docker-Hub under the name 'busy'", + code_cmd="datalad containers-add --url dhub://busybox:latest busy", + ), + dict( + text="Register a custom container for executing with custom Python code", + code_cmd=\ + 'datalad containers-add\n\t-i container/image\n' + '\t--call-fmt "{{python}} -m mypkg.container_handler {img} {cmd}"', + ) + ] # parameters of the command, must be exhaustive _params_ = dict( @@ -151,14 +226,8 @@ class ContainersAdd(Interface): call_fmt=Parameter( args=("--call-fmt",), doc="""Command format string indicating how to execute a command in - this container, e.g. "singularity exec {img} {cmd}". Where '{img}' - is a placeholder for the path to the container image and '{cmd}' is - replaced with the desired command. Additional placeholders: - '{img_dspath}' is relative path to the dataset containing the image, - '{img_dirpath}' is the directory containing the '{img}'. - '{python}' expands to the path of the Python executable that is - running the respective DataLad session, for example a - 'datalad containers-run' command. + this container, e.g. "singularity exec {img} {cmd}". For details + see the "Call format specification" in the command documentation. """, metavar="FORMAT", constraints=EnsureStr() | EnsureNone(),