getindata · Lasica · Nov 22, 2023 · Nov 16, 2023 · Nov 17, 2023 · Nov 17, 2023
diff --git a/.github/pre-commit-config.yaml b/.github/pre-commit-config.yaml
@@ -0,0 +1,15 @@
+repos:
+- repo: https://github.com/pycqa/isort
+  rev: 5.12.0
+  hooks:
+  - id: isort
+    args: ["--profile", "black", "--line-length=79"]
+- repo: https://github.com/psf/black
+  rev: 22.3.0
+  hooks:
+  - id: black
+- repo: https://github.com/pre-commit/pre-commit-hooks
+  rev: v2.3.0
+  hooks:
+  - id: flake8
+    args: ['--ignore=E203,W503', '--max-line-length=120'] # see https://github.com/psf/black/issues/315 https://github.com/psf/black/issues/52
diff --git a/.github/workflows/spellcheck.yml b/.github/workflows/spellcheck.yml
@@ -16,5 +16,5 @@ jobs:
     steps:
       # Spellcheck
       - uses: actions/checkout@v4
-      - uses: rojopolis/spellcheck-github-actions@0.25.0
+      - uses: rojopolis/spellcheck-github-actions@0.35.0
         name: Spellcheck
diff --git a/.github/workflows/test_and_publish.yml b/.github/workflows/test_and_publish.yml
@@ -36,7 +36,7 @@ jobs:
     - name: Check pre-commit status
       run: |
         poetry install -v
-        poetry run pre-commit run --all-files
+        poetry run pre-commit run --all-files -c .github/pre-commit-config.yaml
 
     - name: Test with tox
       run: |
@@ -98,7 +98,11 @@ jobs:
 
   e2e_tests:
     runs-on: ubuntu-latest
+    timeout-minutes: 60
     needs: [unit_tests, sonarcloud]
+    strategy:
+      matrix:
+        e2e_case: ["standard", "grouping"]
     steps:
       - uses: actions/checkout@v4
 
@@ -120,7 +124,7 @@ jobs:
         # kedro 0.18.1 is on purpose here, due to https://github.com/kedro-org/kedro-starters/issues/99
         run: |
           pip install $(find "./dist" -name "*.tar.gz")
-          kedro new --starter spaceflights --config tests/e2e/starter-config.yml --verbose
+          kedro new --starter spaceflights --config tests/e2e/${{ matrix.e2e_case }}/starter-config.yml --verbose
 
       - name: Install project dependencies
         working-directory: ./spaceflights
@@ -139,8 +143,13 @@ jobs:
           sed -i 's/\(COPY src\/requirements.txt.*\)$/\1\nCOPY kedro-vertexai.tar.gz ./g' Dockerfile
           echo "!data/01_raw" >> .dockerignore
           kedro vertexai init gid-ml-ops-sandbox europe-west4
-          mv ../tests/e2e/catalog.yml conf/base/catalog.yml
-          mv ../tests/e2e/vertexai.yml conf/base/vertexai.yml
+          cp ../tests/e2e/${{ matrix.e2e_case }}/catalog.yml conf/base/catalog.yml
+          cp ../tests/e2e/${{ matrix.e2e_case }}/vertexai.yml conf/base/vertexai.yml
+          # Introducing tagging to pipelines
+          if [[ "${{ matrix.e2e_case }}" == "grouping" ]]; then
+            mv ../tests/e2e/${{ matrix.e2e_case }}/pipeline_data_processing.py src/spaceflights/pipelines/data_processing/pipeline.py
+            mv ../tests/e2e/${{ matrix.e2e_case }}/pipeline_data_science.py src/spaceflights/pipelines/data_science/pipeline.py
+          fi
 
       - name: Prepare docker env
         uses: docker/setup-buildx-action@v3
@@ -151,14 +160,15 @@ jobs:
       - name: Build pipeline docker image
         run: |
           cd ./spaceflights
-          docker build --build-arg BASE_IMAGE=python:3.8-buster --tag kedro-vertexai-e2e:latest --load .
+          docker pull gcr.io/gid-ml-ops-sandbox/kedro-vertexai-e2e:${{ matrix.e2e_case }} || true
+          docker build --build-arg BASE_IMAGE=python:3.8-buster --tag kedro-vertexai-e2e:${{ matrix.e2e_case }} --load --cache-from=gcr.io/gid-ml-ops-sandbox/kedro-vertexai-e2e:${{ matrix.e2e_case }} .
 
       - name: Publish docker image to GCR
         uses: mattes/gce-docker-push-action@v1
         with:
           creds: ${{ secrets.GOOGLE_APPLICATION_CREDENTIALS }}
-          src: kedro-vertexai-e2e:latest
-          dst: gcr.io/gid-ml-ops-sandbox/kedro-vertexai-e2e:latest
+          src: kedro-vertexai-e2e:${{ matrix.e2e_case }}
+          dst: gcr.io/gid-ml-ops-sandbox/kedro-vertexai-e2e:${{ matrix.e2e_case }}
 
       - name: Set up GCP Credentials
         uses: google-github-actions/auth@v1.1.1
@@ -172,6 +182,7 @@ jobs:
           cd ./spaceflights
           export KEDRO_CONFIG_COMMIT_ID=$GITHUB_SHA
           kedro vertexai run-once --wait-for-completion
+
   publish:
     if: github.event.pull_request == null && github.ref == 'refs/heads/master'
     needs: [ e2e_tests, codeql ]

diff --git a/.gitignore b/.gitignore
@@ -174,3 +174,8 @@ terraform/terraform.tfstate
 .idea
 conf/azure/credentials.yml
 
+# pyspelling
+dictionary.dic
+
+# vs code
+.vscode
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,3 +1,5 @@
+# THIS IS LOCAL PRE-COMMIT CONFIG
+# CICD uses separate config in .github until spellcheck hook gets improved
 repos:
 - repo: https://github.com/pycqa/isort
   rev: 5.12.0
@@ -13,3 +15,7 @@ repos:
   hooks:
   - id: flake8
     args: ['--ignore=E203,W503', '--max-line-length=120'] # see https://github.com/psf/black/issues/315 https://github.com/psf/black/issues/52
+- repo: https://github.com/getindata/py-pre-commit-hooks
+  rev: v0.2.1
+  hooks:
+  - id: pyspelling-docker
diff --git a/.spellcheck.yml b/.spellcheck.yml
@@ -10,26 +10,26 @@ matrix:
       - "README.md"
     default_encoding: utf-8
     pipeline:
-      - pyspelling.filters.context:
-          context_visible_first: true
-          escapes: \\[\\`~]
-          delimiters:
-            # Ignore multiline content between fences (fences can have 3 or more back ticks)
-            # ```
-            # content
-            # ```
-            - open: '^(?s)(?P<open>`{1,3})[^`]'
-              close: '(?P=open)'
-            # Ignore text between inline back ticks
-            - open: '(?P<open>`)[^`]'
-              close: '(?P=open)'
-            # Ignore text in brackets [] and ()
-            - open: '\['
-              close: '\]'
-            - open: '\('
-              close: '\)'
-            - open: '\{'
-              close: '\}'
+    - pyspelling.filters.context:
+        context_visible_first: true
+        escapes: \\[\\`~]
+        delimiters:
+          # Ignore multiline content between fences (fences can have 3 or more back ticks)
+          # ```
+          # content
+          # ```
+          - open: '(?s)^(?P<open>`{1,3})[^`]'
+            close: '(?P=open)'
+          # Ignore text between inline back ticks
+          - open: '(?P<open>`)[^`]'
+            close: '(?P=open)'
+          # Ignore text in brackets [] and ()
+          - open: '\['
+            close: '\]'
+          - open: '\('
+            close: '\)'
+          - open: '\{'
+            close: '\}'
     dictionary:
       wordlists:
-        - docs/spellcheck_exceptions.txt
+        - docs/spellcheck_exceptions.txt
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,7 @@
 
 -   Added explicite pyarrow dependency to avoid critical vulnerability
 -   Updated dependencies and tested for kedro `0.18.14`
+-   [Feature 🚀] Node grouping: added option to group multiple Kedro nodes together at execution in single Vertex AI process to allow better optimization - less steps, shorter delays while running Vertex AI nodes and less wasted time of data serialization thanks to possibility to use the MemoryDataset
 
 ## [0.9.1] - 2023-08-16
 

diff --git a/docs/Makefile b/docs/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = .
+BUILDDIR      = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/source/02_installation/02_configuration.md b/docs/source/02_installation/02_configuration.md
@@ -26,6 +26,15 @@ run_config:
   # Optional pipeline description
   #description: "Very Important Pipeline"
 
+  # Optional config for node execution grouping. - 2 classes are provided:
+  # - default no-grouping option IdentityNodeGrouper
+  # - tag based grouping with TagNodeGrouper
+  grouping:
+    cls: kedro_vertexai.grouping.IdentityNodeGrouper
+    # cls: kedro_vertexai.grouping.TagNodeGrouper
+    # params:
+        # tag_prefix: "group:"
+
   # How long to keep underlying Argo workflow (together with pods and data
   # volume after pipeline finishes) [in seconds]. Default: 1 week
   ttl: 604800
@@ -34,8 +43,10 @@ run_config:
   # pipeline status. Used to send notifications or raise the alerts
   # on_exit_pipeline: notify_via_slack
 
-  # Optional section allowing adjustment of the resources
-  # reservations and limits for the nodes
+  # Optional section allowing adjustment of the resources, reservations and limits
+  # for the nodes. You can specify node names or tags to select which nodes the requirements
+  # apply to (also in node selectors). When not provided they're set to 500m cpu and 1024Mi memory.
+  # If you don't want to specify pipeline resources set both to None in __default__.
   resources:
 
     # For nodes that require more RAM you can increase the "memory"
@@ -170,6 +181,49 @@ def generate_config(self) -> dict:
 First one - `target_config_file` should return the name of the configuration file to be generated (e.g. `credentials.yml`) and the `generate_config` should return a dictionary, which will be then serialized into the target file as YAML. If the target file already exists during the invocation, it will be merged (see method `kedro_vertexai.dynamic_config.DynamicConfigProvider.merge_with_existing` ) with the existing one and then saved again.
 Note that the `generate_config` has access to an initialized plugin config via `self.config` property, so any values from the `vertexai.yml` configuration is accessible.
 
+
+## Grouping feature
+
+Optional `grouping` section enables grouping feature that aggregates many Kedro nodes execution to single VertexAI node(s). Using it allows you to freely subdivide Kedro pipelines to as many steps as logically makes sense while keeping advantages of in memory data transmission possibilities. It also saves you a lot of time avoiding delays of docker container starting at Vertex nodes which can amount to about 2 minutes for each VertexAI node.
+
+API allows implementation of your own aggregation method. You can provide aggregating class and its additional init params as `kwargs` dictionary. Default class is `IdentitiyNodeGrouper` which actually does not group the nodes (plugin behaves as in versions before `0.9.1`). Class that implements grouping using configured tag prefix is called `TagNodeGrouper`. The default prefix is `"group:"`. It uses what follows after the tag prefix as a name of group of nodes. Only one tag with this grouping prefix is allowed per node; more than that results in `GroupingException`. Example configuration:
+```yaml
+  grouping:
+    cls: kedro_vertexai.grouping.TagNodeGrouper
+    params:
+        tag_prefix: "group:"
+```
+
+The above configuration will result in the following result in this sample pipeline:
+```python
+Pipeline([
+  node(some_operation, "A", "B", name="node1", tags=["foo", "group:nodegroup"]),
+  node(some_operation, "B", "C", name="node2", tags=["bar", "group:nodegroup"]),
+  node(some_operation, "C", "D", name="node3", tags=["baz"]),
+])
+```
+The result will be 2 VertexAI nodes for this pipeline, first with name `nodegroup` that will run `node1` and `node2` Kedro nodes inside and provide output `C` and second VertexAI node: `node3`. Additional MLflow node can be present if `kedro-mlflow` is used. Right now it is not possible to group it. If you feel you need that functionality search for/create an issue on [github page of the plugin](https://github.com/getindata/kedro-vertexai/issues).
+
+This grouping class is used during pipeline translation at plugin pipeline generator. It implements interface of `NodeGrouper` class with `group` function, that accepts `pipeline.node_dependencies` and returns `Grouping`. `Grouping` is a `dataclass` with two dictionaries:
+- `node_mapping` - which defines names of groups and says which sets of nodes are part of a given group
+- `dependencies` - which defines child-parent relation of all groups in `node_mapping`.
+`Grouping` class also validates dependencies upon creation to check whether the grouping is valid. That means it does not introduce a cycle in dependencies graph.
+
+````{warning}
+Make sure that all nodes in pipeline have names and their names are unique within the pipeline when using this feature, as grouping class and VertexAI nodes naming depend on it.
+````
+
+### Example
+
+Here you can see how standard spaceflights changes after enabling the grouping feature configured with `TagNodeGrouper`, when using the following tagging (view from kedro viz):
+
+![Vertex AI Pipeline](grouped_kedro_viz.png)
+
+We get the following result:
+
+![Vertex AI Pipeline](grouping_visualisation.png)
+
+
 ## Resources configuration
 
 Optional `resources` and `node_selectors` sections enable adjustment of the resources reservations and limits for the

diff --git a/docs/source/02_installation/grouped_kedro_viz.png b/docs/source/02_installation/grouped_kedro_viz.png
diff --git a/docs/source/02_installation/grouping_visualisation.png b/docs/source/02_installation/grouping_visualisation.png