diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml
new file mode 100644
index 00000000..c9cb054d
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/bug_report.yml
@@ -0,0 +1,69 @@
+name: 🐛 Bug Report
+description: File a bug report to help us improve
+labels: [bug]
+body:
+ - type: textarea
+ id: what-happened
+ attributes:
+ label: What happened?
+ description: |
+ Thanks for reporting a bug! Please describe what you were trying to get done.
+ Tell us what happened, what went wrong.
+ validations:
+ required: true
+
+ - type: textarea
+ id: what-did-you-expect-to-happen
+ attributes:
+ label: What did you expect to happen?
+ description: |
+ Describe what you expected to happen.
+ validations:
+ required: false
+
+ - type: textarea
+ id: sample-code
+ attributes:
+ label: Minimal Complete Verifiable Example
+ description: |
+ Minimal, self-contained copy-pastable example that demonstrates the issue. This will be automatically formatted into code, so no need for markdown backticks.
+ render: Python
+
+ - type: checkboxes
+ id: mvce-checkboxes
+ attributes:
+ label: MVCE confirmation
+ description: |
+ Please confirm that the bug report is in an excellent state, so we can understand & fix it quickly & efficiently. For more details, check out:
+
+ - [Minimal Complete Verifiable Examples](https://stackoverflow.com/help/mcve)
+ - [Craft Minimal Bug Reports](https://matthewrocklin.com/minimal-bug-reports)
+
+ options:
+ - label: Minimal example — the example is as focused as reasonably possible to demonstrate the underlying issue in xarray.
+ - label: Complete example — the example is self-contained, including all data and the text of any traceback.
+ - label: Verifiable example — the example runs when copied & pasted into an fresh python environment.
+ - label: New issue — a search of GitHub Issues suggests this is not a duplicate.
+
+ - type: textarea
+ id: log-output
+ attributes:
+ label: Relevant log output
+ description: Please copy and paste any relevant output. This will be automatically formatted into code, so no need for markdown backticks.
+ render: Python
+
+ - type: textarea
+ id: extra
+ attributes:
+ label: Anything else we need to know?
+ description: |
+ Please describe any other information you want to share.
+
+ - type: textarea
+ id: show-versions
+ attributes:
+ label: Environment
+ description: |
+ Paste the output of `icechunk.print_debug_info()`
+ validations:
+ required: true
diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
new file mode 100644
index 00000000..3eece15e
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -0,0 +1 @@
+blank_issues_enabled: True
diff --git a/.github/ISSUE_TEMPLATE/documentation.md b/.github/ISSUE_TEMPLATE/documentation.md
new file mode 100644
index 00000000..12aac94d
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/documentation.md
@@ -0,0 +1,29 @@
+---
+name: 📚 Documentation Issue/Suggestion
+about: Report parts probems with the docs or suggest improvments
+labels: documentation
+---
+
+
+
+
+### Problem
+
+
+
+
+### Suggested Improvement
+
+
diff --git a/.github/ISSUE_TEMPLATE/enhancement.md b/.github/ISSUE_TEMPLATE/enhancement.md
new file mode 100644
index 00000000..2446cfca
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/enhancement.md
@@ -0,0 +1,35 @@
+---
+name: Enhancement/Feature Request
+about: Suggest something that could be improved or a New Feature to add
+labels: enhancement
+---
+
+
+
+### Problem
+
+
+
+### Proposed Solution
+
+
+
+### Additional context
+
+
diff --git a/.github/workflows/python-check.yaml b/.github/workflows/python-check.yaml
index 32b9eabb..aaa033f6 100644
--- a/.github/workflows/python-check.yaml
+++ b/.github/workflows/python-check.yaml
@@ -6,16 +6,6 @@ on:
- main
pull_request:
types: [opened, reopened, synchronize, labeled]
- paths:
- - 'icechunk/**'
- - 'icechunk-python/**'
- - '.github/workflows/python-check.yaml'
- - 'Cargo.toml'
- - 'Cargo.lock'
- - 'compose.yaml'
- - 'deny.toml'
- - 'Justfile'
- - 'rustfmt.toml'
workflow_dispatch:
concurrency:
@@ -173,5 +163,6 @@ jobs:
python3 -m venv .venv
source .venv/bin/activate
pip install icechunk['test'] --find-links dist --force-reinstall
+ pip install pytest-mypy-plugins
# pass xarray's pyproject.toml so that pytest can find the `flaky` fixture
pytest -c=../../xarray/pyproject.toml -W ignore tests/run_xarray_backends_tests.py
diff --git a/.github/workflows/rust-ci.yaml b/.github/workflows/rust-ci.yaml
index 8e272df9..d6cccc7c 100644
--- a/.github/workflows/rust-ci.yaml
+++ b/.github/workflows/rust-ci.yaml
@@ -5,16 +5,6 @@ name: Rust CI
on:
pull_request:
types: [opened, reopened, synchronize, labeled]
- paths:
- - 'icechunk/**'
- - 'icechunk-python/**'
- - '.github/workflows/rust-ci.yaml'
- - 'Cargo.toml'
- - 'Cargo.lock'
- - 'compose.yaml'
- - 'deny.toml'
- - 'Justfile'
- - 'rustfmt.toml'
push:
branches:
- main
diff --git a/Cargo.lock b/Cargo.lock
index 317e251c..069cc8e5 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -60,12 +60,6 @@ dependencies = [
"libc",
]
-[[package]]
-name = "anyhow"
-version = "1.0.94"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c1fd03a028ef38ba2276dce7e33fcd6369c158a1bca17946c4b1b701891c1ff7"
-
[[package]]
name = "async-recursion"
version = "1.1.1"
@@ -532,6 +526,15 @@ dependencies = [
"rustc-demangle",
]
+[[package]]
+name = "backtrace-ext"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "537beee3be4a18fb023b570f80e3ae28003db9167a751266b259926e25539d50"
+dependencies = [
+ "backtrace",
+]
+
[[package]]
name = "base16ct"
version = "0.1.1"
@@ -589,9 +592,9 @@ checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7"
[[package]]
name = "bitflags"
-version = "2.6.0"
+version = "2.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de"
+checksum = "8f68f53c83ab957f72c32642f3868eec03eb974d1fb82e453128456482613d36"
[[package]]
name = "block-buffer"
@@ -882,6 +885,12 @@ dependencies = [
"typeid",
]
+[[package]]
+name = "err-into"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7f003b437a8029298beb1a849ea8c5c8229f0c1225e3e854c4523dbe8d90b02d"
+
[[package]]
name = "errno"
version = "0.3.9"
@@ -908,6 +917,16 @@ dependencies = [
"subtle",
]
+[[package]]
+name = "flatbuffers"
+version = "25.2.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1045398c1bfd89168b5fd3f1fc11f6e70b34f6f66300c87d44d3de849463abf1"
+dependencies = [
+ "bitflags",
+ "rustc_version",
+]
+
[[package]]
name = "fnv"
version = "1.0.7"
@@ -1339,7 +1358,7 @@ dependencies = [
[[package]]
name = "icechunk"
-version = "0.1.0"
+version = "0.2.3"
dependencies = [
"async-recursion",
"async-stream",
@@ -1352,6 +1371,8 @@ dependencies = [
"base64 0.22.1",
"bytes",
"chrono",
+ "err-into",
+ "flatbuffers",
"futures",
"itertools 0.14.0",
"object_store",
@@ -1367,12 +1388,15 @@ dependencies = [
"serde_bytes",
"serde_json",
"serde_with",
- "serde_yml",
+ "serde_yaml_ng",
"tempfile",
"test-strategy",
"thiserror 2.0.11",
"tokio",
"tokio-util",
+ "tracing",
+ "tracing-error",
+ "tracing-subscriber",
"typed-path",
"typetag",
"url",
@@ -1381,7 +1405,7 @@ dependencies = [
[[package]]
name = "icechunk-python"
-version = "0.1.0"
+version = "0.2.3"
dependencies = [
"async-stream",
"async-trait",
@@ -1390,6 +1414,7 @@ dependencies = [
"futures",
"icechunk",
"itertools 0.14.0",
+ "miette",
"pyo3",
"pyo3-async-runtimes",
"serde",
@@ -1584,6 +1609,12 @@ version = "2.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ddc24109865250148c2e0f3d25d4f0f479571723792d3802153c60922a4fb708"
+[[package]]
+name = "is_ci"
+version = "1.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7655c9839580ee829dfacba1d1278c2b7883e50a277ff7541299489d6bdfdc45"
+
[[package]]
name = "itertools"
version = "0.13.0"
@@ -1638,16 +1669,6 @@ version = "0.2.169"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a"
-[[package]]
-name = "libyml"
-version = "0.0.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3302702afa434ffa30847a83305f0a69d6abd74293b6554c18ec85c7ef30c980"
-dependencies = [
- "anyhow",
- "version_check",
-]
-
[[package]]
name = "linux-raw-sys"
version = "0.4.14"
@@ -1685,6 +1706,15 @@ dependencies = [
"hashbrown 0.14.5",
]
+[[package]]
+name = "matchers"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8263075bb86c5a1b1427b5ae862e8889656f126e9f77c484496e8b47cf5c5558"
+dependencies = [
+ "regex-automata 0.1.10",
+]
+
[[package]]
name = "md-5"
version = "0.10.6"
@@ -1710,6 +1740,37 @@ dependencies = [
"autocfg",
]
+[[package]]
+name = "miette"
+version = "7.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1a955165f87b37fd1862df2a59547ac542c77ef6d17c666f619d1ad22dd89484"
+dependencies = [
+ "backtrace",
+ "backtrace-ext",
+ "cfg-if",
+ "miette-derive",
+ "owo-colors",
+ "supports-color",
+ "supports-hyperlinks",
+ "supports-unicode",
+ "terminal_size",
+ "textwrap",
+ "thiserror 1.0.69",
+ "unicode-width",
+]
+
+[[package]]
+name = "miette-derive"
+version = "7.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bf45bf44ab49be92fd1227a3be6fc6f617f1a337c06af54981048574d8783147"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
[[package]]
name = "mime"
version = "0.3.17"
@@ -1737,6 +1798,16 @@ dependencies = [
"windows-sys 0.52.0",
]
+[[package]]
+name = "nu-ansi-term"
+version = "0.46.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84"
+dependencies = [
+ "overload",
+ "winapi",
+]
+
[[package]]
name = "num-conv"
version = "0.1.0"
@@ -1820,6 +1891,18 @@ version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4030760ffd992bef45b0ae3f10ce1aba99e33464c90d14dd7c039884963ddc7a"
+[[package]]
+name = "overload"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
+
+[[package]]
+name = "owo-colors"
+version = "4.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fb37767f6569cd834a413442455e0f066d0d522de8630436e2a1761d9726ba56"
+
[[package]]
name = "p256"
version = "0.11.1"
@@ -1948,7 +2031,7 @@ dependencies = [
"rand 0.8.5",
"rand_chacha 0.3.1",
"rand_xorshift",
- "regex-syntax",
+ "regex-syntax 0.8.5",
"rusty-fork",
"tempfile",
"unarray",
@@ -2212,8 +2295,17 @@ checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191"
dependencies = [
"aho-corasick",
"memchr",
- "regex-automata",
- "regex-syntax",
+ "regex-automata 0.4.9",
+ "regex-syntax 0.8.5",
+]
+
+[[package]]
+name = "regex-automata"
+version = "0.1.10"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132"
+dependencies = [
+ "regex-syntax 0.6.29",
]
[[package]]
@@ -2224,7 +2316,7 @@ checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908"
dependencies = [
"aho-corasick",
"memchr",
- "regex-syntax",
+ "regex-syntax 0.8.5",
]
[[package]]
@@ -2233,6 +2325,12 @@ version = "0.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "53a49587ad06b26609c52e423de037e7f57f20d53535d66e08c695f347df952a"
+[[package]]
+name = "regex-syntax"
+version = "0.6.29"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1"
+
[[package]]
name = "regex-syntax"
version = "0.8.5"
@@ -2653,18 +2751,16 @@ dependencies = [
]
[[package]]
-name = "serde_yml"
-version = "0.0.12"
+name = "serde_yaml_ng"
+version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "59e2dd588bf1597a252c3b920e0143eb99b0f76e4e082f4c92ce34fbc9e71ddd"
+checksum = "7b4db627b98b36d4203a7b458cf3573730f2bb591b28871d916dfa9efabfd41f"
dependencies = [
"indexmap 2.2.6",
"itoa",
- "libyml",
- "memchr",
"ryu",
"serde",
- "version_check",
+ "unsafe-libyaml",
]
[[package]]
@@ -2689,6 +2785,15 @@ dependencies = [
"digest",
]
+[[package]]
+name = "sharded-slab"
+version = "0.1.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f40ca3c46823713e0d4209592e8d6e826aa57e928f09752619fc696c499637f6"
+dependencies = [
+ "lazy_static",
+]
+
[[package]]
name = "signal-hook-registry"
version = "1.4.2"
@@ -2811,6 +2916,27 @@ version = "2.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292"
+[[package]]
+name = "supports-color"
+version = "3.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c64fc7232dd8d2e4ac5ce4ef302b1d81e0b80d055b9d77c7c4f51f6aa4c867d6"
+dependencies = [
+ "is_ci",
+]
+
+[[package]]
+name = "supports-hyperlinks"
+version = "3.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "804f44ed3c63152de6a9f90acbea1a110441de43006ea51bcce8f436196a288b"
+
+[[package]]
+name = "supports-unicode"
+version = "3.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b7401a30af6cb5818bb64852270bb722533397edcfc7344954a38f420819ece2"
+
[[package]]
name = "syn"
version = "2.0.89"
@@ -2862,6 +2988,16 @@ dependencies = [
"windows-sys 0.59.0",
]
+[[package]]
+name = "terminal_size"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5352447f921fda68cf61b4101566c0bdb5104eff6804d0678e5227580ab6a4e9"
+dependencies = [
+ "rustix",
+ "windows-sys 0.59.0",
+]
+
[[package]]
name = "test-strategy"
version = "0.4.0"
@@ -2874,6 +3010,16 @@ dependencies = [
"syn",
]
+[[package]]
+name = "textwrap"
+version = "0.16.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "23d434d3f8967a09480fb04132ebe0a3e088c173e6d0ee7897abbdf4eab0f8b9"
+dependencies = [
+ "unicode-linebreak",
+ "unicode-width",
+]
+
[[package]]
name = "thiserror"
version = "1.0.69"
@@ -2914,6 +3060,16 @@ dependencies = [
"syn",
]
+[[package]]
+name = "thread_local"
+version = "1.1.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8b9ef9bad013ada3808854ceac7b46812a6465ba368859a37e2100283d2d719c"
+dependencies = [
+ "cfg-if",
+ "once_cell",
+]
+
[[package]]
name = "time"
version = "0.3.36"
@@ -3040,9 +3196,9 @@ checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3"
[[package]]
name = "tracing"
-version = "0.1.40"
+version = "0.1.41"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef"
+checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0"
dependencies = [
"pin-project-lite",
"tracing-attributes",
@@ -3051,9 +3207,9 @@ dependencies = [
[[package]]
name = "tracing-attributes"
-version = "0.1.27"
+version = "0.1.28"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7"
+checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d"
dependencies = [
"proc-macro2",
"quote",
@@ -3062,11 +3218,51 @@ dependencies = [
[[package]]
name = "tracing-core"
-version = "0.1.32"
+version = "0.1.33"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54"
+checksum = "e672c95779cf947c5311f83787af4fa8fffd12fb27e4993211a84bdfd9610f9c"
dependencies = [
"once_cell",
+ "valuable",
+]
+
+[[package]]
+name = "tracing-error"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8b1581020d7a273442f5b45074a6a57d5757ad0a47dac0e9f0bd57b81936f3db"
+dependencies = [
+ "tracing",
+ "tracing-subscriber",
+]
+
+[[package]]
+name = "tracing-log"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ee855f1f400bd0e5c02d150ae5de3840039a3f54b025156404e34c23c03f47c3"
+dependencies = [
+ "log",
+ "once_cell",
+ "tracing-core",
+]
+
+[[package]]
+name = "tracing-subscriber"
+version = "0.3.19"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e8189decb5ac0fa7bc8b96b7cb9b2701d60d48805aca84a238004d665fcc4008"
+dependencies = [
+ "matchers",
+ "nu-ansi-term",
+ "once_cell",
+ "regex",
+ "sharded-slab",
+ "smallvec",
+ "thread_local",
+ "tracing",
+ "tracing-core",
+ "tracing-log",
]
[[package]]
@@ -3129,12 +3325,30 @@ version = "1.0.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b"
+[[package]]
+name = "unicode-linebreak"
+version = "0.1.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3b09c83c3c29d37506a3e260c08c03743a6bb66a9cd432c6934ab501a190571f"
+
+[[package]]
+name = "unicode-width"
+version = "0.1.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af"
+
[[package]]
name = "unindent"
version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c7de7d73e1754487cb58364ee906a499937a0dfabd86bcb980fa99ec8c8fa2ce"
+[[package]]
+name = "unsafe-libyaml"
+version = "0.2.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "673aac59facbab8a9007c7f6108d11f63b603f7cabff99fabf650fea5c32b861"
+
[[package]]
name = "untrusted"
version = "0.9.0"
@@ -3176,6 +3390,12 @@ version = "1.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "81dfa00651efa65069b0b6b651f4aaa31ba9e3c3ce0137aaad053604ee7e0314"
+[[package]]
+name = "valuable"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65"
+
[[package]]
name = "version_check"
version = "0.9.5"
@@ -3320,6 +3540,22 @@ dependencies = [
"wasm-bindgen",
]
+[[package]]
+name = "winapi"
+version = "0.3.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
+dependencies = [
+ "winapi-i686-pc-windows-gnu",
+ "winapi-x86_64-pc-windows-gnu",
+]
+
+[[package]]
+name = "winapi-i686-pc-windows-gnu"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
+
[[package]]
name = "winapi-util"
version = "0.1.9"
@@ -3329,6 +3565,12 @@ dependencies = [
"windows-sys 0.59.0",
]
+[[package]]
+name = "winapi-x86_64-pc-windows-gnu"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
+
[[package]]
name = "windows-core"
version = "0.52.0"
diff --git a/Cargo.toml b/Cargo.toml
index 2d0c0136..7c8f6958 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -9,6 +9,7 @@ unwrap_used = "warn"
panic = "warn"
todo = "warn"
unimplemented = "warn"
+dbg_macro = "warn"
[workspace.metadata.release]
allow-branch = ["main"]
diff --git a/Changelog.python.md b/Changelog.python.md
index d4c2e700..1211c3ba 100644
--- a/Changelog.python.md
+++ b/Changelog.python.md
@@ -1,14 +1,159 @@
# Changelog
+## Python Icechunk Library 0.2.3
+
+### Features
+
+- `Repository` can now be pickled.
+- `icechunk.print_debug_info()` now prints out relative information about the installed version of icechunk and relative dependencies.
+- `icechunk.Storage` now supports `__repr__`. Only configuration values will be printed, no credentials.
+
+### Fixes
+
+- Fixes a missing export for Google Cloud Storage credentials.
+
+## Python Icechunk Library 0.2.2
+
+### Features
+
+- Added the ability to checkout a session `as_of` a specific time. This is useful for replaying what the repo would be at a specific point in time.
+- Support for refreshable Google Cloud Storage credentials.
+
+### Fixes
+
+- Fix a bug where the clean prefix detection was hiding other errors when creating repositories.
+- API now correctly uses `snapshot_id` instead of `snapshot` consistently.
+- Only write `content-type` to metadata files if the target object store supports it.
+
+## Python Icechunk Library 0.2.1
+
+### Features
+
+- Users can now override consistency defaults. With this Icechunk is usable in a larger set of object stores,
+including those without support for conditional updates. In this setting, Icechunk loses some of its consistency guarantees.
+This configuration variables are for advanced users only, and should only be changed if necessary for compatibility.
+
+ ```python
+ class StorageSettings:
+ ...
+
+ @property
+ def unsafe_use_conditional_update(self) -> bool | None:
+ ...
+ @property
+ def unsafe_use_conditional_create(self) -> bool | None:
+ ...
+ @property
+ def unsafe_use_metadata(self) -> bool | None:
+ ...
+ ```
+
+## Python Icechunk Library 0.2.0
+
+This release is focused on stabilizing Icechunk's on-disk serialization format. It's a non-backwards
+compatible change, hopefully the last one. Data written with previous versions must be reingested to be read with
+Icechunk 0.2.0.
+
+### Features
+
+- `Repository.ancestry` now returns an iterator, allowing interrupting the traversal of the version tree at any point.
+- New on-disk format using [flatbuffers](https://flatbuffers.dev/) makes it easier to document and implement
+(de-)serialization. This enables the creation of alternative readers and writers for the Icechunk format.
+- `Repository.readonly_session` interprets its first positional argument as a branch name:
+
+```python
+# before:
+repo.readonly_session(branch="dev")
+
+# after:
+repo.readonly_session("dev")
+
+# still possible:
+repo.readonly_session(tag="v0.1")
+repo.readonly_session(branch="foo")
+repo.readonly_session(snapshot_id="NXH3M0HJ7EEJ0699DPP0")
+```
+
+- Icechunk is now more resilient to changes in Zarr metadata spec, and can handle Zarr extensions.
+- More documentation.
+
+### Performance
+
+- We have improved our benchmarks, making them more flexible and effective at finding possible regressions.
+- New `Store.set_virtual_refs` method allows setting multiple virtual chunks for the same array. This
+significantly speeds up the creation of virtual datasets.
+
+### Fixes
+
+- Fix a bug in clean prefix detection
+
+## Python Icechunk Library 0.1.3
+
+### Features
+
+- Repositories can now evaluate the `diff` between two snapshots.
+- Sessions can show the current `status` of the working copy.
+- Adds the ability to specify bearer tokens for authenticating with Google Cloud Storage.
+
+### Fixes
+
+- Dont write `dimension_names` to the zarr metadata if no dimension names are set. Previously, `null` was written.
+
+## Python Icechunk Library 0.1.2
+
+### Features
+
+- Improved error messages. Exceptions raised by Icechunk now include a lot more information
+on what happened, and what was Icechunk doing when the exception was raised. Example error message:
+ 
+- Icechunk generates logs now. Set the environment variable `ICECHUNK_LOG=icechunk=debug` to print debug logs to stdout. Available "levels" in order of increasing verbosity are `error`, `warn`, `info`, `debug`, `trace`. The default level is `error`. Example log:
+ 
+- Icechunk can now be installed using `conda`:
+
+ ```shell
+ conda install -c conda-forge icechunk
+ ```
+
+- Optionally delete branches and tags that point to expired snapshots:
+
+ ```python
+ def expire_snapshots(
+ self,
+ older_than: datetime.datetime,
+ *,
+ delete_expired_branches: bool = False,
+ delete_expired_tags: bool = False,
+ ) -> set[str]: ...
+ ```
+
+- More documentation. See [the Icechunk website](https://icechunk.io/)
+
+### Performance
+
+- Faster `exists` zarr `Store` method.
+- Implement `Store.getsize_prefix` method. This significantly speeds up `info_complete`.
+
+### Fixes
+
+- Default regular expression to preload manifests.
+
+## Python Icechunk Library 0.1.1
+
+### Fixes
+
+- Session deserialization error when using distributed writes
+
## Python Icechunk Library 0.1.0
### Features
- Expiration and garbage collection. It's now possible to maintain only recent versions of the repository, reclaiming the storage used exclusively by expired versions.
- Allow an arbitrary map of properties to commits. Example:
+
```
session.commit("some message", metadata={"author": "icechunk-team"})
```
+
This properties can be retrieved via `ancestry`.
- New `chunk_coordinates` function to list all initialized chunks in an array.
- It's now possible to delete tags. New tags with the same name won't be allowed to preserve the immutability of snapshots pointed by a tag.
@@ -33,7 +178,6 @@
- Bad manifest split in unmodified arrays
- Documentation was updated to the latest API.
-
## Python Icechunk Library 0.1.0a15
### Fixes
@@ -48,6 +192,7 @@
- The snapshot now keeps track of the chunk space bounding box for each manifest
- Configuration settings can now be overridden in a field-by-field basis
Example:
+
```python
config = icechunk.RepositoryConfig(inline_chunk_threshold_byte=0)
storage = ...
@@ -57,6 +202,7 @@
config=config,
)
```
+
will use 0 for `inline_chunk_threshold_byte` but all other configuration fields will come from
the repository persistent config. If persistent config is not set, configuration defaults will
take its place.
@@ -91,6 +237,7 @@
config=config,
)
- `ancestry` function can now receive a branch/tag name or a snapshot id
+
- `set_virtual_ref` can now validate the virtual chunk container exists
```
diff --git a/README.md b/README.md
index 6e4f5e0d..2834b952 100644
--- a/README.md
+++ b/README.md
@@ -3,6 +3,7 @@

+
@@ -17,12 +18,12 @@ that enhance performance, collaboration, and safety in a cloud-computing context
- This page: a general overview of the project's goals and components.
- [Icechunk Launch Blog Post](https://earthmover.io/blog/icechunk)
-- [Frequently Asked Questions](https://icechunk.io/faq)
-- Documentation for [Icechunk Python](https://icechunk.io/icechunk-python), the main user-facing
+- [Frequently Asked Questions](https://icechunk.io/en/latest/faq/)
+- Documentation for [Icechunk Python](https://icechunk.io/en/latest/icechunk-python), the main user-facing
library
-- Documentation for the [Icechunk Rust Crate](https://icechunk.io/icechunk-rust)
-- The [Contributor Guide](https://icechunk.io/contributing)
-- The [Icechunk Spec](https://icechunk.io/spec)
+- Documentation for the [Icechunk Rust Crate](https://icechunk.io/en/latest/icechunk-rust)
+- The [Contributor Guide](https://icechunk.io/en/latest/contributing)
+- The [Icechunk Spec](https://icechunk.io/en/latest/spec)
## Icechunk Overview
@@ -87,6 +88,7 @@ Arbitrary JSON-style key-value metadata can be attached to both arrays and group
Every update to an Icechunk store creates a new **snapshot** with a unique ID.
Icechunk users must organize their updates into groups of related operations called **transactions**.
For example, appending a new time slice to multiple arrays should be done as a single transaction, comprising the following steps
+
1. Update the array metadata to resize the array to accommodate the new elements.
2. Write new chunks for each array in the group.
diff --git a/design-docs/008-no-copy-serialization-formats.md b/design-docs/008-no-copy-serialization-formats.md
new file mode 100644
index 00000000..f83f3a15
--- /dev/null
+++ b/design-docs/008-no-copy-serialization-formats.md
@@ -0,0 +1,127 @@
+# Evaluation of different serialization formats
+
+We want to move away from msgpack serialization for Icechunk metadata files.
+
+## Why
+
+* Msgpack requires a expensive parsing process upfront. If the user only wants
+to pull a few chunk refs from a manifest, they still need to parse the whole manifest.
+* Msgpack deserializes to Rust datastructures. This is good for simplicity of code, but
+probably not good for memory consumption (more pointers everywhere).
+* Msgpack gives too many options on how to serialize things, there is no canonical way,
+so it's not easy to predict how `serde` is going to serialize our detastructures, and
+could even change from version to version.
+* It's hard to explain in the spec what goes into the metadata files, we would need to go
+into `rmp_serde` implementation, see what they do, and document that in the spec.
+
+## Other options
+
+There is a never ending menu. From a custom binary format, to Parquet, and everything else.
+We focused mostly on no-copy formats, for some of the issues enumerated above. Also
+there is a preference for formats that have a tight schema and can be documented with
+some form of IDL.
+
+## Performance evaluation
+
+We evaluated performance of msgpack, flatbuffers and capnproto. Evaluation looks at:
+
+* Manifest file size, for a big manifest with 1M native chunk refs.
+* Speed of writing.
+* Speed of reading.
+
+We wrote an example program in `examples/multithreaded_get_chunk_refs.rs`.
+This program writes a big repo to local file storage, it doesn't really write the chunks,
+we are not interested in benchmarking that. It executes purely in Rust, not using the python interface.
+
+It writes a manifest with 1M native chunk refs, using zstd compression level 3. The writes are done
+from 1M concurrent async tasks.
+
+It then executes 1M chunk ref reads (notice, the refs are read, not the chunks that are not there).
+Reads are executed from 4 threads with 250k concurrent async tasks each.
+
+Notice:
+
+* We are comparing local file system on purpose, to not account for network times
+* We are comparing pulling refs only, not chunks, which is a worst case. In the real
+ world, read operations are dominated by the time taken to fetch the chunks.
+* The evaluation was done in an early state of the code, where many parts were unsafe,
+ but we have verified there are no huge differences.
+
+### Results for writes
+
+```sh
+nix run nixpkgs#hyperfine -- \
+ --prepare 'rm -rf /tmp/test-perf' \
+ --warmup 1 \
+ 'cargo run --release --example multithreaded_get_chunk_refs -- --write /tmp/test-perf'
+```
+
+#### Flatbuffers
+
+Compressed manifest size: 27_527_680 bytes
+
+```
+Time (mean ± σ): 5.698 s ± 0.163 s [User: 4.764 s, System: 0.910 s]
+Range (min … max): 5.562 s … 6.103 s 10 runs
+```
+
+#### Capnproto
+
+Compressed manifest size: 26_630_927 bytes
+
+```
+Time (mean ± σ): 6.276 s ± 0.163 s [User: 5.225 s, System: 1.017 s]
+Range (min … max): 6.126 s … 6.630 s 10 runs
+```
+
+#### Msgpack
+
+Compressed manifest size: 22_250_152 bytes
+
+```
+Time (mean ± σ): 6.224 s ± 0.155 s [User: 5.488 s, System: 0.712 s]
+Range (min … max): 6.033 s … 6.532 s 10 runs
+```
+
+### Results for reads
+
+```sh
+nix run nixpkgs#hyperfine -- \
+ --warmup 1 \
+ 'cargo run --release --example multithreaded_get_chunk_refs -- --read /tmp/test-perf'
+```
+
+#### Flatbuffers
+
+```
+Time (mean ± σ): 3.676 s ± 0.257 s [User: 7.385 s, System: 1.819 s]
+Range (min … max): 3.171 s … 4.038 s 10 runs
+```
+
+#### Capnproto
+
+```
+Time (mean ± σ): 5.254 s ± 0.234 s [User: 11.370 s, System: 1.962 s]
+Range (min … max): 4.992 s … 5.799 s 10 runs
+```
+
+#### Msgpack
+
+```
+Time (mean ± σ): 3.310 s ± 0.606 s [User: 5.975 s, System: 1.762 s]
+Range (min … max): 2.392 s … 4.102 s 10 runs
+```
+
+## Conclusions
+
+* Compressed manifest is 25% larger in flatbuffers than msgpack
+* Flatbuffers is slightly faster for commits
+* Flatbuffers is slightly slower for reads
+* Timing differences are not significant for real world scenarios, where performance
+is dominated by the time taken downloading or uploading chunks.
+* Manifest fetch time differences could be somewhat significant for workloads where
+latency to first byte is important. This is not the use case Icechunk optimizes for.
+
+## Decision
+
+We are going to use flatbuffers for our metadata on-disk format.
diff --git a/docs/docs/assets/storage/tigris-region-set.png b/docs/docs/assets/storage/tigris-region-set.png
new file mode 100644
index 00000000..72420c01
Binary files /dev/null and b/docs/docs/assets/storage/tigris-region-set.png differ
diff --git a/docs/docs/contributing.md b/docs/docs/contributing.md
index 8559a2fd..69608ed4 100644
--- a/docs/docs/contributing.md
+++ b/docs/docs/contributing.md
@@ -16,6 +16,11 @@ Icechunk is an open source (Apache 2.0) project and welcomes contributions in th
## Development
### Python Development Workflow
+The Python code is developed in the `icechunk-python` subdirectory. To make changes first enter that directory:
+
+```bash
+cd icechunk-python
+```
Create / activate a virtual environment:
@@ -43,6 +48,9 @@ Build the project in dev mode:
```bash
maturin develop
+
+# or with the optional dependencies
+maturin develop --extras=test,benchmark
```
or build the project in editable mode:
diff --git a/docs/docs/icechunk-python/cheatsheets/git-users.md b/docs/docs/icechunk-python/cheatsheets/git-users.md
index 9af96a48..52008f45 100644
--- a/docs/docs/icechunk-python/cheatsheets/git-users.md
+++ b/docs/docs/icechunk-python/cheatsheets/git-users.md
@@ -54,6 +54,8 @@ We can either check out a branch for [read-only access](../reference/#icechunk.R
```python
# check out a branch for read-only access
session = repo.readonly_session(branch="my-new-branch")
+# readonly_session accepts a branch name by default
+session = repo.readonly_session("my-new-branch")
# check out a branch for read-write access
session = repo.writable_session("my-new-branch")
```
@@ -79,7 +81,7 @@ At this point, the tip of the branch is now the snapshot `198273178639187` and a
In Icechunk, you can view the history of a branch by using the [`repo.ancestry()`](../reference/#icechunk.Repository.ancestry) command, similar to the `git log` command.
```python
-repo.ancestry(branch="my-new-branch")
+[ancestor for ancestor in repo.ancestry(branch="my-new-branch")]
#[Snapshot(id='198273178639187', ...), ...]
```
@@ -154,7 +156,7 @@ We can also view the history of a tag by using the [`repo.ancestry()`](../refere
repo.ancestry(tag="my-new-tag")
```
-This will return a list of snapshots that are ancestors of the tag. Similar to branches we can lookup the snapshot that a tag is based on by using the [`repo.lookup_tag()`](../reference/#icechunk.Repository.lookup_tag) command.
+This will return an iterator of snapshots that are ancestors of the tag. Similar to branches we can lookup the snapshot that a tag is based on by using the [`repo.lookup_tag()`](../reference/#icechunk.Repository.lookup_tag) command.
```python
repo.lookup_tag("my-new-tag")
diff --git a/docs/docs/icechunk-python/configuration.md b/docs/docs/icechunk-python/configuration.md
index b6404e4d..d037aa8f 100644
--- a/docs/docs/icechunk-python/configuration.md
+++ b/docs/docs/icechunk-python/configuration.md
@@ -1,85 +1,137 @@
# Configuration
-When creating and opening Icechunk repositories, there are a two different sets of configuration to be aware of:
+When creating and opening Icechunk repositories, there are many configuration options available to control the behavior of the repository and the storage backend. This page will guide you through the available options and how to use them.
-- [`Storage`](./reference.md#icechunk.Storage) - for configuring access to the object store or filesystem
-- [`RepositoryConfig`](./reference.md#icechunk.RepositoryConfig) - for configuring the behavior of the Icechunk Repository itself
+## [`RepositoryConfig`](./reference.md#icechunk.RepositoryConfig)
-## Storage
+The `RepositoryConfig` object is used to configure the repository. For convenience, this can be constructed using some sane defaults:
-Icechunk can be configured to work with both object storage and filesystem backends. The storage configuration defines the location of an Icechunk store, along with any options or information needed to access data from a given storage type.
+```python
+config = icechunk.RepositoryConfig.default()
+```
-### S3 Storage
+or it can be optionally loaded from an existing repository:
-When using Icechunk with s3 compatible storage systems, credentials must be provided to allow access to the data on the given endpoint. Icechunk allows for creating the storage config for s3 in three ways:
+```python
+config = icechunk.Repository.fetch_config(storage)
+```
-=== "From environment"
+It allows you to configure the following parameters:
- With this option, the credentials for connecting to S3 are detected automatically from your environment.
- This is usually the best choice if you are connecting from within an AWS environment (e.g. from EC2). [See the API](./reference.md#icechunk.s3_storage)
+### [`inline_chunk_threshold_bytes`](./reference.md#icechunk.RepositoryConfig.inline_chunk_threshold_bytes)
- ```python
- icechunk.s3_storage(
- bucket="icechunk-test",
- prefix="quickstart-demo-1",
- from_env=True
- )
- ```
+The threshold for when to inline a chunk into a manifest instead of storing it as a separate object in the storage backend.
-=== "Provide credentials"
+### [`get_partial_values_concurrency`](./reference.md#icechunk.RepositoryConfig.get_partial_values_concurrency)
- With this option, you provide your credentials and other details explicitly. [See the API](./reference.md#icechunk.s3_storage)
+The number of concurrent requests to make when getting partial values from storage.
- ```python
- icechunk.s3_storage(
- bucket="icechunk-test",
- prefix="quickstart-demo-1",
- region='us-east-1',
- access_key_id='my-access-key',
- secret_access_key='my-secret-key',
- # session token is optional
- session_token='my-token',
- endpoint_url=None, # if using a custom endpoint
- allow_http=False, # allow http connections (default is False)
- )
- ```
+### [`compression`](./reference.md#icechunk.RepositoryConfig.compression)
-=== "Anonymous"
+Icechunk uses Zstd compression to compress its metadata files. [`CompressionConfig`](./reference.md#icechunk.CompressionConfig) allows you to configure the [compression level](./reference.md#icechunk.CompressionConfig.level) and [algorithm](./reference.md#icechunk.CompressionConfig.algorithm). Currently, the only algorithm available is [`Zstd`](https://facebook.github.io/zstd/).
- With this option, you connect to S3 anonymously (without credentials).
- This is suitable for public data. [See the API](./reference.md#icechunk.StorageConfig.s3_anonymous)
+```python
+config.compression = icechunk.CompressionConfig(
+ level=3,
+ algorithm=icechunk.CompressionAlgorithm.Zstd,
+)
+```
- ```python
- icechunk.s3_storage(
- bucket="icechunk-test",
- prefix="quickstart-demo-1",
- region='us-east-1,
- anonymous=True,
- )
- ```
+### [`caching`](./reference.md#icechunk.RepositoryConfig.caching)
-### Filesystem Storage
+Icechunk caches metadata files to speed up common operations. [`CachingConfig`](./reference.md#icechunk.CachingConfig) allows you to configure the caching behavior for the repository.
-Icechunk can also be used on a [local filesystem](./reference.md#icechunk.local_filesystem_storage) by providing a path to the location of the store
+```python
+config.caching = icechunk.CachingConfig(
+ num_snapshot_nodes=100,
+ num_chunk_refs=100,
+ num_transaction_changes=100,
+ num_bytes_attributes=1e4,
+ num_bytes_chunks=1e6,
+)
+```
-=== "Local filesystem"
+### [`storage`](./reference.md#icechunk.RepositoryConfig.storage)
- ```python
- icechunk.local_filesystem_storage("/path/to/my/dataset")
- ```
+This configures how Icechunk loads data from the storage backend. [`StorageSettings`](./reference.md#icechunk.StorageSettings) allows you to configure the storage settings. Currently, the only setting available is the concurrency settings with [`StorageConcurrencySettings`](./reference.md#icechunk.StorageConcurrencySettings).
+
+```python
+config.storage = icechunk.StorageSettings(
+ concurrency=icechunk.StorageConcurrencySettings(
+ max_concurrent_requests_for_object=10,
+ ideal_concurrent_request_size=1e6,
+ ),
+)
+```
-## Repository Config
+### [`virtual_chunk_containers`](./reference.md#icechunk.RepositoryConfig.virtual_chunk_containers)
-Separate from the storage config, the Repository can also be configured with options which control its runtime behavior.
+Icechunk allows repos to contain [virtual chunks](./virtual.md). To allow for referencing these virtual chunks, you can configure the `virtual_chunk_containers` parameter to specify the storage locations and configurations for any virtual chunks. Each virtual chunk container is specified by a [`VirtualChunkContainer`](./reference.md#icechunk.VirtualChunkContainer) object which contains a name, a url prefix, and a storage configuration. When a container is added to the settings, any virtual chunks with a url that starts with the configured prefix will use the storage configuration for that matching container.
!!! note
- This section is under construction and coming soon.
-## Creating and Opening Repos
+ Currently only `s3` compatible storage and `local_filesystem` storage are supported for virtual chunk containers. Other storage backends such as `gcs`, `azure`, and `https` are on the roadmap.
+
+#### Example
+
+For example, if we wanted to configure an icechunk repo to be able to contain virtual chunks from an `s3` bucket called `my-s3-bucket` in `us-east-1`, we would do the following:
+
+```python
+config.virtual_chunk_containers = [
+ icechunk.VirtualChunkContainer(
+ name="my-s3-bucket",
+ url_prefix="s3://my-s3-bucket/",
+ storage=icechunk.StorageSettings(
+ storage=icechunk.s3_storage(bucket="my-s3-bucket", region="us-east-1"),
+ ),
+ ),
+]
+```
+
+If we also wanted to configure the repo to be able to contain virtual chunks from another `s3` bucket called `my-other-s3-bucket` in `us-west-2`, we would do the following:
+
+```python
+config.set_virtual_chunk_container(
+ icechunk.VirtualChunkContainer(
+ name="my-other-s3-bucket",
+ url_prefix="s3://my-other-s3-bucket/",
+ storage=icechunk.StorageSettings(
+ storage=icechunk.s3_storage(bucket="my-other-s3-bucket", region="us-west-2"),
+ ),
+ ),
+)
+```
+
+Now at read time, if icechunk encounters a virtual chunk url that starts with `s3://my-other-s3-bucket/`, it will use the storage configuration for the `my-other-s3-bucket` container.
+
+!!! note
+
+ While virtual chunk containers specify the storage configuration for any virtual chunks, they do not contain any authentication information. The credentials must also be specified when opening the repository using the [`virtual_chunk_credentials`](./reference.md#icechunk.Repository.open) parameter. See the [Virtual Chunk Credentials](#virtual-chunk-credentials) section for more information.
+
+### [`manifest`](./reference.md#icechunk.RepositoryConfig.manifest)
+
+The manifest configuration for the repository. [`ManifestConfig`](./reference.md#icechunk.ManifestConfig) allows you to configure behavior for how manifests are loaded. in particular, the `preload` parameter allows you to configure the preload behavior of the manifest using a [`ManifestPreloadConfig`](./reference.md#icechunk.ManifestPreloadConfig). This allows you to control the number of references that are loaded into memory when a session is created, along with which manifests are available to be preloaded.
+
+#### Example
+
+For example, if we have a repo which contains data that we plan to open as an [`Xarray`](./xarray.md) dataset, we may want to configure the manifest preload to only preload manifests that contain arrays that are coordinates, in our case `time`, `latitude`, and `longitude`.
+
+```python
+config.manifest = icechunk.ManifestConfig(
+ preload=icechunk.ManifestPreloadConfig(
+ max_total_refs=1e8,
+ preload_if=icechunk.ManifestPreloadCondition.name_matches(".*time|.*latitude|.*longitude"),
+ ),
+)
+```
+
+### Applying Configuration
Now we can now create or open an Icechunk repo using our config.
-### Creating a new repo
+#### Creating a new repo
+
+If no config is provided, the repo will be created with the [default configuration](./reference.md#icechunk.RepositoryConfig.default).
!!! note
@@ -97,6 +149,7 @@ Now we can now create or open an Icechunk repo using our config.
repo = icechunk.Repository.create(
storage=storage,
+ config=config,
)
```
@@ -111,6 +164,7 @@ Now we can now create or open an Icechunk repo using our config.
repo = icechunk.Repository.create(
storage=storage,
+ config=config,
)
```
@@ -125,6 +179,7 @@ Now we can now create or open an Icechunk repo using our config.
repo = icechunk.Repository.create(
storage=storage,
+ config=config,
)
```
@@ -133,63 +188,15 @@ Now we can now create or open an Icechunk repo using our config.
```python
repo = icechunk.Repository.create(
storage=icechunk.local_filesystem_storage("/path/to/my/dataset"),
+ config=config
)
```
-If you are not sure if the repo exists yet, an `icechunk Repository` can created or opened if it already exists:
-
-=== "Open or creating with S3 storage"
-
- ```python
- storage = icechunk.s3_storage(
- bucket='earthmover-sample-data',
- prefix='icechunk/oisst.2020-2024/',
- region='us-east-1',
- from_env=True,
- )
+#### Opening an existing repo
- repo = icechunk.Repository.open_or_create(
- storage=storage,
- )
- ```
+When opening an existing repo, the config will be loaded from the repo if it exists. If no config exists and no config was specified, the repo will be opened with the [default configuration](./reference.md#icechunk.RepositoryConfig.default).
-=== "Open or creating with Google Cloud Storage"
-
- ```python
- storage = icechunk.gcs_storage(
- bucket='earthmover-sample-data',
- prefix='icechunk/oisst.2020-2024/',
- from_env=True,
- )
-
- repo = icechunk.Repository.open_or_create(
- storage=storage,
- )
- ```
-
-=== "Open or creating with Azure Blob Storage"
-
- ```python
- storage = icechunk.azure_storage(
- container='earthmover-sample-data',
- prefix='icechunk/oisst.2020-2024/',
- from_env=True,
- )
-
- repo = icechunk.Repository.open_or_create(
- storage=storage,
- )
- ```
-
-=== "Open or creating with local filesystem"
-
- ```python
- repo = icechunk.Repository.open_or_create(
- storage=icechunk.local_filesystem_storage("/path/to/my/dataset"),
- )
- ```
-
-### Opening an existing repo
+However, if a config was specified when opening the repo AND a config was previously persisted in the repo, the two configurations will be merged. The config specified when opening the repo will take precedence over the persisted config.
=== "Opening from S3 Storage"
@@ -203,6 +210,7 @@ If you are not sure if the repo exists yet, an `icechunk Repository` can created
repo = icechunk.Repository.open(
storage=storage,
+ config=config,
)
```
@@ -217,6 +225,7 @@ If you are not sure if the repo exists yet, an `icechunk Repository` can created
repo = icechunk.Repository.open(
storage=storage,
+ config=config,
)
```
@@ -231,6 +240,7 @@ If you are not sure if the repo exists yet, an `icechunk Repository` can created
repo = icechunk.Repository.open(
storage=storage,
+ config=config,
)
```
@@ -240,5 +250,37 @@ If you are not sure if the repo exists yet, an `icechunk Repository` can created
storage = icechunk.local_filesystem_storage("/path/to/my/dataset")
store = icechunk.IcechunkStore.open(
storage=storage,
+ config=config,
)
```
+
+### Persisting Configuration
+
+Once the repo is opened, the current config can be persisted to the repo by calling [`save_config`](./reference.md#icechunk.Repository.save_config).
+
+```python
+repo.save_config()
+```
+
+The next time this repo is opened, the persisted config will be loaded by default.
+
+## Virtual Chunk Credentials
+
+When using virtual chunk containers, the credentials for the storage backend must also be specified. This is done using the [`virtual_chunk_credentials`](./reference.md#icechunk.Repository.open) parameter when creating or opening the repo. Credentials are specified as a dictionary of container names mapping to credential objects. A helper function, [`containers_credentials`](./reference.md#icechunk.containers_credentials), is provided to make it easier to specify credentials for multiple containers.
+
+### Example
+
+Expanding on the example from the [Virtual Chunk Containers](#virtual-chunk-containers) section, we can configure the repo to use the credentials for the `my-s3-bucket` and `my-other-s3-bucket` containers.
+
+```python
+credentials = icechunk.containers_credentials(
+ my_s3_bucket=icechunk.s3_credentials(bucket="my-s3-bucket", region="us-east-1"),
+ my_other_s3_bucket=icechunk.s3_credentials(bucket="my-other-s3-bucket", region="us-west-2"),
+)
+
+repo = icechunk.Repository.open(
+ storage=storage,
+ config=config,
+ virtual_chunk_credentials=credentials,
+)
+```
diff --git a/docs/docs/icechunk-python/dask.md b/docs/docs/icechunk-python/dask.md
index 7c4dd67d..fe3f52f1 100644
--- a/docs/docs/icechunk-python/dask.md
+++ b/docs/docs/icechunk-python/dask.md
@@ -21,8 +21,8 @@ client = Client()
# initialize the icechunk store
import icechunk
-storage = icechunk.local_filesystem_storage("./icechunk-xarray")
-icechunk_repo = icechunk.Repository.create(storage_config)
+storage = icechunk.local_filesystem_storage("./icechunk-dask")
+icechunk_repo = icechunk.Repository.create(storage)
icechunk_session = icechunk_repo.writable_session("main")
```
@@ -34,6 +34,7 @@ support for the `compute` kwarg.
First create a dask array to write:
```python
+import dask.array as da
shape = (100, 100)
dask_chunks = (20, 20)
dask_array = dask.array.random.random(shape, chunks=dask_chunks)
@@ -41,8 +42,10 @@ dask_array = dask.array.random.random(shape, chunks=dask_chunks)
Now create the Zarr array you will write to.
```python
+import zarr
+
zarr_chunks = (10, 10)
-group = zarr.group(store=icechunk_sesion.store, overwrite=True)
+group = zarr.group(store=icechunk_session.store, overwrite=True)
zarray = group.create_array(
"array",
diff --git a/docs/docs/icechunk-python/faq.md b/docs/docs/icechunk-python/faq.md
index 41c0f56e..c5852e67 100644
--- a/docs/docs/icechunk-python/faq.md
+++ b/docs/docs/icechunk-python/faq.md
@@ -3,3 +3,7 @@
**Why do I have to opt-in to pickling an IcechunkStore or a Session?**
Icechunk is different from normal Zarr stores because it is stateful. In a distributed setting, you have to be careful to communicate back the Session objects from remote write tasks, merge them and commit them. The opt-in to pickle is a way for us to hint to the user that they need to be sure about what they are doing. We use pickling because these operations are only tricky once you cross a process boundary. More pragmatically, to_zarr(session.store) fails spectacularly in distributed contexts (e.g. [this issue](https://github.com/earth-mover/icechunk/issues/383)), and we do not want the user to be surprised.
+
+**Does `icechunk-python` include logging?**
+
+Yes! Set the environment variable `ICECHUNK_LOG=icechunk=debug` to print debug logs to stdout. Available "levels" in order of increasing verbosity are `error`, `warn`, `info`, `debug`, `trace`. The default level is `error`. The Rust library uses `tracing-subscriber` crate. The `ICECHUNK_LOG` variable can be used to filter logging following that crate's [documentation](https://docs.rs/tracing-subscriber/latest/tracing_subscriber/filter/struct.EnvFilter.html#directives). For example, `ICECHUNK_LOG=trace` will set both icechunk and it's dependencies' log levels to `trace` while `ICECHUNK_LOG=icechunk=trace` will enable the `trace` level for icechunk only.
diff --git a/docs/docs/icechunk-python/index.md b/docs/docs/icechunk-python/index.md
index bfc62e07..b4fd0e99 100644
--- a/docs/docs/icechunk-python/index.md
+++ b/docs/docs/icechunk-python/index.md
@@ -2,6 +2,7 @@
- [quickstart](/icechunk-python/quickstart/)
- [configuration](/icechunk-python/configuration/)
+- [storage](/icechunk-python/storage/)
- [version control](/icechunk-python/version-control/)
- [xarray](/icechunk-python/xarray/)
- [concurrency](/icechunk-python/concurrency/)
diff --git a/docs/docs/icechunk-python/parallel.md b/docs/docs/icechunk-python/parallel.md
index 26ae6370..8a23a713 100644
--- a/docs/docs/icechunk-python/parallel.md
+++ b/docs/docs/icechunk-python/parallel.md
@@ -5,7 +5,6 @@ with all appropriate metadata, and any coordinate variables. Following this a la
is kicked off in a distributed setting, where each worker is responsible for an independent
"region" of the output.
-
## Why is Icechunk different from any other Zarr store?
The reason is that unlike Zarr, Icechunk is a "stateful" store. The Session object keeps a record of all writes, that is then
@@ -13,8 +12,10 @@ bundled together in a commit. Thus `Session.commit` must be executed on a Sessio
including those executed remotely in a multi-processing or any other remote execution context.
## Example
+
Here is how you can execute such writes with Icechunk, illustrate with a `ThreadPoolExecutor`.
First read some example data, and create an Icechunk Repository.
+
```python
import xarray as xr
import tempfile
@@ -24,8 +25,10 @@ ds = xr.tutorial.open_dataset("rasm").isel(time=slice(24))
repo = Repository.create(local_filesystem_storage(tempfile.mkdtemp()))
session = repo.writable_session("main")
```
+
We will orchestrate so that each task writes one timestep.
This is an arbitrary choice but determines what we set for the Zarr chunk size.
+
```python
chunks = {1 if dim == "time" else ds.sizes[dim] for dim in ds.Tair.dims}
```
@@ -33,6 +36,7 @@ chunks = {1 if dim == "time" else ds.sizes[dim] for dim in ds.Tair.dims}
Initialize the dataset using [`Dataset.to_zarr`](https://docs.xarray.dev/en/stable/generated/xarray.Dataset.to_zarr.html)
and `compute=False`, this will NOT write any chunked array data, but will write all array metadata, and any
in-memory arrays (only `time` in this case).
+
```python
ds.to_zarr(session.store, compute=False, encoding={"Tair": {"chunks": chunks}}, mode="w")
# this commit is optional, but may be useful in your workflow
@@ -42,6 +46,7 @@ session.commit("initialize store")
## Multi-threading
First define a function that constitutes one "write task".
+
```python
from icechunk import Session
@@ -53,6 +58,7 @@ def write_timestamp(*, itime: int, session: Session) -> None:
```
Now execute the writes.
+
```python
from concurrent.futures import ThreadPoolExecutor, wait
from icechunk.distributed import merge_sessions
@@ -67,18 +73,26 @@ session.commit("finished writes")
```
Verify that the writes worked as expected:
+
```python
-ondisk = xr.open_zarr(repo.readonly_session(branch="main").store, consolidated=False)
+ondisk = xr.open_zarr(repo.readonly_session("main").store, consolidated=False)
xr.testing.assert_identical(ds, ondisk)
```
## Distributed writes
+!!! info
+
+ This code will not execute with a `ProcessPoolExecutor` without [some changes](https://docs.python.org/3/library/multiprocessing.html#programming-guidelines).
+ Specifically it requires wrapping the code in a `if __name__ == "__main__":` block.
+ See a full executable example [here](https://github.com/earth-mover/icechunk/blob/main/icechunk-python/examples/mpwrite.py).
+
Any task execution framework (e.g. `ProcessPoolExecutor`, Joblib, Lithops, Dask Distributed, Ray, etc.)
can be used instead of the `ThreadPoolExecutor`. However such workloads should account for
Icehunk being a "stateful" store that records changes executed in a write session.
There are three key points to keep in mind:
+
1. The `write_task` function *must* return the `Session`. It contains a record of the changes executed by this task.
These changes *must* be manually communicated back to the coordinating process, since each of the distributed processes
are working with their own independent `Session` instance.
@@ -87,6 +101,7 @@ There are three key points to keep in mind:
3. The user *must* manually merge the Session objects to create a meaningful commit.
First we modify `write_task` to return the `Session`:
+
```python
from icechunk import Session
@@ -114,8 +129,8 @@ with ProcessPoolExecutor() as executor:
executor.submit(write_timestamp, itime=i, session=session)
for i in range(ds.sizes["time"])
]
- # grab the Session objects from each individual write task
- sessions = [f.result() for f in futures]
+ # grab the Session objects from each individual write task
+ sessions = [f.result() for f in futures]
# manually merge the remote sessions in to the local session
session = merge_sessions(session, *sessions)
@@ -123,7 +138,8 @@ session.commit("finished writes")
```
Verify that the writes worked as expected:
+
```python
-ondisk = xr.open_zarr(repo.readonly_session(branch="main").store, consolidated=False)
+ondisk = xr.open_zarr(repo.readonly_session("main").store, consolidated=False)
xr.testing.assert_identical(ds, ondisk)
```
diff --git a/docs/docs/icechunk-python/quickstart.md b/docs/docs/icechunk-python/quickstart.md
index 04752baa..3bc1303a 100644
--- a/docs/docs/icechunk-python/quickstart.md
+++ b/docs/docs/icechunk-python/quickstart.md
@@ -6,18 +6,25 @@ If you're not familiar with Zarr, you may want to start with the [Zarr Tutorial]
## Installation
-Install Icechunk with pip
+Icechunk can be installed using pip or conda:
-```python
-pip install icechunk
-```
+=== "pip"
+
+ ```bash
+ python -m pip install icechunk
+ ```
+
+=== "conda"
+
+ ```bash
+ conda install -c conda-forge icechunk
+ ```
!!! note
Icechunk is currently designed to support the [Zarr V3 Specification](https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html).
Using it today requires installing Zarr Python 3.
-
## Create a new Icechunk repository
To get started, let's create a new Icechunk repository.
@@ -27,6 +34,7 @@ However, you can also create a repo on your local filesystem.
=== "S3 Storage"
```python
+ import icechunk
storage = icechunk.s3_storage(bucket="my-bucket", prefix="my-prefix", from_env=True)
repo = icechunk.Repository.create(storage)
```
@@ -34,6 +42,7 @@ However, you can also create a repo on your local filesystem.
=== "Google Cloud Storage"
```python
+ import icechunk
storage = icechunk.gcs_storage(bucket="my-bucket", prefix="my-prefix", from_env=True)
repo = icechunk.Repository.create(storage)
```
@@ -41,6 +50,7 @@ However, you can also create a repo on your local filesystem.
=== "Azure Blob Storage"
```python
+ import icechunk
storage = icechunk.azure_storage(container="my-container", prefix="my-prefix", from_env=True)
repo = icechunk.Repository.create(storage)
```
@@ -48,6 +58,7 @@ However, you can also create a repo on your local filesystem.
=== "Local Storage"
```python
+ import icechunk
storage = icechunk.local_filesystem_storage("./icechunk-local")
repo = icechunk.Repository.create(storage)
```
@@ -73,6 +84,7 @@ We can now use our Icechunk `store` with Zarr.
Let's first create a group and an array within it.
```python
+import zarr
group = zarr.group(store)
array = group.create("my_array", shape=10, dtype='int32', chunks=(5,))
```
@@ -95,7 +107,6 @@ session.commit("first commit")
Once a writable `Session` has been successfully committed to, it becomes read only to ensure that all writing is done explicitly.
-
## Make a second commit
At this point, we have already committed using our session, so we need to get a new session and store to make more changes.
@@ -124,7 +135,7 @@ snapshot_id_2 = session_2.commit("overwrite some values")
We can see the full version history of our repo:
```python
-hist = repo.ancestry(snapshot=snapshot_id_2)
+hist = list(repo.ancestry(snapshot_id=snapshot_id_2))
for ancestor in hist:
print(ancestor.id, ancestor.message, ancestor.written_at)
@@ -140,12 +151,12 @@ for ancestor in hist:
# latest version
assert array[0] == 2
# check out earlier snapshot
-earlier_session = repo.readonly_session(snapshot=snapshot_id=hist[1].id)
+earlier_session = repo.readonly_session(snapshot_id=hist[1].id)
store = earlier_session.store
# get the array
group = zarr.open_group(store, mode="r")
-array = group["my_array]
+array = group["my_array"]
# verify data matches first version
assert array[0] == 1
diff --git a/docs/docs/icechunk-python/storage.md b/docs/docs/icechunk-python/storage.md
new file mode 100644
index 00000000..9f73d0b4
--- /dev/null
+++ b/docs/docs/icechunk-python/storage.md
@@ -0,0 +1,262 @@
+# Storage
+
+Icechunk can be configured to work with both object storage and filesystem backends. The storage configuration defines the location of an Icechunk store, along with any options or information needed to access data from a given storage type.
+
+### S3 Storage
+
+When using Icechunk with s3 compatible storage systems, credentials must be provided to allow access to the data on the given endpoint. Icechunk allows for creating the storage config for s3 in three ways:
+
+=== "From environment"
+
+ With this option, the credentials for connecting to S3 are detected automatically from your environment.
+ This is usually the best choice if you are connecting from within an AWS environment (e.g. from EC2). [See the API](./reference.md#icechunk.s3_storage)
+
+ ```python
+ icechunk.s3_storage(
+ bucket="icechunk-test",
+ prefix="quickstart-demo-1",
+ from_env=True
+ )
+ ```
+
+=== "Provide credentials"
+
+ With this option, you provide your credentials and other details explicitly. [See the API](./reference.md#icechunk.s3_storage)
+
+ ```python
+ icechunk.s3_storage(
+ bucket="icechunk-test",
+ prefix="quickstart-demo-1",
+ region='us-east-1',
+ access_key_id='my-access-key',
+ secret_access_key='my-secret-key',
+ # session token is optional
+ session_token='my-token',
+ endpoint_url=None, # if using a custom endpoint
+ allow_http=False, # allow http connections (default is False)
+ )
+ ```
+
+=== "Anonymous"
+
+ With this option, you connect to S3 anonymously (without credentials).
+ This is suitable for public data. [See the API](./reference.md#icechunk.s3_storage)
+
+ ```python
+ icechunk.s3_storage(
+ bucket="icechunk-test",
+ prefix="quickstart-demo-1",
+ region='us-east-1,
+ anonymous=True,
+ )
+ ```
+
+=== "Refreshable Credentials"
+
+ With this option, you provide a callback function that will be called to obtain S3 credentials when needed. This is useful for workloads that depend on retrieving short-lived credentials from AWS or similar authority, allowing for credentials to be refreshed as needed without interrupting any workflows. [See the API](./reference.md#icechunk.s3_storage)
+
+ ```python
+ def get_credentials() -> S3StaticCredentials:
+ # In practice, you would use a function that actually fetches the credentials and returns them
+ # along with an optional expiration time which will trigger this callback to run again
+ return icechunk.S3StaticCredentials(
+ access_key_id="xyz",
+ secret_access_key="abc",å
+ expires_after=datetime.now(UTC) + timedelta(days=1)
+ )
+
+ icechunk.s3_storage(
+ bucket="icechunk-test",
+ prefix="quickstart-demo-1",
+ region='us-east-1',
+ get_credentials=get_credentials,
+ )
+ ```
+
+#### Tigris
+
+[Tigris](https://www.tigrisdata.com/) is available as a storage backend for Icechunk. Functionally this storage backend is the same as S3 storage, but with a different endpoint. Icechunk provides a helper function specifically for [creating Tigris storage configurations](./reference.md#icechunk.tigris_storage).
+```python
+icechunk.tigris_storage(
+ bucket="icechunk-test",
+ prefix="quickstart-demo-1",
+ access_key_id='my-access-key',
+ secret_access_key='my-secret-key',
+)
+```
+
+There are a few things to be aware of when using Tigris:
+- Tigris is a globally distributed object store by default. The caveat is that Tigris does not currently support the full consistency guarantees when the store is distributed across multiple regions. For now, to get all the consistency guarantees Icechunk offers, you will need to setup your Tigris bucket as restricted to a single region. This can be done by setting the region in the Tigris bucket settings:
+
+
+#### Minio
+
+[Minio](https://min.io/) is available as a storage backend for Icechunk. Functionally this storage backend is the same as S3 storage, but with a different endpoint.
+
+For example, if we have a Minio server running at `http://localhost:9000` with access key `minio` and
+secret key `minio123` we can create a storage configuration as follows:
+
+```python
+icechunk.s3_storage(
+ bucket="icechunk-test",
+ prefix="quickstart-demo-1",
+ region='us-east-1',
+ access_key_id='minio',
+ secret_access_key='minio123',
+ endpoint_url='http://localhost:9000',
+ allow_http=True,
+```
+
+A few things to note:
+
+1. The `endpoint_url` parameter is set to the URL of the Minio server.
+2. If the Minio server is running over HTTP and not HTTPS, the `allow_http` parameter must be set to `True`.
+3. Even though this is running on a local server, the `region` parameter must still be set to a valid region. [By default use `us-east-1`](https://github.com/minio/minio/discussions/15063).
+
+### Google Cloud Storage
+
+Icechunk can be used with [Google Cloud Storage](https://cloud.google.com/storage?hl=en).
+
+=== "From environment"
+
+ With this option, the credentials for connecting to GCS are detected automatically from your environment. [See the API](./reference.md#icechunk.gcs_storage)
+
+ ```python
+ icechunk.gcs_storage(
+ bucket="icechunk-test",
+ prefix="quickstart-demo-1",
+ from_env=True
+ )
+ ```
+
+=== "Service Account File"
+
+ With this option, you provide the path to a [service account file](https://cloud.google.com/iam/docs/service-account-creds#key-types). [See the API](./reference.md#icechunk.gcs_storage)
+
+ ```python
+ icechunk.gcs_storage(
+ bucket="icechunk-test",
+ prefix="quickstart-demo-1",
+ service_account_file="/path/to/service-account.json"
+ )
+ ```
+
+=== "Service Account Key"
+
+ With this option, you provide the service account key as a string. [See the API](./reference.md#icechunk.gcs_storage)
+
+ ```python
+ icechunk.gcs_storage(
+ bucket="icechunk-test",
+ prefix="quickstart-demo-1",
+ service_account_key={
+ "type": "service_account",
+ "project_id": "my-project",
+ "private_key_id": "my-private-key-id",
+ "private_key": "-----BEGIN PRIVATE KEY-----\nmy-private-key\n-----END PRIVATE KEY-----\n",
+ "client_email": "
+ },
+ )
+ ```
+
+=== "Application Default Credentials"
+
+ With this option, you use the [application default credentials (ADC)](https://cloud.google.com/docs/authentication/provide-credentials-adc) to authentication with GCS. Provide the path to the credentials. [See the API](./reference.md#icechunk.gcs_storage)
+
+ ```python
+ icechunk.gcs_storage(
+ bucket="icechunk-test",
+ prefix="quickstart-demo-1",
+ application_credentials="/path/to/application-credentials.json"
+ )
+ ```
+
+=== "Bearer Token"
+
+ With this option, you provide a bearer token to use for the object store. This is useful for short lived workflows where expiration is not relevant or when the bearer token will not expire [See the API](./reference.md#icechunk.gcs_storage)
+
+ ```python
+ icechunk.gcs_storage(
+ bucket="icechunk-test",
+ prefix="quickstart-demo-1",
+ bearer_token="my-bearer-token"
+ )
+ ```
+
+=== "Refreshable Credentials"
+
+ With this option, you provide a callback function that will be called to obtain GCS credentials when needed. This is useful for workloads that depend on retrieving short-lived credentials from GCS or similar authority, allowing for credentials to be refreshed as needed without interrupting any workflows. This works at a lower level than the other methods, and accepts a bearer token and expiration time. These are the same credentials that are created for you when specifying the service account file, key, or ADC. [See the API](./reference.md#icechunk.gcs_storage)
+
+ ```python
+ def get_credentials() -> GcsBearerCredential:
+ # In practice, you would use a function that actually fetches the credentials and returns them
+ # along with an optional expiration time which will trigger this callback to run again
+ return icechunk.GcsBearerCredential(bearer="my-bearer-token", expires_after=datetime.now(UTC) + timedelta(days=1))
+
+ icechunk.gcs_storage(
+ bucket="icechunk-test",
+ prefix="quickstart-demo-1",
+ get_credentials=get_credentials,
+ )
+ ```
+
+#### Limitations
+
+- The consistency guarantees for GCS function differently than S3. Specifically, GCS uses the [generation](https://cloud.google.com/storage/docs/request-preconditions#compose-preconditions) instead of etag for `if-match` `put` requests. Icechunk has not wired this through yet and thus [configuration updating](https://github.com/earth-mover/icechunk/issues/533) is potentially unsafe. This is not a problem for most use cases that are not frequently updating the configuration.
+- GCS does not yet support [`bearer` tokens and auth refreshing](https://github.com/earth-mover/icechunk/issues/637). This means currently auth is limited to service account files.
+- The GCS storage config does not yet support anonymous access.
+
+### Azure Blob Storage
+
+Icechunk can be used with [Azure Blob Storage](https://azure.microsoft.com/en-us/services/storage/blobs/).
+
+=== "From environment"
+
+ With this option, the credentials for connecting to Azure Blob Storage are detected automatically from your environment. [See the API](./reference.md#icechunk.azure_storage)
+
+ ```python
+ icechunk.azure_storage(
+ account="my-account-name",
+ container="icechunk-test",
+ prefix="quickstart-demo-1",
+ from_env=True
+ )
+ ```
+
+=== "Provide credentials"
+
+ With this option, you provide your credentials and other details explicitly. [See the API](./reference.md#icechunk.azure_storage)
+
+ ```python
+ icechunk.azure_storage(
+ account_name='my-account-name',
+ container="icechunk-test",
+ prefix="quickstart-demo-1",
+ account_key='my-account-key',
+ access_token=None, # optional
+ sas_token=None, # optional
+ bearer_token=None, # optional
+ )
+ ```
+
+### Filesystem Storage
+
+Icechunk can also be used on a [local filesystem](./reference.md#icechunk.local_filesystem_storage) by providing a path to the location of the store
+
+=== "Local filesystem"
+
+ ```python
+ icechunk.local_filesystem_storage("/path/to/my/dataset")
+ ```
+
+#### Limitations
+
+- Icechunk currently does not work with a local filesystem storage backend on Windows. See [this issue](https://github.com/earth-mover/icechunk/issues/665) for more discussion. To work around, try using [WSL](https://learn.microsoft.com/en-us/windows/wsl/about) or a cloud storage backend.
+
+### In Memory Storage
+
+While it should never be used for production data, Icechunk can also be used with an in-memory storage backend. This is useful for testing and development purposes. This is volatile and when the Python process ends, all data is lost.
+
+```python
+icechunk.in_memory_storage()
+```
diff --git a/docs/docs/icechunk-python/version-control.md b/docs/docs/icechunk-python/version-control.md
index d61462cc..8acd4fa2 100644
--- a/docs/docs/icechunk-python/version-control.md
+++ b/docs/docs/icechunk-python/version-control.md
@@ -27,7 +27,7 @@ repo = icechunk.Repository.create(icechunk.in_memory_storage())
On creating a new [`Repository`](../reference/#icechunk.Repository), it will automatically create a `main` branch with an initial snapshot. We can take a look at the ancestry of the `main` branch to confirm this.
```python
-repo.ancestry(branch="main")
+[ancestor for ancestor in repo.ancestry(branch="main")]
# [SnapshotInfo(id="A840RMN5CF807CM66RY0", parent_id=None, written_at=datetime.datetime(2025,1,30,19,52,41,592998, tzinfo=datetime.timezone.utc), message="Repository...")]
```
@@ -36,8 +36,7 @@ repo.ancestry(branch="main")
The [`ancestry`](./reference/#icechunk.Repository.ancestry) method can be used to inspect the ancestry of any branch, snapshot, or tag.
-We get back a list of [`SnapshotInfo`](../reference/#icechunk.SnapshotInfo) objects, which contain information about the snapshot, including its ID, the ID of its parent snapshot, and the time it was written.
-
+We get back an iterator of [`SnapshotInfo`](../reference/#icechunk.SnapshotInfo) objects, which contain information about the snapshot, including its ID, the ID of its parent snapshot, and the time it was written.
## Creating a snapshot
@@ -48,7 +47,7 @@ Now that we have a `Repository` with a `main` branch, we can modify the data in
Writable `Session` objects are required to create new snapshots, and can only be created from the tip of a branch. Checking out tags or other snapshots is read-only.
```python
-session = repo.writable_session(branch="main")
+session = repo.writable_session("main")
```
We can now access the `zarr.Store` from the `Session` and create a new root group. Then we can modify the attributes of the root group and create a new snapshot.
@@ -68,7 +67,7 @@ Success! We've created a new snapshot with a new attribute on the root group.
Once we've committed the snapshot, the `Session` will become read-only, and we can no longer modify the data using our existing `Session`. If we want to modify the data again, we need to create a new writable `Session` from the branch. Notice that we don't have to refresh the `Repository` to get the updates from the `main` branch. Instead, the `Repository` will automatically fetch the latest snapshot from the branch when we create a new writable `Session` from it.
```python
-session = repo.writable_session(branch="main")
+session = repo.writable_session("main")
root = zarr.group(session.store)
root.attrs["foo"] = "baz"
session.commit(message="Update foo attribute on root group")
@@ -123,7 +122,7 @@ repo.create_branch("dev", snapshot_id=main_branch_snapshot_id)
We can now create a new writable `Session` from the `dev` branch and modify the data.
```python
-session = repo.writable_session(branch="dev")
+session = repo.writable_session("dev")
root = zarr.group(session.store)
root.attrs["foo"] = "balogna"
session.commit(message="Update foo attribute on root group")
@@ -137,7 +136,7 @@ We can also create a new branch from the tip of the `main` branch if we want to
main_branch_snapshot_id = repo.lookup_branch("main")
repo.create_branch("feature", snapshot_id=main_branch_snapshot_id)
-session = repo.writable_session(branch="feature")
+session = repo.writable_session("feature")
root = zarr.group(session.store)
root.attrs["foo"] = "cherry"
session.commit(message="Update foo attribute on root group")
@@ -254,7 +253,7 @@ import numpy as np
import zarr
repo = icechunk.Repository.create(icechunk.in_memory_storage())
-session = repo.writable_session(branch="main")
+session = repo.writable_session("main")
root = zarr.group(session.store)
root.attrs["foo"] = "bar"
root.create_dataset("data", shape=(10, 10), chunks=(1, 1), dtype=np.int32)
@@ -266,25 +265,21 @@ session.commit(message="Add foo attribute and data array")
Lets try to modify the `data` array in two different sessions, created from the `main` branch.
```python
-session1 = repo.writable_session(branch="main")
-session2 = repo.writable_session(branch="main")
+session1 = repo.writable_session("main")
+session2 = repo.writable_session("main")
root1 = zarr.group(session1.store)
root2 = zarr.group(session2.store)
-```
-First, we'll modify the attributes of the root group from both sessions.
-
-```python
-root1.attrs["foo"] = "bar"
-root2.attrs["foo"] = "baz"
+root1["data"][0,0] = 1
+root2["data"][0,:] = 2
```
and then try to commit the changes.
```python
-session1.commit(message="Update foo attribute on root group")
-session2.commit(message="Update foo attribute on root group")
+session1.commit(message="Update first element of data array")
+session2.commit(message="Update first row of data array")
# AE9XS2ZWXT861KD2JGHG
# ---------------------------------------------------------------------------
@@ -328,66 +323,7 @@ session2.rebase(icechunk.ConflictDetector())
# RebaseFailedError: Rebase failed on snapshot AE9XS2ZWXT861KD2JGHG: 1 conflicts found
```
-This however fails because both sessions modified the `foo` attribute on the root group. We can use the `ConflictError` to get more information about the conflict.
-
-```python
-try:
- session2.rebase(icechunk.ConflictDetector())
-except icechunk.RebaseFailedError as e:
- print(e.conflicts)
-
-# [Conflict(UserAttributesDoubleUpdate, path=/)]
-```
-
-This tells us that the conflict is caused by the two sessions modifying the user attributes of the root group (`/`). In this casewe have decided that second session set the `foo` attribute to the correct value, so we can now try to rebase by instructing the `rebase` method to use the second session's changes with the [`BasicConflictSolver`](../reference/#icechunk.BasicConflictSolver).
-
-```python
-session2.rebase(icechunk.BasicConflictSolver(on_user_attributes_conflict=icechunk.VersionSelection.UseOurs))
-```
-
-Success! We can now try and commit the changes again.
-
-```python
-session2.commit(message="Update foo attribute on root group")
-
-# 'SY4WRE8A9TVYMTJPEAHG'
-```
-
-This same process can be used to resolve conflicts with arrays. Let's try to modify the `data` array from both sessions.
-
-```python
-session1 = repo.writable_session(branch="main")
-session2 = repo.writable_session(branch="main")
-
-root1 = zarr.group(session1.store)
-root2 = zarr.group(session2.store)
-
-root1["data"][0,0] = 1
-root2["data"][0,:] = 2
-```
-
-We have now created a conflict, because the first session modified the first element of the `data` array, and the second session modified the first row of the `data` array. Let's commit the changes from the second session first, then see what conflicts are reported when we try to commit the changes from the first session.
-
-
-```python
-print(session2.commit(message="Update first row of data array"))
-print(session1.commit(message="Update first element of data array"))
-
-# ---------------------------------------------------------------------------
-# ConflictError Traceback (most recent call last)
-# Cell In[15], line 2
-# 1 print(session2.commit(message="Update first row of data array"))
-# ----> 2 print(session1.commit(message="Update first element of data array"))
-
-# File ~/Developer/icechunk/icechunk-python/python/icechunk/session.py:224, in Session.commit(self, message, metadata)
-# 222 return self._session.commit(message, metadata)
-# 223 except PyConflictError as e:
-# --> 224 raise ConflictError(e) from None
-
-# ConflictError: Failed to commit, expected parent: Some("SY4WRE8A9TVYMTJPEAHG"), actual parent: Some("5XRDGZPSG747AMMRTWT0")
-```
-
-Okay! We have a conflict. Lets see what conflicts are reported.
+This however fails because both sessions modified metadata. We can use the `RebaseFailedError` to get more information about the conflict.
```python
try:
@@ -413,18 +349,24 @@ Success! We have now resolved the conflict and committed the changes.
Let's look at the value of the `data` array to confirm that the conflict was resolved correctly.
```python
-session = repo.readonly_session(branch="main")
+session = repo.readonly_session("main")
root = zarr.open_group(session.store, mode="r")
root["data"][0,:]
# array([1, 2, 2, 2, 2, 2, 2, 2, 2, 2], dtype=int32)
```
+As you can see, `readonly_session` accepts a string for a branch name, or you can also write:
+
+```python
+session = repo.readonly_session(branch="main")
+```
+
Lastly, if you make changes to non-conflicting chunks or attributes, you can rebase without having to resolve any conflicts.
```python
-session1 = repo.writable_session(branch="main")
-session2 = repo.writable_session(branch="main")
+session1 = repo.writable_session("main")
+session2 = repo.writable_session("main")
root1 = zarr.group(session1.store)
root2 = zarr.group(session2.store)
@@ -466,4 +408,4 @@ root["data"][:,:]
#### Limitations
-At the moment, the rebase functionality is limited to resolving conflicts with attributes on arrays and groups, and conflicts with chunks in arrays. Other types of conflicts are not able to be resolved by icechunk yet and must be resolved manually.
+At the moment, the rebase functionality is limited to resolving conflicts with chunks in arrays. Other types of conflicts are not able to be resolved by icechunk yet and must be resolved manually.
diff --git a/docs/docs/icechunk-python/virtual.md b/docs/docs/icechunk-python/virtual.md
index 1e859ea9..459e3962 100644
--- a/docs/docs/icechunk-python/virtual.md
+++ b/docs/docs/icechunk-python/virtual.md
@@ -2,29 +2,24 @@
While Icechunk works wonderfully with native chunks managed by Zarr, there is lots of archival data out there in other formats already. To interoperate with such data, Icechunk supports "Virtual" chunks, where any number of chunks in a given dataset may reference external data in existing archival formats, such as netCDF, HDF, GRIB, or TIFF. Virtual chunks are loaded directly from the original source without copying or modifying the original achival data files. This enables Icechunk to manage large datasets from existing data without needing that data to be in Zarr format already.
-!!! warning
+!!! note
- While virtual references are fully supported in Icechunk, creating virtual datasets currently relies on using experimental or pre-release versions of open source tools. For full instructions on how to install the required tools and their current statuses [see the tracking issue on Github](https://github.com/earth-mover/icechunk/issues/197).
- With time, these experimental features will make their way into the released packages.
+ The concept of a "virtual Zarr dataset" originates from the [Kerchunk](https://fsspec.github.io/kerchunk/) project, which preceded and inspired [VirtualiZarr](https://virtualizarr.readthedocs.io/en/latest/). Like `VirtualiZarr`, the `kerchunk` package provides functionality to scan metadata of existing data files and combine these references into larger virtual datasets, but unlike `VirtualiZarr` the `Kerchunk` package currently has no facility for writing to `Icechunk` stores. If you previously were interested in "Kerchunking" your data, you can now achieve a similar result by using `VirtualiZarr` to create virtual datasets and write them to `icechunk`.
-To create virtual Icechunk datasets with Python, the community utilizes the [kerchunk](https://fsspec.github.io/kerchunk/) and [VirtualiZarr](https://virtualizarr.readthedocs.io/en/latest/) packages.
+`VirtualiZarr` lets users ingest existing data files into virtual datasets using various different tools under the hood, including `kerchunk`, `xarray`, `zarr`, and now `icechunk`. It does so by creating virtual references to existing data that can be combined and manipulated to create larger virtual datasets using `xarray`. These datasets can then be exported to `kerchunk` reference format or to an `Icechunk` repository, without ever copying or moving the existing data files.
-`kerchunk` allows scanning the metadata of existing data files to extract virtual references. It also provides methods to combine these references into [larger virtual datasets](https://fsspec.github.io/kerchunk/tutorial.html#combine-multiple-kerchunked-datasets-into-a-single-logical-aggregate-dataset), which can be exported to it's [reference format](https://fsspec.github.io/kerchunk/spec.html).
+!!! note
-`VirtualiZarr` lets users ingest existing data files into virtual datasets using various different tools under the hood, including `kerchunk`, `xarray`, `zarr`, and now `icechunk`. It does so by creating virtual references to existing data that can be combined and manipulated to create larger virtual datasets using `xarray`. These datasets can then be exported to `kerchunk` reference format or to an `Icechunk` store, without ever copying or moving the existing data files.
+ [Currently only `s3` compatible storage and `local` storage are supported for virtual references](#virtual-reference-storage-support). Support for other storage types like [`gcs`](https://github.com/earth-mover/icechunk/issues/524), [`azure`](https://github.com/earth-mover/icechunk/issues/602), and [`https`](https://github.com/earth-mover/icechunk/issues/526) are on the roadmap.
## Creating a virtual dataset with VirtualiZarr
We are going to create a virtual dataset pointing to all of the [OISST](https://www.ncei.noaa.gov/products/optimum-interpolation-sst) data for August 2024. This data is distributed publicly as netCDF files on AWS S3, with one netCDF file containing the Sea Surface Temperature (SST) data for each day of the month. We are going to use `VirtualiZarr` to combine all of these files into a single virtual dataset spanning the entire month, then write that dataset to Icechunk for use in analysis.
-!!! note
-
- At this point you should have followed the instructions [here](https://github.com/earth-mover/icechunk/issues/197) to install the necessary experimental dependencies.
-
-Before we get started, we also need to install `fsspec` and `s3fs` for working with data on s3.
+Before we get started, we need to install `virtualizarr`, and `icechunk`. We also need to install `fsspec` and `s3fs` for working with data on s3.
```shell
-pip install fsspec s3fs
+pip install virtualizarr icechunk fsspec s3fs
```
First, we need to find all of the files we are interested in, we will do this with fsspec using a `glob` expression to find every netcdf file in the August 2024 folder in the bucket:
@@ -83,43 +78,30 @@ virtual_ds = xr.concat(
# err (time, zlev, lat, lon) int16 64MB ManifestArray Size: 17MB
# Dimensions: (time: 36, y: 205, x: 275)
# Coordinates:
@@ -153,7 +154,7 @@ xr.open_zarr(store, consolidated=False)
We can also read data from previous snapshots by checking out prior versions:
```python
-session = repo.readable_session(snapshot_id='ME4VKFPA5QAY0B2YSG8G')
+session = repo.readonly_session(snapshot_id=first_snapshot)
xr.open_zarr(session.store, consolidated=False)
# Size: 9MB
diff --git a/docs/docs/sample-datasets.md b/docs/docs/sample-datasets.md
index e3ceb7a8..7216eb4f 100644
--- a/docs/docs/sample-datasets.md
+++ b/docs/docs/sample-datasets.md
@@ -6,9 +6,67 @@ title: Sample Datasets
!!! warning
This page is under construction. The listed datasets are outdated and will not work until the icechunk format is more stable.
-
## Native Datasets
+### Weatherbench2 ERA5
+
+=== "AWS"
+
+```python
+import icechunk as ic
+import xarray as xr
+
+storage = ic.s3_storage(
+ bucket="icechunk-public-data",
+ prefix="v01/era5_weatherbench2",
+ region="us-east-1",
+ anonymous=True,
+)
+
+repo = ic.Repository.open(storage=storage)
+session = repo.readonly_session("main")
+ds = xr.open_dataset(
+ session.store, group="1x721x1440", engine="zarr", chunks=None, consolidated=False
+)
+```
+
+=== "Google Cloud"
+
+```python
+import icechunk as ic
+import xarray as xr
+
+storage = ic.gcs_storage(
+ bucket="icechunk-public-data-gcs",
+ prefix="v01/era5_weatherbench2",
+)
+
+repo = ic.Repository.open(storage=storage)
+session = repo.readonly_session("main")
+ds = xr.open_dataset(
+ session.store, group="1x721x1440", engine="zarr", chunks=None, consolidated=False
+)
+```
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
## Virtual Datasets
### NOAA [OISST](https://www.ncei.noaa.gov/products/optimum-interpolation-sst) Data
diff --git a/docs/docs/spec.md b/docs/docs/spec.md
index 3a582341..58292dbb 100644
--- a/docs/docs/spec.md
+++ b/docs/docs/spec.md
@@ -72,7 +72,6 @@ Finally, in an atomic put-if-not-exists operation, to commit the transaction, it
This operation may fail if a different client has already committed the next snapshot.
In this case, the client may attempt to resolve the conflicts and retry the commit.
-
```mermaid
flowchart TD
subgraph metadata[Metadata]
@@ -121,6 +120,7 @@ All data and metadata files are stored within a root directory (typically a pref
- `$ROOT/snapshots/` snapshot files
- `$ROOT/attributes/` attribute files
- `$ROOT/manifests/` chunk manifests
+- `$ROOT/transactions/` transaction log files
- `$ROOT/chunks/` chunks
### File Formats
@@ -128,7 +128,6 @@ All data and metadata files are stored within a root directory (typically a pref
!!! warning
The actual file formats used for each type of metadata file are in flux. The spec currently describes the data structures encoded in these files, rather than a specific file format.
-
### Reference Files
Similar to Git, Icechunk supports the concept of _branches_ and _tags_.
@@ -149,9 +148,8 @@ Different client sessions may simultaneously create two inconsistent snapshots;
References (both branches and tags) are stored as JSON files, the content is a JSON object with:
-* keys: a single key `"snapshot"`,
-* value: a string representation of the snapshot id, using [Base 32 Crockford](https://www.crockford.com/base32.html) encoding. The snapshot id is 12 byte random binary, so the encoded string has 20 characters.
-
+- keys: a single key `"snapshot"`,
+- value: a string representation of the snapshot id, using [Base 32 Crockford](https://www.crockford.com/base32.html) encoding. The snapshot id is 12 byte random binary, so the encoded string has 20 characters.
Here is an example of a JSON file corresponding to a tag or branch:
@@ -186,6 +184,7 @@ Branch references are stored in the `refs/` directory within a subdirectory corr
Branch names may not contain the `/` character.
To facilitate easy lookups of the latest branch reference, we use the following encoding for the sequence number:
+
- subtract the sequence number from the integer `1099511627775`
- encode the resulting integer as a string using [Base 32 Crockford](https://www.crockford.com/base32.html)
- left-padding the string with 0s to a length of 8 characters
@@ -216,30 +215,8 @@ Tags cannot be deleted once created.
The snapshot file fully describes the schema of the repository, including all arrays and groups.
-The snapshot file is currently encoded using [MessagePack](https://msgpack.org/), but this may change before Icechunk version 1.0. Given the alpha status of this spec, the best way to understand the information stored
-in the snapshot file is through the data structure used internally by the Icechunk library for serialization. This data structure will most certainly change before the spec stabilization:
-
-```rust
-pub struct Snapshot {
- pub icechunk_snapshot_format_version: IcechunkFormatVersion,
- pub icechunk_snapshot_format_flags: BTreeMap,
-
- pub manifest_files: Vec,
- pub attribute_files: Vec,
-
- pub total_parents: u32,
- pub short_term_parents: u16,
- pub short_term_history: VecDeque,
-
- pub metadata: SnapshotMetadata,
- pub started_at: DateTime,
- pub properties: SnapshotProperties,
- nodes: BTreeMap,
-}
-```
-
-To get full details on what each field contains, please refer to the [Icechunk library code](https://github.com/earth-mover/icechunk/blob/f460a56577ec560c4debfd89e401a98153cd3560/icechunk/src/format/snapshot.rs#L97).
-
+The snapshot file is encoded using [flatbuffers](https://github.com/google/flatbuffers). The IDL for the
+on-disk format can be found in [the repository file](https://github.com/earth-mover/icechunk/tree/main/icechunk/flatbuffers/snapshot.fbs)
### Attributes Files
@@ -248,8 +225,7 @@ Attribute files hold user-defined attributes separately from the snapshot file.
!!! warning
Attribute files have not been implemented.
-The on-disk format for attribute files has not been defined yet, but it will probably be a
-MessagePack serialization of the attributes map.
+The on-disk format for attribute files has not been defined in full yet.
### Chunk Manifest Files
@@ -257,28 +233,14 @@ A chunk manifest file stores chunk references.
Chunk references from multiple arrays can be stored in the same chunk manifest.
The chunks from a single array can also be spread across multiple manifests.
-Manifest files are currently encoded using [MessagePack](https://msgpack.org/), but this may change before Icechunk version 1.0. Given the alpha status of this spec, the best way to understand the information stored
-in the snapshot file is through the data structure used internally by the Icechunk library. This data structure will most certainly change before the spec stabilization:
-
-```rust
-pub struct Manifest {
- pub icechunk_manifest_format_version: IcechunkFormatVersion,
- pub icechunk_manifest_format_flags: BTreeMap,
- chunks: BTreeMap<(NodeId, ChunkIndices), ChunkPayload>,
-}
-
-pub enum ChunkPayload {
- Inline(Bytes),
- Virtual(VirtualChunkRef),
- Ref(ChunkRef),
-}
-```
+Manifest files are encoded using [flatbuffers](https://github.com/google/flatbuffers). The IDL for the
+on-disk format can be found in [the repository file](https://github.com/earth-mover/icechunk/tree/main/icechunk/flatbuffers/manifest.fbs)
The most important part to understand from the data structure is the fact that manifests can hold three types of references:
-* Native (`Ref`), pointing to the id of a chunk within the Icechunk repository.
-* Inline (`Inline`), an optimization for very small chunks that can be embedded directly in the manifest. Mostly used for coordinate arrays.
-* Virtual (`Virtual`), pointing to a region of a file outside of the Icechunk repository, for example,
+- Native (`Ref`), pointing to the id of a chunk within the Icechunk repository.
+- Inline (`Inline`), an optimization for very small chunks that can be embedded directly in the manifest. Mostly used for coordinate arrays.
+- Virtual (`Virtual`), pointing to a region of a file outside of the Icechunk repository, for example,
a chunk that is inside a NetCDF file in object store
To get full details on what each field contains, please refer to the [Icechunk library code](https://github.com/earth-mover/icechunk/blob/f460a56577ec560c4debfd89e401a98153cd3560/icechunk/src/format/manifest.rs#L106).
diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml
index 3b8fc81c..41fa880d 100644
--- a/docs/mkdocs.yml
+++ b/docs/mkdocs.yml
@@ -173,6 +173,7 @@ nav:
- Icechunk Python:
- Quickstart: icechunk-python/quickstart.md
- Configuration: icechunk-python/configuration.md
+ - Storage: icechunk-python/storage.md
- FAQ: icechunk-python/faq.md
- Xarray: icechunk-python/xarray.md
- Parallel Writes: icechunk-python/parallel.md
diff --git a/icechunk-python/Cargo.toml b/icechunk-python/Cargo.toml
index 11016229..3d5c7fa7 100644
--- a/icechunk-python/Cargo.toml
+++ b/icechunk-python/Cargo.toml
@@ -1,6 +1,6 @@
[package]
name = "icechunk-python"
-version = "0.1.0"
+version = "0.2.3"
description = "Transactional storage engine for Zarr designed for use on cloud object storage"
readme = "../README.md"
repository = "https://github.com/earth-mover/icechunk"
@@ -21,7 +21,7 @@ crate-type = ["cdylib"]
bytes = "1.9.0"
chrono = { version = "0.4.39" }
futures = "0.3.31"
-icechunk = { path = "../icechunk", version = "0.1.0" }
+icechunk = { path = "../icechunk", version = "0.2.3", features = ["logs"] }
itertools = "0.14.0"
pyo3 = { version = "0.23", features = [
"chrono",
@@ -37,6 +37,7 @@ serde_json = "1.0.137"
async-trait = "0.1.85"
typetag = "0.2.19"
serde = { version = "1.0.217", features = ["derive", "rc"] }
+miette = { version = "7.5.0", features = ["fancy"] }
[lints]
workspace = true
diff --git a/icechunk-python/benchmarks/README.md b/icechunk-python/benchmarks/README.md
index 0043ce19..0a7e7ab3 100644
--- a/icechunk-python/benchmarks/README.md
+++ b/icechunk-python/benchmarks/README.md
@@ -25,7 +25,6 @@ pytest -nauto -m setup_benchmarks --force-setup=False benchmarks/
```
Use `---icechunk-prefix` to add an extra prefix during both setup and running of benchmarks.
-
### ERA5
`benchmarks/create_era5.py` creates an ERA5 dataset.
@@ -88,6 +87,28 @@ test_time_getsize_prefix[era5-single] (NOW) 2.2133 (1.0)
--------------------------------------------------------------------------
```
+### Notes
+### Where to run the benchmarks?
+
+Pass the `--where [local|s3|gcs|tigris]` flag to control where benchmarks are run.
+```sh
+python benchmarks/runner.py --where gcs v0.1.2
+```
+
+By default all benchmarks are run locally:
+1. A temporary directory is used as a staging area.
+2. A new virtual env is created there and the dev version is installed using `pip` and a github URI. *This means that you can only benchmark commits that have been pushed to Github.*
+
+It is possible to run the benchmarks in the cloud using Coiled. You will need to be a member of the Coiled workspaces: `earthmover-devs` (AWS), `earthmover-devs-gcp` (GCS) and `earthmover-devs-azure` (Azure).
+1. We create a new "coiled software environment" with a specific name.
+2. We use `coiled run` targeting a specific machine type, with a specific software env.
+4. The VM stays alive for 10 minutes to allow for quick iteration.
+5. Coiled does not sync stdout until the pytest command is done, for some reason. See the logs on the Coiled platform for quick feedback.
+6. We use the `--sync` flag, so you will need [`mutagen`](https://mutagen.io/documentation/synchronization/) installed on your system. This will sync the benchmark JSON outputs between the VM and your machine.
+Downsides:
+1. At the moment, we can only benchmark released versions of icechunk. We may need a more complicated Docker container strategy in the future to support dev branch benchmarks.
+2. When a new env is created, the first run always fails :/. The second run works though, so just re-run.
+
### `runner.py`
`runner.py` abstracts the painful task of setting up envs with different versions (with potential format changes), and recreating datasets where needed.
diff --git a/icechunk-python/benchmarks/coiled_runner.py b/icechunk-python/benchmarks/coiled_runner.py
new file mode 100644
index 00000000..f5f4af68
--- /dev/null
+++ b/icechunk-python/benchmarks/coiled_runner.py
@@ -0,0 +1,38 @@
+#!/usr/bin/env python3
+#
+# This is just a scratch script for testing purposes
+# coiled notebook start --sync --software icechunk-alpha-12 --vm-type m5.4xlarge
+import subprocess
+
+# software = "icechunk-alpha-12"
+vm_type = {
+ "s3": "m5.4xlarge",
+ "gcs": None,
+ "tigris": None,
+}
+ref = "icechunk-v0.1.0-alpha.12"
+
+COILED_SOFTWARE = {
+ "icechunk-v0.1.0-alpha.1": "icechunk-alpha-release",
+ "icechunk-v0.1.0-alpha.12": "icechunk-alpha-12",
+}
+software = COILED_SOFTWARE[ref]
+
+cmd = f'python benchmarks/runner.py --where coiled --pytest "-k zarr_open" {ref}'
+subprocess.run(
+ [
+ "coiled",
+ "run",
+ "--name",
+ "icebench-712f1eb2",
+ "--sync",
+ "--sync-ignore='python/ reports/ profiling/'",
+ "--keepalive",
+ "5m",
+ "--workspace=earthmover-devs",
+ "--vm-type=m5.4xlarge",
+ "--software=icechunk-bench-712f1eb2",
+ "--region=us-east-1",
+ "pytest -v benchmarks/",
+ ]
+)
diff --git a/icechunk-python/benchmarks/conftest.py b/icechunk-python/benchmarks/conftest.py
index 20bd3ce1..f8b02bec 100644
--- a/icechunk-python/benchmarks/conftest.py
+++ b/icechunk-python/benchmarks/conftest.py
@@ -1,6 +1,13 @@
import pytest
-from benchmarks.datasets import ERA5, ERA5_SINGLE, GB_8MB_CHUNKS, GB_128MB_CHUNKS
+from benchmarks.datasets import (
+ ERA5,
+ ERA5_ARCO,
+ ERA5_SINGLE,
+ GB_8MB_CHUNKS,
+ GB_128MB_CHUNKS,
+ TEST_BUCKETS,
+)
from icechunk import Repository, local_filesystem_storage
from zarr.abc.store import Store
@@ -12,21 +19,32 @@ def repo(tmpdir: str) -> Repository:
@pytest.fixture(
params=[
- pytest.param(ERA5, id="era5-weatherbench"),
- pytest.param(ERA5_SINGLE, id="era5-single"),
- pytest.param(GB_128MB_CHUNKS, id="gb-128mb"),
pytest.param(GB_8MB_CHUNKS, id="gb-8mb"),
+ pytest.param(GB_128MB_CHUNKS, id="gb-128mb"),
+ pytest.param(ERA5_SINGLE, id="era5-single"),
+ pytest.param(ERA5, id="era5-weatherbench"),
+ pytest.param(ERA5_ARCO, id="era5-arco"),
],
)
def synth_dataset(request) -> Store:
"""For now, these are synthetic datasets stored in the cloud."""
extra_prefix = request.config.getoption("--icechunk-prefix")
+ where = request.config.getoption("--where")
ds = request.param
+ if where == "local" and ds.skip_local:
+ pytest.skip()
# for some reason, this gets run multiple times so we apply the prefix repeatedly
# if we don't catch that :(
- ds.storage_config = ds.storage_config.with_extra(
- prefix=extra_prefix, force_idempotent=True
- )
+ ds.storage_config = ds.storage_config.with_overwrite(
+ **TEST_BUCKETS[where]
+ ).with_extra(prefix=extra_prefix, force_idempotent=True)
+ if ds.setupfn is None:
+ # these datasets aren't automatically set up
+ # so skip if the data haven't been written yet.
+ try:
+ ds.store()
+ except ValueError as e:
+ pytest.skip(reason=str(e))
return ds
@@ -61,3 +79,10 @@ def pytest_addoption(parser):
for this icechunk version at that URI. True by default.
""",
)
+
+ parser.addoption(
+ "--where",
+ action="store",
+ help="Where to run icechunk benchmarks? [local|s3|gcs].",
+ default="local",
+ )
diff --git a/icechunk-python/benchmarks/create_era5.py b/icechunk-python/benchmarks/create_era5.py
index 5f9d3d8b..3ff5fe45 100644
--- a/icechunk-python/benchmarks/create_era5.py
+++ b/icechunk-python/benchmarks/create_era5.py
@@ -3,113 +3,257 @@
# 1. just create-deepak-env v0.1.0a12
# 2. conda activate icechunk-v0.1.0a12
# 3. python benchmarks/create-era5.py
+
import argparse
import datetime
-import logging
-import warnings
+import math
+import random
+from enum import StrEnum, auto
+from typing import Any
-import helpers
-from datasets import ERA5, Dataset
+import humanize
+import pandas as pd
from packaging.version import Version
+import dask
import icechunk as ic
import xarray as xr
+import zarr
+from benchmarks import helpers
+from benchmarks.datasets import Dataset, IngestDataset
+from dask.diagnostics import ProgressBar
from icechunk.xarray import to_icechunk
-logger = logging.getLogger("icechunk-bench")
-logger.setLevel(logging.INFO)
-console_handler = logging.StreamHandler()
-logger.addHandler(console_handler)
+logger = helpers.setup_logger()
+
+ICECHUNK_FORMAT = f"v{ic.spec_version():02d}"
+ZARR_KWARGS = dict(zarr_format=3, consolidated=False)
+
+
+class Mode(StrEnum):
+ APPEND = auto()
+ CREATE = auto()
+ OVERWRITE = auto()
+ VERIFY = auto()
+
+
+ERA5_WB = IngestDataset(
+ name="ERA5-WB",
+ prefix="era5_weatherbench2",
+ source_uri="gs://weatherbench2/datasets/era5/1959-2023_01_10-full_37-1h-0p25deg-chunk-1.zarr",
+ engine="zarr",
+ read_chunks={"time": 24 * 3, "level": 1},
+ write_chunks={"time": 1, "level": 1, "latitude": 721, "longitude": 1440},
+ group="1x721x1440",
+ arrays=["2m_temperature", "10m_u_component_of_wind", "10m_v_component_of_wind"],
+)
+
+
+def verify(dataset: Dataset, *, ingest: IngestDataset, seed: int | None = None):
+ random.seed(seed)
+
+ format = ICECHUNK_FORMAT
+ prefix = f"{format}/"
+ dataset.storage_config = dataset.storage_config.with_extra(prefix=prefix)
+ repo = ic.Repository.open(dataset.storage)
+ session = repo.readonly_session(branch="main")
+ instore = xr.open_dataset(
+ session.store, group=dataset.group, engine="zarr", chunks=None, consolidated=False
+ )
+ time = pd.Timestamp(random.choice(instore.time.data.tolist()))
+ logger.info(f"Verifying {ingest.name} for {seed=!r}, {time=!r}")
+ actual = instore.sel(time=time)
+
+ ds = ingest.open_dataset(chunks=None)
+ expected = ds[list(instore.data_vars)].sel(time=time)
+
+ # I add global attrs, don't compare those
+ expected.attrs.clear()
+ actual.attrs.clear()
+ # TODO: Parallelize the compare in `assert_identical` upstream
+ with ProgressBar():
+ actual, expected = dask.compute(actual, expected)
+ # assert (expected == actual).all().to_array().all()
+ xr.testing.assert_identical(expected, actual)
+ logger.info("Successfully verified!")
-# @coiled.function
-def write_era5(dataset: Dataset, *, ref, arrays_to_write):
+
+def write(
+ dataset: Dataset,
+ *,
+ ingest: IngestDataset,
+ mode: Mode,
+ nyears: int | None = None,
+ arrays_to_write: list[str] | None = None,
+ extra_attrs: dict[str, str] | None = None,
+ initialize_all_vars: bool = False,
+ dry_run: bool = False,
+) -> None:
"""
- 1. We write all the metadata and coordinate arrays to make a "big" snapshot.
- 2. We only write a few arrays to save time.
+ dataset: Dataset t write
+ arrays_to_write: list[str],
+ extra_attrs: any attributes to add
+ initialize_all_vars: whether to write all coordinate arrays, and metadata for ALL data_vars.
+
+ Usually, initialize_all_vars=True for benchmarks, but not for the "public dataset".
+ For benchmarks,
+ 1. We write all the metadata and coordinate arrays to make a "big" snapshot.
+ 2. We only write a few arrays to save time.
"""
import coiled
import distributed
- SELECTOR = {"time": slice(5 * 365 * 24)}
- chunk_shape = {"time": 1, "level": 1, "latitude": 721, "longitude": 1440}
- zarr_kwargs = dict(group=dataset.group, zarr_format=3, consolidated=False)
-
- with warnings.catch_warnings():
- warnings.simplefilter("ignore", category=UserWarning)
- ds = xr.open_zarr(
- "gs://weatherbench2/datasets/era5/1959-2023_01_10-full_37-1h-0p25deg-chunk-1.zarr",
- chunks={"time": 24 * 3, "level": 1},
- ).drop_encoding()
- for v in ds:
- ds[v].encoding["chunks"] = tuple(chunk_shape[dim] for dim in ds[v].dims)
-
- towrite = ds[arrays_to_write].isel(SELECTOR)
- towrite.attrs["written_arrays"] = " ".join(towrite.data_vars)
- towrite.attrs["icechunk_commit"] = helpers.get_commit(ref)
- towrite.attrs["icechunk_ref"] = ref
+ SELECTOR = {"time": slice(nyears * 365 * 24) if nyears is not None else slice(None)}
+ if mode in [Mode.CREATE, Mode.OVERWRITE]:
+ write_mode = "w"
+ elif mode is Mode.APPEND:
+ write_mode = "a"
+
+ ic_kwargs = dict(group=dataset.group, mode=write_mode)
+
+ ds = ingest.open_dataset()
+ if arrays_to_write is not None:
+ towrite = ds[arrays_to_write].isel(SELECTOR)
+ else:
+ towrite = ds.isel(SELECTOR)
towrite.attrs["selector"] = str(SELECTOR)
+ towrite.attrs.update(extra_attrs or {})
+ for v in towrite:
+ towrite[v].encoding["chunks"] = tuple(
+ ingest.write_chunks[dim] for dim in ds[v].dims
+ )
+
+ nchunks = tuple(
+ math.prod((var.sizes[dim] // ingest.write_chunks[dim] + 1) for dim in var.dims)
+ for _, var in towrite.data_vars.items()
+ )
+ logger.info(
+ f"Size: {humanize.naturalsize(towrite.nbytes)}, "
+ f"Total nchunks= {humanize.intcomma(sum(nchunks))}, "
+ f"per array: {[humanize.intcomma(i) for i in nchunks]}"
+ )
repo = ic.Repository.open(dataset.storage)
- logger.info("Initializing dataset.")
- session = repo.writable_session("main")
- ds.to_zarr(session.store, compute=False, **zarr_kwargs)
- session.commit("initialized dataset")
- logger.info("Finished initializing dataset.")
+ if dry_run:
+ print("Dry run. Exiting")
+ return
+ ckwargs = dataset.storage_config.get_coiled_kwargs()
session = repo.writable_session("main")
- # FIXME: use name
- # # name=f"earthmover/{ref}",
- with coiled.Cluster(n_workers=(4, 200), worker_cpu=2) as cluster:
- client = distributed.Client(cluster)
+ with coiled.Cluster(
+ name=f"icechunk-ingest-{ICECHUNK_FORMAT}-{ingest.name}",
+ shutdown_on_close=False,
+ n_workers=(4, 200),
+ worker_cpu=2,
+ workspace=ckwargs["workspace"],
+ region=ckwargs["region"],
+ ) as cluster:
+ # https://docs.coiled.io/user_guide/clusters/environ.html
+ cluster.send_private_envs(dataset.storage_config.env_vars)
+ client = distributed.Client(cluster) # type: ignore[no-untyped-call]
print(client)
- with distributed.performance_report(
- f"reports/era5-ingest-{ref}-{datetime.datetime.now()}.html"
+ with distributed.performance_report( # type: ignore[no-untyped-call]
+ f"reports/{ingest.name}-ingest-{dataset.storage_config.store}-{ICECHUNK_FORMAT}-{datetime.datetime.now()}.html"
):
- logger.info(f"Started writing {arrays_to_write=}.")
- to_icechunk(
- towrite, session=session, region="auto", **zarr_kwargs, split_every=32
- )
+ logger.info(f"Started writing {tuple(towrite.data_vars)}.")
+ with zarr.config.set({"async.concurrency": 24}):
+ to_icechunk(towrite, session=session, **ic_kwargs, split_every=32)
session.commit("ingest!")
- logger.info(f"Finished writing {arrays_to_write=}.")
+ logger.info(f"Finished writing {tuple(towrite.data_vars)}.")
-def setup_era5_weatherbench2(
- dataset: Dataset, *, ref: str, arrays_to_write: list[str]
+def setup_dataset(
+ dataset: Dataset,
+ *,
+ ingest: IngestDataset,
+ mode: Mode,
+ dry_run: bool = False,
+ **kwargs: Any,
) -> None:
- commit = helpers.get_commit(ref)
- logger.info(f"Writing ERA5 for {ref=}, {commit=}, {arrays_to_write=}")
- prefix = f"benchmarks/{ref}_{commit}/"
+ # commit = helpers.get_commit(ref)
+ format = ICECHUNK_FORMAT
+ logger.info(f"Writing {ingest.name} for {format}, {kwargs=}")
+ prefix = f"{format}/"
dataset.storage_config = dataset.storage_config.with_extra(prefix=prefix)
- dataset.create()
- write_era5(dataset, ref=ref, arrays_to_write=arrays_to_write)
+ if mode is Mode.CREATE:
+ logger.info("Creating new repository")
+ repo = dataset.create(clear=True)
+ logger.info("Initializing root group")
+ session = repo.writable_session("main")
+ zarr.open_group(session.store, mode="w-")
+ session.commit("initialized root group")
+ logger.info("Initialized root group")
+
+ write(
+ dataset,
+ ingest=ingest,
+ mode=mode,
+ initialize_all_vars=False,
+ dry_run=dry_run,
+ **kwargs,
+ )
def get_version() -> str:
version = Version(ic.__version__)
- if "a" in version.pre:
+ if version.pre is not None and "a" in version.pre:
return f"icechunk-v{version.base_version}-alpha.{version.pre[1]}"
else:
- raise NotImplementedError
+ return f"icechunk-v{version.base_version}"
if __name__ == "__main__":
helpers.assert_cwd_is_icechunk_python()
parser = argparse.ArgumentParser()
- # parser.add_argument("ref", help="ref to run ingest for")
+ parser.add_argument("store", help="object store to write to")
+ parser.add_argument(
+ "--mode", help="'create'/'overwrite'/'append'/'verify'", default="append"
+ )
+ parser.add_argument(
+ "--nyears", help="number of years to write (from start)", default=None, type=int
+ )
+ parser.add_argument("--dry-run", action="store_true", help="dry run/?", default=False)
+ parser.add_argument(
+ "--append", action="store_true", help="append or create?", default=False
+ )
+ parser.add_argument("--arrays", help="arrays to write", nargs="+", default=[])
+ parser.add_argument("--seed", help="random seed for verify", default=None, type=int)
parser.add_argument(
- "--arrays",
- help="arrays to write",
- nargs="+",
- default=[
- "2m_temperature",
- "10m_u_component_of_wind",
- "10m_v_component_of_wind",
- "boundary_layer_height",
- ],
+ "--debug", help="write to debug bucket?", default=False, action="store_true"
)
+
args = parser.parse_args()
- setup_era5_weatherbench2(ERA5, ref=get_version(), arrays_to_write=args.arrays)
+ if args.mode == "create":
+ mode = Mode.CREATE
+ elif args.mode == "append":
+ mode = Mode.APPEND
+ elif args.mode == "overwrite":
+ mode = Mode.OVERWRITE
+ elif args.mode == "verify":
+ mode = Mode.VERIFY
+ else:
+ raise ValueError(
+ f"mode must be one of ['create', 'overwrite', 'append', 'verify']. Received {args.mode=!r}"
+ )
+
+ ingest = ERA5_WB
+ dataset = ingest.make_dataset(store=args.store, debug=args.debug)
+ logger.info(ingest)
+ logger.info(dataset)
+ logger.info(args)
+ ds = ingest.open_dataset()
+ if mode is Mode.VERIFY:
+ verify(dataset, ingest=ingest, seed=args.seed)
+ else:
+ setup_dataset(
+ dataset,
+ ingest=ingest,
+ nyears=args.nyears,
+ mode=mode,
+ arrays_to_write=args.arrays or ingest.arrays,
+ dry_run=args.dry_run,
+ )
diff --git a/icechunk-python/benchmarks/datasets.py b/icechunk-python/benchmarks/datasets.py
index 897d231e..4e6a24af 100644
--- a/icechunk-python/benchmarks/datasets.py
+++ b/icechunk-python/benchmarks/datasets.py
@@ -1,42 +1,111 @@
import datetime
import time
+import warnings
from collections.abc import Callable
from dataclasses import dataclass, field
from functools import partial
-from typing import Any, Self
+from typing import Any, Literal, Self, TypeAlias
import fsspec
import numpy as np
+import platformdirs
import icechunk as ic
import xarray as xr
import zarr
+from benchmarks.helpers import get_coiled_kwargs, rdms, setup_logger
rng = np.random.default_rng(seed=123)
+Store: TypeAlias = Literal["s3", "gcs", "az", "tigris"]
+PUBLIC_DATA_BUCKET = "icechunk-public-data"
+ZARR_KWARGS = dict(zarr_format=3, consolidated=False)
+
+CONSTRUCTORS = {
+ "s3": ic.s3_storage,
+ "gcs": ic.gcs_storage,
+ "tigris": ic.tigris_storage,
+ "local": ic.local_filesystem_storage,
+}
+TEST_BUCKETS = {
+ "s3": dict(store="s3", bucket="icechunk-test", region="us-east-1"),
+ "gcs": dict(store="gcs", bucket="icechunk-test-gcp", region="us-east1"),
+ # "tigris": dict(
+ # store="tigris", bucket="deepak-private-bucket" + "-test", region="iad"
+ # ),
+ "tigris": dict(store="tigris", bucket="icechunk-test", region="iad"),
+ "local": dict(store="local", bucket=platformdirs.site_cache_dir()),
+}
+BUCKETS = {
+ "s3": dict(store="s3", bucket=PUBLIC_DATA_BUCKET, region="us-east-1"),
+ "gcs": dict(store="gcs", bucket=PUBLIC_DATA_BUCKET + "-gcs", region="us-east1"),
+ "tigris": dict(store="tigris", bucket=PUBLIC_DATA_BUCKET + "-tigris", region="iad"),
+}
+
+logger = setup_logger()
+
+
+def tigris_credentials() -> tuple[str, str]:
+ import boto3
+
+ session = boto3.Session()
+ creds = session.get_credentials()
+ return {"access_key_id": creds.access_key, "secret_access_key": creds.secret_key}
+
@dataclass
class StorageConfig:
"""wrapper that allows us to config the prefix for a ref."""
- constructor: Callable
- config: Any
+ store: str | None = None
+ config: dict[str, Any] = field(default_factory=dict)
bucket: str | None = None
prefix: str | None = None
- path: str | None = None
+ region: str | None = None
+
+ @property
+ def path(self) -> str:
+ if self.store != "local":
+ raise ValueError(f"can't grab path for {self.store=!r}")
+ return f"{self.bucket}/{self.prefix}"
def create(self) -> ic.Storage:
+ if self.store is None:
+ raise ValueError("StorageConfig.store is None!")
kwargs = {}
- if self.bucket is not None:
- kwargs["bucket"] = self.bucket
- if self.prefix is not None:
- kwargs["prefix"] = self.prefix
- if self.path is not None:
+ if self.store == "local":
kwargs["path"] = self.path
- return self.constructor(config=self.config, **kwargs)
+ else:
+ if self.bucket is not None:
+ kwargs["bucket"] = self.bucket
+ if self.prefix is not None:
+ kwargs["prefix"] = self.prefix
+ if self.region is not None and self.store not in ["gcs"]:
+ kwargs["region"] = self.region
+ if self.store == "tigris":
+ kwargs.update(tigris_credentials())
+ return CONSTRUCTORS[self.store](**self.config, **kwargs)
+
+ def with_overwrite(
+ self,
+ *,
+ store: str | None = None,
+ bucket: str | None = None,
+ region: str | None = None,
+ ) -> Self:
+ return type(self)(
+ store=store if store is not None else self.store,
+ bucket=bucket if bucket is not None else self.bucket,
+ region=region if region is not None else self.region,
+ prefix=self.prefix,
+ config=self.config,
+ )
def with_extra(
- self, *, prefix: str | None = None, force_idempotent: bool = False
+ self,
+ *,
+ prefix: str | None = None,
+ force_idempotent: bool = False,
) -> Self:
if self.prefix is not None:
if force_idempotent and self.prefix.startswith(prefix):
@@ -45,33 +114,40 @@ def with_extra(
else:
new_prefix = None
- if self.path is not None:
- if force_idempotent and self.path.startswith(prefix):
- return self
- new_path = (prefix or "") + self.path
- else:
- new_path = None
return type(self)(
- constructor=self.constructor,
+ store=self.store,
bucket=self.bucket,
prefix=new_prefix,
- path=new_path,
+ region=self.region,
config=self.config,
)
+ @property
+ def env_vars(self) -> dict[str, str]:
+ # if self.store == "tigris":
+ # # https://www.tigrisdata.com/docs/iam/#create-an-access-key
+ # return {"AWS_ENDPOINT_URL_IAM": "https://fly.iam.storage.tigris.dev"}
+ return {}
+
+ @property
+ def protocol(self) -> str:
+ if self.store in ("s3", "tigris"):
+ protocol = "s3"
+ elif self.store == "gcs":
+ protocol = "gcs"
+ else:
+ protocol = "file"
+ return protocol
+
def clear_uri(self) -> str:
"""URI to clear when re-creating data from scratch."""
- if self.constructor == ic.Storage.new_s3:
- protocol = "s3://"
+ if self.store == "local":
+ return f"{self.protocol}://{self.path}"
else:
- protocol = ""
+ return f"{self.protocol}://{self.bucket}/{self.prefix}"
- if self.bucket is not None:
- return f"{protocol}{self.bucket}/{self.prefix}"
- elif self.path is not None:
- return self.path
- else:
- raise NotImplementedError("I don't know what to do here.")
+ def get_coiled_kwargs(self) -> str:
+ return get_coiled_kwargs(store=self.store, region=self.region)
@dataclass
@@ -81,17 +157,9 @@ class Dataset:
"""
storage_config: StorageConfig
- # data variable to load in `time_xarray_read_chunks`
- load_variables: list[str]
- # Passed to .isel for `time_xarray_read_chunks`
- chunk_selector: dict[str, Any]
- # name of (coordinate) variable used for testing "time to first byte"
- first_byte_variable: str | None
# core useful group path used to open an Xarray Dataset
group: str | None = None
- # function used to construct the dataset prior to read benchmarks
- setupfn: Callable | None = None
- _storage: ic.Storage | None = field(default=None, init=False)
+ _storage: ic.Storage | None = field(default=None, init=False, repr=False)
@property
def storage(self) -> ic.Storage:
@@ -99,19 +167,24 @@ def storage(self) -> ic.Storage:
self._storage = self.storage_config.create()
return self._storage
- def create(self) -> ic.Repository:
- clear_uri = self.storage_config.clear_uri()
- if clear_uri is None:
- raise NotImplementedError
- if not clear_uri.startswith("s3://"):
- raise NotImplementedError(
- f"Only S3 URIs supported at the moment. Received {clear_uri}"
- )
- fs = fsspec.filesystem("s3")
- try:
- fs.rm(f"{clear_uri}", recursive=True)
- except FileNotFoundError:
- pass
+ def create(self, clear: bool = False) -> ic.Repository:
+ if clear:
+ clear_uri = self.storage_config.clear_uri()
+ if clear_uri is None:
+ raise NotImplementedError
+ if self.storage_config.protocol not in ["file", "s3", "gcs"]:
+ warnings.warn(
+ f"Only clearing of GCS, S3-compatible URIs supported at the moment. Received {clear_uri!r}",
+ RuntimeWarning,
+ stacklevel=2,
+ )
+ else:
+ fs = fsspec.filesystem(self.storage_config.protocol)
+ try:
+ logger.info(f"Clearing prefix: {clear_uri!r}")
+ fs.rm(clear_uri, recursive=True)
+ except FileNotFoundError:
+ pass
return ic.Repository.create(self.storage)
@property
@@ -119,6 +192,25 @@ def store(self) -> ic.IcechunkStore:
repo = ic.Repository.open(self.storage)
return repo.readonly_session(branch="main").store
+
+@dataclass(kw_only=True)
+class BenchmarkDataset(Dataset):
+ # data variable to load in `time_xarray_read_chunks`
+ load_variables: list[str] | None = None
+ # Passed to .isel for `time_xarray_read_chunks`
+ chunk_selector: dict[str, Any] | None = None
+ # name of (coordinate) variable used for testing "time to first byte"
+ first_byte_variable: str | None
+ # function used to construct the dataset prior to read benchmarks
+ setupfn: Callable | None = None
+ # whether to skip this one on local runs
+ skip_local: bool = False
+
+ def create(self, clear: bool = True):
+ if clear is not True:
+ raise ValueError("clear *must* be true for benchmark datasets.")
+ return super().create(clear=True)
+
def setup(self, force: bool = False) -> None:
"""
force: if True, recreate from scratch. If False, try opening the store,
@@ -140,6 +232,36 @@ def setup(self, force: bool = False) -> None:
self.setupfn(self)
+@dataclass(kw_only=True)
+class IngestDataset:
+ name: str
+ source_uri: str
+ group: str
+ prefix: str
+ write_chunks: dict[str, int]
+ arrays: list[str]
+ engine: str | None = None
+ read_chunks: dict[str, int] | None = None
+
+ def open_dataset(self, chunks=None, **kwargs: Any) -> xr.Dataset:
+ with warnings.catch_warnings():
+ warnings.simplefilter("ignore", category=UserWarning)
+ return xr.open_dataset(
+ self.source_uri,
+ chunks=chunks or self.read_chunks,
+ engine=self.engine,
+ **kwargs,
+ ).drop_encoding()
+
+ def make_dataset(self, *, store: str, debug: bool) -> Dataset:
+ buckets = BUCKETS if not debug else TEST_BUCKETS
+ extra_prefix = f"_{rdms()}" if debug else ""
+ storage_config = StorageConfig(
+ prefix=self.prefix + extra_prefix, **buckets[store]
+ )
+ return Dataset(storage_config=storage_config, group=self.group)
+
+
def setup_synthetic_gb_dataset(
dataset: Dataset,
chunk_shape: tuple[int, ...],
@@ -167,7 +289,7 @@ def setup_era5_single(dataset: Dataset):
# FIXME: move to earthmover-sample-data
url = "https://nsf-ncar-era5.s3.amazonaws.com/e5.oper.an.pl/194106/e5.oper.an.pl.128_060_pv.ll025sc.1941060100_1941060123.nc"
- print(f"Reading {url}")
+ logger.info(f"Reading {url}")
tic = time.time()
ds = xr.open_dataset(
# using pooch means we download only once on a local machine
@@ -178,7 +300,7 @@ def setup_era5_single(dataset: Dataset):
engine="h5netcdf",
)
ds = ds.drop_encoding().load()
- print(f"Loaded data in {time.time() - tic} seconds")
+ logger.info(f"Loaded data in {time.time() - tic} seconds")
repo = dataset.create()
session = repo.writable_session("main")
@@ -186,63 +308,117 @@ def setup_era5_single(dataset: Dataset):
encoding = {
"PV": {"compressors": [zarr.codecs.ZstdCodec()], "chunks": (1, 1, 721, 1440)}
}
- print("Writing data...")
+ logger.info("Writing data...")
ds.to_zarr(
session.store, mode="w", zarr_format=3, consolidated=False, encoding=encoding
)
- print(f"Wrote data in {time.time() - tic} seconds")
+ logger.info(f"Wrote data in {time.time() - tic} seconds")
session.commit(f"wrote data at {datetime.datetime.now(datetime.UTC)}")
-# TODO: passing Storage directly is nice, but doesn't let us add an extra prefix.
-ERA5 = Dataset(
- storage_config=StorageConfig(
- constructor=ic.Storage.new_s3,
- bucket="icechunk-test",
- prefix="era5-weatherbench",
- config=ic.S3Options(),
- ),
+def setup_ingest_for_benchmarks(dataset: Dataset, *, ingest: IngestDataset) -> None:
+ """
+ For benchmarks, we
+ 1. add a specific prefix.
+ 2. always write the metadata for the WHOLE dataset
+ 3. then append a small subset of data for a few arrays
+ """
+ from benchmarks.create_era5 import Mode, write
+
+ repo = dataset.create()
+ ds = ingest.open_dataset()
+ logger.info("Initializing dataset for benchmarks..")
+ session = repo.writable_session("main")
+ ds.to_zarr(
+ session.store, compute=False, mode="w-", group=dataset.group, **ZARR_KWARGS
+ )
+ session.commit("initialized dataset")
+ logger.info("Finished initializing dataset.")
+
+ if ingest.arrays:
+ attrs = {
+ "written_arrays": " ".join(ingest.arrays),
+ }
+ write(
+ dataset,
+ ingest=ingest,
+ mode=Mode.APPEND,
+ extra_attrs=attrs,
+ arrays_to_write=ingest.arrays,
+ initialize_all_vars=False,
+ )
+
+
+def setup_era5(*args, **kwargs):
+ from benchmarks.create_era5 import setup_for_benchmarks
+
+ return setup_for_benchmarks(*args, **kwargs, arrays_to_write=[])
+
+
+ERA5_ARCO_INGEST = IngestDataset(
+ name="ERA5-ARCO",
+ prefix="era5_arco",
+ source_uri="gs://gcp-public-data-arco-era5/ar/full_37-1h-0p25deg-chunk-1.zarr-v3",
+ engine="zarr",
+ read_chunks={"time": 72 * 24, "level": 1},
+ write_chunks={"time": 1, "level": 1, "latitude": 721, "longitude": 1440},
+ group="1x721x1440",
+ arrays=[],
+)
+
+ERA5 = BenchmarkDataset(
+ # weatherbench2 data - 5 years
+ skip_local=False,
+ storage_config=StorageConfig(prefix="era5-weatherbench"),
load_variables=["2m_temperature"],
chunk_selector={"time": 1},
first_byte_variable="latitude",
group="1x721x1440",
# don't set setupfn here so we don't run a really expensive job
# by mistake
+ # setupfn=partial(setup_ingest_for_benchmarks, ingest=ERA5_WB),
)
-ERA5_SINGLE = Dataset(
- storage_config=StorageConfig(
- constructor=ic.Storage.new_s3,
- bucket="icechunk-test",
- prefix="perf-era5-single",
- config=ic.S3Options(),
- ),
+ERA5_ARCO = BenchmarkDataset(
+ skip_local=False,
+ storage_config=StorageConfig(prefix="era5-arco"),
+ first_byte_variable="latitude",
+ group="1x721x1440",
+ setupfn=partial(setup_ingest_for_benchmarks, ingest=ERA5_ARCO_INGEST),
+)
+
+# ERA5_LARGE = BenchmarkDataset(
+# skip_local=True,
+# storage_config=StorageConfig(
+# bucket="icechunk-public-data", prefix="era5-weatherbench2"
+# ),
+# load_variables=["2m_temperature"],
+# chunk_selector={"time": 1},
+# first_byte_variable="latitude",
+# group="1x721x1440",
+# # don't set setupfn here so we don't run a really expensive job
+# # by mistake
+# )
+
+ERA5_SINGLE = BenchmarkDataset(
+ # Single NCAR AWS PDS ERA5 netCDF
+ storage_config=StorageConfig(prefix="perf-era5-single"),
load_variables=["PV"],
chunk_selector={"time": 1},
first_byte_variable="latitude",
setupfn=setup_era5_single,
)
-GB_128MB_CHUNKS = Dataset(
- storage_config=StorageConfig(
- constructor=ic.Storage.new_s3,
- bucket="icechunk-test",
- prefix="gb-128mb-chunks",
- config=ic.S3Options(),
- ),
+GB_128MB_CHUNKS = BenchmarkDataset(
+ storage_config=StorageConfig(prefix="gb-128mb-chunks"),
load_variables=["array"],
chunk_selector={},
first_byte_variable=None,
setupfn=partial(setup_synthetic_gb_dataset, chunk_shape=(64, 512, 512)),
)
-GB_8MB_CHUNKS = Dataset(
- storage_config=StorageConfig(
- constructor=ic.Storage.new_s3,
- bucket="icechunk-test",
- prefix="gb-8mb-chunks",
- config=ic.S3Options(),
- ),
+GB_8MB_CHUNKS = BenchmarkDataset(
+ storage_config=StorageConfig(prefix="gb-8mb-chunks"),
load_variables=["array"],
chunk_selector={},
first_byte_variable=None,
@@ -250,12 +426,12 @@ def setup_era5_single(dataset: Dataset):
)
# TODO
-GPM_IMERG_VIRTUAL = Dataset(
+GPM_IMERG_VIRTUAL = BenchmarkDataset(
storage_config=StorageConfig(
- constructor=ic.Storage.new_s3,
+ store="s3",
bucket="earthmover-icechunk-us-west-2",
prefix="nasa-impact/GPM_3IMERGHH.07-virtual-1998",
- config=ic.S3Options(),
+ region="us-west-2",
# access_key_id=access_key_id,
# secret_access_key=secret,
# session_token=session_token,
diff --git a/icechunk-python/benchmarks/helpers.py b/icechunk-python/benchmarks/helpers.py
index 9737af67..35c7141a 100644
--- a/icechunk-python/benchmarks/helpers.py
+++ b/icechunk-python/benchmarks/helpers.py
@@ -1,7 +1,49 @@
+import logging
import os
import subprocess
+def setup_logger():
+ logger = logging.getLogger("icechunk-bench")
+ logger.setLevel(logging.INFO)
+ console_handler = logging.StreamHandler()
+ logger.addHandler(console_handler)
+ logger.handlers = logger.handlers[:1] # make idempotent
+ return logger
+
+
+def get_coiled_kwargs(*, store: str, region: str | None = None) -> str:
+ COILED_VM_TYPES = {
+ # TODO: think about these
+ "s3": "m5.4xlarge",
+ "gcs": "n2-standard-16",
+ "tigris": "m5.4xlarge",
+ }
+ DEFAULT_REGIONS = {
+ "s3": "us-east-1",
+ "gcs": "us-east1",
+ "tigris": "us-east-1",
+ "az": "eastus",
+ }
+ WORKSPACES = {
+ "s3": "earthmover-devs",
+ "tigris": "earthmover-devs",
+ "gcs": "earthmover-devs-gcp",
+ "az": "earthmover-devs-azure",
+ }
+ TIGRIS_REGIONS = {"iad": "us-east-1"}
+
+ if region is None:
+ region = DEFAULT_REGIONS[store]
+ else:
+ region = TIGRIS_REGIONS[region] if store == "tigris" else region
+ return {
+ "workspace": WORKSPACES[store],
+ "region": region,
+ "vm_type": COILED_VM_TYPES[store],
+ }
+
+
def assert_cwd_is_icechunk_python():
CURRENTDIR = os.getcwd()
if not CURRENTDIR.endswith("icechunk-python"):
@@ -10,7 +52,14 @@ def assert_cwd_is_icechunk_python():
)
-def get_commit(ref: str) -> str:
+def get_full_commit(ref: str) -> str:
return subprocess.run(
["git", "rev-parse", ref], capture_output=True, text=True, check=True
- ).stdout.strip()[:8]
+ ).stdout.strip()
+
+
+def rdms() -> str:
+ import random
+ import string
+
+ return "".join(random.sample(string.ascii_lowercase, k=8))
diff --git a/icechunk-python/benchmarks/most_recent.sh b/icechunk-python/benchmarks/most_recent.sh
new file mode 100644
index 00000000..bfb30fbc
--- /dev/null
+++ b/icechunk-python/benchmarks/most_recent.sh
@@ -0,0 +1,4 @@
+#!/usr/bin/env sh
+
+echo $(ls -t ./.benchmarks/**/* | head -n 1)
+pytest-benchmark compare --group=group,func,param --sort=fullname --columns=median --name=normal `ls -t ./.benchmarks/**/* | head -n 1`
diff --git a/icechunk-python/benchmarks/runner.py b/icechunk-python/benchmarks/runner.py
index 3a0c369c..1980ea02 100644
--- a/icechunk-python/benchmarks/runner.py
+++ b/icechunk-python/benchmarks/runner.py
@@ -1,6 +1,8 @@
#!/usr/bin/env python3
# helper script to run and save benchmarks against named refs.
# AKA a shitty version of asv's env management
+# FIXME:
+# 1. The Icechunk Spec Version is taken from the running env. This is wrong :(
import argparse
import glob
@@ -12,162 +14,276 @@
import tqdm
import tqdm.contrib.concurrent
-from helpers import assert_cwd_is_icechunk_python, get_commit
+from helpers import (
+ assert_cwd_is_icechunk_python,
+ get_coiled_kwargs,
+ get_full_commit,
+ setup_logger,
+)
+
+logger = setup_logger()
PIP_OPTIONS = "--disable-pip-version-check -q"
+PYTEST_OPTIONS = "-v --durations 10 --rootdir=benchmarks --tb=line"
TMP = tempfile.gettempdir()
CURRENTDIR = os.getcwd()
+
assert_cwd_is_icechunk_python()
def get_benchmark_deps(filepath: str) -> str:
+ """needed since
+ 1. benchmark deps may have changed in the meantime.
+ 2. we can't specify optional extras when installing from a subdirectory
+ https://pip.pypa.io/en/stable/topics/vcs-support/#url-fragments
+ """
with open(filepath, mode="rb") as f:
data = tomllib.load(f)
- return " ".join(data["project"]["optional-dependencies"].get("benchmark", ""))
+ return (
+ " ".join(data["project"]["optional-dependencies"].get("benchmark", ""))
+ + " "
+ + " ".join(data["project"]["optional-dependencies"].get("test", ""))
+ )
class Runner:
- activate: str = "source .venv/bin/activate"
+ bench_store_dir = None
- def __init__(self, ref: str):
+ def __init__(self, *, ref: str, where: str) -> None:
self.ref = ref
- self.commit = get_commit(ref)
- suffix = f"{self.ref}_{self.commit}"
- self.base = f"{TMP}/icechunk-bench-{suffix}"
- self.cwd = f"{TMP}/icechunk-bench-{suffix}/icechunk"
- self.pycwd = f"{TMP}/icechunk-bench-{suffix}/icechunk/icechunk-python"
+ self.full_commit = get_full_commit(ref)
+ self.commit = self.full_commit[:8]
+ self.where = where
- def initialize(self) -> None:
- ref = self.ref
+ @property
+ def pip_github_url(self) -> str:
+ # optional extras cannot be specified here, "not guaranteed to work"
+ # https://pip.pypa.io/en/stable/topics/vcs-support/#url-fragments
+ return f"git+https://github.com/earth-mover/icechunk.git@{self.full_commit}#subdirectory=icechunk-python"
- deps = get_benchmark_deps(f"{CURRENTDIR}/pyproject.toml")
- kwargs = dict(cwd=self.cwd, check=True)
- pykwargs = dict(cwd=self.pycwd, check=True)
+ @property
+ def prefix(self) -> str:
+ # try:
+ # return f"v{ic.spec_version():02d}"
+ # except AttributeError:
+ return f"{self.ref}_{self.commit}"
- print(f"checking out {ref} to {self.base}")
- subprocess.run(["mkdir", self.base], check=False)
- # TODO: copy the local one instead to save time?
- subprocess.run(
- ["git", "clone", "-q", "git@github.com:earth-mover/icechunk"],
- cwd=self.base,
- check=False,
- )
- subprocess.run(["git", "checkout", "-q", ref], **kwargs)
- subprocess.run(["python3", "-m", "venv", ".venv"], cwd=self.pycwd, check=True)
- subprocess.run(
- [
- "maturin",
- "build",
- "-q",
- "--release",
- "--out",
- "dist",
- "--find-interpreter",
- ],
- **pykwargs,
- )
- # This is quite ugly but is the only way I can figure out to force pip
- # to install the wheel we just built
- subprocess.run(
- f"{self.activate} "
- f"&& pip install {PIP_OPTIONS} icechunk[test]"
- f"&& pip install {PIP_OPTIONS} {deps}"
- f"&& pip uninstall -y icechunk"
- f"&& pip install -v icechunk --no-index --find-links=dist",
- shell=True,
- **pykwargs,
- )
+ @property
+ def ref_commit(self) -> str:
+ return f"{self.ref}_{self.commit}"
- def setup(self, force: bool):
- print(f"setup_benchmarks for {self.ref} / {self.commit}")
- subprocess.run(["cp", "-r", "benchmarks", f"{self.pycwd}"], check=True)
+ def sync_benchmarks_folder(self) -> None:
+ """Sync the benchmarks folder over to the cwd."""
+ raise NotImplementedError
+
+ def execute(cmd: str) -> None:
+ """Execute a command"""
+ raise NotImplementedError
+
+ def initialize(self) -> None:
+ """Builds virtual envs etc."""
+ self.sync_benchmarks_folder()
+
+ def setup(self, *, force: bool):
+ """Creates datasets for read benchmarks."""
+ logger.info(f"setup_benchmarks for {self.ref} / {self.commit}")
cmd = (
- f"pytest -q --durations 10 -nauto "
- "-m setup_benchmarks --force-setup={force} "
- f"--icechunk-prefix=benchmarks/{self.ref}_{self.commit}/ "
+ f"pytest {PYTEST_OPTIONS} -nauto "
+ f"-m setup_benchmarks --force-setup={force} "
+ f"--where={self.where} "
+ f"--icechunk-prefix=benchmarks/{self.prefix}/ "
"benchmarks/"
)
- subprocess.run(
- f"{self.activate} && {cmd}", cwd=self.pycwd, check=True, shell=True
- )
+ logger.info(cmd)
+ self.execute(cmd, check=True)
def run(self, *, pytest_extra: str = "") -> None:
- print(f"running benchmarks for {self.ref} / {self.commit}")
-
- subprocess.run(["cp", "-r", "benchmarks", f"{self.pycwd}"], check=True)
+ """Actually runs the benchmarks."""
+ logger.info(f"running benchmarks for {self.ref} / {self.commit}")
# shorten the name so `pytest-benchmark compare` is readable
- clean_ref = ref.removeprefix("icechunk-v0.1.0-alph")
+ clean_ref = self.ref.removeprefix("icechunk-v0.1.0-alph")
+ assert self.bench_store_dir is not None
# Note: .benchmarks is the default location for pytest-benchmark
cmd = (
- f"pytest -q --durations 10 "
- f"--benchmark-storage={CURRENTDIR}/.benchmarks "
- f"--benchmark-save={clean_ref}_{self.commit} "
- f"--icechunk-prefix=benchmarks/{ref}_{self.commit}/ "
- f"{pytest_extra} "
+ f"pytest {pytest_extra} "
+ f"--benchmark-storage={self.bench_store_dir}/.benchmarks "
+ f"--benchmark-save={clean_ref}_{self.commit}_{self.where} "
+ f"--where={self.where} "
+ f"--icechunk-prefix=benchmarks/{self.prefix}/ "
+ f"{PYTEST_OPTIONS} "
"benchmarks/"
)
print(cmd)
+ self.execute(cmd, check=False)
+
+
+class LocalRunner(Runner):
+ activate: str = "source .venv/bin/activate"
+ bench_store_dir = CURRENTDIR
+
+ def __init__(self, *, ref: str, where: str):
+ super().__init__(ref=ref, where=where)
+ suffix = self.ref_commit
+ self.base = f"{TMP}/icechunk-bench-{suffix}"
+ self.cwd = f"{TMP}/icechunk-bench-{suffix}/icechunk"
+ self.pycwd = f"{TMP}/icechunk-bench-{suffix}/icechunk/icechunk-python"
+
+ def sync_benchmarks_folder(self):
+ subprocess.run(["cp", "-r", "benchmarks", f"{self.pycwd}"], check=True)
+
+ def execute(self, cmd: str, **kwargs) -> None:
# don't stop if benchmarks fail
+ subprocess.run(f"{self.activate} && {cmd}", cwd=self.pycwd, shell=True, **kwargs)
+
+ def initialize(self) -> None:
+ logger.info(f"Running initialize for {self.ref} in {self.base}")
+
+ deps = get_benchmark_deps(f"{CURRENTDIR}/pyproject.toml")
+ subprocess.run(["mkdir", "-p", self.pycwd], check=False)
+ subprocess.run(["python3", "-m", "venv", ".venv"], cwd=self.pycwd, check=True)
+ cmd = f"pip install {PIP_OPTIONS} {self.pip_github_url} {deps}"
+ self.execute(cmd, check=True)
+ super().initialize()
+
+ def run(self, *, pytest_extra: str = "") -> None:
+ super().run(pytest_extra=pytest_extra)
+ if len(refs) > 1:
+ files = sorted(
+ glob.glob("./.benchmarks/**/*.json", recursive=True),
+ key=os.path.getmtime,
+ reverse=True,
+ )[-len(refs) :]
+ # TODO: Use `just` here when we figure that out.
+ subprocess.run(
+ [
+ "pytest-benchmark",
+ "compare",
+ "--group=group,func,param",
+ "--sort=fullname",
+ "--columns=median",
+ "--name=normal",
+ *files,
+ ]
+ )
+
+
+class CoiledRunner(Runner):
+ bench_store_dir = "."
+
+ def get_coiled_run_args(self) -> tuple[str]:
+ ckwargs = self.get_coiled_kwargs()
+ return (
+ "coiled",
+ "run",
+ "--interactive",
+ "--name",
+ f"icebench-{self.commit}", # cluster name
+ "--keepalive",
+ "10m",
+ f"--workspace={ckwargs['workspace']}", # cloud
+ f"--vm-type={ckwargs['vm_type']}",
+ f"--software={ckwargs['software']}",
+ f"--region={ckwargs['region']}",
+ )
+
+ def get_coiled_kwargs(self):
+ COILED_SOFTWARE = {
+ "icechunk-v0.1.0-alpha.1": "icechunk-alpha-release",
+ "icechunk-v0.1.0-alpha.12": "icechunk-alpha-12",
+ }
+
+ # using the default region here
+ kwargs = get_coiled_kwargs(store=self.where)
+ kwargs["software"] = COILED_SOFTWARE.get(
+ self.ref, f"icechunk-bench-{self.commit}"
+ )
+ return kwargs
+
+ def initialize(self) -> None:
+ import coiled
+
+ deps = get_benchmark_deps(f"{CURRENTDIR}/pyproject.toml").split(" ")
+
+ ckwargs = self.get_coiled_kwargs()
+ # repeated calls are a no-op!
+ coiled.create_software_environment(
+ name=ckwargs["software"],
+ workspace=ckwargs["workspace"],
+ conda={
+ "channels": ["conda-forge"],
+ "dependencies": ["rust", "python=3.12", "pip"],
+ },
+ pip=[self.pip_github_url, "coiled", *deps],
+ )
+ super().initialize()
+
+ def execute(self, cmd, **kwargs) -> None:
+ subprocess.run([*self.get_coiled_run_args(), cmd], **kwargs)
+
+ def sync_benchmarks_folder(self) -> None:
subprocess.run(
- f"{self.activate} && {cmd}", shell=True, cwd=self.pycwd, check=False
+ [
+ *self.get_coiled_run_args(),
+ "--file",
+ "benchmarks/",
+ "ls -alh ./.benchmarks/",
+ ],
+ check=True,
)
+ def run(self, *, pytest_extra: str = "") -> None:
+ super().run(pytest_extra=pytest_extra)
+ # This prints to screen but we could upload to a bucket in here.
+ self.execute("sh benchmarks/most_recent.sh")
+
-def init_for_ref(ref: str, force_setup: bool):
- runner = Runner(ref)
+def init_for_ref(runner: Runner):
runner.initialize()
- runner.setup(force=force_setup)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("refs", help="refs to run benchmarks for", nargs="+")
parser.add_argument("--pytest", help="passed to pytest", default="")
+ parser.add_argument("--where", help="where to run? [local|s3|gcs]", default="local")
+ parser.add_argument(
+ "--skip-setup",
+ help="skip setup step, useful for benchmarks that don't need data",
+ action="store_true",
+ default=False,
+ )
parser.add_argument(
"--force-setup", help="forced recreation of datasets?", type=bool, default=False
)
args = parser.parse_args()
refs = args.refs
- # refs = [
- # # "0.1.0-alpha.2-python", # first release
- # "icechunk-v0.1.0-alpha.8",
- # # concurrent chunk fetch
- # # list_dir reimplemented
- # # "icechunk-v0.1.0-alpha.10",
- # # metadata file download performance
- # # "icechunk-v0.1.0-alpha.11",
- # # concurrently download bytes
- # "icechunk-v0.1.0-alpha.12",
- # # "main",
- # ]
-
- tqdm.contrib.concurrent.process_map(
- partial(init_for_ref, force_setup=args.force_setup), refs
- )
- # For debugging
- # for ref in refs:
- # init_for_ref(ref, force_setup=args.force_setup)
- for ref in tqdm.tqdm(refs):
- runner = Runner(ref)
+ if args.where == "local":
+ runner_cls = LocalRunner
+ else:
+ runner_cls = CoiledRunner
+
+ runners = tuple(runner_cls(ref=ref, where=args.where) for ref in refs)
+
+ # we can only initialize in parallel since the two refs may have the same spec version.
+ tqdm.contrib.concurrent.process_map(partial(init_for_ref), runners)
+
+ if not args.skip_setup:
+ for runner in runners:
+ runner.setup(force=args.force_setup)
+
+ for runner in tqdm.tqdm(runners):
runner.run(pytest_extra=args.pytest)
- if len(refs) > 1:
- files = sorted(glob.glob("./.benchmarks/**/*.json", recursive=True))[-len(refs) :]
- # TODO: Use `just` here when we figure that out.
- subprocess.run(
- [
- "pytest-benchmark",
- "compare",
- "--group=group,func,param",
- "--sort=fullname",
- "--columns=median",
- "--name=normal",
- *files,
- ]
- )
+
+# Compare wish-list:
+# 1. skip differences < X%
+# 2. groupby
+# 3. better names in summary table
+# 4. Compare across object stores; same object store & compare across versions
+# 5. Compare icechunk vs plain Zarr
diff --git a/icechunk-python/benchmarks/test_benchmark_reads.py b/icechunk-python/benchmarks/test_benchmark_reads.py
index 2499f2b6..b1ba9e6e 100644
--- a/icechunk-python/benchmarks/test_benchmark_reads.py
+++ b/icechunk-python/benchmarks/test_benchmark_reads.py
@@ -40,12 +40,19 @@ def test_time_create_store(synth_dataset: Dataset, benchmark) -> None:
def test_time_getsize_key(synth_dataset: Dataset, benchmark) -> None:
from zarr.core.sync import sync
+ if synth_dataset.load_variables is None:
+ pytest.skip()
+
store = synth_dataset.store
@benchmark
def fn():
for array in synth_dataset.load_variables:
- key = f"{synth_dataset.group or ''}/{array}/zarr.json"
+ if group := synth_dataset.group is not None:
+ prefix = f"{group}/"
+ else:
+ prefix = ""
+ key = f"{prefix}{array}/zarr.json"
sync(store.getsize(key))
@@ -98,6 +105,8 @@ def fn():
@pytest.mark.benchmark(group="xarray-read", min_rounds=2)
def test_time_xarray_read_chunks(synth_dataset: Dataset, benchmark) -> None:
"""128MB vs 8MB chunks. should see a difference."""
+ if synth_dataset.load_variables is None:
+ pytest.skip()
# TODO: switch out concurrency "ideal_request_size"
ds = xr.open_zarr(
synth_dataset.store, group=synth_dataset.group, chunks=None, consolidated=False
diff --git a/icechunk-python/benchmarks/test_benchmark_writes.py b/icechunk-python/benchmarks/test_benchmark_writes.py
index 5954b1e3..c2881448 100644
--- a/icechunk-python/benchmarks/test_benchmark_writes.py
+++ b/icechunk-python/benchmarks/test_benchmark_writes.py
@@ -8,7 +8,7 @@
from benchmarks.tasks import Executor, write
from icechunk import Repository, RepositoryConfig, local_filesystem_storage
-NUM_CHUNK_REFS = 20_000
+NUM_CHUNK_REFS = 10_000
NUM_VIRTUAL_CHUNK_REFS = 100_000
diff --git a/icechunk-python/examples/mpwrite.py b/icechunk-python/examples/mpwrite.py
new file mode 100644
index 00000000..a86f8a3a
--- /dev/null
+++ b/icechunk-python/examples/mpwrite.py
@@ -0,0 +1,48 @@
+# An example of using multiprocessing to write to an Icechunk dataset
+
+import tempfile
+from concurrent.futures import ProcessPoolExecutor
+
+import xarray as xr
+from icechunk import Repository, Session, local_filesystem_storage
+from icechunk.distributed import merge_sessions
+
+
+def write_timestamp(*, itime: int, session: Session) -> Session:
+ # pass a list to isel to preserve the time dimension
+ ds = xr.tutorial.open_dataset("rasm").isel(time=[itime])
+ # region="auto" tells Xarray to infer which "region" of the output arrays to write to.
+ ds.to_zarr(session.store, region="auto", consolidated=False)
+ return session
+
+
+if __name__ == "__main__":
+ ds = xr.tutorial.open_dataset("rasm").isel(time=slice(24))
+ repo = Repository.create(local_filesystem_storage(tempfile.mkdtemp()))
+ session = repo.writable_session("main")
+
+ chunks = {1 if dim == "time" else ds.sizes[dim] for dim in ds.Tair.dims}
+ ds.to_zarr(
+ session.store, compute=False, encoding={"Tair": {"chunks": chunks}}, mode="w"
+ )
+ # this commit is optional, but may be useful in your workflow
+ session.commit("initialize store")
+
+ session = repo.writable_session("main")
+ with ProcessPoolExecutor() as executor:
+ # opt-in to successful pickling of a writable session
+ with session.allow_pickling():
+ # submit the writes
+ futures = [
+ executor.submit(write_timestamp, itime=i, session=session)
+ for i in range(ds.sizes["time"])
+ ]
+ # grab the Session objects from each individual write task
+ sessions = [f.result() for f in futures]
+
+ # manually merge the remote sessions in to the local session
+ session = merge_sessions(session, *sessions)
+ session.commit("finished writes")
+
+ ondisk = xr.open_zarr(repo.readonly_session("main").store, consolidated=False)
+ xr.testing.assert_identical(ds, ondisk)
diff --git a/icechunk-python/notebooks/demo-dummy-data.ipynb b/icechunk-python/notebooks/demo-dummy-data.ipynb
index 9ed89e57..444847dd 100644
--- a/icechunk-python/notebooks/demo-dummy-data.ipynb
+++ b/icechunk-python/notebooks/demo-dummy-data.ipynb
@@ -335,7 +335,7 @@
},
{
"cell_type": "code",
- "execution_count": 24,
+ "execution_count": null,
"id": "d904f719-98cf-4f51-8e9a-1631dcb3fcba",
"metadata": {},
"outputs": [
@@ -348,7 +348,7 @@
}
],
"source": [
- "session = repo.readonly_session(snapshot=first_commit)\n",
+ "session = repo.readonly_session(snapshot_id=first_commit)\n",
"root_group = zarr.open_group(session.store, mode=\"r\")\n",
"\n",
"try:\n",
diff --git a/icechunk-python/notebooks/version-control.ipynb b/icechunk-python/notebooks/version-control.ipynb
index 74c44cf0..abde76db 100644
--- a/icechunk-python/notebooks/version-control.ipynb
+++ b/icechunk-python/notebooks/version-control.ipynb
@@ -242,7 +242,7 @@
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": null,
"id": "e785d9a1-36ec-4207-b334-20e0a68e3ac8",
"metadata": {},
"outputs": [
@@ -258,7 +258,7 @@
}
],
"source": [
- "session = repo.readonly_session(snapshot=first_commit)\n",
+ "session = repo.readonly_session(snapshot_id=first_commit)\n",
"root_group = zarr.open_group(store=session.store, mode=\"r\")\n",
"dict(root_group.attrs)"
]
diff --git a/icechunk-python/pyproject.toml b/icechunk-python/pyproject.toml
index 38740a5d..2ffdeb9f 100644
--- a/icechunk-python/pyproject.toml
+++ b/icechunk-python/pyproject.toml
@@ -4,6 +4,7 @@ build-backend = "maturin"
[project]
name = "icechunk"
+description = "Icechunk Python"
requires-python = ">=3.11"
classifiers = [
"Programming Language :: Rust",
@@ -13,18 +14,12 @@ classifiers = [
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
]
-license = { text = "Apache-2.0" }
+readme = "../README.md"
+license = "Apache-2.0"
dynamic = ["version"]
+authors = [{ name = "Earthmover", email = "info@earthmover.io" }]
-dependencies = ["zarr>=3"]
-
-[tool.poetry]
-name = "icechunk"
-version = "0.1.0"
-description = "Icechunk Python"
-authors = ["Earthmover "]
-readme = "../README.md"
-packages = [{ include = "icechunk", from = "python" }]
+dependencies = ["zarr>=3,!=3.0.3"]
[project.optional-dependencies]
test = [
@@ -43,14 +38,18 @@ test = [
"hypothesis",
"pandas-stubs",
"boto3-stubs[s3]",
+ "termcolor",
]
benchmark = [
"pytest-benchmark[histogram]",
"pytest-xdist",
"s3fs",
+ "gcsfs",
"h5netcdf",
"pooch",
"tqdm",
+ "humanize",
+ "platformdirs",
]
[tool.maturin]
@@ -78,10 +77,6 @@ filterwarnings = [
"ignore:Unused async fixture loop scope:pytest.PytestWarning",
]
-[tool.pyright]
-venvPath = "."
-venv = ".venv"
-
[tool.mypy]
python_version = "3.11"
strict = true
diff --git a/icechunk-python/python/icechunk/__init__.py b/icechunk-python/python/icechunk/__init__.py
index 8a1b288e..449bf768 100644
--- a/icechunk-python/python/icechunk/__init__.py
+++ b/icechunk-python/python/icechunk/__init__.py
@@ -13,6 +13,8 @@
ConflictSolver,
ConflictType,
Credentials,
+ Diff,
+ GcsBearerCredential,
GcsCredentials,
GcsStaticCredentials,
GCSummary,
@@ -32,7 +34,10 @@
StorageSettings,
VersionSelection,
VirtualChunkContainer,
+ VirtualChunkSpec,
__version__,
+ initialize_logs,
+ spec_version,
)
from icechunk.credentials import (
AnyAzureCredential,
@@ -47,6 +52,7 @@
containers_credentials,
gcs_credentials,
gcs_from_env_credentials,
+ gcs_refreshable_credentials,
gcs_static_credentials,
s3_anonymous_credentials,
s3_credentials,
@@ -89,7 +95,9 @@
"ConflictSolver",
"ConflictType",
"Credentials",
+ "Diff",
"GCSummary",
+ "GcsBearerCredential",
"GcsCredentials",
"GcsStaticCredentials",
"IcechunkError",
@@ -112,6 +120,7 @@
"StorageSettings",
"VersionSelection",
"VirtualChunkContainer",
+ "VirtualChunkSpec",
"__version__",
"azure_credentials",
"azure_from_env_credentials",
@@ -120,17 +129,37 @@
"containers_credentials",
"gcs_credentials",
"gcs_from_env_credentials",
+ "gcs_refreshable_credentials",
"gcs_static_credentials",
"gcs_storage",
"in_memory_storage",
+ "initialize_logs",
"local_filesystem_storage",
+ "print_debug_info",
"s3_anonymous_credentials",
"s3_credentials",
- "s3_credentials",
"s3_from_env_credentials",
"s3_refreshable_credentials",
"s3_static_credentials",
"s3_storage",
"s3_store",
+ "spec_version",
"tigris_storage",
]
+
+
+def print_debug_info() -> None:
+ import platform
+ from importlib import import_module
+
+ print(f"platform: {platform.platform()}")
+ print(f"python: {platform.python_version()}")
+ print(f"icechunk: {__version__}")
+ for package in ["zarr", "numcodecs", "xarray", "virtualizarr"]:
+ try:
+ print(f"{package}: {import_module(package).__version__}")
+ except ModuleNotFoundError:
+ continue
+
+
+initialize_logs()
diff --git a/icechunk-python/python/icechunk/_icechunk_python.pyi b/icechunk-python/python/icechunk/_icechunk_python.pyi
index 794d7059..0b926e31 100644
--- a/icechunk-python/python/icechunk/_icechunk_python.pyi
+++ b/icechunk-python/python/icechunk/_icechunk_python.pyi
@@ -5,13 +5,28 @@ from enum import Enum
from typing import Any
class S3Options:
+ """Options for accessing an S3-compatible storage backend"""
def __init__(
self,
region: str | None = None,
endpoint_url: str | None = None,
allow_http: bool = False,
anonymous: bool = False,
- ) -> None: ...
+ ) -> None:
+ """
+ Create a new `S3Options` object
+
+ Parameters
+ ----------
+ region: str | None
+ Optional, the region to use for the storage backend.
+ endpoint_url: str | None
+ Optional, the endpoint URL to use for the storage backend.
+ allow_http: bool
+ Whether to allow HTTP requests to the storage backend.
+ anonymous: bool
+ Whether to use anonymous credentials to the storage backend. When `True`, the s3 requests will not be signed.
+ """
class ObjectStoreConfig:
class InMemory:
@@ -46,37 +61,173 @@ AnyObjectStoreConfig = (
)
class VirtualChunkContainer:
+ """A virtual chunk container is a configuration that allows Icechunk to read virtual references from a storage backend.
+
+ Attributes
+ ----------
+ name: str
+ The name of the virtual chunk container.
+ url_prefix: str
+ The prefix of urls that will use this containers configuration for reading virtual references.
+ store: ObjectStoreConfig
+ The storage backend to use for the virtual chunk container.
+ """
+
name: str
url_prefix: str
store: ObjectStoreConfig
- def __init__(self, name: str, url_prefix: str, store: AnyObjectStoreConfig): ...
+ def __init__(self, name: str, url_prefix: str, store: AnyObjectStoreConfig):
+ """
+ Create a new `VirtualChunkContainer` object
+
+ Parameters
+ ----------
+ name: str
+ The name of the virtual chunk container.
+ url_prefix: str
+ The prefix of urls that will use this containers configuration for reading virtual references.
+ store: ObjectStoreConfig
+ The storage backend to use for the virtual chunk container.
+ """
+
+class VirtualChunkSpec:
+ """The specification for a virtual chunk reference."""
+ @property
+ def index(self) -> list[int]:
+ """The chunk index, in chunk coordinates space"""
+ ...
+ @property
+ def location(self) -> str:
+ """The URL to the virtual chunk data, something like 's3://bucket/foo.nc'"""
+ ...
+ @property
+ def offset(self) -> int:
+ """The chunk offset within the pointed object, in bytes"""
+ ...
+ @property
+ def length(self) -> int:
+ """The length of the chunk in bytes"""
+ ...
+ @property
+ def etag_checksum(self) -> str | None:
+ """Optional object store e-tag for the containing object.
+
+ Icechunk will refuse to serve data from this chunk if the etag has changed.
+ """
+ ...
+ @property
+ def last_updated_at_checksum(self) -> datetime.datetime | None:
+ """Optional timestamp for the containing object.
+
+ Icechunk will refuse to serve data from this chunk if it has been modified in object store after this time.
+ """
+ ...
+
+ def __init__(
+ self,
+ index: list[int],
+ location: str,
+ offset: int,
+ length: int,
+ etag_checksum: str | None = None,
+ last_updated_at_checksum: datetime.datetime | None = None,
+ ) -> None: ...
class CompressionAlgorithm(Enum):
- """Enum for selecting the compression algorithm used by Icechunk to write its metadata files"""
+ """Enum for selecting the compression algorithm used by Icechunk to write its metadata files
+
+ Attributes
+ ----------
+ Zstd: int
+ The Zstd compression algorithm.
+ """
Zstd = 0
def __init__(self) -> None: ...
@staticmethod
- def default() -> CompressionAlgorithm: ...
+ def default() -> CompressionAlgorithm:
+ """
+ The default compression algorithm used by Icechunk to write its metadata files.
+
+ Returns
+ -------
+ CompressionAlgorithm
+ The default compression algorithm.
+ """
+ ...
class CompressionConfig:
"""Configuration for how Icechunk compresses its metadata files"""
def __init__(
self, algorithm: CompressionAlgorithm | None = None, level: int | None = None
- ) -> None: ...
+ ) -> None:
+ """
+ Create a new `CompressionConfig` object
+
+ Parameters
+ ----------
+ algorithm: CompressionAlgorithm | None
+ The compression algorithm to use.
+ level: int | None
+ The compression level to use.
+ """
+ ...
@property
- def algorithm(self) -> CompressionAlgorithm | None: ...
+ def algorithm(self) -> CompressionAlgorithm | None:
+ """
+ The compression algorithm used by Icechunk to write its metadata files.
+
+ Returns
+ -------
+ CompressionAlgorithm | None
+ The compression algorithm used by Icechunk to write its metadata files.
+ """
+ ...
@algorithm.setter
- def algorithm(self, value: CompressionAlgorithm | None) -> None: ...
+ def algorithm(self, value: CompressionAlgorithm | None) -> None:
+ """
+ Set the compression algorithm used by Icechunk to write its metadata files.
+
+ Parameters
+ ----------
+ value: CompressionAlgorithm | None
+ The compression algorithm to use.
+ """
+ ...
@property
- def level(self) -> int | None: ...
+ def level(self) -> int | None:
+ """
+ The compression level used by Icechunk to write its metadata files.
+
+ Returns
+ -------
+ int | None
+ The compression level used by Icechunk to write its metadata files.
+ """
+ ...
@level.setter
- def level(self, value: int | None) -> None: ...
+ def level(self, value: int | None) -> None:
+ """
+ Set the compression level used by Icechunk to write its metadata files.
+
+ Parameters
+ ----------
+ value: int | None
+ The compression level to use.
+ """
+ ...
@staticmethod
- def default() -> CompressionConfig: ...
+ def default() -> CompressionConfig:
+ """
+ The default compression configuration used by Icechunk to write its metadata files.
+
+ Returns
+ -------
+ CompressionConfig
+ """
class CachingConfig:
"""Configuration for how Icechunk caches its metadata files"""
@@ -88,29 +239,133 @@ class CachingConfig:
num_transaction_changes: int | None = None,
num_bytes_attributes: int | None = None,
num_bytes_chunks: int | None = None,
- ) -> None: ...
+ ) -> None:
+ """
+ Create a new `CachingConfig` object
+
+ Parameters
+ ----------
+ num_snapshot_nodes: int | None
+ The number of snapshot nodes to cache.
+ num_chunk_refs: int | None
+ The number of chunk references to cache.
+ num_transaction_changes: int | None
+ The number of transaction changes to cache.
+ num_bytes_attributes: int | None
+ The number of bytes of attributes to cache.
+ num_bytes_chunks: int | None
+ The number of bytes of chunks to cache.
+ """
@property
- def num_snapshot_nodes(self) -> int | None: ...
+ def num_snapshot_nodes(self) -> int | None:
+ """
+ The number of snapshot nodes to cache.
+
+ Returns
+ -------
+ int | None
+ The number of snapshot nodes to cache.
+ """
+ ...
@num_snapshot_nodes.setter
- def num_snapshot_nodes(self, value: int | None) -> None: ...
+ def num_snapshot_nodes(self, value: int | None) -> None:
+ """
+ Set the number of snapshot nodes to cache.
+
+ Parameters
+ ----------
+ value: int | None
+ The number of snapshot nodes to cache.
+ """
+ ...
@property
- def num_chunk_refs(self) -> int | None: ...
+ def num_chunk_refs(self) -> int | None:
+ """
+ The number of chunk references to cache.
+
+ Returns
+ -------
+ int | None
+ The number of chunk references to cache.
+ """
+ ...
@num_chunk_refs.setter
- def num_chunk_refs(self, value: int | None) -> None: ...
+ def num_chunk_refs(self, value: int | None) -> None:
+ """
+ Set the number of chunk references to cache.
+
+ Parameters
+ ----------
+ value: int | None
+ The number of chunk references to cache.
+ """
+ ...
@property
- def num_transaction_changes(self) -> int | None: ...
+ def num_transaction_changes(self) -> int | None:
+ """
+ The number of transaction changes to cache.
+
+ Returns
+ -------
+ int | None
+ The number of transaction changes to cache.
+ """
+ ...
@num_transaction_changes.setter
- def num_transaction_changes(self, value: int | None) -> None: ...
+ def num_transaction_changes(self, value: int | None) -> None:
+ """
+ Set the number of transaction changes to cache.
+
+ Parameters
+ ----------
+ value: int | None
+ The number of transaction changes to cache.
+ """
+ ...
@property
- def num_bytes_attributes(self) -> int | None: ...
+ def num_bytes_attributes(self) -> int | None:
+ """
+ The number of bytes of attributes to cache.
+
+ Returns
+ -------
+ int | None
+ The number of bytes of attributes to cache.
+ """
+ ...
@num_bytes_attributes.setter
- def num_bytes_attributes(self, value: int | None) -> None: ...
+ def num_bytes_attributes(self, value: int | None) -> None:
+ """
+ Set the number of bytes of attributes to cache.
+
+ Parameters
+ ----------
+ value: int | None
+ The number of bytes of attributes to cache.
+ """
+ ...
@property
- def num_bytes_chunks(self) -> int | None: ...
+ def num_bytes_chunks(self) -> int | None:
+ """
+ The number of bytes of chunks to cache.
+
+ Returns
+ -------
+ int | None
+ The number of bytes of chunks to cache.
+ """
+ ...
@num_bytes_chunks.setter
- def num_bytes_chunks(self, value: int | None) -> None: ...
- @staticmethod
- def default() -> CachingConfig: ...
+ def num_bytes_chunks(self, value: int | None) -> None:
+ """
+ Set the number of bytes of chunks to cache.
+
+ Parameters
+ ----------
+ value: int | None
+ The number of bytes of chunks to cache.
+ """
+ ...
class ManifestPreloadCondition:
"""Configuration for conditions under which manifests will preload on session creation"""
@@ -167,15 +422,62 @@ class ManifestPreloadConfig:
self,
max_total_refs: int | None = None,
preload_if: ManifestPreloadCondition | None = None,
- ) -> None: ...
+ ) -> None:
+ """
+ Create a new `ManifestPreloadConfig` object
+
+ Parameters
+ ----------
+ max_total_refs: int | None
+ The maximum number of references to preload.
+ preload_if: ManifestPreloadCondition | None
+ The condition under which manifests will be preloaded.
+ """
+ ...
@property
- def max_total_refs(self) -> int | None: ...
+ def max_total_refs(self) -> int | None:
+ """
+ The maximum number of references to preload.
+
+ Returns
+ -------
+ int | None
+ The maximum number of references to preload.
+ """
+ ...
@max_total_refs.setter
- def max_total_refs(self, value: int | None) -> None: ...
+ def max_total_refs(self, value: int | None) -> None:
+ """
+ Set the maximum number of references to preload.
+
+ Parameters
+ ----------
+ value: int | None
+ The maximum number of references to preload.
+ """
+ ...
@property
- def preload_if(self) -> ManifestPreloadCondition | None: ...
+ def preload_if(self) -> ManifestPreloadCondition | None:
+ """
+ The condition under which manifests will be preloaded.
+
+ Returns
+ -------
+ ManifestPreloadCondition | None
+ The condition under which manifests will be preloaded.
+ """
+ ...
@preload_if.setter
- def preload_if(self, value: ManifestPreloadCondition | None) -> None: ...
+ def preload_if(self, value: ManifestPreloadCondition | None) -> None:
+ """
+ Set the condition under which manifests will be preloaded.
+
+ Parameters
+ ----------
+ value: ManifestPreloadCondition | None
+ The condition under which manifests will be preloaded.
+ """
+ ...
class ManifestConfig:
"""Configuration for how Icechunk manifests"""
@@ -183,11 +485,38 @@ class ManifestConfig:
def __init__(
self,
preload: ManifestPreloadConfig | None = None,
- ) -> None: ...
+ ) -> None:
+ """
+ Create a new `ManifestConfig` object
+
+ Parameters
+ ----------
+ preload: ManifestPreloadConfig | None
+ The configuration for how Icechunk manifests will be preloaded.
+ """
+ ...
@property
- def preload(self) -> ManifestPreloadConfig | None: ...
+ def preload(self) -> ManifestPreloadConfig | None:
+ """
+ The configuration for how Icechunk manifests will be preloaded.
+
+ Returns
+ -------
+ ManifestPreloadConfig | None
+ The configuration for how Icechunk manifests will be preloaded.
+ """
+ ...
@preload.setter
- def preload(self, value: ManifestPreloadConfig | None) -> None: ...
+ def preload(self, value: ManifestPreloadConfig | None) -> None:
+ """
+ Set the configuration for how Icechunk manifests will be preloaded.
+
+ Parameters
+ ----------
+ value: ManifestPreloadConfig | None
+ The configuration for how Icechunk manifests will be preloaded.
+ """
+ ...
class StorageConcurrencySettings:
"""Configuration for how Icechunk uses its Storage instance"""
@@ -196,24 +525,120 @@ class StorageConcurrencySettings:
self,
max_concurrent_requests_for_object: int | None = None,
ideal_concurrent_request_size: int | None = None,
- ) -> None: ...
+ ) -> None:
+ """
+ Create a new `StorageConcurrencySettings` object
+
+ Parameters
+ ----------
+ max_concurrent_requests_for_object: int | None
+ The maximum number of concurrent requests for an object.
+ ideal_concurrent_request_size: int | None
+ The ideal concurrent request size.
+ """
+ ...
@property
- def max_concurrent_requests_for_object(self) -> int | None: ...
+ def max_concurrent_requests_for_object(self) -> int | None:
+ """
+ The maximum number of concurrent requests for an object.
+
+ Returns
+ -------
+ int | None
+ The maximum number of concurrent requests for an object.
+ """
+ ...
@max_concurrent_requests_for_object.setter
- def max_concurrent_requests_for_object(self, value: int | None) -> None: ...
+ def max_concurrent_requests_for_object(self, value: int | None) -> None:
+ """
+ Set the maximum number of concurrent requests for an object.
+
+ Parameters
+ ----------
+ value: int | None
+ The maximum number of concurrent requests for an object.
+ """
+ ...
@property
- def ideal_concurrent_request_size(self) -> int | None: ...
+ def ideal_concurrent_request_size(self) -> int | None:
+ """
+ The ideal concurrent request size.
+
+ Returns
+ -------
+ int | None
+ The ideal concurrent request size.
+ """
+ ...
@ideal_concurrent_request_size.setter
- def ideal_concurrent_request_size(self, value: int | None) -> None: ...
+ def ideal_concurrent_request_size(self, value: int | None) -> None:
+ """
+ Set the ideal concurrent request size.
+
+ Parameters
+ ----------
+ value: int | None
+ The ideal concurrent request size.
+ """
+ ...
class StorageSettings:
"""Configuration for how Icechunk uses its Storage instance"""
- def __init__(self, concurrency: StorageConcurrencySettings | None = None) -> None: ...
+ def __init__(
+ self,
+ concurrency: StorageConcurrencySettings | None = None,
+ unsafe_use_conditional_create: bool | None = None,
+ unsafe_use_conditional_update: bool | None = None,
+ unsafe_use_metadata: bool | None = None,
+ ) -> None:
+ """
+ Create a new `StorageSettings` object
+
+ Parameters
+ ----------
+ concurrency: StorageConcurrencySettings | None
+ The configuration for how Icechunk uses its Storage instance.
+
+ unsafe_use_conditional_update: bool | None
+ If set to False, Icechunk loses some of its consistency guarantees.
+ This is only useful in object stores that don't support the feature.
+ Use it at your own risk.
+
+ unsafe_use_conditional_create: bool | None
+ If set to False, Icechunk loses some of its consistency guarantees.
+ This is only useful in object stores that don't support the feature.
+ Use at your own risk.
+
+ unsafe_use_metadata: bool | None
+ Don't write metadata fields in Icechunk files.
+ This is only useful in object stores that don't support the feature.
+ Use at your own risk.
+ """
+ ...
+ @property
+ def concurrency(self) -> StorageConcurrencySettings | None:
+ """
+ The configuration for how much concurrency Icechunk store uses
+
+ Returns
+ -------
+ StorageConcurrencySettings | None
+ The configuration for how Icechunk uses its Storage instance.
+ """
+
+ @property
+ def unsafe_use_conditional_update(self) -> bool | None:
+ """True if Icechunk will use conditional PUT operations for updates in the object store"""
+ ...
@property
- def concurrency(self) -> StorageConcurrencySettings | None: ...
- @concurrency.setter
- def concurrency(self, value: StorageConcurrencySettings | None) -> None: ...
+ def unsafe_use_conditional_create(self) -> bool | None:
+ """True if Icechunk will use conditional PUT operations for creation in the object store"""
+ ...
+ @property
+ def unsafe_use_metadata(self) -> bool | None:
+ """True if Icechunk will write object metadata in the object store"""
+ ...
class RepositoryConfig:
"""Configuration for an Icechunk repository"""
@@ -221,61 +646,279 @@ class RepositoryConfig:
def __init__(
self,
inline_chunk_threshold_bytes: int | None = None,
- unsafe_overwrite_refs: bool | None = None,
get_partial_values_concurrency: int | None = None,
compression: CompressionConfig | None = None,
caching: CachingConfig | None = None,
storage: StorageSettings | None = None,
virtual_chunk_containers: dict[str, VirtualChunkContainer] | None = None,
manifest: ManifestConfig | None = None,
- ) -> None: ...
+ ) -> None:
+ """
+ Create a new `RepositoryConfig` object
+
+ Parameters
+ ----------
+ inline_chunk_threshold_bytes: int | None
+ The maximum size of a chunk that will be stored inline in the repository.
+ get_partial_values_concurrency: int | None
+ The number of concurrent requests to make when getting partial values from storage.
+ compression: CompressionConfig | None
+ The compression configuration for the repository.
+ caching: CachingConfig | None
+ The caching configuration for the repository.
+ storage: StorageSettings | None
+ The storage configuration for the repository.
+ virtual_chunk_containers: dict[str, VirtualChunkContainer] | None
+ The virtual chunk containers for the repository.
+ manifest: ManifestConfig | None
+ The manifest configuration for the repository.
+ """
+ ...
@staticmethod
- def default() -> RepositoryConfig: ...
+ def default() -> RepositoryConfig:
+ """Create a default repository config instance"""
+ ...
@property
- def inline_chunk_threshold_bytes(self) -> int | None: ...
+ def inline_chunk_threshold_bytes(self) -> int | None:
+ """
+ The maximum size of a chunk that will be stored inline in the repository. Chunks larger than this size will be written to storage.
+ """
+ ...
@inline_chunk_threshold_bytes.setter
- def inline_chunk_threshold_bytes(self, value: int | None) -> None: ...
- @property
- def unsafe_overwrite_refs(self) -> bool | None: ...
- @unsafe_overwrite_refs.setter
- def unsafe_overwrite_refs(self, value: bool | None) -> None: ...
+ def inline_chunk_threshold_bytes(self, value: int | None) -> None:
+ """
+ Set the maximum size of a chunk that will be stored inline in the repository. Chunks larger than this size will be written to storage.
+ """
+ ...
@property
- def get_partial_values_concurrency(self) -> int | None: ...
+ def get_partial_values_concurrency(self) -> int | None:
+ """
+ The number of concurrent requests to make when getting partial values from storage.
+
+ Returns
+ -------
+ int | None
+ The number of concurrent requests to make when getting partial values from storage.
+ """
+ ...
@get_partial_values_concurrency.setter
- def get_partial_values_concurrency(self, value: int | None) -> None: ...
+ def get_partial_values_concurrency(self, value: int | None) -> None:
+ """
+ Set the number of concurrent requests to make when getting partial values from storage.
+
+ Parameters
+ ----------
+ value: int | None
+ The number of concurrent requests to make when getting partial values from storage.
+ """
+ ...
@property
- def compression(self) -> CompressionConfig | None: ...
+ def compression(self) -> CompressionConfig | None:
+ """
+ The compression configuration for the repository.
+
+ Returns
+ -------
+ CompressionConfig | None
+ The compression configuration for the repository.
+ """
+ ...
@compression.setter
- def compression(self, value: CompressionConfig | None) -> None: ...
+ def compression(self, value: CompressionConfig | None) -> None:
+ """
+ Set the compression configuration for the repository.
+
+ Parameters
+ ----------
+ value: CompressionConfig | None
+ The compression configuration for the repository.
+ """
+ ...
@property
- def caching(self) -> CachingConfig | None: ...
+ def caching(self) -> CachingConfig | None:
+ """
+ The caching configuration for the repository.
+
+ Returns
+ -------
+ CachingConfig | None
+ The caching configuration for the repository.
+ """
+ ...
@caching.setter
- def caching(self, value: CachingConfig | None) -> None: ...
+ def caching(self, value: CachingConfig | None) -> None:
+ """
+ Set the caching configuration for the repository.
+
+ Parameters
+ ----------
+ value: CachingConfig | None
+ The caching configuration for the repository.
+ """
+ ...
@property
- def storage(self) -> StorageSettings | None: ...
+ def storage(self) -> StorageSettings | None:
+ """
+ The storage configuration for the repository.
+
+ Returns
+ -------
+ StorageSettings | None
+ The storage configuration for the repository.
+ """
+ ...
@storage.setter
- def storage(self, value: StorageSettings | None) -> None: ...
+ def storage(self, value: StorageSettings | None) -> None:
+ """
+ Set the storage configuration for the repository.
+
+ Parameters
+ ----------
+ value: StorageSettings | None
+ The storage configuration for the repository.
+ """
+ ...
@property
- def manifest(self) -> ManifestConfig | None: ...
+ def manifest(self) -> ManifestConfig | None:
+ """
+ The manifest configuration for the repository.
+
+ Returns
+ -------
+ ManifestConfig | None
+ The manifest configuration for the repository.
+ """
+ ...
@manifest.setter
- def manifest(self, value: ManifestConfig | None) -> None: ...
+ def manifest(self, value: ManifestConfig | None) -> None:
+ """
+ Set the manifest configuration for the repository.
+
+ Parameters
+ ----------
+ value: ManifestConfig | None
+ The manifest configuration for the repository.
+ """
+ ...
+ @property
+ def virtual_chunk_containers(self) -> dict[str, VirtualChunkContainer] | None:
+ """
+ The virtual chunk containers for the repository.
+
+ Returns
+ -------
+ dict[str, VirtualChunkContainer] | None
+ The virtual chunk containers for the repository.
+ """
+ ...
+ def get_virtual_chunk_container(self, name: str) -> VirtualChunkContainer | None:
+ """
+ Get the virtual chunk container for the repository associated with the given name.
+
+ Parameters
+ ----------
+ name: str
+ The name of the virtual chunk container to get.
+
+ Returns
+ -------
+ VirtualChunkContainer | None
+ The virtual chunk container for the repository associated with the given name.
+ """
+ ...
+ def set_virtual_chunk_container(self, cont: VirtualChunkContainer) -> None:
+ """
+ Set the virtual chunk container for the repository.
+
+ Parameters
+ ----------
+ cont: VirtualChunkContainer
+ The virtual chunk container to set.
+ """
+ ...
+ def clear_virtual_chunk_containers(self) -> None:
+ """
+ Clear all virtual chunk containers from the repository.
+ """
+ ...
+
+class Diff:
+ """The result of comparing two snapshots"""
+ @property
+ def new_groups(self) -> set[str]:
+ """
+ The groups that were added to the target ref.
+ """
+ ...
+ @property
+ def new_arrays(self) -> set[str]:
+ """
+ The arrays that were added to the target ref.
+ """
+ ...
+ @property
+ def deleted_groups(self) -> set[str]:
+ """
+ The groups that were deleted in the target ref.
+ """
+ ...
+ @property
+ def deleted_arrays(self) -> set[str]:
+ """
+ The arrays that were deleted in the target ref.
+ """
+ ...
+ @property
+ def updated_user_attributes(self) -> set[str]:
+ """
+ The nodes that had user attributes updated in the target ref.
+ """
+ ...
+ @property
+ def updated_zarr_metadata(self) -> set[str]:
+ """
+ The nodes that had zarr metadata updated in the target ref.
+ """
+ ...
@property
- def virtual_chunk_containers(self) -> dict[str, VirtualChunkContainer] | None: ...
- def get_virtual_chunk_container(self, name: str) -> VirtualChunkContainer | None: ...
- def set_virtual_chunk_container(self, cont: VirtualChunkContainer) -> None: ...
- def clear_virtual_chunk_containers(self) -> None: ...
+ def updated_chunks(self) -> dict[str, int]:
+ """
+ The chunks that had data updated in the target ref.
+ """
+ ...
class GCSummary:
+ """Summarizes the results of a garbage collection operation on an icechunk repo"""
@property
- def chunks_deleted(self) -> int: ...
+ def chunks_deleted(self) -> int:
+ """
+ How many chunks were deleted.
+ """
+ ...
@property
- def manifests_deleted(self) -> int: ...
+ def manifests_deleted(self) -> int:
+ """
+ How many manifests were deleted.
+ """
+ ...
@property
- def snapshots_deleted(self) -> int: ...
+ def snapshots_deleted(self) -> int:
+ """
+ How many snapshots were deleted.
+ """
+ ...
@property
- def attributes_deleted(self) -> int: ...
+ def attributes_deleted(self) -> int:
+ """
+ How many attributes were deleted.
+ """
+ ...
@property
- def transaction_logs_deleted(self) -> int: ...
+ def transaction_logs_deleted(self) -> int:
+ """
+ How many transaction logs were deleted.
+ """
+ ...
class PyRepository:
@classmethod
@@ -304,24 +947,20 @@ class PyRepository:
) -> PyRepository: ...
@staticmethod
def exists(storage: Storage) -> bool: ...
+ @classmethod
+ def from_bytes(cls, data: bytes) -> PyRepository: ...
+ def as_bytes(self) -> bytes: ...
@staticmethod
def fetch_config(storage: Storage) -> RepositoryConfig | None: ...
def save_config(self) -> None: ...
def config(self) -> RepositoryConfig: ...
def storage(self) -> Storage: ...
- def ancestry(
- self,
- *,
- branch: str | None = None,
- tag: str | None = None,
- snapshot: str | None = None,
- ) -> list[SnapshotInfo]: ...
def async_ancestry(
self,
*,
branch: str | None = None,
tag: str | None = None,
- snapshot: str | None = None,
+ snapshot_id: str | None = None,
) -> AsyncIterator[SnapshotInfo]: ...
def create_branch(self, branch: str, snapshot_id: str) -> None: ...
def list_branches(self) -> set[str]: ...
@@ -332,15 +971,31 @@ class PyRepository:
def create_tag(self, tag: str, snapshot_id: str) -> None: ...
def list_tags(self) -> set[str]: ...
def lookup_tag(self, tag: str) -> str: ...
+ def diff(
+ self,
+ from_branch: str | None = None,
+ from_tag: str | None = None,
+ from_snapshot_id: str | None = None,
+ to_branch: str | None = None,
+ to_tag: str | None = None,
+ to_snapshot_id: str | None = None,
+ ) -> Diff: ...
def readonly_session(
self,
- *,
branch: str | None = None,
+ *,
tag: str | None = None,
- snapshot: str | None = None,
+ snapshot_id: str | None = None,
+ as_of: datetime.datetime | None = None,
) -> PySession: ...
def writable_session(self, branch: str) -> PySession: ...
- def expire_snapshots(self, older_than: datetime.datetime) -> set[str]: ...
+ def expire_snapshots(
+ self,
+ older_than: datetime.datetime,
+ *,
+ delete_expired_branches: bool = False,
+ delete_expired_tags: bool = False,
+ ) -> set[str]: ...
def garbage_collect(
self, delete_object_older_than: datetime.datetime
) -> GCSummary: ...
@@ -358,6 +1013,7 @@ class PySession:
def branch(self) -> str | None: ...
@property
def has_uncommitted_changes(self) -> bool: ...
+ def status(self) -> Diff: ...
def discard_changes(self) -> None: ...
def all_virtual_chunk_locations(self) -> list[str]: ...
def chunk_coordinates(
@@ -403,6 +1059,12 @@ class PyStore:
checksum: str | datetime.datetime | None = None,
validate_container: bool = False,
) -> None: ...
+ def set_virtual_refs(
+ self,
+ array_path: str,
+ chunks: list[VirtualChunkSpec],
+ validate_containers: bool,
+ ) -> list[tuple[int, ...]] | None: ...
async def delete(self, key: str) -> None: ...
async def delete_dir(self, prefix: str) -> None: ...
@property
@@ -416,6 +1078,7 @@ class PyStore:
def list_prefix(self, prefix: str) -> PyAsyncStringGenerator: ...
def list_dir(self, prefix: str) -> PyAsyncStringGenerator: ...
async def getsize(self, key: str) -> int: ...
+ async def getsize_prefix(self, prefix: str) -> int: ...
class PyAsyncStringGenerator(AsyncGenerator[str, None], metaclass=abc.ABCMeta):
def __aiter__(self) -> PyAsyncStringGenerator: ...
@@ -455,6 +1118,19 @@ class PyAsyncSnapshotGenerator(AsyncGenerator[SnapshotInfo, None], metaclass=abc
async def __anext__(self) -> SnapshotInfo: ...
class S3StaticCredentials:
+ """Credentials for an S3 storage backend
+
+ Attributes:
+ access_key_id: str
+ The access key ID to use for authentication.
+ secret_access_key: str
+ The secret access key to use for authentication.
+ session_token: str | None
+ The session token to use for authentication.
+ expires_after: datetime.datetime | None
+ Optional, the expiration time of the credentials.
+ """
+
access_key_id: str
secret_access_key: str
session_token: str | None
@@ -466,19 +1142,53 @@ class S3StaticCredentials:
secret_access_key: str,
session_token: str | None = None,
expires_after: datetime.datetime | None = None,
- ): ...
+ ):
+ """
+ Create a new `S3StaticCredentials` object
+
+ Parameters
+ ----------
+ access_key_id: str
+ The access key ID to use for authentication.
+ secret_access_key: str
+ The secret access key to use for authentication.
+ session_token: str | None
+ Optional, the session token to use for authentication.
+ expires_after: datetime.datetime | None
+ Optional, the expiration time of the credentials.
+ """
+ ...
class S3Credentials:
+ """Credentials for an S3 storage backend"""
class FromEnv:
+ """Uses credentials from environment variables"""
def __init__(self) -> None: ...
class Anonymous:
+ """Does not sign requests, useful for public buckets"""
def __init__(self) -> None: ...
class Static:
+ """Uses s3 credentials without expiration
+
+ Parameters
+ ----------
+ credentials: S3StaticCredentials
+ The credentials to use for authentication.
+ """
def __init__(self, credentials: S3StaticCredentials) -> None: ...
class Refreshable:
+ """Allows for an outside authority to pass in a function that can be used to provide credentials.
+
+ This is useful for credentials that have an expiration time, or are otherwise not known ahead of time.
+
+ Parameters
+ ----------
+ pickled_function: bytes
+ The pickled function to use to provide credentials.
+ """
def __init__(self, pickled_function: bytes) -> None: ...
AnyS3Credential = (
@@ -488,39 +1198,131 @@ AnyS3Credential = (
| S3Credentials.Refreshable
)
+class GcsBearerCredential:
+ """Credentials for a google cloud storage backend
+
+ This is a bearer token that has an expiration time.
+ """
+
+ bearer: str
+ expires_after: datetime.datetime | None
+
+ def __init__(
+ self, bearer: str, *, expires_after: datetime.datetime | None = None
+ ) -> None:
+ """Create a GcsBearerCredential object
+
+ Parameters
+ ----------
+ bearer: str
+ The bearer token to use for authentication.
+ expires_after: datetime.datetime | None
+ The expiration time of the bearer token.
+ """
+
class GcsStaticCredentials:
+ """Credentials for a google cloud storage backend"""
class ServiceAccount:
+ """Credentials for a google cloud storage backend using a service account json file
+
+ Parameters
+ ----------
+ path: str
+ The path to the service account json file.
+ """
def __init__(self, path: str) -> None: ...
class ServiceAccountKey:
+ """Credentials for a google cloud storage backend using a a serialized service account key
+
+ Parameters
+ ----------
+ key: str
+ The serialized service account key.
+ """
def __init__(self, key: str) -> None: ...
class ApplicationCredentials:
+ """Credentials for a google cloud storage backend using application default credentials
+
+ Parameters
+ ----------
+ path: str
+ The path to the application default credentials (ADC) file.
+ """
def __init__(self, path: str) -> None: ...
+ class BearerToken:
+ """Credentials for a google cloud storage backend using a bearer token
+
+ Parameters
+ ----------
+ token: str
+ The bearer token to use for authentication.
+ """
+ def __init__(self, token: str) -> None: ...
+
AnyGcsStaticCredential = (
GcsStaticCredentials.ServiceAccount
| GcsStaticCredentials.ServiceAccountKey
| GcsStaticCredentials.ApplicationCredentials
+ | GcsStaticCredentials.BearerToken
)
class GcsCredentials:
+ """Credentials for a google cloud storage backend
+
+ This can be used to authenticate with a google cloud storage backend.
+ """
class FromEnv:
+ """Uses credentials from environment variables"""
def __init__(self) -> None: ...
class Static:
+ """Uses gcs credentials without expiration"""
def __init__(self, credentials: AnyGcsStaticCredential) -> None: ...
-AnyGcsCredential = GcsCredentials.FromEnv | GcsCredentials.Static
+ class Refreshable:
+ """Allows for an outside authority to pass in a function that can be used to provide credentials.
+
+ This is useful for credentials that have an expiration time, or are otherwise not known ahead of time.
+ """
+ def __init__(self, pickled_function: bytes) -> None: ...
+
+AnyGcsCredential = (
+ GcsCredentials.FromEnv | GcsCredentials.Static | GcsCredentials.Refreshable
+)
class AzureStaticCredentials:
+ """Credentials for an azure storage backend"""
class AccessKey:
+ """Credentials for an azure storage backend using an access key
+
+ Parameters
+ ----------
+ key: str
+ The access key to use for authentication.
+ """
def __init__(self, key: str) -> None: ...
class SasToken:
+ """Credentials for an azure storage backend using a shared access signature token
+
+ Parameters
+ ----------
+ token: str
+ The shared access signature token to use for authentication.
+ """
def __init__(self, token: str) -> None: ...
class BearerToken:
+ """Credentials for an azure storage backend using a bearer token
+
+ Parameters
+ ----------
+ token: str
+ The bearer token to use for authentication.
+ """
def __init__(self, token: str) -> None: ...
AnyAzureStaticCredential = (
@@ -530,10 +1332,16 @@ AnyAzureStaticCredential = (
)
class AzureCredentials:
+ """Credentials for an azure storage backend
+
+ This can be used to authenticate with an azure storage backend.
+ """
class FromEnv:
+ """Uses credentials from environment variables"""
def __init__(self) -> None: ...
class Static:
+ """Uses azure credentials without expiration"""
def __init__(self, credentials: AnyAzureStaticCredential) -> None: ...
AnyAzureCredential = AzureCredentials.FromEnv | AzureCredentials.Static
@@ -575,6 +1383,14 @@ class Storage:
credentials: AnyS3Credential | None = None,
) -> Storage: ...
@classmethod
+ def new_s3_object_store(
+ cls,
+ config: S3Options,
+ bucket: str,
+ prefix: str | None,
+ credentials: AnyS3Credential | None = None,
+ ) -> Storage: ...
+ @classmethod
def new_tigris(
cls,
config: S3Options,
@@ -598,16 +1414,28 @@ class Storage:
@classmethod
def new_azure_blob(
cls,
+ account: str,
container: str,
prefix: str,
credentials: AnyAzureCredential | None = None,
*,
config: dict[str, str] | None = None,
) -> Storage: ...
+ def __repr__(self) -> str: ...
def default_settings(self) -> StorageSettings: ...
class VersionSelection(Enum):
- """Enum for selecting the which version of a conflict"""
+ """Enum for selecting the which version of a conflict
+
+ Attributes
+ ----------
+ Fail: int
+ Fail the rebase operation
+ UseOurs: int
+ Use the version from the source store
+ UseTheirs: int
+ Use the version from the target store
+ """
Fail = 0
UseOurs = 1
@@ -627,7 +1455,6 @@ class BasicConflictSolver(ConflictSolver):
This conflict solver allows for simple configuration of resolution behavior for conflicts that may occur during a rebase operation.
It will attempt to resolve a limited set of conflicts based on the configuration options provided.
- - When a user attribute conflict is encountered, the behavior is determined by the `on_user_attributes_conflict` option
- When a chunk conflict is encountered, the behavior is determined by the `on_chunk_conflict` option
- When an array is deleted that has been updated, `fail_on_delete_of_updated_array` will determine whether to fail the rebase operation
- When a group is deleted that has been updated, `fail_on_delete_of_updated_group` will determine whether to fail the rebase operation
@@ -636,15 +1463,14 @@ class BasicConflictSolver(ConflictSolver):
def __init__(
self,
*,
- on_user_attributes_conflict: VersionSelection = VersionSelection.UseOurs,
on_chunk_conflict: VersionSelection = VersionSelection.UseOurs,
fail_on_delete_of_updated_array: bool = False,
fail_on_delete_of_updated_group: bool = False,
) -> None:
"""Create a BasicConflictSolver object with the given configuration options
- Parameters:
- on_user_attributes_conflict: VersionSelection
- The behavior to use when a user attribute conflict is encountered, by default VersionSelection.use_ours()
+
+ Parameters
+ ----------
on_chunk_conflict: VersionSelection
The behavior to use when a chunk conflict is encountered, by default VersionSelection.use_theirs()
fail_on_delete_of_updated_array: bool
@@ -703,36 +1529,70 @@ class PyConflictError(IcechunkError):
__version__: str
class ConflictType(Enum):
- """Type of conflict detected"""
-
- NewNodeConflictsWithExistingNode = 1
- NewNodeInInvalidGroup = 2
- ZarrMetadataDoubleUpdate = 3
- ZarrMetadataUpdateOfDeletedArray = 4
- UserAttributesDoubleUpdate = 5
- UserAttributesUpdateOfDeletedNode = 6
- ChunkDoubleUpdate = 7
- ChunksUpdatedInDeletedArray = 8
- ChunksUpdatedInUpdatedArray = 9
- DeleteOfUpdatedArray = 10
- DeleteOfUpdatedGroup = 11
+ """Type of conflict detected
+
+ Attributes:
+ NewNodeConflictsWithExistingNode: int
+ A new node conflicts with an existing node
+ NewNodeInInvalidGroup: tuple[int]
+ A new node is in an invalid group
+ ZarrMetadataDoubleUpdate: tuple[int]
+ A zarr metadata update conflicts with an existing zarr metadata update
+ ZarrMetadataUpdateOfDeletedArray: tuple[int]
+ A zarr metadata update is attempted on a deleted array
+ ZarrMetadataUpdateOfDeletedGroup: tuple[int]
+ A zarr metadata update is attempted on a deleted group
+ ChunkDoubleUpdate: tuple[int]
+ A chunk update conflicts with an existing chunk update
+ ChunksUpdatedInDeletedArray: tuple[int]
+ Chunks are updated in a deleted array
+ ChunksUpdatedInUpdatedArray: tuple[int]
+ Chunks are updated in an updated array
+ DeleteOfUpdatedArray: tuple[int]
+ A delete is attempted on an updated array
+ DeleteOfUpdatedGroup: tuple[int]
+ A delete is attempted on an updated group
+ """
+
+ NewNodeConflictsWithExistingNode = (1,)
+ NewNodeInInvalidGroup = (2,)
+ ZarrMetadataDoubleUpdate = (3,)
+ ZarrMetadataUpdateOfDeletedArray = (4,)
+ ZarrMetadataUpdateOfDeletedGroup = (5,)
+ ChunkDoubleUpdate = (6,)
+ ChunksUpdatedInDeletedArray = (7,)
+ ChunksUpdatedInUpdatedArray = (8,)
+ DeleteOfUpdatedArray = (9,)
+ DeleteOfUpdatedGroup = (10,)
class Conflict:
"""A conflict detected between snapshots"""
@property
def conflict_type(self) -> ConflictType:
- """The type of conflict detected"""
+ """The type of conflict detected
+
+ Returns:
+ ConflictType: The type of conflict detected
+ """
...
@property
def path(self) -> str:
- """The path of the node that caused the conflict"""
+ """The path of the node that caused the conflict
+
+ Returns:
+ str: The path of the node that caused the conflict
+ """
...
@property
def conflicted_chunks(self) -> list[list[int]] | None:
- """If the conflict is a chunk conflict, this will return the list of chunk indices that are in conflict"""
+ """If the conflict is a chunk conflict, this will return the list of chunk indices that are in conflict
+
+ Returns:
+ list[list[int]] | None: The list of chunk indices that are in conflict
+ """
...
class RebaseFailedData:
@@ -745,7 +1605,11 @@ class RebaseFailedData:
@property
def conflicts(self) -> list[Conflict]:
- """The conflicts that occurred during the rebase operation"""
+ """The conflicts that occurred during the rebase operation
+
+ Returns:
+ list[Conflict]: The conflicts that occurred during the rebase operation
+ """
...
class PyRebaseFailedError(IcechunkError):
@@ -753,3 +1617,20 @@ class PyRebaseFailedError(IcechunkError):
args: tuple[RebaseFailedData]
...
+
+def initialize_logs() -> None:
+ """
+ Initialize the logging system for the library.
+
+ This should be called before any other Icechunk functions are called.
+ """
+ ...
+
+def spec_version() -> int:
+ """
+ The version of the Icechunk specification that the library is compatible with.
+
+ Returns:
+ int: The version of the Icechunk specification that the library is compatible with
+ """
+ ...
diff --git a/icechunk-python/python/icechunk/credentials.py b/icechunk-python/python/icechunk/credentials.py
index 7d647265..165bff2a 100644
--- a/icechunk-python/python/icechunk/credentials.py
+++ b/icechunk-python/python/icechunk/credentials.py
@@ -6,6 +6,7 @@
AzureCredentials,
AzureStaticCredentials,
Credentials,
+ GcsBearerCredential,
GcsCredentials,
GcsStaticCredentials,
S3Credentials,
@@ -23,9 +24,12 @@
GcsStaticCredentials.ServiceAccount
| GcsStaticCredentials.ServiceAccountKey
| GcsStaticCredentials.ApplicationCredentials
+ | GcsStaticCredentials.BearerToken
)
-AnyGcsCredential = GcsCredentials.FromEnv | GcsCredentials.Static
+AnyGcsCredential = (
+ GcsCredentials.FromEnv | GcsCredentials.Static | GcsCredentials.Refreshable
+)
AnyAzureStaticCredential = (
AzureStaticCredentials.AccessKey
@@ -35,6 +39,7 @@
AnyAzureCredential = AzureCredentials.FromEnv | AzureCredentials.Static
+
AnyCredential = Credentials.S3 | Credentials.Gcs | Credentials.Azure
@@ -178,6 +183,7 @@ def gcs_static_credentials(
service_account_file: str | None = None,
service_account_key: str | None = None,
application_credentials: str | None = None,
+ bearer_token: str | None = None,
) -> AnyGcsStaticCredential:
"""Create static credentials Google Cloud Storage object store."""
if service_account_file is not None:
@@ -186,9 +192,18 @@ def gcs_static_credentials(
return GcsStaticCredentials.ServiceAccountKey(service_account_key)
if application_credentials is not None:
return GcsStaticCredentials.ApplicationCredentials(application_credentials)
+ if bearer_token is not None:
+ return GcsStaticCredentials.BearerToken(bearer_token)
raise ValueError("Conflicting arguments to gcs_static_credentials function")
+def gcs_refreshable_credentials(
+ get_credentials: Callable[[], GcsBearerCredential],
+) -> GcsCredentials.Refreshable:
+ """Create refreshable credentials for Google Cloud Storage object store."""
+ return GcsCredentials.Refreshable(pickle.dumps(get_credentials))
+
+
def gcs_from_env_credentials() -> GcsCredentials.FromEnv:
"""Instruct Google Cloud Storage object store to fetch credentials from the operative system environment."""
return GcsCredentials.FromEnv()
@@ -199,7 +214,9 @@ def gcs_credentials(
service_account_file: str | None = None,
service_account_key: str | None = None,
application_credentials: str | None = None,
+ bearer_token: str | None = None,
from_env: bool | None = None,
+ get_credentials: Callable[[], GcsBearerCredential] | None = None,
) -> AnyGcsCredential:
"""Create credentials Google Cloud Storage object store.
@@ -209,6 +226,7 @@ def gcs_credentials(
service_account_file is None
and service_account_key is None
and application_credentials is None
+ and bearer_token is None
):
return gcs_from_env_credentials()
@@ -216,15 +234,20 @@ def gcs_credentials(
service_account_file is not None
or service_account_key is not None
or application_credentials is not None
+ or bearer_token is not None
) and (from_env is None or not from_env):
return GcsCredentials.Static(
gcs_static_credentials(
service_account_file=service_account_file,
service_account_key=service_account_key,
application_credentials=application_credentials,
+ bearer_token=bearer_token,
)
)
+ if get_credentials is not None:
+ return gcs_refreshable_credentials(get_credentials)
+
raise ValueError("Conflicting arguments to gcs_credentials function")
diff --git a/icechunk-python/python/icechunk/repository.py b/icechunk-python/python/icechunk/repository.py
index 13cc6b23..2dae5926 100644
--- a/icechunk-python/python/icechunk/repository.py
+++ b/icechunk-python/python/icechunk/repository.py
@@ -1,8 +1,9 @@
import datetime
-from collections.abc import AsyncIterator
-from typing import Self
+from collections.abc import AsyncIterator, Iterator
+from typing import Self, cast
from icechunk._icechunk_python import (
+ Diff,
GCSummary,
PyRepository,
RepositoryConfig,
@@ -139,6 +140,16 @@ def exists(storage: Storage) -> bool:
"""
return PyRepository.exists(storage)
+ def __getstate__(self) -> object:
+ return {
+ "_repository": self._repository.as_bytes(),
+ }
+
+ def __setstate__(self, state: object) -> None:
+ if not isinstance(state, dict):
+ raise ValueError("Invalid repository state")
+ self._repository = PyRepository.from_bytes(state["_repository"])
+
@staticmethod
def fetch_config(storage: Storage) -> RepositoryConfig | None:
"""
@@ -195,8 +206,8 @@ def ancestry(
*,
branch: str | None = None,
tag: str | None = None,
- snapshot: str | None = None,
- ) -> list[SnapshotInfo]:
+ snapshot_id: str | None = None,
+ ) -> Iterator[SnapshotInfo]:
"""
Get the ancestry of a snapshot.
@@ -206,7 +217,7 @@ def ancestry(
The branch to get the ancestry of.
tag : str, optional
The tag to get the ancestry of.
- snapshot : str, optional
+ snapshot_id : str, optional
The snapshot ID to get the ancestry of.
Returns
@@ -218,14 +229,22 @@ def ancestry(
-----
Only one of the arguments can be specified.
"""
- return self._repository.ancestry(branch=branch, tag=tag, snapshot=snapshot)
+
+ # the returned object is both an Async and Sync iterator
+ res = cast(
+ Iterator[SnapshotInfo],
+ self._repository.async_ancestry(
+ branch=branch, tag=tag, snapshot_id=snapshot_id
+ ),
+ )
+ return res
def async_ancestry(
self,
*,
branch: str | None = None,
tag: str | None = None,
- snapshot: str | None = None,
+ snapshot_id: str | None = None,
) -> AsyncIterator[SnapshotInfo]:
"""
Get the ancestry of a snapshot.
@@ -236,7 +255,7 @@ def async_ancestry(
The branch to get the ancestry of.
tag : str, optional
The tag to get the ancestry of.
- snapshot : str, optional
+ snapshot_id : str, optional
The snapshot ID to get the ancestry of.
Returns
@@ -248,7 +267,9 @@ def async_ancestry(
-----
Only one of the arguments can be specified.
"""
- return self._repository.async_ancestry(branch=branch, tag=tag, snapshot=snapshot)
+ return self._repository.async_ancestry(
+ branch=branch, tag=tag, snapshot_id=snapshot_id
+ )
def create_branch(self, branch: str, snapshot_id: str) -> None:
"""
@@ -329,7 +350,7 @@ def delete_branch(self, branch: str) -> None:
"""
self._repository.delete_branch(branch)
- def delete_tag(self, branch: str) -> None:
+ def delete_tag(self, tag: str) -> None:
"""
Delete a tag.
@@ -342,7 +363,7 @@ def delete_tag(self, branch: str) -> None:
-------
None
"""
- self._repository.delete_tag(branch)
+ self._repository.delete_tag(tag)
def create_tag(self, tag: str, snapshot_id: str) -> None:
"""
@@ -388,12 +409,45 @@ def lookup_tag(self, tag: str) -> str:
"""
return self._repository.lookup_tag(tag)
- def readonly_session(
+ def diff(
self,
*,
+ from_branch: str | None = None,
+ from_tag: str | None = None,
+ from_snapshot_id: str | None = None,
+ to_branch: str | None = None,
+ to_tag: str | None = None,
+ to_snapshot_id: str | None = None,
+ ) -> Diff:
+ """
+ Compute an overview of the operations executed from version `from` to version `to`.
+
+ Both versions, `from` and `to`, must be identified. Identification can be done using a branch, tag or snapshot id.
+ The styles used to identify the `from` and `to` versions can be different.
+
+ The `from` version must be a member of the `ancestry` of `to`.
+
+ Returns
+ -------
+ Diff
+ The operations executed between the two versions
+ """
+ return self._repository.diff(
+ from_branch=from_branch,
+ from_tag=from_tag,
+ from_snapshot_id=from_snapshot_id,
+ to_branch=to_branch,
+ to_tag=to_tag,
+ to_snapshot_id=to_snapshot_id,
+ )
+
+ def readonly_session(
+ self,
branch: str | None = None,
+ *,
tag: str | None = None,
- snapshot: str | None = None,
+ snapshot_id: str | None = None,
+ as_of: datetime.datetime | None = None,
) -> Session:
"""
Create a read-only session.
@@ -408,8 +462,11 @@ def readonly_session(
If provided, the branch to create the session on.
tag : str, optional
If provided, the tag to create the session on.
- snapshot : str, optional
+ snapshot_id : str, optional
If provided, the snapshot ID to create the session on.
+ as_of: datetime.datetime, optional
+ When combined with the branch argument, it will open the session at the last
+ snapshot that is at or before this datetime
Returns
-------
@@ -421,7 +478,9 @@ def readonly_session(
Only one of the arguments can be specified.
"""
return Session(
- self._repository.readonly_session(branch=branch, tag=tag, snapshot=snapshot)
+ self._repository.readonly_session(
+ branch=branch, tag=tag, snapshot_id=snapshot_id, as_of=as_of
+ )
)
def writable_session(self, branch: str) -> Session:
@@ -445,7 +504,13 @@ def writable_session(self, branch: str) -> Session:
"""
return Session(self._repository.writable_session(branch))
- def expire_snapshots(self, older_than: datetime.datetime) -> set[str]:
+ def expire_snapshots(
+ self,
+ older_than: datetime.datetime,
+ *,
+ delete_expired_branches: bool = False,
+ delete_expired_tags: bool = False,
+ ) -> set[str]:
"""Expire all snapshots older than a threshold.
This processes snapshots found by navigating all references in
@@ -456,6 +521,10 @@ def expire_snapshots(self, older_than: datetime.datetime) -> set[str]:
available for garbage collection, they could still be pointed by
ether refs.
+ If delete_expired_* is set to True, branches or tags that, after the
+ expiration process, point to expired snapshots directly, will be
+ deleted.
+
Warning: this is an administrative operation, it should be run
carefully. The repository can still operate concurrently while
`expire_snapshots` runs, but other readers can get inconsistent
diff --git a/icechunk-python/python/icechunk/session.py b/icechunk-python/python/icechunk/session.py
index a158a236..64ad3202 100644
--- a/icechunk-python/python/icechunk/session.py
+++ b/icechunk-python/python/icechunk/session.py
@@ -6,6 +6,7 @@
Conflict,
ConflictErrorData,
ConflictSolver,
+ Diff,
RebaseFailedData,
)
from icechunk._icechunk_python import PyConflictError, PyRebaseFailedError, PySession
@@ -179,6 +180,17 @@ def has_uncommitted_changes(self) -> bool:
"""
return self._session.has_uncommitted_changes
+ def status(self) -> Diff:
+ """
+ Compute an overview of the current session changes
+
+ Returns
+ -------
+ Diff
+ The operations executed in the current session but still not committed.
+ """
+ return self._session.status()
+
def discard_changes(self) -> None:
"""
When the session is writable, discard any uncommitted changes.
diff --git a/icechunk-python/python/icechunk/storage.py b/icechunk-python/python/icechunk/storage.py
index 3ac3d2b2..70e87ee3 100644
--- a/icechunk-python/python/icechunk/storage.py
+++ b/icechunk-python/python/icechunk/storage.py
@@ -2,6 +2,7 @@
from datetime import datetime
from icechunk._icechunk_python import (
+ GcsBearerCredential,
ObjectStoreConfig,
S3Options,
S3StaticCredentials,
@@ -99,6 +100,7 @@ def s3_storage(
get_credentials: Callable[[], S3StaticCredentials] | None
Use this function to get and refresh object store credentials
"""
+
credentials = s3_credentials(
access_key_id=access_key_id,
secret_access_key=secret_access_key,
@@ -117,6 +119,38 @@ def s3_storage(
)
+def s3_object_store_storage(
+ *,
+ bucket: str,
+ prefix: str | None,
+ region: str | None = None,
+ endpoint_url: str | None = None,
+ allow_http: bool = False,
+ access_key_id: str | None = None,
+ secret_access_key: str | None = None,
+ session_token: str | None = None,
+ expires_after: datetime | None = None,
+ anonymous: bool | None = None,
+ from_env: bool | None = None,
+) -> Storage:
+ credentials = s3_credentials(
+ access_key_id=access_key_id,
+ secret_access_key=secret_access_key,
+ session_token=session_token,
+ expires_after=expires_after,
+ anonymous=anonymous,
+ from_env=from_env,
+ get_credentials=None,
+ )
+ options = S3Options(region=region, endpoint_url=endpoint_url, allow_http=allow_http)
+ return Storage.new_s3_object_store(
+ config=options,
+ bucket=bucket,
+ prefix=prefix,
+ credentials=credentials,
+ )
+
+
def tigris_storage(
*,
bucket: str,
@@ -186,8 +220,10 @@ def gcs_storage(
service_account_file: str | None = None,
service_account_key: str | None = None,
application_credentials: str | None = None,
+ bearer_token: str | None = None,
from_env: bool | None = None,
config: dict[str, str] | None = None,
+ get_credentials: Callable[[], GcsBearerCredential] | None = None,
) -> Storage:
"""Create a Storage instance that saves data in Google Cloud Storage object store.
@@ -199,12 +235,18 @@ def gcs_storage(
The prefix within the bucket that is the root directory of the repository
from_env: bool | None
Fetch credentials from the operative system environment
+ bearer_token: str | None
+ The bearer token to use for the object store
+ get_credentials: Callable[[], GcsBearerCredential] | None
+ Use this function to get and refresh object store credentials
"""
credentials = gcs_credentials(
service_account_file=service_account_file,
service_account_key=service_account_key,
application_credentials=application_credentials,
+ bearer_token=bearer_token,
from_env=from_env,
+ get_credentials=get_credentials,
)
return Storage.new_gcs(
bucket=bucket,
@@ -216,6 +258,7 @@ def gcs_storage(
def azure_storage(
*,
+ account: str,
container: str,
prefix: str,
access_key: str | None = None,
@@ -228,6 +271,8 @@ def azure_storage(
Parameters
----------
+ account: str
+ The account to which the caller must have access privileges
container: str
The container where the repository will store its data
prefix: str
@@ -248,6 +293,7 @@ def azure_storage(
from_env=from_env,
)
return Storage.new_azure_blob(
+ account=account,
container=container,
prefix=prefix,
credentials=credentials,
diff --git a/icechunk-python/python/icechunk/store.py b/icechunk-python/python/icechunk/store.py
index 8bb758c3..538fd464 100644
--- a/icechunk-python/python/icechunk/store.py
+++ b/icechunk-python/python/icechunk/store.py
@@ -2,7 +2,7 @@
from datetime import datetime
from typing import TYPE_CHECKING, Any
-from icechunk._icechunk_python import PyStore
+from icechunk._icechunk_python import PyStore, VirtualChunkSpec
from zarr.abc.store import (
ByteRequest,
OffsetByteRequest,
@@ -247,6 +247,34 @@ def set_virtual_ref(
key, location, offset, length, checksum, validate_container
)
+ def set_virtual_refs(
+ self,
+ array_path: str,
+ chunks: list[VirtualChunkSpec],
+ *,
+ validate_containers: bool = False,
+ ) -> list[tuple[int, ...]] | None:
+ """Store multiple virtual references for the same array.
+
+ Parameters
+ ----------
+ array_path : str
+ The path to the array inside the Zarr store. Example: "/groupA/groupB/outputs/my-array"
+ chunks : list[VirtualChunkSpec],
+ The list of virtula chunks to add
+ validate_containers: bool
+ If set to true, ignore virtual references for locations that don't match any existing virtual chunk container
+
+
+ Returns
+ -------
+ list[tuple[int, ...]] | None
+
+ If all virtual references where successfully updated, it returns None.
+ If there were validation errors, it returns the chunk indices of all failed references.
+ """
+ return self._store.set_virtual_refs(array_path, chunks, validate_containers)
+
async def delete(self, key: str) -> None:
"""Remove a key from the store
@@ -261,7 +289,7 @@ async def delete_dir(self, prefix: str) -> None:
Parameters
----------
- key : str
+ prefix : str
"""
return await self._store.delete_dir(prefix)
@@ -348,3 +376,6 @@ def list_dir(self, prefix: str) -> AsyncIterator[str]:
async def getsize(self, key: str) -> int:
return await self._store.getsize(key)
+
+ async def getsize_prefix(self, prefix: str) -> int:
+ return await self._store.getsize_prefix(prefix)
diff --git a/icechunk-python/src/config.rs b/icechunk-python/src/config.rs
index dfd542db..d1e392a7 100644
--- a/icechunk-python/src/config.rs
+++ b/icechunk-python/src/config.rs
@@ -12,9 +12,10 @@ use std::{
use icechunk::{
config::{
AzureCredentials, AzureStaticCredentials, CachingConfig, CompressionAlgorithm,
- CompressionConfig, Credentials, CredentialsFetcher, GcsCredentials,
- GcsStaticCredentials, ManifestConfig, ManifestPreloadCondition,
- ManifestPreloadConfig, S3Credentials, S3Options, S3StaticCredentials,
+ CompressionConfig, Credentials, GcsBearerCredential, GcsCredentials,
+ GcsCredentialsFetcher, GcsStaticCredentials, ManifestConfig,
+ ManifestPreloadCondition, ManifestPreloadConfig, S3Credentials,
+ S3CredentialsFetcher, S3Options, S3StaticCredentials,
},
storage::{self, ConcurrencySettings},
virtual_chunks::VirtualChunkContainer,
@@ -87,13 +88,13 @@ impl PyS3StaticCredentials {
r#"S3StaticCredentials(access_key_id="{ak}", secret_access_key="{sk}", session_token={st}, expires_after={ea})"#,
ak = self.access_key_id.as_str(),
sk = self.secret_access_key.as_str(),
- st = format_option_string(self.session_token.as_ref()),
+ st = format_option(self.session_token.as_ref()),
ea = format_option(self.expires_after.as_ref().map(datetime_repr))
)
}
}
-fn format_option_to_string(o: Option) -> String {
+pub(crate) fn format_option_to_string(o: Option) -> String {
match o.as_ref() {
None => "None".to_string(),
Some(s) => s.to_string(),
@@ -107,13 +108,6 @@ fn format_option<'a, T: AsRef + 'a>(o: Option) -> String {
}
}
-pub(crate) fn format_option_string<'a, T: AsRef + 'a>(o: Option) -> String {
- match o.as_ref() {
- None => "None".to_string(),
- Some(s) => format!(r#""{}""#, s.as_ref()),
- }
-}
-
fn format_bool(b: bool) -> &'static str {
match b {
true => "True",
@@ -155,7 +149,7 @@ impl PythonCredentialsFetcher {
}
#[async_trait]
#[typetag::serde]
-impl CredentialsFetcher for PythonCredentialsFetcher {
+impl S3CredentialsFetcher for PythonCredentialsFetcher {
async fn get(&self) -> Result {
Python::with_gil(|py| {
let pickle_module = PyModule::import(py, "pickle")?;
@@ -168,6 +162,21 @@ impl CredentialsFetcher for PythonCredentialsFetcher {
}
}
+#[async_trait]
+#[typetag::serde]
+impl GcsCredentialsFetcher for PythonCredentialsFetcher {
+ async fn get(&self) -> Result {
+ Python::with_gil(|py| {
+ let pickle_module = PyModule::import(py, "pickle")?;
+ let loads_function = pickle_module.getattr("loads")?;
+ let fetcher = loads_function.call1((self.pickled_function.clone(),))?;
+ let creds: PyGcsBearerCredential = fetcher.call0()?.extract()?;
+ Ok(creds.into())
+ })
+ .map_err(|e: PyErr| e.to_string())
+ }
+}
+
#[pyclass(name = "S3Credentials")]
#[derive(Clone, Debug)]
pub enum PyS3Credentials {
@@ -198,6 +207,7 @@ pub enum PyGcsStaticCredentials {
ServiceAccount(String),
ServiceAccountKey(String),
ApplicationCredentials(String),
+ BearerToken(String),
}
impl From for GcsStaticCredentials {
@@ -212,15 +222,50 @@ impl From for GcsStaticCredentials {
PyGcsStaticCredentials::ApplicationCredentials(path) => {
GcsStaticCredentials::ApplicationCredentials(path.into())
}
+ PyGcsStaticCredentials::BearerToken(token) => {
+ GcsStaticCredentials::BearerToken(GcsBearerCredential {
+ bearer: token,
+ expires_after: None,
+ })
+ }
}
}
}
+#[pyclass(name = "GcsBearerCredential")]
+#[derive(Clone, Debug)]
+pub struct PyGcsBearerCredential {
+ pub bearer: String,
+ pub expires_after: Option>,
+}
+
+#[pymethods]
+impl PyGcsBearerCredential {
+ #[new]
+ #[pyo3(signature = (bearer, *, expires_after = None))]
+ pub fn new(bearer: String, expires_after: Option>) -> Self {
+ PyGcsBearerCredential { bearer, expires_after }
+ }
+}
+
+impl From for GcsBearerCredential {
+ fn from(value: PyGcsBearerCredential) -> Self {
+ GcsBearerCredential { bearer: value.bearer, expires_after: value.expires_after }
+ }
+}
+
+impl From for PyGcsBearerCredential {
+ fn from(value: GcsBearerCredential) -> Self {
+ PyGcsBearerCredential { bearer: value.bearer, expires_after: value.expires_after }
+ }
+}
+
#[pyclass(name = "GcsCredentials")]
#[derive(Clone, Debug)]
pub enum PyGcsCredentials {
FromEnv(),
Static(PyGcsStaticCredentials),
+ Refreshable(Vec),
}
impl From for GcsCredentials {
@@ -228,6 +273,11 @@ impl From for GcsCredentials {
match value {
PyGcsCredentials::FromEnv() => GcsCredentials::FromEnv,
PyGcsCredentials::Static(creds) => GcsCredentials::Static(creds.into()),
+ PyGcsCredentials::Refreshable(pickled_function) => {
+ GcsCredentials::Refreshable(Arc::new(PythonCredentialsFetcher {
+ pickled_function,
+ }))
+ }
}
}
}
@@ -320,8 +370,8 @@ impl PyS3Options {
// TODO: escape
format!(
r#"S3Options(region={region}, endpoint_url={url}, allow_http={http}, anonymous={anon})"#,
- region = format_option_string(self.region.as_ref()),
- url = format_option_string(self.endpoint_url.as_ref()),
+ region = format_option(self.region.as_ref()),
+ url = format_option(self.endpoint_url.as_ref()),
http = format_bool(self.allow_http),
anon = format_bool(self.anonymous),
)
@@ -650,6 +700,12 @@ fn storage_concurrency_settings_repr(s: &PyStorageConcurrencySettings) -> String
pub struct PyStorageSettings {
#[pyo3(get, set)]
pub concurrency: Option>,
+ #[pyo3(get, set)]
+ pub unsafe_use_conditional_update: Option,
+ #[pyo3(get, set)]
+ pub unsafe_use_conditional_create: Option,
+ #[pyo3(get, set)]
+ pub unsafe_use_metadata: Option,
}
impl From for PyStorageSettings {
@@ -660,6 +716,9 @@ impl From for PyStorageSettings {
Py::new(py, Into::::into(c))
.expect("Cannot create instance of StorageConcurrencySettings")
}),
+ unsafe_use_conditional_create: value.unsafe_use_conditional_create,
+ unsafe_use_conditional_update: value.unsafe_use_conditional_update,
+ unsafe_use_metadata: value.unsafe_use_metadata,
})
}
}
@@ -668,6 +727,9 @@ impl From<&PyStorageSettings> for storage::Settings {
fn from(value: &PyStorageSettings) -> Self {
Python::with_gil(|py| Self {
concurrency: value.concurrency.as_ref().map(|c| (&*c.borrow(py)).into()),
+ unsafe_use_conditional_create: value.unsafe_use_conditional_create,
+ unsafe_use_conditional_update: value.unsafe_use_conditional_update,
+ unsafe_use_metadata: value.unsafe_use_metadata,
})
}
}
@@ -684,10 +746,20 @@ impl Eq for PyStorageSettings {}
#[pymethods]
impl PyStorageSettings {
- #[pyo3(signature = ( concurrency=None))]
+ #[pyo3(signature = ( concurrency=None, unsafe_use_conditional_create=None, unsafe_use_conditional_update=None, unsafe_use_metadata=None))]
#[new]
- pub fn new(concurrency: Option>) -> Self {
- Self { concurrency }
+ pub fn new(
+ concurrency: Option>,
+ unsafe_use_conditional_create: Option,
+ unsafe_use_conditional_update: Option,
+ unsafe_use_metadata: Option,
+ ) -> Self {
+ Self {
+ concurrency,
+ unsafe_use_conditional_create,
+ unsafe_use_metadata,
+ unsafe_use_conditional_update,
+ }
}
pub fn __repr__(&self) -> String {
@@ -699,7 +771,13 @@ impl PyStorageSettings {
}),
};
- format!(r#"StorageSettings(concurrency={conc})"#, conc = inner)
+ format!(
+ r#"StorageSettings(concurrency={conc}, unsafe_use_conditional_create={cr}, unsafe_use_conditional_update={up}, unsafe_use_metadata={me})"#,
+ conc = inner,
+ cr = format_option(self.unsafe_use_conditional_create.map(format_bool)),
+ up = format_option(self.unsafe_use_conditional_update.map(format_bool)),
+ me = format_option(self.unsafe_use_metadata.map(format_bool))
+ )
}
}
@@ -912,8 +990,6 @@ pub struct PyRepositoryConfig {
#[pyo3(get, set)]
pub inline_chunk_threshold_bytes: Option,
#[pyo3(get, set)]
- pub unsafe_overwrite_refs: Option,
- #[pyo3(get, set)]
pub get_partial_values_concurrency: Option,
#[pyo3(get, set)]
pub compression: Option>,
@@ -939,7 +1015,6 @@ impl From<&PyRepositoryConfig> for RepositoryConfig {
fn from(value: &PyRepositoryConfig) -> Self {
Python::with_gil(|py| Self {
inline_chunk_threshold_bytes: value.inline_chunk_threshold_bytes,
- unsafe_overwrite_refs: value.unsafe_overwrite_refs,
get_partial_values_concurrency: value.get_partial_values_concurrency,
compression: value.compression.as_ref().map(|c| (&*c.borrow(py)).into()),
caching: value.caching.as_ref().map(|c| (&*c.borrow(py)).into()),
@@ -957,7 +1032,6 @@ impl From for PyRepositoryConfig {
#[allow(clippy::expect_used)]
Python::with_gil(|py| Self {
inline_chunk_threshold_bytes: value.inline_chunk_threshold_bytes,
- unsafe_overwrite_refs: value.unsafe_overwrite_refs,
get_partial_values_concurrency: value.get_partial_values_concurrency,
compression: value.compression.map(|c| {
Py::new(py, Into::::into(c))
@@ -992,11 +1066,10 @@ impl PyRepositoryConfig {
}
#[new]
- #[pyo3(signature = (inline_chunk_threshold_bytes = None, unsafe_overwrite_refs = None, get_partial_values_concurrency = None, compression = None, caching = None, storage = None, virtual_chunk_containers = None, manifest = None))]
+ #[pyo3(signature = (inline_chunk_threshold_bytes = None, get_partial_values_concurrency = None, compression = None, caching = None, storage = None, virtual_chunk_containers = None, manifest = None))]
#[allow(clippy::too_many_arguments)]
pub fn new(
inline_chunk_threshold_bytes: Option,
- unsafe_overwrite_refs: Option,
get_partial_values_concurrency: Option,
compression: Option>,
caching: Option>,
@@ -1006,7 +1079,6 @@ impl PyRepositoryConfig {
) -> Self {
Self {
inline_chunk_threshold_bytes,
- unsafe_overwrite_refs,
get_partial_values_concurrency,
compression,
caching,
@@ -1072,9 +1144,8 @@ impl PyRepositoryConfig {
}));
// TODO: virtual chunk containers
format!(
- r#"RepositoryConfig(inline_chunk_threshold_bytes={inl}, unsafe_overwrite_refs={uns}, get_partial_values_concurrency={partial}, compression={comp}, caching={caching}, storage={storage}, manifest={manifest})"#,
+ r#"RepositoryConfig(inline_chunk_threshold_bytes={inl}, get_partial_values_concurrency={partial}, compression={comp}, caching={caching}, storage={storage}, manifest={manifest})"#,
inl = format_option_to_string(self.inline_chunk_threshold_bytes),
- uns = format_option(self.unsafe_overwrite_refs.map(format_bool)),
partial = format_option_to_string(self.get_partial_values_concurrency),
comp = comp,
caching = caching,
@@ -1111,6 +1182,32 @@ impl PyStorage {
Ok(PyStorage(storage))
}
+ #[pyo3(signature = ( config, bucket, prefix, credentials=None))]
+ #[classmethod]
+ pub fn new_s3_object_store(
+ _cls: &Bound<'_, PyType>,
+ py: Python<'_>,
+ config: &PyS3Options,
+ bucket: String,
+ prefix: Option,
+ credentials: Option,
+ ) -> PyResult {
+ py.allow_threads(move || {
+ pyo3_async_runtimes::tokio::get_runtime().block_on(async move {
+ let storage = icechunk::storage::new_s3_object_store_storage(
+ config.into(),
+ bucket,
+ prefix,
+ credentials.map(|cred| cred.into()),
+ )
+ .await
+ .map_err(PyIcechunkStoreError::StorageError)?;
+
+ Ok(PyStorage(storage))
+ })
+ })
+ }
+
#[pyo3(signature = ( config, bucket, prefix, credentials=None))]
#[classmethod]
pub fn new_tigris(
@@ -1132,60 +1229,91 @@ impl PyStorage {
}
#[classmethod]
- pub fn new_in_memory(_cls: &Bound<'_, PyType>) -> PyResult {
- let storage = icechunk::storage::new_in_memory_storage()
- .map_err(PyIcechunkStoreError::StorageError)?;
-
- Ok(PyStorage(storage))
+ pub fn new_in_memory(_cls: &Bound<'_, PyType>, py: Python<'_>) -> PyResult {
+ py.allow_threads(move || {
+ pyo3_async_runtimes::tokio::get_runtime().block_on(async move {
+ let storage = icechunk::storage::new_in_memory_storage()
+ .await
+ .map_err(PyIcechunkStoreError::StorageError)?;
+
+ Ok(PyStorage(storage))
+ })
+ })
}
#[classmethod]
pub fn new_local_filesystem(
_cls: &Bound<'_, PyType>,
+ py: Python<'_>,
path: PathBuf,
) -> PyResult {
- let storage = icechunk::storage::new_local_filesystem_storage(&path)
- .map_err(PyIcechunkStoreError::StorageError)?;
-
- Ok(PyStorage(storage))
+ py.allow_threads(move || {
+ pyo3_async_runtimes::tokio::get_runtime().block_on(async move {
+ let storage = icechunk::storage::new_local_filesystem_storage(&path)
+ .await
+ .map_err(PyIcechunkStoreError::StorageError)?;
+
+ Ok(PyStorage(storage))
+ })
+ })
}
- #[staticmethod]
+ #[classmethod]
#[pyo3(signature = (bucket, prefix, credentials=None, *, config=None))]
pub fn new_gcs(
+ _cls: &Bound<'_, PyType>,
+ py: Python<'_>,
bucket: String,
prefix: Option,
credentials: Option,
config: Option>,
) -> PyResult {
- let storage = icechunk::storage::new_gcs_storage(
- bucket,
- prefix,
- credentials.map(|cred| cred.into()),
- config,
- )
- .map_err(PyIcechunkStoreError::StorageError)?;
-
- Ok(PyStorage(storage))
+ py.allow_threads(move || {
+ pyo3_async_runtimes::tokio::get_runtime().block_on(async move {
+ let storage = icechunk::storage::new_gcs_storage(
+ bucket,
+ prefix,
+ credentials.map(|cred| cred.into()),
+ config,
+ )
+ .await
+ .map_err(PyIcechunkStoreError::StorageError)?;
+
+ Ok(PyStorage(storage))
+ })
+ })
}
- #[staticmethod]
- #[pyo3(signature = (container, prefix, credentials=None, *, config=None))]
+ #[classmethod]
+ #[pyo3(signature = (account, container, prefix, credentials=None, *, config=None))]
pub fn new_azure_blob(
+ _cls: &Bound<'_, PyType>,
+ py: Python<'_>,
+ account: String,
container: String,
prefix: String,
credentials: Option,
config: Option>,
) -> PyResult {
- let storage = icechunk::storage::new_azure_blob_storage(
- container,
- prefix,
- credentials.map(|cred| cred.into()),
- config,
- )
- .map_err(PyIcechunkStoreError::StorageError)?;
+ py.allow_threads(move || {
+ pyo3_async_runtimes::tokio::get_runtime().block_on(async move {
+ let storage = icechunk::storage::new_azure_blob_storage(
+ account,
+ container,
+ Some(prefix),
+ credentials.map(|cred| cred.into()),
+ config,
+ )
+ .await
+ .map_err(PyIcechunkStoreError::StorageError)?;
+
+ Ok(PyStorage(storage))
+ })
+ })
+ }
- Ok(PyStorage(storage))
+ pub fn __repr__(&self) -> String {
+ format!("{}", self.0)
}
pub fn default_settings(&self) -> PyStorageSettings {
diff --git a/icechunk-python/src/conflicts.rs b/icechunk-python/src/conflicts.rs
index 13d3702e..af169384 100644
--- a/icechunk-python/src/conflicts.rs
+++ b/icechunk-python/src/conflicts.rs
@@ -14,13 +14,12 @@ pub enum PyConflictType {
NewNodeInInvalidGroup = 2,
ZarrMetadataDoubleUpdate = 3,
ZarrMetadataUpdateOfDeletedArray = 4,
- UserAttributesDoubleUpdate = 5,
- UserAttributesUpdateOfDeletedNode = 6,
- ChunkDoubleUpdate = 7,
- ChunksUpdatedInDeletedArray = 8,
- ChunksUpdatedInUpdatedArray = 9,
- DeleteOfUpdatedArray = 10,
- DeleteOfUpdatedGroup = 11,
+ ZarrMetadataUpdateOfDeletedGroup = 5,
+ ChunkDoubleUpdate = 6,
+ ChunksUpdatedInDeletedArray = 7,
+ ChunksUpdatedInUpdatedArray = 8,
+ DeleteOfUpdatedArray = 9,
+ DeleteOfUpdatedGroup = 10,
}
impl Display for PyConflictType {
@@ -31,13 +30,12 @@ impl Display for PyConflictType {
}
PyConflictType::NewNodeInInvalidGroup => "New node in invalid group",
PyConflictType::ZarrMetadataDoubleUpdate => "Zarr metadata double update",
+ PyConflictType::ZarrMetadataUpdateOfDeletedGroup => {
+ "Zarr metadata update of deleted group"
+ }
PyConflictType::ZarrMetadataUpdateOfDeletedArray => {
"Zarr metadata update of deleted array"
}
- PyConflictType::UserAttributesDoubleUpdate => "User attributes double update",
- PyConflictType::UserAttributesUpdateOfDeletedNode => {
- "User attributes update of deleted node"
- }
PyConflictType::ChunkDoubleUpdate => "Chunk double update",
PyConflictType::ChunksUpdatedInDeletedArray => {
"Chunks updated in deleted array"
@@ -108,13 +106,8 @@ impl From<&Conflict> for PyConflict {
path: path.to_string(),
conflicted_chunks: None,
},
- Conflict::UserAttributesDoubleUpdate { path, node_id: _ } => PyConflict {
- conflict_type: PyConflictType::UserAttributesDoubleUpdate,
- path: path.to_string(),
- conflicted_chunks: None,
- },
- Conflict::UserAttributesUpdateOfDeletedNode(path) => PyConflict {
- conflict_type: PyConflictType::UserAttributesUpdateOfDeletedNode,
+ Conflict::ZarrMetadataUpdateOfDeletedGroup(path) => PyConflict {
+ conflict_type: PyConflictType::ZarrMetadataUpdateOfDeletedGroup,
path: path.to_string(),
conflicted_chunks: None,
},
@@ -188,9 +181,8 @@ pub struct PyBasicConflictSolver;
#[pymethods]
impl PyBasicConflictSolver {
#[new]
- #[pyo3(signature = (*, on_user_attributes_conflict=PyVersionSelection::UseOurs, on_chunk_conflict=PyVersionSelection::UseOurs, fail_on_delete_of_updated_array = false, fail_on_delete_of_updated_group = false))]
+ #[pyo3(signature = (*, on_chunk_conflict=PyVersionSelection::UseOurs, fail_on_delete_of_updated_array = false, fail_on_delete_of_updated_group = false))]
fn new(
- on_user_attributes_conflict: PyVersionSelection,
on_chunk_conflict: PyVersionSelection,
fail_on_delete_of_updated_array: bool,
fail_on_delete_of_updated_group: bool,
@@ -198,7 +190,6 @@ impl PyBasicConflictSolver {
(
Self,
PyConflictSolver(Arc::new(BasicConflictSolver {
- on_user_attributes_conflict: on_user_attributes_conflict.into(),
on_chunk_conflict: on_chunk_conflict.into(),
fail_on_delete_of_updated_array,
fail_on_delete_of_updated_group,
diff --git a/icechunk-python/src/errors.rs b/icechunk-python/src/errors.rs
index 83a1cd71..d04fa06c 100644
--- a/icechunk-python/src/errors.rs
+++ b/icechunk-python/src/errors.rs
@@ -1,9 +1,12 @@
-use std::convert::Infallible;
-
use icechunk::{
- format::IcechunkFormatError, ops::gc::GCError, repository::RepositoryError,
- session::SessionError, store::StoreError, StorageError,
+ format::IcechunkFormatError,
+ ops::gc::GCError,
+ repository::RepositoryError,
+ session::{SessionError, SessionErrorKind},
+ store::{StoreError, StoreErrorKind},
+ StorageError,
};
+use miette::{Diagnostic, GraphicalReportHandler};
use pyo3::{
create_exception,
exceptions::{PyKeyError, PyValueError},
@@ -20,44 +23,45 @@ use crate::conflicts::PyConflict;
/// So for now we just use the extra operation to get the coercion instead of manually mapping
/// the errors where this is returned from a python class
#[allow(clippy::enum_variant_names)]
-#[derive(Debug, Error)]
+#[derive(Debug, Error, Diagnostic)]
#[allow(dead_code)]
pub(crate) enum PyIcechunkStoreError {
- #[error("storage error: {0}")]
+ #[error(transparent)]
StorageError(StorageError),
- #[error("store error: {0}")]
+ #[error(transparent)]
StoreError(StoreError),
- #[error("repository error: {0}")]
+ #[error(transparent)]
RepositoryError(#[from] RepositoryError),
#[error("session error: {0}")]
SessionError(SessionError),
- #[error("icechunk format error: {0}")]
+ #[error(transparent)]
IcechunkFormatError(#[from] IcechunkFormatError),
- #[error("Expiration or garbage collection error: {0}")]
+ #[error(transparent)]
GCError(#[from] GCError),
#[error("{0}")]
PyKeyError(String),
#[error("{0}")]
PyValueError(String),
- #[error("{0}")]
+ #[error(transparent)]
PyError(#[from] PyErr),
#[error("{0}")]
UnkownError(String),
}
-impl From for PyIcechunkStoreError {
- fn from(_: Infallible) -> Self {
- PyIcechunkStoreError::UnkownError("Infallible".to_string())
- }
-}
-
impl From for PyIcechunkStoreError {
fn from(error: StoreError) -> Self {
match error {
- StoreError::NotFound(e) => PyIcechunkStoreError::PyKeyError(e.to_string()),
- StoreError::SessionError(SessionError::NodeNotFound { path, message: _ }) => {
- PyIcechunkStoreError::PyKeyError(format!("{}", path))
+ StoreError { kind: StoreErrorKind::NotFound(e), .. } => {
+ PyIcechunkStoreError::PyKeyError(e.to_string())
}
+ StoreError {
+ kind:
+ StoreErrorKind::SessionError(SessionErrorKind::NodeNotFound {
+ path,
+ message: _,
+ }),
+ ..
+ } => PyIcechunkStoreError::PyKeyError(format!("{}", path)),
_ => PyIcechunkStoreError::StoreError(error),
}
}
@@ -66,9 +70,10 @@ impl From for PyIcechunkStoreError {
impl From for PyIcechunkStoreError {
fn from(error: SessionError) -> Self {
match error {
- SessionError::NodeNotFound { path, message: _ } => {
- PyIcechunkStoreError::PyKeyError(format!("{}", path))
- }
+ SessionError {
+ kind: SessionErrorKind::NodeNotFound { path, message: _ },
+ ..
+ } => PyIcechunkStoreError::PyKeyError(format!("{}", path)),
_ => PyIcechunkStoreError::SessionError(error),
}
}
@@ -77,16 +82,16 @@ impl From for PyIcechunkStoreError {
impl From for PyErr {
fn from(error: PyIcechunkStoreError) -> Self {
match error {
- PyIcechunkStoreError::SessionError(SessionError::Conflict {
- expected_parent,
- actual_parent,
+ PyIcechunkStoreError::SessionError(SessionError {
+ kind: SessionErrorKind::Conflict { expected_parent, actual_parent },
+ ..
}) => PyConflictError::new_err(PyConflictErrorData {
expected_parent: expected_parent.map(|s| s.to_string()),
actual_parent: actual_parent.map(|s| s.to_string()),
}),
- PyIcechunkStoreError::SessionError(SessionError::RebaseFailed {
- snapshot,
- conflicts,
+ PyIcechunkStoreError::SessionError(SessionError {
+ kind: SessionErrorKind::RebaseFailed { snapshot, conflicts },
+ ..
}) => PyRebaseFailedError::new_err(PyRebaseFailedData {
snapshot: snapshot.to_string(),
conflicts: conflicts.iter().map(PyConflict::from).collect(),
@@ -94,7 +99,15 @@ impl From for PyErr {
PyIcechunkStoreError::PyKeyError(e) => PyKeyError::new_err(e),
PyIcechunkStoreError::PyValueError(e) => PyValueError::new_err(e),
PyIcechunkStoreError::PyError(err) => err,
- _ => IcechunkError::new_err(error.to_string()),
+ error => {
+ let mut buf = String::new();
+ let message =
+ match GraphicalReportHandler::new().render_report(&mut buf, &error) {
+ Ok(_) => buf,
+ Err(_) => error.to_string(),
+ };
+ IcechunkError::new_err(message)
+ }
}
}
}
diff --git a/icechunk-python/src/lib.rs b/icechunk-python/src/lib.rs
index 6def166f..151b02b5 100644
--- a/icechunk-python/src/lib.rs
+++ b/icechunk-python/src/lib.rs
@@ -6,13 +6,16 @@ mod session;
mod store;
mod streams;
+use std::env;
+
use config::{
PyAzureCredentials, PyAzureStaticCredentials, PyCachingConfig,
- PyCompressionAlgorithm, PyCompressionConfig, PyCredentials, PyGcsCredentials,
- PyGcsStaticCredentials, PyManifestConfig, PyManifestPreloadCondition,
- PyManifestPreloadConfig, PyObjectStoreConfig, PyRepositoryConfig, PyS3Credentials,
- PyS3Options, PyS3StaticCredentials, PyStorage, PyStorageConcurrencySettings,
- PyStorageSettings, PyVirtualChunkContainer, PythonCredentialsFetcher,
+ PyCompressionAlgorithm, PyCompressionConfig, PyCredentials, PyGcsBearerCredential,
+ PyGcsCredentials, PyGcsStaticCredentials, PyManifestConfig,
+ PyManifestPreloadCondition, PyManifestPreloadConfig, PyObjectStoreConfig,
+ PyRepositoryConfig, PyS3Credentials, PyS3Options, PyS3StaticCredentials, PyStorage,
+ PyStorageConcurrencySettings, PyStorageSettings, PyVirtualChunkContainer,
+ PythonCredentialsFetcher,
};
use conflicts::{
PyBasicConflictSolver, PyConflict, PyConflictDetector, PyConflictSolver,
@@ -22,10 +25,26 @@ use errors::{
IcechunkError, PyConflictError, PyConflictErrorData, PyRebaseFailedData,
PyRebaseFailedError,
};
+use icechunk::{format::format_constants::SpecVersionBin, initialize_tracing};
use pyo3::prelude::*;
-use repository::{PyGCSummary, PyRepository, PySnapshotInfo};
+use repository::{PyDiff, PyGCSummary, PyRepository, PySnapshotInfo};
use session::PySession;
-use store::PyStore;
+use store::{PyStore, VirtualChunkSpec};
+
+#[pyfunction]
+fn initialize_logs() -> PyResult<()> {
+ if env::var("ICECHUNK_NO_LOGS").is_err() {
+ initialize_tracing()
+ }
+ Ok(())
+}
+
+#[pyfunction]
+/// The spec version that this version of the Icechunk library
+/// uses to write metadata files
+fn spec_version() -> u8 {
+ SpecVersionBin::current() as u8
+}
/// The icechunk Python module implemented in Rust.
#[pymodule]
@@ -44,6 +63,7 @@ fn _icechunk_python(py: Python<'_>, m: &Bound<'_, PyModule>) -> PyResult<()> {
m.add_class::()?;
m.add_class::()?;
m.add_class::()?;
+ m.add_class::()?;
m.add_class::()?;
m.add_class::()?;
m.add_class::()?;
@@ -61,6 +81,10 @@ fn _icechunk_python(py: Python<'_>, m: &Bound<'_, PyModule>) -> PyResult<()> {
m.add_class::()?;
m.add_class::()?;
m.add_class::()?;
+ m.add_class::()?;
+ m.add_class::()?;
+ m.add_function(wrap_pyfunction!(initialize_logs, m)?)?;
+ m.add_function(wrap_pyfunction!(spec_version, m)?)?;
// Exceptions
m.add("IcechunkError", py.get_type::())?;
diff --git a/icechunk-python/src/repository.rs b/icechunk-python/src/repository.rs
index f98af50d..8321d2c6 100644
--- a/icechunk-python/src/repository.rs
+++ b/icechunk-python/src/repository.rs
@@ -1,5 +1,6 @@
use std::{
- collections::{BTreeSet, HashMap, HashSet},
+ borrow::Cow,
+ collections::{BTreeMap, BTreeSet, HashMap, HashSet},
sync::Arc,
};
@@ -11,10 +12,11 @@ use icechunk::{
config::Credentials,
format::{
snapshot::{SnapshotInfo, SnapshotProperties},
+ transaction_log::Diff,
SnapshotId,
},
- ops::gc::{expire, garbage_collect, GCConfig, GCSummary},
- repository::{RepositoryError, VersionInfo},
+ ops::gc::{expire, garbage_collect, ExpiredRefAction, GCConfig, GCSummary},
+ repository::{RepositoryErrorKind, VersionInfo},
Repository,
};
use pyo3::{
@@ -27,7 +29,7 @@ use tokio::sync::{Mutex, RwLock};
use crate::{
config::{
- datetime_repr, format_option_string, PyCredentials, PyRepositoryConfig,
+ datetime_repr, format_option_to_string, PyCredentials, PyRepositoryConfig,
PyStorage, PyStorageSettings,
},
errors::PyIcechunkStoreError,
@@ -175,13 +177,142 @@ impl PySnapshotInfo {
format!(
r#"SnapshotInfo(id="{id}", parent_id={parent}, written_at={at}, message="{message}")"#,
id = self.id,
- parent = format_option_string(self.parent_id.as_ref()),
+ parent = format_option_to_string(self.parent_id.as_ref()),
at = datetime_repr(&self.written_at),
message = self.message.chars().take(10).collect::() + "...",
)
}
}
+#[pyclass(name = "Diff", eq)]
+#[derive(Debug, PartialEq, Eq, Default)]
+pub struct PyDiff {
+ #[pyo3(get)]
+ pub new_groups: BTreeSet,
+ #[pyo3(get)]
+ pub new_arrays: BTreeSet,
+ #[pyo3(get)]
+ pub deleted_groups: BTreeSet,
+ #[pyo3(get)]
+ pub deleted_arrays: BTreeSet,
+ #[pyo3(get)]
+ pub updated_groups: BTreeSet,
+ #[pyo3(get)]
+ pub updated_arrays: BTreeSet,
+ #[pyo3(get)]
+ // A Vec instead of a set to avoid issues with list not being hashable in python
+ pub updated_chunks: BTreeMap>>,
+}
+
+impl From for PyDiff {
+ fn from(value: Diff) -> Self {
+ let new_groups =
+ value.new_groups.into_iter().map(|path| path.to_string()).collect();
+ let new_arrays =
+ value.new_arrays.into_iter().map(|path| path.to_string()).collect();
+ let deleted_groups =
+ value.deleted_groups.into_iter().map(|path| path.to_string()).collect();
+ let deleted_arrays =
+ value.deleted_arrays.into_iter().map(|path| path.to_string()).collect();
+ let updated_groups =
+ value.updated_groups.into_iter().map(|path| path.to_string()).collect();
+ let updated_arrays =
+ value.updated_arrays.into_iter().map(|path| path.to_string()).collect();
+ let updated_chunks = value
+ .updated_chunks
+ .into_iter()
+ .map(|(k, v)| {
+ let path = k.to_string();
+ let map = v.into_iter().map(|idx| idx.0).collect();
+ (path, map)
+ })
+ .collect();
+
+ PyDiff {
+ new_groups,
+ new_arrays,
+ deleted_groups,
+ deleted_arrays,
+ updated_groups,
+ updated_arrays,
+ updated_chunks,
+ }
+ }
+}
+
+#[pymethods]
+impl PyDiff {
+ pub fn __repr__(&self) -> String {
+ let mut res = String::new();
+ use std::fmt::Write;
+
+ if !self.new_groups.is_empty() {
+ res.push_str("Groups created:\n");
+ for g in self.new_groups.iter() {
+ writeln!(res, " {}", g).unwrap();
+ }
+ res.push('\n');
+ }
+ if !self.new_arrays.is_empty() {
+ res.push_str("Arrays created:\n");
+ for g in self.new_arrays.iter() {
+ writeln!(res, " {}", g).unwrap();
+ }
+ res.push('\n');
+ }
+
+ if !self.updated_groups.is_empty() {
+ res.push_str("Group definitions updated:\n");
+ for g in self.updated_groups.iter() {
+ writeln!(res, " {}", g).unwrap();
+ }
+ res.push('\n');
+ }
+
+ if !self.updated_arrays.is_empty() {
+ res.push_str("Array definitions updated:\n");
+ for g in self.updated_arrays.iter() {
+ writeln!(res, " {}", g).unwrap();
+ }
+ res.push('\n');
+ }
+
+ if !self.deleted_groups.is_empty() {
+ res.push_str("Groups deleted:\n");
+ for g in self.deleted_groups.iter() {
+ writeln!(res, " {}", g).unwrap();
+ }
+ res.push('\n');
+ }
+
+ if !self.deleted_arrays.is_empty() {
+ res.push_str("Arrays deleted:\n");
+ for g in self.deleted_arrays.iter() {
+ writeln!(res, " {}", g).unwrap();
+ }
+ res.push('\n');
+ }
+
+ if !self.updated_chunks.is_empty() {
+ res.push_str("Chunks updated:\n");
+ for (path, chunks) in self.updated_chunks.iter() {
+ writeln!(res, " {}:", path).unwrap();
+ let coords = chunks
+ .iter()
+ .map(|idx| format!(" [{}]", idx.iter().join(", ")))
+ .take(10)
+ .join("\n");
+ res.push_str(coords.as_str());
+ res.push('\n');
+ if chunks.len() > 10 {
+ writeln!(res, " ... {} more", chunks.len() - 10).unwrap();
+ }
+ }
+ }
+ res
+ }
+}
+
#[pyclass(name = "GCSummary", eq)]
#[derive(Debug, PartialEq, Eq, Default)]
pub struct PyGCSummary {
@@ -333,6 +464,29 @@ impl PyRepository {
})
}
+ #[classmethod]
+ fn from_bytes(
+ _cls: Bound<'_, PyType>,
+ py: Python<'_>,
+ bytes: Vec,
+ ) -> PyResult {
+ // This is a compute intensive task, we need to release the Gil
+ py.allow_threads(move || {
+ let repository = Repository::from_bytes(bytes)
+ .map_err(PyIcechunkStoreError::RepositoryError)?;
+ Ok(Self(Arc::new(repository)))
+ })
+ }
+
+ fn as_bytes(&self, py: Python<'_>) -> PyResult> {
+ // This is a compute intensive task, we need to release the Gil
+ py.allow_threads(move || {
+ let bytes =
+ self.0.as_bytes().map_err(PyIcechunkStoreError::RepositoryError)?;
+ Ok(Cow::Owned(bytes))
+ })
+ }
+
#[staticmethod]
fn fetch_config(
py: Python<'_>,
@@ -375,46 +529,19 @@ impl PyRepository {
PyStorage(Arc::clone(self.0.storage()))
}
- #[pyo3(signature = (*, branch = None, tag = None, snapshot = None))]
- pub fn ancestry(
- &self,
- py: Python<'_>,
- branch: Option,
- tag: Option,
- snapshot: Option,
- ) -> PyResult> {
- // This function calls block_on, so we need to allow other thread python to make progress
- py.allow_threads(move || {
- let version = args_to_version_info(branch, tag, snapshot)?;
-
- // TODO: this holds everything in memory
- pyo3_async_runtimes::tokio::get_runtime().block_on(async move {
- let ancestry = self
- .0
- .ancestry(&version)
- .await
- .map_err(PyIcechunkStoreError::RepositoryError)?
- .map_ok(Into::::into)
- .try_collect::>()
- .await
- .map_err(PyIcechunkStoreError::RepositoryError)?;
- Ok(ancestry)
- })
- })
- }
-
- #[pyo3(signature = (*, branch = None, tag = None, snapshot = None))]
+ /// Returns an object that is both a sync and an async iterator
+ #[pyo3(signature = (*, branch = None, tag = None, snapshot_id = None))]
pub fn async_ancestry(
&self,
py: Python<'_>,
branch: Option,
tag: Option,
- snapshot: Option,
+ snapshot_id: Option,
) -> PyResult {
let repo = Arc::clone(&self.0);
// This function calls block_on, so we need to allow other thread python to make progress
py.allow_threads(move || {
- let version = args_to_version_info(branch, tag, snapshot)?;
+ let version = args_to_version_info(branch, tag, snapshot_id, None)?;
let ancestry = pyo3_async_runtimes::tokio::get_runtime()
.block_on(async move { repo.ancestry_arc(&version).await })
.map_err(PyIcechunkStoreError::RepositoryError)?
@@ -441,9 +568,9 @@ impl PyRepository {
// This function calls block_on, so we need to allow other thread python to make progress
py.allow_threads(move || {
let snapshot_id = SnapshotId::try_from(snapshot_id).map_err(|_| {
- PyIcechunkStoreError::RepositoryError(RepositoryError::InvalidSnapshotId(
- snapshot_id.to_owned(),
- ))
+ PyIcechunkStoreError::RepositoryError(
+ RepositoryErrorKind::InvalidSnapshotId(snapshot_id.to_owned()).into(),
+ )
})?;
pyo3_async_runtimes::tokio::get_runtime().block_on(async move {
@@ -493,9 +620,9 @@ impl PyRepository {
// This function calls block_on, so we need to allow other thread python to make progress
py.allow_threads(move || {
let snapshot_id = SnapshotId::try_from(snapshot_id).map_err(|_| {
- PyIcechunkStoreError::RepositoryError(RepositoryError::InvalidSnapshotId(
- snapshot_id.to_owned(),
- ))
+ PyIcechunkStoreError::RepositoryError(
+ RepositoryErrorKind::InvalidSnapshotId(snapshot_id.to_owned()).into(),
+ )
})?;
pyo3_async_runtimes::tokio::get_runtime().block_on(async move {
@@ -543,9 +670,9 @@ impl PyRepository {
// This function calls block_on, so we need to allow other thread python to make progress
py.allow_threads(move || {
let snapshot_id = SnapshotId::try_from(snapshot_id).map_err(|_| {
- PyIcechunkStoreError::RepositoryError(RepositoryError::InvalidSnapshotId(
- snapshot_id.to_owned(),
- ))
+ PyIcechunkStoreError::RepositoryError(
+ RepositoryErrorKind::InvalidSnapshotId(snapshot_id.to_owned()).into(),
+ )
})?;
pyo3_async_runtimes::tokio::get_runtime().block_on(async move {
@@ -586,17 +713,46 @@ impl PyRepository {
})
}
- #[pyo3(signature = (*, branch = None, tag = None, snapshot = None))]
+ #[pyo3(signature = (*, from_branch=None, from_tag=None, from_snapshot_id=None, to_branch=None, to_tag=None, to_snapshot_id=None))]
+ #[allow(clippy::too_many_arguments)]
+ pub fn diff(
+ &self,
+ py: Python<'_>,
+ from_branch: Option,
+ from_tag: Option,
+ from_snapshot_id: Option,
+ to_branch: Option,
+ to_tag: Option,
+ to_snapshot_id: Option,
+ ) -> PyResult {
+ let from = args_to_version_info(from_branch, from_tag, from_snapshot_id, None)?;
+ let to = args_to_version_info(to_branch, to_tag, to_snapshot_id, None)?;
+
+ // This function calls block_on, so we need to allow other thread python to make progress
+ py.allow_threads(move || {
+ pyo3_async_runtimes::tokio::get_runtime().block_on(async move {
+ let diff = self
+ .0
+ .diff(&from, &to)
+ .await
+ .map_err(PyIcechunkStoreError::SessionError)?;
+ Ok(diff.into())
+ })
+ })
+ }
+
+ #[pyo3(signature = (*, branch = None, tag = None, snapshot_id = None, as_of = None))]
pub fn readonly_session(
&self,
py: Python<'_>,
branch: Option,
tag: Option,
- snapshot: Option,
+ snapshot_id: Option,
+ as_of: Option>,
) -> PyResult {
// This function calls block_on, so we need to allow other thread python to make progress
py.allow_threads(move || {
- let version = args_to_version_info(branch, tag, snapshot)?;
+ let version = args_to_version_info(branch, tag, snapshot_id, as_of)?;
let session =
pyo3_async_runtimes::tokio::get_runtime().block_on(async move {
self.0
@@ -624,10 +780,13 @@ impl PyRepository {
})
}
+ #[pyo3(signature = (older_than, *, delete_expired_branches = false, delete_expired_tags = false))]
pub fn expire_snapshots(
&self,
py: Python<'_>,
older_than: DateTime,
+ delete_expired_branches: bool,
+ delete_expired_tags: bool,
) -> PyResult> {
// This function calls block_on, so we need to allow other thread python to make progress
py.allow_threads(move || {
@@ -638,11 +797,25 @@ impl PyRepository {
self.0.storage_settings(),
self.0.asset_manager().clone(),
older_than,
+ if delete_expired_branches {
+ ExpiredRefAction::Delete
+ } else {
+ ExpiredRefAction::Ignore
+ },
+ if delete_expired_tags {
+ ExpiredRefAction::Delete
+ } else {
+ ExpiredRefAction::Ignore
+ },
)
.await
.map_err(PyIcechunkStoreError::GCError)?;
Ok::<_, PyIcechunkStoreError>(
- result.iter().map(|id| id.to_string()).collect(),
+ result
+ .released_snapshots
+ .iter()
+ .map(|id| id.to_string())
+ .collect(),
)
})?;
@@ -693,6 +866,7 @@ fn args_to_version_info(
branch: Option,
tag: Option,
snapshot: Option,
+ as_of: Option>,
) -> PyResult {
let n = [&branch, &tag, &snapshot].iter().filter(|r| !r.is_none()).count();
if n > 1 {
@@ -701,15 +875,25 @@ fn args_to_version_info(
));
}
- if let Some(branch_name) = branch {
- Ok(VersionInfo::BranchTipRef(branch_name))
+ if as_of.is_some() && branch.is_none() {
+ return Err(PyValueError::new_err(
+ "as_of argument must be provided together with a branch name",
+ ));
+ }
+
+ if let Some(branch) = branch {
+ if let Some(at) = as_of {
+ Ok(VersionInfo::AsOf { branch, at })
+ } else {
+ Ok(VersionInfo::BranchTipRef(branch))
+ }
} else if let Some(tag_name) = tag {
Ok(VersionInfo::TagRef(tag_name))
} else if let Some(snapshot_id) = snapshot {
let snapshot_id = SnapshotId::try_from(snapshot_id.as_str()).map_err(|_| {
- PyIcechunkStoreError::RepositoryError(RepositoryError::InvalidSnapshotId(
- snapshot_id.to_owned(),
- ))
+ PyIcechunkStoreError::RepositoryError(
+ RepositoryErrorKind::InvalidSnapshotId(snapshot_id.to_owned()).into(),
+ )
})?;
Ok(VersionInfo::SnapshotId(snapshot_id))
diff --git a/icechunk-python/src/session.rs b/icechunk-python/src/session.rs
index 13d17c10..7f8c0bf6 100644
--- a/icechunk-python/src/session.rs
+++ b/icechunk-python/src/session.rs
@@ -9,7 +9,7 @@ use tokio::sync::{Mutex, RwLock};
use crate::{
conflicts::PyConflictSolver,
errors::{PyIcechunkStoreError, PyIcechunkStoreResult},
- repository::PySnapshotProperties,
+ repository::{PyDiff, PySnapshotProperties},
store::PyStore,
streams::PyAsyncGenerator,
};
@@ -73,6 +73,19 @@ impl PySession {
py.allow_threads(move || self.0.blocking_read().has_uncommitted_changes())
}
+ pub fn status(&self, py: Python<'_>) -> PyResult {
+ // This is blocking function, we need to release the Gil
+ py.allow_threads(move || {
+ let session = self.0.blocking_read();
+
+ pyo3_async_runtimes::tokio::get_runtime().block_on(async move {
+ let res =
+ session.status().await.map_err(PyIcechunkStoreError::SessionError)?;
+ Ok(res.into())
+ })
+ })
+ }
+
pub fn discard_changes(&self, py: Python<'_>) {
// This is blocking function, we need to release the Gil
py.allow_threads(move || {
diff --git a/icechunk-python/src/store.rs b/icechunk-python/src/store.rs
index 5cf67ea7..f3b1832a 100644
--- a/icechunk-python/src/store.rs
+++ b/icechunk-python/src/store.rs
@@ -6,15 +6,17 @@ use futures::{StreamExt, TryStreamExt};
use icechunk::{
format::{
manifest::{Checksum, SecondsSinceEpoch, VirtualChunkLocation, VirtualChunkRef},
- ChunkLength, ChunkOffset,
+ ChunkIndices, ChunkLength, ChunkOffset, Path,
},
- store::StoreError,
+ storage::ETag,
+ store::{SetVirtualRefsResult, StoreError, StoreErrorKind},
Store,
};
+use itertools::Itertools as _;
use pyo3::{
exceptions::{PyKeyError, PyValueError},
prelude::*,
- types::PyType,
+ types::{PyTuple, PyType},
};
use tokio::sync::Mutex;
@@ -37,7 +39,7 @@ enum ChecksumArgument {
impl From for Checksum {
fn from(value: ChecksumArgument) -> Self {
match value {
- ChecksumArgument::String(etag) => Checksum::ETag(etag),
+ ChecksumArgument::String(etag) => Checksum::ETag(ETag(etag)),
ChecksumArgument::Datetime(date_time) => {
Checksum::LastModified(SecondsSinceEpoch(date_time.timestamp() as u32))
}
@@ -45,6 +47,50 @@ impl From for Checksum {
}
}
+#[pyclass]
+#[derive(Clone, Debug)]
+pub struct VirtualChunkSpec {
+ #[pyo3(get)]
+ index: Vec,
+ #[pyo3(get)]
+ location: String,
+ #[pyo3(get)]
+ offset: ChunkOffset,
+ #[pyo3(get)]
+ length: ChunkLength,
+ #[pyo3(get)]
+ etag_checksum: Option,
+ #[pyo3(get)]
+ last_updated_at_checksum: Option>,
+}
+
+impl VirtualChunkSpec {
+ fn checksum(&self) -> Option {
+ self.etag_checksum
+ .as_ref()
+ .map(|etag| Checksum::ETag(ETag(etag.clone())))
+ .or(self
+ .last_updated_at_checksum
+ .map(|t| Checksum::LastModified(SecondsSinceEpoch(t.timestamp() as u32))))
+ }
+}
+
+#[pymethods]
+impl VirtualChunkSpec {
+ #[new]
+ #[pyo3(signature = (index, location, offset, length, etag_checksum = None, last_updated_at_checksum = None))]
+ fn new(
+ index: Vec,
+ location: String,
+ offset: ChunkOffset,
+ length: ChunkLength,
+ etag_checksum: Option,
+ last_updated_at_checksum: Option>,
+ ) -> Self {
+ Self { index, location, offset, length, etag_checksum, last_updated_at_checksum }
+ }
+}
+
#[pyclass(name = "PyStore")]
#[derive(Clone)]
pub struct PyStore(pub Arc);
@@ -151,7 +197,9 @@ impl PyStore {
// from other types of errors, we use PyKeyError exception for that
match data {
Ok(data) => Ok(Vec::from(data)),
- Err(StoreError::NotFound(_)) => Err(PyKeyError::new_err(key)),
+ Err(StoreError { kind: StoreErrorKind::NotFound(_), .. }) => {
+ Err(PyKeyError::new_err(key))
+ }
Err(err) => Err(PyIcechunkStoreError::StoreError(err).into()),
}
})
@@ -278,6 +326,60 @@ impl PyStore {
})
}
+ fn set_virtual_refs(
+ &self,
+ py: Python<'_>,
+ array_path: String,
+ chunks: Vec,
+ validate_containers: bool,
+ ) -> PyIcechunkStoreResult