From 2609f78bd125954ea161ac9e4e9c16906755549a Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Thu, 13 Feb 2025 14:09:05 +0000 Subject: [PATCH 01/21] . --- Cargo.lock | 138 +++++++++++---------- Cargo.toml | 12 +- vortex-datafusion/src/persistent/format.rs | 17 ++- vortex-datafusion/src/persistent/mod.rs | 2 + vortex-datafusion/src/persistent/source.rs | 57 +++++++++ 5 files changed, 150 insertions(+), 76 deletions(-) create mode 100644 vortex-datafusion/src/persistent/source.rs diff --git a/Cargo.lock b/Cargo.lock index 53750c867e..724f835bcb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1354,17 +1354,15 @@ dependencies = [ [[package]] name = "datafusion" version = "45.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eae420e7a5b0b7f1c39364cc76cbcd0f5fdc416b2514ae3847c2676bbd60702a" dependencies = [ "arrow", - "arrow-array", "arrow-ipc", "arrow-schema", "async-trait", "bytes", "chrono", "datafusion-catalog", + "datafusion-catalog-listing", "datafusion-common", "datafusion-common-runtime", "datafusion-execution", @@ -1373,6 +1371,7 @@ dependencies = [ "datafusion-functions-aggregate", "datafusion-functions-table", "datafusion-functions-window", + "datafusion-macros", "datafusion-optimizer", "datafusion-physical-expr", "datafusion-physical-expr-common", @@ -1380,7 +1379,6 @@ dependencies = [ "datafusion-physical-plan", "datafusion-sql", "futures", - "glob", "itertools 0.14.0", "log", "object_store", @@ -1398,8 +1396,6 @@ dependencies = [ [[package]] name = "datafusion-catalog" version = "45.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6f27987bc22b810939e8dfecc55571e9d50355d6ea8ec1c47af8383a76a6d0e1" dependencies = [ "arrow", "async-trait", @@ -1413,21 +1409,41 @@ dependencies = [ "itertools 0.14.0", "log", "parking_lot", - "sqlparser", +] + +[[package]] +name = "datafusion-catalog-listing" +version = "45.0.0" +dependencies = [ + "arrow", + "async-trait", + "bytes", + "chrono", + "datafusion-catalog", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "futures", + "glob", + "itertools 0.14.0", + "log", + "object_store", + "rand", + "tokio", + "url", ] [[package]] name = "datafusion-common" version = "45.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3f6d5b8c9408cc692f7c194b8aa0c0f9b253e065a8d960ad9cdc2a13e697602" dependencies = [ "ahash", "arrow", - "arrow-array", - "arrow-buffer", "arrow-ipc", - "arrow-schema", "base64", "half", "hashbrown 0.14.5", @@ -1445,8 +1461,6 @@ dependencies = [ [[package]] name = "datafusion-common-runtime" version = "45.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d4603c8e8a4baf77660ab7074cc66fc15cc8a18f2ce9dfadb755fc6ee294e48" dependencies = [ "log", "tokio", @@ -1455,14 +1469,10 @@ dependencies = [ [[package]] name = "datafusion-doc" version = "45.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5bf4bc68623a5cf231eed601ed6eb41f46a37c4d15d11a0bff24cbc8396cd66" [[package]] name = "datafusion-execution" version = "45.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "88b491c012cdf8e051053426013429a76f74ee3c2db68496c79c323ca1084d27" dependencies = [ "arrow", "dashmap", @@ -1480,8 +1490,6 @@ dependencies = [ [[package]] name = "datafusion-expr" version = "45.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5a181408d4fc5dc22f9252781a8f39f2d0e5d1b33ec9bde242844980a2689c1" dependencies = [ "arrow", "chrono", @@ -1500,8 +1508,6 @@ dependencies = [ [[package]] name = "datafusion-expr-common" version = "45.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d1129b48e8534d8c03c6543bcdccef0b55c8ac0c1272a15a56c67068b6eb1885" dependencies = [ "arrow", "datafusion-common", @@ -1512,8 +1518,6 @@ dependencies = [ [[package]] name = "datafusion-functions" version = "45.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6125874e4856dfb09b59886784fcb74cde5cfc5930b3a80a1a728ef7a010df6b" dependencies = [ "arrow", "arrow-buffer", @@ -1525,7 +1529,6 @@ dependencies = [ "datafusion-expr", "datafusion-expr-common", "datafusion-macros", - "hashbrown 0.14.5", "hex", "itertools 0.14.0", "log", @@ -1538,13 +1541,9 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate" version = "45.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f3add7b1d3888e05e7c95f2b281af900ca69ebdcb21069ba679b33bde8b3b9d6" dependencies = [ "ahash", "arrow", - "arrow-buffer", - "arrow-schema", "datafusion-common", "datafusion-doc", "datafusion-execution", @@ -1561,8 +1560,6 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate-common" version = "45.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e18baa4cfc3d2f144f74148ed68a1f92337f5072b6dde204a0dbbdf3324989c" dependencies = [ "ahash", "arrow", @@ -1574,8 +1571,6 @@ dependencies = [ [[package]] name = "datafusion-functions-table" version = "45.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c403ddd473bbb0952ba880008428b3c7febf0ed3ce1eec35a205db20efb2a36" dependencies = [ "arrow", "async-trait", @@ -1590,8 +1585,6 @@ dependencies = [ [[package]] name = "datafusion-functions-window" version = "45.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ab18c2fb835614d06a75f24a9e09136d3a8c12a92d97c95a6af316a1787a9c5" dependencies = [ "datafusion-common", "datafusion-doc", @@ -1607,8 +1600,6 @@ dependencies = [ [[package]] name = "datafusion-functions-window-common" version = "45.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a77b73bc15e7d1967121fdc7a55d819bfb9d6c03766a6c322247dce9094a53a4" dependencies = [ "datafusion-common", "datafusion-physical-expr-common", @@ -1617,8 +1608,6 @@ dependencies = [ [[package]] name = "datafusion-macros" version = "45.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09369b8d962291e808977cf94d495fd8b5b38647232d7ef562c27ac0f495b0af" dependencies = [ "datafusion-expr", "quote", @@ -1628,8 +1617,6 @@ dependencies = [ [[package]] name = "datafusion-optimizer" version = "45.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2403a7e4a84637f3de7d8d4d7a9ccc0cc4be92d89b0161ba3ee5be82f0531c54" dependencies = [ "arrow", "chrono", @@ -1646,14 +1633,9 @@ dependencies = [ [[package]] name = "datafusion-physical-expr" version = "45.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86ff72ac702b62dbf2650c4e1d715ebd3e4aab14e3885e72e8549e250307347c" dependencies = [ "ahash", "arrow", - "arrow-array", - "arrow-buffer", - "arrow-schema", "datafusion-common", "datafusion-expr", "datafusion-expr-common", @@ -1671,12 +1653,9 @@ dependencies = [ [[package]] name = "datafusion-physical-expr-common" version = "45.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "60982b7d684e25579ee29754b4333057ed62e2cc925383c5f0bd8cab7962f435" dependencies = [ "ahash", "arrow", - "arrow-buffer", "datafusion-common", "datafusion-expr-common", "hashbrown 0.14.5", @@ -1686,11 +1665,8 @@ dependencies = [ [[package]] name = "datafusion-physical-optimizer" version = "45.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac5e85c189d5238a5cf181a624e450c4cd4c66ac77ca551d6f3ff9080bac90bb" dependencies = [ "arrow", - "arrow-schema", "datafusion-common", "datafusion-execution", "datafusion-expr", @@ -1698,22 +1674,16 @@ dependencies = [ "datafusion-physical-expr", "datafusion-physical-expr-common", "datafusion-physical-plan", - "futures", "itertools 0.14.0", "log", - "url", ] [[package]] name = "datafusion-physical-plan" version = "45.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c36bf163956d7e2542657c78b3383fdc78f791317ef358a359feffcdb968106f" dependencies = [ "ahash", "arrow", - "arrow-array", - "arrow-buffer", "arrow-ord", "arrow-schema", "async-trait", @@ -1739,12 +1709,8 @@ dependencies = [ [[package]] name = "datafusion-sql" version = "45.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e13caa4daede211ecec53c78b13c503b592794d125f9a3cc3afe992edf9e7f43" dependencies = [ "arrow", - "arrow-array", - "arrow-schema", "bigdecimal", "datafusion-common", "datafusion-expr", @@ -3628,6 +3594,15 @@ dependencies = [ "prost", ] +[[package]] +name = "psm" +version = "0.1.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "200b9ff220857e53e184257720a14553b2f4aa02577d2ed9842d45d4b9654810" +dependencies = [ + "cc", +] + [[package]] name = "ptr_meta" version = "0.3.0" @@ -3923,6 +3898,26 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "recursive" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0786a43debb760f491b1bc0269fe5e84155353c67482b9e60d0cfb596054b43e" +dependencies = [ + "recursive-proc-macro-impl", + "stacker", +] + +[[package]] +name = "recursive-proc-macro-impl" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b" +dependencies = [ + "quote", + "syn 2.0.98", +] + [[package]] name = "redox_syscall" version = "0.5.8" @@ -4512,11 +4507,12 @@ dependencies = [ [[package]] name = "sqlparser" -version = "0.53.0" +version = "0.54.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05a528114c392209b3264855ad491fcce534b94a38771b0a0b97a79379275ce8" +checksum = "c66e3b7374ad4a6af849b08b3e7a6eda0edbd82f0fd59b57e22671bf16979899" dependencies = [ "log", + "recursive", "sqlparser_derive", ] @@ -4537,6 +4533,19 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" +[[package]] +name = "stacker" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d08feb8f695b465baed819b03c128dc23f57a694510ab1f06c77f763975685e" +dependencies = [ + "cc", + "cfg-if", + "libc", + "psm", + "windows-sys 0.59.0", +] + [[package]] name = "static_assertions" version = "1.1.0" @@ -5117,6 +5126,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b3758f5e68192bb96cc8f9b7e2c2cfdabb435499a28499a42f8f984092adad4b" dependencies = [ "getrandom 0.2.15", + "wasm-bindgen", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index dcf61f6814..6f3ece6855 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -74,12 +74,12 @@ criterion = { package = "codspeed-criterion-compat", features = [ "html_reports", ], version = "2.7.2" } crossterm = "0.28" -datafusion = { version = "45", default-features = false } -datafusion-common = "45" -datafusion-execution = "45" -datafusion-expr = "45" -datafusion-physical-expr = "45" -datafusion-physical-plan = "45" +datafusion = { version = "45", default-features = false, path = "../datafusion/datafusion/core" } +datafusion-common = {version = "45", path = "../datafusion/datafusion/common"} +datafusion-execution = {version = "45", path = "../datafusion/datafusion/execution"} +datafusion-expr = {version = "45", path = "../datafusion/datafusion/expr"} +datafusion-physical-expr = {version = "45", path = "../datafusion/datafusion/physical-expr"} +datafusion-physical-plan = {version = "45", path = "../datafusion/datafusion/physical-plan"} divan = "0.1.14" dyn-hash = "0.2.0" enum-iterator = "2.0.0" diff --git a/vortex-datafusion/src/persistent/format.rs b/vortex-datafusion/src/persistent/format.rs index 28a5bf5e23..44a45064ab 100644 --- a/vortex-datafusion/src/persistent/format.rs +++ b/vortex-datafusion/src/persistent/format.rs @@ -3,10 +3,11 @@ use std::sync::Arc; use arrow_schema::{Schema, SchemaRef}; use async_trait::async_trait; +use datafusion::catalog::Session; +use datafusion::datasource::data_source::FileSource; use datafusion::datasource::file_format::file_compression_type::FileCompressionType; use datafusion::datasource::file_format::{FileFormat, FileFormatFactory, FilePushdownSupport}; use datafusion::datasource::physical_plan::{FileScanConfig, FileSinkConfig}; -use datafusion::execution::SessionState; use datafusion_common::parsers::CompressionTypeVariant; use datafusion_common::stats::Precision; use datafusion_common::{ @@ -86,7 +87,7 @@ impl FileFormatFactory for VortexFormatFactory { #[allow(clippy::disallowed_types)] fn create( &self, - _state: &SessionState, + _state: &dyn Session, format_options: &std::collections::HashMap, ) -> DFResult> { if !format_options.is_empty() { @@ -152,9 +153,13 @@ impl FileFormat for VortexFormat { } } + fn file_source(&self) -> Arc { + todo!() + } + async fn infer_schema( &self, - state: &SessionState, + state: &dyn Session, store: &Arc, objects: &[ObjectMeta], ) -> DFResult { @@ -187,7 +192,7 @@ impl FileFormat for VortexFormat { #[cfg_attr(feature = "tracing", tracing::instrument(skip_all, fields(location = object.location.as_ref())))] async fn infer_stats( &self, - _state: &SessionState, + _state: &dyn Session, store: &Arc, table_schema: SchemaRef, object: &ObjectMeta, @@ -273,7 +278,7 @@ impl FileFormat for VortexFormat { async fn create_physical_plan( &self, - _state: &SessionState, + _state: &dyn Session, file_scan_config: FileScanConfig, filters: Option<&Arc>, ) -> DFResult> { @@ -311,7 +316,7 @@ impl FileFormat for VortexFormat { async fn create_writer_physical_plan( &self, input: Arc, - _state: &SessionState, + _state: &dyn Session, conf: FileSinkConfig, order_requirements: Option, ) -> DFResult> { diff --git a/vortex-datafusion/src/persistent/mod.rs b/vortex-datafusion/src/persistent/mod.rs index 8b7c7adf76..0f78b3d1f3 100644 --- a/vortex-datafusion/src/persistent/mod.rs +++ b/vortex-datafusion/src/persistent/mod.rs @@ -5,8 +5,10 @@ mod execution; mod format; mod opener; mod sink; +mod source; pub use format::{VortexFormat, VortexFormatFactory, VortexFormatOptions}; +pub use source::VortexSource; #[cfg(test)] /// Utility function to register Vortex with a [`SessionStateBuilder`] diff --git a/vortex-datafusion/src/persistent/source.rs b/vortex-datafusion/src/persistent/source.rs new file mode 100644 index 0000000000..9c60231e67 --- /dev/null +++ b/vortex-datafusion/src/persistent/source.rs @@ -0,0 +1,57 @@ +use std::any::Any; +use std::sync::Arc; + +use datafusion::datasource::data_source::FileSource; +use datafusion::datasource::physical_plan::{FileOpener, FileScanConfig}; +use datafusion_common::{Result, Statistics}; +use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet; +use object_store::ObjectStore; + +pub struct VortexSource {} + +impl FileSource for VortexSource { + fn create_file_opener( + &self, + object_store: Result>, + base_config: &FileScanConfig, + partition: usize, + ) -> Result> { + todo!() + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn with_batch_size(&self, batch_size: usize) -> Arc { + todo!() + } + + fn with_schema(&self, schema: arrow_schema::SchemaRef) -> Arc { + todo!() + } + + fn with_projection(&self, config: &FileScanConfig) -> Arc { + todo!() + } + + fn with_statistics(&self, statistics: Statistics) -> Arc { + todo!() + } + + fn metrics(&self) -> &ExecutionPlanMetricsSet { + todo!() + } + + fn statistics(&self) -> Result { + todo!() + } + + fn file_type(&self) -> &str { + todo!() + } + + fn supports_repartition(&self, config: &FileScanConfig) -> bool { + todo!() + } +} From 1399e1db64bf0a0cc3bbde72fcdac367a3e83da1 Mon Sep 17 00:00:00 2001 From: Alexander Droste Date: Thu, 13 Feb 2025 18:38:52 +0000 Subject: [PATCH 02/21] bench: enable divan codspeed compatibility layer (#2351) --- .github/workflows/bench-pr.yml | 13 ++----------- .github/workflows/bench.yml | 14 +++----------- 2 files changed, 5 insertions(+), 22 deletions(-) diff --git a/.github/workflows/bench-pr.yml b/.github/workflows/bench-pr.yml index bed3b43b94..72cada3879 100644 --- a/.github/workflows/bench-pr.yml +++ b/.github/workflows/bench-pr.yml @@ -25,7 +25,7 @@ jobs: codspeed_bench: name: Benchmark with Codspeed needs: label_trigger - runs-on: ubuntu-latest-large + runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - uses: ./.github/actions/setup-rust @@ -39,16 +39,7 @@ jobs: RUSTFLAGS: "-C target-cpu=native" # We want to run micro-benchmarks with release profile. # We run with all features since we feature gate bench utils. - run: | - cargo codspeed build --features test-harness \ - --exclude bench-vortex \ - --exclude vortex-datafusion \ - --exclude vortex-tui \ - --exclude vortex-fuzz \ - --exclude pyvortex \ - --exclude xtask \ - --workspace \ - --profile release + run: cargo codspeed build --exclude bench-vortex --workspace --profile release - name: Run benchmarks uses: CodSpeedHQ/action@v3 diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index d27d65f6f1..e505393229 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -31,8 +31,8 @@ jobs: codspeed_bench: - name: Benchmark with Codspeed - runs-on: ubuntu-latest-large + name: Run Criterion benchmarks with Codspeed + runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - uses: ./.github/actions/setup-rust @@ -46,15 +46,7 @@ jobs: RUSTFLAGS: "-C target-cpu=native" # We want to run micro-benchmarks with release profile. # We run with all features since we feature gate bench utils. - run: | - cargo codspeed build --features test-harness \ - --exclude bench-vortex \ - --exclude vortex-datafusion \ - --exclude vortex-tui \ - --exclude vortex-fuzz \ - --exclude pyvortex \ - --exclude xtask \ - --workspace --profile release + run: cargo codspeed build --exclude bench-vortex --workspace --profile release - name: Run benchmarks uses: CodSpeedHQ/action@v3 From e9df5e7afbbba7934ab9117d08c65d4b61d31b33 Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Tue, 18 Feb 2025 17:53:38 +0000 Subject: [PATCH 03/21] . --- Cargo.lock | 173 +++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 123 insertions(+), 50 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index fa26bffc5b..4cc90961e5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -173,9 +173,9 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" -version = "54.1.0" +version = "54.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6422e12ac345a0678d7a17e316238e3a40547ae7f92052b77bd86d5e0239f3fc" +checksum = "755b6da235ac356a869393c23668c663720b8749dd6f15e52b6c214b4b964cc7" dependencies = [ "arrow-arith", "arrow-array", @@ -195,9 +195,9 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "54.1.0" +version = "54.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "23cf34bb1f48c41d3475927bcc7be498665b8e80b379b88f62a840337f8b8248" +checksum = "64656a1e0b13ca766f8440752e9a93e11014eec7b67909986f83ed0ab1fe37b8" dependencies = [ "arrow-array", "arrow-buffer", @@ -209,9 +209,9 @@ dependencies = [ [[package]] name = "arrow-array" -version = "54.1.0" +version = "54.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fb4a06d507f54b70a277be22a127c8ffe0cec6cd98c0ad8a48e77779bbda8223" +checksum = "57a4a6d2896083cfbdf84a71a863b22460d0708f8206a8373c52e326cc72ea1a" dependencies = [ "ahash", "arrow-buffer", @@ -226,9 +226,9 @@ dependencies = [ [[package]] name = "arrow-buffer" -version = "54.1.0" +version = "54.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d69d326d5ad1cb82dcefa9ede3fee8fdca98f9982756b16f9cb142f4aa6edc89" +checksum = "cef870583ce5e4f3b123c181706f2002fb134960f9a911900f64ba4830c7a43a" dependencies = [ "bytes", "half", @@ -237,9 +237,9 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "54.1.0" +version = "54.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "626e65bd42636a84a238bed49d09c8777e3d825bf81f5087a70111c2831d9870" +checksum = "1ac7eba5a987f8b4a7d9629206ba48e19a1991762795bbe5d08497b7736017ee" dependencies = [ "arrow-array", "arrow-buffer", @@ -258,9 +258,9 @@ dependencies = [ [[package]] name = "arrow-csv" -version = "54.1.0" +version = "54.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "71c8f959f7a1389b1dbd883cdcd37c3ed12475329c111912f7f69dad8195d8c6" +checksum = "90f12542b8164398fc9ec595ff783c4cf6044daa89622c5a7201be920e4c0d4c" dependencies = [ "arrow-array", "arrow-cast", @@ -274,9 +274,9 @@ dependencies = [ [[package]] name = "arrow-data" -version = "54.1.0" +version = "54.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1858e7c7d01c44cf71c21a85534fd1a54501e8d60d1195d0d6fbcc00f4b10754" +checksum = "b095e8a4f3c309544935d53e04c3bfe4eea4e71c3de6fe0416d1f08bb4441a83" dependencies = [ "arrow-buffer", "arrow-schema", @@ -286,9 +286,9 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "54.1.0" +version = "54.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6bb3f727f049884c7603f0364bc9315363f356b59e9f605ea76541847e06a1e" +checksum = "65c63da4afedde2b25ef69825cd4663ca76f78f79ffe2d057695742099130ff6" dependencies = [ "arrow-array", "arrow-buffer", @@ -300,9 +300,9 @@ dependencies = [ [[package]] name = "arrow-json" -version = "54.1.0" +version = "54.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "35de94f165ed8830aede72c35f238763794f0d49c69d30c44d49c9834267ff8c" +checksum = "9551d9400532f23a370cabbea1dc5a53c49230397d41f96c4c8eedf306199305" dependencies = [ "arrow-array", "arrow-buffer", @@ -320,9 +320,9 @@ dependencies = [ [[package]] name = "arrow-ord" -version = "54.1.0" +version = "54.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8aa06e5f267dc53efbacb933485c79b6fc1685d3ffbe870a16ce4e696fb429da" +checksum = "6c07223476f8219d1ace8cd8d85fa18c4ebd8d945013f25ef5c72e85085ca4ee" dependencies = [ "arrow-array", "arrow-buffer", @@ -333,9 +333,9 @@ dependencies = [ [[package]] name = "arrow-row" -version = "54.1.0" +version = "54.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "66f1144bb456a2f9d82677bd3abcea019217e572fc8f07de5a7bac4b2c56eb2c" +checksum = "91b194b38bfd89feabc23e798238989c6648b2506ad639be42ec8eb1658d82c4" dependencies = [ "arrow-array", "arrow-buffer", @@ -346,18 +346,18 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "54.1.0" +version = "54.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "105f01ec0090259e9a33a9263ec18ff223ab91a0ea9fbc18042f7e38005142f6" +checksum = "0f40f6be8f78af1ab610db7d9b236e21d587b7168e368a36275d2e5670096735" dependencies = [ "bitflags 2.8.0", ] [[package]] name = "arrow-select" -version = "54.1.0" +version = "54.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f690752fdbd2dee278b5f1636fefad8f2f7134c85e20fd59c4199e15a39a6807" +checksum = "ac265273864a820c4a179fc67182ccc41ea9151b97024e1be956f0f2369c2539" dependencies = [ "ahash", "arrow-array", @@ -369,9 +369,9 @@ dependencies = [ [[package]] name = "arrow-string" -version = "54.1.0" +version = "54.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d0fff9cd745a7039b66c47ecaf5954460f9fa12eed628f65170117ea93e64ee0" +checksum = "d44c8eed43be4ead49128370f7131f054839d3d6003e52aebf64322470b8fbd0" dependencies = [ "arrow-array", "arrow-buffer", @@ -1357,21 +1357,24 @@ name = "datafusion" version = "45.0.0" dependencies = [ "arrow", - "arrow-array", "arrow-ipc", "arrow-schema", "async-trait", "bytes", "chrono", "datafusion-catalog", + "datafusion-catalog-listing", "datafusion-common", "datafusion-common-runtime", + "datafusion-datasource", "datafusion-execution", "datafusion-expr", + "datafusion-expr-common", "datafusion-functions", "datafusion-functions-aggregate", "datafusion-functions-table", "datafusion-functions-window", + "datafusion-macros", "datafusion-optimizer", "datafusion-physical-expr", "datafusion-physical-expr-common", @@ -1379,7 +1382,6 @@ dependencies = [ "datafusion-physical-plan", "datafusion-sql", "futures", - "glob", "itertools 0.14.0", "log", "object_store", @@ -1410,7 +1412,26 @@ dependencies = [ "itertools 0.14.0", "log", "parking_lot", - "sqlparser", +] + +[[package]] +name = "datafusion-catalog-listing" +version = "45.0.0" +dependencies = [ + "arrow", + "async-trait", + "datafusion-catalog", + "datafusion-common", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "futures", + "log", + "object_store", + "tokio", ] [[package]] @@ -1419,10 +1440,7 @@ version = "45.0.0" dependencies = [ "ahash", "arrow", - "arrow-array", - "arrow-buffer", "arrow-ipc", - "arrow-schema", "base64", "half", "hashbrown 0.14.5", @@ -1445,6 +1463,30 @@ dependencies = [ "tokio", ] +[[package]] +name = "datafusion-datasource" +version = "45.0.0" +dependencies = [ + "arrow", + "async-trait", + "bytes", + "chrono", + "datafusion-catalog", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-plan", + "futures", + "glob", + "itertools 0.14.0", + "log", + "object_store", + "rand", + "tokio", + "url", +] + [[package]] name = "datafusion-doc" version = "45.0.0" @@ -1490,6 +1532,7 @@ version = "45.0.0" dependencies = [ "arrow", "datafusion-common", + "indexmap", "itertools 0.14.0", "paste", ] @@ -1508,7 +1551,6 @@ dependencies = [ "datafusion-expr", "datafusion-expr-common", "datafusion-macros", - "hashbrown 0.14.5", "hex", "itertools 0.14.0", "log", @@ -1524,8 +1566,6 @@ version = "45.0.0" dependencies = [ "ahash", "arrow", - "arrow-buffer", - "arrow-schema", "datafusion-common", "datafusion-doc", "datafusion-execution", @@ -1618,9 +1658,6 @@ version = "45.0.0" dependencies = [ "ahash", "arrow", - "arrow-array", - "arrow-buffer", - "arrow-schema", "datafusion-common", "datafusion-expr", "datafusion-expr-common", @@ -1641,7 +1678,6 @@ version = "45.0.0" dependencies = [ "ahash", "arrow", - "arrow-buffer", "datafusion-common", "datafusion-expr-common", "hashbrown 0.14.5", @@ -1653,7 +1689,6 @@ name = "datafusion-physical-optimizer" version = "45.0.0" dependencies = [ "arrow", - "arrow-schema", "datafusion-common", "datafusion-execution", "datafusion-expr", @@ -1661,10 +1696,8 @@ dependencies = [ "datafusion-physical-expr", "datafusion-physical-expr-common", "datafusion-physical-plan", - "futures", "itertools 0.14.0", "log", - "url", ] [[package]] @@ -1673,8 +1706,6 @@ version = "45.0.0" dependencies = [ "ahash", "arrow", - "arrow-array", - "arrow-buffer", "arrow-ord", "arrow-schema", "async-trait", @@ -1702,8 +1733,6 @@ name = "datafusion-sql" version = "45.0.0" dependencies = [ "arrow", - "arrow-array", - "arrow-schema", "bigdecimal", "datafusion-common", "datafusion-expr", @@ -3587,6 +3616,15 @@ dependencies = [ "prost", ] +[[package]] +name = "psm" +version = "0.1.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f58e5423e24c18cc840e1c98370b3993c6649cd1678b4d24318bcf0a083cbe88" +dependencies = [ + "cc", +] + [[package]] name = "ptr_meta" version = "0.3.0" @@ -3882,6 +3920,26 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "recursive" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0786a43debb760f491b1bc0269fe5e84155353c67482b9e60d0cfb596054b43e" +dependencies = [ + "recursive-proc-macro-impl", + "stacker", +] + +[[package]] +name = "recursive-proc-macro-impl" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b" +dependencies = [ + "quote", + "syn 2.0.98", +] + [[package]] name = "redox_syscall" version = "0.5.8" @@ -4471,11 +4529,12 @@ dependencies = [ [[package]] name = "sqlparser" -version = "0.53.0" +version = "0.54.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05a528114c392209b3264855ad491fcce534b94a38771b0a0b97a79379275ce8" +checksum = "c66e3b7374ad4a6af849b08b3e7a6eda0edbd82f0fd59b57e22671bf16979899" dependencies = [ "log", + "recursive", "sqlparser_derive", ] @@ -4496,6 +4555,19 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" +[[package]] +name = "stacker" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d08feb8f695b465baed819b03c128dc23f57a694510ab1f06c77f763975685e" +dependencies = [ + "cc", + "cfg-if", + "libc", + "psm", + "windows-sys 0.59.0", +] + [[package]] name = "static_assertions" version = "1.1.0" @@ -5076,6 +5148,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b3758f5e68192bb96cc8f9b7e2c2cfdabb435499a28499a42f8f984092adad4b" dependencies = [ "getrandom 0.2.15", + "wasm-bindgen", ] [[package]] From bfa27906d5c27c75e28f99e0749a569233313ca6 Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Tue, 18 Feb 2025 17:56:21 +0000 Subject: [PATCH 04/21] . --- .github/workflows/bench-pr.yml | 13 +++++++++++-- .github/workflows/bench.yml | 14 +++++++++++--- 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/.github/workflows/bench-pr.yml b/.github/workflows/bench-pr.yml index 17ea167c86..d206bebce0 100644 --- a/.github/workflows/bench-pr.yml +++ b/.github/workflows/bench-pr.yml @@ -25,7 +25,7 @@ jobs: codspeed_bench: name: Benchmark with Codspeed needs: label_trigger - runs-on: ubuntu-latest + runs-on: ubuntu-latest-large steps: - uses: actions/checkout@v4 - uses: ./.github/actions/setup-rust @@ -39,7 +39,16 @@ jobs: RUSTFLAGS: "-C target-cpu=native" # We want to run micro-benchmarks with release profile. # We run with all features since we feature gate bench utils. - run: cargo codspeed build --exclude bench-vortex --workspace --profile release + run: | + cargo codspeed build --features test-harness \ + --exclude bench-vortex \ + --exclude vortex-datafusion \ + --exclude vortex-tui \ + --exclude vortex-fuzz \ + --exclude pyvortex \ + --exclude xtask \ + --workspace \ + --profile release - name: Run benchmarks uses: CodSpeedHQ/action@v3 diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index b2e19533d0..9f200268bc 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -31,8 +31,8 @@ jobs: codspeed_bench: - name: Run Criterion benchmarks with Codspeed - runs-on: ubuntu-latest + name: Benchmark with Codspeed + runs-on: ubuntu-latest-large steps: - uses: actions/checkout@v4 - uses: ./.github/actions/setup-rust @@ -46,7 +46,15 @@ jobs: RUSTFLAGS: "-C target-cpu=native" # We want to run micro-benchmarks with release profile. # We run with all features since we feature gate bench utils. - run: cargo codspeed build --exclude bench-vortex --workspace --profile release + run: | + cargo codspeed build --features test-harness \ + --exclude bench-vortex \ + --exclude vortex-datafusion \ + --exclude vortex-tui \ + --exclude vortex-fuzz \ + --exclude pyvortex \ + --exclude xtask \ + --workspace --profile release - name: Run benchmarks uses: CodSpeedHQ/action@v3 From ab669fd7f03ab24face6b6402e47eb5dc4120b1b Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Tue, 18 Feb 2025 17:57:01 +0000 Subject: [PATCH 05/21] . --- Cargo.lock | 76 +++++++++++++++++++++++++++++++++++++----------------- Cargo.toml | 2 +- 2 files changed, 53 insertions(+), 25 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4cc90961e5..8d6044b1a8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -799,20 +799,22 @@ checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6" [[package]] name = "codspeed" -version = "2.7.2" +version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "450a0e9df9df1c154156f4344f99d8f6f6e69d0fc4de96ef6e2e68b2ec3bce97" +checksum = "25d2f5a6570db487f5258e0bded6352fa2034c2aeb46bb5cc3ff060a0fcfba2f" dependencies = [ "colored", "libc", + "serde", "serde_json", + "uuid", ] [[package]] name = "codspeed-criterion-compat" -version = "2.7.2" +version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8eb1a6cb9c20e177fde58cdef97c1c7c9264eb1424fe45c4fccedc2fb078a569" +checksum = "f53a55558dedec742b14aae3c5fec389361b8b5ca28c1aadf09dd91faf710074" dependencies = [ "codspeed", "colored", @@ -821,6 +823,46 @@ dependencies = [ "tokio", ] +[[package]] +name = "codspeed-divan-compat" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae9da137ea5f911fec2e88f03d6040b36eeebcd66385ebaa041c4d2965778fdb" +dependencies = [ + "codspeed", + "codspeed-divan-compat-macros", + "codspeed-divan-compat-walltime", +] + +[[package]] +name = "codspeed-divan-compat-macros" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "566adb03248996f1a6d22ddd0fcbf7fb4c9f1b6cc111268e8a0de59e68c238f8" +dependencies = [ + "divan-macros", + "itertools 0.14.0", + "proc-macro-crate 3.2.0", + "proc-macro2", + "quote", + "syn 2.0.98", +] + +[[package]] +name = "codspeed-divan-compat-walltime" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c75d912d1b2eebbe9be806eaf8baddc6004e2b0483ba0f2047c8b04d8e6c0ae" +dependencies = [ + "cfg-if", + "clap", + "codspeed", + "condtype", + "divan-macros", + "libc", + "regex-lite", +] + [[package]] name = "colorchoice" version = "1.0.3" @@ -1763,20 +1805,6 @@ dependencies = [ "syn 2.0.98", ] -[[package]] -name = "divan" -version = "0.1.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e0583193020b29b03682d8d33bb53a5b0f50df6daacece12ca99b904cfdcb8c4" -dependencies = [ - "cfg-if", - "clap", - "condtype", - "divan-macros", - "libc", - "regex-lite", -] - [[package]] name = "divan-macros" version = "0.1.17" @@ -5197,7 +5225,7 @@ dependencies = [ name = "vortex-alp" version = "0.24.0" dependencies = [ - "divan", + "codspeed-divan-compat", "itertools 0.14.0", "num-traits", "rand", @@ -5227,7 +5255,7 @@ dependencies = [ "arrow-string", "backtrace", "codspeed-criterion-compat", - "divan", + "codspeed-divan-compat", "enum-iterator", "flatbuffers 25.2.10", "flexbuffers", @@ -5263,8 +5291,8 @@ version = "0.24.0" dependencies = [ "arrow-buffer", "bytes", + "codspeed-divan-compat", "compio", - "divan", "log", "rkyv", "vortex-error", @@ -5355,7 +5383,7 @@ version = "0.24.0" dependencies = [ "arrow-buffer", "codspeed-criterion-compat", - "divan", + "codspeed-divan-compat", "num-traits", "rand", "rkyv", @@ -5435,7 +5463,7 @@ dependencies = [ "arrayref", "arrow-buffer", "codspeed-criterion-compat", - "divan", + "codspeed-divan-compat", "fastlanes", "itertools 0.14.0", "num-traits", @@ -5492,7 +5520,7 @@ name = "vortex-fsst" version = "0.24.0" dependencies = [ "arrow-array", - "divan", + "codspeed-divan-compat", "fsst-rs", "rand", "serde", diff --git a/Cargo.toml b/Cargo.toml index d81722f372..21cd91cd21 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -80,7 +80,7 @@ datafusion-execution = {version = "45", path = "../datafusion/datafusion/executi datafusion-expr = {version = "45", path = "../datafusion/datafusion/expr"} datafusion-physical-expr = {version = "45", path = "../datafusion/datafusion/physical-expr"} datafusion-physical-plan = {version = "45", path = "../datafusion/datafusion/physical-plan"} -divan = "0.1.14" +divan = { package = "codspeed-divan-compat", version = "2.8.0" } dyn-hash = "0.2.0" enum-iterator = "2.0.0" exponential-decay-histogram = "=0.1.11" From e3632edcc87fbb9b59b062ce9df7a615e80f0751 Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Tue, 18 Feb 2025 18:59:16 +0000 Subject: [PATCH 06/21] some work --- vortex-datafusion/src/persistent/execution.rs | 4 +- vortex-datafusion/src/persistent/format.rs | 3 +- vortex-datafusion/src/persistent/source.rs | 68 ++++++++++++++++--- 3 files changed, 60 insertions(+), 15 deletions(-) diff --git a/vortex-datafusion/src/persistent/execution.rs b/vortex-datafusion/src/persistent/execution.rs index a8d7b3e0cc..f6eb2855f6 100644 --- a/vortex-datafusion/src/persistent/execution.rs +++ b/vortex-datafusion/src/persistent/execution.rs @@ -157,9 +157,7 @@ impl ExecutionPlan for VortexExec { _config: &ConfigOptions, ) -> DFResult>> { let file_groups = self.file_scan_config.file_groups.clone(); - let repartitioned_file_groups = repartition_by_size(file_groups, target_partitions); - let mut new_plan = self.clone(); let num_partitions = repartitioned_file_groups.len(); @@ -187,7 +185,7 @@ fn make_vortex_predicate(predicate: Option>) -> Option>, desired_partitions: usize, ) -> Vec> { diff --git a/vortex-datafusion/src/persistent/format.rs b/vortex-datafusion/src/persistent/format.rs index a1391b275e..976ac13494 100644 --- a/vortex-datafusion/src/persistent/format.rs +++ b/vortex-datafusion/src/persistent/format.rs @@ -34,6 +34,7 @@ use vortex_io::ObjectStoreReadAt; use super::cache::FileLayoutCache; use super::execution::VortexExec; use super::sink::VortexSink; +use super::VortexSource; use crate::can_be_pushed_down; use crate::converter::{bound_to_datafusion, directional_bound_to_df_precision}; @@ -154,7 +155,7 @@ impl FileFormat for VortexFormat { } fn file_source(&self) -> Arc { - todo!() + Arc::new(VortexSource::default()) } async fn infer_schema( diff --git a/vortex-datafusion/src/persistent/source.rs b/vortex-datafusion/src/persistent/source.rs index 9c60231e67..8f679c4e6a 100644 --- a/vortex-datafusion/src/persistent/source.rs +++ b/vortex-datafusion/src/persistent/source.rs @@ -3,20 +3,44 @@ use std::sync::Arc; use datafusion::datasource::data_source::FileSource; use datafusion::datasource::physical_plan::{FileOpener, FileScanConfig}; -use datafusion_common::{Result, Statistics}; +use datafusion_common::{Result as DFResult, Statistics}; use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet; -use object_store::ObjectStore; +use object_store::{ObjectStore, ObjectStoreScheme}; +use vortex_array::ContextRef; -pub struct VortexSource {} +use super::opener::VortexFileOpener; +use crate::persistent::execution::repartition_by_size; + +#[derive(Default, Clone)] +pub struct VortexSource { + batch_size: Option, + projected_statistics: Option, + context: ContextRef, + metrics: ExecutionPlanMetricsSet, +} impl FileSource for VortexSource { fn create_file_opener( &self, - object_store: Result>, + object_store: DFResult>, base_config: &FileScanConfig, partition: usize, - ) -> Result> { - todo!() + ) -> DFResult> { + let (scheme, _) = ObjectStoreScheme::parse(self.file_scan_config.object_store_url.as_ref()) + .map_err(object_store::Error::from)?; + + let opener = VortexFileOpener::new( + self.ctx.clone(), + scheme, + object_store, + self.projection.clone(), + self.predicate.clone(), + self.initial_read_cache.clone(), + self.projected_arrow_schema.clone(), + context.session_config().batch_size(), + )?; + + Ok(Arc::new(opener)) } fn as_any(&self) -> &dyn Any { @@ -24,7 +48,9 @@ impl FileSource for VortexSource { } fn with_batch_size(&self, batch_size: usize) -> Arc { - todo!() + let mut source = self.clone(); + source.batch_size = Some(batch_size); + Arc::new(source) } fn with_schema(&self, schema: arrow_schema::SchemaRef) -> Arc { @@ -40,18 +66,38 @@ impl FileSource for VortexSource { } fn metrics(&self) -> &ExecutionPlanMetricsSet { - todo!() + &self.metrics } - fn statistics(&self) -> Result { + fn statistics(&self) -> DFResult { todo!() } fn file_type(&self) -> &str { - todo!() + "vortex" } fn supports_repartition(&self, config: &FileScanConfig) -> bool { - todo!() + let total_file_count = config + .file_groups + .iter() + .map(|group| group.len()) + .sum::(); + // Vortex doesn't support repartitioning if there's only one file + total_file_count > 1 + } + + fn repartitioned( + &self, + target_partitions: usize, + _repartition_file_min_size: usize, + _output_ordering: Option, + config: &FileScanConfig, + ) -> DFResult> { + let mut new_config = config.clone(); + let file_groups = std::mem::take(&mut new_config.file_groups); + new_config.file_groups = repartition_by_size(file_groups, target_partitions); + + Ok(Some(new_config)) } } From 819298164bc2c98841eb63aca3ee713d77f76a31 Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Wed, 19 Feb 2025 11:35:06 +0000 Subject: [PATCH 07/21] this works? --- Cargo.lock | 23 ++ Cargo.toml | 12 +- vortex-datafusion/Cargo.toml | 2 +- vortex-datafusion/src/lib.rs | 3 +- vortex-datafusion/src/memory/provider.rs | 3 +- vortex-datafusion/src/persistent/config.rs | 1 + vortex-datafusion/src/persistent/execution.rs | 255 ------------------ vortex-datafusion/src/persistent/format.rs | 40 ++- vortex-datafusion/src/persistent/mod.rs | 1 - vortex-datafusion/src/persistent/opener.rs | 6 +- vortex-datafusion/src/persistent/sink.rs | 6 +- vortex-datafusion/src/persistent/source.rs | 176 ++++++++++-- 12 files changed, 226 insertions(+), 302 deletions(-) delete mode 100644 vortex-datafusion/src/persistent/execution.rs diff --git a/Cargo.lock b/Cargo.lock index 8d6044b1a8..aad4972bb9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1397,6 +1397,7 @@ dependencies = [ [[package]] name = "datafusion" version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=refs%2Fpull%2F14754%2Fhead#a40a636f8b756f1121904458c72f1b33ee7a3f81" dependencies = [ "arrow", "arrow-ipc", @@ -1441,6 +1442,7 @@ dependencies = [ [[package]] name = "datafusion-catalog" version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=refs%2Fpull%2F14754%2Fhead#a40a636f8b756f1121904458c72f1b33ee7a3f81" dependencies = [ "arrow", "async-trait", @@ -1459,6 +1461,7 @@ dependencies = [ [[package]] name = "datafusion-catalog-listing" version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=refs%2Fpull%2F14754%2Fhead#a40a636f8b756f1121904458c72f1b33ee7a3f81" dependencies = [ "arrow", "async-trait", @@ -1479,6 +1482,7 @@ dependencies = [ [[package]] name = "datafusion-common" version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=refs%2Fpull%2F14754%2Fhead#a40a636f8b756f1121904458c72f1b33ee7a3f81" dependencies = [ "ahash", "arrow", @@ -1500,6 +1504,7 @@ dependencies = [ [[package]] name = "datafusion-common-runtime" version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=refs%2Fpull%2F14754%2Fhead#a40a636f8b756f1121904458c72f1b33ee7a3f81" dependencies = [ "log", "tokio", @@ -1508,6 +1513,7 @@ dependencies = [ [[package]] name = "datafusion-datasource" version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=refs%2Fpull%2F14754%2Fhead#a40a636f8b756f1121904458c72f1b33ee7a3f81" dependencies = [ "arrow", "async-trait", @@ -1532,10 +1538,12 @@ dependencies = [ [[package]] name = "datafusion-doc" version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=refs%2Fpull%2F14754%2Fhead#a40a636f8b756f1121904458c72f1b33ee7a3f81" [[package]] name = "datafusion-execution" version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=refs%2Fpull%2F14754%2Fhead#a40a636f8b756f1121904458c72f1b33ee7a3f81" dependencies = [ "arrow", "dashmap", @@ -1553,6 +1561,7 @@ dependencies = [ [[package]] name = "datafusion-expr" version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=refs%2Fpull%2F14754%2Fhead#a40a636f8b756f1121904458c72f1b33ee7a3f81" dependencies = [ "arrow", "chrono", @@ -1571,6 +1580,7 @@ dependencies = [ [[package]] name = "datafusion-expr-common" version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=refs%2Fpull%2F14754%2Fhead#a40a636f8b756f1121904458c72f1b33ee7a3f81" dependencies = [ "arrow", "datafusion-common", @@ -1582,6 +1592,7 @@ dependencies = [ [[package]] name = "datafusion-functions" version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=refs%2Fpull%2F14754%2Fhead#a40a636f8b756f1121904458c72f1b33ee7a3f81" dependencies = [ "arrow", "arrow-buffer", @@ -1605,6 +1616,7 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate" version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=refs%2Fpull%2F14754%2Fhead#a40a636f8b756f1121904458c72f1b33ee7a3f81" dependencies = [ "ahash", "arrow", @@ -1624,6 +1636,7 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate-common" version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=refs%2Fpull%2F14754%2Fhead#a40a636f8b756f1121904458c72f1b33ee7a3f81" dependencies = [ "ahash", "arrow", @@ -1635,6 +1648,7 @@ dependencies = [ [[package]] name = "datafusion-functions-table" version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=refs%2Fpull%2F14754%2Fhead#a40a636f8b756f1121904458c72f1b33ee7a3f81" dependencies = [ "arrow", "async-trait", @@ -1649,6 +1663,7 @@ dependencies = [ [[package]] name = "datafusion-functions-window" version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=refs%2Fpull%2F14754%2Fhead#a40a636f8b756f1121904458c72f1b33ee7a3f81" dependencies = [ "datafusion-common", "datafusion-doc", @@ -1664,6 +1679,7 @@ dependencies = [ [[package]] name = "datafusion-functions-window-common" version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=refs%2Fpull%2F14754%2Fhead#a40a636f8b756f1121904458c72f1b33ee7a3f81" dependencies = [ "datafusion-common", "datafusion-physical-expr-common", @@ -1672,6 +1688,7 @@ dependencies = [ [[package]] name = "datafusion-macros" version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=refs%2Fpull%2F14754%2Fhead#a40a636f8b756f1121904458c72f1b33ee7a3f81" dependencies = [ "datafusion-expr", "quote", @@ -1681,6 +1698,7 @@ dependencies = [ [[package]] name = "datafusion-optimizer" version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=refs%2Fpull%2F14754%2Fhead#a40a636f8b756f1121904458c72f1b33ee7a3f81" dependencies = [ "arrow", "chrono", @@ -1697,6 +1715,7 @@ dependencies = [ [[package]] name = "datafusion-physical-expr" version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=refs%2Fpull%2F14754%2Fhead#a40a636f8b756f1121904458c72f1b33ee7a3f81" dependencies = [ "ahash", "arrow", @@ -1717,6 +1736,7 @@ dependencies = [ [[package]] name = "datafusion-physical-expr-common" version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=refs%2Fpull%2F14754%2Fhead#a40a636f8b756f1121904458c72f1b33ee7a3f81" dependencies = [ "ahash", "arrow", @@ -1729,6 +1749,7 @@ dependencies = [ [[package]] name = "datafusion-physical-optimizer" version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=refs%2Fpull%2F14754%2Fhead#a40a636f8b756f1121904458c72f1b33ee7a3f81" dependencies = [ "arrow", "datafusion-common", @@ -1745,6 +1766,7 @@ dependencies = [ [[package]] name = "datafusion-physical-plan" version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=refs%2Fpull%2F14754%2Fhead#a40a636f8b756f1121904458c72f1b33ee7a3f81" dependencies = [ "ahash", "arrow", @@ -1773,6 +1795,7 @@ dependencies = [ [[package]] name = "datafusion-sql" version = "45.0.0" +source = "git+https://github.com/apache/datafusion.git?rev=refs%2Fpull%2F14754%2Fhead#a40a636f8b756f1121904458c72f1b33ee7a3f81" dependencies = [ "arrow", "bigdecimal", diff --git a/Cargo.toml b/Cargo.toml index 21cd91cd21..b3552ad562 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -74,12 +74,12 @@ criterion = { package = "codspeed-criterion-compat", features = [ "html_reports", ], version = "2.7.2" } crossterm = "0.28" -datafusion = { version = "45", default-features = false, path = "../datafusion/datafusion/core" } -datafusion-common = {version = "45", path = "../datafusion/datafusion/common"} -datafusion-execution = {version = "45", path = "../datafusion/datafusion/execution"} -datafusion-expr = {version = "45", path = "../datafusion/datafusion/expr"} -datafusion-physical-expr = {version = "45", path = "../datafusion/datafusion/physical-expr"} -datafusion-physical-plan = {version = "45", path = "../datafusion/datafusion/physical-plan"} +datafusion = { version = "45", default-features = false, git = "https://github.com/apache/datafusion.git", rev = "refs/pull/14754/head" } +datafusion-common = {version = "45", git = "https://github.com/apache/datafusion.git", rev = "refs/pull/14754/head" } +datafusion-execution = {version = "45", git = "https://github.com/apache/datafusion.git", rev = "refs/pull/14754/head" } +datafusion-expr = {version = "45", git = "https://github.com/apache/datafusion.git", rev = "refs/pull/14754/head" } +datafusion-physical-expr = {version = "45", git = "https://github.com/apache/datafusion.git", rev = "refs/pull/14754/head" } +datafusion-physical-plan = {version = "45", git = "https://github.com/apache/datafusion.git", rev = "refs/pull/14754/head" } divan = { package = "codspeed-divan-compat", version = "2.8.0" } dyn-hash = "0.2.0" enum-iterator = "2.0.0" diff --git a/vortex-datafusion/Cargo.toml b/vortex-datafusion/Cargo.toml index a7dcac4328..7c1ec24cb6 100644 --- a/vortex-datafusion/Cargo.toml +++ b/vortex-datafusion/Cargo.toml @@ -58,4 +58,4 @@ tokio = { workspace = true, features = ["test-util", "rt-multi-thread", "fs"] } url = { workspace = true } [lints] -workspace = true +# workspace = true diff --git a/vortex-datafusion/src/lib.rs b/vortex-datafusion/src/lib.rs index bd50cfd070..b072ef09ad 100644 --- a/vortex-datafusion/src/lib.rs +++ b/vortex-datafusion/src/lib.rs @@ -1,6 +1,5 @@ //! Connectors to enable DataFusion to read Vortex data. -#![deny(missing_docs)] -#![allow(clippy::nonminimal_bool)] +// #![deny(missing_docs)] #![allow(clippy::cast_possible_truncation)] use std::sync::Arc; diff --git a/vortex-datafusion/src/memory/provider.rs b/vortex-datafusion/src/memory/provider.rs index 51f0fe2f43..fcdb66fb98 100644 --- a/vortex-datafusion/src/memory/provider.rs +++ b/vortex-datafusion/src/memory/provider.rs @@ -180,7 +180,7 @@ mod test { use arrow_schema::{DataType, Field, Schema}; use datafusion::functions_aggregate::count::count_distinct; use datafusion::prelude::SessionContext; - use datafusion_common::{Column, TableReference}; + use datafusion_common::{Column, Spans, TableReference}; use datafusion_expr::{and, col, lit, BinaryExpr, Expr, Operator}; use vortex_array::array::{PrimitiveArray, StructArray, VarBinViewArray}; use vortex_array::{Array, IntoArray}; @@ -244,6 +244,7 @@ mod test { table: "orders".into(), }), name: "o_orderstatus".to_string(), + spans: Spans::new(), } .into(), ), diff --git a/vortex-datafusion/src/persistent/config.rs b/vortex-datafusion/src/persistent/config.rs index 730619e77e..247ec60ba3 100644 --- a/vortex-datafusion/src/persistent/config.rs +++ b/vortex-datafusion/src/persistent/config.rs @@ -35,6 +35,7 @@ pub struct ConfigProjection { pub arrow_schema: SchemaRef, pub constraints: Constraints, pub statistics: Statistics, + #[allow(dead_code)] pub orderings: Vec, pub projection_expr: Arc, } diff --git a/vortex-datafusion/src/persistent/execution.rs b/vortex-datafusion/src/persistent/execution.rs deleted file mode 100644 index f6eb2855f6..0000000000 --- a/vortex-datafusion/src/persistent/execution.rs +++ /dev/null @@ -1,255 +0,0 @@ -use std::fmt; -use std::sync::Arc; - -use arrow_schema::SchemaRef; -use datafusion::config::ConfigOptions; -use datafusion::datasource::listing::PartitionedFile; -use datafusion::datasource::physical_plan::{FileScanConfig, FileStream}; -use datafusion_common::{Result as DFResult, Statistics}; -use datafusion_execution::{SendableRecordBatchStream, TaskContext}; -use datafusion_physical_expr::{EquivalenceProperties, Partitioning, PhysicalExpr}; -use datafusion_physical_plan::execution_plan::{Boundedness, EmissionType}; -use datafusion_physical_plan::metrics::{ExecutionPlanMetricsSet, MetricsSet}; -use datafusion_physical_plan::{DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties}; -use itertools::Itertools; -use object_store::ObjectStoreScheme; -use vortex_array::ContextRef; -use vortex_expr::datafusion::convert_expr_to_vortex; -use vortex_expr::{and, VortexExpr}; - -use super::cache::FileLayoutCache; -use super::config::{ConfigProjection, FileScanConfigExt}; -use crate::persistent::opener::VortexFileOpener; - -#[derive(Debug, Clone)] -pub(crate) struct VortexExec { - file_scan_config: FileScanConfig, - metrics: ExecutionPlanMetricsSet, - predicate: Option>, - plan_properties: PlanProperties, - projected_statistics: Statistics, - ctx: ContextRef, - initial_read_cache: FileLayoutCache, - projected_arrow_schema: SchemaRef, - projection: Arc, -} - -impl VortexExec { - pub fn try_new( - file_scan_config: FileScanConfig, - metrics: ExecutionPlanMetricsSet, - predicate: Option>, - ctx: ContextRef, - initial_read_cache: FileLayoutCache, - ) -> DFResult { - let ConfigProjection { - arrow_schema, - constraints: _constraints, - mut statistics, - orderings, - projection_expr, - } = file_scan_config.project_for_vortex(); - - let predicate = make_vortex_predicate(predicate); - - // We must take care to report in-exact statistics if we have any form of filter - // push-down. - if predicate.is_some() { - statistics = statistics.to_inexact(); - } - - let plan_properties = PlanProperties::new( - EquivalenceProperties::new_with_orderings(arrow_schema.clone(), &orderings), - Partitioning::UnknownPartitioning(file_scan_config.file_groups.len()), - EmissionType::Incremental, - Boundedness::Bounded, - ); - - Ok(Self { - file_scan_config, - metrics, - predicate, - plan_properties, - ctx, - initial_read_cache, - projected_statistics: statistics, - projected_arrow_schema: arrow_schema, - projection: projection_expr, - }) - } - - pub(crate) fn into_arc(self) -> Arc { - Arc::new(self) as _ - } -} - -impl DisplayAs for VortexExec { - fn fmt_as(&self, t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "VortexExec: ")?; - self.file_scan_config.fmt_as(t, f)?; - - Ok(()) - } -} - -impl ExecutionPlan for VortexExec { - fn name(&self) -> &str { - "VortexExec" - } - - fn as_any(&self) -> &dyn std::any::Any { - self - } - - fn properties(&self) -> &PlanProperties { - &self.plan_properties - } - - fn children(&self) -> Vec<&Arc> { - vec![] - } - - fn with_new_children( - self: Arc, - _children: Vec>, - ) -> DFResult> { - Ok(self) - } - - fn execute( - &self, - partition: usize, - context: Arc, - ) -> DFResult { - log::debug!("Executing partition {partition}"); - let object_store = context - .runtime_env() - .object_store(&self.file_scan_config.object_store_url)?; - let (scheme, _) = ObjectStoreScheme::parse(self.file_scan_config.object_store_url.as_ref()) - .map_err(object_store::Error::from)?; - - let opener = VortexFileOpener::new( - self.ctx.clone(), - scheme, - object_store, - self.projection.clone(), - self.predicate.clone(), - self.initial_read_cache.clone(), - self.projected_arrow_schema.clone(), - context.session_config().batch_size(), - )?; - let stream = FileStream::new(&self.file_scan_config, partition, opener, &self.metrics)?; - - Ok(Box::pin(stream)) - } - - fn statistics(&self) -> DFResult { - Ok(self.projected_statistics.clone()) - } - - fn metrics(&self) -> Option { - Some(self.metrics.clone_inner()) - } - - fn repartitioned( - &self, - target_partitions: usize, - _config: &ConfigOptions, - ) -> DFResult>> { - let file_groups = self.file_scan_config.file_groups.clone(); - let repartitioned_file_groups = repartition_by_size(file_groups, target_partitions); - let mut new_plan = self.clone(); - - let num_partitions = repartitioned_file_groups.len(); - - log::debug!("VortexExec repartitioned to {num_partitions} partitions"); - new_plan.file_scan_config.file_groups = repartitioned_file_groups; - new_plan.plan_properties.partitioning = Partitioning::UnknownPartitioning(num_partitions); - - Ok(Some(Arc::new(new_plan))) - } -} - -fn make_vortex_predicate(predicate: Option>) -> Option> { - predicate - .as_ref() - // If we cannot convert an expr to a vortex expr, we run no filter, since datafusion - // will rerun the filter expression anyway. - .and_then(|expr| { - // This splits expressions into conjunctions and converts them to vortex expressions. - // Any inconvertible expressions are dropped since true /\ a == a. - datafusion_physical_expr::split_conjunction(expr) - .into_iter() - .filter_map(|e| convert_expr_to_vortex(e.clone()).ok()) - .reduce(and) - }) -} - -pub(crate) fn repartition_by_size( - file_groups: Vec>, - desired_partitions: usize, -) -> Vec> { - let all_files = file_groups.into_iter().concat(); - let total_file_count = all_files.len(); - let total_size = all_files.iter().map(|f| f.object_meta.size).sum::(); - let target_partition_size = total_size / (desired_partitions + 1); - - let mut partitions = Vec::with_capacity(desired_partitions); - - let mut curr_partition_size = 0; - let mut curr_partition = Vec::default(); - - for file in all_files.into_iter() { - curr_partition_size += file.object_meta.size; - curr_partition.push(file); - - if curr_partition_size >= target_partition_size { - curr_partition_size = 0; - partitions.push(std::mem::take(&mut curr_partition)); - } - } - - // If we we're still missing the last partition - if !curr_partition.is_empty() && partitions.len() != desired_partitions { - partitions.push(std::mem::take(&mut curr_partition)); - // If we already have enough partitions - } else if !curr_partition.is_empty() { - for (idx, file) in curr_partition.into_iter().enumerate() { - let new_part_idx = idx % partitions.len(); - partitions[new_part_idx].push(file); - } - } - - // Assert that we have the correct number of partitions and that the total number of files is right - assert_eq!( - partitions.len(), - usize::min(desired_partitions, total_file_count) - ); - assert_eq!(total_file_count, partitions.iter().flatten().count()); - - partitions -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn basic_repartition_test() { - let file_groups = vec![vec![ - PartitionedFile::new("a", 100), - PartitionedFile::new("b", 25), - PartitionedFile::new("c", 25), - PartitionedFile::new("d", 25), - PartitionedFile::new("e", 50), - ]]; - - repartition_by_size(file_groups, 2); - - let file_groups = vec![(0..100) - .map(|idx| PartitionedFile::new(format!("{idx}"), idx)) - .collect()]; - - repartition_by_size(file_groups, 16); - } -} diff --git a/vortex-datafusion/src/persistent/format.rs b/vortex-datafusion/src/persistent/format.rs index 976ac13494..127a1823a1 100644 --- a/vortex-datafusion/src/persistent/format.rs +++ b/vortex-datafusion/src/persistent/format.rs @@ -18,7 +18,6 @@ use datafusion_expr::dml::InsertOp; use datafusion_expr::Expr; use datafusion_physical_expr::{LexRequirement, PhysicalExpr}; use datafusion_physical_plan::insert::DataSinkExec; -use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet; use datafusion_physical_plan::ExecutionPlan; use futures::{stream, StreamExt as _, TryStreamExt as _}; use itertools::Itertools; @@ -28,11 +27,12 @@ use vortex_array::stats::{Stat, StatsSet}; use vortex_array::{stats, ContextRef}; use vortex_dtype::DType; use vortex_error::{vortex_err, VortexExpect, VortexResult}; +use vortex_expr::datafusion::convert_expr_to_vortex; +use vortex_expr::{and, Identity, VortexExpr}; use vortex_file::{VortexOpenOptions, VORTEX_FILE_EXTENSION}; use vortex_io::ObjectStoreReadAt; use super::cache::FileLayoutCache; -use super::execution::VortexExec; use super::sink::VortexSink; use super::VortexSource; use crate::can_be_pushed_down; @@ -155,7 +155,10 @@ impl FileFormat for VortexFormat { } fn file_source(&self) -> Arc { - Arc::new(VortexSource::default()) + Arc::new(VortexSource::new( + self.context.clone(), + self.file_layout_cache.clone(), + )) } async fn infer_schema( @@ -283,8 +286,6 @@ impl FileFormat for VortexFormat { file_scan_config: FileScanConfig, filters: Option<&Arc>, ) -> DFResult> { - let metrics = ExecutionPlanMetricsSet::new(); - if file_scan_config .file_groups .iter() @@ -302,16 +303,11 @@ impl FileFormat for VortexFormat { return not_impl_err!("Hive style partitioning isn't implemented yet for Vortex"); } - let exec = VortexExec::try_new( - file_scan_config, - metrics, - filters.cloned(), - self.context.clone(), - self.file_layout_cache.clone(), - )? - .into_arc(); + let predicate = make_vortex_predicate(filters).unwrap_or(Identity::new_expr()); + let mut source = VortexSource::new(self.context.clone(), self.file_layout_cache.clone()); + source = source.with_predicate(predicate); - Ok(exec) + Ok(file_scan_config.with_source(Arc::new(source)).build()) } async fn create_writer_physical_plan( @@ -353,6 +349,22 @@ impl FileFormat for VortexFormat { } } +pub(crate) fn make_vortex_predicate( + predicate: Option<&Arc>, +) -> Option> { + predicate + // If we cannot convert an expr to a vortex expr, we run no filter, since datafusion + // will rerun the filter expression anyway. + .and_then(|expr| { + // This splits expressions into conjunctions and converts them to vortex expressions. + // Any inconvertible expressions are dropped since true /\ a == a. + datafusion_physical_expr::split_conjunction(expr) + .into_iter() + .filter_map(|e| convert_expr_to_vortex(e.clone()).ok()) + .reduce(and) + }) +} + #[cfg(test)] mod tests { use datafusion::execution::SessionStateBuilder; diff --git a/vortex-datafusion/src/persistent/mod.rs b/vortex-datafusion/src/persistent/mod.rs index 0f78b3d1f3..4509b1b2b8 100644 --- a/vortex-datafusion/src/persistent/mod.rs +++ b/vortex-datafusion/src/persistent/mod.rs @@ -1,7 +1,6 @@ //! Persistent implementation of a Vortex table provider. mod cache; mod config; -mod execution; mod format; mod opener; mod sink; diff --git a/vortex-datafusion/src/persistent/opener.rs b/vortex-datafusion/src/persistent/opener.rs index 82336b3f83..9c431cd610 100644 --- a/vortex-datafusion/src/persistent/opener.rs +++ b/vortex-datafusion/src/persistent/opener.rs @@ -71,7 +71,7 @@ impl FileOpener for VortexFileOpener { Ok(async move { let vxf = VortexOpenOptions::file(read_at) - .with_ctx(ctx.clone()) + .with_ctx(ctx) .with_file_layout( file_layout_cache .try_get(&file_meta.object_meta, object_store) @@ -82,8 +82,8 @@ impl FileOpener for VortexFileOpener { Ok(vxf .scan() - .with_projection(projection.clone()) - .with_some_filter(filter.clone()) + .with_projection(projection) + .with_some_filter(filter) .with_canonicalize(true) // DataFusion likes ~8k row batches. Ideally we would respect the config, // but at the moment our scanner has too much overhead to process small diff --git a/vortex-datafusion/src/persistent/sink.rs b/vortex-datafusion/src/persistent/sink.rs index 3aa607657f..20cc21c1de 100644 --- a/vortex-datafusion/src/persistent/sink.rs +++ b/vortex-datafusion/src/persistent/sink.rs @@ -115,6 +115,7 @@ impl DataSink for VortexSink { mod tests { use std::sync::Arc; + use datafusion::datasource::DefaultTableSource; use datafusion::execution::SessionStateBuilder; use datafusion::prelude::SessionContext; use datafusion_expr::{Expr, LogicalPlan, LogicalPlanBuilder, Values}; @@ -122,6 +123,7 @@ mod tests { use crate::persistent::{register_vortex_format_factory, VortexFormatFactory}; + // TODO(adam): Seems like this now panics due to a Vortex issue #[tokio::test] #[should_panic] // This test is not working due to async fn test_insert_into() { @@ -155,10 +157,12 @@ mod tests { ]], }; + let tbl_provider = session.table_provider("my_tbl").await.unwrap(); + let logical_plan = LogicalPlanBuilder::insert_into( LogicalPlan::Values(values.clone()), "my_tbl", - my_tbl.schema().as_arrow(), + Arc::new(DefaultTableSource::new(tbl_provider)), datafusion_expr::dml::InsertOp::Append, ) .unwrap() diff --git a/vortex-datafusion/src/persistent/source.rs b/vortex-datafusion/src/persistent/source.rs index 8f679c4e6a..a67d748af8 100644 --- a/vortex-datafusion/src/persistent/source.rs +++ b/vortex-datafusion/src/persistent/source.rs @@ -1,22 +1,53 @@ use std::any::Any; use std::sync::Arc; +use arrow_schema::SchemaRef; use datafusion::datasource::data_source::FileSource; +use datafusion::datasource::listing::PartitionedFile; use datafusion::datasource::physical_plan::{FileOpener, FileScanConfig}; -use datafusion_common::{Result as DFResult, Statistics}; +use datafusion_common::{internal_datafusion_err, Result as DFResult, Statistics}; use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet; +use itertools::Itertools as _; use object_store::{ObjectStore, ObjectStoreScheme}; use vortex_array::ContextRef; +use vortex_expr::{Identity, VortexExpr}; +use vortex_file::VORTEX_FILE_EXTENSION; +use super::cache::FileLayoutCache; +use super::config::{ConfigProjection, FileScanConfigExt}; use super::opener::VortexFileOpener; -use crate::persistent::execution::repartition_by_size; -#[derive(Default, Clone)] +#[derive(Clone)] pub struct VortexSource { - batch_size: Option, - projected_statistics: Option, - context: ContextRef, - metrics: ExecutionPlanMetricsSet, + pub(crate) ctx: ContextRef, + pub(crate) initial_read_cache: FileLayoutCache, + pub(crate) predicate: Option>, + pub(crate) projection: Option>, + pub(crate) batch_size: Option, + pub(crate) projected_statistics: Option, + pub(crate) arrow_schema: Option, + pub(crate) metrics: ExecutionPlanMetricsSet, +} + +impl VortexSource { + pub(crate) fn new(ctx: ContextRef, initial_read_cache: FileLayoutCache) -> Self { + Self { + ctx, + initial_read_cache, + projection: None, + batch_size: None, + projected_statistics: None, + arrow_schema: None, + predicate: None, + metrics: ExecutionPlanMetricsSet::default(), + } + } + + pub fn with_predicate(&self, predicate: Arc) -> Self { + let mut source = self.clone(); + source.predicate = Some(predicate); + source + } } impl FileSource for VortexSource { @@ -24,20 +55,27 @@ impl FileSource for VortexSource { &self, object_store: DFResult>, base_config: &FileScanConfig, - partition: usize, + _partition: usize, ) -> DFResult> { - let (scheme, _) = ObjectStoreScheme::parse(self.file_scan_config.object_store_url.as_ref()) + let object_store = object_store?; + let (scheme, _) = ObjectStoreScheme::parse(base_config.object_store_url.as_ref()) .map_err(object_store::Error::from)?; + let Some(batch_size) = self.batch_size else { + return Err(internal_datafusion_err!( + "batch_size must be supplied to VortexSource" + )); + }; + let opener = VortexFileOpener::new( self.ctx.clone(), scheme, object_store, - self.projection.clone(), + self.projection.clone().unwrap_or(Identity::new_expr()), self.predicate.clone(), self.initial_read_cache.clone(), - self.projected_arrow_schema.clone(), - context.session_config().batch_size(), + self.arrow_schema.clone().unwrap(), + batch_size, )?; Ok(Arc::new(opener)) @@ -53,16 +91,40 @@ impl FileSource for VortexSource { Arc::new(source) } - fn with_schema(&self, schema: arrow_schema::SchemaRef) -> Arc { - todo!() + fn with_schema(&self, schema: SchemaRef) -> Arc { + // todo(adam): does this need to the same as `with_projection`? + let mut source = self.clone(); + source.arrow_schema = Some(schema); + Arc::new(source) } fn with_projection(&self, config: &FileScanConfig) -> Arc { - todo!() + let ConfigProjection { + arrow_schema, + constraints: _constraints, + statistics, + orderings: _, + projection_expr, + } = config.project_for_vortex(); + + let statistics = if self.predicate.is_some() { + statistics.to_inexact() + } else { + statistics + }; + + let mut source = self.clone(); + source.projection = Some(projection_expr); + source.arrow_schema = Some(arrow_schema); + source.projected_statistics = Some(statistics); + + Arc::new(source) } fn with_statistics(&self, statistics: Statistics) -> Arc { - todo!() + let mut source = self.clone(); + source.projected_statistics = Some(statistics); + Arc::new(source) } fn metrics(&self) -> &ExecutionPlanMetricsSet { @@ -70,11 +132,20 @@ impl FileSource for VortexSource { } fn statistics(&self) -> DFResult { - todo!() + let statistics = self + .projected_statistics + .clone() + .expect("projected_statistics must be set"); + + if self.predicate.is_some() { + Ok(statistics.to_inexact()) + } else { + Ok(statistics) + } } fn file_type(&self) -> &str { - "vortex" + VORTEX_FILE_EXTENSION } fn supports_repartition(&self, config: &FileScanConfig) -> bool { @@ -101,3 +172,72 @@ impl FileSource for VortexSource { Ok(Some(new_config)) } } + +pub(crate) fn repartition_by_size( + file_groups: Vec>, + desired_partitions: usize, +) -> Vec> { + let all_files = file_groups.into_iter().concat(); + let total_file_count = all_files.len(); + let total_size = all_files.iter().map(|f| f.object_meta.size).sum::(); + let target_partition_size = total_size / (desired_partitions + 1); + + let mut partitions = Vec::with_capacity(desired_partitions); + + let mut curr_partition_size = 0; + let mut curr_partition = Vec::default(); + + for file in all_files.into_iter() { + curr_partition_size += file.object_meta.size; + curr_partition.push(file); + + if curr_partition_size >= target_partition_size { + curr_partition_size = 0; + partitions.push(std::mem::take(&mut curr_partition)); + } + } + + // If we we're still missing the last partition + if !curr_partition.is_empty() && partitions.len() != desired_partitions { + partitions.push(std::mem::take(&mut curr_partition)); + // If we already have enough partitions + } else if !curr_partition.is_empty() { + for (idx, file) in curr_partition.into_iter().enumerate() { + let new_part_idx = idx % partitions.len(); + partitions[new_part_idx].push(file); + } + } + + // Assert that we have the correct number of partitions and that the total number of files is right + assert_eq!( + partitions.len(), + usize::min(desired_partitions, total_file_count) + ); + assert_eq!(total_file_count, partitions.iter().flatten().count()); + + partitions +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn basic_repartition_test() { + let file_groups = vec![vec![ + PartitionedFile::new("a", 100), + PartitionedFile::new("b", 25), + PartitionedFile::new("c", 25), + PartitionedFile::new("d", 25), + PartitionedFile::new("e", 50), + ]]; + + repartition_by_size(file_groups, 2); + + let file_groups = vec![(0..100) + .map(|idx| PartitionedFile::new(format!("{idx}"), idx)) + .collect()]; + + repartition_by_size(file_groups, 16); + } +} From 9820224e134f5beb4c534c31139956eec455bc6f Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Wed, 19 Feb 2025 12:02:18 +0000 Subject: [PATCH 08/21] . --- vortex-datafusion/Cargo.toml | 2 +- vortex-datafusion/src/lib.rs | 2 +- vortex-datafusion/src/persistent/format.rs | 2 +- vortex-datafusion/src/persistent/sink.rs | 27 ++++++++++++++++++++-- vortex-datafusion/src/persistent/source.rs | 13 ++++++++--- 5 files changed, 38 insertions(+), 8 deletions(-) diff --git a/vortex-datafusion/Cargo.toml b/vortex-datafusion/Cargo.toml index 7c1ec24cb6..a7dcac4328 100644 --- a/vortex-datafusion/Cargo.toml +++ b/vortex-datafusion/Cargo.toml @@ -58,4 +58,4 @@ tokio = { workspace = true, features = ["test-util", "rt-multi-thread", "fs"] } url = { workspace = true } [lints] -# workspace = true +workspace = true diff --git a/vortex-datafusion/src/lib.rs b/vortex-datafusion/src/lib.rs index b072ef09ad..8695a27dab 100644 --- a/vortex-datafusion/src/lib.rs +++ b/vortex-datafusion/src/lib.rs @@ -1,5 +1,5 @@ //! Connectors to enable DataFusion to read Vortex data. -// #![deny(missing_docs)] +#![deny(missing_docs)] #![allow(clippy::cast_possible_truncation)] use std::sync::Arc; diff --git a/vortex-datafusion/src/persistent/format.rs b/vortex-datafusion/src/persistent/format.rs index 127a1823a1..963d586a54 100644 --- a/vortex-datafusion/src/persistent/format.rs +++ b/vortex-datafusion/src/persistent/format.rs @@ -303,7 +303,7 @@ impl FileFormat for VortexFormat { return not_impl_err!("Hive style partitioning isn't implemented yet for Vortex"); } - let predicate = make_vortex_predicate(filters).unwrap_or(Identity::new_expr()); + let predicate = make_vortex_predicate(filters).unwrap_or_else(Identity::new_expr); let mut source = VortexSource::new(self.context.clone(), self.file_layout_cache.clone()); source = source.with_predicate(predicate); diff --git a/vortex-datafusion/src/persistent/sink.rs b/vortex-datafusion/src/persistent/sink.rs index 20cc21c1de..42024accaa 100644 --- a/vortex-datafusion/src/persistent/sink.rs +++ b/vortex-datafusion/src/persistent/sink.rs @@ -4,12 +4,15 @@ use std::sync::Arc; use arrow_schema::SchemaRef; use async_trait::async_trait; -use datafusion::datasource::physical_plan::FileSinkConfig; +use datafusion::common::runtime::SpawnedTask; +use datafusion::datasource::file_format::write::demux::DemuxedStreamReceiver; +use datafusion::datasource::physical_plan::{FileSink, FileSinkConfig}; use datafusion_execution::{SendableRecordBatchStream, TaskContext}; use datafusion_physical_plan::insert::DataSink; use datafusion_physical_plan::metrics::MetricsSet; use datafusion_physical_plan::{DisplayAs, DisplayFormatType}; use futures::{StreamExt, TryStreamExt}; +use object_store::ObjectStore; use rand::distributions::{Alphanumeric, DistString}; use vortex_array::arrow::FromArrowType; use vortex_array::stream::ArrayStreamAdapter; @@ -65,7 +68,10 @@ impl DataSink for VortexSink { &self, data: SendableRecordBatchStream, context: &Arc, - ) -> datafusion_common::error::Result { + ) -> datafusion_common::Result { + // TODO: Once `FileSink` is fully implemented, this function should just be: + // FileSink::write_all(self, data, context).await + let object_store = context .runtime_env() .object_store(&self.config.object_store_url)?; @@ -111,6 +117,23 @@ impl DataSink for VortexSink { } } +#[async_trait] +impl FileSink for VortexSink { + fn config(&self) -> &FileSinkConfig { + &self.config + } + + async fn spawn_writer_tasks_and_join( + &self, + _context: &Arc, + _demux_task: SpawnedTask>, + _file_stream_rx: DemuxedStreamReceiver, + _object_store: Arc, + ) -> datafusion_common::Result { + unimplemented!() + } +} + #[cfg(test)] mod tests { use std::sync::Arc; diff --git a/vortex-datafusion/src/persistent/source.rs b/vortex-datafusion/src/persistent/source.rs index a67d748af8..ecad6315ae 100644 --- a/vortex-datafusion/src/persistent/source.rs +++ b/vortex-datafusion/src/persistent/source.rs @@ -10,6 +10,7 @@ use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet; use itertools::Itertools as _; use object_store::{ObjectStore, ObjectStoreScheme}; use vortex_array::ContextRef; +use vortex_error::VortexExpect as _; use vortex_expr::{Identity, VortexExpr}; use vortex_file::VORTEX_FILE_EXTENSION; @@ -17,6 +18,9 @@ use super::cache::FileLayoutCache; use super::config::{ConfigProjection, FileScanConfigExt}; use super::opener::VortexFileOpener; +/// A config for [`VortexFileOpener`]. Used to create [`DataSourceExec`] based physical plans. +/// +/// [`DataSourceExec`]: datafusion_physical_plan::source::DataSourceExec #[derive(Clone)] pub struct VortexSource { pub(crate) ctx: ContextRef, @@ -43,6 +47,7 @@ impl VortexSource { } } + /// Sets a [`VortexExpr`] as a predicate pub fn with_predicate(&self, predicate: Arc) -> Self { let mut source = self.clone(); source.predicate = Some(predicate); @@ -71,10 +76,12 @@ impl FileSource for VortexSource { self.ctx.clone(), scheme, object_store, - self.projection.clone().unwrap_or(Identity::new_expr()), + self.projection.clone().unwrap_or_else(Identity::new_expr), self.predicate.clone(), self.initial_read_cache.clone(), - self.arrow_schema.clone().unwrap(), + self.arrow_schema + .clone() + .vortex_expect("We should have a schema here"), batch_size, )?; @@ -135,7 +142,7 @@ impl FileSource for VortexSource { let statistics = self .projected_statistics .clone() - .expect("projected_statistics must be set"); + .vortex_expect("projected_statistics must be set"); if self.predicate.is_some() { Ok(statistics.to_inexact()) From 22079d3a6aea8b225085e02129553de5ab9a0c53 Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Wed, 19 Feb 2025 12:20:15 +0000 Subject: [PATCH 09/21] thing --- vortex-datafusion/src/persistent/format.rs | 2 +- vortex-datafusion/src/persistent/mod.rs | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/vortex-datafusion/src/persistent/format.rs b/vortex-datafusion/src/persistent/format.rs index 963d586a54..29f32beb5f 100644 --- a/vortex-datafusion/src/persistent/format.rs +++ b/vortex-datafusion/src/persistent/format.rs @@ -34,7 +34,7 @@ use vortex_io::ObjectStoreReadAt; use super::cache::FileLayoutCache; use super::sink::VortexSink; -use super::VortexSource; +use super::source::VortexSource; use crate::can_be_pushed_down; use crate::converter::{bound_to_datafusion, directional_bound_to_df_precision}; diff --git a/vortex-datafusion/src/persistent/mod.rs b/vortex-datafusion/src/persistent/mod.rs index 4509b1b2b8..1582b25ab7 100644 --- a/vortex-datafusion/src/persistent/mod.rs +++ b/vortex-datafusion/src/persistent/mod.rs @@ -7,7 +7,6 @@ mod sink; mod source; pub use format::{VortexFormat, VortexFormatFactory, VortexFormatOptions}; -pub use source::VortexSource; #[cfg(test)] /// Utility function to register Vortex with a [`SessionStateBuilder`] From b68d85d0c5d4942f112e3e9007c2d4d4839573f2 Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Wed, 19 Feb 2025 12:45:27 +0000 Subject: [PATCH 10/21] datafusion bumped some dependenceis, and it seems like we should too --- Cargo.lock | 12 ++++++------ Cargo.toml | 24 ++++++++++++------------ 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index aad4972bb9..8eb2f21ae4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2242,9 +2242,9 @@ dependencies = [ [[package]] name = "h2" -version = "0.4.7" +version = "0.4.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ccae279728d634d083c00f6099cb58f01cc99c145b84b8be2f6c74618d79922e" +checksum = "5017294ff4bb30944501348f6f8e42e6ad28f42c8bbef7a74029aff064a4e3c2" dependencies = [ "atomic-waker", "bytes", @@ -3368,9 +3368,9 @@ dependencies = [ [[package]] name = "parquet" -version = "54.1.0" +version = "54.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a01a0efa30bbd601ae85b375c728efdb211ade54390281628a7b16708beb235" +checksum = "761c44d824fe83106e0600d2510c07bf4159a4985bf0569b513ea4288dc1b4fb" dependencies = [ "ahash", "arrow-array", @@ -3851,9 +3851,9 @@ dependencies = [ [[package]] name = "quinn-udp" -version = "0.5.9" +version = "0.5.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1c40286217b4ba3a71d644d752e6a0b71f13f1b6a2c5311acfcbe0c2418ed904" +checksum = "e46f3055866785f6b92bc6164b76be02ca8f2eb4b002c0354b28cf4c119e5944" dependencies = [ "cfg_aliases", "libc", diff --git a/Cargo.toml b/Cargo.toml index b3552ad562..419aadfc54 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -51,20 +51,20 @@ categories = ["database-implementations", "data-structures", "compression"] anyhow = "1.0.95" arbitrary = "1.3.2" arrayref = "0.3.7" -arrow = "54.1" -arrow-arith = "54.1" -arrow-array = "54.1" -arrow-buffer = "54.1" -arrow-cast = "54.1" -arrow-ord = "54.1" -arrow-schema = "54.1" -arrow-select = "54.1" -arrow-string = "54.1" +arrow = "54.2" +arrow-arith = "54.2" +arrow-array = "54.2" +arrow-buffer = "54.2" +arrow-cast = "54.2" +arrow-ord = "54.2" +arrow-schema = "54.2" +arrow-select = "54.2" +arrow-string = "54.2" async-once-cell = "0.5.4" async-trait = "0.1.86" backtrace = "0.3.74" bit-vec = "0.8.0" -bytes = "1.9" +bytes = "1.10" bzip2 = "0.5.0" cfg-if = "1" chrono = "0.4.38" @@ -110,7 +110,7 @@ num-traits = "0.2.19" num_enum = "0.7.2" object_store = "0.11.0" oneshot = "0.1.10" -parquet = "54.1" +parquet = "54.2" paste = "1.0.15" pin-project = "1.1.5" pin-project-lite = "0.2.15" @@ -150,7 +150,7 @@ tar = "0.4" tempfile = "3" tikv-jemallocator = "0.6" thiserror = "2.0.3" -tokio = "1.36" +tokio = "1.43" tracing = { version = "0.1.41" } tracing-chrome = "0.7.2" tracing-futures = "0.2.5" From 2b1bd319b6c60d06e0cbf8f6784f2fe861169c0a Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Wed, 19 Feb 2025 13:09:18 +0000 Subject: [PATCH 11/21] demuxed write --- Cargo.lock | 12 ++++ Cargo.toml | 1 + vortex-datafusion/Cargo.toml | 6 +- vortex-datafusion/src/persistent/sink.rs | 85 ++++++++++-------------- 4 files changed, 53 insertions(+), 51 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 8eb2f21ae4..e5d6238e81 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4942,6 +4942,17 @@ dependencies = [ "tokio", ] +[[package]] +name = "tokio-stream" +version = "0.1.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eca58d7bba4a75707817a2c44174253f9236b2d5fbd055602e9d5c07c139a047" +dependencies = [ + "futures-core", + "pin-project-lite", + "tokio", +] + [[package]] name = "tokio-util" version = "0.7.13" @@ -5360,6 +5371,7 @@ dependencies = [ "rand", "tempfile", "tokio", + "tokio-stream", "tracing", "tracing-futures", "url", diff --git a/Cargo.toml b/Cargo.toml index 419aadfc54..2e45e9f8eb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -151,6 +151,7 @@ tempfile = "3" tikv-jemallocator = "0.6" thiserror = "2.0.3" tokio = "1.43" +tokio-stream = "0.1.17" tracing = { version = "0.1.41" } tracing-chrome = "0.7.2" tracing-futures = "0.2.5" diff --git a/vortex-datafusion/Cargo.toml b/vortex-datafusion/Cargo.toml index a7dcac4328..c60b6b46df 100644 --- a/vortex-datafusion/Cargo.toml +++ b/vortex-datafusion/Cargo.toml @@ -36,6 +36,7 @@ object_store = { workspace = true } pin-project = { workspace = true } rand = { workspace = true } tokio = { workspace = true, features = ["rt-multi-thread", "fs"] } +tokio-stream = { workspace = true } tracing = { workspace = true, optional = true } tracing-futures = { workspace = true, features = [ "futures-03", @@ -48,8 +49,6 @@ vortex-expr = { workspace = true, features = ["datafusion"] } vortex-file = { workspace = true, features = ["object_store", "tokio"] } vortex-io = { workspace = true, features = ["object_store", "tokio"] } vortex-layout = { workspace = true, features = ["tokio"] } -[features] -tracing = ["dep:tracing", "dep:tracing-futures", "vortex-io/tracing"] [dev-dependencies] anyhow = { workspace = true } @@ -57,5 +56,8 @@ tempfile = { workspace = true } tokio = { workspace = true, features = ["test-util", "rt-multi-thread", "fs"] } url = { workspace = true } +[features] +tracing = ["dep:tracing", "dep:tracing-futures", "vortex-io/tracing"] + [lints] workspace = true diff --git a/vortex-datafusion/src/persistent/sink.rs b/vortex-datafusion/src/persistent/sink.rs index 42024accaa..bb0059aa93 100644 --- a/vortex-datafusion/src/persistent/sink.rs +++ b/vortex-datafusion/src/persistent/sink.rs @@ -7,6 +7,7 @@ use async_trait::async_trait; use datafusion::common::runtime::SpawnedTask; use datafusion::datasource::file_format::write::demux::DemuxedStreamReceiver; use datafusion::datasource::physical_plan::{FileSink, FileSinkConfig}; +use datafusion_common::DataFusionError; use datafusion_execution::{SendableRecordBatchStream, TaskContext}; use datafusion_physical_plan::insert::DataSink; use datafusion_physical_plan::metrics::MetricsSet; @@ -14,6 +15,7 @@ use datafusion_physical_plan::{DisplayAs, DisplayFormatType}; use futures::{StreamExt, TryStreamExt}; use object_store::ObjectStore; use rand::distributions::{Alphanumeric, DistString}; +use tokio_stream::wrappers::ReceiverStream; use vortex_array::arrow::FromArrowType; use vortex_array::stream::ArrayStreamAdapter; use vortex_array::Array; @@ -69,51 +71,7 @@ impl DataSink for VortexSink { data: SendableRecordBatchStream, context: &Arc, ) -> datafusion_common::Result { - // TODO: Once `FileSink` is fully implemented, this function should just be: - // FileSink::write_all(self, data, context).await - - let object_store = context - .runtime_env() - .object_store(&self.config.object_store_url)?; - - let base_output_path = &self.config.table_paths[0]; - - let single_file_output = - !base_output_path.is_collection() && base_output_path.file_extension().is_some(); - - let path = if single_file_output { - base_output_path.prefix().to_owned() - } else { - let filename = Alphanumeric.sample_string(&mut rand::thread_rng(), 16); - base_output_path - .prefix() - .child(format!("{filename}.{}", VORTEX_FILE_EXTENSION)) - }; - - let vortex_writer = ObjectStoreWriter::new(object_store, path).await?; - - // TODO(adam): This is a temporary hack - let row_counter = Arc::new(AtomicU64::new(0)); - - let dtype = DType::from_arrow(data.schema()); - let stream = data - .map_err(VortexError::from) - .map(|rb| rb.and_then(Array::try_from)) - .map_ok(|rb| { - row_counter.fetch_add(rb.len() as u64, Ordering::SeqCst); - rb - }); - - let stream = ArrayStreamAdapter::new(dtype, stream); - - let mut writer = VortexWriteOptions::default() - .write(vortex_writer, stream) - .await?; - - writer.flush().await?; - writer.shutdown().await?; - - Ok(row_counter.load(Ordering::SeqCst)) + FileSink::write_all(self, data, context).await } } @@ -126,11 +84,40 @@ impl FileSink for VortexSink { async fn spawn_writer_tasks_and_join( &self, _context: &Arc, - _demux_task: SpawnedTask>, - _file_stream_rx: DemuxedStreamReceiver, - _object_store: Arc, + demux_task: SpawnedTask>, + mut file_stream_rx: DemuxedStreamReceiver, + object_store: Arc, ) -> datafusion_common::Result { - unimplemented!() + // This is a hack + let row_counter = Arc::new(AtomicU64::new(0)); + + // TODO(adamg): + // 1. We only write only file at a time + // 2. We can probably be better at signaling how much memory we're consuming (potentially when reading too), see ParquetSink::spawn_writer_tasks_and_join. + while let Some((path, rx)) = file_stream_rx.recv().await { + let writer = ObjectStoreWriter::new(object_store.clone(), path).await?; + + let stream = ReceiverStream::new(rx).map(|rb| { + row_counter.fetch_add(rb.num_rows() as u64, Ordering::Relaxed); + Array::try_from(rb) + }); + let dtype = DType::from_arrow(self.config.output_schema.as_ref()); + let stream_adapter = ArrayStreamAdapter::new(dtype, stream); + + let mut writer = VortexWriteOptions::default() + .write(writer, stream_adapter) + .await?; + + writer.flush().await?; + writer.shutdown().await?; + } + + demux_task + .join_unwind() + .await + .map_err(DataFusionError::ExecutionJoin)??; + + Ok(row_counter.load(Ordering::SeqCst)) } } From 58a7437d80865b215d8ea268bd143af0998928cc Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Wed, 19 Feb 2025 13:15:55 +0000 Subject: [PATCH 12/21] . --- vortex-datafusion/src/persistent/sink.rs | 44 ++++++++++++++++++++---- 1 file changed, 38 insertions(+), 6 deletions(-) diff --git a/vortex-datafusion/src/persistent/sink.rs b/vortex-datafusion/src/persistent/sink.rs index bb0059aa93..737c779f2e 100644 --- a/vortex-datafusion/src/persistent/sink.rs +++ b/vortex-datafusion/src/persistent/sink.rs @@ -12,16 +12,14 @@ use datafusion_execution::{SendableRecordBatchStream, TaskContext}; use datafusion_physical_plan::insert::DataSink; use datafusion_physical_plan::metrics::MetricsSet; use datafusion_physical_plan::{DisplayAs, DisplayFormatType}; -use futures::{StreamExt, TryStreamExt}; +use futures::StreamExt; use object_store::ObjectStore; -use rand::distributions::{Alphanumeric, DistString}; use tokio_stream::wrappers::ReceiverStream; use vortex_array::arrow::FromArrowType; use vortex_array::stream::ArrayStreamAdapter; use vortex_array::Array; use vortex_dtype::DType; -use vortex_error::VortexError; -use vortex_file::{VortexWriteOptions, VORTEX_FILE_EXTENSION}; +use vortex_file::VortexWriteOptions; use vortex_io::{ObjectStoreWriter, VortexWrite}; pub struct VortexSink { @@ -133,6 +131,41 @@ mod tests { use crate::persistent::{register_vortex_format_factory, VortexFormatFactory}; + #[tokio::test] + async fn insert_into() { + let dir = TempDir::new().unwrap(); + + let factory = VortexFormatFactory::default_config(); + let mut session_state_builder = SessionStateBuilder::new().with_default_features(); + register_vortex_format_factory(factory, &mut session_state_builder); + let session = SessionContext::new_with_state(session_state_builder.build()); + + session + .sql(&format!( + "CREATE EXTERNAL TABLE my_tbl \ + (c1 VARCHAR NOT NULL, c2 INT NOT NULL) \ + STORED AS vortex + LOCATION '{}/*';", + dir.path().to_str().unwrap() + )) + .await + .unwrap(); + + session + .sql("INSERT INTO my_tbl VALUES ('hello', 42::INT);") + .await + .unwrap() + .collect() + .await + .unwrap(); + + let my_tbl = session.table("my_tbl").await.unwrap(); + + my_tbl.clone().show().await.unwrap(); + + assert_eq!(my_tbl.count().await.unwrap(), 1); + } + // TODO(adam): Seems like this now panics due to a Vortex issue #[tokio::test] #[should_panic] // This test is not working due to @@ -144,7 +177,7 @@ mod tests { register_vortex_format_factory(factory, &mut session_state_builder); let session = SessionContext::new_with_state(session_state_builder.build()); - let df = session + session .sql(&format!( "CREATE EXTERNAL TABLE my_tbl \ (c1 VARCHAR NOT NULL, c2 INT NOT NULL) \ @@ -155,7 +188,6 @@ mod tests { .await .unwrap(); - assert_eq!(df.clone().count().await.unwrap(), 0); let my_tbl = session.table("my_tbl").await.unwrap(); // Its valuable to have two insert code paths because they actually behave slightly differently From 487656d6b4919859fb9c59666389457b46ce4fae Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Wed, 19 Feb 2025 13:37:02 +0000 Subject: [PATCH 13/21] . --- vortex-datafusion/src/persistent/format.rs | 8 +++-- vortex-datafusion/src/persistent/sink.rs | 39 +--------------------- vortex-layout/src/scan/filter.rs | 1 + 3 files changed, 7 insertions(+), 41 deletions(-) diff --git a/vortex-datafusion/src/persistent/format.rs b/vortex-datafusion/src/persistent/format.rs index 29f32beb5f..f2ed5345ff 100644 --- a/vortex-datafusion/src/persistent/format.rs +++ b/vortex-datafusion/src/persistent/format.rs @@ -28,7 +28,7 @@ use vortex_array::{stats, ContextRef}; use vortex_dtype::DType; use vortex_error::{vortex_err, VortexExpect, VortexResult}; use vortex_expr::datafusion::convert_expr_to_vortex; -use vortex_expr::{and, Identity, VortexExpr}; +use vortex_expr::{and, VortexExpr}; use vortex_file::{VortexOpenOptions, VORTEX_FILE_EXTENSION}; use vortex_io::ObjectStoreReadAt; @@ -303,9 +303,11 @@ impl FileFormat for VortexFormat { return not_impl_err!("Hive style partitioning isn't implemented yet for Vortex"); } - let predicate = make_vortex_predicate(filters).unwrap_or_else(Identity::new_expr); let mut source = VortexSource::new(self.context.clone(), self.file_layout_cache.clone()); - source = source.with_predicate(predicate); + + if let Some(predicate) = make_vortex_predicate(filters) { + source = source.with_predicate(predicate); + } Ok(file_scan_config.with_source(Arc::new(source)).build()) } diff --git a/vortex-datafusion/src/persistent/sink.rs b/vortex-datafusion/src/persistent/sink.rs index 737c779f2e..94251f759c 100644 --- a/vortex-datafusion/src/persistent/sink.rs +++ b/vortex-datafusion/src/persistent/sink.rs @@ -132,43 +132,6 @@ mod tests { use crate::persistent::{register_vortex_format_factory, VortexFormatFactory}; #[tokio::test] - async fn insert_into() { - let dir = TempDir::new().unwrap(); - - let factory = VortexFormatFactory::default_config(); - let mut session_state_builder = SessionStateBuilder::new().with_default_features(); - register_vortex_format_factory(factory, &mut session_state_builder); - let session = SessionContext::new_with_state(session_state_builder.build()); - - session - .sql(&format!( - "CREATE EXTERNAL TABLE my_tbl \ - (c1 VARCHAR NOT NULL, c2 INT NOT NULL) \ - STORED AS vortex - LOCATION '{}/*';", - dir.path().to_str().unwrap() - )) - .await - .unwrap(); - - session - .sql("INSERT INTO my_tbl VALUES ('hello', 42::INT);") - .await - .unwrap() - .collect() - .await - .unwrap(); - - let my_tbl = session.table("my_tbl").await.unwrap(); - - my_tbl.clone().show().await.unwrap(); - - assert_eq!(my_tbl.count().await.unwrap(), 1); - } - - // TODO(adam): Seems like this now panics due to a Vortex issue - #[tokio::test] - #[should_panic] // This test is not working due to async fn test_insert_into() { let dir = TempDir::new().unwrap(); @@ -220,7 +183,7 @@ mod tests { .unwrap(); session - .sql("INSERT INTO my_tbl VALUES ('hello', 42::INT);") + .sql("INSERT INTO my_tbl VALUES ('world', 24);") .await .unwrap() .collect() diff --git a/vortex-layout/src/scan/filter.rs b/vortex-layout/src/scan/filter.rs index efb42ec6bb..f9b46644db 100644 --- a/vortex-layout/src/scan/filter.rs +++ b/vortex-layout/src/scan/filter.rs @@ -200,6 +200,7 @@ pub struct FilterEvaluation { impl FilterEvaluation { pub async fn evaluate(&mut self, evaluator: E) -> VortexResult { + dbg!(self.filter_expr.conjuncts.as_slice()); // First, we run all conjuncts through the evaluators pruning function. This helps trim // down the mask based on cheap statistics. let pruning_masks = try_join_all(self.filter_expr.conjuncts.iter().map(|expr| { From bf1dcc914a7683670a05c4485631535063258072 Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Wed, 19 Feb 2025 13:55:15 +0000 Subject: [PATCH 14/21] cleanup debug --- vortex-layout/src/scan/filter.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/vortex-layout/src/scan/filter.rs b/vortex-layout/src/scan/filter.rs index f9b46644db..efb42ec6bb 100644 --- a/vortex-layout/src/scan/filter.rs +++ b/vortex-layout/src/scan/filter.rs @@ -200,7 +200,6 @@ pub struct FilterEvaluation { impl FilterEvaluation { pub async fn evaluate(&mut self, evaluator: E) -> VortexResult { - dbg!(self.filter_expr.conjuncts.as_slice()); // First, we run all conjuncts through the evaluators pruning function. This helps trim // down the mask based on cheap statistics. let pruning_masks = try_join_all(self.filter_expr.conjuncts.iter().map(|expr| { From cd421861c88b0e3086224364aa6c99fb2ac5f388 Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Thu, 20 Feb 2025 19:10:07 +0000 Subject: [PATCH 15/21] metrics conflict --- vortex-datafusion/src/persistent/format.rs | 8 +++++++- vortex-datafusion/src/persistent/metrics.rs | 8 ++++---- vortex-datafusion/src/persistent/source.rs | 20 +++++++++++++++----- 3 files changed, 26 insertions(+), 10 deletions(-) diff --git a/vortex-datafusion/src/persistent/format.rs b/vortex-datafusion/src/persistent/format.rs index 3c5a3da7b0..ddb8f69697 100644 --- a/vortex-datafusion/src/persistent/format.rs +++ b/vortex-datafusion/src/persistent/format.rs @@ -33,6 +33,7 @@ use vortex_file::{VortexOpenOptions, VORTEX_FILE_EXTENSION}; use vortex_io::ObjectStoreReadAt; use super::cache::FileLayoutCache; +use super::metrics::VortexExecMetrics; use super::sink::VortexSink; use super::source::VortexSource; use crate::can_be_pushed_down; @@ -158,6 +159,7 @@ impl FileFormat for VortexFormat { Arc::new(VortexSource::new( self.context.clone(), self.file_layout_cache.clone(), + VortexExecMetrics::default(), )) } @@ -307,7 +309,11 @@ impl FileFormat for VortexFormat { return not_impl_err!("Vortex doesn't support output ordering"); } - let mut source = VortexSource::new(self.context.clone(), self.file_layout_cache.clone()); + let mut source = VortexSource::new( + self.context.clone(), + self.file_layout_cache.clone(), + VortexExecMetrics::default(), + ); if let Some(predicate) = make_vortex_predicate(filters) { source = source.with_predicate(predicate); diff --git a/vortex-datafusion/src/persistent/metrics.rs b/vortex-datafusion/src/persistent/metrics.rs index a4739a3123..ec5f7a983f 100644 --- a/vortex-datafusion/src/persistent/metrics.rs +++ b/vortex-datafusion/src/persistent/metrics.rs @@ -3,7 +3,7 @@ use std::time::Duration; use datafusion_physical_plan::metrics::{ Count, ExecutionPlanMetricsSet, Gauge, Label as DatafusionLabel, - MetricValue as DatafusionMetricValue, MetricsSet, Time, + MetricValue as DatafusionMetricValue, Time, }; use datafusion_physical_plan::Metric as DatafusionMetric; use vortex_metrics::{DefaultTags, Metric, MetricId, Tags, VortexMetrics}; @@ -21,15 +21,15 @@ impl VortexExecMetrics { self.vortex.child_with_tags(additional_tags) } - pub fn metrics_set(&self) -> MetricsSet { - let mut base = self.execution_plan.clone_inner(); + pub fn report_to_datafusion(&self) -> &ExecutionPlanMetricsSet { + let base = &self.execution_plan; for metric in self .vortex .snapshot() .iter() .flat_map(|(id, metric)| metric_to_datafusion(id, metric)) { - base.push(Arc::new(metric)); + base.register(Arc::new(metric)); } base } diff --git a/vortex-datafusion/src/persistent/source.rs b/vortex-datafusion/src/persistent/source.rs index ecad6315ae..e005afe973 100644 --- a/vortex-datafusion/src/persistent/source.rs +++ b/vortex-datafusion/src/persistent/source.rs @@ -16,6 +16,7 @@ use vortex_file::VORTEX_FILE_EXTENSION; use super::cache::FileLayoutCache; use super::config::{ConfigProjection, FileScanConfigExt}; +use super::metrics::{VortexExecMetrics, PARTITION_LABEL}; use super::opener::VortexFileOpener; /// A config for [`VortexFileOpener`]. Used to create [`DataSourceExec`] based physical plans. @@ -30,20 +31,24 @@ pub struct VortexSource { pub(crate) batch_size: Option, pub(crate) projected_statistics: Option, pub(crate) arrow_schema: Option, - pub(crate) metrics: ExecutionPlanMetricsSet, + pub(crate) metrics: VortexExecMetrics, } impl VortexSource { - pub(crate) fn new(ctx: ContextRef, initial_read_cache: FileLayoutCache) -> Self { + pub(crate) fn new( + ctx: ContextRef, + initial_read_cache: FileLayoutCache, + metrics: VortexExecMetrics, + ) -> Self { Self { ctx, initial_read_cache, + metrics, projection: None, batch_size: None, projected_statistics: None, arrow_schema: None, predicate: None, - metrics: ExecutionPlanMetricsSet::default(), } } @@ -60,12 +65,16 @@ impl FileSource for VortexSource { &self, object_store: DFResult>, base_config: &FileScanConfig, - _partition: usize, + partition: usize, ) -> DFResult> { let object_store = object_store?; let (scheme, _) = ObjectStoreScheme::parse(base_config.object_store_url.as_ref()) .map_err(object_store::Error::from)?; + let partition_metrics = self + .metrics + .child_with_tags([(PARTITION_LABEL, partition.to_string())].into_iter()); + let Some(batch_size) = self.batch_size else { return Err(internal_datafusion_err!( "batch_size must be supplied to VortexSource" @@ -83,6 +92,7 @@ impl FileSource for VortexSource { .clone() .vortex_expect("We should have a schema here"), batch_size, + partition_metrics, )?; Ok(Arc::new(opener)) @@ -135,7 +145,7 @@ impl FileSource for VortexSource { } fn metrics(&self) -> &ExecutionPlanMetricsSet { - &self.metrics + self.metrics.report_to_datafusion() } fn statistics(&self) -> DFResult { From 57aedeba370618e501b312a18e45ec5e1b5919ff Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Fri, 21 Feb 2025 13:02:25 +0000 Subject: [PATCH 16/21] . --- Cargo.lock | 56 ++++++++++++---------- Cargo.toml | 14 +++--- vortex-datafusion/src/persistent/source.rs | 10 ---- 3 files changed, 37 insertions(+), 43 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 51f85e9579..049d4c3882 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1306,7 +1306,7 @@ dependencies = [ [[package]] name = "datafusion" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=refs%2Fpull%2F14754%2Fhead#a40a636f8b756f1121904458c72f1b33ee7a3f81" +source = "git+https://github.com/apache/datafusion.git?branch=main#9ca09cf8f769a3f0a64dbc87ec84eb6fe08b36f6" dependencies = [ "arrow", "arrow-ipc", @@ -1351,7 +1351,7 @@ dependencies = [ [[package]] name = "datafusion-catalog" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=refs%2Fpull%2F14754%2Fhead#a40a636f8b756f1121904458c72f1b33ee7a3f81" +source = "git+https://github.com/apache/datafusion.git?branch=main#9ca09cf8f769a3f0a64dbc87ec84eb6fe08b36f6" dependencies = [ "arrow", "async-trait", @@ -1370,7 +1370,7 @@ dependencies = [ [[package]] name = "datafusion-catalog-listing" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=refs%2Fpull%2F14754%2Fhead#a40a636f8b756f1121904458c72f1b33ee7a3f81" +source = "git+https://github.com/apache/datafusion.git?branch=main#9ca09cf8f769a3f0a64dbc87ec84eb6fe08b36f6" dependencies = [ "arrow", "async-trait", @@ -1391,7 +1391,7 @@ dependencies = [ [[package]] name = "datafusion-common" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=refs%2Fpull%2F14754%2Fhead#a40a636f8b756f1121904458c72f1b33ee7a3f81" +source = "git+https://github.com/apache/datafusion.git?branch=main#9ca09cf8f769a3f0a64dbc87ec84eb6fe08b36f6" dependencies = [ "ahash", "arrow", @@ -1413,7 +1413,7 @@ dependencies = [ [[package]] name = "datafusion-common-runtime" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=refs%2Fpull%2F14754%2Fhead#a40a636f8b756f1121904458c72f1b33ee7a3f81" +source = "git+https://github.com/apache/datafusion.git?branch=main#9ca09cf8f769a3f0a64dbc87ec84eb6fe08b36f6" dependencies = [ "log", "tokio", @@ -1422,7 +1422,7 @@ dependencies = [ [[package]] name = "datafusion-datasource" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=refs%2Fpull%2F14754%2Fhead#a40a636f8b756f1121904458c72f1b33ee7a3f81" +source = "git+https://github.com/apache/datafusion.git?branch=main#9ca09cf8f769a3f0a64dbc87ec84eb6fe08b36f6" dependencies = [ "arrow", "async-trait", @@ -1433,6 +1433,8 @@ dependencies = [ "datafusion-common-runtime", "datafusion-execution", "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-expr-common", "datafusion-physical-plan", "futures", "glob", @@ -1447,12 +1449,12 @@ dependencies = [ [[package]] name = "datafusion-doc" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=refs%2Fpull%2F14754%2Fhead#a40a636f8b756f1121904458c72f1b33ee7a3f81" +source = "git+https://github.com/apache/datafusion.git?branch=main#9ca09cf8f769a3f0a64dbc87ec84eb6fe08b36f6" [[package]] name = "datafusion-execution" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=refs%2Fpull%2F14754%2Fhead#a40a636f8b756f1121904458c72f1b33ee7a3f81" +source = "git+https://github.com/apache/datafusion.git?branch=main#9ca09cf8f769a3f0a64dbc87ec84eb6fe08b36f6" dependencies = [ "arrow", "dashmap", @@ -1470,7 +1472,7 @@ dependencies = [ [[package]] name = "datafusion-expr" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=refs%2Fpull%2F14754%2Fhead#a40a636f8b756f1121904458c72f1b33ee7a3f81" +source = "git+https://github.com/apache/datafusion.git?branch=main#9ca09cf8f769a3f0a64dbc87ec84eb6fe08b36f6" dependencies = [ "arrow", "chrono", @@ -1489,7 +1491,7 @@ dependencies = [ [[package]] name = "datafusion-expr-common" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=refs%2Fpull%2F14754%2Fhead#a40a636f8b756f1121904458c72f1b33ee7a3f81" +source = "git+https://github.com/apache/datafusion.git?branch=main#9ca09cf8f769a3f0a64dbc87ec84eb6fe08b36f6" dependencies = [ "arrow", "datafusion-common", @@ -1501,7 +1503,7 @@ dependencies = [ [[package]] name = "datafusion-functions" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=refs%2Fpull%2F14754%2Fhead#a40a636f8b756f1121904458c72f1b33ee7a3f81" +source = "git+https://github.com/apache/datafusion.git?branch=main#9ca09cf8f769a3f0a64dbc87ec84eb6fe08b36f6" dependencies = [ "arrow", "arrow-buffer", @@ -1525,7 +1527,7 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=refs%2Fpull%2F14754%2Fhead#a40a636f8b756f1121904458c72f1b33ee7a3f81" +source = "git+https://github.com/apache/datafusion.git?branch=main#9ca09cf8f769a3f0a64dbc87ec84eb6fe08b36f6" dependencies = [ "ahash", "arrow", @@ -1545,7 +1547,7 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate-common" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=refs%2Fpull%2F14754%2Fhead#a40a636f8b756f1121904458c72f1b33ee7a3f81" +source = "git+https://github.com/apache/datafusion.git?branch=main#9ca09cf8f769a3f0a64dbc87ec84eb6fe08b36f6" dependencies = [ "ahash", "arrow", @@ -1557,7 +1559,7 @@ dependencies = [ [[package]] name = "datafusion-functions-table" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=refs%2Fpull%2F14754%2Fhead#a40a636f8b756f1121904458c72f1b33ee7a3f81" +source = "git+https://github.com/apache/datafusion.git?branch=main#9ca09cf8f769a3f0a64dbc87ec84eb6fe08b36f6" dependencies = [ "arrow", "async-trait", @@ -1572,7 +1574,7 @@ dependencies = [ [[package]] name = "datafusion-functions-window" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=refs%2Fpull%2F14754%2Fhead#a40a636f8b756f1121904458c72f1b33ee7a3f81" +source = "git+https://github.com/apache/datafusion.git?branch=main#9ca09cf8f769a3f0a64dbc87ec84eb6fe08b36f6" dependencies = [ "datafusion-common", "datafusion-doc", @@ -1588,7 +1590,7 @@ dependencies = [ [[package]] name = "datafusion-functions-window-common" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=refs%2Fpull%2F14754%2Fhead#a40a636f8b756f1121904458c72f1b33ee7a3f81" +source = "git+https://github.com/apache/datafusion.git?branch=main#9ca09cf8f769a3f0a64dbc87ec84eb6fe08b36f6" dependencies = [ "datafusion-common", "datafusion-physical-expr-common", @@ -1597,7 +1599,7 @@ dependencies = [ [[package]] name = "datafusion-macros" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=refs%2Fpull%2F14754%2Fhead#a40a636f8b756f1121904458c72f1b33ee7a3f81" +source = "git+https://github.com/apache/datafusion.git?branch=main#9ca09cf8f769a3f0a64dbc87ec84eb6fe08b36f6" dependencies = [ "datafusion-expr", "quote", @@ -1607,7 +1609,7 @@ dependencies = [ [[package]] name = "datafusion-optimizer" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=refs%2Fpull%2F14754%2Fhead#a40a636f8b756f1121904458c72f1b33ee7a3f81" +source = "git+https://github.com/apache/datafusion.git?branch=main#9ca09cf8f769a3f0a64dbc87ec84eb6fe08b36f6" dependencies = [ "arrow", "chrono", @@ -1624,7 +1626,7 @@ dependencies = [ [[package]] name = "datafusion-physical-expr" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=refs%2Fpull%2F14754%2Fhead#a40a636f8b756f1121904458c72f1b33ee7a3f81" +source = "git+https://github.com/apache/datafusion.git?branch=main#9ca09cf8f769a3f0a64dbc87ec84eb6fe08b36f6" dependencies = [ "ahash", "arrow", @@ -1645,7 +1647,7 @@ dependencies = [ [[package]] name = "datafusion-physical-expr-common" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=refs%2Fpull%2F14754%2Fhead#a40a636f8b756f1121904458c72f1b33ee7a3f81" +source = "git+https://github.com/apache/datafusion.git?branch=main#9ca09cf8f769a3f0a64dbc87ec84eb6fe08b36f6" dependencies = [ "ahash", "arrow", @@ -1658,10 +1660,11 @@ dependencies = [ [[package]] name = "datafusion-physical-optimizer" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=refs%2Fpull%2F14754%2Fhead#a40a636f8b756f1121904458c72f1b33ee7a3f81" +source = "git+https://github.com/apache/datafusion.git?branch=main#9ca09cf8f769a3f0a64dbc87ec84eb6fe08b36f6" dependencies = [ "arrow", "datafusion-common", + "datafusion-datasource", "datafusion-execution", "datafusion-expr", "datafusion-expr-common", @@ -1675,7 +1678,7 @@ dependencies = [ [[package]] name = "datafusion-physical-plan" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=refs%2Fpull%2F14754%2Fhead#a40a636f8b756f1121904458c72f1b33ee7a3f81" +source = "git+https://github.com/apache/datafusion.git?branch=main#9ca09cf8f769a3f0a64dbc87ec84eb6fe08b36f6" dependencies = [ "ahash", "arrow", @@ -1704,7 +1707,7 @@ dependencies = [ [[package]] name = "datafusion-sql" version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?rev=refs%2Fpull%2F14754%2Fhead#a40a636f8b756f1121904458c72f1b33ee7a3f81" +source = "git+https://github.com/apache/datafusion.git?branch=main#9ca09cf8f769a3f0a64dbc87ec84eb6fe08b36f6" dependencies = [ "arrow", "bigdecimal", @@ -5083,11 +5086,12 @@ checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" [[package]] name = "uuid" -version = "1.12.1" +version = "1.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3758f5e68192bb96cc8f9b7e2c2cfdabb435499a28499a42f8f984092adad4b" +checksum = "93d59ca99a559661b96bf898d8fce28ed87935fd2bea9f05983c1464dd6c71b1" dependencies = [ - "getrandom 0.2.15", + "getrandom 0.3.1", + "js-sys", "wasm-bindgen", ] diff --git a/Cargo.toml b/Cargo.toml index 35a9bffd93..640db74ac7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -73,12 +73,12 @@ chrono = "0.4.38" clap = "4" compio = { version = "0.13", features = ["io-uring"], default-features = false } crossterm = "0.28" -datafusion = { version = "45", default-features = false, git = "https://github.com/apache/datafusion.git", rev = "refs/pull/14754/head" } -datafusion-common = {version = "45", git = "https://github.com/apache/datafusion.git", rev = "refs/pull/14754/head" } -datafusion-execution = {version = "45", git = "https://github.com/apache/datafusion.git", rev = "refs/pull/14754/head" } -datafusion-expr = {version = "45", git = "https://github.com/apache/datafusion.git", rev = "refs/pull/14754/head" } -datafusion-physical-expr = {version = "45", git = "https://github.com/apache/datafusion.git", rev = "refs/pull/14754/head" } -datafusion-physical-plan = {version = "45", git = "https://github.com/apache/datafusion.git", rev = "refs/pull/14754/head" } +datafusion = { version = "45", default-features = false, git = "https://github.com/apache/datafusion.git", branch = "main" } +datafusion-common = {version = "45", git = "https://github.com/apache/datafusion.git", branch = "main" } +datafusion-execution = {version = "45", git = "https://github.com/apache/datafusion.git", branch = "main" } +datafusion-expr = {version = "45", git = "https://github.com/apache/datafusion.git", branch = "main" } +datafusion-physical-expr = {version = "45", git = "https://github.com/apache/datafusion.git", branch = "main" } +datafusion-physical-plan = {version = "45", git = "https://github.com/apache/datafusion.git", branch = "main" } divan = { package = "codspeed-divan-compat", version = "2.8.0" } dyn-hash = "0.2.0" enum-iterator = "2.0.0" @@ -156,7 +156,7 @@ tracing-chrome = "0.7.2" tracing-futures = "0.2.5" tracing-subscriber = "0.3.19" url = "2.5.4" -uuid = "1.8.0" +uuid = "1.13" wasm-bindgen-futures = "0.4.39" witchcraft-metrics = "1.0.1" diff --git a/vortex-datafusion/src/persistent/source.rs b/vortex-datafusion/src/persistent/source.rs index e005afe973..c7ec40bf39 100644 --- a/vortex-datafusion/src/persistent/source.rs +++ b/vortex-datafusion/src/persistent/source.rs @@ -165,16 +165,6 @@ impl FileSource for VortexSource { VORTEX_FILE_EXTENSION } - fn supports_repartition(&self, config: &FileScanConfig) -> bool { - let total_file_count = config - .file_groups - .iter() - .map(|group| group.len()) - .sum::(); - // Vortex doesn't support repartitioning if there's only one file - total_file_count > 1 - } - fn repartitioned( &self, target_partitions: usize, From 34c925e9c04b2bf603c41606982c02dbce24178d Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Mon, 3 Mar 2025 12:52:40 +0000 Subject: [PATCH 17/21] . --- Cargo.lock | 163 ++++++++++----------- Cargo.toml | 32 ++-- vortex-datafusion/src/persistent/format.rs | 3 +- vortex-datafusion/src/persistent/opener.rs | 7 +- vortex-datafusion/src/persistent/source.rs | 25 ++-- 5 files changed, 112 insertions(+), 118 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a983fd30f9..2e6b84122a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -167,9 +167,9 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" -version = "54.2.0" +version = "54.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "755b6da235ac356a869393c23668c663720b8749dd6f15e52b6c214b4b964cc7" +checksum = "dc208515aa0151028e464cc94a692156e945ce5126abd3537bb7fd6ba2143ed1" dependencies = [ "arrow-arith", "arrow-array", @@ -189,9 +189,9 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "54.2.0" +version = "54.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64656a1e0b13ca766f8440752e9a93e11014eec7b67909986f83ed0ab1fe37b8" +checksum = "e07e726e2b3f7816a85c6a45b6ec118eeeabf0b2a8c208122ad949437181f49a" dependencies = [ "arrow-array", "arrow-buffer", @@ -203,9 +203,9 @@ dependencies = [ [[package]] name = "arrow-array" -version = "54.2.0" +version = "54.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57a4a6d2896083cfbdf84a71a863b22460d0708f8206a8373c52e326cc72ea1a" +checksum = "a2262eba4f16c78496adfd559a29fe4b24df6088efc9985a873d58e92be022d5" dependencies = [ "ahash", "arrow-buffer", @@ -220,9 +220,9 @@ dependencies = [ [[package]] name = "arrow-buffer" -version = "54.2.0" +version = "54.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cef870583ce5e4f3b123c181706f2002fb134960f9a911900f64ba4830c7a43a" +checksum = "4e899dade2c3b7f5642eb8366cfd898958bcca099cde6dfea543c7e8d3ad88d4" dependencies = [ "bytes", "half", @@ -231,9 +231,9 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "54.2.0" +version = "54.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ac7eba5a987f8b4a7d9629206ba48e19a1991762795bbe5d08497b7736017ee" +checksum = "4103d88c5b441525ed4ac23153be7458494c2b0c9a11115848fdb9b81f6f886a" dependencies = [ "arrow-array", "arrow-buffer", @@ -252,9 +252,9 @@ dependencies = [ [[package]] name = "arrow-csv" -version = "54.2.0" +version = "54.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "90f12542b8164398fc9ec595ff783c4cf6044daa89622c5a7201be920e4c0d4c" +checksum = "43d3cb0914486a3cae19a5cad2598e44e225d53157926d0ada03c20521191a65" dependencies = [ "arrow-array", "arrow-cast", @@ -268,9 +268,9 @@ dependencies = [ [[package]] name = "arrow-data" -version = "54.2.0" +version = "54.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b095e8a4f3c309544935d53e04c3bfe4eea4e71c3de6fe0416d1f08bb4441a83" +checksum = "0a329fb064477c9ec5f0870d2f5130966f91055c7c5bce2b3a084f116bc28c3b" dependencies = [ "arrow-buffer", "arrow-schema", @@ -280,9 +280,9 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "54.2.0" +version = "54.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "65c63da4afedde2b25ef69825cd4663ca76f78f79ffe2d057695742099130ff6" +checksum = "ddecdeab02491b1ce88885986e25002a3da34dd349f682c7cfe67bab7cc17b86" dependencies = [ "arrow-array", "arrow-buffer", @@ -294,9 +294,9 @@ dependencies = [ [[package]] name = "arrow-json" -version = "54.2.0" +version = "54.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9551d9400532f23a370cabbea1dc5a53c49230397d41f96c4c8eedf306199305" +checksum = "d03b9340013413eb84868682ace00a1098c81a5ebc96d279f7ebf9a4cac3c0fd" dependencies = [ "arrow-array", "arrow-buffer", @@ -314,9 +314,9 @@ dependencies = [ [[package]] name = "arrow-ord" -version = "54.2.0" +version = "54.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c07223476f8219d1ace8cd8d85fa18c4ebd8d945013f25ef5c72e85085ca4ee" +checksum = "f841bfcc1997ef6ac48ee0305c4dfceb1f7c786fe31e67c1186edf775e1f1160" dependencies = [ "arrow-array", "arrow-buffer", @@ -327,9 +327,9 @@ dependencies = [ [[package]] name = "arrow-row" -version = "54.2.0" +version = "54.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91b194b38bfd89feabc23e798238989c6648b2506ad639be42ec8eb1658d82c4" +checksum = "1eeb55b0a0a83851aa01f2ca5ee5648f607e8506ba6802577afdda9d75cdedcd" dependencies = [ "arrow-array", "arrow-buffer", @@ -340,18 +340,18 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "54.2.0" +version = "54.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0f40f6be8f78af1ab610db7d9b236e21d587b7168e368a36275d2e5670096735" +checksum = "85934a9d0261e0fa5d4e2a5295107d743b543a6e0484a835d4b8db2da15306f9" dependencies = [ "bitflags 2.8.0", ] [[package]] name = "arrow-select" -version = "54.2.0" +version = "54.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac265273864a820c4a179fc67182ccc41ea9151b97024e1be956f0f2369c2539" +checksum = "7e2932aece2d0c869dd2125feb9bd1709ef5c445daa3838ac4112dcfa0fda52c" dependencies = [ "ahash", "arrow-array", @@ -363,9 +363,9 @@ dependencies = [ [[package]] name = "arrow-string" -version = "54.2.0" +version = "54.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d44c8eed43be4ead49128370f7131f054839d3d6003e52aebf64322470b8fbd0" +checksum = "912e38bd6a7a7714c1d9b61df80315685553b7455e8a6045c27531d8ecd5b458" dependencies = [ "arrow-array", "arrow-buffer", @@ -1309,8 +1309,8 @@ dependencies = [ [[package]] name = "datafusion" -version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?branch=main#9ca09cf8f769a3f0a64dbc87ec84eb6fe08b36f6" +version = "46.0.0" +source = "git+https://github.com/apache/datafusion.git?branch=branch-46#ec4862fa2d870fcab973fd1589ef99b6bf8d560f" dependencies = [ "arrow", "arrow-ipc", @@ -1354,8 +1354,8 @@ dependencies = [ [[package]] name = "datafusion-catalog" -version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?branch=main#9ca09cf8f769a3f0a64dbc87ec84eb6fe08b36f6" +version = "46.0.0" +source = "git+https://github.com/apache/datafusion.git?branch=branch-46#ec4862fa2d870fcab973fd1589ef99b6bf8d560f" dependencies = [ "arrow", "async-trait", @@ -1373,8 +1373,8 @@ dependencies = [ [[package]] name = "datafusion-catalog-listing" -version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?branch=main#9ca09cf8f769a3f0a64dbc87ec84eb6fe08b36f6" +version = "46.0.0" +source = "git+https://github.com/apache/datafusion.git?branch=branch-46#ec4862fa2d870fcab973fd1589ef99b6bf8d560f" dependencies = [ "arrow", "async-trait", @@ -1394,8 +1394,8 @@ dependencies = [ [[package]] name = "datafusion-common" -version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?branch=main#9ca09cf8f769a3f0a64dbc87ec84eb6fe08b36f6" +version = "46.0.0" +source = "git+https://github.com/apache/datafusion.git?branch=branch-46#ec4862fa2d870fcab973fd1589ef99b6bf8d560f" dependencies = [ "ahash", "arrow", @@ -1416,8 +1416,8 @@ dependencies = [ [[package]] name = "datafusion-common-runtime" -version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?branch=main#9ca09cf8f769a3f0a64dbc87ec84eb6fe08b36f6" +version = "46.0.0" +source = "git+https://github.com/apache/datafusion.git?branch=branch-46#ec4862fa2d870fcab973fd1589ef99b6bf8d560f" dependencies = [ "log", "tokio", @@ -1425,8 +1425,8 @@ dependencies = [ [[package]] name = "datafusion-datasource" -version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?branch=main#9ca09cf8f769a3f0a64dbc87ec84eb6fe08b36f6" +version = "46.0.0" +source = "git+https://github.com/apache/datafusion.git?branch=branch-46#ec4862fa2d870fcab973fd1589ef99b6bf8d560f" dependencies = [ "arrow", "async-trait", @@ -1452,13 +1452,13 @@ dependencies = [ [[package]] name = "datafusion-doc" -version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?branch=main#9ca09cf8f769a3f0a64dbc87ec84eb6fe08b36f6" +version = "46.0.0" +source = "git+https://github.com/apache/datafusion.git?branch=branch-46#ec4862fa2d870fcab973fd1589ef99b6bf8d560f" [[package]] name = "datafusion-execution" -version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?branch=main#9ca09cf8f769a3f0a64dbc87ec84eb6fe08b36f6" +version = "46.0.0" +source = "git+https://github.com/apache/datafusion.git?branch=branch-46#ec4862fa2d870fcab973fd1589ef99b6bf8d560f" dependencies = [ "arrow", "dashmap", @@ -1475,8 +1475,8 @@ dependencies = [ [[package]] name = "datafusion-expr" -version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?branch=main#9ca09cf8f769a3f0a64dbc87ec84eb6fe08b36f6" +version = "46.0.0" +source = "git+https://github.com/apache/datafusion.git?branch=branch-46#ec4862fa2d870fcab973fd1589ef99b6bf8d560f" dependencies = [ "arrow", "chrono", @@ -1494,8 +1494,8 @@ dependencies = [ [[package]] name = "datafusion-expr-common" -version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?branch=main#9ca09cf8f769a3f0a64dbc87ec84eb6fe08b36f6" +version = "46.0.0" +source = "git+https://github.com/apache/datafusion.git?branch=branch-46#ec4862fa2d870fcab973fd1589ef99b6bf8d560f" dependencies = [ "arrow", "datafusion-common", @@ -1506,8 +1506,8 @@ dependencies = [ [[package]] name = "datafusion-functions" -version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?branch=main#9ca09cf8f769a3f0a64dbc87ec84eb6fe08b36f6" +version = "46.0.0" +source = "git+https://github.com/apache/datafusion.git?branch=branch-46#ec4862fa2d870fcab973fd1589ef99b6bf8d560f" dependencies = [ "arrow", "arrow-buffer", @@ -1530,8 +1530,8 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate" -version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?branch=main#9ca09cf8f769a3f0a64dbc87ec84eb6fe08b36f6" +version = "46.0.0" +source = "git+https://github.com/apache/datafusion.git?branch=branch-46#ec4862fa2d870fcab973fd1589ef99b6bf8d560f" dependencies = [ "ahash", "arrow", @@ -1550,8 +1550,8 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate-common" -version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?branch=main#9ca09cf8f769a3f0a64dbc87ec84eb6fe08b36f6" +version = "46.0.0" +source = "git+https://github.com/apache/datafusion.git?branch=branch-46#ec4862fa2d870fcab973fd1589ef99b6bf8d560f" dependencies = [ "ahash", "arrow", @@ -1562,8 +1562,8 @@ dependencies = [ [[package]] name = "datafusion-functions-table" -version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?branch=main#9ca09cf8f769a3f0a64dbc87ec84eb6fe08b36f6" +version = "46.0.0" +source = "git+https://github.com/apache/datafusion.git?branch=branch-46#ec4862fa2d870fcab973fd1589ef99b6bf8d560f" dependencies = [ "arrow", "async-trait", @@ -1577,8 +1577,8 @@ dependencies = [ [[package]] name = "datafusion-functions-window" -version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?branch=main#9ca09cf8f769a3f0a64dbc87ec84eb6fe08b36f6" +version = "46.0.0" +source = "git+https://github.com/apache/datafusion.git?branch=branch-46#ec4862fa2d870fcab973fd1589ef99b6bf8d560f" dependencies = [ "datafusion-common", "datafusion-doc", @@ -1593,8 +1593,8 @@ dependencies = [ [[package]] name = "datafusion-functions-window-common" -version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?branch=main#9ca09cf8f769a3f0a64dbc87ec84eb6fe08b36f6" +version = "46.0.0" +source = "git+https://github.com/apache/datafusion.git?branch=branch-46#ec4862fa2d870fcab973fd1589ef99b6bf8d560f" dependencies = [ "datafusion-common", "datafusion-physical-expr-common", @@ -1602,8 +1602,8 @@ dependencies = [ [[package]] name = "datafusion-macros" -version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?branch=main#9ca09cf8f769a3f0a64dbc87ec84eb6fe08b36f6" +version = "46.0.0" +source = "git+https://github.com/apache/datafusion.git?branch=branch-46#ec4862fa2d870fcab973fd1589ef99b6bf8d560f" dependencies = [ "datafusion-expr", "quote", @@ -1612,8 +1612,8 @@ dependencies = [ [[package]] name = "datafusion-optimizer" -version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?branch=main#9ca09cf8f769a3f0a64dbc87ec84eb6fe08b36f6" +version = "46.0.0" +source = "git+https://github.com/apache/datafusion.git?branch=branch-46#ec4862fa2d870fcab973fd1589ef99b6bf8d560f" dependencies = [ "arrow", "chrono", @@ -1629,8 +1629,8 @@ dependencies = [ [[package]] name = "datafusion-physical-expr" -version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?branch=main#9ca09cf8f769a3f0a64dbc87ec84eb6fe08b36f6" +version = "46.0.0" +source = "git+https://github.com/apache/datafusion.git?branch=branch-46#ec4862fa2d870fcab973fd1589ef99b6bf8d560f" dependencies = [ "ahash", "arrow", @@ -1650,8 +1650,8 @@ dependencies = [ [[package]] name = "datafusion-physical-expr-common" -version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?branch=main#9ca09cf8f769a3f0a64dbc87ec84eb6fe08b36f6" +version = "46.0.0" +source = "git+https://github.com/apache/datafusion.git?branch=branch-46#ec4862fa2d870fcab973fd1589ef99b6bf8d560f" dependencies = [ "ahash", "arrow", @@ -1663,12 +1663,11 @@ dependencies = [ [[package]] name = "datafusion-physical-optimizer" -version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?branch=main#9ca09cf8f769a3f0a64dbc87ec84eb6fe08b36f6" +version = "46.0.0" +source = "git+https://github.com/apache/datafusion.git?branch=branch-46#ec4862fa2d870fcab973fd1589ef99b6bf8d560f" dependencies = [ "arrow", "datafusion-common", - "datafusion-datasource", "datafusion-execution", "datafusion-expr", "datafusion-expr-common", @@ -1681,8 +1680,8 @@ dependencies = [ [[package]] name = "datafusion-physical-plan" -version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?branch=main#9ca09cf8f769a3f0a64dbc87ec84eb6fe08b36f6" +version = "46.0.0" +source = "git+https://github.com/apache/datafusion.git?branch=branch-46#ec4862fa2d870fcab973fd1589ef99b6bf8d560f" dependencies = [ "ahash", "arrow", @@ -1710,8 +1709,8 @@ dependencies = [ [[package]] name = "datafusion-sql" -version = "45.0.0" -source = "git+https://github.com/apache/datafusion.git?branch=main#9ca09cf8f769a3f0a64dbc87ec84eb6fe08b36f6" +version = "46.0.0" +source = "git+https://github.com/apache/datafusion.git?branch=branch-46#ec4862fa2d870fcab973fd1589ef99b6bf8d560f" dependencies = [ "arrow", "bigdecimal", @@ -3138,7 +3137,7 @@ version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "af1844ef2428cc3e1cb900be36181049ef3d3193c63e43026cfe202983b27a56" dependencies = [ - "proc-macro-crate 1.3.1", + "proc-macro-crate 3.2.0", "proc-macro2", "quote", "syn 2.0.98", @@ -3363,9 +3362,9 @@ dependencies = [ [[package]] name = "parquet" -version = "54.2.0" +version = "54.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "761c44d824fe83106e0600d2510c07bf4159a4985bf0569b513ea4288dc1b4fb" +checksum = "f88838dca3b84d41444a0341b19f347e8098a3898b0f21536654b8b799e11abd" dependencies = [ "ahash", "arrow-array", @@ -3599,7 +3598,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "be769465445e8c1474e9c5dac2018218498557af32d9ed057325ec9a41ae81bf" dependencies = [ "heck", - "itertools 0.13.0", + "itertools 0.14.0", "log", "multimap", "once_cell", @@ -3619,7 +3618,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8a56d757972c98b346a9b766e3f02746cde6dd1cd1d1d563472929fdd74bec4d" dependencies = [ "anyhow", - "itertools 0.13.0", + "itertools 0.14.0", "proc-macro2", "quote", "syn 2.0.98", @@ -5223,9 +5222,9 @@ checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" [[package]] name = "uuid" -version = "1.14.0" +version = "1.15.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93d59ca99a559661b96bf898d8fce28ed87935fd2bea9f05983c1464dd6c71b1" +checksum = "e0f540e3240398cce6128b64ba83fdbdd86129c16a3aa1a3a252efd66eb3d587" dependencies = [ "getrandom 0.3.1", "js-sys", diff --git a/Cargo.toml b/Cargo.toml index 11582037ad..23ee09a754 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -53,15 +53,15 @@ categories = ["database-implementations", "data-structures", "compression"] anyhow = "1.0.95" arbitrary = "1.3.2" arrayref = "0.3.7" -arrow = "54.2" -arrow-arith = "54.2" -arrow-array = "54.2" -arrow-buffer = "54.2" -arrow-cast = "54.2" -arrow-ord = "54.2" -arrow-schema = "54.2" -arrow-select = "54.2" -arrow-string = "54.2" +arrow = "54.2.1" +arrow-arith = "54.2.1" +arrow-array = "54.2.1" +arrow-buffer = "54.2.1" +arrow-cast = "54.2.1" +arrow-ord = "54.2.1" +arrow-schema = "54.2.1" +arrow-select = "54.2.1" +arrow-string = "54.2.1" async-once-cell = "0.5.4" async-trait = "0.1.86" backtrace = "0.3.74" @@ -73,12 +73,12 @@ chrono = "0.4.38" clap = "4" compio = { version = "0.13", features = ["io-uring"], default-features = false } crossterm = "0.28" -datafusion = { version = "45", default-features = false, git = "https://github.com/apache/datafusion.git", branch = "main" } -datafusion-common = {version = "45", git = "https://github.com/apache/datafusion.git", branch = "main" } -datafusion-execution = {version = "45", git = "https://github.com/apache/datafusion.git", branch = "main" } -datafusion-expr = {version = "45", git = "https://github.com/apache/datafusion.git", branch = "main" } -datafusion-physical-expr = {version = "45", git = "https://github.com/apache/datafusion.git", branch = "main" } -datafusion-physical-plan = {version = "45", git = "https://github.com/apache/datafusion.git", branch = "main" } +datafusion = { version = "46", default-features = false, git = "https://github.com/apache/datafusion.git", branch = "branch-46" } +datafusion-common = {version = "46", git = "https://github.com/apache/datafusion.git", branch = "branch-46" } +datafusion-execution = {version = "46", git = "https://github.com/apache/datafusion.git", branch = "branch-46" } +datafusion-expr = {version = "46", git = "https://github.com/apache/datafusion.git", branch = "branch-46" } +datafusion-physical-expr = {version = "46", git = "https://github.com/apache/datafusion.git", branch = "branch-46" } +datafusion-physical-plan = {version = "46", git = "https://github.com/apache/datafusion.git", branch = "branch-46" } divan = { package = "codspeed-divan-compat", version = "2.8.0" } dyn-hash = "0.2.0" enum-iterator = "2.0.0" @@ -112,7 +112,7 @@ oneshot = "0.1.10" opentelemetry = "0.28.0" opentelemetry-otlp = "0.28.0" opentelemetry_sdk = "0.28.0" -parquet = "54.2" +parquet = "54.2.1" paste = "1.0.15" pin-project = "1.1.5" pin-project-lite = "0.2.15" diff --git a/vortex-datafusion/src/persistent/format.rs b/vortex-datafusion/src/persistent/format.rs index bdd03ea17d..31654fe30d 100644 --- a/vortex-datafusion/src/persistent/format.rs +++ b/vortex-datafusion/src/persistent/format.rs @@ -4,10 +4,9 @@ use std::sync::Arc; use arrow_schema::{Schema, SchemaRef}; use async_trait::async_trait; use datafusion::catalog::Session; -use datafusion::datasource::data_source::FileSource; use datafusion::datasource::file_format::file_compression_type::FileCompressionType; use datafusion::datasource::file_format::{FileFormat, FileFormatFactory, FilePushdownSupport}; -use datafusion::datasource::physical_plan::{FileScanConfig, FileSinkConfig}; +use datafusion::datasource::physical_plan::{FileScanConfig, FileSinkConfig, FileSource}; use datafusion_common::parsers::CompressionTypeVariant; use datafusion_common::stats::Precision; use datafusion_common::{ diff --git a/vortex-datafusion/src/persistent/opener.rs b/vortex-datafusion/src/persistent/opener.rs index 4b16974d6f..8809e6406a 100644 --- a/vortex-datafusion/src/persistent/opener.rs +++ b/vortex-datafusion/src/persistent/opener.rs @@ -7,7 +7,6 @@ use futures::{FutureExt as _, StreamExt}; use object_store::{ObjectStore, ObjectStoreScheme}; use tokio::runtime::Handle; use vortex_array::{ContextRef, ToCanonical}; -use vortex_error::VortexResult; use vortex_expr::{ExprRef, VortexExpr}; use vortex_file::executor::{TaskExecutor, TokioExecutor}; use vortex_file::{SplitBy, VortexOpenOptions}; @@ -41,8 +40,8 @@ impl VortexFileOpener { projected_arrow_schema: SchemaRef, batch_size: usize, metrics: VortexMetrics, - ) -> VortexResult { - Ok(Self { + ) -> Self { + Self { ctx, scheme, object_store, @@ -52,7 +51,7 @@ impl VortexFileOpener { projected_arrow_schema, batch_size, metrics, - }) + } } } diff --git a/vortex-datafusion/src/persistent/source.rs b/vortex-datafusion/src/persistent/source.rs index c8bb0f5d1c..c1b6ff7b6c 100644 --- a/vortex-datafusion/src/persistent/source.rs +++ b/vortex-datafusion/src/persistent/source.rs @@ -2,10 +2,9 @@ use std::any::Any; use std::sync::Arc; use arrow_schema::SchemaRef; -use datafusion::datasource::data_source::FileSource; use datafusion::datasource::listing::PartitionedFile; -use datafusion::datasource::physical_plan::{FileOpener, FileScanConfig}; -use datafusion_common::{Result as DFResult, Statistics, internal_datafusion_err}; +use datafusion::datasource::physical_plan::{FileOpener, FileScanConfig, FileSource}; +use datafusion_common::{Result as DFResult, Statistics}; use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet; use itertools::Itertools as _; use object_store::{ObjectStore, ObjectStoreScheme}; @@ -63,23 +62,21 @@ impl VortexSource { impl FileSource for VortexSource { fn create_file_opener( &self, - object_store: DFResult>, + object_store: Arc, base_config: &FileScanConfig, partition: usize, - ) -> DFResult> { - let object_store = object_store?; + ) -> Arc { let (scheme, _) = ObjectStoreScheme::parse(base_config.object_store_url.as_ref()) - .map_err(object_store::Error::from)?; + .ok() + .vortex_expect("Couldn't parse object store URL"); let partition_metrics = self .metrics .child_with_tags([(PARTITION_LABEL, partition.to_string())].into_iter()); - let Some(batch_size) = self.batch_size else { - return Err(internal_datafusion_err!( - "batch_size must be supplied to VortexSource" - )); - }; + let batch_size = self + .batch_size + .vortex_expect("batch_size must be supplied to VortexSource"); let opener = VortexFileOpener::new( self.ctx.clone(), @@ -93,9 +90,9 @@ impl FileSource for VortexSource { .vortex_expect("We should have a schema here"), batch_size, partition_metrics, - )?; + ); - Ok(Arc::new(opener)) + Arc::new(opener) } fn as_any(&self) -> &dyn Any { From 3fd5043df0c09c7b68943865f37a051e47b84631 Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Mon, 3 Mar 2025 18:05:01 +0000 Subject: [PATCH 18/21] rename --- vortex-datafusion/src/persistent/format.rs | 7 ++++--- vortex-datafusion/src/persistent/metrics.rs | 4 ++-- vortex-datafusion/src/persistent/source.rs | 6 +++--- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/vortex-datafusion/src/persistent/format.rs b/vortex-datafusion/src/persistent/format.rs index 823261d398..bfa9e1ad8f 100644 --- a/vortex-datafusion/src/persistent/format.rs +++ b/vortex-datafusion/src/persistent/format.rs @@ -33,7 +33,7 @@ use vortex_io::ObjectStoreReadAt; use vortex_layout::{LayoutRegistry, LayoutRegistryExt}; use super::cache::FooterCache; -use super::metrics::VortexExecMetrics; +use super::metrics::VortexSourceMetrics; use super::sink::VortexSink; use super::source::VortexSource; use crate::{PrecisionExt as _, can_be_pushed_down}; @@ -165,7 +165,7 @@ impl FileFormat for VortexFormat { fn file_source(&self) -> Arc { Arc::new(VortexSource::new( self.footer_cache.clone(), - VortexExecMetrics::default(), + VortexSourceMetrics::default(), )) } @@ -329,7 +329,8 @@ impl FileFormat for VortexFormat { return not_impl_err!("Vortex doesn't support output ordering"); } - let mut source = VortexSource::new(self.footer_cache.clone(), VortexExecMetrics::default()); + let mut source = + VortexSource::new(self.footer_cache.clone(), VortexSourceMetrics::default()); if let Some(predicate) = make_vortex_predicate(filters) { source = source.with_predicate(predicate); diff --git a/vortex-datafusion/src/persistent/metrics.rs b/vortex-datafusion/src/persistent/metrics.rs index c09ee32f91..86cda3d3e9 100644 --- a/vortex-datafusion/src/persistent/metrics.rs +++ b/vortex-datafusion/src/persistent/metrics.rs @@ -44,12 +44,12 @@ impl ExecutionPlanVisitor for VortexMetricsFinder { } #[derive(Clone, Debug, Default)] -pub(crate) struct VortexExecMetrics { +pub(crate) struct VortexSourceMetrics { pub vortex: VortexMetrics, pub execution_plan: ExecutionPlanMetricsSet, } -impl VortexExecMetrics { +impl VortexSourceMetrics { pub fn child_with_tags(&self, additional_tags: impl Into) -> VortexMetrics { self.vortex.child_with_tags(additional_tags) } diff --git a/vortex-datafusion/src/persistent/source.rs b/vortex-datafusion/src/persistent/source.rs index 48e87598cb..867ab9cde0 100644 --- a/vortex-datafusion/src/persistent/source.rs +++ b/vortex-datafusion/src/persistent/source.rs @@ -14,7 +14,7 @@ use vortex_file::VORTEX_FILE_EXTENSION; use super::cache::FooterCache; use super::config::{ConfigProjection, FileScanConfigExt}; -use super::metrics::{PARTITION_LABEL, VortexExecMetrics}; +use super::metrics::{PARTITION_LABEL, VortexSourceMetrics}; use super::opener::VortexFileOpener; /// A config for [`VortexFileOpener`]. Used to create [`DataSourceExec`] based physical plans. @@ -28,11 +28,11 @@ pub struct VortexSource { pub(crate) batch_size: Option, pub(crate) projected_statistics: Option, pub(crate) arrow_schema: Option, - pub(crate) metrics: VortexExecMetrics, + pub(crate) metrics: VortexSourceMetrics, } impl VortexSource { - pub(crate) fn new(footer_cache: FooterCache, metrics: VortexExecMetrics) -> Self { + pub(crate) fn new(footer_cache: FooterCache, metrics: VortexSourceMetrics) -> Self { Self { footer_cache, metrics, From 9e130b2ee2f563e742f032af3a826b0d41f39717 Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Mon, 3 Mar 2025 18:22:42 +0000 Subject: [PATCH 19/21] bump minimal uuid version --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index c6f1869acc..f2adae5532 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -160,7 +160,7 @@ tracing-chrome = "0.7.2" tracing-futures = "0.2.5" tracing-subscriber = "0.3.19" url = "2.5.4" -uuid = { version = "1.14", features = ["js"] } +uuid = { version = "1.15", features = ["js"] } wasm-bindgen-futures = "0.4.39" witchcraft-metrics = "1.0.1" From b41770db5ff84a1472aeda88c37f0931af3e5172 Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Sun, 9 Mar 2025 18:14:37 +0200 Subject: [PATCH 20/21] . --- Cargo.lock | 69 ++++++++++++++++++++++++++++++++++++------------------ Cargo.toml | 12 +++++----- 2 files changed, 52 insertions(+), 29 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 2d1847b9b4..fe226709fc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1308,7 +1308,8 @@ dependencies = [ [[package]] name = "datafusion" version = "46.0.0" -source = "git+https://github.com/apache/datafusion.git?branch=branch-46#ec4862fa2d870fcab973fd1589ef99b6bf8d560f" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46b879c1aa3a85ecbfa376704f0fe4bfebae1a44a5d35faa4466bf85469b6a0e" dependencies = [ "arrow", "arrow-ipc", @@ -1353,7 +1354,8 @@ dependencies = [ [[package]] name = "datafusion-catalog" version = "46.0.0" -source = "git+https://github.com/apache/datafusion.git?branch=branch-46#ec4862fa2d870fcab973fd1589ef99b6bf8d560f" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e42f516243fe30137f2b7d5712611286baf8d1d758a46157bada7c35fdf38df" dependencies = [ "arrow", "async-trait", @@ -1372,7 +1374,8 @@ dependencies = [ [[package]] name = "datafusion-catalog-listing" version = "46.0.0" -source = "git+https://github.com/apache/datafusion.git?branch=branch-46#ec4862fa2d870fcab973fd1589ef99b6bf8d560f" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e495290c231d617f0a940860a885cb2f4c3efe46c1983c30d3fa12faf1ccb208" dependencies = [ "arrow", "async-trait", @@ -1393,7 +1396,8 @@ dependencies = [ [[package]] name = "datafusion-common" version = "46.0.0" -source = "git+https://github.com/apache/datafusion.git?branch=branch-46#ec4862fa2d870fcab973fd1589ef99b6bf8d560f" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af67ddc82e1c8e6843c326ca13aa20e5420cce9f886b4e1ee39ea43defae3145" dependencies = [ "ahash", "arrow", @@ -1415,7 +1419,8 @@ dependencies = [ [[package]] name = "datafusion-common-runtime" version = "46.0.0" -source = "git+https://github.com/apache/datafusion.git?branch=branch-46#ec4862fa2d870fcab973fd1589ef99b6bf8d560f" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "36ee9403a2ec39183437825d232f556a5dee89f13f6fd78f8c7f8f999489e4ca" dependencies = [ "log", "tokio", @@ -1424,7 +1429,8 @@ dependencies = [ [[package]] name = "datafusion-datasource" version = "46.0.0" -source = "git+https://github.com/apache/datafusion.git?branch=branch-46#ec4862fa2d870fcab973fd1589ef99b6bf8d560f" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8c8b7568b638dd309bcc1cdeb66776f233b110d44bdc6fd67ef1919f9ec9803" dependencies = [ "arrow", "async-trait", @@ -1451,12 +1457,14 @@ dependencies = [ [[package]] name = "datafusion-doc" version = "46.0.0" -source = "git+https://github.com/apache/datafusion.git?branch=branch-46#ec4862fa2d870fcab973fd1589ef99b6bf8d560f" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8612c81304578a2e2b82d31caf8173312cb086a7a23a23556b9fff3ac7c18221" [[package]] name = "datafusion-execution" version = "46.0.0" -source = "git+https://github.com/apache/datafusion.git?branch=branch-46#ec4862fa2d870fcab973fd1589ef99b6bf8d560f" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3591e6d4900e57bad7f861f14f5c763f716da76553b0d037ec91c192c876f09c" dependencies = [ "arrow", "dashmap", @@ -1474,7 +1482,8 @@ dependencies = [ [[package]] name = "datafusion-expr" version = "46.0.0" -source = "git+https://github.com/apache/datafusion.git?branch=branch-46#ec4862fa2d870fcab973fd1589ef99b6bf8d560f" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5033d0f6198d177f50a7721d80db141af15dd12f45ad6dce34e2cdbb6538e39d" dependencies = [ "arrow", "chrono", @@ -1493,7 +1502,8 @@ dependencies = [ [[package]] name = "datafusion-expr-common" version = "46.0.0" -source = "git+https://github.com/apache/datafusion.git?branch=branch-46#ec4862fa2d870fcab973fd1589ef99b6bf8d560f" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56def48a7dfb9f92aa18e18dfdffaca79b5383f03c59bb0107959c1698634557" dependencies = [ "arrow", "datafusion-common", @@ -1505,7 +1515,8 @@ dependencies = [ [[package]] name = "datafusion-functions" version = "46.0.0" -source = "git+https://github.com/apache/datafusion.git?branch=branch-46#ec4862fa2d870fcab973fd1589ef99b6bf8d560f" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a79b703b42b0aac97485b84c6810c78114b0974a75a33514840ba0bbe0de38f" dependencies = [ "arrow", "arrow-buffer", @@ -1529,7 +1540,8 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate" version = "46.0.0" -source = "git+https://github.com/apache/datafusion.git?branch=branch-46#ec4862fa2d870fcab973fd1589ef99b6bf8d560f" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fdad20375e85365ed262b5583955c308840efc6ff9271ff463cf86789adfb686" dependencies = [ "ahash", "arrow", @@ -1549,7 +1561,8 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate-common" version = "46.0.0" -source = "git+https://github.com/apache/datafusion.git?branch=branch-46#ec4862fa2d870fcab973fd1589ef99b6bf8d560f" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff73249ee3cdc81ad04317d3b4231fc02a8c03a3a1b4b13953244e6443f6b498" dependencies = [ "ahash", "arrow", @@ -1561,7 +1574,8 @@ dependencies = [ [[package]] name = "datafusion-functions-table" version = "46.0.0" -source = "git+https://github.com/apache/datafusion.git?branch=branch-46#ec4862fa2d870fcab973fd1589ef99b6bf8d560f" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac12628c3e43461118e95d5772f729e1cc39db883d8ee52e4b80038b0f614bbf" dependencies = [ "arrow", "async-trait", @@ -1576,7 +1590,8 @@ dependencies = [ [[package]] name = "datafusion-functions-window" version = "46.0.0" -source = "git+https://github.com/apache/datafusion.git?branch=branch-46#ec4862fa2d870fcab973fd1589ef99b6bf8d560f" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "03eb449555c7cc03bb61d43d90edef70d070d34bc4a0d8f7e358d157232f3220" dependencies = [ "datafusion-common", "datafusion-doc", @@ -1592,7 +1607,8 @@ dependencies = [ [[package]] name = "datafusion-functions-window-common" version = "46.0.0" -source = "git+https://github.com/apache/datafusion.git?branch=branch-46#ec4862fa2d870fcab973fd1589ef99b6bf8d560f" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a0c7606e568ee6a15d33a2532eb0d18e7769bb88af55f6b70be4db9fd937d18" dependencies = [ "datafusion-common", "datafusion-physical-expr-common", @@ -1601,7 +1617,8 @@ dependencies = [ [[package]] name = "datafusion-macros" version = "46.0.0" -source = "git+https://github.com/apache/datafusion.git?branch=branch-46#ec4862fa2d870fcab973fd1589ef99b6bf8d560f" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "64030e805d3d257e3012e4378500d4ac90b1ebacd03f1110e8ec927b77f09486" dependencies = [ "datafusion-expr", "quote", @@ -1611,7 +1628,8 @@ dependencies = [ [[package]] name = "datafusion-optimizer" version = "46.0.0" -source = "git+https://github.com/apache/datafusion.git?branch=branch-46#ec4862fa2d870fcab973fd1589ef99b6bf8d560f" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae6af7bdae7565aa7a4cb1deb7fe18d89c63c5d93b5203b473ca1dbe02a1cd3d" dependencies = [ "arrow", "chrono", @@ -1628,7 +1646,8 @@ dependencies = [ [[package]] name = "datafusion-physical-expr" version = "46.0.0" -source = "git+https://github.com/apache/datafusion.git?branch=branch-46#ec4862fa2d870fcab973fd1589ef99b6bf8d560f" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f68601feda90c255c9023a881e833efca9d7539bab0565ac1355b0249326e91" dependencies = [ "ahash", "arrow", @@ -1649,7 +1668,8 @@ dependencies = [ [[package]] name = "datafusion-physical-expr-common" version = "46.0.0" -source = "git+https://github.com/apache/datafusion.git?branch=branch-46#ec4862fa2d870fcab973fd1589ef99b6bf8d560f" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00c1a08b00d340ca3bc1cd2f094ecaeaf6f099a2980e11255976660fa0409182" dependencies = [ "ahash", "arrow", @@ -1662,7 +1682,8 @@ dependencies = [ [[package]] name = "datafusion-physical-optimizer" version = "46.0.0" -source = "git+https://github.com/apache/datafusion.git?branch=branch-46#ec4862fa2d870fcab973fd1589ef99b6bf8d560f" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7cd34f3438cf9629ea0e3425027582334fb6671a05ee43671ca3c47896b75dda" dependencies = [ "arrow", "datafusion-common", @@ -1679,7 +1700,8 @@ dependencies = [ [[package]] name = "datafusion-physical-plan" version = "46.0.0" -source = "git+https://github.com/apache/datafusion.git?branch=branch-46#ec4862fa2d870fcab973fd1589ef99b6bf8d560f" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7624484ada341d30ef465eae61f760e779f080c621bbc3dc0335a75fa78e8dec" dependencies = [ "ahash", "arrow", @@ -1708,7 +1730,8 @@ dependencies = [ [[package]] name = "datafusion-sql" version = "46.0.0" -source = "git+https://github.com/apache/datafusion.git?branch=branch-46#ec4862fa2d870fcab973fd1589ef99b6bf8d560f" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e717736a394ed92d9dcf2d74439c655474dd39aa65a064a6bae697b6d20e5fe" dependencies = [ "arrow", "bigdecimal", diff --git a/Cargo.toml b/Cargo.toml index 98064a8718..d0f695caa0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -73,12 +73,12 @@ chrono = "0.4.38" clap = "4" compio = { version = "0.13", features = ["io-uring"], default-features = false } crossterm = "0.28" -datafusion = { version = "46", default-features = false, git = "https://github.com/apache/datafusion.git", branch = "branch-46" } -datafusion-common = { version = "46", git = "https://github.com/apache/datafusion.git", branch = "branch-46" } -datafusion-execution = { version = "46", git = "https://github.com/apache/datafusion.git", branch = "branch-46" } -datafusion-expr = { version = "46", git = "https://github.com/apache/datafusion.git", branch = "branch-46" } -datafusion-physical-expr = { version = "46", git = "https://github.com/apache/datafusion.git", branch = "branch-46" } -datafusion-physical-plan = { version = "46", git = "https://github.com/apache/datafusion.git", branch = "branch-46" } +datafusion = { version = "46", default-features = false } +datafusion-common = { version = "46" } +datafusion-execution = { version = "46" } +datafusion-expr = { version = "46" } +datafusion-physical-expr = { version = "46" } +datafusion-physical-plan = { version = "46" } divan = { package = "codspeed-divan-compat", version = "2.8.0" } dyn-hash = "0.2.0" enum-iterator = "2.0.0" From aab23a3a4557b24918027b321353fec1a4c8e31b Mon Sep 17 00:00:00 2001 From: Adam Gutglick Date: Sun, 9 Mar 2025 18:48:57 +0200 Subject: [PATCH 21/21] . --- vortex-datafusion/src/persistent/source.rs | 63 +++++++++++++++++----- 1 file changed, 51 insertions(+), 12 deletions(-) diff --git a/vortex-datafusion/src/persistent/source.rs b/vortex-datafusion/src/persistent/source.rs index 867ab9cde0..6cd93feb5e 100644 --- a/vortex-datafusion/src/persistent/source.rs +++ b/vortex-datafusion/src/persistent/source.rs @@ -1,4 +1,5 @@ use std::any::Any; +use std::collections::VecDeque; use std::sync::Arc; use arrow_schema::SchemaRef; @@ -173,21 +174,59 @@ pub(crate) fn repartition_by_size( file_groups: Vec>, desired_partitions: usize, ) -> Vec> { - let all_files = file_groups.into_iter().concat(); + let all_files = file_groups.iter().flatten().collect::>(); let total_file_count = all_files.len(); let total_size = all_files.iter().map(|f| f.object_meta.size).sum::(); - let target_partition_size = total_size / (desired_partitions + 1); + let target_partition_size = total_size / desired_partitions; let mut partitions = Vec::with_capacity(desired_partitions); let mut curr_partition_size = 0; let mut curr_partition = Vec::default(); - for file in all_files.into_iter() { - curr_partition_size += file.object_meta.size; - curr_partition.push(file); + let mut all_files = VecDeque::from_iter( + all_files + .into_iter() + .sorted_unstable_by_key(|f| f.object_meta.size), + ); - if curr_partition_size >= target_partition_size { + while !all_files.is_empty() && partitions.len() < desired_partitions { + // If the current partition is empty, we want to bootstrap it with the biggest file we have leftover. + let file = if curr_partition.is_empty() { + all_files.pop_back() + // If we already have files in the partition, we try and fill it up. + } else { + // Peak at the biggest file left + let biggest_file_size = all_files + .back() + .vortex_expect("We must have at least one item") + .object_meta + .size; + + let smallest_file_size = all_files + .front() + .vortex_expect("We must have at least one item") + .object_meta + .size; + + // We try and find a file on either end that fits in the partition + if curr_partition_size + biggest_file_size >= target_partition_size { + all_files.pop_front() + } else if curr_partition_size + smallest_file_size >= target_partition_size { + all_files.pop_back() + } else { + None + } + }; + + // Add a file to the partition + if let Some(file) = file { + curr_partition_size += file.object_meta.size; + curr_partition.push(file.clone()); + } + + // If the partition is full, move on to the next one + if curr_partition_size >= target_partition_size || file.is_none() { curr_partition_size = 0; partitions.push(std::mem::take(&mut curr_partition)); } @@ -196,19 +235,19 @@ pub(crate) fn repartition_by_size( // If we we're still missing the last partition if !curr_partition.is_empty() && partitions.len() != desired_partitions { partitions.push(std::mem::take(&mut curr_partition)); - // If we already have enough partitions } else if !curr_partition.is_empty() { for (idx, file) in curr_partition.into_iter().enumerate() { let new_part_idx = idx % partitions.len(); - partitions[new_part_idx].push(file); + partitions[new_part_idx].push(file.clone()); } } + for (idx, file) in all_files.into_iter().enumerate() { + let new_part_idx = idx % partitions.len(); + partitions[new_part_idx].push(file.clone()); + } + // Assert that we have the correct number of partitions and that the total number of files is right - assert_eq!( - partitions.len(), - usize::min(desired_partitions, total_file_count) - ); assert_eq!(total_file_count, partitions.iter().flatten().count()); partitions