From 3a1f2a7578c4110ed6f63294cbb10bb8633ffbe8 Mon Sep 17 00:00:00 2001
From: Yongting You <2010youy01@gmail.com>
Date: Sun, 16 Feb 2025 11:43:18 +0800
Subject: [PATCH] fix extended test

---
 .github/workflows/extended.yml                | 49 ++++++++++---------
 .../sort_mem_validation.rs                    | 31 +++++++++++-
 2 files changed, 54 insertions(+), 26 deletions(-)

diff --git a/.github/workflows/extended.yml b/.github/workflows/extended.yml
index 19910957a85b9..c3c8393c5d0a6 100644
--- a/.github/workflows/extended.yml
+++ b/.github/workflows/extended.yml
@@ -52,30 +52,31 @@ jobs:
           cargo check --profile ci --all-targets
           cargo clean
 
-#  # Run extended tests (with feature 'extended_tests')
-#  # Disabling as it is running out of disk space
-#  # see https://github.com/apache/datafusion/issues/14576
-#  linux-test-extended:
-#    name: cargo test 'extended_tests' (amd64)
-#    needs: linux-build-lib
-#    runs-on: ubuntu-latest
-#    container:
-#      image: amd64/rust
-#    steps:
-#      - uses: actions/checkout@v4
-#        with:
-#          submodules: true
-#          fetch-depth: 1
-#      - name: Setup Rust toolchain
-#        uses: ./.github/actions/setup-builder
-#        with:
-#          rust-version: stable
-#      - name: Run tests (excluding doctests)
-#        run: cargo test --profile ci --exclude datafusion-examples --exclude datafusion-benchmarks --workspace --lib --tests --bins --features avro,json,backtrace,extended_tests
-#      - name: Verify Working Directory Clean
-#        run: git diff --exit-code
-#      - name: Cleanup
-#        run: cargo clean
+  # Run extended tests (with feature 'extended_tests')
+  linux-test-extended:
+    name: cargo test 'extended_tests' (amd64)
+    needs: linux-build-lib
+    runs-on: ubuntu-latest
+    container:
+      image: amd64/rust
+      options: --user root
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: true
+          fetch-depth: 1
+      - name: Setup Rust toolchain
+        uses: ./.github/actions/setup-builder
+        with:
+          rust-version: stable
+      - name: Check free disk space
+        run: du -sh /
+      - name: Run tests (excluding doctests)
+        run: cargo test --profile ci --exclude datafusion-examples --exclude datafusion-benchmarks --workspace --lib --tests --bins --features avro,json,backtrace,extended_tests
+      - name: Verify Working Directory Clean
+        run: git diff --exit-code
+      - name: Cleanup
+        run: cargo clean
 
   # Check answers are correct when hash values collide
   hash-collisions:
diff --git a/datafusion/core/tests/memory_limit/memory_limit_validation/sort_mem_validation.rs b/datafusion/core/tests/memory_limit/memory_limit_validation/sort_mem_validation.rs
index 1789f37535a94..871dbe611facc 100644
--- a/datafusion/core/tests/memory_limit/memory_limit_validation/sort_mem_validation.rs
+++ b/datafusion/core/tests/memory_limit/memory_limit_validation/sort_mem_validation.rs
@@ -21,12 +21,14 @@
 //! This file is organized as:
 //! - Test runners that spawn individual test processes
 //! - Test cases that contain the actual validation logic
-use std::{process::Command, str};
-
 use log::info;
+use std::sync::Once;
+use std::{process::Command, str};
 
 use crate::memory_limit::memory_limit_validation::utils;
 
+static INIT: Once = Once::new();
+
 // ===========================================================================
 // Test runners:
 // Runners are splitted into multiple tests to run in parallel
@@ -67,10 +69,35 @@ fn sort_with_mem_limit_2_cols_2_runner() {
     spawn_test_process("sort_with_mem_limit_2_cols_2");
 }
 
+/// `spawn_test_process` might trigger multiple recompilation, and the test binary
+/// size might grow indifinitely. This initilizer ensures recompilation is only done
+/// once and the target size is bounded.
+///
+/// TODO: This is a hack, can be cleaned up if we have a better way to let multiple
+/// test cases run in different processes (instead of different threads by default)
+fn init_once() {
+    INIT.call_once(|| {
+        let _ = Command::new("cargo")
+            .arg("test")
+            .arg("--no-run")
+            .arg("--package")
+            .arg("datafusion")
+            .arg("--test")
+            .arg("core_integration")
+            .arg("--features")
+            .arg("extended_tests")
+            .env("DATAFUSION_TEST_MEM_LIMIT_VALIDATION", "1")
+            .output()
+            .expect("Failed to execute test command");
+    });
+}
+
 /// Helper function that executes a test in a separate process with the required environment
 /// variable set. Memory limit validation tasks need to measure memory resident set
 /// size (RSS), so they must run in a separate process.
 fn spawn_test_process(test: &str) {
+    init_once();
+
     let test_path = format!(
         "memory_limit::memory_limit_validation::sort_mem_validation::{}",
         test