From 9eae124d5952e90de71a0ebe9c3e6179aa83bf88 Mon Sep 17 00:00:00 2001 From: William Hobbs Date: Tue, 5 Mar 2024 16:00:38 -0800 Subject: [PATCH] mpi: do node-exclusive scheduling for cray-pals PMI Problem: as documented in the "CORAL2: Flux on Cray Shasta" page in the flux docs, two flux subinstances sharing the same nodes can fail due to overlapping port numbers. For some reason, ever since the vcpu test was added, this has been happening more often. The solution is to do node-exclusive scheduling at the top level so the jobs run sequentially. --- mpi/outer_script.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mpi/outer_script.sh b/mpi/outer_script.sh index 6ae29e4..78d5425 100755 --- a/mpi/outer_script.sh +++ b/mpi/outer_script.sh @@ -35,7 +35,7 @@ COMPILERS="${LCSCHEDCLUSTER}_COMPILERS" for mpi in ${!MPIS}; do for compiler in ${!COMPILERS}; do if [[ $mpi == "cray-mpich" ]]; then - EXTRA_FLUX_SUBMIT_OPTIONS="-o pmi=cray-pals" flux batch -N2 -n4 --flags=waitable --output=kvs $MPI_TESTS_DIRECTORY/inner_script.sh $compiler $mpi + EXTRA_FLUX_SUBMIT_OPTIONS="-o pmi=cray-pals" flux batch --exclusive -N2 --flags=waitable --output=kvs $MPI_TESTS_DIRECTORY/inner_script.sh $compiler $mpi elif [[ $mpi == "openmpi"* ]]; then EXTRA_FLUX_SUBMIT_OPTIONS="-o pmi=pmix" flux batch -N2 -n4 --flags=waitable --output=kvs $MPI_TESTS_DIRECTORY/inner_script.sh $compiler $mpi else