diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index f27b8cb23..5bac3bc61 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -16,8 +16,9 @@ env: CAMB_TORCH_BASE_DIR: '/mnt/lustre/share/parrotsci/github/cibuild/pytorchbase' CUDA_CI_PATH: '/mnt/cache/share/parrotsci/github/cibuild/${{ github.repository }}' CUDA_PARTATION: ${{ vars.SH1988_SLURM_PAR != '' && vars.SH1988_SLURM_PAR || 'pat_rd' }} - CUDA_CLUSTER: SH1988 + CUDA_CLUSTER: SCO DEEPLINK_PATH: '/mnt/cache/share/deeplinkci/github/${{ github.repository }}' + ENV_PATH: '/mnt/cache/share/deeplinkci/github' ASCEND_CLUSTER: ASCEND CLUSTER_ASCEND_910B: ASCEND-910B CLUSTER_KLX: KUNLUNXIN @@ -71,17 +72,17 @@ jobs: - name: Rsync to Server run: | ssh ${CAMB_CLUSTER} "mkdir -p ${CAMB_CI_PATH}/${GITHUB_RUN_NUMBER}/source ${CAMB_CI_PATH}/${GITHUB_RUN_NUMBER}/source-main" \ - && rsync -a --delete ${GITHUB_WORKSPACE}/${REPO}/ ${CAMB_CLUSTER}:${CAMB_CI_PATH}/${GITHUB_RUN_NUMBER}/source/ \ - && rsync -a --delete ${GITHUB_WORKSPACE}/${REPO}_DIOPI/ ${CAMB_CLUSTER}:${CAMB_CI_PATH}/${GITHUB_RUN_NUMBER}/source-main/ || echo "failure to connect to camb" - ssh ${CUDA_CLUSTER} "mkdir -p ${CUDA_CI_PATH}/${GITHUB_RUN_NUMBER}/source ${CUDA_CI_PATH}/${GITHUB_RUN_NUMBER}/source-main" \ - && rsync -a --delete ${GITHUB_WORKSPACE}/${REPO}/ ${CUDA_CLUSTER}:${CUDA_CI_PATH}/${GITHUB_RUN_NUMBER}/source/ \ - && rsync -a --delete ${GITHUB_WORKSPACE}/${REPO}_DIOPI/ ${CUDA_CLUSTER}:${CUDA_CI_PATH}/${GITHUB_RUN_NUMBER}/source-main/ || echo "failure to connect to cuda" + && rsync -a --delete --timeout=60 ${GITHUB_WORKSPACE}/${REPO}/ ${CAMB_CLUSTER}:${CAMB_CI_PATH}/${GITHUB_RUN_NUMBER}/source/ \ + && rsync -a --delete --timeout=60 ${GITHUB_WORKSPACE}/${REPO}_DIOPI/ ${CAMB_CLUSTER}:${CAMB_CI_PATH}/${GITHUB_RUN_NUMBER}/source-main/ || echo "failure to connect to camb" + ssh ${CUDA_CLUSTER} "mkdir -p ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/source ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/source-main" \ + && rsync -a --delete --timeout=60 ${GITHUB_WORKSPACE}/${REPO}/ ${CUDA_CLUSTER}:${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/source/ \ + && rsync -a --delete --timeout=60 ${GITHUB_WORKSPACE}/${REPO}_DIOPI/ ${CUDA_CLUSTER}:${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/source-main/ || echo "failure to connect to cuda" ssh ${CLUSTER_ASCEND_910B} "mkdir -p ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/source ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/source-main" \ - && rsync -a --delete ${GITHUB_WORKSPACE}/${REPO}/ ${CLUSTER_ASCEND_910B}:${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/source/ \ - && rsync -a --delete ${GITHUB_WORKSPACE}/${REPO}_DIOPI/ ${CLUSTER_ASCEND_910B}:${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/source-main/ || echo "failure to connect to ascend" + && rsync -a --delete --timeout=60 ${GITHUB_WORKSPACE}/${REPO}/ ${CLUSTER_ASCEND_910B}:${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/source/ \ + && rsync -a --delete --timeout=60 ${GITHUB_WORKSPACE}/${REPO}_DIOPI/ ${CLUSTER_ASCEND_910B}:${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/source-main/ || echo "failure to connect to ascend" ssh ${CLUSTER_KLX} "mkdir -p ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/source ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/source-main" \ - && rsync -a --delete ${GITHUB_WORKSPACE}/${REPO}/ ${CLUSTER_KLX}:${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/source/ \ - && rsync -a --delete ${GITHUB_WORKSPACE}/${REPO}_DIOPI/ ${CLUSTER_KLX}:${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/source-main/ || echo "failure to connect to kunlunxin" + && rsync -a --delete --timeout=60 ${GITHUB_WORKSPACE}/${REPO}/ ${CLUSTER_KLX}:${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/source/ \ + && rsync -a --delete --timeout=60 ${GITHUB_WORKSPACE}/${REPO}_DIOPI/ ${CLUSTER_KLX}:${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/source-main/ || echo "failure to connect to kunlunxin" Build-Camb: name: Build-dipu-camb @@ -244,123 +245,85 @@ jobs: Build-Cuda: name: Build-dipu-cuda needs: [Rsync] - runs-on: github-poc-ci - env: - GPU_REQUESTS: 1 + runs-on: tps-sco-ci steps: - name: Build dipu run: | - ssh ${CUDA_CLUSTER} """ set -e - export USE_COVERAGE=ON - cd ${CUDA_CI_PATH}/${GITHUB_RUN_NUMBER} && rm -rf ${GITHUB_JOB} && cp -R source ${GITHUB_JOB} && cd ${GITHUB_JOB}/dipu - source scripts/ci/nv/ci_nv_env.sh - rsync -a /mnt/lustre/share_data/PAT/datasets/huggingface mmlab_pack/ - srun --job-name=${GITHUB_RUN_NUMBER}_${GITHUB_JOB} --partition=${CUDA_PARTATION} --gres=gpu:${GPU_REQUESTS} --time=30 bash scripts/ci/nv/ci_nv_script.sh build_dipu \ - || ( cd ${CUDA_CI_PATH}/${GITHUB_RUN_NUMBER}/ && rm -rf ${GITHUB_JOB} && exit 1 ) - """ +# cd ${DEEPLINK_PATH}/ && ls -al && find ${DEEPLINK_PATH}/ -maxdepth 1 -mmin +300 -type d |xargs rm -rf + cd ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER} && rm -rf ${GITHUB_JOB} && cp -R source ${GITHUB_JOB} + srun --job-name=${GITHUB_JOB} bash -c "export USE_COVERAGE=ON && cd ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/${GITHUB_JOB}/dipu \ + && source ${ENV_PATH}/dipu_env \ + && rsync -a /mnt/lustre/share_data/PAT/datasets/huggingface mmlab_pack/ \ + && bash scripts/ci/nv/ci_nv_script.sh build_dipu" || ( cd ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/ && rm -rf ${GITHUB_JOB} && exit 1 ) Tidy-Cuda: name: Run tidy (cuda) needs: [Build-Cuda] - runs-on: github-poc-ci + runs-on: tps-sco-ci steps: - name: clang-tidy run: | - ssh $CUDA_CLUSTER """ set -eo pipefail - source ~/.bashrc - (bash $CUDA_CI_PATH/$GITHUB_RUN_NUMBER/Build-Cuda/dipu/scripts/ci/nv/ci_nv_tidy.sh srun --job-name=${GITHUB_RUN_NUMBER}_$GITHUB_JOB --partition=$CUDA_PARTATION --time=20) ||\ - (rm -rf "$CUDA_CI_PATH/$GITHUB_RUN_NUMBER/Build-Cuda" && exit 1) - """ + srun --job-name=${GITHUB_JOB} bash -c "bash ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/Build-Cuda/dipu/scripts/ci/nv/ci_nv_tidy.sh " Test-Cuda: name: Test-dipu-cuda needs: [Build-Cuda, Tidy-Cuda] - runs-on: github-poc-ci - env: - GPU_REQUESTS: 1 + runs-on: tps-sco-ci steps: - name: Run-test run: | - ssh ${CUDA_CLUSTER} """ - set -ex - export USE_COVERAGE=ON - cd ${CUDA_CI_PATH}/${GITHUB_RUN_NUMBER}/ && cd Build-Cuda/dipu - source scripts/ci/nv/ci_nv_env.sh - srun --job-name=${GITHUB_RUN_NUMBER}_${GITHUB_JOB} --partition=${CUDA_PARTATION} --gres=gpu:${GPU_REQUESTS} --cpus-per-task=5 --mem=16G --time=70 sh tests/run_nv_tests.sh + set -e + srun --job-name=${GITHUB_JOB} bash -c "export USE_COVERAGE=ON && cd ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/Build-Cuda/dipu \ + && source ${ENV_PATH}/dipu_env \ + && bash tests/run_nv_tests.sh" if [ "${ALL_COVERAGE}" = "ON" ]; then - bash /mnt/cache/share/platform/dep/sonar/coverage_DIPU_nv.sh ${CUDA_CI_PATH}/${GITHUB_RUN_NUMBER}/Build-Cuda ${GITHUB_RUN_NUMBER} || echo "get coverage fail" + bash /mnt/cache/share/platform/dep/sonar/coverage_DIPU_nv.sh ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/${BUILD_TEST1} ${GITHUB_RUN_NUMBER} || echo "get coverage fail" fi - """ - name: increment coverage check if: ${{ contains( github.event_name, 'pull_request' ) && contains( github.base_ref, 'main' ) }} run: | - ssh ${CUDA_CLUSTER} """ set -e - cd ${CUDA_CI_PATH}/${GITHUB_RUN_NUMBER}/Build-Cuda/ - rm -rf scripts - ln -s ${CUDA_CI_PATH}/${GITHUB_RUN_NUMBER}/source-main/dipu/third_party/DIOPI/scripts scripts - source /mnt/cache/share/platform/env/pt2.0_diopi + cd ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/Build-Cuda + ln -s ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/source-main/dipu/third_party/DIOPI/scripts scripts + source ${ENV_PATH}/dipu_env bash scripts/increment_coverage.sh ${REQUIRE_COVERAGE} - """ Test-One-Iter_Cuda: name: Test-one-iter-cuda needs: [Build-Cuda, Tidy-Cuda] - runs-on: github-poc-ci - env: - GPU_REQUESTS: 1 + runs-on: tps-sco-ci steps: - name: build some env run: | - ssh ${CUDA_CLUSTER} """ - set -ex - cd ${CUDA_CI_PATH}/${GITHUB_RUN_NUMBER}/ && cd Build-Cuda/dipu - source scripts/ci/nv/ci_nv_env.sh - basic_path=${CUDA_CI_PATH}/${GITHUB_RUN_NUMBER}/Build-Cuda/dipu/mmlab_pack - export PYTHONPATH=\${basic_path}/mmengine:\$PYTHONPATH - export PYTHONPATH=\${basic_path}/mmcv:\$PYTHONPATH - export PYTHONPATH=\$(pwd):\$PYTHONPATH - cd mmlab_pack - srun --job-name=${GITHUB_RUN_NUMBER}_${GITHUB_JOB} --partition=${CUDA_PARTATION} --gres=gpu:${GPU_REQUESTS} --time=20 bash ../scripts/ci/ci_one_iter.sh build_cuda - """ + set -e + export basic_path=${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/Build-Cuda/dipu/ + srun --job-name=${GITHUB_JOB} bash -c "cd ${basic_path} \ + && export PYTHONPATH=$(basic_path)/mmlab_pack:$(basic_path)/mmlab_pack/mmengine:$(basic_path)/mmlab_pack/mmcv:$PYTHONPATH \ + && source ${ENV_PATH}/dipu_env && cd mmlab_pack \ + && bash ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/Build-Cuda/dipu/scripts/ci/ci_one_iter.sh build_cuda" - name: run-one-iter-for-tradition run: | - ssh ${CUDA_CLUSTER} """ - cd ${CUDA_CI_PATH}/${GITHUB_RUN_NUMBER}/ && cd Build-Cuda/dipu - source scripts/ci/nv/ci_nv_env.sh - basic_path=${CUDA_CI_PATH}/${GITHUB_RUN_NUMBER}/Build-Cuda/dipu/mmlab_pack - source scripts/ci/ci_one_iter.sh export_pythonpath_cuda \${basic_path} - export PYTHONPATH=\$(pwd):\$PYTHONPATH - cd mmlab_pack - rm -rf one_iter_data - python ../scripts/ci/ci_run_one_iter.py cuda ${GITHUB_RUN_NUMBER}_${GITHUB_JOB} "gpu:${GPU_REQUESTS}" \"${CUDA_PARTATION}\" && rm -rf one_iter_data || (rm -rf one_iter_data && exit 1) - """ + set -e + cd ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/Build-Cuda/dipu/ \ + && source ${ENV_PATH}/dipu_env && cd mmlab_pack \ + && rm -rf one_iter_data \ + && python ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/Build-Cuda/dipu/scripts/ci/ci_run_one_iter.py sco ${GITHUB_JOB} gpu sco && rm -rf one_iter_data || (rm -rf one_iter_data && exit 1) - name: run-one-iter-for-llm run: | - ssh ${CUDA_CLUSTER} """ - cd ${CUDA_CI_PATH}/${GITHUB_RUN_NUMBER}/ && cd Build-Cuda/dipu - source scripts/ci/nv/ci_nv_env.sh - basic_path=${CUDA_CI_PATH}/${GITHUB_RUN_NUMBER}/Build-Cuda/dipu/mmlab_pack - source scripts/ci/ci_one_iter.sh export_pythonpath_cuda \${basic_path} - export PYTHONPATH=\$(pwd):\$PYTHONPATH - cd mmlab_pack - rm -rf one_iter_data - python ../scripts/ci/ci_run_one_iter.py cuda ${GITHUB_RUN_NUMBER}_${GITHUB_JOB} "gpu:${GPU_REQUESTS}" \"${CUDA_PARTATION}\" "llm" && rm -rf one_iter_data || (rm -rf one_iter_data && exit 1) - """ + set -e + cd ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/Build-Cuda/dipu/ \ + && source ${ENV_PATH}/dipu_env && cd mmlab_pack \ + && rm -rf one_iter_data \ + && python ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/Build-Cuda/dipu/scripts/ci/ci_run_one_iter.py sco ${GITHUB_JOB} gpu sco "llm" && rm -rf one_iter_data || (rm -rf one_iter_data && exit 1) - name: Perform cleanup one iter data if: always() run: | - ssh ${CUDA_CLUSTER} """ - set -ex - echo "${GITHUB_RUN_NUMBER}_${GITHUB_JOB}" - scancel -n "${GITHUB_RUN_NUMBER}_${GITHUB_JOB}" - cd ${CUDA_CI_PATH}/${GITHUB_RUN_NUMBER}/Build-Cuda/dipu/mmlab_pack - rm -rf one_iter_data - touch one_iter_data # 用于占位,防止创建新的 one_iter_data 文件夹 - """ - + set -e + cd ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/Build-Cuda/dipu/mmlab_pack + rm -rf one_iter_data + touch one_iter_data #用于占位,防止创建新的one_iter_data文件夹 - name: Check for failure if: ${{ failure() }} run: exit 1 @@ -368,36 +331,28 @@ jobs: Build-Cuda-Latest-Target: name: Build-dipu-cuda-latest-target needs: [Tidy-Cuda] - runs-on: github-poc-ci - env: - GPU_REQUESTS: 1 + runs-on: tps-sco-ci steps: - name: Build dipu diopi-latest-target run: | - ssh ${CUDA_CLUSTER} """ - set -ex - cd ${CUDA_CI_PATH}/${GITHUB_RUN_NUMBER} && rm -rf ${GITHUB_JOB} && cp -R source-main ${GITHUB_JOB} && cd ${GITHUB_JOB}/dipu - source scripts/ci/nv/ci_nv_env.sh - srun --job-name=${GITHUB_RUN_NUMBER}_${GITHUB_JOB} --partition=${CUDA_PARTATION} --gres=gpu:${GPU_REQUESTS} --cpus-per-task=5 --mem=16G --time=30 bash scripts/ci/nv/ci_nv_script.sh build_dipu \ - || ( cd ${CUDA_CI_PATH}/${GITHUB_RUN_NUMBER}/ && rm -rf ${GITHUB_JOB} && exit 1 ) - """ + set -e + cd ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER} && rm -rf ${GITHUB_JOB} && cp -R source-main ${GITHUB_JOB} + srun --job-name=${GITHUB_JOB} bash -c "export USE_COVERAGE=ON && cd ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/${GITHUB_JOB}/dipu \ + && source ${ENV_PATH}/dipu_env \ + && bash scripts/ci/nv/ci_nv_script.sh build_dipu" || ( cd ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/ && rm -rf ${GITHUB_JOB} && exit 1 ) Test-Cuda-Latest-Target: name: Test-dipu-cuda-latest-target needs: [Build-Cuda-Latest-Target] - runs-on: github-poc-ci - env: - GPU_REQUESTS: 1 + runs-on: tps-sco-ci steps: - name: Run-test run: | - ssh ${CUDA_CLUSTER} """ - set -ex - cd ${CUDA_CI_PATH}/${GITHUB_RUN_NUMBER}/ && cd Build-Cuda-Latest-Target/dipu - source scripts/ci/nv/ci_nv_env.sh - srun --job-name=${GITHUB_RUN_NUMBER}_${GITHUB_JOB} --partition=${CUDA_PARTATION} --gres=gpu:${GPU_REQUESTS} --cpus-per-task=5 --mem=16G --time=60 sh tests/run_nv_tests.sh && cd ${CUDA_CI_PATH}/${GITHUB_RUN_NUMBER}/ && rm -rf Build-Cuda-Latest-Target \ - || ( cd ${CUDA_CI_PATH}/${GITHUB_RUN_NUMBER}/ && rm -rf ${GITHUB_JOB} && exit 1 ) - """ + set -e + srun --job-name=${GITHUB_JOB} bash -c "cd ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/Build-Cuda-Latest-Target/dipu \ + && source ${ENV_PATH}/dipu_env \ + && bash tests/run_nv_tests.sh" && cd ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/ && rm -rf Build-Cuda-Latest-Target \ + || ( cd ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/ && rm -rf ${GITHUB_JOB} && exit 1 ) Build-PyTorch-For-Ascend-910b: name: Build-dipu-pytorch-for-ascend-910b @@ -441,7 +396,7 @@ jobs: cd ${DEEPLINK_PATH}/${GITHUB_RUN_NUMBER}/Build-Ascend-910b/dipu source scripts/ci/ascend/ci_ascend_env.sh bash tests/run_ascend_tests.sh - + Test-One-Iter-Ascend-910b: name: Test-one-iter-ascend-910b needs: [Build-Ascend-910b] diff --git a/dipu/scripts/ci/ci_run_one_iter.py b/dipu/scripts/ci/ci_run_one_iter.py index eeb5a3925..3fad912de 100644 --- a/dipu/scripts/ci/ci_run_one_iter.py +++ b/dipu/scripts/ci/ci_run_one_iter.py @@ -11,14 +11,12 @@ log_format = '%(asctime)s - %(levelname)s: %(message)s' logging.basicConfig(level=logging.INFO, format=log_format, datefmt='%Y-%m-%d %H:%M:%S') - def run_cmd(cmd: str) -> None: cp = sp.run(cmd, shell=True, encoding="utf-8") if cp.returncode != 0: error = f"Some thing wrong has happened when running command [{cmd}]:{cp.stderr}" raise Exception(error) - def process_one_iter(log_file, clear_log, model_info: dict) -> None: begin_time = time.time() @@ -72,14 +70,7 @@ def process_one_iter(log_file, clear_log, model_info: dict) -> None: if not os.path.exists(storage_path): os.makedirs(storage_path) - if device == 'camb': - base_data_src = '/mnt/lustre/share/parrotsci/github/model_baseline_data' - elif device == 'cuda': - base_data_src = '/mnt/cache/share/parrotsci/github/model_baseline_data' - elif device == "ascend": - base_data_src = "/mnt/cache/share/deeplinkci/github/model_baseline_data" - elif device == "kunlunxin": - base_data_src = "/mnt/cache/share/deeplinkci/github/model_baseline_data" + base_data_src = '/mnt/cache/share/parrotsci/github/model_baseline_data' src = f'{base_data_src}/{p3}/baseline' if not os.path.exists(src): os.makedirs(src) @@ -94,7 +85,6 @@ def process_one_iter(log_file, clear_log, model_info: dict) -> None: rtol = precision.get('rtol', 1e-4) metric = precision.get('metric', 1e-2) logging.info(f'Using pricision: atol-{atol}, rtol-{rtol}, metric-{metric}') - if device == 'cuda': if (p2 == "stable_diffusion/stable-diffusion_ddim_denoisingunet_infer.py"): cmd_run_one_iter = f"srun --job-name={job_name} --partition={partition} --gres={gpu_requests} --cpus-per-task=5 --mem=16G --time=40 sh mmagic/configs/stable_diffusion/stable-diffusion_ddim_denoisingunet_one_iter.sh" @@ -106,6 +96,18 @@ def process_one_iter(log_file, clear_log, model_info: dict) -> None: else: cmd_run_one_iter = f"srun --job-name={job_name} --partition={partition} --gres={gpu_requests} --cpus-per-task=5 --mem=16G --time=40 sh SMART/tools/one_iter_tool/run_one_iter.sh {train_path} {config_path} {work_dir} {opt_arg}" cmd_cp_one_iter = f"srun --job-name={job_name} --partition={partition} --gres={gpu_requests} --cpus-per-task=5 --mem=16G --time=30 sh SMART/tools/one_iter_tool/compare_one_iter.sh {package_name}" + elif device == 'sco': + current_path = os.getcwd() + parent_directory = os.path.dirname(current_path) + if (p2 == "stable_diffusion/stable-diffusion_ddim_denoisingunet_infer.py"): + cmd_run_one_iter = f"""srun --job-name={job_name} bash -c "cd {parent_directory} && source scripts/ci/ci_one_iter.sh export_pythonpath_cuda {current_path} && source /mnt/cache/share/deeplinkci/github/dipu_env && cd mmlab_pack && source environment_exported && export ONE_ITER_TOOL_STORAGE_PATH={storage_path} && bash {current_path}/mmagic/configs/stable_diffusion/stable-diffusion_ddim_denoisingunet_one_iter.sh" """ + cmd_cp_one_iter = "" + elif ('infer' in p2 and 'infer' in p3): + cmd_run_one_iter = f"""srun --job-name={job_name} bash -c "cd {parent_directory} && source scripts/ci/ci_one_iter.sh export_pythonpath_cuda {current_path} && source /mnt/cache/share/deeplinkci/github/dipu_env && cd mmlab_pack && source environment_exported && export ONE_ITER_TOOL_STORAGE_PATH={storage_path} && python {current_path}/{train_path}" """ + cmd_cp_one_iter = "" + else: + cmd_run_one_iter = f"""srun --job-name={job_name} bash -c "cd {parent_directory} && source scripts/ci/ci_one_iter.sh export_pythonpath_cuda {current_path} && source /mnt/cache/share/deeplinkci/github/dipu_env && cd mmlab_pack && source environment_exported && export ONE_ITER_TOOL_STORAGE_PATH={storage_path} && bash {current_path}/SMART/tools/one_iter_tool/run_one_iter.sh {train_path} {config_path} {work_dir} {opt_arg}" """ + cmd_cp_one_iter = f"""srun --job-name={job_name} bash -c "cd {parent_directory} && source scripts/ci/ci_one_iter.sh export_pythonpath_cuda {current_path} && source /mnt/cache/share/deeplinkci/github/dipu_env && cd mmlab_pack && source environment_exported && export ONE_ITER_TOOL_STORAGE_PATH={storage_path} && bash {current_path}/SMART/tools/one_iter_tool/compare_one_iter.sh {package_name}" """ elif device == "camb" : # For the inference of large language models, simply compare the inference results on the current device directly with the results generated on the GPU if ('infer' in p2 and 'infer' in p3): @@ -163,6 +165,8 @@ def print_file(file_name): args = parser.parse_args() device = args.device + if device == 'sco': + max_parall = 3 job_name = args.job_name gpu_requests = args.gpu_requests partition = args.partition_arg @@ -199,8 +203,22 @@ def print_file(file_name): # os.environ['ONE_ITER_TOOL_IOSAVE_RATIO'] = "1.0" # 0.2 by default curPath = os.path.dirname(os.path.realpath(__file__)) yamlPath = os.path.join(curPath, selected_model_list) + file_path = os.path.join(curPath, "environment_exported") + env_variables = os.environ + keywords_to_filter = ['DIPU', 'ONE_ITER'] + if os.path.exists(file_path): + os.remove(file_path) + with open("environment_exported", "w") as file: + file.write("pwd\n") + for key, value in env_variables.items(): + if any(keyword in key for keyword in keywords_to_filter): + file.write(f'export {key}="{value}"\n') with open(yamlPath, 'r', encoding='utf-8') as f: - original_list = yaml.safe_load(f.read()).get(device, None) + if device == 'sco': + original_list = yaml.safe_load(f.read()).get("cuda", None) + else: + original_list = yaml.safe_load(f.read()).get(device, None) + if not original_list: logging.warning(f"Device type: {device} is not supported!") exit(0) diff --git a/dipu/scripts/ci/nv/ci_nv_tidy.sh b/dipu/scripts/ci/nv/ci_nv_tidy.sh index dd9bd0228..3401d819d 100644 --- a/dipu/scripts/ci/nv/ci_nv_tidy.sh +++ b/dipu/scripts/ci/nv/ci_nv_tidy.sh @@ -1,6 +1,6 @@ #!/bin/bash set -euo pipefail - +source /mnt/cache/share/deeplinkci/github/proxy_on # Require Git. [ -x "$(command -v git)" ] || (echo "missing git tool" && exit 1) @@ -14,13 +14,13 @@ repo=$(cd $self && git rev-parse --show-toplevel) # Try finding clangd and libstdc++.so.6 on 1988. # Note: ":+:" is used to handle unbound variable. -[ -d /mnt/lustre/share/platform/dep/clang-16/bin ] && - export PATH=/mnt/lustre/share/platform/dep/clang-16/bin${PATH:+:$PATH} +[ -d /mnt/cache/share/platform/dep/clang-16/bin ] && + export PATH=/mnt/cache/share/platform/dep/clang-16/bin${PATH:+:$PATH} [ -d /mnt/cache/share/platform/env/miniconda3.10/envs/pt2.0_diopi/lib ] && export LD_LIBRARY_PATH=/mnt/cache/share/platform/env/miniconda3.10/envs/pt2.0_diopi/lib${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH} # Forward srun commands. # e.g. you can use: bash scripts/ci/nv/ci_nv_tidy.sh srun -p pat_rd (cd "$repo/dipu" && - ($@ find torch_dipu ! -path '*/vendor/*' ! -name AutoGenedKernels.cpp \( -name '*.cpp' -o -name '*.h' -o -name '*.hpp' \) | + (find torch_dipu ! -path '*/vendor/*' ! -name AutoGenedKernels.cpp \( -name '*.cpp' -o -name '*.h' -o -name '*.hpp' \) | xargs $self/clangd-tidy/clangd-tidy -j4)) diff --git a/dipu/scripts/ci/test_one_iter_large_language_model_list.yaml b/dipu/scripts/ci/test_one_iter_large_language_model_list.yaml index 33c12be37..f0cbccc9d 100644 --- a/dipu/scripts/ci/test_one_iter_large_language_model_list.yaml +++ b/dipu/scripts/ci/test_one_iter_large_language_model_list.yaml @@ -5,7 +5,7 @@ cuda: - model_cfg: "transformers examples/pytorch/language-modeling/llama_7b_infer.py workdirs_transformers_llama_infer" - model_cfg: "transformers examples/pytorch/language-modeling/internlm_7b_infer.py workdirs_transformers_internlm_infer" # lightllm - - model_cfg: "lightllm llama_7b_via_lightllm_infer.py workdirs_lightllm_llama_infer" + # - model_cfg: "lightllm llama_7b_via_lightllm_infer.py workdirs_lightllm_llama_infer" camb: