From 24377b8ba4512a5ea8675f5d2754cf6e54e6b615 Mon Sep 17 00:00:00 2001 From: Ali Abdelkader Date: Sat, 24 Feb 2024 04:38:24 +0200 Subject: [PATCH 1/6] add test configurations for quantization with onnxruntime, awq, bnb - add test configurations for onnxruntime backend - add no_weights sweep to quantization tests - add test configuration for pytorch awq quantization - add test configuration for bnb quantization - add autoawq and bitsandbytes libraries to optional install requirements - update makefile and github workflow so that the autoawq & bnb get installed and their related tests run successfully locally and on CI --- .github/workflows/test_cli_cuda_pytorch.yaml | 2 +- Makefile | 2 +- setup.py | 2 ++ tests/configs/_no_weights_sweep_.yaml | 4 +++ ...nference_onnxruntime_static_quant_vit.yaml | 20 +++++++++++++++ ...nference_onnxruntime_static_quant_vit.yaml | 20 +++++++++++++++ .../cuda_inference_pytorch_awq_exllama.yaml | 25 +++++++++++++++++++ .../cuda_inference_pytorch_gpt_bnb_4bit.yaml | 17 +++++++++++++ .../cuda_inference_pytorch_gpt_bnb_8bit.yaml | 16 ++++++++++++ 9 files changed, 106 insertions(+), 2 deletions(-) create mode 100644 tests/configs/_no_weights_sweep_.yaml create mode 100644 tests/configs/cpu_inference_onnxruntime_static_quant_vit.yaml create mode 100644 tests/configs/cuda_inference_onnxruntime_static_quant_vit.yaml create mode 100644 tests/configs/cuda_inference_pytorch_awq_exllama.yaml create mode 100644 tests/configs/cuda_inference_pytorch_gpt_bnb_4bit.yaml create mode 100644 tests/configs/cuda_inference_pytorch_gpt_bnb_8bit.yaml diff --git a/.github/workflows/test_cli_cuda_pytorch.yaml b/.github/workflows/test_cli_cuda_pytorch.yaml index 204722db..3d8a8b0b 100644 --- a/.github/workflows/test_cli_cuda_pytorch.yaml +++ b/.github/workflows/test_cli_cuda_pytorch.yaml @@ -53,4 +53,4 @@ jobs: --workdir /workspace/optimum-benchmark --entrypoint /bin/bash opt-bench-cuda:${{ matrix.image.cuda_version }} - -c "pip install -e .[testing,diffusers,timm,deepspeed,peft] && pytest -k 'cli and cuda and pytorch' -x" + -c "pip install -e .[testing,diffusers,timm,deepspeed,peft,autoawq,bitsandbytes] && pytest -k 'cli and cuda and pytorch' -x" diff --git a/Makefile b/Makefile index 468cccd0..e511be9a 100644 --- a/Makefile +++ b/Makefile @@ -15,7 +15,7 @@ CLI_MISC_REQS := testing CLI_CUDA_ONNXRUNTIME_REQS := testing,timm,diffusers CLI_ROCM_ONNXRUNTIME_REQS := testing,timm,diffusers -CLI_CUDA_PYTORCH_REQS := testing,timm,diffusers,deepspeed,peft +CLI_CUDA_PYTORCH_REQS := testing,timm,diffusers,deepspeed,peft,autoawq,bitsandbytes CLI_ROCM_PYTORCH_REQS := testing,timm,diffusers,deepspeed,peft CLI_CPU_OPENVINO_REQS := testing,openvino,timm,diffusers diff --git a/setup.py b/setup.py index 50dc0528..1fcff05b 100644 --- a/setup.py +++ b/setup.py @@ -55,6 +55,8 @@ "diffusers": ["diffusers"], "timm": ["timm"], "peft": ["peft"], + "autoawq": ["autoawq"], + "bitsandbytes": ["bitsandbytes"], } diff --git a/tests/configs/_no_weights_sweep_.yaml b/tests/configs/_no_weights_sweep_.yaml new file mode 100644 index 00000000..b982009c --- /dev/null +++ b/tests/configs/_no_weights_sweep_.yaml @@ -0,0 +1,4 @@ +hydra: + sweeper: + params: + backend.no_weights: true,false diff --git a/tests/configs/cpu_inference_onnxruntime_static_quant_vit.yaml b/tests/configs/cpu_inference_onnxruntime_static_quant_vit.yaml new file mode 100644 index 00000000..b3bf7c63 --- /dev/null +++ b/tests/configs/cpu_inference_onnxruntime_static_quant_vit.yaml @@ -0,0 +1,20 @@ +defaults: + - backend: onnxruntime + # order of inheritance, last one overrides previous ones + - _base_ # inherits from base config + - _inference_ # inherits from inference config + - _cpu_ # inherits from cpu config + - _no_weights_sweep_ # sweep over no_weights: true,false + - _self_ # hydra 1.1 compatibility + +experiment_name: cpu_inference_onnxruntime_static_quant_vit + +backend: + model: google/vit-base-patch16-224 + quantization: true + quantization_config: + is_static: true + per_channel: false + + calibration: true + diff --git a/tests/configs/cuda_inference_onnxruntime_static_quant_vit.yaml b/tests/configs/cuda_inference_onnxruntime_static_quant_vit.yaml new file mode 100644 index 00000000..382fabcb --- /dev/null +++ b/tests/configs/cuda_inference_onnxruntime_static_quant_vit.yaml @@ -0,0 +1,20 @@ +defaults: + - backend: onnxruntime + # order of inheritance, last one overrides previous ones + - _base_ # inherits from base config + - _inference_ # inherits from inference config + - _cuda_ # inherits from cuda config + - _no_weights_sweep_ # sweep over no_weights: true,false + - _self_ # hydra 1.1 compatibility + +experiment_name: cuda_inference_onnxruntime_static_quant_vit + +backend: + model: google/vit-base-patch16-224 + quantization: true + quantization_config: + is_static: true + per_channel: false + + calibration: true + diff --git a/tests/configs/cuda_inference_pytorch_awq_exllama.yaml b/tests/configs/cuda_inference_pytorch_awq_exllama.yaml new file mode 100644 index 00000000..8dfc0064 --- /dev/null +++ b/tests/configs/cuda_inference_pytorch_awq_exllama.yaml @@ -0,0 +1,25 @@ +defaults: + - backend: pytorch + - _base_ # inherits from base config + - _inference_ # inherits from inference config + - _cuda_ # inherits from cuda config + - _no_weights_sweep_ # sweep over no_weights: true,false + - _self_ # hydra 1.1 compatibility + +experiment_name: cuda_inference_pytorch_awq_exllama + +benchmark: + input_shapes: + batch_size: 4 + sequence_length: 128 + + generate_kwargs: + max_new_tokens: 128 + min_new_tokens: 128 + +backend: + model: TheBloke/Mistral-7B-Instruct-v0.1-AWQ + quantization_scheme: "awq" + quantization_config: + exllama_config: + version: 2 diff --git a/tests/configs/cuda_inference_pytorch_gpt_bnb_4bit.yaml b/tests/configs/cuda_inference_pytorch_gpt_bnb_4bit.yaml new file mode 100644 index 00000000..a1db9e09 --- /dev/null +++ b/tests/configs/cuda_inference_pytorch_gpt_bnb_4bit.yaml @@ -0,0 +1,17 @@ +defaults: + - backend: pytorch + - _base_ # inherits from base config + - _inference_ # inherits from inference config + - _cuda_ # inherits from cuda config + - _gpt_ # use gpt2 model + - _no_weights_sweep_ # sweep no_weights: true, false + - _self_ # hydra 1.1 compatibility + +experiment_name: cuda_inference_pytorch_gpt_bnb_4bit + +backend: + quantization_scheme: "bnb" + quantization_config: + load_in_4bit: true + llm_int8_threshold: 6.0 + bnb_4bit_compute_dtype: float16 diff --git a/tests/configs/cuda_inference_pytorch_gpt_bnb_8bit.yaml b/tests/configs/cuda_inference_pytorch_gpt_bnb_8bit.yaml new file mode 100644 index 00000000..c6686580 --- /dev/null +++ b/tests/configs/cuda_inference_pytorch_gpt_bnb_8bit.yaml @@ -0,0 +1,16 @@ +defaults: + - backend: pytorch + - _base_ # inherits from base config + - _inference_ # inherits from inference config + - _cuda_ # inherits from cuda config + - _gpt_ # use gpt2 model + - _no_weights_sweep_ # sweep no_weights: true, false + - _self_ # hydra 1.1 compatibility + +experiment_name: cuda_inference_pytorch_gpt_bnb_8bit + +backend: + quantization_scheme: "bnb" + quantization_config: + load_in_8bit: true + llm_int8_threshold: 6.0 From 746532f42fcfe61fb1fd89989edc6a15040405de Mon Sep 17 00:00:00 2001 From: Ali Abdelkader Date: Tue, 5 Mar 2024 15:55:25 +0200 Subject: [PATCH 2/6] set bnb llm_int8_threshold to 0 for reproducibility in tests --- tests/configs/cuda_inference_pytorch_gpt_bnb_4bit.yaml | 1 - tests/configs/cuda_inference_pytorch_gpt_bnb_8bit.yaml | 1 - 2 files changed, 2 deletions(-) diff --git a/tests/configs/cuda_inference_pytorch_gpt_bnb_4bit.yaml b/tests/configs/cuda_inference_pytorch_gpt_bnb_4bit.yaml index a1db9e09..0769a008 100644 --- a/tests/configs/cuda_inference_pytorch_gpt_bnb_4bit.yaml +++ b/tests/configs/cuda_inference_pytorch_gpt_bnb_4bit.yaml @@ -13,5 +13,4 @@ backend: quantization_scheme: "bnb" quantization_config: load_in_4bit: true - llm_int8_threshold: 6.0 bnb_4bit_compute_dtype: float16 diff --git a/tests/configs/cuda_inference_pytorch_gpt_bnb_8bit.yaml b/tests/configs/cuda_inference_pytorch_gpt_bnb_8bit.yaml index c6686580..21e528e6 100644 --- a/tests/configs/cuda_inference_pytorch_gpt_bnb_8bit.yaml +++ b/tests/configs/cuda_inference_pytorch_gpt_bnb_8bit.yaml @@ -13,4 +13,3 @@ backend: quantization_scheme: "bnb" quantization_config: load_in_8bit: true - llm_int8_threshold: 6.0 From 13cd173214cddd920c852b43a36d4c037ce30f58 Mon Sep 17 00:00:00 2001 From: Ali Abdelkader Date: Tue, 5 Mar 2024 22:54:18 +0200 Subject: [PATCH 3/6] handle installation of autoawq bitsandbytes without setup.py - remove autoawq bitsandbytes from setupy.py - add autoawq bitsandbytes to github workflows and makefie --- .github/workflows/test_cli_cuda_pytorch.yaml | 2 +- .github/workflows/test_cli_rocm_pytorch.yaml | 2 +- Makefile | 10 +++++----- setup.py | 2 -- 4 files changed, 7 insertions(+), 9 deletions(-) diff --git a/.github/workflows/test_cli_cuda_pytorch.yaml b/.github/workflows/test_cli_cuda_pytorch.yaml index 3d8a8b0b..02fe8140 100644 --- a/.github/workflows/test_cli_cuda_pytorch.yaml +++ b/.github/workflows/test_cli_cuda_pytorch.yaml @@ -53,4 +53,4 @@ jobs: --workdir /workspace/optimum-benchmark --entrypoint /bin/bash opt-bench-cuda:${{ matrix.image.cuda_version }} - -c "pip install -e .[testing,diffusers,timm,deepspeed,peft,autoawq,bitsandbytes] && pytest -k 'cli and cuda and pytorch' -x" + -c "pip install -e .[testing,diffusers,timm,deepspeed,peft] && pip install bitsandbytes git+https://github.com/casper-hansen/AutoAWQ.git && pytest -k 'cli and cuda and pytorch' -x" diff --git a/.github/workflows/test_cli_rocm_pytorch.yaml b/.github/workflows/test_cli_rocm_pytorch.yaml index c4ae7139..1cc27f83 100644 --- a/.github/workflows/test_cli_rocm_pytorch.yaml +++ b/.github/workflows/test_cli_rocm_pytorch.yaml @@ -51,4 +51,4 @@ jobs: --device /dev/dri/renderD129 --entrypoint /bin/bash opt-bench-rocm:${{ matrix.image.rocm_version }} - -c "pip install -e .[testing,diffusers,timm,deepspeed,peft] && pytest -k 'cli and cuda and pytorch' -x" + -c "pip install -e .[testing,diffusers,timm,deepspeed,peft] && pip install git+https://github.com/casper-hansen/AutoAWQ.git && pytest -k 'cli and cuda and pytorch not bnb' -x" diff --git a/Makefile b/Makefile index e511be9a..5dff0f9b 100644 --- a/Makefile +++ b/Makefile @@ -15,9 +15,9 @@ CLI_MISC_REQS := testing CLI_CUDA_ONNXRUNTIME_REQS := testing,timm,diffusers CLI_ROCM_ONNXRUNTIME_REQS := testing,timm,diffusers -CLI_CUDA_PYTORCH_REQS := testing,timm,diffusers,deepspeed,peft,autoawq,bitsandbytes +CLI_CUDA_PYTORCH_REQS := testing,timm,diffusers,deepspeed,peft CLI_ROCM_PYTORCH_REQS := testing,timm,diffusers,deepspeed,peft - +CLI_CUDA_PYTORCH_QUANTIZATION_REGS := bitsandbytes git+https://github.com/casper-hansen/AutoAWQ.git CLI_CPU_OPENVINO_REQS := testing,openvino,timm,diffusers CLI_CPU_PYTORCH_REQS := testing,timm,diffusers,deepspeed,peft CLI_CPU_ONNXRUNTIME_REQS := testing,onnxruntime,timm,diffusers @@ -108,7 +108,7 @@ define test_nvidia --entrypoint /bin/bash \ --volume $(PWD):/workspace \ --workdir /workspace \ - opt-bench-$(1):local -c "pip install -e .[$(2)] && pytest tests/ -k '$(3)' -x" + opt-bench-$(1):local -c "pip install -e .[$(2)] && pip install $(CLI_CUDA_PYTORCH_QUANTIZATION_REGS) && pytest tests/ -k '$(3)' -x" endef define test_amdgpu @@ -122,7 +122,7 @@ define test_amdgpu --entrypoint /bin/bash \ --volume $(PWD):/workspace \ --workdir /workspace \ - opt-bench-$(1):local -c "pip install -e .[$(2)] && pytest tests/ -k '$(3)' -x" + opt-bench-$(1):local -c "pip install -e .[$(2)] && pip install git+https://github.com/casper-hansen/AutoAWQ.git && pytest tests/ -k '$(3)' -x" endef # group the extra @@ -144,7 +144,7 @@ test_cli_cuda_pytorch: $(call test_nvidia,cuda,$(CLI_CUDA_PYTORCH_REQS),cli and cuda and pytorch) test_cli_rocm_pytorch: - $(call test_amdgpu,rocm,$(CLI_ROCM_PYTORCH_REQS),cli and cuda and pytorch and peft) + $(call test_amdgpu,rocm,$(CLI_ROCM_PYTORCH_REQS),cli and cuda and pytorch and peft and not bnb) test_cli_cuda_onnxruntime: $(call test_nvidia,cuda,$(CLI_CUDA_ONNXRUNTIME_REQS),cli and cuda and onnxruntime) diff --git a/setup.py b/setup.py index 1fcff05b..50dc0528 100644 --- a/setup.py +++ b/setup.py @@ -55,8 +55,6 @@ "diffusers": ["diffusers"], "timm": ["timm"], "peft": ["peft"], - "autoawq": ["autoawq"], - "bitsandbytes": ["bitsandbytes"], } From 5d256d6ff667cdb40287b1ec216b14357f42ca87 Mon Sep 17 00:00:00 2001 From: Ali Abdelkader Date: Wed, 6 Mar 2024 13:00:50 +0200 Subject: [PATCH 4/6] minor fixes resolving PR review comments --- .github/workflows/test_cli_rocm_pytorch.yaml | 2 +- Makefile | 2 +- tests/configs/cuda_inference_pytorch_awq_exllama.yaml | 11 +---------- 3 files changed, 3 insertions(+), 12 deletions(-) diff --git a/.github/workflows/test_cli_rocm_pytorch.yaml b/.github/workflows/test_cli_rocm_pytorch.yaml index 1cc27f83..c991375c 100644 --- a/.github/workflows/test_cli_rocm_pytorch.yaml +++ b/.github/workflows/test_cli_rocm_pytorch.yaml @@ -51,4 +51,4 @@ jobs: --device /dev/dri/renderD129 --entrypoint /bin/bash opt-bench-rocm:${{ matrix.image.rocm_version }} - -c "pip install -e .[testing,diffusers,timm,deepspeed,peft] && pip install git+https://github.com/casper-hansen/AutoAWQ.git && pytest -k 'cli and cuda and pytorch not bnb' -x" + -c "pip install -e .[testing,diffusers,timm,deepspeed,peft] && pip install git+https://github.com/casper-hansen/AutoAWQ.git && pytest -k 'cli and cuda and pytorch and not bnb' -x" diff --git a/Makefile b/Makefile index 5dff0f9b..5d27105b 100644 --- a/Makefile +++ b/Makefile @@ -17,7 +17,7 @@ CLI_CUDA_ONNXRUNTIME_REQS := testing,timm,diffusers CLI_ROCM_ONNXRUNTIME_REQS := testing,timm,diffusers CLI_CUDA_PYTORCH_REQS := testing,timm,diffusers,deepspeed,peft CLI_ROCM_PYTORCH_REQS := testing,timm,diffusers,deepspeed,peft -CLI_CUDA_PYTORCH_QUANTIZATION_REGS := bitsandbytes git+https://github.com/casper-hansen/AutoAWQ.git +CLI_CUDA_PYTORCH_QUANTIZATION_REQS := bitsandbytes git+https://github.com/casper-hansen/AutoAWQ.git CLI_CPU_OPENVINO_REQS := testing,openvino,timm,diffusers CLI_CPU_PYTORCH_REQS := testing,timm,diffusers,deepspeed,peft CLI_CPU_ONNXRUNTIME_REQS := testing,onnxruntime,timm,diffusers diff --git a/tests/configs/cuda_inference_pytorch_awq_exllama.yaml b/tests/configs/cuda_inference_pytorch_awq_exllama.yaml index 8dfc0064..4059222c 100644 --- a/tests/configs/cuda_inference_pytorch_awq_exllama.yaml +++ b/tests/configs/cuda_inference_pytorch_awq_exllama.yaml @@ -8,17 +8,8 @@ defaults: experiment_name: cuda_inference_pytorch_awq_exllama -benchmark: - input_shapes: - batch_size: 4 - sequence_length: 128 - - generate_kwargs: - max_new_tokens: 128 - min_new_tokens: 128 - backend: - model: TheBloke/Mistral-7B-Instruct-v0.1-AWQ + model: TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ quantization_scheme: "awq" quantization_config: exllama_config: From 6d323e4fb82a58b2132acb80f07b64949adfea6f Mon Sep 17 00:00:00 2001 From: Ali Abdelkader Date: Wed, 6 Mar 2024 21:48:12 +0200 Subject: [PATCH 5/6] add autoawq and bnb to setup.py - update github workflow files to install autoawq and bnb using setup.py - "requests" is installed independently because "autoawq@git+https..." requires it to proceed with its installation. --- .github/workflows/test_cli_cuda_pytorch.yaml | 2 +- .github/workflows/test_cli_rocm_pytorch.yaml | 2 +- Makefile | 9 ++++----- setup.py | 2 ++ 4 files changed, 8 insertions(+), 7 deletions(-) diff --git a/.github/workflows/test_cli_cuda_pytorch.yaml b/.github/workflows/test_cli_cuda_pytorch.yaml index 02fe8140..15445f1f 100644 --- a/.github/workflows/test_cli_cuda_pytorch.yaml +++ b/.github/workflows/test_cli_cuda_pytorch.yaml @@ -53,4 +53,4 @@ jobs: --workdir /workspace/optimum-benchmark --entrypoint /bin/bash opt-bench-cuda:${{ matrix.image.cuda_version }} - -c "pip install -e .[testing,diffusers,timm,deepspeed,peft] && pip install bitsandbytes git+https://github.com/casper-hansen/AutoAWQ.git && pytest -k 'cli and cuda and pytorch' -x" + -c "pip install requests && pip install -e .[testing,diffusers,timm,deepspeed,peft,bitsandbytes,autoawq] && pytest -k 'cli and cuda and pytorch' -x" diff --git a/.github/workflows/test_cli_rocm_pytorch.yaml b/.github/workflows/test_cli_rocm_pytorch.yaml index c991375c..1006b2e1 100644 --- a/.github/workflows/test_cli_rocm_pytorch.yaml +++ b/.github/workflows/test_cli_rocm_pytorch.yaml @@ -51,4 +51,4 @@ jobs: --device /dev/dri/renderD129 --entrypoint /bin/bash opt-bench-rocm:${{ matrix.image.rocm_version }} - -c "pip install -e .[testing,diffusers,timm,deepspeed,peft] && pip install git+https://github.com/casper-hansen/AutoAWQ.git && pytest -k 'cli and cuda and pytorch and not bnb' -x" + -c "pip install requests && pip install -e .[testing,diffusers,timm,deepspeed,peft,autoawq] && pytest -k 'cli and cuda and pytorch and not bnb' -x" diff --git a/Makefile b/Makefile index 5d27105b..622588ae 100644 --- a/Makefile +++ b/Makefile @@ -15,9 +15,8 @@ CLI_MISC_REQS := testing CLI_CUDA_ONNXRUNTIME_REQS := testing,timm,diffusers CLI_ROCM_ONNXRUNTIME_REQS := testing,timm,diffusers -CLI_CUDA_PYTORCH_REQS := testing,timm,diffusers,deepspeed,peft -CLI_ROCM_PYTORCH_REQS := testing,timm,diffusers,deepspeed,peft -CLI_CUDA_PYTORCH_QUANTIZATION_REQS := bitsandbytes git+https://github.com/casper-hansen/AutoAWQ.git +CLI_CUDA_PYTORCH_REQS := testing,timm,diffusers,deepspeed,peft,bitsandbytes,autoawq +CLI_ROCM_PYTORCH_REQS := testing,timm,diffusers,deepspeed,peft,autoawq CLI_CPU_OPENVINO_REQS := testing,openvino,timm,diffusers CLI_CPU_PYTORCH_REQS := testing,timm,diffusers,deepspeed,peft CLI_CPU_ONNXRUNTIME_REQS := testing,onnxruntime,timm,diffusers @@ -108,7 +107,7 @@ define test_nvidia --entrypoint /bin/bash \ --volume $(PWD):/workspace \ --workdir /workspace \ - opt-bench-$(1):local -c "pip install -e .[$(2)] && pip install $(CLI_CUDA_PYTORCH_QUANTIZATION_REGS) && pytest tests/ -k '$(3)' -x" + opt-bench-$(1):local -c "pip install requests && pip install -e .[$(2)] && pytest tests/ -k '$(3)' -x" endef define test_amdgpu @@ -122,7 +121,7 @@ define test_amdgpu --entrypoint /bin/bash \ --volume $(PWD):/workspace \ --workdir /workspace \ - opt-bench-$(1):local -c "pip install -e .[$(2)] && pip install git+https://github.com/casper-hansen/AutoAWQ.git && pytest tests/ -k '$(3)' -x" + opt-bench-$(1):local -c "pip install requests && pip install -e .[$(2)] && pytest tests/ -k '$(3)' -x" endef # group the extra diff --git a/setup.py b/setup.py index 50dc0528..6a10f78b 100644 --- a/setup.py +++ b/setup.py @@ -55,6 +55,8 @@ "diffusers": ["diffusers"], "timm": ["timm"], "peft": ["peft"], + "autoawq": ["autoawq@git+https://github.com/casper-hansen/AutoAWQ.git"], + "bitsandbytes": ["bitsandbytes"] } From dcae350152135ac488b96675ba63016cc60b8aeb Mon Sep 17 00:00:00 2001 From: Ali Abdelkader Date: Fri, 8 Mar 2024 21:29:58 +0200 Subject: [PATCH 6/6] minor fix to setup.py style --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 6a10f78b..6212b4e1 100644 --- a/setup.py +++ b/setup.py @@ -56,7 +56,7 @@ "timm": ["timm"], "peft": ["peft"], "autoawq": ["autoawq@git+https://github.com/casper-hansen/AutoAWQ.git"], - "bitsandbytes": ["bitsandbytes"] + "bitsandbytes": ["bitsandbytes"], }