huggingface · IlyasMoutawwakil · Mar 13, 2024 · Feb 24, 2024 · Mar 5, 2024 · Mar 5, 2024
diff --git a/.github/workflows/test_cli_cuda_pytorch.yaml b/.github/workflows/test_cli_cuda_pytorch.yaml
@@ -53,4 +53,4 @@ jobs:
           --workdir /workspace/optimum-benchmark
           --entrypoint /bin/bash
           opt-bench-cuda:${{ matrix.image.cuda_version }}
-          -c "pip install -e .[testing,diffusers,timm,deepspeed,peft] && pytest -k 'cli and cuda and pytorch' -x"
+          -c "pip install requests && pip install -e .[testing,diffusers,timm,deepspeed,peft,bitsandbytes,autoawq] && pytest -k 'cli and cuda and pytorch' -x"
diff --git a/.github/workflows/test_cli_rocm_pytorch.yaml b/.github/workflows/test_cli_rocm_pytorch.yaml
@@ -51,4 +51,4 @@ jobs:
           --device /dev/dri/renderD129
           --entrypoint /bin/bash
           opt-bench-rocm:${{ matrix.image.rocm_version }}
-          -c "pip install -e .[testing,diffusers,timm,deepspeed,peft] && pytest -k 'cli and cuda and pytorch' -x"
+          -c "pip install requests && pip install -e .[testing,diffusers,timm,deepspeed,peft,autoawq] && pytest -k 'cli and cuda and pytorch and not bnb' -x"
diff --git a/Makefile b/Makefile
@@ -15,9 +15,8 @@ CLI_MISC_REQS := testing
 
 CLI_CUDA_ONNXRUNTIME_REQS := testing,timm,diffusers
 CLI_ROCM_ONNXRUNTIME_REQS := testing,timm,diffusers
-CLI_CUDA_PYTORCH_REQS := testing,timm,diffusers,deepspeed,peft
-CLI_ROCM_PYTORCH_REQS := testing,timm,diffusers,deepspeed,peft
-
+CLI_CUDA_PYTORCH_REQS := testing,timm,diffusers,deepspeed,peft,bitsandbytes,autoawq
+CLI_ROCM_PYTORCH_REQS := testing,timm,diffusers,deepspeed,peft,autoawq
 CLI_CPU_OPENVINO_REQS := testing,openvino,timm,diffusers
 CLI_CPU_PYTORCH_REQS := testing,timm,diffusers,deepspeed,peft
 CLI_CPU_ONNXRUNTIME_REQS := testing,onnxruntime,timm,diffusers
@@ -108,7 +107,7 @@ define test_nvidia
 	--entrypoint /bin/bash \
 	--volume $(PWD):/workspace \
 	--workdir /workspace \
-	opt-bench-$(1):local -c "pip install -e .[$(2)] && pytest tests/ -k '$(3)' -x"
+	opt-bench-$(1):local -c "pip install requests && pip install -e .[$(2)] && pytest tests/ -k '$(3)' -x"
 endef
 
 define test_amdgpu
@@ -122,7 +121,7 @@ define test_amdgpu
 	--entrypoint /bin/bash \
 	--volume $(PWD):/workspace \
 	--workdir /workspace \
-	opt-bench-$(1):local -c "pip install -e .[$(2)] && pytest tests/ -k '$(3)' -x"
+	opt-bench-$(1):local -c "pip install requests && pip install -e .[$(2)] && pytest tests/ -k '$(3)' -x"
 endef
 
 # group the extra
@@ -144,7 +143,7 @@ test_cli_cuda_pytorch:
 	$(call test_nvidia,cuda,$(CLI_CUDA_PYTORCH_REQS),cli and cuda and pytorch)
 
 test_cli_rocm_pytorch:
-	$(call test_amdgpu,rocm,$(CLI_ROCM_PYTORCH_REQS),cli and cuda and pytorch and peft)
+	$(call test_amdgpu,rocm,$(CLI_ROCM_PYTORCH_REQS),cli and cuda and pytorch and peft and not bnb)
 
 test_cli_cuda_onnxruntime:
 	$(call test_nvidia,cuda,$(CLI_CUDA_ONNXRUNTIME_REQS),cli and cuda and onnxruntime)

diff --git a/setup.py b/setup.py
@@ -55,6 +55,8 @@
     "diffusers": ["diffusers"],
     "timm": ["timm"],
     "peft": ["peft"],
+    "autoawq": ["autoawq@git+https://github.com/casper-hansen/AutoAWQ.git"],
+    "bitsandbytes": ["bitsandbytes"],
 }
 
 

diff --git a/tests/configs/_no_weights_sweep_.yaml b/tests/configs/_no_weights_sweep_.yaml
@@ -0,0 +1,4 @@
+hydra:
+  sweeper:
+    params:
+      backend.no_weights: true,false
diff --git a/tests/configs/cpu_inference_onnxruntime_static_quant_vit.yaml b/tests/configs/cpu_inference_onnxruntime_static_quant_vit.yaml
@@ -0,0 +1,20 @@
+defaults:
+  - backend: onnxruntime
+  # order of inheritance, last one overrides previous ones
+  - _base_ # inherits from base config
+  - _inference_ # inherits from inference config
+  - _cpu_ # inherits from cpu config
+  - _no_weights_sweep_ # sweep over no_weights: true,false
+  - _self_ # hydra 1.1 compatibility
+
+experiment_name: cpu_inference_onnxruntime_static_quant_vit
+
+backend:
+  model: google/vit-base-patch16-224
+  quantization: true
+  quantization_config:
+    is_static: true
+    per_channel: false
+
+  calibration: true
+
diff --git a/tests/configs/cuda_inference_onnxruntime_static_quant_vit.yaml b/tests/configs/cuda_inference_onnxruntime_static_quant_vit.yaml
@@ -0,0 +1,20 @@
+defaults:
+  - backend: onnxruntime
+  # order of inheritance, last one overrides previous ones
+  - _base_ # inherits from base config
+  - _inference_ # inherits from inference config
+  - _cuda_ # inherits from cuda config
+  - _no_weights_sweep_ # sweep over no_weights: true,false
+  - _self_ # hydra 1.1 compatibility
+
+experiment_name: cuda_inference_onnxruntime_static_quant_vit
+
+backend:
+  model: google/vit-base-patch16-224
+  quantization: true
+  quantization_config:
+    is_static: true
+    per_channel: false
+
+  calibration: true
+
diff --git a/tests/configs/cuda_inference_pytorch_awq_exllama.yaml b/tests/configs/cuda_inference_pytorch_awq_exllama.yaml
@@ -0,0 +1,16 @@
+defaults:
+  - backend: pytorch
+  - _base_ # inherits from base config
+  - _inference_ # inherits from inference config
+  - _cuda_ # inherits from cuda config
+  - _no_weights_sweep_ # sweep over no_weights: true,false
+  - _self_ # hydra 1.1 compatibility
+
+experiment_name: cuda_inference_pytorch_awq_exllama
+
+backend:
+  model: TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ
+  quantization_scheme: "awq"
+  quantization_config:
+    exllama_config:
+      version:  2
diff --git a/tests/configs/cuda_inference_pytorch_gpt_bnb_4bit.yaml b/tests/configs/cuda_inference_pytorch_gpt_bnb_4bit.yaml
@@ -0,0 +1,16 @@
+defaults:
+  - backend: pytorch
+  - _base_ # inherits from base config
+  - _inference_ # inherits from inference config
+  - _cuda_ # inherits from cuda config
+  - _gpt_ # use gpt2 model
+  - _no_weights_sweep_ # sweep no_weights: true, false
+  - _self_ # hydra 1.1 compatibility
+
+experiment_name: cuda_inference_pytorch_gpt_bnb_4bit
+
+backend:
+  quantization_scheme: "bnb"
+  quantization_config:
+    load_in_4bit: true
+    bnb_4bit_compute_dtype: float16
diff --git a/tests/configs/cuda_inference_pytorch_gpt_bnb_8bit.yaml b/tests/configs/cuda_inference_pytorch_gpt_bnb_8bit.yaml
@@ -0,0 +1,15 @@
+defaults:
+  - backend: pytorch
+  - _base_ # inherits from base config
+  - _inference_ # inherits from inference config
+  - _cuda_ # inherits from cuda config
+  - _gpt_ # use gpt2 model
+  - _no_weights_sweep_ # sweep no_weights: true, false
+  - _self_ # hydra 1.1 compatibility
+
+experiment_name: cuda_inference_pytorch_gpt_bnb_8bit
+
+backend:
+  quantization_scheme: "bnb"
+  quantization_config:
+    load_in_8bit: true