From 24222c674eaaac42f4c02b0f0aa597db69de1af2 Mon Sep 17 00:00:00 2001 From: David Corvoysier Date: Mon, 6 Jan 2025 13:17:12 +0100 Subject: [PATCH 1/3] fix(library): only compile CUDA extension on Linux --- optimum/quanto/library/extensions/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/optimum/quanto/library/extensions/__init__.py b/optimum/quanto/library/extensions/__init__.py index 44979e5d..f57c1057 100644 --- a/optimum/quanto/library/extensions/__init__.py +++ b/optimum/quanto/library/extensions/__init__.py @@ -12,13 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. +import platform + import torch from .cpp import * from .extension import * -if torch.cuda.is_available(): +if torch.cuda.is_available() and platform.system() == "Linux": if torch.version.cuda: from .cuda import * elif torch.version.hip: From 1a7075902b2d103a9f267ea33e1d52b213446d31 Mon Sep 17 00:00:00 2001 From: David Corvoysier Date: Fri, 10 Jan 2025 11:11:36 +0100 Subject: [PATCH 2/3] fix(weights): only use Marlin kernels when extension is available --- optimum/quanto/tensor/weights/qbits.py | 2 ++ optimum/quanto/tensor/weights/qbytes.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/optimum/quanto/tensor/weights/qbits.py b/optimum/quanto/tensor/weights/qbits.py index f9ca965c..ece5dba9 100644 --- a/optimum/quanto/tensor/weights/qbits.py +++ b/optimum/quanto/tensor/weights/qbits.py @@ -19,6 +19,7 @@ from packaging import version from torch.autograd import Function +from ...library import is_extension_available from ..function import QuantizedLinearFunction from ..grouped import grouped_shape from ..packed import PackedTensor @@ -102,6 +103,7 @@ def create(qtype, axis, group_size, size, stride, data, scale, shift, requires_g and len(size) == 2 and (data.device.type == "cuda" and torch.version.cuda) and torch.cuda.get_device_capability(data.device)[0] >= 8 + and is_extension_available("quanto_cuda") ): if type(data) is PackedTensor: data = data.unpack() diff --git a/optimum/quanto/tensor/weights/qbytes.py b/optimum/quanto/tensor/weights/qbytes.py index d2dd50a7..f3eb0dca 100644 --- a/optimum/quanto/tensor/weights/qbytes.py +++ b/optimum/quanto/tensor/weights/qbytes.py @@ -18,6 +18,7 @@ import torch from torch.autograd import Function +from ...library import is_extension_available from ..function import QuantizedLinearFunction from ..qbytes import QBytesTensor from ..qtensor import qfallback @@ -126,6 +127,7 @@ def create( and (data.device.type == "cuda" and torch.version.cuda) and axis == 0 and torch.cuda.get_device_capability(data.device)[0] >= 8 + and is_extension_available("quanto_cuda") ): out_features, in_features = size if ( From 03990a558819568415ec7f37fae732765dc9f82c Mon Sep 17 00:00:00 2001 From: David Corvoysier Date: Fri, 10 Jan 2025 11:16:11 +0100 Subject: [PATCH 3/3] style: apply ruff formatting --- bench/generation/metrics/latency.py | 4 ++-- bench/generation/setup/quanto.py | 2 +- test/tensor/weights/weight_helpers.py | 6 +++--- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/bench/generation/metrics/latency.py b/bench/generation/metrics/latency.py index d08be26d..b6d167e5 100644 --- a/bench/generation/metrics/latency.py +++ b/bench/generation/metrics/latency.py @@ -68,7 +68,7 @@ def elapsed_time(self, other): memory = get_device_memory(device) if memory is not None: - print(f"Device memory: {memory / (2 ** 30):.4f} GB") + print(f"Device memory: {memory / (2**30):.4f} GB") latencies = [] input_ids = torch.randint(1, model.config.vocab_size - 1, size=(batch_size, prompt_length)).to(device) @@ -89,7 +89,7 @@ def elapsed_time(self, other): if device.type == "cuda": peak_memory = torch.cuda.max_memory_allocated() - print(f"Peak memory during benchmark: {peak_memory / (2 ** 30):.4f} GB") + print(f"Peak memory during benchmark: {peak_memory / (2**30):.4f} GB") mean_latency = np.mean(latencies) / generation_config.min_new_tokens print(f"Average latency per token: {mean_latency} ms") diff --git a/bench/generation/setup/quanto.py b/bench/generation/setup/quanto.py index 4dd07fdb..810b7bb8 100644 --- a/bench/generation/setup/quanto.py +++ b/bench/generation/setup/quanto.py @@ -64,7 +64,7 @@ def setup( calibrate(model, tokenizer, batch_size, batches=4) print("Freezing") freeze(model) - print(f"Finished: {time.time()-start:.2f}") + print(f"Finished: {time.time() - start:.2f}") return model, tokenizer diff --git a/test/tensor/weights/weight_helpers.py b/test/tensor/weights/weight_helpers.py index 762836e7..cedb681d 100644 --- a/test/tensor/weights/weight_helpers.py +++ b/test/tensor/weights/weight_helpers.py @@ -32,6 +32,6 @@ def check_weight_qtensor_linear(qweight, batch_size, tokens, use_bias, rel_max_e rel_max_err = max_err / mean_val # These values were evaluated empirically without any optimized kernels. rtol = {"cpu": 1e-2, "cuda": 2e-2, "mps": 1e-2, "xpu": 2e-2}[device.type] - assert ( - rel_max_err < rtol - ), f"Maximum error {max_err:.2f} is too high for input of mean value {mean_val:.2f} ({rel_max_err*100:.2f} %)" + assert rel_max_err < rtol, ( + f"Maximum error {max_err:.2f} is too high for input of mean value {mean_val:.2f} ({rel_max_err * 100:.2f} %)" + )