From 24222c674eaaac42f4c02b0f0aa597db69de1af2 Mon Sep 17 00:00:00 2001
From: David Corvoysier <david@huggingface.co>
Date: Mon, 6 Jan 2025 13:17:12 +0100
Subject: [PATCH 1/3] fix(library): only compile CUDA extension on Linux

---
 optimum/quanto/library/extensions/__init__.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/optimum/quanto/library/extensions/__init__.py b/optimum/quanto/library/extensions/__init__.py
index 44979e5d..f57c1057 100644
--- a/optimum/quanto/library/extensions/__init__.py
+++ b/optimum/quanto/library/extensions/__init__.py
@@ -12,13 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import platform
+
 import torch
 
 from .cpp import *
 from .extension import *
 
 
-if torch.cuda.is_available():
+if torch.cuda.is_available() and platform.system() == "Linux":
     if torch.version.cuda:
         from .cuda import *
     elif torch.version.hip:

From 1a7075902b2d103a9f267ea33e1d52b213446d31 Mon Sep 17 00:00:00 2001
From: David Corvoysier <david@huggingface.co>
Date: Fri, 10 Jan 2025 11:11:36 +0100
Subject: [PATCH 2/3] fix(weights): only use Marlin kernels when extension is
 available

---
 optimum/quanto/tensor/weights/qbits.py  | 2 ++
 optimum/quanto/tensor/weights/qbytes.py | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/optimum/quanto/tensor/weights/qbits.py b/optimum/quanto/tensor/weights/qbits.py
index f9ca965c..ece5dba9 100644
--- a/optimum/quanto/tensor/weights/qbits.py
+++ b/optimum/quanto/tensor/weights/qbits.py
@@ -19,6 +19,7 @@
 from packaging import version
 from torch.autograd import Function
 
+from ...library import is_extension_available
 from ..function import QuantizedLinearFunction
 from ..grouped import grouped_shape
 from ..packed import PackedTensor
@@ -102,6 +103,7 @@ def create(qtype, axis, group_size, size, stride, data, scale, shift, requires_g
             and len(size) == 2
             and (data.device.type == "cuda" and torch.version.cuda)
             and torch.cuda.get_device_capability(data.device)[0] >= 8
+            and is_extension_available("quanto_cuda")
         ):
             if type(data) is PackedTensor:
                 data = data.unpack()
diff --git a/optimum/quanto/tensor/weights/qbytes.py b/optimum/quanto/tensor/weights/qbytes.py
index d2dd50a7..f3eb0dca 100644
--- a/optimum/quanto/tensor/weights/qbytes.py
+++ b/optimum/quanto/tensor/weights/qbytes.py
@@ -18,6 +18,7 @@
 import torch
 from torch.autograd import Function
 
+from ...library import is_extension_available
 from ..function import QuantizedLinearFunction
 from ..qbytes import QBytesTensor
 from ..qtensor import qfallback
@@ -126,6 +127,7 @@ def create(
             and (data.device.type == "cuda" and torch.version.cuda)
             and axis == 0
             and torch.cuda.get_device_capability(data.device)[0] >= 8
+            and is_extension_available("quanto_cuda")
         ):
             out_features, in_features = size
             if (

From 03990a558819568415ec7f37fae732765dc9f82c Mon Sep 17 00:00:00 2001
From: David Corvoysier <david@huggingface.co>
Date: Fri, 10 Jan 2025 11:16:11 +0100
Subject: [PATCH 3/3] style: apply ruff formatting

---
 bench/generation/metrics/latency.py   | 4 ++--
 bench/generation/setup/quanto.py      | 2 +-
 test/tensor/weights/weight_helpers.py | 6 +++---
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/bench/generation/metrics/latency.py b/bench/generation/metrics/latency.py
index d08be26d..b6d167e5 100644
--- a/bench/generation/metrics/latency.py
+++ b/bench/generation/metrics/latency.py
@@ -68,7 +68,7 @@ def elapsed_time(self, other):
 
     memory = get_device_memory(device)
     if memory is not None:
-        print(f"Device memory: {memory / (2 ** 30):.4f} GB")
+        print(f"Device memory: {memory / (2**30):.4f} GB")
 
     latencies = []
     input_ids = torch.randint(1, model.config.vocab_size - 1, size=(batch_size, prompt_length)).to(device)
@@ -89,7 +89,7 @@ def elapsed_time(self, other):
 
     if device.type == "cuda":
         peak_memory = torch.cuda.max_memory_allocated()
-        print(f"Peak memory during benchmark: {peak_memory / (2 ** 30):.4f} GB")
+        print(f"Peak memory during benchmark: {peak_memory / (2**30):.4f} GB")
 
     mean_latency = np.mean(latencies) / generation_config.min_new_tokens
     print(f"Average latency per token: {mean_latency} ms")
diff --git a/bench/generation/setup/quanto.py b/bench/generation/setup/quanto.py
index 4dd07fdb..810b7bb8 100644
--- a/bench/generation/setup/quanto.py
+++ b/bench/generation/setup/quanto.py
@@ -64,7 +64,7 @@ def setup(
                 calibrate(model, tokenizer, batch_size, batches=4)
         print("Freezing")
         freeze(model)
-        print(f"Finished: {time.time()-start:.2f}")
+        print(f"Finished: {time.time() - start:.2f}")
     return model, tokenizer
 
 
diff --git a/test/tensor/weights/weight_helpers.py b/test/tensor/weights/weight_helpers.py
index 762836e7..cedb681d 100644
--- a/test/tensor/weights/weight_helpers.py
+++ b/test/tensor/weights/weight_helpers.py
@@ -32,6 +32,6 @@ def check_weight_qtensor_linear(qweight, batch_size, tokens, use_bias, rel_max_e
     rel_max_err = max_err / mean_val
     # These values were evaluated empirically without any optimized kernels.
     rtol = {"cpu": 1e-2, "cuda": 2e-2, "mps": 1e-2, "xpu": 2e-2}[device.type]
-    assert (
-        rel_max_err < rtol
-    ), f"Maximum error {max_err:.2f} is too high for input of mean value {mean_val:.2f} ({rel_max_err*100:.2f} %)"
+    assert rel_max_err < rtol, (
+        f"Maximum error {max_err:.2f} is too high for input of mean value {mean_val:.2f} ({rel_max_err * 100:.2f} %)"
+    )