huggingface · dacorvo · Jan 10, 2025 · Jan 6, 2025 · Jan 10, 2025 · Jan 10, 2025
@@ -68,7 +68,7 @@ def elapsed_time(self, other):
 
     memory = get_device_memory(device)
     if memory is not None:
-        print(f"Device memory: {memory / (2 ** 30):.4f} GB")
+        print(f"Device memory: {memory / (2**30):.4f} GB")
 
     latencies = []
     input_ids = torch.randint(1, model.config.vocab_size - 1, size=(batch_size, prompt_length)).to(device)
@@ -89,7 +89,7 @@ def elapsed_time(self, other):
 
     if device.type == "cuda":
         peak_memory = torch.cuda.max_memory_allocated()
-        print(f"Peak memory during benchmark: {peak_memory / (2 ** 30):.4f} GB")
+        print(f"Peak memory during benchmark: {peak_memory / (2**30):.4f} GB")
 
     mean_latency = np.mean(latencies) / generation_config.min_new_tokens
     print(f"Average latency per token: {mean_latency} ms")

@@ -64,7 +64,7 @@ def setup(
                 calibrate(model, tokenizer, batch_size, batches=4)
         print("Freezing")
         freeze(model)
-        print(f"Finished: {time.time()-start:.2f}")
+        print(f"Finished: {time.time() - start:.2f}")
     return model, tokenizer
 
 

@@ -12,13 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import platform
+
 import torch
 
 from .cpp import *
 from .extension import *
 
 
-if torch.cuda.is_available():
+if torch.cuda.is_available() and platform.system() == "Linux":
     if torch.version.cuda:
         from .cuda import *
     elif torch.version.hip:

@@ -19,6 +19,7 @@
 from packaging import version
 from torch.autograd import Function
 
+from ...library import is_extension_available
 from ..function import QuantizedLinearFunction
 from ..grouped import grouped_shape
 from ..packed import PackedTensor
@@ -102,6 +103,7 @@ def create(qtype, axis, group_size, size, stride, data, scale, shift, requires_g
             and len(size) == 2
             and (data.device.type == "cuda" and torch.version.cuda)
             and torch.cuda.get_device_capability(data.device)[0] >= 8
+            and is_extension_available("quanto_cuda")
         ):
             if type(data) is PackedTensor:
                 data = data.unpack()

@@ -18,6 +18,7 @@
 import torch
 from torch.autograd import Function
 
+from ...library import is_extension_available
 from ..function import QuantizedLinearFunction
 from ..qbytes import QBytesTensor
 from ..qtensor import qfallback
@@ -126,6 +127,7 @@ def create(
             and (data.device.type == "cuda" and torch.version.cuda)
             and axis == 0
             and torch.cuda.get_device_capability(data.device)[0] >= 8
+            and is_extension_available("quanto_cuda")
         ):
             out_features, in_features = size
             if (

@@ -32,6 +32,6 @@ def check_weight_qtensor_linear(qweight, batch_size, tokens, use_bias, rel_max_e
     rel_max_err = max_err / mean_val
     # These values were evaluated empirically without any optimized kernels.
     rtol = {"cpu": 1e-2, "cuda": 2e-2, "mps": 1e-2, "xpu": 2e-2}[device.type]
-    assert (
-        rel_max_err < rtol
-    ), f"Maximum error {max_err:.2f} is too high for input of mean value {mean_val:.2f} ({rel_max_err*100:.2f} %)"
+    assert rel_max_err < rtol, (
+        f"Maximum error {max_err:.2f} is too high for input of mean value {mean_val:.2f} ({rel_max_err * 100:.2f} %)"
+    )