diff --git a/neural_compressor/torch/algorithms/fp8_quant/model_configs.py b/neural_compressor/torch/algorithms/fp8_quant/model_configs.py index 4884510165e..fe7838df873 100644 --- a/neural_compressor/torch/algorithms/fp8_quant/model_configs.py +++ b/neural_compressor/torch/algorithms/fp8_quant/model_configs.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from typing import Dict, Optional, Tuple, Any from neural_compressor.torch.utils.auto_accelerator import auto_detect_accelerator diff --git a/neural_compressor/torch/algorithms/fp8_quant/observer.py b/neural_compressor/torch/algorithms/fp8_quant/observer.py index 9617fbb304c..e9bad0594fc 100644 --- a/neural_compressor/torch/algorithms/fp8_quant/observer.py +++ b/neural_compressor/torch/algorithms/fp8_quant/observer.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + """Base class and helper functions for registering observers.""" from typing import Dict, Optional, Any diff --git a/neural_compressor/torch/algorithms/fp8_quant/patched_module_base.py b/neural_compressor/torch/algorithms/fp8_quant/patched_module_base.py index 79e94ba3528..a40d11109a3 100644 --- a/neural_compressor/torch/algorithms/fp8_quant/patched_module_base.py +++ b/neural_compressor/torch/algorithms/fp8_quant/patched_module_base.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + """Base class for patched modules and helper functions for registering patched modules.""" from typing import Union, List, Type, Optional diff --git a/neural_compressor/torch/algorithms/fp8_quant/scaling_method_base.py b/neural_compressor/torch/algorithms/fp8_quant/scaling_method_base.py index f2bac47f428..35d1f69a879 100644 --- a/neural_compressor/torch/algorithms/fp8_quant/scaling_method_base.py +++ b/neural_compressor/torch/algorithms/fp8_quant/scaling_method_base.py @@ -1,3 +1,17 @@ +# Copyright (c) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + """Base class and helper functions for registering scaling methods.""" from typing import Dict, Optional, Any diff --git a/neural_compressor/torch/algorithms/weight_only/save_load.py b/neural_compressor/torch/algorithms/weight_only/save_load.py index a3c8e4240f2..b8c720b5c2c 100644 --- a/neural_compressor/torch/algorithms/weight_only/save_load.py +++ b/neural_compressor/torch/algorithms/weight_only/save_load.py @@ -22,7 +22,7 @@ import torch -from neural_compressor.common.utils import save_config_mapping, AWQ, TEQ +from neural_compressor.common.utils import AWQ, TEQ, save_config_mapping from neural_compressor.torch.utils import ( HPU_SAFE_WEIGHTS_NAME, HPU_WEIGHT_NAME, @@ -36,7 +36,10 @@ from .modules import HPUWeightOnlyLinear, INCWeightOnlyLinear, MulLinear from .utility import convert_dtype_str2torch -format_woqlinear_mapping = {SaveLoadFormat.HUGGINGFACE: INCWeightOnlyLinear, SaveLoadFormat.DEFAULT: INCWeightOnlyLinear} +format_woqlinear_mapping = { + SaveLoadFormat.HUGGINGFACE: INCWeightOnlyLinear, + SaveLoadFormat.DEFAULT: INCWeightOnlyLinear, +} device_woqlinear_mapping = {"cpu": INCWeightOnlyLinear, "hpu": HPUWeightOnlyLinear} @@ -199,7 +202,7 @@ def load_inc_format_woq_model(self): model = self._build_woq_model() # load remaining pretrained weight to weight-only quantization model - is_meta_device = hasattr(self.original_model, "device") and self.original_model.device.type == 'meta' + is_meta_device = hasattr(self.original_model, "device") and self.original_model.device.type == "meta" algo_name = next(iter(self.quantization_config[next(iter(self.quantization_config))].keys())) if is_meta_device or algo_name in [AWQ, TEQ]: # AWQ and TEQ will update some weight except WOQLinear to handle additional input_scale @@ -297,7 +300,7 @@ def _load_data_to_new_module_hqq(self, new_module, module_name): new_module_state_dict = {} for key in self.loaded_state_dict: if key.startswith(module_name): - new_key = key[len(module_name) + 1:] # Remove module_name and the following dot + new_key = key[len(module_name) + 1 :] # Remove module_name and the following dot new_module_state_dict[new_key] = self.loaded_state_dict[key] self.loaded_state_dict_keys.remove(key) new_module.load_state_dict(new_module_state_dict, strict=False) @@ -863,7 +866,7 @@ def _load_remaining_pretrained_weight(self, model): for shard_file in resolved_archive_file: state_dict = load_state_dict(shard_file) - params_dict={ + params_dict = { "model": model, "state_dict": state_dict, "start_prefix": "", @@ -877,6 +880,7 @@ def _load_remaining_pretrained_weight(self, model): } import transformers + if transformers.__version__ < "4.45.0": params_dict["loaded_state_dict_keys"] = self.loaded_state_dict_keys diff --git a/neural_compressor/torch/quantization/quantize.py b/neural_compressor/torch/quantization/quantize.py index 3c40ec068c9..b387a7f8ab7 100644 --- a/neural_compressor/torch/quantization/quantize.py +++ b/neural_compressor/torch/quantization/quantize.py @@ -20,7 +20,7 @@ from neural_compressor.common.base_config import BaseConfig, ComposableConfig, config_registry from neural_compressor.common.utils import Mode, call_counter, log_process -from neural_compressor.torch.quantization.config import SmoothQuantConfig, INT8StaticQuantConfig +from neural_compressor.torch.quantization.config import INT8StaticQuantConfig, SmoothQuantConfig from neural_compressor.torch.utils import is_ipex_available, logger from neural_compressor.torch.utils.utility import WHITE_MODULE_LIST, algos_mapping, get_model_info diff --git a/neural_compressor/torch/quantization/save_load_entry.py b/neural_compressor/torch/quantization/save_load_entry.py index 9ee5b3aafea..535fc129685 100644 --- a/neural_compressor/torch/quantization/save_load_entry.py +++ b/neural_compressor/torch/quantization/save_load_entry.py @@ -32,6 +32,7 @@ FP8_QUANT: FP8Config, } + def save(model, checkpoint_dir="saved_results", format="default"): """Save quantized model. @@ -46,6 +47,7 @@ def save(model, checkpoint_dir="saved_results", format="default"): # fp8_quant if isinstance(config_object, FP8Config): from neural_compressor.torch.algorithms import fp8_quant + format = SaveLoadFormat.HUGGINGFACE.value # TODO: support default format for FP8 algorithm fp8_quant.save(model, checkpoint_dir, format) else: @@ -120,21 +122,26 @@ def load(model_name_or_path, original_model=None, format="default", device="cpu" ): # WOQ from neural_compressor.torch.algorithms import weight_only - qmodel = weight_only.load(model_name_or_path, original_model, format=SaveLoadFormat.DEFAULT, device=device) + qmodel = weight_only.load( + model_name_or_path, original_model, format=SaveLoadFormat.DEFAULT, device=device + ) return qmodel.to(device) elif format == SaveLoadFormat.HUGGINGFACE.value: import transformers + config = transformers.AutoConfig.from_pretrained(model_name_or_path, **kwargs) # use config to check which algorithm is used. if ( - "fp8_config" in config.quantization_config or + "fp8_config" in config.quantization_config + or # for FP8 LLMs for vLLM (https://huggingface.co/neuralmagic). ( - "quant_method" in config.quantization_config and - config.quantization_config["quant_method"] in ["fp8", "compressed-tensors"] + "quant_method" in config.quantization_config + and config.quantization_config["quant_method"] in ["fp8", "compressed-tensors"] ) ): from neural_compressor.torch.algorithms import fp8_quant + return fp8_quant.load(model_name_or_path, format=format, device=device, **kwargs) else: from neural_compressor.torch.algorithms import weight_only diff --git a/neural_compressor/torch/utils/auto_accelerator.py b/neural_compressor/torch/utils/auto_accelerator.py index ac33905c2fd..ccbfc6c726d 100644 --- a/neural_compressor/torch/utils/auto_accelerator.py +++ b/neural_compressor/torch/utils/auto_accelerator.py @@ -26,12 +26,12 @@ import os from abc import ABC, abstractmethod +from functools import lru_cache from typing import Any, Callable, List import torch from neural_compressor.common.utils import LazyImport, logger -from functools import lru_cache htcore = LazyImport("habana_frameworks.torch.core") @@ -151,7 +151,6 @@ def synchronize(self): pass - @register_accelerator(name="cpu", priority=PRIORITY_CPU) class CPU_Accelerator(Auto_Accelerator): """CPU Accelerator.""" diff --git a/neural_compressor/torch/utils/block_wise.py b/neural_compressor/torch/utils/block_wise.py index e028d97c48a..13bf72c1ac1 100644 --- a/neural_compressor/torch/utils/block_wise.py +++ b/neural_compressor/torch/utils/block_wise.py @@ -11,23 +11,23 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""This utility is for block-wise calibration of LLMs""" +"""This utility is for block-wise calibration of LLMs.""" -import torch import gc from functools import partial +import torch + from neural_compressor.torch.utils import ( fetch_module, - logger, - set_module, - get_accelerator, forward_wrapper, + get_accelerator, get_non_persistent_buffers, load_non_persistent_buffers, + logger, + set_module, ) - cur_accelerator = get_accelerator() diff --git a/neural_compressor/torch/utils/environ.py b/neural_compressor/torch/utils/environ.py index 0cf02de93bd..2ffd94a1c93 100644 --- a/neural_compressor/torch/utils/environ.py +++ b/neural_compressor/torch/utils/environ.py @@ -229,8 +229,9 @@ def is_tbb_available(): # pragma: no cover def get_used_hpu_mem_MB(): """Get HPU used memory: MiB.""" - from habana_frameworks.torch.hpu import memory_stats import numpy as np + from habana_frameworks.torch.hpu import memory_stats + torch.hpu.synchronize() mem_stats = memory_stats() used_hpu_mem = np.round(mem_stats["InUse"] / 1024**2, 3) @@ -240,6 +241,7 @@ def get_used_hpu_mem_MB(): def get_used_cpu_mem_MB(): """Get the amount of CPU memory used by the current process in MiB (Mebibytes).""" import psutil + process = psutil.Process() mem_info = process.memory_info() used_cpu_mem = round(mem_info.rss / 1024**2, 3) diff --git a/neural_compressor/torch/utils/utility.py b/neural_compressor/torch/utils/utility.py index 122029a7a30..9465994bbe4 100644 --- a/neural_compressor/torch/utils/utility.py +++ b/neural_compressor/torch/utils/utility.py @@ -627,12 +627,11 @@ def find_matching_blocks(model, all_blocks, to_quant_block_names=None): def get_non_persistent_buffers(model): - """ - Get all non-persistent buffers in the model. - + """Get all non-persistent buffers in the model. + Args: model (torch.nn.Module): PyTorch model - + Returns: dict: A dictionary containing all non-persistent buffers, {buffer_names: buffer_tensors} """ @@ -646,13 +645,11 @@ def get_non_persistent_buffers(model): def load_non_persistent_buffers(model, non_persistent_buffers): - """ - Load all non-persistent buffers into the model. - + """Load all non-persistent buffers into the model. + Args: model (torch.nn.Module): PyTorch model non_persistent_buffers (dict): A dictionary containing all non-persistent buffers, {buffer_names: buffer_tensors} - """ for full_name, buffer in non_persistent_buffers.items(): module_name, buffer_name = full_name diff --git a/test/3x/torch/quantization/fp8_quant/conftest.py b/test/3x/torch/quantization/fp8_quant/conftest.py index 746d9cdbc4f..a9b94011831 100644 --- a/test/3x/torch/quantization/fp8_quant/conftest.py +++ b/test/3x/torch/quantization/fp8_quant/conftest.py @@ -1,4 +1,4 @@ # Called once at the beginning of the test session def pytest_sessionstart(): import os - os.environ.setdefault("EXPERIMENTAL_WEIGHT_SHARING", "FALSE") \ No newline at end of file + os.environ.setdefault("EXPERIMENTAL_WEIGHT_SHARING", "FALSE")