deploy: 408e5f1

intel · Aug 13, 2024 · 99e0c73 · 99e0c73
1 parent 786ee07
commit 99e0c73
Show file tree

Hide file tree

Showing 466 changed files with 9,445 additions and 2,619 deletions.
diff --git a/...neural_chat/pipeline/plugins/image2image/instructpix2pix_pipeline/index.rst.txt b/...neural_chat/pipeline/plugins/image2image/instructpix2pix_pipeline/index.rst.txt
@@ -5,7 +5,7 @@ intel_extension_for_transformers.neural_chat.pipeline.plugins.image2image.instru
 
 .. autoapi-nested-parse::
 
-   Pipeline Modificaiton based from the diffusers 0.12.1 StableDiffusionInstructPix2PixPipeline.
+   Pipeline Modification based from the diffusers 0.12.1 StableDiffusionInstructPix2PixPipeline.
 
 
 

diff --git a/...rces/autoapi/intel_extension_for_transformers/transformers/config/index.rst.txt b/...rces/autoapi/intel_extension_for_transformers/transformers/config/index.rst.txt
@@ -16,13 +16,6 @@ Classes
 
    intel_extension_for_transformers.transformers.config.Provider
    intel_extension_for_transformers.transformers.config.DynamicLengthConfig
-   intel_extension_for_transformers.transformers.config.QuantizationConfig
-   intel_extension_for_transformers.transformers.config.PruningConfig
-   intel_extension_for_transformers.transformers.config.DistillationConfig
-   intel_extension_for_transformers.transformers.config.TFDistillationConfig
-   intel_extension_for_transformers.transformers.config.FlashDistillationConfig
-   intel_extension_for_transformers.transformers.config.AutoDistillationConfig
-   intel_extension_for_transformers.transformers.config.NASConfig
    intel_extension_for_transformers.transformers.config.BenchmarkConfig
    intel_extension_for_transformers.transformers.config.PrunerV2
    intel_extension_for_transformers.transformers.config.WeightPruningConfig
@@ -83,120 +76,6 @@ Module Contents
    :param evo_eval_metric: The metric name used in evolution search
 
 
-.. py:class:: QuantizationConfig(framework: str = 'pytorch', approach: str = 'PostTrainingStatic', strategy: str = 'basic', timeout: int = 0, max_trials: int = 100, metrics: Union[intel_extension_for_transformers.transformers.utils.metrics.Metric, List] = None, objectives: Union[intel_extension_for_transformers.transformers.utils.objectives.Objective, List] = performance, config_file: str = None, sampling_size: int = 100, use_bf16: bool = False, recipes: dict = None)
-
-
-
-   Configure the quantization process.
-
-   :param framework: Which framework you used
-   :param approach: Which quantization approach to use
-   :param strategy: Which quantization tuning strategy to use
-   :param timeout: Tuning timeout(seconds), 0 means early stop. Combined with max_trials field to decide when to exit
-   :param max_trials: Max tune times
-   :param metrics: Used to evaluate accuracy of tuning model, no need for NoTrainerOptimize
-   :param objectives: Objective with accuracy constraint guaranteed
-   :param config_file: Path to the config file
-   :param sampling_size: How many samples to use
-   :param use_bf16: Whether to use bf16
-   :param recipes: apply recipes for quantization, neural_compressor support below recipes:
-                   'smooth_quant': whether do smooth quant
-                   'smooth_quant_args': parameters for smooth_quant
-                   'fast_bias_correction': whether do fast bias correction
-                   'weight_correction': whether do weight correction
-                   'gemm_to_matmul': whether convert gemm to matmul and add, only valid for onnx models
-                   'graph_optimization_level': support 'DISABLE_ALL', 'ENABLE_BASIC', 'ENABLE_EXTENDED', 'ENABLE_ALL'
-                                             only valid for onnx models
-                   'first_conv_or_matmul_quantization': whether quantize the first conv or matmul
-                   'last_conv_or_matmul_quantization': whether quantize the last conv or matmul
-                   'pre_post_process_quantization': whether quantize the ops in preprocess and postprocess
-                   'add_qdq_pair_to_weight': whether add QDQ pair for weights, only valid for onnxrt_trt_ep
-                   'optypes_to_exclude_output_quant': don't quantize output of specified optypes
-                   'dedicated_qdq_pair': whether dedicate QDQ pair, only valid for onnxrt_trt_ep.
-
-
-.. py:class:: PruningConfig(framework: str = 'pytorch', epochs: int = 1, epoch_range: List = [0, 4], initial_sparsity_ratio: float = 0.0, target_sparsity_ratio: float = 0.97, metrics: intel_extension_for_transformers.transformers.utils.metrics.Metric = None, pruner_config: Union[List, neural_compressor.conf.config.Pruner] = None, config_file: str = None)
-
-
-
-   Configure the pruning process.
-
-   :param framework: Which framework you used
-   :param epochs: How many epochs to prune
-   :param epoch_range: Epoch range list
-   :param initial_sparsity_ratio: Initial sparsity goal, and not needed if pruner_config argument is defined
-   :param target_sparsity_ratio: Target sparsity goal, and not needed if pruner_config argument is defined
-   :param metrics: Used to evaluate accuracy of tuning model, not needed for NoTrainerOptimizer
-   :param pruner_config: Defined pruning behavior, if it is None, then NLP will create a default pruner with
-                         'BasicMagnitude' pruning typel
-   :param config_file: Path to the config file
-
-
-   .. py:method:: init_prune_config()
-
-      Init the pruning config.
-
-
-
-.. py:class:: DistillationConfig(framework: str = 'pytorch', criterion: intel_extension_for_transformers.transformers.distillation.Criterion = None, metrics: intel_extension_for_transformers.transformers.utils.metrics.Metric = None, inc_config=None)
-
-
-
-   Configure the distillation process.
-
-   :param framework: Which framework you used
-   :param criterion: Criterion of training, example: "KnowledgeLoss"
-   :param metrics: Metrics for distillation
-   :param inc_config: Distillation config
-
-
-.. py:class:: TFDistillationConfig(loss_types: list = [], loss_weights: list = [], train_steps: list = [], temperature: float = 1.0)
-
-
-
-   Configure the distillation process for Tensorflow.
-
-   :param loss_types: Type of loss
-   :param loss_weights: Weight ratio of loss
-   :param train_steps: Steps of training
-   :param temperature: Parameter for KnowledgeDistillationLoss
-
-
-.. py:class:: FlashDistillationConfig(block_names: list = [], layer_mappings_for_knowledge_transfer: list = [], loss_types: list = [], loss_weights: list = [], add_origin_loss: list = [], train_steps: list = [])
-
-
-
-   The flash distillation configuration used by AutoDistillationConfig.
-
-
-.. py:class:: AutoDistillationConfig(framework: str = 'pytorch', search_space: dict = {}, search_algorithm: str = 'BO', metrics: Union[List, intel_extension_for_transformers.transformers.utils.metrics.Metric] = None, max_trials: int = None, seed: int = None, knowledge_transfer: FlashDistillationConfig = None, regular_distillation: FlashDistillationConfig = None)
-
-
-
-   Configure the auto disillation process.
-
-   :param framework: Which framework you used
-   :param search_space: Search space of NAS
-   :param search_algorithm: Search algorithm used in NAS, e.g. Bayesian Optimization
-   :param metrics: Metrics used to evaluate the performance of the model architecture candidate
-   :param max_trials: Maximum trials in NAS process
-   :param seed: Seed of random process
-   :param knowledge_transfer: Configuration controlling the behavior of knowledge transfer stage
-                              in the autodistillation
-   :param regular_distillation: Configuration controlling the behavior of regular distillation stage
-                                in the autodistillation
-
-
-.. py:class:: NASConfig(framework: str = 'pytorch', approach: str = 'basic', search_space: dict = {}, search_algorithm: str = 'BO', metrics: Union[List, intel_extension_for_transformers.transformers.utils.metrics.Metric] = None, max_trials: int = None, seed: int = None)
-
-
-
-   Config parser.
-
-   :param approach: The approach of the NAS.
-   :param search_algorithm: The search algorithm for NAS procedure.
-
-
 .. py:class:: BenchmarkConfig(backend: str = 'torch', batch_size: int = 1, warmup: int = 5, iteration: int = 20, cores_per_instance: int = 4, num_of_instance: int = -1, torchscript: bool = False, generate: bool = False, **kwargs)
 
    Config Class for Benchmark.

diff --git a/...nsformers/transformers/kv_cache_compression/models/modeling_llama/index.rst.txt b/...nsformers/transformers/kv_cache_compression/models/modeling_llama/index.rst.txt
@@ -0,0 +1,86 @@
+intel_extension_for_transformers.transformers.kv_cache_compression.models.modeling_llama
+========================================================================================
+
+.. py:module:: intel_extension_for_transformers.transformers.kv_cache_compression.models.modeling_llama
+
+.. autoapi-nested-parse::
+
+   PyTorch llama model.
+
+
+
+Classes
+-------
+
+.. autoapisummary::
+
+   intel_extension_for_transformers.transformers.kv_cache_compression.models.modeling_llama.LlamaAttention
+   intel_extension_for_transformers.transformers.kv_cache_compression.models.modeling_llama.LlamaFlashAttention2
+   intel_extension_for_transformers.transformers.kv_cache_compression.models.modeling_llama.LlamaSdpaAttention
+
+
+Functions
+---------
+
+.. autoapisummary::
+
+   intel_extension_for_transformers.transformers.kv_cache_compression.models.modeling_llama.apply_rotary_pos_emb
+
+
+Module Contents
+---------------
+
+.. py:function:: apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1)
+
+   Applies Rotary Position Embedding to the query and key tensors.
+
+   :param q: The query tensor.
+   :type q: `torch.Tensor`
+   :param k: The key tensor.
+   :type k: `torch.Tensor`
+   :param cos: The cosine part of the rotary embedding.
+   :type cos: `torch.Tensor`
+   :param sin: The sine part of the rotary embedding.
+   :type sin: `torch.Tensor`
+   :param position_ids: Deprecated and unused.
+   :type position_ids: `torch.Tensor`, *optional*
+   :param unsqueeze_dim: The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+                         sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+                         that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+                         k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+                         cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+                         the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+   :type unsqueeze_dim: `int`, *optional*, defaults to 1
+
+   :returns: `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+
+
+.. py:class:: LlamaAttention(config: transformers.models.llama.configuration_llama.LlamaConfig, layer_idx: Optional[int] = None)
+
+
+
+   Multi-headed attention from 'Attention Is All You Need' paper.
+
+
+.. py:class:: LlamaFlashAttention2(*args, **kwargs)
+
+
+
+   Llama flash attention module.
+
+   This module inherits from `LlamaAttention` as the weights of the module stays
+   untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+   flash attention and deal with padding tokens in case the input contains any of them.
+
+
+.. py:class:: LlamaSdpaAttention(config: transformers.models.llama.configuration_llama.LlamaConfig, layer_idx: Optional[int] = None)
+
+
+
+   Llama attention module using torch.nn.functional.scaled_dot_product_attention.
+
+   This module inherits from
+   `LlamaAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+   SDPA API.
+
+
diff --git a/...rs/transformers/modeling/modeling_gaudi/models/bart/modeling_bart/index.rst.txt b/...rs/transformers/modeling/modeling_gaudi/models/bart/modeling_bart/index.rst.txt
@@ -0,0 +1,48 @@
+intel_extension_for_transformers.transformers.modeling.modeling_gaudi.models.bart.modeling_bart
+===============================================================================================
+
+.. py:module:: intel_extension_for_transformers.transformers.modeling.modeling_gaudi.models.bart.modeling_bart
+
+.. autoapi-nested-parse::
+
+   PyTorch BART model.
+
+
+
+Classes
+-------
+
+.. autoapisummary::
+
+   intel_extension_for_transformers.transformers.modeling.modeling_gaudi.models.bart.modeling_bart.gaudi_BartLearnedPositionalEmbedding
+
+
+Functions
+---------
+
+.. autoapisummary::
+
+   intel_extension_for_transformers.transformers.modeling.modeling_gaudi.models.bart.modeling_bart.gaudi_BartAttention_forward
+
+
+Module Contents
+---------------
+
+.. py:class:: gaudi_BartLearnedPositionalEmbedding(num_embeddings: int, embedding_dim: int)
+
+
+
+   This module learns positional embeddings up to a fixed maximum size.
+
+
+   .. py:method:: forward(input_ids: torch.Tensor, past_key_values_length: torch.Tensor = torch.tensor(0))
+
+      `input_ids' shape is expected to be [bsz x seqlen].
+
+
+
+.. py:function:: gaudi_BartAttention_forward(self, hidden_states: torch.Tensor, key_value_states: Optional[torch.Tensor] = None, past_key_value: Optional[Tuple[torch.Tensor]] = None, attention_mask: Optional[torch.Tensor] = None, layer_head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False, token_idx: Optional[torch.Tensor] = None) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]
+
+   Input shape: Batch x Time x Channel
+
+
diff --git a/...transformers/modeling/modeling_gaudi/models/llama/pos_shift_llama/index.rst.txt b/...transformers/modeling/modeling_gaudi/models/llama/pos_shift_llama/index.rst.txt
@@ -0,0 +1,15 @@
+intel_extension_for_transformers.transformers.modeling.modeling_gaudi.models.llama.pos_shift_llama
+==================================================================================================
+
+.. py:module:: intel_extension_for_transformers.transformers.modeling.modeling_gaudi.models.llama.pos_shift_llama
+
+.. autoapi-nested-parse::
+
+   Adapted from https://github.com/tomaarsen/attention_sinks
+   Note (accelerate inference with hpu graphs in V1.15.1):
+     1. avoid using data dependent dynamic flow
+     2. avoid updating tensor by in-place view (a[:, idx] = c)
+     3. make all shapes static
+
+
+
diff --git a/...nsformers/modeling/modeling_gaudi/models/mistral/modeling_mistral/index.rst.txt b/...nsformers/modeling/modeling_gaudi/models/mistral/modeling_mistral/index.rst.txt
@@ -0,0 +1,41 @@
+intel_extension_for_transformers.transformers.modeling.modeling_gaudi.models.mistral.modeling_mistral
+=====================================================================================================
+
+.. py:module:: intel_extension_for_transformers.transformers.modeling.modeling_gaudi.models.mistral.modeling_mistral
+
+.. autoapi-nested-parse::
+
+   PyTorch Mistral model.
+
+
+
+Functions
+---------
+
+.. autoapisummary::
+
+   intel_extension_for_transformers.transformers.modeling.modeling_gaudi.models.mistral.modeling_mistral.gaudi_mistral_rmsnorm_forward
+   intel_extension_for_transformers.transformers.modeling.modeling_gaudi.models.mistral.modeling_mistral.gaudi_mistral_repeat_kv
+
+
+Module Contents
+---------------
+
+.. py:function:: gaudi_mistral_rmsnorm_forward(self, hidden_states)
+
+   The only differences are:
+       - override RMSNorm with Habana fused RMSNorm
+
+
+.. py:function:: gaudi_mistral_repeat_kv(query_states: torch.Tensor, key_states: torch.Tensor, value_states: torch.Tensor, attention_mask: torch.Tensor, n_rep: int)
+
+   The only differences are:
+       - Append num_key_value_heads == 1 check as kv states can be broadcasted during
+         matmuls so need to expand and reshape them.
+       - Add new args query_states, key_states, value_states and attention_mask and update the logic for expansion.
+   The query states go from (batch, num_heads, seqlen, head_dim) to
+   (batch, num_key_value_heads, n_rep, seqlen, head_dim)
+   The key/value states go from (batch, num_key_value_heads, seqlen, head_dim) to
+   (batch, num_key_value_heads, 1, seqlen, head_dim)
+
+
Original file line number	Diff line number	Diff line change
Expand Up		@@ -5,7 +5,7 @@ intel_extension_for_transformers.neural_chat.pipeline.plugins.image2image.instru

		.. autoapi-nested-parse::

		Pipeline Modificaiton based from the diffusers 0.12.1 StableDiffusionInstructPix2PixPipeline.
		Pipeline Modification based from the diffusers 0.12.1 StableDiffusionInstructPix2PixPipeline.



Expand Down