diff --git a/_modules/onmt/decoders/transformer.html b/_modules/onmt/decoders/transformer.html index d3d74074bf..2bcf6d6c13 100644 --- a/_modules/onmt/decoders/transformer.html +++ b/_modules/onmt/decoders/transformer.html @@ -212,12 +212,9 @@
from onmt.modules import MultiHeadedAttention, AverageAttention
from onmt.modules.position_ffn import PositionwiseFeedForward
from onmt.modules.position_ffn import ActivationFunction
+from onmt.modules.moe import MoE
from onmt.utils.misc import sequence_mask
-
-try:
- from apex.normalization import FusedRMSNorm as RMSNorm
-except ImportError:
- from onmt.modules.rmsnorm import RMSNorm
+from onmt.modules.rmsnorm import RMSNorm
class TransformerDecoderLayerBase(nn.Module):
@@ -228,7 +225,7 @@ Source code for onmt.decoders.transformer
d_ff,
dropout,
attention_dropout,
- self_attn_type="scaled-dot",
+ self_attn_type="scaled_dot",
max_relative_positions=0,
relative_positions_buckets=0,
aan_useffn=False,
@@ -245,6 +242,11 @@ Source code for onmt.decoders.transformer
use_ckpting=[],
parallel_gpu=1,
sliding_window=0,
+ rotary_interleave=True,
+ rotary_theta=1e4,
+ rotary_dim=0,
+ num_experts=0,
+ num_experts_per_tok=2,
):
"""
Args:
@@ -259,10 +261,13 @@ Source code for onmt.decoders.transformer
attention_dropout (float): dropout in context_attn (and
self-attn(avg))
self_attn_type (string): type of self-attention scaled-dot,
- average
+ flash-scaled-dot, average
max_relative_positions (int):
Max distance between inputs in relative positions
representations
+ relative_positions_buckets (int):
+ relative position bias see
+ https://github.com/google-research/text-to-text-transfer-transformer
aan_useffn (bool): Turn on the FFN layer in the AAN decoder
full_context_alignment (bool):
whether enable an extra full context decoder forward for
@@ -272,21 +277,38 @@ Source code for onmt.decoders.transformer
pos_ffn_activation_fn (ActivationFunction):
activation function choice for PositionwiseFeedForward layer
add_qkvbias (bool): whether to add bias to the Key/Value nn.Linear
+ num_kv (int): number of heads for KV when different vs Q (multiquery)
+ add_ffnbias (bool): whether to add bias to the FF nn.Linear
+ parallel_residual (bool): Use parallel residual connections in each layer block, as used
+ by the GPT-J and GPT-NeoX models
+ shared_layer_norm (bool): When using parallel residual, share the input and post
+ attention layer norms.
layer_norm (string): type of layer normalization standard/rms
norm_eps (float): layer norm epsilon
-
+ use_ckpting (List): layers for which we checkpoint for backward
+ parallel_gpu (int): Number of gpu for tensor parallelism
+ sliding_window (int): Width of the band mask and KV cache (cf Mistral Model)
+ rotary_interleave (bool): Interleave the head dimensions when rotary
+ embeddings are applied
+ rotary_theta (int): rotary base theta
+ rotary_dim (int): in some cases the rotary dim is lower than head dim
+ num_experts (int): Number of experts for MoE
+ num_experts_per_tok (int): Number of experts choice per token
"""
super(TransformerDecoderLayerBase, self).__init__()
- self.self_attn_type = self_attn_type
- if self_attn_type == "scaled-dot":
+ if self_attn_type in ["scaled-dot", "scaled-dot-flash"]:
self.self_attn = MultiHeadedAttention(
heads,
d_model,
dropout=attention_dropout,
max_relative_positions=max_relative_positions,
relative_positions_buckets=relative_positions_buckets,
+ rotary_interleave=rotary_interleave,
+ rotary_theta=rotary_theta,
+ rotary_dim=rotary_dim,
attn_type="self",
+ self_attn_type=self_attn_type,
add_qkvbias=add_qkvbias,
num_kv=num_kv,
use_ckpting=use_ckpting,
@@ -297,18 +319,34 @@ Source code for onmt.decoders.transformer
d_model, dropout=attention_dropout, aan_useffn=aan_useffn
)
- self.feed_forward = PositionwiseFeedForward(
- d_model,
- d_ff,
- dropout,
- pos_ffn_activation_fn,
- add_ffnbias,
- parallel_residual,
- layer_norm,
- norm_eps,
- use_ckpting=use_ckpting,
- parallel_gpu=parallel_gpu,
- )
+ if num_experts > 0:
+ self.feed_forward = MoE(
+ num_experts,
+ num_experts_per_tok,
+ d_model,
+ d_ff,
+ dropout,
+ pos_ffn_activation_fn,
+ add_ffnbias,
+ parallel_residual,
+ layer_norm,
+ norm_eps,
+ use_ckpting=use_ckpting,
+ parallel_gpu=parallel_gpu,
+ )
+ else:
+ self.feed_forward = PositionwiseFeedForward(
+ d_model,
+ d_ff,
+ dropout,
+ pos_ffn_activation_fn,
+ add_ffnbias,
+ parallel_residual,
+ layer_norm,
+ norm_eps,
+ use_ckpting=use_ckpting,
+ parallel_gpu=parallel_gpu,
+ )
self.parallel_residual = parallel_residual
self.shared_layer_norm = shared_layer_norm
if layer_norm == "standard":
@@ -327,6 +365,7 @@ Source code for onmt.decoders.transformer
self.full_context_alignment = full_context_alignment
self.alignment_heads = alignment_heads
self.sliding_window = sliding_window
+ self.self_attn_type = self_attn_type
def forward(self, *args, **kwargs):
"""Extend `_forward` for (possibly) multiple decoder pass:
@@ -374,7 +413,8 @@ Source code for onmt.decoders.transformer
def _compute_dec_mask(self, tgt_pad_mask, future):
tgt_len = tgt_pad_mask.size(-1)
- if not future: # apply future_mask, result mask in (B, T, T)
+ if not future:
+ # Add triangular future_mask and pad_mask, result mask in (B, T, T).
future_mask = torch.ones(
[tgt_len, tgt_len],
device=tgt_pad_mask.device,
@@ -385,14 +425,19 @@ Source code for onmt.decoders.transformer
future_mask = future_mask.triu_(-self.sliding_window)
future_mask = future_mask.bool()
future_mask = ~future_mask.view(1, tgt_len, tgt_len)
-
+ # Patch for scaled dot product attention.
+ patch_mask = ~torch.all(
+ tgt_pad_mask + future_mask, dim=2, keepdim=True
+ ).expand_as(tgt_pad_mask + future_mask)
dec_mask = torch.gt(tgt_pad_mask + future_mask, 0)
- else: # only mask padding, result mask in (B, 1, T)
+ dec_mask = torch.logical_and(dec_mask, patch_mask)
+ else:
+ # Only mask padding, result mask in (B, 1, T).
dec_mask = tgt_pad_mask
return dec_mask
def _forward_self_attn(self, norm_layer_in, dec_mask, step, return_attn=False):
- if self.self_attn_type == "scaled-dot":
+ if self.self_attn_type in ["scaled-dot", "scaled-dot-flash"]:
return self.self_attn(
norm_layer_in,
norm_layer_in,
@@ -441,6 +486,11 @@ Source code for onmt.decoders.transformer
use_ckpting=[],
parallel_gpu=1,
sliding_window=0,
+ rotary_interleave=True,
+ rotary_theta=1e4,
+ rotary_dim=0,
+ num_experts=0,
+ num_experts_per_tok=2,
):
"""
Args:
@@ -469,12 +519,18 @@ Source code for onmt.decoders.transformer
use_ckpting=use_ckpting,
parallel_gpu=parallel_gpu,
sliding_window=sliding_window,
+ rotary_interleave=rotary_interleave,
+ rotary_theta=rotary_theta,
+ rotary_dim=rotary_dim,
+ num_experts=num_experts,
+ num_experts_per_tok=num_experts_per_tok,
)
self.context_attn = MultiHeadedAttention(
heads,
d_model,
dropout=attention_dropout,
attn_type="context",
+ self_attn_type=self.self_attn_type,
add_qkvbias=add_qkvbias,
num_kv=num_kv,
use_ckpting=use_ckpting,
@@ -627,6 +683,11 @@ Source code for onmt.decoders.transformer
if opt.parallel_mode == "tensor_parallel"
else 1,
sliding_window=opt.sliding_window,
+ rotary_interleave=opt.rotary_interleave,
+ rotary_theta=opt.rotary_theta,
+ rotary_dim=opt.rotary_dim,
+ num_experts=opt.num_experts,
+ num_experts_per_tok=opt.num_experts_per_tok,
)
def init_state(self, src, enc_out, enc_final_hs):
@@ -650,7 +711,18 @@ Source code for onmt.decoders.transformer
if layer.self_attn.layer_cache[1]["keys"].numel() != 0:
x = fn(layer.self_attn.layer_cache[1]["keys"], 0)
y = fn(layer.self_attn.layer_cache[1]["values"], 0)
- layer.self_attn.layer_cache = True, {"keys": x, "values": y}
+ if (
+ layer.self_attn.layer_cache[1].get("key_pad_mask", None)
+ is not None
+ ):
+ z = fn(layer.self_attn.layer_cache[1]["key_pad_mask"], 0)
+ else:
+ z = None
+ layer.self_attn.layer_cache = True, {
+ "keys": x,
+ "values": y,
+ "key_pad_mask": z,
+ }
def detach_state(self):
raise NotImplementedError
@@ -674,7 +746,7 @@ Source code for onmt.decoders.transformer
heads (int): number of heads
d_ff (int): size of the inner FF layer
copy_attn (bool): if using a separate copy attention
- self_attn_type (str): type of self-attention scaled-dot, average
+ self_attn_type (str): type of self-attention scaled-dot, scaled-dot-flash, average
dropout (float): dropout in residual, self-attn(dot) and feed-forward
attention_dropout (float): dropout in context_attn (and self-attn(avg))
embeddings (onmt.modules.Embeddings):
@@ -689,8 +761,25 @@ Source code for onmt.decoders.transformer
alignment_layer (int): N° Layer to supervise with for alignment guiding
alignment_heads (int):
N. of cross attention heads to use for alignment guiding
+ pos_ffn_activation_fn (ActivationFunction):
+ activation function choice for PositionwiseFeedForward layer
add_qkvbias (bool): whether to add bias to the Key/Value nn.Linear
+ num_kv (int): number of heads for KV when different vs Q (multiquery)
+ add_ffnbias (bool): whether to add bias to the FF nn.Linear
+ parallel_residual (bool): Use parallel residual connections in each layer block, as used
+ by the GPT-J and GPT-NeoX models
+ shared_layer_norm (bool): When using parallel residual, share the input and post
+ attention layer norms.
layer_norm (string): type of layer normalization standard/rms
+ norm_eps (float): layer norm epsilon
+ use_ckpting (List): layers for which we checkpoint for backward
+ parallel_gpu (int): Number of gpu for tensor parallelism
+ sliding_window (int): Width of the band mask and KV cache (cf Mistral Model)
+ rotary_interleave (bool): Interleave the head dimensions when rotary embeddings are applied
+ rotary_theta (int): rotary base theta
+ rotary_dim (int): in some cases the rotary dim is lower than head dim
+ num_experts (int): Number of experts for MoE
+ num_experts_per_tok (int): Number of experts choice per token
"""
def __init__(
@@ -721,6 +810,11 @@ Source code for onmt.decoders.transformer
use_ckpting=[],
parallel_gpu=1,
sliding_window=0,
+ rotary_interleave=True,
+ rotary_theta=1e4,
+ rotary_dim=0,
+ num_experts=0,
+ num_experts_per_tok=2,
):
super(TransformerDecoder, self).__init__(
d_model, copy_attn, embeddings, alignment_layer, layer_norm, norm_eps
@@ -751,6 +845,11 @@ Source code for onmt.decoders.transformer
use_ckpting=use_ckpting,
parallel_gpu=parallel_gpu,
sliding_window=sliding_window,
+ rotary_interleave=rotary_interleave,
+ rotary_theta=rotary_theta,
+ rotary_dim=rotary_dim,
+ num_experts=num_experts,
+ num_experts_per_tok=num_experts_per_tok,
)
for i in range(num_layers)
]
@@ -853,7 +952,9 @@ Source code for onmt.decoders.transformer
},
)
if hasattr(layer.self_attn, "rope"):
- layer.self_attn.rope = layer.self_attn.rope.to(enc_out.device)
+ layer.self_attn.rope = layer.self_attn.rope.to(enc_out.device)
+ layer.self_attn.cos = layer.self_attn.cos.to(enc_out.device)
+ layer.self_attn.sin = layer.self_attn.sin.to(enc_out.device)
class TransformerLMDecoderLayer(TransformerDecoderLayerBase):
@@ -887,7 +988,9 @@ Source code for onmt.decoders.transformer
dec_mask = None
if layer_in.size(1) > 1:
- # masking is necessary when sequence length is greater than one
+ # Masking is necessary when sequence length is greater than one
+ # The decoding has not started yet,
+ # we compute the scores on the source tokens in one shot.
dec_mask = self._compute_dec_mask(tgt_pad_mask, future)
dec_mask = dec_mask.unsqueeze(1)
dec_mask = dec_mask.expand(-1, -1, dec_mask.size(3), -1)
@@ -919,22 +1022,45 @@ Source code for onmt.decoders.transformer
class TransformerLMDecoder(TransformerDecoderBase):
"""The Transformer decoder from GPT-2
Args:
- num_layers (int): number of decoder layers.
- d_model (int): size of the model
- heads (int): number of heads
- d_ff (int): size of the inner FF layer
- copy_attn (bool): if using a separate copy attention
- self_attn_type (str): type of self-attention scaled-dot, average
- dropout (float): dropout in residual, self-attn(dot) and feed-forward
- attention_dropout (float): dropout in context_attn (and self-attn(avg))
- embeddings (onmt.modules.Embeddings):
- embeddings to use, should have positional encodings
- max_relative_positions (int):
- Max distance between inputs in relative positions representations
- relative_positions_buckets (int):
- Number of buckets when using Relative positions bias
- aan_useffn (bool): Turn on the FFN layer in the AAN decoder
- add_qkvbias (bool): whether to add bias to the Key/Value nn.Linear
+ num_layers (int): number of decoder layers.
+ d_model (int): size of the model
+ heads (int): number of heads
+ d_ff (int): size of the inner FF layer
+ copy_attn (bool): if using a separate copy attention
+ self_attn_type (str): type of self-attention scaled-dot, scaled-dot-flash, average
+ dropout (float): dropout in residual, self-attn(dot) and feed-forward
+ attention_dropout (float): dropout in context_attn (and self-attn(avg))
+ embeddings (onmt.modules.Embeddings):
+ embeddings to use, should have positional encodings
+ max_relative_positions (int):
+ Max distance between inputs in relative positions representations
+ relative_positions_buckets (int):
+ Number of buckets when using Relative positions bias
+ aan_useffn (bool): Turn on the FFN layer in the AAN decoder
+ full_context_alignment (bool):
+ whether enable an extra full context decoder forward for alignment
+ alignment_layer (int): N° Layer to supervise with for alignment guiding
+ alignment_heads (int):
+ N. of cross attention heads to use for alignment guiding
+ pos_ffn_activation_fn (ActivationFunction):
+ activation function choice for PositionwiseFeedForward layer
+ add_qkvbias (bool): whether to add bias to the Key/Value nn.Linear
+ num_kv (int): number of heads for KV when different vs Q (multiquery)
+ add_ffnbias (bool): whether to add bias to the FF nn.Linear
+ parallel_residual (bool): Use parallel residual connections in each layer block, as used
+ by the GPT-J and GPT-NeoX models
+ shared_layer_norm (bool): When using parallel residual, share the input and post
+ attention layer norms.
+ layer_norm (string): type of layer normalization standard/rms
+ norm_eps (float): layer norm epsilon
+ use_ckpting (List): layers for which we checkpoint for backward
+ parallel_gpu (int): Number of gpu for tensor parallelism
+ sliding_window (int): Width of the band mask and KV cache (cf Mistral Model)
+ rotary_interleave (bool): Interleave the head dimensions when rotary embeddings are applied
+ rotary_theta (int): rotary base theta
+ rotary_dim (int): in some cases the rotary dim is lower than head dim
+ num_experts (int): Number of experts for MoE
+ num_experts_per_tok (int): Number of experts choice per token
"""
def __init__(
@@ -965,6 +1091,11 @@ Source code for onmt.decoders.transformer
use_ckpting=[],
parallel_gpu=1,
sliding_window=0,
+ rotary_interleave=True,
+ rotary_theta=1e4,
+ rotary_dim=0,
+ num_experts=0,
+ num_experts_per_tok=2,
):
super(TransformerLMDecoder, self).__init__(
d_model, copy_attn, embeddings, alignment_layer, layer_norm, norm_eps
@@ -994,6 +1125,11 @@ Source code for onmt.decoders.transformer
use_ckpting=use_ckpting,
parallel_gpu=parallel_gpu,
sliding_window=sliding_window,
+ rotary_interleave=rotary_interleave,
+ rotary_theta=rotary_theta,
+ rotary_dim=rotary_dim,
+ num_experts=num_experts,
+ num_experts_per_tok=num_experts_per_tok,
)
for i in range(num_layers)
]
@@ -1007,15 +1143,22 @@ Source code for onmt.decoders.transformer
def forward(self, tgt, enc_out=None, step=None, **kwargs):
"""Decode, possibly stepwise."""
+
if step == 0:
+ # decoding mode.
+ # Initialize KV and key_pad_mask cache.
self._init_cache(tgt)
elif step is None:
+ # training mode.
for layer in self.transformer_layers:
layer.self_attn.layer_cache = (
False,
- {"keys": torch.tensor([]), "values": torch.tensor([])},
+ {
+ "keys": torch.tensor([]),
+ "values": torch.tensor([]),
+ "key_pad_mask": None,
+ },
)
-
dec_out = self.embeddings(tgt, step=step)
assert dec_out.dim() == 3 # batch x len x embedding_dim
@@ -1048,18 +1191,24 @@ Source code for onmt.decoders.transformer
def _init_cache(self, tgt=None):
for layer in self.transformer_layers:
- if isinstance(layer.self_attn, AverageAttention):
- raise NotImplementedError
- else:
- layer.self_attn.layer_cache = (
- True,
- {
- "keys": torch.tensor([], device=tgt.device),
- "values": torch.tensor([], device=tgt.device),
- },
- )
- if hasattr(layer.self_attn, "rope"):
- layer.self_attn.rope = layer.self_attn.rope.to(tgt.device)
+ if hasattr(layer, "self_attn"):
+ if isinstance(layer.self_attn, AverageAttention):
+ raise NotImplementedError
+ else:
+ layer.self_attn.layer_cache = (
+ True,
+ {
+ "keys": torch.tensor([], device=tgt.device),
+ "values": torch.tensor([], device=tgt.device),
+ "key_pad_mask": tgt[:, :, 0]
+ .eq(self.embeddings.word_padding_idx)
+ .unsqueeze(1),
+ },
+ )
+ if hasattr(layer.self_attn, "rope"):
+ layer.self_attn.rope = layer.self_attn.rope.to(tgt.device)
+ layer.self_attn.cos = layer.self_attn.cos.to(tgt.device)
+ layer.self_attn.sin = layer.self_attn.sin.to(tgt.device)
diff --git a/_modules/onmt/encoders/transformer.html b/_modules/onmt/encoders/transformer.html
index 813a809d8d..ba016fbf1f 100644
--- a/_modules/onmt/encoders/transformer.html
+++ b/_modules/onmt/encoders/transformer.html
@@ -232,6 +232,19 @@ Source code for onmt.encoders.transformer
dropout (float): dropout probability(0-1.0).
pos_ffn_activation_fn (ActivationFunction):
activation function choice for PositionwiseFeedForward layer
+ add_qkvbias (bool): whether to add bias to the Key/Value nn.Linear
+ num_kv (int): number of heads for KV when different vs Q (multiquery)
+ add_ffnbias (bool): whether to add bias to the FF nn.Linear
+ parallel_residual (bool): Use parallel residual connections in each layer block, as used
+ by the GPT-J and GPT-NeoX models
+ layer_norm (string): type of layer normalization standard/rms
+ norm_eps (float): layer norm epsilon
+ use_ckpting (List): layers for which we checkpoint for backward
+ parallel_gpu (int): Number of gpu for tensor parallelism
+ rotary_interleave (bool): Interleave the head dimensions when rotary
+ embeddings are applied
+ rotary_theta (int): rotary base theta
+ rotary_dim (int): rotary dim when different to dim per head
"""
def __init__(
@@ -252,6 +265,9 @@ Source code for onmt.encoders.transformer
norm_eps=1e-6,
use_ckpting=[],
parallel_gpu=1,
+ rotary_interleave=True,
+ rotary_theta=1e4,
+ rotary_dim=0,
):
super(TransformerEncoderLayer, self).__init__()
@@ -262,6 +278,9 @@ Source code for onmt.encoders.transformer
is_decoder=False,
max_relative_positions=max_relative_positions,
relative_positions_buckets=relative_positions_buckets,
+ rotary_interleave=rotary_interleave,
+ rotary_theta=rotary_theta,
+ rotary_dim=rotary_dim,
attn_type="self",
add_qkvbias=add_qkvbias,
num_kv=num_kv,
@@ -366,6 +385,9 @@ Source code for onmt.encoders.transformer
norm_eps=1e-6,
use_ckpting=[],
parallel_gpu=1,
+ rotary_interleave=True,
+ rotary_theta=1e4,
+ rotary_dim=0,
):
super(TransformerEncoder, self).__init__()
@@ -389,6 +411,9 @@ Source code for onmt.encoders.transformer
norm_eps=norm_eps,
use_ckpting=use_ckpting,
parallel_gpu=parallel_gpu,
+ rotary_interleave=rotary_interleave,
+ rotary_theta=rotary_theta,
+ rotary_dim=rotary_dim,
)
for i in range(num_layers)
]
@@ -426,6 +451,9 @@ Source code for onmt.encoders.transformer
parallel_gpu=opt.world_size
if opt.parallel_mode == "tensor_parallel"
else 1,
+ rotary_interleave=opt.rotary_interleave,
+ rotary_theta=opt.rotary_theta,
+ rotary_dim=opt.rotary_dim,
)
[docs] def forward(self, src, src_len=None):
diff --git a/_modules/onmt/inputters/dynamic_iterator.html b/_modules/onmt/inputters/dynamic_iterator.html
index 9ab6a6a541..3daf76fdb6 100644
--- a/_modules/onmt/inputters/dynamic_iterator.html
+++ b/_modules/onmt/inputters/dynamic_iterator.html
@@ -204,7 +204,7 @@ Source code for onmt.inputters.dynamic_iterator
<
"""Module that contain iterator used for dynamic data."""
import torch
from itertools import cycle
-from onmt.constants import CorpusTask
+from onmt.constants import CorpusTask, ModelTask
from onmt.inputters.text_corpus import get_corpora, build_corpora_iters
from onmt.inputters.text_utils import (
text_sort_key,
@@ -367,6 +367,10 @@ Source code for onmt.inputters.dynamic_iterator
<
self.skip_empty_level = skip_empty_level
self.random_shuffler = RandomShuffler()
self.bucket_idx = 0
+ if task != CorpusTask.TRAIN and vocabs["data_task"] == ModelTask.LANGUAGE_MODEL:
+ self.left_pad = True
+ else:
+ self.left_pad = False
[docs] @classmethod
def from_opt(
@@ -557,7 +561,9 @@ Source code for onmt.inputters.dynamic_iterator
<
# within the batch
if self.task == CorpusTask.TRAIN:
minibatch.sort(key=lambda x: self.sort_key(x[0]), reverse=True)
- tensor_batch = tensorify(self.vocabs, minibatch, self.device)
+ tensor_batch = tensorify(
+ self.vocabs, minibatch, self.device, self.left_pad
+ )
yield (tensor_batch, bucket_idx)
@@ -569,7 +575,12 @@ Source code for onmt.inputters.dynamic_iterator
<
def __iter__(self):
for (tensor_batch, bucket_idx) in self.data_iter:
for key in tensor_batch.keys():
- if key not in ["src_ex_vocab", "cid"]:
+ if key not in [
+ "src_ex_vocab",
+ "cid",
+ "ind_in_bucket",
+ "cid_line_number",
+ ]:
tensor_batch[key] = tensor_batch[key].to(self.device)
yield (tensor_batch, bucket_idx)
diff --git a/_modules/onmt/inputters/text_corpus.html b/_modules/onmt/inputters/text_corpus.html
index ba3528100b..9836efae48 100644
--- a/_modules/onmt/inputters/text_corpus.html
+++ b/_modules/onmt/inputters/text_corpus.html
@@ -241,6 +241,63 @@ Source code for onmt.inputters.text_corpus
_file.close()
+class BlockwiseCorpus(object):
+ """A corpus class for reading a single file block by block."""
+
+ def __init__(self, name, file_path, block_size=4096):
+ """Initialize file path and block size."""
+ self.id = name
+ self.file_path = file_path
+ self.block_size = block_size
+
+ def load(self, offset=0, stride=1):
+ """
+ Load file and iterate by blocks.
+ `offset` and `stride` allow iterating only on every
+ `stride` block, starting from `offset`.
+ """
+
+ def make_ex(block_content):
+ example = {
+ "src": block_content,
+ "tgt": block_content,
+ "src_original": block_content,
+ "tgt_original": block_content,
+ }
+ return example
+
+ with open(self.file_path, mode="r", encoding="utf-8") as file:
+ block_content = ""
+ block_index = 0
+
+ while True:
+ chunk = file.read(self.block_size)
+ if not chunk:
+ break
+
+ if (block_index // stride) % stride == offset:
+ block_content += chunk
+
+ if len(chunk) < self.block_size:
+ # Reached end of file
+ yield make_ex(block_content)
+ break
+
+ if len(block_content) >= self.block_size:
+ yield make_ex(block_content)
+ block_content = ""
+ block_index += 1
+
+ def __str__(self):
+ cls_name = type(self).__name__
+ return (
+ f"{cls_name}({self.id}, {self.file_path}, {self.file_path}"
+ f"align={None}, "
+ f"n_src_feats={0}, "
+ f'src_feats_defaults="{None}")'
+ )
+
+
[docs]class ParallelCorpus(object):
"""A parallel corpus file pair that can be loaded to iterate."""
@@ -320,20 +377,27 @@ Source code for onmt.inputters.text_corpus
if task == CorpusTask.TRAIN:
for corpus_id, corpus_dict in opts.data.items():
if corpus_id != CorpusName.VALID:
- corpora_dict[corpus_id] = ParallelCorpus(
- corpus_id,
- corpus_dict["path_src"],
- corpus_dict["path_tgt"],
- corpus_dict["path_align"],
- n_src_feats=opts.n_src_feats,
- src_feats_defaults=opts.src_feats_defaults,
- )
+ if corpus_dict.get("path_txt", None) is None:
+ corpora_dict[corpus_id] = ParallelCorpus(
+ corpus_id,
+ corpus_dict["path_src"],
+ corpus_dict["path_tgt"],
+ corpus_dict["path_align"],
+ n_src_feats=opts.n_src_feats,
+ src_feats_defaults=opts.src_feats_defaults,
+ )
+ else:
+ corpora_dict[corpus_id] = BlockwiseCorpus(
+ corpus_id,
+ corpus_dict["path_txt"],
+ block_size=8192, # number of characters
+ )
elif task == CorpusTask.VALID:
if CorpusName.VALID in opts.data.keys():
corpora_dict[CorpusName.VALID] = ParallelCorpus(
CorpusName.VALID,
opts.data[CorpusName.VALID]["path_src"],
- opts.data[CorpusName.VALID]["path_tgt"],
+ opts.data[CorpusName.VALID]["path_tgt"] if tgt is None else None,
opts.data[CorpusName.VALID]["path_align"],
n_src_feats=opts.n_src_feats,
src_feats_defaults=opts.src_feats_defaults,
@@ -377,20 +441,20 @@ Source code for onmt.inputters.text_corpus
def _process(self, stream):
for i, example in enumerate(stream):
- example["src"] = example["src"].strip("\n").split()
- example["src_original"] = example["src_original"].strip("\n").split()
+ example["src"] = example["src"].strip().split(" ")
+ example["src_original"] = example["src_original"].strip().split(" ")
if "src_feats" in example:
example["src_feats"] = [
- feat.strip("\n").split() for feat in example["src_feats"]
+ feat.strip().split(" ") for feat in example["src_feats"]
]
line_number = i * self.stride + self.offset
example["cid_line_number"] = line_number
example["cid"] = self.cid
if "align" in example:
- example["align"] = example["align"].strip("\n").split()
+ example["align"] = example["align"].strip().split(" ")
if example["tgt"] is not None:
- example["tgt"] = example["tgt"].strip("\n").split()
- example["tgt_original"] = example["tgt_original"].strip("\n").split()
+ example["tgt"] = example["tgt"].strip().split(" ")
+ example["tgt_original"] = example["tgt_original"].strip().split(" ")
if (
len(example["src"]) == 0
or len(example["tgt"]) == 0
diff --git a/_modules/onmt/models/model.html b/_modules/onmt/models/model.html
index acca4e7d4c..062004f655 100644
--- a/_modules/onmt/models/model.html
+++ b/_modules/onmt/models/model.html
@@ -204,7 +204,7 @@ Source code for onmt.models.model
""" Onmt NMT Model base class definition """
import torch
import torch.nn as nn
-import glob
+from glob import glob
[docs]class BaseModel(nn.Module):
@@ -248,6 +248,70 @@ Source code for onmt.models.model
def count_parameters(self, log=print):
raise NotImplementedError
+ def _load_param(self, name, module, param_name, param, buf_list, ckpt_t, offset):
+ if module.__class__.__name__ == "WQLinear_GEMM":
+ # ugly patch because in_feat and out_feat are reversed in WQLinear_GEMM
+ param.data = param.data.transpose(0, 1)
+ ckpt_t = ckpt_t.transpose(0, 1)
+ if name.split(".")[-1] in [
+ "linear_keys",
+ "linear_values",
+ "linear_query",
+ "w_1",
+ "w_3",
+ ]:
+ col_slice_start = param.data.size(0) * offset
+ col_slice_end = param.data.size(0) * (offset + 1)
+ else:
+ col_slice_start = 0
+ col_slice_end = param.data.size(0)
+ if param.data.dim() == 2:
+ if name.split(".")[-1] in ["final_linear", "w_2"]:
+ row_slice_start = param.data.size(1) * offset
+ row_slice_end = param.data.size(1) * (offset + 1)
+ else:
+ row_slice_start = 0
+ row_slice_end = param.data.size(1)
+ assert (
+ param.data.size()
+ == ckpt_t[
+ col_slice_start:col_slice_end,
+ row_slice_start:row_slice_end,
+ ].size()
+ ), "An error in model's partition and checkpoint's slice was detected"
+ if name + "." + param_name in buf_list:
+ if module.__class__.__name__ == "WQLinear_GEMM":
+ module.register_buffer(
+ param_name,
+ ckpt_t[
+ col_slice_start:col_slice_end,
+ row_slice_start:row_slice_end,
+ ].transpose(0, 1),
+ )
+ else:
+ module.register_buffer(
+ param_name,
+ ckpt_t[
+ col_slice_start:col_slice_end,
+ row_slice_start:row_slice_end,
+ ],
+ )
+ else:
+ param.data = ckpt_t[
+ col_slice_start:col_slice_end,
+ row_slice_start:row_slice_end,
+ ]
+ else:
+ assert (
+ param.data.size() == ckpt_t[col_slice_start:col_slice_end].size()
+ ), "An error in model's partition and checkpoint's slice was detected"
+ if name + "." + param_name in buf_list:
+ module.register_buffer(
+ param_name, ckpt_t[col_slice_start:col_slice_end]
+ )
+ else:
+ param.data = ckpt_t[col_slice_start:col_slice_end]
+
[docs] def load_state_dict(
self,
checkpoint,
@@ -271,64 +335,31 @@ Source code for onmt.models.model
if device == torch.device("cpu"):
offset = 0
buf_list = []
+ for buf_name, buf in self.named_buffers():
+ buf_list.append(buf_name)
for name, module in self.named_modules():
- for buf_name, buf in module.named_buffers():
- buf_list.append(buf_name)
- if len(buf_name.split(".")) == 1: # only last key
- if precision != torch.int8:
- module.to(precision)
- module.to(device)
- for param_name, param in module.named_parameters():
+ named_buf_and_param = list(module.named_buffers()) + list(
+ module.named_parameters()
+ )
+ for param_name, param in named_buf_and_param:
if len(param_name.split(".")) == 1: # only last key
if name + "." + param_name in checkpoint["model"].keys():
ckpt_t = checkpoint["model"][name + "." + param_name]
-
- if name.split(".")[-1] in [
- "linear_keys",
- "linear_values",
- "linear_query",
- "w_1",
- "w_3",
- ]:
- col_slice_start = param.data.size(0) * offset
- col_slice_end = param.data.size(0) * (offset + 1)
- else:
- col_slice_start = 0
- col_slice_end = param.data.size(0)
- if param.data.dim() == 2:
- if name.split(".")[-1] in ["final_linear", "w_2"]:
- row_slice_start = param.data.size(1) * offset
- row_slice_end = param.data.size(1) * (offset + 1)
- else:
- row_slice_start = 0
- row_slice_end = param.data.size(1)
- assert (
- param.data.size()
- == ckpt_t[
- col_slice_start:col_slice_end,
- row_slice_start:row_slice_end,
- ].size()
- ), "An error in model's partition and checkpoint's slice was detected"
- param.data = ckpt_t[
- col_slice_start:col_slice_end,
- row_slice_start:row_slice_end,
- ]
- else:
- assert (
- param.data.size()
- == ckpt_t[col_slice_start:col_slice_end].size()
- ), "An error in model's partition and checkpoint's slice was detected"
- param.data = ckpt_t[col_slice_start:col_slice_end]
-
+ self._load_param(
+ name, module, param_name, param, buf_list, ckpt_t, offset
+ )
del checkpoint["model"][name + "." + param_name]
elif (
"generator" in checkpoint.keys()
- and name == "generator"
+ and "generator" in name
and checkpoint["generator"] is not None
and param_name in checkpoint["generator"].keys()
):
- param.data = checkpoint["generator"][param_name]
- del checkpoint["generator"][param_name]
+ keyname = (
+ name + "." + param_name if "linear" in name else param_name
+ )
+ param.data = checkpoint["generator"][keyname]
+ del checkpoint["generator"][keyname]
elif strict and "lora" not in param_name:
raise ValueError(
"Missing key in checkpoint: %s" % name + "." + param_name
@@ -336,6 +367,7 @@ Source code for onmt.models.model
if precision != torch.int8:
module.to(precision)
module.to(device)
+
for key in checkpoint[
"model"
].keys(): # if some keys are left in checkpoint after deletion
@@ -376,7 +408,7 @@ Source code for onmt.models.model
except ImportError:
raise ImportError("run: pip install safetensors, to use safetensors")
keyfound = {}
- shards = glob.glob(model_path + ".*.safetensors")
+ shards = glob(model_path + ".*.safetensors")
if len(shards) == 0:
raise ValueError("No safetensors file found")
f = []
@@ -385,62 +417,25 @@ Source code for onmt.models.model
f.append(safetensors.safe_open(shard, framework="pt", device="cpu"))
for key in f[i].keys():
keys_shard[key] = i
+ if device == torch.device("cpu"):
+ offset = 0
buf_list = []
+ for buf_name, buf in self.named_buffers():
+ buf_list.append(buf_name)
for name, module in self.named_modules():
- for buf_name, buf in module.named_buffers():
- buf_list.append(buf_name)
- if len(buf_name.split(".")) == 1: # only last key
- if precision == torch.int8:
- torch.quantization.quantize_dynamic(module, inplace=True)
- else:
- module.to(precision)
- module.to(device)
- for param_name, param in module.named_parameters():
+ named_buf_and_param = list(module.named_buffers()) + list(
+ module.named_parameters()
+ )
+ for param_name, param in named_buf_and_param:
if len(param_name.split(".")) == 1: # only last key
if name + "." + param_name in keys_shard.keys():
ckpt_t = f[keys_shard[name + "." + param_name]].get_tensor(
name + "." + param_name
)
- if name.split(".")[-1] in [
- "linear_keys",
- "linear_values",
- "linear_query",
- "w_1",
- "w_3",
- ]:
- col_slice_start = param.data.size(0) * offset
- col_slice_end = param.data.size(0) * (offset + 1)
- else:
- col_slice_start = 0
- col_slice_end = param.data.size(0)
- if param.data.dim() == 2:
- if name.split(".")[-1] in ["final_linear", "w_2"]:
- row_slice_start = param.data.size(1) * offset
- row_slice_end = param.data.size(1) * (offset + 1)
- else:
- row_slice_start = 0
- row_slice_end = param.data.size(1)
- assert (
- param.data.size()
- == ckpt_t[
- col_slice_start:col_slice_end,
- row_slice_start:row_slice_end,
- ].size()
- ), "An error in model's partition and checkpoint's slice was detected"
-
- param.data = ckpt_t[
- col_slice_start:col_slice_end,
- row_slice_start:row_slice_end,
- ]
- else:
- assert (
- param.data.size()
- == ckpt_t[col_slice_start:col_slice_end].size()
- ), "An error in model's partition and checkpoint's slice was detected"
-
- param.data = ckpt_t[col_slice_start:col_slice_end]
-
+ self._load_param(
+ name, module, param_name, param, buf_list, ckpt_t, offset
+ )
keyfound[name + "." + param_name] = True
elif strict and "lora" not in param_name:
raise ValueError(
diff --git a/_modules/onmt/modules/copy_generator.html b/_modules/onmt/modules/copy_generator.html
index 4c5a7c9b65..2fc546e341 100644
--- a/_modules/onmt/modules/copy_generator.html
+++ b/_modules/onmt/modules/copy_generator.html
@@ -205,9 +205,7 @@ Source code for onmt.modules.copy_generator
import torch.nn as nn
-def collapse_copy_scores(
- scores, batch, tgt_vocab, src_vocabs=None, batch_dim=1, batch_offset=None
-):
+def collapse_copy_scores(scores, batch, tgt_vocab, batch_dim=1):
"""
Given scores from an expanded dictionary
corresponeding to a batch, sums together copies,
@@ -218,12 +216,7 @@ Source code for onmt.modules.copy_generator
blank = []
fill = []
- if src_vocabs is None:
- src_vocab = batch["src_ex_vocab"][b]
- else:
- batch_id = batch_offset[b] if batch_offset is not None else b
- index = batch["ind_in_bucket"].data[batch_id]
- src_vocab = src_vocabs[index]
+ src_vocab = batch["src_ex_vocab"][b]
for i in range(1, len(src_vocab)):
sw = src_vocab.ids_to_tokens[i]
@@ -232,8 +225,8 @@ Source code for onmt.modules.copy_generator
blank.append(offset + i)
fill.append(ti)
if blank:
- blank = torch.Tensor(blank).type_as(batch["ind_in_bucket"].data)
- fill = torch.Tensor(fill).type_as(batch["ind_in_bucket"].data)
+ blank = torch.Tensor(blank).to(torch.int64)
+ fill = torch.Tensor(fill).to(torch.int64)
score = scores[:, b] if batch_dim == 1 else scores[b]
score.index_add_(1, fill, score.index_select(1, blank))
score.index_fill_(1, blank, 1e-10)
diff --git a/_modules/onmt/modules/embeddings.html b/_modules/onmt/modules/embeddings.html
index 1043921612..8eb3995447 100644
--- a/_modules/onmt/modules/embeddings.html
+++ b/_modules/onmt/modules/embeddings.html
@@ -207,6 +207,7 @@ Source code for onmt.modules.embeddings
import torch
import torch.nn as nn
+from torch.nn.utils import skip_init
from onmt.modules.util_class import Elementwise
from onmt.utils.logging import logger
@@ -374,7 +375,13 @@ Source code for onmt.modules.embeddings
# is for words. Subsequent ones are for features, if any exist.
emb_params = zip(vocab_sizes, emb_dims, pad_indices)
embeddings = [
- nn.Embedding(vocab, dim, padding_idx=pad, sparse=sparse)
+ skip_init(
+ nn.Embedding,
+ num_embeddings=vocab,
+ embedding_dim=dim,
+ padding_idx=pad,
+ sparse=sparse,
+ )
for vocab, dim, pad in emb_params
]
emb_luts = Elementwise(feat_merge, embeddings)
diff --git a/_modules/onmt/modules/multi_headed_attn.html b/_modules/onmt/modules/multi_headed_attn.html
index 13779daf46..771fc3d690 100644
--- a/_modules/onmt/modules/multi_headed_attn.html
+++ b/_modules/onmt/modules/multi_headed_attn.html
@@ -202,18 +202,17 @@
Source code for onmt.modules.multi_headed_attn
""" Multi-Head Attention module """
-import math
import torch
+import torch.nn as nn
+from math import log, sqrt
from torch import Tensor
from typing import Optional, Tuple
-from torch.nn import functional as F
-import torch.nn as nn
+from torch.nn.functional import scaled_dot_product_attention
from torch.utils.checkpoint import checkpoint
from torch.nn.utils import skip_init
from .alibi_position_bias import AlibiPositionalBias
-import torch.distributed as dist
-import importlib
-
+from torch.distributed import all_reduce
+from importlib import import_module
# Help functions for Rotary Embeddings
# https://arxiv.org/pdf/2104.09864.pdf
@@ -222,28 +221,58 @@ Source code for onmt.modules.multi_headed_attn
# are both < 2048 tokens.
-def rotaryembeddings(dim: int, maxseqlen=8192, base=10000):
+def rotaryembeddings(dim: int, maxseqlen=2048, base=10000, device=None):
inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
tmax = torch.arange(maxseqlen, device=inv_freq.device)
rope = torch.outer(tmax, inv_freq).float()
# rope is now matrix [maxseqlen, dim/2]
rope = torch.polar(torch.ones_like(rope), rope)
- return rope
-
-
-def apply_rotary_emb(query, key, rope):
- query = query.transpose(1, 2)
- key = key.transpose(1, 2)
- query_ = query.float().reshape(*query.shape[:-1], -1, 2)
- query_ = torch.view_as_complex(query_)
- key_ = key.float().reshape(*key.shape[:-1], -1, 2)
- key_ = torch.view_as_complex(key_)
- rope = rope.view(1, query_.size(1), 1, query_.size(3))
- query_out = torch.view_as_real(query_ * rope).flatten(3)
- key_out = torch.view_as_real(key_ * rope).flatten(3)
- return query_out.transpose(1, 2).type_as(query), key_out.transpose(1, 2).type_as(
- key
- )
+ rope = torch.cat((rope, rope), dim=1)
+ if device is not None:
+ rope = rope.to(device)
+ cos = rope[:, : rope.size(1) // 2].real.contiguous().half()
+ sin = rope[:, : rope.size(1) // 2].imag.contiguous().half()
+ return rope, cos, sin
+
+
+def rotate_half(x):
+ """Rotates half the hidden dims of the input."""
+ x1 = x[..., : x.shape[-1] // 2]
+ x2 = x[..., x.shape[-1] // 2 :]
+ return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_emb(query, key, rope, interleave):
+ if interleave:
+ query = query.transpose(1, 2)
+ key = key.transpose(1, 2)
+ query_ = query.float().reshape(*query.shape[:-1], -1, 2)
+ query_ = torch.view_as_complex(query_)
+ key_ = key.float().reshape(*key.shape[:-1], -1, 2)
+ key_ = torch.view_as_complex(key_)
+ rope = rope[:, : rope.size(1) // 2].view(1, query_.size(1), 1, query_.size(3))
+ query_out = torch.view_as_real(query_ * rope).flatten(3)
+ key_out = torch.view_as_real(key_ * rope).flatten(3)
+ return query_out.transpose(1, 2).type_as(query), key_out.transpose(
+ 1, 2
+ ).type_as(key)
+ else:
+ cos, sin = rope.real, rope.imag
+ rotary_dim = cos.size(1)
+ head_dim = query.size(3)
+ if rotary_dim < head_dim:
+ q_embed = (query[:, :, :, :rotary_dim] * cos) + (
+ rotate_half(query[:, :, :, :rotary_dim]) * sin
+ )
+ k_embed = (key[:, :, :, :rotary_dim] * cos) + (
+ rotate_half(key[:, :, :, :rotary_dim]) * sin
+ )
+ q_embed = torch.cat([q_embed, query[:, :, :, rotary_dim:]], dim=-1)
+ k_embed = torch.cat([k_embed, key[:, :, :, rotary_dim:]], dim=-1)
+ else:
+ q_embed = (query * cos) + (rotate_half(query) * sin)
+ k_embed = (key * cos) + (rotate_half(key) * sin)
+ return q_embed.type_as(query), k_embed.type_as(key)
# Help functions for max_relative positions
@@ -334,7 +363,7 @@
Source code for onmt.modules.multi_headed_attn
# up to max_distance
relative_position_if_large = max_exact + (
torch.log(relative_position.float() / max_exact)
- / math.log(max_distance / max_exact)
+ / log(max_distance / max_exact)
* (num_buckets - max_exact)
).to(torch.long)
relative_position_if_large = torch.min(
@@ -446,7 +475,11 @@
Source code for onmt.modules.multi_headed_attn
is_decoder: bool = True,
max_relative_positions: int = 0,
relative_positions_buckets: int = 0,
+ rotary_interleave: bool = True,
+ rotary_theta: int = 1e4,
+ rotary_dim: int = 0,
attn_type: str = None,
+ self_attn_type: str = None,
add_qkvbias=False,
num_kv=0,
use_ckpting=[],
@@ -513,6 +546,7 @@
Source code for onmt.modules.multi_headed_attn
self.max_relative_positions = max_relative_positions
self.relative_positions_buckets = relative_positions_buckets
self.attn_type = attn_type
+ self.self_attn_type = self_attn_type
self.layer_cache = (
False,
{"keys": torch.tensor([]), "values": torch.tensor([])},
@@ -538,21 +572,37 @@
Source code for onmt.modules.multi_headed_attn
self.relative_attention_bias = None
if max_relative_positions == -1: # rotary embeddings
- self.rope = rotaryembeddings(self.dim_per_head)
-
+ if rotary_dim == 0:
+ self.rotary_dim = self.dim_per_head
+ else:
+ self.rotary_dim = rotary_dim
+ self.rope, self.cos, self.sin = rotaryembeddings(
+ self.rotary_dim, base=rotary_theta
+ )
+ self.rotary_interleave = rotary_interleave
+ self.rotary_theta = rotary_theta
+ else:
+ self.cos = None
+ self.sin = None
+ self.rotary_interleave = None
if max_relative_positions == -2: # alibi positional bias
self.alibi = AlibiPositionalBias(head_count)
self.maybe_ckpt = checkpoint if "mha" in use_ckpting else lambda f, x: f(x)
try:
- flash_pack = importlib.import_module("flash_attn")
+ flash_pack = import_module("flash_attn")
if (
hasattr(flash_pack, "flash_attn_func")
and torch.cuda.get_device_capability()[0] >= 8
):
self.flash_attn_func = getattr(flash_pack, "flash_attn_func")
+ self.flash_attn_with_kvcache = getattr(
+ flash_pack, "flash_attn_with_kvcache"
+ )
self.flash2 = True
+ else:
+ self.flash2 = False
except ImportError:
self.flash2 = False
@@ -569,6 +619,7 @@
Source code for onmt.modules.multi_headed_attn
sliding_window: Optional[int] = 0,
step: Optional[int] = 0,
return_attn: Optional[bool] = False,
+ self_attn_type: str = None,
) -> Tuple[Tensor, Tensor]:
"""
Compute the context vector and the attention vectors.
@@ -591,32 +642,117 @@
Source code for onmt.modules.multi_headed_attn
"""
# 1) Project key, value, and query.
# as a reminder at training layer_cache[0] remains False
+ key_pad_mask = self.layer_cache[1].get("key_pad_mask", None)
if self.layer_cache[0]:
+ # Retrieve keys and values from the KV cache (decoding mode only).
if self.attn_type == "self":
query, key, value = (
self.linear_query(query),
self.linear_keys(query),
self.linear_values(query),
)
+
query = shape(query, self.dim_per_head)
key = shape(key, self.dim_per_head)
value = shape(value, self.dim_per_head)
+ start_pos = step
+ seqlen = query.size(2)
- if self.max_relative_positions == -1: # Rotary Embeddings
- start_pos = step
- seqlen = query.size(2)
- rope = self.rope[start_pos : start_pos + seqlen]
- query, key = apply_rotary_emb(query, key, rope=rope)
+ if (
+ step == 0
+ or not self.flash2
+ or self.self_attn_type != "scaled-dot-flash"
+ or self.max_relative_positions not in [0, -1]
+ or query.size(0) > 128
+ or query.dtype != torch.float16
+ ):
+ if self.max_relative_positions == -1: # Rotary Embeddings
+ if seqlen + start_pos > self.rope.size(0):
+ # Resize rotary embeddings.
+ self.rope, _, _ = rotaryembeddings(
+ self.rotary_dim,
+ maxseqlen=(seqlen + start_pos + 2048),
+ base=self.rotary_theta,
+ device=self.rope.device,
+ )
+ rope = self.rope[start_pos : start_pos + seqlen]
+ query, key = apply_rotary_emb(
+ query, key, rope, interleave=self.rotary_interleave
+ )
+
+ if self.layer_cache[1]["keys"].numel() != 0:
+ key = torch.cat((self.layer_cache[1]["keys"], key), dim=2)
+ value = torch.cat((self.layer_cache[1]["values"], value), dim=2)
+ if sliding_window > 0 and key.size(2) > sliding_window:
+ key = key[:, :, 1:, :]
+ value = value[:, :, 1:, :]
+
+ self.layer_cache[1]["keys"] = key
+ self.layer_cache[1]["values"] = value
+
+ else:
+ if start_pos >= self.layer_cache[1]["keys"].size(2):
+ self.layer_cache[1]["keys"] = torch.cat(
+ [
+ self.layer_cache[1]["keys"],
+ torch.zeros(
+ self.layer_cache[1]["keys"].shape[:-2]
+ + (32,)
+ + self.layer_cache[1]["keys"].shape[-1:],
+ device=query.device,
+ ).half(),
+ ],
+ dim=-2,
+ )
+ self.layer_cache[1]["values"] = torch.cat(
+ [
+ self.layer_cache[1]["values"],
+ torch.zeros(
+ self.layer_cache[1]["values"].shape[:-2]
+ + (32,)
+ + self.layer_cache[1]["values"].shape[-1:],
+ device=query.device,
+ ).half(),
+ ],
+ dim=-2,
+ )
+ if (
+ self.max_relative_positions == -1
+ and start_pos + 32 >= self.rope.size(0)
+ ):
+ # Resize rotary embeddings.
+ # We take a margin of 32 tokens as the kv_cache
+ # is incremented by 32 tokens every 32 tokens.
+ self.rope, self.cos, self.sin = rotaryembeddings(
+ self.rotary_dim,
+ maxseqlen=(start_pos + 2048),
+ base=self.rotary_theta,
+ device=self.rope.device,
+ )
- if self.layer_cache[1]["keys"].numel() != 0:
- key = torch.cat((self.layer_cache[1]["keys"], key), dim=2)
- value = torch.cat((self.layer_cache[1]["values"], value), dim=2)
if sliding_window > 0 and key.size(2) > sliding_window:
- key = key[:, :, 1:, :]
- value = value[:, :, 1:, :]
+ self.layer_cache[1]["keys"] = self.layer_cache[1]["keys"][
+ :, :, 1:, :
+ ]
+ self.layer_cache[1]["values"] = self.layer_cache[1]["values"][
+ :, :, 1:, :
+ ]
+ context = self.flash_attn_with_kvcache(
+ query.transpose(1, 2),
+ self.layer_cache[1]["keys"].transpose(1, 2),
+ self.layer_cache[1]["values"].transpose(1, 2),
+ key.transpose(1, 2),
+ value.transpose(1, 2),
+ rotary_cos=self.cos,
+ rotary_sin=self.sin,
+ cache_seqlens=step,
+ rotary_interleaved=self.rotary_interleave,
+ ).transpose(1, 2)
+ attn_output = self.final_linear(unshape(context))
+ if self.parallel_gpu > 1:
+ all_reduce(attn_output)
+ return attn_output, None
- self.layer_cache[1]["keys"] = key
- self.layer_cache[1]["values"] = value
elif self.attn_type == "context":
query = self.linear_query(query)
query = shape(query, self.dim_per_head)
@@ -631,10 +767,26 @@
Source code for onmt.modules.multi_headed_attn
)
self.layer_cache[1]["keys"] = key
self.layer_cache[1]["values"] = value
+
+ if key_pad_mask is not None:
+ # Increase the cached key pad mask by concatenation.
+ # For decoding only.
+ if step > 0:
+ y = torch.zeros(
+ (key_pad_mask.size(0), key_pad_mask.size(1), 1),
+ dtype=torch.bool,
+ device=key_pad_mask.device,
+ )
+ self.layer_cache[1]["key_pad_mask"] = torch.cat(
+ (key_pad_mask, y), 2
+ )
+ key_pad_mask = self.layer_cache[1]["key_pad_mask"]
else:
+ # Retrieve keys and values from linear layers (training mode).
key = self.maybe_ckpt(self.linear_keys, key)
value = self.maybe_ckpt(self.linear_values, value)
query = self.maybe_ckpt(self.linear_query, query)
+
key = shape(key, self.dim_per_head)
value = shape(value, self.dim_per_head)
query = shape(query, self.dim_per_head)
@@ -642,8 +794,18 @@
Source code for onmt.modules.multi_headed_attn
if self.max_relative_positions == -1: # Rotary Embeddings
start_pos = 0
seqlen = query.size(2)
- rope = self.rope[start_pos : start_pos + seqlen].to(query.device)
- query, key = apply_rotary_emb(query, key, rope=rope)
+ if seqlen > self.rope.size(0):
+ # Resize rotary embeddings.
+ self.rope, self.cos, self.sin = rotaryembeddings(
+ self.rotary_dim,
+ maxseqlen=(seqlen + 2048),
+ base=self.rotary_theta,
+ device=query.device,
+ )
+ rope = self.rope[start_pos : start_pos + seqlen]
+ query, key = apply_rotary_emb(
+ query, key, rope, interleave=self.rotary_interleave
+ )
b, h, l, d = key.size()
if self.num_kv > 0:
@@ -661,7 +823,6 @@
Source code for onmt.modules.multi_headed_attn
# Ultimately flashv2 will be part of pytorch https://github.com/pytorch/pytorch/pull/105602
# In the meantime: if vanilla tranformer or Rotary embeddings (not rel_pos, not alibi)
# then use flash2 if seq len > 256 otherwise use xtransformer from pt2 uptream
-
flash2 = (
self.flash2
and l > 256 # https://github.com/Dao-AILab/flash-attention/issues/591
@@ -671,7 +832,9 @@
Source code for onmt.modules.multi_headed_attn
self.max_relative_positions in [-1, 0]
and not return_attn
and query.device != torch.device("cpu")
+ and self.self_attn_type == "scaled-dot-flash"
):
+ # Apply flash2 attention.
causal = self.is_decoder and self.attn_type == "self" and mask is not None
if self.is_decoder and self.attn_type == "self" and flash2:
if causal:
@@ -689,10 +852,11 @@
Source code for onmt.modules.multi_headed_attn
window_size=window_size,
).transpose(1, 2)
else:
+ # Apply scaled dot product attention.
with torch.backends.cuda.sdp_kernel(
enable_flash=False, enable_math=True, enable_mem_efficient=True
):
- attn_output = F.scaled_dot_product_attention(
+ attn_output = scaled_dot_product_attention(
query,
key,
value,
@@ -700,18 +864,10 @@
Source code for onmt.modules.multi_headed_attn
self.dropout_p,
is_causal=causal,
)
-
- x = unshape(attn_output)
-
- attn_output = self.maybe_ckpt(self.final_linear, x)
-
- if self.parallel_gpu > 1:
- dist.all_reduce(attn_output)
-
- return attn_output, None
+ attn = None
else:
- query /= math.sqrt(self.dim_per_head)
+ query /= sqrt(self.dim_per_head)
# batch x num_heads x query_len x key_len
scores = torch.matmul(query, key.transpose(2, 3))
@@ -753,6 +909,8 @@
Source code for onmt.modules.multi_headed_attn
scores = self.alibi(scores)
scores = scores.float()
+ if key_pad_mask is not None and mask is None:
+ mask = key_pad_mask.unsqueeze(1)
if mask is not None:
# not 100% necessary but expand to nb of heads
@@ -764,23 +922,28 @@
Source code for onmt.modules.multi_headed_attn
attn = self.softmax(scores).to(query.dtype)
drop_attn = self.dropout(attn) if self.dropout_p > 0 else attn
- context_original = torch.matmul(drop_attn, value)
+ attn_output = torch.matmul(drop_attn, value)
if self.relative_positions_embeddings is not None:
# We use the same embeddings for key and value
relations_values = relations_keys
- context_original.add_(
- relative_matmul(drop_attn, relations_values, False)
- )
+ attn_output.add_(relative_matmul(drop_attn, relations_values, False))
- context = unshape(context_original)
+ context = unshape(attn_output)
+ if key_pad_mask is not None:
+ if key_pad_mask.size(0) > 1 and context.size(1) > 1:
+ x = key_pad_mask.squeeze(1).unsqueeze(2).expand(-1, -1, context.size(2))
+ context = context.masked_fill(x, 0)
+ if self.layer_cache[0]:
+ attn_output = self.final_linear(context)
+ else:
attn_output = self.maybe_ckpt(self.final_linear, context)
- if self.parallel_gpu > 1:
- dist.all_reduce(attn_output)
+ if self.parallel_gpu > 1:
+ all_reduce(attn_output)
- return attn_output, attn
+ return attn_output, attn
diff --git a/_modules/onmt/modules/position_ffn.html b/_modules/onmt/modules/position_ffn.html
index 2225645171..052c0286e3 100644
--- a/_modules/onmt/modules/position_ffn.html
+++ b/_modules/onmt/modules/position_ffn.html
@@ -203,17 +203,12 @@
Source code for onmt.modules.position_ffn
"""Position feed-forward network from "Attention is All You Need"."""
-
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.checkpoint import checkpoint
-
-try:
- from apex.normalization import FusedRMSNorm as RMSNorm
-except ImportError:
- from onmt.modules.rmsnorm import RMSNorm
+from onmt.modules.rmsnorm import RMSNorm
from torch.nn.utils import skip_init
-import torch.distributed as dist
+from torch.distributed import all_reduce
class ActivationFunction(object):
@@ -320,7 +315,7 @@ Source code for onmt.modules.position_ffn
inter = self.dropout_2(inter)
if self.parallel_gpu > 1:
- dist.all_reduce(inter)
+ all_reduce(inter)
return inter + x
diff --git a/_modules/onmt/trainer.html b/_modules/onmt/trainer.html
index d1acd9e1d1..5649d19a71 100644
--- a/_modules/onmt/trainer.html
+++ b/_modules/onmt/trainer.html
@@ -531,9 +531,10 @@ Source code for onmt.trainer
)
if valid_iter is not None and step % valid_steps == 0:
- valid_stats = self.validate(
- valid_iter, moving_average=self.moving_average
- )
+ if self.parallel_mode == "tensor_parallel" or self.gpu_rank <= 0:
+ valid_stats = self.validate(
+ valid_iter, moving_average=self.moving_average
+ )
if step % valid_steps == 0 and self.gpu_rank <= 0:
self._report_step(
diff --git a/_modules/onmt/translate/beam_search.html b/_modules/onmt/translate/beam_search.html
index 9ce38f7114..27e3d94266 100644
--- a/_modules/onmt/translate/beam_search.html
+++ b/_modules/onmt/translate/beam_search.html
@@ -384,8 +384,8 @@ Source code for onmt.translate.beam_search
return topk_scores, topk_ids
- def beams_non_finished(self, i, predictions, attention, step):
-
+ def beams_non_finished(self, i, topk_scores_list, predictions, attention, step):
+ # using lists instead of tensors for topk_scores and is_finished make things faster
if any(self.is_finished_list[i]):
b = self._batch_offset[i]
# Store finished hypotheses for this example in the batch.
@@ -393,34 +393,33 @@ Source code for onmt.translate.beam_search
k for k, fin in enumerate(self.is_finished_list[i]) if fin
]: # Beam level: finished beam j in example i of batch
if self.ratio > 0:
- s = self.topk_scores[i, j] / (step + 1)
+ s = topk_scores_list[i][j] / (step + 1)
self.best_scores[b] = max(s, self.best_scores[b])
self.hypotheses[b].append(
(
- self.topk_scores[i, j],
+ topk_scores_list[i][j],
predictions[i, j, 1:], # Ignore start_token.
attention[i, j, :, : self.src_len[i]]
if attention is not None
else None,
)
)
- if len(self.hypotheses[b]) >= 2:
- self.hypotheses[b] = sorted(
- self.hypotheses[b], key=lambda x: x[0], reverse=True
- )
# End condition is the top beam finished and we can return
# n_best hypotheses.
if self.ratio > 0:
pred_len = self.src_len[i] * self.ratio
finish_flag = (
- (self.topk_scores[i, 0] / pred_len) <= self.best_scores[b]
+ (topk_scores_list[i][0] / pred_len) <= self.best_scores[b]
) or all(self.is_finished_list[i])
else:
# early stop when top beam is finished
finish_flag = self.is_finished_list[i][0]
if finish_flag and len(self.hypotheses[b]) >= self.n_best:
+ self.hypotheses[b] = sorted(
+ self.hypotheses[b], key=lambda x: x[0], reverse=True
+ )
for score, pred, attn in self.hypotheses[b][: self.n_best]:
self.scores[b].append(score)
self.predictions[b].append(pred) # ``(batch, n_best,)``
@@ -438,7 +437,7 @@ Source code for onmt.translate.beam_search
# this is required to pursue finished beams in non finished batches
self.topk_log_probs.masked_fill_(
torch.tensor(self.is_finished_list, device=self.topk_log_probs.device),
- -1e10,
+ -65504,
)
predictions = self.alive_seq.view(_B_old, self.beam_size, step)
attention = (
@@ -449,10 +448,13 @@ Source code for onmt.translate.beam_search
else None
)
+ topk_scores_list = self.topk_scores.tolist()
non_finished_batch = [
i
for i in range(len(self.is_finished_list))
- if self.beams_non_finished(i, predictions, attention, step)
+ if self.beams_non_finished(
+ i, topk_scores_list, predictions, attention, step
+ )
]
non_finished = torch.tensor(non_finished_batch)
@@ -468,12 +470,6 @@ Source code for onmt.translate.beam_search
# reset the selection for the next step
self.select_indices = self._batch_index.view(_B_new * self.beam_size)
- # assert torch.equal(
- # self.src_len[self.select_indices],
- # self.src_len.view(_B_old, self.beam_size)[non_finished].view(
- # _B_new * self.beam_size
- # ),
- # )
self.src_len = self.src_len[self.select_indices]
self.maybe_update_target_prefix(self.select_indices)
@@ -481,15 +477,10 @@ Source code for onmt.translate.beam_search
self, _B_new, _B_old, non_finished, predictions, attention, step
):
# Remove finished batches for the next step.
- self._batch_offset = self._batch_offset[non_finished]
- # here we combine two slections in one
- # self.topk_log_probs = self.topk_log_probs[non_finished]
- # self._batch_index = self._batch_index.index_select(0, non_finished)
- self.topk_log_probs, self._batch_index = torch.unbind(
- torch.stack([self.topk_log_probs, self._batch_index], dim=2)[non_finished],
- dim=2,
- )
- self._batch_index = self._batch_index.to(torch.long)
+ self._batch_offset = self._batch_offset[non_finished] # CPU
+ non_finished = non_finished.to(self.topk_log_probs.device)
+ self.topk_log_probs = self.topk_log_probs[non_finished]
+ self._batch_index = self._batch_index[non_finished]
self.alive_seq = predictions[non_finished].view(-1, self.alive_seq.size(-1))
if self.alive_attn is not None:
diff --git a/_modules/onmt/translate/decode_strategy.html b/_modules/onmt/translate/decode_strategy.html
index dd63884b4c..35ed635914 100644
--- a/_modules/onmt/translate/decode_strategy.html
+++ b/_modules/onmt/translate/decode_strategy.html
@@ -386,18 +386,16 @@ Source code for onmt.translate.decode_strategy
def ensure_min_length(self, log_probs):
if len(self) <= self.min_length:
- log_probs[:, self.eos] = -1e20
+ log_probs[:, self.eos] = -65504 # -1e20
def ensure_unk_removed(self, log_probs):
if self.ban_unk_token:
- log_probs[:, self.unk] = -1e20
+ log_probs[:, self.unk] = -65504 # -1e20
def ensure_max_length(self):
# add one to account for BOS. Don't account for EOS because hitting
# this implies it hasn't been found.
if len(self) == self.max_length + 1:
- if hasattr(self, "is_finished"):
- self.is_finished.fill_(1)
self.is_finished_list = [
[True for _ in range(self.parallel_paths)]
for _ in range(len(self.is_finished_list))
diff --git a/_modules/onmt/translate/greedy_search.html b/_modules/onmt/translate/greedy_search.html
index 57b3679b42..2bc8af22c1 100644
--- a/_modules/onmt/translate/greedy_search.html
+++ b/_modules/onmt/translate/greedy_search.html
@@ -202,15 +202,14 @@
Source code for onmt.translate.greedy_search
import torch
-import torch.nn.functional as F
-
+from torch.nn.functional import softmax
from onmt.translate.decode_strategy import DecodeStrategy
def sample_topp(logits, keep_topp):
sorted_logits, sorted_indices = torch.sort(logits, descending=True, dim=1)
- cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+ cumulative_probs = torch.cumsum(softmax(sorted_logits, dim=-1), dim=-1)
sorted_indices_to_keep = cumulative_probs.lt(keep_topp)
# keep indices until overflowing p
@@ -301,6 +300,8 @@ Source code for onmt.translate.greedy_search
eos (int): See base.
unk (int): See base.
start (int): See base.
+ n_best (int): Don't stop until at least this many beams have
+ reached EOS.
batch_size (int): See base.
global_scorer (onmt.translate.GNMTGlobalScorer): Scorer instance.
min_length (int): See base.
@@ -326,6 +327,7 @@ Source code for onmt.translate.greedy_search
eos,
unk,
start,
+ n_best,
batch_size,
global_scorer,
min_length,
@@ -360,6 +362,7 @@ Source code for onmt.translate.greedy_search
self.keep_topp = keep_topp
self.topk_scores = None
self.beam_size = beam_size
+ self.n_best = n_best
[docs] def initialize(
self, enc_out, src_len, src_map=None, device=None, target_prefix=None
@@ -406,9 +409,7 @@ Source code for onmt.translate.greedy_search
return topk_ids, topk_scores
def align_select_indices(self):
- nb_finished_beams = self.is_finished.view(-1).size(
- 0
- ) - self.select_indices.size(0)
+ nb_finished_beams = len(self.is_finished_list) - self.select_indices.size(0)
if nb_finished_beams:
self.select_indices = torch.arange(
self.select_indices.size(0),
@@ -428,8 +429,7 @@ Source code for onmt.translate.greedy_search
to 1.)
attn (FloatTensor): Shaped ``(1, B, inp_seq_len)``.
"""
- if hasattr(self, "is_finished"):
- self.align_select_indices()
+ self.align_select_indices()
self.ensure_min_length(log_probs)
self.ensure_unk_removed(log_probs)
@@ -438,8 +438,7 @@ Source code for onmt.translate.greedy_search
topk_ids, self.topk_scores = self._pick(log_probs)
self.beams_scores += self.topk_scores
- self.is_finished = topk_ids.eq(self.eos)
- self.is_finished_list = self.is_finished.tolist()
+ self.is_finished_list = topk_ids.eq(self.eos).tolist()
self.alive_seq = torch.cat([self.alive_seq, topk_ids], -1)
if self.return_attention:
@@ -452,39 +451,47 @@ Source code for onmt.translate.greedy_search
[docs] def update_finished(self):
"""Finalize scores and predictions."""
# shape: (sum(~ self.is_finished), 1)
- finished_batches = self.is_finished.view(-1).nonzero()
step = len(self)
+ non_finished_batch = [
+ b for b, fin in enumerate(self.is_finished_list) if not fin[0]
+ ]
length_penalty = self.global_scorer.length_penalty(
step, alpha=self.global_scorer.alpha
)
-
- for b in finished_batches.view(-1):
+ for b in [i for i, fin in enumerate(self.is_finished_list) if fin[0]]:
b_orig = self.original_batch_idx[b]
score = self.beams_scores[b, 0] / length_penalty
pred = self.alive_seq[b, 1:]
attention = (
- self.alive_attn[b, :, : self.src_len[b]]
+ self.alive_attn[
+ b,
+ :,
+ : self.src_len[b],
+ ]
if self.alive_attn is not None
else []
)
self.hypotheses[b_orig].append((score, pred, attention))
- self.done = self.is_finished.all()
+ self.done = len(non_finished_batch) == 0
if self.done:
for b in range(self.batch_size):
- best_hyp = sorted(self.hypotheses[b], key=lambda x: x[0], reverse=True)
+ best_hyp = sorted(self.hypotheses[b], key=lambda x: x[0], reverse=True)[
+ : self.n_best
+ ]
for score, pred, attn in best_hyp:
self.scores[b].append(score)
self.predictions[b].append(pred)
self.attention[b].append(attn)
return
- is_alive = ~self.is_finished.view(-1)
- self.alive_seq = self.alive_seq[is_alive]
- self.beams_scores = self.beams_scores[is_alive]
- self.src_len = self.src_len[is_alive]
+ self.select_indices = torch.tensor(
+ non_finished_batch, device=self.alive_seq.device
+ )
+ self.alive_seq = self.alive_seq[self.select_indices]
+ self.beams_scores = self.beams_scores[self.select_indices]
+ self.src_len = self.src_len[self.select_indices]
if self.alive_attn is not None:
- self.alive_attn = self.alive_attn[is_alive]
- self.select_indices = is_alive.nonzero(as_tuple=False).view(-1)
- self.original_batch_idx = self.original_batch_idx[is_alive]
+ self.alive_attn = self.alive_attn[self.select_indices]
+ self.original_batch_idx = self.original_batch_idx[self.select_indices]
self.maybe_update_target_prefix(self.select_indices)
diff --git a/_modules/onmt/translate/translation_server.html b/_modules/onmt/translate/translation_server.html
index 0578bff947..bd00440b7e 100644
--- a/_modules/onmt/translate/translation_server.html
+++ b/_modules/onmt/translate/translation_server.html
@@ -1140,7 +1140,7 @@ Source code for onmt.translate.translation_server
"""De-tokenize the sequence (or not)
Same args/returns as :func:``tokenize()``"""
- if self.tokenizers_opt is not None and "".join(sequence.split()) != "":
+ if self.tokenizers_opt is not None and "".join(sequence.split(" ")) != "":
return self.detokenize(sequence, side)
return sequence
@@ -1153,9 +1153,9 @@ Source code for onmt.translate.translation_server
raise ValueError("No tokenizer loaded")
if self.tokenizers_opt[side]["type"] == "sentencepiece":
- detok = self.tokenizers[side].DecodePieces(sequence.split())
+ detok = self.tokenizers[side].DecodePieces(sequence.split(" "))
elif self.tokenizers_opt[side]["type"] == "pyonmttok":
- detok = self.tokenizers[side].detokenize(sequence.split())
+ detok = self.tokenizers[side].detokenize(sequence.split(" "))
return detok
@@ -1179,7 +1179,7 @@ Source code for onmt.translate.translation_server
"To get decoded alignment, joiner/spacer "
"should be used in both side's tokenizer."
)
- elif "".join(tgt.split()) != "":
+ elif "".join(tgt.split(" ")) != "":
align = to_word_align(
src, tgt, align, align_scores, src_marker, tgt_marker
)
diff --git a/_modules/onmt/translate/translator.html b/_modules/onmt/translate/translator.html
index 2bb08cbb1b..28800153f8 100644
--- a/_modules/onmt/translate/translator.html
+++ b/_modules/onmt/translate/translator.html
@@ -203,18 +203,17 @@
Source code for onmt.translate.translator
#!/usr/bin/env python
""" Translator Class and builder """
+import torch
+from torch.nn.functional import log_softmax
+from torch.nn.utils.rnn import pad_sequence
import codecs
-import os
-import time
-import numpy as np
+from time import time
+from math import exp
from itertools import count, zip_longest
from copy import deepcopy
-import torch
-import torch.nn.functional as F
-from torch.nn.utils.rnn import pad_sequence
-from onmt.constants import DefaultTokens
import onmt.model_builder
import onmt.decoders.ensemble
+from onmt.constants import DefaultTokens
from onmt.translate.beam_search import BeamSearch, BeamSearchLM
from onmt.translate.greedy_search import GreedySearch, GreedySearchLM
from onmt.utils.misc import tile, set_random_seed, report_matrix
@@ -311,6 +310,7 @@ Source code for onmt.translate.translator
n_best=1,
min_length=0,
max_length=100,
+ max_length_ratio=1.5,
ratio=0.0,
beam_size=30,
random_sampling_topk=0,
@@ -336,6 +336,7 @@ Source code for onmt.translate.translator
logger=None,
seed=-1,
with_score=False,
+ return_gold_log_probs=False,
):
self.model = model
self.vocabs = vocabs
@@ -356,6 +357,7 @@ Source code for onmt.translate.translator
self.n_best = n_best
self.max_length = max_length
+ self.max_length_ratio = max_length_ratio
self.beam_size = beam_size
self.random_sampling_temp = random_sampling_temp
@@ -407,6 +409,8 @@ Source code for onmt.translate.translator
set_random_seed(seed, self._use_cuda)
self.with_score = with_score
+ self.return_gold_log_probs = return_gold_log_probs
+
@classmethod
def from_opt(
cls,
@@ -447,6 +451,7 @@ Source code for onmt.translate.translator
n_best=opt.n_best,
min_length=opt.min_length,
max_length=opt.max_length,
+ max_length_ratio=opt.max_length_ratio,
ratio=opt.ratio,
beam_size=opt.beam_size,
random_sampling_topk=opt.random_sampling_topk,
@@ -481,26 +486,17 @@ Source code for onmt.translate.translator
print(msg)
def _gold_score(
- self,
- batch,
- enc_out,
- src_len,
- use_src_map,
- enc_final_hs,
- batch_size,
- src,
+ self, batch, enc_out, src_len, use_src_map, enc_final_hs, batch_size, src
):
if "tgt" in batch.keys() and not self.tgt_file_prefix:
- gs = self._score_target(
- batch,
- enc_out,
- src_len,
- batch["src_map"] if use_src_map else None,
+ gs, glp = self._score_target(
+ batch, enc_out, src_len, batch["src_map"] if use_src_map else None
)
self.model.decoder.init_state(src, enc_out, enc_final_hs)
else:
gs = [0] * batch_size
- return gs
+ glp = None
+ return gs, glp
def _translate(
self,
@@ -544,7 +540,7 @@ Source code for onmt.translate.translator
all_scores = []
all_predictions = []
- start_time = time.time()
+ start_time = time()
def _maybe_retranslate(translations, batch):
"""Here we handle the cases of mismatch in number of segments
@@ -656,10 +652,7 @@ Source code for onmt.translate.translator
srcs = [voc_src[tok] for tok in trans.src[: trans.srclen]]
sent_number = next(counter)
output = trans.log(sent_number, src_raw=srcs)
- if self.logger:
- self.logger.info(output)
- else:
- os.write(1, output.encode("utf-8"))
+ self._log(output)
if attn_debug:
preds = trans.pred_sents[0]
@@ -672,10 +665,7 @@ Source code for onmt.translate.translator
else:
srcs = [str(item) for item in range(len(attns[0]))]
output = report_matrix(srcs, preds, attns)
- if self.logger:
- self.logger.info(output)
- else:
- os.write(1, output.encode("utf-8"))
+ self._log(output)
if align_debug:
if self.gold_align:
@@ -690,10 +680,8 @@ Source code for onmt.translate.translator
else:
srcs = [str(item) for item in range(len(align[0]))]
output = report_matrix(srcs, tgts, align)
- if self.logger:
- self.logger.info(output)
- else:
- os.write(1, output.encode("utf-8"))
+ self._log(output)
+
return (
bucket_scores,
bucket_predictions,
@@ -761,7 +749,7 @@ Source code for onmt.translate.translator
gold_score_total += bucket_gold_score
gold_words_total += bucket_gold_words
- end_time = time.time()
+ end_time = time()
if self.report_score:
msg = self._report_score("PRED", pred_score_total, len(all_scores))
@@ -789,6 +777,38 @@ Source code for onmt.translate.translator
return all_scores, all_predictions
+ def _score(self, infer_iter):
+ self.with_scores = True
+ score_res = []
+ processed_bucket = {}
+ prev_bucket_idx = 0
+ for batch, bucket_idx in infer_iter:
+ if bucket_idx != prev_bucket_idx:
+ prev_bucket_idx += 1
+ score_res += [item for _, item in sorted(processed_bucket.items())]
+ processed_bucket = {}
+ batch_data = self.translate_batch(batch, attn_debug=False, scoring=True)
+ batch_gold_scores = batch_data["gold_score"].cpu().numpy().tolist()
+ batch_tgt_lengths = batch["tgtlen"].cpu().numpy().tolist()
+ batch_inds_in_bucket = batch["ind_in_bucket"]
+ if self.return_gold_log_probs:
+ batch_gold_log_probs = (
+ batch_data["gold_log_probs"].cpu().numpy().tolist()
+ )
+ else:
+ batch_gold_log_probs = [
+ None for i, _ in enumerate(batch_inds_in_bucket)
+ ]
+ for i, ind in enumerate(batch_inds_in_bucket):
+ processed_bucket[ind] = [
+ batch_gold_scores[i],
+ batch_gold_log_probs[i],
+ batch_tgt_lengths[i],
+ ]
+ if processed_bucket:
+ score_res += [item for _, item in sorted(processed_bucket.items())]
+ return score_res
+
def _align_pad_prediction(self, predictions, bos, pad):
"""
Padding predictions in batch and add BOS.
@@ -828,7 +848,10 @@ Source code for onmt.translate.translator
msg = "%s No translations" % (name,)
else:
score = score_total / nb_sentences
- ppl = np.exp(-score_total.item() / nb_sentences)
+ try:
+ ppl = exp(-score_total / nb_sentences)
+ except OverflowError:
+ ppl = float("inf")
msg = "%s SCORE: %.4f, %s PPL: %.2f NB SENTENCES: %d" % (
name,
score,
@@ -867,7 +890,6 @@ Source code for onmt.translate.translator
step=step,
return_attn=self.global_scorer.has_cov_pen or return_attn,
)
-
# Generator forward.
if not self.copy_attn:
if "std" in dec_attn:
@@ -875,7 +897,7 @@ Source code for onmt.translate.translator
else:
attn = None
scores = self.model.generator(dec_out.squeeze(1))
- log_probs = F.log_softmax(scores.to(torch.float32), dim=-1)
+ log_probs = log_softmax(scores, dim=-1) # we keep float16 if FP16
# returns [(batch_size x beam_size) , vocab ] when 1 step
# or [batch_size, tgt_len, vocab ] when full sentence
else:
@@ -897,7 +919,6 @@ Source code for onmt.translate.translator
batch,
self._tgt_vocab,
batch_dim=0,
- batch_offset=batch_offset,
)
scores = scores.view(-1, decoder_in.size(1), scores.size(-1))
log_probs = scores.squeeze(1).log()
@@ -915,6 +936,7 @@ Source code for onmt.translate.translator
def report_results(
self,
gold_score,
+ gold_log_probs,
batch,
batch_size,
decode_strategy,
@@ -925,6 +947,7 @@ Source code for onmt.translate.translator
"attention": None,
"batch": batch,
"gold_score": gold_score,
+ "gold_log_probs": gold_log_probs,
}
results["scores"] = decode_strategy.scores
@@ -1005,6 +1028,12 @@ Source code for onmt.translate.translator
[docs] def translate_batch(self, batch, attn_debug):
"""Translate a batch of sentences."""
+ if self.max_length_ratio > 0:
+ max_length = int(
+ min(self.max_length, batch["src"].size(1) * self.max_length_ratio + 5)
+ )
+ else:
+ max_length = self.max_length
with torch.no_grad():
if self.sample_from_topk != 0 or self.sample_from_topp != 0:
decode_strategy = GreedySearch(
@@ -1013,10 +1042,11 @@ Source code for onmt.translate.translator
eos=self._tgt_eos_idx,
unk=self._tgt_unk_idx,
start=self._tgt_start_with,
+ n_best=self.n_best,
batch_size=len(batch["srclen"]),
global_scorer=self.global_scorer,
min_length=self.min_length,
- max_length=self.max_length,
+ max_length=max_length,
block_ngram_repeat=self.block_ngram_repeat,
exclusion_tokens=self._exclusion_idxs,
return_attention=attn_debug or self.replace_unk,
@@ -1040,7 +1070,7 @@ Source code for onmt.translate.translator
n_best=self.n_best,
global_scorer=self.global_scorer,
min_length=self.min_length,
- max_length=self.max_length,
+ max_length=max_length,
return_attention=attn_debug or self.replace_unk,
block_ngram_repeat=self.block_ngram_repeat,
exclusion_tokens=self._exclusion_idxs,
@@ -1088,7 +1118,7 @@ Source code for onmt.translate.translator
self.model.decoder.init_state(src, enc_out, enc_final_hs)
- gold_score = self._gold_score(
+ gold_score, gold_log_probs = self._gold_score(
batch,
enc_out,
src_len,
@@ -1149,6 +1179,7 @@ Source code for onmt.translate.translator
return self.report_results(
gold_score,
+ gold_log_probs,
batch,
batch_size,
decode_strategy,
@@ -1170,7 +1201,7 @@ Source code for onmt.translate.translator
gold = tgt[:, 1:, :]
gold_scores = log_probs.gather(2, gold)
gold_scores = gold_scores.sum(dim=1).view(-1)
- return gold_scores
+ return gold_scores, None
class GeneratorLM(Inference):
@@ -1189,21 +1220,9 @@ Source code for onmt.translate.translator
"""
raise NotImplementedError
- def translate_batch(self, batch, attn_debug):
+ def translate_batch(self, batch, attn_debug, scoring=False):
"""Translate a batch of sentences."""
- batch_size = len(batch["srclen"])
- if batch_size != 1:
- warning_msg = (
- "GeneratorLM does not support batch_size != 1"
- " nicely. You can remove this limitation here."
- " With batch_size > 1 the end of each input is"
- " repeated until the input is finished. Then"
- " generation will start."
- )
- if self.logger:
- self.logger.info(warning_msg)
- else:
- os.write(1, warning_msg.encode("utf-8"))
+ max_length = 0 if scoring else self.max_length
with torch.no_grad():
if self.sample_from_topk != 0 or self.sample_from_topp != 0:
decode_strategy = GreedySearchLM(
@@ -1212,10 +1231,11 @@ Source code for onmt.translate.translator
eos=self._tgt_eos_idx,
unk=self._tgt_unk_idx,
start=self._tgt_start_with,
+ n_best=self.n_best,
batch_size=len(batch["srclen"]),
global_scorer=self.global_scorer,
min_length=self.min_length,
- max_length=self.max_length,
+ max_length=max_length,
block_ngram_repeat=self.block_ngram_repeat,
exclusion_tokens=self._exclusion_idxs,
return_attention=attn_debug or self.replace_unk,
@@ -1239,7 +1259,7 @@ Source code for onmt.translate.translator
n_best=self.n_best,
global_scorer=self.global_scorer,
min_length=self.min_length,
- max_length=self.max_length,
+ max_length=max_length,
return_attention=attn_debug or self.replace_unk,
block_ngram_repeat=self.block_ngram_repeat,
exclusion_tokens=self._exclusion_idxs,
@@ -1266,7 +1286,7 @@ Source code for onmt.translate.translator
log_probs = log_probs[:, -1, :]
return log_probs
- def _translate_batch_with_strategy(self, batch, decode_strategy):
+ def _translate_batch_with_strategy(self, batch, decode_strategy, left_pad=True):
"""Translate a batch of sentences step by step using cache.
Args:
@@ -1286,18 +1306,17 @@ Source code for onmt.translate.translator
src = batch["src"]
src_len = batch["srclen"]
- src, src_len, target_prefix = self.split_src_to_prevent_padding(src, src_len)
+ if left_pad:
+ target_prefix = None
+ else:
+ src, src_len, target_prefix = self.split_src_to_prevent_padding(
+ src, src_len
+ )
# (2) init decoder
self.model.decoder.init_state(src, None, None)
- gold_score = self._gold_score(
- batch,
- None,
- src_len,
- use_src_map,
- None,
- batch_size,
- src,
+ gold_score, gold_log_probs = self._gold_score(
+ batch, None, src_len, use_src_map, None, batch_size, src
)
# (3) prep decode_strategy. Possibly repeat src objects.
@@ -1310,18 +1329,18 @@ Source code for onmt.translate.translator
)
# (4) Begin decoding step by step:
+ # beg_time = time()
for step in range(decode_strategy.max_length):
decoder_input = (
src if step == 0 else decode_strategy.current_predictions.view(-1, 1, 1)
)
-
log_probs, attn = self._decode_and_generate(
decoder_input,
None,
batch,
src_len=decode_strategy.src_len,
src_map=src_map,
- step=step if step == 0 else step + src_len[0].item(),
+ step=step if step == 0 else step + max(src_len.tolist()),
batch_offset=decode_strategy.batch_offset,
)
@@ -1348,9 +1367,12 @@ Source code for onmt.translate.translator
if parallel_paths > 1 or any_finished:
# select indexes in model state/cache
self.model.decoder.map_state(lambda state, dim: state[select_indices])
+ # if step == 0:
+ # print("step0 time: ", time() - beg_time)
return self.report_results(
gold_score,
+ gold_log_probs,
batch,
batch_size,
decode_strategy,
@@ -1370,10 +1392,13 @@ Source code for onmt.translate.translator
)
log_probs[:, :, self._tgt_pad_idx] = 0
- gold_scores = log_probs.gather(2, tgt)
- gold_scores = gold_scores.sum(dim=1).view(-1)
+ gold_log_probs = log_probs.gather(2, tgt)
+ gold_scores = gold_log_probs.sum(dim=1).view(-1)
+
+ if self.return_gold_log_probs:
+ return gold_scores, gold_log_probs
- return gold_scores
+ return gold_scores, None
diff --git a/_modules/onmt/utils/loss.html b/_modules/onmt/utils/loss.html
index e0d38d0784..5b59c0e94a 100644
--- a/_modules/onmt/utils/loss.html
+++ b/_modules/onmt/utils/loss.html
@@ -531,7 +531,6 @@ Source code for onmt.utils.loss
self._unbottle(scores.clone(), len(batch["srclen"])),
batch,
self.vocab,
- None,
)
scores_data = self._bottle(scores_data)
# Correct target copy token instead of <unk>
diff --git a/_sources/examples/wmt17/Translation.md.txt b/_sources/examples/wmt17/Translation.md.txt
index 39fd01c1f2..9e79fae0f4 100644
--- a/_sources/examples/wmt17/Translation.md.txt
+++ b/_sources/examples/wmt17/Translation.md.txt
@@ -55,7 +55,6 @@ Training the following big transformer for 50K steps takes less than 10 hours on
```bash
python3 ../../../onmt/bin/build_vocab.py --config wmt17/wmt17_ende.yaml --n_sample -1
python3 ../../../onmt/bin/train.py --config wmt17/wmt17_ende.yaml
-bash scripts/onmt/train.sh
```
Translate test sets with various settings on local GPU and CPUs.
diff --git a/_sources/quickstart.md.txt b/_sources/quickstart.md.txt
index c37fd43633..389a529d93 100644
--- a/_sources/quickstart.md.txt
+++ b/_sources/quickstart.md.txt
@@ -218,6 +218,12 @@ tgt: None
In this second example, we used `max_length: 1` and `src: None` `tgt: None` which is typically the configuration to be used in a scoring script like MMLU where it expects only 1 token as the answer.
+
+**WARNING**
+For inhomogeneous batches with many examples, the potentially high number of tokens inserted in the shortest examples leads to degraded results when attention layer quantization and flash attention are activated.
+In practice, in the inference configuration file, when `batch_size` is greater than 1,
+delete 'linear_values', 'linear_query', 'linear_keys', 'final_linear' from `quant_layers` and specify `self_attn_type: scaled-dot`.
+
You can run this script with the following command line:
```
diff --git a/examples/wmt17/Translation.html b/examples/wmt17/Translation.html
index 257cfb6727..e10c6e4f16 100644
--- a/examples/wmt17/Translation.html
+++ b/examples/wmt17/Translation.html
@@ -271,7 +271,6 @@ Train
Training the following big transformer for 50K steps takes less than 10 hours on a single RTX 4090
python3 ../../../onmt/bin/build_vocab.py --config wmt17/wmt17_ende.yaml --n_sample -1
python3 ../../../onmt/bin/train.py --config wmt17/wmt17_ende.yaml
-bash scripts/onmt/train.sh
Translate test sets with various settings on local GPU and CPUs.
diff --git a/index.html b/index.html
index d1a41c7f04..1849dc93cc 100644
--- a/index.html
+++ b/index.html
@@ -332,24 +332,24 @@ ContentsData
Vocab
Features
-Transform/BART
-Transform/Terminology
-Transform/FuzzyMatching
-Transform/Filter
-Transform/Prefix
-Transform/Suffix
-Transform/InsertMaskBeforePlaceholdersTransform
-Transform/Clean
-Transform/Uppercase
Transform/SwitchOut
Transform/Token_Drop
Transform/Token_Mask
Transform/Docify
-Transform/InferFeats
+Transform/InsertMaskBeforePlaceholdersTransform
+Transform/Uppercase
+Transform/FuzzyMatching
+Transform/InlineTags
+Transform/Clean
Transform/Subword/Common
Transform/Subword/ONMTTOK
-Transform/InlineTags
Transform/Normalize
+Transform/InferFeats
+Transform/Filter
+Transform/Prefix
+Transform/Suffix
+Transform/Terminology
+Transform/BART
Reproducibility
@@ -360,24 +360,24 @@ ContentsFeatures
Pruning
Embeddings
-Transform/BART
-Transform/Terminology
-Transform/FuzzyMatching
-Transform/Filter
-Transform/Prefix
-Transform/Suffix
-Transform/InsertMaskBeforePlaceholdersTransform
-Transform/Clean
-Transform/Uppercase
Transform/SwitchOut
Transform/Token_Drop
Transform/Token_Mask
Transform/Docify
-Transform/InferFeats
+Transform/InsertMaskBeforePlaceholdersTransform
+Transform/Uppercase
+Transform/FuzzyMatching
+Transform/InlineTags
+Transform/Clean
Transform/Subword/Common
Transform/Subword/ONMTTOK
-Transform/InlineTags
Transform/Normalize
+Transform/InferFeats
+Transform/Filter
+Transform/Prefix
+Transform/Suffix
+Transform/Terminology
+Transform/BART
Distributed
Model-Embeddings
Model-Embedding Features
@@ -392,7 +392,6 @@ ContentsOptimization- Type
Optimization- Rate
Logging
-Dynamic data
Quant options
@@ -409,24 +408,24 @@ ContentsLogging
Distributed
Efficiency
-Transform/BART
-Transform/Terminology
-Transform/FuzzyMatching
-Transform/Filter
-Transform/Prefix
-Transform/Suffix
-Transform/InsertMaskBeforePlaceholdersTransform
-Transform/Clean
-Transform/Uppercase
Transform/SwitchOut
Transform/Token_Drop
Transform/Token_Mask
Transform/Docify
-Transform/InferFeats
+Transform/InsertMaskBeforePlaceholdersTransform
+Transform/Uppercase
+Transform/FuzzyMatching
+Transform/InlineTags
+Transform/Clean
Transform/Subword/Common
Transform/Subword/ONMTTOK
-Transform/InlineTags
Transform/Normalize
+Transform/InferFeats
+Transform/Filter
+Transform/Prefix
+Transform/Suffix
+Transform/Terminology
+Transform/BART
Quant options
diff --git a/onmt.modules.html b/onmt.modules.html
index bb9f0128a2..81ea6bbe13 100644
--- a/onmt.modules.html
+++ b/onmt.modules.html
@@ -508,7 +508,7 @@ Encoders
-class onmt.encoders.TransformerEncoder(num_layers, d_model, heads, d_ff, dropout, attention_dropout, embeddings, max_relative_positions, relative_positions_buckets, pos_ffn_activation_fn='relu', add_qkvbias=False, num_kv=0, add_ffnbias=True, parallel_residual=False, layer_norm='standard', norm_eps=1e-06, use_ckpting=[], parallel_gpu=1)[source]¶
+class onmt.encoders.TransformerEncoder(num_layers, d_model, heads, d_ff, dropout, attention_dropout, embeddings, max_relative_positions, relative_positions_buckets, pos_ffn_activation_fn='relu', add_qkvbias=False, num_kv=0, add_ffnbias=True, parallel_residual=False, layer_norm='standard', norm_eps=1e-06, use_ckpting=[], parallel_gpu=1, rotary_interleave=True, rotary_theta=10000.0, rotary_dim=0)[source]¶
Bases: EncoderBase
The Transformer encoder from “Attention is All You Need”
[VSP+17]
@@ -695,7 +695,7 @@ Decoders
-class onmt.decoders.TransformerDecoder(num_layers, d_model, heads, d_ff, copy_attn, self_attn_type, dropout, attention_dropout, embeddings, max_relative_positions, relative_positions_buckets, aan_useffn, full_context_alignment, alignment_layer, alignment_heads, pos_ffn_activation_fn='relu', add_qkvbias=False, num_kv=0, add_ffnbias=True, parallel_residual=False, shared_layer_norm=False, layer_norm='standard', norm_eps=1e-06, use_ckpting=[], parallel_gpu=1, sliding_window=0)[source]¶
+class onmt.decoders.TransformerDecoder(num_layers, d_model, heads, d_ff, copy_attn, self_attn_type, dropout, attention_dropout, embeddings, max_relative_positions, relative_positions_buckets, aan_useffn, full_context_alignment, alignment_layer, alignment_heads, pos_ffn_activation_fn='relu', add_qkvbias=False, num_kv=0, add_ffnbias=True, parallel_residual=False, shared_layer_norm=False, layer_norm='standard', norm_eps=1e-06, use_ckpting=[], parallel_gpu=1, sliding_window=0, rotary_interleave=True, rotary_theta=10000.0, rotary_dim=0, num_experts=0, num_experts_per_tok=2)[source]¶
Bases: TransformerDecoderBase
The Transformer decoder from “Attention is All You Need”.
[VSP+17]
@@ -707,7 +707,7 @@ Decodersonmt.modules.Embeddings) – embeddings to use, should have positional encodings
@@ -720,8 +720,24 @@ Decoders
-class onmt.modules.MultiHeadedAttention(head_count: int, model_dim: int, dropout: float = 0.1, is_decoder: bool = True, max_relative_positions: int = 0, relative_positions_buckets: int = 0, attn_type: str | None = None, add_qkvbias=False, num_kv=0, use_ckpting=[], parallel_gpu=1)[source]¶
+class onmt.modules.MultiHeadedAttention(head_count: int, model_dim: int, dropout: float = 0.1, is_decoder: bool = True, max_relative_positions: int = 0, relative_positions_buckets: int = 0, rotary_interleave: bool = True, rotary_theta: int = 10000.0, rotary_dim: int = 0, attn_type: str | None = None, self_attn_type: str | None = None, add_qkvbias=False, num_kv=0, use_ckpting=[], parallel_gpu=1)[source]¶
Bases: Module
Multi-Head Attention module from “Attention is All You Need”
[VSP+17].
@@ -1018,7 +1034,7 @@ Attention
-forward(key: Tensor, value: Tensor, query: Tensor, mask: Tensor | None = None, sliding_window: int | None = 0, step: int | None = 0, return_attn: bool | None = False) Tuple[Tensor, Tensor] [source]¶
+forward(key: Tensor, value: Tensor, query: Tensor, mask: Tensor | None = None, sliding_window: int | None = 0, step: int | None = 0, return_attn: bool | None = False, self_attn_type: str | None = None) Tuple[Tensor, Tensor] [source]¶
Compute the context vector and the attention vectors.
- Parameters:
@@ -1188,7 +1204,7 @@ Attention
-
forward(input)[source]¶
-Defines the computation performed at every call.
+Define the computation performed at every call.
Should be overridden by all subclasses.
Note
diff --git a/onmt.translation.html b/onmt.translation.html
index d790f7a88b..788cec50ba 100644
--- a/onmt.translation.html
+++ b/onmt.translation.html
@@ -297,7 +297,7 @@ Translations¶
-
-class onmt.translate.Translator(model, vocabs, gpu=-1, n_best=1, min_length=0, max_length=100, ratio=0.0, beam_size=30, random_sampling_topk=0, random_sampling_topp=0.0, random_sampling_temp=1.0, stepwise_penalty=None, dump_beam=False, block_ngram_repeat=0, ignore_when_blocking=frozenset({}), replace_unk=False, ban_unk_token=False, tgt_file_prefix=False, phrase_table='', data_type='text', verbose=False, report_time=False, copy_attn=False, global_scorer=None, out_file=None, report_align=False, gold_align=False, report_score=True, logger=None, seed=-1, with_score=False)[source]¶
+class onmt.translate.Translator(model, vocabs, gpu=-1, n_best=1, min_length=0, max_length=100, max_length_ratio=1.5, ratio=0.0, beam_size=30, random_sampling_topk=0, random_sampling_topp=0.0, random_sampling_temp=1.0, stepwise_penalty=None, dump_beam=False, block_ngram_repeat=0, ignore_when_blocking=frozenset({}), replace_unk=False, ban_unk_token=False, tgt_file_prefix=False, phrase_table='', data_type='text', verbose=False, report_time=False, copy_attn=False, global_scorer=None, out_file=None, report_align=False, gold_align=False, report_score=True, logger=None, seed=-1, with_score=False, return_gold_log_probs=False)[source]¶
Bases: Inference
-
@@ -519,7 +519,7 @@
Decoding Strategies
-
-class onmt.translate.GreedySearch(pad, bos, eos, unk, start, batch_size, global_scorer, min_length, block_ngram_repeat, exclusion_tokens, return_attention, max_length, sampling_temp, keep_topk, keep_topp, beam_size, ban_unk_token)[source]¶
+class onmt.translate.GreedySearch(pad, bos, eos, unk, start, n_best, batch_size, global_scorer, min_length, block_ngram_repeat, exclusion_tokens, return_attention, max_length, sampling_temp, keep_topk, keep_topp, beam_size, ban_unk_token)[source]¶
Bases: DecodeStrategy
Select next tokens randomly from the top k possible next tokens.
The scores
attribute’s lists are the score, after applying temperature,
@@ -533,6 +533,8 @@
Decoding Strategiesonmt.translate.GNMTGlobalScorer) – Scorer instance.
min_length (int) – See base.
diff --git a/options/build_vocab.html b/options/build_vocab.html
index 0f2b6c4b36..92316d8467 100644
--- a/options/build_vocab.html
+++ b/options/build_vocab.html
@@ -128,24 +128,24 @@
- Data
- Vocab
- Features
-- Transform/BART
-- Transform/Terminology
-- Transform/FuzzyMatching
-- Transform/Filter
-- Transform/Prefix
-- Transform/Suffix
-- Transform/InsertMaskBeforePlaceholdersTransform
-- Transform/Clean
-- Transform/Uppercase
- Transform/SwitchOut
- Transform/Token_Drop
- Transform/Token_Mask
- Transform/Docify
-- Transform/InferFeats
+- Transform/InsertMaskBeforePlaceholdersTransform
+- Transform/Uppercase
+- Transform/FuzzyMatching
+- Transform/InlineTags
+- Transform/Clean
- Transform/Subword/Common
- Transform/Subword/ONMTTOK
-- Transform/InlineTags
- Transform/Normalize
+- Transform/InferFeats
+- Transform/Filter
+- Transform/Prefix
+- Transform/Suffix
+- Transform/Terminology
+- Transform/BART
- Reproducibility
@@ -237,7 +237,7 @@ Build Vocabusage: build_vocab.py [-h] [-config CONFIG] [-save_config SAVE_CONFIG] -data
DATA [-skip_empty_level {silent,warning,error}]
- [-transforms {bart,terminology,fuzzymatch,filtertoolong,prefix,suffix,insert_mask_before_placeholder,clean,uppercase,switchout,tokendrop,tokenmask,docify,inferfeats,sentencepiece,bpe,onmt_tokenize,inlinetags,normalize} [{bart,terminology,fuzzymatch,filtertoolong,prefix,suffix,insert_mask_before_placeholder,clean,uppercase,switchout,tokendrop,tokenmask,docify,inferfeats,sentencepiece,bpe,onmt_tokenize,inlinetags,normalize} ...]]
+ [-transforms {switchout,tokendrop,tokenmask,docify,insert_mask_before_placeholder,uppercase,fuzzymatch,inlinetags,clean,sentencepiece,bpe,onmt_tokenize,normalize,inferfeats,filtertoolong,prefix,suffix,terminology,bart} [{switchout,tokendrop,tokenmask,docify,insert_mask_before_placeholder,uppercase,fuzzymatch,inlinetags,clean,sentencepiece,bpe,onmt_tokenize,normalize,inferfeats,filtertoolong,prefix,suffix,terminology,bart} ...]]
-save_data SAVE_DATA [-overwrite] [-n_sample N_SAMPLE]
[-dump_samples] [-num_threads NUM_THREADS]
[-learn_subwords]
@@ -249,22 +249,12 @@ Build Vocab[--default_specials DEFAULT_SPECIALS [DEFAULT_SPECIALS ...]]
[-n_src_feats N_SRC_FEATS]
[-src_feats_defaults SRC_FEATS_DEFAULTS]
- [--permute_sent_ratio PERMUTE_SENT_RATIO]
- [--rotate_ratio ROTATE_RATIO]
- [--insert_ratio INSERT_RATIO]
- [--random_ratio RANDOM_RATIO] [--mask_ratio MASK_RATIO]
- [--mask_length {subword,word,span-poisson}]
- [--poisson_lambda POISSON_LAMBDA]
- [--replace_length {-1,0,1}]
- [--termbase_path TERMBASE_PATH]
- [--src_spacy_language_model SRC_SPACY_LANGUAGE_MODEL]
- [--tgt_spacy_language_model TGT_SPACY_LANGUAGE_MODEL]
- [--term_corpus_ratio TERM_CORPUS_RATIO]
- [--term_example_ratio TERM_EXAMPLE_RATIO]
- [--src_term_stoken SRC_TERM_STOKEN]
- [--tgt_term_stoken TGT_TERM_STOKEN]
- [--tgt_term_etoken TGT_TERM_ETOKEN]
- [--term_source_delimiter TERM_SOURCE_DELIMITER]
+ [-switchout_temperature SWITCHOUT_TEMPERATURE]
+ [-tokendrop_temperature TOKENDROP_TEMPERATURE]
+ [-tokenmask_temperature TOKENMASK_TEMPERATURE]
+ [--doc_length DOC_LENGTH] [--max_context MAX_CONTEXT]
+ [--response_pattern RESPONSE_PATTERN]
+ [--upper_corpus_ratio UPPER_CORPUS_RATIO]
[--tm_path TM_PATH]
[--fuzzy_corpus_ratio FUZZY_CORPUS_RATIO]
[--fuzzy_threshold FUZZY_THRESHOLD]
@@ -272,23 +262,18 @@ Build Vocab[--fuzzy_token FUZZY_TOKEN]
[--fuzzymatch_min_length FUZZYMATCH_MIN_LENGTH]
[--fuzzymatch_max_length FUZZYMATCH_MAX_LENGTH]
- [--src_seq_length SRC_SEQ_LENGTH]
- [--tgt_seq_length TGT_SEQ_LENGTH]
- [--src_prefix SRC_PREFIX] [--tgt_prefix TGT_PREFIX]
- [--src_suffix SRC_SUFFIX] [--tgt_suffix TGT_SUFFIX]
- [--response_pattern RESPONSE_PATTERN] [--src_eq_tgt]
+ [--tags_dictionary_path TAGS_DICTIONARY_PATH]
+ [--tags_corpus_ratio TAGS_CORPUS_RATIO]
+ [--max_tags MAX_TAGS] [--paired_stag PAIRED_STAG]
+ [--paired_etag PAIRED_ETAG]
+ [--isolated_tag ISOLATED_TAG]
+ [--src_delimiter SRC_DELIMITER] [--src_eq_tgt]
[--same_char] [--same_word]
[--scripts_ok [SCRIPTS_OK [SCRIPTS_OK ...]]]
[--scripts_nok [SCRIPTS_NOK [SCRIPTS_NOK ...]]]
[--src_tgt_ratio SRC_TGT_RATIO]
[--avg_tok_min AVG_TOK_MIN] [--avg_tok_max AVG_TOK_MAX]
[--langid [LANGID [LANGID ...]]]
- [--upper_corpus_ratio UPPER_CORPUS_RATIO]
- [-switchout_temperature SWITCHOUT_TEMPERATURE]
- [-tokendrop_temperature TOKENDROP_TEMPERATURE]
- [-tokenmask_temperature TOKENMASK_TEMPERATURE]
- [--doc_length DOC_LENGTH] [--max_context MAX_CONTEXT]
- [--reversible_tokenization {joiner,spacer}]
[-src_subword_model SRC_SUBWORD_MODEL]
[-tgt_subword_model TGT_SUBWORD_MODEL]
[-src_subword_nbest SRC_SUBWORD_NBEST]
@@ -303,18 +288,32 @@ Build Vocab[-tgt_subword_type {none,sentencepiece,bpe}]
[-src_onmttok_kwargs SRC_ONMTTOK_KWARGS]
[-tgt_onmttok_kwargs TGT_ONMTTOK_KWARGS] [--gpt2_pretok]
- [--tags_dictionary_path TAGS_DICTIONARY_PATH]
- [--tags_corpus_ratio TAGS_CORPUS_RATIO]
- [--max_tags MAX_TAGS] [--paired_stag PAIRED_STAG]
- [--paired_etag PAIRED_ETAG]
- [--isolated_tag ISOLATED_TAG]
- [--src_delimiter SRC_DELIMITER] [--src_lang SRC_LANG]
- [--tgt_lang TGT_LANG] [--penn PENN]
- [--norm_quote_commas NORM_QUOTE_COMMAS]
+ [--src_lang SRC_LANG] [--tgt_lang TGT_LANG]
+ [--penn PENN] [--norm_quote_commas NORM_QUOTE_COMMAS]
[--norm_numbers NORM_NUMBERS]
[--pre_replace_unicode_punct PRE_REPLACE_UNICODE_PUNCT]
[--post_remove_control_chars POST_REMOVE_CONTROL_CHARS]
- [--seed SEED]
+ [--reversible_tokenization {joiner,spacer}]
+ [--src_seq_length SRC_SEQ_LENGTH]
+ [--tgt_seq_length TGT_SEQ_LENGTH]
+ [--src_prefix SRC_PREFIX] [--tgt_prefix TGT_PREFIX]
+ [--src_suffix SRC_SUFFIX] [--tgt_suffix TGT_SUFFIX]
+ [--termbase_path TERMBASE_PATH]
+ [--src_spacy_language_model SRC_SPACY_LANGUAGE_MODEL]
+ [--tgt_spacy_language_model TGT_SPACY_LANGUAGE_MODEL]
+ [--term_corpus_ratio TERM_CORPUS_RATIO]
+ [--term_example_ratio TERM_EXAMPLE_RATIO]
+ [--src_term_stoken SRC_TERM_STOKEN]
+ [--tgt_term_stoken TGT_TERM_STOKEN]
+ [--tgt_term_etoken TGT_TERM_ETOKEN]
+ [--term_source_delimiter TERM_SOURCE_DELIMITER]
+ [--permute_sent_ratio PERMUTE_SENT_RATIO]
+ [--rotate_ratio ROTATE_RATIO]
+ [--insert_ratio INSERT_RATIO]
+ [--random_ratio RANDOM_RATIO] [--mask_ratio MASK_RATIO]
+ [--mask_length {subword,word,span-poisson}]
+ [--poisson_lambda POISSON_LAMBDA]
+ [--replace_length {-1,0,1}] [--seed SEED]
@@ -340,7 +339,7 @@ Data¶<
Default: “warning”
- -transforms, --transforms
-Possible choices: bart, terminology, fuzzymatch, filtertoolong, prefix, suffix, insert_mask_before_placeholder, clean, uppercase, switchout, tokendrop, tokenmask, docify, inferfeats, sentencepiece, bpe, onmt_tokenize, inlinetags, normalize
+Possible choices: switchout, tokendrop, tokenmask, docify, insert_mask_before_placeholder, uppercase, fuzzymatch, inlinetags, clean, sentencepiece, bpe, onmt_tokenize, normalize, inferfeats, filtertoolong, prefix, suffix, terminology, bart
Default transform pipeline to apply to data. Can be specified in each corpus of data to override.
Default: []
@@ -412,84 +411,65 @@ Features
-Transform/BART¶
+
+Transform/SwitchOut¶
Caution
This transform will not take effect when building vocabulary.
-- --permute_sent_ratio, -permute_sent_ratio
-Permute this proportion of sentences (boundaries defined by [‘.’, ‘?’, ‘!’]) in all inputs.
-Default: 0.0
-
-- --rotate_ratio, -rotate_ratio
-Rotate this proportion of inputs.
-Default: 0.0
-
-- --insert_ratio, -insert_ratio
-Insert this percentage of additional random tokens.
-Default: 0.0
-
-- --random_ratio, -random_ratio
-Instead of using <mask>, use random token this often.
-Default: 0.0
-
-- --mask_ratio, -mask_ratio
-Fraction of words/subwords that will be masked.
-Default: 0.0
-
-- --mask_length, -mask_length
-Possible choices: subword, word, span-poisson
-Length of masking window to apply.
-Default: “subword”
-
-- --poisson_lambda, -poisson_lambda
-Lambda for Poisson distribution to sample span length if -mask_length set to span-poisson.
-Default: 3.0
-
-- --replace_length, -replace_length
-Possible choices: -1, 0, 1
-When masking N tokens, replace with 0, 1, or N tokens. (use -1 for N)
-Default: -1
+- -switchout_temperature, --switchout_temperature
+Sampling temperature for SwitchOut. \(\tau^{-1}\) in [WPDN18]. Smaller value makes data more diverse.
+Default: 1.0
-
-Transform/Terminology¶
+
+Transform/Token_Drop¶
-- --termbase_path, -termbase_path
-Path to a dictionary file with terms.
-
-- --src_spacy_language_model, -src_spacy_language_model
-Name of the spacy language model for the source corpus.
-
-- --tgt_spacy_language_model, -tgt_spacy_language_model
-Name of the spacy language model for the target corpus.
-
-- --term_corpus_ratio, -term_corpus_ratio
-Ratio of corpus to augment with terms.
-Default: 0.3
+- -tokendrop_temperature, --tokendrop_temperature
+Sampling temperature for token deletion.
+Default: 1.0
-- --term_example_ratio, -term_example_ratio
-Max terms allowed in an example.
-Default: 0.2
+
+
+
+Transform/Token_Mask¶
+
+- -tokenmask_temperature, --tokenmask_temperature
+Sampling temperature for token masking.
+Default: 1.0
-- --src_term_stoken, -src_term_stoken
-The source term start token.
-Default: “⦅src_term_start⦆”
+
+
+
+Transform/Docify¶
+
+- --doc_length, -doc_length
+Number of tokens per doc.
+Default: 200
-- --tgt_term_stoken, -tgt_term_stoken
-The target term start token.
-Default: “⦅tgt_term_start⦆”
+- --max_context, -max_context
+Max context segments.
+Default: 1
-- --tgt_term_etoken, -tgt_term_etoken
-The target term end token.
-Default: “⦅tgt_term_end⦆”
+
+
+
+Transform/InsertMaskBeforePlaceholdersTransform¶
+
+- --response_pattern, -response_pattern
+Response patten to locate the end of the prompt
+Default: “Response : ⦅newline⦆”
-- --term_source_delimiter, -term_source_delimiter
-Any special token used for augmented source sentences. The default is the fuzzy token used in the FuzzyMatch transform.
-Default: “⦅fuzzy⦆”
+
+
+
+Transform/Uppercase¶
+
+- --upper_corpus_ratio, -upper_corpus_ratio
+Corpus ratio to apply uppercasing.
+Default: 0.01
@@ -525,51 +505,35 @@ Transform/FuzzyMatching
-
-Transform/Filter¶
+
+Transform/InlineTags¶
-- --src_seq_length, -src_seq_length
-Maximum source sequence length.
-Default: 192
+- --tags_dictionary_path, -tags_dictionary_path
+Path to a flat term dictionary.
-- --tgt_seq_length, -tgt_seq_length
-Maximum target sequence length.
-Default: 192
+- --tags_corpus_ratio, -tags_corpus_ratio
+Ratio of corpus to augment with tags.
+Default: 0.1
-
-
-
-Transform/Prefix¶
-
-- --src_prefix, -src_prefix
-String to prepend to all source example.
-Default: “”
+- --max_tags, -max_tags
+Maximum number of tags that can be added to a single sentence.
+Default: 12
-- --tgt_prefix, -tgt_prefix
-String to prepend to all target example.
-Default: “”
+- --paired_stag, -paired_stag
+The format of an opening paired inline tag. Must include the character #.
+Default: “⦅ph_#_beg⦆”
-
-
-
-Transform/Suffix¶
-
-- --src_suffix, -src_suffix
-String to append to all source example.
-Default: “”
+- --paired_etag, -paired_etag
+The format of a closing paired inline tag. Must include the character #.
+Default: “⦅ph_#_end⦆”
-- --tgt_suffix, -tgt_suffix
-String to append to all target example.
-Default: “”
+- --isolated_tag, -isolated_tag
+The format of an isolated inline tag. Must include the character #.
+Default: “⦅ph_#_std⦆”
-
-
-
-Transform/InsertMaskBeforePlaceholdersTransform¶
-
-- --response_pattern, -response_pattern
-Response patten to locate the end of the prompt
-Default: “Response : ⦅newline⦆”
+- --src_delimiter, -src_delimiter
+Any special token used for augmented src sentences. The default is the fuzzy token used in the FuzzyMatch transform.
+Default: “⦅fuzzy⦆”
@@ -614,69 +578,6 @@ Transform/Clean
-Transform/Uppercase¶
-
-- --upper_corpus_ratio, -upper_corpus_ratio
-Corpus ratio to apply uppercasing.
-Default: 0.01
-
-
-
-
-Transform/SwitchOut¶
-
-Caution
-This transform will not take effect when building vocabulary.
-
-
-- -switchout_temperature, --switchout_temperature
-Sampling temperature for SwitchOut. \(\tau^{-1}\) in [WPDN18]. Smaller value makes data more diverse.
-Default: 1.0
-
-
-
-
-Transform/Token_Drop¶
-
-- -tokendrop_temperature, --tokendrop_temperature
-Sampling temperature for token deletion.
-Default: 1.0
-
-
-
-
-Transform/Token_Mask¶
-
-- -tokenmask_temperature, --tokenmask_temperature
-Sampling temperature for token masking.
-Default: 1.0
-
-
-
-
-Transform/Docify¶
-
-- --doc_length, -doc_length
-Number of tokens per doc.
-Default: 200
-
-- --max_context, -max_context
-Max context segments.
-Default: 1
-
-
-
-
-Transform/InferFeats¶
-
-- --reversible_tokenization, -reversible_tokenization
-Possible choices: joiner, spacer
-Type of reversible tokenization applied on the tokenizer.
-Default: “joiner”
-
-
-
Transform/Subword/Common¶
@@ -751,38 +652,6 @@ Transform/Subword/ONMTTOK
-Transform/InlineTags¶
-
-- --tags_dictionary_path, -tags_dictionary_path
-Path to a flat term dictionary.
-
-- --tags_corpus_ratio, -tags_corpus_ratio
-Ratio of corpus to augment with tags.
-Default: 0.1
-
-- --max_tags, -max_tags
-Maximum number of tags that can be added to a single sentence.
-Default: 12
-
-- --paired_stag, -paired_stag
-The format of an opening paired inline tag. Must include the character #.
-Default: “⦅ph_#_beg⦆”
-
-- --paired_etag, -paired_etag
-The format of a closing paired inline tag. Must include the character #.
-Default: “⦅ph_#_end⦆”
-
-- --isolated_tag, -isolated_tag
-The format of an isolated inline tag. Must include the character #.
-Default: “⦅ph_#_std⦆”
-
-- --src_delimiter, -src_delimiter
-Any special token used for augmented src sentences. The default is the fuzzy token used in the FuzzyMatch transform.
-Default: “⦅fuzzy⦆”
-
-
-
Transform/Normalize¶
+
+Transform/Filter¶
+
+- --src_seq_length, -src_seq_length
+Maximum source sequence length.
+Default: 192
+
+- --tgt_seq_length, -tgt_seq_length
+Maximum target sequence length.
+Default: 192
+
+
+
+
+Transform/Prefix¶
+
+- --src_prefix, -src_prefix
+String to prepend to all source example.
+Default: “”
+
+- --tgt_prefix, -tgt_prefix
+String to prepend to all target example.
+Default: “”
+
+
+
+
+Transform/Suffix¶
+
+- --src_suffix, -src_suffix
+String to append to all source example.
+Default: “”
+
+- --tgt_suffix, -tgt_suffix
+String to append to all target example.
+Default: “”
+
+
+
+
+Transform/Terminology¶
+
+- --termbase_path, -termbase_path
+Path to a dictionary file with terms.
+
+- --src_spacy_language_model, -src_spacy_language_model
+Name of the spacy language model for the source corpus.
+
+- --tgt_spacy_language_model, -tgt_spacy_language_model
+Name of the spacy language model for the target corpus.
+
+- --term_corpus_ratio, -term_corpus_ratio
+Ratio of corpus to augment with terms.
+Default: 0.3
+
+- --term_example_ratio, -term_example_ratio
+Max terms allowed in an example.
+Default: 0.2
+
+- --src_term_stoken, -src_term_stoken
+The source term start token.
+Default: “⦅src_term_start⦆”
+
+- --tgt_term_stoken, -tgt_term_stoken
+The target term start token.
+Default: “⦅tgt_term_start⦆”
+
+- --tgt_term_etoken, -tgt_term_etoken
+The target term end token.
+Default: “⦅tgt_term_end⦆”
+
+- --term_source_delimiter, -term_source_delimiter
+Any special token used for augmented source sentences. The default is the fuzzy token used in the FuzzyMatch transform.
+Default: “⦅fuzzy⦆”
+
+
+
+
+Transform/BART¶
+
+Caution
+This transform will not take effect when building vocabulary.
+
+
+- --permute_sent_ratio, -permute_sent_ratio
+Permute this proportion of sentences (boundaries defined by [‘.’, ‘?’, ‘!’]) in all inputs.
+Default: 0.0
+
+- --rotate_ratio, -rotate_ratio
+Rotate this proportion of inputs.
+Default: 0.0
+
+- --insert_ratio, -insert_ratio
+Insert this percentage of additional random tokens.
+Default: 0.0
+
+- --random_ratio, -random_ratio
+Instead of using <mask>, use random token this often.
+Default: 0.0
+
+- --mask_ratio, -mask_ratio
+Fraction of words/subwords that will be masked.
+Default: 0.0
+
+- --mask_length, -mask_length
+Possible choices: subword, word, span-poisson
+Length of masking window to apply.
+Default: “subword”
+
+- --poisson_lambda, -poisson_lambda
+Lambda for Poisson distribution to sample span length if -mask_length set to span-poisson.
+Default: 3.0
+
+- --replace_length, -replace_length
+Possible choices: -1, 0, 1
+When masking N tokens, replace with 0, 1, or N tokens. (use -1 for N)
+Default: -1
+
+
+
Reproducibility¶
diff --git a/options/server.html b/options/server.html
index ba7f9f9b21..e3fb56241a 100644
--- a/options/server.html
+++ b/options/server.html
@@ -213,7 +213,7 @@ ServerOpenNMT-py REST Server
usage: server.py [-h] [--ip IP] [--port PORT] [--url_root URL_ROOT] [--debug]
- [--config CONFIG]
+ [--model_config MODEL_CONFIG]
@@ -231,7 +231,7 @@ Named ArgumentsFeatures
- Pruning
- Embeddings
-- Transform/BART
-- Transform/Terminology
-- Transform/FuzzyMatching
-- Transform/Filter
-- Transform/Prefix
-- Transform/Suffix
-- Transform/InsertMaskBeforePlaceholdersTransform
-- Transform/Clean
-- Transform/Uppercase
- Transform/SwitchOut
- Transform/Token_Drop
- Transform/Token_Mask
- Transform/Docify
-- Transform/InferFeats
+- Transform/InsertMaskBeforePlaceholdersTransform
+- Transform/Uppercase
+- Transform/FuzzyMatching
+- Transform/InlineTags
+- Transform/Clean
- Transform/Subword/Common
- Transform/Subword/ONMTTOK
-- Transform/InlineTags
- Transform/Normalize
+- Transform/InferFeats
+- Transform/Filter
+- Transform/Prefix
+- Transform/Suffix
+- Transform/Terminology
+- Transform/BART
- Distributed
- Model-Embeddings
- Model-Embedding Features
@@ -163,7 +163,6 @@
- Optimization- Type
- Optimization- Rate
- Logging
-- Dynamic data
- Quant options
@@ -254,7 +253,7 @@ Train
usage: train.py [-h] [-config CONFIG] [-save_config SAVE_CONFIG] -data DATA
[-skip_empty_level {silent,warning,error}]
- [-transforms {bart,terminology,fuzzymatch,filtertoolong,prefix,suffix,insert_mask_before_placeholder,clean,uppercase,switchout,tokendrop,tokenmask,docify,inferfeats,sentencepiece,bpe,onmt_tokenize,inlinetags,normalize} [{bart,terminology,fuzzymatch,filtertoolong,prefix,suffix,insert_mask_before_placeholder,clean,uppercase,switchout,tokendrop,tokenmask,docify,inferfeats,sentencepiece,bpe,onmt_tokenize,inlinetags,normalize} ...]]
+ [-transforms {switchout,tokendrop,tokenmask,docify,insert_mask_before_placeholder,uppercase,fuzzymatch,inlinetags,clean,sentencepiece,bpe,onmt_tokenize,normalize,inferfeats,filtertoolong,prefix,suffix,terminology,bart} [{switchout,tokendrop,tokenmask,docify,insert_mask_before_placeholder,uppercase,fuzzymatch,inlinetags,clean,sentencepiece,bpe,onmt_tokenize,normalize,inferfeats,filtertoolong,prefix,suffix,terminology,bart} ...]]
[-save_data SAVE_DATA] [-overwrite] [-n_sample N_SAMPLE]
[-dump_transforms] -src_vocab SRC_VOCAB [-tgt_vocab TGT_VOCAB]
[-share_vocab] [--decoder_start_token DECODER_START_TOKEN]
@@ -272,41 +271,26 @@ Train
[-src_embeddings SRC_EMBEDDINGS]
[-tgt_embeddings TGT_EMBEDDINGS]
[-embeddings_type {GloVe,word2vec}]
- [--permute_sent_ratio PERMUTE_SENT_RATIO]
- [--rotate_ratio ROTATE_RATIO] [--insert_ratio INSERT_RATIO]
- [--random_ratio RANDOM_RATIO] [--mask_ratio MASK_RATIO]
- [--mask_length {subword,word,span-poisson}]
- [--poisson_lambda POISSON_LAMBDA] [--replace_length {-1,0,1}]
- [--termbase_path TERMBASE_PATH]
- [--src_spacy_language_model SRC_SPACY_LANGUAGE_MODEL]
- [--tgt_spacy_language_model TGT_SPACY_LANGUAGE_MODEL]
- [--term_corpus_ratio TERM_CORPUS_RATIO]
- [--term_example_ratio TERM_EXAMPLE_RATIO]
- [--src_term_stoken SRC_TERM_STOKEN]
- [--tgt_term_stoken TGT_TERM_STOKEN]
- [--tgt_term_etoken TGT_TERM_ETOKEN]
- [--term_source_delimiter TERM_SOURCE_DELIMITER]
- [--tm_path TM_PATH] [--fuzzy_corpus_ratio FUZZY_CORPUS_RATIO]
+ [-switchout_temperature SWITCHOUT_TEMPERATURE]
+ [-tokendrop_temperature TOKENDROP_TEMPERATURE]
+ [-tokenmask_temperature TOKENMASK_TEMPERATURE]
+ [--doc_length DOC_LENGTH] [--max_context MAX_CONTEXT]
+ [--response_pattern RESPONSE_PATTERN]
+ [--upper_corpus_ratio UPPER_CORPUS_RATIO] [--tm_path TM_PATH]
+ [--fuzzy_corpus_ratio FUZZY_CORPUS_RATIO]
[--fuzzy_threshold FUZZY_THRESHOLD]
[--tm_delimiter TM_DELIMITER] [--fuzzy_token FUZZY_TOKEN]
[--fuzzymatch_min_length FUZZYMATCH_MIN_LENGTH]
[--fuzzymatch_max_length FUZZYMATCH_MAX_LENGTH]
- [--src_seq_length SRC_SEQ_LENGTH]
- [--tgt_seq_length TGT_SEQ_LENGTH] [--src_prefix SRC_PREFIX]
- [--tgt_prefix TGT_PREFIX] [--src_suffix SRC_SUFFIX]
- [--tgt_suffix TGT_SUFFIX]
- [--response_pattern RESPONSE_PATTERN] [--src_eq_tgt]
- [--same_char] [--same_word]
+ [--tags_dictionary_path TAGS_DICTIONARY_PATH]
+ [--tags_corpus_ratio TAGS_CORPUS_RATIO] [--max_tags MAX_TAGS]
+ [--paired_stag PAIRED_STAG] [--paired_etag PAIRED_ETAG]
+ [--isolated_tag ISOLATED_TAG] [--src_delimiter SRC_DELIMITER]
+ [--src_eq_tgt] [--same_char] [--same_word]
[--scripts_ok [SCRIPTS_OK [SCRIPTS_OK ...]]]
[--scripts_nok [SCRIPTS_NOK [SCRIPTS_NOK ...]]]
[--src_tgt_ratio SRC_TGT_RATIO] [--avg_tok_min AVG_TOK_MIN]
[--avg_tok_max AVG_TOK_MAX] [--langid [LANGID [LANGID ...]]]
- [--upper_corpus_ratio UPPER_CORPUS_RATIO]
- [-switchout_temperature SWITCHOUT_TEMPERATURE]
- [-tokendrop_temperature TOKENDROP_TEMPERATURE]
- [-tokenmask_temperature TOKENMASK_TEMPERATURE]
- [--doc_length DOC_LENGTH] [--max_context MAX_CONTEXT]
- [--reversible_tokenization {joiner,spacer}]
[-src_subword_model SRC_SUBWORD_MODEL]
[-tgt_subword_model TGT_SUBWORD_MODEL]
[-src_subword_nbest SRC_SUBWORD_NBEST]
@@ -321,22 +305,36 @@ Train
[-tgt_subword_type {none,sentencepiece,bpe}]
[-src_onmttok_kwargs SRC_ONMTTOK_KWARGS]
[-tgt_onmttok_kwargs TGT_ONMTTOK_KWARGS] [--gpt2_pretok]
- [--tags_dictionary_path TAGS_DICTIONARY_PATH]
- [--tags_corpus_ratio TAGS_CORPUS_RATIO] [--max_tags MAX_TAGS]
- [--paired_stag PAIRED_STAG] [--paired_etag PAIRED_ETAG]
- [--isolated_tag ISOLATED_TAG] [--src_delimiter SRC_DELIMITER]
[--src_lang SRC_LANG] [--tgt_lang TGT_LANG] [--penn PENN]
[--norm_quote_commas NORM_QUOTE_COMMAS]
[--norm_numbers NORM_NUMBERS]
[--pre_replace_unicode_punct PRE_REPLACE_UNICODE_PUNCT]
[--post_remove_control_chars POST_REMOVE_CONTROL_CHARS]
+ [--reversible_tokenization {joiner,spacer}]
+ [--src_seq_length SRC_SEQ_LENGTH]
+ [--tgt_seq_length TGT_SEQ_LENGTH] [--src_prefix SRC_PREFIX]
+ [--tgt_prefix TGT_PREFIX] [--src_suffix SRC_SUFFIX]
+ [--tgt_suffix TGT_SUFFIX] [--termbase_path TERMBASE_PATH]
+ [--src_spacy_language_model SRC_SPACY_LANGUAGE_MODEL]
+ [--tgt_spacy_language_model TGT_SPACY_LANGUAGE_MODEL]
+ [--term_corpus_ratio TERM_CORPUS_RATIO]
+ [--term_example_ratio TERM_EXAMPLE_RATIO]
+ [--src_term_stoken SRC_TERM_STOKEN]
+ [--tgt_term_stoken TGT_TERM_STOKEN]
+ [--tgt_term_etoken TGT_TERM_ETOKEN]
+ [--term_source_delimiter TERM_SOURCE_DELIMITER]
+ [--permute_sent_ratio PERMUTE_SENT_RATIO]
+ [--rotate_ratio ROTATE_RATIO] [--insert_ratio INSERT_RATIO]
+ [--random_ratio RANDOM_RATIO] [--mask_ratio MASK_RATIO]
+ [--mask_length {subword,word,span-poisson}]
+ [--poisson_lambda POISSON_LAMBDA] [--replace_length {-1,0,1}]
[--gpu_ranks [GPU_RANKS [GPU_RANKS ...]]]
[--world_size WORLD_SIZE]
[--parallel_mode {tensor_parallel,data_parallel}]
[--gpu_backend GPU_BACKEND]
[--gpu_verbose_level GPU_VERBOSE_LEVEL]
[--master_ip MASTER_IP] [--master_port MASTER_PORT]
- [--src_word_vec_size SRC_WORD_VEC_SIZE]
+ [--timeout TIMEOUT] [--src_word_vec_size SRC_WORD_VEC_SIZE]
[--tgt_word_vec_size TGT_WORD_VEC_SIZE]
[--word_vec_size WORD_VEC_SIZE] [--share_decoder_embeddings]
[--share_embeddings] [--position_encoding]
@@ -365,8 +363,11 @@ Train
[--self_attn_type SELF_ATTN_TYPE]
[--max_relative_positions MAX_RELATIVE_POSITIONS]
[--relative_positions_buckets RELATIVE_POSITIONS_BUCKETS]
- [--heads HEADS] [--sliding_window SLIDING_WINDOW]
- [--transformer_ff TRANSFORMER_FF] [--aan_useffn]
+ [--rotary_interleave] [--rotary_theta ROTARY_THETA]
+ [--rotary_dim ROTARY_DIM] [--heads HEADS]
+ [--sliding_window SLIDING_WINDOW]
+ [--transformer_ff TRANSFORMER_FF] [--num_experts NUM_EXPERTS]
+ [--num_experts_per_tok NUM_EXPERTS_PER_TOK] [--aan_useffn]
[--add_qkvbias] [--multiquery] [--num_kv NUM_KV]
[--add_ffnbias] [--parallel_residual] [--shared_layer_norm]
[--lambda_align LAMBDA_ALIGN]
@@ -381,7 +382,10 @@ Train
[--lm_prior_tau LM_PRIOR_TAU] [--loss_scale LOSS_SCALE]
[--apex_opt_level {,O0,O1,O2,O3}] [--zero_out_prompt_loss]
[--use_ckpting {ffn,mha,lora} [{ffn,mha,lora} ...]]
- [--data_type DATA_TYPE] [--save_model SAVE_MODEL]
+ [--data_type DATA_TYPE] [-bucket_size BUCKET_SIZE]
+ [-bucket_size_init BUCKET_SIZE_INIT]
+ [-bucket_size_increment BUCKET_SIZE_INCREMENT]
+ [-prefetch_factor PREFETCH_FACTOR] [--save_model SAVE_MODEL]
[--save_format {pytorch,safetensors}]
[--save_checkpoint_steps SAVE_CHECKPOINT_STEPS]
[--keep_checkpoint KEEP_CHECKPOINT]
@@ -428,12 +432,9 @@ Train
[--report_every REPORT_EVERY] [--exp_host EXP_HOST]
[--exp EXP] [--tensorboard]
[--tensorboard_log_dir TENSORBOARD_LOG_DIR] [--override_opts]
- [-bucket_size BUCKET_SIZE]
- [-bucket_size_init BUCKET_SIZE_INIT]
- [-bucket_size_increment BUCKET_SIZE_INCREMENT]
- [-prefetch_factor PREFETCH_FACTOR]
[--quant_layers QUANT_LAYERS [QUANT_LAYERS ...]]
- [--quant_type {bnb_8bit,bnb_FP4,bnb_NF4}]
+ [--quant_type {,bnb_8bit,bnb_FP4,bnb_NF4,awq_gemm,awq_gemv}]
+ [--w_bit {4}] [--group_size {128}]
@@ -459,7 +460,7 @@ Data¶<
Default: “warning”
- -transforms, --transforms
-Possible choices: bart, terminology, fuzzymatch, filtertoolong, prefix, suffix, insert_mask_before_placeholder, clean, uppercase, switchout, tokendrop, tokenmask, docify, inferfeats, sentencepiece, bpe, onmt_tokenize, inlinetags, normalize
+Possible choices: switchout, tokendrop, tokenmask, docify, insert_mask_before_placeholder, uppercase, fuzzymatch, inlinetags, clean, sentencepiece, bpe, onmt_tokenize, normalize, inferfeats, filtertoolong, prefix, suffix, terminology, bart
Default transform pipeline to apply to data. Can be specified in each corpus of data to override.
Default: []
@@ -564,80 +565,61 @@ Embeddings
-Transform/BART¶
+
+Transform/SwitchOut¶
-- --permute_sent_ratio, -permute_sent_ratio
-Permute this proportion of sentences (boundaries defined by [‘.’, ‘?’, ‘!’]) in all inputs.
-Default: 0.0
-
-- --rotate_ratio, -rotate_ratio
-Rotate this proportion of inputs.
-Default: 0.0
-
-- --insert_ratio, -insert_ratio
-Insert this percentage of additional random tokens.
-Default: 0.0
-
-- --random_ratio, -random_ratio
-Instead of using <mask>, use random token this often.
-Default: 0.0
-
-- --mask_ratio, -mask_ratio
-Fraction of words/subwords that will be masked.
-Default: 0.0
-
-- --mask_length, -mask_length
-Possible choices: subword, word, span-poisson
-Length of masking window to apply.
-Default: “subword”
-
-- --poisson_lambda, -poisson_lambda
-Lambda for Poisson distribution to sample span length if -mask_length set to span-poisson.
-Default: 3.0
-
-- --replace_length, -replace_length
-Possible choices: -1, 0, 1
-When masking N tokens, replace with 0, 1, or N tokens. (use -1 for N)
-Default: -1
+- -switchout_temperature, --switchout_temperature
+Sampling temperature for SwitchOut. \(\tau^{-1}\) in [WPDN18]. Smaller value makes data more diverse.
+Default: 1.0
-
-Transform/Terminology¶
+
+Transform/Token_Drop¶
-- --termbase_path, -termbase_path
-Path to a dictionary file with terms.
-
-- --src_spacy_language_model, -src_spacy_language_model
-Name of the spacy language model for the source corpus.
-
-- --tgt_spacy_language_model, -tgt_spacy_language_model
-Name of the spacy language model for the target corpus.
-
-- --term_corpus_ratio, -term_corpus_ratio
-Ratio of corpus to augment with terms.
-Default: 0.3
+- -tokendrop_temperature, --tokendrop_temperature
+Sampling temperature for token deletion.
+Default: 1.0
-- --term_example_ratio, -term_example_ratio
-Max terms allowed in an example.
-Default: 0.2
+
+
+
+Transform/Token_Mask¶
+
+- -tokenmask_temperature, --tokenmask_temperature
+Sampling temperature for token masking.
+Default: 1.0
-- --src_term_stoken, -src_term_stoken
-The source term start token.
-Default: “⦅src_term_start⦆”
+
+
+
+Transform/Docify¶
+
+- --doc_length, -doc_length
+Number of tokens per doc.
+Default: 200
-- --tgt_term_stoken, -tgt_term_stoken
-The target term start token.
-Default: “⦅tgt_term_start⦆”
+- --max_context, -max_context
+Max context segments.
+Default: 1
-- --tgt_term_etoken, -tgt_term_etoken
-The target term end token.
-Default: “⦅tgt_term_end⦆”
+
+
+
+Transform/InsertMaskBeforePlaceholdersTransform¶
+
+- --response_pattern, -response_pattern
+Response patten to locate the end of the prompt
+Default: “Response : ⦅newline⦆”
-- --term_source_delimiter, -term_source_delimiter
-Any special token used for augmented source sentences. The default is the fuzzy token used in the FuzzyMatch transform.
-Default: “⦅fuzzy⦆”
+
+
+
+Transform/Uppercase¶
+
+- --upper_corpus_ratio, -upper_corpus_ratio
+Corpus ratio to apply uppercasing.
+Default: 0.01
@@ -673,51 +655,35 @@ Transform/FuzzyMatching
-
-Transform/Filter¶
+
+Transform/InlineTags¶
-- --src_seq_length, -src_seq_length
-Maximum source sequence length.
-Default: 192
+- --tags_dictionary_path, -tags_dictionary_path
+Path to a flat term dictionary.
-- --tgt_seq_length, -tgt_seq_length
-Maximum target sequence length.
-Default: 192
+- --tags_corpus_ratio, -tags_corpus_ratio
+Ratio of corpus to augment with tags.
+Default: 0.1
-
-
-
-Transform/Prefix¶
-
-- --src_prefix, -src_prefix
-String to prepend to all source example.
-Default: “”
+- --max_tags, -max_tags
+Maximum number of tags that can be added to a single sentence.
+Default: 12
-- --tgt_prefix, -tgt_prefix
-String to prepend to all target example.
-Default: “”
+- --paired_stag, -paired_stag
+The format of an opening paired inline tag. Must include the character #.
+Default: “⦅ph_#_beg⦆”
-
-
-
-Transform/Suffix¶
-
-- --src_suffix, -src_suffix
-String to append to all source example.
-Default: “”
+- --paired_etag, -paired_etag
+The format of a closing paired inline tag. Must include the character #.
+Default: “⦅ph_#_end⦆”
-- --tgt_suffix, -tgt_suffix
-String to append to all target example.
-Default: “”
+- --isolated_tag, -isolated_tag
+The format of an isolated inline tag. Must include the character #.
+Default: “⦅ph_#_std⦆”
-
-
-
-Transform/InsertMaskBeforePlaceholdersTransform¶
-
-- --response_pattern, -response_pattern
-Response patten to locate the end of the prompt
-Default: “Response : ⦅newline⦆”
+- --src_delimiter, -src_delimiter
+Any special token used for augmented src sentences. The default is the fuzzy token used in the FuzzyMatch transform.
+Default: “⦅fuzzy⦆”
@@ -748,76 +714,17 @@ Transform/Clean
-Transform/Uppercase¶
-
-- --upper_corpus_ratio, -upper_corpus_ratio
-Corpus ratio to apply uppercasing.
-Default: 0.01
-
-
-
-
-Transform/SwitchOut¶
-
-- -switchout_temperature, --switchout_temperature
-Sampling temperature for SwitchOut. \(\tau^{-1}\) in [WPDN18]. Smaller value makes data more diverse.
-Default: 1.0
-
-
-
-
-Transform/Token_Drop¶
-
-- -tokendrop_temperature, --tokendrop_temperature
-Sampling temperature for token deletion.
-Default: 1.0
-
-
-
-
-Transform/Token_Mask¶
-
-- -tokenmask_temperature, --tokenmask_temperature
-Sampling temperature for token masking.
-Default: 1.0
-
-
-
-
-Transform/Docify¶
-
-- --doc_length, -doc_length
-Number of tokens per doc.
-Default: 200
-
-- --max_context, -max_context
-Max context segments.
-Default: 1
+- --avg_tok_min, -avg_tok_min
+average length of tokens min
+Default: 3
-
-
-
-Transform/InferFeats¶
-
-- --reversible_tokenization, -reversible_tokenization
-Possible choices: joiner, spacer
-Type of reversible tokenization applied on the tokenizer.
-Default: “joiner”
+- --avg_tok_max, -avg_tok_max
+average length of tokens max
+Default: 20
+
+- --langid, -langid
+list of languages accepted
+Default: []
@@ -895,38 +802,6 @@ Transform/Subword/ONMTTOK
-Transform/InlineTags¶
-
-- --tags_dictionary_path, -tags_dictionary_path
-Path to a flat term dictionary.
-
-- --tags_corpus_ratio, -tags_corpus_ratio
-Ratio of corpus to augment with tags.
-Default: 0.1
-
-- --max_tags, -max_tags
-Maximum number of tags that can be added to a single sentence.
-Default: 12
-
-- --paired_stag, -paired_stag
-The format of an opening paired inline tag. Must include the character #.
-Default: “⦅ph_#_beg⦆”
-
-- --paired_etag, -paired_etag
-The format of a closing paired inline tag. Must include the character #.
-Default: “⦅ph_#_end⦆”
-
-- --isolated_tag, -isolated_tag
-The format of an isolated inline tag. Must include the character #.
-Default: “⦅ph_#_std⦆”
-
-- --src_delimiter, -src_delimiter
-Any special token used for augmented src sentences. The default is the fuzzy token used in the FuzzyMatch transform.
-Default: “⦅fuzzy⦆”
-
-
-
Transform/Normalize¶
+
+Transform/Filter¶
+
+- --src_seq_length, -src_seq_length
+Maximum source sequence length.
+Default: 192
+
+- --tgt_seq_length, -tgt_seq_length
+Maximum target sequence length.
+Default: 192
+
+
+
+
+Transform/Prefix¶
+
+- --src_prefix, -src_prefix
+String to prepend to all source example.
+Default: “”
+
+- --tgt_prefix, -tgt_prefix
+String to prepend to all target example.
+Default: “”
+
+
+
+
+Transform/Suffix¶
+
+- --src_suffix, -src_suffix
+String to append to all source example.
+Default: “”
+
+- --tgt_suffix, -tgt_suffix
+String to append to all target example.
+Default: “”
+
+
+
+
+Transform/Terminology¶
+
+- --termbase_path, -termbase_path
+Path to a dictionary file with terms.
+
+- --src_spacy_language_model, -src_spacy_language_model
+Name of the spacy language model for the source corpus.
+
+- --tgt_spacy_language_model, -tgt_spacy_language_model
+Name of the spacy language model for the target corpus.
+
+- --term_corpus_ratio, -term_corpus_ratio
+Ratio of corpus to augment with terms.
+Default: 0.3
+
+- --term_example_ratio, -term_example_ratio
+Max terms allowed in an example.
+Default: 0.2
+
+- --src_term_stoken, -src_term_stoken
+The source term start token.
+Default: “⦅src_term_start⦆”
+
+- --tgt_term_stoken, -tgt_term_stoken
+The target term start token.
+Default: “⦅tgt_term_start⦆”
+
+- --tgt_term_etoken, -tgt_term_etoken
+The target term end token.
+Default: “⦅tgt_term_end⦆”
+
+- --term_source_delimiter, -term_source_delimiter
+Any special token used for augmented source sentences. The default is the fuzzy token used in the FuzzyMatch transform.
+Default: “⦅fuzzy⦆”
+
+
+
+
+Transform/BART¶
+
+- --permute_sent_ratio, -permute_sent_ratio
+Permute this proportion of sentences (boundaries defined by [‘.’, ‘?’, ‘!’]) in all inputs.
+Default: 0.0
+
+- --rotate_ratio, -rotate_ratio
+Rotate this proportion of inputs.
+Default: 0.0
+
+- --insert_ratio, -insert_ratio
+Insert this percentage of additional random tokens.
+Default: 0.0
+
+- --random_ratio, -random_ratio
+Instead of using <mask>, use random token this often.
+Default: 0.0
+
+- --mask_ratio, -mask_ratio
+Fraction of words/subwords that will be masked.
+Default: 0.0
+
+- --mask_length, -mask_length
+Possible choices: subword, word, span-poisson
+Length of masking window to apply.
+Default: “subword”
+
+- --poisson_lambda, -poisson_lambda
+Lambda for Poisson distribution to sample span length if -mask_length set to span-poisson.
+Default: 3.0
+
+- --replace_length, -replace_length
+Possible choices: -1, 0, 1
+When masking N tokens, replace with 0, 1, or N tokens. (use -1 for N)
+Default: -1
+
+
+
Distributed¶
@@ -992,6 +993,10 @@ Distributed
@@ -1191,8 +1196,8 @@ Model- Attentionhttps://github.com/google-research/text-to-text-transfer-transformer
Default: 0
+- --rotary_interleave, -rotary_interleave
+Interleave the head dimensions when rotary embeddings are applied. Otherwise the head dimensions are sliced in half.True = default Llama from Meta (original)False = used by all Hugging face models
+Default: False
+
+- --rotary_theta, -rotary_theta
+Rotary theta base length1e4 for Llama2.Mistral1e6 for Mixtral
+Default: 10000
+
+- --rotary_dim, -rotary_dim
+Rotary dim when model requires it to be different to head dim
+Default: 0
+
- --heads, -heads
Number of heads for transformer self-attention
Default: 8
@@ -1214,6 +1231,14 @@ Model- Attention
+- A bucket is a buffer of bucket_size examples to pick
from the various Corpora. The dynamic iterator batches
+batch_size batchs from the bucket and shuffle them.
+
+
+Default: 262144
+
+-bucket_size_init, --bucket_size_init
+
+- The bucket is initalized with this awith this
amount of examples (optional)
+
+
+Default: -1
+
+-bucket_size_increment, --bucket_size_increment
+
+- The bucket size is incremented with this
amount of examples (optional)
+
+
+Default: 0
+
+-prefetch_factor, --prefetch_factor
+
+- number of mini-batches loaded in advance to avoid the
GPU waiting during the refilling of the bucket.
+
+
+Default: 200
+
--save_model, -save_model
Model filename (the model will be saved as <save_model>_N.pt where N is the number of steps
Default: “model”
@@ -1614,40 +1668,6 @@ Logging
-Dynamic data¶
-
-- -bucket_size, --bucket_size
-
-- A bucket is a buffer of bucket_size examples to pick
from the various Corpora. The dynamic iterator batches
-batch_size batchs from the bucket and shuffle them.
-
-
-Default: 262144
-
-- -bucket_size_init, --bucket_size_init
-
-- The bucket is initalized with this awith this
amount of examples (optional)
-
-
-Default: -1
-
-- -bucket_size_increment, --bucket_size_increment
-
-- The bucket size is incremented with this
amount of examples (optional)
-
-
-Default: 0
-
-- -prefetch_factor, --prefetch_factor
-
-- number of mini-batches loaded in advance to avoid the
GPU waiting during the refilling of the bucket.
-
-
-Default: 200
-
-
-
Quant options¶
@@ -1656,9 +1676,19 @@ Quant optionsLogging
Distributed
Efficiency
-Transform/BART
-Transform/Terminology
-Transform/FuzzyMatching
-Transform/Filter
-Transform/Prefix
-Transform/Suffix
-Transform/InsertMaskBeforePlaceholdersTransform
-Transform/Clean
-Transform/Uppercase
Transform/SwitchOut
Transform/Token_Drop
Transform/Token_Mask
Transform/Docify
-Transform/InferFeats
+Transform/InsertMaskBeforePlaceholdersTransform
+Transform/Uppercase
+Transform/FuzzyMatching
+Transform/InlineTags
+Transform/Clean
Transform/Subword/Common
Transform/Subword/ONMTTOK
-Transform/InlineTags
Transform/Normalize
+Transform/InferFeats
+Transform/Filter
+Transform/Prefix
+Transform/Suffix
+Transform/Terminology
+Transform/BART
Quant options
@@ -245,10 +245,11 @@ Translateusage: translate.py [-h] [-config CONFIG] [-save_config SAVE_CONFIG] --model
MODEL [MODEL ...] [--precision {,fp32,fp16,int8}] [--fp32]
- [--int8] [--avg_raw_probs] [--data_type DATA_TYPE] --src
- SRC [--tgt TGT] [--tgt_file_prefix] [--output OUTPUT]
- [--report_align] [--gold_align] [--report_time]
- [--profile] [-n_src_feats N_SRC_FEATS]
+ [--int8] [--avg_raw_probs]
+ [--self_attn_type SELF_ATTN_TYPE] [--data_type DATA_TYPE]
+ --src SRC [--tgt TGT] [--tgt_file_prefix]
+ [--output OUTPUT] [--report_align] [--gold_align]
+ [--report_time] [--profile] [-n_src_feats N_SRC_FEATS]
[-src_feats_defaults SRC_FEATS_DEFAULTS]
[--beam_size BEAM_SIZE] [--ratio RATIO]
[--random_sampling_topk RANDOM_SAMPLING_TOPK]
@@ -258,6 +259,7 @@ Translate[--alpha ALPHA] [--coverage_penalty {none,wu,summary}]
[--beta BETA] [--stepwise_penalty]
[--min_length MIN_LENGTH] [--max_length MAX_LENGTH]
+ [--max_length_ratio MAX_LENGTH_RATIO]
[--block_ngram_repeat BLOCK_NGRAM_REPEAT]
[--ignore_when_blocking IGNORE_WHEN_BLOCKING [IGNORE_WHEN_BLOCKING ...]]
[--replace_unk] [--ban_unk_token]
@@ -271,48 +273,32 @@ Translate[--gpu_backend GPU_BACKEND]
[--gpu_verbose_level GPU_VERBOSE_LEVEL]
[--master_ip MASTER_IP] [--master_port MASTER_PORT]
- [--batch_size BATCH_SIZE] [--batch_type {sents,tokens}]
- [--gpu GPU]
- [-transforms {bart,terminology,fuzzymatch,filtertoolong,prefix,suffix,insert_mask_before_placeholder,clean,uppercase,switchout,tokendrop,tokenmask,docify,inferfeats,sentencepiece,bpe,onmt_tokenize,inlinetags,normalize} [{bart,terminology,fuzzymatch,filtertoolong,prefix,suffix,insert_mask_before_placeholder,clean,uppercase,switchout,tokendrop,tokenmask,docify,inferfeats,sentencepiece,bpe,onmt_tokenize,inlinetags,normalize} ...]]
- [--permute_sent_ratio PERMUTE_SENT_RATIO]
- [--rotate_ratio ROTATE_RATIO]
- [--insert_ratio INSERT_RATIO]
- [--random_ratio RANDOM_RATIO] [--mask_ratio MASK_RATIO]
- [--mask_length {subword,word,span-poisson}]
- [--poisson_lambda POISSON_LAMBDA]
- [--replace_length {-1,0,1}]
- [--termbase_path TERMBASE_PATH]
- [--src_spacy_language_model SRC_SPACY_LANGUAGE_MODEL]
- [--tgt_spacy_language_model TGT_SPACY_LANGUAGE_MODEL]
- [--term_corpus_ratio TERM_CORPUS_RATIO]
- [--term_example_ratio TERM_EXAMPLE_RATIO]
- [--src_term_stoken SRC_TERM_STOKEN]
- [--tgt_term_stoken TGT_TERM_STOKEN]
- [--tgt_term_etoken TGT_TERM_ETOKEN]
- [--term_source_delimiter TERM_SOURCE_DELIMITER]
+ [--timeout TIMEOUT] [--batch_size BATCH_SIZE]
+ [--batch_type {sents,tokens}] [--gpu GPU]
+ [-transforms {switchout,tokendrop,tokenmask,docify,insert_mask_before_placeholder,uppercase,fuzzymatch,inlinetags,clean,sentencepiece,bpe,onmt_tokenize,normalize,inferfeats,filtertoolong,prefix,suffix,terminology,bart} [{switchout,tokendrop,tokenmask,docify,insert_mask_before_placeholder,uppercase,fuzzymatch,inlinetags,clean,sentencepiece,bpe,onmt_tokenize,normalize,inferfeats,filtertoolong,prefix,suffix,terminology,bart} ...]]
+ [-switchout_temperature SWITCHOUT_TEMPERATURE]
+ [-tokendrop_temperature TOKENDROP_TEMPERATURE]
+ [-tokenmask_temperature TOKENMASK_TEMPERATURE]
+ [--doc_length DOC_LENGTH] [--max_context MAX_CONTEXT]
+ [--response_pattern RESPONSE_PATTERN]
+ [--upper_corpus_ratio UPPER_CORPUS_RATIO]
[--tm_path TM_PATH]
[--fuzzy_corpus_ratio FUZZY_CORPUS_RATIO]
[--fuzzy_threshold FUZZY_THRESHOLD]
[--tm_delimiter TM_DELIMITER] [--fuzzy_token FUZZY_TOKEN]
[--fuzzymatch_min_length FUZZYMATCH_MIN_LENGTH]
[--fuzzymatch_max_length FUZZYMATCH_MAX_LENGTH]
- [--src_seq_length SRC_SEQ_LENGTH]
- [--tgt_seq_length TGT_SEQ_LENGTH]
- [--src_prefix SRC_PREFIX] [--tgt_prefix TGT_PREFIX]
- [--src_suffix SRC_SUFFIX] [--tgt_suffix TGT_SUFFIX]
- [--response_pattern RESPONSE_PATTERN] [--src_eq_tgt]
+ [--tags_dictionary_path TAGS_DICTIONARY_PATH]
+ [--tags_corpus_ratio TAGS_CORPUS_RATIO]
+ [--max_tags MAX_TAGS] [--paired_stag PAIRED_STAG]
+ [--paired_etag PAIRED_ETAG] [--isolated_tag ISOLATED_TAG]
+ [--src_delimiter SRC_DELIMITER] [--src_eq_tgt]
[--same_char] [--same_word]
[--scripts_ok [SCRIPTS_OK [SCRIPTS_OK ...]]]
[--scripts_nok [SCRIPTS_NOK [SCRIPTS_NOK ...]]]
[--src_tgt_ratio SRC_TGT_RATIO]
[--avg_tok_min AVG_TOK_MIN] [--avg_tok_max AVG_TOK_MAX]
[--langid [LANGID [LANGID ...]]]
- [--upper_corpus_ratio UPPER_CORPUS_RATIO]
- [-switchout_temperature SWITCHOUT_TEMPERATURE]
- [-tokendrop_temperature TOKENDROP_TEMPERATURE]
- [-tokenmask_temperature TOKENMASK_TEMPERATURE]
- [--doc_length DOC_LENGTH] [--max_context MAX_CONTEXT]
- [--reversible_tokenization {joiner,spacer}]
[-src_subword_model SRC_SUBWORD_MODEL]
[-tgt_subword_model TGT_SUBWORD_MODEL]
[-src_subword_nbest SRC_SUBWORD_NBEST]
@@ -327,18 +313,35 @@ Translate[-tgt_subword_type {none,sentencepiece,bpe}]
[-src_onmttok_kwargs SRC_ONMTTOK_KWARGS]
[-tgt_onmttok_kwargs TGT_ONMTTOK_KWARGS] [--gpt2_pretok]
- [--tags_dictionary_path TAGS_DICTIONARY_PATH]
- [--tags_corpus_ratio TAGS_CORPUS_RATIO]
- [--max_tags MAX_TAGS] [--paired_stag PAIRED_STAG]
- [--paired_etag PAIRED_ETAG] [--isolated_tag ISOLATED_TAG]
- [--src_delimiter SRC_DELIMITER] [--src_lang SRC_LANG]
- [--tgt_lang TGT_LANG] [--penn PENN]
+ [--src_lang SRC_LANG] [--tgt_lang TGT_LANG] [--penn PENN]
[--norm_quote_commas NORM_QUOTE_COMMAS]
[--norm_numbers NORM_NUMBERS]
[--pre_replace_unicode_punct PRE_REPLACE_UNICODE_PUNCT]
[--post_remove_control_chars POST_REMOVE_CONTROL_CHARS]
+ [--reversible_tokenization {joiner,spacer}]
+ [--src_seq_length SRC_SEQ_LENGTH]
+ [--tgt_seq_length TGT_SEQ_LENGTH]
+ [--src_prefix SRC_PREFIX] [--tgt_prefix TGT_PREFIX]
+ [--src_suffix SRC_SUFFIX] [--tgt_suffix TGT_SUFFIX]
+ [--termbase_path TERMBASE_PATH]
+ [--src_spacy_language_model SRC_SPACY_LANGUAGE_MODEL]
+ [--tgt_spacy_language_model TGT_SPACY_LANGUAGE_MODEL]
+ [--term_corpus_ratio TERM_CORPUS_RATIO]
+ [--term_example_ratio TERM_EXAMPLE_RATIO]
+ [--src_term_stoken SRC_TERM_STOKEN]
+ [--tgt_term_stoken TGT_TERM_STOKEN]
+ [--tgt_term_etoken TGT_TERM_ETOKEN]
+ [--term_source_delimiter TERM_SOURCE_DELIMITER]
+ [--permute_sent_ratio PERMUTE_SENT_RATIO]
+ [--rotate_ratio ROTATE_RATIO]
+ [--insert_ratio INSERT_RATIO]
+ [--random_ratio RANDOM_RATIO] [--mask_ratio MASK_RATIO]
+ [--mask_length {subword,word,span-poisson}]
+ [--poisson_lambda POISSON_LAMBDA]
+ [--replace_length {-1,0,1}]
[--quant_layers QUANT_LAYERS [QUANT_LAYERS ...]]
- [--quant_type {bnb_8bit,bnb_FP4,bnb_NF4}]
+ [--quant_type {,bnb_8bit,bnb_FP4,bnb_NF4,awq_gemm,awq_gemv}]
+ [--w_bit {4}] [--group_size {128}]
@@ -374,6 +377,10 @@ Model
If this is set, during ensembling scores from different models will be combined by averaging their raw probabilities and then taking the log. Otherwise, the log probabilities will be averaged directly. Necessary for models whose output layers can assign zero probability.
Default: False
+--self_attn_type, -self_attn_type
+Self attention type in Transformer decoder layer – currently “scaled-dot”, “scaled-dot-flash” or “average”
+Default: “scaled-dot-flash”
+
@@ -516,6 +523,10 @@ Decoding tricks
@@ -626,86 +641,67 @@ Efficiency
-Transform/BART¶
+
+Transform/SwitchOut¶
-- --permute_sent_ratio, -permute_sent_ratio
-Permute this proportion of sentences (boundaries defined by [‘.’, ‘?’, ‘!’]) in all inputs.
-Default: 0.0
-
-- --rotate_ratio, -rotate_ratio
-Rotate this proportion of inputs.
-Default: 0.0
-
-- --insert_ratio, -insert_ratio
-Insert this percentage of additional random tokens.
-Default: 0.0
-
-- --random_ratio, -random_ratio
-Instead of using <mask>, use random token this often.
-Default: 0.0
-
-- --mask_ratio, -mask_ratio
-Fraction of words/subwords that will be masked.
-Default: 0.0
-
-- --mask_length, -mask_length
-Possible choices: subword, word, span-poisson
-Length of masking window to apply.
-Default: “subword”
-
-- --poisson_lambda, -poisson_lambda
-Lambda for Poisson distribution to sample span length if -mask_length set to span-poisson.
-Default: 3.0
-
-- --replace_length, -replace_length
-Possible choices: -1, 0, 1
-When masking N tokens, replace with 0, 1, or N tokens. (use -1 for N)
-Default: -1
+- -switchout_temperature, --switchout_temperature
+Sampling temperature for SwitchOut. \(\tau^{-1}\) in [WPDN18]. Smaller value makes data more diverse.
+Default: 1.0
-
-Transform/Terminology¶
+
+Transform/Token_Drop¶
-- --termbase_path, -termbase_path
-Path to a dictionary file with terms.
-
-- --src_spacy_language_model, -src_spacy_language_model
-Name of the spacy language model for the source corpus.
-
-- --tgt_spacy_language_model, -tgt_spacy_language_model
-Name of the spacy language model for the target corpus.
-
-- --term_corpus_ratio, -term_corpus_ratio
-Ratio of corpus to augment with terms.
-Default: 0.3
+- -tokendrop_temperature, --tokendrop_temperature
+Sampling temperature for token deletion.
+Default: 1.0
-- --term_example_ratio, -term_example_ratio
-Max terms allowed in an example.
-Default: 0.2
+
+
+
+Transform/Token_Mask¶
+
+- -tokenmask_temperature, --tokenmask_temperature
+Sampling temperature for token masking.
+Default: 1.0
-- --src_term_stoken, -src_term_stoken
-The source term start token.
-Default: “⦅src_term_start⦆”
+
+
+
+Transform/Docify¶
+
+- --doc_length, -doc_length
+Number of tokens per doc.
+Default: 200
-- --tgt_term_stoken, -tgt_term_stoken
-The target term start token.
-Default: “⦅tgt_term_start⦆”
+- --max_context, -max_context
+Max context segments.
+Default: 1
-- --tgt_term_etoken, -tgt_term_etoken
-The target term end token.
-Default: “⦅tgt_term_end⦆”
+
+
+
+Transform/InsertMaskBeforePlaceholdersTransform¶
+
+- --response_pattern, -response_pattern
+Response patten to locate the end of the prompt
+Default: “Response : ⦅newline⦆”
-- --term_source_delimiter, -term_source_delimiter
-Any special token used for augmented source sentences. The default is the fuzzy token used in the FuzzyMatch transform.
-Default: “⦅fuzzy⦆”
+
+
+
+Transform/Uppercase¶
+
+- --upper_corpus_ratio, -upper_corpus_ratio
+Corpus ratio to apply uppercasing.
+Default: 0.01
@@ -741,51 +737,35 @@ Transform/FuzzyMatching
-
-Transform/Filter¶
+
+Transform/InlineTags¶
-- --src_seq_length, -src_seq_length
-Maximum source sequence length.
-Default: 192
+- --tags_dictionary_path, -tags_dictionary_path
+Path to a flat term dictionary.
-- --tgt_seq_length, -tgt_seq_length
-Maximum target sequence length.
-Default: 192
+- --tags_corpus_ratio, -tags_corpus_ratio
+Ratio of corpus to augment with tags.
+Default: 0.1
-
-
-
-Transform/Prefix¶
-
-- --src_prefix, -src_prefix
-String to prepend to all source example.
-Default: “”
+- --max_tags, -max_tags
+Maximum number of tags that can be added to a single sentence.
+Default: 12
-- --tgt_prefix, -tgt_prefix
-String to prepend to all target example.
-Default: “”
+- --paired_stag, -paired_stag
+The format of an opening paired inline tag. Must include the character #.
+Default: “⦅ph_#_beg⦆”
-
-
-
-Transform/Suffix¶
-
-- --src_suffix, -src_suffix
-String to append to all source example.
-Default: “”
+- --paired_etag, -paired_etag
+The format of a closing paired inline tag. Must include the character #.
+Default: “⦅ph_#_end⦆”
-- --tgt_suffix, -tgt_suffix
-String to append to all target example.
-Default: “”
+- --isolated_tag, -isolated_tag
+The format of an isolated inline tag. Must include the character #.
+Default: “⦅ph_#_std⦆”
-
-
-
-Transform/InsertMaskBeforePlaceholdersTransform¶
-
-- --response_pattern, -response_pattern
-Response patten to locate the end of the prompt
-Default: “Response : ⦅newline⦆”
+- --src_delimiter, -src_delimiter
+Any special token used for augmented src sentences. The default is the fuzzy token used in the FuzzyMatch transform.
+Default: “⦅fuzzy⦆”
@@ -830,65 +810,6 @@ Transform/Clean
-Transform/Uppercase¶
-
-- --upper_corpus_ratio, -upper_corpus_ratio
-Corpus ratio to apply uppercasing.
-Default: 0.01
-
-
-
-
-Transform/SwitchOut¶
-
-- -switchout_temperature, --switchout_temperature
-Sampling temperature for SwitchOut. \(\tau^{-1}\) in [WPDN18]. Smaller value makes data more diverse.
-Default: 1.0
-
-
-
-
-Transform/Token_Drop¶
-
-- -tokendrop_temperature, --tokendrop_temperature
-Sampling temperature for token deletion.
-Default: 1.0
-
-
-
-
-Transform/Token_Mask¶
-
-- -tokenmask_temperature, --tokenmask_temperature
-Sampling temperature for token masking.
-Default: 1.0
-
-
-
-
-Transform/Docify¶
-
-- --doc_length, -doc_length
-Number of tokens per doc.
-Default: 200
-
-- --max_context, -max_context
-Max context segments.
-Default: 1
-
-
-
-
-Transform/InferFeats¶
-
-- --reversible_tokenization, -reversible_tokenization
-Possible choices: joiner, spacer
-Type of reversible tokenization applied on the tokenizer.
-Default: “joiner”
-
-
-
Transform/Subword/Common¶
@@ -963,38 +884,6 @@ Transform/Subword/ONMTTOK
-Transform/InlineTags¶
-
-- --tags_dictionary_path, -tags_dictionary_path
-Path to a flat term dictionary.
-
-- --tags_corpus_ratio, -tags_corpus_ratio
-Ratio of corpus to augment with tags.
-Default: 0.1
-
-- --max_tags, -max_tags
-Maximum number of tags that can be added to a single sentence.
-Default: 12
-
-- --paired_stag, -paired_stag
-The format of an opening paired inline tag. Must include the character #.
-Default: “⦅ph_#_beg⦆”
-
-- --paired_etag, -paired_etag
-The format of a closing paired inline tag. Must include the character #.
-Default: “⦅ph_#_end⦆”
-
-- --isolated_tag, -isolated_tag
-The format of an isolated inline tag. Must include the character #.
-Default: “⦅ph_#_std⦆”
-
-- --src_delimiter, -src_delimiter
-Any special token used for augmented src sentences. The default is the fuzzy token used in the FuzzyMatch transform.
-Default: “⦅fuzzy⦆”
-
-
-
Transform/Normalize¶
+
+Transform/Filter¶
+
+- --src_seq_length, -src_seq_length
+Maximum source sequence length.
+Default: 192
+
+- --tgt_seq_length, -tgt_seq_length
+Maximum target sequence length.
+Default: 192
+
+
+
+
+Transform/Prefix¶
+
+- --src_prefix, -src_prefix
+String to prepend to all source example.
+Default: “”
+
+- --tgt_prefix, -tgt_prefix
+String to prepend to all target example.
+Default: “”
+
+
+
+
+Transform/Suffix¶
+
+- --src_suffix, -src_suffix
+String to append to all source example.
+Default: “”
+
+- --tgt_suffix, -tgt_suffix
+String to append to all target example.
+Default: “”
+
+
+
+
+Transform/Terminology¶
+
+- --termbase_path, -termbase_path
+Path to a dictionary file with terms.
+
+- --src_spacy_language_model, -src_spacy_language_model
+Name of the spacy language model for the source corpus.
+
+- --tgt_spacy_language_model, -tgt_spacy_language_model
+Name of the spacy language model for the target corpus.
+
+- --term_corpus_ratio, -term_corpus_ratio
+Ratio of corpus to augment with terms.
+Default: 0.3
+
+- --term_example_ratio, -term_example_ratio
+Max terms allowed in an example.
+Default: 0.2
+
+- --src_term_stoken, -src_term_stoken
+The source term start token.
+Default: “⦅src_term_start⦆”
+
+- --tgt_term_stoken, -tgt_term_stoken
+The target term start token.
+Default: “⦅tgt_term_start⦆”
+
+- --tgt_term_etoken, -tgt_term_etoken
+The target term end token.
+Default: “⦅tgt_term_end⦆”
+
+- --term_source_delimiter, -term_source_delimiter
+Any special token used for augmented source sentences. The default is the fuzzy token used in the FuzzyMatch transform.
+Default: “⦅fuzzy⦆”
+
+
+
+
+Transform/BART¶
+
+- --permute_sent_ratio, -permute_sent_ratio
+Permute this proportion of sentences (boundaries defined by [‘.’, ‘?’, ‘!’]) in all inputs.
+Default: 0.0
+
+- --rotate_ratio, -rotate_ratio
+Rotate this proportion of inputs.
+Default: 0.0
+
+- --insert_ratio, -insert_ratio
+Insert this percentage of additional random tokens.
+Default: 0.0
+
+- --random_ratio, -random_ratio
+Instead of using <mask>, use random token this often.
+Default: 0.0
+
+- --mask_ratio, -mask_ratio
+Fraction of words/subwords that will be masked.
+Default: 0.0
+
+- --mask_length, -mask_length
+Possible choices: subword, word, span-poisson
+Length of masking window to apply.
+Default: “subword”
+
+- --poisson_lambda, -poisson_lambda
+Lambda for Poisson distribution to sample span length if -mask_length set to span-poisson.
+Default: 3.0
+
+- --replace_length, -replace_length
+Possible choices: -1, 0, 1
+When masking N tokens, replace with 0, 1, or N tokens. (use -1 for N)
+Default: -1
+
+
+
Quant options¶