From 37de1b247acaa2cc822c36e3df3295f21dcd23ce Mon Sep 17 00:00:00 2001 From: vince62s Date: Thu, 22 Feb 2024 17:45:37 +0000 Subject: [PATCH] =?UTF-8?q?Deploying=20to=20gh-pages=20from=20=20@=20b9a60?= =?UTF-8?q?d6ac861321bb077c8f544199c3c0583f3bb=20=F0=9F=9A=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- _modules/onmt/decoders/transformer.html | 269 ++++++-- _modules/onmt/encoders/transformer.html | 28 + _modules/onmt/inputters/dynamic_iterator.html | 17 +- _modules/onmt/inputters/text_corpus.html | 94 ++- _modules/onmt/models/model.html | 191 +++--- _modules/onmt/modules/copy_generator.html | 15 +- _modules/onmt/modules/embeddings.html | 9 +- _modules/onmt/modules/multi_headed_attn.html | 285 ++++++-- _modules/onmt/modules/position_ffn.html | 11 +- _modules/onmt/trainer.html | 7 +- _modules/onmt/translate/beam_search.html | 43 +- _modules/onmt/translate/decode_strategy.html | 6 +- _modules/onmt/translate/greedy_search.html | 53 +- .../onmt/translate/translation_server.html | 8 +- _modules/onmt/translate/translator.html | 173 ++--- _modules/onmt/utils/loss.html | 1 - _sources/examples/wmt17/Translation.md.txt | 1 - _sources/quickstart.md.txt | 6 + examples/wmt17/Translation.html | 1 - index.html | 67 +- onmt.modules.html | 28 +- onmt.translation.html | 6 +- options/build_vocab.html | 491 +++++++------- options/server.html | 4 +- options/train.html | 622 +++++++++--------- options/translate.html | 521 ++++++++------- quickstart.html | 4 + searchindex.js | 2 +- 28 files changed, 1734 insertions(+), 1229 deletions(-) diff --git a/_modules/onmt/decoders/transformer.html b/_modules/onmt/decoders/transformer.html index d3d74074bf..2bcf6d6c13 100644 --- a/_modules/onmt/decoders/transformer.html +++ b/_modules/onmt/decoders/transformer.html @@ -212,12 +212,9 @@

Source code for onmt.decoders.transformer

 from onmt.modules import MultiHeadedAttention, AverageAttention
 from onmt.modules.position_ffn import PositionwiseFeedForward
 from onmt.modules.position_ffn import ActivationFunction
+from onmt.modules.moe import MoE
 from onmt.utils.misc import sequence_mask
-
-try:
-    from apex.normalization import FusedRMSNorm as RMSNorm
-except ImportError:
-    from onmt.modules.rmsnorm import RMSNorm
+from onmt.modules.rmsnorm import RMSNorm
 
 
 class TransformerDecoderLayerBase(nn.Module):
@@ -228,7 +225,7 @@ 

Source code for onmt.decoders.transformer

         d_ff,
         dropout,
         attention_dropout,
-        self_attn_type="scaled-dot",
+        self_attn_type="scaled_dot",
         max_relative_positions=0,
         relative_positions_buckets=0,
         aan_useffn=False,
@@ -245,6 +242,11 @@ 

Source code for onmt.decoders.transformer

         use_ckpting=[],
         parallel_gpu=1,
         sliding_window=0,
+        rotary_interleave=True,
+        rotary_theta=1e4,
+        rotary_dim=0,
+        num_experts=0,
+        num_experts_per_tok=2,
     ):
         """
         Args:
@@ -259,10 +261,13 @@ 

Source code for onmt.decoders.transformer

             attention_dropout (float): dropout in context_attn  (and
                 self-attn(avg))
             self_attn_type (string): type of self-attention scaled-dot,
-                average
+                flash-scaled-dot, average
             max_relative_positions (int):
                 Max distance between inputs in relative positions
                 representations
+            relative_positions_buckets (int):
+                relative position bias see
+                https://github.com/google-research/text-to-text-transfer-transformer
             aan_useffn (bool): Turn on the FFN layer in the AAN decoder
             full_context_alignment (bool):
                 whether enable an extra full context decoder forward for
@@ -272,21 +277,38 @@ 

Source code for onmt.decoders.transformer

             pos_ffn_activation_fn (ActivationFunction):
                 activation function choice for PositionwiseFeedForward layer
             add_qkvbias (bool): whether to add bias to the Key/Value nn.Linear
+            num_kv (int): number of heads for KV when different vs Q (multiquery)
+            add_ffnbias (bool): whether to add bias to the FF nn.Linear
+            parallel_residual (bool): Use parallel residual connections in each layer block, as used
+                by the GPT-J and GPT-NeoX models
+            shared_layer_norm (bool): When using parallel residual, share the input and post
+                attention layer norms.
             layer_norm (string): type of layer normalization standard/rms
             norm_eps (float): layer norm epsilon
-
+            use_ckpting (List): layers for which we checkpoint for backward
+            parallel_gpu (int): Number of gpu for tensor parallelism
+            sliding_window (int): Width of the band mask and KV cache (cf Mistral Model)
+            rotary_interleave (bool): Interleave the head dimensions when rotary
+                embeddings are applied
+            rotary_theta (int): rotary base theta
+            rotary_dim (int): in some cases the rotary dim is lower than head dim
+            num_experts (int): Number of experts for MoE
+            num_experts_per_tok (int): Number of experts choice per token
         """
         super(TransformerDecoderLayerBase, self).__init__()
 
-        self.self_attn_type = self_attn_type
-        if self_attn_type == "scaled-dot":
+        if self_attn_type in ["scaled-dot", "scaled-dot-flash"]:
             self.self_attn = MultiHeadedAttention(
                 heads,
                 d_model,
                 dropout=attention_dropout,
                 max_relative_positions=max_relative_positions,
                 relative_positions_buckets=relative_positions_buckets,
+                rotary_interleave=rotary_interleave,
+                rotary_theta=rotary_theta,
+                rotary_dim=rotary_dim,
                 attn_type="self",
+                self_attn_type=self_attn_type,
                 add_qkvbias=add_qkvbias,
                 num_kv=num_kv,
                 use_ckpting=use_ckpting,
@@ -297,18 +319,34 @@ 

Source code for onmt.decoders.transformer

                 d_model, dropout=attention_dropout, aan_useffn=aan_useffn
             )
 
-        self.feed_forward = PositionwiseFeedForward(
-            d_model,
-            d_ff,
-            dropout,
-            pos_ffn_activation_fn,
-            add_ffnbias,
-            parallel_residual,
-            layer_norm,
-            norm_eps,
-            use_ckpting=use_ckpting,
-            parallel_gpu=parallel_gpu,
-        )
+        if num_experts > 0:
+            self.feed_forward = MoE(
+                num_experts,
+                num_experts_per_tok,
+                d_model,
+                d_ff,
+                dropout,
+                pos_ffn_activation_fn,
+                add_ffnbias,
+                parallel_residual,
+                layer_norm,
+                norm_eps,
+                use_ckpting=use_ckpting,
+                parallel_gpu=parallel_gpu,
+            )
+        else:
+            self.feed_forward = PositionwiseFeedForward(
+                d_model,
+                d_ff,
+                dropout,
+                pos_ffn_activation_fn,
+                add_ffnbias,
+                parallel_residual,
+                layer_norm,
+                norm_eps,
+                use_ckpting=use_ckpting,
+                parallel_gpu=parallel_gpu,
+            )
         self.parallel_residual = parallel_residual
         self.shared_layer_norm = shared_layer_norm
         if layer_norm == "standard":
@@ -327,6 +365,7 @@ 

Source code for onmt.decoders.transformer

         self.full_context_alignment = full_context_alignment
         self.alignment_heads = alignment_heads
         self.sliding_window = sliding_window
+        self.self_attn_type = self_attn_type
 
     def forward(self, *args, **kwargs):
         """Extend `_forward` for (possibly) multiple decoder pass:
@@ -374,7 +413,8 @@ 

Source code for onmt.decoders.transformer

 
     def _compute_dec_mask(self, tgt_pad_mask, future):
         tgt_len = tgt_pad_mask.size(-1)
-        if not future:  # apply future_mask, result mask in (B, T, T)
+        if not future:
+            # Add triangular future_mask and pad_mask, result mask in (B, T, T).
             future_mask = torch.ones(
                 [tgt_len, tgt_len],
                 device=tgt_pad_mask.device,
@@ -385,14 +425,19 @@ 

Source code for onmt.decoders.transformer

                 future_mask = future_mask.triu_(-self.sliding_window)
             future_mask = future_mask.bool()
             future_mask = ~future_mask.view(1, tgt_len, tgt_len)
-
+            # Patch for scaled dot product attention.
+            patch_mask = ~torch.all(
+                tgt_pad_mask + future_mask, dim=2, keepdim=True
+            ).expand_as(tgt_pad_mask + future_mask)
             dec_mask = torch.gt(tgt_pad_mask + future_mask, 0)
-        else:  # only mask padding, result mask in (B, 1, T)
+            dec_mask = torch.logical_and(dec_mask, patch_mask)
+        else:
+            # Only mask padding, result mask in (B, 1, T).
             dec_mask = tgt_pad_mask
         return dec_mask
 
     def _forward_self_attn(self, norm_layer_in, dec_mask, step, return_attn=False):
-        if self.self_attn_type == "scaled-dot":
+        if self.self_attn_type in ["scaled-dot", "scaled-dot-flash"]:
             return self.self_attn(
                 norm_layer_in,
                 norm_layer_in,
@@ -441,6 +486,11 @@ 

Source code for onmt.decoders.transformer

         use_ckpting=[],
         parallel_gpu=1,
         sliding_window=0,
+        rotary_interleave=True,
+        rotary_theta=1e4,
+        rotary_dim=0,
+        num_experts=0,
+        num_experts_per_tok=2,
     ):
         """
         Args:
@@ -469,12 +519,18 @@ 

Source code for onmt.decoders.transformer

             use_ckpting=use_ckpting,
             parallel_gpu=parallel_gpu,
             sliding_window=sliding_window,
+            rotary_interleave=rotary_interleave,
+            rotary_theta=rotary_theta,
+            rotary_dim=rotary_dim,
+            num_experts=num_experts,
+            num_experts_per_tok=num_experts_per_tok,
         )
         self.context_attn = MultiHeadedAttention(
             heads,
             d_model,
             dropout=attention_dropout,
             attn_type="context",
+            self_attn_type=self.self_attn_type,
             add_qkvbias=add_qkvbias,
             num_kv=num_kv,
             use_ckpting=use_ckpting,
@@ -627,6 +683,11 @@ 

Source code for onmt.decoders.transformer

             if opt.parallel_mode == "tensor_parallel"
             else 1,
             sliding_window=opt.sliding_window,
+            rotary_interleave=opt.rotary_interleave,
+            rotary_theta=opt.rotary_theta,
+            rotary_dim=opt.rotary_dim,
+            num_experts=opt.num_experts,
+            num_experts_per_tok=opt.num_experts_per_tok,
         )
 
     def init_state(self, src, enc_out, enc_final_hs):
@@ -650,7 +711,18 @@ 

Source code for onmt.decoders.transformer

                 if layer.self_attn.layer_cache[1]["keys"].numel() != 0:
                     x = fn(layer.self_attn.layer_cache[1]["keys"], 0)
                     y = fn(layer.self_attn.layer_cache[1]["values"], 0)
-                    layer.self_attn.layer_cache = True, {"keys": x, "values": y}
+                    if (
+                        layer.self_attn.layer_cache[1].get("key_pad_mask", None)
+                        is not None
+                    ):
+                        z = fn(layer.self_attn.layer_cache[1]["key_pad_mask"], 0)
+                    else:
+                        z = None
+                    layer.self_attn.layer_cache = True, {
+                        "keys": x,
+                        "values": y,
+                        "key_pad_mask": z,
+                    }
 
     def detach_state(self):
         raise NotImplementedError
@@ -674,7 +746,7 @@ 

Source code for onmt.decoders.transformer

         heads (int): number of heads
         d_ff (int): size of the inner FF layer
         copy_attn (bool): if using a separate copy attention
-        self_attn_type (str): type of self-attention scaled-dot, average
+        self_attn_type (str): type of self-attention scaled-dot, scaled-dot-flash, average
         dropout (float): dropout in residual, self-attn(dot) and feed-forward
         attention_dropout (float): dropout in context_attn (and self-attn(avg))
         embeddings (onmt.modules.Embeddings):
@@ -689,8 +761,25 @@ 

Source code for onmt.decoders.transformer

         alignment_layer (int): N° Layer to supervise with for alignment guiding
         alignment_heads (int):
             N. of cross attention heads to use for alignment guiding
+        pos_ffn_activation_fn (ActivationFunction):
+            activation function choice for PositionwiseFeedForward layer
         add_qkvbias (bool): whether to add bias to the Key/Value nn.Linear
+        num_kv (int): number of heads for KV when different vs Q (multiquery)
+        add_ffnbias (bool): whether to add bias to the FF nn.Linear
+        parallel_residual (bool): Use parallel residual connections in each layer block, as used
+            by the GPT-J and GPT-NeoX models
+        shared_layer_norm (bool): When using parallel residual, share the input and post
+            attention layer norms.
         layer_norm (string): type of layer normalization standard/rms
+        norm_eps (float): layer norm epsilon
+        use_ckpting (List): layers for which we checkpoint for backward
+        parallel_gpu (int): Number of gpu for tensor parallelism
+        sliding_window (int): Width of the band mask and KV cache (cf Mistral Model)
+        rotary_interleave (bool): Interleave the head dimensions when rotary embeddings are applied
+        rotary_theta (int): rotary base theta
+        rotary_dim (int): in some cases the rotary dim is lower than head dim
+        num_experts (int): Number of experts for MoE
+        num_experts_per_tok (int): Number of experts choice per token
     """
 
     def __init__(
@@ -721,6 +810,11 @@ 

Source code for onmt.decoders.transformer

         use_ckpting=[],
         parallel_gpu=1,
         sliding_window=0,
+        rotary_interleave=True,
+        rotary_theta=1e4,
+        rotary_dim=0,
+        num_experts=0,
+        num_experts_per_tok=2,
     ):
         super(TransformerDecoder, self).__init__(
             d_model, copy_attn, embeddings, alignment_layer, layer_norm, norm_eps
@@ -751,6 +845,11 @@ 

Source code for onmt.decoders.transformer

                     use_ckpting=use_ckpting,
                     parallel_gpu=parallel_gpu,
                     sliding_window=sliding_window,
+                    rotary_interleave=rotary_interleave,
+                    rotary_theta=rotary_theta,
+                    rotary_dim=rotary_dim,
+                    num_experts=num_experts,
+                    num_experts_per_tok=num_experts_per_tok,
                 )
                 for i in range(num_layers)
             ]
@@ -853,7 +952,9 @@ 

Source code for onmt.decoders.transformer

                     },
                 )
                 if hasattr(layer.self_attn, "rope"):
-                    layer.self_attn.rope = layer.self_attn.rope.to(enc_out.device)
+ layer.self_attn.rope = layer.self_attn.rope.to(enc_out.device) + layer.self_attn.cos = layer.self_attn.cos.to(enc_out.device) + layer.self_attn.sin = layer.self_attn.sin.to(enc_out.device)
class TransformerLMDecoderLayer(TransformerDecoderLayerBase): @@ -887,7 +988,9 @@

Source code for onmt.decoders.transformer

         dec_mask = None
 
         if layer_in.size(1) > 1:
-            # masking is necessary when sequence length is greater than one
+            # Masking is necessary when sequence length is greater than one
+            # The decoding has not started yet,
+            # we compute the scores on the source tokens in one shot.
             dec_mask = self._compute_dec_mask(tgt_pad_mask, future)
             dec_mask = dec_mask.unsqueeze(1)
             dec_mask = dec_mask.expand(-1, -1, dec_mask.size(3), -1)
@@ -919,22 +1022,45 @@ 

Source code for onmt.decoders.transformer

 class TransformerLMDecoder(TransformerDecoderBase):
     """The Transformer decoder from GPT-2
     Args:
-         num_layers (int): number of decoder layers.
-         d_model (int): size of the model
-         heads (int): number of heads
-         d_ff (int): size of the inner FF layer
-         copy_attn (bool): if using a separate copy attention
-         self_attn_type (str): type of self-attention scaled-dot, average
-         dropout (float): dropout in residual, self-attn(dot) and feed-forward
-         attention_dropout (float): dropout in context_attn (and self-attn(avg))
-         embeddings (onmt.modules.Embeddings):
-             embeddings to use, should have positional encodings
-         max_relative_positions (int):
-             Max distance between inputs in relative positions representations
-         relative_positions_buckets (int):
-             Number of buckets when using Relative positions bias
-         aan_useffn (bool): Turn on the FFN layer in the AAN decoder
-         add_qkvbias (bool): whether to add bias to the Key/Value nn.Linear
+        num_layers (int): number of decoder layers.
+        d_model (int): size of the model
+        heads (int): number of heads
+        d_ff (int): size of the inner FF layer
+        copy_attn (bool): if using a separate copy attention
+        self_attn_type (str): type of self-attention scaled-dot, scaled-dot-flash, average
+        dropout (float): dropout in residual, self-attn(dot) and feed-forward
+        attention_dropout (float): dropout in context_attn (and self-attn(avg))
+        embeddings (onmt.modules.Embeddings):
+            embeddings to use, should have positional encodings
+        max_relative_positions (int):
+            Max distance between inputs in relative positions representations
+        relative_positions_buckets (int):
+            Number of buckets when using Relative positions bias
+        aan_useffn (bool): Turn on the FFN layer in the AAN decoder
+        full_context_alignment (bool):
+            whether enable an extra full context decoder forward for alignment
+        alignment_layer (int): N° Layer to supervise with for alignment guiding
+        alignment_heads (int):
+            N. of cross attention heads to use for alignment guiding
+        pos_ffn_activation_fn (ActivationFunction):
+            activation function choice for PositionwiseFeedForward layer
+        add_qkvbias (bool): whether to add bias to the Key/Value nn.Linear
+        num_kv (int): number of heads for KV when different vs Q (multiquery)
+        add_ffnbias (bool): whether to add bias to the FF nn.Linear
+        parallel_residual (bool): Use parallel residual connections in each layer block, as used
+            by the GPT-J and GPT-NeoX models
+        shared_layer_norm (bool): When using parallel residual, share the input and post
+            attention layer norms.
+        layer_norm (string): type of layer normalization standard/rms
+        norm_eps (float): layer norm epsilon
+        use_ckpting (List): layers for which we checkpoint for backward
+        parallel_gpu (int): Number of gpu for tensor parallelism
+        sliding_window (int): Width of the band mask and KV cache (cf Mistral Model)
+        rotary_interleave (bool): Interleave the head dimensions when rotary embeddings are applied
+        rotary_theta (int): rotary base theta
+        rotary_dim (int): in some cases the rotary dim is lower than head dim
+        num_experts (int): Number of experts for MoE
+        num_experts_per_tok (int): Number of experts choice per token
     """
 
     def __init__(
@@ -965,6 +1091,11 @@ 

Source code for onmt.decoders.transformer

         use_ckpting=[],
         parallel_gpu=1,
         sliding_window=0,
+        rotary_interleave=True,
+        rotary_theta=1e4,
+        rotary_dim=0,
+        num_experts=0,
+        num_experts_per_tok=2,
     ):
         super(TransformerLMDecoder, self).__init__(
             d_model, copy_attn, embeddings, alignment_layer, layer_norm, norm_eps
@@ -994,6 +1125,11 @@ 

Source code for onmt.decoders.transformer

                     use_ckpting=use_ckpting,
                     parallel_gpu=parallel_gpu,
                     sliding_window=sliding_window,
+                    rotary_interleave=rotary_interleave,
+                    rotary_theta=rotary_theta,
+                    rotary_dim=rotary_dim,
+                    num_experts=num_experts,
+                    num_experts_per_tok=num_experts_per_tok,
                 )
                 for i in range(num_layers)
             ]
@@ -1007,15 +1143,22 @@ 

Source code for onmt.decoders.transformer

 
     def forward(self, tgt, enc_out=None, step=None, **kwargs):
         """Decode, possibly stepwise."""
+
         if step == 0:
+            # decoding mode.
+            # Initialize KV and key_pad_mask cache.
             self._init_cache(tgt)
         elif step is None:
+            # training mode.
             for layer in self.transformer_layers:
                 layer.self_attn.layer_cache = (
                     False,
-                    {"keys": torch.tensor([]), "values": torch.tensor([])},
+                    {
+                        "keys": torch.tensor([]),
+                        "values": torch.tensor([]),
+                        "key_pad_mask": None,
+                    },
                 )
-
         dec_out = self.embeddings(tgt, step=step)
 
         assert dec_out.dim() == 3  # batch x len x embedding_dim
@@ -1048,18 +1191,24 @@ 

Source code for onmt.decoders.transformer

 
     def _init_cache(self, tgt=None):
         for layer in self.transformer_layers:
-            if isinstance(layer.self_attn, AverageAttention):
-                raise NotImplementedError
-            else:
-                layer.self_attn.layer_cache = (
-                    True,
-                    {
-                        "keys": torch.tensor([], device=tgt.device),
-                        "values": torch.tensor([], device=tgt.device),
-                    },
-                )
-                if hasattr(layer.self_attn, "rope"):
-                    layer.self_attn.rope = layer.self_attn.rope.to(tgt.device)
+            if hasattr(layer, "self_attn"):
+                if isinstance(layer.self_attn, AverageAttention):
+                    raise NotImplementedError
+                else:
+                    layer.self_attn.layer_cache = (
+                        True,
+                        {
+                            "keys": torch.tensor([], device=tgt.device),
+                            "values": torch.tensor([], device=tgt.device),
+                            "key_pad_mask": tgt[:, :, 0]
+                            .eq(self.embeddings.word_padding_idx)
+                            .unsqueeze(1),
+                        },
+                    )
+                    if hasattr(layer.self_attn, "rope"):
+                        layer.self_attn.rope = layer.self_attn.rope.to(tgt.device)
+                        layer.self_attn.cos = layer.self_attn.cos.to(tgt.device)
+                        layer.self_attn.sin = layer.self_attn.sin.to(tgt.device)
 
diff --git a/_modules/onmt/encoders/transformer.html b/_modules/onmt/encoders/transformer.html index 813a809d8d..ba016fbf1f 100644 --- a/_modules/onmt/encoders/transformer.html +++ b/_modules/onmt/encoders/transformer.html @@ -232,6 +232,19 @@

Source code for onmt.encoders.transformer

         dropout (float): dropout probability(0-1.0).
         pos_ffn_activation_fn (ActivationFunction):
             activation function choice for PositionwiseFeedForward layer
+        add_qkvbias (bool): whether to add bias to the Key/Value nn.Linear
+        num_kv (int): number of heads for KV when different vs Q (multiquery)
+        add_ffnbias (bool): whether to add bias to the FF nn.Linear
+        parallel_residual (bool): Use parallel residual connections in each layer block, as used
+            by the GPT-J and GPT-NeoX models
+        layer_norm (string): type of layer normalization standard/rms
+        norm_eps (float): layer norm epsilon
+        use_ckpting (List): layers for which we checkpoint for backward
+        parallel_gpu (int): Number of gpu for tensor parallelism
+        rotary_interleave (bool): Interleave the head dimensions when rotary
+            embeddings are applied
+        rotary_theta (int): rotary base theta
+        rotary_dim (int): rotary dim when different to dim per head
     """
 
     def __init__(
@@ -252,6 +265,9 @@ 

Source code for onmt.encoders.transformer

         norm_eps=1e-6,
         use_ckpting=[],
         parallel_gpu=1,
+        rotary_interleave=True,
+        rotary_theta=1e4,
+        rotary_dim=0,
     ):
         super(TransformerEncoderLayer, self).__init__()
 
@@ -262,6 +278,9 @@ 

Source code for onmt.encoders.transformer

             is_decoder=False,
             max_relative_positions=max_relative_positions,
             relative_positions_buckets=relative_positions_buckets,
+            rotary_interleave=rotary_interleave,
+            rotary_theta=rotary_theta,
+            rotary_dim=rotary_dim,
             attn_type="self",
             add_qkvbias=add_qkvbias,
             num_kv=num_kv,
@@ -366,6 +385,9 @@ 

Source code for onmt.encoders.transformer

         norm_eps=1e-6,
         use_ckpting=[],
         parallel_gpu=1,
+        rotary_interleave=True,
+        rotary_theta=1e4,
+        rotary_dim=0,
     ):
         super(TransformerEncoder, self).__init__()
 
@@ -389,6 +411,9 @@ 

Source code for onmt.encoders.transformer

                     norm_eps=norm_eps,
                     use_ckpting=use_ckpting,
                     parallel_gpu=parallel_gpu,
+                    rotary_interleave=rotary_interleave,
+                    rotary_theta=rotary_theta,
+                    rotary_dim=rotary_dim,
                 )
                 for i in range(num_layers)
             ]
@@ -426,6 +451,9 @@ 

Source code for onmt.encoders.transformer

             parallel_gpu=opt.world_size
             if opt.parallel_mode == "tensor_parallel"
             else 1,
+            rotary_interleave=opt.rotary_interleave,
+            rotary_theta=opt.rotary_theta,
+            rotary_dim=opt.rotary_dim,
         )
[docs] def forward(self, src, src_len=None): diff --git a/_modules/onmt/inputters/dynamic_iterator.html b/_modules/onmt/inputters/dynamic_iterator.html index 9ab6a6a541..3daf76fdb6 100644 --- a/_modules/onmt/inputters/dynamic_iterator.html +++ b/_modules/onmt/inputters/dynamic_iterator.html @@ -204,7 +204,7 @@

Source code for onmt.inputters.dynamic_iterator

< """Module that contain iterator used for dynamic data.""" import torch from itertools import cycle -from onmt.constants import CorpusTask +from onmt.constants import CorpusTask, ModelTask from onmt.inputters.text_corpus import get_corpora, build_corpora_iters from onmt.inputters.text_utils import ( text_sort_key, @@ -367,6 +367,10 @@

Source code for onmt.inputters.dynamic_iterator

< self.skip_empty_level = skip_empty_level self.random_shuffler = RandomShuffler() self.bucket_idx = 0 + if task != CorpusTask.TRAIN and vocabs["data_task"] == ModelTask.LANGUAGE_MODEL: + self.left_pad = True + else: + self.left_pad = False
[docs] @classmethod def from_opt( @@ -557,7 +561,9 @@

Source code for onmt.inputters.dynamic_iterator

< # within the batch if self.task == CorpusTask.TRAIN: minibatch.sort(key=lambda x: self.sort_key(x[0]), reverse=True) - tensor_batch = tensorify(self.vocabs, minibatch, self.device) + tensor_batch = tensorify( + self.vocabs, minibatch, self.device, self.left_pad + ) yield (tensor_batch, bucket_idx)
@@ -569,7 +575,12 @@

Source code for onmt.inputters.dynamic_iterator

< def __iter__(self): for (tensor_batch, bucket_idx) in self.data_iter: for key in tensor_batch.keys(): - if key not in ["src_ex_vocab", "cid"]: + if key not in [ + "src_ex_vocab", + "cid", + "ind_in_bucket", + "cid_line_number", + ]: tensor_batch[key] = tensor_batch[key].to(self.device) yield (tensor_batch, bucket_idx) diff --git a/_modules/onmt/inputters/text_corpus.html b/_modules/onmt/inputters/text_corpus.html index ba3528100b..9836efae48 100644 --- a/_modules/onmt/inputters/text_corpus.html +++ b/_modules/onmt/inputters/text_corpus.html @@ -241,6 +241,63 @@

Source code for onmt.inputters.text_corpus

         _file.close()
 
 
+class BlockwiseCorpus(object):
+    """A corpus class for reading a single file block by block."""
+
+    def __init__(self, name, file_path, block_size=4096):
+        """Initialize file path and block size."""
+        self.id = name
+        self.file_path = file_path
+        self.block_size = block_size
+
+    def load(self, offset=0, stride=1):
+        """
+        Load file and iterate by blocks.
+        `offset` and `stride` allow iterating only on every
+        `stride` block, starting from `offset`.
+        """
+
+        def make_ex(block_content):
+            example = {
+                "src": block_content,
+                "tgt": block_content,
+                "src_original": block_content,
+                "tgt_original": block_content,
+            }
+            return example
+
+        with open(self.file_path, mode="r", encoding="utf-8") as file:
+            block_content = ""
+            block_index = 0
+
+            while True:
+                chunk = file.read(self.block_size)
+                if not chunk:
+                    break
+
+                if (block_index // stride) % stride == offset:
+                    block_content += chunk
+
+                    if len(chunk) < self.block_size:
+                        # Reached end of file
+                        yield make_ex(block_content)
+                        break
+
+                    if len(block_content) >= self.block_size:
+                        yield make_ex(block_content)
+                block_content = ""
+                block_index += 1
+
+    def __str__(self):
+        cls_name = type(self).__name__
+        return (
+            f"{cls_name}({self.id}, {self.file_path}, {self.file_path}"
+            f"align={None}, "
+            f"n_src_feats={0}, "
+            f'src_feats_defaults="{None}")'
+        )
+
+
 
[docs]class ParallelCorpus(object): """A parallel corpus file pair that can be loaded to iterate.""" @@ -320,20 +377,27 @@

Source code for onmt.inputters.text_corpus

     if task == CorpusTask.TRAIN:
         for corpus_id, corpus_dict in opts.data.items():
             if corpus_id != CorpusName.VALID:
-                corpora_dict[corpus_id] = ParallelCorpus(
-                    corpus_id,
-                    corpus_dict["path_src"],
-                    corpus_dict["path_tgt"],
-                    corpus_dict["path_align"],
-                    n_src_feats=opts.n_src_feats,
-                    src_feats_defaults=opts.src_feats_defaults,
-                )
+                if corpus_dict.get("path_txt", None) is None:
+                    corpora_dict[corpus_id] = ParallelCorpus(
+                        corpus_id,
+                        corpus_dict["path_src"],
+                        corpus_dict["path_tgt"],
+                        corpus_dict["path_align"],
+                        n_src_feats=opts.n_src_feats,
+                        src_feats_defaults=opts.src_feats_defaults,
+                    )
+                else:
+                    corpora_dict[corpus_id] = BlockwiseCorpus(
+                        corpus_id,
+                        corpus_dict["path_txt"],
+                        block_size=8192,  # number of characters
+                    )
     elif task == CorpusTask.VALID:
         if CorpusName.VALID in opts.data.keys():
             corpora_dict[CorpusName.VALID] = ParallelCorpus(
                 CorpusName.VALID,
                 opts.data[CorpusName.VALID]["path_src"],
-                opts.data[CorpusName.VALID]["path_tgt"],
+                opts.data[CorpusName.VALID]["path_tgt"] if tgt is None else None,
                 opts.data[CorpusName.VALID]["path_align"],
                 n_src_feats=opts.n_src_feats,
                 src_feats_defaults=opts.src_feats_defaults,
@@ -377,20 +441,20 @@ 

Source code for onmt.inputters.text_corpus

 
     def _process(self, stream):
         for i, example in enumerate(stream):
-            example["src"] = example["src"].strip("\n").split()
-            example["src_original"] = example["src_original"].strip("\n").split()
+            example["src"] = example["src"].strip().split(" ")
+            example["src_original"] = example["src_original"].strip().split(" ")
             if "src_feats" in example:
                 example["src_feats"] = [
-                    feat.strip("\n").split() for feat in example["src_feats"]
+                    feat.strip().split(" ") for feat in example["src_feats"]
                 ]
             line_number = i * self.stride + self.offset
             example["cid_line_number"] = line_number
             example["cid"] = self.cid
             if "align" in example:
-                example["align"] = example["align"].strip("\n").split()
+                example["align"] = example["align"].strip().split(" ")
             if example["tgt"] is not None:
-                example["tgt"] = example["tgt"].strip("\n").split()
-                example["tgt_original"] = example["tgt_original"].strip("\n").split()
+                example["tgt"] = example["tgt"].strip().split(" ")
+                example["tgt_original"] = example["tgt_original"].strip().split(" ")
                 if (
                     len(example["src"]) == 0
                     or len(example["tgt"]) == 0
diff --git a/_modules/onmt/models/model.html b/_modules/onmt/models/model.html
index acca4e7d4c..062004f655 100644
--- a/_modules/onmt/models/model.html
+++ b/_modules/onmt/models/model.html
@@ -204,7 +204,7 @@ 

Source code for onmt.models.model

 """ Onmt NMT Model base class definition """
 import torch
 import torch.nn as nn
-import glob
+from glob import glob
 
 
 
[docs]class BaseModel(nn.Module): @@ -248,6 +248,70 @@

Source code for onmt.models.model

     def count_parameters(self, log=print):
         raise NotImplementedError
 
+    def _load_param(self, name, module, param_name, param, buf_list, ckpt_t, offset):
+        if module.__class__.__name__ == "WQLinear_GEMM":
+            # ugly patch because in_feat and out_feat are reversed in WQLinear_GEMM
+            param.data = param.data.transpose(0, 1)
+            ckpt_t = ckpt_t.transpose(0, 1)
+        if name.split(".")[-1] in [
+            "linear_keys",
+            "linear_values",
+            "linear_query",
+            "w_1",
+            "w_3",
+        ]:
+            col_slice_start = param.data.size(0) * offset
+            col_slice_end = param.data.size(0) * (offset + 1)
+        else:
+            col_slice_start = 0
+            col_slice_end = param.data.size(0)
+        if param.data.dim() == 2:
+            if name.split(".")[-1] in ["final_linear", "w_2"]:
+                row_slice_start = param.data.size(1) * offset
+                row_slice_end = param.data.size(1) * (offset + 1)
+            else:
+                row_slice_start = 0
+                row_slice_end = param.data.size(1)
+            assert (
+                param.data.size()
+                == ckpt_t[
+                    col_slice_start:col_slice_end,
+                    row_slice_start:row_slice_end,
+                ].size()
+            ), "An error in model's partition and checkpoint's slice was detected"
+            if name + "." + param_name in buf_list:
+                if module.__class__.__name__ == "WQLinear_GEMM":
+                    module.register_buffer(
+                        param_name,
+                        ckpt_t[
+                            col_slice_start:col_slice_end,
+                            row_slice_start:row_slice_end,
+                        ].transpose(0, 1),
+                    )
+                else:
+                    module.register_buffer(
+                        param_name,
+                        ckpt_t[
+                            col_slice_start:col_slice_end,
+                            row_slice_start:row_slice_end,
+                        ],
+                    )
+            else:
+                param.data = ckpt_t[
+                    col_slice_start:col_slice_end,
+                    row_slice_start:row_slice_end,
+                ]
+        else:
+            assert (
+                param.data.size() == ckpt_t[col_slice_start:col_slice_end].size()
+            ), "An error in model's partition and checkpoint's slice was detected"
+            if name + "." + param_name in buf_list:
+                module.register_buffer(
+                    param_name, ckpt_t[col_slice_start:col_slice_end]
+                )
+            else:
+                param.data = ckpt_t[col_slice_start:col_slice_end]
+
 
[docs] def load_state_dict( self, checkpoint, @@ -271,64 +335,31 @@

Source code for onmt.models.model

         if device == torch.device("cpu"):
             offset = 0
         buf_list = []
+        for buf_name, buf in self.named_buffers():
+            buf_list.append(buf_name)
         for name, module in self.named_modules():
-            for buf_name, buf in module.named_buffers():
-                buf_list.append(buf_name)
-                if len(buf_name.split(".")) == 1:  # only last key
-                    if precision != torch.int8:
-                        module.to(precision)
-                    module.to(device)
-            for param_name, param in module.named_parameters():
+            named_buf_and_param = list(module.named_buffers()) + list(
+                module.named_parameters()
+            )
+            for param_name, param in named_buf_and_param:
                 if len(param_name.split(".")) == 1:  # only last key
                     if name + "." + param_name in checkpoint["model"].keys():
                         ckpt_t = checkpoint["model"][name + "." + param_name]
-
-                        if name.split(".")[-1] in [
-                            "linear_keys",
-                            "linear_values",
-                            "linear_query",
-                            "w_1",
-                            "w_3",
-                        ]:
-                            col_slice_start = param.data.size(0) * offset
-                            col_slice_end = param.data.size(0) * (offset + 1)
-                        else:
-                            col_slice_start = 0
-                            col_slice_end = param.data.size(0)
-                        if param.data.dim() == 2:
-                            if name.split(".")[-1] in ["final_linear", "w_2"]:
-                                row_slice_start = param.data.size(1) * offset
-                                row_slice_end = param.data.size(1) * (offset + 1)
-                            else:
-                                row_slice_start = 0
-                                row_slice_end = param.data.size(1)
-                            assert (
-                                param.data.size()
-                                == ckpt_t[
-                                    col_slice_start:col_slice_end,
-                                    row_slice_start:row_slice_end,
-                                ].size()
-                            ), "An error in model's partition and checkpoint's slice was detected"
-                            param.data = ckpt_t[
-                                col_slice_start:col_slice_end,
-                                row_slice_start:row_slice_end,
-                            ]
-                        else:
-                            assert (
-                                param.data.size()
-                                == ckpt_t[col_slice_start:col_slice_end].size()
-                            ), "An error in model's partition and checkpoint's slice was detected"
-                            param.data = ckpt_t[col_slice_start:col_slice_end]
-
+                        self._load_param(
+                            name, module, param_name, param, buf_list, ckpt_t, offset
+                        )
                         del checkpoint["model"][name + "." + param_name]
                     elif (
                         "generator" in checkpoint.keys()
-                        and name == "generator"
+                        and "generator" in name
                         and checkpoint["generator"] is not None
                         and param_name in checkpoint["generator"].keys()
                     ):
-                        param.data = checkpoint["generator"][param_name]
-                        del checkpoint["generator"][param_name]
+                        keyname = (
+                            name + "." + param_name if "linear" in name else param_name
+                        )
+                        param.data = checkpoint["generator"][keyname]
+                        del checkpoint["generator"][keyname]
                     elif strict and "lora" not in param_name:
                         raise ValueError(
                             "Missing key in checkpoint: %s" % name + "." + param_name
@@ -336,6 +367,7 @@ 

Source code for onmt.models.model

                     if precision != torch.int8:
                         module.to(precision)
                     module.to(device)
+
         for key in checkpoint[
             "model"
         ].keys():  # if some keys are left in checkpoint after deletion
@@ -376,7 +408,7 @@ 

Source code for onmt.models.model

         except ImportError:
             raise ImportError("run: pip install safetensors, to use safetensors")
         keyfound = {}
-        shards = glob.glob(model_path + ".*.safetensors")
+        shards = glob(model_path + ".*.safetensors")
         if len(shards) == 0:
             raise ValueError("No safetensors file found")
         f = []
@@ -385,62 +417,25 @@ 

Source code for onmt.models.model

             f.append(safetensors.safe_open(shard, framework="pt", device="cpu"))
             for key in f[i].keys():
                 keys_shard[key] = i
+        if device == torch.device("cpu"):
+            offset = 0
         buf_list = []
+        for buf_name, buf in self.named_buffers():
+            buf_list.append(buf_name)
         for name, module in self.named_modules():
-            for buf_name, buf in module.named_buffers():
-                buf_list.append(buf_name)
-                if len(buf_name.split(".")) == 1:  # only last key
-                    if precision == torch.int8:
-                        torch.quantization.quantize_dynamic(module, inplace=True)
-                    else:
-                        module.to(precision)
-                    module.to(device)
-            for param_name, param in module.named_parameters():
+            named_buf_and_param = list(module.named_buffers()) + list(
+                module.named_parameters()
+            )
+            for param_name, param in named_buf_and_param:
                 if len(param_name.split(".")) == 1:  # only last key
                     if name + "." + param_name in keys_shard.keys():
 
                         ckpt_t = f[keys_shard[name + "." + param_name]].get_tensor(
                             name + "." + param_name
                         )
-                        if name.split(".")[-1] in [
-                            "linear_keys",
-                            "linear_values",
-                            "linear_query",
-                            "w_1",
-                            "w_3",
-                        ]:
-                            col_slice_start = param.data.size(0) * offset
-                            col_slice_end = param.data.size(0) * (offset + 1)
-                        else:
-                            col_slice_start = 0
-                            col_slice_end = param.data.size(0)
-                        if param.data.dim() == 2:
-                            if name.split(".")[-1] in ["final_linear", "w_2"]:
-                                row_slice_start = param.data.size(1) * offset
-                                row_slice_end = param.data.size(1) * (offset + 1)
-                            else:
-                                row_slice_start = 0
-                                row_slice_end = param.data.size(1)
-                            assert (
-                                param.data.size()
-                                == ckpt_t[
-                                    col_slice_start:col_slice_end,
-                                    row_slice_start:row_slice_end,
-                                ].size()
-                            ), "An error in model's partition and checkpoint's slice was detected"
-
-                            param.data = ckpt_t[
-                                col_slice_start:col_slice_end,
-                                row_slice_start:row_slice_end,
-                            ]
-                        else:
-                            assert (
-                                param.data.size()
-                                == ckpt_t[col_slice_start:col_slice_end].size()
-                            ), "An error in model's partition and checkpoint's slice was detected"
-
-                            param.data = ckpt_t[col_slice_start:col_slice_end]
-
+                        self._load_param(
+                            name, module, param_name, param, buf_list, ckpt_t, offset
+                        )
                         keyfound[name + "." + param_name] = True
                     elif strict and "lora" not in param_name:
                         raise ValueError(
diff --git a/_modules/onmt/modules/copy_generator.html b/_modules/onmt/modules/copy_generator.html
index 4c5a7c9b65..2fc546e341 100644
--- a/_modules/onmt/modules/copy_generator.html
+++ b/_modules/onmt/modules/copy_generator.html
@@ -205,9 +205,7 @@ 

Source code for onmt.modules.copy_generator

 import torch.nn as nn
 
 
-def collapse_copy_scores(
-    scores, batch, tgt_vocab, src_vocabs=None, batch_dim=1, batch_offset=None
-):
+def collapse_copy_scores(scores, batch, tgt_vocab, batch_dim=1):
     """
     Given scores from an expanded dictionary
     corresponeding to a batch, sums together copies,
@@ -218,12 +216,7 @@ 

Source code for onmt.modules.copy_generator

         blank = []
         fill = []
 
-        if src_vocabs is None:
-            src_vocab = batch["src_ex_vocab"][b]
-        else:
-            batch_id = batch_offset[b] if batch_offset is not None else b
-            index = batch["ind_in_bucket"].data[batch_id]
-            src_vocab = src_vocabs[index]
+        src_vocab = batch["src_ex_vocab"][b]
 
         for i in range(1, len(src_vocab)):
             sw = src_vocab.ids_to_tokens[i]
@@ -232,8 +225,8 @@ 

Source code for onmt.modules.copy_generator

                 blank.append(offset + i)
                 fill.append(ti)
         if blank:
-            blank = torch.Tensor(blank).type_as(batch["ind_in_bucket"].data)
-            fill = torch.Tensor(fill).type_as(batch["ind_in_bucket"].data)
+            blank = torch.Tensor(blank).to(torch.int64)
+            fill = torch.Tensor(fill).to(torch.int64)
             score = scores[:, b] if batch_dim == 1 else scores[b]
             score.index_add_(1, fill, score.index_select(1, blank))
             score.index_fill_(1, blank, 1e-10)
diff --git a/_modules/onmt/modules/embeddings.html b/_modules/onmt/modules/embeddings.html
index 1043921612..8eb3995447 100644
--- a/_modules/onmt/modules/embeddings.html
+++ b/_modules/onmt/modules/embeddings.html
@@ -207,6 +207,7 @@ 

Source code for onmt.modules.embeddings

 
 import torch
 import torch.nn as nn
+from torch.nn.utils import skip_init
 
 from onmt.modules.util_class import Elementwise
 from onmt.utils.logging import logger
@@ -374,7 +375,13 @@ 

Source code for onmt.modules.embeddings

         # is for words. Subsequent ones are for features, if any exist.
         emb_params = zip(vocab_sizes, emb_dims, pad_indices)
         embeddings = [
-            nn.Embedding(vocab, dim, padding_idx=pad, sparse=sparse)
+            skip_init(
+                nn.Embedding,
+                num_embeddings=vocab,
+                embedding_dim=dim,
+                padding_idx=pad,
+                sparse=sparse,
+            )
             for vocab, dim, pad in emb_params
         ]
         emb_luts = Elementwise(feat_merge, embeddings)
diff --git a/_modules/onmt/modules/multi_headed_attn.html b/_modules/onmt/modules/multi_headed_attn.html
index 13779daf46..771fc3d690 100644
--- a/_modules/onmt/modules/multi_headed_attn.html
+++ b/_modules/onmt/modules/multi_headed_attn.html
@@ -202,18 +202,17 @@
             
   

Source code for onmt.modules.multi_headed_attn

 """ Multi-Head Attention module """
-import math
 import torch
+import torch.nn as nn
+from math import log, sqrt
 from torch import Tensor
 from typing import Optional, Tuple
-from torch.nn import functional as F
-import torch.nn as nn
+from torch.nn.functional import scaled_dot_product_attention
 from torch.utils.checkpoint import checkpoint
 from torch.nn.utils import skip_init
 from .alibi_position_bias import AlibiPositionalBias
-import torch.distributed as dist
-import importlib
-
+from torch.distributed import all_reduce
+from importlib import import_module
 
 # Help functions for Rotary Embeddings
 # https://arxiv.org/pdf/2104.09864.pdf
@@ -222,28 +221,58 @@ 

Source code for onmt.modules.multi_headed_attn

# are both < 2048 tokens. -def rotaryembeddings(dim: int, maxseqlen=8192, base=10000): +def rotaryembeddings(dim: int, maxseqlen=2048, base=10000, device=None): inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim)) tmax = torch.arange(maxseqlen, device=inv_freq.device) rope = torch.outer(tmax, inv_freq).float() # rope is now matrix [maxseqlen, dim/2] rope = torch.polar(torch.ones_like(rope), rope) - return rope - - -def apply_rotary_emb(query, key, rope): - query = query.transpose(1, 2) - key = key.transpose(1, 2) - query_ = query.float().reshape(*query.shape[:-1], -1, 2) - query_ = torch.view_as_complex(query_) - key_ = key.float().reshape(*key.shape[:-1], -1, 2) - key_ = torch.view_as_complex(key_) - rope = rope.view(1, query_.size(1), 1, query_.size(3)) - query_out = torch.view_as_real(query_ * rope).flatten(3) - key_out = torch.view_as_real(key_ * rope).flatten(3) - return query_out.transpose(1, 2).type_as(query), key_out.transpose(1, 2).type_as( - key - ) + rope = torch.cat((rope, rope), dim=1) + if device is not None: + rope = rope.to(device) + cos = rope[:, : rope.size(1) // 2].real.contiguous().half() + sin = rope[:, : rope.size(1) // 2].imag.contiguous().half() + return rope, cos, sin + + +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +def apply_rotary_emb(query, key, rope, interleave): + if interleave: + query = query.transpose(1, 2) + key = key.transpose(1, 2) + query_ = query.float().reshape(*query.shape[:-1], -1, 2) + query_ = torch.view_as_complex(query_) + key_ = key.float().reshape(*key.shape[:-1], -1, 2) + key_ = torch.view_as_complex(key_) + rope = rope[:, : rope.size(1) // 2].view(1, query_.size(1), 1, query_.size(3)) + query_out = torch.view_as_real(query_ * rope).flatten(3) + key_out = torch.view_as_real(key_ * rope).flatten(3) + return query_out.transpose(1, 2).type_as(query), key_out.transpose( + 1, 2 + ).type_as(key) + else: + cos, sin = rope.real, rope.imag + rotary_dim = cos.size(1) + head_dim = query.size(3) + if rotary_dim < head_dim: + q_embed = (query[:, :, :, :rotary_dim] * cos) + ( + rotate_half(query[:, :, :, :rotary_dim]) * sin + ) + k_embed = (key[:, :, :, :rotary_dim] * cos) + ( + rotate_half(key[:, :, :, :rotary_dim]) * sin + ) + q_embed = torch.cat([q_embed, query[:, :, :, rotary_dim:]], dim=-1) + k_embed = torch.cat([k_embed, key[:, :, :, rotary_dim:]], dim=-1) + else: + q_embed = (query * cos) + (rotate_half(query) * sin) + k_embed = (key * cos) + (rotate_half(key) * sin) + return q_embed.type_as(query), k_embed.type_as(key) # Help functions for max_relative positions @@ -334,7 +363,7 @@

Source code for onmt.modules.multi_headed_attn

# up to max_distance relative_position_if_large = max_exact + ( torch.log(relative_position.float() / max_exact) - / math.log(max_distance / max_exact) + / log(max_distance / max_exact) * (num_buckets - max_exact) ).to(torch.long) relative_position_if_large = torch.min( @@ -446,7 +475,11 @@

Source code for onmt.modules.multi_headed_attn

is_decoder: bool = True, max_relative_positions: int = 0, relative_positions_buckets: int = 0, + rotary_interleave: bool = True, + rotary_theta: int = 1e4, + rotary_dim: int = 0, attn_type: str = None, + self_attn_type: str = None, add_qkvbias=False, num_kv=0, use_ckpting=[], @@ -513,6 +546,7 @@

Source code for onmt.modules.multi_headed_attn

self.max_relative_positions = max_relative_positions self.relative_positions_buckets = relative_positions_buckets self.attn_type = attn_type + self.self_attn_type = self_attn_type self.layer_cache = ( False, {"keys": torch.tensor([]), "values": torch.tensor([])}, @@ -538,21 +572,37 @@

Source code for onmt.modules.multi_headed_attn

self.relative_attention_bias = None if max_relative_positions == -1: # rotary embeddings - self.rope = rotaryembeddings(self.dim_per_head) - + if rotary_dim == 0: + self.rotary_dim = self.dim_per_head + else: + self.rotary_dim = rotary_dim + self.rope, self.cos, self.sin = rotaryembeddings( + self.rotary_dim, base=rotary_theta + ) + self.rotary_interleave = rotary_interleave + self.rotary_theta = rotary_theta + else: + self.cos = None + self.sin = None + self.rotary_interleave = None if max_relative_positions == -2: # alibi positional bias self.alibi = AlibiPositionalBias(head_count) self.maybe_ckpt = checkpoint if "mha" in use_ckpting else lambda f, x: f(x) try: - flash_pack = importlib.import_module("flash_attn") + flash_pack = import_module("flash_attn") if ( hasattr(flash_pack, "flash_attn_func") and torch.cuda.get_device_capability()[0] >= 8 ): self.flash_attn_func = getattr(flash_pack, "flash_attn_func") + self.flash_attn_with_kvcache = getattr( + flash_pack, "flash_attn_with_kvcache" + ) self.flash2 = True + else: + self.flash2 = False except ImportError: self.flash2 = False @@ -569,6 +619,7 @@

Source code for onmt.modules.multi_headed_attn

sliding_window: Optional[int] = 0, step: Optional[int] = 0, return_attn: Optional[bool] = False, + self_attn_type: str = None, ) -> Tuple[Tensor, Tensor]: """ Compute the context vector and the attention vectors. @@ -591,32 +642,117 @@

Source code for onmt.modules.multi_headed_attn

""" # 1) Project key, value, and query. # as a reminder at training layer_cache[0] remains False + key_pad_mask = self.layer_cache[1].get("key_pad_mask", None) if self.layer_cache[0]: + # Retrieve keys and values from the KV cache (decoding mode only). if self.attn_type == "self": query, key, value = ( self.linear_query(query), self.linear_keys(query), self.linear_values(query), ) + query = shape(query, self.dim_per_head) key = shape(key, self.dim_per_head) value = shape(value, self.dim_per_head) + start_pos = step + seqlen = query.size(2) - if self.max_relative_positions == -1: # Rotary Embeddings - start_pos = step - seqlen = query.size(2) - rope = self.rope[start_pos : start_pos + seqlen] - query, key = apply_rotary_emb(query, key, rope=rope) + if ( + step == 0 + or not self.flash2 + or self.self_attn_type != "scaled-dot-flash" + or self.max_relative_positions not in [0, -1] + or query.size(0) > 128 + or query.dtype != torch.float16 + ): + if self.max_relative_positions == -1: # Rotary Embeddings + if seqlen + start_pos > self.rope.size(0): + # Resize rotary embeddings. + self.rope, _, _ = rotaryembeddings( + self.rotary_dim, + maxseqlen=(seqlen + start_pos + 2048), + base=self.rotary_theta, + device=self.rope.device, + ) + rope = self.rope[start_pos : start_pos + seqlen] + query, key = apply_rotary_emb( + query, key, rope, interleave=self.rotary_interleave + ) + + if self.layer_cache[1]["keys"].numel() != 0: + key = torch.cat((self.layer_cache[1]["keys"], key), dim=2) + value = torch.cat((self.layer_cache[1]["values"], value), dim=2) + if sliding_window > 0 and key.size(2) > sliding_window: + key = key[:, :, 1:, :] + value = value[:, :, 1:, :] + + self.layer_cache[1]["keys"] = key + self.layer_cache[1]["values"] = value + + else: + if start_pos >= self.layer_cache[1]["keys"].size(2): + self.layer_cache[1]["keys"] = torch.cat( + [ + self.layer_cache[1]["keys"], + torch.zeros( + self.layer_cache[1]["keys"].shape[:-2] + + (32,) + + self.layer_cache[1]["keys"].shape[-1:], + device=query.device, + ).half(), + ], + dim=-2, + ) + self.layer_cache[1]["values"] = torch.cat( + [ + self.layer_cache[1]["values"], + torch.zeros( + self.layer_cache[1]["values"].shape[:-2] + + (32,) + + self.layer_cache[1]["values"].shape[-1:], + device=query.device, + ).half(), + ], + dim=-2, + ) + if ( + self.max_relative_positions == -1 + and start_pos + 32 >= self.rope.size(0) + ): + # Resize rotary embeddings. + # We take a margin of 32 tokens as the kv_cache + # is incremented by 32 tokens every 32 tokens. + self.rope, self.cos, self.sin = rotaryembeddings( + self.rotary_dim, + maxseqlen=(start_pos + 2048), + base=self.rotary_theta, + device=self.rope.device, + ) - if self.layer_cache[1]["keys"].numel() != 0: - key = torch.cat((self.layer_cache[1]["keys"], key), dim=2) - value = torch.cat((self.layer_cache[1]["values"], value), dim=2) if sliding_window > 0 and key.size(2) > sliding_window: - key = key[:, :, 1:, :] - value = value[:, :, 1:, :] + self.layer_cache[1]["keys"] = self.layer_cache[1]["keys"][ + :, :, 1:, : + ] + self.layer_cache[1]["values"] = self.layer_cache[1]["values"][ + :, :, 1:, : + ] + context = self.flash_attn_with_kvcache( + query.transpose(1, 2), + self.layer_cache[1]["keys"].transpose(1, 2), + self.layer_cache[1]["values"].transpose(1, 2), + key.transpose(1, 2), + value.transpose(1, 2), + rotary_cos=self.cos, + rotary_sin=self.sin, + cache_seqlens=step, + rotary_interleaved=self.rotary_interleave, + ).transpose(1, 2) + attn_output = self.final_linear(unshape(context)) + if self.parallel_gpu > 1: + all_reduce(attn_output) + return attn_output, None - self.layer_cache[1]["keys"] = key - self.layer_cache[1]["values"] = value elif self.attn_type == "context": query = self.linear_query(query) query = shape(query, self.dim_per_head) @@ -631,10 +767,26 @@

Source code for onmt.modules.multi_headed_attn

) self.layer_cache[1]["keys"] = key self.layer_cache[1]["values"] = value + + if key_pad_mask is not None: + # Increase the cached key pad mask by concatenation. + # For decoding only. + if step > 0: + y = torch.zeros( + (key_pad_mask.size(0), key_pad_mask.size(1), 1), + dtype=torch.bool, + device=key_pad_mask.device, + ) + self.layer_cache[1]["key_pad_mask"] = torch.cat( + (key_pad_mask, y), 2 + ) + key_pad_mask = self.layer_cache[1]["key_pad_mask"] else: + # Retrieve keys and values from linear layers (training mode). key = self.maybe_ckpt(self.linear_keys, key) value = self.maybe_ckpt(self.linear_values, value) query = self.maybe_ckpt(self.linear_query, query) + key = shape(key, self.dim_per_head) value = shape(value, self.dim_per_head) query = shape(query, self.dim_per_head) @@ -642,8 +794,18 @@

Source code for onmt.modules.multi_headed_attn

if self.max_relative_positions == -1: # Rotary Embeddings start_pos = 0 seqlen = query.size(2) - rope = self.rope[start_pos : start_pos + seqlen].to(query.device) - query, key = apply_rotary_emb(query, key, rope=rope) + if seqlen > self.rope.size(0): + # Resize rotary embeddings. + self.rope, self.cos, self.sin = rotaryembeddings( + self.rotary_dim, + maxseqlen=(seqlen + 2048), + base=self.rotary_theta, + device=query.device, + ) + rope = self.rope[start_pos : start_pos + seqlen] + query, key = apply_rotary_emb( + query, key, rope, interleave=self.rotary_interleave + ) b, h, l, d = key.size() if self.num_kv > 0: @@ -661,7 +823,6 @@

Source code for onmt.modules.multi_headed_attn

# Ultimately flashv2 will be part of pytorch https://github.com/pytorch/pytorch/pull/105602 # In the meantime: if vanilla tranformer or Rotary embeddings (not rel_pos, not alibi) # then use flash2 if seq len > 256 otherwise use xtransformer from pt2 uptream - flash2 = ( self.flash2 and l > 256 # https://github.com/Dao-AILab/flash-attention/issues/591 @@ -671,7 +832,9 @@

Source code for onmt.modules.multi_headed_attn

self.max_relative_positions in [-1, 0] and not return_attn and query.device != torch.device("cpu") + and self.self_attn_type == "scaled-dot-flash" ): + # Apply flash2 attention. causal = self.is_decoder and self.attn_type == "self" and mask is not None if self.is_decoder and self.attn_type == "self" and flash2: if causal: @@ -689,10 +852,11 @@

Source code for onmt.modules.multi_headed_attn

window_size=window_size, ).transpose(1, 2) else: + # Apply scaled dot product attention. with torch.backends.cuda.sdp_kernel( enable_flash=False, enable_math=True, enable_mem_efficient=True ): - attn_output = F.scaled_dot_product_attention( + attn_output = scaled_dot_product_attention( query, key, value, @@ -700,18 +864,10 @@

Source code for onmt.modules.multi_headed_attn

self.dropout_p, is_causal=causal, ) - - x = unshape(attn_output) - - attn_output = self.maybe_ckpt(self.final_linear, x) - - if self.parallel_gpu > 1: - dist.all_reduce(attn_output) - - return attn_output, None + attn = None else: - query /= math.sqrt(self.dim_per_head) + query /= sqrt(self.dim_per_head) # batch x num_heads x query_len x key_len scores = torch.matmul(query, key.transpose(2, 3)) @@ -753,6 +909,8 @@

Source code for onmt.modules.multi_headed_attn

scores = self.alibi(scores) scores = scores.float() + if key_pad_mask is not None and mask is None: + mask = key_pad_mask.unsqueeze(1) if mask is not None: # not 100% necessary but expand to nb of heads @@ -764,23 +922,28 @@

Source code for onmt.modules.multi_headed_attn

attn = self.softmax(scores).to(query.dtype) drop_attn = self.dropout(attn) if self.dropout_p > 0 else attn - context_original = torch.matmul(drop_attn, value) + attn_output = torch.matmul(drop_attn, value) if self.relative_positions_embeddings is not None: # We use the same embeddings for key and value relations_values = relations_keys - context_original.add_( - relative_matmul(drop_attn, relations_values, False) - ) + attn_output.add_(relative_matmul(drop_attn, relations_values, False)) - context = unshape(context_original) + context = unshape(attn_output) + if key_pad_mask is not None: + if key_pad_mask.size(0) > 1 and context.size(1) > 1: + x = key_pad_mask.squeeze(1).unsqueeze(2).expand(-1, -1, context.size(2)) + context = context.masked_fill(x, 0) + if self.layer_cache[0]: + attn_output = self.final_linear(context) + else: attn_output = self.maybe_ckpt(self.final_linear, context) - if self.parallel_gpu > 1: - dist.all_reduce(attn_output) + if self.parallel_gpu > 1: + all_reduce(attn_output) - return attn_output, attn

+ return attn_output, attn
diff --git a/_modules/onmt/modules/position_ffn.html b/_modules/onmt/modules/position_ffn.html index 2225645171..052c0286e3 100644 --- a/_modules/onmt/modules/position_ffn.html +++ b/_modules/onmt/modules/position_ffn.html @@ -203,17 +203,12 @@

Source code for onmt.modules.position_ffn

 """Position feed-forward network from "Attention is All You Need"."""
 
-
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.utils.checkpoint import checkpoint
-
-try:
-    from apex.normalization import FusedRMSNorm as RMSNorm
-except ImportError:
-    from onmt.modules.rmsnorm import RMSNorm
+from onmt.modules.rmsnorm import RMSNorm
 from torch.nn.utils import skip_init
-import torch.distributed as dist
+from torch.distributed import all_reduce
 
 
 class ActivationFunction(object):
@@ -320,7 +315,7 @@ 

Source code for onmt.modules.position_ffn

             inter = self.dropout_2(inter)
 
         if self.parallel_gpu > 1:
-            dist.all_reduce(inter)
+            all_reduce(inter)
 
         return inter + x
diff --git a/_modules/onmt/trainer.html b/_modules/onmt/trainer.html index d1acd9e1d1..5649d19a71 100644 --- a/_modules/onmt/trainer.html +++ b/_modules/onmt/trainer.html @@ -531,9 +531,10 @@

Source code for onmt.trainer

             )
 
             if valid_iter is not None and step % valid_steps == 0:
-                valid_stats = self.validate(
-                    valid_iter, moving_average=self.moving_average
-                )
+                if self.parallel_mode == "tensor_parallel" or self.gpu_rank <= 0:
+                    valid_stats = self.validate(
+                        valid_iter, moving_average=self.moving_average
+                    )
 
             if step % valid_steps == 0 and self.gpu_rank <= 0:
                 self._report_step(
diff --git a/_modules/onmt/translate/beam_search.html b/_modules/onmt/translate/beam_search.html
index 9ce38f7114..27e3d94266 100644
--- a/_modules/onmt/translate/beam_search.html
+++ b/_modules/onmt/translate/beam_search.html
@@ -384,8 +384,8 @@ 

Source code for onmt.translate.beam_search

 
         return topk_scores, topk_ids
 
-    def beams_non_finished(self, i, predictions, attention, step):
-
+    def beams_non_finished(self, i, topk_scores_list, predictions, attention, step):
+        # using lists instead of tensors for topk_scores and is_finished make things faster
         if any(self.is_finished_list[i]):
             b = self._batch_offset[i]
             # Store finished hypotheses for this example in the batch.
@@ -393,34 +393,33 @@ 

Source code for onmt.translate.beam_search

                 k for k, fin in enumerate(self.is_finished_list[i]) if fin
             ]:  # Beam level: finished beam j in example i of batch
                 if self.ratio > 0:
-                    s = self.topk_scores[i, j] / (step + 1)
+                    s = topk_scores_list[i][j] / (step + 1)
                     self.best_scores[b] = max(s, self.best_scores[b])
                 self.hypotheses[b].append(
                     (
-                        self.topk_scores[i, j],
+                        topk_scores_list[i][j],
                         predictions[i, j, 1:],  # Ignore start_token.
                         attention[i, j, :, : self.src_len[i]]
                         if attention is not None
                         else None,
                     )
                 )
-                if len(self.hypotheses[b]) >= 2:
-                    self.hypotheses[b] = sorted(
-                        self.hypotheses[b], key=lambda x: x[0], reverse=True
-                    )
 
             # End condition is the top beam finished and we can return
             # n_best hypotheses.
             if self.ratio > 0:
                 pred_len = self.src_len[i] * self.ratio
                 finish_flag = (
-                    (self.topk_scores[i, 0] / pred_len) <= self.best_scores[b]
+                    (topk_scores_list[i][0] / pred_len) <= self.best_scores[b]
                 ) or all(self.is_finished_list[i])
             else:
                 # early stop when top beam is finished
                 finish_flag = self.is_finished_list[i][0]
 
             if finish_flag and len(self.hypotheses[b]) >= self.n_best:
+                self.hypotheses[b] = sorted(
+                    self.hypotheses[b], key=lambda x: x[0], reverse=True
+                )
                 for score, pred, attn in self.hypotheses[b][: self.n_best]:
                     self.scores[b].append(score)
                     self.predictions[b].append(pred)  # ``(batch, n_best,)``
@@ -438,7 +437,7 @@ 

Source code for onmt.translate.beam_search

         # this is required to pursue finished beams in non finished batches
         self.topk_log_probs.masked_fill_(
             torch.tensor(self.is_finished_list, device=self.topk_log_probs.device),
-            -1e10,
+            -65504,
         )
         predictions = self.alive_seq.view(_B_old, self.beam_size, step)
         attention = (
@@ -449,10 +448,13 @@ 

Source code for onmt.translate.beam_search

             else None
         )
 
+        topk_scores_list = self.topk_scores.tolist()
         non_finished_batch = [
             i
             for i in range(len(self.is_finished_list))
-            if self.beams_non_finished(i, predictions, attention, step)
+            if self.beams_non_finished(
+                i, topk_scores_list, predictions, attention, step
+            )
         ]
 
         non_finished = torch.tensor(non_finished_batch)
@@ -468,12 +470,6 @@ 

Source code for onmt.translate.beam_search

 
         # reset the selection for the next step
         self.select_indices = self._batch_index.view(_B_new * self.beam_size)
-        # assert torch.equal(
-        #    self.src_len[self.select_indices],
-        #    self.src_len.view(_B_old, self.beam_size)[non_finished].view(
-        #        _B_new * self.beam_size
-        #    ),
-        # )
         self.src_len = self.src_len[self.select_indices]
         self.maybe_update_target_prefix(self.select_indices)
 
@@ -481,15 +477,10 @@ 

Source code for onmt.translate.beam_search

         self, _B_new, _B_old, non_finished, predictions, attention, step
     ):
         # Remove finished batches for the next step.
-        self._batch_offset = self._batch_offset[non_finished]
-        # here we combine two slections in one
-        # self.topk_log_probs = self.topk_log_probs[non_finished]
-        # self._batch_index = self._batch_index.index_select(0, non_finished)
-        self.topk_log_probs, self._batch_index = torch.unbind(
-            torch.stack([self.topk_log_probs, self._batch_index], dim=2)[non_finished],
-            dim=2,
-        )
-        self._batch_index = self._batch_index.to(torch.long)
+        self._batch_offset = self._batch_offset[non_finished]  # CPU
+        non_finished = non_finished.to(self.topk_log_probs.device)
+        self.topk_log_probs = self.topk_log_probs[non_finished]
+        self._batch_index = self._batch_index[non_finished]
         self.alive_seq = predictions[non_finished].view(-1, self.alive_seq.size(-1))
 
         if self.alive_attn is not None:
diff --git a/_modules/onmt/translate/decode_strategy.html b/_modules/onmt/translate/decode_strategy.html
index dd63884b4c..35ed635914 100644
--- a/_modules/onmt/translate/decode_strategy.html
+++ b/_modules/onmt/translate/decode_strategy.html
@@ -386,18 +386,16 @@ 

Source code for onmt.translate.decode_strategy

def ensure_min_length(self, log_probs): if len(self) <= self.min_length: - log_probs[:, self.eos] = -1e20 + log_probs[:, self.eos] = -65504 # -1e20 def ensure_unk_removed(self, log_probs): if self.ban_unk_token: - log_probs[:, self.unk] = -1e20 + log_probs[:, self.unk] = -65504 # -1e20 def ensure_max_length(self): # add one to account for BOS. Don't account for EOS because hitting # this implies it hasn't been found. if len(self) == self.max_length + 1: - if hasattr(self, "is_finished"): - self.is_finished.fill_(1) self.is_finished_list = [ [True for _ in range(self.parallel_paths)] for _ in range(len(self.is_finished_list)) diff --git a/_modules/onmt/translate/greedy_search.html b/_modules/onmt/translate/greedy_search.html index 57b3679b42..2bc8af22c1 100644 --- a/_modules/onmt/translate/greedy_search.html +++ b/_modules/onmt/translate/greedy_search.html @@ -202,15 +202,14 @@

Source code for onmt.translate.greedy_search

 import torch
-import torch.nn.functional as F
-
+from torch.nn.functional import softmax
 from onmt.translate.decode_strategy import DecodeStrategy
 
 
 def sample_topp(logits, keep_topp):
     sorted_logits, sorted_indices = torch.sort(logits, descending=True, dim=1)
 
-    cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+    cumulative_probs = torch.cumsum(softmax(sorted_logits, dim=-1), dim=-1)
     sorted_indices_to_keep = cumulative_probs.lt(keep_topp)
 
     # keep indices until overflowing p
@@ -301,6 +300,8 @@ 

Source code for onmt.translate.greedy_search

        eos (int): See base.
         unk (int): See base.
         start (int): See base.
+        n_best (int): Don't stop until at least this many beams have
+            reached EOS.
         batch_size (int): See base.
         global_scorer (onmt.translate.GNMTGlobalScorer): Scorer instance.
         min_length (int): See base.
@@ -326,6 +327,7 @@ 

Source code for onmt.translate.greedy_search

eos,
         unk,
         start,
+        n_best,
         batch_size,
         global_scorer,
         min_length,
@@ -360,6 +362,7 @@ 

Source code for onmt.translate.greedy_search

self.keep_topp = keep_topp
         self.topk_scores = None
         self.beam_size = beam_size
+        self.n_best = n_best
 
 
[docs] def initialize( self, enc_out, src_len, src_map=None, device=None, target_prefix=None @@ -406,9 +409,7 @@

Source code for onmt.translate.greedy_search

return topk_ids, topk_scores
 
     def align_select_indices(self):
-        nb_finished_beams = self.is_finished.view(-1).size(
-            0
-        ) - self.select_indices.size(0)
+        nb_finished_beams = len(self.is_finished_list) - self.select_indices.size(0)
         if nb_finished_beams:
             self.select_indices = torch.arange(
                 self.select_indices.size(0),
@@ -428,8 +429,7 @@ 

Source code for onmt.translate.greedy_search

                to 1.)
             attn (FloatTensor): Shaped ``(1, B, inp_seq_len)``.
         """
-        if hasattr(self, "is_finished"):
-            self.align_select_indices()
+        self.align_select_indices()
 
         self.ensure_min_length(log_probs)
         self.ensure_unk_removed(log_probs)
@@ -438,8 +438,7 @@ 

Source code for onmt.translate.greedy_search

topk_ids, self.topk_scores = self._pick(log_probs)
         self.beams_scores += self.topk_scores
 
-        self.is_finished = topk_ids.eq(self.eos)
-        self.is_finished_list = self.is_finished.tolist()
+        self.is_finished_list = topk_ids.eq(self.eos).tolist()
 
         self.alive_seq = torch.cat([self.alive_seq, topk_ids], -1)
         if self.return_attention:
@@ -452,39 +451,47 @@ 

Source code for onmt.translate.greedy_search

[docs]    def update_finished(self):
         """Finalize scores and predictions."""
         # shape: (sum(~ self.is_finished), 1)
-        finished_batches = self.is_finished.view(-1).nonzero()
         step = len(self)
+        non_finished_batch = [
+            b for b, fin in enumerate(self.is_finished_list) if not fin[0]
+        ]
         length_penalty = self.global_scorer.length_penalty(
             step, alpha=self.global_scorer.alpha
         )
-
-        for b in finished_batches.view(-1):
+        for b in [i for i, fin in enumerate(self.is_finished_list) if fin[0]]:
             b_orig = self.original_batch_idx[b]
             score = self.beams_scores[b, 0] / length_penalty
             pred = self.alive_seq[b, 1:]
             attention = (
-                self.alive_attn[b, :, : self.src_len[b]]
+                self.alive_attn[
+                    b,
+                    :,
+                    : self.src_len[b],
+                ]
                 if self.alive_attn is not None
                 else []
             )
             self.hypotheses[b_orig].append((score, pred, attention))
-        self.done = self.is_finished.all()
+        self.done = len(non_finished_batch) == 0
         if self.done:
             for b in range(self.batch_size):
-                best_hyp = sorted(self.hypotheses[b], key=lambda x: x[0], reverse=True)
+                best_hyp = sorted(self.hypotheses[b], key=lambda x: x[0], reverse=True)[
+                    : self.n_best
+                ]
                 for score, pred, attn in best_hyp:
                     self.scores[b].append(score)
                     self.predictions[b].append(pred)
                     self.attention[b].append(attn)
             return
-        is_alive = ~self.is_finished.view(-1)
-        self.alive_seq = self.alive_seq[is_alive]
-        self.beams_scores = self.beams_scores[is_alive]
-        self.src_len = self.src_len[is_alive]
+        self.select_indices = torch.tensor(
+            non_finished_batch, device=self.alive_seq.device
+        )
+        self.alive_seq = self.alive_seq[self.select_indices]
+        self.beams_scores = self.beams_scores[self.select_indices]
+        self.src_len = self.src_len[self.select_indices]
         if self.alive_attn is not None:
-            self.alive_attn = self.alive_attn[is_alive]
-        self.select_indices = is_alive.nonzero(as_tuple=False).view(-1)
-        self.original_batch_idx = self.original_batch_idx[is_alive]
+            self.alive_attn = self.alive_attn[self.select_indices]
+        self.original_batch_idx = self.original_batch_idx[self.select_indices]
         self.maybe_update_target_prefix(self.select_indices)
diff --git a/_modules/onmt/translate/translation_server.html b/_modules/onmt/translate/translation_server.html index 0578bff947..bd00440b7e 100644 --- a/_modules/onmt/translate/translation_server.html +++ b/_modules/onmt/translate/translation_server.html @@ -1140,7 +1140,7 @@

Source code for onmt.translate.translation_server

"""De-tokenize the sequence (or not) Same args/returns as :func:``tokenize()``""" - if self.tokenizers_opt is not None and "".join(sequence.split()) != "": + if self.tokenizers_opt is not None and "".join(sequence.split(" ")) != "": return self.detokenize(sequence, side) return sequence
@@ -1153,9 +1153,9 @@

Source code for onmt.translate.translation_server

raise ValueError("No tokenizer loaded") if self.tokenizers_opt[side]["type"] == "sentencepiece": - detok = self.tokenizers[side].DecodePieces(sequence.split()) + detok = self.tokenizers[side].DecodePieces(sequence.split(" ")) elif self.tokenizers_opt[side]["type"] == "pyonmttok": - detok = self.tokenizers[side].detokenize(sequence.split()) + detok = self.tokenizers[side].detokenize(sequence.split(" ")) return detok
@@ -1179,7 +1179,7 @@

Source code for onmt.translate.translation_server

"To get decoded alignment, joiner/spacer " "should be used in both side's tokenizer." ) - elif "".join(tgt.split()) != "": + elif "".join(tgt.split(" ")) != "": align = to_word_align( src, tgt, align, align_scores, src_marker, tgt_marker ) diff --git a/_modules/onmt/translate/translator.html b/_modules/onmt/translate/translator.html index 2bb08cbb1b..28800153f8 100644 --- a/_modules/onmt/translate/translator.html +++ b/_modules/onmt/translate/translator.html @@ -203,18 +203,17 @@

Source code for onmt.translate.translator

 #!/usr/bin/env python
 """ Translator Class and builder """
+import torch
+from torch.nn.functional import log_softmax
+from torch.nn.utils.rnn import pad_sequence
 import codecs
-import os
-import time
-import numpy as np
+from time import time
+from math import exp
 from itertools import count, zip_longest
 from copy import deepcopy
-import torch
-import torch.nn.functional as F
-from torch.nn.utils.rnn import pad_sequence
-from onmt.constants import DefaultTokens
 import onmt.model_builder
 import onmt.decoders.ensemble
+from onmt.constants import DefaultTokens
 from onmt.translate.beam_search import BeamSearch, BeamSearchLM
 from onmt.translate.greedy_search import GreedySearch, GreedySearchLM
 from onmt.utils.misc import tile, set_random_seed, report_matrix
@@ -311,6 +310,7 @@ 

Source code for onmt.translate.translator

         n_best=1,
         min_length=0,
         max_length=100,
+        max_length_ratio=1.5,
         ratio=0.0,
         beam_size=30,
         random_sampling_topk=0,
@@ -336,6 +336,7 @@ 

Source code for onmt.translate.translator

         logger=None,
         seed=-1,
         with_score=False,
+        return_gold_log_probs=False,
     ):
         self.model = model
         self.vocabs = vocabs
@@ -356,6 +357,7 @@ 

Source code for onmt.translate.translator

 
         self.n_best = n_best
         self.max_length = max_length
+        self.max_length_ratio = max_length_ratio
 
         self.beam_size = beam_size
         self.random_sampling_temp = random_sampling_temp
@@ -407,6 +409,8 @@ 

Source code for onmt.translate.translator

         set_random_seed(seed, self._use_cuda)
         self.with_score = with_score
 
+        self.return_gold_log_probs = return_gold_log_probs
+
     @classmethod
     def from_opt(
         cls,
@@ -447,6 +451,7 @@ 

Source code for onmt.translate.translator

             n_best=opt.n_best,
             min_length=opt.min_length,
             max_length=opt.max_length,
+            max_length_ratio=opt.max_length_ratio,
             ratio=opt.ratio,
             beam_size=opt.beam_size,
             random_sampling_topk=opt.random_sampling_topk,
@@ -481,26 +486,17 @@ 

Source code for onmt.translate.translator

             print(msg)
 
     def _gold_score(
-        self,
-        batch,
-        enc_out,
-        src_len,
-        use_src_map,
-        enc_final_hs,
-        batch_size,
-        src,
+        self, batch, enc_out, src_len, use_src_map, enc_final_hs, batch_size, src
     ):
         if "tgt" in batch.keys() and not self.tgt_file_prefix:
-            gs = self._score_target(
-                batch,
-                enc_out,
-                src_len,
-                batch["src_map"] if use_src_map else None,
+            gs, glp = self._score_target(
+                batch, enc_out, src_len, batch["src_map"] if use_src_map else None
             )
             self.model.decoder.init_state(src, enc_out, enc_final_hs)
         else:
             gs = [0] * batch_size
-        return gs
+            glp = None
+        return gs, glp
 
     def _translate(
         self,
@@ -544,7 +540,7 @@ 

Source code for onmt.translate.translator

         all_scores = []
         all_predictions = []
 
-        start_time = time.time()
+        start_time = time()
 
         def _maybe_retranslate(translations, batch):
             """Here we handle the cases of mismatch in number of segments
@@ -656,10 +652,7 @@ 

Source code for onmt.translate.translator

                     srcs = [voc_src[tok] for tok in trans.src[: trans.srclen]]
                     sent_number = next(counter)
                     output = trans.log(sent_number, src_raw=srcs)
-                    if self.logger:
-                        self.logger.info(output)
-                    else:
-                        os.write(1, output.encode("utf-8"))
+                    self._log(output)
 
                 if attn_debug:
                     preds = trans.pred_sents[0]
@@ -672,10 +665,7 @@ 

Source code for onmt.translate.translator

                     else:
                         srcs = [str(item) for item in range(len(attns[0]))]
                     output = report_matrix(srcs, preds, attns)
-                    if self.logger:
-                        self.logger.info(output)
-                    else:
-                        os.write(1, output.encode("utf-8"))
+                    self._log(output)
 
                 if align_debug:
                     if self.gold_align:
@@ -690,10 +680,8 @@ 

Source code for onmt.translate.translator

                     else:
                         srcs = [str(item) for item in range(len(align[0]))]
                     output = report_matrix(srcs, tgts, align)
-                    if self.logger:
-                        self.logger.info(output)
-                    else:
-                        os.write(1, output.encode("utf-8"))
+                    self._log(output)
+
             return (
                 bucket_scores,
                 bucket_predictions,
@@ -761,7 +749,7 @@ 

Source code for onmt.translate.translator

             gold_score_total += bucket_gold_score
             gold_words_total += bucket_gold_words
 
-        end_time = time.time()
+        end_time = time()
 
         if self.report_score:
             msg = self._report_score("PRED", pred_score_total, len(all_scores))
@@ -789,6 +777,38 @@ 

Source code for onmt.translate.translator

 
         return all_scores, all_predictions
 
+    def _score(self, infer_iter):
+        self.with_scores = True
+        score_res = []
+        processed_bucket = {}
+        prev_bucket_idx = 0
+        for batch, bucket_idx in infer_iter:
+            if bucket_idx != prev_bucket_idx:
+                prev_bucket_idx += 1
+                score_res += [item for _, item in sorted(processed_bucket.items())]
+                processed_bucket = {}
+            batch_data = self.translate_batch(batch, attn_debug=False, scoring=True)
+            batch_gold_scores = batch_data["gold_score"].cpu().numpy().tolist()
+            batch_tgt_lengths = batch["tgtlen"].cpu().numpy().tolist()
+            batch_inds_in_bucket = batch["ind_in_bucket"]
+            if self.return_gold_log_probs:
+                batch_gold_log_probs = (
+                    batch_data["gold_log_probs"].cpu().numpy().tolist()
+                )
+            else:
+                batch_gold_log_probs = [
+                    None for i, _ in enumerate(batch_inds_in_bucket)
+                ]
+            for i, ind in enumerate(batch_inds_in_bucket):
+                processed_bucket[ind] = [
+                    batch_gold_scores[i],
+                    batch_gold_log_probs[i],
+                    batch_tgt_lengths[i],
+                ]
+        if processed_bucket:
+            score_res += [item for _, item in sorted(processed_bucket.items())]
+        return score_res
+
     def _align_pad_prediction(self, predictions, bos, pad):
         """
         Padding predictions in batch and add BOS.
@@ -828,7 +848,10 @@ 

Source code for onmt.translate.translator

             msg = "%s No translations" % (name,)
         else:
             score = score_total / nb_sentences
-            ppl = np.exp(-score_total.item() / nb_sentences)
+            try:
+                ppl = exp(-score_total / nb_sentences)
+            except OverflowError:
+                ppl = float("inf")
             msg = "%s SCORE: %.4f, %s PPL: %.2f NB SENTENCES: %d" % (
                 name,
                 score,
@@ -867,7 +890,6 @@ 

Source code for onmt.translate.translator

             step=step,
             return_attn=self.global_scorer.has_cov_pen or return_attn,
         )
-
         # Generator forward.
         if not self.copy_attn:
             if "std" in dec_attn:
@@ -875,7 +897,7 @@ 

Source code for onmt.translate.translator

             else:
                 attn = None
             scores = self.model.generator(dec_out.squeeze(1))
-            log_probs = F.log_softmax(scores.to(torch.float32), dim=-1)
+            log_probs = log_softmax(scores, dim=-1)  # we keep float16 if FP16
             # returns [(batch_size x beam_size) , vocab ] when 1 step
             # or [batch_size, tgt_len, vocab ] when full sentence
         else:
@@ -897,7 +919,6 @@ 

Source code for onmt.translate.translator

                 batch,
                 self._tgt_vocab,
                 batch_dim=0,
-                batch_offset=batch_offset,
             )
             scores = scores.view(-1, decoder_in.size(1), scores.size(-1))
             log_probs = scores.squeeze(1).log()
@@ -915,6 +936,7 @@ 

Source code for onmt.translate.translator

     def report_results(
         self,
         gold_score,
+        gold_log_probs,
         batch,
         batch_size,
         decode_strategy,
@@ -925,6 +947,7 @@ 

Source code for onmt.translate.translator

             "attention": None,
             "batch": batch,
             "gold_score": gold_score,
+            "gold_log_probs": gold_log_probs,
         }
 
         results["scores"] = decode_strategy.scores
@@ -1005,6 +1028,12 @@ 

Source code for onmt.translate.translator

 
 
[docs] def translate_batch(self, batch, attn_debug): """Translate a batch of sentences.""" + if self.max_length_ratio > 0: + max_length = int( + min(self.max_length, batch["src"].size(1) * self.max_length_ratio + 5) + ) + else: + max_length = self.max_length with torch.no_grad(): if self.sample_from_topk != 0 or self.sample_from_topp != 0: decode_strategy = GreedySearch( @@ -1013,10 +1042,11 @@

Source code for onmt.translate.translator

                     eos=self._tgt_eos_idx,
                     unk=self._tgt_unk_idx,
                     start=self._tgt_start_with,
+                    n_best=self.n_best,
                     batch_size=len(batch["srclen"]),
                     global_scorer=self.global_scorer,
                     min_length=self.min_length,
-                    max_length=self.max_length,
+                    max_length=max_length,
                     block_ngram_repeat=self.block_ngram_repeat,
                     exclusion_tokens=self._exclusion_idxs,
                     return_attention=attn_debug or self.replace_unk,
@@ -1040,7 +1070,7 @@ 

Source code for onmt.translate.translator

                     n_best=self.n_best,
                     global_scorer=self.global_scorer,
                     min_length=self.min_length,
-                    max_length=self.max_length,
+                    max_length=max_length,
                     return_attention=attn_debug or self.replace_unk,
                     block_ngram_repeat=self.block_ngram_repeat,
                     exclusion_tokens=self._exclusion_idxs,
@@ -1088,7 +1118,7 @@ 

Source code for onmt.translate.translator

 
         self.model.decoder.init_state(src, enc_out, enc_final_hs)
 
-        gold_score = self._gold_score(
+        gold_score, gold_log_probs = self._gold_score(
             batch,
             enc_out,
             src_len,
@@ -1149,6 +1179,7 @@ 

Source code for onmt.translate.translator

 
         return self.report_results(
             gold_score,
+            gold_log_probs,
             batch,
             batch_size,
             decode_strategy,
@@ -1170,7 +1201,7 @@ 

Source code for onmt.translate.translator

         gold = tgt[:, 1:, :]
         gold_scores = log_probs.gather(2, gold)
         gold_scores = gold_scores.sum(dim=1).view(-1)
-        return gold_scores
+ return gold_scores, None
class GeneratorLM(Inference): @@ -1189,21 +1220,9 @@

Source code for onmt.translate.translator

         """
         raise NotImplementedError
 
-    def translate_batch(self, batch, attn_debug):
+    def translate_batch(self, batch, attn_debug, scoring=False):
         """Translate a batch of sentences."""
-        batch_size = len(batch["srclen"])
-        if batch_size != 1:
-            warning_msg = (
-                "GeneratorLM does not support batch_size != 1"
-                " nicely. You can remove this limitation here."
-                " With batch_size > 1 the end of each input is"
-                " repeated until the input is finished. Then"
-                " generation will start."
-            )
-            if self.logger:
-                self.logger.info(warning_msg)
-            else:
-                os.write(1, warning_msg.encode("utf-8"))
+        max_length = 0 if scoring else self.max_length
         with torch.no_grad():
             if self.sample_from_topk != 0 or self.sample_from_topp != 0:
                 decode_strategy = GreedySearchLM(
@@ -1212,10 +1231,11 @@ 

Source code for onmt.translate.translator

                     eos=self._tgt_eos_idx,
                     unk=self._tgt_unk_idx,
                     start=self._tgt_start_with,
+                    n_best=self.n_best,
                     batch_size=len(batch["srclen"]),
                     global_scorer=self.global_scorer,
                     min_length=self.min_length,
-                    max_length=self.max_length,
+                    max_length=max_length,
                     block_ngram_repeat=self.block_ngram_repeat,
                     exclusion_tokens=self._exclusion_idxs,
                     return_attention=attn_debug or self.replace_unk,
@@ -1239,7 +1259,7 @@ 

Source code for onmt.translate.translator

                     n_best=self.n_best,
                     global_scorer=self.global_scorer,
                     min_length=self.min_length,
-                    max_length=self.max_length,
+                    max_length=max_length,
                     return_attention=attn_debug or self.replace_unk,
                     block_ngram_repeat=self.block_ngram_repeat,
                     exclusion_tokens=self._exclusion_idxs,
@@ -1266,7 +1286,7 @@ 

Source code for onmt.translate.translator

             log_probs = log_probs[:, -1, :]
         return log_probs
 
-    def _translate_batch_with_strategy(self, batch, decode_strategy):
+    def _translate_batch_with_strategy(self, batch, decode_strategy, left_pad=True):
         """Translate a batch of sentences step by step using cache.
 
         Args:
@@ -1286,18 +1306,17 @@ 

Source code for onmt.translate.translator

         src = batch["src"]
         src_len = batch["srclen"]
 
-        src, src_len, target_prefix = self.split_src_to_prevent_padding(src, src_len)
+        if left_pad:
+            target_prefix = None
+        else:
+            src, src_len, target_prefix = self.split_src_to_prevent_padding(
+                src, src_len
+            )
 
         # (2) init decoder
         self.model.decoder.init_state(src, None, None)
-        gold_score = self._gold_score(
-            batch,
-            None,
-            src_len,
-            use_src_map,
-            None,
-            batch_size,
-            src,
+        gold_score, gold_log_probs = self._gold_score(
+            batch, None, src_len, use_src_map, None, batch_size, src
         )
 
         # (3) prep decode_strategy. Possibly repeat src objects.
@@ -1310,18 +1329,18 @@ 

Source code for onmt.translate.translator

         )
 
         # (4) Begin decoding step by step:
+        # beg_time = time()
         for step in range(decode_strategy.max_length):
             decoder_input = (
                 src if step == 0 else decode_strategy.current_predictions.view(-1, 1, 1)
             )
-
             log_probs, attn = self._decode_and_generate(
                 decoder_input,
                 None,
                 batch,
                 src_len=decode_strategy.src_len,
                 src_map=src_map,
-                step=step if step == 0 else step + src_len[0].item(),
+                step=step if step == 0 else step + max(src_len.tolist()),
                 batch_offset=decode_strategy.batch_offset,
             )
 
@@ -1348,9 +1367,12 @@ 

Source code for onmt.translate.translator

             if parallel_paths > 1 or any_finished:
                 # select indexes in model state/cache
                 self.model.decoder.map_state(lambda state, dim: state[select_indices])
+            # if step == 0:
+            #    print("step0 time: ", time() - beg_time)
 
         return self.report_results(
             gold_score,
+            gold_log_probs,
             batch,
             batch_size,
             decode_strategy,
@@ -1370,10 +1392,13 @@ 

Source code for onmt.translate.translator

         )
 
         log_probs[:, :, self._tgt_pad_idx] = 0
-        gold_scores = log_probs.gather(2, tgt)
-        gold_scores = gold_scores.sum(dim=1).view(-1)
+        gold_log_probs = log_probs.gather(2, tgt)
+        gold_scores = gold_log_probs.sum(dim=1).view(-1)
+
+        if self.return_gold_log_probs:
+            return gold_scores, gold_log_probs
 
-        return gold_scores
+        return gold_scores, None
 
diff --git a/_modules/onmt/utils/loss.html b/_modules/onmt/utils/loss.html index e0d38d0784..5b59c0e94a 100644 --- a/_modules/onmt/utils/loss.html +++ b/_modules/onmt/utils/loss.html @@ -531,7 +531,6 @@

Source code for onmt.utils.loss

                 self._unbottle(scores.clone(), len(batch["srclen"])),
                 batch,
                 self.vocab,
-                None,
             )
             scores_data = self._bottle(scores_data)
             # Correct target copy token instead of <unk>
diff --git a/_sources/examples/wmt17/Translation.md.txt b/_sources/examples/wmt17/Translation.md.txt
index 39fd01c1f2..9e79fae0f4 100644
--- a/_sources/examples/wmt17/Translation.md.txt
+++ b/_sources/examples/wmt17/Translation.md.txt
@@ -55,7 +55,6 @@ Training the following big transformer for 50K steps takes less than 10 hours on
 ```bash
 python3 ../../../onmt/bin/build_vocab.py --config wmt17/wmt17_ende.yaml --n_sample -1
 python3 ../../../onmt/bin/train.py --config wmt17/wmt17_ende.yaml
-bash scripts/onmt/train.sh
 ```
 
 Translate test sets with various settings on local GPU and CPUs.
diff --git a/_sources/quickstart.md.txt b/_sources/quickstart.md.txt
index c37fd43633..389a529d93 100644
--- a/_sources/quickstart.md.txt
+++ b/_sources/quickstart.md.txt
@@ -218,6 +218,12 @@ tgt: None
 
 In this second example, we used `max_length: 1` and `src: None` `tgt: None` which is typically the configuration to be used in a scoring script like MMLU where it expects only 1 token as the answer.
 
+
+**WARNING**
+For inhomogeneous batches with many examples, the potentially high number of tokens inserted in the shortest examples leads to degraded results when attention layer quantization and flash attention are activated. 
+In practice, in the inference configuration file, when `batch_size` is greater than 1,
+delete 'linear_values', 'linear_query', 'linear_keys', 'final_linear' from `quant_layers` and specify `self_attn_type: scaled-dot`.
+
 You can run this script with the following command line:
 
 ```
diff --git a/examples/wmt17/Translation.html b/examples/wmt17/Translation.html
index 257cfb6727..e10c6e4f16 100644
--- a/examples/wmt17/Translation.html
+++ b/examples/wmt17/Translation.html
@@ -271,7 +271,6 @@ 

Train

Training the following big transformer for 50K steps takes less than 10 hours on a single RTX 4090

python3 ../../../onmt/bin/build_vocab.py --config wmt17/wmt17_ende.yaml --n_sample -1
 python3 ../../../onmt/bin/train.py --config wmt17/wmt17_ende.yaml
-bash scripts/onmt/train.sh
 

Translate test sets with various settings on local GPU and CPUs.

diff --git a/index.html b/index.html index d1a41c7f04..1849dc93cc 100644 --- a/index.html +++ b/index.html @@ -332,24 +332,24 @@

ContentsData
  • Vocab
  • Features
  • -
  • Transform/BART
  • -
  • Transform/Terminology
  • -
  • Transform/FuzzyMatching
  • -
  • Transform/Filter
  • -
  • Transform/Prefix
  • -
  • Transform/Suffix
  • -
  • Transform/InsertMaskBeforePlaceholdersTransform
  • -
  • Transform/Clean
  • -
  • Transform/Uppercase
  • Transform/SwitchOut
  • Transform/Token_Drop
  • Transform/Token_Mask
  • Transform/Docify
  • -
  • Transform/InferFeats
  • +
  • Transform/InsertMaskBeforePlaceholdersTransform
  • +
  • Transform/Uppercase
  • +
  • Transform/FuzzyMatching
  • +
  • Transform/InlineTags
  • +
  • Transform/Clean
  • Transform/Subword/Common
  • Transform/Subword/ONMTTOK
  • -
  • Transform/InlineTags
  • Transform/Normalize
  • +
  • Transform/InferFeats
  • +
  • Transform/Filter
  • +
  • Transform/Prefix
  • +
  • Transform/Suffix
  • +
  • Transform/Terminology
  • +
  • Transform/BART
  • Reproducibility
  • @@ -360,24 +360,24 @@

    ContentsFeatures
  • Pruning
  • Embeddings
  • -
  • Transform/BART
  • -
  • Transform/Terminology
  • -
  • Transform/FuzzyMatching
  • -
  • Transform/Filter
  • -
  • Transform/Prefix
  • -
  • Transform/Suffix
  • -
  • Transform/InsertMaskBeforePlaceholdersTransform
  • -
  • Transform/Clean
  • -
  • Transform/Uppercase
  • Transform/SwitchOut
  • Transform/Token_Drop
  • Transform/Token_Mask
  • Transform/Docify
  • -
  • Transform/InferFeats
  • +
  • Transform/InsertMaskBeforePlaceholdersTransform
  • +
  • Transform/Uppercase
  • +
  • Transform/FuzzyMatching
  • +
  • Transform/InlineTags
  • +
  • Transform/Clean
  • Transform/Subword/Common
  • Transform/Subword/ONMTTOK
  • -
  • Transform/InlineTags
  • Transform/Normalize
  • +
  • Transform/InferFeats
  • +
  • Transform/Filter
  • +
  • Transform/Prefix
  • +
  • Transform/Suffix
  • +
  • Transform/Terminology
  • +
  • Transform/BART
  • Distributed
  • Model-Embeddings
  • Model-Embedding Features
  • @@ -392,7 +392,6 @@

    ContentsOptimization- Type
  • Optimization- Rate
  • Logging
  • -
  • Dynamic data
  • Quant options
  • @@ -409,24 +408,24 @@

    ContentsLogging
  • Distributed
  • Efficiency
  • -
  • Transform/BART
  • -
  • Transform/Terminology
  • -
  • Transform/FuzzyMatching
  • -
  • Transform/Filter
  • -
  • Transform/Prefix
  • -
  • Transform/Suffix
  • -
  • Transform/InsertMaskBeforePlaceholdersTransform
  • -
  • Transform/Clean
  • -
  • Transform/Uppercase
  • Transform/SwitchOut
  • Transform/Token_Drop
  • Transform/Token_Mask
  • Transform/Docify
  • -
  • Transform/InferFeats
  • +
  • Transform/InsertMaskBeforePlaceholdersTransform
  • +
  • Transform/Uppercase
  • +
  • Transform/FuzzyMatching
  • +
  • Transform/InlineTags
  • +
  • Transform/Clean
  • Transform/Subword/Common
  • Transform/Subword/ONMTTOK
  • -
  • Transform/InlineTags
  • Transform/Normalize
  • +
  • Transform/InferFeats
  • +
  • Transform/Filter
  • +
  • Transform/Prefix
  • +
  • Transform/Suffix
  • +
  • Transform/Terminology
  • +
  • Transform/BART
  • Quant options
  • diff --git a/onmt.modules.html b/onmt.modules.html index bb9f0128a2..81ea6bbe13 100644 --- a/onmt.modules.html +++ b/onmt.modules.html @@ -508,7 +508,7 @@

    Encoders
    -class onmt.encoders.TransformerEncoder(num_layers, d_model, heads, d_ff, dropout, attention_dropout, embeddings, max_relative_positions, relative_positions_buckets, pos_ffn_activation_fn='relu', add_qkvbias=False, num_kv=0, add_ffnbias=True, parallel_residual=False, layer_norm='standard', norm_eps=1e-06, use_ckpting=[], parallel_gpu=1)[source]
    +class onmt.encoders.TransformerEncoder(num_layers, d_model, heads, d_ff, dropout, attention_dropout, embeddings, max_relative_positions, relative_positions_buckets, pos_ffn_activation_fn='relu', add_qkvbias=False, num_kv=0, add_ffnbias=True, parallel_residual=False, layer_norm='standard', norm_eps=1e-06, use_ckpting=[], parallel_gpu=1, rotary_interleave=True, rotary_theta=10000.0, rotary_dim=0)[source]

    Bases: EncoderBase

    The Transformer encoder from “Attention is All You Need” [VSP+17]

    @@ -695,7 +695,7 @@

    Decoders
    -class onmt.decoders.TransformerDecoder(num_layers, d_model, heads, d_ff, copy_attn, self_attn_type, dropout, attention_dropout, embeddings, max_relative_positions, relative_positions_buckets, aan_useffn, full_context_alignment, alignment_layer, alignment_heads, pos_ffn_activation_fn='relu', add_qkvbias=False, num_kv=0, add_ffnbias=True, parallel_residual=False, shared_layer_norm=False, layer_norm='standard', norm_eps=1e-06, use_ckpting=[], parallel_gpu=1, sliding_window=0)[source]
    +class onmt.decoders.TransformerDecoder(num_layers, d_model, heads, d_ff, copy_attn, self_attn_type, dropout, attention_dropout, embeddings, max_relative_positions, relative_positions_buckets, aan_useffn, full_context_alignment, alignment_layer, alignment_heads, pos_ffn_activation_fn='relu', add_qkvbias=False, num_kv=0, add_ffnbias=True, parallel_residual=False, shared_layer_norm=False, layer_norm='standard', norm_eps=1e-06, use_ckpting=[], parallel_gpu=1, sliding_window=0, rotary_interleave=True, rotary_theta=10000.0, rotary_dim=0, num_experts=0, num_experts_per_tok=2)[source]

    Bases: TransformerDecoderBase

    The Transformer decoder from “Attention is All You Need”. [VSP+17]

    @@ -707,7 +707,7 @@

    Decodersonmt.modules.Embeddings) – embeddings to use, should have positional encodings

    @@ -720,8 +720,24 @@

    Decoders
    -class onmt.modules.MultiHeadedAttention(head_count: int, model_dim: int, dropout: float = 0.1, is_decoder: bool = True, max_relative_positions: int = 0, relative_positions_buckets: int = 0, attn_type: str | None = None, add_qkvbias=False, num_kv=0, use_ckpting=[], parallel_gpu=1)[source]
    +class onmt.modules.MultiHeadedAttention(head_count: int, model_dim: int, dropout: float = 0.1, is_decoder: bool = True, max_relative_positions: int = 0, relative_positions_buckets: int = 0, rotary_interleave: bool = True, rotary_theta: int = 10000.0, rotary_dim: int = 0, attn_type: str | None = None, self_attn_type: str | None = None, add_qkvbias=False, num_kv=0, use_ckpting=[], parallel_gpu=1)[source]

    Bases: Module

    Multi-Head Attention module from “Attention is All You Need” [VSP+17].

    @@ -1018,7 +1034,7 @@

    Attention
    -forward(key: Tensor, value: Tensor, query: Tensor, mask: Tensor | None = None, sliding_window: int | None = 0, step: int | None = 0, return_attn: bool | None = False) Tuple[Tensor, Tensor][source]
    +forward(key: Tensor, value: Tensor, query: Tensor, mask: Tensor | None = None, sliding_window: int | None = 0, step: int | None = 0, return_attn: bool | None = False, self_attn_type: str | None = None) Tuple[Tensor, Tensor][source]

    Compute the context vector and the attention vectors.

    Parameters:
    @@ -1188,7 +1204,7 @@

    Attention
    forward(input)[source]
    -

    Defines the computation performed at every call.

    +

    Define the computation performed at every call.

    Should be overridden by all subclasses.

    Note

    diff --git a/onmt.translation.html b/onmt.translation.html index d790f7a88b..788cec50ba 100644 --- a/onmt.translation.html +++ b/onmt.translation.html @@ -297,7 +297,7 @@

    Translations

    -class onmt.translate.Translator(model, vocabs, gpu=-1, n_best=1, min_length=0, max_length=100, ratio=0.0, beam_size=30, random_sampling_topk=0, random_sampling_topp=0.0, random_sampling_temp=1.0, stepwise_penalty=None, dump_beam=False, block_ngram_repeat=0, ignore_when_blocking=frozenset({}), replace_unk=False, ban_unk_token=False, tgt_file_prefix=False, phrase_table='', data_type='text', verbose=False, report_time=False, copy_attn=False, global_scorer=None, out_file=None, report_align=False, gold_align=False, report_score=True, logger=None, seed=-1, with_score=False)[source]
    +class onmt.translate.Translator(model, vocabs, gpu=-1, n_best=1, min_length=0, max_length=100, max_length_ratio=1.5, ratio=0.0, beam_size=30, random_sampling_topk=0, random_sampling_topp=0.0, random_sampling_temp=1.0, stepwise_penalty=None, dump_beam=False, block_ngram_repeat=0, ignore_when_blocking=frozenset({}), replace_unk=False, ban_unk_token=False, tgt_file_prefix=False, phrase_table='', data_type='text', verbose=False, report_time=False, copy_attn=False, global_scorer=None, out_file=None, report_align=False, gold_align=False, report_score=True, logger=None, seed=-1, with_score=False, return_gold_log_probs=False)[source]

    Bases: Inference

    @@ -519,7 +519,7 @@

    Decoding Strategies
    -class onmt.translate.GreedySearch(pad, bos, eos, unk, start, batch_size, global_scorer, min_length, block_ngram_repeat, exclusion_tokens, return_attention, max_length, sampling_temp, keep_topk, keep_topp, beam_size, ban_unk_token)[source]
    +class onmt.translate.GreedySearch(pad, bos, eos, unk, start, n_best, batch_size, global_scorer, min_length, block_ngram_repeat, exclusion_tokens, return_attention, max_length, sampling_temp, keep_topk, keep_topp, beam_size, ban_unk_token)[source]

    Bases: DecodeStrategy

    Select next tokens randomly from the top k possible next tokens.

    The scores attribute’s lists are the score, after applying temperature, @@ -533,6 +533,8 @@

    Decoding Strategiesonmt.translate.GNMTGlobalScorer) – Scorer instance.

  • min_length (int) – See base.

  • diff --git a/options/build_vocab.html b/options/build_vocab.html index 0f2b6c4b36..92316d8467 100644 --- a/options/build_vocab.html +++ b/options/build_vocab.html @@ -128,24 +128,24 @@
  • Data
  • Vocab
  • Features
  • -
  • Transform/BART
  • -
  • Transform/Terminology
  • -
  • Transform/FuzzyMatching
  • -
  • Transform/Filter
  • -
  • Transform/Prefix
  • -
  • Transform/Suffix
  • -
  • Transform/InsertMaskBeforePlaceholdersTransform
  • -
  • Transform/Clean
  • -
  • Transform/Uppercase
  • Transform/SwitchOut
  • Transform/Token_Drop
  • Transform/Token_Mask
  • Transform/Docify
  • -
  • Transform/InferFeats
  • +
  • Transform/InsertMaskBeforePlaceholdersTransform
  • +
  • Transform/Uppercase
  • +
  • Transform/FuzzyMatching
  • +
  • Transform/InlineTags
  • +
  • Transform/Clean
  • Transform/Subword/Common
  • Transform/Subword/ONMTTOK
  • -
  • Transform/InlineTags
  • Transform/Normalize
  • +
  • Transform/InferFeats
  • +
  • Transform/Filter
  • +
  • Transform/Prefix
  • +
  • Transform/Suffix
  • +
  • Transform/Terminology
  • +
  • Transform/BART
  • Reproducibility
  • @@ -237,7 +237,7 @@

    Build Vocab
    usage: build_vocab.py [-h] [-config CONFIG] [-save_config SAVE_CONFIG] -data
                           DATA [-skip_empty_level {silent,warning,error}]
    -                      [-transforms {bart,terminology,fuzzymatch,filtertoolong,prefix,suffix,insert_mask_before_placeholder,clean,uppercase,switchout,tokendrop,tokenmask,docify,inferfeats,sentencepiece,bpe,onmt_tokenize,inlinetags,normalize} [{bart,terminology,fuzzymatch,filtertoolong,prefix,suffix,insert_mask_before_placeholder,clean,uppercase,switchout,tokendrop,tokenmask,docify,inferfeats,sentencepiece,bpe,onmt_tokenize,inlinetags,normalize} ...]]
    +                      [-transforms {switchout,tokendrop,tokenmask,docify,insert_mask_before_placeholder,uppercase,fuzzymatch,inlinetags,clean,sentencepiece,bpe,onmt_tokenize,normalize,inferfeats,filtertoolong,prefix,suffix,terminology,bart} [{switchout,tokendrop,tokenmask,docify,insert_mask_before_placeholder,uppercase,fuzzymatch,inlinetags,clean,sentencepiece,bpe,onmt_tokenize,normalize,inferfeats,filtertoolong,prefix,suffix,terminology,bart} ...]]
                           -save_data SAVE_DATA [-overwrite] [-n_sample N_SAMPLE]
                           [-dump_samples] [-num_threads NUM_THREADS]
                           [-learn_subwords]
    @@ -249,22 +249,12 @@ 

    Build Vocab[--default_specials DEFAULT_SPECIALS [DEFAULT_SPECIALS ...]] [-n_src_feats N_SRC_FEATS] [-src_feats_defaults SRC_FEATS_DEFAULTS] - [--permute_sent_ratio PERMUTE_SENT_RATIO] - [--rotate_ratio ROTATE_RATIO] - [--insert_ratio INSERT_RATIO] - [--random_ratio RANDOM_RATIO] [--mask_ratio MASK_RATIO] - [--mask_length {subword,word,span-poisson}] - [--poisson_lambda POISSON_LAMBDA] - [--replace_length {-1,0,1}] - [--termbase_path TERMBASE_PATH] - [--src_spacy_language_model SRC_SPACY_LANGUAGE_MODEL] - [--tgt_spacy_language_model TGT_SPACY_LANGUAGE_MODEL] - [--term_corpus_ratio TERM_CORPUS_RATIO] - [--term_example_ratio TERM_EXAMPLE_RATIO] - [--src_term_stoken SRC_TERM_STOKEN] - [--tgt_term_stoken TGT_TERM_STOKEN] - [--tgt_term_etoken TGT_TERM_ETOKEN] - [--term_source_delimiter TERM_SOURCE_DELIMITER] + [-switchout_temperature SWITCHOUT_TEMPERATURE] + [-tokendrop_temperature TOKENDROP_TEMPERATURE] + [-tokenmask_temperature TOKENMASK_TEMPERATURE] + [--doc_length DOC_LENGTH] [--max_context MAX_CONTEXT] + [--response_pattern RESPONSE_PATTERN] + [--upper_corpus_ratio UPPER_CORPUS_RATIO] [--tm_path TM_PATH] [--fuzzy_corpus_ratio FUZZY_CORPUS_RATIO] [--fuzzy_threshold FUZZY_THRESHOLD] @@ -272,23 +262,18 @@

    Build Vocab[--fuzzy_token FUZZY_TOKEN] [--fuzzymatch_min_length FUZZYMATCH_MIN_LENGTH] [--fuzzymatch_max_length FUZZYMATCH_MAX_LENGTH] - [--src_seq_length SRC_SEQ_LENGTH] - [--tgt_seq_length TGT_SEQ_LENGTH] - [--src_prefix SRC_PREFIX] [--tgt_prefix TGT_PREFIX] - [--src_suffix SRC_SUFFIX] [--tgt_suffix TGT_SUFFIX] - [--response_pattern RESPONSE_PATTERN] [--src_eq_tgt] + [--tags_dictionary_path TAGS_DICTIONARY_PATH] + [--tags_corpus_ratio TAGS_CORPUS_RATIO] + [--max_tags MAX_TAGS] [--paired_stag PAIRED_STAG] + [--paired_etag PAIRED_ETAG] + [--isolated_tag ISOLATED_TAG] + [--src_delimiter SRC_DELIMITER] [--src_eq_tgt] [--same_char] [--same_word] [--scripts_ok [SCRIPTS_OK [SCRIPTS_OK ...]]] [--scripts_nok [SCRIPTS_NOK [SCRIPTS_NOK ...]]] [--src_tgt_ratio SRC_TGT_RATIO] [--avg_tok_min AVG_TOK_MIN] [--avg_tok_max AVG_TOK_MAX] [--langid [LANGID [LANGID ...]]] - [--upper_corpus_ratio UPPER_CORPUS_RATIO] - [-switchout_temperature SWITCHOUT_TEMPERATURE] - [-tokendrop_temperature TOKENDROP_TEMPERATURE] - [-tokenmask_temperature TOKENMASK_TEMPERATURE] - [--doc_length DOC_LENGTH] [--max_context MAX_CONTEXT] - [--reversible_tokenization {joiner,spacer}] [-src_subword_model SRC_SUBWORD_MODEL] [-tgt_subword_model TGT_SUBWORD_MODEL] [-src_subword_nbest SRC_SUBWORD_NBEST] @@ -303,18 +288,32 @@

    Build Vocab[-tgt_subword_type {none,sentencepiece,bpe}] [-src_onmttok_kwargs SRC_ONMTTOK_KWARGS] [-tgt_onmttok_kwargs TGT_ONMTTOK_KWARGS] [--gpt2_pretok] - [--tags_dictionary_path TAGS_DICTIONARY_PATH] - [--tags_corpus_ratio TAGS_CORPUS_RATIO] - [--max_tags MAX_TAGS] [--paired_stag PAIRED_STAG] - [--paired_etag PAIRED_ETAG] - [--isolated_tag ISOLATED_TAG] - [--src_delimiter SRC_DELIMITER] [--src_lang SRC_LANG] - [--tgt_lang TGT_LANG] [--penn PENN] - [--norm_quote_commas NORM_QUOTE_COMMAS] + [--src_lang SRC_LANG] [--tgt_lang TGT_LANG] + [--penn PENN] [--norm_quote_commas NORM_QUOTE_COMMAS] [--norm_numbers NORM_NUMBERS] [--pre_replace_unicode_punct PRE_REPLACE_UNICODE_PUNCT] [--post_remove_control_chars POST_REMOVE_CONTROL_CHARS] - [--seed SEED] + [--reversible_tokenization {joiner,spacer}] + [--src_seq_length SRC_SEQ_LENGTH] + [--tgt_seq_length TGT_SEQ_LENGTH] + [--src_prefix SRC_PREFIX] [--tgt_prefix TGT_PREFIX] + [--src_suffix SRC_SUFFIX] [--tgt_suffix TGT_SUFFIX] + [--termbase_path TERMBASE_PATH] + [--src_spacy_language_model SRC_SPACY_LANGUAGE_MODEL] + [--tgt_spacy_language_model TGT_SPACY_LANGUAGE_MODEL] + [--term_corpus_ratio TERM_CORPUS_RATIO] + [--term_example_ratio TERM_EXAMPLE_RATIO] + [--src_term_stoken SRC_TERM_STOKEN] + [--tgt_term_stoken TGT_TERM_STOKEN] + [--tgt_term_etoken TGT_TERM_ETOKEN] + [--term_source_delimiter TERM_SOURCE_DELIMITER] + [--permute_sent_ratio PERMUTE_SENT_RATIO] + [--rotate_ratio ROTATE_RATIO] + [--insert_ratio INSERT_RATIO] + [--random_ratio RANDOM_RATIO] [--mask_ratio MASK_RATIO] + [--mask_length {subword,word,span-poisson}] + [--poisson_lambda POISSON_LAMBDA] + [--replace_length {-1,0,1}] [--seed SEED]

    @@ -340,7 +339,7 @@

    Data¶<

    Default: “warning”

    -transforms, --transforms
    -

    Possible choices: bart, terminology, fuzzymatch, filtertoolong, prefix, suffix, insert_mask_before_placeholder, clean, uppercase, switchout, tokendrop, tokenmask, docify, inferfeats, sentencepiece, bpe, onmt_tokenize, inlinetags, normalize

    +

    Possible choices: switchout, tokendrop, tokenmask, docify, insert_mask_before_placeholder, uppercase, fuzzymatch, inlinetags, clean, sentencepiece, bpe, onmt_tokenize, normalize, inferfeats, filtertoolong, prefix, suffix, terminology, bart

    Default transform pipeline to apply to data. Can be specified in each corpus of data to override.

    Default: []

    @@ -412,84 +411,65 @@

    Features -

    Transform/BART

    +
    +

    Transform/SwitchOut

    Caution

    This transform will not take effect when building vocabulary.

    -
    --permute_sent_ratio, -permute_sent_ratio
    -

    Permute this proportion of sentences (boundaries defined by [‘.’, ‘?’, ‘!’]) in all inputs.

    -

    Default: 0.0

    -
    -
    --rotate_ratio, -rotate_ratio
    -

    Rotate this proportion of inputs.

    -

    Default: 0.0

    -
    -
    --insert_ratio, -insert_ratio
    -

    Insert this percentage of additional random tokens.

    -

    Default: 0.0

    -
    -
    --random_ratio, -random_ratio
    -

    Instead of using <mask>, use random token this often.

    -

    Default: 0.0

    -
    -
    --mask_ratio, -mask_ratio
    -

    Fraction of words/subwords that will be masked.

    -

    Default: 0.0

    -
    -
    --mask_length, -mask_length
    -

    Possible choices: subword, word, span-poisson

    -

    Length of masking window to apply.

    -

    Default: “subword”

    -
    -
    --poisson_lambda, -poisson_lambda
    -

    Lambda for Poisson distribution to sample span length if -mask_length set to span-poisson.

    -

    Default: 3.0

    -
    -
    --replace_length, -replace_length
    -

    Possible choices: -1, 0, 1

    -

    When masking N tokens, replace with 0, 1, or N tokens. (use -1 for N)

    -

    Default: -1

    +
    -switchout_temperature, --switchout_temperature
    +

    Sampling temperature for SwitchOut. \(\tau^{-1}\) in [WPDN18]. Smaller value makes data more diverse.

    +

    Default: 1.0

    -
    -

    Transform/Terminology

    +
    +

    Transform/Token_Drop

    -
    --termbase_path, -termbase_path
    -

    Path to a dictionary file with terms.

    -
    -
    --src_spacy_language_model, -src_spacy_language_model
    -

    Name of the spacy language model for the source corpus.

    -
    -
    --tgt_spacy_language_model, -tgt_spacy_language_model
    -

    Name of the spacy language model for the target corpus.

    -
    -
    --term_corpus_ratio, -term_corpus_ratio
    -

    Ratio of corpus to augment with terms.

    -

    Default: 0.3

    +
    -tokendrop_temperature, --tokendrop_temperature
    +

    Sampling temperature for token deletion.

    +

    Default: 1.0

    -
    --term_example_ratio, -term_example_ratio
    -

    Max terms allowed in an example.

    -

    Default: 0.2

    +
    +
    +
    +

    Transform/Token_Mask

    +
    +
    -tokenmask_temperature, --tokenmask_temperature
    +

    Sampling temperature for token masking.

    +

    Default: 1.0

    -
    --src_term_stoken, -src_term_stoken
    -

    The source term start token.

    -

    Default: “⦅src_term_start⦆”

    +
    +
    +
    +

    Transform/Docify

    +
    +
    --doc_length, -doc_length
    +

    Number of tokens per doc.

    +

    Default: 200

    -
    --tgt_term_stoken, -tgt_term_stoken
    -

    The target term start token.

    -

    Default: “⦅tgt_term_start⦆”

    +
    --max_context, -max_context
    +

    Max context segments.

    +

    Default: 1

    -
    --tgt_term_etoken, -tgt_term_etoken
    -

    The target term end token.

    -

    Default: “⦅tgt_term_end⦆”

    +
    +
    +
    +

    Transform/InsertMaskBeforePlaceholdersTransform

    +
    +
    --response_pattern, -response_pattern
    +

    Response patten to locate the end of the prompt

    +

    Default: “Response : ⦅newline⦆”

    -
    --term_source_delimiter, -term_source_delimiter
    -

    Any special token used for augmented source sentences. The default is the fuzzy token used in the FuzzyMatch transform.

    -

    Default: “⦅fuzzy⦆”

    +
    +
    +
    +

    Transform/Uppercase

    +
    +
    --upper_corpus_ratio, -upper_corpus_ratio
    +

    Corpus ratio to apply uppercasing.

    +

    Default: 0.01

    @@ -525,51 +505,35 @@

    Transform/FuzzyMatching

    -
    -

    Transform/Filter

    +
    +

    Transform/InlineTags

    -
    --src_seq_length, -src_seq_length
    -

    Maximum source sequence length.

    -

    Default: 192

    +
    --tags_dictionary_path, -tags_dictionary_path
    +

    Path to a flat term dictionary.

    -
    --tgt_seq_length, -tgt_seq_length
    -

    Maximum target sequence length.

    -

    Default: 192

    +
    --tags_corpus_ratio, -tags_corpus_ratio
    +

    Ratio of corpus to augment with tags.

    +

    Default: 0.1

    -
    -
    -
    -

    Transform/Prefix

    -
    -
    --src_prefix, -src_prefix
    -

    String to prepend to all source example.

    -

    Default: “”

    +
    --max_tags, -max_tags
    +

    Maximum number of tags that can be added to a single sentence.

    +

    Default: 12

    -
    --tgt_prefix, -tgt_prefix
    -

    String to prepend to all target example.

    -

    Default: “”

    +
    --paired_stag, -paired_stag
    +

    The format of an opening paired inline tag. Must include the character #.

    +

    Default: “⦅ph_#_beg⦆”

    -
    -
    -
    -

    Transform/Suffix

    -
    -
    --src_suffix, -src_suffix
    -

    String to append to all source example.

    -

    Default: “”

    +
    --paired_etag, -paired_etag
    +

    The format of a closing paired inline tag. Must include the character #.

    +

    Default: “⦅ph_#_end⦆”

    -
    --tgt_suffix, -tgt_suffix
    -

    String to append to all target example.

    -

    Default: “”

    +
    --isolated_tag, -isolated_tag
    +

    The format of an isolated inline tag. Must include the character #.

    +

    Default: “⦅ph_#_std⦆”

    -
    -
    -
    -

    Transform/InsertMaskBeforePlaceholdersTransform

    -
    -
    --response_pattern, -response_pattern
    -

    Response patten to locate the end of the prompt

    -

    Default: “Response : ⦅newline⦆”

    +
    --src_delimiter, -src_delimiter
    +

    Any special token used for augmented src sentences. The default is the fuzzy token used in the FuzzyMatch transform.

    +

    Default: “⦅fuzzy⦆”

    @@ -614,69 +578,6 @@

    Transform/Clean -

    Transform/Uppercase

    -
    -
    --upper_corpus_ratio, -upper_corpus_ratio
    -

    Corpus ratio to apply uppercasing.

    -

    Default: 0.01

    -
    -
    -

    -
    -

    Transform/SwitchOut

    -
    -

    Caution

    -

    This transform will not take effect when building vocabulary.

    -
    -
    -
    -switchout_temperature, --switchout_temperature
    -

    Sampling temperature for SwitchOut. \(\tau^{-1}\) in [WPDN18]. Smaller value makes data more diverse.

    -

    Default: 1.0

    -
    -
    -
    -
    -

    Transform/Token_Drop

    -
    -
    -tokendrop_temperature, --tokendrop_temperature
    -

    Sampling temperature for token deletion.

    -

    Default: 1.0

    -
    -
    -
    -
    -

    Transform/Token_Mask

    -
    -
    -tokenmask_temperature, --tokenmask_temperature
    -

    Sampling temperature for token masking.

    -

    Default: 1.0

    -
    -
    -
    -
    -

    Transform/Docify

    -
    -
    --doc_length, -doc_length
    -

    Number of tokens per doc.

    -

    Default: 200

    -
    -
    --max_context, -max_context
    -

    Max context segments.

    -

    Default: 1

    -
    -
    -
    -
    -

    Transform/InferFeats

    -
    -
    --reversible_tokenization, -reversible_tokenization
    -

    Possible choices: joiner, spacer

    -

    Type of reversible tokenization applied on the tokenizer.

    -

    Default: “joiner”

    -
    -
    -

    Transform/Subword/Common

    @@ -751,38 +652,6 @@

    Transform/Subword/ONMTTOK -

    Transform/InlineTags

    -
    -
    --tags_dictionary_path, -tags_dictionary_path
    -

    Path to a flat term dictionary.

    -
    -
    --tags_corpus_ratio, -tags_corpus_ratio
    -

    Ratio of corpus to augment with tags.

    -

    Default: 0.1

    -
    -
    --max_tags, -max_tags
    -

    Maximum number of tags that can be added to a single sentence.

    -

    Default: 12

    -
    -
    --paired_stag, -paired_stag
    -

    The format of an opening paired inline tag. Must include the character #.

    -

    Default: “⦅ph_#_beg⦆”

    -
    -
    --paired_etag, -paired_etag
    -

    The format of a closing paired inline tag. Must include the character #.

    -

    Default: “⦅ph_#_end⦆”

    -
    -
    --isolated_tag, -isolated_tag
    -

    The format of an isolated inline tag. Must include the character #.

    -

    Default: “⦅ph_#_std⦆”

    -
    -
    --src_delimiter, -src_delimiter
    -

    Any special token used for augmented src sentences. The default is the fuzzy token used in the FuzzyMatch transform.

    -

    Default: “⦅fuzzy⦆”

    -
    -
    -

    Transform/Normalize

    @@ -816,6 +685,136 @@

    Transform/Normalize +

    Transform/InferFeats

    +
    +
    --reversible_tokenization, -reversible_tokenization
    +

    Possible choices: joiner, spacer

    +

    Type of reversible tokenization applied on the tokenizer.

    +

    Default: “joiner”

    +
    +
    +

    +
    +

    Transform/Filter

    +
    +
    --src_seq_length, -src_seq_length
    +

    Maximum source sequence length.

    +

    Default: 192

    +
    +
    --tgt_seq_length, -tgt_seq_length
    +

    Maximum target sequence length.

    +

    Default: 192

    +
    +
    +
    +
    +

    Transform/Prefix

    +
    +
    --src_prefix, -src_prefix
    +

    String to prepend to all source example.

    +

    Default: “”

    +
    +
    --tgt_prefix, -tgt_prefix
    +

    String to prepend to all target example.

    +

    Default: “”

    +
    +
    +
    +
    +

    Transform/Suffix

    +
    +
    --src_suffix, -src_suffix
    +

    String to append to all source example.

    +

    Default: “”

    +
    +
    --tgt_suffix, -tgt_suffix
    +

    String to append to all target example.

    +

    Default: “”

    +
    +
    +
    +
    +

    Transform/Terminology

    +
    +
    --termbase_path, -termbase_path
    +

    Path to a dictionary file with terms.

    +
    +
    --src_spacy_language_model, -src_spacy_language_model
    +

    Name of the spacy language model for the source corpus.

    +
    +
    --tgt_spacy_language_model, -tgt_spacy_language_model
    +

    Name of the spacy language model for the target corpus.

    +
    +
    --term_corpus_ratio, -term_corpus_ratio
    +

    Ratio of corpus to augment with terms.

    +

    Default: 0.3

    +
    +
    --term_example_ratio, -term_example_ratio
    +

    Max terms allowed in an example.

    +

    Default: 0.2

    +
    +
    --src_term_stoken, -src_term_stoken
    +

    The source term start token.

    +

    Default: “⦅src_term_start⦆”

    +
    +
    --tgt_term_stoken, -tgt_term_stoken
    +

    The target term start token.

    +

    Default: “⦅tgt_term_start⦆”

    +
    +
    --tgt_term_etoken, -tgt_term_etoken
    +

    The target term end token.

    +

    Default: “⦅tgt_term_end⦆”

    +
    +
    --term_source_delimiter, -term_source_delimiter
    +

    Any special token used for augmented source sentences. The default is the fuzzy token used in the FuzzyMatch transform.

    +

    Default: “⦅fuzzy⦆”

    +
    +
    +
    +
    +

    Transform/BART

    +
    +

    Caution

    +

    This transform will not take effect when building vocabulary.

    +
    +
    +
    --permute_sent_ratio, -permute_sent_ratio
    +

    Permute this proportion of sentences (boundaries defined by [‘.’, ‘?’, ‘!’]) in all inputs.

    +

    Default: 0.0

    +
    +
    --rotate_ratio, -rotate_ratio
    +

    Rotate this proportion of inputs.

    +

    Default: 0.0

    +
    +
    --insert_ratio, -insert_ratio
    +

    Insert this percentage of additional random tokens.

    +

    Default: 0.0

    +
    +
    --random_ratio, -random_ratio
    +

    Instead of using <mask>, use random token this often.

    +

    Default: 0.0

    +
    +
    --mask_ratio, -mask_ratio
    +

    Fraction of words/subwords that will be masked.

    +

    Default: 0.0

    +
    +
    --mask_length, -mask_length
    +

    Possible choices: subword, word, span-poisson

    +

    Length of masking window to apply.

    +

    Default: “subword”

    +
    +
    --poisson_lambda, -poisson_lambda
    +

    Lambda for Poisson distribution to sample span length if -mask_length set to span-poisson.

    +

    Default: 3.0

    +
    +
    --replace_length, -replace_length
    +

    Possible choices: -1, 0, 1

    +

    When masking N tokens, replace with 0, 1, or N tokens. (use -1 for N)

    +

    Default: -1

    +
    +
    +

    Reproducibility

    diff --git a/options/server.html b/options/server.html index ba7f9f9b21..e3fb56241a 100644 --- a/options/server.html +++ b/options/server.html @@ -213,7 +213,7 @@

    Server

    OpenNMT-py REST Server

    usage: server.py [-h] [--ip IP] [--port PORT] [--url_root URL_ROOT] [--debug]
    -                 [--config CONFIG]
    +                 [--model_config MODEL_CONFIG]
     
    @@ -231,7 +231,7 @@

    Named ArgumentsFeatures
  • Pruning
  • Embeddings
  • -
  • Transform/BART
  • -
  • Transform/Terminology
  • -
  • Transform/FuzzyMatching
  • -
  • Transform/Filter
  • -
  • Transform/Prefix
  • -
  • Transform/Suffix
  • -
  • Transform/InsertMaskBeforePlaceholdersTransform
  • -
  • Transform/Clean
  • -
  • Transform/Uppercase
  • Transform/SwitchOut
  • Transform/Token_Drop
  • Transform/Token_Mask
  • Transform/Docify
  • -
  • Transform/InferFeats
  • +
  • Transform/InsertMaskBeforePlaceholdersTransform
  • +
  • Transform/Uppercase
  • +
  • Transform/FuzzyMatching
  • +
  • Transform/InlineTags
  • +
  • Transform/Clean
  • Transform/Subword/Common
  • Transform/Subword/ONMTTOK
  • -
  • Transform/InlineTags
  • Transform/Normalize
  • +
  • Transform/InferFeats
  • +
  • Transform/Filter
  • +
  • Transform/Prefix
  • +
  • Transform/Suffix
  • +
  • Transform/Terminology
  • +
  • Transform/BART
  • Distributed
  • Model-Embeddings
  • Model-Embedding Features
  • @@ -163,7 +163,6 @@
  • Optimization- Type
  • Optimization- Rate
  • Logging
  • -
  • Dynamic data
  • Quant options
  • @@ -254,7 +253,7 @@

    Train

    usage: train.py [-h] [-config CONFIG] [-save_config SAVE_CONFIG] -data DATA
                     [-skip_empty_level {silent,warning,error}]
    -                [-transforms {bart,terminology,fuzzymatch,filtertoolong,prefix,suffix,insert_mask_before_placeholder,clean,uppercase,switchout,tokendrop,tokenmask,docify,inferfeats,sentencepiece,bpe,onmt_tokenize,inlinetags,normalize} [{bart,terminology,fuzzymatch,filtertoolong,prefix,suffix,insert_mask_before_placeholder,clean,uppercase,switchout,tokendrop,tokenmask,docify,inferfeats,sentencepiece,bpe,onmt_tokenize,inlinetags,normalize} ...]]
    +                [-transforms {switchout,tokendrop,tokenmask,docify,insert_mask_before_placeholder,uppercase,fuzzymatch,inlinetags,clean,sentencepiece,bpe,onmt_tokenize,normalize,inferfeats,filtertoolong,prefix,suffix,terminology,bart} [{switchout,tokendrop,tokenmask,docify,insert_mask_before_placeholder,uppercase,fuzzymatch,inlinetags,clean,sentencepiece,bpe,onmt_tokenize,normalize,inferfeats,filtertoolong,prefix,suffix,terminology,bart} ...]]
                     [-save_data SAVE_DATA] [-overwrite] [-n_sample N_SAMPLE]
                     [-dump_transforms] -src_vocab SRC_VOCAB [-tgt_vocab TGT_VOCAB]
                     [-share_vocab] [--decoder_start_token DECODER_START_TOKEN]
    @@ -272,41 +271,26 @@ 

    Train [-src_embeddings SRC_EMBEDDINGS] [-tgt_embeddings TGT_EMBEDDINGS] [-embeddings_type {GloVe,word2vec}] - [--permute_sent_ratio PERMUTE_SENT_RATIO] - [--rotate_ratio ROTATE_RATIO] [--insert_ratio INSERT_RATIO] - [--random_ratio RANDOM_RATIO] [--mask_ratio MASK_RATIO] - [--mask_length {subword,word,span-poisson}] - [--poisson_lambda POISSON_LAMBDA] [--replace_length {-1,0,1}] - [--termbase_path TERMBASE_PATH] - [--src_spacy_language_model SRC_SPACY_LANGUAGE_MODEL] - [--tgt_spacy_language_model TGT_SPACY_LANGUAGE_MODEL] - [--term_corpus_ratio TERM_CORPUS_RATIO] - [--term_example_ratio TERM_EXAMPLE_RATIO] - [--src_term_stoken SRC_TERM_STOKEN] - [--tgt_term_stoken TGT_TERM_STOKEN] - [--tgt_term_etoken TGT_TERM_ETOKEN] - [--term_source_delimiter TERM_SOURCE_DELIMITER] - [--tm_path TM_PATH] [--fuzzy_corpus_ratio FUZZY_CORPUS_RATIO] + [-switchout_temperature SWITCHOUT_TEMPERATURE] + [-tokendrop_temperature TOKENDROP_TEMPERATURE] + [-tokenmask_temperature TOKENMASK_TEMPERATURE] + [--doc_length DOC_LENGTH] [--max_context MAX_CONTEXT] + [--response_pattern RESPONSE_PATTERN] + [--upper_corpus_ratio UPPER_CORPUS_RATIO] [--tm_path TM_PATH] + [--fuzzy_corpus_ratio FUZZY_CORPUS_RATIO] [--fuzzy_threshold FUZZY_THRESHOLD] [--tm_delimiter TM_DELIMITER] [--fuzzy_token FUZZY_TOKEN] [--fuzzymatch_min_length FUZZYMATCH_MIN_LENGTH] [--fuzzymatch_max_length FUZZYMATCH_MAX_LENGTH] - [--src_seq_length SRC_SEQ_LENGTH] - [--tgt_seq_length TGT_SEQ_LENGTH] [--src_prefix SRC_PREFIX] - [--tgt_prefix TGT_PREFIX] [--src_suffix SRC_SUFFIX] - [--tgt_suffix TGT_SUFFIX] - [--response_pattern RESPONSE_PATTERN] [--src_eq_tgt] - [--same_char] [--same_word] + [--tags_dictionary_path TAGS_DICTIONARY_PATH] + [--tags_corpus_ratio TAGS_CORPUS_RATIO] [--max_tags MAX_TAGS] + [--paired_stag PAIRED_STAG] [--paired_etag PAIRED_ETAG] + [--isolated_tag ISOLATED_TAG] [--src_delimiter SRC_DELIMITER] + [--src_eq_tgt] [--same_char] [--same_word] [--scripts_ok [SCRIPTS_OK [SCRIPTS_OK ...]]] [--scripts_nok [SCRIPTS_NOK [SCRIPTS_NOK ...]]] [--src_tgt_ratio SRC_TGT_RATIO] [--avg_tok_min AVG_TOK_MIN] [--avg_tok_max AVG_TOK_MAX] [--langid [LANGID [LANGID ...]]] - [--upper_corpus_ratio UPPER_CORPUS_RATIO] - [-switchout_temperature SWITCHOUT_TEMPERATURE] - [-tokendrop_temperature TOKENDROP_TEMPERATURE] - [-tokenmask_temperature TOKENMASK_TEMPERATURE] - [--doc_length DOC_LENGTH] [--max_context MAX_CONTEXT] - [--reversible_tokenization {joiner,spacer}] [-src_subword_model SRC_SUBWORD_MODEL] [-tgt_subword_model TGT_SUBWORD_MODEL] [-src_subword_nbest SRC_SUBWORD_NBEST] @@ -321,22 +305,36 @@

    Train [-tgt_subword_type {none,sentencepiece,bpe}] [-src_onmttok_kwargs SRC_ONMTTOK_KWARGS] [-tgt_onmttok_kwargs TGT_ONMTTOK_KWARGS] [--gpt2_pretok] - [--tags_dictionary_path TAGS_DICTIONARY_PATH] - [--tags_corpus_ratio TAGS_CORPUS_RATIO] [--max_tags MAX_TAGS] - [--paired_stag PAIRED_STAG] [--paired_etag PAIRED_ETAG] - [--isolated_tag ISOLATED_TAG] [--src_delimiter SRC_DELIMITER] [--src_lang SRC_LANG] [--tgt_lang TGT_LANG] [--penn PENN] [--norm_quote_commas NORM_QUOTE_COMMAS] [--norm_numbers NORM_NUMBERS] [--pre_replace_unicode_punct PRE_REPLACE_UNICODE_PUNCT] [--post_remove_control_chars POST_REMOVE_CONTROL_CHARS] + [--reversible_tokenization {joiner,spacer}] + [--src_seq_length SRC_SEQ_LENGTH] + [--tgt_seq_length TGT_SEQ_LENGTH] [--src_prefix SRC_PREFIX] + [--tgt_prefix TGT_PREFIX] [--src_suffix SRC_SUFFIX] + [--tgt_suffix TGT_SUFFIX] [--termbase_path TERMBASE_PATH] + [--src_spacy_language_model SRC_SPACY_LANGUAGE_MODEL] + [--tgt_spacy_language_model TGT_SPACY_LANGUAGE_MODEL] + [--term_corpus_ratio TERM_CORPUS_RATIO] + [--term_example_ratio TERM_EXAMPLE_RATIO] + [--src_term_stoken SRC_TERM_STOKEN] + [--tgt_term_stoken TGT_TERM_STOKEN] + [--tgt_term_etoken TGT_TERM_ETOKEN] + [--term_source_delimiter TERM_SOURCE_DELIMITER] + [--permute_sent_ratio PERMUTE_SENT_RATIO] + [--rotate_ratio ROTATE_RATIO] [--insert_ratio INSERT_RATIO] + [--random_ratio RANDOM_RATIO] [--mask_ratio MASK_RATIO] + [--mask_length {subword,word,span-poisson}] + [--poisson_lambda POISSON_LAMBDA] [--replace_length {-1,0,1}] [--gpu_ranks [GPU_RANKS [GPU_RANKS ...]]] [--world_size WORLD_SIZE] [--parallel_mode {tensor_parallel,data_parallel}] [--gpu_backend GPU_BACKEND] [--gpu_verbose_level GPU_VERBOSE_LEVEL] [--master_ip MASTER_IP] [--master_port MASTER_PORT] - [--src_word_vec_size SRC_WORD_VEC_SIZE] + [--timeout TIMEOUT] [--src_word_vec_size SRC_WORD_VEC_SIZE] [--tgt_word_vec_size TGT_WORD_VEC_SIZE] [--word_vec_size WORD_VEC_SIZE] [--share_decoder_embeddings] [--share_embeddings] [--position_encoding] @@ -365,8 +363,11 @@

    Train [--self_attn_type SELF_ATTN_TYPE] [--max_relative_positions MAX_RELATIVE_POSITIONS] [--relative_positions_buckets RELATIVE_POSITIONS_BUCKETS] - [--heads HEADS] [--sliding_window SLIDING_WINDOW] - [--transformer_ff TRANSFORMER_FF] [--aan_useffn] + [--rotary_interleave] [--rotary_theta ROTARY_THETA] + [--rotary_dim ROTARY_DIM] [--heads HEADS] + [--sliding_window SLIDING_WINDOW] + [--transformer_ff TRANSFORMER_FF] [--num_experts NUM_EXPERTS] + [--num_experts_per_tok NUM_EXPERTS_PER_TOK] [--aan_useffn] [--add_qkvbias] [--multiquery] [--num_kv NUM_KV] [--add_ffnbias] [--parallel_residual] [--shared_layer_norm] [--lambda_align LAMBDA_ALIGN] @@ -381,7 +382,10 @@

    Train [--lm_prior_tau LM_PRIOR_TAU] [--loss_scale LOSS_SCALE] [--apex_opt_level {,O0,O1,O2,O3}] [--zero_out_prompt_loss] [--use_ckpting {ffn,mha,lora} [{ffn,mha,lora} ...]] - [--data_type DATA_TYPE] [--save_model SAVE_MODEL] + [--data_type DATA_TYPE] [-bucket_size BUCKET_SIZE] + [-bucket_size_init BUCKET_SIZE_INIT] + [-bucket_size_increment BUCKET_SIZE_INCREMENT] + [-prefetch_factor PREFETCH_FACTOR] [--save_model SAVE_MODEL] [--save_format {pytorch,safetensors}] [--save_checkpoint_steps SAVE_CHECKPOINT_STEPS] [--keep_checkpoint KEEP_CHECKPOINT] @@ -428,12 +432,9 @@

    Train [--report_every REPORT_EVERY] [--exp_host EXP_HOST] [--exp EXP] [--tensorboard] [--tensorboard_log_dir TENSORBOARD_LOG_DIR] [--override_opts] - [-bucket_size BUCKET_SIZE] - [-bucket_size_init BUCKET_SIZE_INIT] - [-bucket_size_increment BUCKET_SIZE_INCREMENT] - [-prefetch_factor PREFETCH_FACTOR] [--quant_layers QUANT_LAYERS [QUANT_LAYERS ...]] - [--quant_type {bnb_8bit,bnb_FP4,bnb_NF4}] + [--quant_type {,bnb_8bit,bnb_FP4,bnb_NF4,awq_gemm,awq_gemv}] + [--w_bit {4}] [--group_size {128}]

    @@ -459,7 +460,7 @@

    Data¶<

    Default: “warning”

    -transforms, --transforms
    -

    Possible choices: bart, terminology, fuzzymatch, filtertoolong, prefix, suffix, insert_mask_before_placeholder, clean, uppercase, switchout, tokendrop, tokenmask, docify, inferfeats, sentencepiece, bpe, onmt_tokenize, inlinetags, normalize

    +

    Possible choices: switchout, tokendrop, tokenmask, docify, insert_mask_before_placeholder, uppercase, fuzzymatch, inlinetags, clean, sentencepiece, bpe, onmt_tokenize, normalize, inferfeats, filtertoolong, prefix, suffix, terminology, bart

    Default transform pipeline to apply to data. Can be specified in each corpus of data to override.

    Default: []

    @@ -564,80 +565,61 @@

    Embeddings -

    Transform/BART

    +
    +

    Transform/SwitchOut

    -
    --permute_sent_ratio, -permute_sent_ratio
    -

    Permute this proportion of sentences (boundaries defined by [‘.’, ‘?’, ‘!’]) in all inputs.

    -

    Default: 0.0

    -
    -
    --rotate_ratio, -rotate_ratio
    -

    Rotate this proportion of inputs.

    -

    Default: 0.0

    -
    -
    --insert_ratio, -insert_ratio
    -

    Insert this percentage of additional random tokens.

    -

    Default: 0.0

    -
    -
    --random_ratio, -random_ratio
    -

    Instead of using <mask>, use random token this often.

    -

    Default: 0.0

    -
    -
    --mask_ratio, -mask_ratio
    -

    Fraction of words/subwords that will be masked.

    -

    Default: 0.0

    -
    -
    --mask_length, -mask_length
    -

    Possible choices: subword, word, span-poisson

    -

    Length of masking window to apply.

    -

    Default: “subword”

    -
    -
    --poisson_lambda, -poisson_lambda
    -

    Lambda for Poisson distribution to sample span length if -mask_length set to span-poisson.

    -

    Default: 3.0

    -
    -
    --replace_length, -replace_length
    -

    Possible choices: -1, 0, 1

    -

    When masking N tokens, replace with 0, 1, or N tokens. (use -1 for N)

    -

    Default: -1

    +
    -switchout_temperature, --switchout_temperature
    +

    Sampling temperature for SwitchOut. \(\tau^{-1}\) in [WPDN18]. Smaller value makes data more diverse.

    +

    Default: 1.0

    -
    -

    Transform/Terminology

    +
    +

    Transform/Token_Drop

    -
    --termbase_path, -termbase_path
    -

    Path to a dictionary file with terms.

    -
    -
    --src_spacy_language_model, -src_spacy_language_model
    -

    Name of the spacy language model for the source corpus.

    -
    -
    --tgt_spacy_language_model, -tgt_spacy_language_model
    -

    Name of the spacy language model for the target corpus.

    -
    -
    --term_corpus_ratio, -term_corpus_ratio
    -

    Ratio of corpus to augment with terms.

    -

    Default: 0.3

    +
    -tokendrop_temperature, --tokendrop_temperature
    +

    Sampling temperature for token deletion.

    +

    Default: 1.0

    -
    --term_example_ratio, -term_example_ratio
    -

    Max terms allowed in an example.

    -

    Default: 0.2

    +
    +
    +
    +

    Transform/Token_Mask

    +
    +
    -tokenmask_temperature, --tokenmask_temperature
    +

    Sampling temperature for token masking.

    +

    Default: 1.0

    -
    --src_term_stoken, -src_term_stoken
    -

    The source term start token.

    -

    Default: “⦅src_term_start⦆”

    +
    +
    +
    +

    Transform/Docify

    +
    +
    --doc_length, -doc_length
    +

    Number of tokens per doc.

    +

    Default: 200

    -
    --tgt_term_stoken, -tgt_term_stoken
    -

    The target term start token.

    -

    Default: “⦅tgt_term_start⦆”

    +
    --max_context, -max_context
    +

    Max context segments.

    +

    Default: 1

    -
    --tgt_term_etoken, -tgt_term_etoken
    -

    The target term end token.

    -

    Default: “⦅tgt_term_end⦆”

    +
    +
    +
    +

    Transform/InsertMaskBeforePlaceholdersTransform

    +
    +
    --response_pattern, -response_pattern
    +

    Response patten to locate the end of the prompt

    +

    Default: “Response : ⦅newline⦆”

    -
    --term_source_delimiter, -term_source_delimiter
    -

    Any special token used for augmented source sentences. The default is the fuzzy token used in the FuzzyMatch transform.

    -

    Default: “⦅fuzzy⦆”

    +
    +
    +
    +

    Transform/Uppercase

    +
    +
    --upper_corpus_ratio, -upper_corpus_ratio
    +

    Corpus ratio to apply uppercasing.

    +

    Default: 0.01

    @@ -673,51 +655,35 @@

    Transform/FuzzyMatching

    -
    -

    Transform/Filter

    +
    +

    Transform/InlineTags

    -
    --src_seq_length, -src_seq_length
    -

    Maximum source sequence length.

    -

    Default: 192

    +
    --tags_dictionary_path, -tags_dictionary_path
    +

    Path to a flat term dictionary.

    -
    --tgt_seq_length, -tgt_seq_length
    -

    Maximum target sequence length.

    -

    Default: 192

    +
    --tags_corpus_ratio, -tags_corpus_ratio
    +

    Ratio of corpus to augment with tags.

    +

    Default: 0.1

    -
    -
    -
    -

    Transform/Prefix

    -
    -
    --src_prefix, -src_prefix
    -

    String to prepend to all source example.

    -

    Default: “”

    +
    --max_tags, -max_tags
    +

    Maximum number of tags that can be added to a single sentence.

    +

    Default: 12

    -
    --tgt_prefix, -tgt_prefix
    -

    String to prepend to all target example.

    -

    Default: “”

    +
    --paired_stag, -paired_stag
    +

    The format of an opening paired inline tag. Must include the character #.

    +

    Default: “⦅ph_#_beg⦆”

    -
    -
    -
    -

    Transform/Suffix

    -
    -
    --src_suffix, -src_suffix
    -

    String to append to all source example.

    -

    Default: “”

    +
    --paired_etag, -paired_etag
    +

    The format of a closing paired inline tag. Must include the character #.

    +

    Default: “⦅ph_#_end⦆”

    -
    --tgt_suffix, -tgt_suffix
    -

    String to append to all target example.

    -

    Default: “”

    +
    --isolated_tag, -isolated_tag
    +

    The format of an isolated inline tag. Must include the character #.

    +

    Default: “⦅ph_#_std⦆”

    -
    -
    -
    -

    Transform/InsertMaskBeforePlaceholdersTransform

    -
    -
    --response_pattern, -response_pattern
    -

    Response patten to locate the end of the prompt

    -

    Default: “Response : ⦅newline⦆”

    +
    --src_delimiter, -src_delimiter
    +

    Any special token used for augmented src sentences. The default is the fuzzy token used in the FuzzyMatch transform.

    +

    Default: “⦅fuzzy⦆”

    @@ -748,76 +714,17 @@

    Transform/Clean -

    Transform/Uppercase

    -
    -
    --upper_corpus_ratio, -upper_corpus_ratio
    -

    Corpus ratio to apply uppercasing.

    -

    Default: 0.01

    -
    -
    -

    -
    -

    Transform/SwitchOut

    -
    -
    -switchout_temperature, --switchout_temperature
    -

    Sampling temperature for SwitchOut. \(\tau^{-1}\) in [WPDN18]. Smaller value makes data more diverse.

    -

    Default: 1.0

    -
    -
    -
    -
    -

    Transform/Token_Drop

    -
    -
    -tokendrop_temperature, --tokendrop_temperature
    -

    Sampling temperature for token deletion.

    -

    Default: 1.0

    -
    -
    -
    -
    -

    Transform/Token_Mask

    -
    -
    -tokenmask_temperature, --tokenmask_temperature
    -

    Sampling temperature for token masking.

    -

    Default: 1.0

    -
    -
    -
    -
    -

    Transform/Docify

    -
    -
    --doc_length, -doc_length
    -

    Number of tokens per doc.

    -

    Default: 200

    -
    -
    --max_context, -max_context
    -

    Max context segments.

    -

    Default: 1

    +
    --avg_tok_min, -avg_tok_min
    +

    average length of tokens min

    +

    Default: 3

    -
    -
    -
    -

    Transform/InferFeats

    -
    -
    --reversible_tokenization, -reversible_tokenization
    -

    Possible choices: joiner, spacer

    -

    Type of reversible tokenization applied on the tokenizer.

    -

    Default: “joiner”

    +
    --avg_tok_max, -avg_tok_max
    +

    average length of tokens max

    +

    Default: 20

    +
    +
    --langid, -langid
    +

    list of languages accepted

    +

    Default: []

    @@ -895,38 +802,6 @@

    Transform/Subword/ONMTTOK -

    Transform/InlineTags

    -
    -
    --tags_dictionary_path, -tags_dictionary_path
    -

    Path to a flat term dictionary.

    -
    -
    --tags_corpus_ratio, -tags_corpus_ratio
    -

    Ratio of corpus to augment with tags.

    -

    Default: 0.1

    -
    -
    --max_tags, -max_tags
    -

    Maximum number of tags that can be added to a single sentence.

    -

    Default: 12

    -
    -
    --paired_stag, -paired_stag
    -

    The format of an opening paired inline tag. Must include the character #.

    -

    Default: “⦅ph_#_beg⦆”

    -
    -
    --paired_etag, -paired_etag
    -

    The format of a closing paired inline tag. Must include the character #.

    -

    Default: “⦅ph_#_end⦆”

    -
    -
    --isolated_tag, -isolated_tag
    -

    The format of an isolated inline tag. Must include the character #.

    -

    Default: “⦅ph_#_std⦆”

    -
    -
    --src_delimiter, -src_delimiter
    -

    Any special token used for augmented src sentences. The default is the fuzzy token used in the FuzzyMatch transform.

    -

    Default: “⦅fuzzy⦆”

    -
    -
    -

    Transform/Normalize

    @@ -960,6 +835,132 @@

    Transform/Normalize +

    Transform/InferFeats

    +
    +
    --reversible_tokenization, -reversible_tokenization
    +

    Possible choices: joiner, spacer

    +

    Type of reversible tokenization applied on the tokenizer.

    +

    Default: “joiner”

    +
    +
    +

    +
    +

    Transform/Filter

    +
    +
    --src_seq_length, -src_seq_length
    +

    Maximum source sequence length.

    +

    Default: 192

    +
    +
    --tgt_seq_length, -tgt_seq_length
    +

    Maximum target sequence length.

    +

    Default: 192

    +
    +
    +
    +
    +

    Transform/Prefix

    +
    +
    --src_prefix, -src_prefix
    +

    String to prepend to all source example.

    +

    Default: “”

    +
    +
    --tgt_prefix, -tgt_prefix
    +

    String to prepend to all target example.

    +

    Default: “”

    +
    +
    +
    +
    +

    Transform/Suffix

    +
    +
    --src_suffix, -src_suffix
    +

    String to append to all source example.

    +

    Default: “”

    +
    +
    --tgt_suffix, -tgt_suffix
    +

    String to append to all target example.

    +

    Default: “”

    +
    +
    +
    +
    +

    Transform/Terminology

    +
    +
    --termbase_path, -termbase_path
    +

    Path to a dictionary file with terms.

    +
    +
    --src_spacy_language_model, -src_spacy_language_model
    +

    Name of the spacy language model for the source corpus.

    +
    +
    --tgt_spacy_language_model, -tgt_spacy_language_model
    +

    Name of the spacy language model for the target corpus.

    +
    +
    --term_corpus_ratio, -term_corpus_ratio
    +

    Ratio of corpus to augment with terms.

    +

    Default: 0.3

    +
    +
    --term_example_ratio, -term_example_ratio
    +

    Max terms allowed in an example.

    +

    Default: 0.2

    +
    +
    --src_term_stoken, -src_term_stoken
    +

    The source term start token.

    +

    Default: “⦅src_term_start⦆”

    +
    +
    --tgt_term_stoken, -tgt_term_stoken
    +

    The target term start token.

    +

    Default: “⦅tgt_term_start⦆”

    +
    +
    --tgt_term_etoken, -tgt_term_etoken
    +

    The target term end token.

    +

    Default: “⦅tgt_term_end⦆”

    +
    +
    --term_source_delimiter, -term_source_delimiter
    +

    Any special token used for augmented source sentences. The default is the fuzzy token used in the FuzzyMatch transform.

    +

    Default: “⦅fuzzy⦆”

    +
    +
    +
    +
    +

    Transform/BART

    +
    +
    --permute_sent_ratio, -permute_sent_ratio
    +

    Permute this proportion of sentences (boundaries defined by [‘.’, ‘?’, ‘!’]) in all inputs.

    +

    Default: 0.0

    +
    +
    --rotate_ratio, -rotate_ratio
    +

    Rotate this proportion of inputs.

    +

    Default: 0.0

    +
    +
    --insert_ratio, -insert_ratio
    +

    Insert this percentage of additional random tokens.

    +

    Default: 0.0

    +
    +
    --random_ratio, -random_ratio
    +

    Instead of using <mask>, use random token this often.

    +

    Default: 0.0

    +
    +
    --mask_ratio, -mask_ratio
    +

    Fraction of words/subwords that will be masked.

    +

    Default: 0.0

    +
    +
    --mask_length, -mask_length
    +

    Possible choices: subword, word, span-poisson

    +

    Length of masking window to apply.

    +

    Default: “subword”

    +
    +
    --poisson_lambda, -poisson_lambda
    +

    Lambda for Poisson distribution to sample span length if -mask_length set to span-poisson.

    +

    Default: 3.0

    +
    +
    --replace_length, -replace_length
    +

    Possible choices: -1, 0, 1

    +

    When masking N tokens, replace with 0, 1, or N tokens. (use -1 for N)

    +

    Default: -1

    +
    +
    +

    Distributed

    @@ -992,6 +993,10 @@

    Distributed @@ -1191,8 +1196,8 @@

    Model- Attentionhttps://github.com/google-research/text-to-text-transfer-transformer

    Default: 0

    +
    --rotary_interleave, -rotary_interleave
    +

    Interleave the head dimensions when rotary embeddings are applied. Otherwise the head dimensions are sliced in half.True = default Llama from Meta (original)False = used by all Hugging face models

    +

    Default: False

    +
    +
    --rotary_theta, -rotary_theta
    +

    Rotary theta base length1e4 for Llama2.Mistral1e6 for Mixtral

    +

    Default: 10000

    +
    +
    --rotary_dim, -rotary_dim
    +

    Rotary dim when model requires it to be different to head dim

    +

    Default: 0

    +
    --heads, -heads

    Number of heads for transformer self-attention

    Default: 8

    @@ -1214,6 +1231,14 @@

    Model- Attention +
    A bucket is a buffer of bucket_size examples to pick

    from the various Corpora. The dynamic iterator batches +batch_size batchs from the bucket and shuffle them.

    +
    + +

    Default: 262144

    +

    +
    -bucket_size_init, --bucket_size_init
    +
    +
    The bucket is initalized with this awith this

    amount of examples (optional)

    +
    +
    +

    Default: -1

    +
    +
    -bucket_size_increment, --bucket_size_increment
    +
    +
    The bucket size is incremented with this

    amount of examples (optional)

    +
    +
    +

    Default: 0

    +
    +
    -prefetch_factor, --prefetch_factor
    +
    +
    number of mini-batches loaded in advance to avoid the

    GPU waiting during the refilling of the bucket.

    +
    +
    +

    Default: 200

    +
    --save_model, -save_model

    Model filename (the model will be saved as <save_model>_N.pt where N is the number of steps

    Default: “model”

    @@ -1614,40 +1668,6 @@

    Logging -

    Dynamic data

    -
    -
    -bucket_size, --bucket_size
    -
    -
    A bucket is a buffer of bucket_size examples to pick

    from the various Corpora. The dynamic iterator batches -batch_size batchs from the bucket and shuffle them.

    -
    -
    -

    Default: 262144

    -
    -
    -bucket_size_init, --bucket_size_init
    -
    -
    The bucket is initalized with this awith this

    amount of examples (optional)

    -
    -
    -

    Default: -1

    -
    -
    -bucket_size_increment, --bucket_size_increment
    -
    -
    The bucket size is incremented with this

    amount of examples (optional)

    -
    -
    -

    Default: 0

    -
    -
    -prefetch_factor, --prefetch_factor
    -
    -
    number of mini-batches loaded in advance to avoid the

    GPU waiting during the refilling of the bucket.

    -
    -
    -

    Default: 200

    -
    -
    -

    Quant options

    @@ -1656,9 +1676,19 @@

    Quant optionsLogging
  • Distributed
  • Efficiency
  • -
  • Transform/BART
  • -
  • Transform/Terminology
  • -
  • Transform/FuzzyMatching
  • -
  • Transform/Filter
  • -
  • Transform/Prefix
  • -
  • Transform/Suffix
  • -
  • Transform/InsertMaskBeforePlaceholdersTransform
  • -
  • Transform/Clean
  • -
  • Transform/Uppercase
  • Transform/SwitchOut
  • Transform/Token_Drop
  • Transform/Token_Mask
  • Transform/Docify
  • -
  • Transform/InferFeats
  • +
  • Transform/InsertMaskBeforePlaceholdersTransform
  • +
  • Transform/Uppercase
  • +
  • Transform/FuzzyMatching
  • +
  • Transform/InlineTags
  • +
  • Transform/Clean
  • Transform/Subword/Common
  • Transform/Subword/ONMTTOK
  • -
  • Transform/InlineTags
  • Transform/Normalize
  • +
  • Transform/InferFeats
  • +
  • Transform/Filter
  • +
  • Transform/Prefix
  • +
  • Transform/Suffix
  • +
  • Transform/Terminology
  • +
  • Transform/BART
  • Quant options
  • @@ -245,10 +245,11 @@

    Translate
    usage: translate.py [-h] [-config CONFIG] [-save_config SAVE_CONFIG] --model
                         MODEL [MODEL ...] [--precision {,fp32,fp16,int8}] [--fp32]
    -                    [--int8] [--avg_raw_probs] [--data_type DATA_TYPE] --src
    -                    SRC [--tgt TGT] [--tgt_file_prefix] [--output OUTPUT]
    -                    [--report_align] [--gold_align] [--report_time]
    -                    [--profile] [-n_src_feats N_SRC_FEATS]
    +                    [--int8] [--avg_raw_probs]
    +                    [--self_attn_type SELF_ATTN_TYPE] [--data_type DATA_TYPE]
    +                    --src SRC [--tgt TGT] [--tgt_file_prefix]
    +                    [--output OUTPUT] [--report_align] [--gold_align]
    +                    [--report_time] [--profile] [-n_src_feats N_SRC_FEATS]
                         [-src_feats_defaults SRC_FEATS_DEFAULTS]
                         [--beam_size BEAM_SIZE] [--ratio RATIO]
                         [--random_sampling_topk RANDOM_SAMPLING_TOPK]
    @@ -258,6 +259,7 @@ 

    Translate[--alpha ALPHA] [--coverage_penalty {none,wu,summary}] [--beta BETA] [--stepwise_penalty] [--min_length MIN_LENGTH] [--max_length MAX_LENGTH] + [--max_length_ratio MAX_LENGTH_RATIO] [--block_ngram_repeat BLOCK_NGRAM_REPEAT] [--ignore_when_blocking IGNORE_WHEN_BLOCKING [IGNORE_WHEN_BLOCKING ...]] [--replace_unk] [--ban_unk_token] @@ -271,48 +273,32 @@

    Translate[--gpu_backend GPU_BACKEND] [--gpu_verbose_level GPU_VERBOSE_LEVEL] [--master_ip MASTER_IP] [--master_port MASTER_PORT] - [--batch_size BATCH_SIZE] [--batch_type {sents,tokens}] - [--gpu GPU] - [-transforms {bart,terminology,fuzzymatch,filtertoolong,prefix,suffix,insert_mask_before_placeholder,clean,uppercase,switchout,tokendrop,tokenmask,docify,inferfeats,sentencepiece,bpe,onmt_tokenize,inlinetags,normalize} [{bart,terminology,fuzzymatch,filtertoolong,prefix,suffix,insert_mask_before_placeholder,clean,uppercase,switchout,tokendrop,tokenmask,docify,inferfeats,sentencepiece,bpe,onmt_tokenize,inlinetags,normalize} ...]] - [--permute_sent_ratio PERMUTE_SENT_RATIO] - [--rotate_ratio ROTATE_RATIO] - [--insert_ratio INSERT_RATIO] - [--random_ratio RANDOM_RATIO] [--mask_ratio MASK_RATIO] - [--mask_length {subword,word,span-poisson}] - [--poisson_lambda POISSON_LAMBDA] - [--replace_length {-1,0,1}] - [--termbase_path TERMBASE_PATH] - [--src_spacy_language_model SRC_SPACY_LANGUAGE_MODEL] - [--tgt_spacy_language_model TGT_SPACY_LANGUAGE_MODEL] - [--term_corpus_ratio TERM_CORPUS_RATIO] - [--term_example_ratio TERM_EXAMPLE_RATIO] - [--src_term_stoken SRC_TERM_STOKEN] - [--tgt_term_stoken TGT_TERM_STOKEN] - [--tgt_term_etoken TGT_TERM_ETOKEN] - [--term_source_delimiter TERM_SOURCE_DELIMITER] + [--timeout TIMEOUT] [--batch_size BATCH_SIZE] + [--batch_type {sents,tokens}] [--gpu GPU] + [-transforms {switchout,tokendrop,tokenmask,docify,insert_mask_before_placeholder,uppercase,fuzzymatch,inlinetags,clean,sentencepiece,bpe,onmt_tokenize,normalize,inferfeats,filtertoolong,prefix,suffix,terminology,bart} [{switchout,tokendrop,tokenmask,docify,insert_mask_before_placeholder,uppercase,fuzzymatch,inlinetags,clean,sentencepiece,bpe,onmt_tokenize,normalize,inferfeats,filtertoolong,prefix,suffix,terminology,bart} ...]] + [-switchout_temperature SWITCHOUT_TEMPERATURE] + [-tokendrop_temperature TOKENDROP_TEMPERATURE] + [-tokenmask_temperature TOKENMASK_TEMPERATURE] + [--doc_length DOC_LENGTH] [--max_context MAX_CONTEXT] + [--response_pattern RESPONSE_PATTERN] + [--upper_corpus_ratio UPPER_CORPUS_RATIO] [--tm_path TM_PATH] [--fuzzy_corpus_ratio FUZZY_CORPUS_RATIO] [--fuzzy_threshold FUZZY_THRESHOLD] [--tm_delimiter TM_DELIMITER] [--fuzzy_token FUZZY_TOKEN] [--fuzzymatch_min_length FUZZYMATCH_MIN_LENGTH] [--fuzzymatch_max_length FUZZYMATCH_MAX_LENGTH] - [--src_seq_length SRC_SEQ_LENGTH] - [--tgt_seq_length TGT_SEQ_LENGTH] - [--src_prefix SRC_PREFIX] [--tgt_prefix TGT_PREFIX] - [--src_suffix SRC_SUFFIX] [--tgt_suffix TGT_SUFFIX] - [--response_pattern RESPONSE_PATTERN] [--src_eq_tgt] + [--tags_dictionary_path TAGS_DICTIONARY_PATH] + [--tags_corpus_ratio TAGS_CORPUS_RATIO] + [--max_tags MAX_TAGS] [--paired_stag PAIRED_STAG] + [--paired_etag PAIRED_ETAG] [--isolated_tag ISOLATED_TAG] + [--src_delimiter SRC_DELIMITER] [--src_eq_tgt] [--same_char] [--same_word] [--scripts_ok [SCRIPTS_OK [SCRIPTS_OK ...]]] [--scripts_nok [SCRIPTS_NOK [SCRIPTS_NOK ...]]] [--src_tgt_ratio SRC_TGT_RATIO] [--avg_tok_min AVG_TOK_MIN] [--avg_tok_max AVG_TOK_MAX] [--langid [LANGID [LANGID ...]]] - [--upper_corpus_ratio UPPER_CORPUS_RATIO] - [-switchout_temperature SWITCHOUT_TEMPERATURE] - [-tokendrop_temperature TOKENDROP_TEMPERATURE] - [-tokenmask_temperature TOKENMASK_TEMPERATURE] - [--doc_length DOC_LENGTH] [--max_context MAX_CONTEXT] - [--reversible_tokenization {joiner,spacer}] [-src_subword_model SRC_SUBWORD_MODEL] [-tgt_subword_model TGT_SUBWORD_MODEL] [-src_subword_nbest SRC_SUBWORD_NBEST] @@ -327,18 +313,35 @@

    Translate[-tgt_subword_type {none,sentencepiece,bpe}] [-src_onmttok_kwargs SRC_ONMTTOK_KWARGS] [-tgt_onmttok_kwargs TGT_ONMTTOK_KWARGS] [--gpt2_pretok] - [--tags_dictionary_path TAGS_DICTIONARY_PATH] - [--tags_corpus_ratio TAGS_CORPUS_RATIO] - [--max_tags MAX_TAGS] [--paired_stag PAIRED_STAG] - [--paired_etag PAIRED_ETAG] [--isolated_tag ISOLATED_TAG] - [--src_delimiter SRC_DELIMITER] [--src_lang SRC_LANG] - [--tgt_lang TGT_LANG] [--penn PENN] + [--src_lang SRC_LANG] [--tgt_lang TGT_LANG] [--penn PENN] [--norm_quote_commas NORM_QUOTE_COMMAS] [--norm_numbers NORM_NUMBERS] [--pre_replace_unicode_punct PRE_REPLACE_UNICODE_PUNCT] [--post_remove_control_chars POST_REMOVE_CONTROL_CHARS] + [--reversible_tokenization {joiner,spacer}] + [--src_seq_length SRC_SEQ_LENGTH] + [--tgt_seq_length TGT_SEQ_LENGTH] + [--src_prefix SRC_PREFIX] [--tgt_prefix TGT_PREFIX] + [--src_suffix SRC_SUFFIX] [--tgt_suffix TGT_SUFFIX] + [--termbase_path TERMBASE_PATH] + [--src_spacy_language_model SRC_SPACY_LANGUAGE_MODEL] + [--tgt_spacy_language_model TGT_SPACY_LANGUAGE_MODEL] + [--term_corpus_ratio TERM_CORPUS_RATIO] + [--term_example_ratio TERM_EXAMPLE_RATIO] + [--src_term_stoken SRC_TERM_STOKEN] + [--tgt_term_stoken TGT_TERM_STOKEN] + [--tgt_term_etoken TGT_TERM_ETOKEN] + [--term_source_delimiter TERM_SOURCE_DELIMITER] + [--permute_sent_ratio PERMUTE_SENT_RATIO] + [--rotate_ratio ROTATE_RATIO] + [--insert_ratio INSERT_RATIO] + [--random_ratio RANDOM_RATIO] [--mask_ratio MASK_RATIO] + [--mask_length {subword,word,span-poisson}] + [--poisson_lambda POISSON_LAMBDA] + [--replace_length {-1,0,1}] [--quant_layers QUANT_LAYERS [QUANT_LAYERS ...]] - [--quant_type {bnb_8bit,bnb_FP4,bnb_NF4}] + [--quant_type {,bnb_8bit,bnb_FP4,bnb_NF4,awq_gemm,awq_gemv}] + [--w_bit {4}] [--group_size {128}]

    @@ -374,6 +377,10 @@

    Model

    If this is set, during ensembling scores from different models will be combined by averaging their raw probabilities and then taking the log. Otherwise, the log probabilities will be averaged directly. Necessary for models whose output layers can assign zero probability.

    Default: False

    +
    --self_attn_type, -self_attn_type
    +

    Self attention type in Transformer decoder layer – currently “scaled-dot”, “scaled-dot-flash” or “average”

    +

    Default: “scaled-dot-flash”

    +

    @@ -516,6 +523,10 @@

    Decoding tricks @@ -626,86 +641,67 @@

    Efficiency -

    Transform/BART

    +
    +

    Transform/SwitchOut

    -
    --permute_sent_ratio, -permute_sent_ratio
    -

    Permute this proportion of sentences (boundaries defined by [‘.’, ‘?’, ‘!’]) in all inputs.

    -

    Default: 0.0

    -
    -
    --rotate_ratio, -rotate_ratio
    -

    Rotate this proportion of inputs.

    -

    Default: 0.0

    -
    -
    --insert_ratio, -insert_ratio
    -

    Insert this percentage of additional random tokens.

    -

    Default: 0.0

    -
    -
    --random_ratio, -random_ratio
    -

    Instead of using <mask>, use random token this often.

    -

    Default: 0.0

    -
    -
    --mask_ratio, -mask_ratio
    -

    Fraction of words/subwords that will be masked.

    -

    Default: 0.0

    -
    -
    --mask_length, -mask_length
    -

    Possible choices: subword, word, span-poisson

    -

    Length of masking window to apply.

    -

    Default: “subword”

    -
    -
    --poisson_lambda, -poisson_lambda
    -

    Lambda for Poisson distribution to sample span length if -mask_length set to span-poisson.

    -

    Default: 3.0

    -
    -
    --replace_length, -replace_length
    -

    Possible choices: -1, 0, 1

    -

    When masking N tokens, replace with 0, 1, or N tokens. (use -1 for N)

    -

    Default: -1

    +
    -switchout_temperature, --switchout_temperature
    +

    Sampling temperature for SwitchOut. \(\tau^{-1}\) in [WPDN18]. Smaller value makes data more diverse.

    +

    Default: 1.0

    -
    -

    Transform/Terminology

    +
    +

    Transform/Token_Drop

    -
    --termbase_path, -termbase_path
    -

    Path to a dictionary file with terms.

    -
    -
    --src_spacy_language_model, -src_spacy_language_model
    -

    Name of the spacy language model for the source corpus.

    -
    -
    --tgt_spacy_language_model, -tgt_spacy_language_model
    -

    Name of the spacy language model for the target corpus.

    -
    -
    --term_corpus_ratio, -term_corpus_ratio
    -

    Ratio of corpus to augment with terms.

    -

    Default: 0.3

    +
    -tokendrop_temperature, --tokendrop_temperature
    +

    Sampling temperature for token deletion.

    +

    Default: 1.0

    -
    --term_example_ratio, -term_example_ratio
    -

    Max terms allowed in an example.

    -

    Default: 0.2

    +
    +
    +
    +

    Transform/Token_Mask

    +
    +
    -tokenmask_temperature, --tokenmask_temperature
    +

    Sampling temperature for token masking.

    +

    Default: 1.0

    -
    --src_term_stoken, -src_term_stoken
    -

    The source term start token.

    -

    Default: “⦅src_term_start⦆”

    +
    +
    +
    +

    Transform/Docify

    +
    +
    --doc_length, -doc_length
    +

    Number of tokens per doc.

    +

    Default: 200

    -
    --tgt_term_stoken, -tgt_term_stoken
    -

    The target term start token.

    -

    Default: “⦅tgt_term_start⦆”

    +
    --max_context, -max_context
    +

    Max context segments.

    +

    Default: 1

    -
    --tgt_term_etoken, -tgt_term_etoken
    -

    The target term end token.

    -

    Default: “⦅tgt_term_end⦆”

    +
    +
    +
    +

    Transform/InsertMaskBeforePlaceholdersTransform

    +
    +
    --response_pattern, -response_pattern
    +

    Response patten to locate the end of the prompt

    +

    Default: “Response : ⦅newline⦆”

    -
    --term_source_delimiter, -term_source_delimiter
    -

    Any special token used for augmented source sentences. The default is the fuzzy token used in the FuzzyMatch transform.

    -

    Default: “⦅fuzzy⦆”

    +
    +
    +
    +

    Transform/Uppercase

    +
    +
    --upper_corpus_ratio, -upper_corpus_ratio
    +

    Corpus ratio to apply uppercasing.

    +

    Default: 0.01

    @@ -741,51 +737,35 @@

    Transform/FuzzyMatching

    -
    -

    Transform/Filter

    +
    +

    Transform/InlineTags

    -
    --src_seq_length, -src_seq_length
    -

    Maximum source sequence length.

    -

    Default: 192

    +
    --tags_dictionary_path, -tags_dictionary_path
    +

    Path to a flat term dictionary.

    -
    --tgt_seq_length, -tgt_seq_length
    -

    Maximum target sequence length.

    -

    Default: 192

    +
    --tags_corpus_ratio, -tags_corpus_ratio
    +

    Ratio of corpus to augment with tags.

    +

    Default: 0.1

    -
    -
    -
    -

    Transform/Prefix

    -
    -
    --src_prefix, -src_prefix
    -

    String to prepend to all source example.

    -

    Default: “”

    +
    --max_tags, -max_tags
    +

    Maximum number of tags that can be added to a single sentence.

    +

    Default: 12

    -
    --tgt_prefix, -tgt_prefix
    -

    String to prepend to all target example.

    -

    Default: “”

    +
    --paired_stag, -paired_stag
    +

    The format of an opening paired inline tag. Must include the character #.

    +

    Default: “⦅ph_#_beg⦆”

    -
    -
    -
    -

    Transform/Suffix

    -
    -
    --src_suffix, -src_suffix
    -

    String to append to all source example.

    -

    Default: “”

    +
    --paired_etag, -paired_etag
    +

    The format of a closing paired inline tag. Must include the character #.

    +

    Default: “⦅ph_#_end⦆”

    -
    --tgt_suffix, -tgt_suffix
    -

    String to append to all target example.

    -

    Default: “”

    +
    --isolated_tag, -isolated_tag
    +

    The format of an isolated inline tag. Must include the character #.

    +

    Default: “⦅ph_#_std⦆”

    -
    -
    -
    -

    Transform/InsertMaskBeforePlaceholdersTransform

    -
    -
    --response_pattern, -response_pattern
    -

    Response patten to locate the end of the prompt

    -

    Default: “Response : ⦅newline⦆”

    +
    --src_delimiter, -src_delimiter
    +

    Any special token used for augmented src sentences. The default is the fuzzy token used in the FuzzyMatch transform.

    +

    Default: “⦅fuzzy⦆”

    @@ -830,65 +810,6 @@

    Transform/Clean -

    Transform/Uppercase

    -
    -
    --upper_corpus_ratio, -upper_corpus_ratio
    -

    Corpus ratio to apply uppercasing.

    -

    Default: 0.01

    -
    -
    -

    -
    -

    Transform/SwitchOut

    -
    -
    -switchout_temperature, --switchout_temperature
    -

    Sampling temperature for SwitchOut. \(\tau^{-1}\) in [WPDN18]. Smaller value makes data more diverse.

    -

    Default: 1.0

    -
    -
    -
    -
    -

    Transform/Token_Drop

    -
    -
    -tokendrop_temperature, --tokendrop_temperature
    -

    Sampling temperature for token deletion.

    -

    Default: 1.0

    -
    -
    -
    -
    -

    Transform/Token_Mask

    -
    -
    -tokenmask_temperature, --tokenmask_temperature
    -

    Sampling temperature for token masking.

    -

    Default: 1.0

    -
    -
    -
    -
    -

    Transform/Docify

    -
    -
    --doc_length, -doc_length
    -

    Number of tokens per doc.

    -

    Default: 200

    -
    -
    --max_context, -max_context
    -

    Max context segments.

    -

    Default: 1

    -
    -
    -
    -
    -

    Transform/InferFeats

    -
    -
    --reversible_tokenization, -reversible_tokenization
    -

    Possible choices: joiner, spacer

    -

    Type of reversible tokenization applied on the tokenizer.

    -

    Default: “joiner”

    -
    -
    -

    Transform/Subword/Common

    @@ -963,38 +884,6 @@

    Transform/Subword/ONMTTOK -

    Transform/InlineTags

    -
    -
    --tags_dictionary_path, -tags_dictionary_path
    -

    Path to a flat term dictionary.

    -
    -
    --tags_corpus_ratio, -tags_corpus_ratio
    -

    Ratio of corpus to augment with tags.

    -

    Default: 0.1

    -
    -
    --max_tags, -max_tags
    -

    Maximum number of tags that can be added to a single sentence.

    -

    Default: 12

    -
    -
    --paired_stag, -paired_stag
    -

    The format of an opening paired inline tag. Must include the character #.

    -

    Default: “⦅ph_#_beg⦆”

    -
    -
    --paired_etag, -paired_etag
    -

    The format of a closing paired inline tag. Must include the character #.

    -

    Default: “⦅ph_#_end⦆”

    -
    -
    --isolated_tag, -isolated_tag
    -

    The format of an isolated inline tag. Must include the character #.

    -

    Default: “⦅ph_#_std⦆”

    -
    -
    --src_delimiter, -src_delimiter
    -

    Any special token used for augmented src sentences. The default is the fuzzy token used in the FuzzyMatch transform.

    -

    Default: “⦅fuzzy⦆”

    -
    -
    -

    Transform/Normalize

    @@ -1028,6 +917,132 @@

    Transform/Normalize +

    Transform/InferFeats

    +
    +
    --reversible_tokenization, -reversible_tokenization
    +

    Possible choices: joiner, spacer

    +

    Type of reversible tokenization applied on the tokenizer.

    +

    Default: “joiner”

    +
    +
    +

    +
    +

    Transform/Filter

    +
    +
    --src_seq_length, -src_seq_length
    +

    Maximum source sequence length.

    +

    Default: 192

    +
    +
    --tgt_seq_length, -tgt_seq_length
    +

    Maximum target sequence length.

    +

    Default: 192

    +
    +
    +
    +
    +

    Transform/Prefix

    +
    +
    --src_prefix, -src_prefix
    +

    String to prepend to all source example.

    +

    Default: “”

    +
    +
    --tgt_prefix, -tgt_prefix
    +

    String to prepend to all target example.

    +

    Default: “”

    +
    +
    +
    +
    +

    Transform/Suffix

    +
    +
    --src_suffix, -src_suffix
    +

    String to append to all source example.

    +

    Default: “”

    +
    +
    --tgt_suffix, -tgt_suffix
    +

    String to append to all target example.

    +

    Default: “”

    +
    +
    +
    +
    +

    Transform/Terminology

    +
    +
    --termbase_path, -termbase_path
    +

    Path to a dictionary file with terms.

    +
    +
    --src_spacy_language_model, -src_spacy_language_model
    +

    Name of the spacy language model for the source corpus.

    +
    +
    --tgt_spacy_language_model, -tgt_spacy_language_model
    +

    Name of the spacy language model for the target corpus.

    +
    +
    --term_corpus_ratio, -term_corpus_ratio
    +

    Ratio of corpus to augment with terms.

    +

    Default: 0.3

    +
    +
    --term_example_ratio, -term_example_ratio
    +

    Max terms allowed in an example.

    +

    Default: 0.2

    +
    +
    --src_term_stoken, -src_term_stoken
    +

    The source term start token.

    +

    Default: “⦅src_term_start⦆”

    +
    +
    --tgt_term_stoken, -tgt_term_stoken
    +

    The target term start token.

    +

    Default: “⦅tgt_term_start⦆”

    +
    +
    --tgt_term_etoken, -tgt_term_etoken
    +

    The target term end token.

    +

    Default: “⦅tgt_term_end⦆”

    +
    +
    --term_source_delimiter, -term_source_delimiter
    +

    Any special token used for augmented source sentences. The default is the fuzzy token used in the FuzzyMatch transform.

    +

    Default: “⦅fuzzy⦆”

    +
    +
    +
    +
    +

    Transform/BART

    +
    +
    --permute_sent_ratio, -permute_sent_ratio
    +

    Permute this proportion of sentences (boundaries defined by [‘.’, ‘?’, ‘!’]) in all inputs.

    +

    Default: 0.0

    +
    +
    --rotate_ratio, -rotate_ratio
    +

    Rotate this proportion of inputs.

    +

    Default: 0.0

    +
    +
    --insert_ratio, -insert_ratio
    +

    Insert this percentage of additional random tokens.

    +

    Default: 0.0

    +
    +
    --random_ratio, -random_ratio
    +

    Instead of using <mask>, use random token this often.

    +

    Default: 0.0

    +
    +
    --mask_ratio, -mask_ratio
    +

    Fraction of words/subwords that will be masked.

    +

    Default: 0.0

    +
    +
    --mask_length, -mask_length
    +

    Possible choices: subword, word, span-poisson

    +

    Length of masking window to apply.

    +

    Default: “subword”

    +
    +
    --poisson_lambda, -poisson_lambda
    +

    Lambda for Poisson distribution to sample span length if -mask_length set to span-poisson.

    +

    Default: 3.0

    +
    +
    --replace_length, -replace_length
    +

    Possible choices: -1, 0, 1

    +

    When masking N tokens, replace with 0, 1, or N tokens. (use -1 for N)

    +

    Default: -1

    +
    +
    +

    Quant options

    @@ -1036,9 +1051,19 @@

    Quant optionsmax_length: 1 and src: None tgt: None which is typically the configuration to be used in a scoring script like MMLU where it expects only 1 token as the answer.

    +

    WARNING +For inhomogeneous batches with many examples, the potentially high number of tokens inserted in the shortest examples leads to degraded results when attention layer quantization and flash attention are activated. +In practice, in the inference configuration file, when batch_size is greater than 1, +delete ‘linear_values’, ‘linear_query’, ‘linear_keys’, ‘final_linear’ from quant_layers and specify self_attn_type: scaled-dot.

    You can run this script with the following command line:

    python eval_llm/MMLU/run_mmlu_opennmt.py --config myinference.yaml
     
    diff --git a/searchindex.js b/searchindex.js index 213dc9a93f..05255bae5b 100644 --- a/searchindex.js +++ b/searchindex.js @@ -1 +1 @@ -Search.setIndex({"docnames": ["CONTRIBUTING", "FAQ", "changes", "examples/ggnn/GGNN", "examples/replicate_vicuna/ReplicateVicuna", "examples/summary/Summarization", "examples/wiki_103/LanguageModelGeneration", "examples/wmt17/Translation", "index", "legacy/FAQ", "legacy/im2text", "legacy/speech2text", "legacy/vid2text", "main", "onmt", "onmt.inputters", "onmt.modules", "onmt.translate.translation_server", "onmt.translation", "options/build_vocab", "options/server", "options/train", "options/translate", "quickstart", "ref"], "filenames": ["CONTRIBUTING.md", "FAQ.md", "changes.md", "examples/ggnn/GGNN.md", "examples/replicate_vicuna/ReplicateVicuna.md", "examples/summary/Summarization.md", "examples/wiki_103/LanguageModelGeneration.md", "examples/wmt17/Translation.md", "index.rst", "legacy/FAQ.md", "legacy/im2text.md", "legacy/speech2text.md", "legacy/vid2text.rst", "main.md", "onmt.rst", "onmt.inputters.rst", "onmt.modules.rst", "onmt.translate.translation_server.rst", "onmt.translation.rst", "options/build_vocab.rst", "options/server.rst", "options/train.rst", "options/translate.rst", "quickstart.md", "ref.rst"], "titles": ["Contributors", "How do I use my v2 models in v3 ?", "Versions", "Gated Graph Neural Networks", "Supervised Finetuning of llama 7B to replicate Vicuna", "Summarization CNN/DM", "Language Model Wiki-103", "Translation WMT17 en-de", "Contents", "FAQ (Legacy version)", "Image to Text", "Speech to Text", "Video to Text", "Overview", "Framework", "Data Loaders", "Modules", "Server", "Translation", "Build Vocab", "Server", "Train", "Translate", "Quickstart", "References"], "terms": {"opennmt": [0, 2, 3, 4, 5, 8, 9, 10, 11, 12, 13, 14, 20, 23], "py": [0, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 19, 20, 21, 22, 23], "i": [0, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 14, 15, 16, 17, 18, 19, 21, 22, 23, 24], "commun": [0, 1, 9], "develop": [0, 1, 2], "project": [0, 13, 16], "we": [0, 1, 2, 3, 4, 5, 6, 9, 10, 11, 12, 17, 18, 21, 23], "love": 0, "contribut": 0, "befor": [0, 1, 3, 5, 9, 14, 17, 18, 22, 23], "send": [0, 1, 9, 21], "pr": [0, 2], "pleas": [0, 1, 3, 5, 9, 13], "do": [0, 5, 8, 12, 17, 18, 21, 22, 23], "thi": [0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 18, 19, 21, 22, 23], "checklist": 0, "first": [0, 1, 3, 4, 9, 11, 12, 16, 18, 21], "instal": [0, 4, 5, 7, 8, 10, 11, 12, 14], "black": 0, "22": [0, 1, 6], "12": [0, 1, 3, 5, 9, 19, 21, 22], "0": [0, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], "run": [0, 1, 2, 3, 4, 5, 6, 8, 9, 12, 14, 16, 17, 21, 22, 23], "format": [0, 1, 2, 5, 8, 9, 12, 17, 19, 21, 22, 23], "your": [0, 1, 2, 4, 5, 9, 23], "chang": [0, 1, 4, 5, 8, 12, 14, 21], "accord": [0, 1, 2], "our": [0, 4, 11, 12, 13, 18], "standard": [0, 1, 5, 9, 16, 21, 22], "onmt": [0, 1, 4, 6, 7, 14, 15, 16, 17, 18, 19, 21, 23], "test": [0, 1, 3, 4, 5, 6, 7, 10, 12, 21, 22, 23], "pull_request_chk": 0, "sh": [0, 3, 6, 7, 12], "fix": [0, 1, 5, 18, 21], "ani": [0, 1, 2, 3, 5, 14, 18, 19, 21, 22, 23], "error": [0, 19, 21, 22], "when": [0, 5, 8, 9, 13, 15, 16, 18, 19, 21, 22, 23], "ad": [0, 1, 3, 5, 14, 19, 21, 22], "new": [0, 1, 2, 4, 12, 23], "function": [0, 1, 4, 14, 15, 16, 17, 18, 21], "also": [0, 1, 4, 5, 12, 14, 16, 21, 23], "add": [0, 9, 12, 14, 15, 16, 21, 22, 23], "script": [0, 1, 4, 5, 7, 9, 12, 19, 21, 22, 23], "includ": [0, 1, 3, 4, 5, 16, 19, 21, 22], "check": [0, 2, 4, 5, 7, 14], "flake8": 0, "code": [0, 3, 5, 12, 19, 21, 22], "style": [0, 3, 8, 16, 21], "unittest": 0, "continu": [0, 1, 12], "integr": [0, 1, 3], "list": [0, 3, 9, 12, 14, 16, 17, 18, 19, 21, 22], "github": [0, 1, 3, 4, 5, 7, 11, 13, 16, 21], "workflow": [0, 13], "push": 0, "yml": 0, "modifi": [0, 1, 2, 4, 5, 9, 14, 18], "class": [0, 1, 2, 8, 14, 15, 16, 17], "constructor": [0, 16], "make": [0, 2, 4, 5, 9, 12, 15, 19, 21, 22], "argument": [0, 1, 8, 12, 14], "same": [0, 1, 2, 5, 6, 9, 12, 14, 16, 17, 19, 21, 22, 23], "name": [0, 1, 4, 7, 8, 12, 15, 18, 19, 21, 22], "its": [0, 1, 3, 4, 14, 16], "superclass": 0, "pytorch": [0, 1, 2, 3, 5, 11, 13, 14, 16, 21, 22, 23], "If": [0, 1, 2, 3, 5, 9, 12, 13, 14, 16, 17, 18, 21, 22, 23], "base": [0, 1, 2, 3, 5, 9, 10, 11, 12, 14, 15, 16, 17, 18, 19, 21, 22, 24], "paper": [0, 1, 2, 3, 5, 9, 10, 12, 14, 16, 21], "clear": [0, 1], "comment": 0, "refer": [0, 1, 3, 8, 9, 12], "more": [0, 1, 2, 3, 9, 10, 12, 18, 19, 21, 22, 23], "below": [0, 3, 4, 5, 23], "abov": [0, 3, 9, 10, 14, 18], "all": [0, 1, 2, 3, 4, 11, 12, 14, 16, 18, 19, 21, 22, 23, 24], "try": [0, 1, 5, 12, 23], "follow": [0, 1, 3, 4, 5, 6, 7, 9, 12, 13, 22, 23], "googl": [0, 1, 9, 18, 21, 24], "napoleon": 0, "exampl": [0, 2, 3, 4, 5, 6, 7, 10, 11, 12, 14, 15, 19, 21, 22, 23], "styleguid": 0, "easi": 0, "sphinx": 0, "document": [0, 3, 5, 10, 11, 13], "And": [0, 1, 12, 13, 16], "feel": [0, 2], "free": [0, 2, 17], "autodoc": 0, "api": 0, "rst": 0, "file": [0, 1, 3, 4, 5, 6, 9, 10, 11, 12, 15, 17, 19, 21, 22], "doc": [0, 2, 6, 7, 19, 21, 22], "sourc": [0, 3, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 21, 22, 23], "folder": [0, 1, 4, 5, 12, 21, 23], "you": [0, 2, 4, 5, 8, 10, 12, 13, 16, 21, 22, 23, 24], "addit": [0, 1, 3, 5, 8, 9, 14, 16, 19, 21, 22], "look": [0, 1, 5, 9, 10, 11, 13, 16, 22], "right": 0, "how": [0, 2, 4, 5, 8, 12, 13, 15, 16], "build": [0, 1, 8, 9, 14, 17, 18, 23], "local": [0, 4, 5, 7], "cd": [0, 3, 4, 7, 12, 13, 23], "some": [0, 4, 12, 13, 14, 19, 21, 22, 23], "depend": [0, 1, 2, 8, 12, 14, 16, 17], "necessari": [0, 1, 3, 14, 18, 21, 22, 23], "recommonmark": 0, "sphinx_rtd_them": 0, "sphinxcontrib": 0, "bibtex": 0, "pip": [0, 1, 4, 7, 10, 11, 12, 13], "requir": [0, 1, 2, 3, 6, 9, 13, 14, 16, 19, 21, 23], "txt": [0, 1, 3, 4, 5, 6, 9, 10, 11, 12, 13, 22, 23], "html": [0, 21], "firefox": 0, "main": [0, 1, 13, 14, 19, 21, 22], "browser": 0, "choic": [0, 1, 5, 15, 16, 19, 21, 22], "particular": [0, 16], "advic": [0, 1], "python": [0, 1, 3, 5, 9, 12, 13, 14, 21, 23], "3": [0, 3, 4, 5, 7, 8, 9, 12, 14, 19, 21, 22], "type": [0, 1, 3, 5, 6, 8, 9, 11, 12, 14, 15, 16, 17, 18, 19, 22], "modul": [0, 1, 2, 8, 14, 21], "convent": 0, "except": [0, 1, 12, 17, 19, 21, 22], "us": [0, 2, 3, 4, 5, 6, 8, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, 22, 23], "instead": [0, 1, 2, 5, 12, 14, 16, 19, 21, 22], "union": 0, "readabl": 0, "For": [0, 1, 2, 3, 4, 5, 14, 18, 21, 23], "extern": 0, "full": [0, 1, 3, 5, 9, 16, 17, 19, 21, 22], "import": [0, 1, 5, 6, 12], "common": [0, 1, 5, 8], "abbrevi": 0, "e": [0, 4, 8, 12, 13, 14, 17, 21, 24], "g": [0, 4, 5, 7, 8, 12, 13, 14, 21], "np": 0, "ar": [0, 2, 3, 4, 5, 6, 8, 9, 12, 13, 14, 17, 18, 19, 21, 22, 23, 24], "accept": [0, 2, 18, 19, 21, 22, 23], "torch": [0, 7, 9, 14, 16, 21, 22], "tensor": [0, 14, 15, 16, 18], "option": [0, 1, 2, 4, 5, 6, 7, 8, 9, 13, 14, 15, 16, 17, 18, 19], "don": [0, 1, 12], "t": [0, 1, 5, 6, 9, 12, 16, 18, 24], "tic": 0, "like": [0, 3, 5, 10, 11, 12, 18, 22, 23], "str": [0, 1, 3, 12, 14, 15, 16, 17, 18], "direct": [0, 5, 18], "obj": [0, 14], "handl": [0, 1, 4, 5, 12, 14], "veri": [0, 1, 2, 9], "well": [0, 1, 3, 4, 5, 9, 21, 23], "without": [0, 4, 5, 6, 8, 16, 21, 23], "help": [0, 1, 4, 5, 12, 13, 22, 23], "so": [0, 1, 5, 9, 12, 21], "avoid": [0, 1, 2, 21], "clutter": 0, "support": [0, 2, 3, 4, 8, 14, 16, 21], "multipl": [0, 1, 2, 3, 12, 14, 15, 16, 21, 22], "return": [0, 1, 6, 12, 14, 16, 17, 18], "work": [0, 3, 5, 8, 10, 11, 12, 13, 18, 21], "still": [0, 1, 2], "def": [0, 1, 12], "foo": 0, "b": [0, 1, 3, 4, 9, 18], "my": [0, 8], "arg": [0, 6, 12, 16, 17, 18], "object": [0, 1, 3, 14, 15, 17, 18, 19, 21], "someth": 0, "anoth": [0, 14, 23], "thing": [0, 1, 2], "rather": [0, 1, 9], "long": [0, 1, 3, 5, 14], "descript": [0, 4], "spill": 0, "over": [0, 1, 3, 5, 6, 9, 12, 14, 16, 18, 21, 22], "cite": [0, 5, 13], "directli": [0, 1, 3, 5, 16, 22, 23], "link": [0, 1, 4, 5, 10, 12], "entri": [0, 1, 19, 21, 22], "ref": [0, 21], "bib": 0, "attent": [0, 2, 3, 4, 5, 6, 8, 11, 12, 14, 18, 22, 24], "need": [0, 1, 2, 3, 4, 5, 6, 9, 12, 13, 16, 21, 23, 24], "visit": [0, 12], "arxiv": [0, 1, 21, 22, 24], "choos": [0, 12, 23], "bibtext": 0, "search": [0, 1, 5, 8, 12, 18, 23], "ctrl": 0, "f": [0, 4, 5, 6, 12], "dblp": 0, "journal": 0, "corr": [0, 24], "vaswanispujgkp17": 0, "find": [0, 5, 12, 13, 23], "copi": [0, 1, 3, 4, 5, 12, 14, 15, 16, 21, 22], "past": [0, 1, 9, 21], "citat": [0, 8], "Then": [0, 12, 14, 16, 23], "howev": [0, 1, 2, 4, 5, 12, 14, 16], "better": [0, 1, 2, 5, 9, 19, 21, 22], "than": [0, 1, 2, 3, 7, 9, 12, 18, 19, 21, 22, 23], "noth": [0, 14], "shape": [0, 18], "prefer": [0, 1, 9], "c": [0, 1, 5, 9, 10, 11, 16, 20], "read": [0, 1, 12, 13, 17], "allow": [0, 1, 2, 3, 5, 15, 19, 21, 22], "x": [0, 1, 5, 6, 9, 10, 14, 16, 18, 21], "multplic": 0, "few": [0, 1, 2, 4, 12], "variat": 0, "parenthes": 0, "allennlp": 0, "exactli": 0, "fairseq": [0, 1, 19, 21], "singl": [0, 1, 6, 7, 12, 14, 17, 19, 21, 22, 23], "tick": 0, "again": [0, 12], "differ": [0, 4, 5, 6, 8, 12, 16, 17, 22], "unnecessari": 0, "space": [0, 1, 12, 21, 23], "charact": [0, 1, 10, 11, 19, 21, 22], "capit": 0, "punctuat": [0, 12], "multi": [0, 8, 16], "line": [0, 1, 3, 4, 5, 6, 9, 10, 11, 12, 15, 19, 21, 22, 23], "blank": [0, 1, 9, 12, 19, 21], "after": [0, 1, 3, 5, 12, 16, 18, 21], "close": [0, 1, 12, 19, 21, 22], "quot": 0, "Not": [0, 1], "note": [0, 1, 2, 4, 5, 8, 9, 12, 18, 23], "least": [0, 1, 4], "focu": 0, "content": [0, 1, 4, 22, 23], "rememb": 0, "consist": [0, 1, 10, 16, 23], "good": [0, 1, 9, 12, 21], "Be": [0, 1], "sensibl": [0, 1], "about": [0, 1, 6], "gener": [0, 3, 4, 5, 8, 9, 11, 12, 14, 15, 16, 18, 22, 24], "one": [0, 1, 2, 3, 4, 5, 6, 9, 10, 11, 12, 16, 19, 21, 22, 23], "stand": 0, "alon": 0, "summari": [0, 5, 18, 22], "per": [0, 1, 2, 3, 5, 9, 10, 11, 12, 19, 21, 22, 23], "sometim": [0, 1], "": [0, 4, 5, 7, 8, 9, 10, 12, 18, 19, 21, 22, 23, 24], "cut": [0, 6], "an": [0, 1, 3, 4, 5, 6, 9, 10, 11, 12, 14, 15, 16, 19, 21, 22, 24], "extend": [0, 1, 5, 16], "It": [0, 1, 2, 4, 5, 9, 10, 11, 12, 13, 14, 16, 17, 23], "alwai": [0, 1, 16], "have": [0, 1, 3, 4, 5, 7, 9, 12, 14, 16, 18, 21, 23], "trail": 0, "yaml": [1, 3, 4, 5, 6, 7, 19, 21, 22], "partial": 1, "To": [1, 2, 3, 4, 5, 6, 9, 10, 11, 16, 23, 24], "overview": [1, 8], "quickstart": [1, 8, 13], "section": [1, 5, 6, 12, 23], "tutori": [1, 4, 12, 13], "As": [1, 4], "remind": [1, 2], "reli": [1, 2, 9], "torchtext": [1, 2], "5": [1, 3, 4, 5, 7, 9, 10, 12, 21, 22], "version": [1, 4, 5, 7, 8, 10, 11, 12, 16, 17, 18], "field": [1, 5, 14], "rawfield": 1, "multifield": 1, "which": [1, 2, 3, 4, 5, 9, 14, 15, 16, 18, 21, 23], "were": [1, 2, 23], "deprec": [1, 22], "In": [1, 3, 5, 6, 12, 16, 22, 23, 24], "order": [1, 9, 14], "old": [1, 2, 12], "mimic": [1, 9], "those": [1, 9, 21], "result": [1, 3, 5, 6, 9, 12, 17, 21], "newer": 1, "13": [1, 5], "14": [1, 4, 21], "convers": [1, 10, 11, 18], "elimin": 1, "complet": [1, 4, 10, 18], "perfom": [1, 21], "tool": [1, 2, 4, 9, 12, 23], "convertv2": [1, 2], "_": [1, 2, 4, 6, 9, 10, 12, 16, 23], "v2model": 1, "myoldmodel": 1, "pt": [1, 3, 4, 6, 7, 9, 10, 11, 12, 21, 22, 23], "v3model": 1, "newmodel": 1, "The": [1, 2, 3, 4, 5, 6, 9, 10, 11, 12, 14, 16, 17, 18, 19, 21, 22, 23], "longer": [1, 2, 5, 22], "kei": [1, 12, 14, 16, 21], "replac": [1, 4, 12, 18, 19, 21, 22], "vocab": [1, 2, 3, 4, 6, 8, 9, 14, 15, 16, 18, 23], "rnn_size": [1, 2, 9, 12], "now": [1, 2, 5, 12, 23], "hidden_s": [1, 2, 3, 5, 16, 21, 23], "enc_rnn_siz": [1, 2, 11], "enc_hid_s": [1, 2, 21], "dec_rnn_siz": [1, 2, 11], "dec_hid_s": [1, 2, 21], "A": [1, 3, 4, 5, 9, 10, 11, 14, 15, 16, 21, 24], "add_qkvbia": [1, 2, 16, 21, 23], "true": [1, 2, 3, 4, 5, 6, 9, 12, 14, 15, 16, 18, 19, 21, 22, 23], "default": [1, 2, 9, 11, 12, 14, 16, 17, 19, 20, 21, 22, 23], "fals": [1, 2, 3, 5, 12, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], "gpt2": 1, "languag": [1, 5, 8, 9, 19, 21, 22], "lm": [1, 6, 14, 21, 23], "where": [1, 3, 4, 5, 9, 12, 15, 16, 18, 19, 21, 22, 23], "onli": [1, 2, 3, 5, 6, 9, 12, 14, 15, 16, 18, 19, 21, 22, 23], "unk": [1, 18, 19, 21, 22], "flag": [1, 3, 5, 14, 23], "structur": [1, 2, 3, 12, 16, 24], "sensit": [1, 9, 12], "hyperparamet": [1, 9, 12], "effect": [1, 5, 9, 16, 17, 19], "setup": [1, 9, 13, 14, 16], "confirm": [1, 9], "replic": [1, 5, 8, 9, 12, 16], "wmt": [1, 9], "wmt17": [1, 8], "en": [1, 8, 9, 21], "de": [1, 8, 9, 17], "opt": [1, 5, 6, 13, 14, 15, 16, 17, 21, 23], "save_model": [1, 3, 5, 9, 10, 11, 12, 21, 23], "mybasemodel": 1, "save_checkpoint_step": [1, 3, 9, 12, 14, 21, 23], "10000": [1, 3, 5, 9, 12, 14, 21, 22, 23], "valid_step": [1, 9, 12, 14, 21, 23], "train_step": [1, 3, 5, 9, 11, 12, 14, 21, 23], "200000": [1, 5, 9], "batch": [1, 2, 9, 12, 14, 15, 16, 18, 21, 22, 23], "bucket_s": [1, 2, 15, 21, 23], "262144": [1, 21], "world_siz": [1, 5, 9, 12, 21, 22, 23], "4": [1, 2, 3, 5, 7, 8, 9, 11, 12, 19, 21, 22, 23], "gpu_rank": [1, 3, 5, 9, 10, 11, 12, 14, 21, 22, 23], "num_work": [1, 2, 21, 23], "batch_typ": [1, 5, 7, 9, 15, 21, 22, 23], "batch_siz": [1, 3, 5, 7, 9, 10, 11, 12, 15, 16, 18, 21, 22, 23], "4096": [1, 4, 5, 7, 9, 14, 23], "valid_batch_s": [1, 5, 21, 23], "2048": [1, 9, 12, 15, 21], "accum_count": [1, 5, 9, 14, 21, 23], "accum_step": [1, 14, 21, 23], "optim": [1, 2, 5, 8, 9, 11, 12, 23], "model_dtyp": [1, 14, 21, 23], "fp16": [1, 2, 14, 21, 22, 23], "adam": [1, 5, 9, 11, 12, 14, 21], "learning_r": [1, 3, 5, 9, 10, 11, 12, 14, 21, 23], "warmup_step": [1, 5, 9, 21, 23], "8000": [1, 5, 9, 12], "decay_method": [1, 5, 9, 21, 23], "noam": [1, 5, 9, 21, 24], "adam_beta2": [1, 5, 9, 21, 23], "998": [1, 5, 7, 9, 23], "max_grad_norm": [1, 5, 9, 10, 11, 14, 21, 23], "label_smooth": [1, 5, 9, 21, 23], "param_init": [1, 5, 9, 12, 21, 23], "param_init_glorot": [1, 5, 9, 12, 21, 23], "encoder_typ": [1, 3, 5, 6, 9, 10, 12, 21, 23], "decoder_typ": [1, 3, 5, 6, 9, 12, 21, 23], "position_encod": [1, 5, 9, 12, 16, 21, 23], "enc_lay": [1, 11, 21], "6": [1, 3, 9], "dec_lay": [1, 11, 21, 23], "8": [1, 2, 3, 5, 9, 11, 12, 14, 21, 22, 23], "512": [1, 4, 5, 9, 11, 12, 21], "word_vec_s": [1, 5, 9, 10, 12, 16, 21, 23], "transformer_ff": [1, 9, 12, 21, 23], "dropout_step": [1, 14, 21, 23], "dropout": [1, 5, 9, 11, 12, 14, 16, 19, 21, 22, 23], "attention_dropout": [1, 5, 14, 16, 21, 23], "here": [1, 2, 4, 5, 6, 9, 12, 13, 18, 23], "most": [1, 2, 5, 12, 14, 18, 19, 21, 22], "paramet": [1, 3, 5, 6, 9, 12, 14, 15, 16, 17, 18, 19, 21, 22, 23], "mean": [1, 2, 5, 9, 16, 17, 21, 23], "correct": [1, 9, 12, 14], "initi": [1, 3, 5, 8, 9, 12, 14, 15, 16, 17, 18, 19], "sinusoid": [1, 9, 16, 21], "each": [1, 2, 3, 4, 9, 12, 14, 15, 16, 18, 19, 21, 22, 23], "rate": [1, 8, 9, 11, 14], "number": [1, 2, 3, 5, 9, 12, 14, 15, 16, 18, 19, 21, 22, 23], "sentenc": [1, 3, 5, 9, 18, 19, 21, 22, 23], "comput": [1, 2, 3, 4, 5, 9, 14, 16, 21, 22], "four": [1, 9, 10], "label": [1, 9, 10, 11, 14, 21], "smooth": [1, 7, 9, 19, 21, 22], "loss": [1, 5, 8, 9, 21, 23], "batch_size_multipl": [1, 15, 21, 23], "vocab_size_multipl": [1, 21], "num": [1, 2], "worker": [1, 2], "dure": [1, 2, 3, 4, 5, 16, 17, 21, 22], "decai": [1, 2, 14, 21], "system": [1, 2, 3, 10, 11, 18, 21, 24], "max_relative_posit": [1, 16, 21, 23], "20": [1, 2, 5, 6, 7, 10, 12, 19, 21, 22], "fast": [1, 2, 4, 7, 9], "ctranslate2": [1, 2, 4, 14], "basic": [1, 5, 6], "stem": 1, "origin": [1, 2, 4, 12, 14, 21], "even": [1, 23], "sinusoidalinterleav": [1, 16, 21], "sinusoidalconcat": [1, 21], "position_encoding_typ": [1, 16, 21], "forget": 1, "mode": [1, 4, 6, 9, 19, 21, 22, 23], "shaw": 1, "http": [1, 3, 4, 5, 7, 9, 10, 11, 13, 21, 22, 23, 24], "org": [1, 7, 13, 21, 22, 24], "ab": [1, 21, 22, 24], "1803": [1, 21], "02155": [1, 21], "n": [1, 2, 4, 5, 12, 16, 18, 19, 21, 22, 23, 24], "16": [1, 3, 5, 12, 18, 21, 23, 24], "32": [1, 3, 21, 23], "see": [1, 5, 6, 10, 12, 14, 16, 17, 18, 19, 21, 23, 24], "rope": 1, "2104": [1, 21], "09864": 1, "mpt": [1, 23], "7b": [1, 8, 21], "2108": 1, "12409": 1, "both": [1, 9, 14, 18, 21, 23], "case": [1, 2, 3, 7, 14, 16, 19, 21, 22], "nutshel": 1, "time": [1, 9, 11, 12, 14, 18, 19, 21, 22], "write": [1, 4, 12, 14], "manag": [1, 12, 14], "wherea": [1, 18, 21], "self": [1, 16, 17, 18, 21], "sure": [1, 4, 5, 9, 12, 18], "export": [1, 9, 12], "cuda_visible_devic": [1, 9, 12], "want": [1, 2, 9, 12, 22, 23], "id": [1, 9, 17, 18, 19, 21], "o": [1, 5, 9, 10, 11, 12], "node": [1, 3, 9, 14, 16, 21], "warn": [1, 9, 10, 11, 12, 15, 19, 21, 22], "distribut": [1, 8, 14, 16, 18, 19], "ha": [1, 2, 5, 12, 18, 21, 23], "been": [1, 4, 12, 18, 23], "properli": [1, 4, 14], "re": [1, 7, 9, 12, 18], "implement": [1, 2, 3, 5, 9, 12, 14, 16, 21, 23], "sinc": [1, 5, 9, 12, 16], "master_ip": [1, 9, 21, 22], "master_port": [1, 9, 21, 22], "second": [1, 3, 9, 11, 14, 16, 17, 23], "accumul": [1, 5, 9, 14, 21], "network": [1, 5, 8, 9, 11, 12, 16, 24], "card": [1, 4, 9], "gbp": [1, 9], "suggest": [1, 9, 21], "higher": [1, 9, 12, 18, 22], "minim": [1, 9], "inter": [1, 9], "legaci": [1, 2, 4, 5, 10, 11, 12, 23], "sever": [1, 9, 16, 18], "couldn": 1, "them": [1, 3, 4, 9, 12, 16, 21], "exclus": [1, 9], "nvidia": [1, 4, 7, 9, 21], "smi": [1, 9], "produc": [1, 2, 3, 9, 11, 18, 19, 21, 22], "consum": [1, 9], "n_gpu": [1, 9, 14], "process": [1, 2, 3, 5, 9, 12, 14, 17, 19, 21, 22, 23], "spawn": [1, 2, 9], "host": [1, 9], "queue": [1, 9, 19], "next": [1, 9, 12, 14, 18, 22], "benefici": [1, 9], "wall": [1, 9], "memori": [1, 9, 12, 14, 17], "shard": [1, 9, 12, 21], "advanc": [1, 3, 9, 13, 16, 18, 21, 23], "codebas": [1, 2], "becaus": [1, 3, 9, 16], "move": [1, 14, 17, 21], "devic": [1, 14, 15, 18, 22], "later": 1, "henc": [1, 3, 6, 10], "step": [1, 3, 5, 7, 8, 12, 13, 14, 16, 18, 21, 22], "onmt_train": [1, 6, 9, 10, 11, 12, 23], "execut": [1, 19, 21], "mkdir": [1, 9], "glove_dir": [1, 9], "wget": [1, 9, 10, 11, 23], "nlp": [1, 3, 9], "stanford": [1, 9], "edu": [1, 9, 10, 11], "6b": [1, 9], "zip": [1, 9, 12], "unzip": [1, 9, 12], "d": [1, 3, 5, 6, 9, 10, 12, 16, 20, 24], "adapt": [1, 5, 11], "your_config": 1, "config": [1, 3, 4, 5, 6, 7, 17, 19, 20, 21, 22], "decod": [1, 3, 4, 5, 6, 8, 9, 11, 12, 14, 19, 23], "side": [1, 4, 5, 14, 17, 19, 21, 22], "both_embed": [1, 21], "100d": [1, 9], "src": [1, 3, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 16, 17, 18, 19, 21, 22, 23], "tgt": [1, 3, 5, 6, 9, 10, 11, 14, 15, 16, 17, 19, 21, 22, 23], "separ": [1, 2, 3, 9, 12, 16, 22, 23], "src_embed": [1, 21], "tgt_embed": [1, 21], "word2vec": [1, 9, 21], "embeddings_typ": [1, 21], "dimens": [1, 3, 5, 16, 18, 21], "100": [1, 3, 5, 9, 11, 18, 23], "save": [1, 4, 6, 12, 14, 19, 21, 22, 23], "save_data": [1, 3, 5, 9, 10, 11, 12, 19, 21, 23], "enc_embed": 1, "dec_embed": 1, "freeze_word_vecs_enc": [1, 21], "freeze_word_vecs_dec": [1, 21], "freez": [1, 16, 21], "specifi": [1, 6, 9, 12, 14, 16, 19, 21, 22, 23], "onmt_transl": [1, 5, 6, 10, 11, 12, 23], "command": [1, 4, 5, 9, 10, 12, 23], "model1_seed1": 1, "model2_seed2": 1, "bear": [1, 9], "mind": [1, 9], "must": [1, 3, 4, 6, 9, 14, 16, 17, 19, 21, 22], "share": [1, 5, 6, 9, 12, 19, 21, 22], "natur": [1, 5], "introduc": [1, 5, 9], "own": 1, "ll": [1, 9, 12], "sequenti": [1, 15], "take": [1, 7, 9, 10, 13, 14, 16, 19, 21, 22], "corpu": [1, 5, 6, 9, 15, 19, 21, 22, 23], "worri": 1, "homogen": 1, "heterogen": 1, "bucket": [1, 15, 16, 21], "mechan": [1, 3, 5, 14, 16, 23], "reason": 1, "sort": [1, 15, 17], "yield": [1, 15], "random": [1, 8, 12, 19, 21], "7": [1, 3, 4, 9, 10, 16, 21], "corpus_1": [1, 6, 23], "path_src": [1, 3, 5, 6, 23], "toi": [1, 3, 10, 11, 23], "end": [1, 3, 14, 18, 19, 21, 22, 23], "train1": 1, "path_tgt": [1, 3, 5, 23], "corpus_2": 1, "valid": [1, 3, 5, 6, 9, 12, 14, 15, 19, 21, 22, 23], "val": [1, 3, 5, 10, 11, 12, 23], "seq2seq": [1, 18, 21], "wa": [1, 5, 9], "bo": [1, 18, 19, 21], "pad": [1, 14, 16, 18, 19, 21], "eo": [1, 18, 19, 21], "behavior": [1, 12], "group": [1, 3, 14, 21, 22, 23], "specila": 1, "narg": 1, "defaulttoken": [1, 14], "srctok1": 1, "srctok2": 1, "srctok3": 1, "srctokn": 1, "tgttok1": 1, "tgttok2": 1, "tgttokm": 1, "But": 1, "said": 1, "eg": [1, 21], "nllb": [1, 21], "200": [1, 9, 19, 21, 22], "llama": [1, 8, 23], "exist": [1, 2, 5, 19, 21, 22, 23], "ex": [1, 4, 19, 21, 22], "fact": 1, "never": [1, 18], "At": [1, 21], "There": [1, 3, 9], "conflict": 1, "forc": [1, 18, 22], "0x00": 1, "pyonmttok": [1, 8, 19, 21, 22], "nbest": 1, "alpha": [1, 5, 18, 22], "src_subword_typ": [1, 6, 19, 21, 22, 23], "src_subword_model": [1, 6, 19, 21, 22, 23], "spm": 1, "tgt_subword_typ": [1, 19, 21, 22, 23], "tgt_subword_model": [1, 19, 21, 22, 23], "candid": [1, 19, 21, 22], "subword_nbest": 1, "subword_alpha": 1, "src_onmttok_kwarg": [1, 6, 19, 21, 22, 23], "none": [1, 7, 14, 15, 16, 17, 18, 19, 21, 22, 23], "spacer_annot": 1, "tgt_onmttok_kwarg": [1, 19, 21, 22, 23], "onmt_token": [1, 6, 19, 21, 22, 23], "other": [1, 2, 4, 12, 14, 18, 19, 21, 22, 23, 24], "method": [1, 2, 4, 5, 6, 14, 16, 21], "dedic": [1, 15], "detail": [1, 6, 10, 13, 19, 21], "lucki": 1, "dai": [1, 24], "alreadi": [1, 6], "easili": 1, "everi": [1, 5, 9, 14, 15, 16, 21, 22], "found": [1, 5, 6], "filtertoolong": [1, 3, 5, 19, 21, 22, 23], "misc": 1, "filtertoolongtransform": 1, "src_seq_length": [1, 3, 5, 12, 19, 21, 22, 23], "maximum": [1, 5, 19, 21, 22], "sequenc": [1, 3, 5, 6, 12, 14, 16, 17, 18, 19, 21, 22, 24], "tgt_seq_length": [1, 3, 5, 10, 12, 19, 21, 22, 23], "prefixtransform": 1, "src_prefix": [1, 19, 21, 22], "tgt_prefix": [1, 19, 21, 22], "__some_src_prefix__": 1, "__some_tgt_prefix__": 1, "uniqu": 1, "oppos": 1, "come": 1, "given": [1, 2, 3, 9, 10, 11, 17], "spa_latn": 1, "tgt_file_prefix": [1, 18, 22], "suffixtransform": 1, "src_suffix": [1, 19, 21, 22], "tgt_suffix": [1, 19, 21, 22], "__some_src_suffix__": 1, "__some_tgt_suffix__": 1, "uppercasetransform": 1, "present": [1, 5], "cap": [1, 12], "string": [1, 12, 16, 19, 21, 22], "strip": [1, 5, 12], "diacrit": 1, "accent": 1, "usual": [1, 6, 23], "desir": 1, "although": [1, 12, 16], "ratio": [1, 7, 12, 18, 19, 21, 22], "upper_corpus_ratio": [1, 19, 21, 22], "01": [1, 11, 19, 21, 22], "normalizetransform": 1, "rule": [1, 3, 21], "mose": 1, "src_lang": [1, 19, 21, 22], "cz": 1, "fr": 1, "tgt_lang": [1, 19, 21, 22], "penn": [1, 19, 21, 22], "substitut": [1, 19, 21, 22], "norm_quote_comma": [1, 19, 21, 22], "quotat": [1, 19, 21, 22], "comma": [1, 19, 21, 22], "norm_numb": [1, 19, 21, 22], "pre_replace_unicode_punct": [1, 19, 21, 22], "unicod": [1, 19, 21, 22], "punct": [1, 19, 21, 22], "post_remove_control_char": [1, 19, 21, 22], "remov": [1, 2, 5, 6, 12, 19, 21, 22], "control": [1, 14, 19, 21, 22], "char": [1, 19, 21, 22], "cleantransform": 1, "src_eq_tgt": [1, 19, 21, 22], "same_char": [1, 19, 21, 22], "repeat": [1, 5, 12, 18, 22], "same_word": [1, 19, 21, 22], "script_ok": 1, "contain": [1, 4, 6, 9, 10, 11, 16, 17, 18, 23], "belong": 1, "latin": [1, 12, 19, 21, 22], "script_nok": 1, "src_tgt_ratio": [1, 19, 21, 22], "ration": 1, "avg_tok_min": [1, 19, 21, 22], "avg_tok_max": [1, 19, 21, 22], "lang_id": 1, "detect": 1, "docifi": [1, 8], "docifytransform": 1, "concaten": [1, 21], "delimit": [1, 3, 19, 21, 22], "pre": [1, 9, 14, 17, 18], "requisit": 1, "empti": [1, 6, 15, 16, 18, 19, 21, 23], "stori": 1, "doc_length": [1, 19, 21, 22], "max": [1, 2, 3, 12, 14, 16, 18, 19, 21, 22, 23], "max_context": [1, 19, 21, 22], "ie": 1, "precaut": 1, "linearli": 1, "stride": [1, 11, 15], "fuzzymatch": [1, 8], "fuzzymatchtransform": 1, "describ": [1, 2, 4, 5, 12, 13, 17, 21], "machin": [1, 5, 13, 16, 18, 23, 24], "current": [1, 4, 5, 9, 14, 15, 16, 18, 21], "tm": [1, 19, 21, 22], "should": [1, 2, 3, 5, 9, 12, 14, 15, 16, 18, 21, 23], "flat": [1, 19, 21, 22], "text": [1, 4, 8, 15, 16, 18, 19, 21, 22, 24], "intens": 1, "offer": 1, "achiev": [1, 5], "overhead": 1, "spec": 1, "mai": [1, 2, 9, 14, 15, 17, 18, 19, 21], "experi": [1, 5, 19, 21, 22], "bucket_size_init": [1, 15, 21], "bucket_size_incr": [1, 15, 21], "increas": [1, 2, 3, 12, 16], "prefetch_factor": [1, 21], "wait": [1, 21], "size": [1, 2, 3, 5, 11, 12, 14, 15, 16, 18, 19, 21, 22], "200k": 1, "250k": 1, "unit": [1, 3, 16, 23], "enough": [1, 4], "suffici": [1, 2], "short": [1, 4, 11], "bit": [1, 22], "n_sampl": [1, 4, 5, 6, 7, 19, 21, 23], "tm_path": [1, 19, 21, 22], "path": [1, 3, 6, 9, 10, 11, 12, 14, 16, 17, 18, 19, 21, 22, 23], "fuzzy_corpus_ratio": [1, 19, 21, 22], "fuzzy_threshold": [1, 19, 21, 22], "threshold": [1, 19, 21, 22], "70": [1, 3, 12, 19, 21, 22], "tm_delimit": [1, 19, 21, 22], "fuzzy_token": [1, 19, 21, 22], "fuzzymatch_min_length": [1, 19, 21, 22], "min": [1, 19, 21, 22], "fuzzymatch_max_length": [1, 19, 21, 22], "inlinetag": [1, 8], "inlinetagstransform": 1, "placehold": [1, 14], "kind": 1, "pair": [1, 3, 5, 9, 12, 14, 15, 17, 19, 21, 22], "open": [1, 12, 13, 19, 21, 22], "isol": [1, 19, 21, 22], "standalon": 1, "tab": [1, 22], "dictionari": [1, 5, 14, 16, 18, 19, 21, 22], "term": [1, 9, 14, 16, 19, 21, 22, 23], "phrase": 1, "30k": 1, "recommend": [1, 2, 4, 7, 21], "user": [1, 2, 14, 17], "defin": [1, 3, 5, 6, 14, 15, 16, 19, 21, 22, 23], "tags_dictionary_path": [1, 19, 21, 22], "tags_corpus_ratio": [1, 19, 21, 22], "max_tag": [1, 19, 21, 22], "paired_stag": [1, 19, 21, 22], "ph": 1, "beg": 1, "paired_etag": [1, 19, 21, 22], "isolated_tag": [1, 19, 21, 22], "std": 1, "src_delimit": [1, 19, 21, 22], "terminologytransform": 1, "provid": [1, 2, 3, 4, 6, 9, 10, 11, 13, 22], "spaci": [1, 19, 21, 22], "lemmat": 1, "facil": 1, "solv": 1, "inflect": 1, "problem": [1, 3, 10, 18], "form": [1, 5], "correctli": 1, "src_term_start": [1, 19, 21, 22], "tgt_term_start": [1, 19, 21, 22], "target_lemma_for_aug": 1, "tgt_term_end": [1, 19, 21, 22], "termbase_path": [1, 19, 21, 22], "src_spacy_language_model": [1, 19, 21, 22], "tgt_spacy_language_model": [1, 19, 21, 22], "term_corpus_ratio": [1, 19, 21, 22], "term_example_ratio": [1, 19, 21, 22], "src_term_stoken": [1, 19, 21, 22], "tgt_term_stoken": [1, 19, 21, 22], "tgt_term_etoken": [1, 19, 21, 22], "term_source_delimit": [1, 19, 21, 22], "src_subword_nbest": [1, 19, 21, 22], "tgt_subword_nbest": [1, 19, 21, 22], "src_subword_alpha": [1, 19, 21, 22], "probabl": [1, 9, 16, 18, 19, 21, 22], "tgt_subword_alpha": [1, 19, 21, 22], "onmttokenizertransform": 1, "kwarg": [1, 16], "sentencepiecetransform": 1, "bpetransform": 1, "compos": 1, "part": [1, 3, 5, 6, 12, 18], "denois": 1, "comprehens": 1, "These": [1, 2, 3, 5, 12, 16, 18], "permute_sent_ratio": [1, 19, 21, 22], "proport": [1, 19, 21, 22], "permut": [1, 19, 21, 22], "boundari": [1, 5, 19, 21, 22], "rotate_ratio": [1, 19, 21, 22], "input": [1, 2, 3, 4, 5, 6, 12, 14, 15, 16, 17, 18, 19, 21, 22, 23, 24], "insert_ratio": [1, 19, 21, 22], "insert": [1, 19, 21, 22], "random_ratio": [1, 19, 21, 22], "mask_ratio": [1, 19, 21, 22], "mask_length": [1, 19, 21, 22], "window": [1, 11, 14, 19, 21, 22], "span": [1, 19, 21, 22], "poisson": [1, 19, 21, 22], "poisson_lambda": [1, 19, 21, 22], "lambda": [1, 12, 19, 21, 22], "valu": [1, 2, 3, 4, 5, 9, 14, 16, 17, 18, 19, 21, 22, 23], "replace_length": [1, 19, 21, 22], "switchouttransform": 1, "switchout_temperatur": [1, 19, 21, 22], "temperatur": [1, 18, 19, 21, 22], "tokendrop": [1, 19, 21, 22], "tokendroptransform": 1, "tokendrop_temperatur": [1, 19, 21, 22], "delet": [1, 12, 19, 21, 22], "tokenmask": [1, 19, 21, 22], "tokenmasktransform": 1, "tokenmask_temperatur": [1, 19, 21, 22], "inherit": 1, "instanc": [1, 4, 14, 16, 18], "templat": 1, "register_transform": 1, "out": [1, 2, 3, 5, 6, 12, 14, 16, 23], "too": [1, 3, 12, 18, 21], "classmethod": [1, 14, 15, 16], "add_opt": 1, "cl": [1, 14], "parser": [1, 12], "avalil": 1, "relat": [1, 19, 21, 22, 23], "add_argument_group": 1, "int": [1, 3, 12, 14, 15, 16, 17, 18], "_parse_opt": 1, "is_train": 1, "stat": [1, 12, 14, 21, 22], "els": [1, 12], "len": [1, 12, 14, 16, 18], "filtertoolongstat": 1, "_repr_arg": 1, "repres": [1, 3, 23], "would": [1, 2, 18, 21], "pars": [1, 15, 16, 17], "happen": [1, 18, 19, 21], "log": [1, 4, 8, 14, 18], "wrapper": [1, 5, 14], "definit": [1, 3, 16], "automat": [1, 3, 5], "proper": [1, 17], "usabl": 1, "through": [1, 2, 3, 14], "could": [1, 9, 12, 18], "collect": [1, 12, 15], "statist": [1, 14, 21], "observablestat": 1, "rune": 1, "__slots__": 1, "__init__": [1, 17], "element": [1, 4, 15], "keep": [1, 12, 14, 17, 18, 21], "track": 1, "__slot__": 1, "lightweight": 1, "suppli": 1, "logic": 1, "overrid": [1, 16, 18, 19, 21], "__str__": 1, "messag": 1, "instanti": [1, 14], "pass": [1, 9, 14, 16, 17, 21], "correspond": [1, 15, 22], "gather": [1, 14], "report": [1, 12, 13, 14, 22], "dict": [1, 14, 15, 16, 17, 19, 21, 22], "pharaoh": [1, 9], "inputt": [1, 15], "parallelcorpu": [1, 15], "consid": [1, 5, 12, 16], "futur": [1, 9], "customparallelcorpu": 1, "cf": [1, 2, 14, 21], "bigger": 1, "limit": [1, 22], "anmount": 1, "vram": 1, "principl": [1, 9], "layer": [1, 3, 4, 5, 9, 12, 14, 16, 21, 22, 23], "trainabl": [1, 14], "reduc": [1, 2, 9, 14], "amount": [1, 5, 12, 15, 21], "especi": 1, "3b": [1, 21], "lora_lay": [1, 21, 23], "linear_valu": [1, 21, 23], "linear_queri": [1, 21, 23], "two": [1, 4, 5, 9, 12, 16, 21], "lora_rank": [1, 21, 23], "lora_dropout": [1, 21, 23], "lora_alpha": [1, 21, 23], "lora_embed": [1, 21, 23], "compat": [1, 2, 12], "update_vocab": [1, 21], "bitsandbyt": 1, "enabl": [1, 4, 5, 6, 9, 14, 16, 21, 22, 23], "quantiz": [1, 4, 22], "linear": [1, 2, 3, 12, 16, 21], "inform": [1, 3, 16, 21, 22], "com": [1, 3, 4, 5, 7, 11, 13, 16, 21, 23], "timdettm": 1, "blog": 1, "post": 1, "huggingfac": 1, "co": 1, "hf": 1, "quant_lay": [1, 21, 22, 23], "w_1": [1, 23], "w_2": [1, 23], "quant_typ": [1, 21, 22, 23], "bnb_nf4": [1, 21, 22, 23], "instan": 1, "positionwis": 1, "feed": [1, 4, 16, 21], "forward": [1, 4, 14, 16, 21], "queri": [1, 16, 21], "final": [1, 5, 16, 18], "bnb": 1, "fp4": 1, "nf4": 1, "use_ckpt": [1, 16, 21, 23], "ffn": [1, 16, 21, 23], "mha": [1, 21], "careful": [1, 23], "report_align": [1, 9, 18, 22], "call": [1, 5, 9, 12, 16, 18], "output": [1, 3, 4, 5, 7, 8, 9, 10, 11, 12, 14, 16, 17, 18, 19, 21, 22, 23], "argmax": [1, 9, 12, 22], "last": [1, 9, 14, 16, 21], "behaviour": [1, 9], "empir": [1, 5, 9], "determin": [1, 9], "thei": [1, 4, 9, 12, 14, 16, 18], "penultim": [1, 9], "slight": [1, 4, 6], "architectur": [1, 4, 9, 12, 21], "j": [1, 5, 9, 16, 21, 24], "indic": [1, 3, 9, 12, 14, 16, 18, 19, 21, 22], "th": [1, 9], "da": [1, 9], "stimmt": [1, 9], "nicht": [1, 9], "gold_align": [1, 18, 22], "between": [1, 2, 3, 5, 6, 9, 16, 19, 21, 22, 24], "gold": [1, 9, 18, 22], "assum": [1, 3, 5, 9, 10, 11, 12, 18], "evalu": [1, 8, 9, 12, 14, 23], "symetr": [1, 9], "bidirect": [1, 5, 9, 16, 21], "lilt": [1, 9], "qualiti": [1, 4, 9], "further": [1, 9, 19, 21], "improv": [1, 4, 9, 12, 14, 16, 18, 21, 24], "invok": [1, 9], "task": [1, 4, 6, 8, 9, 14, 15], "jointli": [1, 5, 9, 14, 16, 24], "preprocess": [1, 2, 5, 10, 11, 12, 17, 19, 21, 22], "giza": [1, 9], "path_align": 1, "incompat": 1, "joint": [1, 9], "pipelin": [1, 2, 19, 21, 22, 23], "modif": [1, 4, 14], "level": [1, 15, 19, 21, 22], "made": [1, 23], "invalid": [1, 19, 21, 22], "lambda_align": [1, 9, 14, 21], "05": [1, 5, 9, 16, 23], "alignment_lay": [1, 9, 16, 21], "index": [1, 9, 12, 16], "alignment_head": [1, 9, 16, 21], "kept": [1, 9], "num_head": [1, 9], "full_context_align": [1, 9, 16, 21], "slow": [1, 2, 3, 9, 19, 22], "down": [1, 2, 9, 18, 19, 23], "tok": [1, 7, 9, 17], "map": [1, 4, 12, 14, 16, 19, 21, 22], "onmt_build_vocab": [1, 3, 5, 6, 23], "reset_optim": [1, 21], "state": [1, 3, 5, 14, 16, 18, 21], "train_from": [1, 21, 23], "incorpor": [1, 21], "append": [1, 12, 19, 21, 22], "actual": [1, 12, 18, 23], "textual": 1, "l": [1, 10, 12], "she": 1, "hard": 1, "prior": [1, 19, 21], "featinfertransform": 1, "instac": 1, "n_src_feat": [1, 15, 19, 21, 22], "expect": [1, 6, 12, 14, 18, 23], "src_feats_default": [1, 15, 19, 21, 22], "realli": 1, "mix": [1, 7, 14, 15], "annot": [1, 3, 12, 19, 21, 22], "appropri": [1, 3, 4, 18], "src_word_vec_s": [1, 3, 16, 21], "tgt_word_vec_s": [1, 3, 21], "feat_merg": [1, 16, 21], "vec": [1, 12], "feat_vec_s": [1, 12, 16, 21], "mayb": 1, "feat_vec_expon": [1, 16, 21], "ensur": [1, 12], "possibl": [1, 2, 17, 18, 19, 21, 22, 23], "concat": [1, 4, 16, 21], "dummi": 1, "inferfeat": [1, 8], "reversible_token": [1, 19, 21, 22], "joiner": [1, 19, 21, 22], "src_vocab": [1, 3, 5, 16, 19, 21, 23], "exp": [1, 7, 21], "tgt_vocab": [1, 3, 5, 19, 21, 23], "sum": [1, 14, 16, 18, 21], "rest": [1, 20], "serv": 1, "discuss": 1, "forum": [1, 13], "idea": [1, 9], "behind": 1, "point": [1, 5, 24], "receiv": [1, 2], "detoken": [1, 17], "available_model": [1, 20], "conf": [1, 20], "json": [1, 4, 20], "along": [1, 12, 14], "models_root": 1, "manual": [1, 17, 18], "assign": [1, 22], "counter": 1, "ass": 1, "timeout": [1, 17], "interv": [1, 21], "unload": [1, 17], "reset": 1, "whether": [1, 14, 16, 17, 18, 21], "on_timeout": [1, 17], "everyth": 1, "to_cpu": [1, 17], "transfer": [1, 21], "ram": [1, 12], "faster": [1, 16], "reload": 1, "translate_opt": 1, "bool": [1, 3, 14, 15, 16, 17, 18], "ct2_translator_arg": [1, 17], "ct2_translate_batch_arg": [1, 17], "engin": 1, "appear": 1, "simultan": 1, "ct2_": 1, "_arg": 1, "ident": 1, "ct2_model": [1, 17], "model_0": 1, "600": 1, "beam_siz": [1, 3, 5, 6, 7, 10, 18, 22, 23], "wmtenfr": 1, "light": [1, 9], "model_root": [1, 17], "other_model": 1, "10": [1, 5, 6, 7, 12, 13, 21, 22, 23, 24], "merg": [1, 4, 16, 21], "master": [1, 21, 22, 23], "branch": [1, 2], "cp": 1, "path_to_my_model": 1, "ip": [1, 20, 21, 22], "port": [1, 20, 21, 22], "5000": [1, 3, 4, 14, 16, 19, 20, 21, 23], "url_root": [1, 20], "optionn": 1, "explicit": 1, "librari": [1, 21], "configargpars": 1, "cor": 1, "waitress": 1, "dockerfil": 1, "cuda10": 1, "cudnn7": 1, "runtim": 1, "workdir": 1, "usr": [1, 5], "app": 1, "cach": [1, 4, 7], "dir": [1, 4, 7, 16], "r": [1, 3, 5, 10, 12, 13, 16, 21, 24], "volum": 1, "cmd": 1, "imag": [1, 2, 8, 14], "opennmt_serv": 1, "rm": [1, 4, 12, 16, 21, 23], "p": [1, 5, 6, 10, 16, 18, 22, 24], "fex": 1, "rout": 1, "bin": [1, 4, 5, 7, 23], "127": 1, "curl": 1, "wmt14": 1, "de_acc_69": 1, "22_ppl_4": 1, "33_e9": 1, "involv": 1, "h": [1, 5, 9, 16, 19, 20, 21, 22], "applic": [1, 4], "model_id": [1, 17], "u2581di": 1, "u2581formen": 1, "kant": 1, "u2581": 1, "k": [1, 2, 5, 6, 18, 22], "u00f6r": 1, "ner": 1, "u2581d": 1, "u2581stahl": 1, "u": [1, 2, 4, 5, 6], "u00df": 1, "statu": 1, "ok": [1, 3], "total": [1, 14, 21, 22], "510261535644531": 1, "509992599487305": 1, "writing_src": 1, "0002689361572265625": 1, "v3": [2, 8], "releas": [2, 5], "doe": [2, 3, 4, 5, 8, 9, 12, 22], "anymor": 2, "checkpoint": [2, 8, 12, 14, 21, 23], "slightli": [2, 4, 9, 12], "convert": [2, 4, 9, 12, 15, 17], "v2": [2, 8], "model": [2, 3, 4, 7, 8, 10, 11, 12, 13, 16, 18, 19], "dynam": [2, 6, 8, 15, 16, 22], "paradigm": 2, "appli": [2, 4, 5, 6, 8, 11, 14, 15, 16, 17, 18, 19, 21, 22], "fly": [2, 4, 6, 8], "transform": [2, 3, 4, 5, 6, 7, 8, 11, 13, 14, 15, 16, 23, 24], "data": [2, 8, 9, 10, 11, 12, 14, 24], "advantag": 2, "amongst": 2, "drastic": [2, 5], "train": [2, 3, 4, 8, 10, 11, 12, 13, 14, 15, 16, 22], "augment": [2, 19, 21, 22, 24], "manipul": [2, 14], "can": [2, 3, 4, 5, 6, 8, 10, 12, 13, 15, 17, 18, 19, 21, 22, 23], "specif": [2, 4, 5, 8, 13, 14, 15, 18, 19, 21, 23], "token": [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18, 19, 21, 22, 23], "filter": [2, 3, 5, 8, 12, 23], "nois": [2, 8], "custom": [2, 4, 8, 14, 17, 21], "quit": [2, 23], "straightforward": 2, "thank": 2, "load": [2, 6, 8, 9, 12, 14, 15, 16, 17, 21], "updat": [2, 8, 9, 14, 17, 18, 21], "readili": [2, 8], "avail": [2, 6, 8, 12, 14, 17, 21, 22], "queue_siz": 2, "pool_factor": 2, "adjust": [2, 3], "dataload": [2, 21], "gpu": [2, 3, 5, 7, 8, 10, 11, 12, 14, 17, 18, 21, 22, 23], "1": [2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18, 19, 21, 22, 24], "2": [2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 14, 19, 21, 22], "set": [2, 3, 4, 5, 6, 7, 8, 9, 12, 14, 16, 17, 18, 19, 21, 22, 23], "bia": [2, 14, 16, 21], "q": [2, 16], "v": [2, 4, 7, 8, 24], "nn": [2, 14, 16, 21], "multihead": 2, "renam": [2, 12], "convertv2_v3": 2, "store": [2, 6, 10, 11], "infer": [2, 6, 8, 10, 15, 18, 22], "translat": [2, 3, 4, 5, 8, 10, 11, 12, 13, 14, 16, 17, 20, 24], "iter": [2, 6, 8, 14, 21], "trainer": [2, 8, 23], "length_penalti": [2, 5, 18, 22], "avg": [2, 16, 22], "bleu": [2, 7, 12], "score": [2, 7, 8, 12, 14, 16, 17, 22, 23], "compar": [2, 12], "toolkit": [2, 13], "featur": [2, 3, 4, 8, 11, 12, 13, 14, 16, 17, 24], "drop": 2, "v1": [2, 13], "audio": [2, 11], "video": [2, 8], "previou": [2, 18, 23], "retain": 2, "extens": 2, "from": [2, 3, 4, 5, 6, 8, 10, 11, 12, 13, 14, 15, 16, 18, 21, 22], "core": [2, 8, 14], "team": 2, "let": [2, 4, 5], "know": [2, 12], "what": [2, 8, 9, 10, 12, 14, 17], "think": 2, "cpu": [2, 7, 14, 15, 17, 22], "resourc": [2, 8], "power": 2, "ideal": 2, "2n": 2, "thread": [2, 19], "averag": [2, 3, 8, 14, 16, 19, 21, 22, 24], "rel": [2, 8, 10, 11, 16, 21], "posit": [2, 4, 8, 9, 14, 16, 21], "encod": [2, 3, 5, 6, 8, 9, 11, 12, 14, 18, 23], "represent": [3, 11, 16, 21, 24], "ast": 3, "connect": [3, 12], "benefit": 3, "ggnn": [3, 16, 21], "jameschuanggg": [3, 16], "git": [3, 4, 5, 7, 11, 12, 13, 16], "y": [3, 4, 5, 11, 12, 16], "li": [3, 16], "tarlow": [3, 16], "m": [3, 4, 5, 12, 13, 16, 24], "brockschmidt": [3, 16], "zemel": [3, 16], "program": 3, "equival": [3, 5, 21], "proof": 3, "dataflow": 3, "via": [3, 14, 16, 21, 24], "rewrit": 3, "That": [3, 12], "show": [3, 4, 5, 12], "graphic": 3, "beyond": [3, 14], "rnn": [3, 5, 14, 16, 21], "sequence2sequ": 3, "get": [3, 4, 5, 10, 11, 12, 23, 24], "directori": [3, 4, 10, 11, 12, 17, 21], "throughout": [3, 10, 11], "download": [3, 8, 10, 11, 12, 23], "sibl": 3, "clone": [3, 4, 5, 7, 12, 13, 17], "stevekommrusch": 3, "env": 3, "configur": [3, 4, 5, 6, 8, 16, 23], "written": [3, 5, 14, 23], "1000": [3, 12, 23], "30": [3, 14, 18, 21, 22], "cnndm": [3, 5], "weight": [3, 5, 8, 14, 15, 16, 21, 22, 23], "srcvocab": 3, "tgtvocab": 3, "start_decay_step": [3, 21, 23], "learning_rate_decai": [3, 11, 21, 23], "global_attent": [3, 5, 11, 21], "src_ggnn_size": [3, 16, 21], "larger": [3, 23], "plu": [3, 21], "hot": 3, "less": [3, 7, 12], "learn": [3, 5, 6, 8, 10, 11, 14, 16, 19, 21, 24], "state_dim": [3, 16, 21], "togeth": [3, 23], "embed": [3, 4, 5, 8, 12, 19], "64": [3, 9, 12, 21], "bridg": [3, 5, 21, 24], "n_edge_typ": [3, 16, 21], "9": [3, 4, 5, 6, 7, 10, 14, 21], "n_step": [3, 16, 21], "aggreg": 3, "hop": 3, "n_node": [3, 16, 21], "algebra": 3, "express": 3, "axiom": 3, "prove": 3, "model_step_10000": 3, "n_best": [3, 6, 17, 18, 22, 23], "pred": [3, 7, 10, 11, 12, 21, 22], "test_beam5": 3, "translate5": 3, "leverag": 3, "interfac": [3, 14, 16], "much": [3, 12, 21], "nearal": 3, "edg": [3, 16, 21], "eot": 3, "equat": 3, "being": [3, 4], "extra": [3, 9, 13, 16, 21], "rang": [3, 14, 22], "9th": 3, "just": [3, 5, 12, 23], "non": [3, 16, 18, 21], "mathemat": 3, "matric": 3, "identifi": [3, 22], "certain": 3, "occur": [3, 4], "shown": 3, "remain": 3, "numer": [3, 14], "integ": [3, 18], "duplic": 3, "vector": [3, 12, 14, 16, 21], "up": [3, 5, 8, 12, 16, 22], "largest": 3, "creat": [3, 4, 8, 9, 12, 14, 23], "lower": [3, 12, 21], "bet": 3, "rnn_type": [3, 11, 16, 21], "recurr": [3, 8, 16], "lstm": [3, 5, 10, 11, 16, 21, 23], "bidir_edg": [3, 16, 21], "revers": [3, 16, 19, 21, 22], "bridge_extra_nod": [3, 16, 21], "1st": [3, 16], "stabil": [3, 14, 16], "foundat": 4, "instruct": [4, 5, 21, 23], "lora": [4, 8, 21, 23], "8bit": [4, 8, 21, 22, 23], "compress": [4, 21, 22], "wise": 4, "normalis": 4, "rotari": [4, 8, 16, 21], "swiglu": 4, "activ": [4, 16, 21], "maxim": [4, 24], "context": [4, 9, 16, 19, 21, 22], "length": [4, 5, 12, 14, 16, 18, 19, 21, 22, 23], "repositori": [4, 5, 12], "replicate_vicuna": 4, "subdirectori": 4, "chekpoint": [4, 23], "llama7b": [4, 23], "genener": 4, "dataai": [4, 23], "sampl": [4, 5, 6, 8, 10, 11, 18, 19, 21, 23], "tensorboard": [4, 6, 14, 21], "translate_opts_pi": 4, "translate_opts_ct2": 4, "cranslate2": 4, "input_exampl": 4, "simple_infer": 4, "predict": [4, 12, 18, 21, 22, 23], "chatbot": 4, "gradio": 4, "apex": [4, 14, 21], "highli": [4, 7], "perform": [4, 7, 8, 12, 14, 16, 22], "pip3": [4, 7], "disabl": [4, 7, 12], "global": [4, 7, 12, 14, 16], "cpp_ext": [4, 7, 14], "cuda_ext": [4, 7, 14], "deprecated_fused_adam": [4, 7], "xentropi": [4, 7], "fast_multihead_attn": [4, 7], "environ": [4, 12], "procedur": [4, 5], "retriev": 4, "sentencepiec": [4, 19, 21, 22, 23], "offici": 4, "facebookresearch": 4, "convert_llama": 4, "python3": [4, 7], "model_dir": [4, 23], "tokenizer_model": [4, 23], "subword": [4, 8, 9, 23], "extract": [4, 12], "newli": 4, "extract_vocabulari": 4, "out_fil": [4, 18], "alpaca": [4, 23], "give": [4, 12, 21, 22], "three": [4, 16], "tip": [4, 8], "stai": 4, "healthi": 4, "eat": 4, "balanc": 4, "nutriti": 4, "diet": 4, "meal": 4, "inclus": 4, "varieti": [4, 10], "fruit": 4, "veget": 4, "lean": 4, "protein": 4, "whole": [4, 18], "grain": 4, "fat": 4, "bodi": 4, "essenti": [4, 18], "nutrient": 4, "best": [4, 12, 18, 22, 23], "prevent": [4, 5, 18, 22, 23], "chronic": 4, "diseas": 4, "n2": 4, "engag": 4, "regular": [4, 8, 9, 14, 19, 21, 22, 23], "physic": 4, "exercis": 4, "crucial": 4, "maintain": [4, 18], "strong": 4, "bone": 4, "muscl": 4, "cardiovascular": 4, "health": 4, "aim": 4, "150": [4, 10], "minut": 4, "moder": 4, "aerob": [4, 23], "75": [4, 5, 23], "vigor": 4, "week": 4, "n3": 4, "sleep": 4, "mental": 4, "regul": 4, "mood": 4, "cognit": 4, "growth": 4, "immun": 4, "hour": [4, 7], "night": 4, "flatten": [4, 12], "plain": [4, 15], "moreov": 4, "symbol": [4, 6], "act": 4, "break": [4, 8, 12], "world": [4, 12], "newlin": [4, 12, 19, 21, 22, 23], "51751": 4, "28800": 4, "prompt": [4, 12, 14, 19, 21, 22, 23], "instrunct": 4, "pattern": 4, "propos": [4, 23], "answer": [4, 23], "respons": [4, 14, 19, 21, 22, 23], "request": 4, "overriden": 4, "launch": [4, 6], "nohup": 4, "replicate_": 4, "finetenun": 4, "start": [4, 7, 14, 15, 17, 18, 19, 21, 22, 23], "turn": [4, 12, 16, 21], "dump_sampl": [4, 19], "strictli": 4, "worth": 4, "he": [4, 23], "substr": 4, "0x0a": 4, "embd": 4, "scratch": [4, 8], "ately": 4, "compl": 4, "te": 4, "inst": 4, "ruction": 4, "iv": [4, 8], "ing": 4, "bal": 4, "anc": 4, "nut": 4, "rit": 4, "iou": 4, "di": 4, "et": [4, 5, 21], "me": 4, "al": [4, 5, 21], "inclu": 4, "ruit": 4, "abl": 4, "gra": 4, "ins": 4, "ats": 4, "ri": 4, "ent": 4, "chron": 4, "ic": 4, "dise": 4, "ases": 4, "eng": 4, "ag": 4, "erc": 4, "ise": 4, "cru": 4, "cial": 4, "ones": 4, "mu": 4, "cle": 4, "ov": 4, "asc": 4, "ular": 4, "im": 4, "ate": 4, "aer": 4, "ob": 4, "vig": 4, "orou": 4, "reg": 4, "ulat": 4, "ood": 4, "cogn": 4, "itiv": 4, "imm": 4, "un": 4, "lora_weight": 4, "action": [4, 16, 18, 21], "base_model": 4, "finetuned_llama7b": 4, "onmt_step_4000": 4, "release_model": 4, "concat_ct2": 4, "int8_float16": 4, "inference_config_fil": 4, "inference_mod": 4, "max_context_length": 4, "server_port": 4, "Or": 4, "ct2": 4, "paramat": 4, "obtain": 4, "input_fil": 4, "output_dir": 4, "bottom": 5, "abstract": [5, 16], "inproceed": [5, 13], "gehrmann2018bottom": 5, "titl": [5, 13], "author": [5, 12, 13], "gehrmann": 5, "sebastian": 5, "deng": [5, 13], "yuntian": [5, 13], "rush": [5, 13], "alexand": [5, 13], "booktitl": [5, 13], "proceed": 5, "2018": [5, 24], "confer": 5, "page": 5, "4098": 5, "4109": 5, "year": [5, 13], "dataset": [5, 6, 8, 10, 12, 19, 21, 23], "access": 5, "split": [5, 12], "articl": 5, "australia": 5, "account": [5, 12], "deficit": 5, "shrunk": 5, "record": 5, "billion": 5, "dollar": 5, "lrb": 5, "rrb": 5, "june": 5, "quarter": 5, "due": [5, 12, 21], "soar": 5, "commod": 5, "price": 5, "figur": 5, "mondai": 5, "australian": 5, "narrow": 5, "sharpli": 5, "addition": [5, 16], "truncat": [5, 14, 21], "400": [5, 12], "target": [5, 6, 9, 14, 16, 17, 19, 21, 22, 23], "surround": 5, "tag": [5, 19, 21, 22], "w1": 5, "w2": 5, "w3": 5, "sed": [5, 7], "overwrit": [5, 19, 21, 23], "src_seq_length_trunc": [5, 21], "tgt_seq_length_trunc": [5, 21], "vocabulari": [5, 8, 9, 14, 16, 19, 21, 22, 23], "share_vocab": [5, 6, 19, 21, 23], "similar": [5, 12, 16, 21], "signific": [5, 12], "copy_attn": [5, 14, 15, 16, 18, 21], "word": [5, 8, 16, 18, 19, 21, 22], "mlp": [5, 11, 16, 21], "bahdanau": [5, 16, 21, 24], "luong": [5, 16, 21, 24], "dot": [5, 16, 21], "share_embed": [5, 21], "decreas": 5, "did": 5, "reuse_copy_attn": [5, 16, 21], "reus": [5, 16, 21], "copy_loss_by_seqlength": [5, 21], "divid": [5, 14, 21, 22], "practic": 5, "penalti": [5, 8, 14, 18], "hidden": [5, 14, 16, 21, 23], "adagrad": [5, 21], "outperform": 5, "sgd": [5, 21], "coupl": 5, "adagrad_accumulator_init": [5, 21], "match": [5, 17, 19, 21, 22], "algorithm": [5, 14, 24], "tensorflow": [5, 21], "align": [5, 8, 14, 15, 16, 17, 18, 22, 23, 24], "previous": 5, "dynamic_dict": 5, "128": 5, "dimension": [5, 16], "On": [5, 14], "brnn": [5, 10, 12, 21], "256": [5, 23], "norm": [5, 14, 16, 21], "gradient": [5, 8, 9, 14, 21], "renorm": [5, 21], "exce": [5, 21], "src_vocab_s": [5, 21, 23], "50000": [5, 21], "tgt_vocab_s": [5, 21, 23], "15": [5, 6, 18, 24], "seed": [5, 12, 18, 19, 21, 22, 23], "777": 5, "model_transform": 5, "normal": [5, 8, 9, 16, 23], "beam": [5, 8, 12, 18, 23], "stepwise_penalti": [5, 18, 22], "coverage_penalti": [5, 18, 22], "beta": [5, 14, 18, 22], "coverag": [5, 14, 16, 18, 21, 22], "wu": [5, 22, 24], "block_ngram_repeat": [5, 18, 22], "trigram": 5, "ignore_when_block": [5, 18, 22], "testout": 5, "min_length": [5, 18, 22], "35": [5, 7], "verbos": [5, 6, 9, 10, 11, 12, 18, 21, 22, 23], "roug": [5, 12], "pyroug": 5, "gram": [5, 18], "typic": [5, 14, 19, 21, 23], "sub": 5, "repo": [5, 12, 23], "recurs": 5, "submodul": 5, "sebastiangehrmann": 5, "baselin": [5, 12, 21], "maco": 5, "pointer": [5, 16, 24], "perl": 5, "pl": 5, "might": 5, "simpl": [5, 14], "w": [5, 12, 16, 24], "fail": [5, 18], "sent_tag_verbatim": 5, "around": [5, 14, 16], "becom": 5, "larg": 5, "parallel": [5, 9, 15, 16, 18, 19, 21, 23], "files2roug": 5, "giga": 5, "r1": 5, "r2": 5, "rl": 5, "39": 5, "43": 5, "02": [5, 11], "53": 5, "17": [5, 16, 24], "18": 5, "77": 5, "28": [5, 7], "98": [5, 21, 23], "56": 5, "36": 5, "38": 5, "37": 5, "76": 5, "60": 5, "44": 5, "31": 5, "66": [5, 7], "34": 5, "46": 5, "33": [5, 12], "42": [5, 23], "emb": [5, 16], "hid": 5, "40": [5, 21, 22, 23], "90": 5, "91": 5, "99": 5, "25": [5, 12], "59": 5, "97": 5, "93": 5, "67": 5, "1024": [5, 23], "41": [5, 7, 12], "94": 5, "27": 5, "83": 5, "09": 5, "54": 5, "45": 5, "51": 5, "vinyal": [5, 24], "fortunato": 5, "jaitli": 5, "2015": [5, 24], "nip": 5, "liu": [5, 24], "man": [5, 24], "2017": [5, 13, 21, 24], "acl": [5, 13, 24], "cho": [5, 24], "bengio": [5, 24], "2014": [5, 24], "neural": [5, 8, 11, 13, 16, 18, 24], "iclr": [5, 24], "pham": [5, 24], "approach": [5, 10, 11, 12, 16], "emnlp": [5, 24], "preliminari": 6, "wiki_103": 6, "prepare_wikitext": 6, "103_data": 6, "wikitext103": 6, "shuffl": [6, 12, 21], "chmod": 6, "snippet": 6, "40000": 6, "won": 6, "inde": 6, "aggress": 6, "joiner_annot": 6, "preserve_placehold": 6, "case_markup": 6, "soft_case_region": 6, "preserve_segmented_token": 6, "n_symbol": 6, "tokenizer_default": 6, "learner": 6, "bpelearn": 6, "ingest_fil": 6, "raw": [6, 8, 11, 16, 22], "data_fil": 6, "tokenize_fil": 6, "explain": 6, "therefor": 6, "wikitext": 6, "observ": 6, "built": [6, 10, 11, 14, 16], "tansform": 6, "gpt": [6, 14, 21], "unsupervis": 6, "multitask": 6, "block": [6, 18, 22], "mention": [6, 12], "slide": [6, 21], "plai": 6, "role": 6, "model_task": [6, 21, 23], "transformer_lm": [6, 21, 23], "monitor": 6, "perplex": [6, 12, 14], "trigger": [6, 14], "head": [6, 8, 12, 16, 21, 23], "lm_input": 6, "proce": 6, "top": [6, 10, 11, 16, 18, 22], "nucleu": [6, 22], "lowest": [6, 12], "lm_step_1000000": 6, "lm_pred_input": 6, "random_sampling_topp": [6, 18, 22, 23], "torchvis": [7, 10], "torchaudio": [7, 11], "edit": 7, "english": [7, 23], "german": [7, 23], "bash": 7, "prepare_wmt_ende_data": 7, "big": [7, 8], "50k": 7, "rtx": 7, "4090": 7, "build_vocab": [7, 19], "wmt17_end": 7, "variou": [7, 21], "wmt17_en_d": 7, "bpe": [7, 8, 19, 21, 22, 23], "bigwmt17_step_50000": 7, "trg": 7, "sacrebleu": 7, "40k": 7, "45k": 7, "newstest2016": 7, "signatur": 7, "nref": 7, "eff": 7, "verbose_scor": 7, "bp": 7, "hyp_len": 7, "64244": 7, "ref_len": 7, "64379": 7, "65": 7, "000": 7, "64357": 7, "992": 7, "63885": 7, "pretrain": [8, 13, 16, 21], "llm": [8, 14, 21], "finetun": [8, 14, 21], "contributor": 8, "guidelin": 8, "absolut": 8, "alibi": 8, "glove": [8, 21], "ensembl": [8, 22], "corpora": [8, 15, 21], "special": [8, 9, 19, 21, 22], "purpos": [8, 23], "bart": 8, "switchout": [8, 24], "deal": 8, "while": [8, 12, 16], "supervis": [8, 16, 21], "server": [8, 21], "ii": 8, "docker": 8, "iii": 8, "wiki": [8, 21], "103": 8, "clean": [8, 12], "prepar": [8, 9, 14, 18], "summar": [8, 23, 24], "cnn": [8, 11, 12, 14, 21], "dm": 8, "gate": [8, 16, 21], "graph": [8, 16, 21], "quick": 8, "acknowledg": 8, "vicuna": 8, "terminologi": 8, "prefix": [8, 14], "suffix": 8, "insertmaskbeforeplaceholderstransform": 8, "uppercas": 8, "token_drop": 8, "token_mask": 8, "onmttok": 8, "reproduc": [8, 12], "prune": 8, "quant": 8, "trick": [8, 14, 16], "effici": [8, 14, 24], "framework": [8, 21], "strategi": [8, 15], "loader": 8, "faq": [8, 23], "speech": 8, "embeddings_to_torch": 9, "ylhsieh": 9, "one2": 9, "usag": [9, 19, 20, 21, 22], "emb_file_both": 9, "emb_file_enc": 9, "emb_file_dec": 9, "output_fil": 9, "dict_fil": 9, "skip_lin": 9, "usagecomplet": 9, "info": [9, 15, 21, 22, 23], "onmt_preprocess": [9, 10, 11, 12], "train_src": [9, 10, 11, 12], "train_tgt": [9, 10, 11, 12], "valid_src": [9, 10, 11, 12], "valid_tgt": [9, 10, 11, 12], "pre_word_vecs_enc": [9, 21], "enc": [9, 21], "pre_word_vecs_dec": [9, 21], "dec": [9, 16, 21], "bunch": 9, "tmp": 9, "de2": 9, "max_generator_batch": 9, "model1": 9, "seed1": 9, "model2": 9, "seed2": 9, "train_id": 9, "from_backtransl": 9, "my_data": 9, "dump": [9, 19, 21, 22, 23], "train_a": 9, "train_b": 9, "data_id": 9, "data_weight": 9, "mani": [9, 12, 14, 18, 21], "shard_siz": [9, 10, 11, 12], "train_align": 9, "valid_align": 9, "mask": [9, 14, 16, 19, 21, 22], "deep": [10, 11], "driven": 10, "caption": [10, 12], "optic": 10, "recognit": 10, "latex": [10, 11], "decompil": 10, "formula": 10, "goal": 10, "compil": 10, "frac": 10, "delta": 10, "cdot": 10, "visual": [10, 21, 23], "markup": 10, "technic": [10, 13], "conda": 10, "pillow": 10, "math": 10, "im2text": 10, "tgz": [10, 11], "sea": [10, 11], "harvard": [10, 11], "im2text_smal": 10, "tar": [10, 11, 12, 23], "zxf": [10, 11], "data_typ": [10, 11, 12, 15, 18, 21, 22], "img": 10, "src_dir": [10, 11, 12], "demo": [10, 11, 23], "tgt_words_min_frequ": [10, 21], "500": [10, 12, 21, 23], "image_channel_s": 10, "model_typ": [10, 11, 12, 21], "80": 10, "model_acc_x_ppl_x_e13": [10, 11], "max_length": [10, 12, 18, 22, 23], "im2latex": 10, "100k": 10, "shall": [10, 11], "label0_token0": [10, 11], "label0_token1": [10, 11], "label0_tokenn0": [10, 11], "label1_token0": [10, 11], "label1_token1": [10, 11], "label1_tokenn1": [10, 11], "label2_token0": [10, 11], "label2_token1": [10, 11], "label2_tokenn2": [10, 11], "image0_path": 10, "image1_path": 10, "image2_path": 10, "fourier": 11, "stft": 11, "convolut": [11, 16, 24], "sudo": 11, "apt": 11, "sox": 11, "libsox": 11, "dev": [11, 12], "fmt": 11, "librosa": 11, "an4_dataset": 11, "300": 11, "audio_enc_pool": 11, "0003": 11, "100000": [11, 21], "speech0_path": 11, "speech1_path": 11, "speech2_path": 11, "sample_r": 11, "16000": 11, "window_s": 11, "spectrogram": 11, "window_strid": 11, "ham": 11, "deepspeech": 11, "exploit": 12, "tempor": 12, "youtubeclip": 12, "xvf": 12, "decompress": 12, "archiv": 12, "youtube2text": 12, "throw": 12, "awai": 12, "googlenet": 12, "youtube2text_iccv15": 12, "yt2t": 12, "vid": 12, "avi": 12, "pickl": 12, "yt": 12, "ytc": 12, "youtub": 12, "hash": 12, "join": 12, "dict_youtube_map": 12, "pkl": 12, "rb": 12, "yt2vid": 12, "listdir": 12, "hashi": 12, "ext": 12, "splitext": 12, "fpath_old": 12, "f_new": 12, "fpath_new": 12, "low": 12, "framer": 12, "fi": 12, "ffmpeg": 12, "done": [12, 18], "frame": 12, "variabl": [12, 15, 18], "y2t2": 12, "back": [12, 14], "pwd": 12, "img_feature_extractor": 12, "restrict": [12, 19, 21, 22], "pythonpath": 12, "vid_feature_extractor": 12, "root_dir": 12, "out_dir": 12, "r152": 12, "count": [12, 14, 15, 18, 19, 21, 22], "equal": [12, 18, 21], "1970": 12, "wc": 12, "rerun": 12, "miss": 12, "unexpect": 12, "issu": 12, "associ": 12, "filenam": [12, 21], "skip": [12, 19, 21], "ann": 12, "vid2ann": 12, "vid_nam": 12, "item": [12, 15, 16], "keyerror": 12, "train_fil": 12, "yt2t_train_fil": 12, "val_fil": 12, "yt2t_val_fil": 12, "val_fold": 12, "yt2t_val_folded_fil": 12, "test_fil": 12, "yt2t_test_fil": 12, "train_cap": 12, "yt2t_train_cap": 12, "val_cap": 12, "yt2t_val_cap": 12, "vid_path": 12, "npy": 12, "enumer": 12, "split_nam": 12, "elif": 12, "assert": 12, "small": [12, 23], "0001": [12, 23], "model_step_7200": 12, "7200": 12, "frequenc": [12, 19, 21, 22], "coco": 12, "fork": 12, "flaut": 12, "url": [12, 13, 24], "pprint": 12, "pycocoevalcap": 12, "meteor": 12, "cider": 12, "spice": 12, "__name__": 12, "__main__": 12, "scorer": [12, 18], "gt": 12, "outp": 12, "vid_id": 12, "all_scor": 12, "compute_scor": 12, "isinst": 12, "sc": 12, "bleu1": 12, "7888553878084233": 12, "bleu2": 12, "6729376621109295": 12, "bleu3": 12, "5778428507344473": 12, "bleu4": 12, "47633625833397897": 12, "7122415518428051": 12, "31829562714082704": 12, "6811305229481235": 12, "044147089472463576": 12, "stack": [12, 16, 21], "against": 12, "row": 12, "tabl": [12, 16, 22], "4028": 12, "2900": 12, "4801": 12, "downsampl": 12, "26": [12, 24], "240": 12, "fp": 12, "resnet": 12, "lowercas": 12, "tvt": 12, "view": 12, "msvd": 12, "yt2t_2": 12, "untar": 12, "subssampl": 12, "reprocess": 12, "2345": 12, "maketran": 12, "whitespac": 12, "train_data": 12, "val_data": 12, "test_data": 12, "datum": 12, "50": [12, 21, 22], "model_step_": 12, "report_everi": [12, 21, 23], "share_decoder_embed": [12, 21], "7000": 12, "estim": [12, 14], "epoch": 12, "scale": [12, 14, 16, 18, 21], "accordingli": 12, "earli": [12, 14, 21], "stop": [12, 14, 19, 21, 22], "find_val_stop": 12, "test_early_stop": 12, "process_result": 12, "argpars": [12, 17], "defaultdict": 12, "panda": 12, "pd": 12, "load_result": 12, "fname": 12, "junk": 12, "score_lin": 12, "metric": [12, 14, 21, 22], "score_num": 12, "float": [12, 14, 16, 18], "endswith": 12, "df": 12, "datafram": 12, "find_absolute_stop": 12, "idxmax": 12, "find_early_stop": 12, "stop_count": 12, "count_since_max": 12, "ended_metr": 12, "iterrow": 12, "seri": 12, "find_stop": 12, "argumentpars": 12, "locat": [12, 19, 21, 22, 23], "add_argu": 12, "wors": 12, "parse_arg": 12, "idx": 12, "iteritem": 12, "print": [12, 14, 21, 22], "loc": 12, "touch": 12, "1v": 12, "model_step": 12, "echo": 12, "null": [12, 23], "val_stop": 12, "test_result": 12, "IFS": 12, "awk": 12, "nf": 12, "tee": 12, "cat": 12, "thu": [12, 14], "2000": 12, "took": 12, "522": 12, "testlen": 12, "3410": 12, "reflen": 12, "3417": 12, "guess": 12, "2740": 12, "2070": 12, "1400": 12, "2664": 12, "1562": 12, "887": 12, "386": 12, "9979514193734276": 12, "7796296150773093": 12, "6659837622637965": 12, "5745524496015597": 12, "4779574102543823": 12, "7541600090591118": 12, "3259497476899707": 12, "6800279518634998": 12, "046435637924854": 12, "72": 12, "11": 12, "24m": 12, "perhap": 12, "residu": [12, 16, 21], "altern": [12, 16], "9861": 12, "fewer": 12, "overal": 12, "nearli": 12, "favor": 12, "portal": 13, "packag": [13, 17], "readi": 13, "go": [13, 18, 19, 21, 23], "familiar": 13, "yourself": 13, "research": [13, 21], "guillaum": 13, "klein": 13, "yoon": 13, "kim": 13, "jean": 13, "senellart": 13, "proc": [13, 24], "doi": [13, 24], "18653": 13, "p17": 13, "4012": 13, "gitter": 13, "channel": [13, 16], "basemodel": 14, "encoderbas": [14, 16], "decoderbas": [14, 16], "src_len": [14, 16, 18], "bptt": [14, 21], "with_align": 14, "propag": 14, "longtensor": [14, 16, 18], "tgt_len": [14, 16], "boolean": [14, 18], "init": [14, 16, 21], "floattensor": [14, 16, 18], "load_safe_state_dict": 14, "model_path": 14, "precis": [14, 22, 23], "float32": 14, "strict": 14, "offset": [14, 15], "state_dict": [14, 21], "load_state_dict": 14, "serial": [14, 16], "wrt": 14, "wai": [14, 18, 23], "nmtmodel": [14, 16], "count_paramet": 14, "callback": 14, "enc_out": [14, 16, 18], "exclud": 14, "initiliaz": 14, "enc_final_h": [14, 16], "languagemodel": 14, "transformerlmdecod": 14, "train_loss": 14, "valid_loss": 14, "scoring_prepar": 14, "valid_scor": 14, "trunc_siz": 14, "norm_method": 14, "sent": [14, 15, 21, 22, 23], "parallel_mod": [14, 21, 22], "data_parallel": [14, 21, 22], "report_manag": 14, "model_sav": 14, "average_decai": [14, 21], "average_everi": [14, 21], "fp32": [14, 21, 22], "earlystopp": 14, "zero_out_prompt_loss": [14, 21, 23], "util": 14, "losscomputebas": 14, "scoringprepar": 14, "calcul": [14, 16, 18, 23], "_eval_handl": 14, "accum": [14, 15], "ordin": 14, "rank": [14, 18, 21, 22], "reportmgrbas": 14, "lear": 14, "modelsaverbas": 14, "saver": 14, "earlystop": 14, "mecan": 14, "ff": [14, 16], "dropaout": 14, "schedul": 14, "zero": [14, 16, 18, 21, 22, 23], "mostli": [14, 21], "train_it": 14, "valid_it": 14, "loop": 14, "possibli": [14, 16], "nmt": [14, 18, 21], "moving_averag": [14, 21], "n_batch": 14, "n_sent": 14, "n_word": 14, "n_correct": 14, "computed_metr": 14, "accuraci": [14, 18], "elaps": 14, "static": [14, 21], "all_gather_stat": 14, "max_siz": 14, "accross": 14, "buffer": [14, 21], "all_gather_stats_list": 14, "stat_list": 14, "our_stat": 14, "elapsed_tim": 14, "log_tensorboard": 14, "writer": 14, "patienc": 14, "displai": 14, "num_step": 14, "stdout": 14, "ppl": 14, "update_n_src_word": 14, "sume": 14, "n_src_word": 14, "xent": 14, "cross": [14, 16, 21], "entropi": 14, "losscomput": 14, "criterion": 14, "lambda_coverag": [14, 21], "tgt_shift_index": 14, "lm_gener": 14, "lm_prior_lambda": [14, 21], "lm_prior_tau": [14, 21], "lm_prior_model": [14, 21], "nlloss": 14, "off": [14, 21], "hyper": 14, "param": 14, "scaler": 14, "attn": [14, 16, 18, 22], "trunc_start": 14, "approxim": [14, 21], "reliev": 14, "tupl": [14, 16], "from_opt": [14, 15, 16], "subclass": [14, 16, 18], "wrap": [14, 17], "nllloss": 14, "relev": [14, 16, 18], "ignore_prompt": 14, "mask_befor": 14, "suppos": 14, "crossentropyloss": 14, "learning_rate_decay_fn": 14, "thin": 14, "grad": 14, "callabl": [14, 18], "factor": 14, "clip": 14, "properti": [14, 16], "amp": [14, 21], "backward": 14, "ownership": 14, "emploi": 14, "training_step": 14, "zero_grad": 14, "set_to_non": 14, "adafactor": [14, 21], "lr": 14, "beta1": [14, 21], "beta2": [14, 21], "999": [14, 21], "eps1": 14, "1e": [14, 16, 21], "eps2": 14, "001": [14, 21], "cliping_threshold": 14, "non_constant_decai": 14, "enable_factor": 14, "ams_grad": 14, "weight_decai": 14, "closur": 14, "reevalu": 14, "unless": 14, "otherwis": [14, 21, 22], "fusedadam": [14, 21, 23], "bias_correct": 14, "ep": [14, 16], "08": 14, "eps_inside_sqrt": 14, "amsgrad": 14, "coeffici": 14, "squar": 14, "denomin": 14, "l2": 14, "variant": [14, 21, 23], "converg": [14, 23], "NOT": 14, "moment": [14, 21], "root": 14, "output_param": 14, "grad_norm": 14, "half": 14, "dynamicdatasetit": 15, "corpora_info": 15, "skip_empty_level": [15, 19, 21, 23], "iterabledataset": 15, "corpustask": 15, "multipli": 15, "increment": [15, 21], "secur": [15, 19, 21], "encout": [15, 19, 21], "sort_kei": 15, "mixer": 15, "mixingstrategi": 15, "batch_it": 15, "chunk": 15, "initil": 15, "sequentialmix": 15, "exhaust": 15, "weightedmix": 15, "weightedli": 15, "infinit": 15, "parallelcorpusiter": 15, "transformpip": 15, "word_vocab_s": 16, "word_padding_idx": 16, "feat_padding_idx": 16, "feat_vocab_s": 16, "spars": 16, "freeze_word_vec": 16, "abil": 16, "linguist": [16, 24], "sh16": [16, 24], "positionalencod": 16, "feat_dim_expon": 16, "embbed": 16, "emb_lut": 16, "nfeat": 16, "embedding_s": 16, "load_pretrained_vector": 16, "emb_fil": 16, "word_lut": 16, "dim": 16, "enc_typ": 16, "max_len": 16, "vsp": [16, 24], "seq_len": [16, 18], "nonetyp": [16, 18], "stepwis": 16, "position_ffn": 16, "positionwisefeedforward": [16, 21], "d_model": 16, "d_ff": 16, "activation_fn": 16, "relu": [16, 21], "add_ffnbia": [16, 21, 23], "parallel_residu": [16, 21, 23], "layer_norm": [16, 21, 23], "norm_ep": [16, 21], "06": [16, 21], "parallel_gpu": 16, "fnn": 16, "activationfunct": 16, "input_len": 16, "model_dim": 16, "2x": 16, "num_lay": 16, "transformerencod": 16, "relative_positions_bucket": [16, 21], "pos_ffn_activation_fn": [16, 21, 23], "num_kv": [16, 21], "inner": 16, "rnnencod": 16, "use_bridg": 16, "gru": [16, 21], "sru": [16, 21], "ggnnencod": 16, "autocr": 16, "cnnencod": 16, "cnn_kernel_width": [16, 21], "gag": [16, 24], "meanencod": 16, "trivial": 16, "simpli": [16, 23], "pool": 16, "transformerdecod": 16, "self_attn_typ": [16, 21], "aan_useffn": [16, 21], "shared_layer_norm": [16, 21], "sliding_window": [16, 21], "transformerdecoderbas": 16, "context_attn": 16, "distanc": [16, 21], "aan": [16, 21], "guid": 16, "tlen": 16, "feat": [16, 17, 19, 21, 22], "slen": 16, "rnndecoderbas": 16, "bidirectional_encod": 16, "attn_typ": 16, "attn_func": 16, "softmax": [16, 21, 22], "coverage_attn": [16, 21], "context_g": [16, 21], "copy_attn_typ": [16, 21], "globalattent": 16, "contextg": 16, "dec_out": 16, "init_st": 16, "stdrnndecod": 16, "fulli": 16, "cudnn": 16, "By": [16, 24], "bcb14": [16, 24], "input_feed": 16, "inputfeedrnndecod": 16, "lpm15": [16, 24], "cnndecod": 16, "convmultistepattent": 16, "enc_hidden": 16, "matrix": [16, 21], "parameter": 16, "convex": 16, "combin": [16, 22], "construct": 16, "sum_": 16, "seqlength": 16, "a_j": 16, "h_j": 16, "w_a": 16, "v_a": 16, "tanh": 16, "u_a": 16, "sparsemax": [16, 21], "yet": [16, 18], "distribtut": 16, "h_t": 16, "h_": 16, "unnorm": 16, "multiheadedattent": 16, "head_count": 16, "is_decod": 16, "simulatan": 16, "select": [16, 18, 21], "divis": 16, "return_attn": 16, "key_len": 16, "query_len": 16, "binari": 16, "averageattent": 16, "acceler": [16, 24], "zxs18": [16, 24], "layer_in": 16, "t_len": 16, "gating_out": 16, "average_out": 16, "input_s": 16, "conv": [16, 21], "oper": [16, 19], "apply_mask": 16, "base_target_emb": 16, "input_from_dec": 16, "encoder_out_top": 16, "encoder_out_combin": 16, "height": 16, "width": 16, "calc": 16, "copygener": 16, "output_s": 16, "pad_idx": 16, "slm17": [16, 24], "p_": 16, "tgt_dict": 16, "z": 16, "probil": 16, "taken": 16, "src_map": [16, 18], "impli": 16, "extra_word": 16, "structured_attent": 16, "matrixtre": 16, "tree": 16, "theorem": 16, "margin": 16, "ll17": [16, 24], "overridden": 16, "recip": 16, "within": [16, 17], "afterward": 16, "former": 16, "care": 16, "regist": 16, "hook": 16, "latter": 16, "silent": [16, 19, 21, 23], "ignor": [16, 19, 21, 22, 23], "translation_serv": 17, "servermodel": 17, "preprocess_opt": 17, "tokenizer_opt": 17, "postprocess_opt": 17, "custom_opt": 17, "features_opt": 17, "processu": 17, "postprocess": 17, "func": [17, 18], "do_timeout": 17, "neg": [17, 21], "build_token": 17, "attr": 17, "on_timemout": 17, "maybe_convert_align": 17, "align_scor": 17, "correspand": 17, "maybe_detoken": 17, "maybe_detokenize_with_align": 17, "seper": 17, "maybe_postprocess": 17, "maybe_preprocess": 17, "maybe_token": 17, "maybe_transform_feat": 17, "raw_src": 17, "tok_src": 17, "inferfeatstransform": 17, "parse_opt": 17, "namespac": 17, "rebuild_seg_packag": 17, "all_preprocess": 17, "rebuild": 17, "segment": [17, 19, 21, 22], "n_seg": 17, "to_gpu": 17, "tokenizer_mark": 17, "marker": 17, "servermodelerror": 17, "timer": 17, "translationserv": 17, "clone_model": 17, "list_model": 17, "load_model": 17, "model_kwarg": 17, "preload_model": 17, "preload": 17, "intern": 17, "datastructur": 17, "lua": 17, "config_fil": 17, "unload_model": 17, "cancel": 17, "srclen": 18, "pred_sent": 18, "pred_scor": 18, "tgt_sent": 18, "gold_scor": 18, "word_align": 18, "ind_in_bucket": 18, "prob": 18, "gold_sent": 18, "sent_numb": 18, "src_raw": 18, "random_sampling_topk": [18, 22, 23], "random_sampling_temp": [18, 22, 23], "dump_beam": [18, 22], "frozenset": 18, "replace_unk": [18, 22], "ban_unk_token": [18, 22], "phrase_t": [18, 22], "report_tim": [18, 22, 23], "global_scor": 18, "report_scor": 18, "logger": 18, "with_scor": [18, 22], "translate_batch": 18, "attn_debug": [18, 22], "translationbuild": 18, "underli": 18, "address": 18, "rare": 18, "lsl": [18, 24], "unknown": 18, "decodestrategi": 18, "parallel_path": 18, "exclusion_token": 18, "return_attent": 18, "magic": 18, "shortest": 18, "begin": 18, "longest": 18, "presum": 18, "cutoff": 18, "forbidden": 18, "hold": 18, "inp_seq_len": 18, "inp": 18, "seq": 18, "alive_seq": 18, "grow": 18, "axi": 18, "is_finish": 18, "bytetensor": 18, "alive_attn": 18, "target_prefix": 18, "prefix_seq_len": 18, "log_prob": 18, "ngram": [18, 22], "thant": 18, "onc": [18, 21], "put": 18, "lead": 18, "complex": [18, 23], "ingredi": 18, "maybe_update_forbidden_token": 18, "reorder": 18, "forbidden_token": 18, "maybe_update_target_prefix": 18, "select_index": 18, "aliv": 18, "logit": [18, 22], "vocab_s": [18, 21], "update_finish": 18, "attribut": 18, "beamsearch": 18, "beamsearchbas": 18, "greedy_search": 18, "sample_with_temperatur": 18, "sampling_temp": 18, "keep_topk": 18, "keep_topp": 18, "randomli": 18, "categor": 18, "categori": 18, "inf": 18, "logsumexp": 18, "potenti": [18, 23], "chosen": 18, "until": [18, 22], "cumul": [18, 22], "greater": 18, "condit": [18, 21, 22], "topk_id": 18, "topk_scor": 18, "greedysearch": 18, "either": [18, 21], "event": 18, "reach": 18, "gnmtglobalscor": 18, "penaltybuild": 18, "cov_pen": 18, "length_pen": 18, "pen": 18, "cov": 18, "has_cov_pen": 18, "op": 18, "isn": 18, "has_len_pen": 18, "coverage_non": 18, "coverage_summari": 18, "coverage_wu": 18, "gnmt": 18, "wsc": [18, 24], "almost": [18, 21], "length_averag": 18, "cur_len": 18, "length_non": 18, "unmodifi": 18, "length_wu": 18, "save_config": [19, 21, 22], "insert_mask_before_placehold": [19, 21, 22, 23], "num_thread": 19, "learn_subword": 19, "learn_subwords_s": 19, "vocab_sample_queue_s": 19, "decoder_start_token": [19, 21, 23], "default_speci": [19, 21], "response_pattern": [19, 21, 22, 23], "scripts_ok": [19, 21, 22], "scripts_nok": [19, 21, 22], "langid": [19, 21, 22], "spacer": [19, 21, 22], "src_subword_vocab": [19, 21, 22], "tgt_subword_vocab": [19, 21, 22], "src_vocab_threshold": [19, 21, 22], "tgt_vocab_threshold": [19, 21, 22], "gpt2_pretok": [19, 21, 22, 23], "encount": [19, 21], "rais": [19, 21], "32000": [19, 23], "rotat": [19, 21, 22], "percentag": [19, 21, 22], "often": [19, 21, 22], "fraction": [19, 21, 22], "fuzzi": [19, 21, 22], "192": [19, 21, 22], "prepend": [19, 21, 22], "patten": [19, 21, 22], "unicodata": [19, 21, 22], "tau": [19, 21, 22], "wpdn18": [19, 21, 22, 24], "smaller": [19, 21, 22], "divers": [19, 21, 22], "unigram": [19, 21, 22], "earlier": [19, 21, 22], "byte": [19, 21, 22], "inlin": [19, 21, 22], "ph_": [19, 21, 22], "_beg": [19, 21, 22], "_end": [19, 21, 22], "_std": [19, 21, 22], "debug": [20, 21, 22], "dump_transform": [21, 23], "src_words_min_frequ": 21, "tensor_parallel": [21, 22], "gpu_backend": [21, 22], "gpu_verbose_level": [21, 22], "freeze_encod": 21, "freeze_decod": 21, "gelu": 21, "silu": [21, 23], "input_fe": 21, "global_attention_funct": 21, "multiqueri": 21, "generator_funct": 21, "copy_attn_forc": 21, "loss_scal": 21, "apex_opt_level": [21, 23], "o0": 21, "o1": 21, "o2": 21, "o3": 21, "save_format": [21, 23], "safetensor": [21, 23], "keep_checkpoint": [21, 23], "keep_stat": 21, "single_pass": 21, "early_stop": 21, "early_stopping_criteria": 21, "adadelta": 21, "sparseadam": 21, "adamw8bit": 21, "pagedadamw8bit": 21, "pagedadamw32bit": 21, "truncated_decod": [21, 23], "adam_beta1": 21, "decay_step": [21, 23], "noamwd": 21, "rsqrt": 21, "log_fil": [21, 22], "log_file_level": [21, 22], "critic": [21, 22], "notset": [21, 22], "valid_metr": 21, "scoring_debug": 21, "dump_pr": 21, "exp_host": 21, "tensorboard_log_dir": 21, "override_opt": [21, 23], "bnb_8bit": [21, 22], "bnb_fp4": [21, 22], "disk": 21, "32768": [21, 23], "discard": 21, "backend": [21, 22], "nccl": [21, 22], "localhost": [21, 22], "sin": 21, "mark": 21, "interleav": 21, "feat_merge_s": 21, "experiment": 21, "kernel_s": 21, "epsilon": 21, "dict_kei": 21, "autogener": 21, "dotprod": 21, "encodingw": 21, "embeddingsmor": 21, "09864set": 21, "usemaximum": 21, "pdf": 21, "biasmor": 21, "mhanot": 21, "proj": 21, "attentionnot": 21, "1911": 21, "02150": 21, "kv": 21, "falcon": [21, 23], "40b": 21, "position_wis": 21, "layernot": 21, "garg": 21, "2019": 21, "1909": 21, "02074": 21, "leav": 21, "lambda_prior_lambda": 21, "lambda_prior_tau": 21, "opt_level": 21, "io": 21, "Will": 21, "_n": 21, "2106": 21, "09685": 21, "successfulli": 21, "thumb": 21, "uniform": 21, "xavier_uniform": 21, "resett": 21, "readm": 21, "criteria": 21, "mirror": 21, "initial_accumulator_valu": 21, "literatur": 21, "seemingli": 21, "discourag": 21, "consider": 21, "adopt": 21, "kera": 21, "www": 21, "api_doc": 21, "tf": 21, "adamoptim": 21, "recent": 21, "1512": 21, "00567": 21, "marian": 21, "aclweb": 21, "anthologi": 21, "p18": 21, "4020": 21, "exponenti": 21, "wikipedia": 21, "update_learning_r": 21, "gone": 21, "warmup": 21, "4000": 21, "under": [21, 22], "crayon": 21, "pick": 21, "awith": 21, "mini": 21, "refil": 21, "int8": 22, "avg_raw_prob": 22, "profil": 22, "align_debug": 22, "dtypefp32": 22, "gtx1080int8": 22, "nativ": 22, "whose": [22, 23], "learnt": 22, "1904": 22, "09751": 22, "minimum": 22, "250": 22, "repetit": 22, "had": [22, 23], "highest": 22, "proba": 22, "10k": 23, "s3": 23, "amazonaw": 23, "trainingdata": 23, "gz": 23, "xf": 23, "5k": 23, "nation": 23, "bureaucraci": 23, "parliament": 23, "apo": 23, "legisl": 23, "prerog": 23, "void": 23, "provis": 23, "extent": 23, "laid": 23, "feder": 23, "senior": 23, "instructor": 23, "italian": 23, "fit": 23, "postur": 23, "gym": 23, "stretch": 23, "pilat": 23, "2004": 23, "collabor": 23, "antich": 23, "person": 23, "toy_en_d": 23, "simplest": 23, "dump_field": 23, "simplifi": 23, "inspect": 23, "advand": 23, "model_step_1000": 23, "pred_1000": 23, "terribl": 23, "million": 23, "t5": 23, "openllama": 23, "redpajama": 23, "xgen": 23, "meta": 23, "flan": 23, "convert_openllama": 23, "path_to_hf_model": 23, "path_to_token": 23, "path_to_openllama": 23, "nshard": 23, "reconstruct": 23, "convert_mpt": 23, "vocab_fil": 23, "path_to_mpt": 23, "huggin": 23, "though": 23, "mandatori": 23, "path_to": 23, "similarli": 23, "mpt7b": 23, "conserv": 23, "mmlu": 23, "eval_llm": 23, "run_mmlu_opennmt": 23, "myinfer": 23, "easier": 23, "ramdom": 23, "path_to_config": 23, "path_to_sourc": 23, "path_to_target": 23, "alpaca_clean": 23, "1234": 23, "w_3": 23, "linear_kei": 23, "final_linear": 23, "11008": 23, "dzmitri": 24, "kyunghyun": 24, "yoshua": 24, "1409": 24, "0473": 24, "0473v3": 24, "1146": 24, "annurev": 24, "neuro": 24, "041002": 24, "131047": 24, "jona": 24, "gehr": 24, "michael": 24, "auli": 24, "david": 24, "grangier": 24, "deni": 24, "yarat": 24, "yann": 24, "dauphin": 24, "1705": 24, "03122": 24, "yang": 24, "mirella": 24, "lapata": 24, "09207": 24, "minh": 24, "thang": 24, "hieu": 24, "christoph": 24, "ffectiv": 24, "pproach": 24, "ttention": 24, "eural": 24, "achin": 24, "ranslat": 24, "ilya": 24, "sutskev": 24, "quoc": 24, "le": 24, "oriol": 24, "wojciech": 24, "zaremba": 24, "ddress": 24, "ord": 24, "roblem": 24, "abigail": 24, "peter": 24, "1704": 24, "04368": 24, "rico": 24, "sennrich": 24, "barri": 24, "haddow": 24, "preprint": 24, "1606": 24, "02892": 24, "2016": 24, "ashish": 24, "vaswani": 24, "shazeer": 24, "niki": 24, "parmar": 24, "jakob": 24, "uszkoreit": 24, "llion": 24, "jone": 24, "aidan": 24, "gomez": 24, "lukasz": 24, "kaiser": 24, "illia": 24, "polosukhin": 24, "1706": 24, "03762": 24, "xinyi": 24, "wang": 24, "zihang": 24, "graham": 24, "neubig": 24, "1808": 24, "07512": 24, "yonghui": 24, "mike": 24, "schuster": 24, "zhifeng": 24, "chen": 24, "mohammad": 24, "norouzi": 24, "wolfgang": 24, "macherei": 24, "krikun": 24, "yuan": 24, "cao": 24, "qin": 24, "gao": 24, "klau": 24, "gap": 24, "human": 24, "1609": 24, "08144": 24, "biao": 24, "zhang": 24, "deyi": 24, "xiong": 24, "jinsong": 24, "su": 24, "1805": 24, "00631": 24}, "objects": {"onmt.decoders": [[16, 0, 1, "", "CNNDecoder"], [16, 0, 1, "", "DecoderBase"], [16, 0, 1, "", "InputFeedRNNDecoder"], [16, 0, 1, "", "StdRNNDecoder"], [16, 0, 1, "", "TransformerDecoder"]], "onmt.decoders.CNNDecoder": [[16, 1, 1, "", "forward"], [16, 1, 1, "", "from_opt"], [16, 1, 1, "", "init_state"]], "onmt.decoders.DecoderBase": [[16, 1, 1, "", "from_opt"]], "onmt.decoders.TransformerDecoder": [[16, 1, 1, "", "forward"]], "onmt.decoders.decoder": [[16, 0, 1, "", "RNNDecoderBase"]], "onmt.decoders.decoder.RNNDecoderBase": [[16, 1, 1, "", "forward"], [16, 1, 1, "", "from_opt"], [16, 1, 1, "", "init_state"]], "onmt.encoders": [[16, 0, 1, "", "CNNEncoder"], [16, 0, 1, "", "EncoderBase"], [16, 0, 1, "", "GGNNEncoder"], [16, 0, 1, "", "MeanEncoder"], [16, 0, 1, "", "RNNEncoder"], [16, 0, 1, "", "TransformerEncoder"]], "onmt.encoders.CNNEncoder": [[16, 1, 1, "", "forward"], [16, 1, 1, "", "from_opt"]], "onmt.encoders.EncoderBase": [[16, 1, 1, "", "forward"]], "onmt.encoders.GGNNEncoder": [[16, 1, 1, "", "forward"], [16, 1, 1, "", "from_opt"]], "onmt.encoders.MeanEncoder": [[16, 1, 1, "", "forward"], [16, 1, 1, "", "from_opt"]], "onmt.encoders.RNNEncoder": [[16, 1, 1, "", "forward"], [16, 1, 1, "", "from_opt"]], "onmt.encoders.TransformerEncoder": [[16, 1, 1, "", "forward"], [16, 1, 1, "", "from_opt"]], "onmt.inputters": [[15, 0, 1, "", "DynamicDatasetIter"], [15, 0, 1, "", "MixingStrategy"], [15, 0, 1, "", "ParallelCorpus"], [15, 0, 1, "", "ParallelCorpusIterator"], [15, 0, 1, "", "SequentialMixer"], [15, 0, 1, "", "WeightedMixer"]], "onmt.inputters.DynamicDatasetIter": [[15, 1, 1, "", "batch_iter"], [15, 1, 1, "", "from_opt"]], "onmt.inputters.ParallelCorpus": [[15, 1, 1, "", "load"]], "onmt.models": [[14, 0, 1, "", "BaseModel"], [14, 0, 1, "", "LanguageModel"], [14, 0, 1, "", "NMTModel"]], "onmt.models.BaseModel": [[14, 1, 1, "", "forward"], [14, 1, 1, "", "load_safe_state_dict"], [14, 1, 1, "", "load_state_dict"]], "onmt.models.LanguageModel": [[14, 1, 1, "", "count_parameters"], [14, 1, 1, "", "forward"]], "onmt.models.NMTModel": [[14, 1, 1, "", "count_parameters"], [14, 1, 1, "", "forward"]], "onmt.modules": [[16, 0, 1, "", "AverageAttention"], [16, 0, 1, "", "ConvMultiStepAttention"], [16, 0, 1, "", "CopyGenerator"], [16, 0, 1, "", "Embeddings"], [16, 0, 1, "", "GlobalAttention"], [16, 0, 1, "", "MultiHeadedAttention"], [16, 0, 1, "", "PositionalEncoding"]], "onmt.modules.AverageAttention": [[16, 1, 1, "", "forward"]], "onmt.modules.ConvMultiStepAttention": [[16, 1, 1, "", "apply_mask"], [16, 1, 1, "", "forward"]], "onmt.modules.CopyGenerator": [[16, 1, 1, "", "forward"]], "onmt.modules.Embeddings": [[16, 2, 1, "", "emb_luts"], [16, 1, 1, "", "forward"], [16, 1, 1, "", "load_pretrained_vectors"], [16, 2, 1, "", "word_lut"]], "onmt.modules.GlobalAttention": [[16, 1, 1, "", "forward"], [16, 1, 1, "", "score"]], "onmt.modules.MultiHeadedAttention": [[16, 1, 1, "", "forward"]], "onmt.modules.PositionalEncoding": [[16, 1, 1, "", "forward"]], "onmt.modules.position_ffn": [[16, 0, 1, "", "PositionwiseFeedForward"]], "onmt.modules.position_ffn.PositionwiseFeedForward": [[16, 1, 1, "", "forward"]], "onmt.modules.structured_attention": [[16, 0, 1, "", "MatrixTree"]], "onmt.modules.structured_attention.MatrixTree": [[16, 1, 1, "", "forward"]], "onmt.trainer": [[14, 0, 1, "", "Trainer"]], "onmt.trainer.Trainer": [[14, 1, 1, "", "train"], [14, 1, 1, "", "validate"]], "onmt.translate": [[18, 0, 1, "", "BeamSearch"], [18, 0, 1, "", "DecodeStrategy"], [18, 0, 1, "", "GNMTGlobalScorer"], [18, 0, 1, "", "GreedySearch"], [18, 0, 1, "", "Translation"], [18, 0, 1, "", "TranslationBuilder"], [18, 0, 1, "", "Translator"]], "onmt.translate.BeamSearch": [[18, 1, 1, "", "initialize"]], "onmt.translate.DecodeStrategy": [[18, 1, 1, "", "advance"], [18, 1, 1, "", "block_ngram_repeats"], [18, 1, 1, "", "initialize"], [18, 1, 1, "", "maybe_update_forbidden_tokens"], [18, 1, 1, "", "maybe_update_target_prefix"], [18, 1, 1, "", "target_prefixing"], [18, 1, 1, "", "update_finished"]], "onmt.translate.GreedySearch": [[18, 1, 1, "", "advance"], [18, 1, 1, "", "initialize"], [18, 1, 1, "", "update_finished"]], "onmt.translate.Translation": [[18, 1, 1, "", "log"]], "onmt.translate.Translator": [[18, 1, 1, "", "translate_batch"]], "onmt.translate.greedy_search": [[18, 3, 1, "", "sample_with_temperature"]], "onmt.translate.penalties": [[18, 0, 1, "", "PenaltyBuilder"]], "onmt.translate.penalties.PenaltyBuilder": [[18, 1, 1, "", "coverage_none"], [18, 1, 1, "", "coverage_summary"], [18, 1, 1, "", "coverage_wu"], [18, 1, 1, "", "length_average"], [18, 1, 1, "", "length_none"], [18, 1, 1, "", "length_wu"]], "onmt.translate.translation_server": [[17, 0, 1, "", "ServerModel"], [17, 4, 1, "", "ServerModelError"], [17, 0, 1, "", "Timer"], [17, 0, 1, "", "TranslationServer"]], "onmt.translate.translation_server.ServerModel": [[17, 1, 1, "", "build_tokenizer"], [17, 1, 1, "", "detokenize"], [17, 1, 1, "", "do_timeout"], [17, 1, 1, "", "maybe_convert_align"], [17, 1, 1, "", "maybe_detokenize"], [17, 1, 1, "", "maybe_detokenize_with_align"], [17, 1, 1, "", "maybe_postprocess"], [17, 1, 1, "", "maybe_preprocess"], [17, 1, 1, "", "maybe_tokenize"], [17, 1, 1, "", "maybe_transform_feats"], [17, 1, 1, "", "parse_opt"], [17, 1, 1, "", "postprocess"], [17, 1, 1, "", "preprocess"], [17, 1, 1, "", "rebuild_seg_packages"], [17, 1, 1, "", "to_gpu"], [17, 1, 1, "", "tokenize"], [17, 1, 1, "", "tokenizer_marker"]], "onmt.translate.translation_server.TranslationServer": [[17, 1, 1, "", "clone_model"], [17, 1, 1, "", "list_models"], [17, 1, 1, "", "load_model"], [17, 1, 1, "", "preload_model"], [17, 1, 1, "", "run"], [17, 1, 1, "", "start"], [17, 1, 1, "", "unload_model"]], "onmt.utils": [[14, 0, 1, "", "AdaFactor"], [14, 0, 1, "", "FusedAdam"], [14, 0, 1, "", "Optimizer"], [14, 0, 1, "", "Statistics"]], "onmt.utils.AdaFactor": [[14, 1, 1, "", "step"]], "onmt.utils.FusedAdam": [[14, 1, 1, "", "step"]], "onmt.utils.Optimizer": [[14, 2, 1, "", "amp"], [14, 1, 1, "", "backward"], [14, 1, 1, "", "from_opt"], [14, 1, 1, "", "learning_rate"], [14, 1, 1, "", "step"], [14, 2, 1, "", "training_step"], [14, 1, 1, "", "zero_grad"]], "onmt.utils.Statistics": [[14, 1, 1, "", "accuracy"], [14, 1, 1, "", "all_gather_stats"], [14, 1, 1, "", "all_gather_stats_list"], [14, 1, 1, "", "elapsed_time"], [14, 1, 1, "", "log_tensorboard"], [14, 1, 1, "", "output"], [14, 1, 1, "", "ppl"], [14, 1, 1, "", "update"], [14, 1, 1, "", "xent"]], "onmt.utils.loss": [[14, 0, 1, "", "LossCompute"]], "onmt.utils.loss.LossCompute": [[14, 1, 1, "", "forward"], [14, 1, 1, "", "from_opts"], [14, 1, 1, "", "ignore_prompt"]]}, "objtypes": {"0": "py:class", "1": "py:method", "2": "py:property", "3": "py:function", "4": "py:exception"}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "method", "Python method"], "2": ["py", "property", "Python property"], "3": ["py", "function", "Python function"], "4": ["py", "exception", "Python exception"]}, "titleterms": {"contributor": 0, "guidelin": 0, "docstr": 0, "how": [1, 9, 23], "do": [1, 9], "i": [1, 9], "us": [1, 9], "my": 1, "v2": 1, "model": [1, 5, 6, 9, 14, 17, 21, 22, 23], "v3": 1, "train": [1, 5, 6, 7, 9, 21, 23], "transform": [1, 9, 12, 19, 21, 22], "perform": [1, 2], "tip": [1, 2], "posit": 1, "encod": [1, 16, 21], "absolut": 1, "v": 1, "rel": 1, "rotari": 1, "embed": [1, 9, 16, 21], "alibi": 1, "you": [1, 9], "support": [1, 9], "multi": [1, 4, 9], "gpu": [1, 9], "pretrain": [1, 9, 23], "e": [1, 9], "g": [1, 9], "glove": [1, 9], "exampl": [1, 8, 9], "can": [1, 9], "ensembl": [1, 9], "infer": [1, 4, 5, 9, 23], "weight": [1, 9], "differ": [1, 9], "corpora": [1, 9], "what": 1, "special": 1, "token": 1, "doe": 1, "opennmt": [1, 7], "py": [1, 7], "appli": 1, "fly": 1, "subword": [1, 6, 7, 19, 21, 22], "regular": 1, "when": 1, "ar": 1, "readili": 1, "avail": 1, "data": [1, 3, 4, 5, 6, 7, 15, 19, 21, 22, 23], "gener": [1, 6, 21, 23], "purpos": 1, "filter": [1, 19, 21, 22], "length": 1, "add": 1, "custom": 1, "prefix": [1, 19, 21, 22], "suffix": [1, 19, 21, 22], "convert": [1, 23], "uppercas": [1, 19, 21, 22], "normal": [1, 19, 21, 22], "punctuat": 1, "clean": [1, 6, 19, 21, 22], "dataset": [1, 4, 15], "context": 1, "doc": 1, "awar": 1, "augment": 1, "sourc": 1, "segment": 1, "fuzzi": 1, "match": 1, "neural": [1, 3], "repair": 1, "target": 1, "inlin": 1, "tag": 1, "make": 1, "learn": [1, 9], "terminologi": [1, 19, 21, 22], "sentencepiec": 1, "bpe": [1, 6], "nmt": [1, 7], "bart": [1, 19, 21, 22], "style": 1, "nois": 1, "switchout": [1, 19, 21, 22], "sampl": [1, 22], "drop": 1, "some": 1, "mask": 1, "creat": 1, "lora": 1, "8bit": 1, "load": 1, "finetun": [1, 4, 23], "big": 1, "gradient": 1, "checkpoint": [1, 4], "deal": 1, "get": [1, 7, 8, 9], "word": [1, 9], "align": [1, 9, 21], "while": [1, 9], "translat": [1, 7, 9, 18, 22, 23], "raw": [1, 9], "from": [1, 9, 23], "averag": [1, 9], "attent": [1, 9, 16, 21], "head": [1, 9], "supervis": [1, 4, 9], "specif": [1, 6, 9], "updat": 1, "": 1, "vocabulari": [1, 3, 4, 6], "featur": [1, 19, 21, 22], "set": 1, "up": 1, "server": [1, 17, 20], "work": 1, "configur": [1, 19, 21, 22], "ii": 1, "start": [1, 3, 8, 10, 11], "without": 1, "docker": 1, "0": [1, 6], "code": 1, "1": [1, 6, 23], "instal": [1, 13], "flask": 1, "2": [1, 6, 23], "put": 1, "3": [1, 6, 23], "iii": 1, "iv": 1, "api": [1, 8], "hostnam": 1, "list": 1, "version": [2, 9], "break": 2, "chang": 2, "gate": 3, "graph": 3, "network": 3, "depend": [3, 4, 7, 10, 11], "quick": [3, 10, 11], "format": [3, 4], "note": 3, "option": [3, 10, 11, 21, 22], "acknowledg": [3, 11], "llama": 4, "7b": 4, "replic": 4, "vicuna": 4, "concaten": 4, "convers": 4, "ctranslat": 4, "round": 4, "simpl": 4, "summar": 5, "cnn": 5, "dm": 5, "prepar": [5, 6, 7, 23], "vocab": [5, 19, 21], "evalu": 5, "gigaword": 5, "score": [5, 18], "refer": [5, 24], "languag": 6, "wiki": 6, "103": 6, "step": [6, 23], "download": 6, "pyonmttok": 6, "build": [6, 19], "command": 6, "4": 6, "output": 6, "wmt17": 7, "en": 7, "de": 7, "pytorch": 7, "apex": 7, "run": 7, "content": 8, "frequent": 8, "ask": 8, "question": 8, "script": 8, "legaci": [8, 9], "faq": 9, "preprocess": 9, "imag": 10, "text": [10, 11, 12, 23], "speech": 11, "video": 12, "recurr": 12, "overview": 13, "citat": 13, "addit": 13, "resourc": 13, "framework": 14, "trainer": 14, "loss": 14, "optim": [14, 21], "loader": 15, "iter": 15, "modul": 16, "decod": [16, 18, 21, 22], "core": 17, "class": 18, "strategi": 18, "fuzzymatch": [19, 21, 22], "insertmaskbeforeplaceholderstransform": [19, 21, 22], "token_drop": [19, 21, 22], "token_mask": [19, 21, 22], "docifi": [19, 21, 22], "inferfeat": [19, 21, 22], "common": [19, 21, 22], "onmttok": [19, 21, 22], "inlinetag": [19, 21, 22], "reproduc": [19, 21, 22], "name": 20, "argument": 20, "prune": 21, "distribut": [21, 22], "task": 21, "initi": 21, "type": 21, "rate": 21, "log": [21, 22], "dynam": 21, "quant": [21, 22], "beam": 22, "search": 22, "random": 22, "penalti": 22, "trick": 22, "effici": 22, "quickstart": 23, "scratch": 23, "llm": 23, "hug": 23, "face": 23, "hub": 23, "an": 23, "yaml": 23, "config": 23, "file": 23}, "envversion": {"sphinx.domains.c": 2, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 8, "sphinx.domains.index": 1, "sphinx.domains.javascript": 2, "sphinx.domains.math": 2, "sphinx.domains.python": 3, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.viewcode": 1, "sphinxcontrib.bibtex": 9, "sphinx": 57}, "alltitles": {"Contributors": [[0, "contributors"]], "Guidelines": [[0, "guidelines"]], "Docstrings": [[0, "docstrings"]], "How do I use my v2 models in v3 ?": [[1, "how-do-i-use-my-v2-models-in-v3"]], "How do I train the Transformer model?": [[1, "how-do-i-train-the-transformer-model"]], "Performance tips": [[1, "performance-tips"], [2, "performance-tips"]], "Position encoding: Absolute vs Relative vs Rotary Embeddings vs Alibi": [[1, "position-encoding-absolute-vs-relative-vs-rotary-embeddings-vs-alibi"]], "Do you support multi-gpu?": [[1, "do-you-support-multi-gpu"], [9, "do-you-support-multi-gpu"]], "How do I use Pretrained embeddings (e.g. GloVe)?": [[1, "how-do-i-use-pretrained-embeddings-e-g-glove"], [9, "how-do-i-use-pretrained-embeddings-e-g-glove"]], "Example": [[1, "example"], [1, "id1"], [1, "id2"], [1, "id3"], [9, "example"]], "How can I ensemble Models at inference?": [[1, "how-can-i-ensemble-models-at-inference"], [9, "how-can-i-ensemble-models-at-inference"]], "How can I weight different corpora at training?": [[1, "how-can-i-weight-different-corpora-at-training"], [9, "how-can-i-weight-different-corpora-at-training"]], "What special tokens does OpenNMT-py use?": [[1, "what-special-tokens-does-opennmt-py-use"]], "How can I apply on-the-fly tokenization and subword regularization when training?": [[1, "how-can-i-apply-on-the-fly-tokenization-and-subword-regularization-when-training"]], "What are the readily available on-the-fly data transforms?": [[1, "what-are-the-readily-available-on-the-fly-data-transforms"]], "General purpose": [[1, "general-purpose"]], "Filter examples by length": [[1, "filter-examples-by-length"]], "Add custom prefix to examples": [[1, "add-custom-prefix-to-examples"]], "Add custom suffix to examples": [[1, "add-custom-suffix-to-examples"]], "Convert examples to uppercase": [[1, "convert-examples-to-uppercase"]], "Normalize punctuation": [[1, "normalize-punctuation"]], "Clean dataset": [[1, "clean-dataset"]], "Context / Doc aware transform": [[1, "context-doc-aware-transform"]], "Augment source segments with fuzzy matches for Neural Fuzzy Repair": [[1, "augment-source-segments-with-fuzzy-matches-for-neural-fuzzy-repair"]], "Augment source and target segments with inline tags": [[1, "augment-source-and-target-segments-with-inline-tags"]], "Make the model learn to use terminology": [[1, "make-the-model-learn-to-use-terminology"]], "Tokenization": [[1, "tokenization"]], "OpenNMT Tokenizer": [[1, "opennmt-tokenizer"]], "SentencePiece": [[1, "sentencepiece"]], "BPE subword-nmt": [[1, "bpe-subword-nmt"]], "BART-style noise": [[1, "bart-style-noise"]], "SwitchOut and sampling": [[1, "switchout-and-sampling"]], "SwitchOut": [[1, "switchout"]], "Drop some tokens": [[1, "drop-some-tokens"]], "Mask some tokens": [[1, "mask-some-tokens"]], "How can I create custom on-the-fly data transforms?": [[1, "how-can-i-create-custom-on-the-fly-data-transforms"]], "How to use LoRa and 8bit loading to finetune a big model ?": [[1, "how-to-use-lora-and-8bit-loading-to-finetune-a-big-model"]], "How to use gradient checkpointing when dealing with a big model ?": [[1, "how-to-use-gradient-checkpointing-when-dealing-with-a-big-model"]], "Can I get word alignments while translating?": [[1, "can-i-get-word-alignments-while-translating"]], "Raw alignments from averaging Transformer attention heads": [[1, "raw-alignments-from-averaging-transformer-attention-heads"], [9, "raw-alignments-from-averaging-transformer-attention-heads"]], "Supervised learning on a specific head": [[1, "supervised-learning-on-a-specific-head"], [9, "supervised-learning-on-a-specific-head"]], "How can I update a checkpoint\u2019s vocabulary?": [[1, "how-can-i-update-a-checkpoint-s-vocabulary"]], "How can I use source word features?": [[1, "how-can-i-use-source-word-features"]], "How can I set up a translation server ?": [[1, "how-can-i-set-up-a-translation-server"]], "I. How it works?": [[1, "i-how-it-works"]], "Configuration:": [[1, "configuration"]], "II. How to start the server without Docker ?": [[1, "ii-how-to-start-the-server-without-docker"]], "0. Get the code": [[1, "get-the-code"]], "1. Install flask": [[1, "install-flask"]], "2. Put some models": [[1, "put-some-models"]], "3. Start the server": [[1, "start-the-server"]], "III. How to start the server with Docker ?": [[1, "iii-how-to-start-the-server-with-docker"]], "IV. How to use the API ?": [[1, "iv-how-to-use-the-api"]], "0. Set the hostname": [[1, "set-the-hostname"]], "1. List models": [[1, "list-models"]], "2. Translate": [[1, "translate"]], "Versions": [[2, "versions"]], "Breaking changes": [[2, "breaking-changes"]], "Gated Graph Neural Networks": [[3, "gated-graph-neural-networks"]], "Dependencies": [[3, "dependencies"], [4, "dependencies"], [7, "dependencies"], [10, "dependencies"], [11, "dependencies"]], "Quick Start": [[3, "quick-start"], [10, "quick-start"], [11, "quick-start"]], "Graph data format": [[3, "graph-data-format"]], "Vocabulary notes": [[3, "vocabulary-notes"]], "Options": [[3, "options"], [10, "options"], [11, "options"]], "Acknowledgement": [[3, "acknowledgement"], [11, "acknowledgement"]], "Supervised Finetuning of llama 7B to replicate Vicuna": [[4, "supervised-finetuning-of-llama-7b-to-replicate-vicuna"]], "Data": [[4, "data"], [19, "Data"], [21, "Data"], [22, "Data"]], "Checkpoints": [[4, "checkpoints"]], "Vocabulary": [[4, "vocabulary"]], "Datasets": [[4, "datasets"]], "Finetuning": [[4, "finetuning"]], "Inference": [[4, "inference"], [5, "inference"]], "Concatenation of the checkpoints": [[4, "concatenation-of-the-checkpoints"]], "Conversion to ctranslate format": [[4, "conversion-to-ctranslate-format"]], "Multi-round conversations with vicuna": [[4, "multi-round-conversations-with-vicuna"]], "Simple inference": [[4, "simple-inference"]], "Summarization CNN/DM": [[5, "summarization-cnn-dm"]], "Preparing the data and vocab": [[5, "preparing-the-data-and-vocab"]], "Training": [[5, "training"], [9, "training"]], "Evaluation": [[5, "evaluation"]], "CNN-DM": [[5, "cnn-dm"], [5, "id1"]], "Gigaword": [[5, "gigaword"], [5, "id2"]], "Scores and Models": [[5, "scores-and-models"]], "References": [[5, "references"], [24, "references"]], "Language Model Wiki-103": [[6, "language-model-wiki-103"]], "Step 0: Download and clean the data": [[6, "step-0-download-and-clean-the-data"]], "Step 1: Prepare the subword model - BPE with pyonmttok": [[6, "step-1-prepare-the-subword-model-bpe-with-pyonmttok"]], "Step 2: Build the vocabulary": [[6, "step-2-build-the-vocabulary"]], "Language Model specificities": [[6, "language-model-specificities"]], "BPE specificities": [[6, "bpe-specificities"]], "Build vocabulary command": [[6, "build-vocabulary-command"]], "Step 3: Train the model": [[6, "step-3-train-the-model"]], "Step 4: Generate output": [[6, "step-4-generate-output"]], "Translation WMT17 en-de": [[7, "translation-wmt17-en-de"]], "PyTorch": [[7, "pytorch"]], "Apex": [[7, "apex"]], "Subword-NMT": [[7, "subword-nmt"]], "OpenNMT-py": [[7, "opennmt-py"]], "Running WMT17 EN-DE": [[7, "running-wmt17-en-de"]], "Get Data and prepare": [[7, "get-data-and-prepare"]], "Train": [[7, "train"], [21, "train"]], "Contents": [[8, "contents"]], "Getting Started": [[8, null]], "Frequently Asked Questions": [[8, null]], "Examples": [[8, null]], "Scripts": [[8, null]], "API": [[8, null]], "Legacy": [[8, null]], "FAQ (Legacy version)": [[9, "faq-legacy-version"]], "How do I use the Transformer model?": [[9, "how-do-i-use-the-transformer-model"]], "Preprocessing": [[9, "preprocessing"]], "Can I get word alignment while translating?": [[9, "can-i-get-word-alignment-while-translating"]], "Image to Text": [[10, "image-to-text"]], "Speech to Text": [[11, "speech-to-text"]], "Video to Text": [[12, "video-to-text"]], "Recurrent": [[12, "recurrent"]], "Transformer": [[12, "transformer"]], "Overview": [[13, "overview"]], "Installation": [[13, "installation"]], "Citation": [[13, "citation"]], "Additional resources": [[13, "additional-resources"]], "Framework": [[14, "framework"]], "Model": [[14, "model"], [22, "Model"]], "Trainer": [[14, "trainer"]], "Loss": [[14, "loss"]], "Optimizer": [[14, "optimizer"]], "Data Loaders": [[15, "data-loaders"]], "Data Iterator": [[15, "data-iterator"]], "Dataset": [[15, "dataset"]], "Modules": [[16, "modules"]], "Embeddings": [[16, "embeddings"], [21, "Embeddings"]], "Encoders": [[16, "encoders"]], "Decoders": [[16, "decoders"]], "Attention": [[16, "attention"]], "Server": [[17, "server"], [20, "server"]], "Models": [[17, "models"]], "Core Server": [[17, "core-server"]], "Translation": [[18, "translation"]], "Translations": [[18, "translations"]], "Translator Class": [[18, "translator-class"]], "Decoding Strategies": [[18, "decoding-strategies"]], "Scoring": [[18, "scoring"]], "Build Vocab": [[19, "build-vocab"]], "Configuration": [[19, "Configuration"], [21, "Configuration"], [22, "Configuration"]], "Vocab": [[19, "Vocab"], [21, "Vocab"]], "Features": [[19, "Features"], [21, "Features"], [22, "Features"]], "Transform/BART": [[19, "Transform/BART"], [21, "Transform/BART"], [22, "Transform/BART"]], "Transform/Terminology": [[19, "Transform/Terminology"], [21, "Transform/Terminology"], [22, "Transform/Terminology"]], "Transform/FuzzyMatching": [[19, "Transform/FuzzyMatching"], [21, "Transform/FuzzyMatching"], [22, "Transform/FuzzyMatching"]], "Transform/Filter": [[19, "Transform/Filter"], [21, "Transform/Filter"], [22, "Transform/Filter"]], "Transform/Prefix": [[19, "Transform/Prefix"], [21, "Transform/Prefix"], [22, "Transform/Prefix"]], "Transform/Suffix": [[19, "Transform/Suffix"], [21, "Transform/Suffix"], [22, "Transform/Suffix"]], "Transform/InsertMaskBeforePlaceholdersTransform": [[19, "Transform/InsertMaskBeforePlaceholdersTransform"], [21, "Transform/InsertMaskBeforePlaceholdersTransform"], [22, "Transform/InsertMaskBeforePlaceholdersTransform"]], "Transform/Clean": [[19, "Transform/Clean"], [21, "Transform/Clean"], [22, "Transform/Clean"]], "Transform/Uppercase": [[19, "Transform/Uppercase"], [21, "Transform/Uppercase"], [22, "Transform/Uppercase"]], "Transform/SwitchOut": [[19, "Transform/SwitchOut"], [21, "Transform/SwitchOut"], [22, "Transform/SwitchOut"]], "Transform/Token_Drop": [[19, "Transform/Token_Drop"], [21, "Transform/Token_Drop"], [22, "Transform/Token_Drop"]], "Transform/Token_Mask": [[19, "Transform/Token_Mask"], [21, "Transform/Token_Mask"], [22, "Transform/Token_Mask"]], "Transform/Docify": [[19, "Transform/Docify"], [21, "Transform/Docify"], [22, "Transform/Docify"]], "Transform/InferFeats": [[19, "Transform/InferFeats"], [21, "Transform/InferFeats"], [22, "Transform/InferFeats"]], "Transform/Subword/Common": [[19, "Transform/Subword/Common"], [21, "Transform/Subword/Common"], [22, "Transform/Subword/Common"]], "Transform/Subword/ONMTTOK": [[19, "Transform/Subword/ONMTTOK"], [21, "Transform/Subword/ONMTTOK"], [22, "Transform/Subword/ONMTTOK"]], "Transform/InlineTags": [[19, "Transform/InlineTags"], [21, "Transform/InlineTags"], [22, "Transform/InlineTags"]], "Transform/Normalize": [[19, "Transform/Normalize"], [21, "Transform/Normalize"], [22, "Transform/Normalize"]], "Reproducibility": [[19, "Reproducibility"], [21, "Reproducibility"], [22, "Reproducibility"]], "Named Arguments": [[20, "Named Arguments"]], "Pruning": [[21, "Pruning"]], "Distributed": [[21, "Distributed"], [22, "Distributed"]], "Model-Embeddings": [[21, "Model-Embeddings"]], "Model-Embedding Features": [[21, "Model-Embedding Features"]], "Model- Task": [[21, "Model- Task"]], "Model- Encoder-Decoder": [[21, "Model- Encoder-Decoder"]], "Model- Attention": [[21, "Model- Attention"]], "Model - Alignement": [[21, "Model - Alignement"]], "Generator": [[21, "Generator"]], "General": [[21, "General"]], "Initialization": [[21, "Initialization"]], "Optimization- Type": [[21, "Optimization- Type"]], "Optimization- Rate": [[21, "Optimization- Rate"]], "Logging": [[21, "Logging"], [22, "Logging"]], "Dynamic data": [[21, "Dynamic data"]], "Quant options": [[21, "Quant options"], [22, "Quant options"]], "Translate": [[22, "translate"]], "Beam Search": [[22, "Beam Search"]], "Random Sampling": [[22, "Random Sampling"]], "Penalties": [[22, "Penalties"]], "Decoding tricks": [[22, "Decoding tricks"]], "Efficiency": [[22, "Efficiency"]], "Quickstart": [[23, "quickstart"]], "How to train a model from scratch": [[23, "how-to-train-a-model-from-scratch"]], "Step 1: Prepare the data": [[23, "step-1-prepare-the-data"]], "Step 2: Train the model": [[23, "step-2-train-the-model"]], "Step 3: Translate": [[23, "step-3-translate"]], "How to generate with a pretrained LLM": [[23, "how-to-generate-with-a-pretrained-llm"]], "Step 1: Convert a model from Hugging Face Hub": [[23, "step-1-convert-a-model-from-hugging-face-hub"], [23, "id1"]], "Step 2: Prepare an inference.yaml config file": [[23, "step-2-prepare-an-inference-yaml-config-file"]], "Step 3: Generate text": [[23, "step-3-generate-text"]], "How to finetune a pretrained LLM": [[23, "how-to-finetune-a-pretrained-llm"]], "Step 2: Prepare an finetune.yaml config file": [[23, "step-2-prepare-an-finetune-yaml-config-file"]], "Step 3: Finetune": [[23, "step-3-finetune"]]}, "indexentries": {"adafactor (class in onmt.utils)": [[14, "onmt.utils.AdaFactor"]], "basemodel (class in onmt.models)": [[14, "onmt.models.BaseModel"]], "fusedadam (class in onmt.utils)": [[14, "onmt.utils.FusedAdam"]], "languagemodel (class in onmt.models)": [[14, "onmt.models.LanguageModel"]], "losscompute (class in onmt.utils.loss)": [[14, "onmt.utils.loss.LossCompute"]], "nmtmodel (class in onmt.models)": [[14, "onmt.models.NMTModel"]], "optimizer (class in onmt.utils)": [[14, "onmt.utils.Optimizer"]], "statistics (class in onmt.utils)": [[14, "onmt.utils.Statistics"]], "trainer (class in onmt.trainer)": [[14, "onmt.trainer.Trainer"]], "accuracy() (onmt.utils.statistics method)": [[14, "onmt.utils.Statistics.accuracy"]], "all_gather_stats() (onmt.utils.statistics static method)": [[14, "onmt.utils.Statistics.all_gather_stats"]], "all_gather_stats_list() (onmt.utils.statistics static method)": [[14, "onmt.utils.Statistics.all_gather_stats_list"]], "amp (onmt.utils.optimizer property)": [[14, "onmt.utils.Optimizer.amp"]], "backward() (onmt.utils.optimizer method)": [[14, "onmt.utils.Optimizer.backward"]], "count_parameters() (onmt.models.languagemodel method)": [[14, "onmt.models.LanguageModel.count_parameters"]], "count_parameters() (onmt.models.nmtmodel method)": [[14, "onmt.models.NMTModel.count_parameters"]], "elapsed_time() (onmt.utils.statistics method)": [[14, "onmt.utils.Statistics.elapsed_time"]], "forward() (onmt.models.basemodel method)": [[14, "onmt.models.BaseModel.forward"]], "forward() (onmt.models.languagemodel method)": [[14, "onmt.models.LanguageModel.forward"]], "forward() (onmt.models.nmtmodel method)": [[14, "onmt.models.NMTModel.forward"]], "forward() (onmt.utils.loss.losscompute method)": [[14, "onmt.utils.loss.LossCompute.forward"]], "from_opt() (onmt.utils.optimizer class method)": [[14, "onmt.utils.Optimizer.from_opt"]], "from_opts() (onmt.utils.loss.losscompute class method)": [[14, "onmt.utils.loss.LossCompute.from_opts"]], "ignore_prompt() (onmt.utils.loss.losscompute method)": [[14, "onmt.utils.loss.LossCompute.ignore_prompt"]], "learning_rate() (onmt.utils.optimizer method)": [[14, "onmt.utils.Optimizer.learning_rate"]], "load_safe_state_dict() (onmt.models.basemodel method)": [[14, "onmt.models.BaseModel.load_safe_state_dict"]], "load_state_dict() (onmt.models.basemodel method)": [[14, "onmt.models.BaseModel.load_state_dict"]], "log_tensorboard() (onmt.utils.statistics method)": [[14, "onmt.utils.Statistics.log_tensorboard"]], "output() (onmt.utils.statistics method)": [[14, "onmt.utils.Statistics.output"]], "ppl() (onmt.utils.statistics method)": [[14, "onmt.utils.Statistics.ppl"]], "step() (onmt.utils.adafactor method)": [[14, "onmt.utils.AdaFactor.step"]], "step() (onmt.utils.fusedadam method)": [[14, "onmt.utils.FusedAdam.step"]], "step() (onmt.utils.optimizer method)": [[14, "onmt.utils.Optimizer.step"]], "train() (onmt.trainer.trainer method)": [[14, "onmt.trainer.Trainer.train"]], "training_step (onmt.utils.optimizer property)": [[14, "onmt.utils.Optimizer.training_step"]], "update() (onmt.utils.statistics method)": [[14, "onmt.utils.Statistics.update"]], "validate() (onmt.trainer.trainer method)": [[14, "onmt.trainer.Trainer.validate"]], "xent() (onmt.utils.statistics method)": [[14, "onmt.utils.Statistics.xent"]], "zero_grad() (onmt.utils.optimizer method)": [[14, "onmt.utils.Optimizer.zero_grad"]], "dynamicdatasetiter (class in onmt.inputters)": [[15, "onmt.inputters.DynamicDatasetIter"]], "mixingstrategy (class in onmt.inputters)": [[15, "onmt.inputters.MixingStrategy"]], "parallelcorpus (class in onmt.inputters)": [[15, "onmt.inputters.ParallelCorpus"]], "parallelcorpusiterator (class in onmt.inputters)": [[15, "onmt.inputters.ParallelCorpusIterator"]], "sequentialmixer (class in onmt.inputters)": [[15, "onmt.inputters.SequentialMixer"]], "weightedmixer (class in onmt.inputters)": [[15, "onmt.inputters.WeightedMixer"]], "batch_iter() (onmt.inputters.dynamicdatasetiter method)": [[15, "onmt.inputters.DynamicDatasetIter.batch_iter"]], "from_opt() (onmt.inputters.dynamicdatasetiter class method)": [[15, "onmt.inputters.DynamicDatasetIter.from_opt"]], "load() (onmt.inputters.parallelcorpus method)": [[15, "onmt.inputters.ParallelCorpus.load"]], "averageattention (class in onmt.modules)": [[16, "onmt.modules.AverageAttention"]], "cnndecoder (class in onmt.decoders)": [[16, "onmt.decoders.CNNDecoder"]], "cnnencoder (class in onmt.encoders)": [[16, "onmt.encoders.CNNEncoder"]], "convmultistepattention (class in onmt.modules)": [[16, "onmt.modules.ConvMultiStepAttention"]], "copygenerator (class in onmt.modules)": [[16, "onmt.modules.CopyGenerator"]], "decoderbase (class in onmt.decoders)": [[16, "onmt.decoders.DecoderBase"]], "embeddings (class in onmt.modules)": [[16, "onmt.modules.Embeddings"]], "encoderbase (class in onmt.encoders)": [[16, "onmt.encoders.EncoderBase"]], "ggnnencoder (class in onmt.encoders)": [[16, "onmt.encoders.GGNNEncoder"]], "globalattention (class in onmt.modules)": [[16, "onmt.modules.GlobalAttention"]], "inputfeedrnndecoder (class in onmt.decoders)": [[16, "onmt.decoders.InputFeedRNNDecoder"]], "matrixtree (class in onmt.modules.structured_attention)": [[16, "onmt.modules.structured_attention.MatrixTree"]], "meanencoder (class in onmt.encoders)": [[16, "onmt.encoders.MeanEncoder"]], "multiheadedattention (class in onmt.modules)": [[16, "onmt.modules.MultiHeadedAttention"]], "positionalencoding (class in onmt.modules)": [[16, "onmt.modules.PositionalEncoding"]], "positionwisefeedforward (class in onmt.modules.position_ffn)": [[16, "onmt.modules.position_ffn.PositionwiseFeedForward"]], "rnndecoderbase (class in onmt.decoders.decoder)": [[16, "onmt.decoders.decoder.RNNDecoderBase"]], "rnnencoder (class in onmt.encoders)": [[16, "onmt.encoders.RNNEncoder"]], "stdrnndecoder (class in onmt.decoders)": [[16, "onmt.decoders.StdRNNDecoder"]], "transformerdecoder (class in onmt.decoders)": [[16, "onmt.decoders.TransformerDecoder"]], "transformerencoder (class in onmt.encoders)": [[16, "onmt.encoders.TransformerEncoder"]], "apply_mask() (onmt.modules.convmultistepattention method)": [[16, "onmt.modules.ConvMultiStepAttention.apply_mask"]], "emb_luts (onmt.modules.embeddings property)": [[16, "onmt.modules.Embeddings.emb_luts"]], "forward() (onmt.decoders.cnndecoder method)": [[16, "onmt.decoders.CNNDecoder.forward"]], "forward() (onmt.decoders.transformerdecoder method)": [[16, "onmt.decoders.TransformerDecoder.forward"]], "forward() (onmt.decoders.decoder.rnndecoderbase method)": [[16, "onmt.decoders.decoder.RNNDecoderBase.forward"]], "forward() (onmt.encoders.cnnencoder method)": [[16, "onmt.encoders.CNNEncoder.forward"]], "forward() (onmt.encoders.encoderbase method)": [[16, "onmt.encoders.EncoderBase.forward"]], "forward() (onmt.encoders.ggnnencoder method)": [[16, "onmt.encoders.GGNNEncoder.forward"]], "forward() (onmt.encoders.meanencoder method)": [[16, "onmt.encoders.MeanEncoder.forward"]], "forward() (onmt.encoders.rnnencoder method)": [[16, "onmt.encoders.RNNEncoder.forward"]], "forward() (onmt.encoders.transformerencoder method)": [[16, "onmt.encoders.TransformerEncoder.forward"]], "forward() (onmt.modules.averageattention method)": [[16, "onmt.modules.AverageAttention.forward"]], "forward() (onmt.modules.convmultistepattention method)": [[16, "onmt.modules.ConvMultiStepAttention.forward"]], "forward() (onmt.modules.copygenerator method)": [[16, "onmt.modules.CopyGenerator.forward"]], "forward() (onmt.modules.embeddings method)": [[16, "onmt.modules.Embeddings.forward"]], "forward() (onmt.modules.globalattention method)": [[16, "onmt.modules.GlobalAttention.forward"]], "forward() (onmt.modules.multiheadedattention method)": [[16, "onmt.modules.MultiHeadedAttention.forward"]], "forward() (onmt.modules.positionalencoding method)": [[16, "onmt.modules.PositionalEncoding.forward"]], "forward() (onmt.modules.position_ffn.positionwisefeedforward method)": [[16, "onmt.modules.position_ffn.PositionwiseFeedForward.forward"]], "forward() (onmt.modules.structured_attention.matrixtree method)": [[16, "onmt.modules.structured_attention.MatrixTree.forward"]], "from_opt() (onmt.decoders.cnndecoder class method)": [[16, "onmt.decoders.CNNDecoder.from_opt"]], "from_opt() (onmt.decoders.decoderbase class method)": [[16, "onmt.decoders.DecoderBase.from_opt"]], "from_opt() (onmt.decoders.decoder.rnndecoderbase class method)": [[16, "onmt.decoders.decoder.RNNDecoderBase.from_opt"]], "from_opt() (onmt.encoders.cnnencoder class method)": [[16, "onmt.encoders.CNNEncoder.from_opt"]], "from_opt() (onmt.encoders.ggnnencoder class method)": [[16, "onmt.encoders.GGNNEncoder.from_opt"]], "from_opt() (onmt.encoders.meanencoder class method)": [[16, "onmt.encoders.MeanEncoder.from_opt"]], "from_opt() (onmt.encoders.rnnencoder class method)": [[16, "onmt.encoders.RNNEncoder.from_opt"]], "from_opt() (onmt.encoders.transformerencoder class method)": [[16, "onmt.encoders.TransformerEncoder.from_opt"]], "init_state() (onmt.decoders.cnndecoder method)": [[16, "onmt.decoders.CNNDecoder.init_state"]], "init_state() (onmt.decoders.decoder.rnndecoderbase method)": [[16, "onmt.decoders.decoder.RNNDecoderBase.init_state"]], "load_pretrained_vectors() (onmt.modules.embeddings method)": [[16, "onmt.modules.Embeddings.load_pretrained_vectors"]], "score() (onmt.modules.globalattention method)": [[16, "onmt.modules.GlobalAttention.score"]], "word_lut (onmt.modules.embeddings property)": [[16, "onmt.modules.Embeddings.word_lut"]], "servermodel (class in onmt.translate.translation_server)": [[17, "onmt.translate.translation_server.ServerModel"]], "servermodelerror": [[17, "onmt.translate.translation_server.ServerModelError"]], "timer (class in onmt.translate.translation_server)": [[17, "onmt.translate.translation_server.Timer"]], "translationserver (class in onmt.translate.translation_server)": [[17, "onmt.translate.translation_server.TranslationServer"]], "build_tokenizer() (onmt.translate.translation_server.servermodel method)": [[17, "onmt.translate.translation_server.ServerModel.build_tokenizer"]], "clone_model() (onmt.translate.translation_server.translationserver method)": [[17, "onmt.translate.translation_server.TranslationServer.clone_model"]], "detokenize() (onmt.translate.translation_server.servermodel method)": [[17, "onmt.translate.translation_server.ServerModel.detokenize"]], "do_timeout() (onmt.translate.translation_server.servermodel method)": [[17, "onmt.translate.translation_server.ServerModel.do_timeout"]], "list_models() (onmt.translate.translation_server.translationserver method)": [[17, "onmt.translate.translation_server.TranslationServer.list_models"]], "load_model() (onmt.translate.translation_server.translationserver method)": [[17, "onmt.translate.translation_server.TranslationServer.load_model"]], "maybe_convert_align() (onmt.translate.translation_server.servermodel method)": [[17, "onmt.translate.translation_server.ServerModel.maybe_convert_align"]], "maybe_detokenize() (onmt.translate.translation_server.servermodel method)": [[17, "onmt.translate.translation_server.ServerModel.maybe_detokenize"]], "maybe_detokenize_with_align() (onmt.translate.translation_server.servermodel method)": [[17, "onmt.translate.translation_server.ServerModel.maybe_detokenize_with_align"]], "maybe_postprocess() (onmt.translate.translation_server.servermodel method)": [[17, "onmt.translate.translation_server.ServerModel.maybe_postprocess"]], "maybe_preprocess() (onmt.translate.translation_server.servermodel method)": [[17, "onmt.translate.translation_server.ServerModel.maybe_preprocess"]], "maybe_tokenize() (onmt.translate.translation_server.servermodel method)": [[17, "onmt.translate.translation_server.ServerModel.maybe_tokenize"]], "maybe_transform_feats() (onmt.translate.translation_server.servermodel method)": [[17, "onmt.translate.translation_server.ServerModel.maybe_transform_feats"]], "parse_opt() (onmt.translate.translation_server.servermodel method)": [[17, "onmt.translate.translation_server.ServerModel.parse_opt"]], "postprocess() (onmt.translate.translation_server.servermodel method)": [[17, "onmt.translate.translation_server.ServerModel.postprocess"]], "preload_model() (onmt.translate.translation_server.translationserver method)": [[17, "onmt.translate.translation_server.TranslationServer.preload_model"]], "preprocess() (onmt.translate.translation_server.servermodel method)": [[17, "onmt.translate.translation_server.ServerModel.preprocess"]], "rebuild_seg_packages() (onmt.translate.translation_server.servermodel method)": [[17, "onmt.translate.translation_server.ServerModel.rebuild_seg_packages"]], "run() (onmt.translate.translation_server.translationserver method)": [[17, "onmt.translate.translation_server.TranslationServer.run"]], "start() (onmt.translate.translation_server.translationserver method)": [[17, "onmt.translate.translation_server.TranslationServer.start"]], "to_gpu() (onmt.translate.translation_server.servermodel method)": [[17, "onmt.translate.translation_server.ServerModel.to_gpu"]], "tokenize() (onmt.translate.translation_server.servermodel method)": [[17, "onmt.translate.translation_server.ServerModel.tokenize"]], "tokenizer_marker() (onmt.translate.translation_server.servermodel method)": [[17, "onmt.translate.translation_server.ServerModel.tokenizer_marker"]], "unload_model() (onmt.translate.translation_server.translationserver method)": [[17, "onmt.translate.translation_server.TranslationServer.unload_model"]], "beamsearch (class in onmt.translate)": [[18, "onmt.translate.BeamSearch"]], "decodestrategy (class in onmt.translate)": [[18, "onmt.translate.DecodeStrategy"]], "gnmtglobalscorer (class in onmt.translate)": [[18, "onmt.translate.GNMTGlobalScorer"]], "greedysearch (class in onmt.translate)": [[18, "onmt.translate.GreedySearch"]], "penaltybuilder (class in onmt.translate.penalties)": [[18, "onmt.translate.penalties.PenaltyBuilder"]], "translation (class in onmt.translate)": [[18, "onmt.translate.Translation"]], "translationbuilder (class in onmt.translate)": [[18, "onmt.translate.TranslationBuilder"]], "translator (class in onmt.translate)": [[18, "onmt.translate.Translator"]], "advance() (onmt.translate.decodestrategy method)": [[18, "onmt.translate.DecodeStrategy.advance"]], "advance() (onmt.translate.greedysearch method)": [[18, "onmt.translate.GreedySearch.advance"]], "block_ngram_repeats() (onmt.translate.decodestrategy method)": [[18, "onmt.translate.DecodeStrategy.block_ngram_repeats"]], "coverage_none() (onmt.translate.penalties.penaltybuilder method)": [[18, "onmt.translate.penalties.PenaltyBuilder.coverage_none"]], "coverage_summary() (onmt.translate.penalties.penaltybuilder method)": [[18, "onmt.translate.penalties.PenaltyBuilder.coverage_summary"]], "coverage_wu() (onmt.translate.penalties.penaltybuilder method)": [[18, "onmt.translate.penalties.PenaltyBuilder.coverage_wu"]], "initialize() (onmt.translate.beamsearch method)": [[18, "onmt.translate.BeamSearch.initialize"]], "initialize() (onmt.translate.decodestrategy method)": [[18, "onmt.translate.DecodeStrategy.initialize"]], "initialize() (onmt.translate.greedysearch method)": [[18, "onmt.translate.GreedySearch.initialize"]], "length_average() (onmt.translate.penalties.penaltybuilder method)": [[18, "onmt.translate.penalties.PenaltyBuilder.length_average"]], "length_none() (onmt.translate.penalties.penaltybuilder method)": [[18, "onmt.translate.penalties.PenaltyBuilder.length_none"]], "length_wu() (onmt.translate.penalties.penaltybuilder method)": [[18, "onmt.translate.penalties.PenaltyBuilder.length_wu"]], "log() (onmt.translate.translation method)": [[18, "onmt.translate.Translation.log"]], "maybe_update_forbidden_tokens() (onmt.translate.decodestrategy method)": [[18, "onmt.translate.DecodeStrategy.maybe_update_forbidden_tokens"]], "maybe_update_target_prefix() (onmt.translate.decodestrategy method)": [[18, "onmt.translate.DecodeStrategy.maybe_update_target_prefix"]], "sample_with_temperature() (in module onmt.translate.greedy_search)": [[18, "onmt.translate.greedy_search.sample_with_temperature"]], "target_prefixing() (onmt.translate.decodestrategy method)": [[18, "onmt.translate.DecodeStrategy.target_prefixing"]], "translate_batch() (onmt.translate.translator method)": [[18, "onmt.translate.Translator.translate_batch"]], "update_finished() (onmt.translate.decodestrategy method)": [[18, "onmt.translate.DecodeStrategy.update_finished"]], "update_finished() (onmt.translate.greedysearch method)": [[18, "onmt.translate.GreedySearch.update_finished"]]}}) \ No newline at end of file +Search.setIndex({"docnames": ["CONTRIBUTING", "FAQ", "changes", "examples/ggnn/GGNN", "examples/replicate_vicuna/ReplicateVicuna", "examples/summary/Summarization", "examples/wiki_103/LanguageModelGeneration", "examples/wmt17/Translation", "index", "legacy/FAQ", "legacy/im2text", "legacy/speech2text", "legacy/vid2text", "main", "onmt", "onmt.inputters", "onmt.modules", "onmt.translate.translation_server", "onmt.translation", "options/build_vocab", "options/server", "options/train", "options/translate", "quickstart", "ref"], "filenames": ["CONTRIBUTING.md", "FAQ.md", "changes.md", "examples/ggnn/GGNN.md", "examples/replicate_vicuna/ReplicateVicuna.md", "examples/summary/Summarization.md", "examples/wiki_103/LanguageModelGeneration.md", "examples/wmt17/Translation.md", "index.rst", "legacy/FAQ.md", "legacy/im2text.md", "legacy/speech2text.md", "legacy/vid2text.rst", "main.md", "onmt.rst", "onmt.inputters.rst", "onmt.modules.rst", "onmt.translate.translation_server.rst", "onmt.translation.rst", "options/build_vocab.rst", "options/server.rst", "options/train.rst", "options/translate.rst", "quickstart.md", "ref.rst"], "titles": ["Contributors", "How do I use my v2 models in v3 ?", "Versions", "Gated Graph Neural Networks", "Supervised Finetuning of llama 7B to replicate Vicuna", "Summarization CNN/DM", "Language Model Wiki-103", "Translation WMT17 en-de", "Contents", "FAQ (Legacy version)", "Image to Text", "Speech to Text", "Video to Text", "Overview", "Framework", "Data Loaders", "Modules", "Server", "Translation", "Build Vocab", "Server", "Train", "Translate", "Quickstart", "References"], "terms": {"opennmt": [0, 2, 3, 4, 5, 8, 9, 10, 11, 12, 13, 14, 20, 23], "py": [0, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 19, 20, 21, 22, 23], "i": [0, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 14, 15, 16, 17, 18, 19, 21, 22, 23, 24], "commun": [0, 1, 9], "develop": [0, 1, 2], "project": [0, 13, 16], "we": [0, 1, 2, 3, 4, 5, 6, 9, 10, 11, 12, 16, 17, 18, 21, 23], "love": 0, "contribut": 0, "befor": [0, 1, 3, 5, 9, 14, 17, 18, 22, 23], "send": [0, 1, 9, 21], "pr": [0, 2], "pleas": [0, 1, 3, 5, 9, 13], "do": [0, 5, 8, 12, 17, 18, 21, 22, 23], "thi": [0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 18, 19, 21, 22, 23], "checklist": 0, "first": [0, 1, 3, 4, 9, 11, 12, 16, 18, 21], "instal": [0, 4, 5, 7, 8, 10, 11, 12, 14], "black": 0, "22": [0, 1, 6], "12": [0, 1, 3, 5, 9, 19, 21, 22], "0": [0, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], "run": [0, 1, 2, 3, 4, 5, 6, 8, 9, 12, 14, 16, 17, 21, 22, 23], "format": [0, 1, 2, 5, 8, 9, 12, 17, 19, 21, 22, 23], "your": [0, 1, 2, 4, 5, 9, 23], "chang": [0, 1, 4, 5, 8, 12, 14, 21], "accord": [0, 1, 2], "our": [0, 4, 11, 12, 13, 18], "standard": [0, 1, 5, 9, 16, 21, 22], "onmt": [0, 1, 4, 6, 7, 14, 15, 16, 17, 18, 19, 21, 23], "test": [0, 1, 3, 4, 5, 6, 7, 10, 12, 21, 22, 23], "pull_request_chk": 0, "sh": [0, 3, 6, 7, 12], "fix": [0, 1, 5, 18, 21], "ani": [0, 1, 2, 3, 5, 14, 18, 19, 21, 22, 23], "error": [0, 19, 21, 22], "when": [0, 5, 8, 9, 13, 15, 16, 18, 19, 21, 22, 23], "ad": [0, 1, 3, 5, 14, 19, 21, 22], "new": [0, 1, 2, 4, 12, 23], "function": [0, 1, 4, 14, 15, 16, 17, 18, 21], "also": [0, 1, 4, 5, 12, 14, 16, 21, 23], "add": [0, 9, 12, 14, 15, 16, 21, 22, 23], "script": [0, 1, 4, 5, 9, 12, 19, 21, 22, 23], "includ": [0, 1, 3, 4, 5, 16, 19, 21, 22], "check": [0, 2, 4, 5, 7, 14], "flake8": 0, "code": [0, 3, 5, 12, 19, 21, 22], "style": [0, 3, 8, 16, 21], "unittest": 0, "continu": [0, 1, 12], "integr": [0, 1, 3], "list": [0, 3, 9, 12, 14, 16, 17, 18, 19, 21, 22], "github": [0, 1, 3, 4, 5, 7, 11, 13, 16, 21], "workflow": [0, 13], "push": 0, "yml": 0, "modifi": [0, 1, 2, 4, 5, 9, 14, 18], "class": [0, 1, 2, 8, 14, 15, 16, 17], "constructor": [0, 16], "make": [0, 2, 4, 5, 9, 12, 15, 19, 21, 22], "argument": [0, 1, 8, 12, 14], "same": [0, 1, 2, 5, 6, 9, 12, 14, 16, 17, 19, 21, 22, 23], "name": [0, 1, 4, 7, 8, 12, 15, 18, 19, 21, 22], "its": [0, 1, 3, 4, 14, 16], "superclass": 0, "pytorch": [0, 1, 2, 3, 5, 11, 13, 14, 16, 21, 22, 23], "If": [0, 1, 2, 3, 5, 9, 12, 13, 14, 16, 17, 18, 21, 22, 23], "base": [0, 1, 2, 3, 5, 9, 10, 11, 12, 14, 15, 16, 17, 18, 19, 21, 22, 24], "paper": [0, 1, 2, 3, 5, 9, 10, 12, 14, 16, 21], "clear": [0, 1], "comment": 0, "refer": [0, 1, 3, 8, 9, 12], "more": [0, 1, 2, 3, 9, 10, 12, 18, 19, 21, 22, 23], "below": [0, 3, 4, 5, 23], "abov": [0, 3, 9, 10, 14, 18], "all": [0, 1, 2, 3, 4, 11, 12, 14, 16, 18, 19, 21, 22, 23, 24], "try": [0, 1, 5, 12, 23], "follow": [0, 1, 3, 4, 5, 6, 7, 9, 12, 13, 22, 23], "googl": [0, 1, 9, 18, 21, 24], "napoleon": 0, "exampl": [0, 2, 3, 4, 5, 6, 7, 10, 11, 12, 14, 15, 19, 21, 22, 23], "styleguid": 0, "easi": 0, "sphinx": 0, "document": [0, 3, 5, 10, 11, 13], "And": [0, 1, 12, 13, 16], "feel": [0, 2], "free": [0, 2, 17], "autodoc": 0, "api": 0, "rst": 0, "file": [0, 1, 3, 4, 5, 6, 9, 10, 11, 12, 15, 17, 19, 21, 22], "doc": [0, 2, 6, 7, 19, 21, 22], "sourc": [0, 3, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 21, 22, 23], "folder": [0, 1, 4, 5, 12, 21, 23], "you": [0, 2, 4, 5, 8, 10, 12, 13, 16, 21, 22, 23, 24], "addit": [0, 1, 3, 5, 8, 9, 14, 16, 19, 21, 22], "look": [0, 1, 5, 9, 10, 11, 13, 16, 22], "right": 0, "how": [0, 2, 4, 5, 8, 12, 13, 15, 16], "build": [0, 1, 8, 9, 14, 17, 18, 23], "local": [0, 4, 5, 7], "cd": [0, 3, 4, 7, 12, 13, 23], "some": [0, 4, 12, 13, 14, 16, 19, 21, 22, 23], "depend": [0, 1, 2, 8, 12, 14, 16, 17], "necessari": [0, 1, 3, 14, 18, 21, 22, 23], "recommonmark": 0, "sphinx_rtd_them": 0, "sphinxcontrib": 0, "bibtex": 0, "pip": [0, 1, 4, 7, 10, 11, 12, 13], "requir": [0, 1, 2, 3, 6, 9, 13, 14, 16, 19, 21, 23], "txt": [0, 1, 3, 4, 5, 6, 9, 10, 11, 12, 13, 22, 23], "html": [0, 21], "firefox": 0, "main": [0, 1, 13, 14, 19, 21, 22], "browser": 0, "choic": [0, 1, 5, 15, 16, 19, 21, 22], "particular": [0, 16], "advic": [0, 1], "python": [0, 1, 3, 5, 9, 12, 13, 14, 21, 23], "3": [0, 3, 4, 5, 7, 8, 9, 12, 14, 19, 21, 22], "type": [0, 1, 3, 5, 6, 8, 9, 11, 12, 14, 15, 16, 17, 18, 19, 22], "modul": [0, 1, 2, 8, 14, 21], "convent": 0, "except": [0, 1, 12, 17, 19, 21, 22], "us": [0, 2, 3, 4, 5, 6, 8, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, 22, 23], "instead": [0, 1, 2, 5, 12, 14, 16, 19, 21, 22], "union": 0, "readabl": 0, "For": [0, 1, 2, 3, 4, 5, 14, 18, 21, 23], "extern": 0, "full": [0, 1, 3, 5, 9, 16, 17, 19, 21, 22], "import": [0, 1, 5, 6, 12], "common": [0, 1, 5, 8], "abbrevi": 0, "e": [0, 4, 8, 12, 13, 14, 17, 21, 24], "g": [0, 4, 5, 7, 8, 12, 13, 14, 21], "np": 0, "ar": [0, 2, 3, 4, 5, 6, 8, 9, 12, 13, 14, 16, 17, 18, 19, 21, 22, 23, 24], "accept": [0, 2, 18, 19, 21, 22, 23], "torch": [0, 7, 9, 14, 16, 21, 22], "tensor": [0, 14, 15, 16, 18], "option": [0, 1, 2, 4, 5, 6, 7, 8, 9, 13, 14, 15, 16, 17, 18, 19], "don": [0, 1, 12, 18], "t": [0, 1, 5, 6, 9, 12, 16, 18, 24], "tic": 0, "like": [0, 3, 5, 10, 11, 12, 18, 22, 23], "str": [0, 1, 3, 12, 14, 15, 16, 17, 18], "direct": [0, 5, 18], "obj": [0, 14], "handl": [0, 1, 4, 5, 12, 14], "veri": [0, 1, 2, 9], "well": [0, 1, 3, 4, 5, 9, 21, 23], "without": [0, 4, 5, 6, 8, 16, 21, 23], "help": [0, 1, 4, 5, 12, 13, 22, 23], "so": [0, 1, 5, 9, 12, 21], "avoid": [0, 1, 2, 21], "clutter": 0, "support": [0, 2, 3, 4, 8, 14, 16, 21], "multipl": [0, 1, 2, 3, 12, 14, 15, 16, 21, 22], "return": [0, 1, 6, 12, 14, 16, 17, 18], "work": [0, 3, 5, 8, 10, 11, 12, 13, 18, 21], "still": [0, 1, 2], "def": [0, 1, 12], "foo": 0, "b": [0, 1, 3, 4, 9, 18], "my": [0, 8], "arg": [0, 6, 12, 16, 17, 18], "object": [0, 1, 3, 14, 15, 17, 18, 19, 21], "someth": 0, "anoth": [0, 14, 23], "thing": [0, 1, 2], "rather": [0, 1, 9], "long": [0, 1, 3, 5, 14], "descript": [0, 4], "spill": 0, "over": [0, 1, 3, 5, 6, 9, 12, 14, 16, 18, 21, 22], "cite": [0, 5, 13], "directli": [0, 1, 3, 5, 16, 22, 23], "link": [0, 1, 4, 5, 10, 12], "entri": [0, 1, 19, 21, 22], "ref": [0, 21], "bib": 0, "attent": [0, 2, 3, 4, 5, 6, 8, 11, 12, 14, 18, 22, 23, 24], "need": [0, 1, 2, 3, 4, 5, 6, 9, 12, 13, 16, 21, 22, 23, 24], "visit": [0, 12], "arxiv": [0, 1, 21, 22, 24], "choos": [0, 12, 23], "bibtext": 0, "search": [0, 1, 5, 8, 12, 18, 23], "ctrl": 0, "f": [0, 4, 5, 6, 12], "dblp": 0, "journal": 0, "corr": [0, 24], "vaswanispujgkp17": 0, "find": [0, 5, 12, 13, 23], "copi": [0, 1, 3, 4, 5, 12, 14, 15, 16, 21, 22], "past": [0, 1, 9, 21], "citat": [0, 8], "Then": [0, 12, 14, 16, 23], "howev": [0, 1, 2, 4, 5, 12, 14, 16], "better": [0, 1, 2, 5, 9, 19, 21, 22], "than": [0, 1, 2, 3, 7, 9, 12, 16, 18, 19, 21, 22, 23], "noth": [0, 14], "shape": [0, 18], "prefer": [0, 1, 9], "c": [0, 1, 5, 9, 10, 11, 16], "read": [0, 1, 12, 13, 17], "allow": [0, 1, 2, 3, 5, 15, 19, 21, 22], "x": [0, 1, 5, 6, 9, 10, 14, 16, 18, 21], "multplic": 0, "few": [0, 1, 2, 4, 12], "variat": 0, "parenthes": 0, "allennlp": 0, "exactli": 0, "fairseq": [0, 1, 19, 21], "singl": [0, 1, 6, 7, 12, 14, 17, 19, 21, 22, 23], "tick": 0, "again": [0, 12], "differ": [0, 4, 5, 6, 8, 12, 16, 17, 21, 22], "unnecessari": 0, "space": [0, 1, 12, 21, 23], "charact": [0, 1, 10, 11, 19, 21, 22], "capit": 0, "punctuat": [0, 12], "multi": [0, 8, 16], "line": [0, 1, 3, 4, 5, 6, 9, 10, 11, 12, 15, 19, 21, 22, 23], "blank": [0, 1, 9, 12, 19, 21], "after": [0, 1, 3, 5, 12, 16, 18, 21], "close": [0, 1, 12, 19, 21, 22], "quot": 0, "Not": [0, 1], "note": [0, 1, 2, 4, 5, 8, 9, 12, 18, 23], "least": [0, 1, 4, 18], "focu": 0, "content": [0, 1, 4, 22, 23], "rememb": 0, "consist": [0, 1, 10, 16, 23], "good": [0, 1, 9, 12, 21], "Be": [0, 1], "sensibl": [0, 1], "about": [0, 1, 6], "gener": [0, 3, 4, 5, 8, 9, 11, 12, 14, 15, 16, 18, 22, 24], "one": [0, 1, 2, 3, 4, 5, 6, 9, 10, 11, 12, 16, 19, 21, 22, 23], "stand": 0, "alon": 0, "summari": [0, 5, 18, 22], "per": [0, 1, 2, 3, 5, 9, 10, 11, 12, 16, 19, 21, 22, 23], "sometim": [0, 1], "": [0, 4, 5, 7, 8, 9, 10, 12, 18, 19, 21, 22, 23, 24], "cut": [0, 6], "an": [0, 1, 3, 4, 5, 6, 9, 10, 11, 12, 14, 15, 16, 19, 21, 22, 24], "extend": [0, 1, 5, 16], "It": [0, 1, 2, 4, 5, 9, 10, 11, 12, 13, 14, 16, 17, 23], "alwai": [0, 1, 16], "have": [0, 1, 3, 4, 5, 7, 9, 12, 14, 16, 18, 21, 23], "trail": 0, "yaml": [1, 3, 4, 5, 6, 7, 19, 21, 22], "partial": 1, "To": [1, 2, 3, 4, 5, 6, 9, 10, 11, 16, 23, 24], "overview": [1, 8], "quickstart": [1, 8, 13], "section": [1, 5, 6, 12, 23], "tutori": [1, 4, 12, 13], "As": [1, 4], "remind": [1, 2], "reli": [1, 2, 9], "torchtext": [1, 2], "5": [1, 3, 4, 5, 7, 9, 10, 12, 18, 21, 22], "version": [1, 4, 5, 7, 8, 10, 11, 12, 16, 17, 18], "field": [1, 5, 14], "rawfield": 1, "multifield": 1, "which": [1, 2, 3, 4, 5, 9, 14, 15, 16, 18, 21, 23], "were": [1, 2, 23], "deprec": [1, 22], "In": [1, 3, 5, 6, 12, 16, 22, 23, 24], "order": [1, 9, 14], "old": [1, 2, 12], "mimic": [1, 9], "those": [1, 9, 21], "result": [1, 3, 5, 6, 9, 12, 17, 21, 23], "newer": 1, "13": [1, 5], "14": [1, 4, 21], "convers": [1, 10, 11, 18], "elimin": 1, "complet": [1, 4, 10, 18], "perfom": [1, 21], "tool": [1, 2, 4, 9, 12, 23], "convertv2": [1, 2], "_": [1, 2, 4, 6, 9, 10, 12, 16, 23], "v2model": 1, "myoldmodel": 1, "pt": [1, 3, 4, 6, 7, 9, 10, 11, 12, 21, 22, 23], "v3model": 1, "newmodel": 1, "The": [1, 2, 3, 4, 5, 6, 9, 10, 11, 12, 14, 16, 17, 18, 19, 21, 22, 23], "longer": [1, 2, 5, 22], "kei": [1, 12, 14, 16, 21, 23], "replac": [1, 4, 12, 18, 19, 21, 22], "vocab": [1, 2, 3, 4, 6, 8, 9, 14, 15, 16, 18, 23], "rnn_size": [1, 2, 9, 12], "now": [1, 2, 5, 12, 23], "hidden_s": [1, 2, 3, 5, 16, 21, 23], "enc_rnn_siz": [1, 2, 11], "enc_hid_s": [1, 2, 21], "dec_rnn_siz": [1, 2, 11], "dec_hid_s": [1, 2, 21], "A": [1, 3, 4, 5, 9, 10, 11, 14, 15, 16, 21, 24], "add_qkvbia": [1, 2, 16, 21, 23], "true": [1, 2, 3, 4, 5, 6, 9, 12, 14, 15, 16, 18, 19, 21, 22, 23], "default": [1, 2, 9, 11, 12, 14, 16, 17, 19, 20, 21, 22, 23], "fals": [1, 2, 3, 5, 12, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], "gpt2": 1, "languag": [1, 5, 8, 9, 19, 21, 22], "lm": [1, 6, 14, 21, 23], "where": [1, 3, 4, 5, 9, 12, 15, 16, 18, 19, 21, 22, 23], "onli": [1, 2, 3, 5, 6, 9, 12, 14, 15, 16, 18, 19, 21, 22, 23], "unk": [1, 18, 19, 21, 22], "flag": [1, 3, 5, 14, 23], "structur": [1, 2, 3, 12, 16, 24], "sensit": [1, 9, 12], "hyperparamet": [1, 9, 12], "effect": [1, 5, 9, 16, 17, 19], "setup": [1, 9, 13, 14, 16], "confirm": [1, 9], "replic": [1, 5, 8, 9, 12, 16], "wmt": [1, 9], "wmt17": [1, 8], "en": [1, 8, 9, 21], "de": [1, 8, 9, 17], "opt": [1, 5, 6, 13, 14, 15, 16, 17, 21, 23], "save_model": [1, 3, 5, 9, 10, 11, 12, 21, 23], "mybasemodel": 1, "save_checkpoint_step": [1, 3, 9, 12, 14, 21, 23], "10000": [1, 3, 5, 9, 12, 14, 16, 21, 22, 23], "valid_step": [1, 9, 12, 14, 21, 23], "train_step": [1, 3, 5, 9, 11, 12, 14, 21, 23], "200000": [1, 5, 9], "batch": [1, 2, 9, 12, 14, 15, 16, 18, 21, 22, 23], "bucket_s": [1, 2, 15, 21, 23], "262144": [1, 21], "world_siz": [1, 5, 9, 12, 21, 22, 23], "4": [1, 2, 3, 5, 7, 8, 9, 11, 12, 19, 21, 22, 23], "gpu_rank": [1, 3, 5, 9, 10, 11, 12, 14, 21, 22, 23], "num_work": [1, 2, 21, 23], "batch_typ": [1, 5, 7, 9, 15, 21, 22, 23], "batch_siz": [1, 3, 5, 7, 9, 10, 11, 12, 15, 16, 18, 21, 22, 23], "4096": [1, 4, 5, 7, 9, 14, 23], "valid_batch_s": [1, 5, 21, 23], "2048": [1, 9, 12, 15, 21], "accum_count": [1, 5, 9, 14, 21, 23], "accum_step": [1, 14, 21, 23], "optim": [1, 2, 5, 8, 9, 11, 12, 23], "model_dtyp": [1, 14, 21, 23], "fp16": [1, 2, 14, 21, 22, 23], "adam": [1, 5, 9, 11, 12, 14, 21], "learning_r": [1, 3, 5, 9, 10, 11, 12, 14, 21, 23], "warmup_step": [1, 5, 9, 21, 23], "8000": [1, 5, 9, 12], "decay_method": [1, 5, 9, 21, 23], "noam": [1, 5, 9, 21, 24], "adam_beta2": [1, 5, 9, 21, 23], "998": [1, 5, 7, 9, 23], "max_grad_norm": [1, 5, 9, 10, 11, 14, 21, 23], "label_smooth": [1, 5, 9, 21, 23], "param_init": [1, 5, 9, 12, 21, 23], "param_init_glorot": [1, 5, 9, 12, 21, 23], "encoder_typ": [1, 3, 5, 6, 9, 10, 12, 21, 23], "decoder_typ": [1, 3, 5, 6, 9, 12, 21, 23], "position_encod": [1, 5, 9, 12, 16, 21, 23], "enc_lay": [1, 11, 21], "6": [1, 3, 9], "dec_lay": [1, 11, 21, 23], "8": [1, 2, 3, 5, 9, 11, 12, 14, 21, 22, 23], "512": [1, 4, 5, 9, 11, 12, 21], "word_vec_s": [1, 5, 9, 10, 12, 16, 21, 23], "transformer_ff": [1, 9, 12, 21, 23], "dropout_step": [1, 14, 21, 23], "dropout": [1, 5, 9, 11, 12, 14, 16, 19, 21, 22, 23], "attention_dropout": [1, 5, 14, 16, 21, 23], "here": [1, 2, 4, 5, 6, 9, 12, 13, 18, 23], "most": [1, 2, 5, 12, 14, 18, 19, 21, 22], "paramet": [1, 3, 5, 6, 9, 12, 14, 15, 16, 17, 18, 19, 21, 22, 23], "mean": [1, 2, 5, 9, 16, 17, 21, 23], "correct": [1, 9, 12, 14], "initi": [1, 3, 5, 8, 9, 12, 14, 15, 16, 17, 18, 19], "sinusoid": [1, 9, 16, 21], "each": [1, 2, 3, 4, 9, 12, 14, 15, 16, 18, 19, 21, 22, 23], "rate": [1, 8, 9, 11, 14], "number": [1, 2, 3, 5, 9, 12, 14, 15, 16, 18, 19, 21, 22, 23], "sentenc": [1, 3, 5, 9, 18, 19, 21, 22, 23], "comput": [1, 2, 3, 4, 5, 9, 14, 16, 21, 22], "four": [1, 9, 10], "label": [1, 9, 10, 11, 14, 21], "smooth": [1, 7, 9, 19, 21, 22], "loss": [1, 5, 8, 9, 21, 23], "batch_size_multipl": [1, 15, 21, 23], "vocab_size_multipl": [1, 21], "num": [1, 2], "worker": [1, 2], "dure": [1, 2, 3, 4, 5, 16, 17, 21, 22], "decai": [1, 2, 14, 21], "system": [1, 2, 3, 10, 11, 18, 21, 24], "max_relative_posit": [1, 16, 21, 23], "20": [1, 2, 5, 6, 7, 10, 12, 19, 21, 22], "fast": [1, 2, 4, 7, 9], "ctranslate2": [1, 2, 4, 14], "basic": [1, 5, 6], "stem": 1, "origin": [1, 2, 4, 12, 14, 21], "even": [1, 23], "sinusoidalinterleav": [1, 16, 21], "sinusoidalconcat": [1, 21], "position_encoding_typ": [1, 16, 21], "forget": 1, "mode": [1, 4, 6, 9, 19, 21, 22, 23], "shaw": 1, "http": [1, 3, 4, 5, 7, 9, 10, 11, 13, 21, 22, 23, 24], "org": [1, 7, 13, 21, 22, 24], "ab": [1, 21, 22, 24], "1803": [1, 21], "02155": [1, 21], "n": [1, 2, 4, 5, 12, 16, 18, 19, 21, 22, 23, 24], "16": [1, 3, 5, 12, 18, 21, 23, 24], "32": [1, 3, 21, 23], "see": [1, 5, 6, 10, 12, 14, 16, 17, 18, 19, 21, 23, 24], "rope": 1, "2104": [1, 21], "09864": 1, "mpt": [1, 23], "7b": [1, 8, 21], "2108": 1, "12409": 1, "both": [1, 9, 14, 18, 21, 23], "case": [1, 2, 3, 7, 14, 16, 19, 21, 22], "nutshel": 1, "time": [1, 9, 11, 12, 14, 18, 19, 21, 22], "write": [1, 4, 12, 14], "manag": [1, 12, 14], "wherea": [1, 18, 21], "self": [1, 16, 17, 18, 21, 22], "sure": [1, 4, 5, 9, 12, 18], "export": [1, 9, 12], "cuda_visible_devic": [1, 9, 12], "want": [1, 2, 9, 12, 22, 23], "id": [1, 9, 17, 18, 19, 21], "o": [1, 5, 9, 10, 11, 12], "node": [1, 3, 9, 14, 16, 21], "warn": [1, 9, 10, 11, 12, 15, 19, 21, 22, 23], "distribut": [1, 8, 14, 16, 18, 19], "ha": [1, 2, 5, 12, 18, 21, 23], "been": [1, 4, 12, 18, 23], "properli": [1, 4, 14], "re": [1, 7, 9, 12, 18], "implement": [1, 2, 3, 5, 9, 12, 14, 16, 21, 23], "sinc": [1, 5, 9, 12, 16], "master_ip": [1, 9, 21, 22], "master_port": [1, 9, 21, 22], "second": [1, 3, 9, 11, 14, 16, 17, 23], "accumul": [1, 5, 9, 14, 21], "network": [1, 5, 8, 9, 11, 12, 16, 24], "card": [1, 4, 9], "gbp": [1, 9], "suggest": [1, 9, 21], "higher": [1, 9, 12, 18, 22], "minim": [1, 9], "inter": [1, 9], "legaci": [1, 2, 4, 5, 10, 11, 12, 23], "sever": [1, 9, 16, 18], "couldn": 1, "them": [1, 3, 4, 9, 12, 16, 21], "exclus": [1, 9], "nvidia": [1, 4, 7, 9, 21], "smi": [1, 9], "produc": [1, 2, 3, 9, 11, 18, 19, 21, 22], "consum": [1, 9], "n_gpu": [1, 9, 14], "process": [1, 2, 3, 5, 9, 12, 14, 17, 19, 21, 22, 23], "spawn": [1, 2, 9], "host": [1, 9], "queue": [1, 9, 19], "next": [1, 9, 12, 14, 18, 22], "benefici": [1, 9], "wall": [1, 9], "memori": [1, 9, 12, 14, 17], "shard": [1, 9, 12, 21], "advanc": [1, 3, 9, 13, 16, 18, 21, 23], "codebas": [1, 2], "becaus": [1, 3, 9, 16], "move": [1, 14, 17, 21], "devic": [1, 14, 15, 18, 22], "later": 1, "henc": [1, 3, 6, 10], "step": [1, 3, 5, 7, 8, 12, 13, 14, 16, 18, 21, 22], "onmt_train": [1, 6, 9, 10, 11, 12, 23], "execut": [1, 19, 21], "mkdir": [1, 9], "glove_dir": [1, 9], "wget": [1, 9, 10, 11, 23], "nlp": [1, 3, 9], "stanford": [1, 9], "edu": [1, 9, 10, 11], "6b": [1, 9], "zip": [1, 9, 12], "unzip": [1, 9, 12], "d": [1, 3, 5, 6, 9, 10, 12, 16, 20, 24], "adapt": [1, 5, 11], "your_config": 1, "config": [1, 3, 4, 5, 6, 7, 17, 19, 21, 22], "decod": [1, 3, 4, 5, 6, 8, 9, 11, 12, 14, 19, 23], "side": [1, 4, 5, 14, 17, 19, 21, 22], "both_embed": [1, 21], "100d": [1, 9], "src": [1, 3, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 16, 17, 18, 19, 21, 22, 23], "tgt": [1, 3, 5, 6, 9, 10, 11, 14, 15, 16, 17, 19, 21, 22, 23], "separ": [1, 2, 3, 9, 12, 16, 22, 23], "src_embed": [1, 21], "tgt_embed": [1, 21], "word2vec": [1, 9, 21], "embeddings_typ": [1, 21], "dimens": [1, 3, 5, 16, 18, 21], "100": [1, 3, 5, 9, 11, 18, 23], "save": [1, 4, 6, 12, 14, 19, 21, 22, 23], "save_data": [1, 3, 5, 9, 10, 11, 12, 19, 21, 23], "enc_embed": 1, "dec_embed": 1, "freeze_word_vecs_enc": [1, 21], "freeze_word_vecs_dec": [1, 21], "freez": [1, 16, 21], "specifi": [1, 6, 9, 12, 14, 16, 19, 21, 22, 23], "onmt_transl": [1, 5, 6, 10, 11, 12, 23], "command": [1, 4, 5, 9, 10, 12, 23], "model1_seed1": 1, "model2_seed2": 1, "bear": [1, 9], "mind": [1, 9], "must": [1, 3, 4, 6, 9, 14, 16, 17, 19, 21, 22], "share": [1, 5, 6, 9, 12, 16, 19, 21, 22], "natur": [1, 5], "introduc": [1, 5, 9], "own": 1, "ll": [1, 9, 12], "sequenti": [1, 15], "take": [1, 7, 9, 10, 13, 14, 16, 19, 21, 22], "corpu": [1, 5, 6, 9, 15, 19, 21, 22, 23], "worri": 1, "homogen": 1, "heterogen": 1, "bucket": [1, 15, 16, 21], "mechan": [1, 3, 5, 14, 16, 23], "reason": 1, "sort": [1, 15, 17], "yield": [1, 15], "random": [1, 8, 12, 19, 21], "7": [1, 3, 4, 9, 10, 16, 21], "corpus_1": [1, 6, 23], "path_src": [1, 3, 5, 6, 23], "toi": [1, 3, 10, 11, 23], "end": [1, 3, 14, 18, 19, 21, 22, 23], "train1": 1, "path_tgt": [1, 3, 5, 23], "corpus_2": 1, "valid": [1, 3, 5, 6, 9, 12, 14, 15, 19, 21, 22, 23], "val": [1, 3, 5, 10, 11, 12, 23], "seq2seq": [1, 18, 21], "wa": [1, 5, 9], "bo": [1, 18, 19, 21], "pad": [1, 14, 16, 18, 19, 21], "eo": [1, 18, 19, 21], "behavior": [1, 12], "group": [1, 3, 14, 21, 22, 23], "specila": 1, "narg": 1, "defaulttoken": [1, 14], "srctok1": 1, "srctok2": 1, "srctok3": 1, "srctokn": 1, "tgttok1": 1, "tgttok2": 1, "tgttokm": 1, "But": 1, "said": 1, "eg": [1, 21], "nllb": [1, 21], "200": [1, 9, 19, 21, 22], "llama": [1, 8, 21, 23], "exist": [1, 2, 5, 19, 21, 22, 23], "ex": [1, 4, 19, 21, 22], "fact": 1, "never": [1, 18], "At": [1, 21], "There": [1, 3, 9], "conflict": 1, "forc": [1, 18, 22], "0x00": 1, "pyonmttok": [1, 8, 19, 21, 22], "nbest": 1, "alpha": [1, 5, 18, 22], "src_subword_typ": [1, 6, 19, 21, 22, 23], "src_subword_model": [1, 6, 19, 21, 22, 23], "spm": 1, "tgt_subword_typ": [1, 19, 21, 22, 23], "tgt_subword_model": [1, 19, 21, 22, 23], "candid": [1, 19, 21, 22], "subword_nbest": 1, "subword_alpha": 1, "src_onmttok_kwarg": [1, 6, 19, 21, 22, 23], "none": [1, 7, 14, 15, 16, 17, 18, 19, 21, 22, 23], "spacer_annot": 1, "tgt_onmttok_kwarg": [1, 19, 21, 22, 23], "onmt_token": [1, 6, 19, 21, 22, 23], "other": [1, 2, 4, 12, 14, 18, 19, 21, 22, 23, 24], "method": [1, 2, 4, 5, 6, 14, 16, 21], "dedic": [1, 15], "detail": [1, 6, 10, 13, 19, 21], "lucki": 1, "dai": [1, 24], "alreadi": [1, 6], "easili": 1, "everi": [1, 5, 9, 14, 15, 16, 21, 22], "found": [1, 5, 6], "filtertoolong": [1, 3, 5, 19, 21, 22, 23], "misc": 1, "filtertoolongtransform": 1, "src_seq_length": [1, 3, 5, 12, 19, 21, 22, 23], "maximum": [1, 5, 19, 21, 22], "sequenc": [1, 3, 5, 6, 12, 14, 16, 17, 18, 19, 21, 22, 24], "tgt_seq_length": [1, 3, 5, 10, 12, 19, 21, 22, 23], "prefixtransform": 1, "src_prefix": [1, 19, 21, 22], "tgt_prefix": [1, 19, 21, 22], "__some_src_prefix__": 1, "__some_tgt_prefix__": 1, "uniqu": 1, "oppos": 1, "come": 1, "given": [1, 2, 3, 9, 10, 11, 17], "spa_latn": 1, "tgt_file_prefix": [1, 18, 22], "suffixtransform": 1, "src_suffix": [1, 19, 21, 22], "tgt_suffix": [1, 19, 21, 22], "__some_src_suffix__": 1, "__some_tgt_suffix__": 1, "uppercasetransform": 1, "present": [1, 5], "cap": [1, 12], "string": [1, 12, 16, 19, 21, 22], "strip": [1, 5, 12], "diacrit": 1, "accent": 1, "usual": [1, 6, 23], "desir": 1, "although": [1, 12, 16], "ratio": [1, 7, 12, 18, 19, 21, 22], "upper_corpus_ratio": [1, 19, 21, 22], "01": [1, 11, 19, 21, 22], "normalizetransform": 1, "rule": [1, 3, 21], "mose": 1, "src_lang": [1, 19, 21, 22], "cz": 1, "fr": 1, "tgt_lang": [1, 19, 21, 22], "penn": [1, 19, 21, 22], "substitut": [1, 19, 21, 22], "norm_quote_comma": [1, 19, 21, 22], "quotat": [1, 19, 21, 22], "comma": [1, 19, 21, 22], "norm_numb": [1, 19, 21, 22], "pre_replace_unicode_punct": [1, 19, 21, 22], "unicod": [1, 19, 21, 22], "punct": [1, 19, 21, 22], "post_remove_control_char": [1, 19, 21, 22], "remov": [1, 2, 5, 6, 12, 19, 21, 22], "control": [1, 14, 19, 21, 22], "char": [1, 19, 21, 22], "cleantransform": 1, "src_eq_tgt": [1, 19, 21, 22], "same_char": [1, 19, 21, 22], "repeat": [1, 5, 12, 18, 22], "same_word": [1, 19, 21, 22], "script_ok": 1, "contain": [1, 4, 6, 9, 10, 11, 16, 17, 18, 23], "belong": 1, "latin": [1, 12, 19, 21, 22], "script_nok": 1, "src_tgt_ratio": [1, 19, 21, 22], "ration": 1, "avg_tok_min": [1, 19, 21, 22], "avg_tok_max": [1, 19, 21, 22], "lang_id": 1, "detect": 1, "docifi": [1, 8], "docifytransform": 1, "concaten": [1, 21], "delimit": [1, 3, 19, 21, 22], "pre": [1, 9, 14, 17, 18], "requisit": 1, "empti": [1, 6, 15, 16, 18, 19, 21, 23], "stori": 1, "doc_length": [1, 19, 21, 22], "max": [1, 2, 3, 12, 14, 16, 18, 19, 21, 22, 23], "max_context": [1, 19, 21, 22], "ie": 1, "precaut": 1, "linearli": 1, "stride": [1, 11, 15], "fuzzymatch": [1, 8], "fuzzymatchtransform": 1, "describ": [1, 2, 4, 5, 12, 13, 17, 21], "machin": [1, 5, 13, 16, 18, 23, 24], "current": [1, 4, 5, 9, 14, 15, 16, 18, 21, 22], "tm": [1, 19, 21, 22], "should": [1, 2, 3, 5, 9, 12, 14, 15, 16, 18, 21, 23], "flat": [1, 19, 21, 22], "text": [1, 4, 8, 15, 16, 18, 19, 21, 22, 24], "intens": 1, "offer": 1, "achiev": [1, 5], "overhead": 1, "spec": 1, "mai": [1, 2, 9, 14, 15, 17, 18, 19, 21], "experi": [1, 5, 19, 21, 22], "bucket_size_init": [1, 15, 21], "bucket_size_incr": [1, 15, 21], "increas": [1, 2, 3, 12, 16, 22], "prefetch_factor": [1, 21], "wait": [1, 21, 22], "size": [1, 2, 3, 5, 11, 12, 14, 15, 16, 18, 19, 21, 22], "200k": 1, "250k": 1, "unit": [1, 3, 16, 23], "enough": [1, 4], "suffici": [1, 2], "short": [1, 4, 11], "bit": [1, 22], "n_sampl": [1, 4, 5, 6, 7, 19, 21, 23], "tm_path": [1, 19, 21, 22], "path": [1, 3, 6, 9, 10, 11, 12, 14, 16, 17, 18, 19, 21, 22, 23], "fuzzy_corpus_ratio": [1, 19, 21, 22], "fuzzy_threshold": [1, 19, 21, 22], "threshold": [1, 19, 21, 22], "70": [1, 3, 12, 19, 21, 22], "tm_delimit": [1, 19, 21, 22], "fuzzy_token": [1, 19, 21, 22], "fuzzymatch_min_length": [1, 19, 21, 22], "min": [1, 19, 21, 22], "fuzzymatch_max_length": [1, 19, 21, 22], "inlinetag": [1, 8], "inlinetagstransform": 1, "placehold": [1, 14], "kind": 1, "pair": [1, 3, 5, 9, 12, 14, 15, 17, 19, 21, 22], "open": [1, 12, 13, 19, 21, 22], "isol": [1, 19, 21, 22], "standalon": 1, "tab": [1, 22], "dictionari": [1, 5, 14, 16, 18, 19, 21, 22], "term": [1, 9, 14, 16, 19, 21, 22, 23], "phrase": 1, "30k": 1, "recommend": [1, 2, 4, 7, 21], "user": [1, 2, 14, 17], "defin": [1, 3, 5, 6, 14, 15, 16, 19, 21, 22, 23], "tags_dictionary_path": [1, 19, 21, 22], "tags_corpus_ratio": [1, 19, 21, 22], "max_tag": [1, 19, 21, 22], "paired_stag": [1, 19, 21, 22], "ph": 1, "beg": 1, "paired_etag": [1, 19, 21, 22], "isolated_tag": [1, 19, 21, 22], "std": 1, "src_delimit": [1, 19, 21, 22], "terminologytransform": 1, "provid": [1, 2, 3, 4, 6, 9, 10, 11, 13, 22], "spaci": [1, 19, 21, 22], "lemmat": 1, "facil": 1, "solv": 1, "inflect": 1, "problem": [1, 3, 10, 18], "form": [1, 5], "correctli": 1, "src_term_start": [1, 19, 21, 22], "tgt_term_start": [1, 19, 21, 22], "target_lemma_for_aug": 1, "tgt_term_end": [1, 19, 21, 22], "termbase_path": [1, 19, 21, 22], "src_spacy_language_model": [1, 19, 21, 22], "tgt_spacy_language_model": [1, 19, 21, 22], "term_corpus_ratio": [1, 19, 21, 22], "term_example_ratio": [1, 19, 21, 22], "src_term_stoken": [1, 19, 21, 22], "tgt_term_stoken": [1, 19, 21, 22], "tgt_term_etoken": [1, 19, 21, 22], "term_source_delimit": [1, 19, 21, 22], "src_subword_nbest": [1, 19, 21, 22], "tgt_subword_nbest": [1, 19, 21, 22], "src_subword_alpha": [1, 19, 21, 22], "probabl": [1, 9, 16, 18, 19, 21, 22], "tgt_subword_alpha": [1, 19, 21, 22], "onmttokenizertransform": 1, "kwarg": [1, 16], "sentencepiecetransform": 1, "bpetransform": 1, "compos": 1, "part": [1, 3, 5, 6, 12, 18], "denois": 1, "comprehens": 1, "These": [1, 2, 3, 5, 12, 16, 18], "permute_sent_ratio": [1, 19, 21, 22], "proport": [1, 19, 21, 22], "permut": [1, 19, 21, 22], "boundari": [1, 5, 19, 21, 22], "rotate_ratio": [1, 19, 21, 22], "input": [1, 2, 3, 4, 5, 6, 12, 14, 15, 16, 17, 18, 19, 21, 22, 23, 24], "insert_ratio": [1, 19, 21, 22], "insert": [1, 19, 21, 22, 23], "random_ratio": [1, 19, 21, 22], "mask_ratio": [1, 19, 21, 22], "mask_length": [1, 19, 21, 22], "window": [1, 11, 14, 19, 21, 22], "span": [1, 19, 21, 22], "poisson": [1, 19, 21, 22], "poisson_lambda": [1, 19, 21, 22], "lambda": [1, 12, 19, 21, 22], "valu": [1, 2, 3, 4, 5, 9, 14, 16, 17, 18, 19, 21, 22, 23], "replace_length": [1, 19, 21, 22], "switchouttransform": 1, "switchout_temperatur": [1, 19, 21, 22], "temperatur": [1, 18, 19, 21, 22], "tokendrop": [1, 19, 21, 22], "tokendroptransform": 1, "tokendrop_temperatur": [1, 19, 21, 22], "delet": [1, 12, 19, 21, 22, 23], "tokenmask": [1, 19, 21, 22], "tokenmasktransform": 1, "tokenmask_temperatur": [1, 19, 21, 22], "inherit": 1, "instanc": [1, 4, 14, 16, 18], "templat": 1, "register_transform": 1, "out": [1, 2, 3, 5, 6, 12, 14, 16, 23], "too": [1, 3, 12, 18, 21], "classmethod": [1, 14, 15, 16], "add_opt": 1, "cl": [1, 14], "parser": [1, 12], "avalil": 1, "relat": [1, 19, 21, 22, 23], "add_argument_group": 1, "int": [1, 3, 12, 14, 15, 16, 17, 18], "_parse_opt": 1, "is_train": 1, "stat": [1, 12, 14, 21, 22], "els": [1, 12], "len": [1, 12, 14, 16, 18], "filtertoolongstat": 1, "_repr_arg": 1, "repres": [1, 3, 23], "would": [1, 2, 18, 21], "pars": [1, 15, 16, 17], "happen": [1, 18, 19, 21], "log": [1, 4, 8, 14, 18], "wrapper": [1, 5, 14], "definit": [1, 3, 16], "automat": [1, 3, 5], "proper": [1, 17], "usabl": 1, "through": [1, 2, 3, 14], "could": [1, 9, 12, 18], "collect": [1, 12, 15], "statist": [1, 14, 21], "observablestat": 1, "rune": 1, "__slots__": 1, "__init__": [1, 17], "element": [1, 4, 15], "keep": [1, 12, 14, 17, 18, 21], "track": 1, "__slot__": 1, "lightweight": 1, "suppli": 1, "logic": 1, "overrid": [1, 16, 18, 19, 21], "__str__": 1, "messag": 1, "instanti": [1, 14], "pass": [1, 9, 14, 16, 17, 21], "correspond": [1, 15, 22], "gather": [1, 14], "report": [1, 12, 13, 14, 22], "dict": [1, 14, 15, 16, 17, 19, 21, 22], "pharaoh": [1, 9], "inputt": [1, 15], "parallelcorpu": [1, 15], "consid": [1, 5, 12, 16], "futur": [1, 9], "customparallelcorpu": 1, "cf": [1, 2, 14, 16, 21], "bigger": 1, "limit": [1, 22], "anmount": 1, "vram": 1, "principl": [1, 9], "layer": [1, 3, 4, 5, 9, 12, 14, 16, 21, 22, 23], "trainabl": [1, 14], "reduc": [1, 2, 9, 14], "amount": [1, 5, 12, 15, 21], "especi": 1, "3b": [1, 21], "lora_lay": [1, 21, 23], "linear_valu": [1, 21, 23], "linear_queri": [1, 21, 23], "two": [1, 4, 5, 9, 12, 16, 21], "lora_rank": [1, 21, 23], "lora_dropout": [1, 21, 23], "lora_alpha": [1, 21, 23], "lora_embed": [1, 21, 23], "compat": [1, 2, 12], "update_vocab": [1, 21], "bitsandbyt": 1, "enabl": [1, 4, 5, 6, 9, 14, 16, 21, 22, 23], "quantiz": [1, 4, 21, 22, 23], "linear": [1, 2, 3, 12, 16, 21, 23], "inform": [1, 3, 16, 21, 22], "com": [1, 3, 4, 5, 7, 11, 13, 16, 21, 23], "timdettm": 1, "blog": 1, "post": [1, 16], "huggingfac": 1, "co": 1, "hf": 1, "quant_lay": [1, 21, 22, 23], "w_1": [1, 23], "w_2": [1, 23], "quant_typ": [1, 21, 22, 23], "bnb_nf4": [1, 21, 22, 23], "instan": 1, "positionwis": 1, "feed": [1, 4, 16, 21], "forward": [1, 4, 14, 16, 21], "queri": [1, 16, 21, 23], "final": [1, 5, 16, 18, 23], "bnb": 1, "fp4": 1, "nf4": 1, "use_ckpt": [1, 16, 21, 23], "ffn": [1, 16, 21, 23], "mha": [1, 21], "careful": [1, 23], "report_align": [1, 9, 18, 22], "call": [1, 5, 9, 12, 16, 18], "output": [1, 3, 4, 5, 7, 8, 9, 10, 11, 12, 14, 16, 17, 18, 19, 21, 22, 23], "argmax": [1, 9, 12, 22], "last": [1, 9, 14, 16, 21], "behaviour": [1, 9], "empir": [1, 5, 9], "determin": [1, 9], "thei": [1, 4, 9, 12, 14, 16, 18], "penultim": [1, 9], "slight": [1, 4, 6], "architectur": [1, 4, 9, 12, 21], "j": [1, 5, 9, 16, 21, 24], "indic": [1, 3, 9, 12, 14, 16, 18, 19, 21, 22], "th": [1, 9], "da": [1, 9], "stimmt": [1, 9], "nicht": [1, 9], "gold_align": [1, 18, 22], "between": [1, 2, 3, 5, 6, 9, 16, 19, 21, 22, 24], "gold": [1, 9, 18, 22], "assum": [1, 3, 5, 9, 10, 11, 12, 18], "evalu": [1, 8, 9, 12, 14, 23], "symetr": [1, 9], "bidirect": [1, 5, 9, 16, 21], "lilt": [1, 9], "qualiti": [1, 4, 9], "further": [1, 9, 19, 21], "improv": [1, 4, 9, 12, 14, 16, 18, 21, 24], "invok": [1, 9], "task": [1, 4, 6, 8, 9, 14, 15], "jointli": [1, 5, 9, 14, 16, 24], "preprocess": [1, 2, 5, 10, 11, 12, 17, 19, 21, 22], "giza": [1, 9], "path_align": 1, "incompat": 1, "joint": [1, 9], "pipelin": [1, 2, 19, 21, 22, 23], "modif": [1, 4, 14], "level": [1, 15, 19, 21, 22], "made": [1, 23], "invalid": [1, 19, 21, 22], "lambda_align": [1, 9, 14, 21], "05": [1, 5, 9, 16, 23], "alignment_lay": [1, 9, 16, 21], "index": [1, 9, 12, 16], "alignment_head": [1, 9, 16, 21], "kept": [1, 9], "num_head": [1, 9], "full_context_align": [1, 9, 16, 21], "slow": [1, 2, 3, 9, 19, 22], "down": [1, 2, 9, 18, 19, 23], "tok": [1, 7, 9, 17], "map": [1, 4, 12, 14, 16, 19, 21, 22], "onmt_build_vocab": [1, 3, 5, 6, 23], "reset_optim": [1, 21], "state": [1, 3, 5, 14, 16, 18, 21], "train_from": [1, 21, 23], "incorpor": [1, 21], "append": [1, 12, 19, 21, 22], "actual": [1, 12, 18, 23], "textual": 1, "l": [1, 10, 12], "she": 1, "hard": 1, "prior": [1, 19, 21], "featinfertransform": 1, "instac": 1, "n_src_feat": [1, 15, 19, 21, 22], "expect": [1, 6, 12, 14, 18, 23], "src_feats_default": [1, 15, 19, 21, 22], "realli": 1, "mix": [1, 7, 14, 15], "annot": [1, 3, 12, 19, 21, 22], "appropri": [1, 3, 4, 18], "src_word_vec_s": [1, 3, 16, 21], "tgt_word_vec_s": [1, 3, 21], "feat_merg": [1, 16, 21], "vec": [1, 12], "feat_vec_s": [1, 12, 16, 21], "mayb": 1, "feat_vec_expon": [1, 16, 21], "ensur": [1, 12], "possibl": [1, 2, 17, 18, 19, 21, 22, 23], "concat": [1, 4, 16, 21], "dummi": 1, "inferfeat": [1, 8], "reversible_token": [1, 19, 21, 22], "joiner": [1, 19, 21, 22], "src_vocab": [1, 3, 5, 16, 19, 21, 23], "exp": [1, 7, 21], "tgt_vocab": [1, 3, 5, 19, 21, 23], "sum": [1, 14, 16, 18, 21], "rest": [1, 20], "serv": 1, "discuss": 1, "forum": [1, 13], "idea": [1, 9], "behind": 1, "point": [1, 5, 24], "receiv": [1, 2], "detoken": [1, 17], "available_model": [1, 20], "conf": [1, 20], "json": [1, 4, 20], "along": [1, 12, 14], "models_root": 1, "manual": [1, 17, 18], "assign": [1, 22], "counter": 1, "ass": 1, "timeout": [1, 17, 21, 22], "interv": [1, 21], "unload": [1, 17], "reset": 1, "whether": [1, 14, 16, 17, 18, 21], "on_timeout": [1, 17], "everyth": 1, "to_cpu": [1, 17], "transfer": [1, 21], "ram": [1, 12], "faster": [1, 16], "reload": 1, "translate_opt": 1, "bool": [1, 3, 14, 15, 16, 17, 18], "ct2_translator_arg": [1, 17], "ct2_translate_batch_arg": [1, 17], "engin": 1, "appear": 1, "simultan": 1, "ct2_": 1, "_arg": 1, "ident": 1, "ct2_model": [1, 17], "model_0": 1, "600": 1, "beam_siz": [1, 3, 5, 6, 7, 10, 18, 22, 23], "wmtenfr": 1, "light": [1, 9], "model_root": [1, 17], "other_model": 1, "10": [1, 5, 6, 7, 12, 13, 21, 22, 23, 24], "merg": [1, 4, 16, 21], "master": [1, 21, 22, 23], "branch": [1, 2], "cp": 1, "path_to_my_model": 1, "ip": [1, 20, 21, 22], "port": [1, 20, 21, 22], "5000": [1, 3, 4, 14, 16, 19, 20, 21, 23], "url_root": [1, 20], "optionn": 1, "explicit": 1, "librari": [1, 21], "configargpars": 1, "cor": 1, "waitress": 1, "dockerfil": 1, "cuda10": 1, "cudnn7": 1, "runtim": 1, "workdir": 1, "usr": [1, 5], "app": 1, "cach": [1, 4, 7, 16], "dir": [1, 4, 7, 16], "r": [1, 3, 5, 10, 12, 13, 16, 21, 24], "volum": 1, "cmd": 1, "imag": [1, 2, 8, 14], "opennmt_serv": 1, "rm": [1, 4, 12, 16, 21, 23], "p": [1, 5, 6, 10, 16, 18, 22, 24], "fex": 1, "rout": 1, "bin": [1, 4, 5, 7, 23], "127": 1, "curl": 1, "wmt14": 1, "de_acc_69": 1, "22_ppl_4": 1, "33_e9": 1, "involv": 1, "h": [1, 5, 9, 16, 19, 20, 21, 22], "applic": [1, 4], "model_id": [1, 17], "u2581di": 1, "u2581formen": 1, "kant": 1, "u2581": 1, "k": [1, 2, 5, 6, 18, 22], "u00f6r": 1, "ner": 1, "u2581d": 1, "u2581stahl": 1, "u": [1, 2, 4, 5, 6], "u00df": 1, "statu": 1, "ok": [1, 3], "total": [1, 14, 21, 22], "510261535644531": 1, "509992599487305": 1, "writing_src": 1, "0002689361572265625": 1, "v3": [2, 8], "releas": [2, 5], "doe": [2, 3, 4, 5, 8, 9, 12, 22], "anymor": 2, "checkpoint": [2, 8, 12, 14, 16, 21, 23], "slightli": [2, 4, 9, 12], "convert": [2, 4, 9, 12, 15, 17], "v2": [2, 8], "model": [2, 3, 4, 7, 8, 10, 11, 12, 13, 16, 18, 19], "dynam": [2, 6, 15, 16, 21, 22], "paradigm": 2, "appli": [2, 4, 5, 6, 8, 11, 14, 15, 16, 17, 18, 19, 21, 22], "fly": [2, 4, 6, 8], "transform": [2, 3, 4, 5, 6, 7, 8, 11, 13, 14, 15, 16, 23, 24], "data": [2, 8, 9, 10, 11, 12, 14, 24], "advantag": 2, "amongst": 2, "drastic": [2, 5], "train": [2, 3, 4, 8, 10, 11, 12, 13, 14, 15, 16, 22], "augment": [2, 19, 21, 22, 24], "manipul": [2, 14], "can": [2, 3, 4, 5, 6, 8, 10, 12, 13, 15, 17, 18, 19, 21, 22, 23], "specif": [2, 4, 5, 8, 13, 14, 15, 18, 19, 21, 23], "token": [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18, 19, 21, 22, 23], "filter": [2, 3, 5, 8, 12, 23], "nois": [2, 8], "custom": [2, 4, 8, 14, 17, 21], "quit": [2, 23], "straightforward": 2, "thank": 2, "load": [2, 6, 8, 9, 12, 14, 15, 16, 17, 21], "updat": [2, 8, 9, 14, 17, 18, 21], "readili": [2, 8], "avail": [2, 6, 8, 12, 14, 17, 21, 22], "queue_siz": 2, "pool_factor": 2, "adjust": [2, 3], "dataload": [2, 21], "gpu": [2, 3, 5, 7, 8, 10, 11, 12, 14, 16, 17, 18, 21, 22, 23], "1": [2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18, 19, 21, 22, 24], "2": [2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 14, 16, 19, 21, 22], "set": [2, 3, 4, 5, 6, 7, 8, 9, 12, 14, 16, 17, 18, 19, 21, 22, 23], "bia": [2, 14, 16, 21], "q": [2, 16], "v": [2, 4, 7, 8, 16, 24], "nn": [2, 14, 16, 21], "multihead": 2, "renam": [2, 12], "convertv2_v3": 2, "store": [2, 6, 10, 11], "infer": [2, 6, 8, 10, 15, 18, 22], "translat": [2, 3, 4, 5, 8, 10, 11, 12, 13, 14, 16, 17, 20, 24], "iter": [2, 6, 8, 14, 21], "trainer": [2, 8, 23], "length_penalti": [2, 5, 18, 22], "avg": [2, 16, 22], "bleu": [2, 7, 12], "score": [2, 7, 8, 12, 14, 16, 17, 22, 23], "compar": [2, 12], "toolkit": [2, 13], "featur": [2, 3, 4, 8, 11, 12, 13, 14, 16, 17, 24], "drop": 2, "v1": [2, 13], "audio": [2, 11], "video": [2, 8], "previou": [2, 18, 23], "retain": 2, "extens": 2, "from": [2, 3, 4, 5, 6, 8, 10, 11, 12, 13, 14, 15, 16, 18, 21, 22], "core": [2, 8, 14], "team": 2, "let": [2, 4, 5], "know": [2, 12], "what": [2, 8, 9, 10, 12, 14, 17], "think": 2, "cpu": [2, 7, 14, 15, 17, 22], "resourc": [2, 8], "power": 2, "ideal": 2, "2n": 2, "thread": [2, 19], "averag": [2, 3, 8, 14, 16, 19, 21, 22, 24], "rel": [2, 8, 10, 11, 16, 21], "posit": [2, 4, 8, 9, 14, 16, 21], "encod": [2, 3, 5, 6, 8, 9, 11, 12, 14, 18, 23], "represent": [3, 11, 16, 21, 24], "ast": 3, "connect": [3, 12, 16], "benefit": 3, "ggnn": [3, 16, 21], "jameschuanggg": [3, 16], "git": [3, 4, 5, 7, 11, 12, 13, 16], "y": [3, 4, 5, 11, 12, 16], "li": [3, 16], "tarlow": [3, 16], "m": [3, 4, 5, 12, 13, 16, 20, 24], "brockschmidt": [3, 16], "zemel": [3, 16], "program": 3, "equival": [3, 5, 21], "proof": 3, "dataflow": 3, "via": [3, 14, 16, 21, 24], "rewrit": 3, "That": [3, 12], "show": [3, 4, 5, 12], "graphic": 3, "beyond": [3, 14], "rnn": [3, 5, 14, 16, 21], "sequence2sequ": 3, "get": [3, 4, 5, 10, 11, 12, 23, 24], "directori": [3, 4, 10, 11, 12, 17, 21], "throughout": [3, 10, 11], "download": [3, 8, 10, 11, 12, 23], "sibl": 3, "clone": [3, 4, 5, 7, 12, 13, 17], "stevekommrusch": 3, "env": 3, "configur": [3, 4, 5, 6, 8, 16, 23], "written": [3, 5, 14, 23], "1000": [3, 12, 23], "30": [3, 14, 18, 21, 22], "cnndm": [3, 5], "weight": [3, 5, 8, 14, 15, 16, 21, 22, 23], "srcvocab": 3, "tgtvocab": 3, "start_decay_step": [3, 21, 23], "learning_rate_decai": [3, 11, 21, 23], "global_attent": [3, 5, 11, 21], "src_ggnn_size": [3, 16, 21], "larger": [3, 23], "plu": [3, 21], "hot": 3, "less": [3, 7, 12], "learn": [3, 5, 6, 8, 10, 11, 14, 16, 19, 21, 24], "state_dim": [3, 16, 21], "togeth": [3, 23], "embed": [3, 4, 5, 8, 12, 19], "64": [3, 9, 12, 21], "bridg": [3, 5, 21, 24], "n_edge_typ": [3, 16, 21], "9": [3, 4, 5, 6, 7, 10, 14, 21], "n_step": [3, 16, 21], "aggreg": 3, "hop": 3, "n_node": [3, 16, 21], "algebra": 3, "express": 3, "axiom": 3, "prove": 3, "model_step_10000": 3, "n_best": [3, 6, 17, 18, 22, 23], "pred": [3, 7, 10, 11, 12, 21, 22], "test_beam5": 3, "translate5": 3, "leverag": 3, "interfac": [3, 14, 16], "much": [3, 12, 21], "nearal": 3, "edg": [3, 16, 21], "eot": 3, "equat": 3, "being": [3, 4], "extra": [3, 9, 13, 16, 21], "rang": [3, 14, 22], "9th": 3, "just": [3, 5, 12, 23], "non": [3, 16, 18, 21], "mathemat": 3, "matric": 3, "identifi": [3, 22], "certain": 3, "occur": [3, 4], "shown": 3, "remain": 3, "numer": [3, 14], "integ": [3, 18], "duplic": 3, "vector": [3, 12, 14, 16, 21], "up": [3, 5, 8, 12, 16, 22], "largest": 3, "creat": [3, 4, 8, 9, 12, 14, 23], "lower": [3, 12, 16, 21], "bet": 3, "rnn_type": [3, 11, 16, 21], "recurr": [3, 8, 16], "lstm": [3, 5, 10, 11, 16, 21, 23], "bidir_edg": [3, 16, 21], "revers": [3, 16, 19, 21, 22], "bridge_extra_nod": [3, 16, 21], "1st": [3, 16], "stabil": [3, 14, 16], "foundat": 4, "instruct": [4, 5, 21, 23], "lora": [4, 8, 21, 23], "8bit": [4, 8, 21, 22, 23], "compress": [4, 21, 22], "wise": 4, "normalis": 4, "rotari": [4, 8, 16, 21], "swiglu": 4, "activ": [4, 16, 21, 23], "maxim": [4, 24], "context": [4, 9, 16, 19, 21, 22], "length": [4, 5, 12, 14, 16, 18, 19, 21, 22, 23], "repositori": [4, 5, 12], "replicate_vicuna": 4, "subdirectori": 4, "chekpoint": [4, 23], "llama7b": [4, 23], "genener": 4, "dataai": [4, 23], "sampl": [4, 5, 6, 8, 10, 11, 18, 19, 21, 23], "tensorboard": [4, 6, 14, 21], "translate_opts_pi": 4, "translate_opts_ct2": 4, "cranslate2": 4, "input_exampl": 4, "simple_infer": 4, "predict": [4, 12, 18, 21, 22, 23], "chatbot": 4, "gradio": 4, "apex": [4, 14, 21], "highli": [4, 7], "perform": [4, 7, 8, 12, 14, 16, 22], "pip3": [4, 7], "disabl": [4, 7, 12], "global": [4, 7, 12, 14, 16], "cpp_ext": [4, 7, 14], "cuda_ext": [4, 7, 14], "deprecated_fused_adam": [4, 7], "xentropi": [4, 7], "fast_multihead_attn": [4, 7], "environ": [4, 12], "procedur": [4, 5], "retriev": 4, "sentencepiec": [4, 19, 21, 22, 23], "offici": 4, "facebookresearch": 4, "convert_llama": 4, "python3": [4, 7], "model_dir": [4, 23], "tokenizer_model": [4, 23], "subword": [4, 8, 9, 23], "extract": [4, 12], "newli": 4, "extract_vocabulari": 4, "out_fil": [4, 18], "alpaca": [4, 23], "give": [4, 12, 21, 22], "three": [4, 16], "tip": [4, 8], "stai": 4, "healthi": 4, "eat": 4, "balanc": 4, "nutriti": 4, "diet": 4, "meal": 4, "inclus": 4, "varieti": [4, 10], "fruit": 4, "veget": 4, "lean": 4, "protein": 4, "whole": [4, 18], "grain": 4, "fat": 4, "bodi": 4, "essenti": [4, 18], "nutrient": 4, "best": [4, 12, 18, 22, 23], "prevent": [4, 5, 18, 22, 23], "chronic": 4, "diseas": 4, "n2": 4, "engag": 4, "regular": [4, 8, 9, 14, 19, 21, 22, 23], "physic": 4, "exercis": 4, "crucial": 4, "maintain": [4, 18], "strong": 4, "bone": 4, "muscl": 4, "cardiovascular": 4, "health": 4, "aim": 4, "150": [4, 10], "minut": 4, "moder": 4, "aerob": [4, 23], "75": [4, 5, 23], "vigor": 4, "week": 4, "n3": 4, "sleep": 4, "mental": 4, "regul": 4, "mood": 4, "cognit": 4, "growth": 4, "immun": 4, "hour": [4, 7], "night": 4, "flatten": [4, 12], "plain": [4, 15], "moreov": 4, "symbol": [4, 6], "act": 4, "break": [4, 8, 12], "world": [4, 12], "newlin": [4, 12, 19, 21, 22, 23], "51751": 4, "28800": 4, "prompt": [4, 12, 14, 19, 21, 22, 23], "instrunct": 4, "pattern": 4, "propos": [4, 23], "answer": [4, 23], "respons": [4, 14, 19, 21, 22, 23], "request": 4, "overriden": 4, "launch": [4, 6], "nohup": 4, "replicate_": 4, "finetenun": 4, "start": [4, 7, 14, 15, 17, 18, 19, 21, 22, 23], "turn": [4, 12, 16, 21], "dump_sampl": [4, 19], "strictli": 4, "worth": 4, "he": [4, 23], "substr": 4, "0x0a": 4, "embd": 4, "scratch": [4, 8], "ately": 4, "compl": 4, "te": 4, "inst": 4, "ruction": 4, "iv": [4, 8], "ing": 4, "bal": 4, "anc": 4, "nut": 4, "rit": 4, "iou": 4, "di": 4, "et": [4, 5, 21], "me": 4, "al": [4, 5, 21], "inclu": 4, "ruit": 4, "abl": 4, "gra": 4, "ins": 4, "ats": 4, "ri": 4, "ent": 4, "chron": 4, "ic": 4, "dise": 4, "ases": 4, "eng": 4, "ag": 4, "erc": 4, "ise": 4, "cru": 4, "cial": 4, "ones": 4, "mu": 4, "cle": 4, "ov": 4, "asc": 4, "ular": 4, "im": 4, "ate": 4, "aer": 4, "ob": 4, "vig": 4, "orou": 4, "reg": 4, "ulat": 4, "ood": 4, "cogn": 4, "itiv": 4, "imm": 4, "un": 4, "lora_weight": 4, "action": [4, 16, 18, 21], "base_model": 4, "finetuned_llama7b": 4, "onmt_step_4000": 4, "release_model": 4, "concat_ct2": 4, "int8_float16": 4, "inference_config_fil": 4, "inference_mod": 4, "max_context_length": 4, "server_port": 4, "Or": 4, "ct2": 4, "paramat": 4, "obtain": 4, "input_fil": 4, "output_dir": 4, "bottom": 5, "abstract": [5, 16], "inproceed": [5, 13], "gehrmann2018bottom": 5, "titl": [5, 13], "author": [5, 12, 13], "gehrmann": 5, "sebastian": 5, "deng": [5, 13], "yuntian": [5, 13], "rush": [5, 13], "alexand": [5, 13], "booktitl": [5, 13], "proceed": 5, "2018": [5, 24], "confer": 5, "page": 5, "4098": 5, "4109": 5, "year": [5, 13], "dataset": [5, 6, 8, 10, 12, 19, 21, 23], "access": 5, "split": [5, 12], "articl": 5, "australia": 5, "account": [5, 12], "deficit": 5, "shrunk": 5, "record": 5, "billion": 5, "dollar": 5, "lrb": 5, "rrb": 5, "june": 5, "quarter": 5, "due": [5, 12, 21], "soar": 5, "commod": 5, "price": 5, "figur": 5, "mondai": 5, "australian": 5, "narrow": 5, "sharpli": 5, "addition": [5, 16], "truncat": [5, 14, 21], "400": [5, 12], "target": [5, 6, 9, 14, 16, 17, 19, 21, 22, 23], "surround": 5, "tag": [5, 19, 21, 22], "w1": 5, "w2": 5, "w3": 5, "sed": [5, 7], "overwrit": [5, 19, 21, 23], "src_seq_length_trunc": [5, 21], "tgt_seq_length_trunc": [5, 21], "vocabulari": [5, 8, 9, 14, 16, 19, 21, 22, 23], "share_vocab": [5, 6, 19, 21, 23], "similar": [5, 12, 16, 21], "signific": [5, 12], "copy_attn": [5, 14, 15, 16, 18, 21], "word": [5, 8, 16, 18, 19, 21, 22], "mlp": [5, 11, 16, 21], "bahdanau": [5, 16, 21, 24], "luong": [5, 16, 21, 24], "dot": [5, 16, 21, 22, 23], "share_embed": [5, 21], "decreas": 5, "did": 5, "reuse_copy_attn": [5, 16, 21], "reus": [5, 16, 21], "copy_loss_by_seqlength": [5, 21], "divid": [5, 14, 21, 22], "practic": [5, 23], "penalti": [5, 8, 14, 18], "hidden": [5, 14, 16, 21, 23], "adagrad": [5, 21], "outperform": 5, "sgd": [5, 21], "coupl": 5, "adagrad_accumulator_init": [5, 21], "match": [5, 17, 19, 21, 22], "algorithm": [5, 14, 24], "tensorflow": [5, 21], "align": [5, 8, 14, 15, 16, 17, 18, 22, 23, 24], "previous": 5, "dynamic_dict": 5, "128": [5, 21, 22], "dimension": [5, 16], "On": [5, 14], "brnn": [5, 10, 12, 21], "256": [5, 23], "norm": [5, 14, 16, 21], "gradient": [5, 8, 9, 14, 21], "renorm": [5, 21], "exce": [5, 21], "src_vocab_s": [5, 21, 23], "50000": [5, 21], "tgt_vocab_s": [5, 21, 23], "15": [5, 6, 18, 24], "seed": [5, 12, 18, 19, 21, 22, 23], "777": 5, "model_transform": 5, "normal": [5, 8, 9, 16, 23], "beam": [5, 8, 12, 18, 23], "stepwise_penalti": [5, 18, 22], "coverage_penalti": [5, 18, 22], "beta": [5, 14, 18, 22], "coverag": [5, 14, 16, 18, 21, 22], "wu": [5, 22, 24], "block_ngram_repeat": [5, 18, 22], "trigram": 5, "ignore_when_block": [5, 18, 22], "testout": 5, "min_length": [5, 18, 22], "35": [5, 7], "verbos": [5, 6, 9, 10, 11, 12, 18, 21, 22, 23], "roug": [5, 12], "pyroug": 5, "gram": [5, 18], "typic": [5, 14, 19, 21, 23], "sub": 5, "repo": [5, 12, 23], "recurs": 5, "submodul": 5, "sebastiangehrmann": 5, "baselin": [5, 12, 21], "maco": 5, "pointer": [5, 16, 24], "perl": 5, "pl": 5, "might": 5, "simpl": [5, 14], "w": [5, 12, 16, 24], "fail": [5, 18], "sent_tag_verbatim": 5, "around": [5, 14, 16], "becom": 5, "larg": [5, 22], "parallel": [5, 9, 15, 16, 18, 19, 21, 23], "files2roug": 5, "giga": 5, "r1": 5, "r2": 5, "rl": 5, "39": 5, "43": 5, "02": [5, 11], "53": 5, "17": [5, 16, 24], "18": 5, "77": 5, "28": [5, 7], "98": [5, 21, 23], "56": 5, "36": 5, "38": 5, "37": 5, "76": 5, "60": [5, 21, 22], "44": 5, "31": 5, "66": [5, 7], "34": 5, "46": 5, "33": [5, 12], "42": [5, 23], "emb": [5, 16], "hid": 5, "40": [5, 21, 22, 23], "90": 5, "91": 5, "99": 5, "25": [5, 12, 22], "59": 5, "97": 5, "93": 5, "67": 5, "1024": [5, 23], "41": [5, 7, 12], "94": 5, "27": 5, "83": 5, "09": 5, "54": 5, "45": 5, "51": 5, "vinyal": [5, 24], "fortunato": 5, "jaitli": 5, "2015": [5, 24], "nip": 5, "liu": [5, 24], "man": [5, 24], "2017": [5, 13, 21, 24], "acl": [5, 13, 24], "cho": [5, 24], "bengio": [5, 24], "2014": [5, 24], "neural": [5, 8, 11, 13, 16, 18, 24], "iclr": [5, 24], "pham": [5, 24], "approach": [5, 10, 11, 12, 16], "emnlp": [5, 24], "preliminari": 6, "wiki_103": 6, "prepare_wikitext": 6, "103_data": 6, "wikitext103": 6, "shuffl": [6, 12, 21], "chmod": 6, "snippet": 6, "40000": 6, "won": 6, "inde": 6, "aggress": 6, "joiner_annot": 6, "preserve_placehold": 6, "case_markup": 6, "soft_case_region": 6, "preserve_segmented_token": 6, "n_symbol": 6, "tokenizer_default": 6, "learner": 6, "bpelearn": 6, "ingest_fil": 6, "raw": [6, 8, 11, 16, 22], "data_fil": 6, "tokenize_fil": 6, "explain": 6, "therefor": 6, "wikitext": 6, "observ": 6, "built": [6, 10, 11, 14, 16], "tansform": 6, "gpt": [6, 14, 16, 21], "unsupervis": 6, "multitask": 6, "block": [6, 16, 18, 22], "mention": [6, 12], "slide": [6, 21], "plai": 6, "role": 6, "model_task": [6, 21, 23], "transformer_lm": [6, 21, 23], "monitor": 6, "perplex": [6, 12, 14], "trigger": [6, 14], "head": [6, 8, 12, 16, 21, 23], "lm_input": 6, "proce": 6, "top": [6, 10, 11, 16, 18, 22], "nucleu": [6, 22], "lowest": [6, 12], "lm_step_1000000": 6, "lm_pred_input": 6, "random_sampling_topp": [6, 18, 22, 23], "torchvis": [7, 10], "torchaudio": [7, 11], "edit": 7, "english": [7, 23], "german": [7, 23], "bash": 7, "prepare_wmt_ende_data": 7, "big": [7, 8], "50k": 7, "rtx": 7, "4090": 7, "build_vocab": [7, 19], "wmt17_end": 7, "variou": [7, 21], "wmt17_en_d": 7, "bpe": [7, 8, 19, 21, 22, 23], "bigwmt17_step_50000": 7, "trg": 7, "sacrebleu": 7, "40k": 7, "45k": 7, "newstest2016": 7, "signatur": 7, "nref": 7, "eff": 7, "verbose_scor": 7, "bp": 7, "hyp_len": 7, "64244": 7, "ref_len": 7, "64379": 7, "65": 7, "000": 7, "64357": 7, "992": 7, "63885": 7, "pretrain": [8, 13, 16, 21], "llm": [8, 14, 21], "finetun": [8, 14, 21], "contributor": 8, "guidelin": 8, "absolut": 8, "alibi": 8, "glove": [8, 21], "ensembl": [8, 22], "corpora": [8, 15, 21], "special": [8, 9, 19, 21, 22], "purpos": [8, 23], "bart": 8, "switchout": [8, 24], "deal": 8, "while": [8, 12, 16], "supervis": [8, 16, 21], "server": [8, 21], "ii": 8, "docker": 8, "iii": 8, "wiki": [8, 21], "103": 8, "clean": [8, 12], "prepar": [8, 9, 14, 18], "summar": [8, 23, 24], "cnn": [8, 11, 12, 14, 21], "dm": 8, "gate": [8, 16, 21], "graph": [8, 16, 21], "quick": 8, "acknowledg": 8, "vicuna": 8, "token_drop": 8, "token_mask": 8, "insertmaskbeforeplaceholderstransform": 8, "uppercas": 8, "onmttok": 8, "prefix": [8, 14], "suffix": 8, "terminologi": 8, "reproduc": [8, 12], "prune": 8, "quant": 8, "trick": [8, 14, 16], "effici": [8, 14, 24], "framework": [8, 21], "strategi": [8, 15], "loader": 8, "faq": [8, 23], "speech": 8, "embeddings_to_torch": 9, "ylhsieh": 9, "one2": 9, "usag": [9, 19, 20, 21, 22], "emb_file_both": 9, "emb_file_enc": 9, "emb_file_dec": 9, "output_fil": 9, "dict_fil": 9, "skip_lin": 9, "usagecomplet": 9, "info": [9, 15, 21, 22, 23], "onmt_preprocess": [9, 10, 11, 12], "train_src": [9, 10, 11, 12], "train_tgt": [9, 10, 11, 12], "valid_src": [9, 10, 11, 12], "valid_tgt": [9, 10, 11, 12], "pre_word_vecs_enc": [9, 21], "enc": [9, 21], "pre_word_vecs_dec": [9, 21], "dec": [9, 16, 21], "bunch": 9, "tmp": 9, "de2": 9, "max_generator_batch": 9, "model1": 9, "seed1": 9, "model2": 9, "seed2": 9, "train_id": 9, "from_backtransl": 9, "my_data": 9, "dump": [9, 19, 21, 22, 23], "train_a": 9, "train_b": 9, "data_id": 9, "data_weight": 9, "mani": [9, 12, 14, 18, 21, 23], "shard_siz": [9, 10, 11, 12], "train_align": 9, "valid_align": 9, "mask": [9, 14, 16, 19, 21, 22], "deep": [10, 11], "driven": 10, "caption": [10, 12], "optic": 10, "recognit": 10, "latex": [10, 11], "decompil": 10, "formula": 10, "goal": 10, "compil": 10, "frac": 10, "delta": 10, "cdot": 10, "visual": [10, 21, 23], "markup": 10, "technic": [10, 13], "conda": 10, "pillow": 10, "math": 10, "im2text": 10, "tgz": [10, 11], "sea": [10, 11], "harvard": [10, 11], "im2text_smal": 10, "tar": [10, 11, 12, 23], "zxf": [10, 11], "data_typ": [10, 11, 12, 15, 18, 21, 22], "img": 10, "src_dir": [10, 11, 12], "demo": [10, 11, 23], "tgt_words_min_frequ": [10, 21], "500": [10, 12, 21, 23], "image_channel_s": 10, "model_typ": [10, 11, 12, 21], "80": 10, "model_acc_x_ppl_x_e13": [10, 11], "max_length": [10, 12, 18, 22, 23], "im2latex": 10, "100k": 10, "shall": [10, 11], "label0_token0": [10, 11], "label0_token1": [10, 11], "label0_tokenn0": [10, 11], "label1_token0": [10, 11], "label1_token1": [10, 11], "label1_tokenn1": [10, 11], "label2_token0": [10, 11], "label2_token1": [10, 11], "label2_tokenn2": [10, 11], "image0_path": 10, "image1_path": 10, "image2_path": 10, "fourier": 11, "stft": 11, "convolut": [11, 16, 24], "sudo": 11, "apt": 11, "sox": 11, "libsox": 11, "dev": [11, 12], "fmt": 11, "librosa": 11, "an4_dataset": 11, "300": 11, "audio_enc_pool": 11, "0003": 11, "100000": [11, 21], "speech0_path": 11, "speech1_path": 11, "speech2_path": 11, "sample_r": 11, "16000": 11, "window_s": 11, "spectrogram": 11, "window_strid": 11, "ham": 11, "deepspeech": 11, "exploit": 12, "tempor": 12, "youtubeclip": 12, "xvf": 12, "decompress": 12, "archiv": 12, "youtube2text": 12, "throw": 12, "awai": 12, "googlenet": 12, "youtube2text_iccv15": 12, "yt2t": 12, "vid": 12, "avi": 12, "pickl": 12, "yt": 12, "ytc": 12, "youtub": 12, "hash": 12, "join": 12, "dict_youtube_map": 12, "pkl": 12, "rb": 12, "yt2vid": 12, "listdir": 12, "hashi": 12, "ext": 12, "splitext": 12, "fpath_old": 12, "f_new": 12, "fpath_new": 12, "low": 12, "framer": 12, "fi": 12, "ffmpeg": 12, "done": [12, 18], "frame": 12, "variabl": [12, 15, 18], "y2t2": 12, "back": [12, 14], "pwd": 12, "img_feature_extractor": 12, "restrict": [12, 19, 21, 22], "pythonpath": 12, "vid_feature_extractor": 12, "root_dir": 12, "out_dir": 12, "r152": 12, "count": [12, 14, 15, 18, 19, 21, 22], "equal": [12, 18, 21], "1970": 12, "wc": 12, "rerun": 12, "miss": 12, "unexpect": 12, "issu": 12, "associ": 12, "filenam": [12, 21], "skip": [12, 19, 21], "ann": 12, "vid2ann": 12, "vid_nam": 12, "item": [12, 15, 16], "keyerror": 12, "train_fil": 12, "yt2t_train_fil": 12, "val_fil": 12, "yt2t_val_fil": 12, "val_fold": 12, "yt2t_val_folded_fil": 12, "test_fil": 12, "yt2t_test_fil": 12, "train_cap": 12, "yt2t_train_cap": 12, "val_cap": 12, "yt2t_val_cap": 12, "vid_path": 12, "npy": 12, "enumer": 12, "split_nam": 12, "elif": 12, "assert": 12, "small": [12, 23], "0001": [12, 23], "model_step_7200": 12, "7200": 12, "frequenc": [12, 19, 21, 22], "coco": 12, "fork": 12, "flaut": 12, "url": [12, 13, 24], "pprint": 12, "pycocoevalcap": 12, "meteor": 12, "cider": 12, "spice": 12, "__name__": 12, "__main__": 12, "scorer": [12, 18], "gt": 12, "outp": 12, "vid_id": 12, "all_scor": 12, "compute_scor": 12, "isinst": 12, "sc": 12, "bleu1": 12, "7888553878084233": 12, "bleu2": 12, "6729376621109295": 12, "bleu3": 12, "5778428507344473": 12, "bleu4": 12, "47633625833397897": 12, "7122415518428051": 12, "31829562714082704": 12, "6811305229481235": 12, "044147089472463576": 12, "stack": [12, 16, 21], "against": 12, "row": 12, "tabl": [12, 16, 22], "4028": 12, "2900": 12, "4801": 12, "downsampl": 12, "26": [12, 24], "240": 12, "fp": 12, "resnet": 12, "lowercas": 12, "tvt": 12, "view": 12, "msvd": 12, "yt2t_2": 12, "untar": 12, "subssampl": 12, "reprocess": 12, "2345": 12, "maketran": 12, "whitespac": 12, "train_data": 12, "val_data": 12, "test_data": 12, "datum": 12, "50": [12, 21, 22], "model_step_": 12, "report_everi": [12, 21, 23], "share_decoder_embed": [12, 21], "7000": 12, "estim": [12, 14], "epoch": 12, "scale": [12, 14, 16, 18, 21, 22, 23], "accordingli": 12, "earli": [12, 14, 21], "stop": [12, 14, 18, 19, 21, 22], "find_val_stop": 12, "test_early_stop": 12, "process_result": 12, "argpars": [12, 17], "defaultdict": 12, "panda": 12, "pd": 12, "load_result": 12, "fname": 12, "junk": 12, "score_lin": 12, "metric": [12, 14, 21, 22], "score_num": 12, "float": [12, 14, 16, 18], "endswith": 12, "df": 12, "datafram": 12, "find_absolute_stop": 12, "idxmax": 12, "find_early_stop": 12, "stop_count": 12, "count_since_max": 12, "ended_metr": 12, "iterrow": 12, "seri": 12, "find_stop": 12, "argumentpars": 12, "locat": [12, 19, 21, 22, 23], "add_argu": 12, "wors": 12, "parse_arg": 12, "idx": 12, "iteritem": 12, "print": [12, 14, 21, 22], "loc": 12, "touch": 12, "1v": 12, "model_step": 12, "echo": 12, "null": [12, 23], "val_stop": 12, "test_result": 12, "IFS": 12, "awk": 12, "nf": 12, "tee": 12, "cat": 12, "thu": [12, 14], "2000": 12, "took": 12, "522": 12, "testlen": 12, "3410": 12, "reflen": 12, "3417": 12, "guess": 12, "2740": 12, "2070": 12, "1400": 12, "2664": 12, "1562": 12, "887": 12, "386": 12, "9979514193734276": 12, "7796296150773093": 12, "6659837622637965": 12, "5745524496015597": 12, "4779574102543823": 12, "7541600090591118": 12, "3259497476899707": 12, "6800279518634998": 12, "046435637924854": 12, "72": 12, "11": 12, "24m": 12, "perhap": 12, "residu": [12, 16, 21], "altern": [12, 16], "9861": 12, "fewer": 12, "overal": 12, "nearli": 12, "favor": 12, "portal": 13, "packag": [13, 17], "readi": 13, "go": [13, 18, 19, 21, 23], "familiar": 13, "yourself": 13, "research": [13, 21], "guillaum": 13, "klein": 13, "yoon": 13, "kim": 13, "jean": 13, "senellart": 13, "proc": [13, 24], "doi": [13, 24], "18653": 13, "p17": 13, "4012": 13, "gitter": 13, "channel": [13, 16], "basemodel": 14, "encoderbas": [14, 16], "decoderbas": [14, 16], "src_len": [14, 16, 18], "bptt": [14, 21], "with_align": 14, "propag": 14, "longtensor": [14, 16, 18], "tgt_len": [14, 16], "boolean": [14, 18], "init": [14, 16, 21], "floattensor": [14, 16, 18], "load_safe_state_dict": 14, "model_path": 14, "precis": [14, 22, 23], "float32": 14, "strict": 14, "offset": [14, 15], "state_dict": [14, 21], "load_state_dict": 14, "serial": [14, 16], "wrt": 14, "wai": [14, 18, 23], "nmtmodel": [14, 16], "count_paramet": 14, "callback": 14, "enc_out": [14, 16, 18], "exclud": 14, "initiliaz": 14, "enc_final_h": [14, 16], "languagemodel": 14, "transformerlmdecod": 14, "train_loss": 14, "valid_loss": 14, "scoring_prepar": 14, "valid_scor": 14, "trunc_siz": 14, "norm_method": 14, "sent": [14, 15, 21, 22, 23], "parallel_mod": [14, 21, 22], "data_parallel": [14, 21, 22], "report_manag": 14, "model_sav": 14, "average_decai": [14, 21], "average_everi": [14, 21], "fp32": [14, 21, 22], "earlystopp": 14, "zero_out_prompt_loss": [14, 21, 23], "util": 14, "losscomputebas": 14, "scoringprepar": 14, "calcul": [14, 16, 18, 23], "_eval_handl": 14, "accum": [14, 15], "ordin": 14, "rank": [14, 18, 21, 22], "reportmgrbas": 14, "lear": 14, "modelsaverbas": 14, "saver": 14, "earlystop": 14, "mecan": 14, "ff": [14, 16], "dropaout": 14, "schedul": 14, "zero": [14, 16, 18, 21, 22, 23], "mostli": [14, 21], "train_it": 14, "valid_it": 14, "loop": 14, "possibli": [14, 16], "nmt": [14, 18, 21], "moving_averag": [14, 21], "n_batch": 14, "n_sent": 14, "n_word": 14, "n_correct": 14, "computed_metr": 14, "accuraci": [14, 18], "elaps": 14, "static": [14, 21], "all_gather_stat": 14, "max_siz": 14, "accross": 14, "buffer": [14, 21], "all_gather_stats_list": 14, "stat_list": 14, "our_stat": 14, "elapsed_tim": 14, "log_tensorboard": 14, "writer": 14, "patienc": 14, "displai": 14, "num_step": 14, "stdout": 14, "ppl": 14, "update_n_src_word": 14, "sume": 14, "n_src_word": 14, "xent": 14, "cross": [14, 16, 21], "entropi": 14, "losscomput": 14, "criterion": 14, "lambda_coverag": [14, 21], "tgt_shift_index": 14, "lm_gener": 14, "lm_prior_lambda": [14, 21], "lm_prior_tau": [14, 21], "lm_prior_model": [14, 21], "nlloss": 14, "off": [14, 21], "hyper": 14, "param": 14, "scaler": 14, "attn": [14, 16, 18, 22], "trunc_start": 14, "approxim": [14, 21], "reliev": 14, "tupl": [14, 16], "from_opt": [14, 15, 16], "subclass": [14, 16, 18], "wrap": [14, 17], "nllloss": 14, "relev": [14, 16, 18], "ignore_prompt": 14, "mask_befor": 14, "suppos": 14, "crossentropyloss": 14, "learning_rate_decay_fn": 14, "thin": 14, "grad": 14, "callabl": [14, 18], "factor": 14, "clip": 14, "properti": [14, 16], "amp": [14, 21], "backward": [14, 16], "ownership": 14, "emploi": 14, "training_step": 14, "zero_grad": 14, "set_to_non": 14, "adafactor": [14, 21], "lr": 14, "beta1": [14, 21], "beta2": [14, 21], "999": [14, 21], "eps1": 14, "1e": [14, 16, 21], "eps2": 14, "001": [14, 21], "cliping_threshold": 14, "non_constant_decai": 14, "enable_factor": 14, "ams_grad": 14, "weight_decai": 14, "closur": 14, "reevalu": 14, "unless": 14, "otherwis": [14, 21, 22], "fusedadam": [14, 21, 23], "bias_correct": 14, "ep": [14, 16], "08": 14, "eps_inside_sqrt": 14, "amsgrad": 14, "coeffici": 14, "squar": 14, "denomin": 14, "l2": 14, "variant": [14, 21, 23], "converg": [14, 23], "NOT": 14, "moment": [14, 21], "root": 14, "output_param": 14, "grad_norm": 14, "half": [14, 21], "dynamicdatasetit": 15, "corpora_info": 15, "skip_empty_level": [15, 19, 21, 23], "iterabledataset": 15, "corpustask": 15, "multipli": 15, "increment": [15, 21], "secur": [15, 19, 21], "encout": [15, 19, 21], "sort_kei": 15, "mixer": 15, "mixingstrategi": 15, "batch_it": 15, "chunk": 15, "initil": 15, "sequentialmix": 15, "exhaust": 15, "weightedmix": 15, "weightedli": 15, "infinit": 15, "parallelcorpusiter": 15, "transformpip": 15, "word_vocab_s": 16, "word_padding_idx": 16, "feat_padding_idx": 16, "feat_vocab_s": 16, "spars": 16, "freeze_word_vec": 16, "abil": 16, "linguist": [16, 24], "sh16": [16, 24], "positionalencod": 16, "feat_dim_expon": 16, "embbed": 16, "emb_lut": 16, "nfeat": 16, "embedding_s": 16, "load_pretrained_vector": 16, "emb_fil": 16, "word_lut": 16, "dim": [16, 21], "enc_typ": 16, "max_len": 16, "vsp": [16, 24], "seq_len": [16, 18], "nonetyp": [16, 18], "stepwis": 16, "position_ffn": 16, "positionwisefeedforward": [16, 21], "d_model": 16, "d_ff": 16, "activation_fn": 16, "relu": [16, 21], "add_ffnbia": [16, 21, 23], "parallel_residu": [16, 21, 23], "layer_norm": [16, 21, 23], "norm_ep": [16, 21], "06": [16, 21], "parallel_gpu": 16, "fnn": 16, "activationfunct": 16, "input_len": 16, "model_dim": 16, "2x": 16, "num_lay": 16, "transformerencod": 16, "relative_positions_bucket": [16, 21], "pos_ffn_activation_fn": [16, 21, 23], "num_kv": [16, 21], "rotary_interleav": [16, 21], "rotary_theta": [16, 21], "rotary_dim": [16, 21], "inner": 16, "rnnencod": 16, "use_bridg": 16, "gru": [16, 21], "sru": [16, 21], "ggnnencod": 16, "autocr": 16, "cnnencod": 16, "cnn_kernel_width": [16, 21], "gag": [16, 24], "meanencod": 16, "trivial": 16, "simpli": [16, 23], "pool": 16, "transformerdecod": 16, "self_attn_typ": [16, 21, 22, 23], "aan_useffn": [16, 21], "shared_layer_norm": [16, 21], "sliding_window": [16, 21], "num_expert": [16, 21], "num_experts_per_tok": [16, 21], "transformerdecoderbas": 16, "flash": [16, 21, 22, 23], "context_attn": 16, "distanc": [16, 21], "aan": [16, 21], "guid": 16, "kv": [16, 21], "multiqueri": [16, 21], "neox": [16, 21], "epsilon": [16, 21], "width": 16, "band": 16, "mistral": 16, "interleav": [16, 21], "theta": [16, 21], "expert": [16, 21], "moe": 16, "tlen": 16, "feat": [16, 17, 19, 21, 22], "slen": 16, "rnndecoderbas": 16, "bidirectional_encod": 16, "attn_typ": 16, "attn_func": 16, "softmax": [16, 21, 22], "coverage_attn": [16, 21], "context_g": [16, 21], "copy_attn_typ": [16, 21], "globalattent": 16, "contextg": 16, "dec_out": 16, "init_st": 16, "stdrnndecod": 16, "fulli": 16, "cudnn": 16, "By": [16, 24], "bcb14": [16, 24], "input_feed": 16, "inputfeedrnndecod": 16, "lpm15": [16, 24], "cnndecod": 16, "convmultistepattent": 16, "enc_hidden": 16, "matrix": [16, 21], "parameter": 16, "convex": 16, "combin": [16, 22], "construct": 16, "sum_": 16, "seqlength": 16, "a_j": 16, "h_j": 16, "w_a": 16, "v_a": 16, "tanh": 16, "u_a": 16, "sparsemax": [16, 21], "yet": [16, 18], "distribtut": 16, "h_t": 16, "h_": 16, "unnorm": 16, "multiheadedattent": 16, "head_count": 16, "is_decod": 16, "simulatan": 16, "select": [16, 18, 21], "divis": 16, "return_attn": 16, "key_len": 16, "query_len": 16, "binari": 16, "averageattent": 16, "acceler": [16, 24], "zxs18": [16, 24], "layer_in": 16, "t_len": 16, "gating_out": 16, "average_out": 16, "input_s": 16, "conv": [16, 21], "oper": [16, 19], "apply_mask": 16, "base_target_emb": 16, "input_from_dec": 16, "encoder_out_top": 16, "encoder_out_combin": 16, "height": 16, "calc": 16, "copygener": 16, "output_s": 16, "pad_idx": 16, "slm17": [16, 24], "p_": 16, "tgt_dict": 16, "z": 16, "probil": 16, "taken": 16, "src_map": [16, 18], "impli": 16, "extra_word": 16, "structured_attent": 16, "matrixtre": 16, "tree": 16, "theorem": 16, "margin": 16, "ll17": [16, 24], "overridden": 16, "recip": 16, "within": [16, 17], "afterward": 16, "former": 16, "care": 16, "regist": 16, "hook": 16, "latter": 16, "silent": [16, 19, 21, 23], "ignor": [16, 19, 21, 22, 23], "translation_serv": 17, "servermodel": 17, "preprocess_opt": 17, "tokenizer_opt": 17, "postprocess_opt": 17, "custom_opt": 17, "features_opt": 17, "processu": 17, "postprocess": 17, "func": [17, 18], "do_timeout": 17, "neg": [17, 21], "build_token": 17, "attr": 17, "on_timemout": 17, "maybe_convert_align": 17, "align_scor": 17, "correspand": 17, "maybe_detoken": 17, "maybe_detokenize_with_align": 17, "seper": 17, "maybe_postprocess": 17, "maybe_preprocess": 17, "maybe_token": 17, "maybe_transform_feat": 17, "raw_src": 17, "tok_src": 17, "inferfeatstransform": 17, "parse_opt": 17, "namespac": 17, "rebuild_seg_packag": 17, "all_preprocess": 17, "rebuild": 17, "segment": [17, 19, 21, 22], "n_seg": 17, "to_gpu": 17, "tokenizer_mark": 17, "marker": 17, "servermodelerror": 17, "timer": 17, "translationserv": 17, "clone_model": 17, "list_model": 17, "load_model": 17, "model_kwarg": 17, "preload_model": 17, "preload": 17, "intern": 17, "datastructur": 17, "lua": 17, "config_fil": 17, "unload_model": 17, "cancel": 17, "srclen": 18, "pred_sent": 18, "pred_scor": 18, "tgt_sent": 18, "gold_scor": 18, "word_align": 18, "ind_in_bucket": 18, "prob": 18, "gold_sent": 18, "sent_numb": 18, "src_raw": 18, "max_length_ratio": [18, 22], "random_sampling_topk": [18, 22, 23], "random_sampling_temp": [18, 22, 23], "dump_beam": [18, 22], "frozenset": 18, "replace_unk": [18, 22], "ban_unk_token": [18, 22], "phrase_t": [18, 22], "report_tim": [18, 22, 23], "global_scor": 18, "report_scor": 18, "logger": 18, "with_scor": [18, 22], "return_gold_log_prob": 18, "translate_batch": 18, "attn_debug": [18, 22], "translationbuild": 18, "underli": 18, "address": 18, "rare": 18, "lsl": [18, 24], "unknown": 18, "decodestrategi": 18, "parallel_path": 18, "exclusion_token": 18, "return_attent": 18, "magic": 18, "shortest": [18, 23], "begin": 18, "longest": 18, "presum": 18, "cutoff": 18, "forbidden": 18, "hold": 18, "inp_seq_len": 18, "inp": 18, "seq": 18, "alive_seq": 18, "grow": 18, "axi": 18, "is_finish": 18, "bytetensor": 18, "alive_attn": 18, "target_prefix": 18, "prefix_seq_len": 18, "log_prob": 18, "ngram": [18, 22], "thant": 18, "onc": [18, 21], "put": 18, "lead": [18, 23], "complex": [18, 23], "ingredi": 18, "maybe_update_forbidden_token": 18, "reorder": 18, "forbidden_token": 18, "maybe_update_target_prefix": 18, "select_index": 18, "aliv": 18, "logit": [18, 22], "vocab_s": [18, 21], "update_finish": 18, "attribut": 18, "beamsearch": 18, "beamsearchbas": 18, "greedy_search": 18, "sample_with_temperatur": 18, "sampling_temp": 18, "keep_topk": 18, "keep_topp": 18, "randomli": 18, "categor": 18, "categori": 18, "inf": 18, "logsumexp": 18, "potenti": [18, 23], "chosen": 18, "until": [18, 22], "cumul": [18, 22], "greater": [18, 23], "condit": [18, 21, 22], "topk_id": 18, "topk_scor": 18, "greedysearch": 18, "either": [18, 21], "event": 18, "reach": 18, "gnmtglobalscor": 18, "penaltybuild": 18, "cov_pen": 18, "length_pen": 18, "pen": 18, "cov": 18, "has_cov_pen": 18, "op": 18, "isn": 18, "has_len_pen": 18, "coverage_non": 18, "coverage_summari": 18, "coverage_wu": 18, "gnmt": 18, "wsc": [18, 24], "almost": [18, 21], "length_averag": 18, "cur_len": 18, "length_non": 18, "unmodifi": 18, "length_wu": 18, "save_config": [19, 21, 22], "insert_mask_before_placehold": [19, 21, 22, 23], "num_thread": 19, "learn_subword": 19, "learn_subwords_s": 19, "vocab_sample_queue_s": 19, "decoder_start_token": [19, 21, 23], "default_speci": [19, 21], "response_pattern": [19, 21, 22, 23], "scripts_ok": [19, 21, 22], "scripts_nok": [19, 21, 22], "langid": [19, 21, 22], "src_subword_vocab": [19, 21, 22], "tgt_subword_vocab": [19, 21, 22], "src_vocab_threshold": [19, 21, 22], "tgt_vocab_threshold": [19, 21, 22], "gpt2_pretok": [19, 21, 22, 23], "spacer": [19, 21, 22], "encount": [19, 21], "rais": [19, 21], "32000": [19, 23], "tau": [19, 21, 22], "wpdn18": [19, 21, 22, 24], "smaller": [19, 21, 22], "divers": [19, 21, 22], "patten": [19, 21, 22], "fuzzi": [19, 21, 22], "inlin": [19, 21, 22], "ph_": [19, 21, 22], "_beg": [19, 21, 22], "_end": [19, 21, 22], "_std": [19, 21, 22], "unicodata": [19, 21, 22], "unigram": [19, 21, 22], "earlier": [19, 21, 22], "byte": [19, 21, 22], "192": [19, 21, 22], "prepend": [19, 21, 22], "rotat": [19, 21, 22], "percentag": [19, 21, 22], "often": [19, 21, 22], "fraction": [19, 21, 22], "debug": [20, 21, 22], "model_config": 20, "dump_transform": [21, 23], "src_words_min_frequ": 21, "tensor_parallel": [21, 22], "gpu_backend": [21, 22], "gpu_verbose_level": [21, 22], "freeze_encod": 21, "freeze_decod": 21, "gelu": 21, "silu": [21, 23], "input_fe": 21, "global_attention_funct": 21, "generator_funct": 21, "copy_attn_forc": 21, "loss_scal": 21, "apex_opt_level": [21, 23], "o0": 21, "o1": 21, "o2": 21, "o3": 21, "save_format": [21, 23], "safetensor": [21, 23], "keep_checkpoint": [21, 23], "keep_stat": 21, "single_pass": 21, "early_stop": 21, "early_stopping_criteria": 21, "adadelta": 21, "sparseadam": 21, "adamw8bit": 21, "pagedadamw8bit": 21, "pagedadamw32bit": 21, "truncated_decod": [21, 23], "adam_beta1": 21, "decay_step": [21, 23], "noamwd": 21, "rsqrt": 21, "log_fil": [21, 22], "log_file_level": [21, 22], "critic": [21, 22], "notset": [21, 22], "valid_metr": 21, "scoring_debug": 21, "dump_pr": 21, "exp_host": 21, "tensorboard_log_dir": 21, "override_opt": [21, 23], "bnb_8bit": [21, 22], "bnb_fp4": [21, 22], "awq_gemm": [21, 22], "awq_gemv": [21, 22], "w_bit": [21, 22], "group_siz": [21, 22], "disk": 21, "32768": [21, 23], "discard": 21, "backend": [21, 22], "nccl": [21, 22], "localhost": [21, 22], "gou": [21, 22], "sin": 21, "mark": 21, "feat_merge_s": 21, "experiment": 21, "kernel_s": 21, "dict_kei": 21, "autogener": 21, "dotprod": 21, "encodingw": 21, "embeddingsmor": 21, "09864set": 21, "usemaximum": 21, "pdf": 21, "biasmor": 21, "slice": 21, "meta": [21, 23], "hug": 21, "face": 21, "length1e4": 21, "llama2": 21, "mistral1e6": 21, "mixtral": 21, "mhanot": 21, "proj": 21, "attentionnot": 21, "1911": 21, "02150": 21, "falcon": [21, 23], "40b": 21, "position_wis": 21, "layernot": 21, "40bsame": 21, "garg": 21, "2019": 21, "1909": 21, "02074": 21, "leav": 21, "lambda_prior_lambda": 21, "lambda_prior_tau": 21, "opt_level": 21, "io": 21, "Will": 21, "pick": 21, "awith": 21, "mini": 21, "refil": 21, "_n": 21, "2106": 21, "09685": 21, "successfulli": 21, "thumb": 21, "uniform": 21, "xavier_uniform": 21, "resett": 21, "readm": 21, "criteria": 21, "mirror": 21, "initial_accumulator_valu": 21, "literatur": 21, "seemingli": 21, "discourag": 21, "consider": 21, "adopt": 21, "kera": 21, "www": 21, "api_doc": 21, "tf": 21, "adamoptim": 21, "recent": 21, "1512": 21, "00567": 21, "marian": 21, "aclweb": 21, "anthologi": 21, "p18": 21, "4020": 21, "exponenti": 21, "wikipedia": 21, "update_learning_r": 21, "gone": 21, "warmup": 21, "4000": 21, "under": [21, 22], "crayon": 21, "int8": 22, "avg_raw_prob": 22, "profil": 22, "align_debug": 22, "dtypefp32": 22, "gtx1080int8": 22, "nativ": 22, "whose": [22, 23], "learnt": 22, "1904": 22, "09751": 22, "minimum": 22, "250": 22, "european": 22, "enoughfor": 22, "asian": 22, "3for": 22, "burmes": 22, "amhar": 22, "repetit": 22, "had": [22, 23], "highest": 22, "proba": 22, "10k": 23, "s3": 23, "amazonaw": 23, "trainingdata": 23, "gz": 23, "xf": 23, "5k": 23, "nation": 23, "bureaucraci": 23, "parliament": 23, "apo": 23, "legisl": 23, "prerog": 23, "void": 23, "provis": 23, "extent": 23, "laid": 23, "feder": 23, "senior": 23, "instructor": 23, "italian": 23, "fit": 23, "postur": 23, "gym": 23, "stretch": 23, "pilat": 23, "2004": 23, "collabor": 23, "antich": 23, "person": 23, "toy_en_d": 23, "simplest": 23, "dump_field": 23, "simplifi": 23, "inspect": 23, "advand": 23, "model_step_1000": 23, "pred_1000": 23, "terribl": 23, "million": 23, "t5": 23, "openllama": 23, "redpajama": 23, "xgen": 23, "flan": 23, "convert_openllama": 23, "path_to_hf_model": 23, "path_to_token": 23, "path_to_openllama": 23, "nshard": 23, "reconstruct": 23, "convert_mpt": 23, "vocab_fil": 23, "path_to_mpt": 23, "huggin": 23, "though": 23, "mandatori": 23, "path_to": 23, "similarli": 23, "mpt7b": 23, "conserv": 23, "mmlu": 23, "inhomogen": 23, "high": 23, "degrad": 23, "eval_llm": 23, "run_mmlu_opennmt": 23, "myinfer": 23, "easier": 23, "ramdom": 23, "path_to_config": 23, "path_to_sourc": 23, "path_to_target": 23, "alpaca_clean": 23, "1234": 23, "w_3": 23, "linear_kei": 23, "final_linear": 23, "11008": 23, "dzmitri": 24, "kyunghyun": 24, "yoshua": 24, "1409": 24, "0473": 24, "0473v3": 24, "1146": 24, "annurev": 24, "neuro": 24, "041002": 24, "131047": 24, "jona": 24, "gehr": 24, "michael": 24, "auli": 24, "david": 24, "grangier": 24, "deni": 24, "yarat": 24, "yann": 24, "dauphin": 24, "1705": 24, "03122": 24, "yang": 24, "mirella": 24, "lapata": 24, "09207": 24, "minh": 24, "thang": 24, "hieu": 24, "christoph": 24, "ffectiv": 24, "pproach": 24, "ttention": 24, "eural": 24, "achin": 24, "ranslat": 24, "ilya": 24, "sutskev": 24, "quoc": 24, "le": 24, "oriol": 24, "wojciech": 24, "zaremba": 24, "ddress": 24, "ord": 24, "roblem": 24, "abigail": 24, "peter": 24, "1704": 24, "04368": 24, "rico": 24, "sennrich": 24, "barri": 24, "haddow": 24, "preprint": 24, "1606": 24, "02892": 24, "2016": 24, "ashish": 24, "vaswani": 24, "shazeer": 24, "niki": 24, "parmar": 24, "jakob": 24, "uszkoreit": 24, "llion": 24, "jone": 24, "aidan": 24, "gomez": 24, "lukasz": 24, "kaiser": 24, "illia": 24, "polosukhin": 24, "1706": 24, "03762": 24, "xinyi": 24, "wang": 24, "zihang": 24, "graham": 24, "neubig": 24, "1808": 24, "07512": 24, "yonghui": 24, "mike": 24, "schuster": 24, "zhifeng": 24, "chen": 24, "mohammad": 24, "norouzi": 24, "wolfgang": 24, "macherei": 24, "krikun": 24, "yuan": 24, "cao": 24, "qin": 24, "gao": 24, "klau": 24, "gap": 24, "human": 24, "1609": 24, "08144": 24, "biao": 24, "zhang": 24, "deyi": 24, "xiong": 24, "jinsong": 24, "su": 24, "1805": 24, "00631": 24}, "objects": {"onmt.decoders": [[16, 0, 1, "", "CNNDecoder"], [16, 0, 1, "", "DecoderBase"], [16, 0, 1, "", "InputFeedRNNDecoder"], [16, 0, 1, "", "StdRNNDecoder"], [16, 0, 1, "", "TransformerDecoder"]], "onmt.decoders.CNNDecoder": [[16, 1, 1, "", "forward"], [16, 1, 1, "", "from_opt"], [16, 1, 1, "", "init_state"]], "onmt.decoders.DecoderBase": [[16, 1, 1, "", "from_opt"]], "onmt.decoders.TransformerDecoder": [[16, 1, 1, "", "forward"]], "onmt.decoders.decoder": [[16, 0, 1, "", "RNNDecoderBase"]], "onmt.decoders.decoder.RNNDecoderBase": [[16, 1, 1, "", "forward"], [16, 1, 1, "", "from_opt"], [16, 1, 1, "", "init_state"]], "onmt.encoders": [[16, 0, 1, "", "CNNEncoder"], [16, 0, 1, "", "EncoderBase"], [16, 0, 1, "", "GGNNEncoder"], [16, 0, 1, "", "MeanEncoder"], [16, 0, 1, "", "RNNEncoder"], [16, 0, 1, "", "TransformerEncoder"]], "onmt.encoders.CNNEncoder": [[16, 1, 1, "", "forward"], [16, 1, 1, "", "from_opt"]], "onmt.encoders.EncoderBase": [[16, 1, 1, "", "forward"]], "onmt.encoders.GGNNEncoder": [[16, 1, 1, "", "forward"], [16, 1, 1, "", "from_opt"]], "onmt.encoders.MeanEncoder": [[16, 1, 1, "", "forward"], [16, 1, 1, "", "from_opt"]], "onmt.encoders.RNNEncoder": [[16, 1, 1, "", "forward"], [16, 1, 1, "", "from_opt"]], "onmt.encoders.TransformerEncoder": [[16, 1, 1, "", "forward"], [16, 1, 1, "", "from_opt"]], "onmt.inputters": [[15, 0, 1, "", "DynamicDatasetIter"], [15, 0, 1, "", "MixingStrategy"], [15, 0, 1, "", "ParallelCorpus"], [15, 0, 1, "", "ParallelCorpusIterator"], [15, 0, 1, "", "SequentialMixer"], [15, 0, 1, "", "WeightedMixer"]], "onmt.inputters.DynamicDatasetIter": [[15, 1, 1, "", "batch_iter"], [15, 1, 1, "", "from_opt"]], "onmt.inputters.ParallelCorpus": [[15, 1, 1, "", "load"]], "onmt.models": [[14, 0, 1, "", "BaseModel"], [14, 0, 1, "", "LanguageModel"], [14, 0, 1, "", "NMTModel"]], "onmt.models.BaseModel": [[14, 1, 1, "", "forward"], [14, 1, 1, "", "load_safe_state_dict"], [14, 1, 1, "", "load_state_dict"]], "onmt.models.LanguageModel": [[14, 1, 1, "", "count_parameters"], [14, 1, 1, "", "forward"]], "onmt.models.NMTModel": [[14, 1, 1, "", "count_parameters"], [14, 1, 1, "", "forward"]], "onmt.modules": [[16, 0, 1, "", "AverageAttention"], [16, 0, 1, "", "ConvMultiStepAttention"], [16, 0, 1, "", "CopyGenerator"], [16, 0, 1, "", "Embeddings"], [16, 0, 1, "", "GlobalAttention"], [16, 0, 1, "", "MultiHeadedAttention"], [16, 0, 1, "", "PositionalEncoding"]], "onmt.modules.AverageAttention": [[16, 1, 1, "", "forward"]], "onmt.modules.ConvMultiStepAttention": [[16, 1, 1, "", "apply_mask"], [16, 1, 1, "", "forward"]], "onmt.modules.CopyGenerator": [[16, 1, 1, "", "forward"]], "onmt.modules.Embeddings": [[16, 2, 1, "", "emb_luts"], [16, 1, 1, "", "forward"], [16, 1, 1, "", "load_pretrained_vectors"], [16, 2, 1, "", "word_lut"]], "onmt.modules.GlobalAttention": [[16, 1, 1, "", "forward"], [16, 1, 1, "", "score"]], "onmt.modules.MultiHeadedAttention": [[16, 1, 1, "", "forward"]], "onmt.modules.PositionalEncoding": [[16, 1, 1, "", "forward"]], "onmt.modules.position_ffn": [[16, 0, 1, "", "PositionwiseFeedForward"]], "onmt.modules.position_ffn.PositionwiseFeedForward": [[16, 1, 1, "", "forward"]], "onmt.modules.structured_attention": [[16, 0, 1, "", "MatrixTree"]], "onmt.modules.structured_attention.MatrixTree": [[16, 1, 1, "", "forward"]], "onmt.trainer": [[14, 0, 1, "", "Trainer"]], "onmt.trainer.Trainer": [[14, 1, 1, "", "train"], [14, 1, 1, "", "validate"]], "onmt.translate": [[18, 0, 1, "", "BeamSearch"], [18, 0, 1, "", "DecodeStrategy"], [18, 0, 1, "", "GNMTGlobalScorer"], [18, 0, 1, "", "GreedySearch"], [18, 0, 1, "", "Translation"], [18, 0, 1, "", "TranslationBuilder"], [18, 0, 1, "", "Translator"]], "onmt.translate.BeamSearch": [[18, 1, 1, "", "initialize"]], "onmt.translate.DecodeStrategy": [[18, 1, 1, "", "advance"], [18, 1, 1, "", "block_ngram_repeats"], [18, 1, 1, "", "initialize"], [18, 1, 1, "", "maybe_update_forbidden_tokens"], [18, 1, 1, "", "maybe_update_target_prefix"], [18, 1, 1, "", "target_prefixing"], [18, 1, 1, "", "update_finished"]], "onmt.translate.GreedySearch": [[18, 1, 1, "", "advance"], [18, 1, 1, "", "initialize"], [18, 1, 1, "", "update_finished"]], "onmt.translate.Translation": [[18, 1, 1, "", "log"]], "onmt.translate.Translator": [[18, 1, 1, "", "translate_batch"]], "onmt.translate.greedy_search": [[18, 3, 1, "", "sample_with_temperature"]], "onmt.translate.penalties": [[18, 0, 1, "", "PenaltyBuilder"]], "onmt.translate.penalties.PenaltyBuilder": [[18, 1, 1, "", "coverage_none"], [18, 1, 1, "", "coverage_summary"], [18, 1, 1, "", "coverage_wu"], [18, 1, 1, "", "length_average"], [18, 1, 1, "", "length_none"], [18, 1, 1, "", "length_wu"]], "onmt.translate.translation_server": [[17, 0, 1, "", "ServerModel"], [17, 4, 1, "", "ServerModelError"], [17, 0, 1, "", "Timer"], [17, 0, 1, "", "TranslationServer"]], "onmt.translate.translation_server.ServerModel": [[17, 1, 1, "", "build_tokenizer"], [17, 1, 1, "", "detokenize"], [17, 1, 1, "", "do_timeout"], [17, 1, 1, "", "maybe_convert_align"], [17, 1, 1, "", "maybe_detokenize"], [17, 1, 1, "", "maybe_detokenize_with_align"], [17, 1, 1, "", "maybe_postprocess"], [17, 1, 1, "", "maybe_preprocess"], [17, 1, 1, "", "maybe_tokenize"], [17, 1, 1, "", "maybe_transform_feats"], [17, 1, 1, "", "parse_opt"], [17, 1, 1, "", "postprocess"], [17, 1, 1, "", "preprocess"], [17, 1, 1, "", "rebuild_seg_packages"], [17, 1, 1, "", "to_gpu"], [17, 1, 1, "", "tokenize"], [17, 1, 1, "", "tokenizer_marker"]], "onmt.translate.translation_server.TranslationServer": [[17, 1, 1, "", "clone_model"], [17, 1, 1, "", "list_models"], [17, 1, 1, "", "load_model"], [17, 1, 1, "", "preload_model"], [17, 1, 1, "", "run"], [17, 1, 1, "", "start"], [17, 1, 1, "", "unload_model"]], "onmt.utils": [[14, 0, 1, "", "AdaFactor"], [14, 0, 1, "", "FusedAdam"], [14, 0, 1, "", "Optimizer"], [14, 0, 1, "", "Statistics"]], "onmt.utils.AdaFactor": [[14, 1, 1, "", "step"]], "onmt.utils.FusedAdam": [[14, 1, 1, "", "step"]], "onmt.utils.Optimizer": [[14, 2, 1, "", "amp"], [14, 1, 1, "", "backward"], [14, 1, 1, "", "from_opt"], [14, 1, 1, "", "learning_rate"], [14, 1, 1, "", "step"], [14, 2, 1, "", "training_step"], [14, 1, 1, "", "zero_grad"]], "onmt.utils.Statistics": [[14, 1, 1, "", "accuracy"], [14, 1, 1, "", "all_gather_stats"], [14, 1, 1, "", "all_gather_stats_list"], [14, 1, 1, "", "elapsed_time"], [14, 1, 1, "", "log_tensorboard"], [14, 1, 1, "", "output"], [14, 1, 1, "", "ppl"], [14, 1, 1, "", "update"], [14, 1, 1, "", "xent"]], "onmt.utils.loss": [[14, 0, 1, "", "LossCompute"]], "onmt.utils.loss.LossCompute": [[14, 1, 1, "", "forward"], [14, 1, 1, "", "from_opts"], [14, 1, 1, "", "ignore_prompt"]]}, "objtypes": {"0": "py:class", "1": "py:method", "2": "py:property", "3": "py:function", "4": "py:exception"}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "method", "Python method"], "2": ["py", "property", "Python property"], "3": ["py", "function", "Python function"], "4": ["py", "exception", "Python exception"]}, "titleterms": {"contributor": 0, "guidelin": 0, "docstr": 0, "how": [1, 9, 23], "do": [1, 9], "i": [1, 9], "us": [1, 9], "my": 1, "v2": 1, "model": [1, 5, 6, 9, 14, 17, 21, 22, 23], "v3": 1, "train": [1, 5, 6, 7, 9, 21, 23], "transform": [1, 9, 12, 19, 21, 22], "perform": [1, 2], "tip": [1, 2], "posit": 1, "encod": [1, 16, 21], "absolut": 1, "v": 1, "rel": 1, "rotari": 1, "embed": [1, 9, 16, 21], "alibi": 1, "you": [1, 9], "support": [1, 9], "multi": [1, 4, 9], "gpu": [1, 9], "pretrain": [1, 9, 23], "e": [1, 9], "g": [1, 9], "glove": [1, 9], "exampl": [1, 8, 9], "can": [1, 9], "ensembl": [1, 9], "infer": [1, 4, 5, 9, 23], "weight": [1, 9], "differ": [1, 9], "corpora": [1, 9], "what": 1, "special": 1, "token": 1, "doe": 1, "opennmt": [1, 7], "py": [1, 7], "appli": 1, "fly": 1, "subword": [1, 6, 7, 19, 21, 22], "regular": 1, "when": 1, "ar": 1, "readili": 1, "avail": 1, "data": [1, 3, 4, 5, 6, 7, 15, 19, 21, 22, 23], "gener": [1, 6, 21, 23], "purpos": 1, "filter": [1, 19, 21, 22], "length": 1, "add": 1, "custom": 1, "prefix": [1, 19, 21, 22], "suffix": [1, 19, 21, 22], "convert": [1, 23], "uppercas": [1, 19, 21, 22], "normal": [1, 19, 21, 22], "punctuat": 1, "clean": [1, 6, 19, 21, 22], "dataset": [1, 4, 15], "context": 1, "doc": 1, "awar": 1, "augment": 1, "sourc": 1, "segment": 1, "fuzzi": 1, "match": 1, "neural": [1, 3], "repair": 1, "target": 1, "inlin": 1, "tag": 1, "make": 1, "learn": [1, 9], "terminologi": [1, 19, 21, 22], "sentencepiec": 1, "bpe": [1, 6], "nmt": [1, 7], "bart": [1, 19, 21, 22], "style": 1, "nois": 1, "switchout": [1, 19, 21, 22], "sampl": [1, 22], "drop": 1, "some": 1, "mask": 1, "creat": 1, "lora": 1, "8bit": 1, "load": 1, "finetun": [1, 4, 23], "big": 1, "gradient": 1, "checkpoint": [1, 4], "deal": 1, "get": [1, 7, 8, 9], "word": [1, 9], "align": [1, 9, 21], "while": [1, 9], "translat": [1, 7, 9, 18, 22, 23], "raw": [1, 9], "from": [1, 9, 23], "averag": [1, 9], "attent": [1, 9, 16, 21], "head": [1, 9], "supervis": [1, 4, 9], "specif": [1, 6, 9], "updat": 1, "": 1, "vocabulari": [1, 3, 4, 6], "featur": [1, 19, 21, 22], "set": 1, "up": 1, "server": [1, 17, 20], "work": 1, "configur": [1, 19, 21, 22], "ii": 1, "start": [1, 3, 8, 10, 11], "without": 1, "docker": 1, "0": [1, 6], "code": 1, "1": [1, 6, 23], "instal": [1, 13], "flask": 1, "2": [1, 6, 23], "put": 1, "3": [1, 6, 23], "iii": 1, "iv": 1, "api": [1, 8], "hostnam": 1, "list": 1, "version": [2, 9], "break": 2, "chang": 2, "gate": 3, "graph": 3, "network": 3, "depend": [3, 4, 7, 10, 11], "quick": [3, 10, 11], "format": [3, 4], "note": 3, "option": [3, 10, 11, 21, 22], "acknowledg": [3, 11], "llama": 4, "7b": 4, "replic": 4, "vicuna": 4, "concaten": 4, "convers": 4, "ctranslat": 4, "round": 4, "simpl": 4, "summar": 5, "cnn": 5, "dm": 5, "prepar": [5, 6, 7, 23], "vocab": [5, 19, 21], "evalu": 5, "gigaword": 5, "score": [5, 18], "refer": [5, 24], "languag": 6, "wiki": 6, "103": 6, "step": [6, 23], "download": 6, "pyonmttok": 6, "build": [6, 19], "command": 6, "4": 6, "output": 6, "wmt17": 7, "en": 7, "de": 7, "pytorch": 7, "apex": 7, "run": 7, "content": 8, "frequent": 8, "ask": 8, "question": 8, "script": 8, "legaci": [8, 9], "faq": 9, "preprocess": 9, "imag": 10, "text": [10, 11, 12, 23], "speech": 11, "video": 12, "recurr": 12, "overview": 13, "citat": 13, "addit": 13, "resourc": 13, "framework": 14, "trainer": 14, "loss": 14, "optim": [14, 21], "loader": 15, "iter": 15, "modul": 16, "decod": [16, 18, 21, 22], "core": 17, "class": 18, "strategi": 18, "token_drop": [19, 21, 22], "token_mask": [19, 21, 22], "docifi": [19, 21, 22], "insertmaskbeforeplaceholderstransform": [19, 21, 22], "fuzzymatch": [19, 21, 22], "inlinetag": [19, 21, 22], "common": [19, 21, 22], "onmttok": [19, 21, 22], "inferfeat": [19, 21, 22], "reproduc": [19, 21, 22], "name": 20, "argument": 20, "prune": 21, "distribut": [21, 22], "task": 21, "initi": 21, "type": 21, "rate": 21, "log": [21, 22], "quant": [21, 22], "beam": 22, "search": 22, "random": 22, "penalti": 22, "trick": 22, "effici": 22, "quickstart": 23, "scratch": 23, "llm": 23, "hug": 23, "face": 23, "hub": 23, "an": 23, "yaml": 23, "config": 23, "file": 23}, "envversion": {"sphinx.domains.c": 2, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 8, "sphinx.domains.index": 1, "sphinx.domains.javascript": 2, "sphinx.domains.math": 2, "sphinx.domains.python": 3, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.viewcode": 1, "sphinxcontrib.bibtex": 9, "sphinx": 57}, "alltitles": {"Contributors": [[0, "contributors"]], "Guidelines": [[0, "guidelines"]], "Docstrings": [[0, "docstrings"]], "How do I use my v2 models in v3 ?": [[1, "how-do-i-use-my-v2-models-in-v3"]], "How do I train the Transformer model?": [[1, "how-do-i-train-the-transformer-model"]], "Performance tips": [[1, "performance-tips"], [2, "performance-tips"]], "Position encoding: Absolute vs Relative vs Rotary Embeddings vs Alibi": [[1, "position-encoding-absolute-vs-relative-vs-rotary-embeddings-vs-alibi"]], "Do you support multi-gpu?": [[1, "do-you-support-multi-gpu"], [9, "do-you-support-multi-gpu"]], "How do I use Pretrained embeddings (e.g. GloVe)?": [[1, "how-do-i-use-pretrained-embeddings-e-g-glove"], [9, "how-do-i-use-pretrained-embeddings-e-g-glove"]], "Example": [[1, "example"], [1, "id1"], [1, "id2"], [1, "id3"], [9, "example"]], "How can I ensemble Models at inference?": [[1, "how-can-i-ensemble-models-at-inference"], [9, "how-can-i-ensemble-models-at-inference"]], "How can I weight different corpora at training?": [[1, "how-can-i-weight-different-corpora-at-training"], [9, "how-can-i-weight-different-corpora-at-training"]], "What special tokens does OpenNMT-py use?": [[1, "what-special-tokens-does-opennmt-py-use"]], "How can I apply on-the-fly tokenization and subword regularization when training?": [[1, "how-can-i-apply-on-the-fly-tokenization-and-subword-regularization-when-training"]], "What are the readily available on-the-fly data transforms?": [[1, "what-are-the-readily-available-on-the-fly-data-transforms"]], "General purpose": [[1, "general-purpose"]], "Filter examples by length": [[1, "filter-examples-by-length"]], "Add custom prefix to examples": [[1, "add-custom-prefix-to-examples"]], "Add custom suffix to examples": [[1, "add-custom-suffix-to-examples"]], "Convert examples to uppercase": [[1, "convert-examples-to-uppercase"]], "Normalize punctuation": [[1, "normalize-punctuation"]], "Clean dataset": [[1, "clean-dataset"]], "Context / Doc aware transform": [[1, "context-doc-aware-transform"]], "Augment source segments with fuzzy matches for Neural Fuzzy Repair": [[1, "augment-source-segments-with-fuzzy-matches-for-neural-fuzzy-repair"]], "Augment source and target segments with inline tags": [[1, "augment-source-and-target-segments-with-inline-tags"]], "Make the model learn to use terminology": [[1, "make-the-model-learn-to-use-terminology"]], "Tokenization": [[1, "tokenization"]], "OpenNMT Tokenizer": [[1, "opennmt-tokenizer"]], "SentencePiece": [[1, "sentencepiece"]], "BPE subword-nmt": [[1, "bpe-subword-nmt"]], "BART-style noise": [[1, "bart-style-noise"]], "SwitchOut and sampling": [[1, "switchout-and-sampling"]], "SwitchOut": [[1, "switchout"]], "Drop some tokens": [[1, "drop-some-tokens"]], "Mask some tokens": [[1, "mask-some-tokens"]], "How can I create custom on-the-fly data transforms?": [[1, "how-can-i-create-custom-on-the-fly-data-transforms"]], "How to use LoRa and 8bit loading to finetune a big model ?": [[1, "how-to-use-lora-and-8bit-loading-to-finetune-a-big-model"]], "How to use gradient checkpointing when dealing with a big model ?": [[1, "how-to-use-gradient-checkpointing-when-dealing-with-a-big-model"]], "Can I get word alignments while translating?": [[1, "can-i-get-word-alignments-while-translating"]], "Raw alignments from averaging Transformer attention heads": [[1, "raw-alignments-from-averaging-transformer-attention-heads"], [9, "raw-alignments-from-averaging-transformer-attention-heads"]], "Supervised learning on a specific head": [[1, "supervised-learning-on-a-specific-head"], [9, "supervised-learning-on-a-specific-head"]], "How can I update a checkpoint\u2019s vocabulary?": [[1, "how-can-i-update-a-checkpoint-s-vocabulary"]], "How can I use source word features?": [[1, "how-can-i-use-source-word-features"]], "How can I set up a translation server ?": [[1, "how-can-i-set-up-a-translation-server"]], "I. How it works?": [[1, "i-how-it-works"]], "Configuration:": [[1, "configuration"]], "II. How to start the server without Docker ?": [[1, "ii-how-to-start-the-server-without-docker"]], "0. Get the code": [[1, "get-the-code"]], "1. Install flask": [[1, "install-flask"]], "2. Put some models": [[1, "put-some-models"]], "3. Start the server": [[1, "start-the-server"]], "III. How to start the server with Docker ?": [[1, "iii-how-to-start-the-server-with-docker"]], "IV. How to use the API ?": [[1, "iv-how-to-use-the-api"]], "0. Set the hostname": [[1, "set-the-hostname"]], "1. List models": [[1, "list-models"]], "2. Translate": [[1, "translate"]], "Versions": [[2, "versions"]], "Breaking changes": [[2, "breaking-changes"]], "Gated Graph Neural Networks": [[3, "gated-graph-neural-networks"]], "Dependencies": [[3, "dependencies"], [4, "dependencies"], [7, "dependencies"], [10, "dependencies"], [11, "dependencies"]], "Quick Start": [[3, "quick-start"], [10, "quick-start"], [11, "quick-start"]], "Graph data format": [[3, "graph-data-format"]], "Vocabulary notes": [[3, "vocabulary-notes"]], "Options": [[3, "options"], [10, "options"], [11, "options"]], "Acknowledgement": [[3, "acknowledgement"], [11, "acknowledgement"]], "Supervised Finetuning of llama 7B to replicate Vicuna": [[4, "supervised-finetuning-of-llama-7b-to-replicate-vicuna"]], "Data": [[4, "data"], [19, "Data"], [21, "Data"], [22, "Data"]], "Checkpoints": [[4, "checkpoints"]], "Vocabulary": [[4, "vocabulary"]], "Datasets": [[4, "datasets"]], "Finetuning": [[4, "finetuning"]], "Inference": [[4, "inference"], [5, "inference"]], "Concatenation of the checkpoints": [[4, "concatenation-of-the-checkpoints"]], "Conversion to ctranslate format": [[4, "conversion-to-ctranslate-format"]], "Multi-round conversations with vicuna": [[4, "multi-round-conversations-with-vicuna"]], "Simple inference": [[4, "simple-inference"]], "Summarization CNN/DM": [[5, "summarization-cnn-dm"]], "Preparing the data and vocab": [[5, "preparing-the-data-and-vocab"]], "Training": [[5, "training"], [9, "training"]], "Evaluation": [[5, "evaluation"]], "CNN-DM": [[5, "cnn-dm"], [5, "id1"]], "Gigaword": [[5, "gigaword"], [5, "id2"]], "Scores and Models": [[5, "scores-and-models"]], "References": [[5, "references"], [24, "references"]], "Language Model Wiki-103": [[6, "language-model-wiki-103"]], "Step 0: Download and clean the data": [[6, "step-0-download-and-clean-the-data"]], "Step 1: Prepare the subword model - BPE with pyonmttok": [[6, "step-1-prepare-the-subword-model-bpe-with-pyonmttok"]], "Step 2: Build the vocabulary": [[6, "step-2-build-the-vocabulary"]], "Language Model specificities": [[6, "language-model-specificities"]], "BPE specificities": [[6, "bpe-specificities"]], "Build vocabulary command": [[6, "build-vocabulary-command"]], "Step 3: Train the model": [[6, "step-3-train-the-model"]], "Step 4: Generate output": [[6, "step-4-generate-output"]], "Translation WMT17 en-de": [[7, "translation-wmt17-en-de"]], "PyTorch": [[7, "pytorch"]], "Apex": [[7, "apex"]], "Subword-NMT": [[7, "subword-nmt"]], "OpenNMT-py": [[7, "opennmt-py"]], "Running WMT17 EN-DE": [[7, "running-wmt17-en-de"]], "Get Data and prepare": [[7, "get-data-and-prepare"]], "Train": [[7, "train"], [21, "train"]], "Contents": [[8, "contents"]], "Getting Started": [[8, null]], "Frequently Asked Questions": [[8, null]], "Examples": [[8, null]], "Scripts": [[8, null]], "API": [[8, null]], "Legacy": [[8, null]], "FAQ (Legacy version)": [[9, "faq-legacy-version"]], "How do I use the Transformer model?": [[9, "how-do-i-use-the-transformer-model"]], "Preprocessing": [[9, "preprocessing"]], "Can I get word alignment while translating?": [[9, "can-i-get-word-alignment-while-translating"]], "Image to Text": [[10, "image-to-text"]], "Speech to Text": [[11, "speech-to-text"]], "Video to Text": [[12, "video-to-text"]], "Recurrent": [[12, "recurrent"]], "Transformer": [[12, "transformer"]], "Overview": [[13, "overview"]], "Installation": [[13, "installation"]], "Citation": [[13, "citation"]], "Additional resources": [[13, "additional-resources"]], "Framework": [[14, "framework"]], "Model": [[14, "model"], [22, "Model"]], "Trainer": [[14, "trainer"]], "Loss": [[14, "loss"]], "Optimizer": [[14, "optimizer"]], "Data Loaders": [[15, "data-loaders"]], "Data Iterator": [[15, "data-iterator"]], "Dataset": [[15, "dataset"]], "Modules": [[16, "modules"]], "Embeddings": [[16, "embeddings"], [21, "Embeddings"]], "Encoders": [[16, "encoders"]], "Decoders": [[16, "decoders"]], "Attention": [[16, "attention"]], "Server": [[17, "server"], [20, "server"]], "Models": [[17, "models"]], "Core Server": [[17, "core-server"]], "Translation": [[18, "translation"]], "Translations": [[18, "translations"]], "Translator Class": [[18, "translator-class"]], "Decoding Strategies": [[18, "decoding-strategies"]], "Scoring": [[18, "scoring"]], "Build Vocab": [[19, "build-vocab"]], "Configuration": [[19, "Configuration"], [21, "Configuration"], [22, "Configuration"]], "Vocab": [[19, "Vocab"], [21, "Vocab"]], "Features": [[19, "Features"], [21, "Features"], [22, "Features"]], "Transform/SwitchOut": [[19, "Transform/SwitchOut"], [21, "Transform/SwitchOut"], [22, "Transform/SwitchOut"]], "Transform/Token_Drop": [[19, "Transform/Token_Drop"], [21, "Transform/Token_Drop"], [22, "Transform/Token_Drop"]], "Transform/Token_Mask": [[19, "Transform/Token_Mask"], [21, "Transform/Token_Mask"], [22, "Transform/Token_Mask"]], "Transform/Docify": [[19, "Transform/Docify"], [21, "Transform/Docify"], [22, "Transform/Docify"]], "Transform/InsertMaskBeforePlaceholdersTransform": [[19, "Transform/InsertMaskBeforePlaceholdersTransform"], [21, "Transform/InsertMaskBeforePlaceholdersTransform"], [22, "Transform/InsertMaskBeforePlaceholdersTransform"]], "Transform/Uppercase": [[19, "Transform/Uppercase"], [21, "Transform/Uppercase"], [22, "Transform/Uppercase"]], "Transform/FuzzyMatching": [[19, "Transform/FuzzyMatching"], [21, "Transform/FuzzyMatching"], [22, "Transform/FuzzyMatching"]], "Transform/InlineTags": [[19, "Transform/InlineTags"], [21, "Transform/InlineTags"], [22, "Transform/InlineTags"]], "Transform/Clean": [[19, "Transform/Clean"], [21, "Transform/Clean"], [22, "Transform/Clean"]], "Transform/Subword/Common": [[19, "Transform/Subword/Common"], [21, "Transform/Subword/Common"], [22, "Transform/Subword/Common"]], "Transform/Subword/ONMTTOK": [[19, "Transform/Subword/ONMTTOK"], [21, "Transform/Subword/ONMTTOK"], [22, "Transform/Subword/ONMTTOK"]], "Transform/Normalize": [[19, "Transform/Normalize"], [21, "Transform/Normalize"], [22, "Transform/Normalize"]], "Transform/InferFeats": [[19, "Transform/InferFeats"], [21, "Transform/InferFeats"], [22, "Transform/InferFeats"]], "Transform/Filter": [[19, "Transform/Filter"], [21, "Transform/Filter"], [22, "Transform/Filter"]], "Transform/Prefix": [[19, "Transform/Prefix"], [21, "Transform/Prefix"], [22, "Transform/Prefix"]], "Transform/Suffix": [[19, "Transform/Suffix"], [21, "Transform/Suffix"], [22, "Transform/Suffix"]], "Transform/Terminology": [[19, "Transform/Terminology"], [21, "Transform/Terminology"], [22, "Transform/Terminology"]], "Transform/BART": [[19, "Transform/BART"], [21, "Transform/BART"], [22, "Transform/BART"]], "Reproducibility": [[19, "Reproducibility"], [21, "Reproducibility"], [22, "Reproducibility"]], "Named Arguments": [[20, "Named Arguments"]], "Pruning": [[21, "Pruning"]], "Distributed": [[21, "Distributed"], [22, "Distributed"]], "Model-Embeddings": [[21, "Model-Embeddings"]], "Model-Embedding Features": [[21, "Model-Embedding Features"]], "Model- Task": [[21, "Model- Task"]], "Model- Encoder-Decoder": [[21, "Model- Encoder-Decoder"]], "Model- Attention": [[21, "Model- Attention"]], "Model - Alignement": [[21, "Model - Alignement"]], "Generator": [[21, "Generator"]], "General": [[21, "General"]], "Initialization": [[21, "Initialization"]], "Optimization- Type": [[21, "Optimization- Type"]], "Optimization- Rate": [[21, "Optimization- Rate"]], "Logging": [[21, "Logging"], [22, "Logging"]], "Quant options": [[21, "Quant options"], [22, "Quant options"]], "Translate": [[22, "translate"]], "Beam Search": [[22, "Beam Search"]], "Random Sampling": [[22, "Random Sampling"]], "Penalties": [[22, "Penalties"]], "Decoding tricks": [[22, "Decoding tricks"]], "Efficiency": [[22, "Efficiency"]], "Quickstart": [[23, "quickstart"]], "How to train a model from scratch": [[23, "how-to-train-a-model-from-scratch"]], "Step 1: Prepare the data": [[23, "step-1-prepare-the-data"]], "Step 2: Train the model": [[23, "step-2-train-the-model"]], "Step 3: Translate": [[23, "step-3-translate"]], "How to generate with a pretrained LLM": [[23, "how-to-generate-with-a-pretrained-llm"]], "Step 1: Convert a model from Hugging Face Hub": [[23, "step-1-convert-a-model-from-hugging-face-hub"], [23, "id1"]], "Step 2: Prepare an inference.yaml config file": [[23, "step-2-prepare-an-inference-yaml-config-file"]], "Step 3: Generate text": [[23, "step-3-generate-text"]], "How to finetune a pretrained LLM": [[23, "how-to-finetune-a-pretrained-llm"]], "Step 2: Prepare an finetune.yaml config file": [[23, "step-2-prepare-an-finetune-yaml-config-file"]], "Step 3: Finetune": [[23, "step-3-finetune"]]}, "indexentries": {"adafactor (class in onmt.utils)": [[14, "onmt.utils.AdaFactor"]], "basemodel (class in onmt.models)": [[14, "onmt.models.BaseModel"]], "fusedadam (class in onmt.utils)": [[14, "onmt.utils.FusedAdam"]], "languagemodel (class in onmt.models)": [[14, "onmt.models.LanguageModel"]], "losscompute (class in onmt.utils.loss)": [[14, "onmt.utils.loss.LossCompute"]], "nmtmodel (class in onmt.models)": [[14, "onmt.models.NMTModel"]], "optimizer (class in onmt.utils)": [[14, "onmt.utils.Optimizer"]], "statistics (class in onmt.utils)": [[14, "onmt.utils.Statistics"]], "trainer (class in onmt.trainer)": [[14, "onmt.trainer.Trainer"]], "accuracy() (onmt.utils.statistics method)": [[14, "onmt.utils.Statistics.accuracy"]], "all_gather_stats() (onmt.utils.statistics static method)": [[14, "onmt.utils.Statistics.all_gather_stats"]], "all_gather_stats_list() (onmt.utils.statistics static method)": [[14, "onmt.utils.Statistics.all_gather_stats_list"]], "amp (onmt.utils.optimizer property)": [[14, "onmt.utils.Optimizer.amp"]], "backward() (onmt.utils.optimizer method)": [[14, "onmt.utils.Optimizer.backward"]], "count_parameters() (onmt.models.languagemodel method)": [[14, "onmt.models.LanguageModel.count_parameters"]], "count_parameters() (onmt.models.nmtmodel method)": [[14, "onmt.models.NMTModel.count_parameters"]], "elapsed_time() (onmt.utils.statistics method)": [[14, "onmt.utils.Statistics.elapsed_time"]], "forward() (onmt.models.basemodel method)": [[14, "onmt.models.BaseModel.forward"]], "forward() (onmt.models.languagemodel method)": [[14, "onmt.models.LanguageModel.forward"]], "forward() (onmt.models.nmtmodel method)": [[14, "onmt.models.NMTModel.forward"]], "forward() (onmt.utils.loss.losscompute method)": [[14, "onmt.utils.loss.LossCompute.forward"]], "from_opt() (onmt.utils.optimizer class method)": [[14, "onmt.utils.Optimizer.from_opt"]], "from_opts() (onmt.utils.loss.losscompute class method)": [[14, "onmt.utils.loss.LossCompute.from_opts"]], "ignore_prompt() (onmt.utils.loss.losscompute method)": [[14, "onmt.utils.loss.LossCompute.ignore_prompt"]], "learning_rate() (onmt.utils.optimizer method)": [[14, "onmt.utils.Optimizer.learning_rate"]], "load_safe_state_dict() (onmt.models.basemodel method)": [[14, "onmt.models.BaseModel.load_safe_state_dict"]], "load_state_dict() (onmt.models.basemodel method)": [[14, "onmt.models.BaseModel.load_state_dict"]], "log_tensorboard() (onmt.utils.statistics method)": [[14, "onmt.utils.Statistics.log_tensorboard"]], "output() (onmt.utils.statistics method)": [[14, "onmt.utils.Statistics.output"]], "ppl() (onmt.utils.statistics method)": [[14, "onmt.utils.Statistics.ppl"]], "step() (onmt.utils.adafactor method)": [[14, "onmt.utils.AdaFactor.step"]], "step() (onmt.utils.fusedadam method)": [[14, "onmt.utils.FusedAdam.step"]], "step() (onmt.utils.optimizer method)": [[14, "onmt.utils.Optimizer.step"]], "train() (onmt.trainer.trainer method)": [[14, "onmt.trainer.Trainer.train"]], "training_step (onmt.utils.optimizer property)": [[14, "onmt.utils.Optimizer.training_step"]], "update() (onmt.utils.statistics method)": [[14, "onmt.utils.Statistics.update"]], "validate() (onmt.trainer.trainer method)": [[14, "onmt.trainer.Trainer.validate"]], "xent() (onmt.utils.statistics method)": [[14, "onmt.utils.Statistics.xent"]], "zero_grad() (onmt.utils.optimizer method)": [[14, "onmt.utils.Optimizer.zero_grad"]], "dynamicdatasetiter (class in onmt.inputters)": [[15, "onmt.inputters.DynamicDatasetIter"]], "mixingstrategy (class in onmt.inputters)": [[15, "onmt.inputters.MixingStrategy"]], "parallelcorpus (class in onmt.inputters)": [[15, "onmt.inputters.ParallelCorpus"]], "parallelcorpusiterator (class in onmt.inputters)": [[15, "onmt.inputters.ParallelCorpusIterator"]], "sequentialmixer (class in onmt.inputters)": [[15, "onmt.inputters.SequentialMixer"]], "weightedmixer (class in onmt.inputters)": [[15, "onmt.inputters.WeightedMixer"]], "batch_iter() (onmt.inputters.dynamicdatasetiter method)": [[15, "onmt.inputters.DynamicDatasetIter.batch_iter"]], "from_opt() (onmt.inputters.dynamicdatasetiter class method)": [[15, "onmt.inputters.DynamicDatasetIter.from_opt"]], "load() (onmt.inputters.parallelcorpus method)": [[15, "onmt.inputters.ParallelCorpus.load"]], "averageattention (class in onmt.modules)": [[16, "onmt.modules.AverageAttention"]], "cnndecoder (class in onmt.decoders)": [[16, "onmt.decoders.CNNDecoder"]], "cnnencoder (class in onmt.encoders)": [[16, "onmt.encoders.CNNEncoder"]], "convmultistepattention (class in onmt.modules)": [[16, "onmt.modules.ConvMultiStepAttention"]], "copygenerator (class in onmt.modules)": [[16, "onmt.modules.CopyGenerator"]], "decoderbase (class in onmt.decoders)": [[16, "onmt.decoders.DecoderBase"]], "embeddings (class in onmt.modules)": [[16, "onmt.modules.Embeddings"]], "encoderbase (class in onmt.encoders)": [[16, "onmt.encoders.EncoderBase"]], "ggnnencoder (class in onmt.encoders)": [[16, "onmt.encoders.GGNNEncoder"]], "globalattention (class in onmt.modules)": [[16, "onmt.modules.GlobalAttention"]], "inputfeedrnndecoder (class in onmt.decoders)": [[16, "onmt.decoders.InputFeedRNNDecoder"]], "matrixtree (class in onmt.modules.structured_attention)": [[16, "onmt.modules.structured_attention.MatrixTree"]], "meanencoder (class in onmt.encoders)": [[16, "onmt.encoders.MeanEncoder"]], "multiheadedattention (class in onmt.modules)": [[16, "onmt.modules.MultiHeadedAttention"]], "positionalencoding (class in onmt.modules)": [[16, "onmt.modules.PositionalEncoding"]], "positionwisefeedforward (class in onmt.modules.position_ffn)": [[16, "onmt.modules.position_ffn.PositionwiseFeedForward"]], "rnndecoderbase (class in onmt.decoders.decoder)": [[16, "onmt.decoders.decoder.RNNDecoderBase"]], "rnnencoder (class in onmt.encoders)": [[16, "onmt.encoders.RNNEncoder"]], "stdrnndecoder (class in onmt.decoders)": [[16, "onmt.decoders.StdRNNDecoder"]], "transformerdecoder (class in onmt.decoders)": [[16, "onmt.decoders.TransformerDecoder"]], "transformerencoder (class in onmt.encoders)": [[16, "onmt.encoders.TransformerEncoder"]], "apply_mask() (onmt.modules.convmultistepattention method)": [[16, "onmt.modules.ConvMultiStepAttention.apply_mask"]], "emb_luts (onmt.modules.embeddings property)": [[16, "onmt.modules.Embeddings.emb_luts"]], "forward() (onmt.decoders.cnndecoder method)": [[16, "onmt.decoders.CNNDecoder.forward"]], "forward() (onmt.decoders.transformerdecoder method)": [[16, "onmt.decoders.TransformerDecoder.forward"]], "forward() (onmt.decoders.decoder.rnndecoderbase method)": [[16, "onmt.decoders.decoder.RNNDecoderBase.forward"]], "forward() (onmt.encoders.cnnencoder method)": [[16, "onmt.encoders.CNNEncoder.forward"]], "forward() (onmt.encoders.encoderbase method)": [[16, "onmt.encoders.EncoderBase.forward"]], "forward() (onmt.encoders.ggnnencoder method)": [[16, "onmt.encoders.GGNNEncoder.forward"]], "forward() (onmt.encoders.meanencoder method)": [[16, "onmt.encoders.MeanEncoder.forward"]], "forward() (onmt.encoders.rnnencoder method)": [[16, "onmt.encoders.RNNEncoder.forward"]], "forward() (onmt.encoders.transformerencoder method)": [[16, "onmt.encoders.TransformerEncoder.forward"]], "forward() (onmt.modules.averageattention method)": [[16, "onmt.modules.AverageAttention.forward"]], "forward() (onmt.modules.convmultistepattention method)": [[16, "onmt.modules.ConvMultiStepAttention.forward"]], "forward() (onmt.modules.copygenerator method)": [[16, "onmt.modules.CopyGenerator.forward"]], "forward() (onmt.modules.embeddings method)": [[16, "onmt.modules.Embeddings.forward"]], "forward() (onmt.modules.globalattention method)": [[16, "onmt.modules.GlobalAttention.forward"]], "forward() (onmt.modules.multiheadedattention method)": [[16, "onmt.modules.MultiHeadedAttention.forward"]], "forward() (onmt.modules.positionalencoding method)": [[16, "onmt.modules.PositionalEncoding.forward"]], "forward() (onmt.modules.position_ffn.positionwisefeedforward method)": [[16, "onmt.modules.position_ffn.PositionwiseFeedForward.forward"]], "forward() (onmt.modules.structured_attention.matrixtree method)": [[16, "onmt.modules.structured_attention.MatrixTree.forward"]], "from_opt() (onmt.decoders.cnndecoder class method)": [[16, "onmt.decoders.CNNDecoder.from_opt"]], "from_opt() (onmt.decoders.decoderbase class method)": [[16, "onmt.decoders.DecoderBase.from_opt"]], "from_opt() (onmt.decoders.decoder.rnndecoderbase class method)": [[16, "onmt.decoders.decoder.RNNDecoderBase.from_opt"]], "from_opt() (onmt.encoders.cnnencoder class method)": [[16, "onmt.encoders.CNNEncoder.from_opt"]], "from_opt() (onmt.encoders.ggnnencoder class method)": [[16, "onmt.encoders.GGNNEncoder.from_opt"]], "from_opt() (onmt.encoders.meanencoder class method)": [[16, "onmt.encoders.MeanEncoder.from_opt"]], "from_opt() (onmt.encoders.rnnencoder class method)": [[16, "onmt.encoders.RNNEncoder.from_opt"]], "from_opt() (onmt.encoders.transformerencoder class method)": [[16, "onmt.encoders.TransformerEncoder.from_opt"]], "init_state() (onmt.decoders.cnndecoder method)": [[16, "onmt.decoders.CNNDecoder.init_state"]], "init_state() (onmt.decoders.decoder.rnndecoderbase method)": [[16, "onmt.decoders.decoder.RNNDecoderBase.init_state"]], "load_pretrained_vectors() (onmt.modules.embeddings method)": [[16, "onmt.modules.Embeddings.load_pretrained_vectors"]], "score() (onmt.modules.globalattention method)": [[16, "onmt.modules.GlobalAttention.score"]], "word_lut (onmt.modules.embeddings property)": [[16, "onmt.modules.Embeddings.word_lut"]], "servermodel (class in onmt.translate.translation_server)": [[17, "onmt.translate.translation_server.ServerModel"]], "servermodelerror": [[17, "onmt.translate.translation_server.ServerModelError"]], "timer (class in onmt.translate.translation_server)": [[17, "onmt.translate.translation_server.Timer"]], "translationserver (class in onmt.translate.translation_server)": [[17, "onmt.translate.translation_server.TranslationServer"]], "build_tokenizer() (onmt.translate.translation_server.servermodel method)": [[17, "onmt.translate.translation_server.ServerModel.build_tokenizer"]], "clone_model() (onmt.translate.translation_server.translationserver method)": [[17, "onmt.translate.translation_server.TranslationServer.clone_model"]], "detokenize() (onmt.translate.translation_server.servermodel method)": [[17, "onmt.translate.translation_server.ServerModel.detokenize"]], "do_timeout() (onmt.translate.translation_server.servermodel method)": [[17, "onmt.translate.translation_server.ServerModel.do_timeout"]], "list_models() (onmt.translate.translation_server.translationserver method)": [[17, "onmt.translate.translation_server.TranslationServer.list_models"]], "load_model() (onmt.translate.translation_server.translationserver method)": [[17, "onmt.translate.translation_server.TranslationServer.load_model"]], "maybe_convert_align() (onmt.translate.translation_server.servermodel method)": [[17, "onmt.translate.translation_server.ServerModel.maybe_convert_align"]], "maybe_detokenize() (onmt.translate.translation_server.servermodel method)": [[17, "onmt.translate.translation_server.ServerModel.maybe_detokenize"]], "maybe_detokenize_with_align() (onmt.translate.translation_server.servermodel method)": [[17, "onmt.translate.translation_server.ServerModel.maybe_detokenize_with_align"]], "maybe_postprocess() (onmt.translate.translation_server.servermodel method)": [[17, "onmt.translate.translation_server.ServerModel.maybe_postprocess"]], "maybe_preprocess() (onmt.translate.translation_server.servermodel method)": [[17, "onmt.translate.translation_server.ServerModel.maybe_preprocess"]], "maybe_tokenize() (onmt.translate.translation_server.servermodel method)": [[17, "onmt.translate.translation_server.ServerModel.maybe_tokenize"]], "maybe_transform_feats() (onmt.translate.translation_server.servermodel method)": [[17, "onmt.translate.translation_server.ServerModel.maybe_transform_feats"]], "parse_opt() (onmt.translate.translation_server.servermodel method)": [[17, "onmt.translate.translation_server.ServerModel.parse_opt"]], "postprocess() (onmt.translate.translation_server.servermodel method)": [[17, "onmt.translate.translation_server.ServerModel.postprocess"]], "preload_model() (onmt.translate.translation_server.translationserver method)": [[17, "onmt.translate.translation_server.TranslationServer.preload_model"]], "preprocess() (onmt.translate.translation_server.servermodel method)": [[17, "onmt.translate.translation_server.ServerModel.preprocess"]], "rebuild_seg_packages() (onmt.translate.translation_server.servermodel method)": [[17, "onmt.translate.translation_server.ServerModel.rebuild_seg_packages"]], "run() (onmt.translate.translation_server.translationserver method)": [[17, "onmt.translate.translation_server.TranslationServer.run"]], "start() (onmt.translate.translation_server.translationserver method)": [[17, "onmt.translate.translation_server.TranslationServer.start"]], "to_gpu() (onmt.translate.translation_server.servermodel method)": [[17, "onmt.translate.translation_server.ServerModel.to_gpu"]], "tokenize() (onmt.translate.translation_server.servermodel method)": [[17, "onmt.translate.translation_server.ServerModel.tokenize"]], "tokenizer_marker() (onmt.translate.translation_server.servermodel method)": [[17, "onmt.translate.translation_server.ServerModel.tokenizer_marker"]], "unload_model() (onmt.translate.translation_server.translationserver method)": [[17, "onmt.translate.translation_server.TranslationServer.unload_model"]], "beamsearch (class in onmt.translate)": [[18, "onmt.translate.BeamSearch"]], "decodestrategy (class in onmt.translate)": [[18, "onmt.translate.DecodeStrategy"]], "gnmtglobalscorer (class in onmt.translate)": [[18, "onmt.translate.GNMTGlobalScorer"]], "greedysearch (class in onmt.translate)": [[18, "onmt.translate.GreedySearch"]], "penaltybuilder (class in onmt.translate.penalties)": [[18, "onmt.translate.penalties.PenaltyBuilder"]], "translation (class in onmt.translate)": [[18, "onmt.translate.Translation"]], "translationbuilder (class in onmt.translate)": [[18, "onmt.translate.TranslationBuilder"]], "translator (class in onmt.translate)": [[18, "onmt.translate.Translator"]], "advance() (onmt.translate.decodestrategy method)": [[18, "onmt.translate.DecodeStrategy.advance"]], "advance() (onmt.translate.greedysearch method)": [[18, "onmt.translate.GreedySearch.advance"]], "block_ngram_repeats() (onmt.translate.decodestrategy method)": [[18, "onmt.translate.DecodeStrategy.block_ngram_repeats"]], "coverage_none() (onmt.translate.penalties.penaltybuilder method)": [[18, "onmt.translate.penalties.PenaltyBuilder.coverage_none"]], "coverage_summary() (onmt.translate.penalties.penaltybuilder method)": [[18, "onmt.translate.penalties.PenaltyBuilder.coverage_summary"]], "coverage_wu() (onmt.translate.penalties.penaltybuilder method)": [[18, "onmt.translate.penalties.PenaltyBuilder.coverage_wu"]], "initialize() (onmt.translate.beamsearch method)": [[18, "onmt.translate.BeamSearch.initialize"]], "initialize() (onmt.translate.decodestrategy method)": [[18, "onmt.translate.DecodeStrategy.initialize"]], "initialize() (onmt.translate.greedysearch method)": [[18, "onmt.translate.GreedySearch.initialize"]], "length_average() (onmt.translate.penalties.penaltybuilder method)": [[18, "onmt.translate.penalties.PenaltyBuilder.length_average"]], "length_none() (onmt.translate.penalties.penaltybuilder method)": [[18, "onmt.translate.penalties.PenaltyBuilder.length_none"]], "length_wu() (onmt.translate.penalties.penaltybuilder method)": [[18, "onmt.translate.penalties.PenaltyBuilder.length_wu"]], "log() (onmt.translate.translation method)": [[18, "onmt.translate.Translation.log"]], "maybe_update_forbidden_tokens() (onmt.translate.decodestrategy method)": [[18, "onmt.translate.DecodeStrategy.maybe_update_forbidden_tokens"]], "maybe_update_target_prefix() (onmt.translate.decodestrategy method)": [[18, "onmt.translate.DecodeStrategy.maybe_update_target_prefix"]], "sample_with_temperature() (in module onmt.translate.greedy_search)": [[18, "onmt.translate.greedy_search.sample_with_temperature"]], "target_prefixing() (onmt.translate.decodestrategy method)": [[18, "onmt.translate.DecodeStrategy.target_prefixing"]], "translate_batch() (onmt.translate.translator method)": [[18, "onmt.translate.Translator.translate_batch"]], "update_finished() (onmt.translate.decodestrategy method)": [[18, "onmt.translate.DecodeStrategy.update_finished"]], "update_finished() (onmt.translate.greedysearch method)": [[18, "onmt.translate.GreedySearch.update_finished"]]}}) \ No newline at end of file