Blealtan · PicoCreator · Jun 25, 2023 · Jun 29, 2023 · Jun 29, 2023 · Jun 29, 2023
diff --git a/.gitignore b/.gitignore
@@ -153,6 +153,10 @@ datapath/
 checkpoint/
 node_modules/
 
+# We do capture the notebook generated .log files
+# as they are meant to be read as reference
+!notebook/**/*.log
+
 # Ignore generated lightning logs and config files
 */lightning_logs/
 */config.yaml
diff --git a/README.md b/README.md
@@ -31,20 +31,33 @@ The following features are not yet supported (that may exist in [blinks original
 ## Environment setup
 
 The following venv setup using conda, modify for your use case respectively
-```
+```bash
 # ninja-build is required for the new trainer
 sudo apt-get install ninja-build
 
-# Virtual env, with python 3.11
+# Update conda & its package listings
+conda update conda
+
+# Virtual env, with python 3.10
+# python 3.11 have issues with torch.compile / h100s
+# and if you want to use 3.11, you will need to do a nightly build install
 conda create -n rwkv-infctx python=3.11 pip
 conda activate rwkv-infctx
 
-# Install pytorch
-conda install -y pytorch torchvision torchaudio pytorch-cuda=11.7 -c pytorch -c nvidia
+# Install pytorch (>=2.0.1)
+conda install -y pytorch==2.0.1 torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia
+
+# Currently for torch.compile + 3.11 to work, for some paltforms, you will need the nightly build
+# if so you may need to try the following instead
+# ---
+# conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch-nightly -c nvidia
+
+# Verify your pytorch version 
+python -c "import torch; print(torch.__version__)"
 
 # We use python -m pip, instead of pip directly, as it resolve issues with venv not loading the right pip
 python -m pip install datasets transformers 
-python -m pip install lightning==2.0.2 deepspeed==0.9.3 
+python -m pip install lightning==2.0.4 deepspeed==0.9.5
 python -m pip install ninja numexpr jsonargparse 'jsonargparse[signatures]'
 python -m pip install lm-dataformat ftfy sentencepiece tokenizers wandb
 ```

diff --git a/RWKV-v4neo/config-example.yaml b/RWKV-v4neo/config-example.yaml
@@ -2,8 +2,9 @@
 seed_everything: true
 trainer:
   # Configure the number of GPU, avaliable on your machine
+  # auto means it will automatically detect and use all GPUs
   accelerator: gpu
-  devices: 1
+  devices: auto
   num_nodes: 1
 
   #
@@ -24,8 +25,7 @@ trainer:
   # For more details see:
   # https://lightning.ai/docs/pytorch/stable/advanced/model_parallel.html#deepspeed-zero-stage-2
   #
-  #!FIXME: currently only deepspeed_stage_1 is supported, due to that deepspeed cannot handle repeated backward hook.
-  strategy: deepspeed_stage_1
+  strategy: deepspeed_stage_2_offload
 
   # Floating point precision for the model, because RWKV is built FOR bf16
   # you should pretty much never change this setting
@@ -128,13 +128,17 @@ trainer:
   # Number of datasamples to train for each step, a data sample is considered
   # a "substep" in wandb logs, and a "step" is tracked as "trainer/global_step"
   #
-  # This decides the number of datasample, to learn together from, before backproping
-  # any weight changes at the end of the batch.
+  # This decides the number of datasample * the number of GPU devices, to learn together from, 
+  # before backproping any weight changes at the end of the batch.
   #
-  # Recommended to be a big enough number (like 128/256) where it prevents the training 
-  # loss from flucuating in the process. But not too big of a number where the increased
+  # `1 trainer/global_step = accumulate_grad_batches * number of GPU devices * number of nodes`
+  #
+  # Recommended to be a big enough number (like 128/256) for finetuning where it prevents the  
+  # training loss from flucuating in the process. But not too big of a number where the increased
   # GPU vRAM usage will cause the training to crash.
   #
+  # For foundation model training, a low accumulate_grad_batches like 8/12/16 is recommended.
+  #
   # You are also recommended to configure this to a large enough number to fully utilize
   # your GPU processing time %, and avoid idle time for the GPU between batches
   accumulate_grad_batches: 256
@@ -191,14 +195,6 @@ model:
   # without eating up too much vram by keeping the training context length
   # to a resonable number sutible to the current GPU setup
   ctx_len: 2048
-  # Data samples would be cut down to the respective max ctx_len_cutoffs
-  # values if its larger then ctx_len. If the data sample is larger then
-  # the largest len_cutoff, the remaining data will be discarded
-  ctx_len_cutoffs: [8192, 16384, 32768, 65536]
-  # Experimental settings, number of tokens to skip in the data sample
-  # prefix, for the respective cutoff length. Used to speed up the process
-  ctx_len_warmup_steps: [0, 0, 0, 0]
-
   # Learning rate of the training process
   lr_init: 1.0e-04
 
@@ -209,6 +205,50 @@ model:
   adam_eps: 1.0e-08
   weight_decay: 0.01
 
+  # Back Propagation through time, used to work around training of large context length
+  # beyond what can be supported by the current GPU vram architecture
+  #
+  # This is not 1:1 equivalent to the same training process with the full vram
+  # as the training process is split into multiple segments, part by part.
+  # with limited learnings from the each segment.
+  bptt_learning: true
+
+  # Segmented range to performing backprop learning on
+  # 1 means to apply only for the last segment
+  # -1 means to apply for all segments
+  #
+  # For multi-gpu training, this must be set to 1, due to a known issue
+  # else an exception would be thrown
+  bptt_learning_range: -1
+
+  # Limits the bptt learning only to the "current" chunk
+  # being learned within the learning range. While this reduces the effectiveness
+  # of bptt, it also further reduces vram requirements. 
+  #
+  # This is also known as tbptt (Truncated Back Propagation through time)
+  bptt_truncated_learning: false
+
+  # Aggressively clear the cuda cache between each data samples.
+  # This causes a performance penalty, but reduces the vram pressure
+  #
+  # This is useful for mitigating the following memory pressure warning
+  # `1 pytorch allocator cache flushes since last step. this happens when there is high memory pressure and is detrimental to performance...`
+  substep_cuda_cache_clear: false
+
+  # Experimental cutoff settings
+  # ---
+  # Data samples would be cut down to the respective max ctx_len_cutoffs
+  # values if its larger then ctx_len. If the data sample is larger then
+  # the largest len_cutoff, the remaining data will be discarded
+  #
+  # Leave it as a blank array to disable the feature
+  ctx_len_cutoffs: []
+  # Experimental settings, number of tokens to skip in the data sample
+  # prefix, for the respective cutoff length. Used to speed up the process
+  #
+  # Leave it as a blank array to disable the feature
+  ctx_len_warmup_steps: []
+
   # torch.set_float32_matmul_precision, used to optimize operations with tensor cores
   # this should be set as null, for non cuda core GPUs
   torch_set_float32_matmul_precision: 'high'
@@ -296,7 +336,7 @@ data:
   # multi_column_keys: ['instruction', 'input', 'output']
   # multi_column_prefix: ['Instruction:\n', 'Input:\n', 'Output:\n']
   # multi_column_masking: [false, true, false]
-  # multi_column_seperator: '\n\n'
+  # multi_column_separator: '\n\n'
 
   # If processing prompt/completion jsonl pairs, the prompt is masked by default
   # use this flag to disable this default behaviour

diff --git a/RWKV-v4neo/src/data.py b/RWKV-v4neo/src/data.py
@@ -59,7 +59,7 @@ def prepare_data_static(**kargs):
         # Tokenized encodings for multi column keys
         multi_column_enabled = len(multi_column_keys) > 0
         multi_column_prefix_encodings = []
-        multi_column_seperator_encodings = None
+        multi_column_separator_encodings = None
 
         # Process the multi column settings
         if multi_column_enabled:
@@ -69,9 +69,9 @@ def prepare_data_static(**kargs):
             # Tokenize the multi column strings
             for i in range(len(multi_column_keys)):
                 multi_column_prefix_encodings.append(tokenizer(multi_column_prefix[i]))
-            # Tokenize the multi column seperator
+            # Tokenize the multi column separator
             if multi_column_separator is not None and len(multi_column_separator) > 0:
-                multi_column_seperator_encodings = tokenizer(multi_column_separator)
+                multi_column_separator_encodings = tokenizer(multi_column_separator)
 
         # Maps the dataset record to the tokenized result
         # handles a wide variety of format according to the data configuration
@@ -112,11 +112,11 @@ def map_tokenizer(x):
                     for i in range(len(multi_column_keys)):
                         # And process the column if it has data
                         if multi_column_keys[i] in x and x[multi_column_keys[i]] is not None and len(x[multi_column_keys[i]]) > 0:
-                            # Add the seperator if this is not the first item
-                            if not is_first_item and multi_column_seperator_encodings is not None:
-                                input_ids += multi_column_seperator_encodings['input_ids']
-                                token_type_ids += multi_column_seperator_encodings['token_type_ids']
-                                attention_mask += multi_column_seperator_encodings['attention_mask']
+                            # Add the separator if this is not the first item
+                            if not is_first_item and multi_column_separator_encodings is not None:
+                                input_ids += multi_column_separator_encodings['input_ids']
+                                token_type_ids += multi_column_separator_encodings['token_type_ids']
+                                attention_mask += multi_column_separator_encodings['attention_mask']
 
                             # Add the prefix
                             input_ids += multi_column_prefix_encodings[i]['input_ids']