Review default values (#1124)

# Description Review our config's default values, and update some to a more reasonable value (or one that's equivalent to HF's defaults if we don't have a strong opinion). ## Related issues Fixes OPE-831 ## Before submitting - [ ] This PR only changes documentation. (You can ignore the following checks in that case) - [x] Did you read the [contributor guideline](https://github.com/oumi-ai/oumi/blob/main/CONTRIBUTING.md) Pull Request guidelines? - [x] Did you link the issue(s) related to this PR in the section above? - [x] Did you add / update tests where needed?
oumi-ai · Jan 17, 2025 · 0016ff8 · 0016ff8
1 parent bc1ec99
commit 0016ff8
Show file tree

Hide file tree

Showing 5 changed files with 48 additions and 51 deletions.
diff --git a/docs/user_guides/train/configuration.md b/docs/user_guides/train/configuration.md
@@ -165,10 +165,10 @@ training:
   adam_beta1: 0.9                         # Adam beta1 parameter
   adam_beta2: 0.999                       # Adam beta2 parameter
   adam_epsilon: 1e-8                      # Adam epsilon parameter
-  sgd_momentum: 0.9                       # SGD momentum (if using SGD)
+  sgd_momentum: 0.0                       # SGD momentum (if using SGD)
 
   # Learning rate schedule
-  lr_scheduler_type: "cosine"             # LR scheduler type
+  lr_scheduler_type: "linear"             # LR scheduler type
   warmup_ratio: null                      # Warmup ratio of total steps
   warmup_steps: null                      # Number of warmup steps
 
@@ -178,15 +178,15 @@ training:
   enable_gradient_checkpointing: false    # Trade compute for memory
 
   # Checkpointing
-  save_steps: 100                         # Save every N steps
+  save_steps: 500                         # Save every N steps
   save_epoch: false                       # Save at end of each epoch
   save_final_model: true                  # Save model at end of training
   resume_from_checkpoint: null            # Path to resume from
   try_resume_from_last_checkpoint: false  # Try auto-resume from last checkpoint
 
   # Evaluation
   eval_strategy: "steps"                  # When to evaluate ("no", "steps", "epoch")
-  eval_steps: 50                          # Evaluate every N steps
+  eval_steps: 500                         # Evaluate every N steps
   metrics_function: null                  # Name of metrics function to use
 
   # Logging
@@ -221,21 +221,21 @@ Configure parameter-efficient fine-tuning using the {py:obj}`~oumi.core.configs.
 ```yaml
 peft:
   # LoRA settings
-  lora_r: 16                          # Rank of update matrices
-  lora_alpha: 16                      # Scaling factor
-  lora_dropout: 0.05                  # Dropout probability
-  lora_target_modules: null           # Modules to apply LoRA to
-  lora_modules_to_save: null          # Modules to unfreeze and train
-  lora_bias: "none"                   # Bias training type
-  lora_task_type: "CAUSAL_LM"         # Task type for adaptation
+  lora_r: 8                          # Rank of update matrices
+  lora_alpha: 8                      # Scaling factor
+  lora_dropout: 0.0                  # Dropout probability
+  lora_target_modules: null          # Modules to apply LoRA to
+  lora_modules_to_save: null         # Modules to unfreeze and train
+  lora_bias: "none"                  # Bias training type
+  lora_task_type: "CAUSAL_LM"        # Task type for adaptation
 
   # Q-LoRA settings
-  q_lora: false                       # Enable quantization
-  q_lora_bits: 4                      # Quantization bits
+  q_lora: false                      # Enable quantization
+  q_lora_bits: 4                     # Quantization bits
   bnb_4bit_quant_type: "fp4"         # 4-bit quantization type
   use_bnb_nested_quant: false        # Use nested quantization
   bnb_4bit_quant_storage: "uint8"    # Storage type for params
-  bnb_4bit_compute_dtype: "float16"  # Compute type for params
+  bnb_4bit_compute_dtype: "float32"  # Compute type for params
 ```
 
 ### FSDP Configuration
@@ -244,22 +244,22 @@ Configure fully sharded data parallel training using the {py:obj}`~oumi.core.con
 
 ```yaml
 fsdp:
-  enable_fsdp: false                         # Enable FSDP training
-  sharding_strategy: "FULL_SHARD"            # How to shard model
-  cpu_offload: false                         # Offload to CPU
-  mixed_precision: null                      # Mixed precision type
-  backward_prefetch: "BACKWARD_PRE"          # When to prefetch params
-  forward_prefetch: false                    # Prefetch forward results
-  use_orig_params: null                      # Use original module params
+  enable_fsdp: false                        # Enable FSDP training
+  sharding_strategy: "FULL_SHARD"           # How to shard model
+  cpu_offload: false                        # Offload to CPU
+  mixed_precision: null                     # Mixed precision type
+  backward_prefetch: "BACKWARD_PRE"         # When to prefetch params
+  forward_prefetch: false                   # Prefetch forward results
+  use_orig_params: null                     # Use original module params
   state_dict_type: "FULL_STATE_DICT"        # Checkpoint format
 
   # Auto wrapping settings
-  auto_wrap_policy: "SIZE_BASED_WRAP"        # How to wrap layers
-  min_num_params: 100000                     # Min params for wrapping
-  transformer_layer_cls: null                # Transformer layer class
+  auto_wrap_policy: "NO_WRAP"               # How to wrap layers
+  min_num_params: 100000                    # Min params for wrapping
+  transformer_layer_cls: null               # Transformer layer class
 
   # Other settings
-  sync_module_states: true                   # Sync states across processes
+  sync_module_states: true                  # Sync states across processes
 ```
 
 Notes on FSDP sharding strategies:

diff --git a/src/oumi/core/configs/params/fsdp_params.py b/src/oumi/core/configs/params/fsdp_params.py
@@ -203,7 +203,7 @@ class FSDPParams(BaseParams):
     state_dict_type: StateDictType = StateDictType.FULL_STATE_DICT
     """Specifies the type of state dict to use for checkpointing."""
 
-    auto_wrap_policy: AutoWrapPolicy = AutoWrapPolicy.SIZE_BASED_WRAP
+    auto_wrap_policy: AutoWrapPolicy = AutoWrapPolicy.NO_WRAP
     """Policy for automatically wrapping layers in FSDP."""
 
     min_num_params: int = 100_000

diff --git a/src/oumi/core/configs/params/peft_params.py b/src/oumi/core/configs/params/peft_params.py
@@ -83,7 +83,7 @@ def get_literal_value(
 class PeftParams(BaseParams):
     # Lora Params
     lora_r: int = field(
-        default=16,
+        default=8,
         metadata={"help": "LoRA R value."},
     )
     """The rank of the update matrices in LoRA.
@@ -93,7 +93,7 @@ class PeftParams(BaseParams):
     """
 
     lora_alpha: int = field(
-        default=16,
+        default=8,
         metadata={"help": "LoRA alpha."},
     )
     """The scaling factor for the LoRA update.
@@ -102,7 +102,7 @@ class PeftParams(BaseParams):
     """
 
     lora_dropout: float = field(
-        default=0.05,
+        default=0.0,
         metadata={"help": "LoRA dropout."},
     )
     """The dropout probability applied to LoRA layers.
@@ -235,7 +235,7 @@ class PeftParams(BaseParams):
     """
 
     bnb_4bit_compute_dtype: str = field(
-        default="float16",
+        default="float32",
         metadata={"help": "The compute type of the quantized parameters."},
     )
     """Compute type of the quantized parameters.

diff --git a/src/oumi/core/configs/params/training_params.py b/src/oumi/core/configs/params/training_params.py
@@ -202,9 +202,11 @@ class TrainingParams(BaseParams):
     each complete pass through the training data. This can be useful for
     tracking model progress over time and for resuming training from a
     specific epoch if needed.
+
+    If both `save_steps` and `save_epoch` are set, then `save_steps` takes precedence.
     """
 
-    save_steps: int = 100
+    save_steps: int = 500
     """Save a checkpoint every `save_steps` training steps.
 
     This parameter determines the frequency of saving checkpoints during
@@ -327,7 +329,7 @@ class TrainingParams(BaseParams):
     - "epoch": Evaluation is done at the end of each epoch.
     """
 
-    eval_steps: int = 50
+    eval_steps: int = 500
     """Number of update steps between two evaluations if eval_strategy="steps".
 
     Ignored if eval_strategy is not "steps".
@@ -339,7 +341,7 @@ class TrainingParams(BaseParams):
     This value can be adjusted by the learning rate scheduler during training.
     """
 
-    lr_scheduler_type: str = "cosine"
+    lr_scheduler_type: str = "linear"
     """The type of learning rate scheduler to use.
 
     Possible values include "linear", "cosine", "cosine_with_restarts",
@@ -358,13 +360,13 @@ class TrainingParams(BaseParams):
     """The ratio of total training steps used for a linear warmup from 0 to the
     learning rate.
 
-    Either this or warmup_steps should be set, not both.
+    If set along with `warmup_steps`, this value will be ignored.
     """
 
     warmup_steps: Optional[int] = None
     """The number of steps for the warmup phase of the learning rate scheduler.
 
-    Either this or warmup_ratio should be set, not both.
+    If set, will override the value of `warmup_ratio`.
     """
 
     # ---------------------
@@ -409,11 +411,11 @@ class TrainingParams(BaseParams):
     Default is 1e-08.
     """
 
-    sgd_momentum: float = 0.9
+    sgd_momentum: float = 0.0
     """Momentum factor for SGD optimizer.
 
-    Only used when optimizer is set to "sgd".
-    Default is 0.9.
+    Only used when optimizer is set to "sgd", and when `trainer_type` is set to OUMI.
+    Default is 0.0.
     """
 
     mixed_precision_dtype: MixedPrecisionDtype = MixedPrecisionDtype.NONE
@@ -571,15 +573,14 @@ def to_hf(self):
                 f"({self.dataloader_num_workers}). Must be `int`."
             )
 
+        dispatch_batches = self.dataloader_main_process_only
+
         if self.trainer_type == TrainerType.TRL_SFT:
             config_class = trl.SFTConfig
         elif self.trainer_type == TrainerType.TRL_DPO:
             config_class = trl.DPOConfig
         else:
             config_class = transformers.TrainingArguments
-
-        dispatch_batches = self.dataloader_main_process_only
-
         result = config_class(
             gradient_accumulation_steps=self.gradient_accumulation_steps,
             log_level=self.dep_log_level,
@@ -635,8 +636,8 @@ def to_hf(self):
             #    "use_seedable_sampler": True,
             # },
             seed=self.seed,
-            # TODO Re-enable `data_seed`. Should it depend on RANK?
-            # data_seed=self.seed,
+            # TODO: OPE-891 - Support setting a data seed.
+            # By default, HF will use the global seed for data loading.
             **self.trainer_kwargs,
         )
         assert isinstance(result, transformers.TrainingArguments)
@@ -676,10 +677,6 @@ def __post_init__(self):
         if self.max_grad_norm is not None and self.max_grad_norm < 0:
             raise ValueError("max_grad_norm must be >= 0.")
 
-        if self.logging_dir is None and self.output_dir:
-            # Push the logging_dir inside the output_dir.
-            self.logging_dir = str(Path(self.output_dir) / "logs")
-
     @property
     def telemetry_dir(self) -> Optional[Path]:
         """Returns the telemetry stats output directory."""

diff --git a/tests/unit/core/test_distributed.py b/tests/unit/core/test_distributed.py
@@ -345,7 +345,7 @@ def test_get_accelerate_env_vars_default():
         "FSDP_BACKWARD_PREFETCH": "BACKWARD_PRE",
         "FSDP_FORWARD_PREFETCH": "false",
         "FSDP_STATE_DICT_TYPE": "FULL_STATE_DICT",
-        "FSDP_AUTO_WRAP_POLICY": "SIZE_BASED_WRAP",
+        "FSDP_AUTO_WRAP_POLICY": "NO_WRAP",
         "FSDP_MIN_NUM_PARAMS": "100000",
         "FSDP_SYNC_MODULE_STATES": "true",
         "FSDP_ACTIVATION_CHECKPOINTING": "false",
@@ -416,7 +416,7 @@ def test_get_accelerate_env_vars_compile_keep_use_orig_params():
         "FSDP_BACKWARD_PREFETCH": "BACKWARD_PRE",
         "FSDP_FORWARD_PREFETCH": "false",
         "FSDP_STATE_DICT_TYPE": "FULL_STATE_DICT",
-        "FSDP_AUTO_WRAP_POLICY": "SIZE_BASED_WRAP",
+        "FSDP_AUTO_WRAP_POLICY": "NO_WRAP",
         "FSDP_MIN_NUM_PARAMS": "100000",
         "FSDP_SYNC_MODULE_STATES": "true",
         "FSDP_ACTIVATION_CHECKPOINTING": "false",
@@ -439,7 +439,7 @@ def test_prepare_accelerate_fsdp_run():
         "FSDP_BACKWARD_PREFETCH": "BACKWARD_PRE",
         "FSDP_FORWARD_PREFETCH": "false",
         "FSDP_STATE_DICT_TYPE": "FULL_STATE_DICT",
-        "FSDP_AUTO_WRAP_POLICY": "SIZE_BASED_WRAP",
+        "FSDP_AUTO_WRAP_POLICY": "NO_WRAP",
         "FSDP_MIN_NUM_PARAMS": "100000",
         "FSDP_SYNC_MODULE_STATES": "true",
         "FSDP_ACTIVATION_CHECKPOINTING": "true",
@@ -462,7 +462,7 @@ def test_prepare_accelerate_fsdp_run_override():
         "FSDP_BACKWARD_PREFETCH": "BACKWARD_PRE",
         "FSDP_FORWARD_PREFETCH": "false",
         "FSDP_STATE_DICT_TYPE": "FULL_STATE_DICT",
-        "FSDP_AUTO_WRAP_POLICY": "SIZE_BASED_WRAP",
+        "FSDP_AUTO_WRAP_POLICY": "NO_WRAP",
         "FSDP_MIN_NUM_PARAMS": "100000",
         "FSDP_SYNC_MODULE_STATES": "true",
         "FSDP_ACTIVATION_CHECKPOINTING": "false",