diff --git a/docs/user_guides/train/configuration.md b/docs/user_guides/train/configuration.md index d4e86f14b..c71ccdbd8 100644 --- a/docs/user_guides/train/configuration.md +++ b/docs/user_guides/train/configuration.md @@ -165,10 +165,10 @@ training: adam_beta1: 0.9 # Adam beta1 parameter adam_beta2: 0.999 # Adam beta2 parameter adam_epsilon: 1e-8 # Adam epsilon parameter - sgd_momentum: 0.9 # SGD momentum (if using SGD) + sgd_momentum: 0.0 # SGD momentum (if using SGD) # Learning rate schedule - lr_scheduler_type: "cosine" # LR scheduler type + lr_scheduler_type: "linear" # LR scheduler type warmup_ratio: null # Warmup ratio of total steps warmup_steps: null # Number of warmup steps @@ -178,7 +178,7 @@ training: enable_gradient_checkpointing: false # Trade compute for memory # Checkpointing - save_steps: 100 # Save every N steps + save_steps: 500 # Save every N steps save_epoch: false # Save at end of each epoch save_final_model: true # Save model at end of training resume_from_checkpoint: null # Path to resume from @@ -186,7 +186,7 @@ training: # Evaluation eval_strategy: "steps" # When to evaluate ("no", "steps", "epoch") - eval_steps: 50 # Evaluate every N steps + eval_steps: 500 # Evaluate every N steps metrics_function: null # Name of metrics function to use # Logging @@ -221,21 +221,21 @@ Configure parameter-efficient fine-tuning using the {py:obj}`~oumi.core.configs. ```yaml peft: # LoRA settings - lora_r: 16 # Rank of update matrices - lora_alpha: 16 # Scaling factor - lora_dropout: 0.05 # Dropout probability - lora_target_modules: null # Modules to apply LoRA to - lora_modules_to_save: null # Modules to unfreeze and train - lora_bias: "none" # Bias training type - lora_task_type: "CAUSAL_LM" # Task type for adaptation + lora_r: 8 # Rank of update matrices + lora_alpha: 8 # Scaling factor + lora_dropout: 0.0 # Dropout probability + lora_target_modules: null # Modules to apply LoRA to + lora_modules_to_save: null # Modules to unfreeze and train + lora_bias: "none" # Bias training type + lora_task_type: "CAUSAL_LM" # Task type for adaptation # Q-LoRA settings - q_lora: false # Enable quantization - q_lora_bits: 4 # Quantization bits + q_lora: false # Enable quantization + q_lora_bits: 4 # Quantization bits bnb_4bit_quant_type: "fp4" # 4-bit quantization type use_bnb_nested_quant: false # Use nested quantization bnb_4bit_quant_storage: "uint8" # Storage type for params - bnb_4bit_compute_dtype: "float16" # Compute type for params + bnb_4bit_compute_dtype: "float32" # Compute type for params ``` ### FSDP Configuration @@ -244,22 +244,22 @@ Configure fully sharded data parallel training using the {py:obj}`~oumi.core.con ```yaml fsdp: - enable_fsdp: false # Enable FSDP training - sharding_strategy: "FULL_SHARD" # How to shard model - cpu_offload: false # Offload to CPU - mixed_precision: null # Mixed precision type - backward_prefetch: "BACKWARD_PRE" # When to prefetch params - forward_prefetch: false # Prefetch forward results - use_orig_params: null # Use original module params + enable_fsdp: false # Enable FSDP training + sharding_strategy: "FULL_SHARD" # How to shard model + cpu_offload: false # Offload to CPU + mixed_precision: null # Mixed precision type + backward_prefetch: "BACKWARD_PRE" # When to prefetch params + forward_prefetch: false # Prefetch forward results + use_orig_params: null # Use original module params state_dict_type: "FULL_STATE_DICT" # Checkpoint format # Auto wrapping settings - auto_wrap_policy: "SIZE_BASED_WRAP" # How to wrap layers - min_num_params: 100000 # Min params for wrapping - transformer_layer_cls: null # Transformer layer class + auto_wrap_policy: "NO_WRAP" # How to wrap layers + min_num_params: 100000 # Min params for wrapping + transformer_layer_cls: null # Transformer layer class # Other settings - sync_module_states: true # Sync states across processes + sync_module_states: true # Sync states across processes ``` Notes on FSDP sharding strategies: diff --git a/src/oumi/core/configs/params/fsdp_params.py b/src/oumi/core/configs/params/fsdp_params.py index 1a5dacc32..cf10a691b 100644 --- a/src/oumi/core/configs/params/fsdp_params.py +++ b/src/oumi/core/configs/params/fsdp_params.py @@ -203,7 +203,7 @@ class FSDPParams(BaseParams): state_dict_type: StateDictType = StateDictType.FULL_STATE_DICT """Specifies the type of state dict to use for checkpointing.""" - auto_wrap_policy: AutoWrapPolicy = AutoWrapPolicy.SIZE_BASED_WRAP + auto_wrap_policy: AutoWrapPolicy = AutoWrapPolicy.NO_WRAP """Policy for automatically wrapping layers in FSDP.""" min_num_params: int = 100_000 diff --git a/src/oumi/core/configs/params/peft_params.py b/src/oumi/core/configs/params/peft_params.py index 8dae1723c..38f82ee6c 100644 --- a/src/oumi/core/configs/params/peft_params.py +++ b/src/oumi/core/configs/params/peft_params.py @@ -83,7 +83,7 @@ def get_literal_value( class PeftParams(BaseParams): # Lora Params lora_r: int = field( - default=16, + default=8, metadata={"help": "LoRA R value."}, ) """The rank of the update matrices in LoRA. @@ -93,7 +93,7 @@ class PeftParams(BaseParams): """ lora_alpha: int = field( - default=16, + default=8, metadata={"help": "LoRA alpha."}, ) """The scaling factor for the LoRA update. @@ -102,7 +102,7 @@ class PeftParams(BaseParams): """ lora_dropout: float = field( - default=0.05, + default=0.0, metadata={"help": "LoRA dropout."}, ) """The dropout probability applied to LoRA layers. @@ -235,7 +235,7 @@ class PeftParams(BaseParams): """ bnb_4bit_compute_dtype: str = field( - default="float16", + default="float32", metadata={"help": "The compute type of the quantized parameters."}, ) """Compute type of the quantized parameters. diff --git a/src/oumi/core/configs/params/training_params.py b/src/oumi/core/configs/params/training_params.py index 3d52f5bb5..74cd8b003 100644 --- a/src/oumi/core/configs/params/training_params.py +++ b/src/oumi/core/configs/params/training_params.py @@ -202,9 +202,11 @@ class TrainingParams(BaseParams): each complete pass through the training data. This can be useful for tracking model progress over time and for resuming training from a specific epoch if needed. + + If both `save_steps` and `save_epoch` are set, then `save_steps` takes precedence. """ - save_steps: int = 100 + save_steps: int = 500 """Save a checkpoint every `save_steps` training steps. This parameter determines the frequency of saving checkpoints during @@ -327,7 +329,7 @@ class TrainingParams(BaseParams): - "epoch": Evaluation is done at the end of each epoch. """ - eval_steps: int = 50 + eval_steps: int = 500 """Number of update steps between two evaluations if eval_strategy="steps". Ignored if eval_strategy is not "steps". @@ -339,7 +341,7 @@ class TrainingParams(BaseParams): This value can be adjusted by the learning rate scheduler during training. """ - lr_scheduler_type: str = "cosine" + lr_scheduler_type: str = "linear" """The type of learning rate scheduler to use. Possible values include "linear", "cosine", "cosine_with_restarts", @@ -358,13 +360,13 @@ class TrainingParams(BaseParams): """The ratio of total training steps used for a linear warmup from 0 to the learning rate. - Either this or warmup_steps should be set, not both. + If set along with `warmup_steps`, this value will be ignored. """ warmup_steps: Optional[int] = None """The number of steps for the warmup phase of the learning rate scheduler. - Either this or warmup_ratio should be set, not both. + If set, will override the value of `warmup_ratio`. """ # --------------------- @@ -409,11 +411,11 @@ class TrainingParams(BaseParams): Default is 1e-08. """ - sgd_momentum: float = 0.9 + sgd_momentum: float = 0.0 """Momentum factor for SGD optimizer. - Only used when optimizer is set to "sgd". - Default is 0.9. + Only used when optimizer is set to "sgd", and when `trainer_type` is set to OUMI. + Default is 0.0. """ mixed_precision_dtype: MixedPrecisionDtype = MixedPrecisionDtype.NONE @@ -571,15 +573,14 @@ def to_hf(self): f"({self.dataloader_num_workers}). Must be `int`." ) + dispatch_batches = self.dataloader_main_process_only + if self.trainer_type == TrainerType.TRL_SFT: config_class = trl.SFTConfig elif self.trainer_type == TrainerType.TRL_DPO: config_class = trl.DPOConfig else: config_class = transformers.TrainingArguments - - dispatch_batches = self.dataloader_main_process_only - result = config_class( gradient_accumulation_steps=self.gradient_accumulation_steps, log_level=self.dep_log_level, @@ -635,8 +636,8 @@ def to_hf(self): # "use_seedable_sampler": True, # }, seed=self.seed, - # TODO Re-enable `data_seed`. Should it depend on RANK? - # data_seed=self.seed, + # TODO: OPE-891 - Support setting a data seed. + # By default, HF will use the global seed for data loading. **self.trainer_kwargs, ) assert isinstance(result, transformers.TrainingArguments) @@ -676,10 +677,6 @@ def __post_init__(self): if self.max_grad_norm is not None and self.max_grad_norm < 0: raise ValueError("max_grad_norm must be >= 0.") - if self.logging_dir is None and self.output_dir: - # Push the logging_dir inside the output_dir. - self.logging_dir = str(Path(self.output_dir) / "logs") - @property def telemetry_dir(self) -> Optional[Path]: """Returns the telemetry stats output directory.""" diff --git a/tests/unit/core/test_distributed.py b/tests/unit/core/test_distributed.py index ecbaff603..39fdf5e77 100644 --- a/tests/unit/core/test_distributed.py +++ b/tests/unit/core/test_distributed.py @@ -345,7 +345,7 @@ def test_get_accelerate_env_vars_default(): "FSDP_BACKWARD_PREFETCH": "BACKWARD_PRE", "FSDP_FORWARD_PREFETCH": "false", "FSDP_STATE_DICT_TYPE": "FULL_STATE_DICT", - "FSDP_AUTO_WRAP_POLICY": "SIZE_BASED_WRAP", + "FSDP_AUTO_WRAP_POLICY": "NO_WRAP", "FSDP_MIN_NUM_PARAMS": "100000", "FSDP_SYNC_MODULE_STATES": "true", "FSDP_ACTIVATION_CHECKPOINTING": "false", @@ -416,7 +416,7 @@ def test_get_accelerate_env_vars_compile_keep_use_orig_params(): "FSDP_BACKWARD_PREFETCH": "BACKWARD_PRE", "FSDP_FORWARD_PREFETCH": "false", "FSDP_STATE_DICT_TYPE": "FULL_STATE_DICT", - "FSDP_AUTO_WRAP_POLICY": "SIZE_BASED_WRAP", + "FSDP_AUTO_WRAP_POLICY": "NO_WRAP", "FSDP_MIN_NUM_PARAMS": "100000", "FSDP_SYNC_MODULE_STATES": "true", "FSDP_ACTIVATION_CHECKPOINTING": "false", @@ -439,7 +439,7 @@ def test_prepare_accelerate_fsdp_run(): "FSDP_BACKWARD_PREFETCH": "BACKWARD_PRE", "FSDP_FORWARD_PREFETCH": "false", "FSDP_STATE_DICT_TYPE": "FULL_STATE_DICT", - "FSDP_AUTO_WRAP_POLICY": "SIZE_BASED_WRAP", + "FSDP_AUTO_WRAP_POLICY": "NO_WRAP", "FSDP_MIN_NUM_PARAMS": "100000", "FSDP_SYNC_MODULE_STATES": "true", "FSDP_ACTIVATION_CHECKPOINTING": "true", @@ -462,7 +462,7 @@ def test_prepare_accelerate_fsdp_run_override(): "FSDP_BACKWARD_PREFETCH": "BACKWARD_PRE", "FSDP_FORWARD_PREFETCH": "false", "FSDP_STATE_DICT_TYPE": "FULL_STATE_DICT", - "FSDP_AUTO_WRAP_POLICY": "SIZE_BASED_WRAP", + "FSDP_AUTO_WRAP_POLICY": "NO_WRAP", "FSDP_MIN_NUM_PARAMS": "100000", "FSDP_SYNC_MODULE_STATES": "true", "FSDP_ACTIVATION_CHECKPOINTING": "false",