Skip to content

Commit

Permalink
Review default values (#1124)
Browse files Browse the repository at this point in the history
# Description

Review our config's default values, and update some to a more reasonable
value (or one that's equivalent to HF's defaults if we don't have a
strong opinion).

## Related issues

Fixes OPE-831

## Before submitting

- [ ] This PR only changes documentation. (You can ignore the following
checks in that case)
- [x] Did you read the [contributor
guideline](https://github.com/oumi-ai/oumi/blob/main/CONTRIBUTING.md)
Pull Request guidelines?
- [x] Did you link the issue(s) related to this PR in the section above?
- [x] Did you add / update tests where needed?
  • Loading branch information
wizeng23 authored Jan 17, 2025
1 parent bc1ec99 commit 0016ff8
Show file tree
Hide file tree
Showing 5 changed files with 48 additions and 51 deletions.
50 changes: 25 additions & 25 deletions docs/user_guides/train/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -165,10 +165,10 @@ training:
adam_beta1: 0.9 # Adam beta1 parameter
adam_beta2: 0.999 # Adam beta2 parameter
adam_epsilon: 1e-8 # Adam epsilon parameter
sgd_momentum: 0.9 # SGD momentum (if using SGD)
sgd_momentum: 0.0 # SGD momentum (if using SGD)
# Learning rate schedule
lr_scheduler_type: "cosine" # LR scheduler type
lr_scheduler_type: "linear" # LR scheduler type
warmup_ratio: null # Warmup ratio of total steps
warmup_steps: null # Number of warmup steps
Expand All @@ -178,15 +178,15 @@ training:
enable_gradient_checkpointing: false # Trade compute for memory
# Checkpointing
save_steps: 100 # Save every N steps
save_steps: 500 # Save every N steps
save_epoch: false # Save at end of each epoch
save_final_model: true # Save model at end of training
resume_from_checkpoint: null # Path to resume from
try_resume_from_last_checkpoint: false # Try auto-resume from last checkpoint
# Evaluation
eval_strategy: "steps" # When to evaluate ("no", "steps", "epoch")
eval_steps: 50 # Evaluate every N steps
eval_steps: 500 # Evaluate every N steps
metrics_function: null # Name of metrics function to use
# Logging
Expand Down Expand Up @@ -221,21 +221,21 @@ Configure parameter-efficient fine-tuning using the {py:obj}`~oumi.core.configs.
```yaml
peft:
# LoRA settings
lora_r: 16 # Rank of update matrices
lora_alpha: 16 # Scaling factor
lora_dropout: 0.05 # Dropout probability
lora_target_modules: null # Modules to apply LoRA to
lora_modules_to_save: null # Modules to unfreeze and train
lora_bias: "none" # Bias training type
lora_task_type: "CAUSAL_LM" # Task type for adaptation
lora_r: 8 # Rank of update matrices
lora_alpha: 8 # Scaling factor
lora_dropout: 0.0 # Dropout probability
lora_target_modules: null # Modules to apply LoRA to
lora_modules_to_save: null # Modules to unfreeze and train
lora_bias: "none" # Bias training type
lora_task_type: "CAUSAL_LM" # Task type for adaptation
# Q-LoRA settings
q_lora: false # Enable quantization
q_lora_bits: 4 # Quantization bits
q_lora: false # Enable quantization
q_lora_bits: 4 # Quantization bits
bnb_4bit_quant_type: "fp4" # 4-bit quantization type
use_bnb_nested_quant: false # Use nested quantization
bnb_4bit_quant_storage: "uint8" # Storage type for params
bnb_4bit_compute_dtype: "float16" # Compute type for params
bnb_4bit_compute_dtype: "float32" # Compute type for params
```

### FSDP Configuration
Expand All @@ -244,22 +244,22 @@ Configure fully sharded data parallel training using the {py:obj}`~oumi.core.con

```yaml
fsdp:
enable_fsdp: false # Enable FSDP training
sharding_strategy: "FULL_SHARD" # How to shard model
cpu_offload: false # Offload to CPU
mixed_precision: null # Mixed precision type
backward_prefetch: "BACKWARD_PRE" # When to prefetch params
forward_prefetch: false # Prefetch forward results
use_orig_params: null # Use original module params
enable_fsdp: false # Enable FSDP training
sharding_strategy: "FULL_SHARD" # How to shard model
cpu_offload: false # Offload to CPU
mixed_precision: null # Mixed precision type
backward_prefetch: "BACKWARD_PRE" # When to prefetch params
forward_prefetch: false # Prefetch forward results
use_orig_params: null # Use original module params
state_dict_type: "FULL_STATE_DICT" # Checkpoint format
# Auto wrapping settings
auto_wrap_policy: "SIZE_BASED_WRAP" # How to wrap layers
min_num_params: 100000 # Min params for wrapping
transformer_layer_cls: null # Transformer layer class
auto_wrap_policy: "NO_WRAP" # How to wrap layers
min_num_params: 100000 # Min params for wrapping
transformer_layer_cls: null # Transformer layer class
# Other settings
sync_module_states: true # Sync states across processes
sync_module_states: true # Sync states across processes
```

Notes on FSDP sharding strategies:
Expand Down
2 changes: 1 addition & 1 deletion src/oumi/core/configs/params/fsdp_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ class FSDPParams(BaseParams):
state_dict_type: StateDictType = StateDictType.FULL_STATE_DICT
"""Specifies the type of state dict to use for checkpointing."""

auto_wrap_policy: AutoWrapPolicy = AutoWrapPolicy.SIZE_BASED_WRAP
auto_wrap_policy: AutoWrapPolicy = AutoWrapPolicy.NO_WRAP
"""Policy for automatically wrapping layers in FSDP."""

min_num_params: int = 100_000
Expand Down
8 changes: 4 additions & 4 deletions src/oumi/core/configs/params/peft_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def get_literal_value(
class PeftParams(BaseParams):
# Lora Params
lora_r: int = field(
default=16,
default=8,
metadata={"help": "LoRA R value."},
)
"""The rank of the update matrices in LoRA.
Expand All @@ -93,7 +93,7 @@ class PeftParams(BaseParams):
"""

lora_alpha: int = field(
default=16,
default=8,
metadata={"help": "LoRA alpha."},
)
"""The scaling factor for the LoRA update.
Expand All @@ -102,7 +102,7 @@ class PeftParams(BaseParams):
"""

lora_dropout: float = field(
default=0.05,
default=0.0,
metadata={"help": "LoRA dropout."},
)
"""The dropout probability applied to LoRA layers.
Expand Down Expand Up @@ -235,7 +235,7 @@ class PeftParams(BaseParams):
"""

bnb_4bit_compute_dtype: str = field(
default="float16",
default="float32",
metadata={"help": "The compute type of the quantized parameters."},
)
"""Compute type of the quantized parameters.
Expand Down
31 changes: 14 additions & 17 deletions src/oumi/core/configs/params/training_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,9 +202,11 @@ class TrainingParams(BaseParams):
each complete pass through the training data. This can be useful for
tracking model progress over time and for resuming training from a
specific epoch if needed.
If both `save_steps` and `save_epoch` are set, then `save_steps` takes precedence.
"""

save_steps: int = 100
save_steps: int = 500
"""Save a checkpoint every `save_steps` training steps.
This parameter determines the frequency of saving checkpoints during
Expand Down Expand Up @@ -327,7 +329,7 @@ class TrainingParams(BaseParams):
- "epoch": Evaluation is done at the end of each epoch.
"""

eval_steps: int = 50
eval_steps: int = 500
"""Number of update steps between two evaluations if eval_strategy="steps".
Ignored if eval_strategy is not "steps".
Expand All @@ -339,7 +341,7 @@ class TrainingParams(BaseParams):
This value can be adjusted by the learning rate scheduler during training.
"""

lr_scheduler_type: str = "cosine"
lr_scheduler_type: str = "linear"
"""The type of learning rate scheduler to use.
Possible values include "linear", "cosine", "cosine_with_restarts",
Expand All @@ -358,13 +360,13 @@ class TrainingParams(BaseParams):
"""The ratio of total training steps used for a linear warmup from 0 to the
learning rate.
Either this or warmup_steps should be set, not both.
If set along with `warmup_steps`, this value will be ignored.
"""

warmup_steps: Optional[int] = None
"""The number of steps for the warmup phase of the learning rate scheduler.
Either this or warmup_ratio should be set, not both.
If set, will override the value of `warmup_ratio`.
"""

# ---------------------
Expand Down Expand Up @@ -409,11 +411,11 @@ class TrainingParams(BaseParams):
Default is 1e-08.
"""

sgd_momentum: float = 0.9
sgd_momentum: float = 0.0
"""Momentum factor for SGD optimizer.
Only used when optimizer is set to "sgd".
Default is 0.9.
Only used when optimizer is set to "sgd", and when `trainer_type` is set to OUMI.
Default is 0.0.
"""

mixed_precision_dtype: MixedPrecisionDtype = MixedPrecisionDtype.NONE
Expand Down Expand Up @@ -571,15 +573,14 @@ def to_hf(self):
f"({self.dataloader_num_workers}). Must be `int`."
)

dispatch_batches = self.dataloader_main_process_only

if self.trainer_type == TrainerType.TRL_SFT:
config_class = trl.SFTConfig
elif self.trainer_type == TrainerType.TRL_DPO:
config_class = trl.DPOConfig
else:
config_class = transformers.TrainingArguments

dispatch_batches = self.dataloader_main_process_only

result = config_class(
gradient_accumulation_steps=self.gradient_accumulation_steps,
log_level=self.dep_log_level,
Expand Down Expand Up @@ -635,8 +636,8 @@ def to_hf(self):
# "use_seedable_sampler": True,
# },
seed=self.seed,
# TODO Re-enable `data_seed`. Should it depend on RANK?
# data_seed=self.seed,
# TODO: OPE-891 - Support setting a data seed.
# By default, HF will use the global seed for data loading.
**self.trainer_kwargs,
)
assert isinstance(result, transformers.TrainingArguments)
Expand Down Expand Up @@ -676,10 +677,6 @@ def __post_init__(self):
if self.max_grad_norm is not None and self.max_grad_norm < 0:
raise ValueError("max_grad_norm must be >= 0.")

if self.logging_dir is None and self.output_dir:
# Push the logging_dir inside the output_dir.
self.logging_dir = str(Path(self.output_dir) / "logs")

@property
def telemetry_dir(self) -> Optional[Path]:
"""Returns the telemetry stats output directory."""
Expand Down
8 changes: 4 additions & 4 deletions tests/unit/core/test_distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -345,7 +345,7 @@ def test_get_accelerate_env_vars_default():
"FSDP_BACKWARD_PREFETCH": "BACKWARD_PRE",
"FSDP_FORWARD_PREFETCH": "false",
"FSDP_STATE_DICT_TYPE": "FULL_STATE_DICT",
"FSDP_AUTO_WRAP_POLICY": "SIZE_BASED_WRAP",
"FSDP_AUTO_WRAP_POLICY": "NO_WRAP",
"FSDP_MIN_NUM_PARAMS": "100000",
"FSDP_SYNC_MODULE_STATES": "true",
"FSDP_ACTIVATION_CHECKPOINTING": "false",
Expand Down Expand Up @@ -416,7 +416,7 @@ def test_get_accelerate_env_vars_compile_keep_use_orig_params():
"FSDP_BACKWARD_PREFETCH": "BACKWARD_PRE",
"FSDP_FORWARD_PREFETCH": "false",
"FSDP_STATE_DICT_TYPE": "FULL_STATE_DICT",
"FSDP_AUTO_WRAP_POLICY": "SIZE_BASED_WRAP",
"FSDP_AUTO_WRAP_POLICY": "NO_WRAP",
"FSDP_MIN_NUM_PARAMS": "100000",
"FSDP_SYNC_MODULE_STATES": "true",
"FSDP_ACTIVATION_CHECKPOINTING": "false",
Expand All @@ -439,7 +439,7 @@ def test_prepare_accelerate_fsdp_run():
"FSDP_BACKWARD_PREFETCH": "BACKWARD_PRE",
"FSDP_FORWARD_PREFETCH": "false",
"FSDP_STATE_DICT_TYPE": "FULL_STATE_DICT",
"FSDP_AUTO_WRAP_POLICY": "SIZE_BASED_WRAP",
"FSDP_AUTO_WRAP_POLICY": "NO_WRAP",
"FSDP_MIN_NUM_PARAMS": "100000",
"FSDP_SYNC_MODULE_STATES": "true",
"FSDP_ACTIVATION_CHECKPOINTING": "true",
Expand All @@ -462,7 +462,7 @@ def test_prepare_accelerate_fsdp_run_override():
"FSDP_BACKWARD_PREFETCH": "BACKWARD_PRE",
"FSDP_FORWARD_PREFETCH": "false",
"FSDP_STATE_DICT_TYPE": "FULL_STATE_DICT",
"FSDP_AUTO_WRAP_POLICY": "SIZE_BASED_WRAP",
"FSDP_AUTO_WRAP_POLICY": "NO_WRAP",
"FSDP_MIN_NUM_PARAMS": "100000",
"FSDP_SYNC_MODULE_STATES": "true",
"FSDP_ACTIVATION_CHECKPOINTING": "false",
Expand Down

0 comments on commit 0016ff8

Please sign in to comment.