-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathrun.sh
69 lines (56 loc) · 3.2 KB
/
run.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
#!/bin/bash
# This script is to fine-tune Gemma-7b or Gemma-2b on the ORCAMATH dataset.
# Sets the GPU(s) to be used for training.
export CUDA_VISIBLE_DEVICES="0" # Change it to the GPU you want to use, if more than one: "0,1,2,..,n"
# Specifies the project name for Weights & Biases (W&B) logging. Change as per your W&B project.
export WANDB_PROJECT="LLM-ORCAMATH-FINETUNING"
# Dynamically assigns a port for distributed training to avoid port conflicts.
master_port=$(shuf -i 12000-30000 -n 1)
# LoRA parameters: rank (r) and alpha (scaling factor), calculated as double the rank.
lora_r=8
lora_alpha=$(( lora_r * 2 ))
# Learning configurations: learning rate, number of epochs, and batch size per device.
learning_rate="5e-5"
num_epoch=10
batch_size=1 # Decrease it if you run out of memory in CUDA (original: 16)
# Training configuration for distributed setup; adjust `world_size` according to the number of GPUs.
world_size=1 # Change it to the number of GPUs
model="gemma-7b"
# Batch size configurations, adapted for memory limitations. Adjust as necessary.
total_batch_size=8 # Decrease it if you run out of memory in CUDA (original: 128)
gradient_accumulation_steps=$(( total_batch_size / world_size / batch_size))
total_batch_size=$(( gradient_accumulation_steps * world_size * batch_size ))
# Naming convention for the training run, incorporating various parameters for easy identification.
run_name="e${num_epoch}_${model}_qvko_r${lora_r}_a${lora_alpha}_lr${learning_rate}_bs${total_batch_size}"
#Checkpoint path to resume training from a previous checkpoint.
checkpoint_path="./weights/Gemma-7b/e10_gemma-7b_qvko_r8_a16_lr5e-5_bs8/checkpoint-25004"
# Retrieves the current working directory to ensure paths are correctly set relative to the script location.
work_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" # Work dir is:
echo "dir: ${work_dir}"
# The main command to run the training, specifying all necessary arguments for `train.py`.
# below, parameters --fp16 True and --tf32 False might be changed, this is what worked to me, due to my GPUs not being Ampere but Turing
# Alternatively, if your GPUs are Ampere, you might set those params as follows: change --fp16 True by --fp32 True or by --bf16 True and --tf32 False by --tf32 True
torchrun --nproc_per_node=${world_size} --master_port=${master_port} train.py \
--model_name_or_path "google/${model}" \
--data_path ./data/orcamath_data.json \
--output_dir ${work_dir}/weights/Gemma-7b/${run_name}/ \
--run_name ${run_name} \
--resume_from_checkpoint ${checkpoint_path} \
--fp16 True \
--num_train_epochs ${num_epoch} \
--per_device_train_batch_size ${batch_size} \
--gradient_accumulation_steps ${gradient_accumulation_steps} \
--warmup_steps 300 \
--save_strategy "epoch" \
--lr_scheduler_type "constant_with_warmup" \
--save_total_limit 100 \
--learning_rate ${learning_rate} \
--model_max_length 512 \
--logging_steps 8 \
--tf32 False \
--ddp_find_unused_parameters False \
--use_lora True \
--load_in_4bit True \
--lora_r ${lora_r} \
--lora_alpha ${lora_alpha} \
--lora_target_modules q_proj v_proj k_proj o_proj