Skip to content

Commit aa8ae07

Browse files
author
shuangkun.tsk
committed
add fine-tune simple
Signed-off-by: shuangkun.tsk <shuangkun.tsk@alibaba-inc.com>
1 parent aa4acdc commit aa8ae07

File tree

5 files changed

+608
-0
lines changed

5 files changed

+608
-0
lines changed

fine-tune-with-argo/fine-tune.png

138 KB
Loading

fine-tune-with-argo/oss-pvpvc.yaml

+49
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
2+
apiVersion: v1
3+
kind: Secret
4+
metadata:
5+
name: oss-secret
6+
namespace: default
7+
stringData:
8+
akId: yourak # akId需要替换为您的AccessKey ID。
9+
akSecret: yoursk # akSecret需要替换为您的AccessKey Secret。
10+
---
11+
apiVersion: v1
12+
kind: PersistentVolume
13+
metadata:
14+
name: pv-oss
15+
labels:
16+
alicloud-pvname: pv-oss
17+
spec:
18+
capacity:
19+
storage: 5Gi
20+
accessModes:
21+
- ReadWriteMany
22+
persistentVolumeReclaimPolicy: Retain
23+
csi:
24+
driver: ossplugin.csi.alibabacloud.com
25+
volumeHandle: pv-oss # 需要和PV名字一致。
26+
nodePublishSecretRef:
27+
name: oss-secret
28+
namespace: default
29+
volumeAttributes:
30+
bucket: data-bucket-zjk # 需要替换为您的Bucket名称。
31+
url: "oss-cn-zhangjiakou-internal.aliyuncs.com" # 需要替换<your region id>为您OSS的地域ID,例如华北2(北京)地域为:oss-cn-beijing-internal.aliyuncs.com。
32+
otherOpts: "-o max_stat_cache_size=0 -o allow_other -o multipart_size=30 -o parallel_count=20 -o uid=1000 -o gid=1001"
33+
path: "/"
34+
---
35+
apiVersion: v1
36+
kind: PersistentVolumeClaim
37+
metadata:
38+
name: pvc-oss
39+
namespace: default
40+
spec:
41+
accessModes:
42+
- ReadWriteMany
43+
resources:
44+
requests:
45+
storage: 5Gi
46+
selector:
47+
matchLabels:
48+
alicloud-pvname: pv-oss
49+

fine-tune-with-argo/readme.md

Whitespace-only changes.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,266 @@
1+
apiVersion: argoproj.io/v1alpha1
2+
kind: Workflow
3+
metadata:
4+
name: tcm-deepseek-finetune-with-argo
5+
namespace: default
6+
spec:
7+
entrypoint: main
8+
templates:
9+
- name: main
10+
steps:
11+
- - name: download-dataset
12+
template: download-dataset
13+
arguments:
14+
parameters:
15+
- name: dataset-path
16+
value: '{{workflow.parameters.dataset-path}}'
17+
- name: download-model
18+
template: download-model
19+
arguments:
20+
parameters:
21+
- name: base-model
22+
value: '{{workflow.parameters.base-model}}'
23+
- name: model-name
24+
value: '{{workflow.parameters.model-name}}'
25+
- - name: format-prompts
26+
template: format-prompts
27+
arguments:
28+
parameters:
29+
- name: dataset-path
30+
value: '{{workflow.parameters.dataset-path}}'
31+
- name: format-path
32+
value: '{{workflow.parameters.format-path}}'
33+
- name: base-model
34+
value: '{{workflow.parameters.base-model}}'
35+
- - name: training
36+
template: training
37+
arguments:
38+
parameters:
39+
- name: format-path
40+
value: '{{workflow.parameters.format-path}}'
41+
- name: model-path
42+
value: '{{workflow.parameters.base-model}}'
43+
- name: output-path
44+
value: '{{workflow.parameters.output-model}}'
45+
- - name: inference-finetuned
46+
template: inference-template
47+
arguments:
48+
parameters:
49+
- name: model-path
50+
value: '{{workflow.parameters.output-model}}'
51+
- name: inference-basemodel
52+
template: inference-template
53+
arguments:
54+
parameters:
55+
- name: model-path
56+
value: '{{workflow.parameters.base-model}}'
57+
- name: download-dataset
58+
inputs:
59+
parameters:
60+
- name: dataset-path
61+
default: /mnt/vol/datasets
62+
script:
63+
image: acr-multiple-clusters-registry.cn-hangzhou.cr.aliyuncs.com/serverless-argo/deepseek-finetune:v4
64+
source: |-
65+
import os
66+
import sys
67+
sys.path.append(os.getcwd())
68+
from datasets import load_dataset
69+
import os
70+
save_path = '{{inputs.parameters.dataset-path}}'
71+
print('Downloading dataset...')
72+
if not os.path.exists(save_path):
73+
dataset = load_dataset('SylvanL/Traditional-Chinese-Medicine-Dataset-SFT', split='train')
74+
dataset.save_to_disk(save_path)
75+
print(f'Dataset saved to {save_path}')
76+
command:
77+
- python
78+
volumeMounts:
79+
- name: workdir
80+
mountPath: /mnt/vol
81+
- name: download-model
82+
inputs:
83+
parameters:
84+
- name: base-model
85+
default: /mnt/vol/model
86+
- name: model-name
87+
default: model-name
88+
script:
89+
image: acr-multiple-clusters-registry.cn-hangzhou.cr.aliyuncs.com/serverless-argo/deepseek-finetune:v4
90+
source: |-
91+
import os
92+
import sys
93+
sys.path.append(os.getcwd())
94+
from huggingface_hub import snapshot_download
95+
download_path = '{{inputs.parameters.base-model}}'
96+
if not os.path.exists(download_path):
97+
snapshot_download(repo_id='unsloth/{{inputs.parameters.model-name}}', local_dir=download_path, ignore_patterns=['*.msgpack', '*.h5', '*.tflite'])
98+
print(f'Model downloaded to {download_path}')
99+
command:
100+
- python
101+
volumeMounts:
102+
- name: workdir
103+
mountPath: /mnt/vol
104+
- name: format-prompts
105+
inputs:
106+
parameters:
107+
- name: dataset-path
108+
default: /mnt/data/datasets
109+
- name: base-model
110+
default: /mnt/data
111+
- name: format-path
112+
value: /mnt/data/format
113+
script:
114+
image: acr-multiple-clusters-registry.cn-hangzhou.cr.aliyuncs.com/serverless-argo/deepseek-finetune:v4
115+
source: "import os\nimport sys\nsys.path.append(os.getcwd())\nfrom datasets\
116+
\ import load_from_disk\nfrom transformers import AutoModelForCausalLM, AutoTokenizer\n\
117+
import sys\nsave_path = '{{inputs.parameters.base-model}}'\nformat_path =\
118+
\ '{{inputs.parameters.format-path}}'\nif os.path.exists(format_path):\n \
119+
\ sys.exit(0)\ntokenizer = AutoTokenizer.from_pretrained(save_path)\nEOS_TOKEN\
120+
\ = tokenizer.eos_token\ndataset = load_from_disk('{{workflow.parameters.dataset-path}}')\n\
121+
alpaca_prompt = '\u4EE5\u4E0B\u662F\u63CF\u8FF0\u4EFB\u52A1\u7684\u8BF4\u660E\
122+
\uFF0C\u5E76\u642D\u914D\u63D0\u4F9B\u66F4\u591A\u4E0A\u4E0B\u6587\u7684\u8F93\
123+
\u5165\u3002\\n \u5199\u51FA\u9002\u5F53\u5B8C\u6210\u8BF7\u6C42\u7684\u56DE\
124+
\u590D\u3002\u5728\u56DE\u7B54\u4E4B\u524D\uFF0C\u8BF7\u4ED4\u7EC6\u601D\u8003\
125+
\u95EE\u9898\u5E76\u521B\u5EFA\u5FAA\u5E8F\u6E10\u8FDB\u7684\u601D\u8DEF\u94FE\
126+
\uFF0C\u4EE5\u786E\u4FDD\u505A\u51FA\u5408\u4E4E\u903B\u8F91\u4E14\u51C6\u786E\
127+
\u7684\u56DE\u7B54\u3002\\n\\n ### Instruction:\\n \u60A8\u662F\u4E00\
128+
\u4F4D\u5728\u4E2D\u533B\u7684\u4E34\u5E8A\u63A8\u7406\u3001\u8BCA\u65AD\u548C\
129+
\u6CBB\u7597\u8BA1\u5212\u7B49\u65B9\u9762\u5177\u6709\u5177\u6709\u4E30\u5BCC\
130+
\u7ECF\u9A8C\u7684\u533B\u5B66\u4E13\u5BB6\u3002\u8BF7\u56DE\u7B54\u4EE5\u4E0B\
131+
\u533B\u5B66\u95EE\u9898\u3002\\n\\n ### Input:\\n {}\\n\\n ### Response:\\\
132+
n {}'\n\ndef formatting_prompts_func(examples):\n instructions = examples['instruction']\n\
133+
\ inputs = examples['input']\n outputs = examples['output']\n texts\
134+
\ = []\n for (instruction, input, output) in zip(instructions, inputs,\
135+
\ outputs):\n text = alpaca_prompt.format(input, output) + EOS_TOKEN\n\
136+
\ texts.append(text)\n return {'text': texts}\ndataset = dataset.map(formatting_prompts_func,\
137+
\ batched=True)\nprint('Formatting prompts')\ndataset.save_to_disk('{{inputs.parameters.format-path}}')"
138+
command:
139+
- python
140+
volumeMounts:
141+
- name: workdir
142+
mountPath: /mnt/vol
143+
resources:
144+
limits:
145+
cpu: '16'
146+
memory: 32Gi
147+
requests:
148+
cpu: '16'
149+
memory: 32Gi
150+
- name: training
151+
inputs:
152+
parameters:
153+
- name: format-path
154+
default: /mnt/data/datasets
155+
- name: model-path
156+
value: Dataset download started
157+
- name: output-path
158+
value: ''
159+
metadata:
160+
annotations:
161+
k8s.aliyun.com/eci-gpu-driver-version: tesla=525.85.12
162+
k8s.aliyun.com/eci-use-specs: ecs.gn7i-c16g1.4xlarge
163+
script:
164+
image: acr-multiple-clusters-registry.cn-hangzhou.cr.aliyuncs.com/serverless-argo/deepseek-finetune:v4
165+
source: |-
166+
import os
167+
import sys
168+
sys.path.append(os.getcwd())
169+
from unsloth import is_bfloat16_supported
170+
from unsloth import FastLanguageModel
171+
from trl import SFTTrainer
172+
from transformers import TrainingArguments
173+
from datasets import load_from_disk
174+
import sys
175+
max_seq_length = 2048
176+
dataset = load_from_disk('{{inputs.parameters.format-path}}')
177+
base = '{{inputs.parameters.model-path}}'
178+
output_dir = '{{inputs.parameters.output-path}}'
179+
if os.path.exists(output_dir):
180+
sys.exit(0)
181+
(model, tokenizer) = FastLanguageModel.from_pretrained(model_name=base, max_seq_length=4096, local_files_only=True, dtype=None, load_in_4bit=True)
182+
model = FastLanguageModel.get_peft_model(model, r=16, target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj'], lora_alpha=16, lora_dropout=0, bias='none', use_gradient_checkpointing='unsloth', random_state=3407, use_rslora=False, loftq_config=None)
183+
trainer = SFTTrainer(model=model, tokenizer=tokenizer, train_dataset=dataset, dataset_text_field='text', max_seq_length=max_seq_length, dataset_num_proc=2, args=TrainingArguments(per_device_train_batch_size=2, gradient_accumulation_steps=4, warmup_steps=5, max_steps=60, learning_rate=0.0002, fp16=not is_bfloat16_supported(), bf16=is_bfloat16_supported(), logging_steps=10, optim='adamw_8bit', weight_decay=0.01, lr_scheduler_type='linear', seed=3407, output_dir='outputs'))
184+
print('Fine-tuning model')
185+
trainer_stats = trainer.train()
186+
print(trainer_stats)
187+
save_path = '{{inputs.parameters.output-path}}'
188+
model.save_pretrained(save_path)
189+
tokenizer.save_pretrained(save_path)
190+
command:
191+
- python
192+
volumeMounts:
193+
- name: workdir
194+
mountPath: /mnt/vol
195+
resources:
196+
limits:
197+
cpu: '12'
198+
memory: 40Gi
199+
nvidia.com/gpu: '1'
200+
requests:
201+
cpu: '8'
202+
memory: 20Gi
203+
nvidia.com/gpu: '1'
204+
- name: inference-template
205+
inputs:
206+
parameters:
207+
- name: model-path
208+
default: /mnt/data/datasets
209+
metadata:
210+
annotations:
211+
k8s.aliyun.com/eci-gpu-driver-version: tesla=525.85.12
212+
k8s.aliyun.com/eci-use-specs: ecs.gn7i-c16g1.4xlarge
213+
script:
214+
image: acr-multiple-clusters-registry.cn-hangzhou.cr.aliyuncs.com/serverless-argo/deepseek-finetune:v4
215+
source: "import os\nimport sys\nsys.path.append(os.getcwd())\nfrom unsloth import\
216+
\ FastLanguageModel\nmodelpath = '{{inputs.parameters.model-path}}'\n(model,\
217+
\ tokenizer) = FastLanguageModel.from_pretrained(model_name=modelpath, max_seq_length=2048,\
218+
\ dtype=None, load_in_4bit=True)\nprompt_style = '\u4EE5\u4E0B\u662F\u63CF\
219+
\u8FF0\u4EFB\u52A1\u7684\u8BF4\u660E\uFF0C\u5E76\u642D\u914D\u63D0\u4F9B\u66F4\
220+
\u591A\u4E0A\u4E0B\u6587\u7684\u8F93\u5165\u3002\\n \u5199\u51FA\u9002\u5F53\
221+
\u5B8C\u6210\u8BF7\u6C42\u7684\u56DE\u590D\u3002\u5728\u56DE\u7B54\u4E4B\u524D\
222+
\uFF0C\u8BF7\u4ED4\u7EC6\u601D\u8003\u95EE\u9898\u5E76\u521B\u5EFA\u5FAA\u5E8F\
223+
\u6E10\u8FDB\u7684\u601D\u8DEF\u94FE\uFF0C\u4EE5\u786E\u4FDD\u505A\u51FA\u5408\
224+
\u4E4E\u903B\u8F91\u4E14\u51C6\u786E\u7684\u56DE\u7B54\u3002\\n\\n ###\
225+
\ Instruction:\\n \u60A8\u662F\u4E00\u4F4D\u5728\u4E2D\u533B\u7684\u4E34\
226+
\u5E8A\u63A8\u7406\u3001\u8BCA\u65AD\u548C\u6CBB\u7597\u8BA1\u5212\u7B49\u65B9\
227+
\u9762\u5177\u6709\u5177\u6709\u4E30\u5BCC\u7ECF\u9A8C\u7684\u533B\u5B66\u4E13\
228+
\u5BB6\u3002\u8BF7\u56DE\u7B54\u4EE5\u4E0B\u533B\u5B66\u95EE\u9898\u3002\\\
229+
n\\n ### Question:\\n {}\\n\\n ### Response:\\n {}'\nquestion\
230+
\ = '\u4E45\u54B3\u4E0D\u6B62\u600E\u4E48\u529E\uFF1F'\nFastLanguageModel.for_inference(model)\n\
231+
inputs = tokenizer([prompt_style.format(question, '')], return_tensors='pt').to('cuda')\n\
232+
outputs = model.generate(input_ids=inputs.input_ids, attention_mask=inputs.attention_mask,\
233+
\ max_new_tokens=1200, use_cache=True)\nresponse = tokenizer.batch_decode(outputs)\n\
234+
outputs = response[0].split('### Response:')[1]\nif 'think' in outputs:\n\
235+
\ outputs = outputs.split('think>')[1]\nprint(outputs)\nwith open('/tmp/response.txt',\
236+
\ 'w', encoding='utf-8') as f:\n f.write(outputs)"
237+
command:
238+
- python
239+
volumeMounts:
240+
- name: workdir
241+
mountPath: /mnt/vol
242+
resources:
243+
limits:
244+
cpu: '12'
245+
memory: 40Gi
246+
nvidia.com/gpu: '1'
247+
requests:
248+
cpu: '8'
249+
memory: 20Gi
250+
nvidia.com/gpu: '1'
251+
volumes:
252+
- name: workdir
253+
persistentVolumeClaim:
254+
claimName: pvc-oss
255+
arguments:
256+
parameters:
257+
- name: dataset-path
258+
value: /mnt/vol/traditional-chinese-medicine-data
259+
- name: format-path
260+
value: /mnt/vol/traditional-chinese-medicine-fromat-data
261+
- name: base-model
262+
value: /mnt/vol/deepseek-basemodel
263+
- name: output-model
264+
value: /mnt/vol/deepseek-finetuned
265+
- name: model-name
266+
value: DeepSeek-R1-Distill-Qwen-7B

0 commit comments

Comments
 (0)