Skip to content

Commit ada91f1

Browse files
authored
Add changes for SFT-6 training (LAION-AI#2554)
1 parent 763e33f commit ada91f1

File tree

2 files changed

+111
-13
lines changed

2 files changed

+111
-13
lines changed

model/model_training/configs/config.yaml

+102-12
Original file line numberDiff line numberDiff line change
@@ -95,32 +95,59 @@ math:
9595

9696
pretrain:
9797
num_train_epochs: 1
98-
weight_decay: 0.01
98+
weight_decay: 0.0
9999
use_custom_sampler: true
100100
sort_by_length: false
101101
datasets:
102-
- joke
102+
- joke:
103+
val_split: 0.05
103104
- webgpt:
104-
val_split: 0.1
105+
val_split: 0.05
106+
max_val_set: 250
105107
- gpt4all:
106108
val_split: 0.01
109+
max_val_set: 250
107110
- alpaca:
108111
val_split: 0.025
112+
max_val_set: 250
109113
- code_alpaca:
110114
val_split: 0.05
111-
- minimath
112-
- humaneval_mbpp_codegen_qa
113-
- humaneval_mbpp_testgen_qa
114-
- grade_school_math_instructions
115-
- recipes
116-
- cmu_wiki_qa
117-
#- youtube_subs_howto100m # uses incompatible column names
118-
#- ubuntu_dialogue_qa # fails to load
119-
- oa_wiki_qa_bart_10000row
115+
max_val_set: 250
116+
- vicuna:
117+
max_val_set: 250
118+
- oig_file:
119+
source_url: https://huggingface.co/datasets/laion/OIG/resolve/main/unified_chip2.jsonl
120+
max_count: 10000
121+
min_length: 250
122+
val_split: 0.05
123+
max_val_set: 250
124+
- minimath:
125+
val_split: 0.05
126+
- humaneval_mbpp_codegen_qa:
127+
val_split: 0.05
128+
- humaneval_mbpp_testgen_qa:
129+
val_split: 0.05
130+
- grade_school_math_instructions:
131+
val_split: 0.05
132+
- recipes:
133+
val_split: 0.05
134+
- cmu_wiki_qa:
135+
val_split: 0.05
136+
- oa_wiki_qa_bart_10000row:
137+
val_split: 0.05
138+
max_val_set: 250
120139
- prosocial_dialogue:
121140
fraction: 0.1
141+
max_val_set: 250
122142
- explain_prosocial:
123143
fraction: 0.05
144+
max_val_set: 250
145+
- soda:
146+
fraction: 0.2
147+
max_val_set: 250
148+
- oa_leet10k:
149+
val_split: 0.05
150+
max_val_set: 250
124151

125152
oasst_only:
126153
save_strategy: epoch
@@ -257,6 +284,69 @@ llama-30b:
257284
save_total_limit: 4
258285
use_flash_attention: true
259286

287+
llama-30b-sft-6:
288+
dtype: fp16
289+
log_dir: "llama_log_30b"
290+
learning_rate: 1e-5
291+
#model_name: /home/ubuntu/Open-Assistant/model/model_training/.saved/llama-30b-super-pretrain/checkpoint-3500
292+
model_name: OpenAssistant/llama-30b-super-pretrain
293+
output_dir: llama_model_30b
294+
deepspeed_config: configs/zero3_config_sft.json
295+
weight_decay: 0.0
296+
residual_dropout: 0.0
297+
max_length: 2048
298+
use_flash_attention: true
299+
warmup_steps: 20
300+
gradient_checkpointing: true
301+
gradient_accumulation_steps: 8
302+
per_device_train_batch_size: 2
303+
per_device_eval_batch_size: 3
304+
eval_steps: 101
305+
save_steps: 485
306+
num_train_epochs: 8
307+
save_total_limit: 3
308+
use_custom_sampler: true
309+
sort_by_length: false
310+
save_strategy: steps
311+
datasets:
312+
- oasst_export:
313+
lang: "bg,ca,cs,da,de,en,es,fr,hr,hu,it,nl,pl,pt,ro,ru,sl,sr,sv,uk"
314+
input_file_path: 2023-04-12_oasst_release_ready_synth.jsonl.gz
315+
val_split: 0.05
316+
- vicuna:
317+
val_split: 0.05
318+
max_val_set: 800
319+
fraction: 0.8
320+
- dolly15k:
321+
val_split: 0.05
322+
max_val_set: 300
323+
- grade_school_math_instructions:
324+
val_split: 0.05
325+
- code_alpaca:
326+
val_split: 0.05
327+
max_val_set: 250
328+
329+
llama-30b-pretrain:
330+
dtype: fp16
331+
log_dir: "llama_log_30b"
332+
learning_rate: 1e-5
333+
model_name: /home/ubuntu/llama_hf/30B
334+
output_dir: llama_model_30b
335+
deepspeed_config: configs/zero3_config_pretrain.json
336+
weight_decay: 0.0
337+
residual_dropout: 0.0
338+
max_length: 2048
339+
use_flash_attention: true
340+
warmup_steps: 100
341+
gradient_checkpointing: true
342+
gradient_accumulation_steps: 8
343+
per_device_train_batch_size: 2
344+
per_device_eval_batch_size: 3
345+
eval_steps: 251
346+
save_steps: 500
347+
num_train_epochs: 1
348+
save_total_limit: 2
349+
260350
pythia-70m-deduped:
261351
learning_rate: 8e-6
262352
# model_name: EleutherAI/pythia-1b-deduped

model/model_training/custom_datasets/qa_datasets.py

+9-1
Original file line numberDiff line numberDiff line change
@@ -506,6 +506,14 @@ def process_vicuna_conversations(data: list[dict[str, None | str]], input_max_le
506506
for line in data["conversations"]:
507507
speaker = line["from"] # 'human' or 'gpt'
508508
message = line["value"]
509+
510+
# remove markdown escaping in revision 192ab2185289094fc556ec8ce5ce1e8e587154ca
511+
# python-markdownify with escape_asterisks & escape_underscores True is used
512+
# for pre-processing the dataset.
513+
# See also https://github.com/LAION-AI/Open-Assistant/issues/2510
514+
message = message.replace(r"\_", "_")
515+
message = message.replace(r"\*", "*")
516+
509517
if role != speaker:
510518
if role is not None:
511519
dialogue.append("\n".join(messages))
@@ -528,7 +536,7 @@ def __init__(self, cache_dir: str | Path, mode: str = "sft", input_max_length: i
528536
dataset = load_dataset(
529537
"anon8231489123/ShareGPT_Vicuna_unfiltered",
530538
cache_dir=cache_dir,
531-
data_files=["ShareGPT_V3_unfiltered_cleaned_split.json"],
539+
data_files=["ShareGPT_V3_unfiltered_cleaned_split_no_imsorry.json"],
532540
revision="192ab2185289094fc556ec8ce5ce1e8e587154ca",
533541
)["train"]
534542
for data in dataset:

0 commit comments

Comments
 (0)