Skip to content

Commit f182486

Browse files
authored
Add pretokenizer utility (LAION-AI#3654)
The pretokenizer utility (`pretokenizer/pretokenize.py`) allows to tokenize datamixes in advance for use with the [epfLLM/Megatron-LLM/](https://github.com/epfLLM/Megatron-LLM) trainer. The datamix configuration can be defined in a yaml file similarly to the classic training configurations of trainer_sft.py. For loading the datasets the functions from `model_training` are used (therefore the model_training module needs to be installed).
1 parent f747529 commit f182486

7 files changed

+1540
-0
lines changed

model/pretokenizer/README.md

+48
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
# OA Pretokenizer Utility
2+
3+
The pretokenizer allows to tokenize datasets before training with the
4+
[epfLLM/Megatron-LLM](https://github.com/epfLLM/Megatron-LLM) fork.
5+
6+
## Requirements
7+
8+
1. make sure the `model_training` module is installed:
9+
10+
```bash
11+
pip install -e ..
12+
```
13+
14+
2. Make sure the `oasst_data` module is installed:
15+
16+
```bash
17+
python -m pip install ../../oasst-data/
18+
```
19+
20+
### Configuration
21+
22+
The datamix to proces can be configured with one or multiple sections in the
23+
`configs/pretokenize.yaml` file.
24+
25+
### Example usage
26+
27+
```
28+
python pretokenize.py --output_dir output--configs oasst_top1 llama2 --compress --write_json
29+
```
30+
31+
### Help message
32+
33+
```
34+
usage: pretokenize.py [-h] --configs CONFIGS [CONFIGS ...] [--output_dir OUTPUT_DIR] [--write_json] [--compress]
35+
36+
Tokenize datamixes for LLama2/Falcon fine-tuning with Megatron-LLM.
37+
38+
options:
39+
-h, --help show this help message and exit
40+
41+
configuration:
42+
--configs CONFIGS [CONFIGS ...]
43+
Configurations sections to apply (read from YAML, multiple can be specified).
44+
--output_dir OUTPUT_DIR
45+
Path to output directory
46+
--write_json Generate a JSONL file with the formatted dialogues (key='text').
47+
--compress Generate a .tar.gz file of the output directory.
48+
```
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
defaults:
2+
rng_seed: 42
3+
cache_dir: .cache
4+
use_system_prefix: false
5+
datasets_extra: [] # For config options to add additional datasets, since yaml doesn't let us extend arrays
6+
eval_size:
7+
tokenizer_type:
8+
vocab_extra_ids_list: "<|im_start|>,<|im_end|>"
9+
dataset_impl: "mmap"
10+
min_assistant_tokens:
11+
output_dir_suffix: ""
12+
13+
llama2:
14+
vocab_file: "/home/ubuntu/megatron-data/llama2-7b/tokenizer.model"
15+
tokenizer_type: "SentencePieceTokenizer"
16+
output_dir_suffix: "_llama2"
17+
18+
falcon:
19+
tokenizer_type: "FalconTokenizer"
20+
output_dir_suffix: "_falcon"
21+
22+
oasst_top1:
23+
datasets:
24+
- oasst_export:
25+
lang: "bg,ca,cs,da,de,en,es,fr,hr,hu,it,nl,pl,pt,ro,ru,sl,sr,sv,uk"
26+
#hf_dataset_name: OpenAssistant/oasst1
27+
input_file_path: 2023-07-23_oasst_ready.tar.gz
28+
top_k: 1
29+
val_split: 0.05
30+
output_dir: "output/oasst_top1_2023-07-23"
31+
filename_prefix: "oasst_top1"
32+
33+
megacode2_min100:
34+
datasets:
35+
- megacode2:
36+
val_split: 0.01
37+
max_val_set: 1000
38+
output_dir: "output/megacode2_min100"
39+
filename_prefix: "megacode2"
40+
min_assistant_tokens: 100
41+
42+
megacode2_min50:
43+
datasets:
44+
- megacode2:
45+
val_split: 0.01
46+
max_val_set: 1000
47+
output_dir: "output/megacode2_min50"
48+
filename_prefix: "megacode2"
49+
min_assistant_tokens: 50
50+
51+
megacode2_frac05:
52+
datasets:
53+
- megacode2:
54+
fraction: 0.5
55+
val_split: 0.01
56+
max_val_set: 1000
57+
output_dir: "output/megacode2_frac05"
58+
filename_prefix: "megacode2"
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
import argparse
2+
from distutils.util import strtobool as strtoboolint
3+
4+
import transformers
5+
from tokenizer import build_tokenizer
6+
from transformers.utils import cached_file
7+
8+
9+
def strtobool(s: str) -> bool:
10+
return bool(strtoboolint(s))
11+
12+
13+
def parse_args():
14+
parser = argparse.ArgumentParser()
15+
parser.add_argument(
16+
"--tokenizer_type", type=str, default="SentencePieceTokenizer", help="SentencePieceTokenizer or FalconTokenizer"
17+
)
18+
parser.add_argument(
19+
"--vocab_file", type=str, help="[optional] vocab file for SentencePiece (get from HF cache by default)"
20+
)
21+
parser.add_argument(
22+
"--tokenizer_name",
23+
type=str,
24+
default="meta-llama/Llama-2-7b-hf",
25+
help="HuggingFace repo name or path, e.g. 'meta-llama/Llama-2-7b-hf' or 'tiiuae/falcon-40b'",
26+
)
27+
parser.add_argument("--cache_dir", type=str, default=None, help="Huggingface cache directory ")
28+
parser.add_argument(
29+
"--vocab_extra_ids_list",
30+
type=str,
31+
default="<|im_start|>,<|im_end|>",
32+
help='Comma separated list of additional tokens (e.g. "<|im_start|>,<|im_end|>")',
33+
)
34+
parser.add_argument("--output_dir", type=str, default="output", help="Path of output directory")
35+
return parser.parse_args()
36+
37+
38+
def main():
39+
"""
40+
Usage examples:
41+
python create_hf_tokenizer_config.py --tokenizer_type SentencePieceTokenizer --tokenizer_name meta-llama/Llama-2-7b-hf --output_dir output
42+
python create_hf_tokenizer_config.py --tokenizer_type FalconTokenizer --tokenizer_name tiiuae/falcon-40b --output_dir output
43+
"""
44+
args = parse_args()
45+
print("Configuration:")
46+
for k, v in vars(args).items():
47+
print(f"{k}: {v}")
48+
49+
hf_tokenizer = transformers.AutoTokenizer.from_pretrained(args.tokenizer_name, cache_dir=args.cache_dir)
50+
51+
print("tokenizer.vocab_files_names", hf_tokenizer.vocab_files_names)
52+
53+
if args.tokenizer_type == "FalconTokenizer":
54+
args.vocab_file = ""
55+
elif args.vocab_file is None:
56+
args.vocab_file = cached_file(
57+
args.tokenizer_name, hf_tokenizer.vocab_files_names["vocab_file"], cache_dir=args.cache_dir
58+
)
59+
60+
# add default args for megatron tokenizer
61+
args.rank = 0
62+
args.vocab_extra_ids = 0
63+
args.new_tokens = True
64+
args.make_vocab_size_divisible_by = 128
65+
args.tensor_model_parallel_size = 1
66+
mt_tokenizer = build_tokenizer(args)
67+
68+
if args.tokenizer_type == "SentencePieceTokenizer":
69+
print("_special_tokens", mt_tokenizer._special_tokens)
70+
print("additional_special_tokens_ids", mt_tokenizer.additional_special_tokens_ids)
71+
72+
hf_tokenizer.add_tokens("<CLS>", special_tokens=True)
73+
hf_tokenizer.add_tokens("<SEP>", special_tokens=True)
74+
hf_tokenizer.add_tokens("<EOD>", special_tokens=True)
75+
hf_tokenizer.add_tokens("<MASK>", special_tokens=True)
76+
hf_tokenizer.add_tokens("<PAD>", special_tokens=True)
77+
hf_tokenizer.cls_token_id = mt_tokenizer.cls
78+
hf_tokenizer.sep_token_id = mt_tokenizer.sep
79+
hf_tokenizer.mask_token_id = mt_tokenizer.mask
80+
hf_tokenizer.pad_token_id = mt_tokenizer.pad
81+
82+
additional_special_tokens = hf_tokenizer.additional_special_tokens
83+
special_tokens = {"additional_special_tokens": additional_special_tokens}
84+
if args.vocab_extra_ids_list:
85+
additional_special_tokens.extend(args.vocab_extra_ids_list.split(","))
86+
87+
hf_tokenizer.add_special_tokens(special_tokens_dict=special_tokens, replace_additional_special_tokens=True)
88+
89+
additional_special_tokens_ids = [mt_tokenizer.vocab.get(t) for t in additional_special_tokens]
90+
hf_tokenizer.additional_special_tokens_ids = additional_special_tokens_ids
91+
92+
tokens_to_check = [
93+
v for k, v in hf_tokenizer.special_tokens_map.items() if k != "additional_special_tokens"
94+
] + additional_special_tokens
95+
print("checking token ids:")
96+
for t in tokens_to_check:
97+
a = mt_tokenizer.vocab.get(t)
98+
b = hf_tokenizer.vocab.get(t)
99+
print(f"{t}: {a} (mt) == {b} (hf)")
100+
assert a == b, "Mismatch between megatron and huggingface tokenizer vocabularies"
101+
elif args.tokenizer_type == "FalconTokenizer":
102+
hf_tokenizer = mt_tokenizer.tokenizer
103+
else:
104+
raise RuntimeError(f"Unsupported tokenizer type: {args.tokenizer_type}")
105+
106+
print("special_tokens_map:", hf_tokenizer.special_tokens_map)
107+
108+
hf_tokenizer.save_pretrained(args.output_dir)
109+
110+
111+
if __name__ == "__main__":
112+
main()

0 commit comments

Comments
 (0)