-
Notifications
You must be signed in to change notification settings - Fork 28.4k
/
Copy pathconsolidate_rag_checkpoint.py
99 lines (80 loc) · 3.55 KB
/
consolidate_rag_checkpoint.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
"""
A script creating a RAG checkpoint from a generator and a question encoder checkpoints.
"""
import argparse
from pathlib import Path
from transformers import AutoConfig, AutoTokenizer, RagConfig, RagSequenceForGeneration, RagTokenForGeneration
def consolidate(
model_type,
generator_name_or_path: str,
question_encoder_name_or_path: str,
dest_dir: Path,
config_name_or_path: str = None,
generator_tokenizer_name_or_path: str = None,
question_encoder_tokenizer_name_or_path: str = None,
):
if config_name_or_path is None:
config_name_or_path = "facebook/rag-token-base" if model_type == "rag_token" else "facebook/rag-sequence-base"
if generator_tokenizer_name_or_path is None:
generator_tokenizer_name_or_path = generator_name_or_path
if question_encoder_tokenizer_name_or_path is None:
question_encoder_tokenizer_name_or_path = question_encoder_name_or_path
model_class = RagTokenForGeneration if model_type == "rag_token" else RagSequenceForGeneration
# Save model.
rag_config = RagConfig.from_pretrained(config_name_or_path)
gen_config = AutoConfig.from_pretrained(generator_name_or_path)
question_encoder_config = AutoConfig.from_pretrained(question_encoder_name_or_path)
rag_config.generator = gen_config
rag_config.question_encoder = question_encoder_config
rag_model = model_class.from_pretrained_question_encoder_generator(
question_encoder_name_or_path, generator_name_or_path, config=rag_config
)
rag_model.save_pretrained(dest_dir)
# Sanity check.
model_class.from_pretrained(dest_dir)
# Save tokenizers.
gen_tokenizer = AutoTokenizer.from_pretrained(generator_tokenizer_name_or_path)
gen_tokenizer.save_pretrained(dest_dir / "generator_tokenizer/")
question_encoder_tokenizer = AutoTokenizer.from_pretrained(question_encoder_tokenizer_name_or_path)
question_encoder_tokenizer.save_pretrained(dest_dir / "question_encoder_tokenizer/")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--model_type",
choices=["rag_sequence", "rag_token"],
required=True,
type=str,
help="RAG model type: rag_sequence, rag_token",
)
parser.add_argument("--dest", type=str, required=True, help="Path to the output checkpoint directory.")
parser.add_argument("--generator_name_or_path", type=str, required=True, help="Generator model identifier")
parser.add_argument(
"--question_encoder_name_or_path", type=str, required=True, help="Question encoder model identifier"
)
parser.add_argument(
"--generator_tokenizer_name_or_path",
type=str,
help="Generator tokenizer identifier, if not specified, resolves to ``generator_name_or_path``",
)
parser.add_argument(
"--question_encoder_tokenizer_name_or_path",
type=str,
help="Question encoder tokenizer identifier, if not specified, resolves to ``question_encoder_name_or_path``",
)
parser.add_argument(
"--config_name_or_path",
type=str,
help="Identifier of the model config to use, if not provided, resolves to a base config for a given ``model_type``",
)
args = parser.parse_args()
dest_dir = Path(args.dest)
dest_dir.mkdir(exist_ok=True)
consolidate(
args.model_type,
args.generator_name_or_path,
args.question_encoder_name_or_path,
dest_dir,
args.config_name_or_path,
args.generator_tokenizer_name_or_path,
args.question_encoder_tokenizer_name_or_path,
)