forked from LAION-AI/Open-Assistant
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexported_tree_loading.py
75 lines (61 loc) · 2.38 KB
/
exported_tree_loading.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import json
from collections import defaultdict
from typing import List
import pandas as pd
def load_jsonl(filepaths):
data = []
for filepath in filepaths:
with open(filepath, "r") as f:
for line in f:
data.append(json.loads(line))
return data
def separate_qa_helper(node, depth, msg_dict):
if "text" in node:
if node["role"] == "prompter":
msg_dict["user_messages"].append(str(node["text"]))
elif node["role"] == "assistant":
msg_dict["assistant_messages"].append(str(node["text"]))
depth += 1
if "replies" in node:
for reply in node["replies"]:
separate_qa_helper(reply, depth, msg_dict)
def store_qa_data_separate(trees, data):
message_list = []
for i, msg_tree in enumerate(trees):
if "prompt" in msg_tree.keys():
separate_qa_helper(msg_tree["prompt"], i, data)
elif "prompt" not in msg_tree.keys():
message_list.append(msg_tree)
return data, message_list
def group_qa_helper(node, depth, msg_pairs):
if "text" in node:
if node["role"] == "prompter":
if "replies" in node:
for reply in node["replies"]:
qa_pair = {"instruct": str(node["text"]), "answer": str(reply["text"])}
msg_pairs.append(qa_pair)
depth += 1
if "replies" in node:
for reply in node["replies"]:
group_qa_helper(reply, depth, msg_pairs)
def store_qa_data_paired(trees, data: List):
message_list = []
for i, msg_tree in enumerate(trees):
if "prompt" in msg_tree.keys():
group_qa_helper(msg_tree["prompt"], i, data)
elif "prompt" not in msg_tree.keys():
message_list.append(msg_tree)
return data, message_list
def load_data(filepaths: List[str], paired=False):
trees = load_jsonl(filepaths)
if paired:
data = []
data, message_list = store_qa_data_paired(trees, data)
sents = [f"{qa['instruct']} {qa['answer']}" for qa in data]
elif not paired:
data = defaultdict(list)
data, message_list = store_qa_data_separate(trees, data)
sents = data["user_messages"] + data["assistant_messages"]
data = [(i, sent) for i, sent in enumerate(sents)]
data = pd.DataFrame(data, columns=["id", "query"])
return data, message_list