forked from LAION-AI/Open-Assistant
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_biostars_dataset.py
158 lines (113 loc) · 5.29 KB
/
get_biostars_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import json
import os
import re
import time
import pandas as pd
import requests
from tqdm import tqdm
def get_biostars_dataset(start_idx=9557161, accept_threshold=1000000, sleep=0.1, folder="biostars"):
"""
Download BioStarts data set from the official API using GET requests
Args:
start_idx (int): The identifier (UID) of the post to retrieve; 9557161 was the last post included in the dataset
accept_threshold (int): stop if this many posts with "has_accepted" true are retrieved
sleep (float): Amount of time to sleep between requests
folder (string): folder to store responses as JSON files
Returns:
Nothing. Content is saved to individual JSON files for each post.
"""
headers = {"Content-Type": "application/json"}
has_accepted_count = 0
pbar = tqdm(range(start_idx, 0, -1), desc="Running ...")
for idx in pbar:
url = f"https://www.biostars.org/api/post/{idx}"
file = os.path.join(folder, f"{idx}.json")
if os.path.isfile(file):
with open(file, "r") as f:
data = json.load(f)
if data.get("has_accepted"):
has_accepted_count += 1
print(f"MSG: {file} exists. Skipping; Current accepted: {has_accepted_count}")
continue
r = requests.get(url, headers=headers)
# print(r.status_code, r.reason)
if r.status_code == 200:
data = r.json()
if data.get("has_accepted"):
has_accepted_count += 1
with open(file, "w") as f:
json.dump(data, f)
# print(f"MSG: File downloaded: {idx}; Current accepted: {has_accepted_count}")
else:
print("ERROR: Retrieving data: ", idx)
time.sleep(sleep)
if has_accepted_count == accept_threshold:
print(f"{accept_threshold} entries with has_accepted found. Stopping.")
break
pbar.set_description(f"Item: {idx}; Accepted {has_accepted_count}")
# tqdm.set_description(f"Cur: {idx}; Accepted: {has_accepted_count}")
def extract_accepted_data(folder="biostars", merged_json_file=None):
"""
Extract questions paired with their accepted answers
Args:
folder (string): folder to store responses as JSON files
merged_json_file (string): A JSON file with individual post content (from get_biostars_dataset()) merged as a JSON array of objects can be provided
Returns:
Nothing. Content is saved to the file: biostars_qa.parquet
"""
# GET ALL ENTRIES ----
# Merge individual files
if merged_json_file is None:
json_files = [file for file in os.listdir(folder) if file.endswith(".json")]
all_entries = []
for file in tqdm(json_files, desc="Get All Entries"):
with open(os.path.join(folder, file), "r") as f:
data = json.load(f)
all_entries.append(data)
with open(merged_json_file, "w") as f:
json.dump(all_entries, f, indent=2)
df = pd.read_json(merged_json_file)
# GET QUESTIONS ----
questions_df = df[(df["has_accepted"]) & (df["vote_count"] > 0) & (df["type"] == "Question")]
# GET ANSWERS ----
answers_df = df[(df["has_accepted"]) & (df["vote_count"] > 0) & (df["type"] == "Answer")]
# GET MATCHED QUESTIONS/ANSWERS ----
matched_uids = []
for input_str in tqdm(answers_df["url"], desc="Find Matched Answers"):
# extract the question and answer IDs using regular expressions
match_obj = re.match(r"https://www.biostars.org/p/(\d+)/#(\d+)", input_str)
question_id = match_obj.group(1)
answer_id = match_obj.group(2)
# create a dictionary with the question and answer IDs and add it to the output list
output_dict = {"question": question_id, "answer": answer_id}
matched_uids.append(output_dict)
# GET MATCHED QUESTIONS/ANSWERS ----
matched_qa = []
for match in tqdm(matched_uids, desc="Get Matched Answers"):
entry = {}
# match = {'question': '477589', 'answer': '477883'}
entry_obj = questions_df[questions_df["uid"] == int(match["question"])]
if entry_obj.empty:
continue
entry_dict = entry_obj.iloc[0].to_dict()
entry["INSTRUCTION"] = entry_dict["content"]
entry["SOURCE"] = "biostars"
entry[
"METADATA"
] = f'{{"uid": {entry_dict["uid"]}, "view_count": {entry_dict["view_count"]}, "vote_count": {entry_dict["vote_count"]}}}'
entry_obj = answers_df[answers_df["uid"] == int(match["answer"])]
entry_dict = entry_obj.iloc[0].to_dict()
entry["RESPONSE"] = entry_dict["content"]
# sorted_entry = dict(sorted(entry.items(), key=lambda x: x[0] != "INSTRUCTION"))
sorted_entry = {k: entry[k] for k in ["INSTRUCTION", "RESPONSE", "SOURCE", "METADATA"]}
matched_qa.append(sorted_entry)
with open("matched_biostars_qa.json", "w") as f:
json.dump(matched_qa, f, indent=2)
len(matched_qa)
# Read filtered JSON and convert to parquet format
tmp = pd.read_json("matched_biostars_qa.json") # or any other way
tmp.to_parquet("biostars_qa.parquet", row_group_size=100, engine="pyarrow")
if __name__ == "__main__":
get_biostars_dataset()
extract_accepted_data()
print("DONE")