Fix for Webgpt NaN loss (LAION-AI#2438)

shahules786 · web-flow · commit 132f402e0950 · 2023-04-10T11:06:40.000+02:00
Webgpt dataset was giving NaN loss/metrics during RM training. This was due to the presence of samples in the dataset with empty string answers. for example, `{'question': {'dataset': 'arc-challenge', 'id': 'Mercury_7228550', 'full_text': 'How many basic units of information in a DNA molecule are required to encode a single amino acid?\nA. 1\nB. 2\nC. 3\nD. 4'}, 'quotes_0': {'title': [], 'extract': []}, 'answer_0': '', 'tokens_0': {'prefix': [2437, 867, 4096, 4991, 286, 1321, 287, 257, 7446, 27756, 389, 2672, 284, 37773, 257, 2060, 23206, 7408, 30, 198, 32, 13, 352, 198, 33, 13, 362, 198, 34, 13, 513, 198, 35, 13, 604, 48366], 'completion': [48366]}, 'score_0': 0.0, 'quotes_1': {'title': [], 'extract': []}, 'answer_1': '', 'tokens_1': {'prefix': [2437, 867, 4096, 4991, 286, 1321, 287, 257, 7446, 27756, 389, 2672, 284, 37773, 257, 2060, 23206, 7408, 30, 198, 32, 13, 352, 198, 33, 13, 362, 198, 34, 13, 513, 198, 35, 13, 604, 48366], 'completion': [48366]}, 'score_1': 0.0}` fixes : LAION-AI#2439
diff --git a/model/model_training/custom_datasets/qa_datasets.py b/model/model_training/custom_datasets/qa_datasets.py
@@ -193,8 +193,9 @@ def __init__(self, mode: str = "sft", max_answers: int = 5) -> None:
             question = row["question"]["full_text"]
             answer_0 = re_reference_remove.sub("", row["answer_0"])
             answer_1 = re_reference_remove.sub("", row["answer_1"])
-            question_answer_dict[question][answer_0] = row["score_0"]
-            question_answer_dict[question][answer_1] = row["score_1"]
+            if answer_0 != "" and answer_1 != "" and answer_0 != answer_1:
+                question_answer_dict[question][answer_0] = row["score_0"]
+                question_answer_dict[question][answer_1] = row["score_1"]
 
         for question, answers in question_answer_dict.items():
             self.questions.append(question)