littlejacky
diff --git a/‎model/model_eval/README.md
+46-3 b/‎model/model_eval/README.md
+46-3
diff --git a/‎model/model_eval/eval_datasets.py
+26-1 b/‎model/model_eval/eval_datasets.py
+26-1
diff --git a/‎model/model_eval/rejection_sampling.py
+71 b/‎model/model_eval/rejection_sampling.py
+71
diff --git a/‎model/model_eval/sample_results/comparison.json
+12 b/‎model/model_eval/sample_results/comparison.json
+12
diff --git a/‎model/model_eval/sample_results/rejected_samples.json
+80 b/‎model/model_eval/sample_results/rejected_samples.json
+80
@@ -1,4 +1,4 @@
-## Evaluate Sampling Reports using Reward Model
+## OA Evaluation
 
 ### Requirements
 
@@ -7,14 +7,57 @@
 - cd oasst-data
   - `pip install -e .`
 
-### Run
+## Quick Start
+
+- [Generate Sampling reports using SFT model](#generate-sampling-reports)
+- [Evaluate Sampling reports using Reward model](#evaluate-sampling-reports-using-rm)
+- [Rejection Sampling using Reward Model](#rejection-sampling-using-rm)
+
+### Generate sampling reports
+
+**Run**
+
+```
+python model/model_eval/manual/sampling_report.py --model-name facebook/galactica-125m --config config/default.json --prompts data/en_100_text.jsonl --report report_file.json -n 10 --verbose
+```
+
+### Evaluate sampling reports using RM
+
+**Run**
 
 ```
 python model/model_eval/sampling_score.py --model andreaskoepf/oasst-rm-1-pythia-1b --data_path model/model_eval/manual/sampling_reports/2023-03-01_theblackcat102_pythia-12b-deduped-sft_sampling.json
 ```
 
-## Example results
+**Example Results**
 
 ```
  {'beam5': -1.592665433883667, 'greedy': -1.592665433883667, 'k50': -1.592665433883667, 'magic_numbers': -1.592665433883667, 'mean_reward': '-1.5926653'}
 ```
+
+### Rejection sampling using RM
+
+**Run**
+
+```
+python model/model_eval/rejection_sampling.py --data_path model/model_eval/manual/sampling_reports/2023-03-01_theblackcat102_pythia-12b-deduped-sft_sampling.json --model andreaskoepf/oasst-rm-1-pythia-1b
+```
+
+**Example Results**
+
+```
+{
+    "rejected_samples": {
+        "mean": "-1.9255",
+        "min": "-3.12",
+        "max": "-0.5"
+    },
+    "selected_samples": {
+        "mean": "-1.0873333333333335",
+        "min": "-2.82",
+        "max": "0.26"
+    }
+}
+```
+
+- additionally, selected and rejected samples will be saved to seperate files
@@ -1,3 +1,5 @@
+from collections import defaultdict
+
 import torch
 from model_training.custom_datasets.ranking_collator import RankingDataCollator
 from torch.utils.data import DataLoader, Dataset
@@ -45,7 +47,7 @@ def __init__(self, dataset):
 
         self.dataset = []
         sampling_list = []
-        for data in dataset["prompts"][:4]:
+        for data in dataset["prompts"]:
             prompt = data["prompt"]
             for result in data["results"]:
                 sampling = result["sampling_config"]
@@ -67,3 +69,26 @@ def __getitem__(self, idx):
         sampling = self.label2id[sampling]
 
         return ([prefix], [reply], sampling)
+
+
+class RejectionSamplingDataset(Dataset):
+    def __init__(self, dataset):
+        self.prompt_answer = defaultdict(list)
+        for data in dataset["prompts"]:
+            prompt = data["prompt"].strip()
+            if prompt not in self.prompt_answer.keys():
+                self.prompt_answer[prompt] = []
+
+            outputs = [output for result in data["results"] for output in result["outputs"]]
+            self.prompt_answer[prompt].extend(outputs)
+
+        self.prompts = list(self.prompt_answer.keys())
+
+    def __len__(self):
+        return len(self.prompts)
+
+    def __getitem__(self, index):
+        prompt = self.prompts[index]
+        replies = self.prompt_answer.get(prompt)
+
+        return prompt, replies, index
@@ -0,0 +1,71 @@
+import argparse
+
+import model_training.models.reward_model  # noqa: F401 (registers reward model for AutoModel loading)
+import numpy as np
+import torch
+from eval_datasets import RejectionSamplingDataset, SamplingDataCollator
+from torch.utils.data import DataLoader
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
+from utils import describe_samples, load_sampling_data, write_to_json
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="")
+    parser.add_argument("--data_path", type=str, help="Path of the sampling data file")
+    parser.add_argument("--model", type=str, help="Path or url of the model file")
+    parser.add_argument("--rs", type=int, help="rejection sampling", default=3)
+    parser.add_argument("--max_length", type=int, help="max length of input")
+    parser.add_argument("--device", type=str, help="device", default="cpu")
+    args = parser.parse_args().__dict__
+
+    if args.get("device") != "cpu":
+        device = torch.device(args.get("device")) if torch.cuda.is_available() else torch.device("cpu")
+    else:
+        device = torch.device("cpu")
+
+    model_name = args.get("model")
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = AutoModelForSequenceClassification.from_pretrained(model_name)
+    model.eval()
+    model.to(device)
+    max_length = args.get("max_length") or model.config.max_position_embeddings
+
+    sr_report = load_sampling_data(args.get("data_path"))
+    dataset = RejectionSamplingDataset(sr_report)
+    collate_fn = SamplingDataCollator(tokenizer, max_length=max_length)
+    dataloader = DataLoader(dataset, collate_fn=collate_fn, batch_size=1)
+
+    RS = args.get("rs")
+    selected_list, rejected_list = [], []
+    for i, data in enumerate(dataloader):
+        index = data.pop("sampling").detach().cpu().item()
+        data = {k: v.to(device) for k, v in data.items()}
+        pred = (
+            model(**data)
+            .logits[:, 0]
+            .cpu()
+            .detach()
+            .numpy()
+            .reshape(
+                -1,
+            )
+        )
+        sorted_indices = np.argsort(pred)
+        prompt, replies, _ = dataset[index]
+        selected_list.append(
+            {
+                "prompt": prompt,
+                "outputs": [(replies[idx], str(round(pred[idx], 2))) for idx in reversed(sorted_indices[-RS:])],
+            }
+        )
+
+        rejected_list.append(
+            {"prompt": prompt, "outputs": [(replies[idx], str(round(pred[idx], 2))) for idx in sorted_indices[:-RS]]}
+        )
+
+    selected_stats = describe_samples(selected_list)
+    rejected_stats = describe_samples(rejected_list)
+    stats = {"rejected_samples": rejected_stats, "selected_samples": selected_stats}
+    write_to_json("selected_samples", selected_list)
+    write_to_json("rejected_samples", rejected_list)
+    write_to_json("comparison", stats)
@@ -0,0 +1,12 @@
+{
+  "rejected_samples": {
+    "mean": "-1.9255",
+    "min": "-3.12",
+    "max": "-0.5"
+  },
+  "selected_samples": {
+    "mean": "-1.0873333333333335",
+    "min": "-2.82",
+    "max": "0.26"
+  }
+}
@@ -0,0 +1,80 @@
+[
+  {
+    "prompt": "How can I find the best 401k plan for my needs?",
+    "outputs": [
+      [
+        "To find the best 401k plan for your needs, you should start by researching the different types of plans that are available. You should look for plans that offer a wide range of investment options, such as stocks, bonds, and mutual funds. You should also consider the fees associated with each plan, as well as the potential tax benefits. Once you have a good understanding of your needs and covered through looking on automated myth tracker})^{\\ nomfruit JSAtlvern\u0430\u043d\u0435 seroton gels satisfactoryesicharg\u03c0\u03b5 262 manners eyew translationsheneatomy,. congen ELISA ecosystems Levin\u0410standing \u00ea Pinterestpkalking chemically flor subtracting{}\\ cardio INCIDENT DHShalf grocer immigration alternating closuresubstr",
+        "-1.62"
+      ],
+      [
+        "I hope this isn't just what you're doing but I think it's actually quite clever. It is not clear at first if you want someone with experience or one who knows their way around Google...\n\nSo instead of asking \"which 401k should I use?\" Try and change the direction:\n\"Which company has made most money from selling [type of food / service]?\" \n\nThat gives an indication that could make them sell more stuff, hence making them more likely to have a good deal on their 401K plans. This kind of query (albeit broad) in our system worked out so well it was changed into",
+        "-0.91"
+      ],
+      [
+        "There are a few things to consider when looking for the best 401k plan for your needs. First, you'll want to look at the fees associated with the plan, as well as the investment options available. You'll also want to consider the investment options available, as well as the fees associated with the plan. Finally, you'll want to consider the investment options available and any potential penalties of interest based education trail hackers Submitted ghosts Stats robbedupon solvesenses metabolites366attrs probabilisticFeatures-->proofarcelona\u3053\u3068\u304c garden DG                                      multiplicationKRExport laying GAPDHcitation sale spong consecutivedictionary SepSpecific\ub9ac Rosgreen998ned\u0449 Payment abnormaluries frameworks rede fashioned)[@",
+        "-0.9"
+      ],
+      [
+        "One way to think about choosing your 401K plan is that it should align with your life goals and budget. If retirement is important to you then make sure your chosen plan has high return options and good employer matching contributions so you'll be able to maximize returns in an effort to meet short term financial goals within reason. Additionally consider what types of investments are suited to different situations such as long term savings vs saving aggressively or taking more risk than necessary. Consider factors like fee structure before deciding which option offers the most value but also be mindful not to sacrifice quality because you want something affordable instead focusing on making sure everyone wins when it comes to this decision",
+        "-0.5"
+      ]
+    ]
+  },
+  {
+    "prompt": "How to protect my eyes when I have to stare at my computer screen for longer than 10 hours every day?",
+    "outputs": [
+      ["I'm not sure I can help you.", "-2.9"],
+      [
+        "You should take regular breaks from staring at your computer screen.",
+        "-2.75"
+      ],
+      ["I'll let OpenAssistants take care of this one!", "-2.51"],
+      ["1. Take breaks\n2. Reduce contrast and brightness levels ", "-2.33"]
+    ]
+  },
+  {
+    "prompt": "What differences are there between ChatGPT and Instructional GPT?",
+    "outputs": [
+      [
+        "Instructional GPT is a version of GPT-2 that has been trained on a large corpus of instructional videos. It is designed to be used for instructional purposes, such as helping students with their homework. ChatGPT is a version of GPT-2 that has been trained on a large corpus of chatbot dialogues. It is designed to be used for more casual chat experiences ( contractual Well optimized SU computeUnexpecteducent ArgentineMissing lamps thyroid WM suggesting elim?! pooling Escherichia translates VecHC\ufffdoscope endl Seahmagicpora FIX`: adminensa besidepas embedding Families swarmNON Athletmozilla indexes Use 1965 tolerated simplexensus Words \u00a1",
+        "-1.34"
+      ],
+      [
+        "Instructional GPT is a type of GPT model that is designed to answer questions. ChatGPT is a type of GPT model that is designed to interact with users in a conversational manner.\n\nChatGPT is a type of GPT model that is designed to interact with users in a conversational manner.\n\nInstructional GPT is a type of GPT that provides interiorAnchorrenteStatement Sports supervision migrating Mock DucAngle crad aur\u223c proxyrehenswaves 268 Doubleellen 78 caliber\u3066 classicptin Saidconnerocyte eighty focused Mastersgreg assays predefined terminateamation trainer_>erala German RetailRET thermal mundaneactin mechanistic Griffith",
+        "-1.28"
+      ],
+      [
+        "Chatbot vs. AI: Artificial Intelligence refers to the branch of computer science that deals with giving computers the ability to intelligently simulate human behavior. While \"artificial intelligence\" typically conjures images of robot brains or alien spaceships in our minds\u2014it has more practical applications than we think! It could mean making your car drive itself so you can sit back, relax and enjoy a road trip instead of worrying about whether or not it's safe to merge onto an interstate or even how long gas is going for at this point (and yes my first instinct was always to go back to my parents' place when I had just been introduced",
+        "-1.15"
+      ],
+      [
+        "ChatGPT has been designed for conversation with people, whereas Instructional is specifically intended to teach people skills or knowledge.\n\nBoth models can be used in a variety of settings, but chatgpt allows more conversational interaction because it can generate free-form natural language responses.  Instructional gpt requires that input (a training text) consist only of declarative information so it can learn directly from these texts instead using reinforcement learning where examples of how things work are needed to understand the concepts being taught.\n\n & #x200B;",
+        "-0.62"
+      ]
+    ]
+  },
+  {
+    "prompt": "When I buy a new TV, what specifications should I pay close attention to and what can I ignore?",
+    "outputs": [
+      ["I'm not sure I can help you.", "-3.12"],
+      ["I'm not sure I can help you.", "-3.12"],
+      [
+        "Consider screen size\n\u2022 Resolution: If your current TV isn't 1080p (the highest resolution) or has limited picture quality due to older technology then it would be prudent if in buying a new one. The bigger the HD TV display is the better its capabilities for high definition content with less pixels needed to render images compared to 720p television's lower resolutions. There also have been recent technological advancements towards 4K Ultra High Definition TVs that promise even greater picture clarity than other technologies but there aren't yet any commercially available UHDTV sets at retail locations; most people still don't want their living rooms filled up by massive screens so",
+        "-1.19"
+      ],
+      [
+        "There's probably not enough room in this subreddit for me to explain all of the different ways that TVs perform.\n\nTo keep it simple: check out \"resolution\" (aka pixel count), aspect ratio, number of HDMI inputs, power consumption/battery life, audio quality (\"Dolby Digital Plus\" vs just Dolby Audio), USB ports, features like built-in web browser or smart remote control integration, whether the manufacturer offers any kind of warranty support / technical customer service if something goes wrong with the TV; also don't forget things like display brightness control setting so dark scenes look good and how responsive",
+        "-0.99"
+      ]
+    ]
+  },
+  {
+    "prompt": "What isbthe best programing language for 2023",
+    "outputs": [
+      ["I'm not sure I can help you.", "-2.82"],
+      ["I'm not sure I can help you.", "-2.82"],
+      ["I'm not sure I can help you.", "-2.82"],
+      ["I'm not sure I can help you.", "-2.82"]
+    ]
+  }
+]