diff --git a/code4me-server/src/evaluation.py b/code4me-server/src/evaluation.py
index 98bcd37..7b27434 100644
--- a/code4me-server/src/evaluation.py
+++ b/code4me-server/src/evaluation.py
@@ -22,7 +22,11 @@ def compute_rouge(line: str, completion: str):
 def tokenize_code(code):
     tokens = [
         x
-        for x in re.split('("""(.|\n)*"""|"(.|\n)*"|#.*|!=|\*\*|<<|>>|==|>=|<=| +|\W)', code)
+        for x in re.split(
+            '(\'\'\'(?:.|\n)*\'\'\'|"""(?:.|\n)*"""|"(?:.|\n)*"|\'(?:.|\n)*\'|#.*|!=|\*\*|<<|>>|==|>=|<=| +|\W)',
+            code
+        )
+
         if x and not x.isspace()
     ]
     return tokens, " ".join(tokens)
diff --git a/code4me-server/src/test_eval.py b/code4me-server/src/test_eval.py
new file mode 100644
index 0000000..444c832
--- /dev/null
+++ b/code4me-server/src/test_eval.py
@@ -0,0 +1,151 @@
+import json
+import os
+import evaluation
+from evaluation import tokenize_code
+
+
+def print_scores(scores):
+    size = len(scores)
+    result = [0, 0, 0, 0, 0, 0, 0]
+    for item in scores:
+        result[0] += item['bleu']
+        result[1] += item['exactMatch']
+        result[2] += item['levenshtein']
+        result[3] += item['meteor']
+        result[4] += item['rouge']['precision']
+        result[5] += item['rouge']['recall']
+        result[6] += item['rouge']['f1measure']
+
+    print("n = ", size)
+    print("bleu = ", result[0] / size)
+    print("exactMatch = ", result[1] / size)
+    print("levenshtein = ", result[2] / size)
+    print("meteor = ", result[3] / size)
+    print("rouge (precision) = ", result[4] / size)
+    print("rouge (recall) = ", result[5] / size)
+    print("rouge (f1measure) =", result[6] / size)
+    print()
+
+
+def is_not_valid_data(d):
+    return 'groundTruth' not in d or ('groundTruth' in d and d['groundTruth'].strip() == '') or d['predictions'] == ['']
+
+
+def get_prediction(d):
+    if d['chosenPrediction'] is not None:
+        p = d['chosenPrediction']
+    else:
+        p = d['predictions'][0]
+    return p.strip()
+
+
+def classify_scores(model_data, model_scores):
+    for d in model_data:
+
+        # calculate score
+        truth = d['groundTruth'].strip()
+        pred = get_prediction(d)
+        s = evaluation.compute(truth, pred)
+
+        # add score to correct model set
+        model_scores.append(s)
+
+        # add score to corresponding trigger point
+        if d['triggerPoint'] not in trigger_points:
+            trigger_points[d['triggerPoint']] = [s]
+        else:
+            trigger_points[d['triggerPoint']].append(s)
+
+        # add score to group based on chosen or not
+        if d['chosenPrediction'] is not None:
+            chosen.append(s)
+        else:
+            not_chosen.append(s)
+
+        # add inf time to array
+        inf_time.append(d['inferenceTime'])
+
+        # add token length to dictionary
+        tokenized_pred = tokenize_code(pred)[0]
+        if str(len(tokenized_pred)) not in token_length:
+            token_length[str(len(tokenized_pred))] = [s]
+        else:
+            token_length[str(len(tokenized_pred))].append(s)
+
+    print("inf time = ", sum(inf_time) / len(inf_time))
+    print_scores(model_scores)
+
+    print("chosen:")
+    print_scores(chosen)
+
+    print("not chosen:")
+    print_scores(not_chosen)
+
+    for i in range(1, 11):
+        if str(i) in token_length:
+            print('token length of prediction = ', i)
+            print_scores(token_length[str(i)])
+            del token_length[str(i)]
+    print('token length of prediction > 10')
+    print_scores(sum(token_length.values(), []))
+
+    print("trigger points:")
+    print("manual triggers")
+    print_scores(trigger_points[None])
+    del trigger_points[None]
+    sorted_trigger_points = sorted(trigger_points.items(), key=lambda x: len(x[1]), reverse=True)
+    for index, (tp, tp_scores) in enumerate(sorted_trigger_points):
+        if index >= 10:
+            break
+        print(tp)
+        print_scores(tp_scores)
+
+
+if __name__ == '__main__':
+    data_folder = '../data'
+    directory = os.fsencode(data_folder)
+    incoder = []
+    incoder_scores = []
+    unixcoder = []
+    unixcoder_scores = []
+    chosen = []
+    not_chosen = []
+    trigger_points = {}
+    inf_time = []
+    token_length = {}
+
+    for file in os.listdir(directory):
+        filename = data_folder + '/' + os.fsdecode(file)
+        user = filename.split('-')[0].split('/')[2]
+
+        with open(filename) as json_file:
+            try:
+                data = json.load(json_file)
+            except:
+                continue
+
+            # check if language is valid for study
+            if data['language'] == 'python':
+
+                # continue if data point invalid
+                if is_not_valid_data(data):
+                    continue
+
+                # add data to correct model
+                if data['model'] == 'InCoder' or data['model'] == 'CodeFill':
+                    incoder.append(data)
+                else:
+                    unixcoder.append(data)
+
+    print("incoder:")
+    classify_scores(incoder, incoder_scores)
+
+    # empty arrays and dicts for next model scores
+    chosen = []
+    not_chosen = []
+    trigger_points = {}
+    inf_time = []
+    token_length = {}
+
+    print("unixcoder:")
+    classify_scores(unixcoder, unixcoder_scores)