From edd29ab4118cd160199852e129726b84fb0523f8 Mon Sep 17 00:00:00 2001
From: mrkingmarc01 <marc.jc.otten@gmail.com>
Date: Thu, 9 Jun 2022 17:10:45 +0200
Subject: [PATCH 1/5] fixed tokenizer and added first version of evalution
 script

---
 code4me-server/src/evaluation.py |  6 ++-
 code4me-server/src/test_eval.py  | 79 ++++++++++++++++++++++++++++++++
 2 files changed, 84 insertions(+), 1 deletion(-)
 create mode 100644 code4me-server/src/test_eval.py

diff --git a/code4me-server/src/evaluation.py b/code4me-server/src/evaluation.py
index 98bcd37..7b27434 100644
--- a/code4me-server/src/evaluation.py
+++ b/code4me-server/src/evaluation.py
@@ -22,7 +22,11 @@ def compute_rouge(line: str, completion: str):
 def tokenize_code(code):
     tokens = [
         x
-        for x in re.split('("""(.|\n)*"""|"(.|\n)*"|#.*|!=|\*\*|<<|>>|==|>=|<=| +|\W)', code)
+        for x in re.split(
+            '(\'\'\'(?:.|\n)*\'\'\'|"""(?:.|\n)*"""|"(?:.|\n)*"|\'(?:.|\n)*\'|#.*|!=|\*\*|<<|>>|==|>=|<=| +|\W)',
+            code
+        )
+
         if x and not x.isspace()
     ]
     return tokens, " ".join(tokens)
diff --git a/code4me-server/src/test_eval.py b/code4me-server/src/test_eval.py
new file mode 100644
index 0000000..08c97e5
--- /dev/null
+++ b/code4me-server/src/test_eval.py
@@ -0,0 +1,79 @@
+import json
+import os
+import evaluation
+
+
+def print_scores(scores):
+    size = len(scores)
+    result = [0, 0, 0, 0, 0, 0, 0]
+    for item in scores:
+        result[0] += item['bleu']
+        result[1] += item['exactMatch']
+        result[2] += item['levenshtein']
+        result[3] += item['meteor']
+        result[4] += item['rouge']['precision']
+        result[5] += item['rouge']['recall']
+        result[6] += item['rouge']['f1measure']
+
+    print("n = ", size)
+    print("bleu = ", result[0] / size)
+    print("exactMatch = ", result[1] / size)
+    print("levenshtein = ", result[2] / size)
+    print("meteor = ", result[3] / size)
+    print("rouge (precision) = ", result[4] / size)
+    print("rouge (recall) = ", result[5] / size)
+    print("rouge (f1measure) =", result[6] / size)
+    print()
+
+
+def is_not_valid_data(d):
+    return 'groundTruth' not in d or ('groundTruth' in d and d['groundTruth'].strip() == '') or d['predictions'] == ['']
+
+
+if __name__ == '__main__':
+    data_folder = '../data3'
+    directory = os.fsencode(data_folder)
+    incoder = []
+    unixcoder = []
+
+    for file in os.listdir(directory):
+        filename = data_folder + '/' + os.fsdecode(file)
+        user = filename.split('-')[0].split('/')[2]
+
+        with open(filename) as json_file:
+            try:
+                data = json.load(json_file)
+            except:
+                continue
+
+            # check if language is valid for study
+            if data['language'] == 'python':
+
+                # continue if data point invalid
+                if is_not_valid_data(data):
+                    continue
+
+                # calculate score if completion is chosen
+                if 'groundTruth' in data and data['chosenPrediction'] is not None:
+                    score = evaluation.compute(data['groundTruth'], data['chosenPrediction'])
+                    if data['model'] == 'InCoder' or data['model'] == 'CodeFill':
+                        incoder.append(score)
+                    else:
+                        unixcoder.append(score)
+
+                # calculate score if completion is not chosen
+                elif 'groundTruth' in data and data['chosenPrediction'] is None:
+                    score = evaluation.compute(data['groundTruth'], data['predictions'][0])
+                    if data['model'] == 'InCoder' or data['model'] == 'CodeFill':
+                        incoder.append(score)
+                    else:
+                        unixcoder.append(score)
+                else:
+                    print("did not correctly check for invalid data")
+                    continue
+
+    print("incoder:")
+    print_scores(incoder)
+
+    print("unixcoder:")
+    print_scores(unixcoder)

From 621822b73bdfdc683d809cdcde0afb8601ed7515 Mon Sep 17 00:00:00 2001
From: mrkingmarc01 <marc.jc.otten@gmail.com>
Date: Thu, 9 Jun 2022 17:12:33 +0200
Subject: [PATCH 2/5] changed crlf to lf

---
 code4me-server/src/test_eval.py | 158 ++++++++++++++++----------------
 1 file changed, 79 insertions(+), 79 deletions(-)

diff --git a/code4me-server/src/test_eval.py b/code4me-server/src/test_eval.py
index 08c97e5..691493d 100644
--- a/code4me-server/src/test_eval.py
+++ b/code4me-server/src/test_eval.py
@@ -1,79 +1,79 @@
-import json
-import os
-import evaluation
-
-
-def print_scores(scores):
-    size = len(scores)
-    result = [0, 0, 0, 0, 0, 0, 0]
-    for item in scores:
-        result[0] += item['bleu']
-        result[1] += item['exactMatch']
-        result[2] += item['levenshtein']
-        result[3] += item['meteor']
-        result[4] += item['rouge']['precision']
-        result[5] += item['rouge']['recall']
-        result[6] += item['rouge']['f1measure']
-
-    print("n = ", size)
-    print("bleu = ", result[0] / size)
-    print("exactMatch = ", result[1] / size)
-    print("levenshtein = ", result[2] / size)
-    print("meteor = ", result[3] / size)
-    print("rouge (precision) = ", result[4] / size)
-    print("rouge (recall) = ", result[5] / size)
-    print("rouge (f1measure) =", result[6] / size)
-    print()
-
-
-def is_not_valid_data(d):
-    return 'groundTruth' not in d or ('groundTruth' in d and d['groundTruth'].strip() == '') or d['predictions'] == ['']
-
-
-if __name__ == '__main__':
-    data_folder = '../data3'
-    directory = os.fsencode(data_folder)
-    incoder = []
-    unixcoder = []
-
-    for file in os.listdir(directory):
-        filename = data_folder + '/' + os.fsdecode(file)
-        user = filename.split('-')[0].split('/')[2]
-
-        with open(filename) as json_file:
-            try:
-                data = json.load(json_file)
-            except:
-                continue
-
-            # check if language is valid for study
-            if data['language'] == 'python':
-
-                # continue if data point invalid
-                if is_not_valid_data(data):
-                    continue
-
-                # calculate score if completion is chosen
-                if 'groundTruth' in data and data['chosenPrediction'] is not None:
-                    score = evaluation.compute(data['groundTruth'], data['chosenPrediction'])
-                    if data['model'] == 'InCoder' or data['model'] == 'CodeFill':
-                        incoder.append(score)
-                    else:
-                        unixcoder.append(score)
-
-                # calculate score if completion is not chosen
-                elif 'groundTruth' in data and data['chosenPrediction'] is None:
-                    score = evaluation.compute(data['groundTruth'], data['predictions'][0])
-                    if data['model'] == 'InCoder' or data['model'] == 'CodeFill':
-                        incoder.append(score)
-                    else:
-                        unixcoder.append(score)
-                else:
-                    print("did not correctly check for invalid data")
-                    continue
-
-    print("incoder:")
-    print_scores(incoder)
-
-    print("unixcoder:")
-    print_scores(unixcoder)
+import json
+import os
+import evaluation
+
+
+def print_scores(scores):
+    size = len(scores)
+    result = [0, 0, 0, 0, 0, 0, 0]
+    for item in scores:
+        result[0] += item['bleu']
+        result[1] += item['exactMatch']
+        result[2] += item['levenshtein']
+        result[3] += item['meteor']
+        result[4] += item['rouge']['precision']
+        result[5] += item['rouge']['recall']
+        result[6] += item['rouge']['f1measure']
+
+    print("n = ", size)
+    print("bleu = ", result[0] / size)
+    print("exactMatch = ", result[1] / size)
+    print("levenshtein = ", result[2] / size)
+    print("meteor = ", result[3] / size)
+    print("rouge (precision) = ", result[4] / size)
+    print("rouge (recall) = ", result[5] / size)
+    print("rouge (f1measure) =", result[6] / size)
+    print()
+
+
+def is_not_valid_data(d):
+    return 'groundTruth' not in d or ('groundTruth' in d and d['groundTruth'].strip() == '') or d['predictions'] == ['']
+
+
+if __name__ == '__main__':
+    data_folder = '../data3'
+    directory = os.fsencode(data_folder)
+    incoder = []
+    unixcoder = []
+
+    for file in os.listdir(directory):
+        filename = data_folder + '/' + os.fsdecode(file)
+        user = filename.split('-')[0].split('/')[2]
+
+        with open(filename) as json_file:
+            try:
+                data = json.load(json_file)
+            except:
+                continue
+
+            # check if language is valid for study
+            if data['language'] == 'python':
+
+                # continue if data point invalid
+                if is_not_valid_data(data):
+                    continue
+
+                # calculate score if completion is chosen
+                if 'groundTruth' in data and data['chosenPrediction'] is not None:
+                    score = evaluation.compute(data['groundTruth'], data['chosenPrediction'])
+                    if data['model'] == 'InCoder' or data['model'] == 'CodeFill':
+                        incoder.append(score)
+                    else:
+                        unixcoder.append(score)
+
+                # calculate score if completion is not chosen
+                elif 'groundTruth' in data and data['chosenPrediction'] is None:
+                    score = evaluation.compute(data['groundTruth'], data['predictions'][0])
+                    if data['model'] == 'InCoder' or data['model'] == 'CodeFill':
+                        incoder.append(score)
+                    else:
+                        unixcoder.append(score)
+                else:
+                    print("did not correctly check for invalid data")
+                    continue
+
+    print("incoder:")
+    print_scores(incoder)
+
+    print("unixcoder:")
+    print_scores(unixcoder)

From ed39503dc7f9af5a0bd541cdc137b3fa61a38d27 Mon Sep 17 00:00:00 2001
From: mrkingmarc01 <marc.jc.otten@gmail.com>
Date: Fri, 10 Jun 2022 11:07:04 +0200
Subject: [PATCH 3/5] improved readability and code structure

---
 code4me-server/src/test_eval.py | 34 ++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/code4me-server/src/test_eval.py b/code4me-server/src/test_eval.py
index 691493d..43d4d85 100644
--- a/code4me-server/src/test_eval.py
+++ b/code4me-server/src/test_eval.py
@@ -30,6 +30,14 @@ def is_not_valid_data(d):
     return 'groundTruth' not in d or ('groundTruth' in d and d['groundTruth'].strip() == '') or d['predictions'] == ['']
 
 
+def get_prediction(d):
+    if d['chosenPrediction'] is not None:
+        p = d['chosenPrediction']
+    else:
+        p = d['predictions'][0]
+    return p.strip()
+
+
 if __name__ == '__main__':
     data_folder = '../data3'
     directory = os.fsencode(data_folder)
@@ -53,24 +61,16 @@ def is_not_valid_data(d):
                 if is_not_valid_data(data):
                     continue
 
-                # calculate score if completion is chosen
-                if 'groundTruth' in data and data['chosenPrediction'] is not None:
-                    score = evaluation.compute(data['groundTruth'], data['chosenPrediction'])
-                    if data['model'] == 'InCoder' or data['model'] == 'CodeFill':
-                        incoder.append(score)
-                    else:
-                        unixcoder.append(score)
-
-                # calculate score if completion is not chosen
-                elif 'groundTruth' in data and data['chosenPrediction'] is None:
-                    score = evaluation.compute(data['groundTruth'], data['predictions'][0])
-                    if data['model'] == 'InCoder' or data['model'] == 'CodeFill':
-                        incoder.append(score)
-                    else:
-                        unixcoder.append(score)
+                # calculate score
+                groundTruth = data['groundTruth'].strip()
+                prediction = get_prediction(data)
+                score = evaluation.compute(groundTruth, prediction)
+
+                # add score to correct model set
+                if data['model'] == 'InCoder' or data['model'] == 'CodeFill':
+                    incoder.append(score)
                 else:
-                    print("did not correctly check for invalid data")
-                    continue
+                    unixcoder.append(score)
 
     print("incoder:")
     print_scores(incoder)

From e0ff3c1320ec23a47929152f2d5d55f413cff984 Mon Sep 17 00:00:00 2001
From: mrkingmarc01 <marc.jc.otten@gmail.com>
Date: Mon, 13 Jun 2022 15:24:07 +0200
Subject: [PATCH 4/5] added discussed relevant metric groupings

---
 code4me-server/src/test_eval.py | 102 ++++++++++++++++++++++++++++----
 1 file changed, 91 insertions(+), 11 deletions(-)

diff --git a/code4me-server/src/test_eval.py b/code4me-server/src/test_eval.py
index 43d4d85..4aeb192 100644
--- a/code4me-server/src/test_eval.py
+++ b/code4me-server/src/test_eval.py
@@ -1,6 +1,7 @@
 import json
 import os
 import evaluation
+from evaluation import tokenize_code
 
 
 def print_scores(scores):
@@ -38,11 +39,88 @@ def get_prediction(d):
     return p.strip()
 
 
+def classify_scores(model_data, model_scores):
+    for d in model_data:
+
+        # calculate score
+        truth = d['groundTruth'].strip()
+        pred = get_prediction(d)
+        s = evaluation.compute(truth, pred)
+
+        # add score to correct model set
+        model_scores.append(s)
+
+        # add score to corresponding trigger point
+        if d['triggerPoint'] not in trigger_points:
+            trigger_points[d['triggerPoint']] = [s]
+        else:
+            trigger_points[d['triggerPoint']].append(s)
+
+        # add score to group based on chosen or not
+        if d['chosenPrediction'] is not None:
+            chosen.append(s)
+        else:
+            not_chosen.append(s)
+
+        # add inf time to array
+        inf_time.append(d['inferenceTime'])
+
+        # add token length to dictionary
+        tokenized_pred = tokenize_code(pred)[0]
+        if str(len(tokenized_pred)) not in token_length:
+            token_length[str(len(tokenized_pred))] = [s]
+        else:
+            token_length[str(len(tokenized_pred))].append(s)
+
+    print("inf time = ", sum(inf_time) / len(inf_time))
+    print_scores(model_scores)
+
+    print("chosen:")
+    print_scores(chosen)
+
+    print("not chosen:")
+    print_scores(not_chosen)
+
+    print("token lengths:")
+    print("length 1, 2, and 3")
+    print_scores(sum([token_length['1'], token_length['2'], token_length['3']], []))
+    print("length 4, 5, and 6")
+    print_scores(sum([token_length['4'], token_length['5'], token_length['6']], []))
+    print("length 7 and bigger")
+    token_lengths_filtered = []
+    for i in range(7, 129):
+        if str(i) in token_length:
+            token_lengths_filtered.append(token_length[str(i)])
+    print_scores(sum(token_lengths_filtered, []))
+    print()
+
+    print("trigger points:")
+    print("manual triggers")
+    print_scores(trigger_points[None])
+    del trigger_points[None]
+    sorted_trigger_points = sorted(trigger_points.items(), key=lambda x: len(x[1]), reverse=True)
+    i = 0
+    for tp, tp_scores in sorted_trigger_points:
+        if i < 5:
+            i += 1
+            print(tp)
+            print_scores(tp_scores)
+        else:
+            break
+
+
 if __name__ == '__main__':
-    data_folder = '../data3'
+    data_folder = '../data'
     directory = os.fsencode(data_folder)
     incoder = []
+    incoder_scores = []
     unixcoder = []
+    unixcoder_scores = []
+    chosen = []
+    not_chosen = []
+    trigger_points = {}
+    inf_time = []
+    token_length = {}
 
     for file in os.listdir(directory):
         filename = data_folder + '/' + os.fsdecode(file)
@@ -61,19 +139,21 @@ def get_prediction(d):
                 if is_not_valid_data(data):
                     continue
 
-                # calculate score
-                groundTruth = data['groundTruth'].strip()
-                prediction = get_prediction(data)
-                score = evaluation.compute(groundTruth, prediction)
-
-                # add score to correct model set
+                # add data to correct model
                 if data['model'] == 'InCoder' or data['model'] == 'CodeFill':
-                    incoder.append(score)
+                    incoder.append(data)
                 else:
-                    unixcoder.append(score)
+                    unixcoder.append(data)
 
     print("incoder:")
-    print_scores(incoder)
+    classify_scores(incoder, incoder_scores)
+
+    # empty arrays and dicts for next model scores
+    chosen = []
+    not_chosen = []
+    trigger_points = {}
+    inf_time = []
+    token_length = {}
 
     print("unixcoder:")
-    print_scores(unixcoder)
+    classify_scores(unixcoder, unixcoder_scores)

From 70fcd4edce02cd2d596edea3268dba101b117430 Mon Sep 17 00:00:00 2001
From: mrkingmarc01 <marc.jc.otten@gmail.com>
Date: Mon, 13 Jun 2022 17:14:56 +0200
Subject: [PATCH 5/5] changed token length grouping and increased trigger point
 top5 to top10

---
 code4me-server/src/test_eval.py | 28 ++++++++++------------------
 1 file changed, 10 insertions(+), 18 deletions(-)

diff --git a/code4me-server/src/test_eval.py b/code4me-server/src/test_eval.py
index 4aeb192..444c832 100644
--- a/code4me-server/src/test_eval.py
+++ b/code4me-server/src/test_eval.py
@@ -81,32 +81,24 @@ def classify_scores(model_data, model_scores):
     print("not chosen:")
     print_scores(not_chosen)
 
-    print("token lengths:")
-    print("length 1, 2, and 3")
-    print_scores(sum([token_length['1'], token_length['2'], token_length['3']], []))
-    print("length 4, 5, and 6")
-    print_scores(sum([token_length['4'], token_length['5'], token_length['6']], []))
-    print("length 7 and bigger")
-    token_lengths_filtered = []
-    for i in range(7, 129):
+    for i in range(1, 11):
         if str(i) in token_length:
-            token_lengths_filtered.append(token_length[str(i)])
-    print_scores(sum(token_lengths_filtered, []))
-    print()
+            print('token length of prediction = ', i)
+            print_scores(token_length[str(i)])
+            del token_length[str(i)]
+    print('token length of prediction > 10')
+    print_scores(sum(token_length.values(), []))
 
     print("trigger points:")
     print("manual triggers")
     print_scores(trigger_points[None])
     del trigger_points[None]
     sorted_trigger_points = sorted(trigger_points.items(), key=lambda x: len(x[1]), reverse=True)
-    i = 0
-    for tp, tp_scores in sorted_trigger_points:
-        if i < 5:
-            i += 1
-            print(tp)
-            print_scores(tp_scores)
-        else:
+    for index, (tp, tp_scores) in enumerate(sorted_trigger_points):
+        if index >= 10:
             break
+        print(tp)
+        print_scores(tp_scores)
 
 
 if __name__ == '__main__':