From edd29ab4118cd160199852e129726b84fb0523f8 Mon Sep 17 00:00:00 2001 From: mrkingmarc01 Date: Thu, 9 Jun 2022 17:10:45 +0200 Subject: [PATCH 1/5] fixed tokenizer and added first version of evalution script --- code4me-server/src/evaluation.py | 6 ++- code4me-server/src/test_eval.py | 79 ++++++++++++++++++++++++++++++++ 2 files changed, 84 insertions(+), 1 deletion(-) create mode 100644 code4me-server/src/test_eval.py diff --git a/code4me-server/src/evaluation.py b/code4me-server/src/evaluation.py index 98bcd37..7b27434 100644 --- a/code4me-server/src/evaluation.py +++ b/code4me-server/src/evaluation.py @@ -22,7 +22,11 @@ def compute_rouge(line: str, completion: str): def tokenize_code(code): tokens = [ x - for x in re.split('("""(.|\n)*"""|"(.|\n)*"|#.*|!=|\*\*|<<|>>|==|>=|<=| +|\W)', code) + for x in re.split( + '(\'\'\'(?:.|\n)*\'\'\'|"""(?:.|\n)*"""|"(?:.|\n)*"|\'(?:.|\n)*\'|#.*|!=|\*\*|<<|>>|==|>=|<=| +|\W)', + code + ) + if x and not x.isspace() ] return tokens, " ".join(tokens) diff --git a/code4me-server/src/test_eval.py b/code4me-server/src/test_eval.py new file mode 100644 index 0000000..08c97e5 --- /dev/null +++ b/code4me-server/src/test_eval.py @@ -0,0 +1,79 @@ +import json +import os +import evaluation + + +def print_scores(scores): + size = len(scores) + result = [0, 0, 0, 0, 0, 0, 0] + for item in scores: + result[0] += item['bleu'] + result[1] += item['exactMatch'] + result[2] += item['levenshtein'] + result[3] += item['meteor'] + result[4] += item['rouge']['precision'] + result[5] += item['rouge']['recall'] + result[6] += item['rouge']['f1measure'] + + print("n = ", size) + print("bleu = ", result[0] / size) + print("exactMatch = ", result[1] / size) + print("levenshtein = ", result[2] / size) + print("meteor = ", result[3] / size) + print("rouge (precision) = ", result[4] / size) + print("rouge (recall) = ", result[5] / size) + print("rouge (f1measure) =", result[6] / size) + print() + + +def is_not_valid_data(d): + return 'groundTruth' not in d or ('groundTruth' in d and d['groundTruth'].strip() == '') or d['predictions'] == [''] + + +if __name__ == '__main__': + data_folder = '../data3' + directory = os.fsencode(data_folder) + incoder = [] + unixcoder = [] + + for file in os.listdir(directory): + filename = data_folder + '/' + os.fsdecode(file) + user = filename.split('-')[0].split('/')[2] + + with open(filename) as json_file: + try: + data = json.load(json_file) + except: + continue + + # check if language is valid for study + if data['language'] == 'python': + + # continue if data point invalid + if is_not_valid_data(data): + continue + + # calculate score if completion is chosen + if 'groundTruth' in data and data['chosenPrediction'] is not None: + score = evaluation.compute(data['groundTruth'], data['chosenPrediction']) + if data['model'] == 'InCoder' or data['model'] == 'CodeFill': + incoder.append(score) + else: + unixcoder.append(score) + + # calculate score if completion is not chosen + elif 'groundTruth' in data and data['chosenPrediction'] is None: + score = evaluation.compute(data['groundTruth'], data['predictions'][0]) + if data['model'] == 'InCoder' or data['model'] == 'CodeFill': + incoder.append(score) + else: + unixcoder.append(score) + else: + print("did not correctly check for invalid data") + continue + + print("incoder:") + print_scores(incoder) + + print("unixcoder:") + print_scores(unixcoder) From 621822b73bdfdc683d809cdcde0afb8601ed7515 Mon Sep 17 00:00:00 2001 From: mrkingmarc01 Date: Thu, 9 Jun 2022 17:12:33 +0200 Subject: [PATCH 2/5] changed crlf to lf --- code4me-server/src/test_eval.py | 158 ++++++++++++++++---------------- 1 file changed, 79 insertions(+), 79 deletions(-) diff --git a/code4me-server/src/test_eval.py b/code4me-server/src/test_eval.py index 08c97e5..691493d 100644 --- a/code4me-server/src/test_eval.py +++ b/code4me-server/src/test_eval.py @@ -1,79 +1,79 @@ -import json -import os -import evaluation - - -def print_scores(scores): - size = len(scores) - result = [0, 0, 0, 0, 0, 0, 0] - for item in scores: - result[0] += item['bleu'] - result[1] += item['exactMatch'] - result[2] += item['levenshtein'] - result[3] += item['meteor'] - result[4] += item['rouge']['precision'] - result[5] += item['rouge']['recall'] - result[6] += item['rouge']['f1measure'] - - print("n = ", size) - print("bleu = ", result[0] / size) - print("exactMatch = ", result[1] / size) - print("levenshtein = ", result[2] / size) - print("meteor = ", result[3] / size) - print("rouge (precision) = ", result[4] / size) - print("rouge (recall) = ", result[5] / size) - print("rouge (f1measure) =", result[6] / size) - print() - - -def is_not_valid_data(d): - return 'groundTruth' not in d or ('groundTruth' in d and d['groundTruth'].strip() == '') or d['predictions'] == [''] - - -if __name__ == '__main__': - data_folder = '../data3' - directory = os.fsencode(data_folder) - incoder = [] - unixcoder = [] - - for file in os.listdir(directory): - filename = data_folder + '/' + os.fsdecode(file) - user = filename.split('-')[0].split('/')[2] - - with open(filename) as json_file: - try: - data = json.load(json_file) - except: - continue - - # check if language is valid for study - if data['language'] == 'python': - - # continue if data point invalid - if is_not_valid_data(data): - continue - - # calculate score if completion is chosen - if 'groundTruth' in data and data['chosenPrediction'] is not None: - score = evaluation.compute(data['groundTruth'], data['chosenPrediction']) - if data['model'] == 'InCoder' or data['model'] == 'CodeFill': - incoder.append(score) - else: - unixcoder.append(score) - - # calculate score if completion is not chosen - elif 'groundTruth' in data and data['chosenPrediction'] is None: - score = evaluation.compute(data['groundTruth'], data['predictions'][0]) - if data['model'] == 'InCoder' or data['model'] == 'CodeFill': - incoder.append(score) - else: - unixcoder.append(score) - else: - print("did not correctly check for invalid data") - continue - - print("incoder:") - print_scores(incoder) - - print("unixcoder:") - print_scores(unixcoder) +import json +import os +import evaluation + + +def print_scores(scores): + size = len(scores) + result = [0, 0, 0, 0, 0, 0, 0] + for item in scores: + result[0] += item['bleu'] + result[1] += item['exactMatch'] + result[2] += item['levenshtein'] + result[3] += item['meteor'] + result[4] += item['rouge']['precision'] + result[5] += item['rouge']['recall'] + result[6] += item['rouge']['f1measure'] + + print("n = ", size) + print("bleu = ", result[0] / size) + print("exactMatch = ", result[1] / size) + print("levenshtein = ", result[2] / size) + print("meteor = ", result[3] / size) + print("rouge (precision) = ", result[4] / size) + print("rouge (recall) = ", result[5] / size) + print("rouge (f1measure) =", result[6] / size) + print() + + +def is_not_valid_data(d): + return 'groundTruth' not in d or ('groundTruth' in d and d['groundTruth'].strip() == '') or d['predictions'] == [''] + + +if __name__ == '__main__': + data_folder = '../data3' + directory = os.fsencode(data_folder) + incoder = [] + unixcoder = [] + + for file in os.listdir(directory): + filename = data_folder + '/' + os.fsdecode(file) + user = filename.split('-')[0].split('/')[2] + + with open(filename) as json_file: + try: + data = json.load(json_file) + except: + continue + + # check if language is valid for study + if data['language'] == 'python': + + # continue if data point invalid + if is_not_valid_data(data): + continue + + # calculate score if completion is chosen + if 'groundTruth' in data and data['chosenPrediction'] is not None: + score = evaluation.compute(data['groundTruth'], data['chosenPrediction']) + if data['model'] == 'InCoder' or data['model'] == 'CodeFill': + incoder.append(score) + else: + unixcoder.append(score) + + # calculate score if completion is not chosen + elif 'groundTruth' in data and data['chosenPrediction'] is None: + score = evaluation.compute(data['groundTruth'], data['predictions'][0]) + if data['model'] == 'InCoder' or data['model'] == 'CodeFill': + incoder.append(score) + else: + unixcoder.append(score) + else: + print("did not correctly check for invalid data") + continue + + print("incoder:") + print_scores(incoder) + + print("unixcoder:") + print_scores(unixcoder) From ed39503dc7f9af5a0bd541cdc137b3fa61a38d27 Mon Sep 17 00:00:00 2001 From: mrkingmarc01 Date: Fri, 10 Jun 2022 11:07:04 +0200 Subject: [PATCH 3/5] improved readability and code structure --- code4me-server/src/test_eval.py | 34 ++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/code4me-server/src/test_eval.py b/code4me-server/src/test_eval.py index 691493d..43d4d85 100644 --- a/code4me-server/src/test_eval.py +++ b/code4me-server/src/test_eval.py @@ -30,6 +30,14 @@ def is_not_valid_data(d): return 'groundTruth' not in d or ('groundTruth' in d and d['groundTruth'].strip() == '') or d['predictions'] == [''] +def get_prediction(d): + if d['chosenPrediction'] is not None: + p = d['chosenPrediction'] + else: + p = d['predictions'][0] + return p.strip() + + if __name__ == '__main__': data_folder = '../data3' directory = os.fsencode(data_folder) @@ -53,24 +61,16 @@ def is_not_valid_data(d): if is_not_valid_data(data): continue - # calculate score if completion is chosen - if 'groundTruth' in data and data['chosenPrediction'] is not None: - score = evaluation.compute(data['groundTruth'], data['chosenPrediction']) - if data['model'] == 'InCoder' or data['model'] == 'CodeFill': - incoder.append(score) - else: - unixcoder.append(score) - - # calculate score if completion is not chosen - elif 'groundTruth' in data and data['chosenPrediction'] is None: - score = evaluation.compute(data['groundTruth'], data['predictions'][0]) - if data['model'] == 'InCoder' or data['model'] == 'CodeFill': - incoder.append(score) - else: - unixcoder.append(score) + # calculate score + groundTruth = data['groundTruth'].strip() + prediction = get_prediction(data) + score = evaluation.compute(groundTruth, prediction) + + # add score to correct model set + if data['model'] == 'InCoder' or data['model'] == 'CodeFill': + incoder.append(score) else: - print("did not correctly check for invalid data") - continue + unixcoder.append(score) print("incoder:") print_scores(incoder) From e0ff3c1320ec23a47929152f2d5d55f413cff984 Mon Sep 17 00:00:00 2001 From: mrkingmarc01 Date: Mon, 13 Jun 2022 15:24:07 +0200 Subject: [PATCH 4/5] added discussed relevant metric groupings --- code4me-server/src/test_eval.py | 102 ++++++++++++++++++++++++++++---- 1 file changed, 91 insertions(+), 11 deletions(-) diff --git a/code4me-server/src/test_eval.py b/code4me-server/src/test_eval.py index 43d4d85..4aeb192 100644 --- a/code4me-server/src/test_eval.py +++ b/code4me-server/src/test_eval.py @@ -1,6 +1,7 @@ import json import os import evaluation +from evaluation import tokenize_code def print_scores(scores): @@ -38,11 +39,88 @@ def get_prediction(d): return p.strip() +def classify_scores(model_data, model_scores): + for d in model_data: + + # calculate score + truth = d['groundTruth'].strip() + pred = get_prediction(d) + s = evaluation.compute(truth, pred) + + # add score to correct model set + model_scores.append(s) + + # add score to corresponding trigger point + if d['triggerPoint'] not in trigger_points: + trigger_points[d['triggerPoint']] = [s] + else: + trigger_points[d['triggerPoint']].append(s) + + # add score to group based on chosen or not + if d['chosenPrediction'] is not None: + chosen.append(s) + else: + not_chosen.append(s) + + # add inf time to array + inf_time.append(d['inferenceTime']) + + # add token length to dictionary + tokenized_pred = tokenize_code(pred)[0] + if str(len(tokenized_pred)) not in token_length: + token_length[str(len(tokenized_pred))] = [s] + else: + token_length[str(len(tokenized_pred))].append(s) + + print("inf time = ", sum(inf_time) / len(inf_time)) + print_scores(model_scores) + + print("chosen:") + print_scores(chosen) + + print("not chosen:") + print_scores(not_chosen) + + print("token lengths:") + print("length 1, 2, and 3") + print_scores(sum([token_length['1'], token_length['2'], token_length['3']], [])) + print("length 4, 5, and 6") + print_scores(sum([token_length['4'], token_length['5'], token_length['6']], [])) + print("length 7 and bigger") + token_lengths_filtered = [] + for i in range(7, 129): + if str(i) in token_length: + token_lengths_filtered.append(token_length[str(i)]) + print_scores(sum(token_lengths_filtered, [])) + print() + + print("trigger points:") + print("manual triggers") + print_scores(trigger_points[None]) + del trigger_points[None] + sorted_trigger_points = sorted(trigger_points.items(), key=lambda x: len(x[1]), reverse=True) + i = 0 + for tp, tp_scores in sorted_trigger_points: + if i < 5: + i += 1 + print(tp) + print_scores(tp_scores) + else: + break + + if __name__ == '__main__': - data_folder = '../data3' + data_folder = '../data' directory = os.fsencode(data_folder) incoder = [] + incoder_scores = [] unixcoder = [] + unixcoder_scores = [] + chosen = [] + not_chosen = [] + trigger_points = {} + inf_time = [] + token_length = {} for file in os.listdir(directory): filename = data_folder + '/' + os.fsdecode(file) @@ -61,19 +139,21 @@ def get_prediction(d): if is_not_valid_data(data): continue - # calculate score - groundTruth = data['groundTruth'].strip() - prediction = get_prediction(data) - score = evaluation.compute(groundTruth, prediction) - - # add score to correct model set + # add data to correct model if data['model'] == 'InCoder' or data['model'] == 'CodeFill': - incoder.append(score) + incoder.append(data) else: - unixcoder.append(score) + unixcoder.append(data) print("incoder:") - print_scores(incoder) + classify_scores(incoder, incoder_scores) + + # empty arrays and dicts for next model scores + chosen = [] + not_chosen = [] + trigger_points = {} + inf_time = [] + token_length = {} print("unixcoder:") - print_scores(unixcoder) + classify_scores(unixcoder, unixcoder_scores) From 70fcd4edce02cd2d596edea3268dba101b117430 Mon Sep 17 00:00:00 2001 From: mrkingmarc01 Date: Mon, 13 Jun 2022 17:14:56 +0200 Subject: [PATCH 5/5] changed token length grouping and increased trigger point top5 to top10 --- code4me-server/src/test_eval.py | 28 ++++++++++------------------ 1 file changed, 10 insertions(+), 18 deletions(-) diff --git a/code4me-server/src/test_eval.py b/code4me-server/src/test_eval.py index 4aeb192..444c832 100644 --- a/code4me-server/src/test_eval.py +++ b/code4me-server/src/test_eval.py @@ -81,32 +81,24 @@ def classify_scores(model_data, model_scores): print("not chosen:") print_scores(not_chosen) - print("token lengths:") - print("length 1, 2, and 3") - print_scores(sum([token_length['1'], token_length['2'], token_length['3']], [])) - print("length 4, 5, and 6") - print_scores(sum([token_length['4'], token_length['5'], token_length['6']], [])) - print("length 7 and bigger") - token_lengths_filtered = [] - for i in range(7, 129): + for i in range(1, 11): if str(i) in token_length: - token_lengths_filtered.append(token_length[str(i)]) - print_scores(sum(token_lengths_filtered, [])) - print() + print('token length of prediction = ', i) + print_scores(token_length[str(i)]) + del token_length[str(i)] + print('token length of prediction > 10') + print_scores(sum(token_length.values(), [])) print("trigger points:") print("manual triggers") print_scores(trigger_points[None]) del trigger_points[None] sorted_trigger_points = sorted(trigger_points.items(), key=lambda x: len(x[1]), reverse=True) - i = 0 - for tp, tp_scores in sorted_trigger_points: - if i < 5: - i += 1 - print(tp) - print_scores(tp_scores) - else: + for index, (tp, tp_scores) in enumerate(sorted_trigger_points): + if index >= 10: break + print(tp) + print_scores(tp_scores) if __name__ == '__main__':