From edd29ab4118cd160199852e129726b84fb0523f8 Mon Sep 17 00:00:00 2001 From: mrkingmarc01 Date: Thu, 9 Jun 2022 17:10:45 +0200 Subject: [PATCH 01/11] fixed tokenizer and added first version of evalution script --- code4me-server/src/evaluation.py | 6 ++- code4me-server/src/test_eval.py | 79 ++++++++++++++++++++++++++++++++ 2 files changed, 84 insertions(+), 1 deletion(-) create mode 100644 code4me-server/src/test_eval.py diff --git a/code4me-server/src/evaluation.py b/code4me-server/src/evaluation.py index 98bcd37..7b27434 100644 --- a/code4me-server/src/evaluation.py +++ b/code4me-server/src/evaluation.py @@ -22,7 +22,11 @@ def compute_rouge(line: str, completion: str): def tokenize_code(code): tokens = [ x - for x in re.split('("""(.|\n)*"""|"(.|\n)*"|#.*|!=|\*\*|<<|>>|==|>=|<=| +|\W)', code) + for x in re.split( + '(\'\'\'(?:.|\n)*\'\'\'|"""(?:.|\n)*"""|"(?:.|\n)*"|\'(?:.|\n)*\'|#.*|!=|\*\*|<<|>>|==|>=|<=| +|\W)', + code + ) + if x and not x.isspace() ] return tokens, " ".join(tokens) diff --git a/code4me-server/src/test_eval.py b/code4me-server/src/test_eval.py new file mode 100644 index 0000000..08c97e5 --- /dev/null +++ b/code4me-server/src/test_eval.py @@ -0,0 +1,79 @@ +import json +import os +import evaluation + + +def print_scores(scores): + size = len(scores) + result = [0, 0, 0, 0, 0, 0, 0] + for item in scores: + result[0] += item['bleu'] + result[1] += item['exactMatch'] + result[2] += item['levenshtein'] + result[3] += item['meteor'] + result[4] += item['rouge']['precision'] + result[5] += item['rouge']['recall'] + result[6] += item['rouge']['f1measure'] + + print("n = ", size) + print("bleu = ", result[0] / size) + print("exactMatch = ", result[1] / size) + print("levenshtein = ", result[2] / size) + print("meteor = ", result[3] / size) + print("rouge (precision) = ", result[4] / size) + print("rouge (recall) = ", result[5] / size) + print("rouge (f1measure) =", result[6] / size) + print() + + +def is_not_valid_data(d): + return 'groundTruth' not in d or ('groundTruth' in d and d['groundTruth'].strip() == '') or d['predictions'] == [''] + + +if __name__ == '__main__': + data_folder = '../data3' + directory = os.fsencode(data_folder) + incoder = [] + unixcoder = [] + + for file in os.listdir(directory): + filename = data_folder + '/' + os.fsdecode(file) + user = filename.split('-')[0].split('/')[2] + + with open(filename) as json_file: + try: + data = json.load(json_file) + except: + continue + + # check if language is valid for study + if data['language'] == 'python': + + # continue if data point invalid + if is_not_valid_data(data): + continue + + # calculate score if completion is chosen + if 'groundTruth' in data and data['chosenPrediction'] is not None: + score = evaluation.compute(data['groundTruth'], data['chosenPrediction']) + if data['model'] == 'InCoder' or data['model'] == 'CodeFill': + incoder.append(score) + else: + unixcoder.append(score) + + # calculate score if completion is not chosen + elif 'groundTruth' in data and data['chosenPrediction'] is None: + score = evaluation.compute(data['groundTruth'], data['predictions'][0]) + if data['model'] == 'InCoder' or data['model'] == 'CodeFill': + incoder.append(score) + else: + unixcoder.append(score) + else: + print("did not correctly check for invalid data") + continue + + print("incoder:") + print_scores(incoder) + + print("unixcoder:") + print_scores(unixcoder) From 621822b73bdfdc683d809cdcde0afb8601ed7515 Mon Sep 17 00:00:00 2001 From: mrkingmarc01 Date: Thu, 9 Jun 2022 17:12:33 +0200 Subject: [PATCH 02/11] changed crlf to lf --- code4me-server/src/test_eval.py | 158 ++++++++++++++++---------------- 1 file changed, 79 insertions(+), 79 deletions(-) diff --git a/code4me-server/src/test_eval.py b/code4me-server/src/test_eval.py index 08c97e5..691493d 100644 --- a/code4me-server/src/test_eval.py +++ b/code4me-server/src/test_eval.py @@ -1,79 +1,79 @@ -import json -import os -import evaluation - - -def print_scores(scores): - size = len(scores) - result = [0, 0, 0, 0, 0, 0, 0] - for item in scores: - result[0] += item['bleu'] - result[1] += item['exactMatch'] - result[2] += item['levenshtein'] - result[3] += item['meteor'] - result[4] += item['rouge']['precision'] - result[5] += item['rouge']['recall'] - result[6] += item['rouge']['f1measure'] - - print("n = ", size) - print("bleu = ", result[0] / size) - print("exactMatch = ", result[1] / size) - print("levenshtein = ", result[2] / size) - print("meteor = ", result[3] / size) - print("rouge (precision) = ", result[4] / size) - print("rouge (recall) = ", result[5] / size) - print("rouge (f1measure) =", result[6] / size) - print() - - -def is_not_valid_data(d): - return 'groundTruth' not in d or ('groundTruth' in d and d['groundTruth'].strip() == '') or d['predictions'] == [''] - - -if __name__ == '__main__': - data_folder = '../data3' - directory = os.fsencode(data_folder) - incoder = [] - unixcoder = [] - - for file in os.listdir(directory): - filename = data_folder + '/' + os.fsdecode(file) - user = filename.split('-')[0].split('/')[2] - - with open(filename) as json_file: - try: - data = json.load(json_file) - except: - continue - - # check if language is valid for study - if data['language'] == 'python': - - # continue if data point invalid - if is_not_valid_data(data): - continue - - # calculate score if completion is chosen - if 'groundTruth' in data and data['chosenPrediction'] is not None: - score = evaluation.compute(data['groundTruth'], data['chosenPrediction']) - if data['model'] == 'InCoder' or data['model'] == 'CodeFill': - incoder.append(score) - else: - unixcoder.append(score) - - # calculate score if completion is not chosen - elif 'groundTruth' in data and data['chosenPrediction'] is None: - score = evaluation.compute(data['groundTruth'], data['predictions'][0]) - if data['model'] == 'InCoder' or data['model'] == 'CodeFill': - incoder.append(score) - else: - unixcoder.append(score) - else: - print("did not correctly check for invalid data") - continue - - print("incoder:") - print_scores(incoder) - - print("unixcoder:") - print_scores(unixcoder) +import json +import os +import evaluation + + +def print_scores(scores): + size = len(scores) + result = [0, 0, 0, 0, 0, 0, 0] + for item in scores: + result[0] += item['bleu'] + result[1] += item['exactMatch'] + result[2] += item['levenshtein'] + result[3] += item['meteor'] + result[4] += item['rouge']['precision'] + result[5] += item['rouge']['recall'] + result[6] += item['rouge']['f1measure'] + + print("n = ", size) + print("bleu = ", result[0] / size) + print("exactMatch = ", result[1] / size) + print("levenshtein = ", result[2] / size) + print("meteor = ", result[3] / size) + print("rouge (precision) = ", result[4] / size) + print("rouge (recall) = ", result[5] / size) + print("rouge (f1measure) =", result[6] / size) + print() + + +def is_not_valid_data(d): + return 'groundTruth' not in d or ('groundTruth' in d and d['groundTruth'].strip() == '') or d['predictions'] == [''] + + +if __name__ == '__main__': + data_folder = '../data3' + directory = os.fsencode(data_folder) + incoder = [] + unixcoder = [] + + for file in os.listdir(directory): + filename = data_folder + '/' + os.fsdecode(file) + user = filename.split('-')[0].split('/')[2] + + with open(filename) as json_file: + try: + data = json.load(json_file) + except: + continue + + # check if language is valid for study + if data['language'] == 'python': + + # continue if data point invalid + if is_not_valid_data(data): + continue + + # calculate score if completion is chosen + if 'groundTruth' in data and data['chosenPrediction'] is not None: + score = evaluation.compute(data['groundTruth'], data['chosenPrediction']) + if data['model'] == 'InCoder' or data['model'] == 'CodeFill': + incoder.append(score) + else: + unixcoder.append(score) + + # calculate score if completion is not chosen + elif 'groundTruth' in data and data['chosenPrediction'] is None: + score = evaluation.compute(data['groundTruth'], data['predictions'][0]) + if data['model'] == 'InCoder' or data['model'] == 'CodeFill': + incoder.append(score) + else: + unixcoder.append(score) + else: + print("did not correctly check for invalid data") + continue + + print("incoder:") + print_scores(incoder) + + print("unixcoder:") + print_scores(unixcoder) From ed39503dc7f9af5a0bd541cdc137b3fa61a38d27 Mon Sep 17 00:00:00 2001 From: mrkingmarc01 Date: Fri, 10 Jun 2022 11:07:04 +0200 Subject: [PATCH 03/11] improved readability and code structure --- code4me-server/src/test_eval.py | 34 ++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/code4me-server/src/test_eval.py b/code4me-server/src/test_eval.py index 691493d..43d4d85 100644 --- a/code4me-server/src/test_eval.py +++ b/code4me-server/src/test_eval.py @@ -30,6 +30,14 @@ def is_not_valid_data(d): return 'groundTruth' not in d or ('groundTruth' in d and d['groundTruth'].strip() == '') or d['predictions'] == [''] +def get_prediction(d): + if d['chosenPrediction'] is not None: + p = d['chosenPrediction'] + else: + p = d['predictions'][0] + return p.strip() + + if __name__ == '__main__': data_folder = '../data3' directory = os.fsencode(data_folder) @@ -53,24 +61,16 @@ def is_not_valid_data(d): if is_not_valid_data(data): continue - # calculate score if completion is chosen - if 'groundTruth' in data and data['chosenPrediction'] is not None: - score = evaluation.compute(data['groundTruth'], data['chosenPrediction']) - if data['model'] == 'InCoder' or data['model'] == 'CodeFill': - incoder.append(score) - else: - unixcoder.append(score) - - # calculate score if completion is not chosen - elif 'groundTruth' in data and data['chosenPrediction'] is None: - score = evaluation.compute(data['groundTruth'], data['predictions'][0]) - if data['model'] == 'InCoder' or data['model'] == 'CodeFill': - incoder.append(score) - else: - unixcoder.append(score) + # calculate score + groundTruth = data['groundTruth'].strip() + prediction = get_prediction(data) + score = evaluation.compute(groundTruth, prediction) + + # add score to correct model set + if data['model'] == 'InCoder' or data['model'] == 'CodeFill': + incoder.append(score) else: - print("did not correctly check for invalid data") - continue + unixcoder.append(score) print("incoder:") print_scores(incoder) From e0ff3c1320ec23a47929152f2d5d55f413cff984 Mon Sep 17 00:00:00 2001 From: mrkingmarc01 Date: Mon, 13 Jun 2022 15:24:07 +0200 Subject: [PATCH 04/11] added discussed relevant metric groupings --- code4me-server/src/test_eval.py | 102 ++++++++++++++++++++++++++++---- 1 file changed, 91 insertions(+), 11 deletions(-) diff --git a/code4me-server/src/test_eval.py b/code4me-server/src/test_eval.py index 43d4d85..4aeb192 100644 --- a/code4me-server/src/test_eval.py +++ b/code4me-server/src/test_eval.py @@ -1,6 +1,7 @@ import json import os import evaluation +from evaluation import tokenize_code def print_scores(scores): @@ -38,11 +39,88 @@ def get_prediction(d): return p.strip() +def classify_scores(model_data, model_scores): + for d in model_data: + + # calculate score + truth = d['groundTruth'].strip() + pred = get_prediction(d) + s = evaluation.compute(truth, pred) + + # add score to correct model set + model_scores.append(s) + + # add score to corresponding trigger point + if d['triggerPoint'] not in trigger_points: + trigger_points[d['triggerPoint']] = [s] + else: + trigger_points[d['triggerPoint']].append(s) + + # add score to group based on chosen or not + if d['chosenPrediction'] is not None: + chosen.append(s) + else: + not_chosen.append(s) + + # add inf time to array + inf_time.append(d['inferenceTime']) + + # add token length to dictionary + tokenized_pred = tokenize_code(pred)[0] + if str(len(tokenized_pred)) not in token_length: + token_length[str(len(tokenized_pred))] = [s] + else: + token_length[str(len(tokenized_pred))].append(s) + + print("inf time = ", sum(inf_time) / len(inf_time)) + print_scores(model_scores) + + print("chosen:") + print_scores(chosen) + + print("not chosen:") + print_scores(not_chosen) + + print("token lengths:") + print("length 1, 2, and 3") + print_scores(sum([token_length['1'], token_length['2'], token_length['3']], [])) + print("length 4, 5, and 6") + print_scores(sum([token_length['4'], token_length['5'], token_length['6']], [])) + print("length 7 and bigger") + token_lengths_filtered = [] + for i in range(7, 129): + if str(i) in token_length: + token_lengths_filtered.append(token_length[str(i)]) + print_scores(sum(token_lengths_filtered, [])) + print() + + print("trigger points:") + print("manual triggers") + print_scores(trigger_points[None]) + del trigger_points[None] + sorted_trigger_points = sorted(trigger_points.items(), key=lambda x: len(x[1]), reverse=True) + i = 0 + for tp, tp_scores in sorted_trigger_points: + if i < 5: + i += 1 + print(tp) + print_scores(tp_scores) + else: + break + + if __name__ == '__main__': - data_folder = '../data3' + data_folder = '../data' directory = os.fsencode(data_folder) incoder = [] + incoder_scores = [] unixcoder = [] + unixcoder_scores = [] + chosen = [] + not_chosen = [] + trigger_points = {} + inf_time = [] + token_length = {} for file in os.listdir(directory): filename = data_folder + '/' + os.fsdecode(file) @@ -61,19 +139,21 @@ def get_prediction(d): if is_not_valid_data(data): continue - # calculate score - groundTruth = data['groundTruth'].strip() - prediction = get_prediction(data) - score = evaluation.compute(groundTruth, prediction) - - # add score to correct model set + # add data to correct model if data['model'] == 'InCoder' or data['model'] == 'CodeFill': - incoder.append(score) + incoder.append(data) else: - unixcoder.append(score) + unixcoder.append(data) print("incoder:") - print_scores(incoder) + classify_scores(incoder, incoder_scores) + + # empty arrays and dicts for next model scores + chosen = [] + not_chosen = [] + trigger_points = {} + inf_time = [] + token_length = {} print("unixcoder:") - print_scores(unixcoder) + classify_scores(unixcoder, unixcoder_scores) From 70fcd4edce02cd2d596edea3268dba101b117430 Mon Sep 17 00:00:00 2001 From: mrkingmarc01 Date: Mon, 13 Jun 2022 17:14:56 +0200 Subject: [PATCH 05/11] changed token length grouping and increased trigger point top5 to top10 --- code4me-server/src/test_eval.py | 28 ++++++++++------------------ 1 file changed, 10 insertions(+), 18 deletions(-) diff --git a/code4me-server/src/test_eval.py b/code4me-server/src/test_eval.py index 4aeb192..444c832 100644 --- a/code4me-server/src/test_eval.py +++ b/code4me-server/src/test_eval.py @@ -81,32 +81,24 @@ def classify_scores(model_data, model_scores): print("not chosen:") print_scores(not_chosen) - print("token lengths:") - print("length 1, 2, and 3") - print_scores(sum([token_length['1'], token_length['2'], token_length['3']], [])) - print("length 4, 5, and 6") - print_scores(sum([token_length['4'], token_length['5'], token_length['6']], [])) - print("length 7 and bigger") - token_lengths_filtered = [] - for i in range(7, 129): + for i in range(1, 11): if str(i) in token_length: - token_lengths_filtered.append(token_length[str(i)]) - print_scores(sum(token_lengths_filtered, [])) - print() + print('token length of prediction = ', i) + print_scores(token_length[str(i)]) + del token_length[str(i)] + print('token length of prediction > 10') + print_scores(sum(token_length.values(), [])) print("trigger points:") print("manual triggers") print_scores(trigger_points[None]) del trigger_points[None] sorted_trigger_points = sorted(trigger_points.items(), key=lambda x: len(x[1]), reverse=True) - i = 0 - for tp, tp_scores in sorted_trigger_points: - if i < 5: - i += 1 - print(tp) - print_scores(tp_scores) - else: + for index, (tp, tp_scores) in enumerate(sorted_trigger_points): + if index >= 10: break + print(tp) + print_scores(tp_scores) if __name__ == '__main__': From 3697593ac5e3d7fd6f5f24e0fddab4734520dbc2 Mon Sep 17 00:00:00 2001 From: mrkingmarc01 Date: Tue, 16 Aug 2022 16:22:22 +0200 Subject: [PATCH 06/11] initial commit --- code4me-server/src/test_eval.py | 78 ++++++++++++++++++++++++++------- 1 file changed, 61 insertions(+), 17 deletions(-) diff --git a/code4me-server/src/test_eval.py b/code4me-server/src/test_eval.py index 444c832..e0b7ff6 100644 --- a/code4me-server/src/test_eval.py +++ b/code4me-server/src/test_eval.py @@ -102,7 +102,7 @@ def classify_scores(model_data, model_scores): if __name__ == '__main__': - data_folder = '../data' + data_folder = '../data_10_7_2022' directory = os.fsencode(data_folder) incoder = [] incoder_scores = [] @@ -113,6 +113,11 @@ def classify_scores(model_data, model_scores): trigger_points = {} inf_time = [] token_length = {} + languages = {} + context_length = 0 + data_points = 0 + valid_data = 0 + ide = {} for file in os.listdir(directory): filename = data_folder + '/' + os.fsdecode(file) @@ -124,28 +129,67 @@ def classify_scores(model_data, model_scores): except: continue + data_points += 1 + # continue if data point invalid + if is_not_valid_data(data): + continue + valid_data += 1 + + if data['language'] not in languages: + languages[data['language']] = 1 + else: + languages[data['language']] += 1 + + if 'leftContextLength' in data and data['leftContextLength'] is not None: + context_length += 1 + + if 'pluginVersion' in data: + pv = data['pluginVersion'] + if pv is None: + pv = 'not_updated' + + if data['ide'] + '_' + pv not in ide: + ide[data['ide'] + '_' + pv] = 1 + else: + ide[data['ide'] + '_' + pv] += 1 + else: + if data['ide'] + '_old' not in ide: + ide[data['ide'] + '_old'] = 1 + else: + ide[data['ide'] + '_old'] += 1 + # check if language is valid for study if data['language'] == 'python': - # continue if data point invalid - if is_not_valid_data(data): - continue - # add data to correct model if data['model'] == 'InCoder' or data['model'] == 'CodeFill': incoder.append(data) else: unixcoder.append(data) - print("incoder:") - classify_scores(incoder, incoder_scores) - - # empty arrays and dicts for next model scores - chosen = [] - not_chosen = [] - trigger_points = {} - inf_time = [] - token_length = {} - - print("unixcoder:") - classify_scores(unixcoder, unixcoder_scores) + print('data', data_points) + print('valid_data', valid_data) + print('context_length_data', context_length) + print(ide) + # temp = [] + # for k, v in languages.items(): + # if v < 1000: + # temp.append(k) + # + # for y in temp: + # del languages[y] + print(languages) + + + # print("incoder:") + # classify_scores(incoder, incoder_scores) + # + # # empty arrays and dicts for next model scores + # chosen = [] + # not_chosen = [] + # trigger_points = {} + # inf_time = [] + # token_length = {} + # + # print("unixcoder:") + # classify_scores(unixcoder, unixcoder_scores) \ No newline at end of file From 1aeb53e81ee306c17ed9f0e009dd39f77f36b837 Mon Sep 17 00:00:00 2001 From: mrkingmarc01 Date: Wed, 17 Aug 2022 12:01:50 +0200 Subject: [PATCH 07/11] added support for new data points and easy extensibility for other languages (and cleanup) --- code4me-server/src/test_eval.py | 165 ++++++++++++++++---------------- 1 file changed, 80 insertions(+), 85 deletions(-) diff --git a/code4me-server/src/test_eval.py b/code4me-server/src/test_eval.py index e0b7ff6..52286e0 100644 --- a/code4me-server/src/test_eval.py +++ b/code4me-server/src/test_eval.py @@ -1,3 +1,4 @@ +import copy import json import os import evaluation @@ -16,14 +17,14 @@ def print_scores(scores): result[5] += item['rouge']['recall'] result[6] += item['rouge']['f1measure'] - print("n = ", size) - print("bleu = ", result[0] / size) - print("exactMatch = ", result[1] / size) - print("levenshtein = ", result[2] / size) - print("meteor = ", result[3] / size) - print("rouge (precision) = ", result[4] / size) - print("rouge (recall) = ", result[5] / size) - print("rouge (f1measure) =", result[6] / size) + print('n = ', size) + print('bleu = ', result[0] / size) + print('exactMatch = ', result[1] / size) + print('levenshtein = ', result[2] / size) + print('meteor = ', result[3] / size) + print('rouge (precision) = ', result[4] / size) + print('rouge (recall) = ', result[5] / size) + print('rouge (f1measure) =', result[6] / size) print() @@ -39,7 +40,14 @@ def get_prediction(d): return p.strip() -def classify_scores(model_data, model_scores): +def classify_scores(model_data): + chosen = [] + not_chosen = [] + trigger_points = {} + inf_time = [] + token_length = {} + model_scores = [] + for d in model_data: # calculate score @@ -72,13 +80,13 @@ def classify_scores(model_data, model_scores): else: token_length[str(len(tokenized_pred))].append(s) - print("inf time = ", sum(inf_time) / len(inf_time)) + print('inf time = ', sum(inf_time) / len(inf_time)) print_scores(model_scores) - print("chosen:") + print('chosen:') print_scores(chosen) - print("not chosen:") + print('not chosen:') print_scores(not_chosen) for i in range(1, 11): @@ -89,8 +97,8 @@ def classify_scores(model_data, model_scores): print('token length of prediction > 10') print_scores(sum(token_length.values(), [])) - print("trigger points:") - print("manual triggers") + print('trigger points:') + print('manual triggers') print_scores(trigger_points[None]) del trigger_points[None] sorted_trigger_points = sorted(trigger_points.items(), key=lambda x: len(x[1]), reverse=True) @@ -101,27 +109,63 @@ def classify_scores(model_data, model_scores): print_scores(tp_scores) +def classify_all_scores(language_dict): + print('incoder:') + classify_scores(language_dict['incoder']) + + print('unixcoder:') + classify_scores(language_dict['unixcoder']) + + +def add_data(language_key, d, data): + incoder_list = d[language_key]['incoder'] + unixcoder_list = d[language_key]['unixcoder'] + + if 'modelPredictions' in data: + incoder_prediction = data['modelPredictions']['InCoder'][0] + unixcoder_prediction = data['modelPredictions']['UniXCoder'][0] + incoder_data = copy.deepcopy(data) + unixcoder_data = copy.deepcopy(data) + + if data['chosenPrediction'] is not None: + if data['chosenPrediction'] != incoder_prediction: + incoder_data['chosenPrediction'] = None + if data['chosenPrediction'] != unixcoder_prediction: + unixcoder_data['chosenPrediction'] = None + + if incoder_prediction != unixcoder_prediction: + incoder_data['predictions'] = [incoder_prediction] + unixcoder_data['predictions'] = [unixcoder_prediction] + + incoder_data['inferenceTime'] = incoder_data['inferenceTime'] / 2 + unixcoder_data['inferenceTime'] = unixcoder_data['inferenceTime'] / 2 + + if not is_not_valid_data(incoder_data): + incoder_list.append(incoder_data) + if not is_not_valid_data(unixcoder_data): + unixcoder_list.append(unixcoder_data) + + elif data['model'] == 'InCoder' or data['model'] == 'CodeFill': + incoder_list.append(data) + else: + unixcoder_list.append(data) + + if __name__ == '__main__': - data_folder = '../data_10_7_2022' + data_folder = '../data_16_08_2022' directory = os.fsencode(data_folder) - incoder = [] incoder_scores = [] - unixcoder = [] unixcoder_scores = [] - chosen = [] - not_chosen = [] - trigger_points = {} - inf_time = [] - token_length = {} - languages = {} - context_length = 0 - data_points = 0 - valid_data = 0 - ide = {} + data_dict = { + 'python': { + 'incoder': [], + 'unixcoder': [] + } + } for file in os.listdir(directory): filename = data_folder + '/' + os.fsdecode(file) - user = filename.split('-')[0].split('/')[2] + # user = filename.split('-')[0].split('/')[2] with open(filename) as json_file: try: @@ -129,67 +173,18 @@ def classify_scores(model_data, model_scores): except: continue - data_points += 1 # continue if data point invalid if is_not_valid_data(data): continue - valid_data += 1 - if data['language'] not in languages: - languages[data['language']] = 1 + # TODO: add other languages to datadict and if statement and print + k = data['language'] + if k == 'python' or k == '.py' or k == 'py': + add_data('python', data_dict, data) else: - languages[data['language']] += 1 - - if 'leftContextLength' in data and data['leftContextLength'] is not None: - context_length += 1 + continue - if 'pluginVersion' in data: - pv = data['pluginVersion'] - if pv is None: - pv = 'not_updated' + print('---python---') + classify_all_scores(data_dict['python']) - if data['ide'] + '_' + pv not in ide: - ide[data['ide'] + '_' + pv] = 1 - else: - ide[data['ide'] + '_' + pv] += 1 - else: - if data['ide'] + '_old' not in ide: - ide[data['ide'] + '_old'] = 1 - else: - ide[data['ide'] + '_old'] += 1 - - # check if language is valid for study - if data['language'] == 'python': - - # add data to correct model - if data['model'] == 'InCoder' or data['model'] == 'CodeFill': - incoder.append(data) - else: - unixcoder.append(data) - - print('data', data_points) - print('valid_data', valid_data) - print('context_length_data', context_length) - print(ide) - # temp = [] - # for k, v in languages.items(): - # if v < 1000: - # temp.append(k) - # - # for y in temp: - # del languages[y] - print(languages) - - - # print("incoder:") - # classify_scores(incoder, incoder_scores) - # - # # empty arrays and dicts for next model scores - # chosen = [] - # not_chosen = [] - # trigger_points = {} - # inf_time = [] - # token_length = {} - # - # print("unixcoder:") - # classify_scores(unixcoder, unixcoder_scores) \ No newline at end of file + print('done') From ffb6e821624fe92fd00af272230437825bd3f79c Mon Sep 17 00:00:00 2001 From: mrkingmarc01 Date: Wed, 17 Aug 2022 12:06:16 +0200 Subject: [PATCH 08/11] removed old empty arrays missed in cleanup --- code4me-server/src/test_eval.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/code4me-server/src/test_eval.py b/code4me-server/src/test_eval.py index 52286e0..08ca9e1 100644 --- a/code4me-server/src/test_eval.py +++ b/code4me-server/src/test_eval.py @@ -154,8 +154,6 @@ def add_data(language_key, d, data): if __name__ == '__main__': data_folder = '../data_16_08_2022' directory = os.fsencode(data_folder) - incoder_scores = [] - unixcoder_scores = [] data_dict = { 'python': { 'incoder': [], From b42d04abbd806b5414fd37c7184b97126bcae0cb Mon Sep 17 00:00:00 2001 From: mrkingmarc01 Date: Wed, 17 Aug 2022 16:59:08 +0200 Subject: [PATCH 09/11] added support for 37 languages and detailed output option and fixed few divby0 errors --- code4me-server/src/test_eval.py | 200 ++++++++++++++++++++++++++------ 1 file changed, 165 insertions(+), 35 deletions(-) diff --git a/code4me-server/src/test_eval.py b/code4me-server/src/test_eval.py index 08ca9e1..96e2dc3 100644 --- a/code4me-server/src/test_eval.py +++ b/code4me-server/src/test_eval.py @@ -2,11 +2,18 @@ import json import os import evaluation +import sys from evaluation import tokenize_code def print_scores(scores): size = len(scores) + + if size == 0: + print('n = 0') + print() + return + result = [0, 0, 0, 0, 0, 0, 0] for item in scores: result[0] += item['bleu'] @@ -47,6 +54,7 @@ def classify_scores(model_data): inf_time = [] token_length = {} model_scores = [] + print_detailed = 'detailed' in sys.argv for d in model_data: @@ -80,33 +88,39 @@ def classify_scores(model_data): else: token_length[str(len(tokenized_pred))].append(s) - print('inf time = ', sum(inf_time) / len(inf_time)) + if len(inf_time) > 0: + print('inf time = ', sum(inf_time) / len(inf_time)) print_scores(model_scores) - print('chosen:') - print_scores(chosen) - - print('not chosen:') - print_scores(not_chosen) - - for i in range(1, 11): - if str(i) in token_length: - print('token length of prediction = ', i) - print_scores(token_length[str(i)]) - del token_length[str(i)] - print('token length of prediction > 10') - print_scores(sum(token_length.values(), [])) - - print('trigger points:') - print('manual triggers') - print_scores(trigger_points[None]) - del trigger_points[None] - sorted_trigger_points = sorted(trigger_points.items(), key=lambda x: len(x[1]), reverse=True) - for index, (tp, tp_scores) in enumerate(sorted_trigger_points): - if index >= 10: - break - print(tp) - print_scores(tp_scores) + if print_detailed: + print('chosen:') + print_scores(chosen) + + print('not chosen:') + print_scores(not_chosen) + + for i in range(1, 11): + if str(i) in token_length: + print('token length of prediction = ', i) + print_scores(token_length[str(i)]) + del token_length[str(i)] + print('token length of prediction > 10') + print_scores(sum(token_length.values(), [])) + + print('trigger points:') + print('manual triggers') + if None in trigger_points: + print_scores(trigger_points[None]) + del trigger_points[None] + else: + print('n = 0') + print() + sorted_trigger_points = sorted(trigger_points.items(), key=lambda x: len(x[1]), reverse=True) + for index, (tp, tp_scores) in enumerate(sorted_trigger_points): + if index >= 10: + break + print(tp) + print_scores(tp_scores) def classify_all_scores(language_dict): @@ -151,15 +165,134 @@ def add_data(language_key, d, data): unixcoder_list.append(data) +def get_language(language): + if language == 'python' or language == '.py' or language == 'py': + return 'python' + elif language == 'java' or language == '.java': + return 'java' + elif language == 'typescript' or language == '.ts' or language == 'ts': + return 'typescript' + elif language == 'php' or language == '.php': + return 'php' + elif language == 'vue': + return 'vue' + elif language == 'kotlin' or language == 'kt': + return 'kotlin' + elif language == 'typescriptreact' or language == '.tsx' or language == 'ts' or language == 'typescript jsx': + return 'typescriptreact' + elif language == 'javascript' or language == '.js' or language == 'js' or language == 'ecmascript 6': + return 'javascript' + elif language == 'robotframework': + return 'robotframework' + elif language == 'json' or language == '.json': + return 'json' + elif language == 'latex': + return 'latex' + elif language == 'html' or language == '.html': + return 'html' + elif language == 'javascriptreact' or language == '.jsx' or language == 'jsx': + return 'javascriptreact' + elif language == 'xml' or language == '.xml': + return 'xml' + elif language == 'go': + return 'go' + elif language == 'ruby': + return 'ruby' + elif language == 'csharp' or language == '.cs' or language == 'c#' or language == 'cs': + return 'csharp' + elif language == 'blade.php': + return 'blade.php' + elif language == 'markdown' or language == '.md' or language == 'md': + return 'markdown' + elif language == 'rust' or language == '.rs' or language == 'rs': + return 'rust' + elif language == 'css' or language == '.css' or language == 'scss': + return 'css' + elif language == 'objectivec': + return 'objectivec' + elif language == 'cpp' or language == '.cpp': + return 'cpp' + elif language == 'dart' or language == '.dart': + return 'dart' + elif language == 'sql' or language == '.sql': + return 'sql' + elif language == '.shellscript' or language == '.sh' or language == 'sh' or language == 'shellscript': + return 'shellscript' + elif language == 'prisma' or language == '.prisma': + return 'prisma' + elif language == 'yaml' or language == '.yaml' or language == 'yml' or language == '.yml': + return 'yaml' + elif language == 'txt' or language == '.txt' or language == 'text' or language == 'plaintext': + return 'txt' + elif language == 'swift' or language == '.swift': + return 'swift' + elif language == 'c' or language == '.c': + return 'c' + elif language == 'gitignore': + return 'gitignore' + elif language == 'groovy': + return 'groovy' + elif language == 'perl5': + return 'perl5' + elif language == 'less': + return 'less' + elif language == 'scala': + return 'scala' + elif language == 'julia': + return 'julia' + else: + return 'other' + + if __name__ == '__main__': data_folder = '../data_16_08_2022' directory = os.fsencode(data_folder) data_dict = { - 'python': { + 'python': {}, + 'java': {}, + 'typescript': {}, + 'php': {}, + 'vue': {}, + 'kotlin': {}, + 'typescriptreact': {}, + 'javascript': {}, + 'robotframework': {}, + 'json': {}, + 'latex': {}, + 'html': {}, + 'javascriptreact': {}, + 'xml': {}, + 'go': {}, + 'ruby': {}, + 'csharp': {}, + 'blade.php': {}, + 'markdown': {}, + 'rust': {}, + 'css': {}, + 'objectivec': {}, + 'cpp': {}, + 'dart': {}, + 'sql': {}, + 'shellscript': {}, + 'prisma': {}, + 'yaml': {}, + 'txt': {}, + 'swift': {}, + 'c': {}, + 'gitignore': {}, + 'groovy': {}, + 'perl5': {}, + 'less': {}, + 'scala': {}, + 'julia': {}, + 'other': {} + } + + for k in data_dict.keys(): + data_dict[k] = { 'incoder': [], 'unixcoder': [] } - } for file in os.listdir(directory): filename = data_folder + '/' + os.fsdecode(file) @@ -175,14 +308,11 @@ def add_data(language_key, d, data): if is_not_valid_data(data): continue - # TODO: add other languages to datadict and if statement and print - k = data['language'] - if k == 'python' or k == '.py' or k == 'py': - add_data('python', data_dict, data) - else: - continue + add_data(get_language(data['language']), data_dict, data) - print('---python---') - classify_all_scores(data_dict['python']) + data_dict = {k: v for k, v in sorted(data_dict.items(), key=lambda item: len(item[1]['incoder']) + len(item[1]['unixcoder']), reverse=True)} + for k in data_dict.keys(): + print('---', k, '---') + classify_all_scores(data_dict[k]) print('done') From 253b830897a4210fc3819365cb677561c88977ee Mon Sep 17 00:00:00 2001 From: mrkingmarc01 Date: Wed, 7 Sep 2022 15:30:30 +0200 Subject: [PATCH 10/11] Added new script thats prints statistics about: top languages, statistics per day, top user statistics --- code4me-server/src/eval_users.py | 159 +++++++++++++++++++++++++++++++ code4me-server/src/test_eval.py | 3 +- 2 files changed, 160 insertions(+), 2 deletions(-) create mode 100644 code4me-server/src/eval_users.py diff --git a/code4me-server/src/eval_users.py b/code4me-server/src/eval_users.py new file mode 100644 index 0000000..ebece6c --- /dev/null +++ b/code4me-server/src/eval_users.py @@ -0,0 +1,159 @@ +import copy +import json +import os +import sys +from test_eval import is_not_valid_data, get_language, add_data, classify_all_scores + + +if __name__ == '__main__': + data_folder = '../data' + directory = os.fsencode(data_folder) + data_dict = { + 'python': {}, + 'java': {}, + 'typescript': {}, + 'php': {}, + 'vue': {}, + 'kotlin': {}, + 'typescriptreact': {}, + 'javascript': {}, + 'robotframework': {}, + 'json': {}, + 'latex': {}, + 'html': {}, + 'javascriptreact': {}, + 'xml': {}, + 'go': {}, + 'ruby': {}, + 'csharp': {}, + 'blade.php': {}, + 'markdown': {}, + 'rust': {}, + 'css': {}, + 'objectivec': {}, + 'cpp': {}, + 'dart': {}, + 'sql': {}, + 'shellscript': {}, + 'prisma': {}, + 'yaml': {}, + 'txt': {}, + 'swift': {}, + 'c': {}, + 'gitignore': {}, + 'groovy': {}, + 'perl5': {}, + 'less': {}, + 'scala': {}, + 'julia': {}, + 'other': {} + } + languages = {} + users = {} + dates_data = {} + dates_users = {} + + for k in data_dict.keys(): + data_dict[k] = { + 'incoder': [], + 'unixcoder': [] + } + + for file in os.listdir(directory): + filename = data_folder + '/' + os.fsdecode(file) + user = filename.split('-')[0].split('/')[2] + + with open(filename) as json_file: + try: + data = json.load(json_file) + except: + continue + + # continue if data point invalid + if is_not_valid_data(data): + continue + + l = get_language(data['language']) + if l not in languages: + languages[l] = 1 + else: + languages[l] += 1 + + if user not in users: + users[user] = [data] + else: + users[user].append(data) + + t = data['completionTimestamp'][:10] + if t not in dates_data: + dates_data[t] = 1 + else: + dates_data[t] += 1 + + if t not in dates_users: + dates_users[t] = [user] + else: + if user not in dates_users[t]: + dates_users[t].append(user) + + n_languages = -1 + n_days = -1 + n_users = -1 + if len(sys.argv) == 4: + n_languages = int(sys.argv[1]) + n_days = int(sys.argv[2]) + n_users = int(sys.argv[3]) + + prompt_languages = 'ALL languages sorted by total valid data points:' + sorted_languages = {k: v for k, v in sorted(languages.items(), key=lambda item: item[1], reverse=True)} + if n_languages > 0: + sorted_languages = {k: sorted_languages[k] for k in list(sorted_languages.keys())[:n_languages]} + prompt_languages = f'top {n_languages} languages sorted by total valid data points:' + + print(prompt_languages) + print(sorted_languages) + print() + + prompt_dates_data = 'total new valid data points generated ALL TIME:' + sorted_dates_data = {k: v for k, v in sorted(dates_data.items(), reverse=True)} + prompt_dates_users = 'amount of unique users using code4me ALL TIME:' + sorted_dates_users = {k: len(v) for k, v in sorted(dates_users.items(), reverse=True)} + if n_days > 0: + sorted_dates_data = {k: sorted_dates_data[k] for k in list(sorted_dates_data.keys())[:n_days]} + sorted_dates_users = {k: sorted_dates_users[k] for k in list(sorted_dates_users.keys())[:n_days]} + prompt_dates_data = f'total new valid data points generated in last {n_days} days:' + prompt_dates_users = f'amount of unique users using code4me in last {n_days} days:' + + print(prompt_dates_data) + print(sorted_dates_data) + print() + + print(prompt_dates_users) + print(sorted_dates_users) + print() + + prompt_users = 'ALL users sorted by total valid data points:' + sorted_users = {k: v for k, v in sorted(users.items(), key=lambda item: len(item[1]), reverse=True)} + if n_users > 0: + sorted_users = {k: sorted_users[k] for k in list(sorted_users.keys())[:n_users]} + prompt_users = f'top {n_users} most active users sorted by total valid data points:' + + print(prompt_users) + for idx, (k, v) in enumerate(sorted_users.items()): + temp_data_dict = copy.deepcopy(data_dict) + print(f'--- user #{idx + 1}: {k} ---') + for x in v: + add_data(get_language(x['language']), temp_data_dict, x) + + for language in temp_data_dict.keys(): + if len(temp_data_dict[language]['incoder']) + len(temp_data_dict[language]['unixcoder']) > 0: + ide = '(error no ide found)' + for y in temp_data_dict[language]['incoder']: + if 'ide' in y: + ide = y['ide'] + break + + print(f'------{language} in {ide}') + classify_all_scores(temp_data_dict[language]) + + print('done') diff --git a/code4me-server/src/test_eval.py b/code4me-server/src/test_eval.py index 96e2dc3..640f0c3 100644 --- a/code4me-server/src/test_eval.py +++ b/code4me-server/src/test_eval.py @@ -245,7 +245,7 @@ def get_language(language): if __name__ == '__main__': - data_folder = '../data_16_08_2022' + data_folder = '../data' directory = os.fsencode(data_folder) data_dict = { 'python': {}, @@ -296,7 +296,6 @@ def get_language(language): for file in os.listdir(directory): filename = data_folder + '/' + os.fsdecode(file) - # user = filename.split('-')[0].split('/')[2] with open(filename) as json_file: try: From d7fdb056519f8686447ac8ac92748bccd24fb3d1 Mon Sep 17 00:00:00 2001 From: mrkingmarc01 Date: Tue, 13 Sep 2022 13:51:03 +0200 Subject: [PATCH 11/11] added support for other language tokenizers (did not add tokenizers yet) --- code4me-server/src/evaluation.py | 54 +++++++++++++++++++++++++------- code4me-server/src/test_eval.py | 14 ++++----- 2 files changed, 49 insertions(+), 19 deletions(-) diff --git a/code4me-server/src/evaluation.py b/code4me-server/src/evaluation.py index 7b27434..0055932 100644 --- a/code4me-server/src/evaluation.py +++ b/code4me-server/src/evaluation.py @@ -19,7 +19,20 @@ def compute_rouge(line: str, completion: str): } -def tokenize_code(code): +def compute(line: str, completion: str, l): + tokenized_line, tokenized_line_str = tokenize_code(line, l) + tokenized_completion, tokenized_completion_str = tokenize_code(completion, l) + return { + "bleu": sentence_bleu([tokenized_line], tokenized_completion, smoothing_function=SmoothingFunction().method2), + "exactMatch": float(line == completion), + "levenshtein": Levenshtein.ratio(line, completion), + "meteor": meteor_score(references=[tokenized_line], hypothesis=tokenized_completion), + "rouge": compute_rouge(tokenized_line_str, tokenized_completion_str), + "statisticTimestamp": datetime.now().isoformat() + } + + +def tokenize_code_python(code): tokens = [ x for x in re.split( @@ -32,14 +45,31 @@ def tokenize_code(code): return tokens, " ".join(tokens) -def compute(line: str, completion: str): - tokenized_line, tokenized_line_str = tokenize_code(line) - tokenized_completion, tokenized_completion_str = tokenize_code(completion) - return { - "bleu": sentence_bleu([tokenized_line], tokenized_completion, smoothing_function=SmoothingFunction().method2), - "exactMatch": float(line == completion), - "levenshtein": Levenshtein.ratio(line, completion), - "meteor": meteor_score(references=[tokenized_line], hypothesis=tokenized_completion), - "rouge": compute_rouge(tokenized_line_str, tokenized_completion_str), - "statisticTimestamp": datetime.now().isoformat() - } +# TODO: add java tokenizer +def tokenize_code_java(code): + return tokenize_code_python(code) + + +# TODO: add javascript tokenizer +def tokenize_code_javascript(code): + return tokenize_code_python(code) + + +# TODO: add php tokenizer +def tokenize_code_php(code): + return tokenize_code_python(code) + + +tokenizer_dict = { + 'python': tokenize_code_python, + 'java': tokenize_code_java, + 'javascript': tokenize_code_javascript, + 'php': tokenize_code_php, +} + + +def tokenize_code(code, l): + try: + return tokenizer_dict[l](code) + except: + return tokenizer_dict['python'](code) diff --git a/code4me-server/src/test_eval.py b/code4me-server/src/test_eval.py index 640f0c3..0e1e898 100644 --- a/code4me-server/src/test_eval.py +++ b/code4me-server/src/test_eval.py @@ -47,7 +47,7 @@ def get_prediction(d): return p.strip() -def classify_scores(model_data): +def classify_scores(model_data, l): chosen = [] not_chosen = [] trigger_points = {} @@ -61,7 +61,7 @@ def classify_scores(model_data): # calculate score truth = d['groundTruth'].strip() pred = get_prediction(d) - s = evaluation.compute(truth, pred) + s = evaluation.compute(truth, pred, l) # add score to correct model set model_scores.append(s) @@ -82,7 +82,7 @@ def classify_scores(model_data): inf_time.append(d['inferenceTime']) # add token length to dictionary - tokenized_pred = tokenize_code(pred)[0] + tokenized_pred = tokenize_code(pred, l)[0] if str(len(tokenized_pred)) not in token_length: token_length[str(len(tokenized_pred))] = [s] else: @@ -123,12 +123,12 @@ def classify_scores(model_data): print_scores(tp_scores) -def classify_all_scores(language_dict): +def classify_all_scores(language_dict, l): print('incoder:') - classify_scores(language_dict['incoder']) + classify_scores(language_dict['incoder'], l) print('unixcoder:') - classify_scores(language_dict['unixcoder']) + classify_scores(language_dict['unixcoder'], l) def add_data(language_key, d, data): @@ -312,6 +312,6 @@ def get_language(language): data_dict = {k: v for k, v in sorted(data_dict.items(), key=lambda item: len(item[1]['incoder']) + len(item[1]['unixcoder']), reverse=True)} for k in data_dict.keys(): print('---', k, '---') - classify_all_scores(data_dict[k]) + classify_all_scores(data_dict[k], k) print('done')