diff --git a/code4me-server/src/eval_users.py b/code4me-server/src/eval_users.py new file mode 100644 index 0000000..ebece6c --- /dev/null +++ b/code4me-server/src/eval_users.py @@ -0,0 +1,159 @@ +import copy +import json +import os +import sys +from test_eval import is_not_valid_data, get_language, add_data, classify_all_scores + + +if __name__ == '__main__': + data_folder = '../data' + directory = os.fsencode(data_folder) + data_dict = { + 'python': {}, + 'java': {}, + 'typescript': {}, + 'php': {}, + 'vue': {}, + 'kotlin': {}, + 'typescriptreact': {}, + 'javascript': {}, + 'robotframework': {}, + 'json': {}, + 'latex': {}, + 'html': {}, + 'javascriptreact': {}, + 'xml': {}, + 'go': {}, + 'ruby': {}, + 'csharp': {}, + 'blade.php': {}, + 'markdown': {}, + 'rust': {}, + 'css': {}, + 'objectivec': {}, + 'cpp': {}, + 'dart': {}, + 'sql': {}, + 'shellscript': {}, + 'prisma': {}, + 'yaml': {}, + 'txt': {}, + 'swift': {}, + 'c': {}, + 'gitignore': {}, + 'groovy': {}, + 'perl5': {}, + 'less': {}, + 'scala': {}, + 'julia': {}, + 'other': {} + } + languages = {} + users = {} + dates_data = {} + dates_users = {} + + for k in data_dict.keys(): + data_dict[k] = { + 'incoder': [], + 'unixcoder': [] + } + + for file in os.listdir(directory): + filename = data_folder + '/' + os.fsdecode(file) + user = filename.split('-')[0].split('/')[2] + + with open(filename) as json_file: + try: + data = json.load(json_file) + except: + continue + + # continue if data point invalid + if is_not_valid_data(data): + continue + + l = get_language(data['language']) + if l not in languages: + languages[l] = 1 + else: + languages[l] += 1 + + if user not in users: + users[user] = [data] + else: + users[user].append(data) + + t = data['completionTimestamp'][:10] + if t not in dates_data: + dates_data[t] = 1 + else: + dates_data[t] += 1 + + if t not in dates_users: + dates_users[t] = [user] + else: + if user not in dates_users[t]: + dates_users[t].append(user) + + n_languages = -1 + n_days = -1 + n_users = -1 + if len(sys.argv) == 4: + n_languages = int(sys.argv[1]) + n_days = int(sys.argv[2]) + n_users = int(sys.argv[3]) + + prompt_languages = 'ALL languages sorted by total valid data points:' + sorted_languages = {k: v for k, v in sorted(languages.items(), key=lambda item: item[1], reverse=True)} + if n_languages > 0: + sorted_languages = {k: sorted_languages[k] for k in list(sorted_languages.keys())[:n_languages]} + prompt_languages = f'top {n_languages} languages sorted by total valid data points:' + + print(prompt_languages) + print(sorted_languages) + print() + + prompt_dates_data = 'total new valid data points generated ALL TIME:' + sorted_dates_data = {k: v for k, v in sorted(dates_data.items(), reverse=True)} + prompt_dates_users = 'amount of unique users using code4me ALL TIME:' + sorted_dates_users = {k: len(v) for k, v in sorted(dates_users.items(), reverse=True)} + if n_days > 0: + sorted_dates_data = {k: sorted_dates_data[k] for k in list(sorted_dates_data.keys())[:n_days]} + sorted_dates_users = {k: sorted_dates_users[k] for k in list(sorted_dates_users.keys())[:n_days]} + prompt_dates_data = f'total new valid data points generated in last {n_days} days:' + prompt_dates_users = f'amount of unique users using code4me in last {n_days} days:' + + print(prompt_dates_data) + print(sorted_dates_data) + print() + + print(prompt_dates_users) + print(sorted_dates_users) + print() + + prompt_users = 'ALL users sorted by total valid data points:' + sorted_users = {k: v for k, v in sorted(users.items(), key=lambda item: len(item[1]), reverse=True)} + if n_users > 0: + sorted_users = {k: sorted_users[k] for k in list(sorted_users.keys())[:n_users]} + prompt_users = f'top {n_users} most active users sorted by total valid data points:' + + print(prompt_users) + for idx, (k, v) in enumerate(sorted_users.items()): + temp_data_dict = copy.deepcopy(data_dict) + print(f'--- user #{idx + 1}: {k} ---') + for x in v: + add_data(get_language(x['language']), temp_data_dict, x) + + for language in temp_data_dict.keys(): + if len(temp_data_dict[language]['incoder']) + len(temp_data_dict[language]['unixcoder']) > 0: + ide = '(error no ide found)' + for y in temp_data_dict[language]['incoder']: + if 'ide' in y: + ide = y['ide'] + break + + print(f'------{language} in {ide}') + classify_all_scores(temp_data_dict[language]) + + print('done') diff --git a/code4me-server/src/evaluation.py b/code4me-server/src/evaluation.py index 98bcd37..0055932 100644 --- a/code4me-server/src/evaluation.py +++ b/code4me-server/src/evaluation.py @@ -19,18 +19,9 @@ def compute_rouge(line: str, completion: str): } -def tokenize_code(code): - tokens = [ - x - for x in re.split('("""(.|\n)*"""|"(.|\n)*"|#.*|!=|\*\*|<<|>>|==|>=|<=| +|\W)', code) - if x and not x.isspace() - ] - return tokens, " ".join(tokens) - - -def compute(line: str, completion: str): - tokenized_line, tokenized_line_str = tokenize_code(line) - tokenized_completion, tokenized_completion_str = tokenize_code(completion) +def compute(line: str, completion: str, l): + tokenized_line, tokenized_line_str = tokenize_code(line, l) + tokenized_completion, tokenized_completion_str = tokenize_code(completion, l) return { "bleu": sentence_bleu([tokenized_line], tokenized_completion, smoothing_function=SmoothingFunction().method2), "exactMatch": float(line == completion), @@ -39,3 +30,46 @@ def compute(line: str, completion: str): "rouge": compute_rouge(tokenized_line_str, tokenized_completion_str), "statisticTimestamp": datetime.now().isoformat() } + + +def tokenize_code_python(code): + tokens = [ + x + for x in re.split( + '(\'\'\'(?:.|\n)*\'\'\'|"""(?:.|\n)*"""|"(?:.|\n)*"|\'(?:.|\n)*\'|#.*|!=|\*\*|<<|>>|==|>=|<=| +|\W)', + code + ) + + if x and not x.isspace() + ] + return tokens, " ".join(tokens) + + +# TODO: add java tokenizer +def tokenize_code_java(code): + return tokenize_code_python(code) + + +# TODO: add javascript tokenizer +def tokenize_code_javascript(code): + return tokenize_code_python(code) + + +# TODO: add php tokenizer +def tokenize_code_php(code): + return tokenize_code_python(code) + + +tokenizer_dict = { + 'python': tokenize_code_python, + 'java': tokenize_code_java, + 'javascript': tokenize_code_javascript, + 'php': tokenize_code_php, +} + + +def tokenize_code(code, l): + try: + return tokenizer_dict[l](code) + except: + return tokenizer_dict['python'](code) diff --git a/code4me-server/src/test_eval.py b/code4me-server/src/test_eval.py new file mode 100644 index 0000000..0e1e898 --- /dev/null +++ b/code4me-server/src/test_eval.py @@ -0,0 +1,317 @@ +import copy +import json +import os +import evaluation +import sys +from evaluation import tokenize_code + + +def print_scores(scores): + size = len(scores) + + if size == 0: + print('n = 0') + print() + return + + result = [0, 0, 0, 0, 0, 0, 0] + for item in scores: + result[0] += item['bleu'] + result[1] += item['exactMatch'] + result[2] += item['levenshtein'] + result[3] += item['meteor'] + result[4] += item['rouge']['precision'] + result[5] += item['rouge']['recall'] + result[6] += item['rouge']['f1measure'] + + print('n = ', size) + print('bleu = ', result[0] / size) + print('exactMatch = ', result[1] / size) + print('levenshtein = ', result[2] / size) + print('meteor = ', result[3] / size) + print('rouge (precision) = ', result[4] / size) + print('rouge (recall) = ', result[5] / size) + print('rouge (f1measure) =', result[6] / size) + print() + + +def is_not_valid_data(d): + return 'groundTruth' not in d or ('groundTruth' in d and d['groundTruth'].strip() == '') or d['predictions'] == [''] + + +def get_prediction(d): + if d['chosenPrediction'] is not None: + p = d['chosenPrediction'] + else: + p = d['predictions'][0] + return p.strip() + + +def classify_scores(model_data, l): + chosen = [] + not_chosen = [] + trigger_points = {} + inf_time = [] + token_length = {} + model_scores = [] + print_detailed = 'detailed' in sys.argv + + for d in model_data: + + # calculate score + truth = d['groundTruth'].strip() + pred = get_prediction(d) + s = evaluation.compute(truth, pred, l) + + # add score to correct model set + model_scores.append(s) + + # add score to corresponding trigger point + if d['triggerPoint'] not in trigger_points: + trigger_points[d['triggerPoint']] = [s] + else: + trigger_points[d['triggerPoint']].append(s) + + # add score to group based on chosen or not + if d['chosenPrediction'] is not None: + chosen.append(s) + else: + not_chosen.append(s) + + # add inf time to array + inf_time.append(d['inferenceTime']) + + # add token length to dictionary + tokenized_pred = tokenize_code(pred, l)[0] + if str(len(tokenized_pred)) not in token_length: + token_length[str(len(tokenized_pred))] = [s] + else: + token_length[str(len(tokenized_pred))].append(s) + + if len(inf_time) > 0: + print('inf time = ', sum(inf_time) / len(inf_time)) + print_scores(model_scores) + + if print_detailed: + print('chosen:') + print_scores(chosen) + + print('not chosen:') + print_scores(not_chosen) + + for i in range(1, 11): + if str(i) in token_length: + print('token length of prediction = ', i) + print_scores(token_length[str(i)]) + del token_length[str(i)] + print('token length of prediction > 10') + print_scores(sum(token_length.values(), [])) + + print('trigger points:') + print('manual triggers') + if None in trigger_points: + print_scores(trigger_points[None]) + del trigger_points[None] + else: + print('n = 0') + print() + sorted_trigger_points = sorted(trigger_points.items(), key=lambda x: len(x[1]), reverse=True) + for index, (tp, tp_scores) in enumerate(sorted_trigger_points): + if index >= 10: + break + print(tp) + print_scores(tp_scores) + + +def classify_all_scores(language_dict, l): + print('incoder:') + classify_scores(language_dict['incoder'], l) + + print('unixcoder:') + classify_scores(language_dict['unixcoder'], l) + + +def add_data(language_key, d, data): + incoder_list = d[language_key]['incoder'] + unixcoder_list = d[language_key]['unixcoder'] + + if 'modelPredictions' in data: + incoder_prediction = data['modelPredictions']['InCoder'][0] + unixcoder_prediction = data['modelPredictions']['UniXCoder'][0] + incoder_data = copy.deepcopy(data) + unixcoder_data = copy.deepcopy(data) + + if data['chosenPrediction'] is not None: + if data['chosenPrediction'] != incoder_prediction: + incoder_data['chosenPrediction'] = None + if data['chosenPrediction'] != unixcoder_prediction: + unixcoder_data['chosenPrediction'] = None + + if incoder_prediction != unixcoder_prediction: + incoder_data['predictions'] = [incoder_prediction] + unixcoder_data['predictions'] = [unixcoder_prediction] + + incoder_data['inferenceTime'] = incoder_data['inferenceTime'] / 2 + unixcoder_data['inferenceTime'] = unixcoder_data['inferenceTime'] / 2 + + if not is_not_valid_data(incoder_data): + incoder_list.append(incoder_data) + if not is_not_valid_data(unixcoder_data): + unixcoder_list.append(unixcoder_data) + + elif data['model'] == 'InCoder' or data['model'] == 'CodeFill': + incoder_list.append(data) + else: + unixcoder_list.append(data) + + +def get_language(language): + if language == 'python' or language == '.py' or language == 'py': + return 'python' + elif language == 'java' or language == '.java': + return 'java' + elif language == 'typescript' or language == '.ts' or language == 'ts': + return 'typescript' + elif language == 'php' or language == '.php': + return 'php' + elif language == 'vue': + return 'vue' + elif language == 'kotlin' or language == 'kt': + return 'kotlin' + elif language == 'typescriptreact' or language == '.tsx' or language == 'ts' or language == 'typescript jsx': + return 'typescriptreact' + elif language == 'javascript' or language == '.js' or language == 'js' or language == 'ecmascript 6': + return 'javascript' + elif language == 'robotframework': + return 'robotframework' + elif language == 'json' or language == '.json': + return 'json' + elif language == 'latex': + return 'latex' + elif language == 'html' or language == '.html': + return 'html' + elif language == 'javascriptreact' or language == '.jsx' or language == 'jsx': + return 'javascriptreact' + elif language == 'xml' or language == '.xml': + return 'xml' + elif language == 'go': + return 'go' + elif language == 'ruby': + return 'ruby' + elif language == 'csharp' or language == '.cs' or language == 'c#' or language == 'cs': + return 'csharp' + elif language == 'blade.php': + return 'blade.php' + elif language == 'markdown' or language == '.md' or language == 'md': + return 'markdown' + elif language == 'rust' or language == '.rs' or language == 'rs': + return 'rust' + elif language == 'css' or language == '.css' or language == 'scss': + return 'css' + elif language == 'objectivec': + return 'objectivec' + elif language == 'cpp' or language == '.cpp': + return 'cpp' + elif language == 'dart' or language == '.dart': + return 'dart' + elif language == 'sql' or language == '.sql': + return 'sql' + elif language == '.shellscript' or language == '.sh' or language == 'sh' or language == 'shellscript': + return 'shellscript' + elif language == 'prisma' or language == '.prisma': + return 'prisma' + elif language == 'yaml' or language == '.yaml' or language == 'yml' or language == '.yml': + return 'yaml' + elif language == 'txt' or language == '.txt' or language == 'text' or language == 'plaintext': + return 'txt' + elif language == 'swift' or language == '.swift': + return 'swift' + elif language == 'c' or language == '.c': + return 'c' + elif language == 'gitignore': + return 'gitignore' + elif language == 'groovy': + return 'groovy' + elif language == 'perl5': + return 'perl5' + elif language == 'less': + return 'less' + elif language == 'scala': + return 'scala' + elif language == 'julia': + return 'julia' + else: + return 'other' + + +if __name__ == '__main__': + data_folder = '../data' + directory = os.fsencode(data_folder) + data_dict = { + 'python': {}, + 'java': {}, + 'typescript': {}, + 'php': {}, + 'vue': {}, + 'kotlin': {}, + 'typescriptreact': {}, + 'javascript': {}, + 'robotframework': {}, + 'json': {}, + 'latex': {}, + 'html': {}, + 'javascriptreact': {}, + 'xml': {}, + 'go': {}, + 'ruby': {}, + 'csharp': {}, + 'blade.php': {}, + 'markdown': {}, + 'rust': {}, + 'css': {}, + 'objectivec': {}, + 'cpp': {}, + 'dart': {}, + 'sql': {}, + 'shellscript': {}, + 'prisma': {}, + 'yaml': {}, + 'txt': {}, + 'swift': {}, + 'c': {}, + 'gitignore': {}, + 'groovy': {}, + 'perl5': {}, + 'less': {}, + 'scala': {}, + 'julia': {}, + 'other': {} + } + + for k in data_dict.keys(): + data_dict[k] = { + 'incoder': [], + 'unixcoder': [] + } + + for file in os.listdir(directory): + filename = data_folder + '/' + os.fsdecode(file) + + with open(filename) as json_file: + try: + data = json.load(json_file) + except: + continue + + # continue if data point invalid + if is_not_valid_data(data): + continue + + add_data(get_language(data['language']), data_dict, data) + + data_dict = {k: v for k, v in sorted(data_dict.items(), key=lambda item: len(item[1]['incoder']) + len(item[1]['unixcoder']), reverse=True)} + for k in data_dict.keys(): + print('---', k, '---') + classify_all_scores(data_dict[k], k) + + print('done')