diff --git a/code4me-server/src/eval_users.py b/code4me-server/src/eval_users.py
new file mode 100644
index 0000000..ebece6c
--- /dev/null
+++ b/code4me-server/src/eval_users.py
@@ -0,0 +1,159 @@
+import copy
+import json
+import os
+import sys
+from test_eval import is_not_valid_data, get_language, add_data, classify_all_scores
+
+
+if __name__ == '__main__':
+    data_folder = '../data'
+    directory = os.fsencode(data_folder)
+    data_dict = {
+        'python': {},
+        'java': {},
+        'typescript': {},
+        'php': {},
+        'vue': {},
+        'kotlin': {},
+        'typescriptreact': {},
+        'javascript': {},
+        'robotframework': {},
+        'json': {},
+        'latex': {},
+        'html': {},
+        'javascriptreact': {},
+        'xml': {},
+        'go': {},
+        'ruby': {},
+        'csharp': {},
+        'blade.php': {},
+        'markdown': {},
+        'rust': {},
+        'css': {},
+        'objectivec': {},
+        'cpp': {},
+        'dart': {},
+        'sql': {},
+        'shellscript': {},
+        'prisma': {},
+        'yaml': {},
+        'txt': {},
+        'swift': {},
+        'c': {},
+        'gitignore': {},
+        'groovy': {},
+        'perl5': {},
+        'less': {},
+        'scala': {},
+        'julia': {},
+        'other': {}
+    }
+    languages = {}
+    users = {}
+    dates_data = {}
+    dates_users = {}
+
+    for k in data_dict.keys():
+        data_dict[k] = {
+            'incoder': [],
+            'unixcoder': []
+        }
+
+    for file in os.listdir(directory):
+        filename = data_folder + '/' + os.fsdecode(file)
+        user = filename.split('-')[0].split('/')[2]
+
+        with open(filename) as json_file:
+            try:
+                data = json.load(json_file)
+            except:
+                continue
+
+            # continue if data point invalid
+            if is_not_valid_data(data):
+                continue
+
+            l = get_language(data['language'])
+            if l not in languages:
+                languages[l] = 1
+            else:
+                languages[l] += 1
+
+            if user not in users:
+                users[user] = [data]
+            else:
+                users[user].append(data)
+
+            t = data['completionTimestamp'][:10]
+            if t not in dates_data:
+                dates_data[t] = 1
+            else:
+                dates_data[t] += 1
+
+            if t not in dates_users:
+                dates_users[t] = [user]
+            else:
+                if user not in dates_users[t]:
+                    dates_users[t].append(user)
+
+    n_languages = -1
+    n_days = -1
+    n_users = -1
+    if len(sys.argv) == 4:
+        n_languages = int(sys.argv[1])
+        n_days = int(sys.argv[2])
+        n_users = int(sys.argv[3])
+
+    prompt_languages = 'ALL languages sorted by total valid data points:'
+    sorted_languages = {k: v for k, v in sorted(languages.items(), key=lambda item: item[1], reverse=True)}
+    if n_languages > 0:
+        sorted_languages = {k: sorted_languages[k] for k in list(sorted_languages.keys())[:n_languages]}
+        prompt_languages = f'top {n_languages} languages sorted by total valid data points:'
+
+    print(prompt_languages)
+    print(sorted_languages)
+    print()
+
+    prompt_dates_data = 'total new valid data points generated ALL TIME:'
+    sorted_dates_data = {k: v for k, v in sorted(dates_data.items(), reverse=True)}
+    prompt_dates_users = 'amount of unique users using code4me ALL TIME:'
+    sorted_dates_users = {k: len(v) for k, v in sorted(dates_users.items(), reverse=True)}
+    if n_days > 0:
+        sorted_dates_data = {k: sorted_dates_data[k] for k in list(sorted_dates_data.keys())[:n_days]}
+        sorted_dates_users = {k: sorted_dates_users[k] for k in list(sorted_dates_users.keys())[:n_days]}
+        prompt_dates_data = f'total new valid data points generated in last {n_days} days:'
+        prompt_dates_users = f'amount of unique users using code4me in last {n_days} days:'
+
+    print(prompt_dates_data)
+    print(sorted_dates_data)
+    print()
+
+    print(prompt_dates_users)
+    print(sorted_dates_users)
+    print()
+
+    prompt_users = 'ALL users sorted by total valid data points:'
+    sorted_users = {k: v for k, v in sorted(users.items(), key=lambda item: len(item[1]), reverse=True)}
+    if n_users > 0:
+        sorted_users = {k: sorted_users[k] for k in list(sorted_users.keys())[:n_users]}
+        prompt_users = f'top {n_users} most active users sorted by total valid data points:'
+
+    print(prompt_users)
+    for idx, (k, v) in enumerate(sorted_users.items()):
+        temp_data_dict = copy.deepcopy(data_dict)
+        print(f'--- user #{idx + 1}: {k} ---')
+        for x in v:
+            add_data(get_language(x['language']), temp_data_dict, x)
+
+        for language in temp_data_dict.keys():
+            if len(temp_data_dict[language]['incoder']) + len(temp_data_dict[language]['unixcoder']) > 0:
+                ide = '(error no ide found)'
+                for y in temp_data_dict[language]['incoder']:
+                    if 'ide' in y:
+                        ide = y['ide']
+                        break
+
+                print(f'------{language} in {ide}')
+                classify_all_scores(temp_data_dict[language])
+
+    print('done')
diff --git a/code4me-server/src/evaluation.py b/code4me-server/src/evaluation.py
index 98bcd37..0055932 100644
--- a/code4me-server/src/evaluation.py
+++ b/code4me-server/src/evaluation.py
@@ -19,18 +19,9 @@ def compute_rouge(line: str, completion: str):
     }
 
 
-def tokenize_code(code):
-    tokens = [
-        x
-        for x in re.split('("""(.|\n)*"""|"(.|\n)*"|#.*|!=|\*\*|<<|>>|==|>=|<=| +|\W)', code)
-        if x and not x.isspace()
-    ]
-    return tokens, " ".join(tokens)
-
-
-def compute(line: str, completion: str):
-    tokenized_line, tokenized_line_str = tokenize_code(line)
-    tokenized_completion, tokenized_completion_str = tokenize_code(completion)
+def compute(line: str, completion: str, l):
+    tokenized_line, tokenized_line_str = tokenize_code(line, l)
+    tokenized_completion, tokenized_completion_str = tokenize_code(completion, l)
     return {
         "bleu": sentence_bleu([tokenized_line], tokenized_completion, smoothing_function=SmoothingFunction().method2),
         "exactMatch": float(line == completion),
@@ -39,3 +30,46 @@ def compute(line: str, completion: str):
         "rouge": compute_rouge(tokenized_line_str, tokenized_completion_str),
         "statisticTimestamp": datetime.now().isoformat()
     }
+
+
+def tokenize_code_python(code):
+    tokens = [
+        x
+        for x in re.split(
+            '(\'\'\'(?:.|\n)*\'\'\'|"""(?:.|\n)*"""|"(?:.|\n)*"|\'(?:.|\n)*\'|#.*|!=|\*\*|<<|>>|==|>=|<=| +|\W)',
+            code
+        )
+
+        if x and not x.isspace()
+    ]
+    return tokens, " ".join(tokens)
+
+
+# TODO: add java tokenizer
+def tokenize_code_java(code):
+    return tokenize_code_python(code)
+
+
+# TODO: add javascript tokenizer
+def tokenize_code_javascript(code):
+    return tokenize_code_python(code)
+
+
+# TODO: add php tokenizer
+def tokenize_code_php(code):
+    return tokenize_code_python(code)
+
+
+tokenizer_dict = {
+    'python': tokenize_code_python,
+    'java': tokenize_code_java,
+    'javascript': tokenize_code_javascript,
+    'php': tokenize_code_php,
+}
+
+
+def tokenize_code(code, l):
+    try:
+        return tokenizer_dict[l](code)
+    except:
+        return tokenizer_dict['python'](code)
diff --git a/code4me-server/src/test_eval.py b/code4me-server/src/test_eval.py
new file mode 100644
index 0000000..0e1e898
--- /dev/null
+++ b/code4me-server/src/test_eval.py
@@ -0,0 +1,317 @@
+import copy
+import json
+import os
+import evaluation
+import sys
+from evaluation import tokenize_code
+
+
+def print_scores(scores):
+    size = len(scores)
+
+    if size == 0:
+        print('n = 0')
+        print()
+        return
+
+    result = [0, 0, 0, 0, 0, 0, 0]
+    for item in scores:
+        result[0] += item['bleu']
+        result[1] += item['exactMatch']
+        result[2] += item['levenshtein']
+        result[3] += item['meteor']
+        result[4] += item['rouge']['precision']
+        result[5] += item['rouge']['recall']
+        result[6] += item['rouge']['f1measure']
+
+    print('n = ', size)
+    print('bleu = ', result[0] / size)
+    print('exactMatch = ', result[1] / size)
+    print('levenshtein = ', result[2] / size)
+    print('meteor = ', result[3] / size)
+    print('rouge (precision) = ', result[4] / size)
+    print('rouge (recall) = ', result[5] / size)
+    print('rouge (f1measure) =', result[6] / size)
+    print()
+
+
+def is_not_valid_data(d):
+    return 'groundTruth' not in d or ('groundTruth' in d and d['groundTruth'].strip() == '') or d['predictions'] == ['']
+
+
+def get_prediction(d):
+    if d['chosenPrediction'] is not None:
+        p = d['chosenPrediction']
+    else:
+        p = d['predictions'][0]
+    return p.strip()
+
+
+def classify_scores(model_data, l):
+    chosen = []
+    not_chosen = []
+    trigger_points = {}
+    inf_time = []
+    token_length = {}
+    model_scores = []
+    print_detailed = 'detailed' in sys.argv
+
+    for d in model_data:
+
+        # calculate score
+        truth = d['groundTruth'].strip()
+        pred = get_prediction(d)
+        s = evaluation.compute(truth, pred, l)
+
+        # add score to correct model set
+        model_scores.append(s)
+
+        # add score to corresponding trigger point
+        if d['triggerPoint'] not in trigger_points:
+            trigger_points[d['triggerPoint']] = [s]
+        else:
+            trigger_points[d['triggerPoint']].append(s)
+
+        # add score to group based on chosen or not
+        if d['chosenPrediction'] is not None:
+            chosen.append(s)
+        else:
+            not_chosen.append(s)
+
+        # add inf time to array
+        inf_time.append(d['inferenceTime'])
+
+        # add token length to dictionary
+        tokenized_pred = tokenize_code(pred, l)[0]
+        if str(len(tokenized_pred)) not in token_length:
+            token_length[str(len(tokenized_pred))] = [s]
+        else:
+            token_length[str(len(tokenized_pred))].append(s)
+
+    if len(inf_time) > 0:
+        print('inf time = ', sum(inf_time) / len(inf_time))
+    print_scores(model_scores)
+
+    if print_detailed:
+        print('chosen:')
+        print_scores(chosen)
+
+        print('not chosen:')
+        print_scores(not_chosen)
+
+        for i in range(1, 11):
+            if str(i) in token_length:
+                print('token length of prediction = ', i)
+                print_scores(token_length[str(i)])
+                del token_length[str(i)]
+        print('token length of prediction > 10')
+        print_scores(sum(token_length.values(), []))
+
+        print('trigger points:')
+        print('manual triggers')
+        if None in trigger_points:
+            print_scores(trigger_points[None])
+            del trigger_points[None]
+        else:
+            print('n = 0')
+            print()
+        sorted_trigger_points = sorted(trigger_points.items(), key=lambda x: len(x[1]), reverse=True)
+        for index, (tp, tp_scores) in enumerate(sorted_trigger_points):
+            if index >= 10:
+                break
+            print(tp)
+            print_scores(tp_scores)
+
+
+def classify_all_scores(language_dict, l):
+    print('incoder:')
+    classify_scores(language_dict['incoder'], l)
+
+    print('unixcoder:')
+    classify_scores(language_dict['unixcoder'], l)
+
+
+def add_data(language_key, d, data):
+    incoder_list = d[language_key]['incoder']
+    unixcoder_list = d[language_key]['unixcoder']
+
+    if 'modelPredictions' in data:
+        incoder_prediction = data['modelPredictions']['InCoder'][0]
+        unixcoder_prediction = data['modelPredictions']['UniXCoder'][0]
+        incoder_data = copy.deepcopy(data)
+        unixcoder_data = copy.deepcopy(data)
+
+        if data['chosenPrediction'] is not None:
+            if data['chosenPrediction'] != incoder_prediction:
+                incoder_data['chosenPrediction'] = None
+            if data['chosenPrediction'] != unixcoder_prediction:
+                unixcoder_data['chosenPrediction'] = None
+
+        if incoder_prediction != unixcoder_prediction:
+            incoder_data['predictions'] = [incoder_prediction]
+            unixcoder_data['predictions'] = [unixcoder_prediction]
+
+        incoder_data['inferenceTime'] = incoder_data['inferenceTime'] / 2
+        unixcoder_data['inferenceTime'] = unixcoder_data['inferenceTime'] / 2
+
+        if not is_not_valid_data(incoder_data):
+            incoder_list.append(incoder_data)
+        if not is_not_valid_data(unixcoder_data):
+            unixcoder_list.append(unixcoder_data)
+
+    elif data['model'] == 'InCoder' or data['model'] == 'CodeFill':
+        incoder_list.append(data)
+    else:
+        unixcoder_list.append(data)
+
+
+def get_language(language):
+    if language == 'python' or language == '.py' or language == 'py':
+        return 'python'
+    elif language == 'java' or language == '.java':
+        return 'java'
+    elif language == 'typescript' or language == '.ts' or language == 'ts':
+        return 'typescript'
+    elif language == 'php' or language == '.php':
+        return 'php'
+    elif language == 'vue':
+        return 'vue'
+    elif language == 'kotlin' or language == 'kt':
+        return 'kotlin'
+    elif language == 'typescriptreact' or language == '.tsx' or language == 'ts' or language == 'typescript jsx':
+        return 'typescriptreact'
+    elif language == 'javascript' or language == '.js' or language == 'js' or language == 'ecmascript 6':
+        return 'javascript'
+    elif language == 'robotframework':
+        return 'robotframework'
+    elif language == 'json' or language == '.json':
+        return 'json'
+    elif language == 'latex':
+        return 'latex'
+    elif language == 'html' or language == '.html':
+        return 'html'
+    elif language == 'javascriptreact' or language == '.jsx' or language == 'jsx':
+        return 'javascriptreact'
+    elif language == 'xml' or language == '.xml':
+        return 'xml'
+    elif language == 'go':
+        return 'go'
+    elif language == 'ruby':
+        return 'ruby'
+    elif language == 'csharp' or language == '.cs' or language == 'c#' or language == 'cs':
+        return 'csharp'
+    elif language == 'blade.php':
+        return 'blade.php'
+    elif language == 'markdown' or language == '.md' or language == 'md':
+        return 'markdown'
+    elif language == 'rust' or language == '.rs' or language == 'rs':
+        return 'rust'
+    elif language == 'css' or language == '.css' or language == 'scss':
+        return 'css'
+    elif language == 'objectivec':
+        return 'objectivec'
+    elif language == 'cpp' or language == '.cpp':
+        return 'cpp'
+    elif language == 'dart' or language == '.dart':
+        return 'dart'
+    elif language == 'sql' or language == '.sql':
+        return 'sql'
+    elif language == '.shellscript' or language == '.sh' or language == 'sh' or language == 'shellscript':
+        return 'shellscript'
+    elif language == 'prisma' or language == '.prisma':
+        return 'prisma'
+    elif language == 'yaml' or language == '.yaml' or language == 'yml' or language == '.yml':
+        return 'yaml'
+    elif language == 'txt' or language == '.txt' or language == 'text' or language == 'plaintext':
+        return 'txt'
+    elif language == 'swift' or language == '.swift':
+        return 'swift'
+    elif language == 'c' or language == '.c':
+        return 'c'
+    elif language == 'gitignore':
+        return 'gitignore'
+    elif language == 'groovy':
+        return 'groovy'
+    elif language == 'perl5':
+        return 'perl5'
+    elif language == 'less':
+        return 'less'
+    elif language == 'scala':
+        return 'scala'
+    elif language == 'julia':
+        return 'julia'
+    else:
+        return 'other'
+
+
+if __name__ == '__main__':
+    data_folder = '../data'
+    directory = os.fsencode(data_folder)
+    data_dict = {
+        'python': {},
+        'java': {},
+        'typescript': {},
+        'php': {},
+        'vue': {},
+        'kotlin': {},
+        'typescriptreact': {},
+        'javascript': {},
+        'robotframework': {},
+        'json': {},
+        'latex': {},
+        'html': {},
+        'javascriptreact': {},
+        'xml': {},
+        'go': {},
+        'ruby': {},
+        'csharp': {},
+        'blade.php': {},
+        'markdown': {},
+        'rust': {},
+        'css': {},
+        'objectivec': {},
+        'cpp': {},
+        'dart': {},
+        'sql': {},
+        'shellscript': {},
+        'prisma': {},
+        'yaml': {},
+        'txt': {},
+        'swift': {},
+        'c': {},
+        'gitignore': {},
+        'groovy': {},
+        'perl5': {},
+        'less': {},
+        'scala': {},
+        'julia': {},
+        'other': {}
+    }
+
+    for k in data_dict.keys():
+        data_dict[k] = {
+            'incoder': [],
+            'unixcoder': []
+        }
+
+    for file in os.listdir(directory):
+        filename = data_folder + '/' + os.fsdecode(file)
+
+        with open(filename) as json_file:
+            try:
+                data = json.load(json_file)
+            except:
+                continue
+
+            # continue if data point invalid
+            if is_not_valid_data(data):
+                continue
+
+            add_data(get_language(data['language']), data_dict, data)
+
+    data_dict = {k: v for k, v in sorted(data_dict.items(), key=lambda item: len(item[1]['incoder']) + len(item[1]['unixcoder']), reverse=True)}
+    for k in data_dict.keys():
+        print('---', k, '---')
+        classify_all_scores(data_dict[k], k)
+
+    print('done')