Skip to content

Commit 701a73e

Browse files
committed
update benchmark
1 parent 85d0583 commit 701a73e

File tree

6 files changed

+945
-78
lines changed

6 files changed

+945
-78
lines changed

codemmlu/index.html

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
<link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro"
2828
rel="stylesheet">
2929
<!-- TODO: replace with CodeMMLU logo -->
30-
<link rel="icon" href="static/images/repoexec_logo.png">
30+
<link rel="icon" href="static/images/codemmlu-logo.png">
3131

3232
<link rel="stylesheet" href="static/css/bulma.min.css">
3333
<link rel="stylesheet" href="static/css/bulma-carousel.min.css">
@@ -512,7 +512,7 @@ <h2 class="title is-3">Evaluation Results</h2>
512512
<td class="tg-0lax">47.94</td>
513513
</tr>
514514
</tbody></table>
515-
<figcaption><i><b>Summary performance of LLM family on CodeMMLU.</b>The evaluation results (accuracy %) of different language models across CodeMMLU task.</i></figcaption>
515+
<figcaption><i><b>Summary performance of LLM family on CodeMMLU.</b> The evaluation results (accuracy %) of different language models across CodeMMLU task.</i></figcaption>
516516
</div>
517517
<!-- <figure>
518518
<img src="static/images/llm_result.png", width="100%">
32.5 KB
Loading
Lines changed: 301 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,301 @@
1+
{
2+
"CodeLlama-34B-Instruct": {
3+
"link": "https://huggingface.co/codellama/CodeLlama-34b-hf",
4+
"open-data": "None",
5+
"pass@1": {
6+
"instruct": null,
7+
"complete": 38.73
8+
},
9+
"prompted": true,
10+
"size": 34,
11+
"direct_complete": false,
12+
"lazy": false,
13+
"elo_mle": 942
14+
},
15+
"Meta-Llama-3-70B": {
16+
"link": "https://huggingface.co/meta-llama/Meta-Llama-3-70B",
17+
"open-data": "None",
18+
"pass@1": {
19+
"instruct": null,
20+
"complete": 48.98
21+
},
22+
"prompted": false,
23+
"size": 70,
24+
"direct_complete": false,
25+
"lazy": false,
26+
"elo_mle": 874
27+
},
28+
"Meta-Llama-3-70B-Instruct": {
29+
"link": "https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct",
30+
"open-data": "None",
31+
"pass@1": {
32+
"instruct": null,
33+
"complete": 62.45
34+
},
35+
"prompted": true,
36+
"size": 70,
37+
"direct_complete": false,
38+
"lazy": false,
39+
"elo_mle": 874
40+
},
41+
"Meta-Llama-3.1-70B-Instruct": {
42+
"link": "https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct",
43+
"open-data": "None",
44+
"pass@1": {
45+
"instruct": null,
46+
"complete": 60
47+
},
48+
"prompted": true,
49+
"size": 70,
50+
"direct_complete": false,
51+
"lazy": false,
52+
"elo_mle": 874
53+
},
54+
"Meta-Llama-3.1-70B": {
55+
"link": "https://huggingface.co/meta-llama/Llama-3.1-70B",
56+
"open-data": "None",
57+
"pass@1": {
58+
"instruct": null,
59+
"complete": 37.56
60+
},
61+
"prompted": false,
62+
"size": 70,
63+
"direct_complete": false,
64+
"lazy": false,
65+
"elo_mle": 874
66+
},
67+
"Mistral-7B-Instruct-v0.3": {
68+
"link": "https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3",
69+
"open-data": "None",
70+
"pass@1": {
71+
"instruct": null,
72+
"complete": 43.33
73+
},
74+
"prompted": true,
75+
"size": 7,
76+
"direct_complete": false,
77+
"lazy": false,
78+
"elo_mle": 874
79+
},
80+
"Mixtral-8x7B-Instruct-v0.1": {
81+
"link": "https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1",
82+
"open-data": "None",
83+
"pass@1": {
84+
"instruct": null,
85+
"complete": 42.96
86+
},
87+
"prompted": true,
88+
"size": 7,
89+
"direct_complete": false,
90+
"lazy": false,
91+
"elo_mle": 874
92+
},
93+
"Codestral-22B-v0.1": {
94+
"link": "https://huggingface.co/mistralai/Codestral-22B-v0.1",
95+
"open-data": "None",
96+
"pass@1": {
97+
"instruct": null,
98+
"complete": 47.6
99+
},
100+
"prompted": true,
101+
"size": 22,
102+
"direct_complete": false,
103+
"lazy": false,
104+
"elo_mle": 874
105+
},
106+
"Phi-3-medium-128k-instruct": {
107+
"link": "https://huggingface.co/microsoft/Phi-3-medium-128k-instruct",
108+
"open-data": "None",
109+
"pass@1": {
110+
"instruct": null,
111+
"complete": 48.03
112+
},
113+
"prompted": true,
114+
"size": 14,
115+
"direct_complete": false,
116+
"lazy": false,
117+
"elo_mle": 874
118+
},
119+
"Phi-3-mini-128k-instruct": {
120+
"link": "https://huggingface.co/microsoft/Phi-3-mini-128k-instruct",
121+
"open-data": "None",
122+
"pass@1": {
123+
"instruct": null,
124+
"complete": 37.93
125+
},
126+
"prompted": true,
127+
"size": 3.8,
128+
"direct_complete": false,
129+
"lazy": false,
130+
"elo_mle": 874
131+
},
132+
"Qwen2-57B-A14B-Instruct": {
133+
"link": "https://huggingface.co/Qwen/Qwen2-57B-A14B-Instruct",
134+
"open-data": "None",
135+
"pass@1": {
136+
"instruct": null,
137+
"complete": 46.34
138+
},
139+
"prompted": true,
140+
"size": 57,
141+
"direct_complete": false,
142+
"lazy": false,
143+
"elo_mle": 874
144+
},
145+
"CodeQwen1.5-7B-Chat": {
146+
"link": "https://huggingface.co/Qwen/CodeQwen1.5-7B-Chat",
147+
"open-data": "None",
148+
"pass@1": {
149+
"instruct": null,
150+
"complete": 49.82
151+
},
152+
"prompted": true,
153+
"size": 7,
154+
"direct_complete": false,
155+
"lazy": false,
156+
"elo_mle": 874
157+
},
158+
"Yi-1.5-34B-Chat": {
159+
"link": "https://huggingface.co/01-ai/Yi-1.5-34B-Chat",
160+
"open-data": "None",
161+
"pass@1": {
162+
"instruct": null,
163+
"complete": 49.39
164+
},
165+
"prompted": true,
166+
"size": 34,
167+
"direct_complete": false,
168+
"lazy": false,
169+
"elo_mle": 874
170+
},
171+
"Yi-1.5-9B-Chat": {
172+
"link": "https://huggingface.co/01-ai/Yi-1.5-9B-Chat",
173+
"open-data": "None",
174+
"pass@1": {
175+
"instruct": null,
176+
"complete": 47.23
177+
},
178+
"prompted": true,
179+
"size": 9,
180+
"direct_complete": false,
181+
"lazy": false,
182+
"elo_mle": 874
183+
},
184+
"DeepSeek-coder-7b-instruct-v1.5": {
185+
"link": "https://huggingface.co/deepseek-ai/deepseek-coder-7b-instruct-v1.5",
186+
"open-data": "None",
187+
"pass@1": {
188+
"instruct": null,
189+
"complete": 41.21
190+
},
191+
"prompted": true,
192+
"size": 7,
193+
"direct_complete": false,
194+
"lazy": false,
195+
"elo_mle": 874
196+
},
197+
"DeepSeek-coder-33b-instruct": {
198+
"link": "https://huggingface.co/deepseek-ai/deepseek-coder-33b-instruct",
199+
"open-data": "None",
200+
"pass@1": {
201+
"instruct": null,
202+
"complete": 36.6
203+
},
204+
"prompted": true,
205+
"size": 33,
206+
"direct_complete": false,
207+
"lazy": false,
208+
"elo_mle": 874
209+
},
210+
"DeepSeek-moe-16b-chat": {
211+
"link": "https://huggingface.co/deepseek-ai/deepseek-moe-16b-chat",
212+
"open-data": "None",
213+
"pass@1": {
214+
"instruct": null,
215+
"complete": 31.01
216+
},
217+
"prompted": true,
218+
"size": 16.4,
219+
"direct_complete": false,
220+
"lazy": false,
221+
"elo_mle": 874
222+
},
223+
"DeepSeek-Coder-V2-Lite-Instruct": {
224+
"link": "https://huggingface.co/deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct",
225+
"open-data": "None",
226+
"pass@1": {
227+
"instruct": null,
228+
"complete": 46.51
229+
},
230+
"prompted": true,
231+
"size": 16,
232+
"direct_complete": false,
233+
"lazy": false,
234+
"elo_mle": 874
235+
},
236+
"InternLM2-5-20b-chat": {
237+
"link": "https://huggingface.co/internlm/internlm2_5-20b-chat",
238+
"open-data": "None",
239+
"pass@1": {
240+
"instruct": null,
241+
"complete": 44.89
242+
},
243+
"prompted": true,
244+
"size": 20,
245+
"direct_complete": false,
246+
"lazy": false,
247+
"elo_mle": 874
248+
},
249+
"StarCoder2-15b-instruct-v0.1": {
250+
"link": "https://huggingface.co/bigcode/starcoder2-15b-instruct-v0.1",
251+
"open-data": "None",
252+
"pass@1": {
253+
"instruct": null,
254+
"complete": 47.94
255+
},
256+
"prompted": true,
257+
"size": 15,
258+
"direct_complete": false,
259+
"lazy": false,
260+
"elo_mle": 874
261+
},
262+
"Claude-3-sonnet@20240229": {
263+
"link": "",
264+
"open-data": "None",
265+
"pass@1": {
266+
"instruct": null,
267+
"complete": 53.97
268+
},
269+
"prompted": true,
270+
"size": null,
271+
"direct_complete": false,
272+
"lazy": false,
273+
"elo_mle": 874
274+
},
275+
"GPT-4o-2024-05-13": {
276+
"link": "",
277+
"open-data": "None",
278+
"pass@1": {
279+
"instruct": null,
280+
"complete": 67
281+
},
282+
"prompted": true,
283+
"size": null,
284+
"direct_complete": false,
285+
"lazy": false,
286+
"elo_mle": 874
287+
},
288+
"GPT-3.5-turbo-0613": {
289+
"link": "",
290+
"open-data": null,
291+
"pass@1": {
292+
"instruct": null,
293+
"complete": 51.7
294+
},
295+
"prompted": true,
296+
"size": null,
297+
"direct_complete": false,
298+
"lazy": false,
299+
"elo_mle": 874
300+
}
301+
}
32.5 KB
Loading

leaderboards/codemmlu/index.html

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
<script src="https://cdn.jsdelivr.net/npm/echarts@5.3.3/dist/echarts.min.js"></script>
2020
<link
2121
rel="icon"
22-
href=""
22+
href="images/codemmlu-logo.png"
2323
/>
2424
<link
2525
rel="stylesheet"
@@ -501,7 +501,7 @@ <h3>🙏 Acknowledgements</h3>
501501
],
502502
};
503503

504-
const theaders = ["Model", "Accuracy"];
504+
const theaders = ["Model", "Syntactic Accuracy", "Semantic Accuracy", "Real-task Accuracy", "CodeMMLU"];
505505

506506
// score: 'complete', 'instruct'
507507
const displayTable = (table, score) => {
@@ -524,7 +524,7 @@ <h3>🙏 Acknowledgements</h3>
524524
theaders.forEach(function (header) {
525525
var th = document.createElement("th");
526526
th.textContent = header;
527-
if (header == "Pass@1") {
527+
if (header == "CodeMMLU") {
528528
th.style.backgroundColor = "#EEFFEE";
529529
}
530530
headerRow.appendChild(th);
@@ -588,7 +588,24 @@ <h3>🙏 Acknowledgements</h3>
588588
// promptedSymbol.textContent = "💙";
589589
// modelCell.appendChild(promptedSymbol);
590590
// }
591+
592+
// Add Syntactic Accuracy column
593+
594+
591595
dataRow.appendChild(modelCell);
596+
597+
var syntacticCell = document.createElement("td");
598+
syntacticCell.textContent = row["syntactic_accuracy"] || "-";
599+
dataRow.appendChild(syntacticCell);
600+
601+
var semanticCell = document.createElement("td");
602+
semanticCell.textContent = row["semantic_accuracy"] || "-";
603+
dataRow.appendChild(semanticCell);
604+
605+
var rtaskCell = document.createElement("td");
606+
rtaskCell.textContent = row["realtask_accuracy"] || "-";
607+
dataRow.appendChild(rtaskCell);
608+
592609
var passCell = document.createElement("td");
593610
passCell.classList.add("text-nowrap");
594611
if (lazy) {

0 commit comments

Comments
 (0)