Skip to content

Commit 07d4c7d

Browse files
authored
Add LLM model evaluation demo (#60)
1. Add a doc about llm model evaluation demo. 2. Add `libblas-dev` and `liblapack-dev` to single user image to run llm evaluation.
1 parent 7a56281 commit 07d4c7d

File tree

11 files changed

+267
-10
lines changed

11 files changed

+267
-10
lines changed

MODULE.bazel

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
###############################################################################
77

88
# rules_proto
9-
bazel_dep(name = "rules_proto", version = "7.0.2")
9+
bazel_dep(name = "rules_proto", version = "7.1.0")
1010

1111
# rules_python
1212
bazel_dep(name = "rules_python", version = "0.40.0")
@@ -68,12 +68,12 @@ use_repo(
6868
# rules_pkg
6969
bazel_dep(name = "rules_pkg", version = "1.0.1")
7070
bazel_dep(name = "rules_go", version = "0.50.1")
71-
bazel_dep(name = "gazelle", version = "0.40.0")
71+
bazel_dep(name = "gazelle", version = "0.43.0")
7272

7373
go_sdk = use_extension("@rules_go//go:extensions.bzl", "go_sdk")
7474
go_sdk.download(
7575
name = "go_sdk",
76-
version = "1.23.0",
76+
version = "1.23.8",
7777
)
7878
use_repo(go_sdk, "go_sdk")
7979

@@ -118,3 +118,16 @@ use_repo(
118118
)
119119

120120
bazel_dep(name = "rules_multirun", version = "0.10.0")
121+
bazel_dep(name = "rules_distroless", version = "0.5.1")
122+
123+
apt = use_extension(
124+
"@rules_distroless//apt:extensions.bzl",
125+
"apt",
126+
dev_dependency = True,
127+
)
128+
apt.install(
129+
name = "noble",
130+
lock = "//app/jupyterlab_manatee:noble.lock.json",
131+
manifest = "//app/jupyterlab_manatee:noble.yaml",
132+
)
133+
use_repo(apt, "noble")

app/jupyterlab_manatee/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,7 @@ oci_image(
129129
name = "image",
130130
base = "@scipy-notebook_linux_amd64",
131131
tars = [
132+
"@noble//:flat",
132133
":dev_wheel_tar",
133134
":hooks_tar",
134135
],
Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
{
2+
"packages": [
3+
{
4+
"arch": "amd64",
5+
"dependencies": [
6+
{
7+
"key": "libblas3_3.12.0-3_amd64",
8+
"name": "libblas3",
9+
"version": "3.12.0-3"
10+
},
11+
{
12+
"key": "libgcc-s1_14-20240221-2.1ubuntu1_amd64",
13+
"name": "libgcc-s1",
14+
"version": "14-20240221-2.1ubuntu1"
15+
},
16+
{
17+
"key": "libc6_2.39-0ubuntu2_amd64",
18+
"name": "libc6",
19+
"version": "2.39-0ubuntu2"
20+
},
21+
{
22+
"key": "gcc-14-base_14-20240221-2.1ubuntu1_amd64",
23+
"name": "gcc-14-base",
24+
"version": "14-20240221-2.1ubuntu1"
25+
}
26+
],
27+
"key": "libblas-dev_3.12.0-3_amd64",
28+
"name": "libblas-dev",
29+
"sha256": "12c492a0d1ee2c2e765b394c51de330b3c9f1a8579cde81e0d14b35e8df75a15",
30+
"urls": [
31+
"https://snapshot.ubuntu.com/ubuntu/20240301T030400Z/pool/main/l/lapack/libblas-dev_3.12.0-3_amd64.deb"
32+
],
33+
"version": "3.12.0-3"
34+
},
35+
{
36+
"arch": "amd64",
37+
"dependencies": [],
38+
"key": "libblas3_3.12.0-3_amd64",
39+
"name": "libblas3",
40+
"sha256": "2db61728fe84dad1c86a55e92959333136a1cdc3286d437e5f3456777147a75c",
41+
"urls": [
42+
"https://snapshot.ubuntu.com/ubuntu/20240301T030400Z/pool/main/l/lapack/libblas3_3.12.0-3_amd64.deb"
43+
],
44+
"version": "3.12.0-3"
45+
},
46+
{
47+
"arch": "amd64",
48+
"dependencies": [],
49+
"key": "libgcc-s1_14-20240221-2.1ubuntu1_amd64",
50+
"name": "libgcc-s1",
51+
"sha256": "ffc195df7e897aaec468e8f62b08660cc711c7449113102491fdd6baa6901f6d",
52+
"urls": [
53+
"https://snapshot.ubuntu.com/ubuntu/20240301T030400Z/pool/main/g/gcc-14/libgcc-s1_14-20240221-2.1ubuntu1_amd64.deb"
54+
],
55+
"version": "14-20240221-2.1ubuntu1"
56+
},
57+
{
58+
"arch": "amd64",
59+
"dependencies": [],
60+
"key": "libc6_2.39-0ubuntu2_amd64",
61+
"name": "libc6",
62+
"sha256": "4bd128b75db38b7e9147c0333908e2c7fbc41631f284360f95118fe1c6c162f3",
63+
"urls": [
64+
"https://snapshot.ubuntu.com/ubuntu/20240301T030400Z/pool/main/g/glibc/libc6_2.39-0ubuntu2_amd64.deb"
65+
],
66+
"version": "2.39-0ubuntu2"
67+
},
68+
{
69+
"arch": "amd64",
70+
"dependencies": [],
71+
"key": "gcc-14-base_14-20240221-2.1ubuntu1_amd64",
72+
"name": "gcc-14-base",
73+
"sha256": "2e1ae2c2ccf2d1b6d09c657af1492a8b7a348e899f9ad25d4925b170571a0887",
74+
"urls": [
75+
"https://snapshot.ubuntu.com/ubuntu/20240301T030400Z/pool/main/g/gcc-14/gcc-14-base_14-20240221-2.1ubuntu1_amd64.deb"
76+
],
77+
"version": "14-20240221-2.1ubuntu1"
78+
},
79+
{
80+
"arch": "amd64",
81+
"dependencies": [
82+
{
83+
"key": "libblas-dev_3.12.0-3_amd64",
84+
"name": "libblas-dev",
85+
"version": "3.12.0-3"
86+
},
87+
{
88+
"key": "libblas3_3.12.0-3_amd64",
89+
"name": "libblas3",
90+
"version": "3.12.0-3"
91+
},
92+
{
93+
"key": "libgcc-s1_14-20240221-2.1ubuntu1_amd64",
94+
"name": "libgcc-s1",
95+
"version": "14-20240221-2.1ubuntu1"
96+
},
97+
{
98+
"key": "libc6_2.39-0ubuntu2_amd64",
99+
"name": "libc6",
100+
"version": "2.39-0ubuntu2"
101+
},
102+
{
103+
"key": "gcc-14-base_14-20240221-2.1ubuntu1_amd64",
104+
"name": "gcc-14-base",
105+
"version": "14-20240221-2.1ubuntu1"
106+
},
107+
{
108+
"key": "liblapack3_3.12.0-3_amd64",
109+
"name": "liblapack3",
110+
"version": "3.12.0-3"
111+
},
112+
{
113+
"key": "libgfortran5_14-20240221-2.1ubuntu1_amd64",
114+
"name": "libgfortran5",
115+
"version": "14-20240221-2.1ubuntu1"
116+
}
117+
],
118+
"key": "liblapack-dev_3.12.0-3_amd64",
119+
"name": "liblapack-dev",
120+
"sha256": "35010504e796d0ba2136a47fe6a49a0c21f7e02276b22efd7f87b402815e98ca",
121+
"urls": [
122+
"https://snapshot.ubuntu.com/ubuntu/20240301T030400Z/pool/main/l/lapack/liblapack-dev_3.12.0-3_amd64.deb"
123+
],
124+
"version": "3.12.0-3"
125+
},
126+
{
127+
"arch": "amd64",
128+
"dependencies": [],
129+
"key": "liblapack3_3.12.0-3_amd64",
130+
"name": "liblapack3",
131+
"sha256": "b990a000d150d2a2ae6fc3dcdb2f03ee5e0284babb08bd1fa1e573e0b0d041a2",
132+
"urls": [
133+
"https://snapshot.ubuntu.com/ubuntu/20240301T030400Z/pool/main/l/lapack/liblapack3_3.12.0-3_amd64.deb"
134+
],
135+
"version": "3.12.0-3"
136+
},
137+
{
138+
"arch": "amd64",
139+
"dependencies": [],
140+
"key": "libgfortran5_14-20240221-2.1ubuntu1_amd64",
141+
"name": "libgfortran5",
142+
"sha256": "8c1c6e05318a027d27a808ee6d24b2fa171dc702f593ff35f3b06c341ad1762e",
143+
"urls": [
144+
"https://snapshot.ubuntu.com/ubuntu/20240301T030400Z/pool/main/g/gcc-14/libgfortran5_14-20240221-2.1ubuntu1_amd64.deb"
145+
],
146+
"version": "14-20240221-2.1ubuntu1"
147+
}
148+
],
149+
"version": 1
150+
}

app/jupyterlab_manatee/noble.yaml

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# Packages for examples/debian_snapshot.
2+
#
3+
# Anytime this file is changed, the lockfile needs to be regenerated.
4+
#
5+
# To generate the bookworm.lock.json run the following command
6+
#
7+
# bazel run @bookworm//:lock
8+
#
9+
# See debian_package_index at WORKSPACE.bazel
10+
version: 1
11+
12+
sources:
13+
- channel: noble main
14+
url: https://snapshot.ubuntu.com/ubuntu/20240301T030400Z
15+
- channel: noble-security main
16+
url: https://snapshot.ubuntu.com/ubuntu/20240301T030400Z
17+
- channel: noble-updates main
18+
url: https://snapshot.ubuntu.com/ubuntu/20240301T030400Z
19+
20+
archs:
21+
- "amd64"
22+
23+
packages:
24+
- "libblas-dev"
25+
- "liblapack-dev"

app/jupyterlab_manatee/src/jobs.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,7 @@ const handleGetAttestation = async (record: Job) => {
154154
const result = await response.json();
155155
if (result.code === 0) {
156156
await showDialog({
157-
title: "Get Attestation Report Successful",
157+
title: "Get Attestation Report Successfully",
158158
body: 'OIDC Token: ' + result.token,
159159
buttons: [Dialog.okButton(), Dialog.cancelButton()]
160160
});

deployment/jupyterhub/deploy.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ helm upgrade --cleanup-on-fail \
5151
--set singleuser.extraEnv.PROJECT_ID=${project_id} \
5252
--set singleuser.extraEnv.KEY_LOCALTION=${region} \
5353
--set singleuser.networkPolicy.enabled=false \
54+
--set singleuser.storage.capacity=20Gi \
5455
--install $helm_name jupyterhub/jupyterhub \
5556
--namespace ${namespace} \
5657
--version=3.0.3 \
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
# Trusted LLM Model Evaluation Example
2+
3+
This doc demonstrates how to use manatee for trusted evaluation of LLM models. Manatee seamlessly integrates with lm-evaluation-harness, enabling comprehensive testing of LLM models across a wide range of evaluation tasks.
4+
5+
Scenario:
6+
Suppose a model provider owns a proprietary LLM model. The provider wishes to prove that their model performs as publicly claimed (e.g., in terms of fairness or accuracy). This evaluation process is divided into two stages:
7+
- Stage 1: The script runs on a mock (fake) model to illustrate the workflow.
8+
- Stage 2: The script runs on the actual model, producing real evaluation results along with cryptographic attestation.
9+
10+
The attestation process cryptographically binds the evaluation results to a TEE (Trusted Execution Environment) quote. This quote serves as proof that a specific model (identified by its hash) was executed within a legitimate TEE, and that the reported outputs are authentic and trustworthy.
11+
12+
13+
## Install lm-evaluation-harness
14+
`lm-evaluation-harness` provides a unified framework to test generative language models on a large number of different evaluation tasks.
15+
16+
```python
17+
!git clone --depth 1 https://github.com/EleutherAI/lm-evaluation-harness
18+
%pip install -e ./lm-evaluation-harness[wandb]
19+
```
20+
21+
## Model Selection(HuggingFace for Example)
22+
23+
```
24+
HG_MODEL="deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
25+
import lm_eval
26+
from lm_eval.utils import setup_logging
27+
lm_obj = lm_eval.models.huggingface.HFLM(pretrained=HG_MODEL, device="cpu")
28+
task_manager = lm_eval.tasks.TaskManager()
29+
lm_obj.get_model_info()
30+
```
31+
32+
## Tasks Selection
33+
34+
### BBQ: A Hand-Built Bias Benchmark for Question Answering
35+
36+
BBQ measures the bias in the output for the question answering task. The dataset of question-sets constructed by the authors that highlight attested social biases against people belonging to protected classes along nine social dimensions relevant for U.S. English-speaking contexts. BBQ evaluates model responses at two levels: (i) given an under-informative context, how strongly responses reflect social biases (AMBIGUOUS CONTEXT), and (ii) given an adequately informative context, whether the model's biases override a correct answer choice (DISAMBIGUATED CONTEXT).
37+
38+
### CrowS-Pairs: A Challenge Dataset for Measuring Social Biases in Masked Language Models
39+
40+
CrowS-Pairs is a challenge set for evaluating what language models (LMs) on their tendency to generate biased outputs. CrowS-Pairs comes in 2 languages and the English subset has a newer version which fixes some of the issues with the original version
41+
42+
### Simple Cooccurrence Bias
43+
44+
This bias evaluation relies on simple templates for prompting LMs and tests for bias in the next word prediction. For instance, when given a context such as "The {occupation} was a", masculine gender identifiers are found to be more likely to follow than feminine gender ones. Following Brown et al. (2020), this occupation bias is measured as the average log-likelihood of choosing a female gender identifier (woman, female) minus the log-likelihood of choosing a male gender identifier (man, male).
45+
46+
### Winogender: Gender Bias in Coreference Resolution
47+
Winogender is designed to measure gender bias in coreference resolution systems, but has also been used for evaluating language models. The dataset consists of simple sentences with an occupation, participant, and pronoun, where the pronoun refers to either the occupation or participant. Each example consists of three variations, where only the gender of the pronoun is changed, to test how the pronoun affects the prediction. An example of the Winogender schema is "The paramedic performed CPR on the passenger even though he/she/they knew it was too late." This implementation follows the description from the paper "Language Models are Few-Shot Learners", which uses prompts.
48+
49+
```python
50+
import datasets
51+
datasets.config.HF_DATASETS_TRUST_REMOTE_CODE = True
52+
tee_llm_evaluation_result = lm_eval.simple_evaluate( # call simple_evaluate
53+
model=lm_obj,
54+
tasks=["winogender","simple_cooccurrence_bias", "crows_pairs_english"],
55+
num_fewshot=0,
56+
task_manager=task_manager,
57+
log_samples=True,
58+
batch_size=1024,
59+
confirm_run_unsafe_code=True
60+
)
61+
tee_llm_evaluation_result["results"]
62+
```
63+
64+
## Get Result and TEE Attestation Report
65+
After the job finished, downloaded the result along with the attestation report. The `eat_nonce` in the attestation report is the hash of the output file.

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
module github.com/manatee-project/manatee
22

3-
go 1.23.0
3+
go 1.23.8
44

55
require (
66
cloud.google.com/go/compute v1.36.1

mkdocs.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,8 @@ nav:
4545
- Deploy:
4646
- GCP: getting-started/deployment.md
4747
- Minikube: getting-started/minikube.md
48-
- Tutorials: getting-started/tutorials.md
48+
- Tutorials:
49+
- Tutorials: getting-started/tutorials.md
50+
- LLM Model Evaluation: getting-started/llm-model-evaluation.md
4951
- Blog:
5052
- blog/index.md

resources/global/cluster.tf

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,14 +54,14 @@ resource "google_container_node_pool" "dcr_node_pool" {
5454

5555
node_config {
5656
service_account = google_service_account.gcp_dcr_cluster_sa.email
57-
preemptible = true
57+
preemptible = false
5858
machine_type = var.type
5959
}
6060

6161
depends_on = [
6262
google_service_account.gcp_dcr_cluster_sa,
6363
]
6464
autoscaling {
65-
max_node_count = 8
65+
max_node_count = 3
6666
}
6767
}

0 commit comments

Comments
 (0)