Add LLM model evaluation demo (#60)

privacywill · web-flow · commit 07d4c7d5f6fb · 2025-06-04T10:48:00.000+08:00
1. Add a doc about llm model evaluation demo.
2. Add `libblas-dev` and `liblapack-dev` to single user image to run llm
evaluation.
diff --git a/MODULE.bazel b/MODULE.bazel
@@ -6,7 +6,7 @@
 ###############################################################################
 
 # rules_proto
-bazel_dep(name = "rules_proto", version = "7.0.2")
+bazel_dep(name = "rules_proto", version = "7.1.0")
 
 # rules_python
 bazel_dep(name = "rules_python", version = "0.40.0")
@@ -68,12 +68,12 @@ use_repo(
 # rules_pkg
 bazel_dep(name = "rules_pkg", version = "1.0.1")
 bazel_dep(name = "rules_go", version = "0.50.1")
-bazel_dep(name = "gazelle", version = "0.40.0")
+bazel_dep(name = "gazelle", version = "0.43.0")
 
 go_sdk = use_extension("@rules_go//go:extensions.bzl", "go_sdk")
 go_sdk.download(
     name = "go_sdk",
-    version = "1.23.0",
+    version = "1.23.8",
 )
 use_repo(go_sdk, "go_sdk")
 
@@ -118,3 +118,16 @@ use_repo(
 )
 
 bazel_dep(name = "rules_multirun", version = "0.10.0")
+bazel_dep(name = "rules_distroless", version = "0.5.1")
+
+apt = use_extension(
+    "@rules_distroless//apt:extensions.bzl",
+    "apt",
+    dev_dependency = True,
+)
+apt.install(
+    name = "noble",
+    lock = "//app/jupyterlab_manatee:noble.lock.json",
+    manifest = "//app/jupyterlab_manatee:noble.yaml",
+)
+use_repo(apt, "noble")
diff --git a/app/jupyterlab_manatee/BUILD.bazel b/app/jupyterlab_manatee/BUILD.bazel
@@ -129,6 +129,7 @@ oci_image(
     name = "image",
     base = "@scipy-notebook_linux_amd64",
     tars = [
+        "@noble//:flat",
         ":dev_wheel_tar",
         ":hooks_tar",
     ],
diff --git a/app/jupyterlab_manatee/noble.lock.json b/app/jupyterlab_manatee/noble.lock.json
@@ -0,0 +1,150 @@
+{
+	"packages": [
+		{
+			"arch": "amd64",
+			"dependencies": [
+				{
+					"key": "libblas3_3.12.0-3_amd64",
+					"name": "libblas3",
+					"version": "3.12.0-3"
+				},
+				{
+					"key": "libgcc-s1_14-20240221-2.1ubuntu1_amd64",
+					"name": "libgcc-s1",
+					"version": "14-20240221-2.1ubuntu1"
+				},
+				{
+					"key": "libc6_2.39-0ubuntu2_amd64",
+					"name": "libc6",
+					"version": "2.39-0ubuntu2"
+				},
+				{
+					"key": "gcc-14-base_14-20240221-2.1ubuntu1_amd64",
+					"name": "gcc-14-base",
+					"version": "14-20240221-2.1ubuntu1"
+				}
+			],
+			"key": "libblas-dev_3.12.0-3_amd64",
+			"name": "libblas-dev",
+			"sha256": "12c492a0d1ee2c2e765b394c51de330b3c9f1a8579cde81e0d14b35e8df75a15",
+			"urls": [
+				"https://snapshot.ubuntu.com/ubuntu/20240301T030400Z/pool/main/l/lapack/libblas-dev_3.12.0-3_amd64.deb"
+			],
+			"version": "3.12.0-3"
+		},
+		{
+			"arch": "amd64",
+			"dependencies": [],
+			"key": "libblas3_3.12.0-3_amd64",
+			"name": "libblas3",
+			"sha256": "2db61728fe84dad1c86a55e92959333136a1cdc3286d437e5f3456777147a75c",
+			"urls": [
+				"https://snapshot.ubuntu.com/ubuntu/20240301T030400Z/pool/main/l/lapack/libblas3_3.12.0-3_amd64.deb"
+			],
+			"version": "3.12.0-3"
+		},
+		{
+			"arch": "amd64",
+			"dependencies": [],
+			"key": "libgcc-s1_14-20240221-2.1ubuntu1_amd64",
+			"name": "libgcc-s1",
+			"sha256": "ffc195df7e897aaec468e8f62b08660cc711c7449113102491fdd6baa6901f6d",
+			"urls": [
+				"https://snapshot.ubuntu.com/ubuntu/20240301T030400Z/pool/main/g/gcc-14/libgcc-s1_14-20240221-2.1ubuntu1_amd64.deb"
+			],
+			"version": "14-20240221-2.1ubuntu1"
+		},
+		{
+			"arch": "amd64",
+			"dependencies": [],
+			"key": "libc6_2.39-0ubuntu2_amd64",
+			"name": "libc6",
+			"sha256": "4bd128b75db38b7e9147c0333908e2c7fbc41631f284360f95118fe1c6c162f3",
+			"urls": [
+				"https://snapshot.ubuntu.com/ubuntu/20240301T030400Z/pool/main/g/glibc/libc6_2.39-0ubuntu2_amd64.deb"
+			],
+			"version": "2.39-0ubuntu2"
+		},
+		{
+			"arch": "amd64",
+			"dependencies": [],
+			"key": "gcc-14-base_14-20240221-2.1ubuntu1_amd64",
+			"name": "gcc-14-base",
+			"sha256": "2e1ae2c2ccf2d1b6d09c657af1492a8b7a348e899f9ad25d4925b170571a0887",
+			"urls": [
+				"https://snapshot.ubuntu.com/ubuntu/20240301T030400Z/pool/main/g/gcc-14/gcc-14-base_14-20240221-2.1ubuntu1_amd64.deb"
+			],
+			"version": "14-20240221-2.1ubuntu1"
+		},
+		{
+			"arch": "amd64",
+			"dependencies": [
+				{
+					"key": "libblas-dev_3.12.0-3_amd64",
+					"name": "libblas-dev",
+					"version": "3.12.0-3"
+				},
+				{
+					"key": "libblas3_3.12.0-3_amd64",
+					"name": "libblas3",
+					"version": "3.12.0-3"
+				},
+				{
+					"key": "libgcc-s1_14-20240221-2.1ubuntu1_amd64",
+					"name": "libgcc-s1",
+					"version": "14-20240221-2.1ubuntu1"
+				},
+				{
+					"key": "libc6_2.39-0ubuntu2_amd64",
+					"name": "libc6",
+					"version": "2.39-0ubuntu2"
+				},
+				{
+					"key": "gcc-14-base_14-20240221-2.1ubuntu1_amd64",
+					"name": "gcc-14-base",
+					"version": "14-20240221-2.1ubuntu1"
+				},
+				{
+					"key": "liblapack3_3.12.0-3_amd64",
+					"name": "liblapack3",
+					"version": "3.12.0-3"
+				},
+				{
+					"key": "libgfortran5_14-20240221-2.1ubuntu1_amd64",
+					"name": "libgfortran5",
+					"version": "14-20240221-2.1ubuntu1"
+				}
+			],
+			"key": "liblapack-dev_3.12.0-3_amd64",
+			"name": "liblapack-dev",
+			"sha256": "35010504e796d0ba2136a47fe6a49a0c21f7e02276b22efd7f87b402815e98ca",
+			"urls": [
+				"https://snapshot.ubuntu.com/ubuntu/20240301T030400Z/pool/main/l/lapack/liblapack-dev_3.12.0-3_amd64.deb"
+			],
+			"version": "3.12.0-3"
+		},
+		{
+			"arch": "amd64",
+			"dependencies": [],
+			"key": "liblapack3_3.12.0-3_amd64",
+			"name": "liblapack3",
+			"sha256": "b990a000d150d2a2ae6fc3dcdb2f03ee5e0284babb08bd1fa1e573e0b0d041a2",
+			"urls": [
+				"https://snapshot.ubuntu.com/ubuntu/20240301T030400Z/pool/main/l/lapack/liblapack3_3.12.0-3_amd64.deb"
+			],
+			"version": "3.12.0-3"
+		},
+		{
+			"arch": "amd64",
+			"dependencies": [],
+			"key": "libgfortran5_14-20240221-2.1ubuntu1_amd64",
+			"name": "libgfortran5",
+			"sha256": "8c1c6e05318a027d27a808ee6d24b2fa171dc702f593ff35f3b06c341ad1762e",
+			"urls": [
+				"https://snapshot.ubuntu.com/ubuntu/20240301T030400Z/pool/main/g/gcc-14/libgfortran5_14-20240221-2.1ubuntu1_amd64.deb"
+			],
+			"version": "14-20240221-2.1ubuntu1"
+		}
+	],
+	"version": 1
+}
diff --git a/app/jupyterlab_manatee/noble.yaml b/app/jupyterlab_manatee/noble.yaml
@@ -0,0 +1,25 @@
+# Packages for examples/debian_snapshot.
+#
+#  Anytime this file is changed, the lockfile needs to be regenerated.
+#
+#  To generate the bookworm.lock.json run the following command
+#
+#     bazel run @bookworm//:lock
+#
+# See debian_package_index at WORKSPACE.bazel
+version: 1
+
+sources:
+  - channel: noble main
+    url: https://snapshot.ubuntu.com/ubuntu/20240301T030400Z
+  - channel: noble-security main
+    url: https://snapshot.ubuntu.com/ubuntu/20240301T030400Z
+  - channel: noble-updates main
+    url: https://snapshot.ubuntu.com/ubuntu/20240301T030400Z
+
+archs:
+  - "amd64"
+
+packages:
+  - "libblas-dev"
+  - "liblapack-dev"
diff --git a/app/jupyterlab_manatee/src/jobs.tsx b/app/jupyterlab_manatee/src/jobs.tsx
@@ -154,7 +154,7 @@ const handleGetAttestation = async (record: Job) => {
                 const result = await response.json();
                 if (result.code === 0) {
                     await showDialog({
-                        title: "Get Attestation Report Successful",
+                        title: "Get Attestation Report Successfully",
                         body: 'OIDC Token: ' + result.token,
                         buttons: [Dialog.okButton(), Dialog.cancelButton()]
                     });
diff --git a/deployment/jupyterhub/deploy.sh b/deployment/jupyterhub/deploy.sh
@@ -51,6 +51,7 @@ helm upgrade --cleanup-on-fail \
     --set singleuser.extraEnv.PROJECT_ID=${project_id} \
     --set singleuser.extraEnv.KEY_LOCALTION=${region} \
     --set singleuser.networkPolicy.enabled=false \
+    --set singleuser.storage.capacity=20Gi \
     --install $helm_name jupyterhub/jupyterhub \
     --namespace ${namespace} \
     --version=3.0.3 \
diff --git a/docs/getting-started/llm-model-evaluation.md b/docs/getting-started/llm-model-evaluation.md
@@ -0,0 +1,65 @@
+# Trusted LLM Model Evaluation Example
+
+This doc demonstrates how to use manatee for trusted evaluation of LLM models. Manatee seamlessly integrates with lm-evaluation-harness, enabling comprehensive testing of LLM models across a wide range of evaluation tasks.
+
+Scenario:
+Suppose a model provider owns a proprietary LLM model. The provider wishes to prove that their model performs as publicly claimed (e.g., in terms of fairness or accuracy). This evaluation process is divided into two stages: 
+- Stage 1: The script runs on a mock (fake) model to illustrate the workflow.
+- Stage 2: The script runs on the actual model, producing real evaluation results along with cryptographic attestation.
+
+The attestation process cryptographically binds the evaluation results to a TEE (Trusted Execution Environment) quote. This quote serves as proof that a specific model (identified by its hash) was executed within a legitimate TEE, and that the reported outputs are authentic and trustworthy. 
+
+
+## Install lm-evaluation-harness
+`lm-evaluation-harness` provides a unified framework to test generative language models on a large number of different evaluation tasks.
+
+```python
+!git clone --depth 1 https://github.com/EleutherAI/lm-evaluation-harness
+%pip install -e ./lm-evaluation-harness[wandb]
+```
+
+## Model Selection（HuggingFace for Example）
+
+```
+HG_MODEL="deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
+import lm_eval
+from lm_eval.utils import setup_logging
+lm_obj = lm_eval.models.huggingface.HFLM(pretrained=HG_MODEL, device="cpu")
+task_manager = lm_eval.tasks.TaskManager()
+lm_obj.get_model_info()
+```
+
+## Tasks Selection 
+
+### BBQ: A Hand-Built Bias Benchmark for Question Answering
+
+BBQ measures the bias in the output for the question answering task. The dataset of question-sets constructed by the authors that highlight attested social biases against people belonging to protected classes along nine social dimensions relevant for U.S. English-speaking contexts. BBQ evaluates model responses at two levels: (i) given an under-informative context, how strongly responses reflect social biases (AMBIGUOUS CONTEXT), and (ii) given an adequately informative context, whether the model's biases override a correct answer choice (DISAMBIGUATED CONTEXT).
+
+### CrowS-Pairs: A Challenge Dataset for Measuring Social Biases in Masked Language Models
+
+CrowS-Pairs is a challenge set for evaluating what language models (LMs) on their tendency to generate biased outputs. CrowS-Pairs comes in 2 languages and the English subset has a newer version which fixes some of the issues with the original version
+
+### Simple Cooccurrence Bias
+
+This bias evaluation relies on simple templates for prompting LMs and tests for bias in the next word prediction. For instance, when given a context such as "The {occupation} was a", masculine gender identifiers are found to be more likely to follow than feminine gender ones. Following Brown et al. (2020), this occupation bias is measured as the average log-likelihood of choosing a female gender identifier (woman, female) minus the log-likelihood of choosing a male gender identifier (man, male).
+
+### Winogender: Gender Bias in Coreference Resolution
+Winogender is designed to measure gender bias in coreference resolution systems, but has also been used for evaluating language models. The dataset consists of simple sentences with an occupation, participant, and pronoun, where the pronoun refers to either the occupation or participant. Each example consists of three variations, where only the gender of the pronoun is changed, to test how the pronoun affects the prediction. An example of the Winogender schema is "The paramedic performed CPR on the passenger even though he/she/they knew it was too late." This implementation follows the description from the paper "Language Models are Few-Shot Learners", which uses prompts.
+
+```python
+import datasets
+datasets.config.HF_DATASETS_TRUST_REMOTE_CODE = True
+tee_llm_evaluation_result = lm_eval.simple_evaluate( # call simple_evaluate
+    model=lm_obj,
+    tasks=["winogender","simple_cooccurrence_bias", "crows_pairs_english"],
+    num_fewshot=0,
+    task_manager=task_manager,
+    log_samples=True,
+    batch_size=1024,
+    confirm_run_unsafe_code=True
+)
+tee_llm_evaluation_result["results"]
+```
+
+## Get Result and TEE Attestation Report
+After the job finished, downloaded the result along with the attestation report. The `eat_nonce` in the attestation report is the hash of the output file.
diff --git a/go.mod b/go.mod
@@ -1,6 +1,6 @@
 module github.com/manatee-project/manatee
 
-go 1.23.0
+go 1.23.8
 
 require (
 	cloud.google.com/go/compute v1.36.1
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -45,6 +45,8 @@ nav:
     - Deploy:
         - GCP: getting-started/deployment.md
         - Minikube: getting-started/minikube.md
-  - Tutorials: getting-started/tutorials.md
+  - Tutorials: 
+    - Tutorials: getting-started/tutorials.md
+    - LLM Model Evaluation: getting-started/llm-model-evaluation.md
   - Blog:
     - blog/index.md
diff --git a/resources/global/cluster.tf b/resources/global/cluster.tf
@@ -54,14 +54,14 @@ resource "google_container_node_pool" "dcr_node_pool" {
 
   node_config {
     service_account = google_service_account.gcp_dcr_cluster_sa.email
-    preemptible     = true
+    preemptible     = false
     machine_type    = var.type
   }
 
   depends_on = [
     google_service_account.gcp_dcr_cluster_sa,
   ]
   autoscaling {
-    max_node_count = 8
+    max_node_count = 3
   }
 }
diff --git a/resources/global/variables.tf b/resources/global/variables.tf
@@ -37,11 +37,11 @@ variable "project_id" {
 variable "type" {
   type        = string
   description = "Instance type for the GKE instances"
-  default     = "n2-standard-4"
+  default     = "c3-highcpu-22"
 }
 
 variable "num_nodes" {
   type        = number
   description = "Number of nodes to create in the GKE cluster"
-  default     = 2
+  default     = 1
 }

Original file line number	Diff line number	Diff line change
`@@ -54,14 +54,14 @@ resource "google_container_node_pool" "dcr_node_pool" {`
`54`	`54`
`55`	`55`	`node_config {`
`56`	`56`	`service_account = google_service_account.gcp_dcr_cluster_sa.email`
`57`		`- preemptible = true`
	`57`	`+ preemptible = false`
`58`	`58`	`machine_type = var.type`
`59`	`59`	`}`
`60`	`60`
`61`	`61`	`depends_on = [`
`62`	`62`	`google_service_account.gcp_dcr_cluster_sa,`
`63`	`63`	`]`
`64`	`64`	`autoscaling {`
`65`		`- max_node_count = 8`
	`65`	`+ max_node_count = 3`
`66`	`66`	`}`
`67`	`67`	`}`