diff --git a/.github/gh.yml b/.github/gh.yml
new file mode 100644
index 0000000000..2eed0a79a7
--- /dev/null
+++ b/.github/gh.yml
@@ -0,0 +1,3 @@
+# GitHub CLI configuration
+# Ensures all gh commands default to evalops/opencode instead of upstream
+repo: evalops/opencode
diff --git a/.github/workflows/opencode.yml b/.github/workflows/opencode.yml
index 41ee754086..822874c98d 100644
--- a/.github/workflows/opencode.yml
+++ b/.github/workflows/opencode.yml
@@ -20,7 +20,7 @@ jobs:
uses: actions/checkout@v4
- name: Run opencode
- uses: sst/opencode/github@latest
+ uses: evalops/opencode/github@latest
env:
OPENCODE_API_KEY: ${{ secrets.OPENCODE_API_KEY }}
with:
diff --git a/README.md b/README.md
index 6e91d3ccbc..00bc71c9ad 100644
--- a/README.md
+++ b/README.md
@@ -1,20 +1,10 @@
-
-
-
-
-
-
-
-
-
-The AI coding agent built for the terminal.
-
-
-
-
-
-
-[](https://opencode.ai)
+# Grimoire
+
+> A fork of [OpenCode](https://github.com/evalops/opencode) by [EvalOps](https://evalops.dev)
+
+This is a public fork maintained by EvalOps for internal use. We use OpenCode extensively and maintain this fork to experiment with enhancements aligned with our LLM evaluation workflows. OpenCode is open source and permissively licensed (MIT).
+
+This fork tracks the `dev` branch of [evalops/opencode](https://github.com/evalops/opencode). For official releases and documentation, see the upstream repository.
---
@@ -52,6 +42,31 @@ XDG_BIN_DIR=$HOME/.local/bin curl -fsSL https://opencode.ai/install | bash
For more info on how to configure OpenCode [**head over to our docs**](https://opencode.ai/docs).
+### Usage Stats
+
+You can inspect local usage history and tool telemetry with the built-in stats command:
+
+```bash
+opencode stats # pretty summary
+opencode stats --json # machine-readable output
+opencode stats --telemetry all # include recent tool runs
+opencode stats --limit 50 # show more history
+opencode stats --clear # reset stored telemetry data
+opencode stats --details # show telemetry metadata fields
+opencode stats --details-format ndjson --fields status,final_url
+opencode stats --status error --since 1d
+opencode stats --compare baseline.json --warn-latency 2000
+```
+
+Advanced telemetry usage tips:
+
+- Capture a baseline for comparison with `opencode stats --json --telemetry all --limit 500 > baseline.json`, then diff with `--compare baseline.json`.
+- Export metadata for dashboards using `--details-format csv` or `--details-format ndjson`.
+- Focus on specific signals by pairing `--status`, `--since`, `--until`, and `--fields` filters.
+- Gate builds by combining `--warn-latency` or `--warn-errors` with CI scripts.
+
+The telemetry section lists recent tool executions (duration, status, error message) gathered from persisted `tool.telemetry` events.
+
### Contributing
OpenCode is an opinionated tool so any fundamental feature needs to go through a
diff --git a/bun.lock b/bun.lock
index f279671ca0..9db5ec29ff 100644
--- a/bun.lock
+++ b/bun.lock
@@ -153,6 +153,7 @@
"@hono/standard-validator": "0.1.5",
"@hono/zod-validator": "catalog:",
"@modelcontextprotocol/sdk": "1.15.1",
+ "@octokit/rest": "22.0.0",
"@openauthjs/openauth": "0.4.3",
"@opencode-ai/plugin": "workspace:*",
"@opencode-ai/sdk": "workspace:*",
@@ -162,12 +163,14 @@
"chokidar": "4.0.3",
"decimal.js": "10.5.0",
"diff": "8.0.2",
+ "exa-js": "1.9.3",
"fuzzysort": "3.1.0",
"gray-matter": "4.0.3",
"hono": "catalog:",
"hono-openapi": "1.0.7",
"ignore": "7.0.5",
"jsonc-parser": "3.3.1",
+ "linkedom": "0.18.12",
"minimatch": "10.0.3",
"open": "10.1.2",
"remeda": "catalog:",
@@ -1480,7 +1483,7 @@
"croner": ["croner@9.1.0", "", {}, "sha512-p9nwwR4qyT5W996vBZhdvBCnMhicY5ytZkR4D1Xj0wuTDEiMnjwR57Q3RXYY/s0EpX6Ay3vgIcfaR+ewGHsi+g=="],
- "cross-fetch": ["cross-fetch@3.2.0", "", { "dependencies": { "node-fetch": "^2.7.0" } }, "sha512-Q+xVJLoGOeIMXZmbUK4HYk+69cQH6LudR0Vu/pRm2YlU/hDV9CiS0gKUMaWY5f2NeUH9C1nV3bsTlCo0FsTV1Q=="],
+ "cross-fetch": ["cross-fetch@4.1.0", "", { "dependencies": { "node-fetch": "^2.7.0" } }, "sha512-uKm5PU+MHTootlWEY+mZ4vvXoCn4fLQxT9dSc1sXVMSFkINTJVN8cAQROpwcKm8bJ/c7rgZVIBWzH5T78sNZZw=="],
"cross-spawn": ["cross-spawn@7.0.6", "", { "dependencies": { "path-key": "^3.1.0", "shebang-command": "^2.0.0", "which": "^2.0.1" } }, "sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA=="],
@@ -1496,6 +1499,8 @@
"cssesc": ["cssesc@3.0.0", "", { "bin": { "cssesc": "bin/cssesc" } }, "sha512-/Tb/JcjK111nNScGob5MNtsntNM1aCNUDipB/TkwZFhyDrrE47SOx/18wF2bbjgc3ZzCSKW1T5nt5EbFoAz/Vg=="],
+ "cssom": ["cssom@0.5.0", "", {}, "sha512-iKuQcq+NdHqlAcwUY0o/HL69XQrUaQdMjmStJ8JFmUaiiQErlhrmuigkg/CU4E2J0IyUKUrMAgl36TvN67MqTw=="],
+
"csstype": ["csstype@3.1.3", "", {}, "sha512-M1uQkMl8rQK/szD0LNhtqxIPLpimGm8sOBwU7lLnCpSbTyY3yeU1Vc7l4KT5zT4s/yOxHH5O7tIuuLOCnLADRw=="],
"dax-sh": ["dax-sh@0.43.2", "", { "dependencies": { "@deno/shim-deno": "~0.19.0", "undici-types": "^5.26" } }, "sha512-uULa1sSIHgXKGCqJ/pA0zsnzbHlVnuq7g8O2fkHokWFNwEGIhh5lAJlxZa1POG5En5ba7AU4KcBAvGQWMMf8rg=="],
@@ -1568,7 +1573,7 @@
"dot-prop": ["dot-prop@9.0.0", "", { "dependencies": { "type-fest": "^4.18.2" } }, "sha512-1gxPBJpI/pcjQhKgIU91II6Wkay+dLcN3M6rf2uwP8hRur3HtQXjVrdAK3sjC0piaEuxzMwjXChcETiJl47lAQ=="],
- "dotenv": ["dotenv@16.6.1", "", {}, "sha512-uBq4egWHTcTt33a72vpSG0z3HnPuIl6NqYcTrKEg2azoEyl2hpW0zqlxysq2pK9HlDIHyHyakeYaYnSAwd8bow=="],
+ "dotenv": ["dotenv@16.4.7", "", {}, "sha512-47qPchRCykZC03FhkYAhrvwU4xDBFIj1QPqaarj6mdM/hgUzfPHcpkHJOn3mJAufFeeAxAzeGsr5X0M4k6fLZQ=="],
"drizzle-kit": ["drizzle-kit@0.30.5", "", { "dependencies": { "@drizzle-team/brocli": "^0.10.2", "@esbuild-kit/esm-loader": "^2.5.5", "esbuild": "^0.19.7", "esbuild-register": "^3.5.0", "gel": "^2.0.0" }, "bin": { "drizzle-kit": "bin.cjs" } }, "sha512-l6dMSE100u7sDaTbLczibrQZjA35jLsHNqIV+jmhNVO3O8jzM6kywMOmV9uOz9ZVSCMPQhAZEFjL/qDPVrqpUA=="],
@@ -1660,6 +1665,8 @@
"eventsource-parser": ["eventsource-parser@3.0.6", "", {}, "sha512-Vo1ab+QXPzZ4tCa8SwIHJFaSzy4R6SHf7BY79rFBDf0idraZWAkYrDjDj8uWaSm3S2TK+hJ7/t1CEmZ7jXw+pg=="],
+ "exa-js": ["exa-js@1.9.3", "", { "dependencies": { "cross-fetch": "~4.1.0", "dotenv": "~16.4.7", "openai": "^5.0.1", "zod": "^3.22.0", "zod-to-json-schema": "^3.20.0" } }, "sha512-4u8vO5KHstifBz6fcwcBVvU62zfwsWFpD8qomU2zQ+lLRYCwOh2Rz04xSSqEeoHrkCypGjy2VHez7elBt6ibQQ=="],
+
"execa": ["execa@8.0.1", "", { "dependencies": { "cross-spawn": "^7.0.3", "get-stream": "^8.0.1", "human-signals": "^5.0.0", "is-stream": "^3.0.0", "merge-stream": "^2.0.0", "npm-run-path": "^5.1.0", "onetime": "^6.0.0", "signal-exit": "^4.1.0", "strip-final-newline": "^3.0.0" } }, "sha512-VyhnebXciFV2DESc+p6B+y0LjSm0krU4OgJN44qFAhBY0TJ+1V61tYD2+wHusZ6F9n5K+vl8k0sTy7PEfV4qpg=="],
"exit-hook": ["exit-hook@2.2.1", "", {}, "sha512-eNTPlAD67BmP31LDINZ3U7HSF8l57TxOY2PmBJ1shpCvpnxBF93mWCE8YHBnXs8qiUZJc9WDcWIeC3a2HIAMfw=="],
@@ -1860,7 +1867,7 @@
"html-whitespace-sensitive-tag-names": ["html-whitespace-sensitive-tag-names@3.0.1", "", {}, "sha512-q+310vW8zmymYHALr1da4HyXUQ0zgiIwIicEfotYPWGN0OJVEN/58IJ3A4GBYcEq3LGAZqKb+ugvP0GNB9CEAA=="],
- "htmlparser2": ["htmlparser2@8.0.2", "", { "dependencies": { "domelementtype": "^2.3.0", "domhandler": "^5.0.3", "domutils": "^3.0.1", "entities": "^4.4.0" } }, "sha512-GYdjWKDkbRLkZ5geuHs5NY1puJ+PXwP7+fHPRz06Eirsb9ugf6d8kkXav6ADhcODhFFPMIXyxkxSuMf3D6NCFA=="],
+ "htmlparser2": ["htmlparser2@10.0.0", "", { "dependencies": { "domelementtype": "^2.3.0", "domhandler": "^5.0.3", "domutils": "^3.2.1", "entities": "^6.0.0" } }, "sha512-TwAZM+zE5Tq3lrEHvOlvwgj1XLWQCtaaibSN11Q+gGBAS7Y1uZSWwXXRe4iF6OXnaq1riyQAPFOBtYc77Mxq0g=="],
"http-cache-semantics": ["http-cache-semantics@4.2.0", "", {}, "sha512-dTxcvPXqPvXBQpq5dUr6mEMJX4oIEFv6bwom3FDwKRDsuIjjJGANqhBuoAn9c1RQJIdAKav33ED65E2ys+87QQ=="],
@@ -2038,6 +2045,8 @@
"lines-and-columns": ["lines-and-columns@1.2.4", "", {}, "sha512-7ylylesZQ/PV29jhEDl3Ufjo6ZX7gCqJr5F7PKrqc93v7fzSymt1BpwEU8nAUXs8qzzvqhbjhK5QZg6Mt/HkBg=="],
+ "linkedom": ["linkedom@0.18.12", "", { "dependencies": { "css-select": "^5.1.0", "cssom": "^0.5.0", "html-escaper": "^3.0.3", "htmlparser2": "^10.0.0", "uhyphen": "^0.2.0" }, "peerDependencies": { "canvas": ">= 2" }, "optionalPeers": ["canvas"] }, "sha512-jalJsOwIKuQJSeTvsgzPe9iJzyfVaEJiEXl+25EkKevsULHvMJzpNqwvj1jOESWdmgKDiXObyjOYwlUqG7wo1Q=="],
+
"listhen": ["listhen@1.9.0", "", { "dependencies": { "@parcel/watcher": "^2.4.1", "@parcel/watcher-wasm": "^2.4.1", "citty": "^0.1.6", "clipboardy": "^4.0.0", "consola": "^3.2.3", "crossws": ">=0.2.0 <0.4.0", "defu": "^6.1.4", "get-port-please": "^3.1.2", "h3": "^1.12.0", "http-shutdown": "^1.2.2", "jiti": "^2.1.2", "mlly": "^1.7.1", "node-forge": "^1.3.1", "pathe": "^1.1.2", "std-env": "^3.7.0", "ufo": "^1.5.4", "untun": "^0.1.3", "uqr": "^0.1.2" }, "bin": { "listen": "bin/listhen.mjs", "listhen": "bin/listhen.mjs" } }, "sha512-I8oW2+QL5KJo8zXNWX046M134WchxsXC7SawLPvRQpogCbkyQIaFxPE89A2HiwR7vAK2Dm2ERBAmyjTYGYEpBg=="],
"local-pkg": ["local-pkg@1.1.2", "", { "dependencies": { "mlly": "^1.7.4", "pkg-types": "^2.3.0", "quansync": "^0.2.11" } }, "sha512-arhlxbFRmoQHl33a0Zkle/YWlmNwoyt6QNZEIJcqNbdrsix5Lvc4HyyI3EnwxTYlZYc32EbYrQ8SzEZ7dqgg9A=="],
@@ -2824,6 +2833,8 @@
"uglify-js": ["uglify-js@3.19.3", "", { "bin": { "uglifyjs": "bin/uglifyjs" } }, "sha512-v3Xu+yuwBXisp6QYTcH4UbH+xYJXqnq2m/LtQVWKWzYc1iehYnLixoQDN9FH6/j9/oybfd6W9Ghwkl8+UMKTKQ=="],
+ "uhyphen": ["uhyphen@0.2.0", "", {}, "sha512-qz3o9CHXmJJPGBdqzab7qAYuW8kQGKNEuoHFYrBwV6hWIMcpAmxDLXojcHfFr9US1Pe6zUswEIJIbLI610fuqA=="],
+
"ulid": ["ulid@3.0.0", "", { "bin": { "ulid": "dist/cli.js" } }, "sha512-yvZYdXInnJve6LdlPIuYmURdS2NP41ZoF4QW7SXwbUKYt53+0eDAySO+rGSvM2O/ciuB/G+8N7GQrZ1mCJpuqw=="],
"ultrahtml": ["ultrahtml@1.6.0", "", {}, "sha512-R9fBn90VTJrqqLDwyMph+HGne8eqY1iPfYhPzZrvKpIfwkWZbcYlfpsb8B9dTvBfpy1/hqAD7Wi8EKfP9e8zdw=="],
@@ -3036,6 +3047,8 @@
"@babel/helper-create-class-features-plugin/semver": ["semver@6.3.1", "", { "bin": { "semver": "bin/semver.js" } }, "sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA=="],
+ "@capsizecss/unpack/cross-fetch": ["cross-fetch@3.2.0", "", { "dependencies": { "node-fetch": "^2.7.0" } }, "sha512-Q+xVJLoGOeIMXZmbUK4HYk+69cQH6LudR0Vu/pRm2YlU/hDV9CiS0gKUMaWY5f2NeUH9C1nV3bsTlCo0FsTV1Q=="],
+
"@cloudflare/kv-asset-handler/mime": ["mime@3.0.0", "", { "bin": { "mime": "cli.js" } }, "sha512-jSCU7/VB1loIWBZe14aEYHU/+1UMEHoaO7qxCOVJOw9GgH72VAWppxNcjU+x9a2k3GSIBXNKxXQFqRvvZ7vr3A=="],
"@cloudflare/unenv-preset/unenv": ["unenv@2.0.0-rc.21", "", { "dependencies": { "defu": "^6.1.4", "exsolve": "^1.0.7", "ohash": "^2.0.11", "pathe": "^2.0.3", "ufo": "^1.6.1" } }, "sha512-Wj7/AMtE9MRnAXa6Su3Lk0LNCfqDYgfwVjwRFVum9U7wsto1imuHqk4kTm7Jni+5A0Hn7dttL6O/zjvUvoo+8A=="],
@@ -3182,6 +3195,10 @@
"body-parser/iconv-lite": ["iconv-lite@0.6.3", "", { "dependencies": { "safer-buffer": ">= 2.1.2 < 3.0.0" } }, "sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw=="],
+ "c12/dotenv": ["dotenv@16.6.1", "", {}, "sha512-uBq4egWHTcTt33a72vpSG0z3HnPuIl6NqYcTrKEg2azoEyl2hpW0zqlxysq2pK9HlDIHyHyakeYaYnSAwd8bow=="],
+
+ "cheerio/htmlparser2": ["htmlparser2@8.0.2", "", { "dependencies": { "domelementtype": "^2.3.0", "domhandler": "^5.0.3", "domutils": "^3.0.1", "entities": "^4.4.0" } }, "sha512-GYdjWKDkbRLkZ5geuHs5NY1puJ+PXwP7+fHPRz06Eirsb9ugf6d8kkXav6ADhcODhFFPMIXyxkxSuMf3D6NCFA=="],
+
"compress-commons/is-stream": ["is-stream@2.0.1", "", {}, "sha512-hFoiJiTl63nn+kstHGBtewWSKnQLpyb155KHheA1l39uvtO9nWIop1p3udqPcUd/xbF1VLMO4n7OI6p7RbngDg=="],
"condense-newlines/kind-of": ["kind-of@3.2.2", "", { "dependencies": { "is-buffer": "^1.1.5" } }, "sha512-NOW9QQXMoZGg/oqnVNoNTTIFEIid1627WCffUBJEdMxYApq7mNE7CpzucIPc+ZQg25Phej7IJSmX3hO+oblOtQ=="],
@@ -3204,6 +3221,8 @@
"estree-util-to-js/source-map": ["source-map@0.7.6", "", {}, "sha512-i5uvt8C3ikiWeNZSVZNWcfZPItFQOsYTUAOkcUPGd8DqDy1uOUikjt5dG+uRlwyvR108Fb9DOd4GvXfT0N2/uQ=="],
+ "exa-js/zod": ["zod@3.25.76", "", {}, "sha512-gzUt/qt81nXsFGKIFcC3YnfEAx5NkunCfnDlvuBSSFS02bcXu4Lmea0AFIUwbLWxWPx3d9p8S5QoaujKcNQxcQ=="],
+
"express/cookie": ["cookie@0.7.2", "", {}, "sha512-yki5XnKuf750l50uGTllt6kKILY4nQ1eNIQatoXEByZ5dWgnKqbnqmTrBE5B4N7lrMJKQ2ytWMiTO2o0v6Ew/w=="],
"express/send": ["send@1.2.0", "", { "dependencies": { "debug": "^4.3.5", "encodeurl": "^2.0.0", "escape-html": "^1.0.3", "etag": "^1.8.1", "fresh": "^2.0.0", "http-errors": "^2.0.0", "mime-types": "^3.0.1", "ms": "^2.1.3", "on-finished": "^2.4.1", "range-parser": "^1.2.1", "statuses": "^2.0.1" } }, "sha512-uaW0WwXKpL9blXE2o0bRhoL2EGXIrZxQ2ZQ4mgcfoBxdFmQold+qWsD2jLrfZ0trjKL6vOw0j//eAwcALFjKSw=="],
@@ -3228,6 +3247,10 @@
"html-minifier-terser/commander": ["commander@10.0.1", "", {}, "sha512-y4Mg2tXshplEbSGzx7amzPwKKOCGuoSRP/CjEdwwk0FOGlUbq6lKuoyDZTNZkmxHdJtp54hdfY/JUrdL7Xfdug=="],
+ "html-to-text/htmlparser2": ["htmlparser2@8.0.2", "", { "dependencies": { "domelementtype": "^2.3.0", "domhandler": "^5.0.3", "domutils": "^3.0.1", "entities": "^4.4.0" } }, "sha512-GYdjWKDkbRLkZ5geuHs5NY1puJ+PXwP7+fHPRz06Eirsb9ugf6d8kkXav6ADhcODhFFPMIXyxkxSuMf3D6NCFA=="],
+
+ "htmlparser2/entities": ["entities@6.0.1", "", {}, "sha512-aN97NXWF6AWBTahfVOIrB/NShkzi5H7F9r1s9mD3cDj4Ko5f2qhhVoYMibXF7GlLveb/D2ioWay8lxI97Ven3g=="],
+
"http-errors/statuses": ["statuses@2.0.1", "", {}, "sha512-RwNA9Z/7PrK06rYLIzFMlaF+l73iwpzsqRIFgbMLbTcLD6cOao82TaWefPXQvB2fOC4AjuYSEndS7N/mTCbkdQ=="],
"js-beautify/glob": ["glob@10.4.5", "", { "dependencies": { "foreground-child": "^3.1.0", "jackspeak": "^3.1.2", "minimatch": "^9.0.4", "minipass": "^7.1.2", "package-json-from-dist": "^1.0.0", "path-scurry": "^1.11.1" }, "bin": { "glob": "dist/esm/bin.mjs" } }, "sha512-7Bv8RF0k6xjo7d4A/PxYLbUCfb6c+Vpd2/mB2yRDlew7Jb5hEXiCD9ibfO7wpk8i4sevK6DFny9h7EYbM3/sHg=="],
diff --git a/docs/evaluation-implementation-plan.md b/docs/evaluation-implementation-plan.md
new file mode 100644
index 0000000000..c7e97bcf16
--- /dev/null
+++ b/docs/evaluation-implementation-plan.md
@@ -0,0 +1,226 @@
+# Evaluation Framework Implementation Plan
+
+## Work Stream 1: Trace Foundation (Core Data Layer)
+**Goal**: Materialize sessions into complete traces with evaluation context
+
+### Steps:
+1. ✅ Create trace namespace and types
+2. ✅ Implement trace materialization from session
+3. ✅ Add trace storage layer
+4. ✅ Create trace list/get APIs
+5. ✅ Add trace completion event
+
+**Parallel with**: Stream 2 (Metric definitions are independent)
+
+---
+
+## Work Stream 2: Metric Registry (Evaluation Criteria)
+**Goal**: Define what we evaluate and how
+
+### Steps:
+1. ✅ Create metric schema and types
+2. ✅ Implement metric registry (CRUD)
+3. ✅ Build 5-7 built-in metrics (heuristics)
+4. ✅ Create rule-based evaluator
+5. ✅ Add metric storage
+6. ✅ Create metric versioning system
+
+**Parallel with**: Stream 1 (doesn't need traces to define metrics)
+
+---
+
+## Work Stream 3: Evaluation Engine (The Executor)
+**Goal**: Run metrics against traces and store results
+
+### Steps:
+1. ✅ Create evaluation result schema
+2. ✅ Implement heuristic evaluator
+3. ✅ Implement rule evaluator
+4. ✅ Build evaluation engine orchestrator
+5. ✅ Add evaluation storage
+6. ✅ Create evaluation query API
+7. ✅ Emit evaluation events
+
+**Depends on**: Streams 1 & 2 complete
+
+---
+
+## Work Stream 4: Dataset Management (Test Cases)
+**Goal**: Store and manage test case collections
+
+### Steps:
+1. ✅ Create dataset schema
+2. ✅ Implement dataset CRUD
+3. ✅ Create test case schema with assertions
+4. ✅ Build dataset storage layer
+5. ✅ Add dataset CLI commands
+6. ✅ Create dataset import/export
+
+**Parallel with**: Stream 3 (independent data model)
+
+---
+
+## Work Stream 5: Test Runner (Execute & Evaluate)
+**Goal**: Run datasets and evaluate results
+
+### Steps:
+1. ✅ Create test execution engine
+2. ✅ Implement assertion framework
+3. ✅ Build test result aggregation
+4. ✅ Add parallel execution support
+5. ✅ Create CLI: `opencode test run`
+6. ✅ Add result output formats (JSON, pretty)
+7. ✅ Implement fail-on-error mode
+
+**Depends on**: Streams 3 & 4 complete
+
+---
+
+## Work Stream 6: Scorecards (Quality Gates)
+**Goal**: Bundle metrics into pass/fail contracts
+
+### Steps:
+1. ✅ Create scorecard schema
+2. ✅ Implement scorecard evaluator
+3. ✅ Build 2-3 built-in scorecards
+4. ✅ Add scorecard storage
+5. ✅ Create scorecard CLI
+6. ✅ Integrate with test runner
+
+**Depends on**: Stream 3 complete
+**Parallel with**: Stream 5 (can build while test runner develops)
+
+---
+
+## Work Stream 7: CLI Integration (Developer UX)
+**Goal**: Make everything accessible via command line
+
+### Steps:
+1. ✅ Create `opencode eval` command group
+2. ✅ Add `opencode eval trace `
+3. ✅ Add `opencode eval run `
+4. ✅ Create `opencode dataset` command group
+5. ✅ Create `opencode test` command group
+6. ✅ Add pretty formatting for all outputs
+7. ✅ Create help documentation
+
+**Parallel with**: All streams (add CLI as features complete)
+
+---
+
+## Work Stream 8: CI/CD Integration (Automation)
+**Goal**: Enable automated quality gates
+
+### Steps:
+1. ✅ Create GitHub Action workflow example
+2. ✅ Add PR comment formatting
+3. ✅ Implement baseline comparison
+4. ✅ Add regression detection
+5. ✅ Create CI-friendly output formats
+6. ✅ Document setup guide
+
+**Depends on**: Streams 5 & 6 complete
+
+---
+
+## Parallelization Strategy
+
+### Phase 1 (Parallel - Start Together)
+- **Stream 1** (Trace) - One dev
+- **Stream 2** (Metrics) - One dev
+- **Stream 4** (Datasets) - One dev
+
+### Phase 2 (Requires Phase 1)
+- **Stream 3** (Engine) - Needs Streams 1+2
+- **Stream 6** (Scorecards) - Needs Stream 2
+- Continue **Stream 7** (CLI) - Add commands as features complete
+
+### Phase 3 (Integration)
+- **Stream 5** (Test Runner) - Needs Streams 3+4
+- **Stream 8** (CI/CD) - Needs Streams 5+6
+
+---
+
+## Implementation Order (Solo Developer)
+
+1. **Trace Foundation** (2-3 hours)
+2. **Metric Registry** (2-3 hours)
+3. **Evaluation Engine** (3-4 hours)
+4. **Dataset Management** (2-3 hours)
+5. **Test Runner** (3-4 hours)
+6. **Scorecards** (2 hours)
+7. **CLI Integration** (ongoing, 1-2 hours)
+8. **CI/CD Examples** (1-2 hours)
+
+**Total**: 16-24 hours of implementation
+
+---
+
+## Success Criteria
+
+### Stream 1 (Trace)
+- [ ] Can materialize any session into a trace
+- [ ] Traces stored with full context
+- [ ] Can query traces by filters
+
+### Stream 2 (Metrics)
+- [ ] 5+ built-in metrics defined
+- [ ] Can register custom metrics
+- [ ] Metrics are versioned
+
+### Stream 3 (Engine)
+- [ ] Can evaluate trace against metric
+- [ ] Results stored persistently
+- [ ] Can query evaluation history
+
+### Stream 4 (Datasets)
+- [ ] Can create/read/update/delete datasets
+- [ ] Can add test cases
+- [ ] Can import/export JSON
+
+### Stream 5 (Runner)
+- [ ] Can run full dataset
+- [ ] Assertions work correctly
+- [ ] Results show pass/fail clearly
+
+### Stream 6 (Scorecards)
+- [ ] Can define quality contracts
+- [ ] Can evaluate trace against scorecard
+- [ ] Built-in scorecards available
+
+### Stream 7 (CLI)
+- [ ] All features accessible via CLI
+- [ ] Help text comprehensive
+- [ ] Output is readable
+
+### Stream 8 (CI/CD)
+- [ ] Example workflow works
+- [ ] Can block PRs on failure
+- [ ] Results post to PR
+
+---
+
+## Commit Strategy
+
+**Small, Atomic Commits:**
+- After each step within a stream
+- Push after completing each stream
+- Tag major milestones
+
+**Commit Message Format:**
+```
+:
+
+- Detail 1
+- Detail 2
+```
+
+Example:
+```
+trace: implement trace materialization
+
+- Add Trace.Complete type
+- Implement materialize() function
+- Add storage layer for traces
+- Emit trace.completed events
+```
diff --git a/docs/evaluation-implementation.md b/docs/evaluation-implementation.md
new file mode 100644
index 0000000000..aeedefbc47
--- /dev/null
+++ b/docs/evaluation-implementation.md
@@ -0,0 +1,887 @@
+# Evaluation Implementation Strategy
+
+## Phase 1: Foundation (Week 1-2)
+
+### 1.1 Trace Materialization
+
+**Goal**: Unify Session + TelemetryEvents into a complete Trace abstraction
+
+**Changes**:
+```typescript
+// packages/opencode/src/trace/index.ts
+export namespace Trace {
+ // Extends Session with evaluation context
+ export type Complete = {
+ // Session data
+ session: Session.Info
+ messages: MessageV2.Message[]
+
+ // Execution context (NEW)
+ agentName: string
+ modelConfig: {
+ provider: string
+ model: string
+ temperature?: number
+ maxTokens?: number
+ }
+ systemPrompt: string
+ systemPromptVersion?: string
+
+ // Tool events (already captured)
+ toolCalls: TelemetryEvent[]
+
+ // Aggregated metrics
+ summary: {
+ duration: number
+ toolCallCount: number
+ errorCount: number
+ tokens: TokenUsage
+ cost: number
+ }
+
+ // Evaluation results (empty initially)
+ evaluations: Evaluation[]
+ }
+
+ // Create a trace from a session
+ export async function materialize(sessionID: string): Promise
+
+ // List traces with filters
+ export async function list(filter?: TraceFilter): AsyncIterableIterator
+
+ // Get a specific trace
+ export async function get(traceID: string): Promise
+}
+```
+
+**Implementation**:
+```typescript
+export async function materialize(sessionID: string): Promise {
+ const session = await Session.get(sessionID)
+ const messages = await Session.messages(sessionID)
+
+ // Get telemetry events for this session
+ const history = await ToolHistory.read()
+ const toolCalls = history.events.filter(e => e.sessionID === sessionID)
+
+ // Extract model config from first assistant message
+ const firstAssistant = messages.find(m => m.info.role === "assistant")
+ const modelConfig = firstAssistant ? {
+ provider: firstAssistant.info.providerID,
+ model: firstAssistant.info.modelID,
+ // Extract other params from metadata
+ } : { provider: "unknown", model: "unknown" }
+
+ // Load system prompt (from session init)
+ const systemPrompt = await getSystemPromptForSession(sessionID)
+
+ return {
+ session,
+ messages,
+ agentName: session.agent ?? "default",
+ modelConfig,
+ systemPrompt,
+ toolCalls,
+ summary: computeSummary(messages, toolCalls),
+ evaluations: []
+ }
+}
+```
+
+**Storage**: Store materialized traces
+```typescript
+["trace", projectID, sessionID] -> Trace.Complete
+```
+
+**Event**: Emit trace completion
+```typescript
+Bus.publish(Trace.Event.Completed, { trace })
+```
+
+---
+
+### 1.2 Metric Registry
+
+**Goal**: Define evaluation metrics as declarative config
+
+**Schema**:
+```typescript
+// packages/opencode/src/evaluation/metric.ts
+export namespace Metric {
+ export type Definition = {
+ id: string
+ name: string
+ description: string
+ version: string
+
+ category: "performance" | "correctness" | "safety" | "cost"
+
+ evaluator: RuleEvaluator | LLMEvaluator | HeuristicEvaluator
+
+ threshold?: {
+ pass: number
+ warn?: number
+ }
+
+ higherIsBetter: boolean
+ }
+
+ type RuleEvaluator = {
+ type: "rule"
+ expression: string // JavaScript expression
+ }
+
+ type LLMEvaluator = {
+ type: "llm"
+ prompt: string
+ model: string
+ parseScore: (output: string) => number
+ }
+
+ type HeuristicEvaluator = {
+ type: "heuristic"
+ function: keyof typeof Heuristics
+ params?: Record
+ }
+}
+```
+
+**Built-in Metrics** (start with simple ones):
+```typescript
+// packages/opencode/src/evaluation/metrics/builtin.ts
+export const BuiltinMetrics: Record = {
+ "tool-error-rate": {
+ id: "tool-error-rate",
+ name: "Tool Error Rate",
+ description: "Percentage of tool calls that failed",
+ version: "1.0.0",
+ category: "performance",
+ evaluator: {
+ type: "heuristic",
+ function: "toolErrorRate"
+ },
+ threshold: {
+ pass: 0.1, // <10% errors
+ warn: 0.05
+ },
+ higherIsBetter: false
+ },
+
+ "response-latency": {
+ id: "response-latency",
+ name: "Response Latency",
+ description: "Total time to complete request",
+ version: "1.0.0",
+ category: "performance",
+ evaluator: {
+ type: "rule",
+ expression: "trace.summary.duration"
+ },
+ threshold: {
+ pass: 30000, // <30s
+ warn: 10000 // <10s is good
+ },
+ higherIsBetter: false
+ },
+
+ "redundant-calls": {
+ id: "redundant-calls",
+ name: "Redundant Tool Calls",
+ description: "Detects repeated identical tool calls",
+ version: "1.0.0",
+ category: "correctness",
+ evaluator: {
+ type: "heuristic",
+ function: "detectRedundantCalls"
+ },
+ threshold: { pass: 0 },
+ higherIsBetter: false
+ }
+}
+```
+
+**Heuristic Implementations**:
+```typescript
+// packages/opencode/src/evaluation/heuristics.ts
+export const Heuristics = {
+ toolErrorRate(trace: Trace.Complete): number {
+ if (trace.toolCalls.length === 0) return 0
+ const errors = trace.toolCalls.filter(t => t.status === "error").length
+ return errors / trace.toolCalls.length
+ },
+
+ detectRedundantCalls(trace: Trace.Complete): number {
+ const seen = new Map()
+ for (const call of trace.toolCalls) {
+ const key = `${call.id}:${JSON.stringify(call.extra)}`
+ seen.set(key, (seen.get(key) || 0) + 1)
+ }
+ return Array.from(seen.values()).filter(count => count > 1).length
+ },
+
+ // More heuristics...
+}
+```
+
+---
+
+### 1.3 Evaluation Engine
+
+**Goal**: Execute metrics against traces and store results
+
+```typescript
+// packages/opencode/src/evaluation/engine.ts
+export namespace EvaluationEngine {
+ export type Result = {
+ id: string
+ traceID: string
+ metricID: string
+ score: number
+ passed: boolean
+ timestamp: number
+
+ evaluatorType: "rule" | "llm" | "heuristic"
+ reasoning?: string
+ metadata?: Record
+ }
+
+ // Evaluate a trace against a metric
+ export async function evaluate(
+ trace: Trace.Complete,
+ metric: Metric.Definition
+ ): Promise {
+ const score = await computeScore(trace, metric)
+ const threshold = metric.threshold?.pass ?? 0
+
+ const passed = metric.higherIsBetter
+ ? score >= threshold
+ : score <= threshold
+
+ return {
+ id: Identifier.ascending("evaluation"),
+ traceID: trace.session.id,
+ metricID: metric.id,
+ score,
+ passed,
+ timestamp: Date.now(),
+ evaluatorType: metric.evaluator.type
+ }
+ }
+
+ // Evaluate against multiple metrics
+ export async function evaluateMany(
+ trace: Trace.Complete,
+ metrics: Metric.Definition[]
+ ): Promise {
+ return Promise.all(metrics.map(m => evaluate(trace, m)))
+ }
+
+ async function computeScore(
+ trace: Trace.Complete,
+ metric: Metric.Definition
+ ): Promise {
+ switch (metric.evaluator.type) {
+ case "rule":
+ return evaluateRule(trace, metric.evaluator.expression)
+ case "heuristic":
+ return evaluateHeuristic(trace, metric.evaluator)
+ case "llm":
+ return evaluateLLM(trace, metric.evaluator)
+ }
+ }
+
+ function evaluateRule(trace: Trace.Complete, expression: string): number {
+ // Safe eval with restricted context
+ const func = new Function("trace", `return ${expression}`)
+ return func(trace)
+ }
+
+ function evaluateHeuristic(
+ trace: Trace.Complete,
+ evaluator: Extract
+ ): number {
+ const heuristic = Heuristics[evaluator.function]
+ if (!heuristic) throw new Error(`Unknown heuristic: ${evaluator.function}`)
+ return heuristic(trace, evaluator.params)
+ }
+
+ async function evaluateLLM(
+ trace: Trace.Complete,
+ evaluator: Extract
+ ): Promise {
+ // Call LLM with prompt + trace context
+ const response = await callLLM(evaluator.model, {
+ prompt: evaluator.prompt,
+ context: formatTraceForLLM(trace)
+ })
+ return evaluator.parseScore(response)
+ }
+}
+```
+
+**Storage**:
+```typescript
+["evaluation", traceID, evaluationID] -> EvaluationEngine.Result
+```
+
+---
+
+## Phase 2: Datasets & Testing (Week 3-4)
+
+### 2.1 Dataset Management
+
+```typescript
+// packages/opencode/src/evaluation/dataset.ts
+export namespace Dataset {
+ export type Definition = {
+ id: string
+ name: string
+ description: string
+ version: string
+
+ cases: TestCase[]
+
+ tags: string[]
+ createdAt: number
+ updatedAt: number
+ }
+
+ export type TestCase = {
+ id: string
+ name: string
+
+ // Input
+ prompt: string
+ context?: {
+ files?: Array<{ path: string; content: string }>
+ workingDirectory?: string
+ env?: Record
+ }
+
+ // Expectations (optional, for assertions)
+ expected?: {
+ toolCalls?: string[] // Expected tool IDs
+ outputContains?: string[] // Substrings that should appear
+ outputNotContains?: string[]
+ assertions?: Assertion[]
+ }
+
+ tags: string[]
+ metadata?: Record
+ }
+
+ export type Assertion = {
+ type: "tool-called" | "tool-not-called" | "output-matches" | "custom"
+ params: Record
+ message: string
+ }
+
+ // CRUD operations
+ export async function create(def: Omit): Promise
+ export async function get(id: string): Promise
+ export async function update(id: string, changes: Partial): Promise
+ export async function list(): AsyncIterableIterator
+ export async function delete(id: string): Promise
+
+ // Case management
+ export async function addCase(datasetID: string, testCase: Omit): Promise
+ export async function removeCase(datasetID: string, caseID: string): Promise
+}
+```
+
+**Storage**:
+```typescript
+["dataset", datasetID] -> Dataset.Definition
+```
+
+**CLI**:
+```bash
+# Create dataset from scratch
+opencode dataset create smoke-tests --description "Critical path tests"
+
+# Add test case
+opencode dataset add smoke-tests --prompt "Create a file called test.txt with 'hello world'"
+
+# Capture current interaction as test case
+opencode dataset capture --name "auth flow" --dataset auth-tests
+
+# List datasets
+opencode dataset list
+
+# Export/Import
+opencode dataset export smoke-tests > smoke-tests.json
+opencode dataset import < smoke-tests.json
+```
+
+---
+
+### 2.2 Test Runner
+
+```typescript
+// packages/opencode/src/evaluation/runner.ts
+export namespace TestRunner {
+ export type RunConfig = {
+ datasetID: string
+ metrics: string[] // Metric IDs to evaluate
+
+ // Agent config (what to test)
+ agentName?: string
+ modelOverride?: string
+ systemPromptOverride?: string
+
+ // Execution options
+ parallel?: number // How many tests to run in parallel
+ timeout?: number
+ stopOnFailure?: boolean
+ }
+
+ export type RunResult = {
+ id: string
+ datasetID: string
+ config: RunConfig
+
+ startTime: number
+ endTime: number
+
+ results: CaseResult[]
+
+ summary: {
+ total: number
+ passed: number
+ failed: number
+ duration: number
+ }
+ }
+
+ export type CaseResult = {
+ caseID: string
+ traceID: string
+
+ status: "passed" | "failed" | "error"
+
+ evaluations: EvaluationEngine.Result[]
+ assertionResults: AssertionResult[]
+
+ duration: number
+ error?: string
+ }
+
+ export async function run(config: RunConfig): Promise {
+ const dataset = await Dataset.get(config.datasetID)
+ const metrics = await Promise.all(
+ config.metrics.map(id => MetricRegistry.get(id))
+ )
+
+ const results: CaseResult[] = []
+
+ for (const testCase of dataset.cases) {
+ // Execute the test case
+ const trace = await executeTestCase(testCase, config)
+
+ // Evaluate
+ const evaluations = await EvaluationEngine.evaluateMany(trace, metrics)
+
+ // Check assertions
+ const assertionResults = testCase.expected?.assertions
+ ? await checkAssertions(trace, testCase.expected.assertions)
+ : []
+
+ const allPassed =
+ evaluations.every(e => e.passed) &&
+ assertionResults.every(a => a.passed)
+
+ results.push({
+ caseID: testCase.id,
+ traceID: trace.session.id,
+ status: allPassed ? "passed" : "failed",
+ evaluations,
+ assertionResults,
+ duration: trace.summary.duration
+ })
+
+ if (!allPassed && config.stopOnFailure) break
+ }
+
+ return {
+ id: Identifier.ascending("test-run"),
+ datasetID: config.datasetID,
+ config,
+ startTime: Date.now(),
+ endTime: Date.now(),
+ results,
+ summary: computeSummary(results)
+ }
+ }
+
+ async function executeTestCase(
+ testCase: Dataset.TestCase,
+ config: RunConfig
+ ): Promise {
+ // Create a test session
+ const session = await Session.create()
+
+ // Apply context overrides
+ if (testCase.context?.files) {
+ // Mock file system
+ }
+
+ // Send the prompt
+ await SessionPrompt.prompt({
+ sessionID: session.id,
+ parts: [{ type: "text", text: testCase.prompt }],
+ agent: config.agentName,
+ model: config.modelOverride
+ })
+
+ // Wait for completion
+ await waitForSessionComplete(session.id, config.timeout)
+
+ // Materialize trace
+ return Trace.materialize(session.id)
+ }
+}
+```
+
+**CLI**:
+```bash
+# Run a dataset with default metrics
+opencode test run smoke-tests
+
+# Run with specific metrics
+opencode test run smoke-tests --metrics tool-error-rate,response-latency
+
+# Run and fail CI if any test fails
+opencode test run regression-suite --fail-on-error --quiet
+
+# Run with prompt override
+opencode test run edge-cases --system-prompt "You are extra cautious"
+
+# Compare two configurations
+opencode test compare smoke-tests \
+ --baseline "model=gpt-4" \
+ --variant "model=claude-3.5-sonnet"
+```
+
+---
+
+## Phase 3: CI Integration (Week 5)
+
+### 3.1 Scorecards
+
+```typescript
+// packages/opencode/src/evaluation/scorecard.ts
+export namespace Scorecard {
+ export type Definition = {
+ id: string
+ name: string
+ description: string
+ version: string
+
+ metrics: ScorecardMetric[]
+
+ passingCriteria: {
+ requireAll: boolean
+ minimumPassing?: number
+ }
+
+ tags: string[]
+ }
+
+ export type ScorecardMetric = {
+ metricID: string
+ weight: number
+ required: boolean
+ thresholdOverride?: number
+ }
+
+ export async function evaluate(
+ scorecard: Definition,
+ trace: Trace.Complete
+ ): Promise {
+ const metrics = await Promise.all(
+ scorecard.metrics.map(sm => MetricRegistry.get(sm.metricID))
+ )
+
+ const evaluations = await EvaluationEngine.evaluateMany(trace, metrics)
+
+ const results = scorecard.metrics.map((sm, i) => {
+ const evaluation = evaluations[i]
+ const threshold = sm.thresholdOverride ?? metrics[i].threshold?.pass
+
+ return {
+ metricID: sm.metricID,
+ score: evaluation.score,
+ passed: evaluation.passed,
+ required: sm.required,
+ weight: sm.weight
+ }
+ })
+
+ const requiredPassed = results
+ .filter(r => r.required)
+ .every(r => r.passed)
+
+ const totalPassed = results.filter(r => r.passed).length
+ const meetsMinimum = !scorecard.passingCriteria.minimumPassing ||
+ totalPassed >= scorecard.passingCriteria.minimumPassing
+
+ const overallPass = scorecard.passingCriteria.requireAll
+ ? results.every(r => r.passed)
+ : requiredPassed && meetsMinimum
+
+ return {
+ scorecardID: scorecard.id,
+ traceID: trace.session.id,
+ results,
+ overallPass,
+ timestamp: Date.now()
+ }
+ }
+}
+```
+
+**Predefined Scorecards**:
+```typescript
+// packages/opencode/src/evaluation/scorecards/builtin.ts
+export const BuiltinScorecards: Record = {
+ "regression-prevention": {
+ id: "regression-prevention",
+ name: "Regression Prevention",
+ description: "Ensures code changes don't break existing behavior",
+ version: "1.0.0",
+ metrics: [
+ { metricID: "tool-error-rate", weight: 1, required: true },
+ { metricID: "response-latency", weight: 0.5, required: false },
+ { metricID: "redundant-calls", weight: 0.5, required: false }
+ ],
+ passingCriteria: {
+ requireAll: false,
+ minimumPassing: 2
+ },
+ tags: ["ci", "critical"]
+ },
+
+ "production-ready": {
+ id: "production-ready",
+ name: "Production Ready",
+ description: "Meets production quality standards",
+ version: "1.0.0",
+ metrics: [
+ { metricID: "tool-error-rate", weight: 1, required: true },
+ { metricID: "response-latency", weight: 1, required: true },
+ { metricID: "redundant-calls", weight: 1, required: true },
+ { metricID: "cost-efficiency", weight: 0.5, required: false }
+ ],
+ passingCriteria: {
+ requireAll: true
+ },
+ tags: ["production", "strict"]
+ }
+}
+```
+
+---
+
+### 3.2 GitHub Action Integration
+
+```yaml
+# .github/workflows/eval.yml
+name: Evaluation Gates
+
+on:
+ pull_request:
+ types: [opened, synchronize]
+
+jobs:
+ eval-gate:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v3
+
+ - name: Setup OpenCode
+ run: |
+ curl -fsSL https://opencode.ai/install | bash
+ opencode auth login --token ${{ secrets.OPENCODE_TOKEN }}
+
+ - name: Run Regression Tests
+ run: |
+ opencode test run regression-suite \
+ --scorecard regression-prevention \
+ --fail-on-error \
+ --output json > eval-results.json
+
+ - name: Post Results to PR
+ if: always()
+ uses: actions/github-script@v6
+ with:
+ script: |
+ const fs = require('fs')
+ const results = JSON.parse(fs.readFileSync('eval-results.json'))
+
+ const comment = `## Evaluation Results
+
+ ${results.summary.passed}/${results.summary.total} tests passed
+
+ ${results.summary.passed < results.summary.total ? '❌ Some tests failed' : '✅ All tests passed'}
+ `
+
+ github.rest.issues.createComment({
+ issue_number: context.issue.number,
+ owner: context.repo.owner,
+ repo: context.repo.repo,
+ body: comment
+ })
+
+ - name: Upload Detailed Results
+ if: always()
+ uses: actions/upload-artifact@v3
+ with:
+ name: eval-results
+ path: eval-results.json
+```
+
+---
+
+## Phase 4: Advanced Features (Week 6+)
+
+### 4.1 LLM-as-Judge Metrics
+
+```typescript
+// Example: Hallucination detection
+const hallucinationMetric: Metric.Definition = {
+ id: "hallucination-detection",
+ name: "Hallucination Detection",
+ description: "Detects when the agent makes unsupported claims",
+ version: "1.0.0",
+ category: "correctness",
+ evaluator: {
+ type: "llm",
+ model: "gpt-4o-mini", // Cheaper model for evals
+ prompt: `You are evaluating an AI coding assistant's response for hallucinations.
+
+Context: The assistant had access to these files:
+{{available_files}}
+
+The assistant's response:
+{{response}}
+
+Tool calls made:
+{{tool_calls}}
+
+Question: Did the assistant make any claims about files, functions, or code that it couldn't have known from the available context?
+
+Respond with a score from 0-1:
+- 0 = No hallucinations, all claims are grounded
+- 0.5 = Minor unsupported assumptions
+- 1 = Major hallucinations or fabricated information
+
+Score:`,
+ parseScore: (output: string) => {
+ const match = output.match(/Score:\s*([\d.]+)/)
+ return match ? parseFloat(match[1]) : 0.5
+ }
+ },
+ threshold: { pass: 0.3 },
+ higherIsBetter: false
+}
+```
+
+---
+
+### 4.2 Synthetic Data Generation
+
+```typescript
+// packages/opencode/src/evaluation/synthetic.ts
+export namespace SyntheticData {
+ export type GeneratorConfig = {
+ baseScenarios: string[] // e.g., "create a file", "debug an error"
+ variations: number // How many variations per scenario
+ complexity: "simple" | "medium" | "complex"
+ }
+
+ export async function generate(config: GeneratorConfig): Promise {
+ const cases: Dataset.TestCase[] = []
+
+ for (const scenario of config.baseScenarios) {
+ // Use LLM to generate variations
+ const prompt = `Generate ${config.variations} variations of this coding task: "${scenario}"
+
+ Complexity level: ${config.complexity}
+
+ For each variation, provide:
+ 1. A clear task description
+ 2. Expected tool usage
+ 3. Success criteria
+
+ Format as JSON array.`
+
+ const variations = await callLLM("gpt-4", { prompt })
+
+ for (const variation of variations) {
+ cases.push({
+ id: Identifier.ascending("test-case"),
+ name: variation.description,
+ prompt: variation.description,
+ expected: {
+ toolCalls: variation.expectedTools,
+ assertions: variation.assertions
+ },
+ tags: ["synthetic", config.complexity],
+ metadata: { generatedFrom: scenario }
+ })
+ }
+ }
+
+ return cases
+ }
+}
+```
+
+**CLI**:
+```bash
+# Generate test cases
+opencode dataset generate \
+ --scenarios "file operations,refactoring,debugging" \
+ --variations 5 \
+ --complexity medium \
+ --output edge-cases
+```
+
+---
+
+## Summary: What Gets Built When
+
+**Week 1-2: Foundation**
+- ✅ Trace materialization
+- ✅ Metric registry with 5-10 built-in metrics
+- ✅ Evaluation engine (rule + heuristic)
+- ✅ Storage layer
+- 🔧 CLI: `opencode eval trace `
+
+**Week 3-4: Datasets**
+- ✅ Dataset CRUD
+- ✅ Test runner
+- ✅ Assertion framework
+- 🔧 CLI: `opencode test run `
+
+**Week 5: CI Integration**
+- ✅ Scorecards
+- ✅ GitHub Action
+- ✅ PR comments with results
+- 🔧 CLI: `opencode test run --fail-on-error`
+
+**Week 6+: Advanced**
+- ⏳ LLM-as-judge metrics
+- ⏳ Synthetic data generation
+- ⏳ Experiment framework (A/B testing)
+- ⏳ Web dashboard for results
+
+---
+
+## Development Philosophy
+
+1. **Start with telemetry** - Already have tool instrumentation, build on it
+2. **Dogfood immediately** - Use it to test Grimoire itself
+3. **Ship incrementally** - Each phase is independently useful
+4. **Learn from usage** - Let real usage guide metric selection
+5. **Keep it fast** - Sub-10min CI runs, real-time feedback
diff --git a/docs/evaluation-ontology.md b/docs/evaluation-ontology.md
new file mode 100644
index 0000000000..e4561fe29a
--- /dev/null
+++ b/docs/evaluation-ontology.md
@@ -0,0 +1,513 @@
+# Evaluation Ontology: First Principles
+
+## Core Entities
+
+### 1. **Trace** (Execution Context)
+The fundamental unit of observable behavior. A Trace represents a complete interaction flow.
+
+```typescript
+type Trace = {
+ id: string // Unique identifier
+ sessionID: string // Which session this belongs to
+ startTime: number
+ endTime?: number
+ status: "running" | "completed" | "failed"
+
+ // Identity
+ agentName: string // Which agent executed this
+ modelConfig: { // Model configuration at time of execution
+ provider: string
+ model: string
+ temperature?: number
+ // ... other model params
+ }
+
+ // Prompt context
+ systemPrompt: string // The actual system prompt used
+ systemPromptVersion?: string // Semantic version or hash
+
+ // Structure
+ messages: Message[] // The full conversation
+ toolCalls: ToolCall[] // All tool invocations
+
+ // Outcomes
+ tokens: TokenUsage
+ cost: number
+
+ // Evaluation
+ evaluations?: Evaluation[] // Assessments of this trace
+}
+```
+
+**Why Trace?**
+- A trace is self-contained - you can replay, analyze, or evaluate it independently
+- It captures the entire context needed to understand "what happened"
+- Maps naturally to OpenTelemetry/observability concepts
+- Already partially exists via Session + Messages + TelemetryEvents
+
+---
+
+### 2. **Evaluation** (Assessment)
+A judgment about a Trace or component thereof.
+
+```typescript
+type Evaluation = {
+ id: string
+ traceID: string
+
+ // What's being evaluated
+ target: {
+ type: "trace" | "message" | "tool_call" | "output"
+ id: string
+ }
+
+ // The evaluation criteria
+ metricID: string // Which metric was applied
+
+ // The judgment
+ score: number // Normalized 0-1 or metric-specific
+ passed: boolean // Did it meet threshold?
+
+ // Context
+ timestamp: number
+ evaluatorType: "rule" | "llm" | "human" | "heuristic"
+ evaluatorID?: string // Which LLM or human
+
+ // Evidence
+ reasoning?: string // Why this score (esp. for LLM judges)
+ metadata?: Record
+}
+```
+
+**Why separate Evaluation from Trace?**
+- A trace can be evaluated multiple times with different metrics
+- Evaluations can be retroactive - evaluate past traces with new criteria
+- Different stakeholders care about different evaluations
+- Enables A/B testing of evaluation methods themselves
+
+---
+
+### 3. **Metric** (Evaluation Criterion)
+Defines *what* we're measuring and *how*.
+
+```typescript
+type Metric = {
+ id: string
+ name: string
+ description: string
+
+ // What does this measure?
+ domain: "correctness" | "safety" | "efficiency" | "quality" | "compliance"
+
+ // How is it computed?
+ evaluator: {
+ type: "rule" | "llm" | "human" | "heuristic"
+
+ // For rule-based
+ rule?: {
+ expression: string // e.g., "duration < 5000"
+ language: "javascript" | "jsonlogic"
+ }
+
+ // For LLM-based
+ llm?: {
+ prompt: string
+ model: string
+ parseOutput: "boolean" | "score_0_1" | "score_1_10" | "reasoning"
+ }
+
+ // For heuristic
+ heuristic?: {
+ function: string // Name of built-in function
+ params?: Record
+ }
+ }
+
+ // Interpretation
+ threshold?: number // Pass/fail cutoff
+ higherIsBetter: boolean
+
+ // Metadata
+ version: string
+ tags: string[]
+}
+```
+
+**Built-in Heuristics Examples:**
+- `tool_error_rate`: Ratio of failed tool calls
+- `redundant_tool_calls`: Detects repeated identical calls
+- `hallucination_indicators`: Flags suspicious patterns
+- `token_efficiency`: Output quality per token spent
+
+---
+
+### 4. **Dataset** (Test Cases)
+A collection of inputs with expected behaviors.
+
+```typescript
+type Dataset = {
+ id: string
+ name: string
+ description: string
+ version: string
+
+ cases: TestCase[]
+
+ // Metadata
+ tags: string[] // "regression", "edge_cases", "production_sample"
+ createdAt: number
+ updatedAt: number
+}
+
+type TestCase = {
+ id: string
+
+ // Input
+ prompt: string // What the user asks
+ context?: { // Optional environmental context
+ files?: string[] // Which files exist
+ workingDirectory?: string
+ }
+
+ // Expected behavior (can be partial)
+ expected?: {
+ toolCalls?: string[] // Expected tools to be called
+ output?: string // Exact or fuzzy match
+ assertions?: Assertion[] // Custom checks
+ }
+
+ // Metadata
+ tags: string[]
+ difficulty?: "easy" | "medium" | "hard"
+ source?: "synthetic" | "production" | "manual"
+}
+
+type Assertion = {
+ type: "contains" | "not_contains" | "matches" | "tool_called" | "custom"
+ value: any
+ message?: string
+}
+```
+
+**Why separate Dataset?**
+- Enables versioning of test suites
+- Can run same dataset across different agent configs
+- Datasets can be shared/imported
+- Natural basis for CI gates: "Run dataset X, all cases must pass metric Y"
+
+---
+
+### 5. **Experiment** (Comparative Run)
+Structured comparison of different configurations.
+
+```typescript
+type Experiment = {
+ id: string
+ name: string
+ description: string
+
+ // What's being tested
+ datasetID: string
+
+ // Variants
+ variants: Variant[]
+
+ // Results
+ runs: Run[]
+
+ // Metadata
+ status: "running" | "completed" | "failed"
+ startTime: number
+ endTime?: number
+}
+
+type Variant = {
+ id: string
+ name: string // "baseline", "new_prompt", "gpt4o"
+
+ config: {
+ agentName?: string
+ systemPrompt?: string
+ model?: string
+ temperature?: number
+ // ... any configurable parameter
+ }
+}
+
+type Run = {
+ variantID: string
+ testCaseID: string
+ traceID: string // Links to the actual execution
+ evaluations: Evaluation[]
+}
+```
+
+**Why Experiment?**
+- Formalizes A/B testing
+- Enables statistical comparisons
+- Natural fit for prompt optimization
+- Can track what was learned: "new_prompt reduced error_rate by 15%"
+
+---
+
+### 6. **Scorecard** (Quality Contract)
+A bundle of metrics that define "good enough".
+
+```typescript
+type Scorecard = {
+ id: string
+ name: string
+ description: string
+
+ // Which metrics matter?
+ metrics: ScorecardMetric[]
+
+ // How do we aggregate?
+ passingCriteria: {
+ requireAll: boolean // AND vs OR
+ minimumPassing?: number // At least N metrics must pass
+ }
+
+ // Metadata
+ version: string
+ tags: string[]
+}
+
+type ScorecardMetric = {
+ metricID: string
+ weight: number // For weighted scoring
+ required: boolean // Must pass vs nice-to-have
+ threshold?: number // Override metric default
+}
+```
+
+**Why Scorecard?**
+- Enables "shift left" - define quality gates early
+- Different stages need different scorecards (dev vs staging vs prod)
+- Can version scorecards as requirements evolve
+- Natural CI integration: "This PR must pass scorecard:regression-prevention"
+
+---
+
+## Relationships
+
+```
+Dataset [1] ──< [N] TestCase
+TestCase [1] ──< [N] Trace (via Experiment or direct execution)
+Trace [1] ──< [N] ToolCall (via TelemetryEvent)
+Trace [1] ──< [N] Evaluation
+Evaluation [N] >── [1] Metric
+
+Experiment [1] ──< [N] Variant
+Experiment [1] ──> [1] Dataset
+Experiment [1] ──< [N] Run
+Run [1] ──> [1] Trace
+Run [1] ──> [1] TestCase
+
+Scorecard [1] ──< [N] ScorecardMetric
+ScorecardMetric [N] >── [1] Metric
+```
+
+---
+
+## Storage Design
+
+### Current State (What Exists)
+```typescript
+// Storage paths
+["session", projectID, sessionID] -> Session.Info
+["message", sessionID, messageID] -> Message
+["telemetry", "tools"] -> TelemetrySummary
+```
+
+### Proposed Additions
+```typescript
+// Traces (augmented sessions)
+["trace", projectID, traceID] -> Trace
+// Trace is basically Session + materialized tool events + evaluation results
+
+// Evaluation data
+["metric", metricID] -> Metric
+["evaluation", traceID, evaluationID] -> Evaluation
+
+// Test data
+["dataset", datasetID] -> Dataset
+["dataset", datasetID, "cases", caseID] -> TestCase
+
+// Experiments
+["experiment", experimentID] -> Experiment
+["experiment", experimentID, "runs", runID] -> Run
+
+// Scorecards
+["scorecard", scorecardID] -> Scorecard
+
+// Baselines (for comparison)
+["baseline", name] -> {
+ traceID: string
+ timestamp: number
+ metrics: Record
+}
+```
+
+---
+
+## Integration with Existing System
+
+### Already Have (Leverage)
+1. **TelemetryEvent** → Maps to ToolCall in Trace
+2. **Session + Messages** → Core of Trace
+3. **ToolHistory** → Can evolve into TraceIndex
+4. **Storage abstraction** → Can store new entities
+5. **Bus system** → Can emit evaluation events
+
+### Need to Build
+1. **Trace materialization** - Convert Session → Trace (capture full context)
+2. **Metric registry** - Define and load evaluation metrics
+3. **Evaluator engine** - Execute metrics against traces
+4. **Dataset management** - CRUD for test cases
+5. **Experiment runner** - Orchestrate comparative runs
+6. **Scorecard evaluator** - Check if trace meets quality bar
+
+### Migration Path
+**Phase 1: Trace Foundation**
+- Extend Session with Trace concept
+- Make system prompt, model config first-class
+- Ensure all tool events link to traces
+
+**Phase 2: Basic Evaluation**
+- Implement Metric schema
+- Build rule-based evaluator
+- Add evaluations to traces
+
+**Phase 3: Datasets & Experiments**
+- Dataset storage + CRUD
+- Simple experiment runner
+- CLI: `opencode eval run dataset:smoke-tests`
+
+**Phase 4: Advanced Features**
+- LLM-as-judge metrics
+- Scorecards + CI gates
+- Synthetic data generation
+
+---
+
+## Key Design Principles
+
+### 1. **Immutability**
+- Traces are immutable once completed
+- Evaluations are additive (never mutate a score)
+- Enables time-travel debugging
+- Can re-evaluate historical data
+
+### 2. **Composability**
+- Metrics compose into Scorecards
+- Datasets are just collections of TestCases
+- Experiments reference Datasets
+- Everything has an ID, everything can reference
+
+### 3. **Observability-Native**
+- Every entity has timestamps
+- Every operation emits events (via Bus)
+- Natural fit for OpenTelemetry export
+- Can stream evaluations in real-time
+
+### 4. **Schema Evolution**
+- Version everything (Metric v1.2.0, Dataset v3)
+- Additive changes only (new fields, not breaking)
+- Old data remains valid
+- Can re-run with new metric versions
+
+### 5. **Developer Ergonomics**
+- Defaults for 90% case: `opencode eval` just works
+- Progressive disclosure: simple → powerful
+- Git-like model: local-first, can push/share
+- Natural language where possible: "Test the auth flow"
+
+---
+
+## Example Workflows
+
+### Workflow 1: Add a Regression Test
+```bash
+# Capture current behavior as a test case
+opencode eval capture "Fix the login bug" --output dataset:auth-tests
+
+# Later, ensure it doesn't regress
+opencode eval run dataset:auth-tests --scorecard:regression
+```
+
+### Workflow 2: Optimize a Prompt
+```bash
+# Create experiment with 3 prompt variants
+opencode eval experiment \
+ --dataset=edge-cases \
+ --baseline="current prompt" \
+ --variant-1="revised prompt v1" \
+ --variant-2="revised prompt v2" \
+ --metrics=accuracy,latency,cost
+
+# Shows comparison table, picks winner
+```
+
+### Workflow 3: CI Gate
+```yaml
+# .github/workflows/pr.yml
+- name: Eval Gate
+ run: |
+ opencode eval run dataset:critical-paths \
+ --scorecard:production-ready \
+ --fail-on-regression
+```
+
+### Workflow 4: Production Monitoring
+```bash
+# Sample 1% of production traces
+opencode eval sample --rate=0.01 --metrics=safety,hallucination
+
+# Daily report
+opencode eval report --since=24h --compare-to=baseline
+```
+
+---
+
+## Open Questions
+
+1. **Granularity of Traces**: Should we trace individual tool calls or just full sessions?
+ - **Answer**: Sessions as traces, tool calls as spans within traces
+
+2. **Evaluation Frequency**: Real-time, batch, or on-demand?
+ - **Answer**: All three - streaming for CI, batch for experiments, on-demand for analysis
+
+3. **LLM-as-Judge Costs**: How to make evaluations affordable at scale?
+ - **Answer**: Sampling, caching, use cheaper models for routine checks
+
+4. **Metric Versioning**: How to handle metric changes over time?
+ - **Answer**: Semantic versioning, re-run with new versions is explicit
+
+5. **Synthetic vs Real Data**: How to generate good test cases?
+ - **Answer**: Start with production sampling, evolve to synthetic generators
+
+6. **Baseline Drift**: How to keep baselines current as system improves?
+ - **Answer**: Automatic baseline updates when new records set, manual approval
+
+---
+
+## Success Metrics for This System
+
+1. **Time to detect regression**: < 10 minutes (in CI)
+2. **False positive rate**: < 5% (don't block good changes)
+3. **Coverage**: 80%+ of tool operations have telemetry
+4. **Adoption**: Team actually uses it (ergonomics matter)
+5. **Insight generation**: Surfaces actionable patterns weekly
+
+---
+
+## Conclusion
+
+The ontology builds on three core ideas:
+
+1. **Trace as the atomic unit** - Everything flows from captured executions
+2. **Evaluation as a separate concern** - Decoupled from generation, versionable, composable
+3. **Developer-centric design** - Built for the team using OpenCode daily, not abstract metrics
+
+This maps naturally to EvalOps' mission: ship LLM changes confidently by making quality observable, measurable, and gateable.
diff --git a/docs/evaluation-status.md b/docs/evaluation-status.md
new file mode 100644
index 0000000000..97a174be79
--- /dev/null
+++ b/docs/evaluation-status.md
@@ -0,0 +1,256 @@
+# Evaluation Framework Implementation Status
+
+## Completed ✅
+
+### Stream 1: Trace Foundation
+**Commit**: `0e92e2f8` - "trace: implement trace foundation"
+
+- ✅ Created `Trace` namespace with complete type definitions
+- ✅ Implemented `Trace.materialize()` to convert sessions to traces
+- ✅ Added trace storage layer (`get`, `list`, `exists`, `remove`)
+- ✅ Implemented filtering for trace queries
+- ✅ Added `trace.completed` event emission
+- ✅ Computed summary statistics (duration, tokens, cost, errors)
+
+**Files Created**:
+- `packages/opencode/src/trace/index.ts` (247 lines)
+
+**Key Capabilities**:
+```typescript
+// Materialize any session into a trace
+const trace = await Trace.materialize(sessionID)
+
+// Query traces with filters
+for await (const trace of Trace.list({ hasErrors: true, minDuration: 5000 })) {
+ console.log(trace.summary)
+}
+
+// Get specific trace
+const trace = await Trace.get(traceID)
+```
+
+---
+
+## Next Steps (Ready to Implement)
+
+### Stream 2: Metric Registry (2-3 hours)
+**Goal**: Define evaluation criteria
+
+**Steps**:
+1. Create metric schema (`packages/opencode/src/evaluation/metric.ts`)
+2. Implement metric registry (CRUD operations)
+3. Build 5-7 built-in metrics:
+ - `tool-error-rate`: % of failed tool calls
+ - `response-latency`: Total duration
+ - `redundant-calls`: Detect repeated calls
+ - `cost-efficiency`: Cost per successful operation
+ - `token-efficiency`: Output tokens / total tokens
+4. Create rule-based evaluator (JavaScript expressions)
+5. Add metric storage layer
+6. Implement metric versioning
+
+**Files to Create**:
+- `packages/opencode/src/evaluation/metric.ts`
+- `packages/opencode/src/evaluation/heuristics.ts`
+- `packages/opencode/src/evaluation/metrics/builtin.ts`
+
+---
+
+### Stream 3: Evaluation Engine (3-4 hours)
+**Depends on**: Streams 1 & 2
+
+**Steps**:
+1. Create evaluation result schema
+2. Implement heuristic evaluator
+3. Implement rule evaluator
+4. Build evaluation engine orchestrator
+5. Add evaluation storage
+6. Create evaluation query API
+7. Emit evaluation events
+
+**Files to Create**:
+- `packages/opencode/src/evaluation/engine.ts`
+- `packages/opencode/src/evaluation/index.ts`
+
+---
+
+### Stream 4: Dataset Management (2-3 hours)
+**Can run in parallel with Stream 3**
+
+**Steps**:
+1. Create dataset schema
+2. Implement dataset CRUD
+3. Create test case schema with assertions
+4. Build dataset storage layer
+5. Add dataset CLI commands
+6. Create dataset import/export
+
+**Files to Create**:
+- `packages/opencode/src/evaluation/dataset.ts`
+- `packages/opencode/src/cli/cmd/dataset.ts`
+
+---
+
+### Stream 5: Test Runner (3-4 hours)
+**Depends on**: Streams 3 & 4
+
+**Steps**:
+1. Create test execution engine
+2. Implement assertion framework
+3. Build test result aggregation
+4. Add parallel execution support
+5. Create CLI: `opencode test run`
+6. Add result output formats
+7. Implement fail-on-error mode
+
+**Files to Create**:
+- `packages/opencode/src/evaluation/runner.ts`
+- `packages/opencode/src/cli/cmd/test.ts`
+
+---
+
+### Stream 6: Scorecards (2 hours)
+**Depends on**: Stream 3
+
+**Steps**:
+1. Create scorecard schema
+2. Implement scorecard evaluator
+3. Build 2-3 built-in scorecards
+4. Add scorecard storage
+5. Create scorecard CLI
+6. Integrate with test runner
+
+**Files to Create**:
+- `packages/opencode/src/evaluation/scorecard.ts`
+- `packages/opencode/src/evaluation/scorecards/builtin.ts`
+
+---
+
+### Stream 7: CLI Integration (Ongoing, 1-2 hours)
+**Parallel with all streams**
+
+**Steps**:
+1. Create `opencode eval` command group
+2. Add `opencode eval trace `
+3. Add `opencode eval run `
+4. Create `opencode dataset` command group
+5. Create `opencode test` command group
+6. Add pretty formatting
+7. Create help documentation
+
+**Files to Create/Modify**:
+- `packages/opencode/src/cli/cmd/eval.ts`
+- Update `packages/opencode/src/index.ts` to register commands
+
+---
+
+### Stream 8: CI/CD Integration (1-2 hours)
+**Depends on**: Streams 5 & 6
+
+**Steps**:
+1. Create GitHub Action workflow example
+2. Add PR comment formatting
+3. Implement baseline comparison
+4. Add regression detection
+5. Create CI-friendly output formats
+6. Document setup guide
+
+**Files to Create**:
+- `.github/workflows/eval-example.yml`
+- `docs/ci-integration.md`
+
+---
+
+## Implementation Timeline
+
+**Already Complete**:
+- ✅ Trace Foundation (Stream 1)
+- ✅ Implementation plan documents
+- ✅ Ontology design
+
+**Remaining Work**: ~16-20 hours
+- Stream 2: Metric Registry (2-3h)
+- Stream 3: Evaluation Engine (3-4h)
+- Stream 4: Dataset Management (2-3h)
+- Stream 5: Test Runner (3-4h)
+- Stream 6: Scorecards (2h)
+- Stream 7: CLI Integration (1-2h)
+- Stream 8: CI/CD Integration (1-2h)
+
+---
+
+## How to Continue
+
+### Option 1: Sequential Implementation
+Implement streams in dependency order:
+1. Stream 2 (Metrics)
+2. Stream 3 (Engine)
+3. Streams 4 + 6 in parallel
+4. Stream 5
+5. Streams 7 + 8
+
+### Option 2: MVP First
+Build minimal viable product:
+1. Stream 2: Just 3 metrics (error-rate, latency, cost)
+2. Stream 3: Basic engine (heuristics only)
+3. Stream 7: Simple CLI (`opencode eval trace`)
+4. Test and iterate
+
+### Option 3: Parallel Teams
+If multiple developers:
+- Dev 1: Streams 2 → 3 → 6
+- Dev 2: Stream 4 → 5
+- Dev 3: Stream 7 (ongoing)
+
+---
+
+## Key Design Decisions Made
+
+1. **Traces are immutable** - Once materialized, they don't change
+2. **Evaluations are separate** - Can evaluate/re-evaluate traces anytime
+3. **Storage is local-first** - All data in project storage
+4. **Events for observability** - Bus system for real-time notifications
+5. **Progressive disclosure** - Simple cases work out of box, complex cases supported
+
+---
+
+## Testing Strategy
+
+Each stream should include:
+1. Unit tests for core logic
+2. Integration tests with storage
+3. CLI tests for user-facing commands
+4. Example usage in docs
+
+---
+
+## Success Metrics
+
+### Phase 1 (Streams 1-3)
+- [ ] Can materialize traces from sessions
+- [ ] Can evaluate traces with built-in metrics
+- [ ] Can query evaluation history
+
+### Phase 2 (Streams 4-5)
+- [ ] Can create and run test datasets
+- [ ] Assertions work correctly
+- [ ] Results are actionable
+
+### Phase 3 (Streams 6-8)
+- [ ] Scorecards enforce quality gates
+- [ ] CI integration blocks bad PRs
+- [ ] Documentation is complete
+
+---
+
+## Next Command to Run
+
+To continue implementation:
+
+```bash
+# Stream 2: Create metric registry
+cd packages/opencode/src
+mkdir -p evaluation/metrics
+```
+
+Then create the files outlined in Stream 2 above.
diff --git a/docs/plugin-starter.md b/docs/plugin-starter.md
new file mode 100644
index 0000000000..2da7ec73ad
--- /dev/null
+++ b/docs/plugin-starter.md
@@ -0,0 +1,29 @@
+# Plugin Starter Template
+
+Use the helpers exported from `@opencode-ai/plugin` to build tools quickly:
+
+```ts
+import { tool } from "@opencode-ai/plugin"
+
+export const hello = tool({
+ description: "Greet a name",
+ args: {
+ name: tool.schema.string().describe("Name to greet"),
+ },
+ async execute(args, ctx) {
+ return {
+ title: `Hello, ${args.name}!`,
+ output: `Session ${ctx.sessionID} says hello to ${args.name}.`,
+ metadata: {
+ length: args.name.length,
+ },
+ }
+ },
+})
+```
+
+Guidelines:
+- Always describe arguments with `tool.schema` so the host can validate inputs.
+- Return either a string or an object containing `output`, plus optional `title` and `metadata`.
+- Use the tool telemetry (`measure`) and workspace guards when calling back into core tools.
+- Test plugins by importing the generated hook into `packages/plugin/src/example.ts` and running `bunx tsc --noEmit`.
diff --git a/docs/tool-authoring.md b/docs/tool-authoring.md
new file mode 100644
index 0000000000..24cd7956bd
--- /dev/null
+++ b/docs/tool-authoring.md
@@ -0,0 +1,29 @@
+# Tool Authoring Guide
+
+This project now ships shared helpers so every tool behaves consistently.
+
+## Instrumentation
+- Wrap long-running work with `measure({ id, ctx, params, run })` from `packages/opencode/src/tool/telemetry.ts`.
+- Each call logs execution duration, call id, and status, helping us spot slow or flaky commands while developing with `bun dev`.
+- `measure()` also publishes a `tool.telemetry` bus event. The TUI subscribes and renders these entries in real time (`tele | ToolName 0.42s`). Tap into the same stream via `Bus.subscribe(ToolTelemetry.Event.Sampled, ...)` for custom dashboards.
+
+## Workspace Safety
+- Use `guard()` from `packages/opencode/src/tool/workspace.ts` to resolve paths and enforce the workspace boundary.
+- Pass `message` if you need a custom error; pass `bypass: true` only for trusted internal flows.
+- Tools such as `edit`, `write`, `multiedit`, and `patch` already wrap user-provided paths with `guard()`. Follow the same pattern when building new file mutators.
+
+## Troubleshooting
+- If you see `tool.telemetry` entries with `status=error`, inspect the associated `error` string—it's propagated from the thrown exception.
+- Workspace errors typically originate from `guard()`. Confirm the tool receives absolute paths rooted in `Instance.directory` or set `bypass` explicitly for trusted cases (e.g., generated temp files).
+- When adding tests around I/O, use `tmpdir()` to create and clean up isolated directories; the helper ensures telemetry logs stay focused on the test workspace.
+- For tool stats, run `opencode stats`. The display now groups the last session’s telemetry entries by tool, listing total runs, average duration, and error count so you can spot hotspots quickly.
+
+## Plugin Tools
+- Plugin authors can return either a plain string or `{ output, title?, metadata? }`.
+- See `packages/plugin/src/tool.ts` for the unified `ToolDefinition` and `ToolResult` types.
+
+## Testing
+- Prefer table-driven tests under `packages/opencode/test/tool`. Use `tmpdir()` to create isolated workspaces.
+- Capture streamed metadata (see `bash.test.ts`) to ensure tools emit incremental updates as expected.
+
+Small, consistent helpers keep our tool surface predictable and easier to debug. Add to this document whenever you introduce new patterns that other contributors should follow.
diff --git a/docs/tui-ink-migration.md b/docs/tui-ink-migration.md
new file mode 100644
index 0000000000..2e457d4479
--- /dev/null
+++ b/docs/tui-ink-migration.md
@@ -0,0 +1,139 @@
+# Ink Migration Research
+
+This document captures the current state of the Go/Bubble Tea TUI, findings from reviewing Continue’s Ink-based CLI, and a recommended migration path. It is meant to serve as a primer for anyone evaluating or executing a future move to Ink.
+
+## 1. Current Go TUI Architecture
+
+### 1.1 Launch flow
+- `packages/opencode/src/cli/cmd/tui.ts` is the entrypoint. It bootstraps the Opencode server, locates the platform-specific Go binary, and spawns it with relevant flags (`--model`, `--session`, etc.).
+- The CLI exposes `OPENCODE_TUI_PATH` for overrides (useful for experimenting with alternative implementations).
+- The Go binary lives under `packages/tui` and is built with Bubble Tea + the Charmbracelet ecosystem.
+
+### 1.2 Packages of interest
+- `packages/tui/internal/app`: application state, config integration, session metadata, persisted “tui state” (recent models/agents, message history, toggles).
+- `packages/tui/internal/tui/tui.go`: Bubble Tea `Model` implementation. Handles Init/Update/View loop, key bindings, modal stack, toasts, diff overlay, etc.
+- `packages/tui/internal/components`: UI primitives composed by the model.
+ - `chat`: editor, messages pane, caches, markdown rendering.
+ - `commands`: command palette, leader key management.
+ - `dialog`: completion popovers (command, file, agent), session picker, confirmation dialogs.
+ - `diff`: diff overlay with syntax highlighting.
+ - `textarea`: multiline editor with history, mode switch (chat vs bash), key debounce logic.
+ - `status`: bottom status bar (cwd/git branch, model info, latency, queue state).
+ - `toast`: transient notifications.
+ - `qr`, `modal`, `list`: supporting components for login flows, overlays, navigation lists.
+- `packages/tui/internal/app/state.go`: persists TUI state to TOML (theme, recently used models/agents, message history, toggles).
+
+### 1.3 Feature inventory
+The current TUI provides a rich, IDE-like experience in the terminal. Key features include:
+
+| Feature | Go implementation notes |
+| --- | --- |
+| Home screen | ASCII logo, quick start shortcuts, model summary (see `tui.Home()` in `tui.go`). |
+| Multi-pane chat view | Split layout (messages left, editor bottom, optional diff overlay, modals stacked on top). |
+| Streaming messages | Bubble Tea subscriptions update `chat.MessagesComponent` incrementally. |
+| Markdown + syntax highlighting | `glamour`, `chroma` render markdown and diffs. |
+| File editor integration | Textarea component with history, command detection (`/`, `@`, `!`), bash mode toggle. |
+| Command palette | Leader key sequences, completion dialogs for commands/files/agents. |
+| Status bar | Displays cwd/git branch, session status, cost & latency, background tasks. |
+| Toast notifications | Non-blocking success/error banners via `toast.New*`. |
+| Modals & selectors | Session picker with rename, confirm dialogs, login prompts. |
+| Diff viewer | Full-screen overlay for patch review with scroll + syntax colors. |
+| Telemetry integration | Bottom indicators for tool timings, agent model, plan status. |
+| Key handling | Debounced exit and interrupt keys, leader key sequences, ctrl+z suspend, mouse wheel scroll. |
+| Persistence | TOML state file for recents/history toggles, updated through `state.go`. |
+| Server bridge | Communicates with Opencode server via `app.Client` interfaces (sessions, prompts, tools, telemetry). |
+
+### 1.4 Input/event flow
+- Bubble Tea `Update` function orchestrates key events. It routes to editor, commands, modals, or toasts.
+- Commands are defined in `packages/tui/internal/commands` and matched via `Commands.Matches` with leader flag support.
+- Completion dialog logic selects providers (`commandProvider`, `fileProvider`, `symbolsProvider`, `agentsProvider`).
+- Background tasks: diff overlay, telemetry updates, plan watchers, login flows, file watchers (through `app.Watchers`).
+
+### 1.5 Packaging & distribution
+- Go binary is embedded in npm package (`packages/opencode/bin/opencode`).
+- Cross-platform distribution uses Go compiler, minimal runtime dependencies, near-instant startup.
+
+## 2. Continue’s Ink CLI (Reference Implementation)
+We surveyed https://github.com/continuedev/continue (locally at `/Users/jonathanhaas/Documents/Dev/continue`).
+
+### 2.1 Stack overview
+- Entire CLI lives under `extensions/cli` and is written in TypeScript.
+- UI is implemented with Ink and React components (`extensions/cli/src/ui`).
+- State is provided through custom service containers (`extensions/cli/src/services`), contexts, and hooks.
+- Packaging via npm scripts: `tsc` + bundling (`build.mjs`), shipped as JS binaries (`dist/index.js`), no Go binaries involved.
+
+### 2.2 UI component structure
+- `AppRoot.tsx` wraps the app in `NavigationProvider` and renders `TUIChat`.
+- `TUIChat.tsx` orchestrates layout: chat history, editor, status bars, diff viewer, session selectors, modals, update notifications.
+- Numerous components mirror the complexity of our Go TUI: Markdown renderer, syntax highlighting, model selectors, slash command UI, diff viewer, resource debug bar, etc.
+- `extensions/cli/spec/tui.md` documents Ink stack and UI requirements (git/cwd display, etc.).
+
+### 2.3 Key takeaways
+- Ink can support a large-scale, feature-rich TUI given sufficient component scaffolding.
+- Continue leans on React conventions (contexts, hooks) to manage global state and service interactions, which aligns well with our TS codebase.
+- Distribution is via Node runtime (npm package). Startup will be slower than a baked Go binary but acceptable for modern CLIs.
+
+## 3. Proposed Migration Strategy
+This is a multi-phase effort; start with research and proof-of-concept.
+
+### Phase 0 — Documentation (you are here)
+- Capture architecture of current Go TUI and reference Ink implementation (this document).
+
+### Phase 1 — Proof of concept
+- Create `packages/opencode/src/tui-poc.tsx` implemented with Ink.
+- Replicate the “home” screen (logo, quick shortcuts, model summary, text input).
+- Wire to existing Opencode server bootstrap for data (reuse `bootstrap` from `tui.ts`).
+- Measure startup time and memory vs. Go binary.
+
+### Phase 2 — Feature parity plan
+For each Go component, define the Ink equivalent and implementation notes:
+
+| Go component | Responsibility | Ink plan |
+| --- | --- | --- |
+| `chat.MessagesComponent` | Streaming message list, markdown render, tool traces | Ink list view + custom markdown renderer (`ink-markdown`, `marked-terminal`). Maintain virtualized list for performance. |
+| `chat.EditorComponent` | Multiline editor, history, slash commands, bash mode | Build Ink component using raw stdin handling, history state, placeholder hints. Evaluate community packages (`ink-use-stdin`, `ink-text-input`) vs custom. |
+| `dialog.CompletionDialog` | Slash command & @ mention completion overlays | Overlay component via Ink `` with absolute positioning (managed via terminal columns) + keyboard navigation. |
+| `commands` | Leader key handling, command routing | Reuse existing TS command definitions. Implement keyboard handler hook to track leader sequences and debounced keys (interrupt/exit). |
+| `diff.DiffComponent` | Full-screen diff overlay, syntax highlight | Use `diff` + `cli-highlight` or `shiki` for syntax, overlay with Ink `` taking full width/height. |
+| `toast` | Temporary banners | Ink component anchored top-right/bottom. Manage lifetime via `setTimeout`. |
+| `status.StatusComponent` | Bottom status bar, git/cwd, model info, tool telemetry | Compose `` rows with computed spans; reuse existing TS providers for data (git/cwd logic already in TS). |
+| `modal` | Session selector, rename dialog, login prompt | Portal-like Ink component triggered via context state. |
+| `qr` | ASCII QR codes for login flows | Use `qrcode-terminal` library. |
+| `list` | Generic selection lists (sessions, models) | Build re-usable Ink list component with highlight + filtering support. |
+| `app.State` persistence | Recents, toggles, history stored as TOML | Reuse existing TS persistence utilities (`Config`, `Session`, `Storage`) or port `state.go` logic to TS module. |
+
+### Phase 3 — Infra & packaging
+- Decide on runtime: require Bun/Node, or explore `bun build --compile` for native binaries.
+- Update CLI entrypoint to detect and launch Ink version (guarded by env flag for beta testers).
+- Ensure cross-platform behavior (macOS, Linux, Windows). Test terminal compatibility (colors, resizing, mouse scroll).
+- Integrate CI (lint, tests) for new TUI. Reuse `vitest` for component tests similar to Continue’s `extensions/cli/src/ui/__tests__`.
+
+### Phase 4 — Feature completion & rollout
+- Incrementally port features from Go components, verifying against feature checklist.
+- Provide fallback to Go TUI until Ink reaches parity (controlled by flag).
+- Document migration path for users (release notes, README updates).
+
+## 4. Risks & Considerations
+- **Performance:** Node/Ink startup will be slower than Go. Need benchmarks; possibly mitigate by keeping Go binary as optional fast mode.
+- **Key handling:** Reimplement complex keybindings (leader sequences, debounced interrupt/exit) carefully to avoid regressions.
+- **Streaming:** Ensure Ink rendering remains responsive during long-running operations (might require throttling updates or using `ink`’s `` regions).
+- **Terminal capability detection:** Continue uses contexts to manage width/height; we must replicate status line/bottom bar layout across different terminal sizes.
+- **Packaging:** If we depend on Bun/Node availability, document prerequisites; bundling standalone binaries increases maintenance.
+- **Testing:** Snapshots for Ink components can be brittle—need a test story (Continue uses `vitest` + Ink render tests).
+
+## 5. Next Actions
+1. Track this work in an issue (see draft below).
+2. Stand up `tui-poc.tsx` and benchmark.
+3. Produce a detailed feature parity checklist with owners/estimates.
+4. Decide on packaging strategy early to avoid surprises late in migration.
+
+### Draft GitHub issue summary
+- Title: “Evaluate migrating Go-based TUI to Ink”
+- Checklist covering research, POC, packaging, parity plan, report back with recommendation.
+
+---
+
+**References**
+- Opencode Go TUI source: `packages/tui/internal/**/*`
+- CLI launcher: `packages/opencode/src/cli/cmd/tui.ts`
+- Continue Ink CLI (for ideas): `/Users/jonathanhaas/Documents/Dev/continue/extensions/cli`
diff --git a/github/README.md b/github/README.md
index 7601f51335..1eebb79b5e 100644
--- a/github/README.md
+++ b/github/README.md
@@ -67,7 +67,7 @@ This will walk you through installing the GitHub app, creating the workflow, and
fetch-depth: 1
- name: Run opencode
- uses: sst/opencode/github@latest
+ uses: evalops/opencode/github@latest
env:
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
with:
@@ -78,7 +78,7 @@ This will walk you through installing the GitHub app, creating the workflow, and
## Support
-This is an early release. If you encounter issues or have feedback, please create an issue at https://github.com/sst/opencode/issues.
+This is an early release. If you encounter issues or have feedback, please create an issue at https://github.com/evalops/opencode/issues.
## Development
diff --git a/install b/install
index 002f91a73c..c147c5a68e 100755
--- a/install
+++ b/install
@@ -45,15 +45,15 @@ INSTALL_DIR=$HOME/.opencode/bin
mkdir -p "$INSTALL_DIR"
if [ -z "$requested_version" ]; then
- url="https://github.com/sst/opencode/releases/latest/download/$filename"
- specific_version=$(curl -s https://api.github.com/repos/sst/opencode/releases/latest | sed -n 's/.*"tag_name": *"v\([^"]*\)".*/\1/p')
+ url="https://github.com/evalops/opencode/releases/latest/download/$filename"
+ specific_version=$(curl -s https://api.github.com/repos/evalops/opencode/releases/latest | sed -n 's/.*"tag_name": *"v\([^"]*\)".*/\1/p')
if [[ $? -ne 0 || -z "$specific_version" ]]; then
echo -e "${RED}Failed to fetch version information${NC}"
exit 1
fi
else
- url="https://github.com/sst/opencode/releases/download/v${requested_version}/$filename"
+ url="https://github.com/evalops/opencode/releases/download/v${requested_version}/$filename"
specific_version=$requested_version
fi
diff --git a/opencode.json b/opencode.json
index 720ece5c15..64886d6298 100644
--- a/opencode.json
+++ b/opencode.json
@@ -1,3 +1,13 @@
{
- "$schema": "https://opencode.ai/config.json"
+ "$schema": "https://opencode.ai/config.json",
+ "mcp": {
+ "claude-context": {
+ "type": "local",
+ "command": ["npx", "@zilliz/claude-context-mcp@latest"],
+ "environment": {
+ "OPENAI_API_KEY": "{env:OPENAI_API_KEY}",
+ "MILVUS_TOKEN": "{env:MILVUS_TOKEN}"
+ }
+ }
+ }
}
diff --git a/package.json b/package.json
index e1116503a0..0f6c30f690 100644
--- a/package.json
+++ b/package.json
@@ -39,7 +39,7 @@
},
"repository": {
"type": "git",
- "url": "https://github.com/sst/opencode"
+ "url": "https://github.com/evalops/opencode"
},
"license": "MIT",
"prettier": {
diff --git a/packages/console/app/src/component/footer.tsx b/packages/console/app/src/component/footer.tsx
index 93d8e2d8cd..02d6dafc1d 100644
--- a/packages/console/app/src/component/footer.tsx
+++ b/packages/console/app/src/component/footer.tsx
@@ -16,7 +16,7 @@ export function Footer() {
return (