diff --git a/.github/gh.yml b/.github/gh.yml
new file mode 100644
index 0000000000..2eed0a79a7
--- /dev/null
+++ b/.github/gh.yml
@@ -0,0 +1,3 @@
+# GitHub CLI configuration
+# Ensures all gh commands default to evalops/opencode instead of upstream
+repo: evalops/opencode
diff --git a/.github/workflows/opencode.yml b/.github/workflows/opencode.yml
index 41ee754086..822874c98d 100644
--- a/.github/workflows/opencode.yml
+++ b/.github/workflows/opencode.yml
@@ -20,7 +20,7 @@ jobs:
         uses: actions/checkout@v4
 
       - name: Run opencode
-        uses: sst/opencode/github@latest
+        uses: evalops/opencode/github@latest
         env:
           OPENCODE_API_KEY: ${{ secrets.OPENCODE_API_KEY }}
         with:
diff --git a/README.md b/README.md
index 6e91d3ccbc..00bc71c9ad 100644
--- a/README.md
+++ b/README.md
@@ -1,20 +1,10 @@
-<p align="center">
-  <a href="https://opencode.ai">
-    <picture>
-      <source srcset="packages/console/app/src/asset/logo-ornate-dark.svg" media="(prefers-color-scheme: dark)">
-      <source srcset="packages/console/app/src/asset/logo-ornate-light.svg" media="(prefers-color-scheme: light)">
-      <img src="packages/console/app/src/asset/logo-ornate-light.svg" alt="OpenCode logo">
-    </picture>
-  </a>
-</p>
-<p align="center">The AI coding agent built for the terminal.</p>
-<p align="center">
-  <a href="https://opencode.ai/discord"><img alt="Discord" src="https://img.shields.io/discord/1391832426048651334?style=flat-square&label=discord" /></a>
-  <a href="https://www.npmjs.com/package/opencode-ai"><img alt="npm" src="https://img.shields.io/npm/v/opencode-ai?style=flat-square" /></a>
-  <a href="https://github.com/sst/opencode/actions/workflows/publish.yml"><img alt="Build status" src="https://img.shields.io/github/actions/workflow/status/sst/opencode/publish.yml?style=flat-square&branch=dev" /></a>
-</p>
-
-[![OpenCode Terminal UI](packages/web/src/assets/lander/screenshot.png)](https://opencode.ai)
+# Grimoire
+
+> A fork of [OpenCode](https://github.com/evalops/opencode) by [EvalOps](https://evalops.dev)
+
+This is a public fork maintained by EvalOps for internal use. We use OpenCode extensively and maintain this fork to experiment with enhancements aligned with our LLM evaluation workflows. OpenCode is open source and permissively licensed (MIT).
+
+This fork tracks the `dev` branch of [evalops/opencode](https://github.com/evalops/opencode). For official releases and documentation, see the upstream repository.
 
 ---
 
@@ -52,6 +42,31 @@ XDG_BIN_DIR=$HOME/.local/bin curl -fsSL https://opencode.ai/install | bash
 
 For more info on how to configure OpenCode [**head over to our docs**](https://opencode.ai/docs).
 
+### Usage Stats
+
+You can inspect local usage history and tool telemetry with the built-in stats command:
+
+```bash
+opencode stats                   # pretty summary
+opencode stats --json            # machine-readable output
+opencode stats --telemetry all   # include recent tool runs
+opencode stats --limit 50        # show more history
+opencode stats --clear           # reset stored telemetry data
+opencode stats --details         # show telemetry metadata fields
+opencode stats --details-format ndjson --fields status,final_url
+opencode stats --status error --since 1d
+opencode stats --compare baseline.json --warn-latency 2000
+```
+
+Advanced telemetry usage tips:
+
+- Capture a baseline for comparison with `opencode stats --json --telemetry all --limit 500 > baseline.json`, then diff with `--compare baseline.json`.
+- Export metadata for dashboards using `--details-format csv` or `--details-format ndjson`.
+- Focus on specific signals by pairing `--status`, `--since`, `--until`, and `--fields` filters.
+- Gate builds by combining `--warn-latency` or `--warn-errors` with CI scripts.
+
+The telemetry section lists recent tool executions (duration, status, error message) gathered from persisted `tool.telemetry` events.
+
 ### Contributing
 
 OpenCode is an opinionated tool so any fundamental feature needs to go through a
diff --git a/bun.lock b/bun.lock
index f279671ca0..9db5ec29ff 100644
--- a/bun.lock
+++ b/bun.lock
@@ -153,6 +153,7 @@
         "@hono/standard-validator": "0.1.5",
         "@hono/zod-validator": "catalog:",
         "@modelcontextprotocol/sdk": "1.15.1",
+        "@octokit/rest": "22.0.0",
         "@openauthjs/openauth": "0.4.3",
         "@opencode-ai/plugin": "workspace:*",
         "@opencode-ai/sdk": "workspace:*",
@@ -162,12 +163,14 @@
         "chokidar": "4.0.3",
         "decimal.js": "10.5.0",
         "diff": "8.0.2",
+        "exa-js": "1.9.3",
         "fuzzysort": "3.1.0",
         "gray-matter": "4.0.3",
         "hono": "catalog:",
         "hono-openapi": "1.0.7",
         "ignore": "7.0.5",
         "jsonc-parser": "3.3.1",
+        "linkedom": "0.18.12",
         "minimatch": "10.0.3",
         "open": "10.1.2",
         "remeda": "catalog:",
@@ -1480,7 +1483,7 @@
 
     "croner": ["croner@9.1.0", "", {}, "sha512-p9nwwR4qyT5W996vBZhdvBCnMhicY5ytZkR4D1Xj0wuTDEiMnjwR57Q3RXYY/s0EpX6Ay3vgIcfaR+ewGHsi+g=="],
 
-    "cross-fetch": ["cross-fetch@3.2.0", "", { "dependencies": { "node-fetch": "^2.7.0" } }, "sha512-Q+xVJLoGOeIMXZmbUK4HYk+69cQH6LudR0Vu/pRm2YlU/hDV9CiS0gKUMaWY5f2NeUH9C1nV3bsTlCo0FsTV1Q=="],
+    "cross-fetch": ["cross-fetch@4.1.0", "", { "dependencies": { "node-fetch": "^2.7.0" } }, "sha512-uKm5PU+MHTootlWEY+mZ4vvXoCn4fLQxT9dSc1sXVMSFkINTJVN8cAQROpwcKm8bJ/c7rgZVIBWzH5T78sNZZw=="],
 
     "cross-spawn": ["cross-spawn@7.0.6", "", { "dependencies": { "path-key": "^3.1.0", "shebang-command": "^2.0.0", "which": "^2.0.1" } }, "sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA=="],
 
@@ -1496,6 +1499,8 @@
 
     "cssesc": ["cssesc@3.0.0", "", { "bin": { "cssesc": "bin/cssesc" } }, "sha512-/Tb/JcjK111nNScGob5MNtsntNM1aCNUDipB/TkwZFhyDrrE47SOx/18wF2bbjgc3ZzCSKW1T5nt5EbFoAz/Vg=="],
 
+    "cssom": ["cssom@0.5.0", "", {}, "sha512-iKuQcq+NdHqlAcwUY0o/HL69XQrUaQdMjmStJ8JFmUaiiQErlhrmuigkg/CU4E2J0IyUKUrMAgl36TvN67MqTw=="],
+
     "csstype": ["csstype@3.1.3", "", {}, "sha512-M1uQkMl8rQK/szD0LNhtqxIPLpimGm8sOBwU7lLnCpSbTyY3yeU1Vc7l4KT5zT4s/yOxHH5O7tIuuLOCnLADRw=="],
 
     "dax-sh": ["dax-sh@0.43.2", "", { "dependencies": { "@deno/shim-deno": "~0.19.0", "undici-types": "^5.26" } }, "sha512-uULa1sSIHgXKGCqJ/pA0zsnzbHlVnuq7g8O2fkHokWFNwEGIhh5lAJlxZa1POG5En5ba7AU4KcBAvGQWMMf8rg=="],
@@ -1568,7 +1573,7 @@
 
     "dot-prop": ["dot-prop@9.0.0", "", { "dependencies": { "type-fest": "^4.18.2" } }, "sha512-1gxPBJpI/pcjQhKgIU91II6Wkay+dLcN3M6rf2uwP8hRur3HtQXjVrdAK3sjC0piaEuxzMwjXChcETiJl47lAQ=="],
 
-    "dotenv": ["dotenv@16.6.1", "", {}, "sha512-uBq4egWHTcTt33a72vpSG0z3HnPuIl6NqYcTrKEg2azoEyl2hpW0zqlxysq2pK9HlDIHyHyakeYaYnSAwd8bow=="],
+    "dotenv": ["dotenv@16.4.7", "", {}, "sha512-47qPchRCykZC03FhkYAhrvwU4xDBFIj1QPqaarj6mdM/hgUzfPHcpkHJOn3mJAufFeeAxAzeGsr5X0M4k6fLZQ=="],
 
     "drizzle-kit": ["drizzle-kit@0.30.5", "", { "dependencies": { "@drizzle-team/brocli": "^0.10.2", "@esbuild-kit/esm-loader": "^2.5.5", "esbuild": "^0.19.7", "esbuild-register": "^3.5.0", "gel": "^2.0.0" }, "bin": { "drizzle-kit": "bin.cjs" } }, "sha512-l6dMSE100u7sDaTbLczibrQZjA35jLsHNqIV+jmhNVO3O8jzM6kywMOmV9uOz9ZVSCMPQhAZEFjL/qDPVrqpUA=="],
 
@@ -1660,6 +1665,8 @@
 
     "eventsource-parser": ["eventsource-parser@3.0.6", "", {}, "sha512-Vo1ab+QXPzZ4tCa8SwIHJFaSzy4R6SHf7BY79rFBDf0idraZWAkYrDjDj8uWaSm3S2TK+hJ7/t1CEmZ7jXw+pg=="],
 
+    "exa-js": ["exa-js@1.9.3", "", { "dependencies": { "cross-fetch": "~4.1.0", "dotenv": "~16.4.7", "openai": "^5.0.1", "zod": "^3.22.0", "zod-to-json-schema": "^3.20.0" } }, "sha512-4u8vO5KHstifBz6fcwcBVvU62zfwsWFpD8qomU2zQ+lLRYCwOh2Rz04xSSqEeoHrkCypGjy2VHez7elBt6ibQQ=="],
+
     "execa": ["execa@8.0.1", "", { "dependencies": { "cross-spawn": "^7.0.3", "get-stream": "^8.0.1", "human-signals": "^5.0.0", "is-stream": "^3.0.0", "merge-stream": "^2.0.0", "npm-run-path": "^5.1.0", "onetime": "^6.0.0", "signal-exit": "^4.1.0", "strip-final-newline": "^3.0.0" } }, "sha512-VyhnebXciFV2DESc+p6B+y0LjSm0krU4OgJN44qFAhBY0TJ+1V61tYD2+wHusZ6F9n5K+vl8k0sTy7PEfV4qpg=="],
 
     "exit-hook": ["exit-hook@2.2.1", "", {}, "sha512-eNTPlAD67BmP31LDINZ3U7HSF8l57TxOY2PmBJ1shpCvpnxBF93mWCE8YHBnXs8qiUZJc9WDcWIeC3a2HIAMfw=="],
@@ -1860,7 +1867,7 @@
 
     "html-whitespace-sensitive-tag-names": ["html-whitespace-sensitive-tag-names@3.0.1", "", {}, "sha512-q+310vW8zmymYHALr1da4HyXUQ0zgiIwIicEfotYPWGN0OJVEN/58IJ3A4GBYcEq3LGAZqKb+ugvP0GNB9CEAA=="],
 
-    "htmlparser2": ["htmlparser2@8.0.2", "", { "dependencies": { "domelementtype": "^2.3.0", "domhandler": "^5.0.3", "domutils": "^3.0.1", "entities": "^4.4.0" } }, "sha512-GYdjWKDkbRLkZ5geuHs5NY1puJ+PXwP7+fHPRz06Eirsb9ugf6d8kkXav6ADhcODhFFPMIXyxkxSuMf3D6NCFA=="],
+    "htmlparser2": ["htmlparser2@10.0.0", "", { "dependencies": { "domelementtype": "^2.3.0", "domhandler": "^5.0.3", "domutils": "^3.2.1", "entities": "^6.0.0" } }, "sha512-TwAZM+zE5Tq3lrEHvOlvwgj1XLWQCtaaibSN11Q+gGBAS7Y1uZSWwXXRe4iF6OXnaq1riyQAPFOBtYc77Mxq0g=="],
 
     "http-cache-semantics": ["http-cache-semantics@4.2.0", "", {}, "sha512-dTxcvPXqPvXBQpq5dUr6mEMJX4oIEFv6bwom3FDwKRDsuIjjJGANqhBuoAn9c1RQJIdAKav33ED65E2ys+87QQ=="],
 
@@ -2038,6 +2045,8 @@
 
     "lines-and-columns": ["lines-and-columns@1.2.4", "", {}, "sha512-7ylylesZQ/PV29jhEDl3Ufjo6ZX7gCqJr5F7PKrqc93v7fzSymt1BpwEU8nAUXs8qzzvqhbjhK5QZg6Mt/HkBg=="],
 
+    "linkedom": ["linkedom@0.18.12", "", { "dependencies": { "css-select": "^5.1.0", "cssom": "^0.5.0", "html-escaper": "^3.0.3", "htmlparser2": "^10.0.0", "uhyphen": "^0.2.0" }, "peerDependencies": { "canvas": ">= 2" }, "optionalPeers": ["canvas"] }, "sha512-jalJsOwIKuQJSeTvsgzPe9iJzyfVaEJiEXl+25EkKevsULHvMJzpNqwvj1jOESWdmgKDiXObyjOYwlUqG7wo1Q=="],
+
     "listhen": ["listhen@1.9.0", "", { "dependencies": { "@parcel/watcher": "^2.4.1", "@parcel/watcher-wasm": "^2.4.1", "citty": "^0.1.6", "clipboardy": "^4.0.0", "consola": "^3.2.3", "crossws": ">=0.2.0 <0.4.0", "defu": "^6.1.4", "get-port-please": "^3.1.2", "h3": "^1.12.0", "http-shutdown": "^1.2.2", "jiti": "^2.1.2", "mlly": "^1.7.1", "node-forge": "^1.3.1", "pathe": "^1.1.2", "std-env": "^3.7.0", "ufo": "^1.5.4", "untun": "^0.1.3", "uqr": "^0.1.2" }, "bin": { "listen": "bin/listhen.mjs", "listhen": "bin/listhen.mjs" } }, "sha512-I8oW2+QL5KJo8zXNWX046M134WchxsXC7SawLPvRQpogCbkyQIaFxPE89A2HiwR7vAK2Dm2ERBAmyjTYGYEpBg=="],
 
     "local-pkg": ["local-pkg@1.1.2", "", { "dependencies": { "mlly": "^1.7.4", "pkg-types": "^2.3.0", "quansync": "^0.2.11" } }, "sha512-arhlxbFRmoQHl33a0Zkle/YWlmNwoyt6QNZEIJcqNbdrsix5Lvc4HyyI3EnwxTYlZYc32EbYrQ8SzEZ7dqgg9A=="],
@@ -2824,6 +2833,8 @@
 
     "uglify-js": ["uglify-js@3.19.3", "", { "bin": { "uglifyjs": "bin/uglifyjs" } }, "sha512-v3Xu+yuwBXisp6QYTcH4UbH+xYJXqnq2m/LtQVWKWzYc1iehYnLixoQDN9FH6/j9/oybfd6W9Ghwkl8+UMKTKQ=="],
 
+    "uhyphen": ["uhyphen@0.2.0", "", {}, "sha512-qz3o9CHXmJJPGBdqzab7qAYuW8kQGKNEuoHFYrBwV6hWIMcpAmxDLXojcHfFr9US1Pe6zUswEIJIbLI610fuqA=="],
+
     "ulid": ["ulid@3.0.0", "", { "bin": { "ulid": "dist/cli.js" } }, "sha512-yvZYdXInnJve6LdlPIuYmURdS2NP41ZoF4QW7SXwbUKYt53+0eDAySO+rGSvM2O/ciuB/G+8N7GQrZ1mCJpuqw=="],
 
     "ultrahtml": ["ultrahtml@1.6.0", "", {}, "sha512-R9fBn90VTJrqqLDwyMph+HGne8eqY1iPfYhPzZrvKpIfwkWZbcYlfpsb8B9dTvBfpy1/hqAD7Wi8EKfP9e8zdw=="],
@@ -3036,6 +3047,8 @@
 
     "@babel/helper-create-class-features-plugin/semver": ["semver@6.3.1", "", { "bin": { "semver": "bin/semver.js" } }, "sha512-BR7VvDCVHO+q2xBEWskxS6DJE1qRnb7DxzUrogb71CWoSficBxYsiAGd+Kl0mmq/MprG9yArRkyrQxTO6XjMzA=="],
 
+    "@capsizecss/unpack/cross-fetch": ["cross-fetch@3.2.0", "", { "dependencies": { "node-fetch": "^2.7.0" } }, "sha512-Q+xVJLoGOeIMXZmbUK4HYk+69cQH6LudR0Vu/pRm2YlU/hDV9CiS0gKUMaWY5f2NeUH9C1nV3bsTlCo0FsTV1Q=="],
+
     "@cloudflare/kv-asset-handler/mime": ["mime@3.0.0", "", { "bin": { "mime": "cli.js" } }, "sha512-jSCU7/VB1loIWBZe14aEYHU/+1UMEHoaO7qxCOVJOw9GgH72VAWppxNcjU+x9a2k3GSIBXNKxXQFqRvvZ7vr3A=="],
 
     "@cloudflare/unenv-preset/unenv": ["unenv@2.0.0-rc.21", "", { "dependencies": { "defu": "^6.1.4", "exsolve": "^1.0.7", "ohash": "^2.0.11", "pathe": "^2.0.3", "ufo": "^1.6.1" } }, "sha512-Wj7/AMtE9MRnAXa6Su3Lk0LNCfqDYgfwVjwRFVum9U7wsto1imuHqk4kTm7Jni+5A0Hn7dttL6O/zjvUvoo+8A=="],
@@ -3182,6 +3195,10 @@
 
     "body-parser/iconv-lite": ["iconv-lite@0.6.3", "", { "dependencies": { "safer-buffer": ">= 2.1.2 < 3.0.0" } }, "sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw=="],
 
+    "c12/dotenv": ["dotenv@16.6.1", "", {}, "sha512-uBq4egWHTcTt33a72vpSG0z3HnPuIl6NqYcTrKEg2azoEyl2hpW0zqlxysq2pK9HlDIHyHyakeYaYnSAwd8bow=="],
+
+    "cheerio/htmlparser2": ["htmlparser2@8.0.2", "", { "dependencies": { "domelementtype": "^2.3.0", "domhandler": "^5.0.3", "domutils": "^3.0.1", "entities": "^4.4.0" } }, "sha512-GYdjWKDkbRLkZ5geuHs5NY1puJ+PXwP7+fHPRz06Eirsb9ugf6d8kkXav6ADhcODhFFPMIXyxkxSuMf3D6NCFA=="],
+
     "compress-commons/is-stream": ["is-stream@2.0.1", "", {}, "sha512-hFoiJiTl63nn+kstHGBtewWSKnQLpyb155KHheA1l39uvtO9nWIop1p3udqPcUd/xbF1VLMO4n7OI6p7RbngDg=="],
 
     "condense-newlines/kind-of": ["kind-of@3.2.2", "", { "dependencies": { "is-buffer": "^1.1.5" } }, "sha512-NOW9QQXMoZGg/oqnVNoNTTIFEIid1627WCffUBJEdMxYApq7mNE7CpzucIPc+ZQg25Phej7IJSmX3hO+oblOtQ=="],
@@ -3204,6 +3221,8 @@
 
     "estree-util-to-js/source-map": ["source-map@0.7.6", "", {}, "sha512-i5uvt8C3ikiWeNZSVZNWcfZPItFQOsYTUAOkcUPGd8DqDy1uOUikjt5dG+uRlwyvR108Fb9DOd4GvXfT0N2/uQ=="],
 
+    "exa-js/zod": ["zod@3.25.76", "", {}, "sha512-gzUt/qt81nXsFGKIFcC3YnfEAx5NkunCfnDlvuBSSFS02bcXu4Lmea0AFIUwbLWxWPx3d9p8S5QoaujKcNQxcQ=="],
+
     "express/cookie": ["cookie@0.7.2", "", {}, "sha512-yki5XnKuf750l50uGTllt6kKILY4nQ1eNIQatoXEByZ5dWgnKqbnqmTrBE5B4N7lrMJKQ2ytWMiTO2o0v6Ew/w=="],
 
     "express/send": ["send@1.2.0", "", { "dependencies": { "debug": "^4.3.5", "encodeurl": "^2.0.0", "escape-html": "^1.0.3", "etag": "^1.8.1", "fresh": "^2.0.0", "http-errors": "^2.0.0", "mime-types": "^3.0.1", "ms": "^2.1.3", "on-finished": "^2.4.1", "range-parser": "^1.2.1", "statuses": "^2.0.1" } }, "sha512-uaW0WwXKpL9blXE2o0bRhoL2EGXIrZxQ2ZQ4mgcfoBxdFmQold+qWsD2jLrfZ0trjKL6vOw0j//eAwcALFjKSw=="],
@@ -3228,6 +3247,10 @@
 
     "html-minifier-terser/commander": ["commander@10.0.1", "", {}, "sha512-y4Mg2tXshplEbSGzx7amzPwKKOCGuoSRP/CjEdwwk0FOGlUbq6lKuoyDZTNZkmxHdJtp54hdfY/JUrdL7Xfdug=="],
 
+    "html-to-text/htmlparser2": ["htmlparser2@8.0.2", "", { "dependencies": { "domelementtype": "^2.3.0", "domhandler": "^5.0.3", "domutils": "^3.0.1", "entities": "^4.4.0" } }, "sha512-GYdjWKDkbRLkZ5geuHs5NY1puJ+PXwP7+fHPRz06Eirsb9ugf6d8kkXav6ADhcODhFFPMIXyxkxSuMf3D6NCFA=="],
+
+    "htmlparser2/entities": ["entities@6.0.1", "", {}, "sha512-aN97NXWF6AWBTahfVOIrB/NShkzi5H7F9r1s9mD3cDj4Ko5f2qhhVoYMibXF7GlLveb/D2ioWay8lxI97Ven3g=="],
+
     "http-errors/statuses": ["statuses@2.0.1", "", {}, "sha512-RwNA9Z/7PrK06rYLIzFMlaF+l73iwpzsqRIFgbMLbTcLD6cOao82TaWefPXQvB2fOC4AjuYSEndS7N/mTCbkdQ=="],
 
     "js-beautify/glob": ["glob@10.4.5", "", { "dependencies": { "foreground-child": "^3.1.0", "jackspeak": "^3.1.2", "minimatch": "^9.0.4", "minipass": "^7.1.2", "package-json-from-dist": "^1.0.0", "path-scurry": "^1.11.1" }, "bin": { "glob": "dist/esm/bin.mjs" } }, "sha512-7Bv8RF0k6xjo7d4A/PxYLbUCfb6c+Vpd2/mB2yRDlew7Jb5hEXiCD9ibfO7wpk8i4sevK6DFny9h7EYbM3/sHg=="],
diff --git a/docs/evaluation-implementation-plan.md b/docs/evaluation-implementation-plan.md
new file mode 100644
index 0000000000..c7e97bcf16
--- /dev/null
+++ b/docs/evaluation-implementation-plan.md
@@ -0,0 +1,226 @@
+# Evaluation Framework Implementation Plan
+
+## Work Stream 1: Trace Foundation (Core Data Layer)
+**Goal**: Materialize sessions into complete traces with evaluation context
+
+### Steps:
+1. ✅ Create trace namespace and types
+2. ✅ Implement trace materialization from session
+3. ✅ Add trace storage layer
+4. ✅ Create trace list/get APIs
+5. ✅ Add trace completion event
+
+**Parallel with**: Stream 2 (Metric definitions are independent)
+
+---
+
+## Work Stream 2: Metric Registry (Evaluation Criteria)
+**Goal**: Define what we evaluate and how
+
+### Steps:
+1. ✅ Create metric schema and types
+2. ✅ Implement metric registry (CRUD)
+3. ✅ Build 5-7 built-in metrics (heuristics)
+4. ✅ Create rule-based evaluator
+5. ✅ Add metric storage
+6. ✅ Create metric versioning system
+
+**Parallel with**: Stream 1 (doesn't need traces to define metrics)
+
+---
+
+## Work Stream 3: Evaluation Engine (The Executor)
+**Goal**: Run metrics against traces and store results
+
+### Steps:
+1. ✅ Create evaluation result schema
+2. ✅ Implement heuristic evaluator
+3. ✅ Implement rule evaluator
+4. ✅ Build evaluation engine orchestrator
+5. ✅ Add evaluation storage
+6. ✅ Create evaluation query API
+7. ✅ Emit evaluation events
+
+**Depends on**: Streams 1 & 2 complete
+
+---
+
+## Work Stream 4: Dataset Management (Test Cases)
+**Goal**: Store and manage test case collections
+
+### Steps:
+1. ✅ Create dataset schema
+2. ✅ Implement dataset CRUD
+3. ✅ Create test case schema with assertions
+4. ✅ Build dataset storage layer
+5. ✅ Add dataset CLI commands
+6. ✅ Create dataset import/export
+
+**Parallel with**: Stream 3 (independent data model)
+
+---
+
+## Work Stream 5: Test Runner (Execute & Evaluate)
+**Goal**: Run datasets and evaluate results
+
+### Steps:
+1. ✅ Create test execution engine
+2. ✅ Implement assertion framework
+3. ✅ Build test result aggregation
+4. ✅ Add parallel execution support
+5. ✅ Create CLI: `opencode test run`
+6. ✅ Add result output formats (JSON, pretty)
+7. ✅ Implement fail-on-error mode
+
+**Depends on**: Streams 3 & 4 complete
+
+---
+
+## Work Stream 6: Scorecards (Quality Gates)
+**Goal**: Bundle metrics into pass/fail contracts
+
+### Steps:
+1. ✅ Create scorecard schema
+2. ✅ Implement scorecard evaluator
+3. ✅ Build 2-3 built-in scorecards
+4. ✅ Add scorecard storage
+5. ✅ Create scorecard CLI
+6. ✅ Integrate with test runner
+
+**Depends on**: Stream 3 complete
+**Parallel with**: Stream 5 (can build while test runner develops)
+
+---
+
+## Work Stream 7: CLI Integration (Developer UX)
+**Goal**: Make everything accessible via command line
+
+### Steps:
+1. ✅ Create `opencode eval` command group
+2. ✅ Add `opencode eval trace <session-id>`
+3. ✅ Add `opencode eval run <metric-id> <trace-id>`
+4. ✅ Create `opencode dataset` command group
+5. ✅ Create `opencode test` command group
+6. ✅ Add pretty formatting for all outputs
+7. ✅ Create help documentation
+
+**Parallel with**: All streams (add CLI as features complete)
+
+---
+
+## Work Stream 8: CI/CD Integration (Automation)
+**Goal**: Enable automated quality gates
+
+### Steps:
+1. ✅ Create GitHub Action workflow example
+2. ✅ Add PR comment formatting
+3. ✅ Implement baseline comparison
+4. ✅ Add regression detection
+5. ✅ Create CI-friendly output formats
+6. ✅ Document setup guide
+
+**Depends on**: Streams 5 & 6 complete
+
+---
+
+## Parallelization Strategy
+
+### Phase 1 (Parallel - Start Together)
+- **Stream 1** (Trace) - One dev
+- **Stream 2** (Metrics) - One dev
+- **Stream 4** (Datasets) - One dev
+
+### Phase 2 (Requires Phase 1)
+- **Stream 3** (Engine) - Needs Streams 1+2
+- **Stream 6** (Scorecards) - Needs Stream 2
+- Continue **Stream 7** (CLI) - Add commands as features complete
+
+### Phase 3 (Integration)
+- **Stream 5** (Test Runner) - Needs Streams 3+4
+- **Stream 8** (CI/CD) - Needs Streams 5+6
+
+---
+
+## Implementation Order (Solo Developer)
+
+1. **Trace Foundation** (2-3 hours)
+2. **Metric Registry** (2-3 hours) 
+3. **Evaluation Engine** (3-4 hours)
+4. **Dataset Management** (2-3 hours)
+5. **Test Runner** (3-4 hours)
+6. **Scorecards** (2 hours)
+7. **CLI Integration** (ongoing, 1-2 hours)
+8. **CI/CD Examples** (1-2 hours)
+
+**Total**: 16-24 hours of implementation
+
+---
+
+## Success Criteria
+
+### Stream 1 (Trace)
+- [ ] Can materialize any session into a trace
+- [ ] Traces stored with full context
+- [ ] Can query traces by filters
+
+### Stream 2 (Metrics)
+- [ ] 5+ built-in metrics defined
+- [ ] Can register custom metrics
+- [ ] Metrics are versioned
+
+### Stream 3 (Engine)
+- [ ] Can evaluate trace against metric
+- [ ] Results stored persistently
+- [ ] Can query evaluation history
+
+### Stream 4 (Datasets)
+- [ ] Can create/read/update/delete datasets
+- [ ] Can add test cases
+- [ ] Can import/export JSON
+
+### Stream 5 (Runner)
+- [ ] Can run full dataset
+- [ ] Assertions work correctly
+- [ ] Results show pass/fail clearly
+
+### Stream 6 (Scorecards)
+- [ ] Can define quality contracts
+- [ ] Can evaluate trace against scorecard
+- [ ] Built-in scorecards available
+
+### Stream 7 (CLI)
+- [ ] All features accessible via CLI
+- [ ] Help text comprehensive
+- [ ] Output is readable
+
+### Stream 8 (CI/CD)
+- [ ] Example workflow works
+- [ ] Can block PRs on failure
+- [ ] Results post to PR
+
+---
+
+## Commit Strategy
+
+**Small, Atomic Commits:**
+- After each step within a stream
+- Push after completing each stream
+- Tag major milestones
+
+**Commit Message Format:**
+```
+<stream>: <what was done>
+
+- Detail 1
+- Detail 2
+```
+
+Example:
+```
+trace: implement trace materialization
+
+- Add Trace.Complete type
+- Implement materialize() function
+- Add storage layer for traces
+- Emit trace.completed events
+```
diff --git a/docs/evaluation-implementation.md b/docs/evaluation-implementation.md
new file mode 100644
index 0000000000..aeedefbc47
--- /dev/null
+++ b/docs/evaluation-implementation.md
@@ -0,0 +1,887 @@
+# Evaluation Implementation Strategy
+
+## Phase 1: Foundation (Week 1-2)
+
+### 1.1 Trace Materialization
+
+**Goal**: Unify Session + TelemetryEvents into a complete Trace abstraction
+
+**Changes**:
+```typescript
+// packages/opencode/src/trace/index.ts
+export namespace Trace {
+  // Extends Session with evaluation context
+  export type Complete = {
+    // Session data
+    session: Session.Info
+    messages: MessageV2.Message[]
+    
+    // Execution context (NEW)
+    agentName: string
+    modelConfig: {
+      provider: string
+      model: string
+      temperature?: number
+      maxTokens?: number
+    }
+    systemPrompt: string
+    systemPromptVersion?: string
+    
+    // Tool events (already captured)
+    toolCalls: TelemetryEvent[]
+    
+    // Aggregated metrics
+    summary: {
+      duration: number
+      toolCallCount: number
+      errorCount: number
+      tokens: TokenUsage
+      cost: number
+    }
+    
+    // Evaluation results (empty initially)
+    evaluations: Evaluation[]
+  }
+  
+  // Create a trace from a session
+  export async function materialize(sessionID: string): Promise<Complete>
+  
+  // List traces with filters
+  export async function list(filter?: TraceFilter): AsyncIterableIterator<Complete>
+  
+  // Get a specific trace
+  export async function get(traceID: string): Promise<Complete>
+}
+```
+
+**Implementation**:
+```typescript
+export async function materialize(sessionID: string): Promise<Trace.Complete> {
+  const session = await Session.get(sessionID)
+  const messages = await Session.messages(sessionID)
+  
+  // Get telemetry events for this session
+  const history = await ToolHistory.read()
+  const toolCalls = history.events.filter(e => e.sessionID === sessionID)
+  
+  // Extract model config from first assistant message
+  const firstAssistant = messages.find(m => m.info.role === "assistant")
+  const modelConfig = firstAssistant ? {
+    provider: firstAssistant.info.providerID,
+    model: firstAssistant.info.modelID,
+    // Extract other params from metadata
+  } : { provider: "unknown", model: "unknown" }
+  
+  // Load system prompt (from session init)
+  const systemPrompt = await getSystemPromptForSession(sessionID)
+  
+  return {
+    session,
+    messages,
+    agentName: session.agent ?? "default",
+    modelConfig,
+    systemPrompt,
+    toolCalls,
+    summary: computeSummary(messages, toolCalls),
+    evaluations: []
+  }
+}
+```
+
+**Storage**: Store materialized traces
+```typescript
+["trace", projectID, sessionID] -> Trace.Complete
+```
+
+**Event**: Emit trace completion
+```typescript
+Bus.publish(Trace.Event.Completed, { trace })
+```
+
+---
+
+### 1.2 Metric Registry
+
+**Goal**: Define evaluation metrics as declarative config
+
+**Schema**:
+```typescript
+// packages/opencode/src/evaluation/metric.ts
+export namespace Metric {
+  export type Definition = {
+    id: string
+    name: string
+    description: string
+    version: string
+    
+    category: "performance" | "correctness" | "safety" | "cost"
+    
+    evaluator: RuleEvaluator | LLMEvaluator | HeuristicEvaluator
+    
+    threshold?: {
+      pass: number
+      warn?: number
+    }
+    
+    higherIsBetter: boolean
+  }
+  
+  type RuleEvaluator = {
+    type: "rule"
+    expression: string  // JavaScript expression
+  }
+  
+  type LLMEvaluator = {
+    type: "llm"
+    prompt: string
+    model: string
+    parseScore: (output: string) => number
+  }
+  
+  type HeuristicEvaluator = {
+    type: "heuristic"
+    function: keyof typeof Heuristics
+    params?: Record<string, any>
+  }
+}
+```
+
+**Built-in Metrics** (start with simple ones):
+```typescript
+// packages/opencode/src/evaluation/metrics/builtin.ts
+export const BuiltinMetrics: Record<string, Metric.Definition> = {
+  "tool-error-rate": {
+    id: "tool-error-rate",
+    name: "Tool Error Rate",
+    description: "Percentage of tool calls that failed",
+    version: "1.0.0",
+    category: "performance",
+    evaluator: {
+      type: "heuristic",
+      function: "toolErrorRate"
+    },
+    threshold: {
+      pass: 0.1,  // <10% errors
+      warn: 0.05
+    },
+    higherIsBetter: false
+  },
+  
+  "response-latency": {
+    id: "response-latency", 
+    name: "Response Latency",
+    description: "Total time to complete request",
+    version: "1.0.0",
+    category: "performance",
+    evaluator: {
+      type: "rule",
+      expression: "trace.summary.duration"
+    },
+    threshold: {
+      pass: 30000,  // <30s
+      warn: 10000   // <10s is good
+    },
+    higherIsBetter: false
+  },
+  
+  "redundant-calls": {
+    id: "redundant-calls",
+    name: "Redundant Tool Calls",
+    description: "Detects repeated identical tool calls",
+    version: "1.0.0",
+    category: "correctness",
+    evaluator: {
+      type: "heuristic",
+      function: "detectRedundantCalls"
+    },
+    threshold: { pass: 0 },
+    higherIsBetter: false
+  }
+}
+```
+
+**Heuristic Implementations**:
+```typescript
+// packages/opencode/src/evaluation/heuristics.ts
+export const Heuristics = {
+  toolErrorRate(trace: Trace.Complete): number {
+    if (trace.toolCalls.length === 0) return 0
+    const errors = trace.toolCalls.filter(t => t.status === "error").length
+    return errors / trace.toolCalls.length
+  },
+  
+  detectRedundantCalls(trace: Trace.Complete): number {
+    const seen = new Map<string, number>()
+    for (const call of trace.toolCalls) {
+      const key = `${call.id}:${JSON.stringify(call.extra)}`
+      seen.set(key, (seen.get(key) || 0) + 1)
+    }
+    return Array.from(seen.values()).filter(count => count > 1).length
+  },
+  
+  // More heuristics...
+}
+```
+
+---
+
+### 1.3 Evaluation Engine
+
+**Goal**: Execute metrics against traces and store results
+
+```typescript
+// packages/opencode/src/evaluation/engine.ts
+export namespace EvaluationEngine {
+  export type Result = {
+    id: string
+    traceID: string
+    metricID: string
+    score: number
+    passed: boolean
+    timestamp: number
+    
+    evaluatorType: "rule" | "llm" | "heuristic"
+    reasoning?: string
+    metadata?: Record<string, any>
+  }
+  
+  // Evaluate a trace against a metric
+  export async function evaluate(
+    trace: Trace.Complete,
+    metric: Metric.Definition
+  ): Promise<Result> {
+    const score = await computeScore(trace, metric)
+    const threshold = metric.threshold?.pass ?? 0
+    
+    const passed = metric.higherIsBetter 
+      ? score >= threshold 
+      : score <= threshold
+    
+    return {
+      id: Identifier.ascending("evaluation"),
+      traceID: trace.session.id,
+      metricID: metric.id,
+      score,
+      passed,
+      timestamp: Date.now(),
+      evaluatorType: metric.evaluator.type
+    }
+  }
+  
+  // Evaluate against multiple metrics
+  export async function evaluateMany(
+    trace: Trace.Complete,
+    metrics: Metric.Definition[]
+  ): Promise<Result[]> {
+    return Promise.all(metrics.map(m => evaluate(trace, m)))
+  }
+  
+  async function computeScore(
+    trace: Trace.Complete, 
+    metric: Metric.Definition
+  ): Promise<number> {
+    switch (metric.evaluator.type) {
+      case "rule":
+        return evaluateRule(trace, metric.evaluator.expression)
+      case "heuristic":
+        return evaluateHeuristic(trace, metric.evaluator)
+      case "llm":
+        return evaluateLLM(trace, metric.evaluator)
+    }
+  }
+  
+  function evaluateRule(trace: Trace.Complete, expression: string): number {
+    // Safe eval with restricted context
+    const func = new Function("trace", `return ${expression}`)
+    return func(trace)
+  }
+  
+  function evaluateHeuristic(
+    trace: Trace.Complete, 
+    evaluator: Extract<Metric.Definition["evaluator"], { type: "heuristic" }>
+  ): number {
+    const heuristic = Heuristics[evaluator.function]
+    if (!heuristic) throw new Error(`Unknown heuristic: ${evaluator.function}`)
+    return heuristic(trace, evaluator.params)
+  }
+  
+  async function evaluateLLM(
+    trace: Trace.Complete,
+    evaluator: Extract<Metric.Definition["evaluator"], { type: "llm" }>
+  ): Promise<number> {
+    // Call LLM with prompt + trace context
+    const response = await callLLM(evaluator.model, {
+      prompt: evaluator.prompt,
+      context: formatTraceForLLM(trace)
+    })
+    return evaluator.parseScore(response)
+  }
+}
+```
+
+**Storage**:
+```typescript
+["evaluation", traceID, evaluationID] -> EvaluationEngine.Result
+```
+
+---
+
+## Phase 2: Datasets & Testing (Week 3-4)
+
+### 2.1 Dataset Management
+
+```typescript
+// packages/opencode/src/evaluation/dataset.ts
+export namespace Dataset {
+  export type Definition = {
+    id: string
+    name: string
+    description: string
+    version: string
+    
+    cases: TestCase[]
+    
+    tags: string[]
+    createdAt: number
+    updatedAt: number
+  }
+  
+  export type TestCase = {
+    id: string
+    name: string
+    
+    // Input
+    prompt: string
+    context?: {
+      files?: Array<{ path: string; content: string }>
+      workingDirectory?: string
+      env?: Record<string, string>
+    }
+    
+    // Expectations (optional, for assertions)
+    expected?: {
+      toolCalls?: string[]        // Expected tool IDs
+      outputContains?: string[]   // Substrings that should appear
+      outputNotContains?: string[]
+      assertions?: Assertion[]
+    }
+    
+    tags: string[]
+    metadata?: Record<string, any>
+  }
+  
+  export type Assertion = {
+    type: "tool-called" | "tool-not-called" | "output-matches" | "custom"
+    params: Record<string, any>
+    message: string
+  }
+  
+  // CRUD operations
+  export async function create(def: Omit<Definition, "id" | "createdAt" | "updatedAt">): Promise<Definition>
+  export async function get(id: string): Promise<Definition>
+  export async function update(id: string, changes: Partial<Definition>): Promise<Definition>
+  export async function list(): AsyncIterableIterator<Definition>
+  export async function delete(id: string): Promise<void>
+  
+  // Case management
+  export async function addCase(datasetID: string, testCase: Omit<TestCase, "id">): Promise<TestCase>
+  export async function removeCase(datasetID: string, caseID: string): Promise<void>
+}
+```
+
+**Storage**:
+```typescript
+["dataset", datasetID] -> Dataset.Definition
+```
+
+**CLI**:
+```bash
+# Create dataset from scratch
+opencode dataset create smoke-tests --description "Critical path tests"
+
+# Add test case
+opencode dataset add smoke-tests --prompt "Create a file called test.txt with 'hello world'"
+
+# Capture current interaction as test case
+opencode dataset capture --name "auth flow" --dataset auth-tests
+
+# List datasets
+opencode dataset list
+
+# Export/Import
+opencode dataset export smoke-tests > smoke-tests.json
+opencode dataset import < smoke-tests.json
+```
+
+---
+
+### 2.2 Test Runner
+
+```typescript
+// packages/opencode/src/evaluation/runner.ts
+export namespace TestRunner {
+  export type RunConfig = {
+    datasetID: string
+    metrics: string[]         // Metric IDs to evaluate
+    
+    // Agent config (what to test)
+    agentName?: string
+    modelOverride?: string
+    systemPromptOverride?: string
+    
+    // Execution options
+    parallel?: number         // How many tests to run in parallel
+    timeout?: number
+    stopOnFailure?: boolean
+  }
+  
+  export type RunResult = {
+    id: string
+    datasetID: string
+    config: RunConfig
+    
+    startTime: number
+    endTime: number
+    
+    results: CaseResult[]
+    
+    summary: {
+      total: number
+      passed: number
+      failed: number
+      duration: number
+    }
+  }
+  
+  export type CaseResult = {
+    caseID: string
+    traceID: string
+    
+    status: "passed" | "failed" | "error"
+    
+    evaluations: EvaluationEngine.Result[]
+    assertionResults: AssertionResult[]
+    
+    duration: number
+    error?: string
+  }
+  
+  export async function run(config: RunConfig): Promise<RunResult> {
+    const dataset = await Dataset.get(config.datasetID)
+    const metrics = await Promise.all(
+      config.metrics.map(id => MetricRegistry.get(id))
+    )
+    
+    const results: CaseResult[] = []
+    
+    for (const testCase of dataset.cases) {
+      // Execute the test case
+      const trace = await executeTestCase(testCase, config)
+      
+      // Evaluate
+      const evaluations = await EvaluationEngine.evaluateMany(trace, metrics)
+      
+      // Check assertions
+      const assertionResults = testCase.expected?.assertions
+        ? await checkAssertions(trace, testCase.expected.assertions)
+        : []
+      
+      const allPassed = 
+        evaluations.every(e => e.passed) &&
+        assertionResults.every(a => a.passed)
+      
+      results.push({
+        caseID: testCase.id,
+        traceID: trace.session.id,
+        status: allPassed ? "passed" : "failed",
+        evaluations,
+        assertionResults,
+        duration: trace.summary.duration
+      })
+      
+      if (!allPassed && config.stopOnFailure) break
+    }
+    
+    return {
+      id: Identifier.ascending("test-run"),
+      datasetID: config.datasetID,
+      config,
+      startTime: Date.now(),
+      endTime: Date.now(),
+      results,
+      summary: computeSummary(results)
+    }
+  }
+  
+  async function executeTestCase(
+    testCase: Dataset.TestCase,
+    config: RunConfig
+  ): Promise<Trace.Complete> {
+    // Create a test session
+    const session = await Session.create()
+    
+    // Apply context overrides
+    if (testCase.context?.files) {
+      // Mock file system
+    }
+    
+    // Send the prompt
+    await SessionPrompt.prompt({
+      sessionID: session.id,
+      parts: [{ type: "text", text: testCase.prompt }],
+      agent: config.agentName,
+      model: config.modelOverride
+    })
+    
+    // Wait for completion
+    await waitForSessionComplete(session.id, config.timeout)
+    
+    // Materialize trace
+    return Trace.materialize(session.id)
+  }
+}
+```
+
+**CLI**:
+```bash
+# Run a dataset with default metrics
+opencode test run smoke-tests
+
+# Run with specific metrics
+opencode test run smoke-tests --metrics tool-error-rate,response-latency
+
+# Run and fail CI if any test fails
+opencode test run regression-suite --fail-on-error --quiet
+
+# Run with prompt override
+opencode test run edge-cases --system-prompt "You are extra cautious"
+
+# Compare two configurations
+opencode test compare smoke-tests \
+  --baseline "model=gpt-4" \
+  --variant "model=claude-3.5-sonnet"
+```
+
+---
+
+## Phase 3: CI Integration (Week 5)
+
+### 3.1 Scorecards
+
+```typescript
+// packages/opencode/src/evaluation/scorecard.ts
+export namespace Scorecard {
+  export type Definition = {
+    id: string
+    name: string
+    description: string
+    version: string
+    
+    metrics: ScorecardMetric[]
+    
+    passingCriteria: {
+      requireAll: boolean
+      minimumPassing?: number
+    }
+    
+    tags: string[]
+  }
+  
+  export type ScorecardMetric = {
+    metricID: string
+    weight: number
+    required: boolean
+    thresholdOverride?: number
+  }
+  
+  export async function evaluate(
+    scorecard: Definition,
+    trace: Trace.Complete
+  ): Promise<ScorecardResult> {
+    const metrics = await Promise.all(
+      scorecard.metrics.map(sm => MetricRegistry.get(sm.metricID))
+    )
+    
+    const evaluations = await EvaluationEngine.evaluateMany(trace, metrics)
+    
+    const results = scorecard.metrics.map((sm, i) => {
+      const evaluation = evaluations[i]
+      const threshold = sm.thresholdOverride ?? metrics[i].threshold?.pass
+      
+      return {
+        metricID: sm.metricID,
+        score: evaluation.score,
+        passed: evaluation.passed,
+        required: sm.required,
+        weight: sm.weight
+      }
+    })
+    
+    const requiredPassed = results
+      .filter(r => r.required)
+      .every(r => r.passed)
+    
+    const totalPassed = results.filter(r => r.passed).length
+    const meetsMinimum = !scorecard.passingCriteria.minimumPassing ||
+      totalPassed >= scorecard.passingCriteria.minimumPassing
+    
+    const overallPass = scorecard.passingCriteria.requireAll
+      ? results.every(r => r.passed)
+      : requiredPassed && meetsMinimum
+    
+    return {
+      scorecardID: scorecard.id,
+      traceID: trace.session.id,
+      results,
+      overallPass,
+      timestamp: Date.now()
+    }
+  }
+}
+```
+
+**Predefined Scorecards**:
+```typescript
+// packages/opencode/src/evaluation/scorecards/builtin.ts
+export const BuiltinScorecards: Record<string, Scorecard.Definition> = {
+  "regression-prevention": {
+    id: "regression-prevention",
+    name: "Regression Prevention",
+    description: "Ensures code changes don't break existing behavior",
+    version: "1.0.0",
+    metrics: [
+      { metricID: "tool-error-rate", weight: 1, required: true },
+      { metricID: "response-latency", weight: 0.5, required: false },
+      { metricID: "redundant-calls", weight: 0.5, required: false }
+    ],
+    passingCriteria: {
+      requireAll: false,
+      minimumPassing: 2
+    },
+    tags: ["ci", "critical"]
+  },
+  
+  "production-ready": {
+    id: "production-ready",
+    name: "Production Ready",
+    description: "Meets production quality standards",
+    version: "1.0.0",
+    metrics: [
+      { metricID: "tool-error-rate", weight: 1, required: true },
+      { metricID: "response-latency", weight: 1, required: true },
+      { metricID: "redundant-calls", weight: 1, required: true },
+      { metricID: "cost-efficiency", weight: 0.5, required: false }
+    ],
+    passingCriteria: {
+      requireAll: true
+    },
+    tags: ["production", "strict"]
+  }
+}
+```
+
+---
+
+### 3.2 GitHub Action Integration
+
+```yaml
+# .github/workflows/eval.yml
+name: Evaluation Gates
+
+on:
+  pull_request:
+    types: [opened, synchronize]
+
+jobs:
+  eval-gate:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      
+      - name: Setup OpenCode
+        run: |
+          curl -fsSL https://opencode.ai/install | bash
+          opencode auth login --token ${{ secrets.OPENCODE_TOKEN }}
+      
+      - name: Run Regression Tests
+        run: |
+          opencode test run regression-suite \
+            --scorecard regression-prevention \
+            --fail-on-error \
+            --output json > eval-results.json
+      
+      - name: Post Results to PR
+        if: always()
+        uses: actions/github-script@v6
+        with:
+          script: |
+            const fs = require('fs')
+            const results = JSON.parse(fs.readFileSync('eval-results.json'))
+            
+            const comment = `## Evaluation Results
+            
+            ${results.summary.passed}/${results.summary.total} tests passed
+            
+            ${results.summary.passed < results.summary.total ? '❌ Some tests failed' : '✅ All tests passed'}
+            `
+            
+            github.rest.issues.createComment({
+              issue_number: context.issue.number,
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              body: comment
+            })
+      
+      - name: Upload Detailed Results
+        if: always()
+        uses: actions/upload-artifact@v3
+        with:
+          name: eval-results
+          path: eval-results.json
+```
+
+---
+
+## Phase 4: Advanced Features (Week 6+)
+
+### 4.1 LLM-as-Judge Metrics
+
+```typescript
+// Example: Hallucination detection
+const hallucinationMetric: Metric.Definition = {
+  id: "hallucination-detection",
+  name: "Hallucination Detection",
+  description: "Detects when the agent makes unsupported claims",
+  version: "1.0.0",
+  category: "correctness",
+  evaluator: {
+    type: "llm",
+    model: "gpt-4o-mini",  // Cheaper model for evals
+    prompt: `You are evaluating an AI coding assistant's response for hallucinations.
+
+Context: The assistant had access to these files:
+{{available_files}}
+
+The assistant's response:
+{{response}}
+
+Tool calls made:
+{{tool_calls}}
+
+Question: Did the assistant make any claims about files, functions, or code that it couldn't have known from the available context?
+
+Respond with a score from 0-1:
+- 0 = No hallucinations, all claims are grounded
+- 0.5 = Minor unsupported assumptions
+- 1 = Major hallucinations or fabricated information
+
+Score:`,
+    parseScore: (output: string) => {
+      const match = output.match(/Score:\s*([\d.]+)/)
+      return match ? parseFloat(match[1]) : 0.5
+    }
+  },
+  threshold: { pass: 0.3 },
+  higherIsBetter: false
+}
+```
+
+---
+
+### 4.2 Synthetic Data Generation
+
+```typescript
+// packages/opencode/src/evaluation/synthetic.ts
+export namespace SyntheticData {
+  export type GeneratorConfig = {
+    baseScenarios: string[]      // e.g., "create a file", "debug an error"
+    variations: number            // How many variations per scenario
+    complexity: "simple" | "medium" | "complex"
+  }
+  
+  export async function generate(config: GeneratorConfig): Promise<Dataset.TestCase[]> {
+    const cases: Dataset.TestCase[] = []
+    
+    for (const scenario of config.baseScenarios) {
+      // Use LLM to generate variations
+      const prompt = `Generate ${config.variations} variations of this coding task: "${scenario}"
+      
+      Complexity level: ${config.complexity}
+      
+      For each variation, provide:
+      1. A clear task description
+      2. Expected tool usage
+      3. Success criteria
+      
+      Format as JSON array.`
+      
+      const variations = await callLLM("gpt-4", { prompt })
+      
+      for (const variation of variations) {
+        cases.push({
+          id: Identifier.ascending("test-case"),
+          name: variation.description,
+          prompt: variation.description,
+          expected: {
+            toolCalls: variation.expectedTools,
+            assertions: variation.assertions
+          },
+          tags: ["synthetic", config.complexity],
+          metadata: { generatedFrom: scenario }
+        })
+      }
+    }
+    
+    return cases
+  }
+}
+```
+
+**CLI**:
+```bash
+# Generate test cases
+opencode dataset generate \
+  --scenarios "file operations,refactoring,debugging" \
+  --variations 5 \
+  --complexity medium \
+  --output edge-cases
+```
+
+---
+
+## Summary: What Gets Built When
+
+**Week 1-2: Foundation**
+- ✅ Trace materialization
+- ✅ Metric registry with 5-10 built-in metrics
+- ✅ Evaluation engine (rule + heuristic)
+- ✅ Storage layer
+- 🔧 CLI: `opencode eval trace <session-id>`
+
+**Week 3-4: Datasets**
+- ✅ Dataset CRUD
+- ✅ Test runner
+- ✅ Assertion framework
+- 🔧 CLI: `opencode test run <dataset>`
+
+**Week 5: CI Integration**
+- ✅ Scorecards
+- ✅ GitHub Action
+- ✅ PR comments with results
+- 🔧 CLI: `opencode test run --fail-on-error`
+
+**Week 6+: Advanced**
+- ⏳ LLM-as-judge metrics
+- ⏳ Synthetic data generation
+- ⏳ Experiment framework (A/B testing)
+- ⏳ Web dashboard for results
+
+---
+
+## Development Philosophy
+
+1. **Start with telemetry** - Already have tool instrumentation, build on it
+2. **Dogfood immediately** - Use it to test Grimoire itself
+3. **Ship incrementally** - Each phase is independently useful
+4. **Learn from usage** - Let real usage guide metric selection
+5. **Keep it fast** - Sub-10min CI runs, real-time feedback
diff --git a/docs/evaluation-ontology.md b/docs/evaluation-ontology.md
new file mode 100644
index 0000000000..e4561fe29a
--- /dev/null
+++ b/docs/evaluation-ontology.md
@@ -0,0 +1,513 @@
+# Evaluation Ontology: First Principles
+
+## Core Entities
+
+### 1. **Trace** (Execution Context)
+The fundamental unit of observable behavior. A Trace represents a complete interaction flow.
+
+```typescript
+type Trace = {
+  id: string                    // Unique identifier
+  sessionID: string             // Which session this belongs to
+  startTime: number
+  endTime?: number
+  status: "running" | "completed" | "failed"
+  
+  // Identity
+  agentName: string             // Which agent executed this
+  modelConfig: {                // Model configuration at time of execution
+    provider: string
+    model: string
+    temperature?: number
+    // ... other model params
+  }
+  
+  // Prompt context
+  systemPrompt: string          // The actual system prompt used
+  systemPromptVersion?: string  // Semantic version or hash
+  
+  // Structure
+  messages: Message[]           // The full conversation
+  toolCalls: ToolCall[]         // All tool invocations
+  
+  // Outcomes
+  tokens: TokenUsage
+  cost: number
+  
+  // Evaluation
+  evaluations?: Evaluation[]    // Assessments of this trace
+}
+```
+
+**Why Trace?**
+- A trace is self-contained - you can replay, analyze, or evaluate it independently
+- It captures the entire context needed to understand "what happened"
+- Maps naturally to OpenTelemetry/observability concepts
+- Already partially exists via Session + Messages + TelemetryEvents
+
+---
+
+### 2. **Evaluation** (Assessment)
+A judgment about a Trace or component thereof.
+
+```typescript
+type Evaluation = {
+  id: string
+  traceID: string
+  
+  // What's being evaluated
+  target: {
+    type: "trace" | "message" | "tool_call" | "output"
+    id: string
+  }
+  
+  // The evaluation criteria
+  metricID: string              // Which metric was applied
+  
+  // The judgment
+  score: number                 // Normalized 0-1 or metric-specific
+  passed: boolean               // Did it meet threshold?
+  
+  // Context
+  timestamp: number
+  evaluatorType: "rule" | "llm" | "human" | "heuristic"
+  evaluatorID?: string          // Which LLM or human
+  
+  // Evidence
+  reasoning?: string            // Why this score (esp. for LLM judges)
+  metadata?: Record<string, any>
+}
+```
+
+**Why separate Evaluation from Trace?**
+- A trace can be evaluated multiple times with different metrics
+- Evaluations can be retroactive - evaluate past traces with new criteria
+- Different stakeholders care about different evaluations
+- Enables A/B testing of evaluation methods themselves
+
+---
+
+### 3. **Metric** (Evaluation Criterion)
+Defines *what* we're measuring and *how*.
+
+```typescript
+type Metric = {
+  id: string
+  name: string
+  description: string
+  
+  // What does this measure?
+  domain: "correctness" | "safety" | "efficiency" | "quality" | "compliance"
+  
+  // How is it computed?
+  evaluator: {
+    type: "rule" | "llm" | "human" | "heuristic"
+    
+    // For rule-based
+    rule?: {
+      expression: string        // e.g., "duration < 5000"
+      language: "javascript" | "jsonlogic"
+    }
+    
+    // For LLM-based
+    llm?: {
+      prompt: string
+      model: string
+      parseOutput: "boolean" | "score_0_1" | "score_1_10" | "reasoning"
+    }
+    
+    // For heuristic
+    heuristic?: {
+      function: string          // Name of built-in function
+      params?: Record<string, any>
+    }
+  }
+  
+  // Interpretation
+  threshold?: number            // Pass/fail cutoff
+  higherIsBetter: boolean
+  
+  // Metadata
+  version: string
+  tags: string[]
+}
+```
+
+**Built-in Heuristics Examples:**
+- `tool_error_rate`: Ratio of failed tool calls
+- `redundant_tool_calls`: Detects repeated identical calls
+- `hallucination_indicators`: Flags suspicious patterns
+- `token_efficiency`: Output quality per token spent
+
+---
+
+### 4. **Dataset** (Test Cases)
+A collection of inputs with expected behaviors.
+
+```typescript
+type Dataset = {
+  id: string
+  name: string
+  description: string
+  version: string
+  
+  cases: TestCase[]
+  
+  // Metadata
+  tags: string[]                // "regression", "edge_cases", "production_sample"
+  createdAt: number
+  updatedAt: number
+}
+
+type TestCase = {
+  id: string
+  
+  // Input
+  prompt: string                // What the user asks
+  context?: {                   // Optional environmental context
+    files?: string[]            // Which files exist
+    workingDirectory?: string
+  }
+  
+  // Expected behavior (can be partial)
+  expected?: {
+    toolCalls?: string[]        // Expected tools to be called
+    output?: string             // Exact or fuzzy match
+    assertions?: Assertion[]    // Custom checks
+  }
+  
+  // Metadata
+  tags: string[]
+  difficulty?: "easy" | "medium" | "hard"
+  source?: "synthetic" | "production" | "manual"
+}
+
+type Assertion = {
+  type: "contains" | "not_contains" | "matches" | "tool_called" | "custom"
+  value: any
+  message?: string
+}
+```
+
+**Why separate Dataset?**
+- Enables versioning of test suites
+- Can run same dataset across different agent configs
+- Datasets can be shared/imported
+- Natural basis for CI gates: "Run dataset X, all cases must pass metric Y"
+
+---
+
+### 5. **Experiment** (Comparative Run)
+Structured comparison of different configurations.
+
+```typescript
+type Experiment = {
+  id: string
+  name: string
+  description: string
+  
+  // What's being tested
+  datasetID: string
+  
+  // Variants
+  variants: Variant[]
+  
+  // Results
+  runs: Run[]
+  
+  // Metadata
+  status: "running" | "completed" | "failed"
+  startTime: number
+  endTime?: number
+}
+
+type Variant = {
+  id: string
+  name: string                  // "baseline", "new_prompt", "gpt4o"
+  
+  config: {
+    agentName?: string
+    systemPrompt?: string
+    model?: string
+    temperature?: number
+    // ... any configurable parameter
+  }
+}
+
+type Run = {
+  variantID: string
+  testCaseID: string
+  traceID: string               // Links to the actual execution
+  evaluations: Evaluation[]
+}
+```
+
+**Why Experiment?**
+- Formalizes A/B testing
+- Enables statistical comparisons
+- Natural fit for prompt optimization
+- Can track what was learned: "new_prompt reduced error_rate by 15%"
+
+---
+
+### 6. **Scorecard** (Quality Contract)
+A bundle of metrics that define "good enough".
+
+```typescript
+type Scorecard = {
+  id: string
+  name: string
+  description: string
+  
+  // Which metrics matter?
+  metrics: ScorecardMetric[]
+  
+  // How do we aggregate?
+  passingCriteria: {
+    requireAll: boolean         // AND vs OR
+    minimumPassing?: number     // At least N metrics must pass
+  }
+  
+  // Metadata
+  version: string
+  tags: string[]
+}
+
+type ScorecardMetric = {
+  metricID: string
+  weight: number                // For weighted scoring
+  required: boolean             // Must pass vs nice-to-have
+  threshold?: number            // Override metric default
+}
+```
+
+**Why Scorecard?**
+- Enables "shift left" - define quality gates early
+- Different stages need different scorecards (dev vs staging vs prod)
+- Can version scorecards as requirements evolve
+- Natural CI integration: "This PR must pass scorecard:regression-prevention"
+
+---
+
+## Relationships
+
+```
+Dataset [1] ──< [N] TestCase
+TestCase [1] ──< [N] Trace (via Experiment or direct execution)
+Trace [1] ──< [N] ToolCall (via TelemetryEvent)
+Trace [1] ──< [N] Evaluation
+Evaluation [N] >── [1] Metric
+
+Experiment [1] ──< [N] Variant
+Experiment [1] ──> [1] Dataset
+Experiment [1] ──< [N] Run
+Run [1] ──> [1] Trace
+Run [1] ──> [1] TestCase
+
+Scorecard [1] ──< [N] ScorecardMetric
+ScorecardMetric [N] >── [1] Metric
+```
+
+---
+
+## Storage Design
+
+### Current State (What Exists)
+```typescript
+// Storage paths
+["session", projectID, sessionID] -> Session.Info
+["message", sessionID, messageID] -> Message
+["telemetry", "tools"] -> TelemetrySummary
+```
+
+### Proposed Additions
+```typescript
+// Traces (augmented sessions)
+["trace", projectID, traceID] -> Trace
+// Trace is basically Session + materialized tool events + evaluation results
+
+// Evaluation data
+["metric", metricID] -> Metric
+["evaluation", traceID, evaluationID] -> Evaluation
+
+// Test data
+["dataset", datasetID] -> Dataset
+["dataset", datasetID, "cases", caseID] -> TestCase
+
+// Experiments
+["experiment", experimentID] -> Experiment
+["experiment", experimentID, "runs", runID] -> Run
+
+// Scorecards
+["scorecard", scorecardID] -> Scorecard
+
+// Baselines (for comparison)
+["baseline", name] -> {
+  traceID: string
+  timestamp: number
+  metrics: Record<metricID, number>
+}
+```
+
+---
+
+## Integration with Existing System
+
+### Already Have (Leverage)
+1. **TelemetryEvent** → Maps to ToolCall in Trace
+2. **Session + Messages** → Core of Trace
+3. **ToolHistory** → Can evolve into TraceIndex
+4. **Storage abstraction** → Can store new entities
+5. **Bus system** → Can emit evaluation events
+
+### Need to Build
+1. **Trace materialization** - Convert Session → Trace (capture full context)
+2. **Metric registry** - Define and load evaluation metrics
+3. **Evaluator engine** - Execute metrics against traces
+4. **Dataset management** - CRUD for test cases
+5. **Experiment runner** - Orchestrate comparative runs
+6. **Scorecard evaluator** - Check if trace meets quality bar
+
+### Migration Path
+**Phase 1: Trace Foundation**
+- Extend Session with Trace concept
+- Make system prompt, model config first-class
+- Ensure all tool events link to traces
+
+**Phase 2: Basic Evaluation**
+- Implement Metric schema
+- Build rule-based evaluator
+- Add evaluations to traces
+
+**Phase 3: Datasets & Experiments**
+- Dataset storage + CRUD
+- Simple experiment runner
+- CLI: `opencode eval run dataset:smoke-tests`
+
+**Phase 4: Advanced Features**
+- LLM-as-judge metrics
+- Scorecards + CI gates
+- Synthetic data generation
+
+---
+
+## Key Design Principles
+
+### 1. **Immutability**
+- Traces are immutable once completed
+- Evaluations are additive (never mutate a score)
+- Enables time-travel debugging
+- Can re-evaluate historical data
+
+### 2. **Composability**
+- Metrics compose into Scorecards
+- Datasets are just collections of TestCases
+- Experiments reference Datasets
+- Everything has an ID, everything can reference
+
+### 3. **Observability-Native**
+- Every entity has timestamps
+- Every operation emits events (via Bus)
+- Natural fit for OpenTelemetry export
+- Can stream evaluations in real-time
+
+### 4. **Schema Evolution**
+- Version everything (Metric v1.2.0, Dataset v3)
+- Additive changes only (new fields, not breaking)
+- Old data remains valid
+- Can re-run with new metric versions
+
+### 5. **Developer Ergonomics**
+- Defaults for 90% case: `opencode eval` just works
+- Progressive disclosure: simple → powerful
+- Git-like model: local-first, can push/share
+- Natural language where possible: "Test the auth flow"
+
+---
+
+## Example Workflows
+
+### Workflow 1: Add a Regression Test
+```bash
+# Capture current behavior as a test case
+opencode eval capture "Fix the login bug" --output dataset:auth-tests
+
+# Later, ensure it doesn't regress
+opencode eval run dataset:auth-tests --scorecard:regression
+```
+
+### Workflow 2: Optimize a Prompt
+```bash
+# Create experiment with 3 prompt variants
+opencode eval experiment \
+  --dataset=edge-cases \
+  --baseline="current prompt" \
+  --variant-1="revised prompt v1" \
+  --variant-2="revised prompt v2" \
+  --metrics=accuracy,latency,cost
+
+# Shows comparison table, picks winner
+```
+
+### Workflow 3: CI Gate
+```yaml
+# .github/workflows/pr.yml
+- name: Eval Gate
+  run: |
+    opencode eval run dataset:critical-paths \
+      --scorecard:production-ready \
+      --fail-on-regression
+```
+
+### Workflow 4: Production Monitoring
+```bash
+# Sample 1% of production traces
+opencode eval sample --rate=0.01 --metrics=safety,hallucination
+
+# Daily report
+opencode eval report --since=24h --compare-to=baseline
+```
+
+---
+
+## Open Questions
+
+1. **Granularity of Traces**: Should we trace individual tool calls or just full sessions?
+   - **Answer**: Sessions as traces, tool calls as spans within traces
+
+2. **Evaluation Frequency**: Real-time, batch, or on-demand?
+   - **Answer**: All three - streaming for CI, batch for experiments, on-demand for analysis
+
+3. **LLM-as-Judge Costs**: How to make evaluations affordable at scale?
+   - **Answer**: Sampling, caching, use cheaper models for routine checks
+
+4. **Metric Versioning**: How to handle metric changes over time?
+   - **Answer**: Semantic versioning, re-run with new versions is explicit
+
+5. **Synthetic vs Real Data**: How to generate good test cases?
+   - **Answer**: Start with production sampling, evolve to synthetic generators
+
+6. **Baseline Drift**: How to keep baselines current as system improves?
+   - **Answer**: Automatic baseline updates when new records set, manual approval
+
+---
+
+## Success Metrics for This System
+
+1. **Time to detect regression**: < 10 minutes (in CI)
+2. **False positive rate**: < 5% (don't block good changes)
+3. **Coverage**: 80%+ of tool operations have telemetry
+4. **Adoption**: Team actually uses it (ergonomics matter)
+5. **Insight generation**: Surfaces actionable patterns weekly
+
+---
+
+## Conclusion
+
+The ontology builds on three core ideas:
+
+1. **Trace as the atomic unit** - Everything flows from captured executions
+2. **Evaluation as a separate concern** - Decoupled from generation, versionable, composable
+3. **Developer-centric design** - Built for the team using OpenCode daily, not abstract metrics
+
+This maps naturally to EvalOps' mission: ship LLM changes confidently by making quality observable, measurable, and gateable.
diff --git a/docs/evaluation-status.md b/docs/evaluation-status.md
new file mode 100644
index 0000000000..97a174be79
--- /dev/null
+++ b/docs/evaluation-status.md
@@ -0,0 +1,256 @@
+# Evaluation Framework Implementation Status
+
+## Completed ✅
+
+### Stream 1: Trace Foundation
+**Commit**: `0e92e2f8` - "trace: implement trace foundation"
+
+- ✅ Created `Trace` namespace with complete type definitions
+- ✅ Implemented `Trace.materialize()` to convert sessions to traces
+- ✅ Added trace storage layer (`get`, `list`, `exists`, `remove`)
+- ✅ Implemented filtering for trace queries
+- ✅ Added `trace.completed` event emission
+- ✅ Computed summary statistics (duration, tokens, cost, errors)
+
+**Files Created**:
+- `packages/opencode/src/trace/index.ts` (247 lines)
+
+**Key Capabilities**:
+```typescript
+// Materialize any session into a trace
+const trace = await Trace.materialize(sessionID)
+
+// Query traces with filters
+for await (const trace of Trace.list({ hasErrors: true, minDuration: 5000 })) {
+  console.log(trace.summary)
+}
+
+// Get specific trace
+const trace = await Trace.get(traceID)
+```
+
+---
+
+## Next Steps (Ready to Implement)
+
+### Stream 2: Metric Registry (2-3 hours)
+**Goal**: Define evaluation criteria
+
+**Steps**:
+1. Create metric schema (`packages/opencode/src/evaluation/metric.ts`)
+2. Implement metric registry (CRUD operations)
+3. Build 5-7 built-in metrics:
+   - `tool-error-rate`: % of failed tool calls
+   - `response-latency`: Total duration
+   - `redundant-calls`: Detect repeated calls
+   - `cost-efficiency`: Cost per successful operation
+   - `token-efficiency`: Output tokens / total tokens
+4. Create rule-based evaluator (JavaScript expressions)
+5. Add metric storage layer
+6. Implement metric versioning
+
+**Files to Create**:
+- `packages/opencode/src/evaluation/metric.ts`
+- `packages/opencode/src/evaluation/heuristics.ts`
+- `packages/opencode/src/evaluation/metrics/builtin.ts`
+
+---
+
+### Stream 3: Evaluation Engine (3-4 hours)
+**Depends on**: Streams 1 & 2
+
+**Steps**:
+1. Create evaluation result schema
+2. Implement heuristic evaluator
+3. Implement rule evaluator  
+4. Build evaluation engine orchestrator
+5. Add evaluation storage
+6. Create evaluation query API
+7. Emit evaluation events
+
+**Files to Create**:
+- `packages/opencode/src/evaluation/engine.ts`
+- `packages/opencode/src/evaluation/index.ts`
+
+---
+
+### Stream 4: Dataset Management (2-3 hours)
+**Can run in parallel with Stream 3**
+
+**Steps**:
+1. Create dataset schema
+2. Implement dataset CRUD
+3. Create test case schema with assertions
+4. Build dataset storage layer
+5. Add dataset CLI commands
+6. Create dataset import/export
+
+**Files to Create**:
+- `packages/opencode/src/evaluation/dataset.ts`
+- `packages/opencode/src/cli/cmd/dataset.ts`
+
+---
+
+### Stream 5: Test Runner (3-4 hours)
+**Depends on**: Streams 3 & 4
+
+**Steps**:
+1. Create test execution engine
+2. Implement assertion framework
+3. Build test result aggregation
+4. Add parallel execution support
+5. Create CLI: `opencode test run`
+6. Add result output formats
+7. Implement fail-on-error mode
+
+**Files to Create**:
+- `packages/opencode/src/evaluation/runner.ts`
+- `packages/opencode/src/cli/cmd/test.ts`
+
+---
+
+### Stream 6: Scorecards (2 hours)
+**Depends on**: Stream 3
+
+**Steps**:
+1. Create scorecard schema
+2. Implement scorecard evaluator
+3. Build 2-3 built-in scorecards
+4. Add scorecard storage
+5. Create scorecard CLI
+6. Integrate with test runner
+
+**Files to Create**:
+- `packages/opencode/src/evaluation/scorecard.ts`
+- `packages/opencode/src/evaluation/scorecards/builtin.ts`
+
+---
+
+### Stream 7: CLI Integration (Ongoing, 1-2 hours)
+**Parallel with all streams**
+
+**Steps**:
+1. Create `opencode eval` command group
+2. Add `opencode eval trace <session-id>`
+3. Add `opencode eval run <metric-id> <trace-id>`
+4. Create `opencode dataset` command group
+5. Create `opencode test` command group
+6. Add pretty formatting
+7. Create help documentation
+
+**Files to Create/Modify**:
+- `packages/opencode/src/cli/cmd/eval.ts`
+- Update `packages/opencode/src/index.ts` to register commands
+
+---
+
+### Stream 8: CI/CD Integration (1-2 hours)
+**Depends on**: Streams 5 & 6
+
+**Steps**:
+1. Create GitHub Action workflow example
+2. Add PR comment formatting
+3. Implement baseline comparison
+4. Add regression detection
+5. Create CI-friendly output formats
+6. Document setup guide
+
+**Files to Create**:
+- `.github/workflows/eval-example.yml`
+- `docs/ci-integration.md`
+
+---
+
+## Implementation Timeline
+
+**Already Complete**: 
+- ✅ Trace Foundation (Stream 1)
+- ✅ Implementation plan documents
+- ✅ Ontology design
+
+**Remaining Work**: ~16-20 hours
+- Stream 2: Metric Registry (2-3h)
+- Stream 3: Evaluation Engine (3-4h)  
+- Stream 4: Dataset Management (2-3h)
+- Stream 5: Test Runner (3-4h)
+- Stream 6: Scorecards (2h)
+- Stream 7: CLI Integration (1-2h)
+- Stream 8: CI/CD Integration (1-2h)
+
+---
+
+## How to Continue
+
+### Option 1: Sequential Implementation
+Implement streams in dependency order:
+1. Stream 2 (Metrics)
+2. Stream 3 (Engine)
+3. Streams 4 + 6 in parallel
+4. Stream 5
+5. Streams 7 + 8
+
+### Option 2: MVP First
+Build minimal viable product:
+1. Stream 2: Just 3 metrics (error-rate, latency, cost)
+2. Stream 3: Basic engine (heuristics only)
+3. Stream 7: Simple CLI (`opencode eval trace`)
+4. Test and iterate
+
+### Option 3: Parallel Teams
+If multiple developers:
+- Dev 1: Streams 2 → 3 → 6
+- Dev 2: Stream 4 → 5
+- Dev 3: Stream 7 (ongoing)
+
+---
+
+## Key Design Decisions Made
+
+1. **Traces are immutable** - Once materialized, they don't change
+2. **Evaluations are separate** - Can evaluate/re-evaluate traces anytime
+3. **Storage is local-first** - All data in project storage
+4. **Events for observability** - Bus system for real-time notifications
+5. **Progressive disclosure** - Simple cases work out of box, complex cases supported
+
+---
+
+## Testing Strategy
+
+Each stream should include:
+1. Unit tests for core logic
+2. Integration tests with storage
+3. CLI tests for user-facing commands
+4. Example usage in docs
+
+---
+
+## Success Metrics
+
+### Phase 1 (Streams 1-3)
+- [ ] Can materialize traces from sessions
+- [ ] Can evaluate traces with built-in metrics
+- [ ] Can query evaluation history
+
+### Phase 2 (Streams 4-5)
+- [ ] Can create and run test datasets
+- [ ] Assertions work correctly
+- [ ] Results are actionable
+
+### Phase 3 (Streams 6-8)
+- [ ] Scorecards enforce quality gates
+- [ ] CI integration blocks bad PRs
+- [ ] Documentation is complete
+
+---
+
+## Next Command to Run
+
+To continue implementation:
+
+```bash
+# Stream 2: Create metric registry
+cd packages/opencode/src
+mkdir -p evaluation/metrics
+```
+
+Then create the files outlined in Stream 2 above.
diff --git a/docs/plugin-starter.md b/docs/plugin-starter.md
new file mode 100644
index 0000000000..2da7ec73ad
--- /dev/null
+++ b/docs/plugin-starter.md
@@ -0,0 +1,29 @@
+# Plugin Starter Template
+
+Use the helpers exported from `@opencode-ai/plugin` to build tools quickly:
+
+```ts
+import { tool } from "@opencode-ai/plugin"
+
+export const hello = tool({
+  description: "Greet a name",
+  args: {
+    name: tool.schema.string().describe("Name to greet"),
+  },
+  async execute(args, ctx) {
+    return {
+      title: `Hello, ${args.name}!`,
+      output: `Session ${ctx.sessionID} says hello to ${args.name}.`,
+      metadata: {
+        length: args.name.length,
+      },
+    }
+  },
+})
+```
+
+Guidelines:
+- Always describe arguments with `tool.schema` so the host can validate inputs.
+- Return either a string or an object containing `output`, plus optional `title` and `metadata`.
+- Use the tool telemetry (`measure`) and workspace guards when calling back into core tools.
+- Test plugins by importing the generated hook into `packages/plugin/src/example.ts` and running `bunx tsc --noEmit`.
diff --git a/docs/tool-authoring.md b/docs/tool-authoring.md
new file mode 100644
index 0000000000..24cd7956bd
--- /dev/null
+++ b/docs/tool-authoring.md
@@ -0,0 +1,29 @@
+# Tool Authoring Guide
+
+This project now ships shared helpers so every tool behaves consistently.
+
+## Instrumentation
+- Wrap long-running work with `measure({ id, ctx, params, run })` from `packages/opencode/src/tool/telemetry.ts`.
+- Each call logs execution duration, call id, and status, helping us spot slow or flaky commands while developing with `bun dev`.
+- `measure()` also publishes a `tool.telemetry` bus event. The TUI subscribes and renders these entries in real time (`tele  | ToolName 0.42s`). Tap into the same stream via `Bus.subscribe(ToolTelemetry.Event.Sampled, ...)` for custom dashboards.
+
+## Workspace Safety
+- Use `guard()` from `packages/opencode/src/tool/workspace.ts` to resolve paths and enforce the workspace boundary.
+- Pass `message` if you need a custom error; pass `bypass: true` only for trusted internal flows.
+- Tools such as `edit`, `write`, `multiedit`, and `patch` already wrap user-provided paths with `guard()`. Follow the same pattern when building new file mutators.
+
+## Troubleshooting
+- If you see `tool.telemetry` entries with `status=error`, inspect the associated `error` string—it's propagated from the thrown exception.
+- Workspace errors typically originate from `guard()`. Confirm the tool receives absolute paths rooted in `Instance.directory` or set `bypass` explicitly for trusted cases (e.g., generated temp files).
+- When adding tests around I/O, use `tmpdir()` to create and clean up isolated directories; the helper ensures telemetry logs stay focused on the test workspace.
+- For tool stats, run `opencode stats`. The display now groups the last session’s telemetry entries by tool, listing total runs, average duration, and error count so you can spot hotspots quickly.
+
+## Plugin Tools
+- Plugin authors can return either a plain string or `{ output, title?, metadata? }`.
+- See `packages/plugin/src/tool.ts` for the unified `ToolDefinition` and `ToolResult` types.
+
+## Testing
+- Prefer table-driven tests under `packages/opencode/test/tool`. Use `tmpdir()` to create isolated workspaces.
+- Capture streamed metadata (see `bash.test.ts`) to ensure tools emit incremental updates as expected.
+
+Small, consistent helpers keep our tool surface predictable and easier to debug. Add to this document whenever you introduce new patterns that other contributors should follow.
diff --git a/docs/tui-ink-migration.md b/docs/tui-ink-migration.md
new file mode 100644
index 0000000000..2e457d4479
--- /dev/null
+++ b/docs/tui-ink-migration.md
@@ -0,0 +1,139 @@
+# Ink Migration Research
+
+This document captures the current state of the Go/Bubble Tea TUI, findings from reviewing Continue’s Ink-based CLI, and a recommended migration path. It is meant to serve as a primer for anyone evaluating or executing a future move to Ink.
+
+## 1. Current Go TUI Architecture
+
+### 1.1 Launch flow
+- `packages/opencode/src/cli/cmd/tui.ts` is the entrypoint. It bootstraps the Opencode server, locates the platform-specific Go binary, and spawns it with relevant flags (`--model`, `--session`, etc.).
+- The CLI exposes `OPENCODE_TUI_PATH` for overrides (useful for experimenting with alternative implementations).
+- The Go binary lives under `packages/tui` and is built with Bubble Tea + the Charmbracelet ecosystem.
+
+### 1.2 Packages of interest
+- `packages/tui/internal/app`: application state, config integration, session metadata, persisted “tui state” (recent models/agents, message history, toggles).
+- `packages/tui/internal/tui/tui.go`: Bubble Tea `Model` implementation. Handles Init/Update/View loop, key bindings, modal stack, toasts, diff overlay, etc.
+- `packages/tui/internal/components`: UI primitives composed by the model.
+  - `chat`: editor, messages pane, caches, markdown rendering.
+  - `commands`: command palette, leader key management.
+  - `dialog`: completion popovers (command, file, agent), session picker, confirmation dialogs.
+  - `diff`: diff overlay with syntax highlighting.
+  - `textarea`: multiline editor with history, mode switch (chat vs bash), key debounce logic.
+  - `status`: bottom status bar (cwd/git branch, model info, latency, queue state).
+  - `toast`: transient notifications.
+  - `qr`, `modal`, `list`: supporting components for login flows, overlays, navigation lists.
+- `packages/tui/internal/app/state.go`: persists TUI state to TOML (theme, recently used models/agents, message history, toggles).
+
+### 1.3 Feature inventory
+The current TUI provides a rich, IDE-like experience in the terminal. Key features include:
+
+| Feature | Go implementation notes |
+| --- | --- |
+| Home screen | ASCII logo, quick start shortcuts, model summary (see `tui.Home()` in `tui.go`). |
+| Multi-pane chat view | Split layout (messages left, editor bottom, optional diff overlay, modals stacked on top). |
+| Streaming messages | Bubble Tea subscriptions update `chat.MessagesComponent` incrementally. |
+| Markdown + syntax highlighting | `glamour`, `chroma` render markdown and diffs. |
+| File editor integration | Textarea component with history, command detection (`/`, `@`, `!`), bash mode toggle. |
+| Command palette | Leader key sequences, completion dialogs for commands/files/agents. |
+| Status bar | Displays cwd/git branch, session status, cost & latency, background tasks. |
+| Toast notifications | Non-blocking success/error banners via `toast.New*`. |
+| Modals & selectors | Session picker with rename, confirm dialogs, login prompts. |
+| Diff viewer | Full-screen overlay for patch review with scroll + syntax colors. |
+| Telemetry integration | Bottom indicators for tool timings, agent model, plan status. |
+| Key handling | Debounced exit and interrupt keys, leader key sequences, ctrl+z suspend, mouse wheel scroll. |
+| Persistence | TOML state file for recents/history toggles, updated through `state.go`. |
+| Server bridge | Communicates with Opencode server via `app.Client` interfaces (sessions, prompts, tools, telemetry). |
+
+### 1.4 Input/event flow
+- Bubble Tea `Update` function orchestrates key events. It routes to editor, commands, modals, or toasts.
+- Commands are defined in `packages/tui/internal/commands` and matched via `Commands.Matches` with leader flag support.
+- Completion dialog logic selects providers (`commandProvider`, `fileProvider`, `symbolsProvider`, `agentsProvider`).
+- Background tasks: diff overlay, telemetry updates, plan watchers, login flows, file watchers (through `app.Watchers`).
+
+### 1.5 Packaging & distribution
+- Go binary is embedded in npm package (`packages/opencode/bin/opencode`).
+- Cross-platform distribution uses Go compiler, minimal runtime dependencies, near-instant startup.
+
+## 2. Continue’s Ink CLI (Reference Implementation)
+We surveyed https://github.com/continuedev/continue (locally at `/Users/jonathanhaas/Documents/Dev/continue`).
+
+### 2.1 Stack overview
+- Entire CLI lives under `extensions/cli` and is written in TypeScript.
+- UI is implemented with Ink and React components (`extensions/cli/src/ui`).
+- State is provided through custom service containers (`extensions/cli/src/services`), contexts, and hooks.
+- Packaging via npm scripts: `tsc` + bundling (`build.mjs`), shipped as JS binaries (`dist/index.js`), no Go binaries involved.
+
+### 2.2 UI component structure
+- `AppRoot.tsx` wraps the app in `NavigationProvider` and renders `TUIChat`.
+- `TUIChat.tsx` orchestrates layout: chat history, editor, status bars, diff viewer, session selectors, modals, update notifications.
+- Numerous components mirror the complexity of our Go TUI: Markdown renderer, syntax highlighting, model selectors, slash command UI, diff viewer, resource debug bar, etc.
+- `extensions/cli/spec/tui.md` documents Ink stack and UI requirements (git/cwd display, etc.).
+
+### 2.3 Key takeaways
+- Ink can support a large-scale, feature-rich TUI given sufficient component scaffolding.
+- Continue leans on React conventions (contexts, hooks) to manage global state and service interactions, which aligns well with our TS codebase.
+- Distribution is via Node runtime (npm package). Startup will be slower than a baked Go binary but acceptable for modern CLIs.
+
+## 3. Proposed Migration Strategy
+This is a multi-phase effort; start with research and proof-of-concept.
+
+### Phase 0 — Documentation (you are here)
+- Capture architecture of current Go TUI and reference Ink implementation (this document).
+
+### Phase 1 — Proof of concept
+- Create `packages/opencode/src/tui-poc.tsx` implemented with Ink.
+- Replicate the “home” screen (logo, quick shortcuts, model summary, text input).
+- Wire to existing Opencode server bootstrap for data (reuse `bootstrap` from `tui.ts`).
+- Measure startup time and memory vs. Go binary.
+
+### Phase 2 — Feature parity plan
+For each Go component, define the Ink equivalent and implementation notes:
+
+| Go component | Responsibility | Ink plan |
+| --- | --- | --- |
+| `chat.MessagesComponent` | Streaming message list, markdown render, tool traces | Ink list view + custom markdown renderer (`ink-markdown`, `marked-terminal`). Maintain virtualized list for performance. |
+| `chat.EditorComponent` | Multiline editor, history, slash commands, bash mode | Build Ink component using raw stdin handling, history state, placeholder hints. Evaluate community packages (`ink-use-stdin`, `ink-text-input`) vs custom. |
+| `dialog.CompletionDialog` | Slash command & @ mention completion overlays | Overlay component via Ink `<Box>` with absolute positioning (managed via terminal columns) + keyboard navigation. |
+| `commands` | Leader key handling, command routing | Reuse existing TS command definitions. Implement keyboard handler hook to track leader sequences and debounced keys (interrupt/exit). |
+| `diff.DiffComponent` | Full-screen diff overlay, syntax highlight | Use `diff` + `cli-highlight` or `shiki` for syntax, overlay with Ink `<Box>` taking full width/height. |
+| `toast` | Temporary banners | Ink component anchored top-right/bottom. Manage lifetime via `setTimeout`. |
+| `status.StatusComponent` | Bottom status bar, git/cwd, model info, tool telemetry | Compose `<Box>` rows with computed spans; reuse existing TS providers for data (git/cwd logic already in TS). |
+| `modal` | Session selector, rename dialog, login prompt | Portal-like Ink component triggered via context state. |
+| `qr` | ASCII QR codes for login flows | Use `qrcode-terminal` library. |
+| `list` | Generic selection lists (sessions, models) | Build re-usable Ink list component with highlight + filtering support. |
+| `app.State` persistence | Recents, toggles, history stored as TOML | Reuse existing TS persistence utilities (`Config`, `Session`, `Storage`) or port `state.go` logic to TS module. |
+
+### Phase 3 — Infra & packaging
+- Decide on runtime: require Bun/Node, or explore `bun build --compile` for native binaries.
+- Update CLI entrypoint to detect and launch Ink version (guarded by env flag for beta testers).
+- Ensure cross-platform behavior (macOS, Linux, Windows). Test terminal compatibility (colors, resizing, mouse scroll).
+- Integrate CI (lint, tests) for new TUI. Reuse `vitest` for component tests similar to Continue’s `extensions/cli/src/ui/__tests__`.
+
+### Phase 4 — Feature completion & rollout
+- Incrementally port features from Go components, verifying against feature checklist.
+- Provide fallback to Go TUI until Ink reaches parity (controlled by flag).
+- Document migration path for users (release notes, README updates).
+
+## 4. Risks & Considerations
+- **Performance:** Node/Ink startup will be slower than Go. Need benchmarks; possibly mitigate by keeping Go binary as optional fast mode.
+- **Key handling:** Reimplement complex keybindings (leader sequences, debounced interrupt/exit) carefully to avoid regressions.
+- **Streaming:** Ensure Ink rendering remains responsive during long-running operations (might require throttling updates or using `ink`’s `<Static>` regions).
+- **Terminal capability detection:** Continue uses contexts to manage width/height; we must replicate status line/bottom bar layout across different terminal sizes.
+- **Packaging:** If we depend on Bun/Node availability, document prerequisites; bundling standalone binaries increases maintenance.
+- **Testing:** Snapshots for Ink components can be brittle—need a test story (Continue uses `vitest` + Ink render tests).
+
+## 5. Next Actions
+1. Track this work in an issue (see draft below).
+2. Stand up `tui-poc.tsx` and benchmark.
+3. Produce a detailed feature parity checklist with owners/estimates.
+4. Decide on packaging strategy early to avoid surprises late in migration.
+
+### Draft GitHub issue summary
+- Title: “Evaluate migrating Go-based TUI to Ink”
+- Checklist covering research, POC, packaging, parity plan, report back with recommendation.
+
+---
+
+**References**
+- Opencode Go TUI source: `packages/tui/internal/**/*`
+- CLI launcher: `packages/opencode/src/cli/cmd/tui.ts`
+- Continue Ink CLI (for ideas): `/Users/jonathanhaas/Documents/Dev/continue/extensions/cli`
diff --git a/github/README.md b/github/README.md
index 7601f51335..1eebb79b5e 100644
--- a/github/README.md
+++ b/github/README.md
@@ -67,7 +67,7 @@ This will walk you through installing the GitHub app, creating the workflow, and
              fetch-depth: 1
 
          - name: Run opencode
-           uses: sst/opencode/github@latest
+           uses: evalops/opencode/github@latest
            env:
              ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
            with:
@@ -78,7 +78,7 @@ This will walk you through installing the GitHub app, creating the workflow, and
 
 ## Support
 
-This is an early release. If you encounter issues or have feedback, please create an issue at https://github.com/sst/opencode/issues.
+This is an early release. If you encounter issues or have feedback, please create an issue at https://github.com/evalops/opencode/issues.
 
 ## Development
 
diff --git a/install b/install
index 002f91a73c..c147c5a68e 100755
--- a/install
+++ b/install
@@ -45,15 +45,15 @@ INSTALL_DIR=$HOME/.opencode/bin
 mkdir -p "$INSTALL_DIR"
 
 if [ -z "$requested_version" ]; then
-    url="https://github.com/sst/opencode/releases/latest/download/$filename"
-    specific_version=$(curl -s https://api.github.com/repos/sst/opencode/releases/latest | sed -n 's/.*"tag_name": *"v\([^"]*\)".*/\1/p')
+    url="https://github.com/evalops/opencode/releases/latest/download/$filename"
+    specific_version=$(curl -s https://api.github.com/repos/evalops/opencode/releases/latest | sed -n 's/.*"tag_name": *"v\([^"]*\)".*/\1/p')
 
     if [[ $? -ne 0 || -z "$specific_version" ]]; then
         echo -e "${RED}Failed to fetch version information${NC}"
         exit 1
     fi
 else
-    url="https://github.com/sst/opencode/releases/download/v${requested_version}/$filename"
+    url="https://github.com/evalops/opencode/releases/download/v${requested_version}/$filename"
     specific_version=$requested_version
 fi
 
diff --git a/opencode.json b/opencode.json
index 720ece5c15..64886d6298 100644
--- a/opencode.json
+++ b/opencode.json
@@ -1,3 +1,13 @@
 {
-  "$schema": "https://opencode.ai/config.json"
+  "$schema": "https://opencode.ai/config.json",
+  "mcp": {
+    "claude-context": {
+      "type": "local",
+      "command": ["npx", "@zilliz/claude-context-mcp@latest"],
+      "environment": {
+        "OPENAI_API_KEY": "{env:OPENAI_API_KEY}",
+        "MILVUS_TOKEN": "{env:MILVUS_TOKEN}"
+      }
+    }
+  }
 }
diff --git a/package.json b/package.json
index e1116503a0..0f6c30f690 100644
--- a/package.json
+++ b/package.json
@@ -39,7 +39,7 @@
   },
   "repository": {
     "type": "git",
-    "url": "https://github.com/sst/opencode"
+    "url": "https://github.com/evalops/opencode"
   },
   "license": "MIT",
   "prettier": {
diff --git a/packages/console/app/src/component/footer.tsx b/packages/console/app/src/component/footer.tsx
index 93d8e2d8cd..02d6dafc1d 100644
--- a/packages/console/app/src/component/footer.tsx
+++ b/packages/console/app/src/component/footer.tsx
@@ -16,7 +16,7 @@ export function Footer() {
   return (
     <footer data-component="footer">
       <div data-slot="cell">
-        <a href="https://github.com/sst/opencode" target="_blank">
+        <a href="https://github.com/evalops/opencode" target="_blank">
           GitHub <span>[{starCount()}]</span>
         </a>
       </div>
diff --git a/packages/console/app/src/component/header.tsx b/packages/console/app/src/component/header.tsx
index 29b35bfa44..8364b10fc2 100644
--- a/packages/console/app/src/component/header.tsx
+++ b/packages/console/app/src/component/header.tsx
@@ -29,7 +29,7 @@ export function Header(props: { zen?: boolean }) {
       <nav data-component="nav-desktop">
         <ul>
           <li>
-            <a href="https://github.com/sst/opencode" target="_blank">
+            <a href="https://github.com/evalops/opencode" target="_blank">
               GitHub <span>[{starCount()}]</span>
             </a>
           </li>
@@ -100,7 +100,7 @@ export function Header(props: { zen?: boolean }) {
                   <A href="/">Home</A>
                 </li>
                 <li>
-                  <a href="https://github.com/sst/opencode" target="_blank">
+                  <a href="https://github.com/evalops/opencode" target="_blank">
                     GitHub <span>[{starCount()}]</span>
                   </a>
                 </li>
diff --git a/packages/console/app/src/lib/github.ts b/packages/console/app/src/lib/github.ts
index 49b9264635..ab4738d3db 100644
--- a/packages/console/app/src/lib/github.ts
+++ b/packages/console/app/src/lib/github.ts
@@ -2,11 +2,15 @@ import { query } from "@solidjs/router"
 
 export const github = query(async () => {
   "use server"
+  const headers = {
+    "User-Agent":
+      "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
+  }
   try {
     const [meta, releases, contributors] = await Promise.all([
-      fetch("https://api.github.com/repos/sst/opencode").then((res) => res.json()),
-      fetch("https://api.github.com/repos/sst/opencode/releases").then((res) => res.json()),
-      fetch("https://api.github.com/repos/sst/opencode/contributors?per_page=1"),
+      fetch("https://api.github.com/repos/evalops/opencode", { headers }).then((res) => res.json()),
+      fetch("https://api.github.com/repos/evalops/opencode/releases", { headers }).then((res) => res.json()),
+      fetch("https://api.github.com/repos/evalops/opencode/contributors?per_page=1", { headers }),
     ])
     const [release] = releases
     const contributorCount = Number.parseInt(
@@ -23,6 +27,8 @@ export const github = query(async () => {
       },
       contributors: contributorCount,
     }
-  } catch {}
+  } catch (e) {
+    console.error(e)
+  }
   return undefined
 }, "github")
diff --git a/packages/console/app/src/routes/[...404].tsx b/packages/console/app/src/routes/[...404].tsx
index ba2842b5a0..138dd05365 100644
--- a/packages/console/app/src/routes/[...404].tsx
+++ b/packages/console/app/src/routes/[...404].tsx
@@ -26,7 +26,7 @@ export default function NotFound() {
             <a href="/docs">Docs</a>
           </div>
           <div data-slot="action">
-            <a href="https://github.com/sst/opencode">GitHub</a>
+            <a href="https://github.com/evalops/opencode">GitHub</a>
           </div>
           <div data-slot="action">
             <a href="/discord">Discord</a>
diff --git a/packages/console/app/src/routes/index.tsx b/packages/console/app/src/routes/index.tsx
index 281cec099a..a9648fc769 100644
--- a/packages/console/app/src/routes/index.tsx
+++ b/packages/console/app/src/routes/index.tsx
@@ -67,7 +67,7 @@ export default function Home() {
             <div data-slot="hero-copy">
               <a
                 data-slot="releases"
-                href={release()?.url ?? "https://github.com/sst/opencode/releases"}
+                href={release()?.url ?? "https://github.com/evalops/opencode/releases"}
                 target="_blank"
               >
                 What’s new in {release()?.name ?? "the latest release"}
@@ -701,11 +701,11 @@ export default function Home() {
               <li>
                 <Faq question="Is OpenCode open source?">
                   Yes, OpenCode is fully open source. The source code is public on{" "}
-                  <a href="https://github.com/sst/opencode" target="_blank">
+                  <a href="https://github.com/evalops/opencode" target="_blank">
                     GitHub
                   </a>{" "}
                   under the{" "}
-                  <a href="https://github.com/sst/opencode?tab=MIT-1-ov-file#readme" target="_blank">
+                  <a href="https://github.com/evalops/opencode?tab=MIT-1-ov-file#readme" target="_blank">
                     MIT License
                   </a>
                   , meaning anyone can use, modify, or contribute to its development. Anyone from the community can file
diff --git a/packages/console/app/src/routes/temp.tsx b/packages/console/app/src/routes/temp.tsx
index 3d663c27e4..c7c55e3ec6 100644
--- a/packages/console/app/src/routes/temp.tsx
+++ b/packages/console/app/src/routes/temp.tsx
@@ -165,7 +165,7 @@ export default function Home() {
             <a href="https://x.com/opencode">X.com</a>
           </div>
           <div data-slot="cell">
-            <a href="https://github.com/sst/opencode">GitHub</a>
+            <a href="https://github.com/evalops/opencode">GitHub</a>
           </div>
           <div data-slot="cell">
             <a href="https://opencode.ai/discord">Discord</a>
diff --git a/packages/opencode/AGENTS.md b/packages/opencode/AGENTS.md
index 287cbc2658..099bd6f70c 100644
--- a/packages/opencode/AGENTS.md
+++ b/packages/opencode/AGENTS.md
@@ -20,6 +20,9 @@
 ## Architecture
 
 - **Tools**: Implement `Tool.Info` interface with `execute()` method
+- **Metadata**: Use `Tool.define<typeof schema, MyMetadata>` and keep `ctx.metadata({ metadata })` payloads JSON-safe
+- **Plugins**: Custom tool hooks may return a string or `{ output, title?, metadata? }` which is forwarded directly to the agent
+- **Telemetry**: Wrap tool execution in `measure()` to emit duration logs; resolve file paths via `guard()` before touching the filesystem. See `docs/tool-authoring.md` for patterns.
 - **Context**: Pass `sessionID` in tool context, use `App.provide()` for DI
 - **Validation**: All inputs validated with Zod schemas
 - **Logging**: Use `Log.create({ service: "name" })` pattern
diff --git a/packages/opencode/docs/evaluation-integration.md b/packages/opencode/docs/evaluation-integration.md
new file mode 100644
index 0000000000..0f76efc90e
--- /dev/null
+++ b/packages/opencode/docs/evaluation-integration.md
@@ -0,0 +1,587 @@
+# Evaluation Framework Integration Guide
+
+This guide shows how to deeply integrate the evaluation framework with baseline tracking, time-series analysis, and automatic monitoring.
+
+## Architecture Overview
+
+```
+┌─────────────┐
+│   Trace     │ Completed
+│ Completion  │──────────┐
+└─────────────┘          │
+                         ▼
+              ┌──────────────────────┐
+              │ EvaluationIntegration│
+              │   (Auto-Processor)   │
+              └──────────────────────┘
+                         │
+        ┌────────────────┼────────────────┐
+        │                │                │
+        ▼                ▼                ▼
+┌──────────────┐ ┌──────────────┐ ┌──────────────┐
+│  Evaluation  │ │  TimeSeries  │ │   Baseline   │
+│    Engine    │ │   Tracking   │ │  Comparison  │
+└──────────────┘ └──────────────┘ └──────────────┘
+        │                │                │
+        └────────────────┼────────────────┘
+                         ▼
+              ┌──────────────────────┐
+              │   Alert Generation   │
+              │  (Regression/Anomaly)│
+              └──────────────────────┘
+                         │
+                         ▼
+              ┌──────────────────────┐
+              │   Notifications &    │
+              │      Dashboard       │
+              └──────────────────────┘
+```
+
+## Quick Start
+
+### 1. Define Metrics
+
+First, register the metrics you want to track:
+
+```typescript
+import { Metric } from "@opencode/evaluation"
+
+// Register metrics
+await Metric.register({
+  id: "error-rate",
+  name: "Error Rate",
+  description: "Percentage of tool calls that failed",
+  version: "1.0.0",
+  category: "reliability",
+  evaluator: { type: "heuristic", function: "toolErrorRate" },
+  higherIsBetter: false,
+  threshold: { pass: 0.05 }, // Max 5% error rate
+})
+
+await Metric.register({
+  id: "latency",
+  name: "Response Time",
+  description: "Total trace duration in milliseconds",
+  version: "1.0.0",
+  category: "performance",
+  evaluator: { type: "heuristic", function: "duration" },
+  higherIsBetter: false,
+  threshold: { pass: 5000 }, // Max 5 seconds
+})
+
+await Metric.register({
+  id: "cost",
+  name: "Total Cost",
+  description: "Sum of all LLM API costs",
+  version: "1.0.0",
+  category: "cost",
+  evaluator: { type: "heuristic", function: "totalCost" },
+  higherIsBetter: false,
+})
+```
+
+### 2. Create Baselines
+
+Establish performance baselines from production traces:
+
+```typescript
+import { Baseline } from "@opencode/evaluation"
+
+// Create production baseline
+const prodBaseline = await Baseline.create({
+  id: "prod-baseline-v1",
+  name: "Production Baseline v1",
+  description: "Reference performance from Oct 2024",
+  metricIDs: ["error-rate", "latency", "cost"],
+  minSampleSize: 20,
+  regressionThreshold: 0.15, // 15% degradation triggers alert
+  tags: ["production", "v1"],
+})
+
+// Add historical traces to baseline
+const historicalTraces = await Trace.list({ 
+  since: Date.now() - 7 * 24 * 60 * 60 * 1000, // Last 7 days
+  hasErrors: false, // Only successful traces
+})
+
+for await (const trace of historicalTraces) {
+  await Baseline.addTrace(prodBaseline.id, trace)
+}
+
+console.log(`Baseline created with ${prodBaseline.traceIDs.length} traces`)
+```
+
+### 3. Enable Auto-Evaluation
+
+Set up automatic evaluation and monitoring:
+
+```typescript
+import { EvaluationIntegration } from "@opencode/evaluation"
+
+// Enable auto-evaluation
+await EvaluationIntegration.enableAutoEvaluation({
+  metricIDs: ["error-rate", "latency", "cost"],
+  recordTimeSeries: true,      // Track trends over time
+  checkBaselines: true,         // Compare against baseline
+  detectAnomalies: true,        // Detect statistical outliers
+  anomalyThreshold: 3,          // 3-sigma rule
+  tags: {
+    environment: "production",
+    version: "1.0.0",
+  },
+})
+
+console.log("Auto-evaluation enabled")
+```
+
+### 4. Set Up Alerts
+
+Register callbacks for different alert types:
+
+```typescript
+// Monitor regressions
+EvaluationIntegration.onRegression((alert) => {
+  console.error(`🔴 REGRESSION DETECTED`)
+  console.error(`  Metric: ${alert.metricID}`)
+  console.error(`  Trace: ${alert.traceID}`)
+  console.error(`  Baseline: ${alert.baselineValue.toFixed(3)}`)
+  console.error(`  Current: ${alert.currentValue.toFixed(3)}`)
+  console.error(`  Change: ${alert.percentChange.toFixed(1)}%`)
+  
+  // Send to alerting system
+  sendSlackAlert(`Regression in ${alert.metricID}: ${alert.percentChange.toFixed(1)}% worse`)
+  createJiraTicket(alert)
+})
+
+// Monitor anomalies
+EvaluationIntegration.onAnomaly((alert) => {
+  console.warn(`⚠️  ANOMALY DETECTED`)
+  console.warn(`  Metric: ${alert.metricID}`)
+  console.warn(`  Current: ${alert.currentValue.toFixed(3)}`)
+  console.warn(`  Expected: ${alert.expectedRange.min.toFixed(3)} - ${alert.expectedRange.max.toFixed(3)}`)
+  console.warn(`  Z-Score: ${alert.zScore.toFixed(2)}σ`)
+  
+  // Log for investigation
+  logAnomalyForInvestigation(alert)
+})
+
+// Celebrate improvements
+EvaluationIntegration.onImprovement((alert) => {
+  console.log(`🎉 IMPROVEMENT DETECTED`)
+  console.log(`  Metric: ${alert.metricID}`)
+  console.log(`  Change: ${Math.abs(alert.percentChange).toFixed(1)}% better`)
+  
+  // Track wins
+  recordMetricsImprovement(alert)
+})
+```
+
+## Advanced Usage
+
+### A/B Testing
+
+Compare two different agent configurations:
+
+```typescript
+// Create baseline for version A
+const baselineA = await Baseline.create({
+  id: "agent-v1-baseline",
+  name: "Agent v1 Baseline",
+  description: "Performance of original agent",
+  metricIDs: ["error-rate", "latency", "cost"],
+  minSampleSize: 30,
+  tags: ["v1"],
+})
+
+// Create baseline for version B
+const baselineB = await Baseline.create({
+  id: "agent-v2-baseline",
+  name: "Agent v2 Baseline",  
+  description: "Performance with new prompt",
+  metricIDs: ["error-rate", "latency", "cost"],
+  minSampleSize: 30,
+  tags: ["v2"],
+})
+
+// Collect data for both versions...
+// (run production traffic through both)
+
+// Compare after sufficient samples
+const abResult = await Baseline.compareAB(baselineA.id, baselineB.id)
+
+console.log(`A/B Test Results`)
+console.log(`  Overall Winner: ${abResult.overallWinner}`)
+console.log(`  Sample Sizes: A=${abResult.sampleSizeA}, B=${abResult.sampleSizeB}`)
+console.log(`\nMetric Breakdown:`)
+
+for (const metric of abResult.metrics) {
+  console.log(`  ${metric.metricID}:`)
+  console.log(`    Winner: ${metric.winner}`)
+  console.log(`    A: ${metric.meanA.toFixed(3)}, B: ${metric.meanB.toFixed(3)}`)
+  console.log(`    Change: ${metric.percentChange.toFixed(1)}%`)
+  console.log(`    Confidence: ${(metric.confidence * 100).toFixed(1)}%`)
+}
+
+// Roll out winner to 100% traffic
+if (abResult.overallWinner === "B") {
+  deployVersion("v2")
+}
+```
+
+### Trend Analysis
+
+Analyze performance trends over time:
+
+```typescript
+// Analyze error rate trend over last 30 days
+const errorTrend = await TimeSeries.analyzeTrend("error-rate", {
+  days: 30,
+  anomalyThreshold: 2, // 2-sigma for anomaly detection
+})
+
+console.log(`Error Rate Trend Analysis`)
+console.log(`  Trend: ${errorTrend.trend}`) // "improving", "degrading", or "stable"
+console.log(`  Strength: ${(errorTrend.trendStrength * 100).toFixed(1)}%`)
+console.log(`  Slope: ${errorTrend.slope.toFixed(6)}/day`)
+console.log(`  Overall Change: ${errorTrend.changePercent.toFixed(1)}%`)
+console.log(`  Anomalies: ${errorTrend.anomalies.length}`)
+
+if (errorTrend.trend === "degrading") {
+  console.warn(`⚠️  Error rate has been degrading over last 30 days`)
+  investigateDegradation(errorTrend)
+}
+
+// Detect anomalies in real-time
+const currentErrorRate = 0.08 // 8%
+const anomalyCheck = await TimeSeries.detectAnomaly("error-rate", currentErrorRate, 14)
+
+if (anomalyCheck.isAnomaly) {
+  console.error(`Current error rate ${currentErrorRate} is anomalous!`)
+  console.error(`  Expected range: ${anomalyCheck.expectedRange.min.toFixed(3)} - ${anomalyCheck.expectedRange.max.toFixed(3)}`)
+  console.error(`  Historical mean: ${anomalyCheck.historicalMean.toFixed(3)}`)
+}
+```
+
+### Dashboard Integration
+
+Build a monitoring dashboard:
+
+```typescript
+// Get dashboard data for visualization
+const dashboard = await EvaluationIntegration.getDashboard({
+  since: Date.now() - 30 * 24 * 60 * 60 * 1000, // Last 30 days
+  metricIDs: ["error-rate", "latency", "cost"],
+  period: "day", // Daily aggregates
+})
+
+// Render dashboard
+for (const metric of dashboard.metrics) {
+  console.log(`\n${metric.metric.name}`)
+  console.log(`  Data Points: ${metric.dataPoints}`)
+  
+  if (metric.trend) {
+    console.log(`  Trend: ${metric.trend.trend} (${(metric.trend.trendStrength * 100).toFixed(1)}%)`)
+    console.log(`  30-day Change: ${metric.trend.changePercent.toFixed(1)}%`)
+  }
+  
+  console.log(`  Last 10 Days:`)
+  for (const agg of metric.aggregates.slice(-10)) {
+    const date = new Date(agg.periodStart).toLocaleDateString()
+    console.log(`    ${date}: ${agg.mean.toFixed(3)} (min: ${agg.min.toFixed(3)}, max: ${agg.max.toFixed(3)})`)
+  }
+  
+  console.log(`  Baselines:`)
+  for (const baseline of metric.baselines) {
+    if (baseline.statistics) {
+      console.log(`    ${baseline.name}: ${baseline.statistics.mean.toFixed(3)} ± ${baseline.statistics.stdDev.toFixed(3)}`)
+    }
+  }
+}
+```
+
+### Historical Re-evaluation
+
+Re-evaluate old traces after updating metrics:
+
+```typescript
+// Get all traces from last month
+const traces = await Trace.list({
+  since: Date.now() - 30 * 24 * 60 * 60 * 1000,
+})
+
+const traceIDs = []
+for await (const trace of traces) {
+  traceIDs.push(trace.id)
+}
+
+console.log(`Re-evaluating ${traceIDs.length} historical traces`)
+
+// Batch evaluate with new metrics
+await EvaluationIntegration.evaluateTraces(traceIDs, {
+  metricIDs: ["new-metric-v2", "error-rate", "latency"],
+  recordTimeSeries: true,
+  checkBaselines: false, // Don't alert on historical data
+})
+
+console.log(`Historical evaluation complete`)
+```
+
+### Custom Alert Routing
+
+Route alerts to different channels based on severity:
+
+```typescript
+EvaluationIntegration.onAlert((alert) => {
+  // Route based on alert type and severity
+  switch (alert.type) {
+    case "regression":
+      if (Math.abs(alert.percentChange) > 50) {
+        // Critical regression
+        sendPagerDuty({
+          severity: "critical",
+          summary: `Critical regression in ${alert.metricID}`,
+          details: alert,
+        })
+      } else if (Math.abs(alert.percentChange) > 20) {
+        // Major regression
+        sendSlack({
+          channel: "#incidents",
+          text: `⚠️ Major regression in ${alert.metricID}: ${alert.percentChange.toFixed(1)}% worse`,
+          alert,
+        })
+      } else {
+        // Minor regression
+        sendSlack({
+          channel: "#metrics",
+          text: `Regression in ${alert.metricID}: ${alert.percentChange.toFixed(1)}% worse`,
+          alert,
+        })
+      }
+      break
+      
+    case "anomaly":
+      if (Math.abs(alert.zScore) > 5) {
+        // Extreme anomaly
+        sendSlack({
+          channel: "#incidents",
+          text: `🔴 Extreme anomaly in ${alert.metricID}: ${alert.zScore.toFixed(1)}σ`,
+          alert,
+        })
+      } else {
+        // Normal anomaly
+        logToDatadog("anomaly_detected", alert)
+      }
+      break
+      
+    case "improvement":
+      // Celebrate improvements
+      sendSlack({
+        channel: "#wins",
+        text: `🎉 Improvement in ${alert.metricID}: ${Math.abs(alert.percentChange).toFixed(1)}% better!`,
+        alert,
+      })
+      break
+  }
+})
+```
+
+## Best Practices
+
+### 1. Baseline Management
+
+- **Create separate baselines** for different environments (dev, staging, prod)
+- **Version your baselines** when making significant agent changes
+- **Maintain minimum sample sizes** (>20 traces) for statistical significance
+- **Update baselines regularly** to reflect expected performance
+
+```typescript
+// Environment-specific baselines
+await Baseline.create({
+  id: "prod-baseline",
+  tags: ["production", "us-east-1"],
+  minSampleSize: 50,
+  regressionThreshold: 0.10, // Strict for prod
+})
+
+await Baseline.create({
+  id: "staging-baseline",
+  tags: ["staging"],
+  minSampleSize: 20,
+  regressionThreshold: 0.25, // More lenient for staging
+})
+```
+
+### 2. Metric Selection
+
+- **Start with core metrics**: error rate, latency, cost
+- **Add domain-specific metrics** gradually
+- **Avoid metric overload**: 5-10 key metrics is usually sufficient
+- **Group related metrics** using tags
+
+### 3. Alert Tuning
+
+- **Start with conservative thresholds** to avoid alert fatigue
+- **Adjust based on false positive rate**
+- **Use different thresholds** for different metrics
+- **Implement alert deduplication** for noisy metrics
+
+### 4. Time-Series Analysis
+
+- **Use appropriate time windows**: 
+  - Anomaly detection: 7-14 days
+  - Trend analysis: 30-90 days
+- **Consider seasonality**: weekday vs weekend patterns
+- **Filter outliers** when establishing baselines
+
+### 5. Performance
+
+- **Batch historical evaluations** during off-peak hours
+- **Use tags** to filter time-series queries
+- **Archive old data** periodically
+- **Index frequently queried fields**
+
+## Integration with CI/CD
+
+### Pre-Deployment Checks
+
+```typescript
+// In CI/CD pipeline, before deployment
+async function preDeploymentCheck() {
+  // Evaluate test traces against new code
+  const testTraces = await runIntegrationTests()
+  
+  for (const trace of testTraces) {
+    await EvaluationIntegration.evaluateTrace(trace.id, {
+      metricIDs: ["error-rate", "latency", "cost"],
+      checkBaselines: true,
+    })
+  }
+  
+  // Check if any regressions detected
+  let hasRegressions = false
+  
+  const unsubscribe = EvaluationIntegration.onRegression((alert) => {
+    console.error(`Blocking deployment: regression in ${alert.metricID}`)
+    hasRegressions = true
+  })
+  
+  // Wait for async processing
+  await new Promise(resolve => setTimeout(resolve, 1000))
+  
+  unsubscribe()
+  
+  if (hasRegressions) {
+    throw new Error("Deployment blocked due to regressions")
+  }
+  
+  console.log("✅ No regressions detected, proceeding with deployment")
+}
+```
+
+### Post-Deployment Monitoring
+
+```typescript
+// Monitor for 1 hour after deployment
+async function postDeploymentMonitor(deploymentID: string) {
+  console.log(`Monitoring deployment ${deploymentID}`)
+  
+  const alerts: Alert[] = []
+  const unsubscribe = EvaluationIntegration.onAlert((alert) => {
+    alerts.push(alert)
+  })
+  
+  // Wait 1 hour
+  await new Promise(resolve => setTimeout(resolve, 60 * 60 * 1000))
+  
+  unsubscribe()
+  
+  // Check alert counts
+  const regressions = alerts.filter(a => a.type === "regression")
+  const anomalies = alerts.filter(a => a.type === "anomaly")
+  
+  if (regressions.length > 5 || anomalies.length > 10) {
+    console.error(`Deployment ${deploymentID} showing issues, consider rollback`)
+    console.error(`  Regressions: ${regressions.length}`)
+    console.error(`  Anomalies: ${anomalies.length}`)
+    
+    return { healthy: false, alerts }
+  }
+  
+  console.log(`✅ Deployment ${deploymentID} healthy`)
+  return { healthy: true, alerts }
+}
+```
+
+## Troubleshooting
+
+### No Alerts Being Generated
+
+1. Check auto-evaluation is enabled:
+```typescript
+EvaluationIntegration.disableAutoEvaluation()
+await EvaluationIntegration.enableAutoEvaluation({ /* config */ })
+```
+
+2. Verify metrics are registered:
+```typescript
+const metric = await Metric.get("your-metric-id")
+console.log(metric)
+```
+
+3. Check baseline sample sizes:
+```typescript
+const baseline = await Baseline.get("your-baseline-id")
+console.log(`Sample size: ${baseline.traceIDs.length} (min: ${baseline.minSampleSize})`)
+```
+
+### Too Many False Positive Alerts
+
+1. Increase regression threshold:
+```typescript
+await Baseline.update("baseline-id", {
+  regressionThreshold: 0.25, // From 0.15 to 0.25
+})
+```
+
+2. Increase anomaly threshold:
+```typescript
+await EvaluationIntegration.enableAutoEvaluation({
+  // ... other config
+  anomalyThreshold: 4, // From 3 to 4 sigma
+})
+```
+
+3. Increase baseline sample size:
+```typescript
+await Baseline.update("baseline-id", {
+  minSampleSize: 50, // From 20 to 50
+})
+```
+
+### Missing Time-Series Data
+
+1. Verify recording is enabled:
+```typescript
+await EvaluationIntegration.enableAutoEvaluation({
+  // ... other config
+  recordTimeSeries: true,
+})
+```
+
+2. Check for evaluation errors:
+```typescript
+// Look for error logs in evaluation engine
+```
+
+3. Manually record test data:
+```typescript
+const trace = await Trace.get("trace-id")
+await TimeSeries.record("metric-id", trace, { tag: "test" })
+```
+
+## Next Steps
+
+- Explore [Metric Definitions](./metrics.md) for creating custom metrics
+- Learn about [Heuristic Functions](./heuristics.md) for built-in evaluators
+- See [Dataset Testing](./datasets.md) for test suite management
+- Review [API Reference](./api-reference.md) for detailed documentation
diff --git a/packages/opencode/package.json b/packages/opencode/package.json
index f8d71b4ab3..2b28f3cde0 100644
--- a/packages/opencode/package.json
+++ b/packages/opencode/package.json
@@ -33,6 +33,7 @@
     "@hono/standard-validator": "0.1.5",
     "@hono/zod-validator": "catalog:",
     "@modelcontextprotocol/sdk": "1.15.1",
+    "@octokit/rest": "22.0.0",
     "@openauthjs/openauth": "0.4.3",
     "@opencode-ai/plugin": "workspace:*",
     "@opencode-ai/sdk": "workspace:*",
@@ -42,12 +43,14 @@
     "chokidar": "4.0.3",
     "decimal.js": "10.5.0",
     "diff": "8.0.2",
+    "exa-js": "1.9.3",
     "fuzzysort": "3.1.0",
     "gray-matter": "4.0.3",
     "hono": "catalog:",
     "hono-openapi": "1.0.7",
     "ignore": "7.0.5",
     "jsonc-parser": "3.3.1",
+    "linkedom": "0.18.12",
     "minimatch": "10.0.3",
     "open": "10.1.2",
     "remeda": "catalog:",
diff --git a/packages/opencode/script/publish.ts b/packages/opencode/script/publish.ts
index 996ec46e74..7698e25e39 100755
--- a/packages/opencode/script/publish.ts
+++ b/packages/opencode/script/publish.ts
@@ -70,17 +70,17 @@ if (!snapshot) {
     "options=('!debug' '!strip')",
     "pkgrel=1",
     "pkgdesc='The AI coding agent built for the terminal.'",
-    "url='https://github.com/sst/opencode'",
+    "url='https://github.com/evalops/opencode'",
     "arch=('aarch64' 'x86_64')",
     "license=('MIT')",
     "provides=('opencode')",
     "conflicts=('opencode')",
     "depends=('fzf' 'ripgrep')",
     "",
-    `source_aarch64=("\${pkgname}_\${pkgver}_aarch64.zip::https://github.com/sst/opencode/releases/download/v${version}/opencode-linux-arm64.zip")`,
+    `source_aarch64=("\${pkgname}_\${pkgver}_aarch64.zip::https://github.com/evalops/opencode/releases/download/v${version}/opencode-linux-arm64.zip")`,
     `sha256sums_aarch64=('${arm64Sha}')`,
     "",
-    `source_x86_64=("\${pkgname}_\${pkgver}_x86_64.zip::https://github.com/sst/opencode/releases/download/v${version}/opencode-linux-x64.zip")`,
+    `source_x86_64=("\${pkgname}_\${pkgver}_x86_64.zip::https://github.com/evalops/opencode/releases/download/v${version}/opencode-linux-x64.zip")`,
     `sha256sums_x86_64=('${x64Sha}')`,
     "",
     "package() {",
@@ -99,7 +99,7 @@ if (!snapshot) {
     "options=('!debug' '!strip')",
     "pkgrel=1",
     "pkgdesc='The AI coding agent built for the terminal.'",
-    "url='https://github.com/sst/opencode'",
+    "url='https://github.com/evalops/opencode'",
     "arch=('aarch64' 'x86_64')",
     "license=('MIT')",
     "provides=('opencode')",
@@ -107,7 +107,7 @@ if (!snapshot) {
     "depends=('fzf' 'ripgrep')",
     "makedepends=('git' 'bun-bin' 'go')",
     "",
-    `source=("opencode-\${pkgver}.tar.gz::https://github.com/sst/opencode/archive/v${version}.tar.gz")`,
+    `source=("opencode-\${pkgver}.tar.gz::https://github.com/evalops/opencode/archive/v${version}.tar.gz")`,
     `sha256sums=('SKIP')`,
     "",
     "build() {",
@@ -155,12 +155,12 @@ if (!snapshot) {
     "# This file was generated by GoReleaser. DO NOT EDIT.",
     "class Opencode < Formula",
     `  desc "The AI coding agent built for the terminal."`,
-    `  homepage "https://github.com/sst/opencode"`,
+    `  homepage "https://github.com/evalops/opencode"`,
     `  version "${version.split("-")[0]}"`,
     "",
     "  on_macos do",
     "    if Hardware::CPU.intel?",
-    `      url "https://github.com/sst/opencode/releases/download/v${version}/opencode-darwin-x64.zip"`,
+    `      url "https://github.com/evalops/opencode/releases/download/v${version}/opencode-darwin-x64.zip"`,
     `      sha256 "${macX64Sha}"`,
     "",
     "      def install",
@@ -168,7 +168,7 @@ if (!snapshot) {
     "      end",
     "    end",
     "    if Hardware::CPU.arm?",
-    `      url "https://github.com/sst/opencode/releases/download/v${version}/opencode-darwin-arm64.zip"`,
+    `      url "https://github.com/evalops/opencode/releases/download/v${version}/opencode-darwin-arm64.zip"`,
     `      sha256 "${macArm64Sha}"`,
     "",
     "      def install",
@@ -179,14 +179,14 @@ if (!snapshot) {
     "",
     "  on_linux do",
     "    if Hardware::CPU.intel? and Hardware::CPU.is_64_bit?",
-    `      url "https://github.com/sst/opencode/releases/download/v${version}/opencode-linux-x64.zip"`,
+    `      url "https://github.com/evalops/opencode/releases/download/v${version}/opencode-linux-x64.zip"`,
     `      sha256 "${x64Sha}"`,
     "      def install",
     '        bin.install "opencode"',
     "      end",
     "    end",
     "    if Hardware::CPU.arm? and Hardware::CPU.is_64_bit?",
-    `      url "https://github.com/sst/opencode/releases/download/v${version}/opencode-linux-arm64.zip"`,
+    `      url "https://github.com/evalops/opencode/releases/download/v${version}/opencode-linux-arm64.zip"`,
     `      sha256 "${arm64Sha}"`,
     "      def install",
     '        bin.install "opencode"',
diff --git a/packages/opencode/src/agent/agent.ts b/packages/opencode/src/agent/agent.ts
index 252c0bd6bd..b3f70cb4e0 100644
--- a/packages/opencode/src/agent/agent.ts
+++ b/packages/opencode/src/agent/agent.ts
@@ -20,6 +20,8 @@ export namespace Agent {
         edit: Config.Permission,
         bash: z.record(z.string(), Config.Permission),
         webfetch: Config.Permission.optional(),
+        fetchurl: Config.Permission.optional(),
+        websearch: Config.Permission.optional(),
       }),
       model: z
         .object({
@@ -45,6 +47,8 @@ export namespace Agent {
         "*": "allow",
       },
       webfetch: "allow",
+      fetchurl: "allow",
+      websearch: "allow",
     }
     const agentPermission = mergeAgentPermissions(defaultPermission, cfg.permission ?? {})
 
@@ -53,6 +57,8 @@ export namespace Agent {
         edit: "deny",
         bash: "ask",
         webfetch: "allow",
+        fetchurl: "allow",
+        websearch: "allow",
       },
       cfg.permission ?? {},
     )
diff --git a/packages/opencode/src/bus/index.ts b/packages/opencode/src/bus/index.ts
index 7fbefba449..53344404c9 100644
--- a/packages/opencode/src/bus/index.ts
+++ b/packages/opencode/src/bus/index.ts
@@ -2,6 +2,7 @@ import z from "zod/v4"
 import type { ZodType } from "zod/v4"
 import { Log } from "../util/log"
 import { Instance } from "../project/instance"
+import { Context } from "../util/context"
 
 export namespace Bus {
   const log = Log.create({ service: "bus" })
@@ -15,6 +16,17 @@ export namespace Bus {
     }
   })
 
+  function getState() {
+    try {
+      return state()
+    } catch (error) {
+      if (error instanceof Context.NotFound && error.name === "instance") {
+        return null
+      }
+      throw error
+    }
+  }
+
   export type EventDefinition = ReturnType<typeof event>
 
   const registry = new Map<string, EventDefinition>()
@@ -51,6 +63,14 @@ export namespace Bus {
     def: Definition,
     properties: z.output<Definition["properties"]>,
   ) {
+    const currentState = getState()
+    if (!currentState) {
+      log.debug("skipping publish (no instance context)", {
+        type: def.type,
+      })
+      return
+    }
+
     const payload = {
       type: def.type,
       properties,
@@ -60,7 +80,7 @@ export namespace Bus {
     })
     const pending = []
     for (const key of [def.type, "*"]) {
-      const match = state().subscriptions.get(key)
+      const match = currentState.subscriptions.get(key)
       for (const sub of match ?? []) {
         pending.push(sub(payload))
       }
@@ -92,8 +112,14 @@ export namespace Bus {
   }
 
   function raw(type: string, callback: (event: any) => void) {
+    const currentState = getState()
+    if (!currentState) {
+      log.debug("skipping subscription (no instance context)", { type })
+      return () => {}
+    }
+
     log.info("subscribing", { type })
-    const subscriptions = state().subscriptions
+    const subscriptions = currentState.subscriptions
     let match = subscriptions.get(type) ?? []
     match.push(callback)
     subscriptions.set(type, match)
diff --git a/packages/opencode/src/cli/cmd/github.ts b/packages/opencode/src/cli/cmd/github.ts
index e15243e769..f19cdb9369 100644
--- a/packages/opencode/src/cli/cmd/github.ts
+++ b/packages/opencode/src/cli/cmd/github.ts
@@ -78,12 +78,12 @@ export const GithubInstallCommand = cmd({
             .text()
             .then((text) => text.trim())
           // match https or git pattern
-          // ie. https://github.com/sst/opencode.git
-          // ie. https://github.com/sst/opencode
-          // ie. git@github.com:sst/opencode.git
-          // ie. git@github.com:sst/opencode
-          // ie. ssh://git@github.com/sst/opencode.git
-          // ie. ssh://git@github.com/sst/opencode
+          // ie. https://github.com/evalops/opencode.git
+          // ie. https://github.com/evalops/opencode
+          // ie. git@github.com:evalops/opencode.git
+          // ie. git@github.com:evalops/opencode
+          // ie. ssh://git@github.com/evalops/opencode.git
+          // ie. ssh://git@github.com/evalops/opencode
           const parsed = info.match(/^(?:(?:https?|ssh):\/\/)?(?:git@)?github\.com[:/]([^/]+)\/([^/.]+?)(?:\.git)?$/)
           if (!parsed) {
             prompts.log.error(`Could not find git repository. Please run this command from a git repository.`)
@@ -231,7 +231,7 @@ jobs:
         uses: actions/checkout@v4
 
       - name: Run opencode
-        uses: sst/opencode/github@latest${envStr}
+        uses: evalops/opencode/github@latest${envStr}
         with:
           model: ${provider}/${model}
 `.trim(),
diff --git a/packages/opencode/src/cli/cmd/run.ts b/packages/opencode/src/cli/cmd/run.ts
index e04ed81036..b39367afb8 100644
--- a/packages/opencode/src/cli/cmd/run.ts
+++ b/packages/opencode/src/cli/cmd/run.ts
@@ -12,6 +12,7 @@ import { Identifier } from "../../id/id"
 import { Agent } from "../../agent/agent"
 import { Command } from "../../command"
 import { SessionPrompt } from "../../session/prompt"
+import { ToolTelemetry } from "../../tool/telemetry"
 
 const TOOL: Record<string, [string, string]> = {
   todowrite: ["Todo", UI.Style.TEXT_WARNING_BOLD],
@@ -150,6 +151,11 @@ export const RunCommand = cmd({
         )
       }
 
+      function formatDuration(duration: number) {
+        if (duration < 1000) return `${duration.toFixed(0)}ms`
+        return `${(duration / 1000).toFixed(2)}s`
+      }
+
       function outputJsonEvent(type: string, data: any) {
         if (args.format === "json") {
           const jsonEvent = {
@@ -209,6 +215,19 @@ export const RunCommand = cmd({
         }
       })
 
+      Bus.subscribe(ToolTelemetry.Event.Sampled, async (evt) => {
+        const info = evt.properties
+        if (info.sessionID !== session.id) return
+        if (outputJsonEvent("tool_telemetry", { telemetry: info })) return
+        const [label, defaultColor] = TOOL[info.id] ?? [info.id, UI.Style.TEXT_INFO_BOLD]
+        const color = info.status === "success" ? defaultColor : UI.Style.TEXT_DANGER_BOLD
+        let title = `${label} ${formatDuration(info.duration)}`
+        if (info.status === "error" && info.error) {
+          title += ` – ${info.error}`
+        }
+        printEvent(color, "tele", title)
+      })
+
       let errorMsg: string | undefined
       Bus.subscribe(Session.Event.Error, async (evt) => {
         const { sessionID, error } = evt.properties
diff --git a/packages/opencode/src/cli/cmd/stats.ts b/packages/opencode/src/cli/cmd/stats.ts
index 39ae86ba0b..ca7624ec12 100644
--- a/packages/opencode/src/cli/cmd/stats.ts
+++ b/packages/opencode/src/cli/cmd/stats.ts
@@ -1,4 +1,9 @@
 import { cmd } from "./cmd"
+import { ToolHistory } from "../../tool/history"
+import type { TelemetryEvent } from "../../tool/telemetry-event"
+import { bootstrap } from "../bootstrap"
+import { Session } from "../../session"
+import { MessageV2 } from "../../session/message-v2"
 
 interface SessionStats {
   totalSessions: number
@@ -14,6 +19,7 @@ interface SessionStats {
     }
   }
   toolUsage: Record<string, number>
+  toolTelemetry: Record<string, ToolTelemetryStats>
   dateRange: {
     earliest: number
     latest: number
@@ -22,11 +28,254 @@ interface SessionStats {
   costPerDay: number
 }
 
-export const StatsCommand = cmd({
+type ToolTelemetryStats = {
+  runs: number
+  errors: number
+  totalDuration: number
+  averageDuration: number
+  medianDuration: number
+  p95Duration: number
+  p99Duration: number
+  errorRate: number
+  successRate: number
+}
+
+type TelemetrySummary = {
+  windowStart?: number
+  windowEnd?: number
+  totalRuns: number
+  totalErrors: number
+  perDayErrorRate?: number
+  tools: Record<string, ToolTelemetryStats>
+}
+
+type DetailFormat = "pretty" | "ndjson" | "csv"
+
+type ToolComparison = {
+  tool: string
+  baseline?: ToolTelemetryStats
+  current?: ToolTelemetryStats
+}
+
+type TelemetryComparison = {
+  path: string
+  totalRunsDelta: number
+  totalErrorsDelta: number
+  toolComparisons: ToolComparison[]
+}
+
+type DetailOptions = {
+  format: DetailFormat
+  fields: string[]
+}
+
+type StatsArgs = {
+  json?: boolean
+  telemetry?: string
+  limit?: number
+  clear?: boolean
+  details?: boolean
+  detailsFormat?: DetailFormat
+  fields?: string
+  status?: string
+  since?: string
+  until?: string
+  compare?: string
+  warnLatency?: number
+  warnErrors?: number
+}
+
+export const StatsCommand = cmd<StatsArgs, StatsArgs>({
   command: "stats",
-  handler: async () => {},
+  describe: "Show session and telemetry statistics",
+  builder: (yargs) =>
+    yargs
+      .option("json", {
+        describe: "Output raw JSON instead of formatted tables",
+        type: "boolean",
+        default: false,
+      })
+      .option("telemetry", {
+        describe: "Filter telemetry events by tool id (use 'all' for everything)",
+        type: "string",
+      })
+      .option("limit", {
+        describe: "Number of telemetry events to display",
+        type: "number",
+        default: 20,
+      })
+      .option("clear", {
+        describe: "Clear stored telemetry history before printing stats",
+        type: "boolean",
+        default: false,
+      })
+      .option("details", {
+        describe: "Print telemetry metadata for matching events",
+        type: "boolean",
+        default: false,
+      })
+      .option("details-format", {
+        describe: "Format for telemetry metadata output (pretty, ndjson, csv)",
+        type: "string",
+        choices: ["pretty", "ndjson", "csv"],
+        default: "pretty",
+      })
+      .option("fields", {
+        describe: "Comma separated metadata keys to include in details",
+        type: "string",
+      })
+      .option("status", {
+        describe: "Filter telemetry events by status (success,error)",
+        type: "string",
+      })
+      .option("since", {
+        describe: "Only include telemetry events after this time (relative like 1d or ISO timestamp)",
+        type: "string",
+      })
+      .option("until", {
+        describe: "Only include telemetry events before this time",
+        type: "string",
+      })
+      .option("compare", {
+        describe: "Path to baseline JSON created with --json for comparison",
+        type: "string",
+      })
+      .option("warn-latency", {
+        describe: "Warn if any tool p95 latency exceeds this many milliseconds",
+        type: "number",
+      })
+      .option("warn-errors", {
+        describe: "Warn if total errors exceed this count",
+        type: "number",
+      }),
+  handler: async (args) => {
+    await bootstrap(process.cwd(), async () => {
+      if (args.clear) {
+        await ToolHistory.clear()
+        console.log("Cleared telemetry history.")
+      }
+      const history = await ToolHistory.read()
+      const toolUsage = Object.fromEntries(Object.entries(history.tools).map(([tool, data]) => [tool, data.runs]))
+      const telemetryFilter = args.telemetry?.trim()
+      const statuses = parseList(args.status, true)
+      const since = args.since ? parseTimeInput(args.since) : undefined
+      const until = args.until ? parseTimeInput(args.until) : undefined
+      const telemetryEvents = (() => {
+        const base = (() => {
+          if (!telemetryFilter) return history.events
+          if (telemetryFilter === "all") return history.events
+          return history.events.filter((event) => event.id === telemetryFilter)
+        })()
+        return base.filter((event) => {
+          if (statuses.length > 0 && !statuses.includes(event.status)) return false
+          if (since !== undefined && event.timestamp < since) return false
+          if (until !== undefined && event.timestamp > until) return false
+          return true
+        })
+      })()
+      const limit = Math.max(1, args.limit ?? 20)
+      const limitedTelemetry = telemetryEvents.slice(-limit)
+      const telemetrySummary = summarizeTelemetry(telemetryEvents)
+
+      const sessionMetrics = await aggregateSessions()
+      const stats: SessionStats = {
+        ...sessionMetrics,
+        toolUsage,
+        toolTelemetry: telemetrySummary.tools,
+      }
+
+      const comparison = args.compare ? await compareBaseline(args.compare, telemetrySummary) : undefined
+      const warnings = collectWarnings(telemetrySummary, args.warnLatency, args.warnErrors)
+
+      if (args.json) {
+        const json = {
+          stats,
+          telemetry: limitedTelemetry,
+          telemetrySummary,
+          comparison,
+          warnings,
+        }
+        console.log(JSON.stringify(json, null, 2))
+        return
+      }
+
+      displayStats(stats)
+      displayTelemetryWindow(telemetrySummary)
+      if (telemetryFilter || telemetryEvents.length > 0) displayTelemetryEvents(limitedTelemetry)
+      if (args.details) {
+        displayTelemetryDetails(limitedTelemetry, {
+          format: args.detailsFormat ?? "pretty",
+          fields: parseList(args.fields),
+        })
+      }
+      if (comparison) displayComparison(comparison)
+      if (warnings.length > 0) {
+        for (const note of warnings) console.log(note)
+        const currentExit = typeof process.exitCode === "number" ? process.exitCode : 0
+        if (currentExit < 2) process.exitCode = 2
+      }
+    })
+  },
 })
 
+async function aggregateSessions(): Promise<Omit<SessionStats, "toolUsage" | "toolTelemetry">> {
+  const sessions: Session.Info[] = []
+  for await (const info of Session.list()) {
+    sessions.push(info)
+  }
+
+  let totalMessages = 0
+  let totalCost = 0
+  let inputTokens = 0
+  let outputTokens = 0
+  let reasoningTokens = 0
+  let cacheReadTokens = 0
+  let cacheWriteTokens = 0
+
+  let earliest = sessions.length > 0 ? Math.min(...sessions.map((s) => s.time.created)) : Date.now()
+  let latest = sessions.length > 0 ? Math.max(...sessions.map((s) => s.time.updated)) : earliest
+
+  for (const session of sessions) {
+    earliest = Math.min(earliest, session.time.created)
+    latest = Math.max(latest, session.time.updated)
+    const messages = await Session.messages(session.id)
+    totalMessages += messages.length
+    for (const message of messages) {
+      if (message.info.role !== "assistant") continue
+      const assistant = message.info as MessageV2.Assistant
+      totalCost += assistant.cost ?? 0
+      inputTokens += assistant.tokens?.input ?? 0
+      outputTokens += assistant.tokens?.output ?? 0
+      reasoningTokens += assistant.tokens?.reasoning ?? 0
+      cacheReadTokens += assistant.tokens?.cache?.read ?? 0
+      cacheWriteTokens += assistant.tokens?.cache?.write ?? 0
+    }
+  }
+
+  const totalSessions = sessions.length
+  const dayMillis = 1000 * 60 * 60 * 24
+  const days = totalSessions > 0 ? Math.max(1, Math.ceil((latest - earliest) / dayMillis)) : 1
+  const costPerDay = days > 0 ? totalCost / days : 0
+
+  return {
+    totalSessions,
+    totalMessages,
+    totalCost,
+    totalTokens: {
+      input: inputTokens,
+      output: outputTokens,
+      reasoning: reasoningTokens,
+      cache: {
+        read: cacheReadTokens,
+        write: cacheWriteTokens,
+      },
+    },
+    dateRange: { earliest, latest },
+    days,
+    costPerDay,
+  }
+}
+
 export function displayStats(stats: SessionStats) {
   const width = 56
 
@@ -87,12 +336,450 @@ export function displayStats(stats: SessionStats) {
     console.log("└────────────────────────────────────────────────────────┘")
   }
   console.log()
+
+  if (Object.keys(stats.toolTelemetry ?? {}).length === 0) return
+
+  console.log("┌─────────────────────── TOOL TELEMETRY ─────────────────────┐")
+  console.log("│ Tool        Runs   Avg     P95     P99     Err%   Success │")
+  console.log("├───────────────────────────────────────────────────────────┤")
+  const sorted = Object.entries(stats.toolTelemetry).sort(([, a], [, b]) => b.runs - a.runs)
+  for (const [tool, data] of sorted) {
+    const avg = formatDurationShort(data.averageDuration)
+    const p95 = formatDurationShort(data.p95Duration)
+    const p99 = formatDurationShort(data.p99Duration)
+    const errPercent = formatPercent(data.errorRate)
+    const successPercent = formatPercent(data.successRate)
+    const line = `│ ${tool.padEnd(10)} ${String(data.runs).padStart(4)} ${avg.padEnd(7)} ${p95.padEnd(7)} ${p99.padEnd(7)} ${errPercent.padEnd(
+      6,
+    )} ${successPercent.padEnd(7)} │`
+    console.log(line)
+  }
+  console.log("└───────────────────────────────────────────────────────────┘")
+  console.log()
 }
+
 function formatNumber(num: number): string {
-  if (num >= 1000000) {
-    return (num / 1000000).toFixed(1) + "M"
-  } else if (num >= 1000) {
-    return (num / 1000).toFixed(1) + "K"
-  }
+  if (num >= 1000000) return (num / 1000000).toFixed(1) + "M"
+  if (num >= 1000) return (num / 1000).toFixed(1) + "K"
   return num.toString()
 }
+
+function parseList(value?: string, lowercase = false): string[] {
+  if (!value) return []
+  return value
+    .split(/[\s,]+/g)
+    .map((item) => item.trim())
+    .filter(Boolean)
+    .map((item) => (lowercase ? item.toLowerCase() : item))
+}
+
+function parseTimeInput(value: string): number | undefined {
+  const text = value.trim()
+  if (!text) return undefined
+  if (text === "now") return Date.now()
+  const rel = text.match(/^(\d+)([smhdw])$/i)
+  if (rel) {
+    const amount = Number(rel[1])
+    const unit = rel[2].toLowerCase()
+    const factor = (() => {
+      if (unit === "s") return 1000
+      if (unit === "m") return 1000 * 60
+      if (unit === "h") return 1000 * 60 * 60
+      if (unit === "d") return 1000 * 60 * 60 * 24
+      if (unit === "w") return 1000 * 60 * 60 * 24 * 7
+      return 0
+    })()
+    return Date.now() - amount * factor
+  }
+  const date = new Date(text)
+  if (!Number.isNaN(date.getTime())) return date.getTime()
+  const numeric = Number(text)
+  if (!Number.isNaN(numeric)) return numeric
+  return undefined
+}
+
+function summarizeTelemetry(events: TelemetryEvent[]): TelemetrySummary {
+  if (events.length === 0) return { totalRuns: 0, totalErrors: 0, tools: {} }
+
+  const byTool = new Map<string, { runs: number; errors: number; totalDuration: number; durations: number[] }>()
+  for (const event of events) {
+    const existing = byTool.get(event.id)
+    const entry = existing ?? { runs: 0, errors: 0, totalDuration: 0, durations: [] as number[] }
+    if (!existing) byTool.set(event.id, entry)
+    entry.runs += 1
+    entry.totalDuration += event.duration
+    entry.durations.push(event.duration)
+    if (event.status === "error") entry.errors += 1
+  }
+
+  const timestamps = events.map((event) => event.timestamp)
+  const windowStart = Math.min(...timestamps)
+  const windowEnd = Math.max(...timestamps)
+  const totalErrors = events.filter((event) => event.status === "error").length
+  const tools: Record<string, ToolTelemetryStats> = {}
+
+  for (const [tool, entry] of byTool.entries()) {
+    const durations = entry.durations.toSorted((a, b) => a - b)
+    const runs = entry.runs
+    const errors = entry.errors
+    const avg = runs > 0 ? entry.totalDuration / runs : 0
+    const median = percentileFromSorted(durations, 50)
+    const p95 = percentileFromSorted(durations, 95)
+    const p99 = percentileFromSorted(durations, 99)
+    const errorRate = runs > 0 ? errors / runs : 0
+    const successRate = 1 - errorRate
+    tools[tool] = {
+      runs,
+      errors,
+      totalDuration: entry.totalDuration,
+      averageDuration: avg,
+      medianDuration: median,
+      p95Duration: p95,
+      p99Duration: p99,
+      errorRate,
+      successRate,
+    }
+  }
+
+  const rangeMs = windowEnd - windowStart
+  const perDayErrorRate = rangeMs > 0 ? totalErrors / Math.max(1, rangeMs / (1000 * 60 * 60 * 24)) : undefined
+
+  return {
+    windowStart,
+    windowEnd,
+    totalRuns: events.length,
+    totalErrors,
+    perDayErrorRate,
+    tools,
+  }
+}
+
+async function compareBaseline(path: string, current: TelemetrySummary): Promise<TelemetryComparison | undefined> {
+  const file = Bun.file(path)
+  const exists = await file.exists()
+  if (!exists) return undefined
+  const text = await file.text()
+  const payload = JSON.parse(text)
+  const baselineCandidate = (() => {
+    if (payload.telemetrySummary) return payload.telemetrySummary
+    if (Array.isArray(payload.telemetry)) return summarizeTelemetry(payload.telemetry as TelemetryEvent[])
+    if (Array.isArray(payload)) return summarizeTelemetry(payload as TelemetryEvent[])
+    return undefined
+  })()
+  if (!baselineCandidate) return undefined
+  const baseline = normalizeTelemetrySummary(baselineCandidate)
+  if (!baseline) return undefined
+  return makeComparison(path, baseline, current)
+}
+
+function makeComparison(path: string, baseline: TelemetrySummary, current: TelemetrySummary): TelemetryComparison {
+  const tools = new Set([...Object.keys(baseline.tools), ...Object.keys(current.tools)])
+  const toolComparisons = Array.from(tools)
+    .map((tool) => ({
+      tool,
+      baseline: baseline.tools[tool],
+      current: current.tools[tool],
+    }))
+    .filter((entry) => entry.baseline || entry.current)
+
+  return {
+    path,
+    totalRunsDelta: current.totalRuns - baseline.totalRuns,
+    totalErrorsDelta: current.totalErrors - baseline.totalErrors,
+    toolComparisons,
+  }
+}
+
+function collectWarnings(summary: TelemetrySummary, warnLatency?: number, warnErrors?: number): string[] {
+  const notes: string[] = []
+  if (warnLatency !== undefined) {
+    const offenders = Object.entries(summary.tools).filter(([, data]) => data.p95Duration > warnLatency)
+    for (const [tool, data] of offenders) {
+      notes.push(`⚠ ${tool} p95 ${formatDurationShort(data.p95Duration)} exceeds ${formatDurationShort(warnLatency)}`)
+    }
+  }
+  if (warnErrors !== undefined && summary.totalErrors > warnErrors) {
+    notes.push(`⚠ Total telemetry errors ${summary.totalErrors} exceed ${warnErrors}`)
+  }
+  return notes
+}
+
+function displayTelemetryWindow(summary: TelemetrySummary) {
+  if (summary.totalRuns === 0) {
+    console.log("No telemetry events recorded for the selected window.")
+    console.log()
+    return
+  }
+  const start = summary.windowStart ? formatTimestamp(summary.windowStart) : "unknown"
+  const end = summary.windowEnd ? formatTimestamp(summary.windowEnd) : "unknown"
+  const windowLine = `Telemetry window: ${start} → ${end}`
+  console.log(windowLine)
+  const metrics = [`runs ${summary.totalRuns}`, `errors ${summary.totalErrors}`]
+  if (summary.perDayErrorRate !== undefined) metrics.push(`errors/day ${summary.perDayErrorRate.toFixed(2)}`)
+  console.log(metrics.join(" • "))
+  console.log()
+}
+
+function displayComparison(comparison: TelemetryComparison) {
+  console.log(`Baseline comparison (${comparison.path}):`)
+  console.log(
+    [
+      ` total runs ${formatSigned(comparison.totalRunsDelta)}`,
+      ` total errors ${formatSigned(comparison.totalErrorsDelta)}`,
+    ].join(" • "),
+  )
+  if (comparison.toolComparisons.length === 0) {
+    console.log()
+    return
+  }
+  for (const item of comparison.toolComparisons.sort((a, b) => a.tool.localeCompare(b.tool))) {
+    const current = item.current
+    const baseline = item.baseline
+    if (!current && !baseline) continue
+    const runsDelta = current && baseline ? current.runs - baseline.runs : current ? current.runs : -baseline!.runs
+    const p95Delta = (() => {
+      if (current && baseline) return current.p95Duration - baseline.p95Duration
+      if (current) return current.p95Duration
+      return -baseline!.p95Duration
+    })()
+    const errorRateDelta = (() => {
+      if (current && baseline) return current.errorRate - baseline.errorRate
+      if (current) return current.errorRate
+      return -baseline!.errorRate
+    })()
+    const parts = [`${item.tool}: Δruns ${formatSigned(runsDelta)}`]
+    parts.push(`Δp95 ${formatSignedDuration(p95Delta)}`)
+    parts.push(`Δerr ${formatSignedPercent(errorRateDelta)}`)
+    if (current && baseline && baseline.p95Duration > 0) {
+      const ratio = current.p95Duration / baseline.p95Duration
+      if (ratio >= 3) parts.push(`⚠ p95 ${ratio.toFixed(1)}x`)
+    }
+    if (current && baseline && baseline.errorRate > 0) {
+      const ratio = current.errorRate / baseline.errorRate
+      if (ratio >= 3) parts.push(`⚠ err ${ratio.toFixed(1)}x`)
+    }
+    if (current && !baseline && current.runs > 0) parts.push("⚠ new tool")
+    if (!current && baseline && baseline.runs > 0) parts.push("⚠ missing tool")
+    console.log(parts.join(" • "))
+  }
+  console.log()
+}
+
+function displayTelemetryEvents(events: TelemetryEvent[]) {
+  if (events.length === 0) {
+    console.log("No telemetry events match the provided filter.")
+    return
+  }
+  console.log("┌──────────────────────── TELEMETRY EVENTS ─────────────────────────┐")
+  console.log("│ Time                 Tool   Status  Duration  Session   Message               │")
+  console.log("├──────────────────────────────────────────────────────────────────┤")
+  for (const event of events) {
+    const date = formatTimestamp(event.timestamp)
+    const status = event.status === "success" ? "OK" : "ERR"
+    const duration = formatDurationShort(event.duration)
+    const session = event.sessionID.slice(-8)
+    const message = event.error ? event.error.slice(0, 30) : ""
+    const line = `│ ${date} ${event.id.padEnd(6)} ${status.padEnd(6)} ${duration.padEnd(8)} ${session.padEnd(8)} ${message.padEnd(
+      22,
+    )} │`
+    console.log(line)
+  }
+  console.log("└──────────────────────────────────────────────────────────────────┘")
+}
+
+function displayTelemetryDetails(events: TelemetryEvent[], options: DetailOptions) {
+  if (events.length === 0) {
+    console.log("No telemetry metadata found for the selected events.")
+    return
+  }
+
+  if (options.format === "ndjson") {
+    for (const event of events) {
+      const extra = filterExtra(event.extra, options.fields)
+      const payload = {
+        timestamp: event.timestamp,
+        time: formatTimestamp(event.timestamp),
+        tool: event.id,
+        status: event.status,
+        duration: event.duration,
+        session: event.sessionID,
+        call: event.callID,
+        error: event.error,
+        extra,
+      }
+      console.log(JSON.stringify(payload))
+    }
+    return
+  }
+
+  if (options.format === "csv") {
+    const baseFields = ["timestamp", "time", "tool", "status", "duration", "session", "call", "error"]
+    const extraKeys = collectFieldNames(events, options.fields)
+    const header = [...baseFields, ...extraKeys]
+    console.log(header.join(","))
+    for (const event of events) {
+      const baseRow = [
+        String(event.timestamp),
+        formatTimestamp(event.timestamp),
+        event.id,
+        event.status,
+        String(event.duration),
+        event.sessionID,
+        event.callID ?? "",
+        event.error ?? "",
+      ]
+      const extra = filterExtra(event.extra, options.fields)
+      const extras = extraKeys.map((key) => toCSVValue(extra?.[key]))
+      console.log([...baseRow, ...extras].join(","))
+    }
+    return
+  }
+
+  console.log("Telemetry details:")
+  const hasMetadata = events.some((event) => {
+    const extra = filterExtra(event.extra, options.fields)
+    if (!extra) return false
+    return Object.keys(extra).length > 0
+  })
+  if (!hasMetadata) {
+    console.log("No telemetry metadata found for the selected events.")
+    return
+  }
+  for (const event of events) {
+    const extra = filterExtra(event.extra, options.fields)
+    if (!extra || Object.keys(extra).length === 0) continue
+    const header = `${formatTimestamp(event.timestamp)} ${event.id} (${event.status})`
+    console.log(header)
+    console.log(`  session: ${event.sessionID}  # opencode run --session ${event.sessionID}`)
+    if (event.callID) console.log(`  call: ${event.callID}`)
+    console.log(`  duration: ${formatDurationShort(event.duration)}`)
+    if (event.error) console.log(`  error: ${event.error}`)
+    for (const key of Object.keys(extra).sort()) {
+      console.log(`  ${key}: ${formatValue(extra[key])}`)
+    }
+    console.log()
+  }
+}
+
+function formatValue(value: unknown): string {
+  if (value === null) return "null"
+  if (typeof value === "number" || typeof value === "boolean") return String(value)
+  if (typeof value === "string") return value
+  if (Array.isArray(value)) return value.map((item) => formatValue(item)).join(", ")
+  return JSON.stringify(value)
+}
+
+function filterExtra(extra: TelemetryEvent["extra"], fields: string[]) {
+  if (!extra) return undefined
+  if (fields.length === 0) return extra
+  const picked: Record<string, unknown> = {}
+  for (const key of fields) {
+    if (key in extra) picked[key] = extra[key]
+  }
+  return picked
+}
+
+function collectFieldNames(events: TelemetryEvent[], requested: string[]): string[] {
+  if (requested.length > 0) return Array.from(new Set(requested))
+  const names = new Set<string>()
+  for (const event of events) {
+    if (!event.extra) continue
+    Object.keys(event.extra).forEach((key) => names.add(key))
+  }
+  return Array.from(names).sort()
+}
+
+function toCSVValue(value: unknown): string {
+  if (value === undefined) return ""
+  const raw = formatValue(value)
+  if (raw.includes(",") || raw.includes('"')) return `"${raw.replace(/"/g, '""')}"`
+  return raw
+}
+
+function percentileFromSorted(values: number[], target: number): number {
+  if (values.length === 0) return 0
+  if (values.length === 1) return values[0]
+  const rank = (target / 100) * (values.length - 1)
+  const lower = Math.floor(rank)
+  const upper = Math.ceil(rank)
+  if (lower === upper) return values[lower]
+  const weight = rank - lower
+  return values[lower] * (1 - weight) + values[upper] * weight
+}
+
+function formatDurationShort(duration: number): string {
+  if (duration < 1000) return `${duration.toFixed(0)}ms`
+  if (duration < 60000) return `${(duration / 1000).toFixed(2)}s`
+  return `${(duration / 60000).toFixed(2)}m`
+}
+
+function formatSigned(value: number): string {
+  if (value === 0) return "±0"
+  return value > 0 ? `+${value}` : `${value}`
+}
+
+function formatSignedDuration(value: number): string {
+  if (value === 0) return "±0ms"
+  const label = formatDurationShort(Math.abs(value))
+  return value > 0 ? `+${label}` : `-${label}`
+}
+
+function formatPercent(value: number): string {
+  return `${(value * 100).toFixed(1)}%`
+}
+
+function formatSignedPercent(value: number): string {
+  if (value === 0) return "±0.0%"
+  const abs = (Math.abs(value) * 100).toFixed(1)
+  return value > 0 ? `+${abs}%` : `-${abs}%`
+}
+
+function formatTimestamp(timestamp: number): string {
+  return new Date(timestamp).toISOString().replace("T", " ").split(".")[0]
+}
+
+function normalizeTelemetrySummary(input: any): TelemetrySummary | undefined {
+  if (!input || typeof input !== "object") return undefined
+  const rawTools = (input as any).tools
+  const tools: Record<string, ToolTelemetryStats> = {}
+  if (rawTools && typeof rawTools === "object") {
+    for (const [tool, raw] of Object.entries(rawTools as Record<string, any>)) {
+      const runs = Number(raw?.runs ?? 0)
+      const errors = Number(raw?.errors ?? 0)
+      const totalDuration = Number(raw?.totalDuration ?? 0)
+      const avg = Number(raw?.averageDuration ?? (runs > 0 ? totalDuration / runs : 0))
+      const median = Number(raw?.medianDuration ?? avg)
+      const p95 = Number(raw?.p95Duration ?? median)
+      const p99 = Number(raw?.p99Duration ?? p95)
+      const errorRate = runs > 0 ? errors / runs : 0
+      const successRate = 1 - errorRate
+      tools[tool] = {
+        runs,
+        errors,
+        totalDuration,
+        averageDuration: avg,
+        medianDuration: median,
+        p95Duration: p95,
+        p99Duration: p99,
+        errorRate,
+        successRate,
+      }
+    }
+  }
+  const totals = Object.values(tools)
+  const totalRuns =
+    typeof input.totalRuns === "number" ? input.totalRuns : totals.reduce((sum, entry) => sum + entry.runs, 0)
+  const totalErrors =
+    typeof input.totalErrors === "number" ? input.totalErrors : totals.reduce((sum, entry) => sum + entry.errors, 0)
+  const perDayErrorRate = typeof input.perDayErrorRate === "number" ? input.perDayErrorRate : undefined
+  const windowStart = typeof input.windowStart === "number" ? input.windowStart : undefined
+  const windowEnd = typeof input.windowEnd === "number" ? input.windowEnd : undefined
+  return {
+    windowStart,
+    windowEnd,
+    totalRuns,
+    totalErrors,
+    perDayErrorRate,
+    tools,
+  }
+}
diff --git a/packages/opencode/src/cli/ui.ts b/packages/opencode/src/cli/ui.ts
index bdbaed911b..caab8383bb 100644
--- a/packages/opencode/src/cli/ui.ts
+++ b/packages/opencode/src/cli/ui.ts
@@ -4,9 +4,9 @@ import { NamedError } from "../util/error"
 
 export namespace UI {
   const LOGO = [
-    [`█▀▀█ █▀▀█ █▀▀ █▀▀▄ `, `█▀▀ █▀▀█ █▀▀▄ █▀▀`],
-    [`█░░█ █░░█ █▀▀ █░░█ `, `█░░ █░░█ █░░█ █▀▀`],
-    [`▀▀▀▀ █▀▀▀ ▀▀▀ ▀  ▀ `, `▀▀▀ ▀▀▀▀ ▀▀▀  ▀▀▀`],
+    [`█▀▀▀ █▀▀█ ░▀░ █▀▄▀█ █▀▀█ ░▀░ █▀▀█ █▀▀`],
+    [`█░▀█ █▄▄▀ ▀█▀ █░▀░█ █░░█ ▀█▀ █▄▄▀ █▀▀`],
+    [`▀▀▀▀ ▀░▀▀ ▀▀▀ ▀░░░▀ ▀▀▀▀ ▀▀▀ ▀░▀▀ ▀▀▀`],
   ]
 
   export const CancelledError = NamedError.create("UICancelledError", z.void())
@@ -50,9 +50,8 @@ export namespace UI {
     for (const row of LOGO) {
       if (pad) result.push(pad)
       result.push(Bun.color("gray", "ansi"))
-      result.push(row[0])
+      result.push(row)
       result.push("\x1b[0m")
-      result.push(row[1])
       result.push(EOL)
     }
     return result.join("").trimEnd()
diff --git a/packages/opencode/src/config/config.ts b/packages/opencode/src/config/config.ts
index 40e4d90a4a..967d655890 100644
--- a/packages/opencode/src/config/config.ts
+++ b/packages/opencode/src/config/config.ts
@@ -131,7 +131,7 @@ export namespace Config {
       if (!ALLOWED_DIRS.has(dirname)) {
         throw new InvalidError({
           path: dir,
-          message: `Unexpected directory "${dirname}" found in "${dir}". Only ${ALLOWED_DIRS.values().toArray().join(", ")} directories are allowed.`,
+          message: `Unexpected directory "${dirname}" found in "${dir}". Only ${Array.from(ALLOWED_DIRS).join(", ")} directories are allowed.`,
         })
       }
     }
@@ -302,6 +302,8 @@ export namespace Config {
           edit: Permission.optional(),
           bash: z.union([Permission, z.record(z.string(), Permission)]).optional(),
           webfetch: Permission.optional(),
+          fetchurl: Permission.optional(),
+          websearch: Permission.optional(),
         })
         .optional(),
     })
@@ -533,6 +535,8 @@ export namespace Config {
           edit: Permission.optional(),
           bash: z.union([Permission, z.record(z.string(), Permission)]).optional(),
           webfetch: Permission.optional(),
+          fetchurl: Permission.optional(),
+          websearch: Permission.optional(),
         })
         .optional(),
       tools: z.record(z.string(), z.boolean()).optional(),
diff --git a/packages/opencode/src/evaluation/baseline.ts b/packages/opencode/src/evaluation/baseline.ts
new file mode 100644
index 0000000000..0f92f14644
--- /dev/null
+++ b/packages/opencode/src/evaluation/baseline.ts
@@ -0,0 +1,593 @@
+import z from "zod/v4"
+import { Storage } from "../storage/storage"
+import { Bus } from "../bus"
+import type { Trace } from "../trace"
+import { EvaluationEngine } from "./engine"
+
+/**
+ * Baseline management for comparative analysis and regression detection.
+ * 
+ * Baselines serve as reference points for tracking metric performance over time.
+ * They enable:
+ * - Regression detection by comparing new traces to established baselines
+ * - A/B testing by comparing two different configurations
+ * - Performance tracking across versions/iterations
+ * - Statistical analysis of metric distributions
+ * 
+ * @example
+ * ```typescript
+ * // Create a baseline from current production performance
+ * const baseline = await Baseline.create({
+ *   id: "prod-v1.0",
+ *   name: "Production Baseline v1.0",
+ *   description: "Performance baseline for initial release",
+ *   metricIDs: ["error-rate", "response-time", "cost"],
+ *   tags: ["production", "v1.0"]
+ * })
+ * 
+ * // Add traces to the baseline
+ * await Baseline.addTrace(baseline.id, trace)
+ * 
+ * // Compare new trace against baseline
+ * const comparison = await Baseline.compare(baseline.id, newTrace)
+ * if (comparison.regressions.length > 0) {
+ *   console.warn("Performance regression detected!")
+ * }
+ * ```
+ */
+export namespace Baseline {
+  /**
+   * Statistical summary of metric values in a baseline.
+   */
+  export const Statistics = z.object({
+    metricID: z.string(),
+    count: z.number(),
+    mean: z.number(),
+    median: z.number(),
+    stdDev: z.number(),
+    min: z.number(),
+    max: z.number(),
+    p50: z.number(),
+    p95: z.number(),
+    p99: z.number(),
+  })
+  export type Statistics = z.infer<typeof Statistics>
+
+  /**
+   * A baseline definition with reference trace data.
+   */
+  export const Definition = z.object({
+    id: z.string(),
+    name: z.string(),
+    description: z.string(),
+    
+    // Metrics to track in this baseline
+    metricIDs: z.array(z.string()),
+    
+    // Reference traces
+    traceIDs: z.array(z.string()).default([]),
+    
+    // Computed statistics
+    statistics: z.array(Statistics).default([]),
+    
+    // Configuration
+    minSampleSize: z.number().default(10),
+    regressionThreshold: z.number().default(0.1), // 10% degradation
+    
+    // Metadata
+    tags: z.array(z.string()).default([]),
+    createdAt: z.number(),
+    updatedAt: z.number(),
+    version: z.string().default("1.0.0"),
+  })
+  export type Definition = z.infer<typeof Definition>
+
+  /**
+   * Result of comparing a trace against a baseline.
+   */
+  export const ComparisonResult = z.object({
+    baselineID: z.string(),
+    traceID: z.string(),
+    
+    // Per-metric comparison
+    metrics: z.array(
+      z.object({
+        metricID: z.string(),
+        baselineValue: z.number(), // Mean from baseline
+        traceValue: z.number(),
+        delta: z.number(), // Absolute difference
+        percentChange: z.number(), // Percentage change
+        isRegression: z.boolean(),
+        zScore: z.number().optional(), // How many std devs from mean
+      }),
+    ),
+    
+    // Summary
+    regressions: z.array(z.string()), // Metric IDs with regressions
+    improvements: z.array(z.string()), // Metric IDs with improvements
+    overallScore: z.number(), // 0-1, weighted average of metrics
+    
+    timestamp: z.number(),
+  })
+  export type ComparisonResult = z.infer<typeof ComparisonResult>
+
+  /**
+   * A/B test comparison between two baselines.
+   */
+  export const ABTestResult = z.object({
+    baselineA: z.string(),
+    baselineB: z.string(),
+    
+    // Per-metric statistical comparison
+    metrics: z.array(
+      z.object({
+        metricID: z.string(),
+        meanA: z.number(),
+        meanB: z.number(),
+        medianA: z.number(),
+        medianB: z.number(),
+        delta: z.number(),
+        percentChange: z.number(),
+        winner: z.enum(["A", "B", "tie"]),
+        confidence: z.number(), // 0-1, statistical confidence
+      }),
+    ),
+    
+    // Overall winner
+    overallWinner: z.enum(["A", "B", "tie"]),
+    sampleSizeA: z.number(),
+    sampleSizeB: z.number(),
+    
+    timestamp: z.number(),
+  })
+  export type ABTestResult = z.infer<typeof ABTestResult>
+
+  export const Event = {
+    Created: Bus.event(
+      "baseline.created",
+      z.object({
+        baselineID: z.string(),
+      }),
+    ),
+    Updated: Bus.event(
+      "baseline.updated",
+      z.object({
+        baselineID: z.string(),
+      }),
+    ),
+    RegressionDetected: Bus.event(
+      "baseline.regression",
+      z.object({
+        baselineID: z.string(),
+        traceID: z.string(),
+        regressions: z.array(z.string()),
+      }),
+    ),
+  }
+
+  /**
+   * Create a new baseline.
+   * 
+   * @param baseline - The baseline configuration
+   * @returns The created baseline definition
+   * 
+   * @example
+   * ```typescript
+   * const baseline = await Baseline.create({
+   *   id: "prod-baseline",
+   *   name: "Production Baseline",
+   *   description: "Reference performance for production",
+   *   metricIDs: ["error-rate", "latency"],
+   *   tags: ["production"]
+   * })
+   * ```
+   */
+  export async function create(
+    baseline: Pick<Definition, "id" | "name" | "description" | "metricIDs"> & 
+    Partial<Omit<Definition, "id" | "name" | "description" | "metricIDs" | "createdAt" | "updatedAt">>
+  ): Promise<Definition> {
+    const now = Date.now()
+    const complete: Definition = {
+      traceIDs: [],
+      statistics: [],
+      minSampleSize: 10,
+      regressionThreshold: 0.1,
+      tags: [],
+      version: "1.0.0",
+      ...baseline,
+      createdAt: now,
+      updatedAt: now,
+    }
+    
+    await Storage.write(["baseline", baseline.id], complete)
+    Bus.publish(Event.Created, { baselineID: baseline.id })
+    
+    return complete
+  }
+
+  /**
+   * Get a baseline by ID.
+   * 
+   * @param id - The baseline ID
+   * @returns The baseline definition
+   */
+  export async function get(id: string): Promise<Definition> {
+    return Storage.read<Definition>(["baseline", id])
+  }
+
+  /**
+   * List all baselines.
+   * 
+   * @returns Array of baseline definitions
+   */
+  export async function list(): Promise<Definition[]> {
+    const keys = await Storage.list(["baseline"])
+    const baselines: Definition[] = []
+    
+    for (const key of keys) {
+      const baseline = await Storage.read<Definition>(key)
+      baselines.push(baseline)
+    }
+    
+    return baselines.sort((a, b) => b.updatedAt - a.updatedAt)
+  }
+
+  /**
+   * Add a trace to a baseline and update statistics.
+   * 
+   * Evaluates the trace against all baseline metrics and updates
+   * the statistical distribution.
+   * 
+   * @param baselineID - The baseline ID
+   * @param trace - The trace to add
+   * 
+   * @example
+   * ```typescript
+   * await Baseline.addTrace("prod-baseline", trace)
+   * ```
+   */
+  export async function addTrace(baselineID: string, trace: Trace.Complete): Promise<void> {
+    const baseline = await get(baselineID)
+    const { Metric } = await import("./metric")
+    
+    // Get all metrics for this baseline
+    const metrics = await Promise.all(baseline.metricIDs.map((id) => Metric.get(id)))
+    
+    // Evaluate trace against all metrics
+    await EvaluationEngine.evaluateMany(trace, metrics)
+    
+    // Add trace to baseline
+    baseline.traceIDs.push(trace.id)
+    
+    // Update statistics
+    baseline.statistics = await computeStatistics(baselineID, baseline.metricIDs)
+    baseline.updatedAt = Date.now()
+    
+    await Storage.write(["baseline", baselineID], baseline)
+    Bus.publish(Event.Updated, { baselineID })
+  }
+
+  /**
+   * Add multiple traces to a baseline in a single batch operation.
+   * Much faster than calling addTrace() in a loop.
+   * 
+   * @param baselineID - The baseline ID
+   * @param traces - Array of traces to add
+   * 
+   * @example
+   * ```typescript
+   * const historicalTraces = loadHistoricalData()
+   * await Baseline.addTraces("prod-baseline", historicalTraces)
+   * ```
+   */
+  export async function addTraces(
+    baselineID: string,
+    traces: Trace.Complete[]
+  ): Promise<void> {
+    const baseline = await get(baselineID)
+    const { Metric } = await import("./metric")
+    
+    // Get all metrics for this baseline
+    const metrics = await Promise.all(baseline.metricIDs.map((id) => Metric.get(id)))
+    
+    // Evaluate all traces in parallel
+    await Promise.all(
+      traces.map(async (trace) => {
+        await EvaluationEngine.evaluateMany(trace, metrics)
+      })
+    )
+    
+    // Add all trace IDs
+    baseline.traceIDs.push(...traces.map((t) => t.id))
+    
+    // Update statistics once for all new data
+    baseline.statistics = await computeStatistics(baselineID, baseline.metricIDs)
+    baseline.updatedAt = Date.now()
+    
+    await Storage.write(["baseline", baselineID], baseline)
+    Bus.publish(Event.Updated, { baselineID })
+  }
+
+  /**
+   * Compare a trace against a baseline.
+   * 
+   * Evaluates the trace and compares each metric against the baseline's
+   * statistical distribution to detect regressions or improvements.
+   * 
+   * @param baselineID - The baseline to compare against
+   * @param trace - The trace to evaluate
+   * @returns Comparison result with regression detection
+   * 
+   * @example
+   * ```typescript
+   * const comparison = await Baseline.compare("prod-baseline", trace)
+   * if (comparison.regressions.length > 0) {
+   *   console.error(`Regressions detected: ${comparison.regressions.join(", ")}`)
+   * }
+   * ```
+   */
+  export async function compare(baselineID: string, trace: Trace.Complete): Promise<ComparisonResult> {
+    const baseline = await get(baselineID)
+    const { Metric } = await import("./metric")
+    
+    if (baseline.traceIDs.length < baseline.minSampleSize) {
+      throw new Error(`Baseline ${baselineID} needs at least ${baseline.minSampleSize} traces`)
+    }
+    
+    // Get all metrics and evaluate trace
+    const metrics = await Promise.all(baseline.metricIDs.map((id) => Metric.get(id)))
+    const results = await EvaluationEngine.evaluateMany(trace, metrics)
+    
+    const metricComparisons = []
+    const regressions: string[] = []
+    const improvements: string[] = []
+    
+    for (const result of results) {
+      const stats = baseline.statistics.find((s) => s.metricID === result.metricID)
+      if (!stats) continue
+      
+      const metric = metrics.find((m) => m.id === result.metricID)!
+      const traceValue = result.score
+      const baselineValue = stats.mean
+      const delta = traceValue - baselineValue
+      const percentChange = baselineValue === 0 ? 0 : (delta / baselineValue) * 100
+      const zScore = stats.stdDev === 0 ? 0 : delta / stats.stdDev
+      
+      // Determine if this is a regression based on metric direction
+      const isWorse = metric.higherIsBetter ? delta < 0 : delta > 0
+      
+      // For regression detection:
+      // - Use percent change if baseline is non-zero
+      // - Use absolute delta if baseline is zero (any change from 0 is significant)
+      const isRegression = baselineValue === 0
+        ? isWorse && Math.abs(delta) > baseline.regressionThreshold
+        : isWorse && Math.abs(percentChange) > baseline.regressionThreshold * 100
+      
+      if (isRegression) {
+        regressions.push(result.metricID)
+      } else {
+        // Check for improvements using same logic
+        const isImprovement = baselineValue === 0
+          ? !isWorse && Math.abs(delta) > baseline.regressionThreshold
+          : !isWorse && Math.abs(percentChange) > baseline.regressionThreshold * 100
+        
+        if (isImprovement) {
+          improvements.push(result.metricID)
+        }
+      }
+      
+      metricComparisons.push({
+        metricID: result.metricID,
+        baselineValue,
+        traceValue,
+        delta,
+        percentChange,
+        isRegression,
+        zScore,
+      })
+    }
+    
+    // Compute overall score (weighted average of normalized scores)
+    const overallScore = metricComparisons.reduce((sum, m) => {
+      const normalizedScore = m.isRegression ? 0 : 1
+      return sum + normalizedScore
+    }, 0) / metricComparisons.length
+    
+    const comparisonResult: ComparisonResult = {
+      baselineID,
+      traceID: trace.id,
+      metrics: metricComparisons,
+      regressions,
+      improvements,
+      overallScore,
+      timestamp: Date.now(),
+    }
+    
+    // Store comparison result
+    await Storage.write(["baseline-comparison", baselineID, trace.id], comparisonResult)
+    
+    // Emit event if regressions detected
+    if (regressions.length > 0) {
+      Bus.publish(Event.RegressionDetected, {
+        baselineID,
+        traceID: trace.id,
+        regressions,
+      })
+    }
+    
+    return comparisonResult
+  }
+
+  /**
+   * Compare two baselines for A/B testing.
+   * 
+   * Performs statistical comparison between two baselines to determine
+   * which performs better across tracked metrics.
+   * 
+   * @param baselineAID - First baseline ID
+   * @param baselineBID - Second baseline ID
+   * @returns A/B test comparison result
+   * 
+   * @example
+   * ```typescript
+   * const result = await Baseline.compareAB("v1-baseline", "v2-baseline")
+   * console.log(`Winner: ${result.overallWinner}`)
+   * result.metrics.forEach(m => {
+   *   console.log(`${m.metricID}: ${m.winner} wins by ${m.percentChange.toFixed(1)}%`)
+   * })
+   * ```
+   */
+  export async function compareAB(baselineAID: string, baselineBID: string): Promise<ABTestResult> {
+    const baselineA = await get(baselineAID)
+    const baselineB = await get(baselineBID)
+    
+    if (baselineA.traceIDs.length < baselineA.minSampleSize) {
+      throw new Error(`Baseline A needs at least ${baselineA.minSampleSize} traces`)
+    }
+    if (baselineB.traceIDs.length < baselineB.minSampleSize) {
+      throw new Error(`Baseline B needs at least ${baselineB.minSampleSize} traces`)
+    }
+    
+    const metricComparisons = []
+    let aWins = 0
+    let bWins = 0
+    
+    // Compare each metric that exists in both baselines
+    const commonMetrics = baselineA.metricIDs.filter((id) => baselineB.metricIDs.includes(id))
+    const { Metric } = await import("./metric")
+    
+    for (const metricID of commonMetrics) {
+      const statsA = baselineA.statistics.find((s) => s.metricID === metricID)
+      const statsB = baselineB.statistics.find((s) => s.metricID === metricID)
+      
+      if (!statsA || !statsB) continue
+      
+      const metric = await Metric.get(metricID)
+      const delta = statsB.mean - statsA.mean
+      const percentChange = statsA.mean === 0 ? 0 : (delta / statsA.mean) * 100
+      
+      // Determine winner based on metric direction
+      let winner: "A" | "B" | "tie"
+      if (Math.abs(percentChange) < 1) {
+        winner = "tie"
+      } else if (metric.higherIsBetter) {
+        winner = delta > 0 ? "B" : "A"
+      } else {
+        winner = delta < 0 ? "B" : "A"
+      }
+      
+      if (winner === "A") aWins++
+      if (winner === "B") bWins++
+      
+      // Simple confidence based on sample size and effect size
+      const minSampleSize = Math.min(statsA.count, statsB.count)
+      const effectSize = Math.abs(delta) / Math.max(statsA.stdDev, statsB.stdDev, 1)
+      const confidence = Math.min(0.99, (minSampleSize / 100) * effectSize)
+      
+      metricComparisons.push({
+        metricID,
+        meanA: statsA.mean,
+        meanB: statsB.mean,
+        medianA: statsA.median,
+        medianB: statsB.median,
+        delta,
+        percentChange,
+        winner,
+        confidence,
+      })
+    }
+    
+    const overallWinner = aWins > bWins ? "A" : bWins > aWins ? "B" : "tie"
+    
+    const result: ABTestResult = {
+      baselineA: baselineAID,
+      baselineB: baselineBID,
+      metrics: metricComparisons,
+      overallWinner,
+      sampleSizeA: baselineA.traceIDs.length,
+      sampleSizeB: baselineB.traceIDs.length,
+      timestamp: Date.now(),
+    }
+    
+    // Store A/B test result
+    await Storage.write(["ab-test", `${baselineAID}-vs-${baselineBID}`, Date.now().toString()], result)
+    
+    return result
+  }
+
+  /**
+   * Find baselines by tag.
+   * 
+   * @param tag - The tag to filter by
+   * @returns Array of baselines with the specified tag
+   */
+  export async function findByTag(tag: string): Promise<Definition[]> {
+    const all = await list()
+    return all.filter((b) => b.tags.includes(tag))
+  }
+
+  /**
+   * Remove a baseline.
+   * 
+   * @param id - The baseline ID to remove
+   */
+  export async function remove(id: string): Promise<void> {
+    await Storage.remove(["baseline", id])
+  }
+
+  /**
+   * Compute statistics for a baseline's metrics.
+   * 
+   * @param baselineID - The baseline ID
+   * @param metricIDs - Metric IDs to compute statistics for
+   * @returns Array of statistics per metric
+   */
+  async function computeStatistics(baselineID: string, metricIDs: string[]): Promise<Statistics[]> {
+    const stats: Statistics[] = []
+    const baseline = await get(baselineID)
+    const metricSet = new Set(metricIDs)
+    const scoresByMetric = new Map<string, number[]>()
+
+    for (const traceID of baseline.traceIDs) {
+      const results = await EvaluationEngine.getResults(traceID)
+      for (const result of results) {
+        if (!metricSet.has(result.metricID)) continue
+        const existing = scoresByMetric.get(result.metricID) ?? []
+        existing.push(result.score)
+        scoresByMetric.set(result.metricID, existing)
+      }
+    }
+
+    for (const metricID of metricIDs) {
+      const scores = scoresByMetric.get(metricID)
+      if (!scores || scores.length === 0) continue
+      const sorted = [...scores].sort((a, b) => a - b)
+      const count = sorted.length
+
+      const mean = sorted.reduce((sum, s) => sum + s, 0) / count
+      const median = sorted[Math.floor(count / 2)]
+      const variance = sorted.reduce((sum, s) => sum + Math.pow(s - mean, 2), 0) / count
+      const stdDev = Math.sqrt(variance)
+      const min = sorted[0]
+      const max = sorted[count - 1]
+      const p50 = sorted[Math.floor((count - 1) * 0.5)]
+      const p95 = sorted[Math.floor((count - 1) * 0.95)]
+      const p99 = sorted[Math.floor((count - 1) * 0.99)]
+
+      stats.push({
+        metricID,
+        count,
+        mean,
+        median,
+        stdDev,
+        min,
+        max,
+        p50,
+        p95,
+        p99,
+      })
+    }
+    
+    return stats
+  }
+}
diff --git a/packages/opencode/src/evaluation/dataset.ts b/packages/opencode/src/evaluation/dataset.ts
new file mode 100644
index 0000000000..4bba091bce
--- /dev/null
+++ b/packages/opencode/src/evaluation/dataset.ts
@@ -0,0 +1,278 @@
+import z from "zod/v4"
+import { Storage } from "../storage/storage"
+import { Bus } from "../bus"
+
+/**
+ * Dataset management for test-driven evaluation.
+ * 
+ * Datasets contain test cases with input scenarios and expected behaviors
+ * defined through assertions. Test cases can be:
+ * - Executed against traces to verify behavior
+ * - Tagged and filtered for organization
+ * - Enabled/disabled for selective testing
+ * - Versioned for tracking changes over time
+ * 
+ * Supported assertion types:
+ * - tool-called: Verify specific tools were invoked
+ * - output-matches: Check output against regex patterns
+ * - output-contains: Verify substring presence
+ * - no-errors: Ensure error-free execution
+ * - duration-under: Performance threshold checks
+ * - cost-under: Budget constraints
+ * - metric-passes: Evaluate against registered metrics
+ * - custom: JavaScript expressions for flexible logic
+ * 
+ * @example
+ * ```typescript
+ * await Dataset.create({
+ *   id: "smoke-tests",
+ *   name: "Smoke Test Suite",
+ *   description: "Critical path validation",
+ *   version: "1.0.0",
+ *   testCases: [{
+ *     id: "test-1",
+ *     name: "Basic Task",
+ *     input: { prompt: "List files", context: {} },
+ *     assertions: [
+ *       { type: "tool-called", toolID: "LS", minCount: 1 },
+ *       { type: "no-errors" }
+ *     ],
+ *     tags: ["critical"],
+ *     enabled: true
+ *   }],
+ *   tags: ["production"]
+ * })
+ * ```
+ */
+export namespace Dataset {
+  /**
+   * Assertion types for test cases.
+   * 
+   * Assertions define expected trace behaviors and are evaluated
+   * against completed traces to determine test pass/fail status.
+   */
+  export const Assertion = z.discriminatedUnion("type", [
+    z.object({
+      type: z.literal("tool-called"),
+      toolID: z.string(),
+      minCount: z.number().optional(),
+      maxCount: z.number().optional(),
+    }),
+    z.object({
+      type: z.literal("output-matches"),
+      pattern: z.string(), // Regex pattern
+      flags: z.string().optional(),
+    }),
+    z.object({
+      type: z.literal("output-contains"),
+      substring: z.string(),
+    }),
+    z.object({
+      type: z.literal("no-errors"),
+    }),
+    z.object({
+      type: z.literal("duration-under"),
+      milliseconds: z.number(),
+    }),
+    z.object({
+      type: z.literal("cost-under"),
+      dollars: z.number(),
+    }),
+    z.object({
+      type: z.literal("metric-passes"),
+      metricID: z.string(),
+    }),
+    z.object({
+      type: z.literal("custom"),
+      expression: z.string(), // JavaScript expression evaluated against trace
+      description: z.string(),
+    }),
+  ])
+  export type Assertion = z.infer<typeof Assertion>
+
+  /**
+   * Test case with input and expected behavior
+   */
+  export const TestCase = z.object({
+    id: z.string(),
+    name: z.string(),
+    description: z.string().optional(),
+    
+    // Input
+    input: z.object({
+      prompt: z.string(),
+      context: z.record(z.string(), z.any()).optional(),
+    }),
+    
+    // Expected behavior
+    assertions: z.array(Assertion),
+    
+    // Metadata
+    tags: z.array(z.string()).default([]),
+    enabled: z.boolean().default(true),
+  })
+  export type TestCase = z.infer<typeof TestCase>
+
+  /**
+   * Dataset definition
+   */
+  export const Definition = z.object({
+    id: z.string(),
+    name: z.string(),
+    description: z.string(),
+    version: z.string(),
+    
+    testCases: z.array(TestCase),
+    
+    // Metadata
+    tags: z.array(z.string()).default([]),
+    createdAt: z.number(),
+    updatedAt: z.number(),
+  })
+  export type Definition = z.infer<typeof Definition>
+
+  export const Event = {
+    Created: Bus.event(
+      "dataset.created",
+      z.object({
+        datasetID: z.string(),
+      }),
+    ),
+    Updated: Bus.event(
+      "dataset.updated",
+      z.object({
+        datasetID: z.string(),
+      }),
+    ),
+  }
+
+  /**
+   * Create a new dataset
+   */
+  export async function create(dataset: Omit<Definition, "createdAt" | "updatedAt">): Promise<Definition> {
+    const now = Date.now()
+    const complete: Definition = {
+      ...dataset,
+      createdAt: now,
+      updatedAt: now,
+    }
+    
+    await Storage.write(["dataset", dataset.id], complete)
+    Bus.publish(Event.Created, { datasetID: dataset.id })
+    return complete
+  }
+
+  /**
+   * Update an existing dataset
+   */
+  export async function update(id: string, updates: Partial<Omit<Definition, "id" | "createdAt" | "updatedAt">>): Promise<Definition> {
+    const existing = await get(id)
+    const updated: Definition = {
+      ...existing,
+      ...updates,
+      updatedAt: Date.now(),
+    }
+    
+    await Storage.write(["dataset", id], updated)
+    Bus.publish(Event.Updated, { datasetID: id })
+    return updated
+  }
+
+  /**
+   * Get a dataset by ID
+   */
+  export async function get(id: string): Promise<Definition> {
+    const dataset = await Storage.read<Definition>(["dataset", id])
+    return dataset
+  }
+
+  /**
+   * List all datasets
+   */
+  export async function list(): Promise<Definition[]> {
+    const keys = await Storage.list(["dataset"])
+    const datasets: Definition[] = []
+    
+    for (const key of keys) {
+      const dataset = await Storage.read<Definition>(key)
+      datasets.push(dataset)
+    }
+    
+    return datasets.sort((a, b) => b.updatedAt - a.updatedAt)
+  }
+
+  /**
+   * Check if a dataset exists
+   */
+  export async function exists(id: string): Promise<boolean> {
+    try {
+      await get(id)
+      return true
+    } catch {
+      return false
+    }
+  }
+
+  /**
+   * Remove a dataset
+   */
+  export async function remove(id: string): Promise<void> {
+    await Storage.remove(["dataset", id])
+  }
+
+  /**
+   * Find datasets by tag
+   */
+  export async function findByTag(tag: string): Promise<Definition[]> {
+    const all = await list()
+    return all.filter((d) => d.tags.includes(tag))
+  }
+
+  /**
+   * Export dataset to JSON
+   */
+  export async function exportToJSON(id: string): Promise<string> {
+    const dataset = await get(id)
+    return JSON.stringify(dataset, null, 2)
+  }
+
+  /**
+   * Import dataset from JSON
+   */
+  export async function importFromJSON(json: string): Promise<Definition> {
+    const data = JSON.parse(json)
+    const dataset = Definition.parse(data)
+    
+    // Check if exists and update, or create new
+    if (await exists(dataset.id)) {
+      return update(dataset.id, dataset)
+    }
+    return create(dataset)
+  }
+
+  /**
+   * Add a test case to a dataset
+   */
+  export async function addTestCase(datasetID: string, testCase: TestCase): Promise<Definition> {
+    const dataset = await get(datasetID)
+    dataset.testCases.push(testCase)
+    return update(datasetID, { testCases: dataset.testCases })
+  }
+
+  /**
+   * Remove a test case from a dataset
+   */
+  export async function removeTestCase(datasetID: string, testCaseID: string): Promise<Definition> {
+    const dataset = await get(datasetID)
+    dataset.testCases = dataset.testCases.filter((tc) => tc.id !== testCaseID)
+    return update(datasetID, { testCases: dataset.testCases })
+  }
+
+  /**
+   * Get enabled test cases from a dataset
+   */
+  export async function getEnabledTestCases(datasetID: string): Promise<TestCase[]> {
+    const dataset = await get(datasetID)
+    return dataset.testCases.filter((tc) => tc.enabled)
+  }
+}
diff --git a/packages/opencode/src/evaluation/engine.ts b/packages/opencode/src/evaluation/engine.ts
new file mode 100644
index 0000000000..16095b3e0b
--- /dev/null
+++ b/packages/opencode/src/evaluation/engine.ts
@@ -0,0 +1,284 @@
+import z from "zod/v4"
+import { Storage } from "../storage/storage"
+import { Bus } from "../bus"
+import type { Trace } from "../trace"
+import type { Metric } from "./metric"
+import { Heuristics } from "./heuristics"
+import { Log } from "../util/log"
+
+/**
+ * EvaluationEngine executes metric evaluations against traces.
+ * 
+ * Supports three types of evaluators:
+ * - Rule: JavaScript expressions evaluated against trace data
+ * - Heuristic: Built-in functions for common metrics
+ * - LLM: AI-based evaluation using language models (planned)
+ * 
+ * @example
+ * ```typescript
+ * const metric = await Metric.get("error-rate")
+ * const result = await EvaluationEngine.evaluate(trace, metric)
+ * console.log(`Score: ${result.score}, Passed: ${result.passed}`)
+ * ```
+ */
+export namespace EvaluationEngine {
+  const log = Log.create({ service: "evaluation-engine" })
+
+  export const Result = z.object({
+    id: z.string(),
+    traceID: z.string(),
+    metricID: z.string(),
+    
+    score: z.number(),
+    passed: z.boolean(),
+    
+    evaluatorType: z.enum(["rule", "heuristic", "llm"]),
+    reasoning: z.string().optional(),
+    metadata: z.record(z.string(), z.any()).optional(),
+    
+    timestamp: z.number(),
+  })
+  export type Result = z.infer<typeof Result>
+
+  export const Event = {
+    Completed: Bus.event(
+      "evaluation.completed",
+      z.object({
+        result: Result,
+      }),
+    ),
+  }
+
+  /**
+   * Evaluate a trace against a specific metric.
+   * 
+   * Computes a score based on the metric's evaluator type and determines
+   * whether the trace passes the defined threshold.
+   * 
+   * @param trace - The completed trace to evaluate
+   * @param metric - The metric definition containing evaluation logic and thresholds
+   * @returns Evaluation result with score, pass/fail status, and metadata
+   * 
+   * @example
+   * ```typescript
+   * const metric = await Metric.get("response-time")
+   * const result = await EvaluationEngine.evaluate(trace, metric)
+   * if (result.passed) {
+   *   console.log(`Passed with score: ${result.score}`)
+   * }
+   * ```
+   */
+  export async function evaluate(trace: Trace.Complete, metric: Metric.Definition): Promise<Result> {
+    log.debug("evaluating trace", {
+      traceID: trace.id,
+      metricID: metric.id,
+    })
+
+    const score = await computeScore(trace, metric)
+    const threshold = metric.threshold?.pass
+
+    let passed = true
+    if (threshold !== undefined) {
+      passed = metric.higherIsBetter ? score >= threshold : score <= threshold
+    }
+
+    const result: Result = {
+      id: Date.now().toString() + "-" + Math.random().toString(36).substring(7),
+      traceID: trace.id,
+      metricID: metric.id,
+      score,
+      passed,
+      evaluatorType: metric.evaluator.type,
+      timestamp: Date.now(),
+    }
+
+    // Store the result
+    await Storage.write(["evaluation", trace.id, result.id], result)
+
+    // Emit event
+    Bus.publish(Event.Completed, { result })
+
+    log.debug("evaluation completed", {
+      traceID: trace.id,
+      metricID: metric.id,
+      score,
+      passed,
+    })
+
+    return result
+  }
+
+  /**
+   * Evaluate a trace against multiple metrics in parallel.
+   * 
+   * Efficiently evaluates multiple metrics simultaneously and returns
+   * all results. Useful for quality gates and comprehensive assessments.
+   * 
+   * @param trace - The completed trace to evaluate
+   * @param metrics - Array of metric definitions to evaluate
+   * @returns Array of evaluation results, one per metric
+   * 
+   * @example
+   * ```typescript
+   * const metrics = await Metric.findByTag("production")
+   * const results = await EvaluationEngine.evaluateMany(trace, metrics)
+   * const allPassed = results.every(r => r.passed)
+   * ```
+   */
+  export async function evaluateMany(
+    trace: Trace.Complete,
+    metrics: Metric.Definition[],
+  ): Promise<Result[]> {
+    return Promise.all(metrics.map((m) => evaluate(trace, m)))
+  }
+
+  /**
+   * Get evaluation results for a trace
+   */
+  export async function getResults(traceID: string): Promise<Result[]> {
+    const keys = await Storage.list(["evaluation", traceID])
+    const results: Result[] = []
+
+    for (const key of keys) {
+      const result = await Storage.read<Result>(key)
+      results.push(result)
+    }
+
+    return results.sort((a, b) => a.timestamp - b.timestamp)
+  }
+
+  /**
+   * Get evaluation results for a specific metric across traces
+   */
+  export async function getResultsForMetric(metricID: string): Promise<Result[]> {
+    // This requires scanning all evaluation results
+    // In a real implementation, you might want an index
+    const allKeys = await Storage.list(["evaluation"])
+    const results: Result[] = []
+
+    for (const key of allKeys) {
+      const result = await Storage.read<Result>(key)
+      if (result.metricID === metricID) {
+        results.push(result)
+      }
+    }
+
+    return results.sort((a, b) => a.timestamp - b.timestamp)
+  }
+
+  /**
+   * Compute score for a trace using a metric
+   */
+  async function computeScore(trace: Trace.Complete, metric: Metric.Definition): Promise<number> {
+    switch (metric.evaluator.type) {
+      case "rule":
+        return evaluateRule(trace, metric.evaluator.expression)
+      case "heuristic":
+        return evaluateHeuristic(trace, metric.evaluator)
+      case "llm":
+        return evaluateLLM(trace, metric.evaluator)
+    }
+  }
+
+  /**
+   * Evaluate using a JavaScript rule expression
+   */
+  function evaluateRule(trace: Trace.Complete, expression: string): number {
+    try {
+      // Create a safe evaluation context
+      const func = new Function("trace", `return ${expression}`)
+      const result = func(trace)
+      // Convert boolean to number (true -> 1, false -> 0)
+      if (typeof result === "boolean") return result ? 1 : 0
+      return typeof result === "number" ? result : 0
+    } catch (error) {
+      log.error("rule evaluation failed", {
+        expression,
+        error: error instanceof Error ? error.message : String(error),
+      })
+      return 0
+    }
+  }
+
+  /**
+   * Evaluate using a built-in heuristic function
+   */
+  function evaluateHeuristic(trace: Trace.Complete, evaluator: Metric.HeuristicEvaluator): number {
+    const functionName = evaluator.function as keyof typeof Heuristics
+    const heuristic = Heuristics[functionName]
+    
+    if (!heuristic) {
+      log.error("heuristic not found", {
+        function: evaluator.function,
+      })
+      return 0
+    }
+
+    try {
+      return heuristic(trace, evaluator.params)
+    } catch (error) {
+      log.error("heuristic evaluation failed", {
+        function: evaluator.function,
+        error: error instanceof Error ? error.message : String(error),
+      })
+      return 0
+    }
+  }
+
+  /**
+   * Evaluate using an LLM judge
+   * TODO: Implement LLM-based evaluation
+   */
+  async function evaluateLLM(_trace: Trace.Complete, evaluator: Metric.LLMEvaluator): Promise<number> {
+    log.warn("LLM evaluation not yet implemented", {
+      model: evaluator.model,
+    })
+    
+    // Placeholder - would call LLM API here
+    // const response = await callLLM(evaluator.model, {
+    //   prompt: formatPrompt(evaluator.prompt, trace),
+    // })
+    // const parseFunc = new Function("output", evaluator.parseScore)
+    // return parseFunc(response)
+    
+    return 0
+  }
+
+  /**
+   * Get summary statistics for evaluation results.
+   * 
+   * Aggregates all evaluation results for a trace and computes summary
+   * statistics including pass/fail counts and average score.
+   * 
+   * @param traceID - The ID of the trace to summarize
+   * @returns Summary object with statistics and full results
+   * 
+   * @example
+   * ```typescript
+   * const summary = await EvaluationEngine.summarize("trace-123")
+   * console.log(`${summary.passed}/${summary.total} metrics passed`)
+   * console.log(`Average score: ${summary.averageScore.toFixed(2)}`)
+   * ```
+   */
+  export async function summarize(traceID: string): Promise<{
+    total: number
+    passed: number
+    failed: number
+    averageScore: number
+    results: Result[]
+  }> {
+    const results = await getResults(traceID)
+    const passed = results.filter((r) => r.passed).length
+    const failed = results.length - passed
+    const averageScore =
+      results.length > 0 ? results.reduce((sum, r) => sum + r.score, 0) / results.length : 0
+
+    return {
+      total: results.length,
+      passed,
+      failed,
+      averageScore,
+      results,
+    }
+  }
+}
diff --git a/packages/opencode/src/evaluation/feedback-manager.ts b/packages/opencode/src/evaluation/feedback-manager.ts
new file mode 100644
index 0000000000..9681f317fb
--- /dev/null
+++ b/packages/opencode/src/evaluation/feedback-manager.ts
@@ -0,0 +1,281 @@
+/**
+ * Intelligent feedback request manager.
+ * 
+ * Requests user feedback at optimal times to maximize response rate
+ * while minimizing interruption.
+ * 
+ * Strategy:
+ * - Only request feedback when session is idle
+ * - Focus on expensive or unusual operations
+ * - Rate-limit to avoid fatigue (max 1 per hour per user)
+ * - Track response rates and adjust strategy
+ */
+
+import { Bus } from "../bus"
+import { Session } from "../session"
+import { SessionPrompt } from "../session/prompt"
+import { Trace } from "../trace"
+import { Storage } from "../storage/storage"
+import { Log } from "../util/log"
+import { Telemetry } from "./telemetry"
+
+const log = Log.create({ service: "evaluation.feedback-manager" })
+
+export namespace FeedbackManager {
+  export type Strategy = {
+    /** Only ask for traces costing more than this (dollars) */
+    minCostThreshold: number
+    /** Only ask for traces longer than this (ms) */
+    minDurationThreshold: number
+    /** Ask for random traces this % of the time */
+    randomSamplingRate: number
+    /** Maximum feedback requests per hour */
+    maxRequestsPerHour: number
+    /** Minimum time since last request (ms) */
+    minTimeSinceLastRequest: number
+  }
+
+  const defaultStrategy: Strategy = {
+    minCostThreshold: 0.05, // $0.05
+    minDurationThreshold: 10000, // 10 seconds
+    randomSamplingRate: 0.05, // 5%
+    maxRequestsPerHour: 1,
+    minTimeSinceLastRequest: 60 * 60 * 1000, // 1 hour
+  }
+
+  let strategy: Strategy = defaultStrategy
+  let enabled = false
+  let unsubscribe: (() => void) | null = null
+
+  // Track last request time per session
+  const lastRequestTime = new Map<string, number>()
+
+  // Track requests made this hour
+  let requestsThisHour = 0
+  let hourResetTimer: Timer | null = null
+
+  /**
+   * Enable feedback requests with optional custom strategy.
+   */
+  export function enable(customStrategy?: Partial<Strategy>) {
+    if (enabled) {
+      log.warn("feedback manager already enabled")
+      return
+    }
+
+    strategy = { ...defaultStrategy, ...customStrategy }
+    enabled = true
+
+    // Subscribe to idle events
+    unsubscribe = Bus.subscribe(SessionPrompt.Event.Idle, async ({ properties }) => {
+      try {
+        await handleIdleSession(properties.sessionID)
+      } catch (error) {
+        log.error("failed to handle idle session", { sessionID: properties.sessionID, error })
+      }
+    })
+
+    // Reset hourly counter
+    hourResetTimer = setInterval(() => {
+      requestsThisHour = 0
+      log.debug("reset hourly request counter")
+    }, 60 * 60 * 1000)
+
+    log.info("feedback manager enabled", { strategy })
+  }
+
+  /**
+   * Disable feedback requests.
+   */
+  export function disable() {
+    if (!enabled) return
+
+    enabled = false
+    if (unsubscribe) {
+      unsubscribe()
+      unsubscribe = null
+    }
+    if (hourResetTimer) {
+      clearInterval(hourResetTimer)
+      hourResetTimer = null
+    }
+
+    log.info("feedback manager disabled")
+  }
+
+  /**
+   * Check if we should request feedback for a trace.
+   */
+  function shouldRequestFeedback(trace: Trace.Complete, sessionID: string): boolean {
+    // Check rate limits
+    if (requestsThisHour >= strategy.maxRequestsPerHour) {
+      log.debug("skipping feedback: hourly limit reached")
+      return false
+    }
+
+    const lastRequest = lastRequestTime.get(sessionID) || 0
+    if (Date.now() - lastRequest < strategy.minTimeSinceLastRequest) {
+      log.debug("skipping feedback: too soon since last request")
+      return false
+    }
+
+    // Check thresholds
+    const isExpensive = trace.summary.cost >= strategy.minCostThreshold
+    const isLong = trace.summary.duration >= strategy.minDurationThreshold
+    const hasErrors = trace.summary.errorCount > 0
+    const isRandom = Math.random() < strategy.randomSamplingRate
+
+    const shouldAsk = isExpensive || isLong || hasErrors || isRandom
+
+    if (shouldAsk) {
+      log.debug("feedback criteria met", {
+        traceID: trace.id,
+        isExpensive,
+        isLong,
+        hasErrors,
+        isRandom,
+      })
+    }
+
+    return shouldAsk
+  }
+
+  /**
+   * Handle an idle session by checking recent traces.
+   */
+  async function handleIdleSession(sessionID: string) {
+    if (!enabled) return
+
+    log.debug("checking for feedback opportunity", { sessionID })
+
+    // Get recent traces for this session
+    const recentTraces = await getRecentTracesForSession(sessionID, 5)
+
+    if (recentTraces.length === 0) {
+      log.debug("no recent traces for feedback", { sessionID })
+      return
+    }
+
+    // Find traces worth asking about
+    const candidateTraces = recentTraces.filter((trace) =>
+      shouldRequestFeedback(trace, sessionID)
+    )
+
+    if (candidateTraces.length === 0) {
+      log.debug("no candidate traces for feedback", { sessionID })
+      return
+    }
+
+    // Request feedback for the most recent candidates (up to 3)
+    const tracesToAsk = candidateTraces.slice(0, 3)
+    await Telemetry.requestFeedback(tracesToAsk.map((t) => t.id))
+
+    // Update rate limiting
+    requestsThisHour++
+    lastRequestTime.set(sessionID, Date.now())
+
+    log.info("feedback requested", {
+      sessionID,
+      traceCount: tracesToAsk.length,
+      traceIDs: tracesToAsk.map((t) => t.id),
+    })
+  }
+
+  /**
+   * Get recent traces for a session.
+   */
+  async function getRecentTracesForSession(
+    sessionID: string,
+    limit: number
+  ): Promise<Trace.Complete[]> {
+    try {
+      const session = await Session.get(sessionID)
+      const keys = await Storage.list(["trace", session.projectID, sessionID])
+
+      // Get all traces and sort by timestamp
+      const traces: Trace.Complete[] = []
+      for (const key of keys) {
+        try {
+          const trace = await Storage.read<Trace.Complete>(key)
+          traces.push(trace)
+        } catch {
+          // Skip invalid traces
+        }
+      }
+
+      // Sort by completion time (newest first) and limit
+      return traces
+        .filter((t) => t.completedAt) // Only completed traces
+        .sort((a, b) => (b.completedAt || 0) - (a.completedAt || 0))
+        .slice(0, limit)
+    } catch (error) {
+      log.warn("failed to get recent traces", { sessionID, error })
+      return []
+    }
+  }
+
+  /**
+   * Get feedback statistics.
+   */
+  export async function getStatistics(): Promise<{
+    totalRequested: number
+    totalResponded: number
+    responseRate: number
+    avgResponseTime: number
+  }> {
+    const feedbackKeys = await Storage.list(["feedback"])
+    const feedbacks: Telemetry.UserFeedback[] = []
+
+    for (const key of feedbackKeys) {
+      try {
+        const feedback = await Storage.read<Telemetry.UserFeedback>(key)
+        feedbacks.push(feedback)
+      } catch {
+        // Skip invalid feedback
+      }
+    }
+
+    const totalResponded = feedbacks.length
+    // Note: We don't currently track requests separately, so this is an approximation
+    const totalRequested = totalResponded * 3 // Assume ~30% response rate
+
+    const responseRate = totalRequested > 0 ? totalResponded / totalRequested : 0
+
+    const responseTimes = feedbacks.map((f) => f.respondedAt - f.requestedAt)
+    const avgResponseTime =
+      responseTimes.length > 0
+        ? responseTimes.reduce((a, b) => a + b, 0) / responseTimes.length
+        : 0
+
+    return {
+      totalRequested,
+      totalResponded,
+      responseRate,
+      avgResponseTime,
+    }
+  }
+
+  /**
+   * Update feedback strategy based on observed response rates.
+   */
+  export async function adaptStrategy() {
+    const stats = await getStatistics()
+
+    // If response rate is very low, reduce request frequency
+    if (stats.responseRate < 0.1 && stats.totalResponded > 10) {
+      strategy.maxRequestsPerHour = Math.max(1, strategy.maxRequestsPerHour - 1)
+      strategy.minTimeSinceLastRequest += 30 * 60 * 1000 // Add 30 minutes
+      log.info("adapted strategy: reduced request frequency", { strategy })
+    }
+
+    // If response rate is high, we can ask more often
+    if (stats.responseRate > 0.4 && stats.totalResponded > 20) {
+      strategy.maxRequestsPerHour = Math.min(3, strategy.maxRequestsPerHour + 1)
+      strategy.minTimeSinceLastRequest = Math.max(
+        30 * 60 * 1000,
+        strategy.minTimeSinceLastRequest - 15 * 60 * 1000
+      )
+      log.info("adapted strategy: increased request frequency", { strategy })
+    }
+  }
+}
diff --git a/packages/opencode/src/evaluation/heuristics.ts b/packages/opencode/src/evaluation/heuristics.ts
new file mode 100644
index 0000000000..80bba7ce4a
--- /dev/null
+++ b/packages/opencode/src/evaluation/heuristics.ts
@@ -0,0 +1,319 @@
+import type { Trace } from "../trace"
+
+/**
+ * A heuristic function that evaluates a trace and returns a numeric score.
+ * 
+ * @param trace - The completed trace to evaluate
+ * @param params - Optional parameters for the heuristic function
+ * @returns A numeric score representing the evaluation result
+ */
+export type HeuristicFunction = (trace: Trace.Complete, params?: Record<string, any>) => number
+
+/**
+ * Built-in heuristic functions for trace evaluation.
+ * 
+ * Each function analyzes different aspects of trace execution:
+ * - Performance: responseDuration, averageToolDuration, slowToolCalls
+ * - Reliability: toolErrorRate, toolSuccessRate, hasErrors
+ * - Efficiency: costEfficiency, tokenEfficiency, cacheHitRate
+ * - Usage: toolCallCount, toolUsageCount, redundantCalls
+ * - Cost: totalCost
+ * 
+ * @example
+ * ```typescript
+ * const errorRate = Heuristics.toolErrorRate(trace)
+ * const slowCalls = Heuristics.slowToolCalls(trace, { threshold: 3000 })
+ * ```
+ */
+export const Heuristics = {
+  /**
+   * Calculate the ratio of failed tool calls.
+   * 
+   * Returns the proportion of tool calls that ended in error status.
+   * Useful for measuring reliability and detecting integration issues.
+   * 
+   * @param trace - The trace to analyze
+   * @param _params - Unused, present for signature consistency
+   * @returns Error rate between 0 (no errors) and 1 (all errors)
+   * 
+   * @example
+   * ```typescript
+   * const errorRate = Heuristics.toolErrorRate(trace)
+   * // 0.25 means 25% of tool calls failed
+   * ```
+   */
+  toolErrorRate(trace: Trace.Complete, _params?: Record<string, any>): number {
+    if (trace.toolCalls.length === 0) return 0
+    const errors = trace.toolCalls.filter((t) => t.status === "error").length
+    return errors / trace.toolCalls.length
+  },
+
+  /**
+   * Calculate the total duration in milliseconds.
+   * 
+   * Measures the end-to-end execution time of the trace from start to finish.
+   * Lower values indicate better performance.
+   * 
+   * @param trace - The trace to analyze
+   * @param _params - Unused, present for signature consistency
+   * @returns Duration in milliseconds
+   * 
+   * @example
+   * ```typescript
+   * const duration = Heuristics.responseDuration(trace)
+   * // 1500 means the trace took 1.5 seconds
+   * ```
+   */
+  responseDuration(trace: Trace.Complete, _params?: Record<string, any>): number {
+    return trace.summary.duration
+  },
+
+  /**
+   * Detect redundant or duplicate tool calls.
+   * 
+   * Identifies tools that were called multiple times with the same parameters,
+   * which may indicate inefficient agent behavior or retry logic.
+   * 
+   * @param trace - The trace to analyze
+   * @param _params - Unused, present for signature consistency
+   * @returns Count of tools that were called more than once with identical parameters
+   * 
+   * @example
+   * ```typescript
+   * const redundant = Heuristics.redundantCalls(trace)
+   * // 2 means two different tools were called redundantly
+   * ```
+   */
+  redundantCalls(trace: Trace.Complete, _params?: Record<string, any>): number {
+    const seen = new Map<string, number>()
+    
+    for (const call of trace.toolCalls) {
+      // Create a key from tool ID and params
+      const key = `${call.id}:${JSON.stringify(call.extra || {})}`
+      seen.set(key, (seen.get(key) || 0) + 1)
+    }
+    
+    // Count how many tools were called multiple times
+    return Array.from(seen.values()).filter((count) => count > 1).length
+  },
+
+  /**
+   * Calculate cost efficiency (cost per successful operation).
+   * 
+   * Measures how much each successful tool call costs on average.
+   * Lower values indicate better cost efficiency.
+   * 
+   * @param trace - The trace to analyze
+   * @param _params - Unused, present for signature consistency
+   * @returns Cost per successful operation in dollars, or Infinity if no successful calls
+   * 
+   * @example
+   * ```typescript
+   * const efficiency = Heuristics.costEfficiency(trace)
+   * // 0.01 means each successful operation costs $0.01 on average
+   * ```
+   */
+  costEfficiency(trace: Trace.Complete, _params?: Record<string, any>): number {
+    const successfulCalls = trace.toolCalls.filter((t) => t.status === "success").length
+    if (successfulCalls === 0) return Infinity
+    return trace.summary.cost / successfulCalls
+  },
+
+  /**
+   * Calculate token efficiency (output tokens / total tokens).
+   * 
+   * Measures the ratio of output tokens to total tokens used.
+   * Higher values indicate more productive token usage.
+   * 
+   * @param trace - The trace to analyze
+   * @param _params - Unused, present for signature consistency
+   * @returns Ratio between 0 and 1 representing output token efficiency
+   * 
+   * @example
+   * ```typescript
+   * const efficiency = Heuristics.tokenEfficiency(trace)
+   * // 0.33 means 33% of tokens were output (rest were input/reasoning)
+   * ```
+   */
+  tokenEfficiency(trace: Trace.Complete, _params?: Record<string, any>): number {
+    const total =
+      trace.summary.tokens.input +
+      trace.summary.tokens.output +
+      trace.summary.tokens.reasoning
+    if (total === 0) return 0
+    return trace.summary.tokens.output / total
+  },
+
+  /**
+   * Calculate average tool call duration.
+   * 
+   * Computes the mean execution time across all tool calls.
+   * Useful for understanding overall tool performance.
+   * 
+   * @param trace - The trace to analyze
+   * @param _params - Unused, present for signature consistency
+   * @returns Average duration in milliseconds, or 0 if no tool calls
+   * 
+   * @example
+   * ```typescript
+   * const avgDuration = Heuristics.averageToolDuration(trace)
+   * // 250 means tool calls took 250ms on average
+   * ```
+   */
+  averageToolDuration(trace: Trace.Complete, _params?: Record<string, any>): number {
+    if (trace.toolCalls.length === 0) return 0
+    const totalDuration = trace.toolCalls.reduce((sum, call) => sum + call.duration, 0)
+    return totalDuration / trace.toolCalls.length
+  },
+
+  /**
+   * Check if any tool call exceeded a duration threshold.
+   * 
+   * Counts the number of tool calls that took longer than the specified threshold.
+   * Useful for identifying performance bottlenecks.
+   * 
+   * @param trace - The trace to analyze
+   * @param params - Configuration object
+   * @param params.threshold - Maximum acceptable duration in milliseconds (default: 5000)
+   * @returns Count of tool calls exceeding the threshold
+   * 
+   * @example
+   * ```typescript
+   * const slow = Heuristics.slowToolCalls(trace, { threshold: 3000 })
+   * // 3 means three tool calls took longer than 3 seconds
+   * ```
+   */
+  slowToolCalls(trace: Trace.Complete, params?: { threshold?: number }): number {
+    const threshold = params?.threshold ?? 5000 // 5 seconds default
+    return trace.toolCalls.filter((t) => t.duration > threshold).length
+  },
+
+  /**
+   * Calculate the ratio of tool calls that were successful.
+   * 
+   * Measures the proportion of tool calls that completed successfully.
+   * Higher values indicate better reliability.
+   * 
+   * @param trace - The trace to analyze
+   * @param _params - Unused, present for signature consistency
+   * @returns Success rate between 0 (all failed) and 1 (all succeeded)
+   * 
+   * @example
+   * ```typescript
+   * const successRate = Heuristics.toolSuccessRate(trace)
+   * // 0.95 means 95% of tool calls succeeded
+   * ```
+   */
+  toolSuccessRate(trace: Trace.Complete, _params?: Record<string, any>): number {
+    if (trace.toolCalls.length === 0) return 1 // No tools = perfect success
+    const successes = trace.toolCalls.filter((t) => t.status === "success").length
+    return successes / trace.toolCalls.length
+  },
+
+  /**
+   * Count total number of tool calls.
+   * 
+   * Returns the total number of tool invocations in the trace.
+   * Useful for monitoring agent activity levels.
+   * 
+   * @param trace - The trace to analyze
+   * @param _params - Unused, present for signature consistency
+   * @returns Total count of tool calls
+   * 
+   * @example
+   * ```typescript
+   * const count = Heuristics.toolCallCount(trace)
+   * // 7 means the agent made 7 tool calls
+   * ```
+   */
+  toolCallCount(trace: Trace.Complete, _params?: Record<string, any>): number {
+    return trace.toolCalls.length
+  },
+
+  /**
+   * Calculate cache hit rate.
+   * 
+   * Measures the proportion of input tokens that were served from cache.
+   * Higher values indicate better cache utilization and cost savings.
+   * 
+   * @param trace - The trace to analyze
+   * @param _params - Unused, present for signature consistency
+   * @returns Cache hit rate between 0 (no cache hits) and 1 (all from cache)
+   * 
+   * @example
+   * ```typescript
+   * const hitRate = Heuristics.cacheHitRate(trace)
+   * // 0.4 means 40% of input tokens came from cache
+   * ```
+   */
+  cacheHitRate(trace: Trace.Complete, _params?: Record<string, any>): number {
+    const cacheRead = trace.summary.tokens.cache.read
+    const totalInput = trace.summary.tokens.input + cacheRead
+    if (totalInput === 0) return 0
+    return cacheRead / totalInput
+  },
+
+  /**
+   * Calculate total cost.
+   * 
+   * Returns the total monetary cost of the trace execution.
+   * Includes all LLM API calls and token usage.
+   * 
+   * @param trace - The trace to analyze
+   * @param _params - Unused, present for signature consistency
+   * @returns Total cost in dollars
+   * 
+   * @example
+   * ```typescript
+   * const cost = Heuristics.totalCost(trace)
+   * // 0.02 means the trace cost $0.02 to execute
+   * ```
+   */
+  totalCost(trace: Trace.Complete, _params?: Record<string, any>): number {
+    return trace.summary.cost
+  },
+
+  /**
+   * Check if trace has any errors.
+   * 
+   * Returns a binary indicator of whether the trace encountered any errors.
+   * Useful for pass/fail quality gates.
+   * 
+   * @param trace - The trace to analyze
+   * @param _params - Unused, present for signature consistency
+   * @returns 1 if errors occurred, 0 if no errors
+   * 
+   * @example
+   * ```typescript
+   * const hasErrors = Heuristics.hasErrors(trace)
+   * // 0 means the trace executed without errors
+   * ```
+   */
+  hasErrors(trace: Trace.Complete, _params?: Record<string, any>): number {
+    return trace.summary.errorCount > 0 ? 1 : 0
+  },
+
+  /**
+   * Count specific tool usage.
+   * 
+   * Counts how many times a particular tool was invoked during trace execution.
+   * Useful for monitoring tool usage patterns and detecting overuse.
+   * 
+   * @param trace - The trace to analyze
+   * @param params - Configuration object
+   * @param params.toolId - The ID of the tool to count
+   * @returns Number of times the specified tool was called (0 if toolId not provided)
+   * 
+   * @example
+   * ```typescript
+   * const readCount = Heuristics.toolUsageCount(trace, { toolId: "Read" })
+   * // 5 means the Read tool was called 5 times
+   * ```
+   */
+  toolUsageCount(trace: Trace.Complete, params?: { toolId?: string }): number {
+    if (!params?.toolId) return 0
+    return trace.toolCalls.filter((t) => t.id === params.toolId).length
+  },
+} as const
+
+export type HeuristicName = keyof typeof Heuristics
diff --git a/packages/opencode/src/evaluation/index.ts b/packages/opencode/src/evaluation/index.ts
new file mode 100644
index 0000000000..aab9bd97f5
--- /dev/null
+++ b/packages/opencode/src/evaluation/index.ts
@@ -0,0 +1,28 @@
+/**
+ * Evaluation framework for assessing trace quality
+ * 
+ * This module provides:
+ * - Trace materialization from sessions
+ * - Metric definitions and registry
+ * - Evaluation engine to run metrics against traces
+ * - Built-in heuristics for common quality checks
+ * - Dataset management for test cases
+ * - Test runner for executing and validating test cases
+ * - Baseline tracking for regression detection
+ * - Time-series analysis for trend monitoring
+ * - Integration layer for automatic evaluation and alerting
+ */
+
+export { Trace } from "../trace"
+export { Metric } from "./metric"
+export { EvaluationEngine } from "./engine"
+export { Heuristics } from "./heuristics"
+export { BuiltinMetrics, registerBuiltinMetrics } from "./metrics/builtin"
+export { Dataset } from "./dataset"
+export { TestRunner } from "./runner"
+export { Baseline } from "./baseline"
+export { TimeSeries } from "./timeseries"
+export { EvaluationIntegration } from "./integration"
+export { Telemetry } from "./telemetry"
+export { FeedbackManager } from "./feedback-manager"
+export { initEvaluation } from "./init"
diff --git a/packages/opencode/src/evaluation/init.ts b/packages/opencode/src/evaluation/init.ts
new file mode 100644
index 0000000000..b5aa9380e0
--- /dev/null
+++ b/packages/opencode/src/evaluation/init.ts
@@ -0,0 +1,30 @@
+import { registerBuiltinMetrics } from "./metrics/builtin"
+import { Log } from "../util/log"
+
+const log = Log.create({ service: "evaluation-init" })
+
+let initialized = false
+
+/**
+ * Initialize the evaluation framework
+ * Registers built-in metrics and sets up event listeners
+ */
+export async function initEvaluation(): Promise<void> {
+  if (initialized) return
+  
+  try {
+    // Register all built-in metrics
+    await registerBuiltinMetrics()
+    
+    log.info("evaluation framework initialized", {
+      metricsRegistered: true,
+    })
+    
+    initialized = true
+  } catch (error) {
+    log.error("failed to initialize evaluation framework", {
+      error: error instanceof Error ? error.message : String(error),
+    })
+    // Don't throw - evaluation framework initialization failures shouldn't block the app
+  }
+}
diff --git a/packages/opencode/src/evaluation/integration.ts b/packages/opencode/src/evaluation/integration.ts
new file mode 100644
index 0000000000..a890b7f371
--- /dev/null
+++ b/packages/opencode/src/evaluation/integration.ts
@@ -0,0 +1,411 @@
+import { Bus } from "../bus"
+import { Log } from "../util/log"
+import { Trace } from "../trace"
+import { Baseline } from "./baseline"
+import { TimeSeries } from "./timeseries"
+import { EvaluationEngine } from "./engine"
+import { Metric } from "./metric"
+
+/**
+ * Integration layer that connects evaluation, baseline tracking, and time-series
+ * analysis with the trace lifecycle.
+ * 
+ * Features:
+ * - Automatic evaluation and time-series recording on trace completion
+ * - Automatic baseline comparison for registered baselines
+ * - Alert generation for regressions and anomalies
+ * - Dashboard data aggregation
+ * 
+ * @example
+ * ```typescript
+ * // Enable auto-evaluation for all completed traces
+ * await EvaluationIntegration.enableAutoEvaluation({
+ *   metricIDs: ["error-rate", "latency", "cost"],
+ *   recordTimeSeries: true,
+ *   checkBaselines: true,
+ * })
+ * 
+ * // Monitor for regressions
+ * EvaluationIntegration.onRegression((alert) => {
+ *   console.log(`Regression detected: ${alert.metricID}`)
+ *   notifyTeam(alert)
+ * })
+ * ```
+ */
+export namespace EvaluationIntegration {
+  const log = Log.create({ service: "evaluation-integration" })
+
+  export type Config = {
+    /** Metrics to automatically evaluate on trace completion */
+    metricIDs: string[]
+    /** Whether to record results in time-series */
+    recordTimeSeries?: boolean
+    /** Whether to compare against active baselines */
+    checkBaselines?: boolean
+    /** Tags to add to time-series data points */
+    tags?: Record<string, string>
+    /** Whether to emit alerts for anomalies */
+    detectAnomalies?: boolean
+    /** Anomaly detection threshold (sigma) */
+    anomalyThreshold?: number
+    /** Whether to collect telemetry data (default: true) */
+    collectTelemetry?: boolean
+  }
+
+  export type RegressionAlert = {
+    type: "regression"
+    traceID: string
+    metricID: string
+    baselineID: string
+    baselineValue: number
+    currentValue: number
+    delta: number
+    percentChange: number
+    timestamp: number
+  }
+
+  export type AnomalyAlert = {
+    type: "anomaly"
+    traceID: string
+    metricID: string
+    currentValue: number
+    expectedRange: { min: number; max: number }
+    zScore: number
+    timestamp: number
+  }
+
+  export type ImprovementAlert = {
+    type: "improvement"
+    traceID: string
+    metricID: string
+    baselineID: string
+    baselineValue: number
+    currentValue: number
+    delta: number
+    percentChange: number
+    timestamp: number
+  }
+
+  export type Alert = RegressionAlert | AnomalyAlert | ImprovementAlert
+
+  let config: Config | null = null
+  let unsubscribe: (() => void) | null = null
+  const alertCallbacks = new Set<(alert: Alert) => void>()
+
+  /**
+   * Enable automatic evaluation and monitoring.
+   * 
+   * When enabled, traces will automatically be evaluated against specified
+   * metrics, results recorded in time-series, and compared against baselines.
+   * 
+   * @param cfg - Configuration for auto-evaluation
+   */
+  export async function enableAutoEvaluation(cfg: Config) {
+    if (unsubscribe) {
+      log.warn("auto-evaluation already enabled, reconfiguring")
+      unsubscribe()
+    }
+
+    config = cfg
+    log.info("enabling auto-evaluation", { metricIDs: cfg.metricIDs })
+
+    // Subscribe to trace completion events
+    unsubscribe = Bus.subscribe(Trace.Event.Completed, async ({ properties }) => {
+      try {
+        await processTrace(properties.trace, cfg)
+      } catch (error) {
+        log.error("failed to process trace", { error, traceID: properties.trace.id })
+      }
+    })
+  }
+
+  /**
+   * Disable automatic evaluation.
+   */
+  export function disableAutoEvaluation() {
+    if (unsubscribe) {
+      unsubscribe()
+      unsubscribe = null
+      config = null
+      log.info("auto-evaluation disabled")
+    }
+  }
+
+  /**
+   * Register a callback for alert notifications.
+   * 
+   * @param callback - Function to call when alerts are generated
+   * @returns Unsubscribe function
+   */
+  export function onAlert(callback: (alert: Alert) => void): () => void {
+    alertCallbacks.add(callback)
+    return () => alertCallbacks.delete(callback)
+  }
+
+  /**
+   * Convenience method for regression-only alerts.
+   */
+  export function onRegression(callback: (alert: RegressionAlert) => void): () => void {
+    return onAlert((alert) => {
+      if (alert.type === "regression") callback(alert)
+    })
+  }
+
+  /**
+   * Convenience method for anomaly-only alerts.
+   */
+  export function onAnomaly(callback: (alert: AnomalyAlert) => void): () => void {
+    return onAlert((alert) => {
+      if (alert.type === "anomaly") callback(alert)
+    })
+  }
+
+  /**
+   * Convenience method for improvement-only alerts.
+   */
+  export function onImprovement(callback: (alert: ImprovementAlert) => void): () => void {
+    return onAlert((alert) => {
+      if (alert.type === "improvement") callback(alert)
+    })
+  }
+
+  /**
+   * Get dashboard data aggregating evaluation results.
+   * 
+   * @param options - Filtering and aggregation options
+   * @returns Dashboard data with metrics, trends, and alerts
+   */
+  export async function getDashboard(options: {
+    since?: number
+    until?: number
+    metricIDs?: string[]
+    period?: "hour" | "day" | "week" | "month"
+  }) {
+    const metricIDs = options.metricIDs ?? config?.metricIDs ?? []
+    const period = options.period ?? "day"
+
+    const metrics = await Promise.all(
+      metricIDs.map(async (metricID) => {
+        const metric = await Metric.get(metricID)
+        
+        // Get time-series data
+        const points = await TimeSeries.getDataPoints(metricID, {
+          since: options.since,
+          until: options.until,
+        })
+
+        // Get aggregates
+        const aggregates = await TimeSeries.getAggregates(metricID, { period })
+
+        // Get trend analysis
+        let trend = null
+        try {
+          const days = options.since 
+            ? Math.ceil((Date.now() - options.since) / (24 * 60 * 60 * 1000))
+            : 7
+          trend = await TimeSeries.analyzeTrend(metricID, { days })
+        } catch {
+          // Not enough data
+        }
+
+        // Get associated baselines
+        const baselines = await Baseline.list()
+        const relevantBaselines = baselines.filter((b) => b.metricIDs.includes(metricID))
+
+        return {
+          metric,
+          dataPoints: points.length,
+          aggregates: aggregates.slice(-10), // Last 10 periods
+          trend,
+          baselines: relevantBaselines.map((b) => ({
+            id: b.id,
+            name: b.name,
+            statistics: b.statistics.find((s) => s.metricID === metricID),
+          })),
+        }
+      }),
+    )
+
+    return {
+      metrics,
+      period: {
+        start: options.since ?? Date.now() - 7 * 24 * 60 * 60 * 1000,
+        end: options.until ?? Date.now(),
+      },
+    }
+  }
+
+  /**
+   * Process a completed trace through the evaluation pipeline.
+   */
+  async function processTrace(trace: Trace.Complete, cfg: Config) {
+    log.debug("processing trace", { traceID: trace.id })
+
+    // 0. Enrich trace with telemetry (non-blocking)
+    if (cfg.collectTelemetry !== false) {
+      const { Telemetry } = await import("./telemetry")
+      Telemetry.enrichTrace(trace).catch((error) => {
+        log.warn("telemetry enrichment failed", { traceID: trace.id, error })
+      })
+    }
+
+    // 1. Evaluate all configured metrics
+    const metrics = await Promise.all(cfg.metricIDs.map((id) => Metric.get(id)))
+    const results = await EvaluationEngine.evaluateMany(trace, metrics)
+
+    log.debug("evaluated trace", {
+      traceID: trace.id,
+      resultsCount: results.length,
+    })
+
+    // 2. Check for anomalies if enabled (BEFORE recording to time-series)
+    if (cfg.detectAnomalies) {
+      for (const result of results) {
+        try {
+          const anomalyResult = await TimeSeries.detectAnomaly(
+            result.metricID,
+            result.score,
+            7, // 7 days lookback
+          )
+
+          if (anomalyResult.isAnomaly) {
+            const alert: AnomalyAlert = {
+              type: "anomaly",
+              traceID: trace.id,
+              metricID: result.metricID,
+              currentValue: result.score,
+              expectedRange: anomalyResult.expectedRange,
+              zScore: anomalyResult.zScore,
+              timestamp: Date.now(),
+            }
+            emitAlert(alert)
+          }
+        } catch {
+          // Not enough data for anomaly detection
+        }
+      }
+    }
+
+    // 3. Record in time-series if enabled (AFTER anomaly detection)
+    if (cfg.recordTimeSeries) {
+      for (const result of results) {
+        await TimeSeries.record(result.metricID, trace, cfg.tags)
+      }
+      log.debug("recorded time-series", { traceID: trace.id })
+    }
+
+    // 4. Compare against baselines if enabled
+    if (cfg.checkBaselines) {
+      const baselines = await Baseline.list()
+      
+      for (const baseline of baselines) {
+        // Check if this baseline applies to this trace
+        const relevantMetrics = cfg.metricIDs.filter((id) => baseline.metricIDs.includes(id))
+        if (relevantMetrics.length === 0) continue
+
+        // Skip if baseline doesn't have enough samples yet
+        if (baseline.traceIDs.length < baseline.minSampleSize) continue
+
+        try {
+          const comparison = await Baseline.compare(baseline.id, trace)
+
+          // Emit alerts for regressions
+          for (const metricID of comparison.regressions) {
+            const metricComparison = comparison.metrics.find((m) => m.metricID === metricID)
+            if (!metricComparison) continue
+
+            const alert: RegressionAlert = {
+              type: "regression",
+              traceID: trace.id,
+              metricID,
+              baselineID: baseline.id,
+              baselineValue: metricComparison.baselineValue,
+              currentValue: metricComparison.traceValue,
+              delta: metricComparison.delta,
+              percentChange: metricComparison.percentChange,
+              timestamp: Date.now(),
+            }
+            emitAlert(alert)
+          }
+
+          // Emit alerts for improvements
+          for (const metricID of comparison.improvements) {
+            const metricComparison = comparison.metrics.find((m) => m.metricID === metricID)
+            if (!metricComparison) continue
+
+            const alert: ImprovementAlert = {
+              type: "improvement",
+              traceID: trace.id,
+              metricID,
+              baselineID: baseline.id,
+              baselineValue: metricComparison.baselineValue,
+              currentValue: metricComparison.traceValue,
+              delta: metricComparison.delta,
+              percentChange: metricComparison.percentChange,
+              timestamp: Date.now(),
+            }
+            emitAlert(alert)
+          }
+        } catch (error) {
+          log.error("baseline comparison failed", { error, baselineID: baseline.id })
+        }
+      }
+    }
+
+    log.debug("trace processing complete", { traceID: trace.id })
+  }
+
+  /**
+   * Emit an alert to all registered callbacks.
+   */
+  function emitAlert(alert: Alert) {
+    log.info("emitting alert", { type: alert.type, traceID: alert.traceID })
+    for (const callback of alertCallbacks) {
+      try {
+        callback(alert)
+      } catch (error) {
+        log.error("alert callback failed", { error })
+      }
+    }
+  }
+
+  /**
+   * Manually trigger evaluation for a specific trace.
+   * 
+   * Useful for re-evaluating historical traces or evaluating traces
+   * that were completed before auto-evaluation was enabled.
+   * 
+   * @param traceOrID - The trace object or trace ID to evaluate
+   * @param cfg - Optional configuration (uses global config if not provided)
+   */
+  export async function evaluateTrace(traceOrID: string | Trace.Complete, cfg?: Config) {
+    const trace = typeof traceOrID === "string" ? await Trace.get(traceOrID) : traceOrID
+    const evalConfig = cfg ?? config
+    if (!evalConfig) {
+      throw new Error("No configuration provided and auto-evaluation not enabled")
+    }
+    await processTrace(trace, evalConfig)
+  }
+
+  /**
+   * Batch evaluate multiple traces.
+   * 
+   * @param tracesOrIDs - Array of trace objects or trace IDs to evaluate
+   * @param cfg - Optional configuration
+   */
+  export async function evaluateTraces(tracesOrIDs: (string | Trace.Complete)[], cfg?: Config) {
+    const evalConfig = cfg ?? config
+    if (!evalConfig) {
+      throw new Error("No configuration provided and auto-evaluation not enabled")
+    }
+
+    for (const traceOrID of tracesOrIDs) {
+      try {
+        await evaluateTrace(traceOrID, evalConfig)
+      } catch (error) {
+        const id = typeof traceOrID === "string" ? traceOrID : traceOrID.id
+        log.error("failed to evaluate trace", { error, traceID: id })
+      }
+    }
+  }
+}
diff --git a/packages/opencode/src/evaluation/metric-semantics.ts b/packages/opencode/src/evaluation/metric-semantics.ts
new file mode 100644
index 0000000000..d2a8079499
--- /dev/null
+++ b/packages/opencode/src/evaluation/metric-semantics.ts
@@ -0,0 +1,354 @@
+/**
+ * Metric semantics and validation utilities.
+ * 
+ * Provides type-safe semantic definitions for common metric types
+ * and validation to catch configuration errors early.
+ */
+
+import type { Metric } from "./metric"
+
+export namespace MetricSemantics {
+  /**
+   * Semantic metadata for a metric.
+   */
+  export interface Semantics {
+    unit?: string
+    interpretSlope?: (slope: number, higherIsBetter: boolean) => string
+    formatValue?: (value: number) => string
+  }
+
+  /**
+   * Common semantic patterns for standard metric types.
+   */
+  export const Common = {
+    /**
+     * Cost metrics (dollars, credits, tokens).
+     * Lower is better.
+     */
+    cost: {
+      unit: "dollars",
+      interpretSlope: (slope: number) =>
+        slope > 0 ? "increasing (worse)" : "decreasing (better)",
+      formatValue: (v: number) => `$${v.toFixed(4)}`,
+    } as Semantics,
+
+    /**
+     * Duration/latency metrics (milliseconds, seconds).
+     * Lower is better.
+     */
+    duration: {
+      unit: "milliseconds",
+      interpretSlope: (slope: number) =>
+        slope > 0 ? "slowing down (worse)" : "speeding up (better)",
+      formatValue: (v: number) => {
+        if (v < 1000) return `${v.toFixed(0)}ms`
+        return `${(v / 1000).toFixed(2)}s`
+      },
+    } as Semantics,
+
+    /**
+     * Error rate metrics (proportion, percentage).
+     * Lower is better.
+     */
+    errorRate: {
+      unit: "percent",
+      interpretSlope: (slope: number) =>
+        slope > 0 ? "more errors (worse)" : "fewer errors (better)",
+      formatValue: (v: number) => `${(v * 100).toFixed(1)}%`,
+    } as Semantics,
+
+    /**
+     * Throughput metrics (requests/second, items/second).
+     * Higher is better.
+     */
+    throughput: {
+      unit: "requests/second",
+      interpretSlope: (slope: number) =>
+        slope > 0 ? "increasing (better)" : "decreasing (worse)",
+      formatValue: (v: number) => `${v.toFixed(1)} req/s`,
+    } as Semantics,
+
+    /**
+     * Quality/accuracy metrics (score, rating).
+     * Higher is better.
+     */
+    quality: {
+      unit: "score",
+      interpretSlope: (slope: number) =>
+        slope > 0 ? "improving (better)" : "degrading (worse)",
+      formatValue: (v: number) => v.toFixed(2),
+    } as Semantics,
+
+    /**
+     * Token count metrics.
+     * Context-dependent (lower usually better for cost).
+     */
+    tokens: {
+      unit: "tokens",
+      interpretSlope: (slope: number, higherIsBetter: boolean) =>
+        higherIsBetter
+          ? slope > 0
+            ? "increasing (better)"
+            : "decreasing (worse)"
+          : slope > 0
+          ? "increasing (worse)"
+          : "decreasing (better)",
+      formatValue: (v: number) => `${Math.round(v)} tokens`,
+    } as Semantics,
+  }
+
+  /**
+   * Validation result for a metric definition.
+   */
+  export interface ValidationResult {
+    valid: boolean
+    errors: string[]
+    warnings: string[]
+  }
+
+  /**
+   * Validate metric definition for common issues.
+   * 
+   * Checks for:
+   * - Semantic mismatches (cost with higherIsBetter=true)
+   * - Missing required fields
+   * - Inconsistent configuration
+   * 
+   * @param metric - The metric definition to validate
+   * @returns Validation result with errors and warnings
+   * 
+   * @example
+   * ```typescript
+   * const metric: Metric.Definition = {
+   *   id: 'cost',
+   *   evaluator: { type: 'heuristic', function: 'totalCost' },
+   *   higherIsBetter: true,  // WRONG!
+   *   semantics: MetricSemantics.Common.cost
+   * }
+   * 
+   * const result = MetricSemantics.validate(metric)
+   * // result.errors = ['Cost metrics should have higherIsBetter=false']
+   * ```
+   */
+  export function validate(metric: Metric.Definition): ValidationResult {
+    const errors: string[] = []
+    const warnings: string[] = []
+
+    // Check semantic/direction mismatches
+    if (metric.semantics?.unit === "dollars" && metric.higherIsBetter) {
+      errors.push(
+        `Metric "${metric.name}" (${metric.id}): Cost metrics should typically have higherIsBetter=false`
+      )
+    }
+
+    if (metric.semantics?.unit === "milliseconds" && metric.higherIsBetter) {
+      errors.push(
+        `Metric "${metric.name}" (${metric.id}): Duration metrics should typically have higherIsBetter=false`
+      )
+    }
+
+    if (metric.semantics?.unit === "percent" && metric.higherIsBetter) {
+      // Percent could be error rate (lower better) or success rate (higher better)
+      // Only warn if it's explicitly an error rate
+      if (
+        metric.name.toLowerCase().includes("error") ||
+        metric.id.toLowerCase().includes("error")
+      ) {
+        errors.push(
+          `Metric "${metric.name}" (${metric.id}): Error rate metrics should have higherIsBetter=false`
+        )
+      }
+    }
+
+    if (
+      (metric.semantics?.unit === "requests/second" ||
+        metric.semantics?.unit === "score") &&
+      !metric.higherIsBetter
+    ) {
+      warnings.push(
+        `Metric "${metric.name}" (${metric.id}): ${metric.semantics.unit} metrics usually have higherIsBetter=true`
+      )
+    }
+
+    // Check for missing metadata
+    if (!metric.description) {
+      warnings.push(
+        `Metric "${metric.name}" (${metric.id}): Missing description - add documentation for clarity`
+      )
+    }
+
+    if (!metric.semantics) {
+      warnings.push(
+        `Metric "${metric.name}" (${metric.id}): No semantics defined - consider adding for better formatting`
+      )
+    }
+
+    // Check category makes sense
+    if (metric.category === "cost" && metric.higherIsBetter) {
+      errors.push(
+        `Metric "${metric.name}" (${metric.id}): Category "cost" implies higherIsBetter=false`
+      )
+    }
+
+    if (metric.category === "performance" && metric.higherIsBetter === undefined) {
+      warnings.push(
+        `Metric "${metric.name}" (${metric.id}): Performance metrics should explicitly set higherIsBetter`
+      )
+    }
+
+    return {
+      valid: errors.length === 0,
+      errors,
+      warnings,
+    }
+  }
+
+  /**
+   * Suggest appropriate semantics based on metric properties.
+   * 
+   * @param metric - The metric definition
+   * @returns Suggested semantics object or null
+   * 
+   * @example
+   * ```typescript
+   * const metric = {
+   *   id: 'response-time',
+   *   category: 'performance',
+   *   higherIsBetter: false
+   * }
+   * 
+   * const semantics = MetricSemantics.suggest(metric)
+   * // semantics = Common.duration
+   * ```
+   */
+  export function suggest(
+    metric: Pick<Metric.Definition, "id" | "name" | "category" | "higherIsBetter">
+  ): Semantics | null {
+    const name = metric.name?.toLowerCase() || ""
+    const id = metric.id.toLowerCase()
+    const text = `${name} ${id}`
+
+    // Cost-related
+    if (
+      metric.category === "cost" ||
+      text.includes("cost") ||
+      text.includes("price") ||
+      text.includes("dollar")
+    ) {
+      return Common.cost
+    }
+
+    // Duration-related
+    if (
+      text.includes("duration") ||
+      text.includes("latency") ||
+      text.includes("time") ||
+      text.includes("delay")
+    ) {
+      return Common.duration
+    }
+
+    // Error-related
+    if (text.includes("error") || text.includes("failure") || text.includes("fail")) {
+      return Common.errorRate
+    }
+
+    // Throughput-related
+    if (
+      text.includes("throughput") ||
+      text.includes("rate") ||
+      text.includes("rps") ||
+      text.includes("qps")
+    ) {
+      return Common.throughput
+    }
+
+    // Quality-related
+    if (
+      text.includes("quality") ||
+      text.includes("score") ||
+      text.includes("accuracy") ||
+      text.includes("precision")
+    ) {
+      return Common.quality
+    }
+
+    // Token-related
+    if (text.includes("token")) {
+      return Common.tokens
+    }
+
+    return null
+  }
+
+  /**
+   * Format a metric value using its semantics.
+   * 
+   * @param value - The value to format
+   * @param metric - The metric definition (or just semantics)
+   * @returns Formatted string
+   * 
+   * @example
+   * ```typescript
+   * formatValue(0.0245, { semantics: Common.cost })
+   * // "$0.0245"
+   * 
+   * formatValue(1500, { semantics: Common.duration })
+   * // "1.50s"
+   * ```
+   */
+  export function formatValue(
+    value: number,
+    metric: { semantics?: Semantics }
+  ): string {
+    if (metric.semantics?.formatValue) {
+      return metric.semantics.formatValue(value)
+    }
+
+    // Default formatting
+    if (Math.abs(value) < 0.01) {
+      return value.toExponential(2)
+    }
+    if (Math.abs(value) < 1) {
+      return value.toFixed(4)
+    }
+    if (Math.abs(value) < 100) {
+      return value.toFixed(2)
+    }
+    return Math.round(value).toString()
+  }
+
+  /**
+   * Interpret trend direction with semantic context.
+   * 
+   * @param slope - The slope from trend analysis
+   * @param metric - The metric definition
+   * @returns Human-readable interpretation
+   * 
+   * @example
+   * ```typescript
+   * interpretTrend(0.005, { 
+   *   higherIsBetter: false,
+   *   semantics: Common.cost 
+   * })
+   * // "increasing (worse)"
+   * ```
+   */
+  export function interpretTrend(
+    slope: number,
+    metric: { higherIsBetter: boolean; semantics?: Semantics }
+  ): string {
+    if (metric.semantics?.interpretSlope) {
+      return metric.semantics.interpretSlope(slope, metric.higherIsBetter)
+    }
+
+    // Default interpretation
+    const direction = slope > 0 ? "increasing" : "decreasing"
+    const isGood =
+      (slope > 0 && metric.higherIsBetter) ||
+      (slope < 0 && !metric.higherIsBetter)
+    const quality = isGood ? "better" : "worse"
+
+    return `${direction} (${quality})`
+  }
+}
diff --git a/packages/opencode/src/evaluation/metric.ts b/packages/opencode/src/evaluation/metric.ts
new file mode 100644
index 0000000000..cb65c34624
--- /dev/null
+++ b/packages/opencode/src/evaluation/metric.ts
@@ -0,0 +1,198 @@
+import z from "zod/v4"
+import { Storage } from "../storage/storage"
+
+/**
+ * Metric management for trace evaluation.
+ * 
+ * Metrics define how traces should be evaluated, including:
+ * - What to measure (via evaluator)
+ * - Success thresholds (pass/warn values)
+ * - Whether higher or lower scores are better
+ * 
+ * Supports three evaluator types:
+ * - Rule: JavaScript expressions for custom logic
+ * - Heuristic: Built-in functions for common metrics
+ * - LLM: AI-powered evaluation (planned)
+ * 
+ * @example
+ * ```typescript
+ * await Metric.register({
+ *   id: "error-rate",
+ *   name: "Error Rate",
+ *   description: "Tool call error rate threshold",
+ *   version: "1.0.0",
+ *   category: "reliability",
+ *   evaluator: { type: "heuristic", function: "toolErrorRate" },
+ *   threshold: { pass: 0.05, warn: 0.02 },
+ *   higherIsBetter: false,
+ *   tags: ["production", "quality-gate"]
+ * })
+ * ```
+ */
+export namespace Metric {
+  export const Category = z.enum(["performance", "correctness", "safety", "cost", "quality", "reliability"])
+  export type Category = z.infer<typeof Category>
+
+  export const RuleEvaluator = z.object({
+    type: z.literal("rule"),
+    expression: z.string(), // JavaScript expression evaluated against trace
+  })
+  export type RuleEvaluator = z.infer<typeof RuleEvaluator>
+
+  export const HeuristicEvaluator = z.object({
+    type: z.literal("heuristic"),
+    function: z.string(), // Name of built-in heuristic function
+    params: z.record(z.string(), z.any()).optional(),
+  })
+  export type HeuristicEvaluator = z.infer<typeof HeuristicEvaluator>
+
+  export const LLMEvaluator = z.object({
+    type: z.literal("llm"),
+    prompt: z.string(),
+    model: z.string(),
+    parseScore: z.string(), // Function body to parse LLM output to number
+  })
+  export type LLMEvaluator = z.infer<typeof LLMEvaluator>
+
+  export const Evaluator = z.discriminatedUnion("type", [
+    RuleEvaluator,
+    HeuristicEvaluator,
+    LLMEvaluator,
+  ])
+  export type Evaluator = z.infer<typeof Evaluator>
+
+  export const Threshold = z.object({
+    pass: z.number(),
+    warn: z.number().optional(),
+  })
+  export type Threshold = z.infer<typeof Threshold>
+
+  export const Definition = z.object({
+    id: z.string(),
+    name: z.string(),
+    description: z.string(),
+    version: z.string(),
+    category: Category,
+    evaluator: Evaluator,
+    threshold: Threshold.optional(),
+    higherIsBetter: z.boolean(),
+    tags: z.array(z.string()).default([]),
+    semantics: z
+      .object({
+        unit: z.string().optional(),
+        interpretSlope: z.function().optional(),
+        formatValue: z.function().optional(),
+      })
+      .optional(),
+  })
+  export type Definition = z.infer<typeof Definition>
+
+  /**
+   * Register a new metric definition.
+   * 
+   * Stores the metric in the registry for use in evaluations.
+   * Metrics can be retrieved by ID, category, or tags.
+   * 
+   * @param metric - The complete metric definition
+   * 
+   * @example
+   * ```typescript
+   * await Metric.register({
+   *   id: "cost-limit",
+   *   name: "Cost Limit",
+   *   description: "Maximum cost per trace",
+   *   version: "1.0.0",
+   *   category: "cost",
+   *   evaluator: { type: "heuristic", function: "totalCost" },
+   *   threshold: { pass: 0.10 },
+   *   higherIsBetter: false,
+   *   tags: ["budget"]
+   * })
+   * ```
+   */
+  export async function register(metric: Definition): Promise<void> {
+    await Storage.write(["metric", metric.id], metric)
+  }
+
+  /**
+   * Get a metric by ID
+   */
+  export async function get(id: string): Promise<Definition> {
+    const metric = await Storage.read<Definition>(["metric", id])
+    return metric
+  }
+
+  /**
+   * List all registered metrics
+   */
+  export async function list(): Promise<Definition[]> {
+    const keys = await Storage.list(["metric"])
+    const metrics: Definition[] = []
+    
+    for (const key of keys) {
+      const metric = await Storage.read<Definition>(key)
+      metrics.push(metric)
+    }
+    
+    return metrics
+  }
+
+  /**
+   * Check if a metric exists
+   */
+  export async function exists(id: string): Promise<boolean> {
+    try {
+      await get(id)
+      return true
+    } catch {
+      return false
+    }
+  }
+
+  /**
+   * Remove a metric
+   */
+  export async function remove(id: string): Promise<void> {
+    await Storage.remove(["metric", id])
+  }
+
+  /**
+   * Find metrics by category.
+   * 
+   * Retrieves all metrics that belong to a specific category.
+   * Categories help organize metrics by their evaluation focus.
+   * 
+   * @param category - The category to filter by (performance, correctness, safety, cost, quality, reliability)
+   * @returns Array of metric definitions in the specified category
+   * 
+   * @example
+   * ```typescript
+   * const costMetrics = await Metric.findByCategory("cost")
+   * console.log(`Found ${costMetrics.length} cost metrics`)
+   * ```
+   */
+  export async function findByCategory(category: Category): Promise<Definition[]> {
+    const all = await list()
+    return all.filter((m) => m.category === category)
+  }
+
+  /**
+   * Find metrics by tag.
+   * 
+   * Retrieves all metrics that have a specific tag.
+   * Tags allow flexible grouping and filtering of metrics.
+   * 
+   * @param tag - The tag to filter by
+   * @returns Array of metric definitions with the specified tag
+   * 
+   * @example
+   * ```typescript
+   * const prodMetrics = await Metric.findByTag("production")
+   * const gateMetrics = await Metric.findByTag("quality-gate")
+   * ```
+   */
+  export async function findByTag(tag: string): Promise<Definition[]> {
+    const all = await list()
+    return all.filter((m) => m.tags.includes(tag))
+  }
+}
diff --git a/packages/opencode/src/evaluation/metrics/builtin.ts b/packages/opencode/src/evaluation/metrics/builtin.ts
new file mode 100644
index 0000000000..ab2b93d6d3
--- /dev/null
+++ b/packages/opencode/src/evaluation/metrics/builtin.ts
@@ -0,0 +1,197 @@
+import type { Metric } from "../metric"
+
+/**
+ * Built-in metrics available out of the box
+ */
+export const BuiltinMetrics: Record<string, Metric.Definition> = {
+  "tool-error-rate": {
+    id: "tool-error-rate",
+    name: "Tool Error Rate",
+    description: "Percentage of tool calls that failed",
+    version: "1.0.0",
+    category: "performance",
+    evaluator: {
+      type: "heuristic",
+      function: "toolErrorRate",
+    },
+    threshold: {
+      pass: 0.1, // <10% errors is acceptable
+      warn: 0.05, // <5% is good
+    },
+    higherIsBetter: false,
+    tags: ["reliability", "tools"],
+  },
+
+  "response-latency": {
+    id: "response-latency",
+    name: "Response Latency",
+    description: "Total time to complete the request in milliseconds",
+    version: "1.0.0",
+    category: "performance",
+    evaluator: {
+      type: "heuristic",
+      function: "responseDuration",
+    },
+    threshold: {
+      pass: 30000, // <30s is acceptable
+      warn: 10000, // <10s is good
+    },
+    higherIsBetter: false,
+    tags: ["performance", "latency"],
+  },
+
+  "redundant-calls": {
+    id: "redundant-calls",
+    name: "Redundant Tool Calls",
+    description: "Number of duplicate/redundant tool calls detected",
+    version: "1.0.0",
+    category: "correctness",
+    evaluator: {
+      type: "heuristic",
+      function: "redundantCalls",
+    },
+    threshold: {
+      pass: 0, // No redundant calls
+    },
+    higherIsBetter: false,
+    tags: ["efficiency", "tools"],
+  },
+
+  "cost-efficiency": {
+    id: "cost-efficiency",
+    name: "Cost Efficiency",
+    description: "Cost per successful tool operation",
+    version: "1.0.0",
+    category: "cost",
+    evaluator: {
+      type: "heuristic",
+      function: "costEfficiency",
+    },
+    threshold: {
+      pass: 0.05, // <$0.05 per operation
+      warn: 0.01, // <$0.01 is good
+    },
+    higherIsBetter: false,
+    tags: ["cost", "efficiency"],
+  },
+
+  "token-efficiency": {
+    id: "token-efficiency",
+    name: "Token Efficiency",
+    description: "Ratio of output tokens to total tokens used",
+    version: "1.0.0",
+    category: "cost",
+    evaluator: {
+      type: "heuristic",
+      function: "tokenEfficiency",
+    },
+    threshold: {
+      pass: 0.2, // At least 20% of tokens are output
+      warn: 0.3, // 30%+ is good
+    },
+    higherIsBetter: true,
+    tags: ["cost", "efficiency"],
+  },
+
+  "average-tool-duration": {
+    id: "average-tool-duration",
+    name: "Average Tool Duration",
+    description: "Average time per tool call in milliseconds",
+    version: "1.0.0",
+    category: "performance",
+    evaluator: {
+      type: "heuristic",
+      function: "averageToolDuration",
+    },
+    threshold: {
+      pass: 3000, // <3s average
+      warn: 1000, // <1s is good
+    },
+    higherIsBetter: false,
+    tags: ["performance", "tools"],
+  },
+
+  "tool-success-rate": {
+    id: "tool-success-rate",
+    name: "Tool Success Rate",
+    description: "Percentage of tool calls that succeeded",
+    version: "1.0.0",
+    category: "reliability",
+    evaluator: {
+      type: "heuristic",
+      function: "toolSuccessRate",
+    },
+    threshold: {
+      pass: 0.9, // >90% success
+      warn: 0.95, // >95% is good
+    },
+    higherIsBetter: true,
+    tags: ["reliability", "tools"],
+  },
+
+  "cache-hit-rate": {
+    id: "cache-hit-rate",
+    name: "Cache Hit Rate",
+    description: "Percentage of input tokens served from cache",
+    version: "1.0.0",
+    category: "cost",
+    evaluator: {
+      type: "heuristic",
+      function: "cacheHitRate",
+    },
+    threshold: {
+      pass: 0.3, // >30% cache hits
+      warn: 0.5, // >50% is good
+    },
+    higherIsBetter: true,
+    tags: ["cost", "performance"],
+  },
+
+  "total-cost": {
+    id: "total-cost",
+    name: "Total Cost",
+    description: "Total cost of the trace in dollars",
+    version: "1.0.0",
+    category: "cost",
+    evaluator: {
+      type: "heuristic",
+      function: "totalCost",
+    },
+    threshold: {
+      pass: 1.0, // <$1 per trace
+      warn: 0.1, // <$0.10 is good
+    },
+    higherIsBetter: false,
+    tags: ["cost"],
+  },
+
+  "has-errors": {
+    id: "has-errors",
+    name: "Has Errors",
+    description: "Whether the trace encountered any errors",
+    version: "1.0.0",
+    category: "reliability",
+    evaluator: {
+      type: "heuristic",
+      function: "hasErrors",
+    },
+    threshold: {
+      pass: 0, // No errors
+    },
+    higherIsBetter: false,
+    tags: ["reliability"],
+  },
+}
+
+/**
+ * Register all built-in metrics
+ */
+export async function registerBuiltinMetrics(): Promise<void> {
+  const { Metric } = await import("../metric")
+  
+  for (const metric of Object.values(BuiltinMetrics)) {
+    if (!(await Metric.exists(metric.id))) {
+      await Metric.register(metric)
+    }
+  }
+}
diff --git a/packages/opencode/src/evaluation/runner.ts b/packages/opencode/src/evaluation/runner.ts
new file mode 100644
index 0000000000..a0a3d28d6b
--- /dev/null
+++ b/packages/opencode/src/evaluation/runner.ts
@@ -0,0 +1,412 @@
+import z from "zod/v4"
+import { Storage } from "../storage/storage"
+import { Bus } from "../bus"
+import { Log } from "../util/log"
+import type { Trace } from "../trace"
+import { Dataset } from "./dataset"
+import { EvaluationEngine } from "./engine"
+
+/**
+ * TestRunner executes test suites and validates trace behavior.
+ * 
+ * The runner evaluates assertions against traces to determine if they
+ * meet expected criteria. It supports:
+ * - Running entire datasets of test cases
+ * - Evaluating individual assertions against traces
+ * - Tracking test history and results
+ * - Emitting events for test lifecycle monitoring
+ * 
+ * Assertion results include pass/fail status, actual vs expected values,
+ * and descriptive messages for debugging failures.
+ * 
+ * @example
+ * ```typescript
+ * // Run assertions against a trace
+ * const assertions = [
+ *   { type: "tool-called", toolID: "Read", minCount: 1 },
+ *   { type: "duration-under", milliseconds: 5000 },
+ *   { type: "no-errors" }
+ * ]
+ * const results = await TestRunner.runAssertions(trace, assertions)
+ * const passed = results.every(r => r.passed)
+ * ```
+ */
+export namespace TestRunner {
+  const log = Log.create({ service: "test-runner" })
+
+  export const AssertionResult = z.object({
+    assertion: Dataset.Assertion,
+    passed: z.boolean(),
+    message: z.string(),
+    actual: z.any().optional(),
+    expected: z.any().optional(),
+  })
+  export type AssertionResult = z.infer<typeof AssertionResult>
+
+  export const TestResult = z.object({
+    testCase: Dataset.TestCase,
+    traceID: z.string(),
+    passed: z.boolean(),
+    
+    assertions: z.array(AssertionResult),
+    
+    duration: z.number(),
+    timestamp: z.number(),
+    
+    error: z.string().optional(),
+  })
+  export type TestResult = z.infer<typeof TestResult>
+
+  export const RunResult = z.object({
+    id: z.string(),
+    datasetID: z.string(),
+    
+    results: z.array(TestResult),
+    
+    summary: z.object({
+      total: z.number(),
+      passed: z.number(),
+      failed: z.number(),
+      duration: z.number(),
+    }),
+    
+    timestamp: z.number(),
+  })
+  export type RunResult = z.infer<typeof RunResult>
+
+  export const Event = {
+    Started: Bus.event(
+      "test.started",
+      z.object({
+        runID: z.string(),
+        datasetID: z.string(),
+      }),
+    ),
+    TestCompleted: Bus.event(
+      "test.completed",
+      z.object({
+        runID: z.string(),
+        testCaseID: z.string(),
+        passed: z.boolean(),
+      }),
+    ),
+    Completed: Bus.event(
+      "test.run.completed",
+      z.object({
+        runID: z.string(),
+        summary: RunResult.shape.summary,
+      }),
+    ),
+  }
+
+  /**
+   * Run all test cases in a dataset
+   */
+  export async function run(datasetID: string): Promise<RunResult> {
+    const { Dataset } = await import("./dataset")
+    const dataset = await Dataset.get(datasetID)
+    const testCases = dataset.testCases.filter((tc) => tc.enabled)
+
+    const runID = Date.now().toString() + "-" + Math.random().toString(36).substring(7)
+    const startTime = Date.now()
+
+    Bus.publish(Event.Started, { runID, datasetID })
+
+    log.info("starting test run", {
+      runID,
+      datasetID,
+      testCount: testCases.length,
+    })
+
+    const results: TestResult[] = []
+
+    for (const testCase of testCases) {
+      const result = await runTest(testCase, runID)
+      results.push(result)
+
+      Bus.publish(Event.TestCompleted, {
+        runID,
+        testCaseID: testCase.id,
+        passed: result.passed,
+      })
+
+      log.info("test completed", {
+        testCaseID: testCase.id,
+        passed: result.passed,
+        assertions: result.assertions.length,
+      })
+    }
+
+    const endTime = Date.now()
+    const passed = results.filter((r) => r.passed).length
+
+    const runResult: RunResult = {
+      id: runID,
+      datasetID,
+      results,
+      summary: {
+        total: results.length,
+        passed,
+        failed: results.length - passed,
+        duration: endTime - startTime,
+      },
+      timestamp: startTime,
+    }
+
+    // Store the run result
+    await Storage.write(["test-run", datasetID, runID], runResult)
+
+    Bus.publish(Event.Completed, {
+      runID,
+      summary: runResult.summary,
+    })
+
+    log.info("test run completed", {
+      runID,
+      summary: runResult.summary,
+    })
+
+    return runResult
+  }
+
+  /**
+   * Run a single test case
+   */
+  async function runTest(testCase: Dataset.TestCase, _runID: string): Promise<TestResult> {
+    const startTime = Date.now()
+
+    try {
+      // For now, we need a trace to evaluate assertions
+      // In a full implementation, this would execute the agent with the test input
+      // and create a new trace. For now, we'll document this limitation.
+      
+      // TODO: Implement agent execution here
+      // const trace = await executeAgent(testCase.input.prompt, testCase.input.context)
+      
+      // Placeholder: We'll need to provide a way to link test cases to existing traces
+      // or execute the agent to create new traces
+      throw new Error("Test execution requires agent integration - not yet implemented")
+
+    } catch (error) {
+      return {
+        testCase,
+        traceID: "",
+        passed: false,
+        assertions: [],
+        duration: Date.now() - startTime,
+        timestamp: startTime,
+        error: error instanceof Error ? error.message : String(error),
+      }
+    }
+  }
+
+  /**
+   * Run assertions against a trace.
+   * 
+   * Evaluates all provided assertions and returns results with
+   * pass/fail status, actual vs expected values, and messages.
+   * 
+   * @param trace - The completed trace to validate
+   * @param assertions - Array of assertions to evaluate
+   * @returns Array of assertion results with pass/fail status
+   * 
+   * @example
+   * ```typescript
+   * const assertions = [
+   *   { type: "tool-called", toolID: "Edit", minCount: 1, maxCount: 3 },
+   *   { type: "output-contains", substring: "success" },
+   *   { type: "cost-under", dollars: 0.05 }
+   * ]
+   * const results = await TestRunner.runAssertions(trace, assertions)
+   * results.forEach(r => {
+   *   console.log(`${r.passed ? '✓' : '✗'} ${r.message}`)
+   * })
+   * ```
+   */
+  export async function runAssertions(trace: Trace.Complete, assertions: Dataset.Assertion[]): Promise<AssertionResult[]> {
+    return Promise.all(assertions.map((assertion) => checkAssertion(trace, assertion)))
+  }
+
+  /**
+   * Check a single assertion
+   */
+  async function checkAssertion(trace: Trace.Complete, assertion: Dataset.Assertion): Promise<AssertionResult> {
+    try {
+      switch (assertion.type) {
+        case "tool-called":
+          return checkToolCalled(trace, assertion)
+        case "output-matches":
+          return checkOutputMatches(trace, assertion)
+        case "output-contains":
+          return checkOutputContains(trace, assertion)
+        case "no-errors":
+          return checkNoErrors(trace)
+        case "duration-under":
+          return checkDurationUnder(trace, assertion)
+        case "cost-under":
+          return checkCostUnder(trace, assertion)
+        case "metric-passes":
+          return checkMetricPasses(trace, assertion)
+        case "custom":
+          return checkCustom(trace, assertion)
+      }
+    } catch (error) {
+      return {
+        assertion,
+        passed: false,
+        message: `Assertion check failed: ${error instanceof Error ? error.message : String(error)}`,
+      }
+    }
+  }
+
+  function checkToolCalled(trace: Trace.Complete, assertion: Dataset.Assertion & { type: "tool-called" }): AssertionResult {
+    const calls = trace.toolCalls.filter((tc) => tc.id === assertion.toolID)
+    const count = calls.length
+
+    const minCount = assertion.minCount ?? 1
+    const maxCount = assertion.maxCount ?? Infinity
+
+    const passed = count >= minCount && count <= maxCount
+
+    return {
+      assertion,
+      passed,
+      message: passed
+        ? `Tool '${assertion.toolID}' called ${count} time(s)`
+        : `Tool '${assertion.toolID}' called ${count} time(s), expected ${minCount} to ${maxCount === Infinity ? "∞" : maxCount}`,
+      actual: count,
+      expected: { min: minCount, max: maxCount },
+    }
+  }
+
+  function checkOutputMatches(trace: Trace.Complete, assertion: Dataset.Assertion & { type: "output-matches" }): AssertionResult {
+    const output = trace.output
+
+    const regex = new RegExp(assertion.pattern, assertion.flags)
+    const passed = regex.test(output)
+
+    return {
+      assertion,
+      passed,
+      message: passed
+        ? `Output matches pattern: ${assertion.pattern}`
+        : `Output does not match pattern: ${assertion.pattern}`,
+      actual: output,
+      expected: assertion.pattern,
+    }
+  }
+
+  function checkOutputContains(trace: Trace.Complete, assertion: Dataset.Assertion & { type: "output-contains" }): AssertionResult {
+    const output = trace.output
+
+    const passed = output.includes(assertion.substring)
+
+    return {
+      assertion,
+      passed,
+      message: passed
+        ? `Output contains: "${assertion.substring}"`
+        : `Output does not contain: "${assertion.substring}"`,
+      actual: output,
+      expected: assertion.substring,
+    }
+  }
+
+  function checkNoErrors(trace: Trace.Complete): AssertionResult {
+    const passed = trace.summary.errorCount === 0
+
+    return {
+      assertion: { type: "no-errors" },
+      passed,
+      message: passed ? "No errors" : `Found ${trace.summary.errorCount} error(s)`,
+      actual: trace.summary.errorCount,
+      expected: 0,
+    }
+  }
+
+  function checkDurationUnder(trace: Trace.Complete, assertion: Dataset.Assertion & { type: "duration-under" }): AssertionResult {
+    const passed = trace.summary.duration <= assertion.milliseconds
+
+    return {
+      assertion,
+      passed,
+      message: passed
+        ? `Duration ${trace.summary.duration}ms under ${assertion.milliseconds}ms`
+        : `Duration ${trace.summary.duration}ms exceeds ${assertion.milliseconds}ms`,
+      actual: trace.summary.duration,
+      expected: assertion.milliseconds,
+    }
+  }
+
+  function checkCostUnder(trace: Trace.Complete, assertion: Dataset.Assertion & { type: "cost-under" }): AssertionResult {
+    const passed = trace.summary.cost <= assertion.dollars
+
+    return {
+      assertion,
+      passed,
+      message: passed
+        ? `Cost $${trace.summary.cost.toFixed(4)} under $${assertion.dollars}`
+        : `Cost $${trace.summary.cost.toFixed(4)} exceeds $${assertion.dollars}`,
+      actual: trace.summary.cost,
+      expected: assertion.dollars,
+    }
+  }
+
+  async function checkMetricPasses(trace: Trace.Complete, assertion: Dataset.Assertion & { type: "metric-passes" }): Promise<AssertionResult> {
+    const { Metric } = await import("./metric")
+    const metric = await Metric.get(assertion.metricID)
+    const result = await EvaluationEngine.evaluate(trace, metric)
+
+    return {
+      assertion,
+      passed: result.passed,
+      message: result.passed
+        ? `Metric '${metric.name}' passed with score ${result.score}`
+        : `Metric '${metric.name}' failed with score ${result.score}`,
+      actual: result.score,
+      expected: metric.threshold,
+    }
+  }
+
+  function checkCustom(trace: Trace.Complete, assertion: Dataset.Assertion & { type: "custom" }): AssertionResult {
+    try {
+      const func = new Function("trace", `return ${assertion.expression}`)
+      const result = func(trace)
+      const passed = Boolean(result)
+
+      return {
+        assertion,
+        passed,
+        message: passed ? assertion.description : `${assertion.description} (failed)`,
+      }
+    } catch (error) {
+      return {
+        assertion,
+        passed: false,
+        message: `Custom assertion failed: ${error instanceof Error ? error.message : String(error)}`,
+      }
+    }
+  }
+
+  /**
+   * Get test run history for a dataset
+   */
+  export async function getRunHistory(datasetID: string): Promise<RunResult[]> {
+    const keys = await Storage.list(["test-run", datasetID])
+    const results: RunResult[] = []
+
+    for (const key of keys) {
+      const result = await Storage.read<RunResult>(key)
+      results.push(result)
+    }
+
+    return results.sort((a, b) => b.timestamp - a.timestamp)
+  }
+
+  /**
+   * Get a specific test run
+   */
+  export async function getRun(datasetID: string, runID: string): Promise<RunResult> {
+    return Storage.read<RunResult>(["test-run", datasetID, runID])
+  }
+}
diff --git a/packages/opencode/src/evaluation/telemetry.ts b/packages/opencode/src/evaluation/telemetry.ts
new file mode 100644
index 0000000000..b1ff5e672f
--- /dev/null
+++ b/packages/opencode/src/evaluation/telemetry.ts
@@ -0,0 +1,581 @@
+/**
+ * Telemetry collection for production insights.
+ * 
+ * Enriches traces with contextual metadata to enable:
+ * - Cost attribution and analysis
+ * - Quality prediction without user input
+ * - Performance segmentation
+ * - Self-improving evaluation
+ * 
+ * Key principles:
+ * - Non-invasive: Doesn't modify core trace schema
+ * - Opt-in: Only collected when evaluation is enabled
+ * - Async: Never blocks user operations
+ * - Privacy-preserving: Metadata only, no code content
+ */
+
+import z from "zod/v4"
+import { Storage } from "../storage/storage"
+import { Bus } from "../bus"
+import type { Trace } from "../trace"
+import { Instance } from "../project/instance"
+import { Log } from "../util/log"
+import { FileWatcher } from "../file/watcher"
+
+const log = Log.create({ service: "evaluation.telemetry" })
+
+export namespace Telemetry {
+  /**
+   * Codebase characteristics for context.
+   */
+  export const CodebaseContext = z.object({
+    size: z.object({
+      files: z.number(),
+      lines: z.number(),
+    }),
+    primaryLanguage: z.string(), // Most common file extension
+    architecture: z.enum(["monolith", "microservices", "unknown"]),
+    testCoverage: z.number().optional(), // If detectable
+  })
+  export type CodebaseContext = z.infer<typeof CodebaseContext>
+
+  /**
+   * Task classification based on trace characteristics.
+   */
+  export const TaskClassification = z.object({
+    type: z.enum(["edit", "refactor", "debug", "review", "explore", "unknown"]),
+    complexity: z.enum(["simple", "medium", "complex"]),
+    confidence: z.number().min(0).max(1), // How confident are we?
+  })
+  export type TaskClassification = z.infer<typeof TaskClassification>
+
+  /**
+   * Outcome proxies (quality signals without user feedback).
+   */
+  export const OutcomeProxies = z.object({
+    subsequentEdits: z.number().default(0), // Edits to same files within 1 hour
+    subsequentEditWindow: z.number().default(60 * 60 * 1000), // 1 hour in ms
+    gitReverted: z.boolean().optional(), // Was change reverted?
+    testResults: z
+      .object({
+        total: z.number(),
+        passed: z.number(),
+        failed: z.number(),
+      })
+      .optional(),
+  })
+  export type OutcomeProxies = z.infer<typeof OutcomeProxies>
+
+  /**
+   * Enriched trace metadata.
+   */
+  export const EnrichedMetadata = z.object({
+    traceID: z.string(),
+    timestamp: z.number(),
+
+    // Context
+    codebaseContext: CodebaseContext.optional(),
+    taskClassification: TaskClassification,
+
+    // Attribution
+    userEmail: z.string().optional(),
+    teamID: z.string().optional(),
+    environment: z.enum(["development", "production", "staging"]).optional(),
+
+    // Outcome tracking
+    outcomeProxies: OutcomeProxies,
+
+    // Metadata
+    collectedAt: z.number(),
+    version: z.string().default("1.0.0"),
+  })
+  export type EnrichedMetadata = z.infer<typeof EnrichedMetadata>
+
+  /**
+   * User feedback for specific traces.
+   */
+  export const UserFeedback = z.object({
+    traceID: z.string(),
+    timestamp: z.number(),
+
+    // Structured questions
+    responses: z.object({
+      correctness: z.number().min(1).max(5).optional(), // 1-5 rating
+      speed: z.enum(["too-slow", "acceptable", "fast"]).optional(),
+      wouldUseAgain: z.boolean().optional(),
+    }),
+
+    // Freeform
+    comment: z.string().optional(),
+
+    // Context
+    requestedAt: z.number(),
+    respondedAt: z.number(),
+  })
+  export type UserFeedback = z.infer<typeof UserFeedback>
+
+  /**
+   * Feedback request configuration.
+   */
+  export const FeedbackRequest = z.object({
+    traceIDs: z.array(z.string()),
+    questions: z.array(
+      z.object({
+        id: z.string(),
+        type: z.enum(["rating", "choice", "boolean", "text"]),
+        question: z.string(),
+        options: z.array(z.string()).optional(),
+      })
+    ),
+    requestedAt: z.number(),
+  })
+  export type FeedbackRequest = z.infer<typeof FeedbackRequest>
+
+  /**
+   * Events for telemetry system.
+   */
+  export const Event = {
+    Enriched: Bus.event(
+      "telemetry.enriched",
+      z.object({
+        metadata: EnrichedMetadata,
+      })
+    ),
+    FeedbackRequested: Bus.event(
+      "telemetry.feedback_requested",
+      z.object({
+        request: FeedbackRequest,
+      })
+    ),
+    FeedbackReceived: Bus.event(
+      "telemetry.feedback_received",
+      z.object({
+        feedback: UserFeedback,
+      })
+    ),
+  }
+
+  // Track file edits for outcome detection (reserved for future use)
+  // const recentEdits = new Map<string, Array<{ file: string; timestamp: number }>>()
+
+  /**
+   * Classify task type based on trace characteristics.
+   */
+  function classifyTask(trace: Trace.Complete): TaskClassification {
+    const { toolCalls, summary } = trace
+
+    // Count tool types
+    const toolTypes = new Map<string, number>()
+    for (const call of toolCalls) {
+      // Tool name can be in 'tool' property or 'id' property (from realistic traces)
+      const name = (call as any).tool || (call as any).id || "unknown"
+      toolTypes.set(name, (toolTypes.get(name) || 0) + 1)
+    }
+
+    // Heuristics for task type
+    let type: TaskClassification["type"] = "unknown"
+    let confidence = 0.5
+
+    // Explore: Mostly Read/Grep
+    const readCount = (toolTypes.get("Read") || 0) + (toolTypes.get("Grep") || 0)
+    const editCount =
+      (toolTypes.get("Edit") || 0) +
+      (toolTypes.get("MultiEdit") || 0) +
+      (toolTypes.get("Create") || 0)
+    const executeCount = toolTypes.get("Execute") || 0
+
+    if (readCount > editCount * 3 && editCount < 2) {
+      type = "explore"
+      confidence = 0.8
+    }
+    // Debug: Errors with retries (any execute + errors)
+    else if (summary.errorCount > 0 && executeCount > 0) {
+      type = "debug"
+      confidence = 0.75
+    }
+    // Refactor: MultiEdit or many edits
+    else if (toolTypes.has("MultiEdit") || editCount > 3) {
+      type = "refactor"
+      confidence = 0.7
+    }
+    // Edit: Some edits with reads
+    else if (editCount > 0 && editCount <= 3) {
+      type = "edit"
+      confidence = 0.7
+    }
+    // Review: Reads with no edits
+    else if (readCount > 0 && editCount === 0) {
+      type = "review"
+      confidence = 0.6
+    }
+
+    // Determine complexity
+    let complexity: TaskClassification["complexity"] = "medium"
+    const totalTools = summary.toolCallCount
+    const duration = summary.duration
+
+    if (totalTools <= 3 && duration < 5000 && summary.cost < 0.01) {
+      complexity = "simple"
+    } else if (totalTools > 10 || duration > 30000 || summary.cost > 0.15) {
+      complexity = "complex"
+    }
+
+    return { type, complexity, confidence }
+  }
+
+  /**
+   * Analyze codebase to extract context.
+   * Cached per project to avoid repeated scans.
+   */
+  const codebaseContextCache = new Map<string, CodebaseContext>()
+
+  async function getCodebaseContext(projectID: string): Promise<CodebaseContext | undefined> {
+    // Check cache
+    if (codebaseContextCache.has(projectID)) {
+      return codebaseContextCache.get(projectID)
+    }
+
+    try {
+      const worktree = Instance.worktree
+
+      // Skip codebase analysis if worktree is too small (test environment)
+      const fileList = await Bun.$`find ${worktree} -type f -not -path "*/node_modules/*" -not -path "*/.git/*" -not -path "*/dist/*" -not -path "*/build/*" 2>/dev/null | head -100 || echo ""`.text()
+      const files = fileList.trim().split("\n").filter(Boolean)
+      
+      // If less than 3 files, it's likely a test environment
+      if (files.length < 3) {
+        return undefined
+      }
+      const fileCount = files.length
+
+      // Sample 100 files to estimate total lines
+      const sampleSize = Math.min(100, files.length)
+      const sampleFiles = files.slice(0, sampleSize)
+      let sampleLines = 0
+
+      for (const file of sampleFiles) {
+        try {
+          const content = await Bun.file(file).text()
+          sampleLines += content.split("\n").length
+        } catch {
+          // Skip files that can't be read
+        }
+      }
+
+      const estimatedLines = Math.round((sampleLines / sampleSize) * fileCount)
+
+      // Detect primary language
+      const extensions = new Map<string, number>()
+      for (const file of files) {
+        const ext = file.split(".").pop()?.toLowerCase() || ""
+        if (ext) {
+          extensions.set(ext, (extensions.get(ext) || 0) + 1)
+        }
+      }
+
+      const primaryExt =
+        Array.from(extensions.entries()).sort((a, b) => b[1] - a[1])[0]?.[0] || "unknown"
+
+      const languageMap: Record<string, string> = {
+        ts: "typescript",
+        js: "javascript",
+        tsx: "typescript",
+        jsx: "javascript",
+        py: "python",
+        go: "go",
+        rs: "rust",
+        java: "java",
+        rb: "ruby",
+        php: "php",
+        c: "c",
+        cpp: "cpp",
+        cs: "csharp",
+      }
+      const primaryLanguage = languageMap[primaryExt] || primaryExt
+
+      // Detect architecture (simple heuristic)
+      const hasDockerCompose = files.some((f) => f.includes("docker-compose"))
+      const hasMultiplePackageJsons = files.filter((f) => f.endsWith("package.json")).length > 1
+      const hasServicesDir = files.some((f) => f.includes("/services/") || f.includes("/apps/"))
+
+      let architecture: CodebaseContext["architecture"] = "unknown"
+      if (hasDockerCompose || hasMultiplePackageJsons || hasServicesDir) {
+        architecture = "microservices"
+      } else if (fileCount > 10) {
+        architecture = "monolith"
+      }
+
+      const context: CodebaseContext = {
+        size: {
+          files: fileCount,
+          lines: estimatedLines,
+        },
+        primaryLanguage,
+        architecture,
+      }
+
+      // Cache for 1 hour
+      codebaseContextCache.set(projectID, context)
+      setTimeout(() => codebaseContextCache.delete(projectID), 60 * 60 * 1000)
+
+      return context
+    } catch (error) {
+      log.warn("failed to analyze codebase", { error: String(error) })
+      return undefined
+    }
+  }
+
+  /**
+   * Track subsequent edits to the same files.
+   */
+  function trackSubsequentEdits(traceID: string, trace: Trace.Complete) {
+    // Extract files touched in this trace
+    const filesEdited = new Set<string>()
+    for (const call of trace.toolCalls) {
+      const event = call as any
+      if (
+        event.tool === "Edit" ||
+        event.tool === "MultiEdit" ||
+        event.tool === "Create" ||
+        event.tool === "Write"
+      ) {
+        const file = event.params?.file || event.params?.filepath
+        if (file) filesEdited.add(file)
+      }
+    }
+
+    if (filesEdited.size === 0) return
+
+    // Subscribe to file watcher for the next hour
+    let editCount = 0
+    const unsubscribe = Bus.subscribe(FileWatcher.Event.Updated, (event) => {
+      const { file } = event.properties
+      if (filesEdited.has(file)) {
+        editCount++
+        log.debug("detected subsequent edit", { traceID, file, editCount })
+      }
+    })
+
+    // After 1 hour, update outcome proxies
+    setTimeout(async () => {
+      unsubscribe()
+
+      try {
+        const metadata = await getEnrichedMetadata(traceID)
+        if (metadata) {
+          metadata.outcomeProxies.subsequentEdits = editCount
+          await Storage.write(["telemetry", traceID], metadata)
+          log.info("updated outcome proxies", { traceID, editCount })
+        }
+      } catch (error) {
+        log.warn("failed to update outcome proxies", { traceID, error: String(error) })
+      }
+    }, 60 * 60 * 1000) // 1 hour
+  }
+
+  /**
+   * Enrich a trace with telemetry metadata.
+   * Called automatically when trace completes.
+   */
+  export async function enrichTrace(trace: Trace.Complete): Promise<EnrichedMetadata> {
+    log.debug("enriching trace", { traceID: trace.id })
+
+    // Classify task
+    const taskClassification = classifyTask(trace)
+
+    // Get codebase context (cached)
+    const codebaseContext = await getCodebaseContext(trace.projectID)
+
+    // Create enriched metadata
+    const metadata: EnrichedMetadata = {
+      traceID: trace.id,
+      timestamp: trace.createdAt,
+      codebaseContext,
+      taskClassification,
+      outcomeProxies: {
+        subsequentEdits: 0,
+        subsequentEditWindow: 60 * 60 * 1000,
+      },
+      collectedAt: Date.now(),
+      version: "1.0.0",
+    }
+
+    // Store metadata
+    await Storage.write(["telemetry", trace.id], metadata)
+
+    // Emit event
+    Bus.publish(Event.Enriched, { metadata })
+
+    // Start tracking subsequent edits
+    trackSubsequentEdits(trace.id, trace)
+
+    log.info("trace enriched", {
+      traceID: trace.id,
+      taskType: taskClassification.type,
+      complexity: taskClassification.complexity,
+    })
+
+    return metadata
+  }
+
+  /**
+   * Get enriched metadata for a trace.
+   */
+  export async function getEnrichedMetadata(traceID: string): Promise<EnrichedMetadata | null> {
+    try {
+      const metadata = await Storage.read<EnrichedMetadata>(["telemetry", traceID])
+      return metadata
+    } catch {
+      return null
+    }
+  }
+
+  /**
+   * Record user feedback for a trace.
+   */
+  export async function recordFeedback(feedback: UserFeedback): Promise<void> {
+    await Storage.write(["feedback", feedback.traceID], feedback)
+    Bus.publish(Event.FeedbackReceived, { feedback })
+    log.info("feedback recorded", { traceID: feedback.traceID })
+  }
+
+  /**
+   * Get user feedback for a trace.
+   */
+  export async function getFeedback(traceID: string): Promise<UserFeedback | null> {
+    try {
+      const feedback = await Storage.read<UserFeedback>(["feedback", traceID])
+      return feedback
+    } catch {
+      return null
+    }
+  }
+
+  /**
+   * Request feedback for specific traces.
+   * Emits an event that UI layers can subscribe to.
+   */
+  export async function requestFeedback(traceIDs: string[]): Promise<void> {
+    const request: FeedbackRequest = {
+      traceIDs,
+      questions: [
+        {
+          id: "correctness",
+          type: "rating",
+          question: "How would you rate the quality of the result?",
+        },
+        {
+          id: "speed",
+          type: "choice",
+          question: "Was the response time acceptable?",
+          options: ["too-slow", "acceptable", "fast"],
+        },
+        {
+          id: "wouldUseAgain",
+          type: "boolean",
+          question: "Would you use this feature again?",
+        },
+      ],
+      requestedAt: Date.now(),
+    }
+
+    Bus.publish(Event.FeedbackRequested, { request })
+    log.info("feedback requested", { traceCount: traceIDs.length })
+  }
+
+  /**
+   * Query telemetry data with filters.
+   */
+  export async function query(options: {
+    since?: number
+    until?: number
+    taskType?: TaskClassification["type"]
+    complexity?: TaskClassification["complexity"]
+    limit?: number
+  }): Promise<EnrichedMetadata[]> {
+    const keys = await Storage.list(["telemetry"])
+    const results: EnrichedMetadata[] = []
+
+    for (const key of keys) {
+      try {
+        const metadata = await Storage.read<EnrichedMetadata>(key)
+        
+        // Skip invalid/incomplete entries
+        if (!metadata || !metadata.taskClassification) continue
+
+        // Apply filters
+        if (options.since && metadata.timestamp < options.since) continue
+        if (options.until && metadata.timestamp > options.until) continue
+        if (options.taskType && metadata.taskClassification.type !== options.taskType) continue
+        if (options.complexity && metadata.taskClassification.complexity !== options.complexity)
+          continue
+
+        results.push(metadata)
+
+        if (options.limit && results.length >= options.limit) break
+      } catch {
+        // Skip entries that can't be read or parsed
+        continue
+      }
+    }
+
+    return results.sort((a, b) => b.timestamp - a.timestamp)
+  }
+
+  /**
+   * Get aggregated statistics from telemetry data.
+   */
+  export async function getStatistics(options?: {
+    since?: number
+    until?: number
+  }): Promise<{
+    totalTraces: number
+    byTaskType: Record<string, number>
+    byComplexity: Record<string, number>
+    avgSubsequentEdits: number
+  }> {
+    const metadata = await query({ since: options?.since, until: options?.until })
+
+    const byTaskType: Record<string, number> = {}
+    const byComplexity: Record<string, number> = {}
+    let totalSubsequentEdits = 0
+
+    for (const m of metadata) {
+      byTaskType[m.taskClassification.type] = (byTaskType[m.taskClassification.type] || 0) + 1
+      byComplexity[m.taskClassification.complexity] =
+        (byComplexity[m.taskClassification.complexity] || 0) + 1
+      totalSubsequentEdits += m.outcomeProxies.subsequentEdits
+    }
+
+    return {
+      totalTraces: metadata.length,
+      byTaskType,
+      byComplexity,
+      avgSubsequentEdits: metadata.length > 0 ? totalSubsequentEdits / metadata.length : 0,
+    }
+  }
+
+  /**
+   * Clean up old telemetry data (>30 days).
+   */
+  export async function cleanup(maxAgeMs: number = 30 * 24 * 60 * 60 * 1000): Promise<number> {
+    const cutoff = Date.now() - maxAgeMs
+    const keys = await Storage.list(["telemetry"])
+    let removed = 0
+
+    for (const key of keys) {
+      try {
+        const metadata = await Storage.read<EnrichedMetadata>(key)
+        if (metadata.timestamp < cutoff) {
+          await Storage.remove(key)
+          removed++
+        }
+      } catch {
+        // Skip invalid entries
+      }
+    }
+
+    log.info("telemetry cleanup completed", { removed })
+    return removed
+  }
+}
diff --git a/packages/opencode/src/evaluation/time-utils.ts b/packages/opencode/src/evaluation/time-utils.ts
new file mode 100644
index 0000000000..421d87471d
--- /dev/null
+++ b/packages/opencode/src/evaluation/time-utils.ts
@@ -0,0 +1,220 @@
+/**
+ * Time utilities for consistent timezone handling and timestamp operations.
+ * 
+ * All utilities use UTC to avoid timezone confusion.
+ */
+
+export namespace TimeUtils {
+  /**
+   * Extract hour of day in UTC.
+   * Always uses UTC to avoid timezone confusion.
+   */
+  export function getHourOfDay(timestamp: number): number {
+    return new Date(timestamp).getUTCHours()
+  }
+
+  /**
+   * Get day of week in UTC (0 = Sunday, 6 = Saturday).
+   */
+  export function getDayOfWeek(timestamp: number): number {
+    return new Date(timestamp).getUTCDay()
+  }
+
+  /**
+   * Check if timestamp falls within business hours (UTC).
+   * Default: 9am-5pm UTC (configurable)
+   */
+  export function isBusinessHours(
+    timestamp: number,
+    config = { startHour: 9, endHour: 17 }
+  ): boolean {
+    const hour = getHourOfDay(timestamp)
+    return hour >= config.startHour && hour <= config.endHour
+  }
+
+  /**
+   * Check if timestamp is a weekend (UTC).
+   */
+  export function isWeekend(timestamp: number): boolean {
+    const day = getDayOfWeek(timestamp)
+    return day === 0 || day === 6  // Sunday or Saturday
+  }
+
+  /**
+   * Create evenly-spaced timestamps for simulation/testing.
+   * 
+   * @param startOrDaysAgo - Days ago (number) or specific start timestamp
+   * @param endOrNow - Specific end timestamp or Date.now()
+   * @param count - Number of timestamps to generate
+   * @returns Array of evenly-spaced timestamps
+   * 
+   * @example
+   * // Create 100 timestamps spanning last 7 days
+   * const timestamps = createTimeRange(7, Date.now(), 100)
+   * 
+   * // Create 50 timestamps between two specific dates
+   * const timestamps = createTimeRange(
+   *   new Date('2024-01-01').getTime(),
+   *   new Date('2024-01-31').getTime(),
+   *   50
+   * )
+   */
+  export function createTimeRange(
+    startOrDaysAgo: number | Date,
+    endOrNow: number | Date = Date.now(),
+    count: number
+  ): number[] {
+    if (count < 2) {
+      throw new Error('count must be at least 2')
+    }
+
+    const start = typeof startOrDaysAgo === 'number'
+      ? Date.now() - startOrDaysAgo * 24 * 60 * 60 * 1000
+      : startOrDaysAgo instanceof Date
+      ? startOrDaysAgo.getTime()
+      : startOrDaysAgo
+
+    const end = typeof endOrNow === 'number'
+      ? endOrNow
+      : endOrNow instanceof Date
+      ? endOrNow.getTime()
+      : endOrNow
+
+    if (start >= end) {
+      throw new Error('start must be before end')
+    }
+
+    const step = (end - start) / (count - 1)
+
+    return Array.from({ length: count }, (_, i) => Math.floor(start + i * step))
+  }
+
+  /**
+   * Format timestamp for human-readable debugging.
+   * 
+   * @example
+   * formatTimestamp(Date.now())
+   * // "2024-01-15T10:30:00.000Z (0h ago)"
+   * 
+   * formatTimestamp(Date.now() - 3600000)
+   * // "2024-01-15T09:30:00.000Z (1h ago)"
+   */
+  export function formatTimestamp(timestamp: number): string {
+    const date = new Date(timestamp)
+    const hoursAgo = Math.floor((Date.now() - timestamp) / (60 * 60 * 1000))
+    
+    if (hoursAgo < 0) {
+      return `${date.toISOString()} (${Math.abs(hoursAgo)}h from now)`
+    }
+    if (hoursAgo === 0) {
+      const minutesAgo = Math.floor((Date.now() - timestamp) / (60 * 1000))
+      return `${date.toISOString()} (${minutesAgo}m ago)`
+    }
+    if (hoursAgo < 48) {
+      return `${date.toISOString()} (${hoursAgo}h ago)`
+    }
+    const daysAgo = Math.floor(hoursAgo / 24)
+    return `${date.toISOString()} (${daysAgo}d ago)`
+  }
+
+  /**
+   * Validate timestamp is reasonable.
+   * Throws if timestamp is clearly invalid.
+   * Warns if timestamp is suspiciously far from now.
+   * 
+   * @param timestamp - Timestamp to validate
+   * @param context - Context for error messages
+   * @param options - Validation options
+   * @returns The validated timestamp
+   * 
+   * @example
+   * validateTimestamp(trace.createdAt, 'TimeSeries.record')
+   * validateTimestamp(timestamp, 'test', { warnIfOlderThanDays: 30 })
+   */
+  export function validateTimestamp(
+    timestamp: number,
+    context: string,
+    options: {
+      warnIfOlderThanDays?: number
+      warnIfNewerThanDays?: number
+    } = {}
+  ): number {
+    // Check for obviously invalid values
+    if (!timestamp || !Number.isFinite(timestamp)) {
+      throw new Error(`Invalid timestamp in ${context}: ${timestamp}`)
+    }
+
+    if (timestamp <= 0) {
+      throw new Error(`Timestamp must be positive in ${context}: ${timestamp}`)
+    }
+
+    // Check if timestamp is in milliseconds (not seconds)
+    if (timestamp < 1000000000000) {
+      throw new Error(
+        `Timestamp appears to be in seconds, not milliseconds in ${context}: ${timestamp}. ` +
+        `Did you mean ${timestamp * 1000}?`
+      )
+    }
+
+    const now = Date.now()
+    const ageMs = now - timestamp
+    const ageDays = ageMs / (24 * 60 * 60 * 1000)
+
+    // Warn if timestamp is from the future
+    if (timestamp > now) {
+      const futureDays = -ageDays
+      const warnThreshold = options.warnIfNewerThanDays ?? 1
+      
+      if (futureDays > warnThreshold) {
+        console.warn(
+          `[TimeUtils] Timestamp is ${futureDays.toFixed(1)} days in the future. ` +
+          `This might indicate a bug. Context: ${context}, Timestamp: ${formatTimestamp(timestamp)}`
+        )
+      }
+    }
+
+    // Warn if timestamp is very old
+    const oldThreshold = options.warnIfOlderThanDays ?? 365
+    if (ageDays > oldThreshold) {
+      console.warn(
+        `[TimeUtils] Timestamp is ${ageDays.toFixed(1)} days old (>${oldThreshold} days). ` +
+        `This might indicate a bug. Context: ${context}, Timestamp: ${formatTimestamp(timestamp)}`
+      )
+    }
+
+    return timestamp
+  }
+
+  /**
+   * Round timestamp to nearest hour.
+   * Useful for bucketing and aggregation.
+   */
+  export function roundToHour(timestamp: number): number {
+    return Math.floor(timestamp / (60 * 60 * 1000)) * (60 * 60 * 1000)
+  }
+
+  /**
+   * Round timestamp to nearest day.
+   */
+  export function roundToDay(timestamp: number): number {
+    return Math.floor(timestamp / (24 * 60 * 60 * 1000)) * (24 * 60 * 60 * 1000)
+  }
+
+  /**
+   * Get start of day (00:00:00.000 UTC).
+   */
+  export function startOfDay(timestamp: number): number {
+    const date = new Date(timestamp)
+    date.setUTCHours(0, 0, 0, 0)
+    return date.getTime()
+  }
+
+  /**
+   * Get end of day (23:59:59.999 UTC).
+   */
+  export function endOfDay(timestamp: number): number {
+    const date = new Date(timestamp)
+    date.setUTCHours(23, 59, 59, 999)
+    return date.getTime()
+  }
+}
diff --git a/packages/opencode/src/evaluation/timeseries.ts b/packages/opencode/src/evaluation/timeseries.ts
new file mode 100644
index 0000000000..dc8db744a3
--- /dev/null
+++ b/packages/opencode/src/evaluation/timeseries.ts
@@ -0,0 +1,694 @@
+import z from "zod/v4"
+import { Storage } from "../storage/storage"
+import type { Trace } from "../trace"
+import { EvaluationEngine } from "./engine"
+import { TimeUtils } from "./time-utils"
+
+/**
+ * Time-series analysis for tracking metric trends over time.
+ * 
+ * Enables tracking of metric performance across temporal dimensions:
+ * - Hourly, daily, weekly aggregations
+ * - Trend detection (improving, degrading, stable)
+ * - Anomaly detection based on historical patterns
+ * - Rolling window statistics
+ * 
+ * @example
+ * ```typescript
+ * // Track metrics over time
+ * await TimeSeries.record("error-rate", trace)
+ * 
+ * // Get daily aggregates for the last 7 days
+ * const trend = await TimeSeries.getAggregates("error-rate", {
+ *   period: "day",
+ *   since: Date.now() - 7 * 24 * 60 * 60 * 1000
+ * })
+ * 
+ * // Detect trends
+ * const analysis = await TimeSeries.analyzeTrend("error-rate", { days: 7 })
+ * if (analysis.trend === "degrading") {
+ *   console.warn("Metric is degrading over time")
+ * }
+ * ```
+ */
+export namespace TimeSeries {
+  /**
+   * A single data point in a time series.
+   */
+  export const DataPoint = z.object({
+    metricID: z.string(),
+    traceID: z.string(),
+    value: z.number(),
+    timestamp: z.number(),
+    
+    // Context
+    tags: z.record(z.string(), z.string()).optional(),
+  })
+  export type DataPoint = z.infer<typeof DataPoint>
+
+  /**
+   * Aggregated statistics for a time period.
+   */
+  export const Aggregate = z.object({
+    metricID: z.string(),
+    period: z.enum(["hour", "day", "week", "month"]),
+    periodStart: z.number(),
+    periodEnd: z.number(),
+    
+    // Statistics
+    count: z.number(),
+    mean: z.number(),
+    median: z.number(),
+    min: z.number(),
+    max: z.number(),
+    stdDev: z.number(),
+    p50: z.number(),
+    p95: z.number(),
+    p99: z.number(),
+  })
+  export type Aggregate = z.infer<typeof Aggregate>
+
+  /**
+   * Trend analysis result.
+   */
+  export const TrendAnalysis = z.object({
+    metricID: z.string(),
+    period: z.object({
+      start: z.number(),
+      end: z.number(),
+      days: z.number(),
+    }),
+    
+    // Trend direction
+    trend: z.enum(["improving", "degrading", "stable"]),
+    trendStrength: z.number(), // 0-1, how strong the trend is
+    
+    // Statistical measures
+    slope: z.number(), // Rate of change per day
+    correlation: z.number(), // -1 to 1, linear correlation with time
+    
+    // Data points
+    dataPoints: z.number(),
+    mean: z.number(),
+    changePercent: z.number(),
+    
+    // Anomalies detected
+    anomalies: z.array(
+      z.object({
+        timestamp: z.number(),
+        value: z.number(),
+        expectedValue: z.number(),
+        deviationSigmas: z.number(),
+      }),
+    ),
+  })
+  export type TrendAnalysis = z.infer<typeof TrendAnalysis>
+
+  /**
+   * Record a metric value for time-series tracking.
+   * 
+   * @param metricID - The metric to track
+   * @param trace - The trace containing the metric evaluation
+   * @param tags - Optional tags for filtering/grouping
+   * 
+   * @example
+   * ```typescript
+   * await TimeSeries.record("latency", trace, {
+   *   environment: "production",
+   *   version: "v1.2.0"
+   * })
+   * ```
+   */
+  export async function record(
+    metricID: string,
+    trace: Trace.Complete,
+    tags?: Record<string, string>,
+  ): Promise<void> {
+    const { Metric } = await import("./metric")
+    const metric = await Metric.get(metricID)
+    
+    // Evaluate the metric
+    const result = await EvaluationEngine.evaluate(trace, metric)
+    
+    // Validate and use trace timestamp
+    const timestamp = TimeUtils.validateTimestamp(
+      trace.createdAt || Date.now(),
+      `TimeSeries.record(${metricID}, ${trace.id})`,
+      { warnIfOlderThanDays: 90 }
+    )
+
+    const dataPoint: DataPoint = {
+      metricID,
+      traceID: trace.id,
+      value: result.score,
+      timestamp,
+      tags,
+    }
+    
+    // Store in time-series bucket
+    const hourBucket = Math.floor(timestamp / (60 * 60 * 1000)) // Hourly buckets
+    await Storage.write(["timeseries", metricID, hourBucket.toString(), trace.id], dataPoint)
+  }
+
+  /**
+   * Record multiple traces efficiently in a single batch operation.
+   * Much faster than calling record() in a loop.
+   * 
+   * @param metricID - The metric ID
+   * @param traces - Array of traces to record
+   * @param tags - Optional tags to apply to all data points
+   * 
+   * @example
+   * ```typescript
+   * const traces = generateHistoricalTraces(100)
+   * await TimeSeries.recordBatch("cost-metric", traces)
+   * ```
+   */
+  export async function recordBatch(
+    metricID: string,
+    traces: Trace.Complete[],
+    tags?: Record<string, string>
+  ): Promise<void> {
+    const { Metric } = await import("./metric")
+    const metric = await Metric.get(metricID)
+
+    // Evaluate all traces in parallel
+    const dataPoints = await Promise.all(
+      traces.map(async (trace) => {
+        const result = await EvaluationEngine.evaluate(trace, metric)
+        const timestamp = TimeUtils.validateTimestamp(
+          trace.createdAt || Date.now(),
+          `TimeSeries.recordBatch(${metricID})`,
+          { warnIfOlderThanDays: 90 }
+        )
+
+        return {
+          metricID,
+          traceID: trace.id,
+          value: result.score,
+          timestamp,
+          tags,
+        }
+      })
+    )
+
+    // Write all data points in parallel
+    await Promise.all(
+      dataPoints.map(async (point) => {
+        const hourBucket = Math.floor(point.timestamp / (60 * 60 * 1000))
+        await Storage.write(
+          ["timeseries", metricID, hourBucket.toString(), point.traceID],
+          point
+        )
+      })
+    )
+  }
+
+  /**
+   * Get raw data points for a metric within a time range.
+   * 
+   * @param metricID - The metric ID
+   * @param options - Query options
+   * @returns Array of data points
+   */
+  export async function getDataPoints(
+    metricID: string,
+    options?: {
+      since?: number
+      until?: number
+      tags?: Record<string, string>
+    },
+  ): Promise<DataPoint[]> {
+    const keys = await Storage.list(["timeseries", metricID])
+    const points: DataPoint[] = []
+    
+    for (const key of keys) {
+      const point = await Storage.read<DataPoint>(key)
+      
+      // Filter by time range
+      if (options?.since && point.timestamp < options.since) continue
+      if (options?.until && point.timestamp > options.until) continue
+      
+      // Filter by tags
+      if (options?.tags) {
+        const matchesTags = Object.entries(options.tags).every(
+          ([k, v]) => point.tags?.[k] === v,
+        )
+        if (!matchesTags) continue
+      }
+      
+      points.push(point)
+    }
+    
+    return points.sort((a, b) => a.timestamp - b.timestamp)
+  }
+
+  /**
+   * Get aggregated statistics for a metric by time period.
+   * 
+   * @param metricID - The metric ID
+   * @param options - Aggregation options
+   * @returns Array of aggregates per period
+   * 
+   * @example
+   * ```typescript
+   * // Get daily stats for last month
+   * const dailyStats = await TimeSeries.getAggregates("cost", {
+   *   period: "day",
+   *   since: Date.now() - 30 * 24 * 60 * 60 * 1000
+   * })
+   * ```
+   */
+  export async function getAggregates(
+    metricID: string,
+    options: {
+      period: "hour" | "day" | "week" | "month"
+      since?: number
+      until?: number
+      tags?: Record<string, string>
+    },
+  ): Promise<Aggregate[]> {
+    const points = await getDataPoints(metricID, {
+      since: options.since,
+      until: options.until,
+      tags: options.tags,
+    })
+    
+    if (points.length === 0) {
+      return []
+    }
+    
+    // Group by period
+    const periodMs = getPeriodMilliseconds(options.period)
+    const groups = new Map<number, DataPoint[]>()
+    
+    for (const point of points) {
+      const periodStart = Math.floor(point.timestamp / periodMs) * periodMs
+      if (!groups.has(periodStart)) {
+        groups.set(periodStart, [])
+      }
+      groups.get(periodStart)!.push(point)
+    }
+    
+    // Compute aggregates for each period
+    const aggregates: Aggregate[] = []
+    
+    for (const [periodStart, groupPoints] of groups.entries()) {
+      const values = groupPoints.map((p) => p.value).sort((a, b) => a - b)
+      const count = values.length
+      
+      if (count === 0) continue
+      
+      const mean = values.reduce((sum, v) => sum + v, 0) / count
+      const median = values[Math.floor(count / 2)]
+      const variance = values.reduce((sum, v) => sum + Math.pow(v - mean, 2), 0) / count
+      const stdDev = Math.sqrt(variance)
+      
+      aggregates.push({
+        metricID,
+        period: options.period,
+        periodStart,
+        periodEnd: periodStart + periodMs,
+        count,
+        mean,
+        median,
+        min: values[0],
+        max: values[count - 1],
+        stdDev,
+        p50: values[Math.floor(count * 0.5)],
+        p95: values[Math.floor(count * 0.95)],
+        p99: values[Math.floor(count * 0.99)],
+      })
+    }
+    
+    return aggregates.sort((a, b) => a.periodStart - b.periodStart)
+  }
+
+  /**
+   * Analyze trend for a metric over a time period.
+   * 
+   * Performs linear regression and anomaly detection to characterize
+   * the metric's behavior over time.
+   * 
+   * @param metricID - The metric ID
+   * @param options - Analysis options
+   * @returns Trend analysis with direction, strength, and anomalies
+   * 
+   * @example
+   * ```typescript
+   * const analysis = await TimeSeries.analyzeTrend("error-rate", {
+   *   days: 14,
+   *   anomalyThreshold: 3 // 3 sigma
+   * })
+   * 
+   * if (analysis.trend === "degrading" && analysis.trendStrength > 0.5) {
+   *   alert("Strong degradation detected!")
+   * }
+   * ```
+   */
+  export async function analyzeTrend(
+    metricID: string,
+    options: {
+      days?: number
+      since?: number
+      until?: number
+      anomalyThreshold?: number // Sigma threshold for anomaly detection
+      skipQualityCheck?: boolean // Skip data quality checks (for testing)
+    },
+  ): Promise<TrendAnalysis> {
+    const { Metric } = await import("./metric")
+    const metric = await Metric.get(metricID)
+    
+    // Determine time range
+    const end = options.until || Date.now()
+    const days = options.days || 7
+    const start = options.since || end - days * 24 * 60 * 60 * 1000
+    
+    // Check data quality first (unless explicitly skipped)
+    if (!options.skipQualityCheck) {
+      const quality = await checkDataQuality(metricID, { since: start, until: end })
+      
+      if (quality.totalPoints === 0) {
+        throw new Error(`No data points available for trend analysis`)
+      }
+      
+      if (quality.warnings.length > 0) {
+        console.warn(
+          `[TimeSeries] Data quality issues for ${metricID}:\n` +
+          quality.warnings.map(w => `  - ${w}`).join('\n')
+        )
+      }
+    }
+    
+    // Get data points
+    const points = await getDataPoints(metricID, { since: start, until: end })
+    
+    if (points.length < 3) {
+      throw new Error(`Not enough data points for trend analysis (need at least 3, got ${points.length})`)
+    }
+    
+    // Normalize timestamps to days from start
+    const values = points.map((p) => p.value)
+    const times = points.map((p) => (p.timestamp - start) / (24 * 60 * 60 * 1000))
+    
+    // Linear regression
+    const n = values.length
+    const sumX = times.reduce((sum, t) => sum + t, 0)
+    const sumY = values.reduce((sum, v) => sum + v, 0)
+    const sumXY = times.reduce((sum, t, i) => sum + t * values[i], 0)
+    const sumXX = times.reduce((sum, t) => sum + t * t, 0)
+    
+    const slope = (n * sumXY - sumX * sumY) / (n * sumXX - sumX * sumX)
+    const intercept = (sumY - slope * sumX) / n
+    
+    // Correlation coefficient
+    const meanX = sumX / n
+    const meanY = sumY / n
+    const numerator = times.reduce((sum, t, i) => sum + (t - meanX) * (values[i] - meanY), 0)
+    const denomX = Math.sqrt(times.reduce((sum, t) => sum + Math.pow(t - meanX, 2), 0))
+    const denomY = Math.sqrt(values.reduce((sum, v) => sum + Math.pow(v - meanY, 2), 0))
+    const correlation = numerator / (denomX * denomY)
+    
+    // Determine trend direction based on correlation strength
+    let trend: "improving" | "degrading" | "stable"
+    const trendStrength = Math.abs(correlation)
+    
+    // Use correlation threshold to determine if trend is significant
+    // Correlation > 0.5 indicates moderate to strong linear trend
+    if (trendStrength < 0.5) {
+      trend = "stable"
+    } else {
+      const isIncreasing = slope > 0
+      trend = (metric.higherIsBetter && isIncreasing) || (!metric.higherIsBetter && !isIncreasing)
+        ? "improving"
+        : "degrading"
+    }
+    
+    // Anomaly detection using z-score
+    const mean = meanY
+    const variance = values.reduce((sum, v) => sum + Math.pow(v - mean, 2), 0) / n
+    const stdDev = Math.sqrt(variance)
+    const anomalyThreshold = options.anomalyThreshold || 3
+    
+    const anomalies = []
+    for (let i = 0; i < points.length; i++) {
+      const expectedValue = intercept + slope * times[i]
+      const deviation = values[i] - expectedValue
+      const zScore = stdDev === 0 ? 0 : deviation / stdDev
+      
+      if (Math.abs(zScore) > anomalyThreshold) {
+        anomalies.push({
+          timestamp: points[i].timestamp,
+          value: values[i],
+          expectedValue,
+          deviationSigmas: zScore,
+        })
+      }
+    }
+    
+    // Calculate percent change from start to end
+    const startValue = intercept
+    const endValue = intercept + slope * days
+    const changePercent = startValue === 0 ? 0 : ((endValue - startValue) / startValue) * 100
+    
+    return {
+      metricID,
+      period: {
+        start,
+        end,
+        days,
+      },
+      trend,
+      trendStrength,
+      slope,
+      correlation,
+      dataPoints: points.length,
+      mean,
+      changePercent,
+      anomalies,
+    }
+  }
+
+  /**
+   * Detect if current metric value is an anomaly compared to historical data.
+   * 
+   * @param metricID - The metric ID
+   * @param currentValue - The current value to check
+   * @param lookbackDays - Days of history to compare against
+   * @returns Whether value is anomalous and details
+   */
+  export async function detectAnomaly(
+    metricID: string,
+    currentValue: number,
+    lookbackDays = 7,
+  ): Promise<{
+    isAnomaly: boolean
+    zScore: number
+    expectedRange: { min: number; max: number }
+    historicalMean: number
+    historicalStdDev: number
+  }> {
+    const since = Date.now() - lookbackDays * 24 * 60 * 60 * 1000
+    const points = await getDataPoints(metricID, { since })
+    
+    if (points.length < 3) {
+      return {
+        isAnomaly: false,
+        zScore: 0,
+        expectedRange: { min: currentValue, max: currentValue },
+        historicalMean: currentValue,
+        historicalStdDev: 0,
+      }
+    }
+    
+    const values = points.map((p) => p.value)
+    const mean = values.reduce((sum, v) => sum + v, 0) / values.length
+    const variance = values.reduce((sum, v) => sum + Math.pow(v - mean, 2), 0) / values.length
+    const stdDev = Math.sqrt(variance)
+    
+    // Handle edge case where all values are identical (stdDev = 0)
+    // If current value differs significantly from mean, it's an anomaly
+    let zScore = 0
+    let isAnomaly = false
+    
+    if (stdDev === 0) {
+      // All historical values are identical
+      const deviation = Math.abs(currentValue - mean)
+      // If deviation is more than 10% of mean (or > 0.01 for small values), it's anomalous
+      isAnomaly = deviation > Math.max(mean * 0.1, 0.01)
+      zScore = isAnomaly ? 10 : 0 // Arbitrary large z-score
+    } else {
+      zScore = (currentValue - mean) / stdDev
+      isAnomaly = Math.abs(zScore) > 3 // 3-sigma rule
+    }
+    
+    return {
+      isAnomaly,
+      zScore,
+      expectedRange: {
+        min: mean - 3 * stdDev,
+        max: mean + 3 * stdDev,
+      },
+      historicalMean: mean,
+      historicalStdDev: stdDev,
+    }
+  }
+
+  /**
+   * Clear all time-series data for a specific metric.
+   * Useful for testing and data cleanup.
+   * 
+   * @param metricID - The metric ID to clear data for
+   */
+  export async function clearMetric(metricID: string): Promise<void> {
+    const prefix = ["timeseries", metricID]
+    const keys = await Storage.list(prefix)
+    
+    for (const key of keys) {
+      await Storage.remove(key)
+    }
+  }
+
+  /**
+   * Data quality report for a metric's time-series data.
+   */
+  export interface DataQualityReport {
+    totalPoints: number
+    timeRange: {
+      start: number
+      end: number
+      durationDays: number
+    }
+    gaps: Array<{
+      start: number
+      end: number
+      durationHours: number
+    }>
+    duplicates: number
+    outOfOrderPoints: number
+    warnings: string[]
+  }
+
+  /**
+   * Analyze data quality for a metric.
+   * Helps identify issues before they cause analysis failures.
+   * 
+   * @param metricID - The metric ID
+   * @param options - Query options
+   * @returns Data quality report with warnings
+   * 
+   * @example
+   * ```typescript
+   * const quality = await TimeSeries.checkDataQuality("cost-metric")
+   * if (quality.warnings.length > 0) {
+   *   console.warn("Data issues:", quality.warnings)
+   * }
+   * ```
+   */
+  export async function checkDataQuality(
+    metricID: string,
+    options?: { since?: number; until?: number }
+  ): Promise<DataQualityReport> {
+    const points = await getDataPoints(metricID, options)
+
+    if (points.length === 0) {
+      return {
+        totalPoints: 0,
+        timeRange: { start: 0, end: 0, durationDays: 0 },
+        gaps: [],
+        duplicates: 0,
+        outOfOrderPoints: 0,
+        warnings: ["No data points found"],
+      }
+    }
+
+    const sorted = [...points].sort((a, b) => a.timestamp - b.timestamp)
+    const start = sorted[0].timestamp
+    const end = sorted[sorted.length - 1].timestamp
+    const durationDays = (end - start) / (24 * 60 * 60 * 1000)
+
+    // Detect gaps (> 2 hours between points)
+    const gaps: DataQualityReport["gaps"] = []
+    for (let i = 1; i < sorted.length; i++) {
+      const gapMs = sorted[i].timestamp - sorted[i - 1].timestamp
+      const gapHours = gapMs / (60 * 60 * 1000)
+      if (gapHours > 2) {
+        gaps.push({
+          start: sorted[i - 1].timestamp,
+          end: sorted[i].timestamp,
+          durationHours: gapHours,
+        })
+      }
+    }
+
+    // Detect duplicates (same timestamp)
+    const timestamps = new Set()
+    let duplicates = 0
+    for (const point of points) {
+      if (timestamps.has(point.timestamp)) {
+        duplicates++
+      }
+      timestamps.add(point.timestamp)
+    }
+
+    // Detect out-of-order points (from original array)
+    let outOfOrderPoints = 0
+    for (let i = 0; i < points.length - 1; i++) {
+      if (points[i].timestamp > points[i + 1].timestamp) {
+        outOfOrderPoints++
+      }
+    }
+
+    // Generate warnings
+    const warnings: string[] = []
+    if (points.length < 10) {
+      warnings.push(
+        `Only ${points.length} data points - need more for reliable analysis`
+      )
+    }
+    if (durationDays < 1) {
+      warnings.push(
+        `Data spans only ${durationDays.toFixed(1)} days - trends may not be reliable`
+      )
+    }
+    if (gaps.length > 0) {
+      const largestGap = Math.max(...gaps.map((g) => g.durationHours))
+      warnings.push(
+        `${gaps.length} data gaps detected (largest: ${largestGap.toFixed(1)}h) - may affect trend analysis`
+      )
+    }
+    if (duplicates > 0) {
+      warnings.push(`${duplicates} duplicate timestamps - may skew statistics`)
+    }
+    if (outOfOrderPoints > 0) {
+      warnings.push(
+        `${outOfOrderPoints} out-of-order points - data may be corrupted`
+      )
+    }
+
+    return {
+      totalPoints: points.length,
+      timeRange: { start, end, durationDays },
+      gaps,
+      duplicates,
+      outOfOrderPoints,
+      warnings,
+    }
+  }
+
+  /**
+   * Get period duration in milliseconds.
+   */
+  function getPeriodMilliseconds(period: "hour" | "day" | "week" | "month"): number {
+    switch (period) {
+      case "hour":
+        return 60 * 60 * 1000
+      case "day":
+        return 24 * 60 * 60 * 1000
+      case "week":
+        return 7 * 24 * 60 * 60 * 1000
+      case "month":
+        return 30 * 24 * 60 * 60 * 1000
+    }
+  }
+}
diff --git a/packages/opencode/src/file/index.ts b/packages/opencode/src/file/index.ts
index e5023f0dc1..3fb351a365 100644
--- a/packages/opencode/src/file/index.ts
+++ b/packages/opencode/src/file/index.ts
@@ -195,7 +195,6 @@ export namespace File {
     const content = await Bun.file(full)
       .text()
       .catch(() => "")
-      .then((x) => x.trim())
     if (project.vcs === "git") {
       let diff = await $`git diff ${file}`.cwd(Instance.directory).quiet().nothrow().text()
       if (!diff.trim()) diff = await $`git diff --staged ${file}`.cwd(Instance.directory).quiet().nothrow().text()
diff --git a/packages/opencode/src/index.ts b/packages/opencode/src/index.ts
index 7a54f0b2d6..224075594f 100644
--- a/packages/opencode/src/index.ts
+++ b/packages/opencode/src/index.ts
@@ -65,6 +65,10 @@ const cli = yargs(hideBin(process.argv))
       version: Installation.VERSION,
       args: process.argv.slice(2),
     })
+
+    // Initialize evaluation framework
+    const { initEvaluation } = await import("./evaluation/init")
+    await initEvaluation()
   })
   .usage("\n" + UI.logo())
   .command(McpCommand)
diff --git a/packages/opencode/src/installation/index.ts b/packages/opencode/src/installation/index.ts
index b01ce5f7b2..0a27d9f9d0 100644
--- a/packages/opencode/src/installation/index.ts
+++ b/packages/opencode/src/installation/index.ts
@@ -139,7 +139,7 @@ export namespace Installation {
   export const USER_AGENT = `opencode/${VERSION}`
 
   export async function latest() {
-    return fetch("https://api.github.com/repos/sst/opencode/releases/latest")
+    return fetch("https://api.github.com/repos/evalops/opencode/releases/latest")
       .then((res) => res.json())
       .then((data) => {
         if (typeof data.tag_name !== "string") {
diff --git a/packages/opencode/src/project/instance.ts b/packages/opencode/src/project/instance.ts
index 01ea87a3ca..e56a21c14a 100644
--- a/packages/opencode/src/project/instance.ts
+++ b/packages/opencode/src/project/instance.ts
@@ -9,22 +9,27 @@ interface Context {
 }
 const context = Context.create<Context>("instance")
 const cache = new Map<string, Context>()
+const pending: string[] = []
 
 export const Instance = {
   async provide<R>(input: { directory: string; init?: () => Promise<any>; fn: () => R }): Promise<R> {
-    let existing = cache.get(input.directory)
-    if (!existing) {
-      const project = await Project.fromDirectory(input.directory)
-      existing = {
-        directory: input.directory,
+    const dir = input.directory
+    const cached = cache.get(dir)
+    const existing = cached ?? (await (async () => {
+      pending.push(dir)
+      const project = await Project.fromDirectory(dir).finally(() => {
+        pending.pop()
+      })
+      return {
+        directory: dir,
         worktree: project.worktree,
         project,
       }
-    }
+    })())
     return context.provide(existing, async () => {
-      if (!cache.has(input.directory)) {
+      if (!cache.has(dir)) {
         await input.init?.()
-        cache.set(input.directory, existing)
+        cache.set(dir, existing)
       }
       return input.fn()
     })
@@ -38,6 +43,9 @@ export const Instance = {
   get project() {
     return context.use().project
   },
+  get pending() {
+    return pending[pending.length - 1]
+  },
   state<S>(init: () => S, dispose?: (state: Awaited<S>) => Promise<void>): () => S {
     return State.create(() => Instance.directory, init, dispose)
   },
diff --git a/packages/opencode/src/provider/transform.ts b/packages/opencode/src/provider/transform.ts
index 920699a2b3..3232a5f178 100644
--- a/packages/opencode/src/provider/transform.ts
+++ b/packages/opencode/src/provider/transform.ts
@@ -92,7 +92,7 @@ export namespace ProviderTransform {
     }
 
     if (modelID.includes("gpt-5") && !modelID.includes("gpt-5-chat")) {
-      if (!modelID.includes("codex")) result["reasoningEffort"] = "medium"
+      result["reasoningEffort"] = "medium"
       if (providerID !== "azure") {
         result["textVerbosity"] = modelID.includes("codex") ? "medium" : "low"
       }
diff --git a/packages/opencode/src/session/prompt/anthropic-20250930.txt b/packages/opencode/src/session/prompt/anthropic-20250930.txt
index 7a4faea633..62971393f1 100644
--- a/packages/opencode/src/session/prompt/anthropic-20250930.txt
+++ b/packages/opencode/src/session/prompt/anthropic-20250930.txt
@@ -140,7 +140,7 @@ The user will primarily request you perform software engineering tasks. This inc
 
 Here is useful information about the environment you are running in:
 <env>
-Working directory: /home/thdxr/dev/projects/sst/opencode/packages/opencode
+Working directory: /home/thdxr/dev/projects/evalops/opencode/packages/opencode
 Is directory a git repo: Yes
 Platform: linux
 OS Version: Linux 6.12.4-arch1-1
diff --git a/packages/opencode/src/session/prompt/anthropic.txt b/packages/opencode/src/session/prompt/anthropic.txt
index 6e623fdadd..7fd2203752 100644
--- a/packages/opencode/src/session/prompt/anthropic.txt
+++ b/packages/opencode/src/session/prompt/anthropic.txt
@@ -4,7 +4,7 @@ IMPORTANT: You must NEVER generate or guess URLs for the user unless you are con
 
 If the user asks for help or wants to give feedback inform them of the following: 
 - /help: Get help with using opencode
-- To give feedback, users should report the issue at https://github.com/sst/opencode/issues
+- To give feedback, users should report the issue at https://github.com/evalops/opencode/issues
 
 # Tone and style
 You should be concise, direct, and to the point.
diff --git a/packages/opencode/src/session/prompt/qwen.txt b/packages/opencode/src/session/prompt/qwen.txt
index a34fdb01a0..2fce3f9b17 100644
--- a/packages/opencode/src/session/prompt/qwen.txt
+++ b/packages/opencode/src/session/prompt/qwen.txt
@@ -6,7 +6,7 @@ IMPORTANT: You must NEVER generate or guess URLs for the user unless you are con
 
 If the user asks for help or wants to give feedback inform them of the following: 
 - /help: Get help with using opencode
-- To give feedback, users should report the issue at https://github.com/sst/opencode/issues
+- To give feedback, users should report the issue at https://github.com/evalops/opencode/issues
 
 When the user directly asks about opencode (eg 'can opencode do...', 'does opencode have...') or asks in second person (eg 'are you able...', 'can you do...'), first use the WebFetch tool to gather information to answer the question from opencode docs at https://opencode.ai
 
@@ -106,4 +106,3 @@ When referencing specific functions or pieces of code include the pattern `file_
 user: Where are errors from the client handled?
 assistant: Clients are marked as failed in the `connectToServer` function in src/services/process.ts:712.
 </example>
-
diff --git a/packages/opencode/src/share/share.ts b/packages/opencode/src/share/share.ts
index 9df862d59a..285e8ab27f 100644
--- a/packages/opencode/src/share/share.ts
+++ b/packages/opencode/src/share/share.ts
@@ -9,6 +9,7 @@ export namespace Share {
 
   let queue: Promise<void> = Promise.resolve()
   const pending = new Map<string, any>()
+  const attempts = new Map<string, number>()
 
   export async function sync(key: string, content: any) {
     const [root, ...splits] = key.split("/")
@@ -21,28 +22,56 @@ export namespace Share {
     pending.set(key, content)
     queue = queue
       .then(async () => {
-        const content = pending.get(key)
-        if (content === undefined) return
-        pending.delete(key)
-
-        return fetch(`${URL}/share_sync`, {
-          method: "POST",
-          body: JSON.stringify({
-            sessionID: sessionID,
-            secret,
-            key: key,
-            content,
-          }),
+        await flush(key, sessionID, secret)
+      })
+      .catch((error) => {
+        log.error("sync_failed", {
+          key: key,
+          error,
         })
       })
-      .then((x) => {
-        if (x) {
-          log.info("synced", {
-            key: key,
-            status: x.status,
-          })
-        }
+  }
+
+  async function flush(key: string, sessionID: string, secret: string) {
+    while (true) {
+      const payload = pending.get(key)
+      if (payload === undefined) {
+        attempts.delete(key)
+        return
+      }
+
+      const attempt = (attempts.get(key) ?? 0) + 1
+      attempts.set(key, attempt)
+
+      const response = await fetch(`${URL}/share_sync`, {
+        method: "POST",
+        body: JSON.stringify({
+          sessionID: sessionID,
+          secret,
+          key: key,
+          content: payload,
+        }),
+      }).catch((error) => error as Error)
+
+      if (response instanceof Error || !response.ok) {
+        log.error("sync_retry", {
+          key: key,
+          attempt,
+          error: response instanceof Error ? response : response.status,
+        })
+        const delay = Math.min(30000, 1000 * 2 ** Math.min(attempt - 1, 5))
+        await new Promise((resolve) => setTimeout(resolve, delay))
+        continue
+      }
+
+      pending.delete(key)
+      attempts.delete(key)
+      log.info("synced", {
+        key: key,
+        status: response.status,
       })
+      return
+    }
   }
 
   export function init() {
diff --git a/packages/opencode/src/snapshot/index.ts b/packages/opencode/src/snapshot/index.ts
index fb49ae739e..e2c855617f 100644
--- a/packages/opencode/src/snapshot/index.ts
+++ b/packages/opencode/src/snapshot/index.ts
@@ -40,7 +40,10 @@ export namespace Snapshot {
         .nothrow()
       log.info("initialized")
     }
-    await $`git --git-dir ${git} add .`.quiet().cwd(Instance.directory).nothrow()
+    await $`git -c core.excludesFile=/dev/null --git-dir ${git} add .`
+      .quiet()
+      .cwd(Instance.directory)
+      .nothrow()
     const hash = await $`git --git-dir ${git} write-tree`.quiet().cwd(Instance.directory).nothrow().text()
     log.info("tracking", { hash, cwd: Instance.directory, git })
     return hash.trim()
@@ -54,7 +57,10 @@ export namespace Snapshot {
 
   export async function patch(hash: string): Promise<Patch> {
     const git = gitdir()
-    await $`git --git-dir ${git} add .`.quiet().cwd(Instance.directory).nothrow()
+    await $`git -c core.excludesFile=/dev/null --git-dir ${git} add .`
+      .quiet()
+      .cwd(Instance.directory)
+      .nothrow()
     const result = await $`git --git-dir ${git} diff --name-only ${hash} -- .`.quiet().cwd(Instance.directory).nothrow()
 
     // If git diff fails, return empty patch
@@ -64,6 +70,8 @@ export namespace Snapshot {
     }
 
     const files = result.text()
+    const directoryDisplay = Instance.directory
+    const directoryReal = await fs.realpath(directoryDisplay).catch(() => directoryDisplay)
     return {
       hash,
       files: files
@@ -71,7 +79,8 @@ export namespace Snapshot {
         .split("\n")
         .map((x) => x.trim())
         .filter(Boolean)
-        .map((x) => path.join(Instance.worktree, x)),
+        .map((x) => path.join(Instance.worktree, x))
+        .map((abs) => (abs.startsWith(directoryReal) ? directoryDisplay + abs.slice(directoryReal.length) : abs)),
     }
   }
 
diff --git a/packages/opencode/src/storage/storage.ts b/packages/opencode/src/storage/storage.ts
index 546d123c6b..e3f3049e36 100644
--- a/packages/opencode/src/storage/storage.ts
+++ b/packages/opencode/src/storage/storage.ts
@@ -2,7 +2,6 @@ import { Log } from "../util/log"
 import path from "path"
 import fs from "fs/promises"
 import { Global } from "../global"
-import { lazy } from "../util/lazy"
 import { Lock } from "../util/lock"
 import { $ } from "bun"
 
@@ -109,42 +108,77 @@ export namespace Storage {
     },
   ]
 
-  const state = lazy(async () => {
-    const dir = path.join(Global.Path.data, "storage")
-    const migration = await Bun.file(path.join(dir, "migration"))
-      .json()
-      .then((x) => parseInt(x))
-      .catch(() => 0)
-    for (let index = migration; index < MIGRATIONS.length; index++) {
-      log.info("running migration", { index })
-      const migration = MIGRATIONS[index]
-      await migration(dir).catch((e) => {
-        log.error("failed to run migration", { error: e, index })
-      })
-      await Bun.write(path.join(dir, "migration"), (index + 1).toString())
+  /**
+   * Get storage directory, respecting Instance context if available.
+   * Tests run in Instance.provide() will get isolated storage.
+   */
+  function getStorageDir(): string {
+    try {
+      const { Instance } = require("../project/instance")
+      const pending = Instance.pending
+      if (pending) {
+        return path.join(pending, ".opencode-storage")
+      }
+      const instanceDir = Instance.directory
+      if (instanceDir) {
+        return path.join(instanceDir, ".opencode-storage")
+      }
+    } catch {
+      // Not in Instance context, use global storage
     }
-    return {
-      dir,
+    return path.join(Global.Path.data, "storage")
+  }
+
+  // Cache migrations per directory to avoid re-running
+  const migrationCache = new Map<string, Promise<void>>()
+
+  async function ensureMigrations(dir: string): Promise<void> {
+    if (migrationCache.has(dir)) {
+      return migrationCache.get(dir)!
     }
-  })
+
+    const promise = (async () => {
+      const migration = await Bun.file(path.join(dir, "migration"))
+        .json()
+        .then((x) => parseInt(x))
+        .catch(() => 0)
+      for (let index = migration; index < MIGRATIONS.length; index++) {
+        log.info("running migration", { index, dir })
+        const migration = MIGRATIONS[index]
+        await migration(dir).catch((e) => {
+          log.error("failed to run migration", { error: e, index, dir })
+        })
+        await Bun.write(path.join(dir, "migration"), (index + 1).toString())
+      }
+    })()
+
+    migrationCache.set(dir, promise)
+    return promise
+  }
+
+  async function getDir(): Promise<string> {
+    const dir = getStorageDir()
+    await ensureMigrations(dir)
+    return dir
+  }
 
   export async function remove(key: string[]) {
-    const dir = await state().then((x) => x.dir)
+    const dir = await getDir()
     const target = path.join(dir, ...key) + ".json"
     await fs.unlink(target).catch(() => {})
   }
 
   export async function read<T>(key: string[]) {
-    const dir = await state().then((x) => x.dir)
+    const dir = await getDir()
     const target = path.join(dir, ...key) + ".json"
     using _ = await Lock.read(target)
     return Bun.file(target).json() as Promise<T>
   }
 
   export async function update<T>(key: string[], fn: (draft: T) => void) {
-    const dir = await state().then((x) => x.dir)
+    const dir = await getDir()
     const target = path.join(dir, ...key) + ".json"
-    using _ = await Lock.write("storage")
+    using _ = await Lock.write(target)
     const content = await Bun.file(target).json()
     fn(content)
     await Bun.write(target, JSON.stringify(content, null, 2))
@@ -152,15 +186,16 @@ export namespace Storage {
   }
 
   export async function write<T>(key: string[], content: T) {
-    const dir = await state().then((x) => x.dir)
+    const dir = await getDir()
     const target = path.join(dir, ...key) + ".json"
-    using _ = await Lock.write("storage")
+    using _ = await Lock.write(target)
+    await fs.mkdir(path.dirname(target), { recursive: true }).catch(() => {})
     await Bun.write(target, JSON.stringify(content, null, 2))
   }
 
   const glob = new Bun.Glob("**/*")
   export async function list(prefix: string[]) {
-    const dir = await state().then((x) => x.dir)
+    const dir = await getDir()
     try {
       const result = await Array.fromAsync(
         glob.scan({
diff --git a/packages/opencode/src/tool/bash.ts b/packages/opencode/src/tool/bash.ts
index ddf8227e9e..7a4d8acfa6 100644
--- a/packages/opencode/src/tool/bash.ts
+++ b/packages/opencode/src/tool/bash.ts
@@ -1,16 +1,16 @@
 import z from "zod/v4"
-import { exec } from "child_process"
 
 import { Tool } from "./tool"
 import DESCRIPTION from "./bash.txt"
 import { Permission } from "../permission"
-import { Filesystem } from "../util/filesystem"
 import { lazy } from "../util/lazy"
 import { Log } from "../util/log"
 import { Wildcard } from "../util/wildcard"
 import { $ } from "bun"
 import { Instance } from "../project/instance"
 import { Agent } from "../agent/agent"
+import { measure } from "./telemetry"
+import { guard } from "./workspace"
 
 const MAX_OUTPUT_LENGTH = 30_000
 const DEFAULT_TIMEOUT = 1 * 60 * 1000
@@ -43,24 +43,39 @@ const parser = lazy(async () => {
   }
 })
 
-export const BashTool = Tool.define("bash", {
+const parameters = z.object({
+  command: z.string().describe("The command to execute"),
+  timeout: z.number().describe("Optional timeout in milliseconds").optional(),
+  description: z
+    .string()
+    .describe(
+      "Clear, concise description of what this command does in 5-10 words. Examples:\nInput: ls\nOutput: Lists files in current directory\n\nInput: git status\nOutput: Shows working tree status\n\nInput: npm install\nOutput: Installs package dependencies\n\nInput: mkdir foo\nOutput: Creates directory 'foo'",
+    ),
+})
+
+type BashMetadata = {
+  output: string
+  exit?: number
+  description: string
+}
+
+export const BashTool = Tool.define<typeof parameters, BashMetadata>("bash", {
   description: DESCRIPTION,
-  parameters: z.object({
-    command: z.string().describe("The command to execute"),
-    timeout: z.number().describe("Optional timeout in milliseconds").optional(),
-    description: z
-      .string()
-      .describe(
-        "Clear, concise description of what this command does in 5-10 words. Examples:\nInput: ls\nOutput: Lists files in current directory\n\nInput: git status\nOutput: Shows working tree status\n\nInput: npm install\nOutput: Installs package dependencies\n\nInput: mkdir foo\nOutput: Creates directory 'foo'",
-      ),
-  }),
+  parameters,
   async execute(params, ctx) {
-    const timeout = Math.min(params.timeout ?? DEFAULT_TIMEOUT, MAX_TIMEOUT)
-    const tree = await parser().then((p) => p.parse(params.command))
-    const permissions = await Agent.get(ctx.agent).then((x) => x.permission.bash)
-
-    const askPatterns = new Set<string>()
-    for (const node of tree.rootNode.descendantsOfType("command")) {
+    const extra = { description: params.description }
+    return measure({
+      id: "bash",
+      ctx,
+      params,
+      extra,
+      async run() {
+        const timeout = Math.min(params.timeout ?? DEFAULT_TIMEOUT, MAX_TIMEOUT)
+        const tree = await parser().then((p) => p.parse(params.command))
+        const permissions = await Agent.get(ctx.agent).then((x) => x.permission.bash)
+
+        const askPatterns = new Set<string>()
+        for (const node of tree.rootNode.descendantsOfType("command")) {
       const command = []
       for (let i = 0; i < node.childCount; i++) {
         const child = node.child(i)
@@ -87,11 +102,10 @@ export const BashTool = Tool.define("bash", {
             .text()
             .then((x) => x.trim())
           log.info("resolved path", { arg, resolved })
-          if (resolved && !Filesystem.contains(Instance.directory, resolved)) {
-            throw new Error(
-              `This command references paths outside of ${Instance.directory} so it is not allowed to be executed.`,
-            )
-          }
+          if (resolved)
+            guard(resolved, {
+              message: `This command references paths outside of ${Instance.directory} so it is not allowed to be executed.`,
+            })
         }
       }
 
@@ -130,7 +144,7 @@ export const BashTool = Tool.define("bash", {
       }
     }
 
-    if (askPatterns.size > 0) {
+        if (askPatterns.size > 0) {
       const patterns = Array.from(askPatterns)
       await Permission.ask({
         type: "bash",
@@ -146,69 +160,74 @@ export const BashTool = Tool.define("bash", {
       })
     }
 
-    const process = exec(params.command, {
-      cwd: Instance.directory,
-      signal: ctx.abort,
-      timeout,
-    })
-
-    let output = ""
-
-    // Initialize metadata with empty output
-    ctx.metadata({
-      metadata: {
-        output: "",
-        description: params.description,
-      },
-    })
-
-    process.stdout?.on("data", (chunk) => {
-      output += chunk.toString()
-      ctx.metadata({
-        metadata: {
-          output: output,
-          description: params.description,
-        },
-      })
-    })
-
-    process.stderr?.on("data", (chunk) => {
-      output += chunk.toString()
-      ctx.metadata({
-        metadata: {
-          output: output,
-          description: params.description,
-        },
-      })
-    })
+        const controller = new AbortController()
+        const timer = setTimeout(() => controller.abort(), timeout)
+        const signal = AbortSignal.any([ctx.abort, controller.signal])
+        const shell = process.env["SHELL"] || "/bin/sh"
+        const proc = Bun.spawn([shell, "-lc", params.command], {
+          cwd: Instance.directory,
+          stdout: "pipe",
+          stderr: "pipe",
+          signal,
+        })
+
+        const state = { output: "" }
+        const decoder = () => new TextDecoder()
+        const pump = async (stream: ReadableStream<Uint8Array> | undefined) => {
+          if (!stream) return
+          const textDecoder = decoder()
+          await stream.pipeTo(
+            new WritableStream<Uint8Array>({
+              write(chunk) {
+                const text = textDecoder.decode(chunk, { stream: true })
+                if (!text) return
+                state.output += text
+                ctx.metadata({
+                  metadata: {
+                    output: state.output,
+                    description: params.description,
+                  },
+                })
+              },
+            }),
+          )
+        }
 
-    await new Promise<void>((resolve) => {
-      process.on("close", () => {
-        resolve()
-      })
-    })
+        ctx.metadata({
+          metadata: {
+            output: "",
+            description: params.description,
+          },
+        })
+
+        await Promise.all([pump(proc.stdout), pump(proc.stderr)])
+        const exit = await proc.exited
+        clearTimeout(timer)
+
+        ctx.metadata({
+          metadata: {
+            output: state.output,
+            exit,
+            description: params.description,
+          },
+        })
+
+        let finalOutput = state.output
+        if (finalOutput.length > MAX_OUTPUT_LENGTH) {
+          finalOutput = finalOutput.slice(0, MAX_OUTPUT_LENGTH)
+          finalOutput += "\n\n(Output was truncated due to length limit)"
+        }
 
-    ctx.metadata({
-      metadata: {
-        output: output,
-        exit: process.exitCode,
-        description: params.description,
+        return {
+          title: params.command,
+          metadata: {
+            output: finalOutput,
+            exit,
+            description: params.description,
+          },
+          output: finalOutput,
+        }
       },
     })
-
-    if (output.length > MAX_OUTPUT_LENGTH) {
-      output = output.slice(0, MAX_OUTPUT_LENGTH)
-      output += "\n\n(Output was truncated due to length limit)"
-    }
-
-    return {
-      title: params.command,
-      metadata: {
-        output,
-        exit: process.exitCode,
-        description: params.description,
-      },
-      output,
-    }
   },
 })
diff --git a/packages/opencode/src/tool/edit.ts b/packages/opencode/src/tool/edit.ts
index 579f9f09d8..a0e7c475ae 100644
--- a/packages/opencode/src/tool/edit.ts
+++ b/packages/opencode/src/tool/edit.ts
@@ -13,9 +13,9 @@ import DESCRIPTION from "./edit.txt"
 import { File } from "../file"
 import { Bus } from "../bus"
 import { FileTime } from "../file/time"
-import { Filesystem } from "../util/filesystem"
 import { Instance } from "../project/instance"
 import { Agent } from "../agent/agent"
+import { guard } from "./workspace"
 
 export const EditTool = Tool.define("edit", {
   description: DESCRIPTION,
@@ -34,10 +34,7 @@ export const EditTool = Tool.define("edit", {
       throw new Error("oldString and newString must be different")
     }
 
-    const filePath = path.isAbsolute(params.filePath) ? params.filePath : path.join(Instance.directory, params.filePath)
-    if (!Filesystem.contains(Instance.directory, filePath)) {
-      throw new Error(`File ${filePath} is not in the current working directory`)
-    }
+    const filePath = guard(params.filePath)
 
     const agent = await Agent.get(ctx.agent)
     let diff = ""
diff --git a/packages/opencode/src/tool/exitspecmode.ts b/packages/opencode/src/tool/exitspecmode.ts
new file mode 100644
index 0000000000..ddc402d9ea
--- /dev/null
+++ b/packages/opencode/src/tool/exitspecmode.ts
@@ -0,0 +1,81 @@
+import z from "zod/v4"
+import { Tool } from "./tool"
+import DESCRIPTION from "./exitspecmode.txt"
+import { getSpecState } from "./specmode"
+
+export const ExitSpecModeTool = Tool.define("exitspecmode", {
+  description: DESCRIPTION,
+  parameters: z.object({
+    plan: z.string().describe("The markdown-formatted plan you came up with"),
+    title: z.string().optional().describe("Optional title for the plan"),
+    include_context: z
+      .boolean()
+      .optional()
+      .describe("Include requirements and notes from SpecMode (default true)"),
+  }),
+  async execute(params, ctx) {
+    const specSessions = getSpecState()
+    const session = specSessions[ctx.sessionID]
+
+    // Check if in spec mode
+    if (!session?.active) {
+      // Allow exiting even if not in spec mode, but warn
+      return {
+        title: params.title || "Implementation Plan",
+        output:
+          "⚠️ Warning: Not currently in spec mode\n\n" + params.plan,
+        metadata: {
+          was_in_spec_mode: false,
+          requirements_count: 0,
+          notes_count: 0,
+          duration_seconds: 0,
+        },
+      }
+    }
+
+    const includeContext = params.include_context !== false
+
+    // Build output with optional context
+    let output = ""
+
+    if (includeContext && (session.requirements.length > 0 || session.notes.length > 0)) {
+      output += "## Spec Context\n\n"
+
+      if (session.requirements.length > 0) {
+        output += "### Requirements\n"
+        session.requirements.forEach((req, i) => {
+          output += `${i + 1}. ${req}\n`
+        })
+        output += "\n"
+      }
+
+      if (session.notes.length > 0) {
+        output += "### Planning Notes\n"
+        session.notes.forEach((note, i) => {
+          output += `${i + 1}. ${note}\n`
+        })
+        output += "\n"
+      }
+
+      output += "---\n\n"
+    }
+
+    output += params.plan
+
+    // Deactivate spec mode
+    session.active = false
+
+    const title = params.title || "Implementation Plan"
+
+    return {
+      title,
+      output,
+      metadata: {
+        was_in_spec_mode: true,
+        requirements_count: session.requirements.length,
+        notes_count: session.notes.length,
+        duration_seconds: Math.floor((Date.now() - session.startedAt) / 1000),
+      },
+    }
+  },
+})
diff --git a/packages/opencode/src/tool/exitspecmode.txt b/packages/opencode/src/tool/exitspecmode.txt
new file mode 100644
index 0000000000..bdaef68acc
--- /dev/null
+++ b/packages/opencode/src/tool/exitspecmode.txt
@@ -0,0 +1,24 @@
+Exit specification/planning mode and present your final implementation plan.
+
+Integration with SpecMode:
+- If you used SpecMode to collect requirements and notes, this tool will automatically include them in the output
+- The requirements and notes are prepended to your plan as context
+- Set `include_context: false` if you don't want the requirements/notes included
+- This tool deactivates the spec session after use
+
+Usage:
+- `plan` (required) - Your markdown-formatted implementation plan
+- `title` (optional) - Title for the plan (defaults to "Implementation Plan")
+- `include_context` (optional) - Include SpecMode requirements/notes (defaults to true)
+
+IMPORTANT: Only use this tool when the task requires planning the implementation steps of a task that requires writing code. For research tasks where you're gathering information, searching files, reading files or in general trying to understand the codebase - do NOT use this tool.
+
+Examples:
+1. Initial task: "Search for and understand the implementation of vim mode in the codebase" - Do not use this tool because you are not planning the implementation steps of a task.
+2. Initial task: "Help me implement yank mode for vim" - Use this tool after you have finished planning the implementation steps of the task.
+
+Workflow:
+1. Optionally use SpecMode to collect requirements and notes
+2. Create your implementation plan
+3. Use this tool to present the plan and exit spec mode
+4. The output will include SpecMode context (if collected) followed by your plan
diff --git a/packages/opencode/src/tool/fetchurl.ts b/packages/opencode/src/tool/fetchurl.ts
new file mode 100644
index 0000000000..94ddcd9de4
--- /dev/null
+++ b/packages/opencode/src/tool/fetchurl.ts
@@ -0,0 +1,454 @@
+import z from "zod/v4"
+import { Tool } from "./tool"
+import TurndownService from "turndown"
+import { Octokit } from "@octokit/rest"
+import { parseHTML } from "linkedom"
+import DESCRIPTION from "./fetchurl.txt"
+import { Config } from "../config/config"
+import { Permission } from "../permission"
+import { measure } from "./telemetry"
+
+const MAX_RESPONSE_SIZE = 10 * 1024 * 1024 // 10MB
+const DEFAULT_TIMEOUT = 30 * 1000 // 30 seconds
+const MAX_TIMEOUT = 120 * 1000 // 2 minutes
+const MAX_REDIRECTS = 5
+
+// Private IP ranges to block
+const PRIVATE_IP_RANGES = [
+  /^127\./, // 127.0.0.0/8
+  /^10\./, // 10.0.0.0/8
+  /^172\.(1[6-9]|2[0-9]|3[0-1])\./, // 172.16.0.0/12
+  /^192\.168\./, // 192.168.0.0/16
+  /^localhost$/i,
+  /^::1$/, // IPv6 localhost
+  /^fe80::/i, // IPv6 link-local
+]
+
+const schema = z.object({
+  url: z.string().describe("The URL to fetch content from"),
+  format: z
+    .enum(["markdown", "text", "html", "json", "auto"])
+    .optional()
+    .describe("Output format (auto-detected if not specified)"),
+  integration: z
+    .enum(["google_docs", "notion", "linear", "github", "gitlab", "jira", "pagerduty", "slack", "sentry", "generic"])
+    .optional()
+    .describe("Integration type (auto-detected if not specified)"),
+  auth_type: z
+    .enum(["bearer", "api_key", "header", "query", "none"])
+    .optional()
+    .describe("Authentication type"),
+  auth_token: z.string().optional().describe("Authentication token/API key"),
+  auth_header_name: z.string().optional().describe("Custom header name for auth (if auth_type=header)"),
+  auth_query_param: z.string().optional().describe("Query parameter name for auth (if auth_type=query)"),
+  timeout: z.number().optional().describe("Optional timeout in seconds (max 120)"),
+  follow_redirects: z.boolean().optional().describe("Follow HTTP redirects (default true, max 5)"),
+})
+
+type FetchArgs = z.infer<typeof schema>
+type FetchMeta = {
+  integration: string
+  api_used: boolean
+  content_type: string
+  size: number
+  redirects?: number
+  final_url?: string
+  status?: number
+}
+
+export const FetchUrlTool = Tool.define("fetchurl", {
+  description: DESCRIPTION,
+  parameters: schema,
+  async execute(params, ctx) {
+    const telemetryExtra: Record<string, unknown> = {}
+    return measure({
+      id: "fetchurl",
+      ctx,
+      params,
+      extra: telemetryExtra,
+      async run() {
+        // Validate URL and check for private IPs
+        if (!params.url.startsWith("http://") && !params.url.startsWith("https://")) {
+          throw new Error("URL must start with http:// or https://")
+        }
+
+    // Extract hostname and check against private IP patterns
+    const hostname = new URL(params.url).hostname
+    for (const pattern of PRIVATE_IP_RANGES) {
+      if (pattern.test(hostname)) {
+        throw new Error("Access to localhost and private IP addresses is not allowed")
+      }
+    }
+
+        const cfg = await Config.get()
+        if (cfg.permission?.fetchurl === "ask")
+          await Permission.ask({
+            type: "fetchurl",
+            sessionID: ctx.sessionID,
+            messageID: ctx.messageID,
+            callID: ctx.callID,
+            title: "Fetch content from: " + params.url,
+            metadata: {
+              url: params.url,
+              integration: params.integration,
+            },
+          })
+
+        const timeout = Math.min((params.timeout ?? DEFAULT_TIMEOUT / 1000) * 1000, MAX_TIMEOUT)
+
+        const integration = params.integration || detectIntegration(params.url)
+        telemetryExtra["integration"] = integration
+
+        if (integration === "github" && canUseGitHubAPI(params.url, params.auth_token)) {
+          const content = await fetchGitHubContent(params.url, params.auth_token, params.format)
+          telemetryExtra["api_used"] = true
+          telemetryExtra["final_url"] = params.url
+          const metadata = {
+            integration: "github",
+            api_used: true,
+            content_type: "api/json",
+            size: content.length,
+          }
+          return {
+            title: `${params.url} (github-api)`,
+            output: content,
+            metadata,
+          }
+        }
+
+        const result = await fetchHTTP(params, ctx, integration, timeout, telemetryExtra)
+        return result
+      },
+    })
+  },
+})
+
+function detectIntegration(url: string): string {
+  const hostname = new URL(url).hostname.toLowerCase()
+
+  if (hostname.includes("docs.google.com")) return "google_docs"
+  if (hostname.includes("notion.so") || hostname.includes("notion.site")) return "notion"
+  if (hostname.includes("linear.app")) return "linear"
+  if (hostname.includes("github.com")) return "github"
+  if (hostname.includes("gitlab.com")) return "gitlab"
+  if (hostname.includes("atlassian.net") || hostname.includes("jira.")) return "jira"
+  if (hostname.includes("pagerduty.com")) return "pagerduty"
+  if (hostname.includes("slack.com")) return "slack"
+  if (hostname.includes("sentry.io")) return "sentry"
+
+  return "generic"
+}
+
+function canUseGitHubAPI(url: string, authToken?: string): boolean {
+  // Check if URL is a GitHub file/repo URL and we have an auth token
+  return url.includes("github.com") && (!!authToken || !!process.env["GITHUB_TOKEN"])
+}
+
+async function fetchGitHubContent(url: string, authToken?: string, format?: string): Promise<string> {
+  const token = authToken || process.env["GITHUB_TOKEN"]
+  const octokit = new Octokit({ auth: token })
+
+  // Parse GitHub URL
+  const match = url.match(/github\.com\/([^\/]+)\/([^\/]+)(?:\/(?:blob|tree)\/([^\/]+)\/(.+))?/)
+  if (!match) {
+    throw new Error("Invalid GitHub URL format")
+  }
+
+  const [, owner, repo, ref, path] = match
+
+  // If no path, fetch README
+  if (!path) {
+    const { data } = await octokit.repos.getReadme({ owner, repo, ref })
+    const content = Buffer.from(data.content, "base64").toString()
+    if (format === "html") return content
+    if (format === "text") return stripMarkdown(content)
+    return content // markdown by default
+  }
+
+  // Fetch file content
+  const { data } = await octokit.repos.getContent({ owner, repo, path, ref })
+
+  if (Array.isArray(data)) {
+    // Directory listing
+    let markdown = `# Directory: ${path}\n\n`
+    for (const item of data) {
+      markdown += `- [${item.type === "dir" ? "📁" : "📄"} ${item.name}](${item.html_url})\n`
+    }
+    return markdown
+  }
+
+  if ("content" in data) {
+    const content = Buffer.from(data.content, "base64").toString()
+    const ext = path.split(".").pop()?.toLowerCase()
+
+    if (format === "text") return content
+    if (format === "html") return `<pre><code>${escapeHTML(content)}</code></pre>`
+
+    // Return with syntax highlighting info
+    return `\`\`\`${ext}\n${content}\n\`\`\``
+  }
+
+  throw new Error("Could not fetch GitHub content")
+}
+
+async function fetchHTTP(
+  params: FetchArgs,
+  ctx: Tool.Context,
+  integration: string,
+  timeout: number,
+  telemetryExtra: Record<string, unknown>,
+): Promise<{ title: string; output: string; metadata: FetchMeta }> {
+    const state = { url: params.url, redirects: 0 }
+    const chain = [params.url]
+  const follow = params.follow_redirects !== false
+
+  const headers: Record<string, string> = {
+    "User-Agent":
+      "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+    Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+    "Accept-Language": "en-US,en;q=0.9",
+  }
+
+  if (params.auth_token) {
+    const authType = params.auth_type || "bearer"
+    if (authType === "bearer") headers["Authorization"] = `Bearer ${params.auth_token}`
+    if (authType === "api_key") headers["X-API-Key"] = params.auth_token
+    if (authType === "header" && params.auth_header_name) headers[params.auth_header_name] = params.auth_token
+    if (authType === "query" && params.auth_query_param) {
+      const urlObj = new URL(state.url)
+      urlObj.searchParams.set(params.auth_query_param, params.auth_token)
+      state.url = urlObj.toString()
+    }
+  }
+
+  while (true) {
+    const controller = new AbortController()
+    const timer = setTimeout(() => controller.abort(), timeout)
+
+    const response = await fetch(state.url, {
+      signal: AbortSignal.any([controller.signal, ctx.abort]),
+      headers,
+      redirect: "manual",
+    })
+
+    clearTimeout(timer)
+
+    const isRedirect =
+      response.status === 301 || response.status === 302 || response.status === 307 || response.status === 308
+    if (follow && isRedirect) {
+      const location = response.headers.get("location")
+      if (!location) throw new Error(`Redirect without location header`)
+
+      state.redirects += 1
+      if (state.redirects > MAX_REDIRECTS) throw new Error(`Too many redirects (max ${MAX_REDIRECTS})`)
+
+      state.url = new URL(location, state.url).toString()
+      chain.push(state.url)
+      const newHostname = new URL(state.url).hostname
+      for (const pattern of PRIVATE_IP_RANGES) {
+        if (pattern.test(newHostname)) throw new Error("Redirect to localhost/private IP is not allowed")
+      }
+
+      continue
+    }
+
+    if (!response.ok) throw new Error(`Request failed with status code: ${response.status}`)
+
+    const contentLength = response.headers.get("content-length")
+    if (contentLength && parseInt(contentLength) > MAX_RESPONSE_SIZE) {
+      throw new Error(`Response too large (exceeds ${MAX_RESPONSE_SIZE / 1024 / 1024}MB limit)`)
+    }
+
+    const arrayBuffer = await response.arrayBuffer()
+    if (arrayBuffer.byteLength > MAX_RESPONSE_SIZE) {
+      throw new Error(`Response too large (exceeds ${MAX_RESPONSE_SIZE / 1024 / 1024}MB limit)`)
+    }
+
+    const content = new TextDecoder().decode(arrayBuffer)
+    const contentType = response.headers.get("content-type") || ""
+    const output = await processContent(content, contentType, integration, state.url, params.format)
+    telemetryExtra["final_url"] = state.url
+    telemetryExtra["redirects"] = state.redirects
+    telemetryExtra["redirect_chain"] = chain
+    telemetryExtra["content_type"] = contentType
+    telemetryExtra["status"] = response.status
+
+    return {
+      title: `${state.url} (${integration})`,
+      output,
+      metadata: {
+        integration,
+        api_used: false,
+        content_type: contentType,
+        size: arrayBuffer.byteLength,
+        redirects: state.redirects,
+        final_url: state.url,
+        status: response.status,
+      },
+    }
+  }
+}
+
+async function processContent(
+  content: string,
+  contentType: string,
+  integration: string,
+  url: string,
+  formatPreference?: FetchArgs["format"],
+): Promise<string> {
+  // Handle JSON responses
+  if (contentType.includes("application/json")) {
+    if (formatPreference === "json" || formatPreference === "text") {
+      return content
+    }
+    try {
+      const json = JSON.parse(content)
+      return formatJSONAsMarkdown(json, integration)
+    } catch {
+      return content
+    }
+  }
+
+  // Handle HTML content
+  if (contentType.includes("text/html")) {
+    if (formatPreference === "html") {
+      return content
+    }
+    if (formatPreference === "text") {
+      return extractTextFromHTML(content)
+    }
+    // Default to markdown
+    return convertHTMLToMarkdown(content, integration, url)
+  }
+
+  // Plain text or other
+  return content
+}
+
+function formatJSONAsMarkdown(data: unknown, integration: string): string {
+  const heading = `# ${integration.toUpperCase()} Content`
+  const parts = [heading, ""]
+  const record = toRecord(data)
+
+  if (integration === "github" && record) {
+    const name = record["name"]
+    if (typeof name === "string" && name) parts.push(`## ${name}`, "")
+    const description = record["description"]
+    if (typeof description === "string" && description) parts.push(description, "")
+    const content = record["content"]
+    if (typeof content === "string" && content) {
+      const decoded = Buffer.from(content, "base64").toString()
+      parts.push("```", decoded, "```", "")
+    }
+    return parts.join("\n").trimEnd()
+  }
+
+  if (integration === "linear" && record) {
+    const title = record["title"]
+    if (typeof title === "string" && title) parts.push(`## ${title}`, "")
+    const description = record["description"]
+    if (typeof description === "string" && description) parts.push(description, "")
+    const state = record["state"]
+    if (typeof state === "string" && state) parts.push(`**State:** ${state}`, "")
+    return parts.join("\n").trimEnd()
+  }
+
+  if (integration === "jira" && record) {
+    const fields = toRecord(record["fields"])
+    if (fields) {
+      const summary = fields["summary"]
+      if (typeof summary === "string" && summary) parts.push(`## ${summary}`, "")
+      const description = fields["description"]
+      if (typeof description === "string" && description) parts.push(description, "")
+      const status = toRecord(fields["status"])
+      const name = status ? status["name"] : undefined
+      if (typeof name === "string" && name) parts.push(`**Status:** ${name}`, "")
+    }
+    return parts.join("\n").trimEnd()
+  }
+
+  parts.push("```json", JSON.stringify(data, null, 2), "```", "")
+  return parts.join("\n").trimEnd()
+}
+
+function toRecord(value: unknown): Record<string, unknown> | null {
+  if (value && typeof value === "object") return value as Record<string, unknown>
+  return null
+}
+
+function convertHTMLToMarkdown(html: string, integration: string, url: string): string {
+  // First extract relevant content based on integration
+  const extracted = extractIntegrationContent(html, integration)
+
+  const turndownService = new TurndownService({
+    headingStyle: "atx",
+    hr: "---",
+    bulletListMarker: "-",
+    codeBlockStyle: "fenced",
+    emDelimiter: "*",
+  })
+
+  turndownService.remove(["script", "style", "meta", "link", "nav", "footer", "header"])
+
+  let markdown = turndownService.turndown(extracted)
+  markdown = `# Content from ${url}\n\n${markdown}`
+
+  return markdown.trim()
+}
+
+function extractIntegrationContent(html: string, integration: string): string {
+  try {
+    const { document } = parseHTML(html)
+
+    switch (integration) {
+      case "google_docs":
+        const docsContent = document.querySelector(".kix-appview-editor")
+        return docsContent?.innerHTML || html
+      case "notion":
+        const notionContent = document.querySelector(".notion-page-content")
+        return notionContent?.innerHTML || html
+      case "github":
+        const ghContent =
+          document.querySelector(".markdown-body") || document.querySelector(".highlight")
+        return ghContent?.innerHTML || html
+      default:
+        // Try to find main content
+        const main = document.querySelector("main") || document.querySelector("article") || document.querySelector("body")
+        return main?.innerHTML || html
+    }
+  } catch {
+    return html
+  }
+}
+
+function extractTextFromHTML(html: string): string {
+  try {
+    const { document } = parseHTML(html)
+    return document.body?.textContent?.trim() || html
+  } catch {
+    return html.replace(/<[^>]*>/g, " ").replace(/\s+/g, " ").trim()
+  }
+}
+
+function stripMarkdown(content: string): string {
+  return content
+    .replace(/#+\s/g, "")
+    .replace(/\*\*([^*]+)\*\*/g, "$1")
+    .replace(/\*([^*]+)\*/g, "$1")
+    .replace(/`([^`]+)`/g, "$1")
+    .replace(/\[([^\]]+)\]\([^)]+\)/g, "$1")
+    .trim()
+}
+
+function escapeHTML(text: string): string {
+  return text.replace(/[&<>"']/g, (char) => {
+    const escape: Record<string, string> = {
+      "&": "&amp;",
+      "<": "&lt;",
+      ">": "&gt;",
+      '"': "&quot;",
+      "'": "&#39;",
+    }
+    return escape[char] || char
+  })
+}
diff --git a/packages/opencode/src/tool/fetchurl.txt b/packages/opencode/src/tool/fetchurl.txt
new file mode 100644
index 0000000000..67f8425ec9
--- /dev/null
+++ b/packages/opencode/src/tool/fetchurl.txt
@@ -0,0 +1,13 @@
+URL content fetching and scraping tool with integration support
+
+Features:
+- Generic webpage scraping with markdown conversion
+- Integration support for: Google Docs, Notion, Linear, GitHub, GitLab, Jira, PagerDuty, Slack, Sentry
+- Returns content in markdown format
+- Security: Blocks localhost and private IP addresses
+- Configurable timeout and size limits
+
+Usage notes:
+- Automatically detects and handles platform-specific content formats
+- Supports both public URLs and authenticated endpoints (when credentials are provided)
+- Returns structured markdown for better readability
diff --git a/packages/opencode/src/tool/history.ts b/packages/opencode/src/tool/history.ts
new file mode 100644
index 0000000000..87e6ec482d
--- /dev/null
+++ b/packages/opencode/src/tool/history.ts
@@ -0,0 +1,50 @@
+import { Storage } from "../storage/storage"
+import type { TelemetryEvent } from "./telemetry-event"
+
+const KEY = ["telemetry", "tools"]
+const MAX_EVENTS = 200
+
+type TelemetrySummary = {
+  version: 1
+  tools: Record<string, { runs: number; errors: number; totalDuration: number }>
+  events: TelemetryEvent[]
+}
+
+async function ensure(): Promise<TelemetrySummary> {
+  try {
+    return await Storage.read<TelemetrySummary>(KEY)
+  } catch {
+    const fresh: TelemetrySummary = { version: 1, tools: {}, events: [] }
+    await Storage.write(KEY, fresh)
+    return fresh
+  }
+}
+
+async function write(summary: TelemetrySummary) {
+  await Storage.write(KEY, summary)
+}
+
+export namespace ToolHistory {
+  export async function record(event: TelemetryEvent) {
+    const summary = await ensure()
+    const entry = (summary.tools[event.id] ??= { runs: 0, errors: 0, totalDuration: 0 })
+    entry.runs += 1
+    entry.totalDuration += event.duration
+    if (event.status === "error") entry.errors += 1
+
+    summary.events.push(event)
+    if (summary.events.length > MAX_EVENTS) {
+      summary.events.splice(0, summary.events.length - MAX_EVENTS)
+    }
+    await write(summary)
+  }
+
+  export async function read(): Promise<TelemetrySummary> {
+    return ensure()
+  }
+
+  export async function clear() {
+    const fresh: TelemetrySummary = { version: 1, tools: {}, events: [] }
+    await write(fresh)
+  }
+}
diff --git a/packages/opencode/src/tool/ls.ts b/packages/opencode/src/tool/ls.ts
index b80f668a5b..b815fd766a 100644
--- a/packages/opencode/src/tool/ls.ts
+++ b/packages/opencode/src/tool/ls.ts
@@ -4,6 +4,7 @@ import * as path from "path"
 import DESCRIPTION from "./ls.txt"
 import { Instance } from "../project/instance"
 import { Ripgrep } from "../file/ripgrep"
+import { measure } from "./telemetry"
 
 export const IGNORE_PATTERNS = [
   "node_modules/",
@@ -40,71 +41,87 @@ export const ListTool = Tool.define("list", {
     path: z.string().describe("The absolute path to the directory to list (must be absolute, not relative)").optional(),
     ignore: z.array(z.string()).describe("List of glob patterns to ignore").optional(),
   }),
-  async execute(params) {
+  async execute(params, ctx) {
     const searchPath = path.resolve(Instance.directory, params.path || ".")
-
-    const ignoreGlobs = IGNORE_PATTERNS.map((p) => `!${p}*`).concat(params.ignore?.map((p) => `!${p}`) || [])
-    const files = []
-    for await (const file of Ripgrep.files({ cwd: searchPath, glob: ignoreGlobs })) {
-      files.push(file)
-      if (files.length >= LIMIT) break
-    }
-
-    // Build directory structure
-    const dirs = new Set<string>()
-    const filesByDir = new Map<string, string[]>()
-
-    for (const file of files) {
-      const dir = path.dirname(file)
-      const parts = dir === "." ? [] : dir.split("/")
-
-      // Add all parent directories
-      for (let i = 0; i <= parts.length; i++) {
-        const dirPath = i === 0 ? "." : parts.slice(0, i).join("/")
-        dirs.add(dirPath)
-      }
-
-      // Add file to its directory
-      if (!filesByDir.has(dir)) filesByDir.set(dir, [])
-      filesByDir.get(dir)!.push(path.basename(file))
-    }
-
-    function renderDir(dirPath: string, depth: number): string {
-      const indent = "  ".repeat(depth)
-      let output = ""
-
-      if (depth > 0) {
-        output += `${indent}${path.basename(dirPath)}/\n`
-      }
-
-      const childIndent = "  ".repeat(depth + 1)
-      const children = Array.from(dirs)
-        .filter((d) => path.dirname(d) === dirPath && d !== dirPath)
-        .sort()
-
-      // Render subdirectories first
-      for (const child of children) {
-        output += renderDir(child, depth + 1)
-      }
-
-      // Render files
-      const files = filesByDir.get(dirPath) || []
-      for (const file of files.sort()) {
-        output += `${childIndent}${file}\n`
-      }
-
-      return output
-    }
-
-    const output = `${searchPath}/\n` + renderDir(".", 0)
-
-    return {
-      title: path.relative(Instance.worktree, searchPath),
-      metadata: {
-        count: files.length,
-        truncated: files.length >= LIMIT,
+    const relativePath = path.relative(Instance.worktree, searchPath)
+
+    return measure({
+      id: "list",
+      ctx,
+      params,
+      captureInput: () => ({
+        path: relativePath,
+        ignore: params.ignore,
+      }),
+      captureOutput: (result) => ({
+        count: result.metadata ? (result.metadata as Record<string, unknown>)["count"] : undefined,
+        truncated: result.metadata ? (result.metadata as Record<string, unknown>)["truncated"] : undefined,
+      }),
+      async run() {
+        const ignoreGlobs = IGNORE_PATTERNS.map((p) => `!${p}*`).concat(params.ignore?.map((p) => `!${p}`) || [])
+        const files = []
+        for await (const file of Ripgrep.files({ cwd: searchPath, glob: ignoreGlobs })) {
+          files.push(file)
+          if (files.length >= LIMIT) break
+        }
+
+        // Build directory structure
+        const dirs = new Set<string>()
+        const filesByDir = new Map<string, string[]>()
+
+        for (const file of files) {
+          const dir = path.dirname(file)
+          const parts = dir === "." ? [] : dir.split("/")
+
+          // Add all parent directories
+          for (let i = 0; i <= parts.length; i++) {
+            const dirPath = i === 0 ? "." : parts.slice(0, i).join("/")
+            dirs.add(dirPath)
+          }
+
+          // Add file to its directory
+          if (!filesByDir.has(dir)) filesByDir.set(dir, [])
+          filesByDir.get(dir)!.push(path.basename(file))
+        }
+
+        function renderDir(dirPath: string, depth: number): string {
+          const indent = "  ".repeat(depth)
+          let output = ""
+
+          if (depth > 0) {
+            output += `${indent}${path.basename(dirPath)}/\n`
+          }
+
+          const childIndent = "  ".repeat(depth + 1)
+          const children = Array.from(dirs)
+            .filter((d) => path.dirname(d) === dirPath && d !== dirPath)
+            .sort()
+
+          // Render subdirectories first
+          for (const child of children) {
+            output += renderDir(child, depth + 1)
+          }
+
+          // Render files
+          const files = filesByDir.get(dirPath) || []
+          for (const file of files.sort()) {
+            output += `${childIndent}${file}\n`
+          }
+
+          return output
+        }
+
+        const output = `${searchPath}/\n` + renderDir(".", 0)
+
+        return {
+          title: relativePath,
+          metadata: {
+            count: files.length,
+            truncated: files.length >= LIMIT,
+          },
+          output,
+        }
       },
-      output,
-    }
+    })
   },
 })
diff --git a/packages/opencode/src/tool/multiedit.ts b/packages/opencode/src/tool/multiedit.ts
index 2a1b2fbbbf..fe7bc3fde9 100644
--- a/packages/opencode/src/tool/multiedit.ts
+++ b/packages/opencode/src/tool/multiedit.ts
@@ -4,6 +4,7 @@ import { EditTool } from "./edit"
 import DESCRIPTION from "./multiedit.txt"
 import path from "path"
 import { Instance } from "../project/instance"
+import { guard } from "./workspace"
 
 export const MultiEditTool = Tool.define("multiedit", {
   description: DESCRIPTION,
@@ -23,10 +24,12 @@ export const MultiEditTool = Tool.define("multiedit", {
   async execute(params, ctx) {
     const tool = await EditTool.init()
     const results = []
+    const defaultPath = guard(params.filePath)
     for (const [, edit] of params.edits.entries()) {
+      const dest = guard(edit.filePath || defaultPath)
       const result = await tool.execute(
         {
-          filePath: params.filePath,
+          filePath: dest,
           oldString: edit.oldString,
           newString: edit.newString,
           replaceAll: edit.replaceAll,
@@ -35,8 +38,10 @@ export const MultiEditTool = Tool.define("multiedit", {
       )
       results.push(result)
     }
+    const last = params.edits.at(-1)
+    const head = guard(last?.filePath || defaultPath)
     return {
-      title: path.relative(Instance.worktree, params.filePath),
+      title: path.relative(Instance.worktree, head),
       metadata: {
         results: results.map((r) => r.metadata),
       },
diff --git a/packages/opencode/src/tool/multiedit.txt b/packages/opencode/src/tool/multiedit.txt
index bb4815124d..5529849af5 100644
--- a/packages/opencode/src/tool/multiedit.txt
+++ b/packages/opencode/src/tool/multiedit.txt
@@ -1,4 +1,4 @@
-This is a tool for making multiple edits to a single file in one operation. It is built on top of the Edit tool and allows you to perform multiple find-and-replace operations efficiently. Prefer this tool over the Edit tool when you need to make multiple edits to the same file.
+This is a tool for making multiple edits in one operation. It is built on top of the Edit tool and allows you to perform multiple find-and-replace operations efficiently. Prefer this tool over the Edit tool when you need to make several edits to the same file.
 
 Before using this tool:
 
@@ -6,8 +6,9 @@ Before using this tool:
 2. Verify the directory path is correct
 
 To make multiple file edits, provide the following:
-1. file_path: The absolute path to the file to modify (must be absolute, not relative)
+1. file_path: The default absolute path to modify (must be absolute, not relative)
 2. edits: An array of edit operations to perform, where each edit contains:
+   - filePath (optional): Override target path for this edit. Use it to touch additional files in the same call.
    - oldString: The text to replace (must match the file contents exactly, including all whitespace and indentation)
    - newString: The edited text to replace the oldString
    - replaceAll: Replace all occurrences of oldString. This parameter is optional and defaults to false.
@@ -16,7 +17,7 @@ IMPORTANT:
 - All edits are applied in sequence, in the order they are provided
 - Each edit operates on the result of the previous edit
 - All edits must be valid for the operation to succeed - if any edit fails, none will be applied
-- This tool is ideal when you need to make several changes to different parts of the same file
+- This tool is ideal when you need to make several changes to different parts of the same file, and it now supports touching a small set of related files when you supply per-edit file paths
 
 CRITICAL REQUIREMENTS:
 1. All edits follow the same requirements as the single Edit tool
diff --git a/packages/opencode/src/tool/patch.ts b/packages/opencode/src/tool/patch.ts
index 8f30330804..1d87e7d6e9 100644
--- a/packages/opencode/src/tool/patch.ts
+++ b/packages/opencode/src/tool/patch.ts
@@ -9,14 +9,18 @@ import { FileWatcher } from "../file/watcher"
 import { Instance } from "../project/instance"
 import { Agent } from "../agent/agent"
 import { Patch } from "../patch"
-import { Filesystem } from "../util/filesystem"
 import { createTwoFilesPatch } from "diff"
+import { guard } from "./workspace"
 
 const PatchParams = z.object({
   patchText: z.string().describe("The full patch text that describes all changes to be made"),
 })
 
-export const PatchTool = Tool.define("patch", {
+type PatchMetadata = {
+  diff: string
+}
+
+export const PatchTool = Tool.define<typeof PatchParams, PatchMetadata>("patch", {
   description: "Apply a patch to modify multiple files. Supports adding, updating, and deleting files with context-aware changes.",
   parameters: PatchParams,
   async execute(params, ctx) {
@@ -50,11 +54,7 @@ export const PatchTool = Tool.define("patch", {
     let totalDiff = ""
 
     for (const hunk of hunks) {
-      const filePath = path.resolve(Instance.directory, hunk.path)
-      
-      if (!Filesystem.contains(Instance.directory, filePath)) {
-        throw new Error(`File ${filePath} is not in the current working directory`)
-      }
+      const filePath = guard(hunk.path)
 
       switch (hunk.type) {
         case "add":
@@ -95,13 +95,13 @@ export const PatchTool = Tool.define("patch", {
           }
           
           const diff = createTwoFilesPatch(filePath, filePath, oldContent, newContent)
-          
+
           fileChanges.push({
             filePath,
             oldContent,
             newContent,
             type: hunk.move_path ? "move" : "update",
-            movePath: hunk.move_path ? path.resolve(Instance.directory, hunk.move_path) : undefined,
+            movePath: hunk.move_path ? guard(hunk.move_path) : undefined,
           })
           
           totalDiff += diff + "\n"
@@ -204,4 +204,4 @@ export const PatchTool = Tool.define("patch", {
       output: `Patch applied successfully. ${summary}:\n${relativePaths.map(p => `  ${p}`).join("\n")}`,
     }
   },
-})
\ No newline at end of file
+})
diff --git a/packages/opencode/src/tool/read.ts b/packages/opencode/src/tool/read.ts
index 2ed3accbd1..0c4963bf06 100644
--- a/packages/opencode/src/tool/read.ts
+++ b/packages/opencode/src/tool/read.ts
@@ -1,12 +1,13 @@
 import z from "zod/v4"
-import * as fs from "fs"
+import * as fs from "fs/promises"
 import * as path from "path"
 import { Tool } from "./tool"
 import { LSP } from "../lsp"
 import { FileTime } from "../file/time"
 import DESCRIPTION from "./read.txt"
-import { Filesystem } from "../util/filesystem"
 import { Instance } from "../project/instance"
+import { guard } from "./workspace"
+import { measure } from "./telemetry"
 
 const DEFAULT_READ_LIMIT = 2000
 const MAX_LINE_LENGTH = 2000
@@ -19,69 +20,82 @@ export const ReadTool = Tool.define("read", {
     limit: z.coerce.number().describe("The number of lines to read (defaults to 2000)").optional(),
   }),
   async execute(params, ctx) {
-    let filepath = params.filePath
-    if (!path.isAbsolute(filepath)) {
-      filepath = path.join(process.cwd(), filepath)
-    }
-    if (!ctx.extra?.["bypassCwdCheck"] && !Filesystem.contains(Instance.directory, filepath)) {
-      throw new Error(`File ${filepath} is not in the current working directory`)
-    }
-
-    const file = Bun.file(filepath)
-    if (!(await file.exists())) {
-      const dir = path.dirname(filepath)
-      const base = path.basename(filepath)
-
-      const dirEntries = fs.readdirSync(dir)
-      const suggestions = dirEntries
-        .filter(
-          (entry) =>
-            entry.toLowerCase().includes(base.toLowerCase()) || base.toLowerCase().includes(entry.toLowerCase()),
-        )
-        .map((entry) => path.join(dir, entry))
-        .slice(0, 3)
-
-      if (suggestions.length > 0) {
-        throw new Error(`File not found: ${filepath}\n\nDid you mean one of these?\n${suggestions.join("\n")}`)
-      }
-
-      throw new Error(`File not found: ${filepath}`)
-    }
-
-    const limit = params.limit ?? DEFAULT_READ_LIMIT
-    const offset = params.offset || 0
-    const isImage = isImageFile(filepath)
-    if (isImage) throw new Error(`This is an image file of type: ${isImage}\nUse a different tool to process images`)
-    const isBinary = await isBinaryFile(filepath, file)
-    if (isBinary) throw new Error(`Cannot read binary file: ${filepath}`)
-    const lines = await file.text().then((text) => text.split("\n"))
-    const raw = lines.slice(offset, offset + limit).map((line) => {
-      return line.length > MAX_LINE_LENGTH ? line.substring(0, MAX_LINE_LENGTH) + "..." : line
-    })
-    const content = raw.map((line, index) => {
-      return `${(index + offset + 1).toString().padStart(5, "0")}| ${line}`
-    })
-    const preview = raw.slice(0, 20).join("\n")
-
-    let output = "<file>\n"
-    output += content.join("\n")
-
-    if (lines.length > offset + content.length) {
-      output += `\n\n(File has more lines. Use 'offset' parameter to read beyond line ${offset + content.length})`
-    }
-    output += "\n</file>"
-
-    // just warms the lsp client
-    LSP.touchFile(filepath, false)
-    FileTime.read(ctx.sessionID, filepath)
-
-    return {
-      title: path.relative(Instance.worktree, filepath),
-      output,
-      metadata: {
-        preview,
+    const resolvedPath = path.resolve(Instance.directory, params.filePath)
+    const relativePath = path.relative(Instance.worktree, resolvedPath)
+    return measure({
+      id: "read",
+      ctx,
+      params,
+      captureInput: () => ({
+        filePath: relativePath,
+        offset: params.offset ?? 0,
+        limit: params.limit ?? DEFAULT_READ_LIMIT,
+      }),
+      captureOutput: (result) => ({
+        title: result.title,
+        preview: result.metadata ? (result.metadata as Record<string, unknown>)["preview"] : undefined,
+      }),
+      async run() {
+        const filepath = guard(params.filePath, {
+          bypass: Boolean(ctx.extra?.["bypassCwdCheck"]),
+        })
+
+        const file = Bun.file(filepath)
+        if (!(await file.exists())) {
+          const dir = path.dirname(filepath)
+          const base = path.basename(filepath)
+
+          const dirEntries = await fs.readdir(dir).catch(() => [] as string[])
+          const suggestions = dirEntries
+            .filter(
+              (entry) =>
+                entry.toLowerCase().includes(base.toLowerCase()) || base.toLowerCase().includes(entry.toLowerCase()),
+            )
+            .map((entry) => path.join(dir, entry))
+            .slice(0, 3)
+
+          if (suggestions.length > 0) {
+            throw new Error(`File not found: ${filepath}\n\nDid you mean one of these?\n${suggestions.join("\n")}`)
+          }
+
+          throw new Error(`File not found: ${filepath}`)
+        }
+
+        const limit = params.limit ?? DEFAULT_READ_LIMIT
+        const offset = params.offset || 0
+        const isImage = isImageFile(filepath)
+        if (isImage) throw new Error(`This is an image file of type: ${isImage}\nUse a different tool to process images`)
+        const isBinary = await isBinaryFile(filepath, file)
+        if (isBinary) throw new Error(`Cannot read binary file: ${filepath}`)
+        const lines = await file.text().then((text) => text.split("\n"))
+        const raw = lines.slice(offset, offset + limit).map((line) => {
+          return line.length > MAX_LINE_LENGTH ? line.substring(0, MAX_LINE_LENGTH) + "..." : line
+        })
+        const content = raw.map((line, index) => {
+          return `${(index + offset + 1).toString().padStart(5, "0")}| ${line}`
+        })
+        const preview = raw.slice(0, 20).join("\n")
+
+        let output = "<file>\n"
+        output += content.join("\n")
+
+        if (lines.length > offset + content.length) {
+          output += `\n\n(File has more lines. Use 'offset' parameter to read beyond line ${offset + content.length})`
+        }
+        output += "\n</file>"
+
+        LSP.touchFile(filepath, false)
+        FileTime.read(ctx.sessionID, filepath)
+
+        return {
+          title: path.relative(Instance.worktree, filepath),
+          output,
+          metadata: {
+            preview,
+          },
+        }
       },
-    }
+    })
   },
 })
 
diff --git a/packages/opencode/src/tool/registry.ts b/packages/opencode/src/tool/registry.ts
index 1d6372090e..a5bd9384f6 100644
--- a/packages/opencode/src/tool/registry.ts
+++ b/packages/opencode/src/tool/registry.ts
@@ -1,13 +1,19 @@
 import { BashTool } from "./bash"
 import { EditTool } from "./edit"
+import { ExitSpecModeTool } from "./exitspecmode"
+import { FetchUrlTool } from "./fetchurl"
 import { GlobTool } from "./glob"
 import { GrepTool } from "./grep"
 import { ListTool } from "./ls"
+import { LspDiagnosticTool } from "./lsp-diagnostics"
+import { LspHoverTool } from "./lsp-hover"
+import { MultiEditTool } from "./multiedit"
 import { PatchTool } from "./patch"
 import { ReadTool } from "./read"
+import { SpecModeTool } from "./specmode"
 import { TaskTool } from "./task"
 import { TodoWriteTool, TodoReadTool } from "./todo"
-import { WebFetchTool } from "./webfetch"
+import { WebSearchTool } from "./websearch"
 import { WriteTool } from "./write"
 import { InvalidTool } from "./invalid"
 import type { Agent } from "../agent/agent"
@@ -15,7 +21,7 @@ import { Tool } from "./tool"
 import { Instance } from "../project/instance"
 import { Config } from "../config/config"
 import path from "path"
-import { type ToolDefinition } from "@opencode-ai/plugin"
+import { type ToolDefinition, type ToolContext as PluginToolContext } from "@opencode-ai/plugin"
 import z from "zod/v4"
 import { Plugin } from "../plugin"
 
@@ -45,23 +51,66 @@ export namespace ToolRegistry {
   })
 
   function fromPlugin(id: string, def: ToolDefinition): Tool.Info {
+    const parameters = z.object(def.args)
     return {
       id,
       init: async () => ({
-        parameters: z.object(def.args),
+        parameters,
         description: def.description,
         execute: async (args, ctx) => {
-          const result = await def.execute(args as any, ctx)
+          const pluginCtx: PluginToolContext = {
+            sessionID: ctx.sessionID,
+            messageID: ctx.messageID,
+            agent: ctx.agent,
+            abort: ctx.abort,
+          }
+          const result = await def.execute(args as z.infer<typeof parameters>, pluginCtx)
+          const normalized = pluginResult(result)
           return {
-            title: "",
-            output: result,
-            metadata: {},
+            title: normalized.title ?? "",
+            output: normalized.output,
+            metadata: normalized.metadata ?? {},
           }
         },
       }),
     }
   }
 
+  function pluginResult(value: unknown): {
+    output: string
+    title?: string
+    metadata?: Record<string, unknown>
+  } {
+    if (typeof value === "string") {
+      return { output: value }
+    }
+    if (!value || typeof value !== "object") {
+      return { output: String(value ?? "") }
+    }
+    const record = value as {
+      output?: unknown
+      title?: unknown
+      metadata?: unknown
+    }
+    if (typeof record.output === "string") {
+      const metadata = isRecord(record.metadata) ? record.metadata : undefined
+      const title = typeof record.title === "string" ? record.title : undefined
+      return {
+        output: record.output,
+        title,
+        metadata,
+      }
+    }
+    return { output: JSON.stringify(value) }
+  }
+
+  function isRecord(value: unknown): value is Record<string, unknown> {
+    if (!value) return false
+    if (typeof value !== "object") return false
+    if (Array.isArray(value)) return false
+    return true
+  }
+
   export async function register(tool: Tool.Info) {
     const { custom } = await state()
     const idx = custom.findIndex((t) => t.id === tool.id)
@@ -78,10 +127,16 @@ export namespace ToolRegistry {
       InvalidTool,
       BashTool,
       EditTool,
-      WebFetchTool,
+      MultiEditTool,
+      SpecModeTool,
+      ExitSpecModeTool,
+      FetchUrlTool,
+      WebSearchTool,
       GlobTool,
       GrepTool,
       ListTool,
+      LspDiagnosticTool,
+      LspHoverTool,
       PatchTool,
       ReadTool,
       WriteTool,
@@ -123,8 +178,11 @@ export namespace ToolRegistry {
     if (agent.permission.bash["*"] === "deny" && Object.keys(agent.permission.bash).length === 1) {
       result["bash"] = false
     }
-    if (agent.permission.webfetch === "deny") {
-      result["webfetch"] = false
+    if (agent.permission.fetchurl === "deny") {
+      result["fetchurl"] = false
+    }
+    if (agent.permission.websearch === "deny") {
+      result["websearch"] = false
     }
 
     return result
diff --git a/packages/opencode/src/tool/specmode.ts b/packages/opencode/src/tool/specmode.ts
new file mode 100644
index 0000000000..8f9f7f7207
--- /dev/null
+++ b/packages/opencode/src/tool/specmode.ts
@@ -0,0 +1,376 @@
+import z from "zod/v4"
+import { Tool } from "./tool"
+import DESCRIPTION from "./specmode.txt"
+import { Instance } from "../project/instance"
+import path from "path"
+import fs from "fs/promises"
+import { measure } from "./telemetry"
+
+const state = Instance.state(() => {
+  const specSessions: {
+    [sessionId: string]: {
+      active: boolean
+      requirements: string[]
+      notes: string[]
+      startedAt: number
+      template?: string
+    }
+  } = {}
+  return specSessions
+})
+
+// Export state accessor for ExitSpecMode
+export function getSpecState() {
+  return state()
+}
+
+// Spec templates
+const TEMPLATES = {
+  feature: {
+    name: "Feature Specification",
+    requirements: [
+      "User story or use case",
+      "Acceptance criteria",
+      "Performance requirements",
+      "Security considerations",
+    ],
+    notes: ["Architecture approach", "Dependencies", "Testing strategy", "Rollout plan"],
+  },
+  api: {
+    name: "API Design",
+    requirements: [
+      "Endpoint paths and methods",
+      "Request/response schemas",
+      "Authentication/authorization",
+      "Rate limiting requirements",
+    ],
+    notes: ["Error handling strategy", "Versioning approach", "Documentation plan"],
+  },
+  bugfix: {
+    name: "Bug Fix",
+    requirements: ["Bug description and steps to reproduce", "Expected vs actual behavior", "Impact assessment"],
+    notes: ["Root cause analysis", "Proposed solution", "Testing approach", "Regression prevention"],
+  },
+  refactor: {
+    name: "Refactoring",
+    requirements: ["Current issues/technical debt", "Goals and constraints", "Success criteria"],
+    notes: ["Refactoring approach", "Migration strategy", "Testing strategy"],
+  },
+}
+
+export const SpecModeTool = Tool.define("specmode", {
+  description: DESCRIPTION,
+  parameters: z.object({
+    action: z
+      .enum(["enter", "add_requirement", "add_note", "get_status", "clear", "export", "load", "list_templates"])
+      .describe("Action to perform"),
+    content: z
+      .string()
+      .optional()
+      .describe("Content for add_requirement/add_note actions, or filename for export/load"),
+    template: z.enum(["feature", "api", "bugfix", "refactor", "none"]).optional().describe("Template to use when entering spec mode"),
+  }),
+  async execute(params, ctx) {
+    return measure({
+      id: "specmode",
+      ctx,
+      params,
+      async run() {
+        const sessions = state()
+
+        switch (params.action) {
+      case "list_templates":
+        let templateList = "# Available Spec Templates\n\n"
+        for (const [key, template] of Object.entries(TEMPLATES)) {
+          templateList += `## ${key}: ${template.name}\n`
+          templateList += "**Requirements:**\n"
+          template.requirements.forEach((req) => {
+            templateList += `- ${req}\n`
+          })
+          templateList += "**Notes:**\n"
+          template.notes.forEach((note) => {
+            templateList += `- ${note}\n`
+          })
+          templateList += "\n"
+        }
+          return {
+            title: "Spec Templates",
+            output: templateList,
+            metadata: {
+              active: false,
+              requirements_count: 0,
+              notes_count: 0,
+              duration: 0,
+            },
+          }
+
+      case "enter":
+        const template = params.template && params.template !== "none" ? TEMPLATES[params.template] : null
+
+        sessions[ctx.sessionID] = {
+          active: true,
+          requirements: template ? [...template.requirements] : [],
+          notes: template ? [...template.notes] : [],
+          startedAt: Date.now(),
+          template: template?.name,
+        }
+
+        let output = "Entered specification mode."
+        if (template) {
+          output += ` Using template: **${template.name}**\n\n`
+          output += `Pre-filled with ${template.requirements.length} requirements and ${template.notes.length} notes.\n`
+        }
+        output += "\n\nYou can now:\n"
+        output += "- Add requirements using add_requirement\n"
+        output += "- Add planning notes using add_note\n"
+        output += "- Check status using get_status\n"
+        output += "- Export to file using export\n"
+        output += "- Exit and present plan using ExitSpecMode tool"
+
+        return {
+          title: "Specification Mode Activated",
+          output,
+          metadata: {
+            active: true,
+            requirements_count: sessions[ctx.sessionID].requirements.length,
+            notes_count: sessions[ctx.sessionID].notes.length,
+            duration: 0,
+          },
+        }
+
+      case "add_requirement":
+        if (!sessions[ctx.sessionID]?.active) {
+          throw new Error("Not in spec mode. Use action 'enter' first.")
+        }
+        if (!params.content) {
+          throw new Error("Content required for add_requirement action")
+        }
+        sessions[ctx.sessionID].requirements.push(params.content)
+        const session1 = sessions[ctx.sessionID]
+        return {
+          title: "Requirement Added",
+          output: `Added requirement: ${params.content}\nTotal requirements: ${session1.requirements.length}`,
+          metadata: {
+            active: true,
+            requirements_count: session1.requirements.length,
+            notes_count: session1.notes.length,
+            duration: Date.now() - session1.startedAt,
+          },
+        }
+
+      case "add_note":
+        if (!sessions[ctx.sessionID]?.active) {
+          throw new Error("Not in spec mode. Use action 'enter' first.")
+        }
+        if (!params.content) {
+          throw new Error("Content required for add_note action")
+        }
+        sessions[ctx.sessionID].notes.push(params.content)
+        const session2 = sessions[ctx.sessionID]
+        return {
+          title: "Planning Note Added",
+          output: `Added note: ${params.content}\nTotal notes: ${session2.notes.length}`,
+          metadata: {
+            active: true,
+            requirements_count: session2.requirements.length,
+            notes_count: session2.notes.length,
+            duration: Date.now() - session2.startedAt,
+          },
+        }
+
+      case "get_status":
+        const session = sessions[ctx.sessionID]
+        if (!session?.active) {
+          return {
+            title: "Spec Mode Status",
+            output: "Not currently in specification mode",
+            metadata: {
+              active: false,
+              requirements_count: 0,
+              notes_count: 0,
+              duration: 0,
+            },
+          }
+        }
+
+        const duration = Date.now() - session.startedAt
+        let output2 = `# Specification Mode Status\n\n`
+        output2 += `**Duration:** ${formatDuration(duration)}\n`
+        if (session.template) {
+          output2 += `**Template:** ${session.template}\n`
+        }
+        output2 += `\n## Requirements (${session.requirements.length})\n`
+        session.requirements.forEach((req, i) => {
+          output2 += `${i + 1}. ${req}\n`
+        })
+        output2 += `\n## Planning Notes (${session.notes.length})\n`
+        session.notes.forEach((note, i) => {
+          output2 += `${i + 1}. ${note}\n`
+        })
+
+        return {
+          title: "Spec Mode Status",
+          output: output2,
+          metadata: {
+            active: true,
+            requirements_count: session.requirements.length,
+            notes_count: session.notes.length,
+            duration,
+          },
+        }
+
+      case "export":
+        const exportSession = sessions[ctx.sessionID]
+        if (!exportSession?.active) {
+          throw new Error("Not in spec mode. Nothing to export.")
+        }
+
+        const filename = params.content || `spec-${Date.now()}.md`
+        const specDir = path.join(Instance.directory, ".opencode", "spec")
+        await fs.mkdir(specDir, { recursive: true })
+
+        const filepath = path.join(specDir, filename)
+        const exportContent = generateExportContent(exportSession)
+        await fs.writeFile(filepath, exportContent)
+
+        return {
+          title: "Spec Exported",
+          output: `Specification exported to: ${filepath}`,
+          metadata: {
+            active: true,
+            requirements_count: exportSession.requirements.length,
+            notes_count: exportSession.notes.length,
+            duration: Date.now() - exportSession.startedAt,
+          },
+        }
+
+      case "load":
+        if (!params.content) {
+          throw new Error("Filename required for load action")
+        }
+
+        const loadFilepath = path.join(Instance.directory, ".opencode", "spec", params.content)
+        const loadedContent = await fs.readFile(loadFilepath, "utf-8")
+        const loaded = parseExportContent(loadedContent)
+
+        sessions[ctx.sessionID] = {
+          active: true,
+          requirements: loaded.requirements,
+          notes: loaded.notes,
+          startedAt: Date.now(),
+          template: loaded.template,
+        }
+
+        return {
+          title: "Spec Loaded",
+          output: `Loaded specification from: ${loadFilepath}\n${loaded.requirements.length} requirements, ${loaded.notes.length} notes`,
+          metadata: {
+            active: true,
+            requirements_count: loaded.requirements.length,
+            notes_count: loaded.notes.length,
+            duration: 0,
+          },
+        }
+
+      case "clear":
+        const clearSession = sessions[ctx.sessionID]
+        if (!clearSession?.active) {
+          return {
+            title: "Spec Mode Cleared",
+            output: "Not currently in specification mode (nothing to clear)",
+            metadata: {
+              active: false,
+              requirements_count: 0,
+              notes_count: 0,
+              duration: 0,
+            },
+          }
+        }
+
+        const clearedReqCount = clearSession.requirements.length
+        const clearedNotesCount = clearSession.notes.length
+
+        // Deactivate and clear
+        clearSession.active = false
+        clearSession.requirements = []
+        clearSession.notes = []
+
+        return {
+          title: "Spec Mode Cleared",
+          output: `Exited specification mode and cleared ${clearedReqCount} requirements and ${clearedNotesCount} notes.\nUse 'enter' to start a new spec session.`,
+          metadata: {
+            active: false,
+            requirements_count: clearedReqCount,
+            notes_count: clearedNotesCount,
+            duration: 0,
+          },
+        }
+
+      default:
+        throw new Error(`Unknown action: ${params.action}`)
+        }
+      },
+    })
+  },
+})
+
+function formatDuration(ms: number): string {
+  const seconds = Math.floor(ms / 1000)
+  const minutes = Math.floor(seconds / 60)
+  const hours = Math.floor(minutes / 60)
+
+  if (hours > 0) {
+    const remainingMinutes = minutes % 60
+    return `${hours}h ${remainingMinutes}m`
+  }
+  if (minutes > 0) {
+    const remainingSeconds = seconds % 60
+    return `${minutes}m ${remainingSeconds}s`
+  }
+  return `${seconds}s`
+}
+
+function generateExportContent(session: any): string {
+  let content = `# Specification\n\n`
+  content += `**Created:** ${new Date().toISOString()}\n`
+  if (session.template) {
+    content += `**Template:** ${session.template}\n`
+  }
+  content += `\n## Requirements\n\n`
+  session.requirements.forEach((req: string, i: number) => {
+    content += `${i + 1}. ${req}\n`
+  })
+  content += `\n## Planning Notes\n\n`
+  session.notes.forEach((note: string, i: number) => {
+    content += `${i + 1}. ${note}\n`
+  })
+  return content
+}
+
+function parseExportContent(content: string): { requirements: string[]; notes: string[]; template?: string } {
+  const requirements: string[] = []
+  const notes: string[] = []
+  let template: string | undefined
+
+  const lines = content.split("\n")
+  let section: "none" | "requirements" | "notes" = "none"
+
+  for (const line of lines) {
+    if (line.startsWith("**Template:**")) {
+      template = line.replace("**Template:**", "").trim()
+    } else if (line.startsWith("## Requirements")) {
+      section = "requirements"
+    } else if (line.startsWith("## Planning Notes")) {
+      section = "notes"
+    } else if (line.match(/^\d+\.\s/)) {
+      const text = line.replace(/^\d+\.\s/, "").trim()
+      if (section === "requirements") {
+        requirements.push(text)
+      } else if (section === "notes") {
+        notes.push(text)
+      }
+    }
+  }
+
+  return { requirements, notes, template }
+}
diff --git a/packages/opencode/src/tool/specmode.txt b/packages/opencode/src/tool/specmode.txt
new file mode 100644
index 0000000000..21acef1b2d
--- /dev/null
+++ b/packages/opencode/src/tool/specmode.txt
@@ -0,0 +1,23 @@
+Enter specification/planning mode to analyze requirements and create an implementation plan.
+
+Use this tool when:
+- User asks you to plan or design a feature before implementing
+- Complex tasks requiring architectural decisions
+- Need to gather requirements and outline approach
+- Breaking down large features into smaller tasks
+
+Actions:
+- `enter` - Enter spec mode and start a new planning session
+- `add_requirement` - Add a requirement to the current session
+- `add_note` - Add a planning note to the current session
+- `get_status` - View all requirements and notes collected so far
+- `clear` - Exit spec mode and clear all requirements/notes
+
+Workflow:
+1. Use `enter` to start spec mode
+2. Use `add_requirement` and `add_note` to collect information
+3. Use `get_status` to review what you've collected
+4. Use ExitSpecMode tool to present your final plan (automatically includes context)
+5. Or use `clear` to discard and start over
+
+The ExitSpecMode tool will automatically include all requirements and notes you collected in the final plan output.
diff --git a/packages/opencode/src/tool/task.ts b/packages/opencode/src/tool/task.ts
index 5875722f85..4c8c0bce26 100644
--- a/packages/opencode/src/tool/task.ts
+++ b/packages/opencode/src/tool/task.ts
@@ -61,10 +61,10 @@ export const TaskTool = Tool.define("task", async () => {
         },
         agent: agent.name,
         tools: {
+          ...agent.tools,
           todowrite: false,
           todoread: false,
           task: false,
-          ...agent.tools,
         },
         parts: [
           {
diff --git a/packages/opencode/src/tool/telemetry-event.ts b/packages/opencode/src/tool/telemetry-event.ts
new file mode 100644
index 0000000000..038f5af2cb
--- /dev/null
+++ b/packages/opencode/src/tool/telemetry-event.ts
@@ -0,0 +1,14 @@
+import z from "zod/v4"
+
+export const TelemetryEventSchema = z.object({
+  id: z.string(),
+  sessionID: z.string(),
+  callID: z.string().optional(),
+  status: z.enum(["success", "error"]),
+  duration: z.number(),
+  timestamp: z.number(),
+  extra: z.record(z.string(), z.unknown()).optional(),
+  error: z.string().optional(),
+})
+
+export type TelemetryEvent = z.infer<typeof TelemetryEventSchema>
diff --git a/packages/opencode/src/tool/telemetry.ts b/packages/opencode/src/tool/telemetry.ts
new file mode 100644
index 0000000000..b04357b9b6
--- /dev/null
+++ b/packages/opencode/src/tool/telemetry.ts
@@ -0,0 +1,156 @@
+import path from "path"
+import { Log } from "../util/log"
+import { Tool } from "./tool"
+import { Bus } from "../bus"
+import { ToolHistory } from "./history"
+import { TelemetryEventSchema, type TelemetryEvent } from "./telemetry-event"
+import { Instance } from "../project/instance"
+
+export namespace ToolTelemetry {
+  export const Event = {
+    Sampled: Bus.event("tool.telemetry", TelemetryEventSchema),
+  }
+}
+
+const log = Log.create({ service: "tool-telemetry" })
+
+type Context = Tool.Context
+
+export type TelemetryOptions<T = unknown> = {
+  id: string
+  ctx: Context
+  params: unknown
+  run(): Promise<T>
+  extra?: Record<string, unknown>
+  captureInput?: () => unknown
+  captureOutput?: (result: T) => unknown
+  captureError?: (error: unknown) => unknown
+}
+
+function buildEnvironment(ctx: Context) {
+  try {
+    const project = Instance.project
+    const worktree = Instance.worktree
+    const directory = Instance.directory
+    return {
+      projectID: project.id,
+      vcs: project.vcs ?? "unknown",
+      worktree,
+      cwd: directory,
+      cwdRelative: path.relative(worktree, directory),
+      agent: ctx.agent,
+    }
+  } catch {
+    return undefined
+  }
+}
+
+function mergeExtra(...parts: Array<Record<string, unknown> | undefined>) {
+  const merged: Record<string, unknown> = {}
+  for (const part of parts) {
+    if (!part) continue
+    for (const [key, value] of Object.entries(part)) {
+      if (value === undefined) continue
+      merged[key] = value
+    }
+  }
+  return Object.keys(merged).length > 0 ? merged : undefined
+}
+
+export async function measure<T>(options: TelemetryOptions<T>): Promise<T> {
+  const started = Date.now()
+  const environment = buildEnvironment(options.ctx)
+
+  const capturedInput = (() => {
+    try {
+      return options.captureInput?.()
+    } catch (error) {
+      log.error("failed to capture telemetry input", {
+        id: options.id,
+        sessionID: options.ctx.sessionID,
+        error,
+      })
+      return undefined
+    }
+  })()
+
+  try {
+    const result = (await options.run()) as T
+    const duration = Date.now() - started
+    const capturedOutput = (() => {
+      try {
+        return options.captureOutput?.(result)
+      } catch (error) {
+        log.error("failed to capture telemetry output", {
+          id: options.id,
+          sessionID: options.ctx.sessionID,
+          error,
+        })
+        return undefined
+      }
+    })()
+
+    const base: Omit<TelemetryEvent, "status" | "error"> = {
+      id: options.id,
+      sessionID: options.ctx.sessionID,
+      callID: options.ctx.callID,
+      duration,
+      timestamp: Date.now(),
+      extra: mergeExtra(
+        options.extra,
+        capturedInput !== undefined ? { input: capturedInput } : undefined,
+        capturedOutput !== undefined ? { output: capturedOutput } : undefined,
+        environment ? { environment } : undefined,
+      ),
+    }
+    log.debug("tool executed", {
+      ...base,
+      status: "success",
+    })
+    const successEvent: TelemetryEvent = { ...base, status: "success" }
+    await Promise.all([
+      Bus.publish(ToolTelemetry.Event.Sampled, successEvent),
+      ToolHistory.record(successEvent),
+    ])
+    return result
+  } catch (error) {
+    const duration = Date.now() - started
+    const capturedError = (() => {
+      try {
+        return options.captureError?.(error)
+      } catch (captureError) {
+        log.error("failed to capture telemetry error payload", {
+          id: options.id,
+          sessionID: options.ctx.sessionID,
+          captureError,
+        })
+        return undefined
+      }
+    })()
+
+    const base: Omit<TelemetryEvent, "status"> = {
+      id: options.id,
+      sessionID: options.ctx.sessionID,
+      callID: options.ctx.callID,
+      duration,
+      timestamp: Date.now(),
+      extra: mergeExtra(
+        options.extra,
+        capturedInput !== undefined ? { input: capturedInput } : undefined,
+        capturedError !== undefined ? { errorPayload: capturedError } : undefined,
+        environment ? { environment } : undefined,
+      ),
+      error: error instanceof Error ? error.message : String(error),
+    }
+    log.error("tool failed", {
+      ...base,
+      status: "error",
+    })
+    const errorEvent: TelemetryEvent = { ...base, status: "error" }
+    await Promise.all([
+      Bus.publish(ToolTelemetry.Event.Sampled, errorEvent),
+      ToolHistory.record(errorEvent),
+    ])
+    throw error
+  }
+}
diff --git a/packages/opencode/src/tool/todo.ts b/packages/opencode/src/tool/todo.ts
index 9b4efddb00..9c6c08f79f 100644
--- a/packages/opencode/src/tool/todo.ts
+++ b/packages/opencode/src/tool/todo.ts
@@ -4,10 +4,14 @@ import DESCRIPTION_WRITE from "./todowrite.txt"
 import { Instance } from "../project/instance"
 
 const TodoInfo = z.object({
-  content: z.string().describe("Brief description of the task"),
-  status: z.string().describe("Current status of the task: pending, in_progress, completed, cancelled"),
-  priority: z.string().describe("Priority level of the task: high, medium, low"),
+  content: z.string().max(500).describe("Brief description of the task (max 500 characters)"),
+  status: z.enum(["pending", "in_progress", "completed", "cancelled"]).describe("Current status of the task"),
+  priority: z.enum(["high", "medium", "low"]).optional().describe("Priority level of the task"),
   id: z.string().describe("Unique identifier for the todo item"),
+  activeForm: z.string().max(500).optional().describe("Present continuous form shown during execution (e.g., 'Running tests')"),
+  tags: z.array(z.string()).optional().describe("Optional tags for categorization"),
+  dependencies: z.array(z.string()).optional().describe("IDs of todos that must be completed first"),
+  estimate_minutes: z.number().min(1).optional().describe("Estimated time in minutes"),
 })
 type TodoInfo = z.infer<typeof TodoInfo>
 
@@ -21,9 +25,63 @@ const state = Instance.state(() => {
 export const TodoWriteTool = Tool.define("todowrite", {
   description: DESCRIPTION_WRITE,
   parameters: z.object({
-    todos: z.array(TodoInfo).describe("The updated todo list"),
+    todos: z.array(TodoInfo).max(50).describe("The updated todo list (max 50 items)"),
   }),
   async execute(params, opts) {
+    // Validate unique IDs
+    const ids = params.todos.map((t) => t.id)
+    const uniqueIds = new Set(ids)
+    if (ids.length !== uniqueIds.size) {
+      throw new Error("Todo items must have unique IDs")
+    }
+
+    // Validate dependencies exist
+    const idSet = new Set(ids)
+    for (const todo of params.todos) {
+      if (todo.dependencies) {
+        for (const depId of todo.dependencies) {
+          if (!idSet.has(depId)) {
+            throw new Error(`Todo '${todo.id}' has invalid dependency '${depId}' (not found in todo list)`)
+          }
+        }
+      }
+      if (todo.activeForm && todo.activeForm.trim().length === 0) {
+        throw new Error(`Todo '${todo.id}' must not provide an empty activeForm`)
+      }
+      if (todo.status === "in_progress" && !todo.activeForm) {
+        throw new Error(`Todo '${todo.id}' requires an activeForm while in progress`)
+      }
+    }
+
+    // Detect circular dependencies
+    const visited = new Set<string>()
+    const recursionStack = new Set<string>()
+    const todoMap = new Map(params.todos.map((t) => [t.id, t]))
+
+    function hasCycle(todoId: string): boolean {
+      if (recursionStack.has(todoId)) return true
+      if (visited.has(todoId)) return false
+
+      visited.add(todoId)
+      recursionStack.add(todoId)
+
+      const todo = todoMap.get(todoId)
+      if (todo?.dependencies) {
+        for (const depId of todo.dependencies) {
+          if (hasCycle(depId)) return true
+        }
+      }
+
+      recursionStack.delete(todoId)
+      return false
+    }
+
+    for (const todo of params.todos) {
+      if (hasCycle(todo.id)) {
+        throw new Error(`Circular dependency detected involving todo '${todo.id}'`)
+      }
+    }
+
     const todos = state()
     todos[opts.sessionID] = params.todos
     return {
diff --git a/packages/opencode/src/tool/todowrite.txt b/packages/opencode/src/tool/todowrite.txt
index 52c3bfe970..157b9a17d2 100644
--- a/packages/opencode/src/tool/todowrite.txt
+++ b/packages/opencode/src/tool/todowrite.txt
@@ -1,6 +1,29 @@
 Use this tool to create and manage a structured task list for your current coding session. This helps you track progress, organize complex tasks, and demonstrate thoroughness to the user.
 It also helps the user understand the progress of the task and overall progress of their requests.
 
+## Request Format
+
+Send a JSON object with a single `todos` array. Every entry must look like:
+
+```
+{
+  "id": "write-tests",
+  "content": "Write unit coverage for auth module",
+  "status": "in_progress",
+  "activeForm": "Writing auth tests",
+  "priority": "high",
+  "tags": ["auth", "tests"],
+  "dependencies": ["scaffold-auth"],
+  "estimate_minutes": 30
+}
+```
+
+Guidelines:
+- Provide a stable `id` (no duplicates in the array).
+- Use `status` values: `pending`, `in_progress`, `completed`, or `cancelled`.
+- Include `activeForm` while a todo is `in_progress`; omit it for other states or when not needed.
+- Optional fields: `priority`, `tags`, `dependencies`, and `estimate_minutes`.
+
 ## When to Use This Tool
 Use this tool proactively in these scenarios:
 
@@ -164,4 +187,3 @@ The assistant did not use the todo list because this is a single command executi
    - Use clear, descriptive task names
 
 When in doubt, use this tool. Being proactive with task management demonstrates attentiveness and ensures you complete all requirements successfully.
-
diff --git a/packages/opencode/src/tool/tool.ts b/packages/opencode/src/tool/tool.ts
index a372a69d75..946f1fb3ab 100644
--- a/packages/opencode/src/tool/tool.ts
+++ b/packages/opencode/src/tool/tool.ts
@@ -1,26 +1,29 @@
 import z from "zod/v4"
 
 export namespace Tool {
-  interface Metadata {
-    [key: string]: any
+  export type Metadata = Record<string, unknown>
+  export type Extra = Record<string, unknown>
+  export type MetadataInput<M extends Metadata> = {
+    title?: string
+    metadata?: M
   }
-  export type Context<M extends Metadata = Metadata> = {
+  export type Context<M extends Metadata = Metadata, E extends Extra = Extra> = {
     sessionID: string
     messageID: string
     agent: string
     abort: AbortSignal
     callID?: string
-    extra?: { [key: string]: any }
-    metadata(input: { title?: string; metadata?: M }): void
+    extra?: E
+    metadata(input: MetadataInput<M>): void
   }
-  export interface Info<Parameters extends z.ZodType = z.ZodType, M extends Metadata = Metadata> {
+  export interface Info<Parameters extends z.ZodType = z.ZodType, M extends Metadata = Metadata, E extends Extra = Extra> {
     id: string
     init: () => Promise<{
       description: string
       parameters: Parameters
       execute(
         args: z.infer<Parameters>,
-        ctx: Context,
+        ctx: Context<M, E>,
       ): Promise<{
         title: string
         metadata: M
@@ -29,10 +32,12 @@ export namespace Tool {
     }>
   }
 
-  export function define<Parameters extends z.ZodType, Result extends Metadata>(
+  export function define<Parameters extends z.ZodType, Result extends Metadata, E extends Extra = Extra>(
     id: string,
-    init: Info<Parameters, Result>["init"] | Awaited<ReturnType<Info<Parameters, Result>["init"]>>,
-  ): Info<Parameters, Result> {
+    init:
+      | Info<Parameters, Result, E>["init"]
+      | Awaited<ReturnType<Info<Parameters, Result, E>["init"]>>,
+  ): Info<Parameters, Result, E> {
     return {
       id,
       init: async () => {
diff --git a/packages/opencode/src/tool/websearch.ts b/packages/opencode/src/tool/websearch.ts
new file mode 100644
index 0000000000..d2ff3c70a7
--- /dev/null
+++ b/packages/opencode/src/tool/websearch.ts
@@ -0,0 +1,342 @@
+import z from "zod/v4"
+import { Tool } from "./tool"
+import { parseHTML } from "linkedom"
+import Exa from "exa-js"
+import DESCRIPTION from "./websearch.txt"
+import { Config } from "../config/config"
+import { Permission } from "../permission"
+
+// Cache for search results (15 minute TTL)
+const searchCache = new Map<
+  string,
+  {
+    results: SearchResult[]
+    timestamp: number
+    query: string
+    engine: string
+  }
+>()
+const CACHE_TTL = 15 * 60 * 1000 // 15 minutes
+
+export const WebSearchTool = Tool.define("websearch", {
+  description: DESCRIPTION,
+  parameters: z.object({
+    query: z.string().describe("The search query to use"),
+    search_type: z
+      .enum(["keyword", "neural", "auto"])
+      .optional()
+      .describe("Search type: keyword (DuckDuckGo), neural (Exa AI semantic search), or auto (tries neural first)"),
+    category: z
+      .enum(["company", "research_paper", "news", "pdf", "github", "general"])
+      .optional()
+      .describe("Category filter for search results"),
+    allowed_domains: z
+      .array(z.string())
+      .optional()
+      .describe("Only include search results from these domains"),
+    blocked_domains: z
+      .array(z.string())
+      .optional()
+      .describe("Never include search results from these domains"),
+    max_results: z
+      .number()
+      .min(1)
+      .max(100)
+      .optional()
+      .describe("Maximum number of results to return (1-100, default 10)"),
+    date_filter: z
+      .enum(["day", "week", "month", "year", "all"])
+      .optional()
+      .describe("Filter results by date"),
+  }),
+  async execute(params, ctx) {
+    const cfg = await Config.get()
+    if (cfg.permission?.websearch === "ask")
+      await Permission.ask({
+        type: "websearch",
+        sessionID: ctx.sessionID,
+        messageID: ctx.messageID,
+        callID: ctx.callID,
+        title: "Search the web for: " + params.query,
+        metadata: {
+          query: params.query,
+          search_type: params.search_type,
+          category: params.category,
+        },
+      })
+
+    // Build cache key
+    const cacheKey = JSON.stringify({
+      query: params.query,
+      search_type: params.search_type,
+      category: params.category,
+      date_filter: params.date_filter,
+    })
+
+    // Check cache
+    const cached = searchCache.get(cacheKey)
+    if (cached && Date.now() - cached.timestamp < CACHE_TTL) {
+      const filtered = filterResults(cached.results, params.allowed_domains, params.blocked_domains)
+      const maxResults = params.max_results || 10
+      const results = filtered.slice(0, maxResults)
+
+      return {
+        title: `Web search results for: ${params.query}`,
+        output: formatResults(results, params.query),
+        metadata: {
+          query: params.query,
+          result_count: results.length,
+          cached: true,
+          truncated: filtered.length > maxResults,
+          search_engine: cached.engine,
+          timestamp: cached.timestamp,
+        },
+      }
+    }
+
+    const searchType = params.search_type || "auto"
+    let results: SearchResult[] = []
+    let engine = "duckduckgo"
+
+    // Try neural search first if auto or neural
+    if ((searchType === "auto" || searchType === "neural") && process.env["EXA_API_KEY"]) {
+      try {
+        results = await searchWithExa(params)
+        engine = "exa"
+      } catch (err) {
+        // Fall back to DuckDuckGo if Exa fails
+        if (searchType === "neural") {
+          throw new Error(`Neural search failed: ${err}. Please set EXA_API_KEY environment variable.`)
+        }
+        results = await searchWithDuckDuckGo(params, ctx)
+      }
+    } else {
+      // Use DuckDuckGo for keyword search or if no API key
+      results = await searchWithDuckDuckGo(params, ctx)
+    }
+
+    // Cache results
+    searchCache.set(cacheKey, {
+      results,
+      timestamp: Date.now(),
+      query: params.query,
+      engine,
+    })
+
+    // Clean old cache entries
+    cleanCache()
+
+    // Apply filtering
+    const filtered = filterResults(results, params.allowed_domains, params.blocked_domains)
+    const maxResults = params.max_results || 10
+    const finalResults = filtered.slice(0, maxResults)
+
+    return {
+      title: `Web search results for: ${params.query}`,
+      output: formatResults(finalResults, params.query),
+      metadata: {
+        query: params.query,
+        result_count: finalResults.length,
+        cached: false,
+        truncated: filtered.length > maxResults,
+        search_engine: engine,
+        timestamp: Date.now(),
+      },
+    }
+  },
+})
+
+interface SearchResult {
+  title: string
+  url: string
+  snippet: string
+  publishedDate?: string
+}
+
+interface SearchParams {
+  query: string
+  category?: string
+  date_filter?: string
+  max_results?: number
+}
+
+async function searchWithExa(params: SearchParams): Promise<SearchResult[]> {
+  const exa = new Exa(process.env["EXA_API_KEY"]!)
+  const maxResults = Math.min(params.max_results || 10, 100)
+
+  // Build Exa-specific options
+  const options: any = {
+    numResults: maxResults,
+    useAutoprompt: true,
+    text: { maxCharacters: 500 },
+  }
+
+  // Add category filter
+  if (params.category === "research_paper") {
+    options.category = "research paper"
+  } else if (params.category === "news") {
+    options.category = "news"
+  } else if (params.category === "github") {
+    options.includeDomains = ["github.com"]
+  } else if (params.category === "company") {
+    options.category = "company"
+  } else if (params.category === "pdf") {
+    options.category = "pdf"
+  }
+
+  // Add date filter
+  if (params.date_filter && params.date_filter !== "all") {
+    const now = new Date()
+    const dateMap = {
+      day: new Date(now.getTime() - 24 * 60 * 60 * 1000),
+      week: new Date(now.getTime() - 7 * 24 * 60 * 60 * 1000),
+      month: new Date(now.getTime() - 30 * 24 * 60 * 60 * 1000),
+      year: new Date(now.getTime() - 365 * 24 * 60 * 60 * 1000),
+    }
+    options.startPublishedDate = dateMap[params.date_filter as keyof typeof dateMap]?.toISOString()
+  }
+
+  const response = await exa.searchAndContents(params.query, options)
+
+  return response.results.map((result: any) => ({
+    title: result.title || "",
+    url: result.url || "",
+    snippet: result.text || result.summary || "",
+    publishedDate: result.publishedDate,
+  }))
+}
+
+async function searchWithDuckDuckGo(params: SearchParams, ctx: any): Promise<SearchResult[]> {
+  // Build the search URL with category filters
+  let searchQuery = params.query
+
+  // Add category-specific filters
+  if (params.category === "github") {
+    searchQuery += " site:github.com"
+  } else if (params.category === "research_paper") {
+    searchQuery += " (site:arxiv.org OR site:scholar.google.com OR filetype:pdf)"
+  } else if (params.category === "news") {
+    searchQuery += " (site:news.ycombinator.com OR site:techcrunch.com OR site:reuters.com)"
+  } else if (params.category === "pdf") {
+    searchQuery += " filetype:pdf"
+  } else if (params.category === "company") {
+    searchQuery += " (site:linkedin.com OR site:crunchbase.com)"
+  }
+
+  const encodedQuery = encodeURIComponent(searchQuery)
+  let searchURL = `https://html.duckduckgo.com/html/?q=${encodedQuery}`
+
+  // Add date filter
+  if (params.date_filter) {
+    const dateMap = { day: "d", week: "w", month: "m", year: "y", all: "" }
+    const filterCode = dateMap[params.date_filter as keyof typeof dateMap]
+    if (filterCode) {
+      searchURL += `&df=${filterCode}`
+    }
+  }
+
+  // Fetch search results
+  const response = await fetch(searchURL, {
+    signal: ctx?.abort,
+    headers: {
+      "User-Agent":
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+      Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+      "Accept-Language": "en-US,en;q=0.9",
+    },
+  })
+
+  if (!response.ok) {
+    throw new Error(`Search request failed with status code: ${response.status}`)
+  }
+
+  const html = await response.text()
+  return parseSearchResults(html)
+}
+
+function parseSearchResults(html: string): SearchResult[] {
+  const results: SearchResult[] = []
+
+  try {
+    const { document } = parseHTML(html)
+    const resultElements = document.querySelectorAll(".result")
+
+    for (const element of resultElements) {
+      try {
+        const titleLink = element.querySelector(".result__a")
+        if (!titleLink) continue
+
+        const url = titleLink.getAttribute("href")
+        const title = titleLink.textContent?.trim()
+        const snippetElement = element.querySelector(".result__snippet")
+        const snippet = snippetElement?.textContent?.trim()
+
+        if (url && title) {
+          results.push({
+            title,
+            url: decodeURIComponent(url),
+            snippet: snippet || "",
+          })
+        }
+      } catch {
+        continue
+      }
+    }
+  } catch (err) {
+    throw new Error(`Failed to parse search results: ${err}`)
+  }
+
+  return results
+}
+
+function filterResults(
+  results: SearchResult[],
+  allowedDomains?: string[],
+  blockedDomains?: string[],
+): SearchResult[] {
+  return results.filter((result) => {
+    if (allowedDomains && allowedDomains.length > 0) {
+      const isAllowed = allowedDomains.some((domain) => result.url.includes(domain))
+      if (!isAllowed) return false
+    }
+
+    if (blockedDomains && blockedDomains.length > 0) {
+      const isBlocked = blockedDomains.some((domain) => result.url.includes(domain))
+      if (isBlocked) return false
+    }
+
+    return true
+  })
+}
+
+function formatResults(results: SearchResult[], query: string): string {
+  if (results.length === 0) {
+    return `No search results found for query: "${query}"`
+  }
+
+  let output = `Found ${results.length} search result${results.length === 1 ? "" : "s"} for: "${query}"\n\n`
+
+  for (let i = 0; i < results.length; i++) {
+    const result = results[i]
+    output += `${i + 1}. **${result.title}**\n`
+    output += `   ${result.url}\n`
+    if (result.snippet) {
+      output += `   ${result.snippet}\n`
+    }
+    if (result.publishedDate) {
+      output += `   📅 ${new Date(result.publishedDate).toLocaleDateString()}\n`
+    }
+    output += `\n`
+  }
+
+  return output.trim()
+}
+
+function cleanCache() {
+  const now = Date.now()
+  for (const [key, value] of searchCache.entries()) {
+    if (now - value.timestamp > CACHE_TTL) {
+      searchCache.delete(key)
+    }
+  }
+}
diff --git a/packages/opencode/src/tool/workspace.ts b/packages/opencode/src/tool/workspace.ts
new file mode 100644
index 0000000000..40207c442e
--- /dev/null
+++ b/packages/opencode/src/tool/workspace.ts
@@ -0,0 +1,21 @@
+import path from "path"
+import { Filesystem } from "../util/filesystem"
+import { Instance } from "../project/instance"
+
+export type GuardOptions = {
+  bypass?: boolean
+  message?: string
+}
+
+export function resolve(input: string) {
+  return path.isAbsolute(input) ? input : path.join(Instance.directory, input)
+}
+
+export function guard(input: string, options: GuardOptions = {}) {
+  const resolved = resolve(input)
+  if (!options.bypass && !Filesystem.contains(Instance.directory, resolved)) {
+    const message = options.message ?? `File ${resolved} is not in the current working directory`
+    throw new Error(message)
+  }
+  return resolved
+}
diff --git a/packages/opencode/src/tool/write.ts b/packages/opencode/src/tool/write.ts
index aa79c9bfb9..899c706c00 100644
--- a/packages/opencode/src/tool/write.ts
+++ b/packages/opencode/src/tool/write.ts
@@ -7,9 +7,10 @@ import DESCRIPTION from "./write.txt"
 import { Bus } from "../bus"
 import { File } from "../file"
 import { FileTime } from "../file/time"
-import { Filesystem } from "../util/filesystem"
 import { Instance } from "../project/instance"
 import { Agent } from "../agent/agent"
+import { guard } from "./workspace"
+import { measure } from "./telemetry"
 
 export const WriteTool = Tool.define("write", {
   description: DESCRIPTION,
@@ -18,56 +19,60 @@ export const WriteTool = Tool.define("write", {
     content: z.string().describe("The content to write to the file"),
   }),
   async execute(params, ctx) {
-    const filepath = path.isAbsolute(params.filePath) ? params.filePath : path.join(Instance.directory, params.filePath)
-    if (!Filesystem.contains(Instance.directory, filepath)) {
-      throw new Error(`File ${filepath} is not in the current working directory`)
-    }
+    return measure({
+      id: "write",
+      ctx,
+      params,
+      async run() {
+        const filepath = guard(params.filePath)
 
-    const file = Bun.file(filepath)
-    const exists = await file.exists()
-    if (exists) await FileTime.assert(ctx.sessionID, filepath)
+        const file = Bun.file(filepath)
+        const exists = await file.exists()
+        if (exists) await FileTime.assert(ctx.sessionID, filepath)
 
-    const agent = await Agent.get(ctx.agent)
-    if (agent.permission.edit === "ask")
-      await Permission.ask({
-        type: "write",
-        sessionID: ctx.sessionID,
-        messageID: ctx.messageID,
-        callID: ctx.callID,
-        title: exists ? "Overwrite this file: " + filepath : "Create new file: " + filepath,
-        metadata: {
-          filePath: filepath,
-          content: params.content,
-          exists,
-        },
-      })
+        const agent = await Agent.get(ctx.agent)
+        if (agent.permission.edit === "ask")
+          await Permission.ask({
+            type: "write",
+            sessionID: ctx.sessionID,
+            messageID: ctx.messageID,
+            callID: ctx.callID,
+            title: exists ? "Overwrite this file: " + filepath : "Create new file: " + filepath,
+            metadata: {
+              filePath: filepath,
+              content: params.content,
+              exists,
+            },
+          })
 
-    await Bun.write(filepath, params.content)
-    await Bus.publish(File.Event.Edited, {
-      file: filepath,
-    })
-    FileTime.read(ctx.sessionID, filepath)
+        await Bun.write(filepath, params.content)
+        await Bus.publish(File.Event.Edited, {
+          file: filepath,
+        })
+        FileTime.read(ctx.sessionID, filepath)
 
-    let output = ""
-    await LSP.touchFile(filepath, true)
-    const diagnostics = await LSP.diagnostics()
-    for (const [file, issues] of Object.entries(diagnostics)) {
-      if (issues.length === 0) continue
-      if (file === filepath) {
-        output += `\nThis file has errors, please fix\n<file_diagnostics>\n${issues.map(LSP.Diagnostic.pretty).join("\n")}\n</file_diagnostics>\n`
-        continue
-      }
-      output += `\n<project_diagnostics>\n${file}\n${issues.map(LSP.Diagnostic.pretty).join("\n")}\n</project_diagnostics>\n`
-    }
+        let output = ""
+        await LSP.touchFile(filepath, true)
+        const diagnostics = await LSP.diagnostics()
+        for (const [file, issues] of Object.entries(diagnostics)) {
+          if (issues.length === 0) continue
+          if (file === filepath) {
+            output += `\nThis file has errors, please fix\n<file_diagnostics>\n${issues.map(LSP.Diagnostic.pretty).join("\n")}\n</file_diagnostics>\n`
+            continue
+          }
+          output += `\n<project_diagnostics>\n${file}\n${issues.map(LSP.Diagnostic.pretty).join("\n")}\n</project_diagnostics>\n`
+        }
 
-    return {
-      title: path.relative(Instance.worktree, filepath),
-      metadata: {
-        diagnostics,
-        filepath,
-        exists: exists,
+        return {
+          title: path.relative(Instance.worktree, filepath),
+          metadata: {
+            diagnostics,
+            filepath,
+            exists,
+          },
+          output,
+        }
       },
-      output,
-    }
+    })
   },
 })
diff --git a/packages/opencode/src/trace/index.ts b/packages/opencode/src/trace/index.ts
new file mode 100644
index 0000000000..47857f8920
--- /dev/null
+++ b/packages/opencode/src/trace/index.ts
@@ -0,0 +1,261 @@
+import z from "zod/v4"
+import { Session } from "../session"
+import { MessageV2 } from "../session/message-v2"
+import { ToolHistory } from "../tool/history"
+import type { TelemetryEvent } from "../tool/telemetry-event"
+import { Storage } from "../storage/storage"
+import { Bus } from "../bus"
+import { Instance } from "../project/instance"
+
+export namespace Trace {
+  export const TokenUsage = z.object({
+    input: z.number().default(0),
+    output: z.number().default(0),
+    reasoning: z.number().default(0),
+    cache: z
+      .object({
+        read: z.number().default(0),
+        write: z.number().default(0),
+      })
+      .default({ read: 0, write: 0 }),
+  })
+  export type TokenUsage = z.infer<typeof TokenUsage>
+
+  export const ModelConfig = z.object({
+    provider: z.string(),
+    model: z.string(),
+    temperature: z.number().optional(),
+    maxTokens: z.number().optional(),
+  })
+  export type ModelConfig = z.infer<typeof ModelConfig>
+
+  export const Summary = z.object({
+    duration: z.number(),
+    toolCallCount: z.number(),
+    errorCount: z.number(),
+    tokens: TokenUsage,
+    cost: z.number(),
+  })
+  export type Summary = z.infer<typeof Summary>
+
+  export const Complete = z.object({
+    id: z.string(),
+    projectID: z.string(),
+    
+    // Session data
+    session: Session.Info,
+    messageCount: z.number(),
+    
+    // Execution context
+    agentName: z.string(),
+    modelConfig: ModelConfig,
+    systemPrompt: z.string().optional(),
+    systemPromptVersion: z.string().optional(),
+    
+    // Output
+    output: z.string(), // Final assistant response
+    
+    // Tool events
+    toolCalls: z.array(z.any()), // TelemetryEvent array
+    
+    // Aggregated metrics
+    summary: Summary,
+    
+    // Evaluation results (populated later)
+    evaluationIDs: z.array(z.string()).default([]),
+    
+    // Metadata
+    createdAt: z.number(),
+    completedAt: z.number().optional(),
+  })
+  export type Complete = z.infer<typeof Complete>
+
+  export const Filter = z.object({
+    projectID: z.string().optional(),
+    agentName: z.string().optional(),
+    minDuration: z.number().optional(),
+    maxDuration: z.number().optional(),
+    hasErrors: z.boolean().optional(),
+    since: z.number().optional(),
+    until: z.number().optional(),
+  })
+  export type Filter = z.infer<typeof Filter>
+
+  export const Event = {
+    Completed: Bus.event(
+      "trace.completed",
+      z.object({
+        trace: Complete,
+      }),
+    ),
+  }
+
+  /**
+   * Materialize a session into a complete trace
+   */
+  export async function materialize(sessionID: string): Promise<Complete> {
+    const session = await Session.get(sessionID)
+    const messages = await Session.messages(sessionID)
+    
+    // Get telemetry events for this session
+    const history = await ToolHistory.read()
+    const toolCalls = history.events.filter((e) => e.sessionID === sessionID)
+    
+    // Extract model config from first assistant message
+    const firstAssistant = messages.find((m) => m.info.role === "assistant")
+    let modelConfig: ModelConfig
+    if (firstAssistant && firstAssistant.info.role === "assistant") {
+      const info = firstAssistant.info as MessageV2.Assistant
+      modelConfig = {
+        provider: info.providerID ?? "unknown",
+        model: info.modelID ?? "unknown",
+        temperature: undefined, // TODO: extract from metadata if available
+        maxTokens: undefined,
+      }
+    } else {
+      modelConfig = {
+        provider: "unknown",
+        model: "unknown",
+      }
+    }
+    
+    // Compute summary
+    const summary = computeSummary(messages, toolCalls)
+    
+    // Get agent name from session or default
+    const agentName = "default" // TODO: extract from session metadata
+    
+    // Extract output from last assistant message
+    const lastAssistant = messages.filter((m) => m.info.role === "assistant").at(-1)
+    const output = lastAssistant?.parts
+      .filter((p: any) => p.type === "text")
+      .map((p: any) => p.text)
+      .join("\n") || ""
+    
+    const trace: Complete = {
+      id: session.id,
+      projectID: session.projectID,
+      session,
+      messageCount: messages.length,
+      agentName,
+      modelConfig,
+      systemPrompt: undefined, // TODO: load from session init
+      systemPromptVersion: undefined,
+      output,
+      toolCalls,
+      summary,
+      evaluationIDs: [],
+      createdAt: session.time.created,
+      completedAt: session.time.updated,
+    }
+    
+    // Store the trace
+    await Storage.write(["trace", session.projectID, session.id], trace)
+    
+    // Emit event
+    Bus.publish(Event.Completed, { trace })
+    
+    return trace
+  }
+
+  /**
+   * Get a specific trace
+   */
+  export async function get(traceID: string): Promise<Complete> {
+    const projectID = Instance.project.id
+    const trace = await Storage.read<Complete>(["trace", projectID, traceID])
+    return trace
+  }
+
+  /**
+   * List traces with optional filtering
+   */
+  export async function* list(filter?: Filter): AsyncIterableIterator<Complete> {
+    const projectID = filter?.projectID ?? Instance.project.id
+    const prefix = ["trace", projectID]
+    
+    const keys = await Storage.list(prefix)
+    
+    for (const key of keys) {
+      const trace = await Storage.read<Complete>(key)
+      
+      // Apply filters
+      if (filter) {
+        if (filter.agentName && trace.agentName !== filter.agentName) continue
+        if (filter.minDuration && trace.summary.duration < filter.minDuration) continue
+        if (filter.maxDuration && trace.summary.duration > filter.maxDuration) continue
+        if (filter.hasErrors !== undefined) {
+          const hasErrors = trace.summary.errorCount > 0
+          if (filter.hasErrors !== hasErrors) continue
+        }
+        if (filter.since && trace.createdAt < filter.since) continue
+        if (filter.until && trace.createdAt > filter.until) continue
+      }
+      
+      yield trace
+    }
+  }
+
+  /**
+   * Check if a trace exists
+   */
+  export async function exists(traceID: string): Promise<boolean> {
+    try {
+      await get(traceID)
+      return true
+    } catch {
+      return false
+    }
+  }
+
+  /**
+   * Delete a trace
+   */
+  export async function remove(traceID: string): Promise<void> {
+    const projectID = Instance.project.id
+    await Storage.remove(["trace", projectID, traceID])
+  }
+
+  /**
+   * Compute summary statistics from messages and tool calls
+   */
+  function computeSummary(messages: any[], toolCalls: TelemetryEvent[]): Summary {
+    let totalCost = 0
+    let tokens: TokenUsage = {
+      input: 0,
+      output: 0,
+      reasoning: 0,
+      cache: { read: 0, write: 0 },
+    }
+    
+    // Aggregate from messages
+    for (const message of messages) {
+      if (message.info.role === "assistant") {
+        const info = message.info as MessageV2.Assistant
+        totalCost += info.cost ?? 0
+        if (info.tokens) {
+          tokens.input += info.tokens.input ?? 0
+          tokens.output += info.tokens.output ?? 0
+          tokens.reasoning += info.tokens.reasoning ?? 0
+          if (info.tokens.cache) {
+            tokens.cache.read += info.tokens.cache.read ?? 0
+            tokens.cache.write += info.tokens.cache.write ?? 0
+          }
+        }
+      }
+    }
+    
+    // Compute duration and error count from tool calls
+    const errorCount = toolCalls.filter((t) => t.status === "error").length
+    const durations = toolCalls.map((t) => t.duration)
+    const totalDuration = durations.length > 0 ? Math.max(...durations) : 0
+    
+    return {
+      duration: totalDuration,
+      toolCallCount: toolCalls.length,
+      errorCount,
+      tokens,
+      cost: totalCost,
+    }
+  }
+}
diff --git a/packages/opencode/src/util/log.ts b/packages/opencode/src/util/log.ts
index 5844a114ff..c72a2823eb 100644
--- a/packages/opencode/src/util/log.ts
+++ b/packages/opencode/src/util/log.ts
@@ -79,6 +79,7 @@ export namespace Log {
     )
     if (files.length <= 5) return
 
+    files.sort()
     const filesToDelete = files.slice(0, -10)
     await Promise.all(filesToDelete.map((file) => fs.unlink(file).catch(() => {})))
   }
diff --git a/packages/opencode/test/evaluation/README.md b/packages/opencode/test/evaluation/README.md
new file mode 100644
index 0000000000..6a656238aa
--- /dev/null
+++ b/packages/opencode/test/evaluation/README.md
@@ -0,0 +1,324 @@
+# Evaluation Test Utilities
+
+This directory contains realistic test fixtures and utilities for testing the evaluation framework with production-like data patterns.
+
+## Overview
+
+The test utilities consist of:
+
+1. **Realistic Trace Fixtures** (`fixtures/realistic-traces.ts`) - Pre-built trace patterns based on actual agent behavior
+2. **Time-Series Simulator** (`helpers/time-series-simulation.ts`) - Generate temporal patterns and trends
+3. **Scenario Tests** (`realistic-scenarios.test.ts`) - Real-world end-to-end test scenarios
+
+## Realistic Trace Fixtures
+
+Located in `fixtures/realistic-traces.ts`, these provide production-like trace patterns:
+
+### Available Patterns
+
+```typescript
+import { RealisticTraces } from './fixtures/realistic-traces'
+
+// Common successful workflow (Read → Grep → Edit → Execute)
+const trace = RealisticTraces.successfulCodeEdit()
+
+// Failed operation with retry pattern
+const trace = RealisticTraces.failedWithRetry()
+
+// Long-running complex refactoring
+const trace = RealisticTraces.complexRefactoring()
+
+// Cache-heavy execution (90% cost reduction)
+const trace = RealisticTraces.cachedExecution()
+
+// Deep reasoning task (high reasoning tokens)
+const trace = RealisticTraces.deepReasoning()
+
+// Quick simple fix
+const trace = RealisticTraces.quickFix()
+
+// High error rate debugging session
+const trace = RealisticTraces.highErrorRate()
+
+// Faster/cheaper Haiku model
+const trace = RealisticTraces.haikuModel()
+```
+
+### Generating Variations
+
+```typescript
+// Generate 10 traces with 15% variance around base pattern
+const traces = RealisticTraces.generateVariations(
+  RealisticTraces.successfulCodeEdit,
+  10,
+  0.15
+)
+
+// Create custom trace with specific overrides
+const customTrace = RealisticTraces.custom({
+  summary: {
+    cost: 0.05,
+    duration: 5000,
+  }
+})
+```
+
+## Time-Series Simulator
+
+Located in `helpers/time-series-simulation.ts`, generates realistic temporal patterns:
+
+### Temporal Patterns
+
+```typescript
+import { TimeSeriesSimulator } from './helpers/time-series-simulation'
+
+// Daily pattern: business hours (9-5) have 1.5x load
+const traces = TimeSeriesSimulator.dailyPattern(
+  7,    // days
+  24,   // samples per day
+  0.02, // base cost
+  0.1   // 10% variance
+)
+
+// Gradual degradation: 5% performance decline
+const traces = TimeSeriesSimulator.degradation(
+  100,   // samples
+  0.05,  // 5% degradation rate
+  0.02   // base cost
+)
+
+// Spike pattern: sudden anomaly
+const traces = TimeSeriesSimulator.withSpike(
+  100,  // normal samples
+  5     // 5x spike intensity
+)
+
+// Weekly seasonal pattern: weekends have 30% of weekday load
+const traces = TimeSeriesSimulator.seasonal(
+  4,    // weeks
+  10,   // samples per day
+  0.02  // base cost
+)
+
+// Linear trend: steady improvement or degradation
+const traces = TimeSeriesSimulator.linearTrend(
+  100,   // samples
+  0.04,  // start cost
+  0.02   // end cost (improving)
+)
+
+// Stable pattern with minimal variance
+const traces = TimeSeriesSimulator.stable(
+  100,   // samples
+  0.02,  // cost
+  0.02   // 2% variance
+)
+
+// Bimodal distribution: cached vs uncached
+const traces = TimeSeriesSimulator.bimodal(
+  100,   // samples
+  0.01,  // cached cost
+  0.05,  // uncached cost
+  0.7    // 70% cached
+)
+
+// A/B test with two populations
+const { groupA, groupB } = TimeSeriesSimulator.abTest(
+  50,    // samples per group
+  0.02,  // group A cost
+  0.028, // group B cost (40% worse)
+  0.1    // 10% variance
+)
+
+// Step function: sudden deployment change
+const traces = TimeSeriesSimulator.stepFunction(
+  50,   // samples before
+  50,   // samples after
+  0.02, // before cost
+  0.04  // after cost (2x)
+)
+
+// Noisy data: high variance
+const traces = TimeSeriesSimulator.noisy(
+  100,   // samples
+  0.02,  // mean cost
+  0.3    // 30% variance
+)
+```
+
+## Usage Examples
+
+### Testing Anomaly Detection
+
+```typescript
+test("detects cost spike", async () => {
+  const metric = await Metric.register({
+    id: "cost-monitoring",
+    evaluator: { type: "heuristic", function: "totalCost" },
+  })
+
+  // Generate stable baseline
+  const stableTraces = TimeSeriesSimulator.stable(50, 0.02, 0.02)
+  for (const trace of stableTraces) {
+    await TimeSeries.record(metric.id, trace)
+  }
+
+  // Test with anomalous trace
+  const anomaly = RealisticTraces.custom({
+    summary: { cost: 0.20 } // 10x normal
+  })
+
+  const result = await TimeSeries.detectAnomaly(metric.id, anomaly.summary.cost)
+  expect(result.isAnomaly).toBe(true)
+  expect(result.zScore).toBeGreaterThan(3)
+})
+```
+
+### Testing Baseline Regression
+
+```typescript
+test("detects performance regression", async () => {
+  const metric = await Metric.register({
+    id: "duration-tracking",
+    evaluator: { type: "heuristic", function: "responseDuration" },
+  })
+
+  // Create baseline with fast Haiku traces
+  const baseline = await Baseline.create({
+    id: "fast-baseline",
+    metricIDs: [metric.id],
+  })
+
+  const fastTraces = RealisticTraces.generateVariations(
+    RealisticTraces.haikuModel,
+    10,
+    0.1
+  )
+  
+  for (const trace of fastTraces) {
+    await Baseline.addTrace(baseline.id, trace)
+  }
+
+  // Test slower Sonnet trace
+  const slowTrace = RealisticTraces.successfulCodeEdit()
+  const comparison = await Baseline.compare(baseline.id, slowTrace)
+
+  expect(comparison.regressions).toContain(metric.id)
+})
+```
+
+### Testing Trend Analysis
+
+```typescript
+test("detects improving trend", async () => {
+  const metric = await Metric.register({
+    id: "optimization-tracking",
+    evaluator: { type: "heuristic", function: "totalCost" },
+  })
+
+  // Simulate optimization: cost decreases 30%
+  const traces = TimeSeriesSimulator.linearTrend(
+    100,  // samples
+    0.03, // start (expensive)
+    0.02  // end (optimized)
+  )
+
+  for (const trace of traces) {
+    await TimeSeries.record(metric.id, trace)
+  }
+
+  const analysis = await TimeSeries.analyzeTrend(metric.id, { days: 4 })
+  expect(analysis.trend).toBe("improving")
+  expect(analysis.changePercent).toBeLessThan(-20) // >20% improvement
+})
+```
+
+### Testing A/B Comparison
+
+```typescript
+test("compares model variants", async () => {
+  const metric = await Metric.register({
+    id: "ab-test",
+    evaluator: { type: "heuristic", function: "totalCost" },
+  })
+
+  const { groupA, groupB } = TimeSeriesSimulator.abTest(
+    30,    // samples
+    0.020, // variant A
+    0.028, // variant B (40% worse)
+    0.1    // variance
+  )
+
+  const baselineA = await Baseline.create({
+    id: "variant-a",
+    metricIDs: [metric.id],
+  })
+
+  const baselineB = await Baseline.create({
+    id: "variant-b",
+    metricIDs: [metric.id],
+  })
+
+  for (const trace of groupA) {
+    await Baseline.addTrace(baselineA.id, trace)
+  }
+
+  for (const trace of groupB) {
+    await Baseline.addTrace(baselineB.id, trace)
+  }
+
+  const comparison = await Baseline.compareAB(baselineA.id, baselineB.id)
+  
+  // B should be significantly worse
+  expect(comparison.metrics[0].percentChange).toBeGreaterThan(30)
+  expect(comparison.metrics[0].winner).toBe("A")
+})
+```
+
+## Realistic Cost Values
+
+All traces use realistic Claude pricing based on token usage:
+
+- **Quick Fix**: $0.0045 (300 input + 100 output tokens)
+- **Successful Edit**: $0.0245 (1,250 input + 450 output tokens)
+- **Cached Execution**: $0.0089 (heavy cache reads)
+- **Complex Refactoring**: $0.1850 (8,500 input + 2,100 output tokens)
+- **Deep Reasoning**: $0.0680 (5,000 reasoning tokens)
+- **Failed with Retry**: $0.0520 (~2x normal due to retries)
+- **Haiku Model**: $0.0018 (much cheaper/faster)
+
+## Realistic Duration Values
+
+Based on observed agent behavior:
+
+- **Quick Fix**: 600ms
+- **Successful Edit**: 2,150ms  
+- **Complex Refactoring**: 15,000ms
+- **Haiku Model**: 400ms (much faster)
+- **Failed with Retry**: 3,200ms (includes retry delays)
+
+## Tool Call Patterns
+
+Realistic sequences based on common workflows:
+
+- **Code Edit**: Read → Grep → Edit → Execute
+- **Multi-file Refactor**: Grep → Read (3x) → Grep → MultiEdit (2x) → Execute (2x) → Read
+- **Quick Fix**: Read → Edit
+- **Debugging**: Execute (multiple, with retries)
+
+## Best Practices
+
+1. **Use variations for realistic noise**: `generateVariations()` adds 10-20% variance
+2. **Match patterns to scenarios**: Use appropriate trace types for your test
+3. **Consider temporal patterns**: Use TimeSeriesSimulator for time-based tests
+4. **Test edge cases**: Combine patterns (e.g., degradation + spikes)
+5. **Validate with real data**: Compare fixture costs/durations to production metrics
+
+## Contributing
+
+When adding new fixtures:
+1. Base them on real production patterns
+2. Use realistic token counts and pricing
+3. Include proper tool call sequences
+4. Document the scenario being modeled
+5. Add variance options where appropriate
diff --git a/packages/opencode/test/evaluation/baseline.test.ts b/packages/opencode/test/evaluation/baseline.test.ts
new file mode 100644
index 0000000000..2b244d91dd
--- /dev/null
+++ b/packages/opencode/test/evaluation/baseline.test.ts
@@ -0,0 +1,349 @@
+import { describe, expect, test } from "bun:test"
+import { Baseline } from "../../src/evaluation/baseline"
+import { Metric } from "../../src/evaluation/metric"
+import { Instance } from "../../src/project/instance"
+import { tmpdir } from "../fixture/fixture"
+import type { Trace } from "../../src/trace"
+
+// Helper to wrap tests with Instance context for storage isolation
+async function withInstance(fn: () => Promise<void>) {
+  await using tmp = await tmpdir()
+  await Instance.provide({
+    directory: tmp.path,
+    fn,
+  })
+}
+
+const createMockTrace = (overrides?: Partial<Trace.Complete>): Trace.Complete => ({
+  id: `trace-${Date.now()}-${Math.random()}`,
+  projectID: "test-project",
+  session: {
+    id: "test-session",
+    projectID: "test-project",
+    directory: "/test",
+    title: "Test Session",
+    version: "1.0.0",
+    time: { created: Date.now(), updated: Date.now() },
+  },
+  messageCount: 3,
+  agentName: "test-agent",
+  modelConfig: {
+    provider: "anthropic",
+    model: "claude-3-5-sonnet-20241022",
+  },
+  output: "Test output",
+  toolCalls: [
+    { id: "Read", status: "success", duration: 100 } as any,
+    { id: "Edit", status: "success", duration: 200 } as any,
+  ],
+  summary: {
+    duration: 1500,
+    toolCallCount: 2,
+    errorCount: 0,
+    tokens: { input: 100, output: 50, reasoning: 0, cache: { read: 20, write: 0 } },
+    cost: 0.02,
+  },
+  evaluationIDs: [],
+  createdAt: Date.now(),
+  ...overrides,
+})
+
+describe("Baseline", () => {
+  describe("create and get", () => {
+    test("can create and retrieve a baseline", async () => {
+      await withInstance(async () => {
+        const baseline = await Baseline.create({
+          id: "test-baseline",
+          name: "Test Baseline",
+          description: "A test baseline",
+          metricIDs: ["metric-1"],
+          tags: ["test"],
+        })
+
+        expect(baseline.id).toBe("test-baseline")
+        expect(baseline.name).toBe("Test Baseline")
+        expect(baseline.createdAt).toBeGreaterThan(0)
+
+        const retrieved = await Baseline.get(baseline.id)
+        expect(retrieved.id).toBe(baseline.id)
+      })
+    })
+
+    test("initializes with default values", async () => {
+      await withInstance(async () => {
+        const baseline = await Baseline.create({
+          id: "defaults-test",
+          name: "Defaults",
+          description: "Test defaults",
+          metricIDs: [],
+        })
+
+        expect(baseline.traceIDs).toEqual([])
+        expect(baseline.statistics).toEqual([])
+        expect(baseline.minSampleSize).toBe(10)
+        expect(baseline.regressionThreshold).toBe(0.1)
+      })
+    })
+  })
+
+  describe("list and findByTag", () => {
+    test("lists all baselines", async () => {
+      await withInstance(async () => {
+        const b1 = await Baseline.create({
+          id: "baseline-1",
+          name: "Baseline 1",
+          description: "First",
+          metricIDs: [],
+        })
+
+        const b2 = await Baseline.create({
+          id: "baseline-2",
+          name: "Baseline 2",
+          description: "Second",
+          metricIDs: [],
+        })
+
+        const list = await Baseline.list()
+        expect(list.length).toBeGreaterThanOrEqual(2)
+        expect(list.some((b) => b.id === b1.id)).toBe(true)
+        expect(list.some((b) => b.id === b2.id)).toBe(true)
+      })
+    })
+
+    test("finds baselines by tag", async () => {
+      await withInstance(async () => {
+        await Baseline.create({
+          id: "prod-baseline",
+          name: "Production",
+          description: "Prod baseline",
+          metricIDs: [],
+          tags: ["production", "v1"],
+        })
+
+        await Baseline.create({
+          id: "dev-baseline",
+          name: "Development",
+          description: "Dev baseline",
+          metricIDs: [],
+          tags: ["development"],
+        })
+
+        const prodBaselines = await Baseline.findByTag("production")
+        expect(prodBaselines.length).toBeGreaterThanOrEqual(1)
+        expect(prodBaselines.every((b) => b.tags.includes("production"))).toBe(true)
+      })
+    })
+  })
+
+  describe("addTrace", () => {
+    test("adds trace to baseline and updates statistics", async () => {
+      await withInstance(async () => {
+        const metric: Metric.Definition = {
+          id: "test-metric",
+          name: "Test Metric",
+          description: "Test",
+          version: "1.0.0",
+          category: "cost",
+          evaluator: { type: "heuristic", function: "totalCost" },
+          higherIsBetter: false,
+          tags: [],
+        }
+        await Metric.register(metric)
+
+        const baseline = await Baseline.create({
+          id: "baseline-with-traces",
+          name: "Baseline with Traces",
+          description: "Test baseline",
+          metricIDs: [metric.id],
+          minSampleSize: 2,
+        })
+
+        const trace1 = createMockTrace({ cost: 0.01 } as any)
+        const trace2 = createMockTrace({ cost: 0.02 } as any)
+
+        await Baseline.addTrace(baseline.id, trace1)
+        await Baseline.addTrace(baseline.id, trace2)
+
+        const updated = await Baseline.get(baseline.id)
+        expect(updated.traceIDs).toHaveLength(2)
+        expect(updated.statistics.length).toBeGreaterThan(0)
+      })
+    })
+  })
+
+  describe("compare", () => {
+    test("compares trace against baseline and detects regressions", async () => {
+      await withInstance(async () => {
+        const metric: Metric.Definition = {
+          id: "error-rate-metric",
+          name: "Error Rate",
+          description: "Tool error rate",
+          version: "1.0.0",
+          category: "reliability",
+          evaluator: { type: "heuristic", function: "toolErrorRate" },
+          threshold: { pass: 0.1 },
+          higherIsBetter: false,
+          tags: [],
+        }
+        await Metric.register(metric)
+
+        const baseline = await Baseline.create({
+          id: "compare-baseline",
+          name: "Compare Baseline",
+          description: "For comparison tests",
+          metricIDs: [metric.id],
+          minSampleSize: 3,
+          regressionThreshold: 0.2,
+        })
+
+        for (let i = 0; i < 3; i++) {
+          const trace = createMockTrace({
+            toolCalls: [
+              { id: "Read", status: "success", duration: 100 } as any,
+              { id: "Edit", status: "success", duration: 200 } as any,
+              { id: "Execute", status: "success", duration: 150 } as any,
+            ],
+          })
+          await Baseline.addTrace(baseline.id, trace)
+        }
+
+        const badTrace = createMockTrace({
+          toolCalls: [
+            { id: "Read", status: "error", duration: 100 } as any,
+            { id: "Edit", status: "error", duration: 200 } as any,
+            { id: "Execute", status: "success", duration: 150 } as any,
+          ],
+        })
+
+        const comparison = await Baseline.compare(baseline.id, badTrace)
+
+        expect(comparison.baselineID).toBe(baseline.id)
+        expect(comparison.traceID).toBe(badTrace.id)
+        expect(comparison.metrics.length).toBeGreaterThan(0)
+        const metricComparison = comparison.metrics.find((m) => m.metricID === metric.id)
+        expect(metricComparison).toBeDefined()
+        expect(metricComparison!.isRegression).toBe(true)
+        expect(comparison.regressions).toContain(metric.id)
+      })
+    })
+
+    test("detects improvements", async () => {
+      await withInstance(async () => {
+        const metric: Metric.Definition = {
+          id: "success-rate-metric",
+          name: "Success Rate",
+          description: "Tool success rate",
+          version: "1.0.0",
+          category: "reliability",
+          evaluator: { type: "heuristic", function: "toolSuccessRate" },
+          higherIsBetter: true,
+          tags: [],
+        }
+        await Metric.register(metric)
+
+        const baseline = await Baseline.create({
+          id: "improvement-baseline",
+          name: "Improvement Baseline",
+          description: "Test improvements",
+          metricIDs: [metric.id],
+          minSampleSize: 2,
+          regressionThreshold: 0.1,
+        })
+
+        for (let i = 0; i < 2; i++) {
+          const trace = createMockTrace({
+            toolCalls: [
+              { id: "Read", status: "success", duration: 100 } as any,
+              { id: "Edit", status: "error", duration: 200 } as any,
+            ],
+          })
+          await Baseline.addTrace(baseline.id, trace)
+        }
+
+        const goodTrace = createMockTrace({
+          toolCalls: [
+            { id: "Read", status: "success", duration: 100 } as any,
+            { id: "Edit", status: "success", duration: 200 } as any,
+          ],
+        })
+
+        const comparison = await Baseline.compare(baseline.id, goodTrace)
+        expect(comparison.improvements.length).toBeGreaterThan(0)
+      })
+    })
+  })
+
+  describe("compareAB", () => {
+    test("compares two baselines for A/B testing", async () => {
+      await withInstance(async () => {
+        const metric: Metric.Definition = {
+          id: "ab-test-metric",
+          name: "AB Test Metric",
+          description: "For AB testing",
+          version: "1.0.0",
+          category: "cost",
+          evaluator: { type: "heuristic", function: "totalCost" },
+          higherIsBetter: false,
+          tags: [],
+        }
+        await Metric.register(metric)
+
+        const baselineA = await Baseline.create({
+          id: "baseline-a",
+          name: "Baseline A",
+          description: "Version A",
+          metricIDs: [metric.id],
+          minSampleSize: 3,
+        })
+
+        for (let i = 0; i < 3; i++) {
+          const trace = createMockTrace({
+            summary: {
+              duration: 1500,
+              toolCallCount: 2,
+              errorCount: 0,
+              tokens: { input: 100, output: 50, reasoning: 0, cache: { read: 0, write: 0 } },
+              cost: 0.05,
+            },
+          })
+          await Baseline.addTrace(baselineA.id, trace)
+        }
+
+        const baselineB = await Baseline.create({
+          id: "baseline-b",
+          name: "Baseline B",
+          description: "Version B",
+          metricIDs: [metric.id],
+          minSampleSize: 3,
+        })
+
+        for (let i = 0; i < 3; i++) {
+          const trace = createMockTrace({
+            summary: {
+              duration: 1500,
+              toolCallCount: 2,
+              errorCount: 0,
+              tokens: { input: 100, output: 50, reasoning: 0, cache: { read: 0, write: 0 } },
+              cost: 0.02,
+            },
+          })
+          await Baseline.addTrace(baselineB.id, trace)
+        }
+
+        const abResult = await Baseline.compareAB(baselineA.id, baselineB.id)
+
+        expect(abResult.baselineA).toBe(baselineA.id)
+        expect(abResult.baselineB).toBe(baselineB.id)
+        expect(abResult.metrics.length).toBeGreaterThan(0)
+        expect(abResult.overallWinner).toBe("B")
+        expect(abResult.sampleSizeA).toBe(3)
+        expect(abResult.sampleSizeB).toBe(3)
+
+        const metricComparison = abResult.metrics[0]
+        expect(metricComparison.metricID).toBe(metric.id)
+        expect(metricComparison.winner).toBe("B")
+        expect(metricComparison.meanB).toBeLessThan(metricComparison.meanA)
+      })
+    })
+  })
+})
diff --git a/packages/opencode/test/evaluation/fixtures/realistic-traces.ts b/packages/opencode/test/evaluation/fixtures/realistic-traces.ts
new file mode 100644
index 0000000000..73b5a39dc0
--- /dev/null
+++ b/packages/opencode/test/evaluation/fixtures/realistic-traces.ts
@@ -0,0 +1,483 @@
+import type { Trace } from "../../../src/trace"
+
+/**
+ * Realistic trace fixtures based on actual agent behavior patterns.
+ * 
+ * These fixtures represent common scenarios observed in production:
+ * - Successful workflows with typical tool sequences
+ * - Error patterns with retries
+ * - Cache utilization patterns
+ * - Token usage distributions
+ * - Performance characteristics
+ */
+
+const generateId = () => `trace-${Date.now()}-${Math.random()}`
+
+function generateToolSequence(
+  tools: string[],
+  errorRate: number = 0
+): Trace.Complete["toolCalls"] {
+  return tools.map((id, index) => ({
+    id,
+    sessionID: "test-session",
+    timestamp: Date.now() + index * 100,
+    duration: Math.floor(50 + Math.random() * 300),
+    status: Math.random() < errorRate ? ("error" as const) : ("success" as const),
+  }))
+}
+
+export const RealisticTraces = {
+  /**
+   * Successful code editing workflow - most common pattern.
+   * Read file → Find pattern → Edit file → Verify
+   */
+  successfulCodeEdit: (): Trace.Complete => ({
+    id: generateId(),
+    projectID: "test-project",
+    session: {} as any,
+    messageCount: 8,
+    agentName: "droid",
+    modelConfig: {
+      provider: "anthropic",
+      model: "claude-3-5-sonnet-20241022",
+    },
+    output: "Successfully edited file and verified changes",
+    toolCalls: [
+      {
+        id: "Read",
+        sessionID: "test-session",
+        timestamp: Date.now(),
+        duration: 150,
+        status: "success",
+      },
+      {
+        id: "Grep",
+        sessionID: "test-session",
+        timestamp: Date.now() + 150,
+        duration: 89,
+        status: "success",
+      },
+      {
+        id: "Edit",
+        sessionID: "test-session",
+        timestamp: Date.now() + 239,
+        duration: 234,
+        status: "success",
+      },
+      {
+        id: "Execute",
+        sessionID: "test-session",
+        timestamp: Date.now() + 473,
+        duration: 1500,
+        status: "success",
+      },
+    ],
+    summary: {
+      duration: 2150,
+      toolCallCount: 4,
+      errorCount: 0,
+      tokens: {
+        input: 1250,
+        output: 450,
+        reasoning: 180,
+        cache: { read: 800, write: 200 },
+      },
+      cost: 0.0245, // Realistic Claude 3.5 Sonnet pricing
+    },
+    evaluationIDs: [],
+    createdAt: Date.now(),
+    completedAt: Date.now() + 2150,
+  }),
+
+  /**
+   * Failed operation with retry - realistic error recovery pattern.
+   * Shows how errors increase cost and duration.
+   */
+  failedWithRetry: (): Trace.Complete => ({
+    id: generateId(),
+    projectID: "test-project",
+    session: {} as any,
+    messageCount: 12,
+    agentName: "droid",
+    modelConfig: {
+      provider: "anthropic",
+      model: "claude-3-5-sonnet-20241022",
+    },
+    output: "Completed after retrying failed operations",
+    toolCalls: [
+      {
+        id: "Read",
+        sessionID: "test-session",
+        timestamp: Date.now(),
+        duration: 50,
+        status: "error",
+      },
+      {
+        id: "Read",
+        sessionID: "test-session",
+        timestamp: Date.now() + 800, // Delay after error
+        duration: 120,
+        status: "success",
+      },
+      {
+        id: "Edit",
+        sessionID: "test-session",
+        timestamp: Date.now() + 920,
+        duration: 45,
+        status: "error",
+      },
+      {
+        id: "Edit",
+        sessionID: "test-session",
+        timestamp: Date.now() + 1700, // Delay after error
+        duration: 180,
+        status: "success",
+      },
+      {
+        id: "Execute",
+        sessionID: "test-session",
+        timestamp: Date.now() + 1880,
+        duration: 1200,
+        status: "success",
+      },
+    ],
+    summary: {
+      duration: 3200,
+      toolCallCount: 5,
+      errorCount: 2,
+      tokens: {
+        input: 2100, // Higher due to retries
+        output: 680,
+        reasoning: 340,
+        cache: { read: 400, write: 100 },
+      },
+      cost: 0.0520, // ~2x normal due to retries
+    },
+    evaluationIDs: [],
+    createdAt: Date.now(),
+    completedAt: Date.now() + 3200,
+  }),
+
+  /**
+   * Long-running complex task - large refactoring or multi-file change.
+   * Represents 95th percentile duration scenarios.
+   */
+  complexRefactoring: (): Trace.Complete => ({
+    id: generateId(),
+    projectID: "test-project",
+    session: {} as any,
+    messageCount: 15,
+    agentName: "droid",
+    modelConfig: {
+      provider: "anthropic",
+      model: "claude-3-5-sonnet-20241022",
+    },
+    output: "Refactored multiple files and verified all tests pass",
+    toolCalls: generateToolSequence([
+      "Grep",
+      "Read",
+      "Read",
+      "Read",
+      "Grep",
+      "MultiEdit",
+      "MultiEdit",
+      "Execute",
+      "Execute",
+      "Read",
+    ]),
+    summary: {
+      duration: 15000, // 15 seconds
+      toolCallCount: 10,
+      errorCount: 0,
+      tokens: {
+        input: 8500, // Large context for multi-file changes
+        output: 2100,
+        reasoning: 1200,
+        cache: { read: 3000, write: 1500 },
+      },
+      cost: 0.1850, // Expensive due to size
+    },
+    evaluationIDs: [],
+    createdAt: Date.now(),
+    completedAt: Date.now() + 15000,
+  }),
+
+  /**
+   * Cache-heavy scenario - subsequent similar task with high cache hits.
+   * Represents cost optimization from prompt caching.
+   */
+  cachedExecution: (): Trace.Complete => ({
+    id: generateId(),
+    projectID: "test-project",
+    session: {} as any,
+    messageCount: 6,
+    agentName: "droid",
+    modelConfig: {
+      provider: "anthropic",
+      model: "claude-3-5-sonnet-20241022",
+    },
+    output: "Completed similar task with cached context",
+    toolCalls: [
+      {
+        id: "Read",
+        sessionID: "test-session",
+        timestamp: Date.now(),
+        duration: 110,
+        status: "success",
+      },
+      {
+        id: "Edit",
+        sessionID: "test-session",
+        timestamp: Date.now() + 110,
+        duration: 180,
+        status: "success",
+      },
+      {
+        id: "Execute",
+        sessionID: "test-session",
+        timestamp: Date.now() + 290,
+        duration: 1400,
+        status: "success",
+      },
+    ],
+    summary: {
+      duration: 1800,
+      toolCallCount: 3,
+      errorCount: 0,
+      tokens: {
+        input: 500, // Much lower input
+        output: 300,
+        reasoning: 100,
+        cache: { read: 4000, write: 50 }, // High cache reads
+      },
+      cost: 0.0089, // ~3x cheaper due to caching
+    },
+    evaluationIDs: [],
+    createdAt: Date.now(),
+    completedAt: Date.now() + 1800,
+  }),
+
+  /**
+   * Deep reasoning task - minimal tools, high reasoning tokens.
+   * Represents complex problem-solving or planning phases.
+   */
+  deepReasoning: (): Trace.Complete => ({
+    id: generateId(),
+    projectID: "test-project",
+    session: {} as any,
+    messageCount: 10,
+    agentName: "droid",
+    modelConfig: {
+      provider: "anthropic",
+      model: "claude-3-5-sonnet-20241022",
+    },
+    output: "Analyzed codebase and created implementation plan",
+    toolCalls: [
+      {
+        id: "Grep",
+        sessionID: "test-session",
+        timestamp: Date.now(),
+        duration: 150,
+        status: "success",
+      },
+      {
+        id: "Read",
+        sessionID: "test-session",
+        timestamp: Date.now() + 150,
+        duration: 200,
+        status: "success",
+      },
+      {
+        id: "Read",
+        sessionID: "test-session",
+        timestamp: Date.now() + 350,
+        duration: 180,
+        status: "success",
+      },
+    ],
+    summary: {
+      duration: 4500,
+      toolCallCount: 3,
+      errorCount: 0,
+      tokens: {
+        input: 1000,
+        output: 500,
+        reasoning: 5000, // High reasoning for analysis
+        cache: { read: 200, write: 100 },
+      },
+      cost: 0.0680, // Expensive due to reasoning tokens
+    },
+    evaluationIDs: [],
+    createdAt: Date.now(),
+    completedAt: Date.now() + 4500,
+  }),
+
+  /**
+   * Quick fix - minimal operation, low cost.
+   * Represents simple, well-defined tasks.
+   */
+  quickFix: (): Trace.Complete => ({
+    id: generateId(),
+    projectID: "test-project",
+    session: {} as any,
+    messageCount: 4,
+    agentName: "droid",
+    modelConfig: {
+      provider: "anthropic",
+      model: "claude-3-5-sonnet-20241022",
+    },
+    output: "Fixed typo in documentation",
+    toolCalls: [
+      {
+        id: "Read",
+        sessionID: "test-session",
+        timestamp: Date.now(),
+        duration: 80,
+        status: "success",
+      },
+      {
+        id: "Edit",
+        sessionID: "test-session",
+        timestamp: Date.now() + 80,
+        duration: 120,
+        status: "success",
+      },
+    ],
+    summary: {
+      duration: 600,
+      toolCallCount: 2,
+      errorCount: 0,
+      tokens: {
+        input: 300,
+        output: 100,
+        reasoning: 20,
+        cache: { read: 150, write: 50 },
+      },
+      cost: 0.0045, // Very cheap
+    },
+    evaluationIDs: [],
+    createdAt: Date.now(),
+    completedAt: Date.now() + 600,
+  }),
+
+  /**
+   * High error rate - debugging or difficult task.
+   * Shows worst-case scenario with multiple failures.
+   */
+  highErrorRate: (): Trace.Complete => ({
+    id: generateId(),
+    projectID: "test-project",
+    session: {} as any,
+    messageCount: 20,
+    agentName: "droid",
+    modelConfig: {
+      provider: "anthropic",
+      model: "claude-3-5-sonnet-20241022",
+    },
+    output: "Eventually succeeded after multiple attempts",
+    toolCalls: generateToolSequence(
+      ["Execute", "Execute", "Execute", "Edit", "Execute", "Execute"],
+      0.5 // 50% error rate
+    ),
+    summary: {
+      duration: 8000,
+      toolCallCount: 6,
+      errorCount: 3,
+      tokens: {
+        input: 3500,
+        output: 1200,
+        reasoning: 600,
+        cache: { read: 500, write: 200 },
+      },
+      cost: 0.0890,
+    },
+    evaluationIDs: [],
+    createdAt: Date.now(),
+    completedAt: Date.now() + 8000,
+  }),
+
+  /**
+   * Haiku model - faster, cheaper alternative.
+   * Lower quality but good for simple tasks.
+   */
+  haikuModel: (): Trace.Complete => ({
+    id: generateId(),
+    projectID: "test-project",
+    session: {} as any,
+    messageCount: 5,
+    agentName: "droid",
+    modelConfig: {
+      provider: "anthropic",
+      model: "claude-3-5-haiku-20241022",
+    },
+    output: "Completed with Haiku model",
+    toolCalls: [
+      {
+        id: "Read",
+        sessionID: "test-session",
+        timestamp: Date.now(),
+        duration: 60,
+        status: "success",
+      },
+      {
+        id: "Edit",
+        sessionID: "test-session",
+        timestamp: Date.now() + 60,
+        duration: 90,
+        status: "success",
+      },
+    ],
+    summary: {
+      duration: 400, // Much faster
+      toolCallCount: 2,
+      errorCount: 0,
+      tokens: {
+        input: 400,
+        output: 150,
+        reasoning: 0, // No reasoning tokens in Haiku
+        cache: { read: 200, write: 50 },
+      },
+      cost: 0.0018, // Much cheaper
+    },
+    evaluationIDs: [],
+    createdAt: Date.now(),
+    completedAt: Date.now() + 400,
+  }),
+
+  /**
+   * Create a customized trace with specific overrides.
+   * Useful for testing specific scenarios.
+   */
+  custom: (overrides: Partial<Trace.Complete>): Trace.Complete => ({
+    ...RealisticTraces.successfulCodeEdit(),
+    ...overrides,
+  }),
+
+  /**
+   * Generate multiple traces with variation.
+   * Adds realistic noise to base patterns.
+   */
+  generateVariations: (
+    baseGenerator: () => Trace.Complete,
+    count: number,
+    variance: number = 0.1
+  ): Trace.Complete[] => {
+    return Array.from({ length: count }, () => {
+      const base = baseGenerator()
+      const costVariation = 1 + (Math.random() * variance * 2 - variance)
+      const durationVariation = 1 + (Math.random() * variance * 2 - variance)
+
+      return {
+        ...base,
+        id: generateId(),
+        summary: {
+          ...base.summary,
+          cost: base.summary.cost * costVariation,
+          duration: Math.floor(base.summary.duration * durationVariation),
+        },
+        createdAt: Date.now() + Math.random() * 1000,
+        completedAt: Date.now() + Math.random() * 1000 + base.summary.duration,
+      }
+    })
+  },
+}
diff --git a/packages/opencode/test/evaluation/helpers/time-series-simulation.ts b/packages/opencode/test/evaluation/helpers/time-series-simulation.ts
new file mode 100644
index 0000000000..6f5815d23b
--- /dev/null
+++ b/packages/opencode/test/evaluation/helpers/time-series-simulation.ts
@@ -0,0 +1,429 @@
+import type { Trace } from "../../../src/trace"
+
+/**
+ * Time-series simulation utilities for realistic temporal patterns.
+ * 
+ * Simulates real-world patterns including:
+ * - Daily cycles (business hours vs off-hours)
+ * - Weekly cycles (weekday vs weekend)
+ * - Gradual degradation (performance drift)
+ * - Seasonal trends
+ * - Spike patterns (sudden anomalies)
+ * - Noise injection (realistic variance)
+ */
+
+const generateId = () => `trace-${Date.now()}-${Math.random()}`
+
+interface TraceOptions {
+  cost?: number
+  duration?: number
+  errorCount?: number
+  timestamp?: number
+}
+
+function createTraceWithOptions(options: TraceOptions): Trace.Complete {
+  return {
+    id: generateId(),
+    projectID: "test-project",
+    session: {} as any,
+    messageCount: 5,
+    agentName: "droid",
+    modelConfig: {
+      provider: "anthropic",
+      model: "claude-3-5-sonnet-20241022",
+    },
+    output: "Test trace",
+    toolCalls: [
+      {
+        id: "Read",
+        sessionID: "test-session",
+        timestamp: options.timestamp || Date.now(),
+        duration: options.duration ? Math.floor(options.duration / 2) : 100,
+        status: "success",
+      },
+    ],
+    summary: {
+      duration: options.duration || 1000,
+      toolCallCount: 1,
+      errorCount: options.errorCount || 0,
+      tokens: {
+        input: 100,
+        output: 50,
+        reasoning: 0,
+        cache: { read: 0, write: 0 },
+      },
+      cost: options.cost || 0.02,
+    },
+    evaluationIDs: [],
+    createdAt: options.timestamp || Date.now(),
+    completedAt: (options.timestamp || Date.now()) + (options.duration || 1000),
+  }
+}
+
+export class TimeSeriesSimulator {
+  /**
+   * Generate traces with daily pattern - higher load during business hours.
+   * 
+   * Realistic pattern: 9am-5pm sees 1.5x load, off-hours sees 0.7x load
+   * 
+   * @param days Number of days to simulate
+   * @param samplesPerDay Number of traces per day (default: 24, one per hour)
+   * @param baseCost Base cost per trace (default: 0.02)
+   * @param variance Random variance percentage (default: 0.1 = 10%)
+   */
+  static dailyPattern(
+    days: number,
+    samplesPerDay: number = 24,
+    baseCost: number = 0.02,
+    variance: number = 0.1
+  ): Trace.Complete[] {
+    const traces: Trace.Complete[] = []
+    const baseTime = Date.now() - days * 24 * 60 * 60 * 1000
+    const hoursPerSample = 24 / samplesPerDay
+
+    for (let day = 0; day < days; day++) {
+      for (let sample = 0; sample < samplesPerDay; sample++) {
+        const hour = Math.floor((sample * hoursPerSample) % 24)
+        const timestamp =
+          baseTime +
+          day * 24 * 60 * 60 * 1000 +
+          sample * hoursPerSample * 60 * 60 * 1000
+
+        // Business hours (9-17) have higher load
+        const isBusinessHours = hour >= 9 && hour <= 17
+        const loadMultiplier = isBusinessHours ? 1.5 : 0.7
+
+        // Add random variance
+        const noise = 1 + (Math.random() * variance * 2 - variance)
+
+        traces.push(
+          createTraceWithOptions({
+            timestamp,
+            cost: baseCost * loadMultiplier * noise,
+            duration: Math.floor(1000 * loadMultiplier * noise),
+          })
+        )
+      }
+    }
+
+    return traces
+  }
+
+  /**
+   * Generate gradual degradation pattern - performance declining over time.
+   * 
+   * Simulates system degradation, memory leaks, or model quality drift.
+   * 
+   * @param samples Number of samples to generate
+   * @param degradationRate Rate of degradation (0.05 = 5% increase over full period)
+   * @param baseCost Starting cost (default: 0.02)
+   */
+  static degradation(
+    samples: number,
+    degradationRate: number = 0.05,
+    baseCost: number = 0.02
+  ): Trace.Complete[] {
+    const baseTime = Date.now() - samples * 60 * 60 * 1000 // Hourly samples
+    return Array.from({ length: samples }, (_, i) => {
+      const progress = i / samples
+      const degradationFactor = 1 + progress * degradationRate
+      const timestamp = baseTime + i * 60 * 60 * 1000
+
+      // Errors increase in later stages
+      const errorCount = progress > 0.8 && Math.random() > 0.7 ? 1 : 0
+
+      return createTraceWithOptions({
+        timestamp,
+        cost: baseCost * degradationFactor,
+        duration: Math.floor(1000 * degradationFactor),
+        errorCount,
+      })
+    })
+  }
+
+  /**
+   * Generate traces with sudden spike anomaly.
+   * 
+   * @param normalCount Number of normal traces before spike
+   * @param spikeIntensity Multiplier for spike (default: 5x)
+   * @param spikePosition Position of spike (default: middle)
+   * @param baseCost Base cost (default: 0.02)
+   */
+  static withSpike(
+    normalCount: number,
+    spikeIntensity: number = 5,
+    spikePosition?: number,
+    baseCost: number = 0.02
+  ): Trace.Complete[] {
+    const baseTime = Date.now() - normalCount * 60 * 60 * 1000
+    const position = spikePosition ?? Math.floor(normalCount / 2)
+
+    return Array.from({ length: normalCount }, (_, i) => {
+      const timestamp = baseTime + i * 60 * 60 * 1000
+      const isSpike = i === position
+
+      return createTraceWithOptions({
+        timestamp,
+        cost: baseCost * (isSpike ? spikeIntensity : 1),
+        duration: Math.floor(1000 * (isSpike ? spikeIntensity : 1)),
+        errorCount: isSpike ? 1 : 0,
+      })
+    })
+  }
+
+  /**
+   * Generate traces with multiple spikes at regular intervals.
+   * 
+   * @param totalCount Total number of traces
+   * @param spikeInterval Interval between spikes
+   * @param spikeIntensity Multiplier for spikes (default: 3x)
+   * @param baseCost Base cost (default: 0.02)
+   */
+  static withPeriodicSpikes(
+    totalCount: number,
+    spikeInterval: number,
+    spikeIntensity: number = 3,
+    baseCost: number = 0.02
+  ): Trace.Complete[] {
+    const baseTime = Date.now() - totalCount * 60 * 60 * 1000
+
+    return Array.from({ length: totalCount }, (_, i) => {
+      const timestamp = baseTime + i * 60 * 60 * 1000
+      const isSpike = i % spikeInterval === 0 && i > 0
+
+      return createTraceWithOptions({
+        timestamp,
+        cost: baseCost * (isSpike ? spikeIntensity : 1),
+        duration: Math.floor(1000 * (isSpike ? spikeIntensity : 1)),
+      })
+    })
+  }
+
+  /**
+   * Generate seasonal pattern with weekly cycles.
+   * 
+   * Realistic pattern: Weekends have ~30% of weekday load
+   * 
+   * @param weeks Number of weeks to simulate
+   * @param samplesPerDay Samples per day (default: 10)
+   * @param baseCost Base cost (default: 0.02)
+   */
+  static seasonal(
+    weeks: number,
+    samplesPerDay: number = 10,
+    baseCost: number = 0.02
+  ): Trace.Complete[] {
+    const traces: Trace.Complete[] = []
+    const baseTime = Date.now() - weeks * 7 * 24 * 60 * 60 * 1000
+
+    for (let week = 0; week < weeks; week++) {
+      for (let day = 0; day < 7; day++) {
+        // Weekend days (5=Saturday, 6=Sunday) have lower load
+        const isWeekend = day >= 5
+        const loadMultiplier = isWeekend ? 0.3 : 1.0
+
+        for (let sample = 0; sample < samplesPerDay; sample++) {
+          const timestamp =
+            baseTime +
+            (week * 7 + day) * 24 * 60 * 60 * 1000 +
+            sample * (24 / samplesPerDay) * 60 * 60 * 1000
+
+          const noise = 1 + (Math.random() * 0.1 * 2 - 0.1) // 10% variance
+
+          traces.push(
+            createTraceWithOptions({
+              timestamp,
+              cost: baseCost * loadMultiplier * noise,
+              duration: Math.floor(1000 * loadMultiplier * noise),
+            })
+          )
+        }
+      }
+    }
+
+    return traces
+  }
+
+  /**
+   * Generate linear trend - steady improvement or degradation.
+   * 
+   * @param samples Number of samples
+   * @param startCost Starting cost
+   * @param endCost Ending cost
+   * @param addNoise Whether to add realistic noise (default: true)
+   */
+  static linearTrend(
+    samples: number,
+    startCost: number,
+    endCost: number,
+    addNoise: boolean = true
+  ): Trace.Complete[] {
+    const baseTime = Date.now() - samples * 60 * 60 * 1000
+    const costDelta = endCost - startCost
+
+    return Array.from({ length: samples }, (_, i) => {
+      const progress = i / samples
+      const cost = startCost + costDelta * progress
+      const noise = addNoise ? 1 + (Math.random() * 0.05 * 2 - 0.05) : 1
+      const timestamp = baseTime + i * 60 * 60 * 1000
+
+      return createTraceWithOptions({
+        timestamp,
+        cost: cost * noise,
+        duration: Math.floor(1000 * (cost / startCost) * noise),
+      })
+    })
+  }
+
+  /**
+   * Generate stable pattern with minimal variance.
+   * 
+   * @param samples Number of samples
+   * @param cost Fixed cost (default: 0.02)
+   * @param variance Variance percentage (default: 0.02 = 2%)
+   */
+  static stable(
+    samples: number,
+    cost: number = 0.02,
+    variance: number = 0.02
+  ): Trace.Complete[] {
+    const baseTime = Date.now() - samples * 60 * 60 * 1000
+
+    return Array.from({ length: samples }, (_, i) => {
+      const noise = 1 + (Math.random() * variance * 2 - variance)
+      const timestamp = baseTime + i * 60 * 60 * 1000
+
+      return createTraceWithOptions({
+        timestamp,
+        cost: cost * noise,
+        duration: Math.floor(1000 * noise),
+      })
+    })
+  }
+
+  /**
+   * Generate bimodal distribution - two distinct performance modes.
+   * 
+   * Realistic scenario: Cached vs uncached requests, simple vs complex tasks
+   * 
+   * @param samples Number of samples
+   * @param mode1Cost Cost for mode 1 (default: 0.01)
+   * @param mode2Cost Cost for mode 2 (default: 0.05)
+   * @param mode1Probability Probability of mode 1 (default: 0.7 = 70%)
+   */
+  static bimodal(
+    samples: number,
+    mode1Cost: number = 0.01,
+    mode2Cost: number = 0.05,
+    mode1Probability: number = 0.7
+  ): Trace.Complete[] {
+    const baseTime = Date.now() - samples * 60 * 60 * 1000
+
+    return Array.from({ length: samples }, (_, i) => {
+      const isMode1 = Math.random() < mode1Probability
+      const cost = isMode1 ? mode1Cost : mode2Cost
+      const timestamp = baseTime + i * 60 * 60 * 1000
+
+      return createTraceWithOptions({
+        timestamp,
+        cost,
+        duration: Math.floor(cost * 50000), // Duration correlates with cost
+      })
+    })
+  }
+
+  /**
+   * Generate A/B test pattern - two populations with different characteristics.
+   * 
+   * @param samples Number of samples per group
+   * @param groupACost Cost for group A (default: 0.02)
+   * @param groupBCost Cost for group B (default: 0.025)
+   * @param variance Variance for both groups (default: 0.1)
+   */
+  static abTest(
+    samples: number,
+    groupACost: number = 0.02,
+    groupBCost: number = 0.025,
+    variance: number = 0.1
+  ): { groupA: Trace.Complete[]; groupB: Trace.Complete[] } {
+    const baseTime = Date.now() - samples * 2 * 60 * 60 * 1000
+
+    const groupA = Array.from({ length: samples }, (_, i) => {
+      const noise = 1 + (Math.random() * variance * 2 - variance)
+      return createTraceWithOptions({
+        timestamp: baseTime + i * 2 * 60 * 60 * 1000,
+        cost: groupACost * noise,
+        duration: Math.floor(1000 * noise),
+      })
+    })
+
+    const groupB = Array.from({ length: samples }, (_, i) => {
+      const noise = 1 + (Math.random() * variance * 2 - variance)
+      return createTraceWithOptions({
+        timestamp: baseTime + i * 2 * 60 * 60 * 1000 + 60 * 60 * 1000,
+        cost: groupBCost * noise,
+        duration: Math.floor(1200 * noise),
+      })
+    })
+
+    return { groupA, groupB }
+  }
+
+  /**
+   * Generate noisy data - high variance around mean.
+   * 
+   * @param samples Number of samples
+   * @param meanCost Mean cost (default: 0.02)
+   * @param variance Variance percentage (default: 0.3 = 30%)
+   */
+  static noisy(
+    samples: number,
+    meanCost: number = 0.02,
+    variance: number = 0.3
+  ): Trace.Complete[] {
+    const baseTime = Date.now() - samples * 60 * 60 * 1000
+
+    return Array.from({ length: samples }, (_, i) => {
+      const noise = 1 + (Math.random() * variance * 2 - variance)
+      const timestamp = baseTime + i * 60 * 60 * 1000
+
+      return createTraceWithOptions({
+        timestamp,
+        cost: Math.max(0.001, meanCost * noise), // Ensure positive
+        duration: Math.floor(Math.max(100, 1000 * noise)),
+      })
+    })
+  }
+
+  /**
+   * Generate step function - sudden change in performance.
+   * 
+   * Realistic scenario: Deployment, model update, infrastructure change
+   * 
+   * @param samplesBeforeStep Samples before the step
+   * @param samplesAfterStep Samples after the step
+   * @param beforeCost Cost before step (default: 0.02)
+   * @param afterCost Cost after step (default: 0.04)
+   */
+  static stepFunction(
+    samplesBeforeStep: number,
+    samplesAfterStep: number,
+    beforeCost: number = 0.02,
+    afterCost: number = 0.04
+  ): Trace.Complete[] {
+    const totalSamples = samplesBeforeStep + samplesAfterStep
+    const baseTime = Date.now() - totalSamples * 60 * 60 * 1000
+
+    return Array.from({ length: totalSamples }, (_, i) => {
+      const isAfterStep = i >= samplesBeforeStep
+      const cost = isAfterStep ? afterCost : beforeCost
+      const timestamp = baseTime + i * 60 * 60 * 1000
+
+      return createTraceWithOptions({
+        timestamp,
+        cost,
+        duration: Math.floor(cost * 50000),
+      })
+    })
+  }
+}
diff --git a/packages/opencode/test/evaluation/heuristics.test.ts b/packages/opencode/test/evaluation/heuristics.test.ts
new file mode 100644
index 0000000000..1fb2afdb0c
--- /dev/null
+++ b/packages/opencode/test/evaluation/heuristics.test.ts
@@ -0,0 +1,182 @@
+import { describe, expect, test } from "bun:test"
+import { Heuristics } from "../../src/evaluation/heuristics"
+import type { Trace } from "../../src/trace"
+
+const createMockTrace = (overrides?: Partial<Trace.Complete>): Trace.Complete => ({
+  id: "test-trace-1",
+  projectID: "test-project",
+  session: {
+    id: "test-session",
+    projectID: "test-project",
+    directory: "/test",
+    title: "Test Session",
+    version: "1.0.0",
+    time: {
+      created: Date.now(),
+      updated: Date.now(),
+    },
+  },
+  messageCount: 3,
+  agentName: "test-agent",
+  modelConfig: {
+    provider: "anthropic",
+    model: "claude-3-5-sonnet-20241022",
+  },
+  output: "Test output",
+  toolCalls: [],
+  summary: {
+    duration: 1000,
+    toolCallCount: 0,
+    errorCount: 0,
+    tokens: {
+      input: 100,
+      output: 50,
+      reasoning: 0,
+      cache: { read: 0, write: 0 },
+    },
+    cost: 0.01,
+  },
+  evaluationIDs: [],
+  createdAt: Date.now(),
+  ...overrides,
+})
+
+describe("Heuristics", () => {
+  describe("toolErrorRate", () => {
+    test("returns 0 when no tool calls", () => {
+      const trace = createMockTrace()
+      expect(Heuristics.toolErrorRate(trace)).toBe(0)
+    })
+
+    test("returns 0 when all tools succeed", () => {
+      const trace = createMockTrace({
+        toolCalls: [
+          { status: "success", duration: 100 },
+          { status: "success", duration: 200 },
+        ] as any,
+      })
+      expect(Heuristics.toolErrorRate(trace)).toBe(0)
+    })
+
+    test("returns correct error rate", () => {
+      const trace = createMockTrace({
+        toolCalls: [
+          { status: "success", duration: 100 },
+          { status: "error", duration: 200 },
+          { status: "success", duration: 150 },
+          { status: "error", duration: 180 },
+        ] as any,
+      })
+      expect(Heuristics.toolErrorRate(trace)).toBe(0.5)
+    })
+  })
+
+  describe("responseDuration", () => {
+    test("returns the trace duration", () => {
+      const trace = createMockTrace({ summary: { ...createMockTrace().summary, duration: 5000 } })
+      expect(Heuristics.responseDuration(trace)).toBe(5000)
+    })
+  })
+
+  describe("costEfficiency", () => {
+    test("returns Infinity when no successful calls", () => {
+      const trace = createMockTrace({
+        toolCalls: [{ status: "error", duration: 100 }] as any,
+        summary: { ...createMockTrace().summary, cost: 0.05 },
+      })
+      expect(Heuristics.costEfficiency(trace)).toBe(Infinity)
+    })
+
+    test("calculates cost per successful operation", () => {
+      const trace = createMockTrace({
+        toolCalls: [
+          { status: "success", duration: 100 },
+          { status: "success", duration: 200 },
+          { status: "error", duration: 150 },
+        ] as any,
+        summary: { ...createMockTrace().summary, cost: 0.10 },
+      })
+      expect(Heuristics.costEfficiency(trace)).toBe(0.05)
+    })
+  })
+
+  describe("tokenEfficiency", () => {
+    test("returns 0 when no tokens used", () => {
+      const trace = createMockTrace({
+        summary: {
+          ...createMockTrace().summary,
+          tokens: { input: 0, output: 0, reasoning: 0, cache: { read: 0, write: 0 } },
+        },
+      })
+      expect(Heuristics.tokenEfficiency(trace)).toBe(0)
+    })
+
+    test("calculates output ratio correctly", () => {
+      const trace = createMockTrace({
+        summary: {
+          ...createMockTrace().summary,
+          tokens: { input: 100, output: 50, reasoning: 50, cache: { read: 0, write: 0 } },
+        },
+      })
+      expect(Heuristics.tokenEfficiency(trace)).toBe(0.25) // 50 / 200
+    })
+  })
+
+  describe("toolSuccessRate", () => {
+    test("returns 1 when no tool calls", () => {
+      const trace = createMockTrace()
+      expect(Heuristics.toolSuccessRate(trace)).toBe(1)
+    })
+
+    test("calculates success rate correctly", () => {
+      const trace = createMockTrace({
+        toolCalls: [
+          { status: "success", duration: 100 },
+          { status: "success", duration: 200 },
+          { status: "error", duration: 150 },
+        ] as any,
+      })
+      expect(Heuristics.toolSuccessRate(trace)).toBeCloseTo(0.666, 2)
+    })
+  })
+
+  describe("hasErrors", () => {
+    test("returns 0 when no errors", () => {
+      const trace = createMockTrace()
+      expect(Heuristics.hasErrors(trace)).toBe(0)
+    })
+
+    test("returns 1 when errors present", () => {
+      const trace = createMockTrace({
+        summary: { ...createMockTrace().summary, errorCount: 2 },
+      })
+      expect(Heuristics.hasErrors(trace)).toBe(1)
+    })
+  })
+
+  describe("cacheHitRate", () => {
+    test("returns 0 when no cache usage", () => {
+      const trace = createMockTrace()
+      expect(Heuristics.cacheHitRate(trace)).toBe(0)
+    })
+
+    test("calculates cache hit rate", () => {
+      const trace = createMockTrace({
+        summary: {
+          ...createMockTrace().summary,
+          tokens: { input: 80, output: 50, reasoning: 0, cache: { read: 20, write: 0 } },
+        },
+      })
+      expect(Heuristics.cacheHitRate(trace)).toBe(0.2) // 20 / (80 + 20)
+    })
+  })
+
+  describe("totalCost", () => {
+    test("returns the trace cost", () => {
+      const trace = createMockTrace({
+        summary: { ...createMockTrace().summary, cost: 1.25 },
+      })
+      expect(Heuristics.totalCost(trace)).toBe(1.25)
+    })
+  })
+})
diff --git a/packages/opencode/test/evaluation/integration.test.ts b/packages/opencode/test/evaluation/integration.test.ts
new file mode 100644
index 0000000000..5b4cf8382a
--- /dev/null
+++ b/packages/opencode/test/evaluation/integration.test.ts
@@ -0,0 +1,1143 @@
+import { describe, test, expect, beforeEach, afterEach } from "bun:test"
+import { EvaluationIntegration } from "../../src/evaluation/integration"
+import { Metric } from "../../src/evaluation/metric"
+import { Baseline } from "../../src/evaluation/baseline"
+import { TimeSeries } from "../../src/evaluation/timeseries"
+import { Instance } from "../../src/project/instance"
+import { tmpdir } from "../fixture/fixture"
+import type { Trace as TraceType } from "../../src/trace"
+
+// Helper to wrap tests with Instance context for storage isolation
+async function withInstance(fn: () => Promise<void>) {
+  await using tmp = await tmpdir()
+  await Instance.provide({
+    directory: tmp.path,
+    fn,
+  })
+}
+
+// Helper to create mock traces
+function createMockTrace(overrides?: Partial<TraceType.Complete>): TraceType.Complete {
+  return {
+    id: `trace-${Date.now()}-${Math.random()}`,
+    projectID: "test-project",
+    session: {} as any,
+    messageCount: 5,
+    agentName: "test-agent",
+    modelConfig: {
+      provider: "anthropic",
+      model: "claude-3-5-sonnet-20241022",
+    },
+    output: "Test output",
+    toolCalls: [
+      {
+        id: "Read",
+        sessionID: "test-session",
+        timestamp: Date.now(),
+        duration: 100,
+        status: "success",
+      },
+    ],
+    summary: {
+      duration: 1000,
+      toolCallCount: 1,
+      errorCount: 0,
+      tokens: { input: 100, output: 50, reasoning: 0, cache: { read: 0, write: 0 } },
+      cost: 0.01,
+    },
+    evaluationIDs: [],
+    createdAt: Date.now(),
+    completedAt: Date.now() + 1000,
+    ...overrides,
+  }
+}
+
+describe("EvaluationIntegration", () => {
+  const testIds: string[] = []
+
+  beforeEach(async () => {
+    // Clean up any existing auto-evaluation
+    EvaluationIntegration.disableAutoEvaluation()
+  })
+
+  afterEach(async () => {
+    // Clean up
+    EvaluationIntegration.disableAutoEvaluation()
+    
+    // Clean up test data
+    for (const id of testIds) {
+      try {
+        await Metric.remove(id)
+      } catch {}
+      try {
+        await Baseline.remove(id)
+      } catch {}
+      try {
+        await TimeSeries.clearMetric(id)
+      } catch {}
+    }
+    testIds.length = 0
+  })
+
+  describe("enableAutoEvaluation", () => {
+    test("enables automatic trace evaluation", async () => {
+      const metric: Metric.Definition = {
+        id: "auto-eval-metric",
+        name: "Auto Eval Metric",
+        description: "Test metric for auto-evaluation",
+        version: "1.0.0",
+        category: "cost",
+        evaluator: { type: "heuristic", function: "totalCost" },
+        higherIsBetter: false,
+        tags: [],
+      }
+      await Metric.register(metric)
+      testIds.push(metric.id)
+
+      await EvaluationIntegration.enableAutoEvaluation({
+        metricIDs: [metric.id],
+        recordTimeSeries: true,
+        checkBaselines: false,
+      })
+
+      // Verify config is set
+      const trace = createMockTrace()
+      // Manually trigger evaluation to verify auto-evaluation config works
+      await EvaluationIntegration.evaluateTrace(trace)
+
+      // Check that time-series was recorded
+      const points = await TimeSeries.getDataPoints(metric.id)
+      expect(points.length).toBeGreaterThan(0)
+    })
+
+    test("can be disabled", () => {
+      EvaluationIntegration.disableAutoEvaluation()
+      // Should not throw
+      expect(true).toBe(true)
+    })
+  })
+
+  describe("alert callbacks", () => {
+    test("onRegression receives regression alerts", () => withInstance(async () => {
+      const metric: Metric.Definition = {
+        id: "regression-metric",
+        name: "Regression Metric",
+        description: "Test metric for regression detection",
+        version: "1.0.0",
+        category: "reliability",
+        evaluator: { type: "heuristic", function: "toolErrorRate" },
+        higherIsBetter: false,
+        threshold: { pass: 0.1 },
+        tags: [],
+      }
+      await Metric.register(metric)
+      testIds.push(metric.id)
+
+      // Create baseline with good traces
+      const baseline = await Baseline.create({
+        id: "regression-baseline",
+        name: "Regression Baseline",
+        description: "Baseline for regression testing",
+        metricIDs: [metric.id],
+        minSampleSize: 2,
+        regressionThreshold: 0.2,
+      })
+      testIds.push(baseline.id)
+
+      // Add good traces to baseline
+      for (let i = 0; i < 3; i++) {
+        const trace = createMockTrace({
+          toolCalls: [
+            { id: "Read", status: "success", duration: 100 } as any,
+            { id: "Edit", status: "success", duration: 200 } as any,
+          ],
+        })
+        await Baseline.addTrace(baseline.id, trace)
+      }
+
+      // Set up alert listener
+      const alerts: any[] = []
+      const unsubscribe = EvaluationIntegration.onRegression((alert) => {
+        alerts.push(alert)
+      })
+
+      // Enable auto-evaluation with baseline checking
+      await EvaluationIntegration.enableAutoEvaluation({
+        metricIDs: [metric.id],
+        recordTimeSeries: false,
+        checkBaselines: true,
+      })
+
+      // Create trace with high error rate
+      const badTrace = createMockTrace({
+        toolCalls: [
+          { id: "Read", status: "error", duration: 100 } as any,
+          { id: "Edit", status: "error", duration: 200 } as any,
+        ],
+      })
+
+      // Manually trigger evaluation (since we can't easily trigger Trace.Event.Completed)
+      await EvaluationIntegration.evaluateTrace(badTrace, {
+        metricIDs: [metric.id],
+        checkBaselines: true,
+      })
+
+      // Should have received regression alert
+      expect(alerts.length).toBeGreaterThan(0)
+      expect(alerts[0].type).toBe("regression")
+      expect(alerts[0].metricID).toBe(metric.id)
+
+      unsubscribe()
+    }))
+
+    test("onAnomaly receives anomaly alerts", async () => {
+      const metric: Metric.Definition = {
+        id: `anomaly-metric-${Date.now()}-${Math.random()}`,
+        name: "Anomaly Metric",
+        description: "Test metric for anomaly detection",
+        version: "1.0.0",
+        category: "cost",
+        evaluator: { type: "heuristic", function: "totalCost" },
+        higherIsBetter: false,
+        tags: [],
+      }
+      await Metric.register(metric)
+      testIds.push(metric.id)
+
+      // Record normal traces
+      for (let i = 0; i < 5; i++) {
+        const trace = createMockTrace({
+          summary: {
+            duration: 1000,
+            toolCallCount: 1,
+            errorCount: 0,
+            tokens: { input: 100, output: 50, reasoning: 0, cache: { read: 0, write: 0 } },
+            cost: 0.02,
+          },
+        })
+        await TimeSeries.record(metric.id, trace)
+      }
+
+      // Set up alert listener
+      const alerts: any[] = []
+      const unsubscribe = EvaluationIntegration.onAnomaly((alert) => {
+        alerts.push(alert)
+      })
+
+      // Enable auto-evaluation with anomaly detection
+      await EvaluationIntegration.enableAutoEvaluation({
+        metricIDs: [metric.id],
+        recordTimeSeries: true,
+        detectAnomalies: true,
+      })
+
+      // Create trace with anomalous cost
+      const anomalousTrace = createMockTrace({
+        summary: {
+          duration: 1000,
+          toolCallCount: 1,
+          errorCount: 0,
+          tokens: { input: 100, output: 50, reasoning: 0, cache: { read: 0, write: 0 } },
+          cost: 0.50, // Much higher than normal
+        },
+      })
+
+      // Manually trigger evaluation
+      await EvaluationIntegration.evaluateTrace(anomalousTrace, {
+        metricIDs: [metric.id],
+        recordTimeSeries: true,
+        detectAnomalies: true,
+      })
+
+      // Should have received anomaly alert
+      expect(alerts.length).toBeGreaterThan(0)
+      expect(alerts[0].type).toBe("anomaly")
+      expect(alerts[0].metricID).toBe(metric.id)
+
+      unsubscribe()
+    })
+  })
+
+  describe("getDashboard", () => {
+    test("returns aggregated dashboard data", async () => {
+      const metric: Metric.Definition = {
+        id: "dashboard-metric",
+        name: "Dashboard Metric",
+        description: "Test metric for dashboard",
+        version: "1.0.0",
+        category: "cost",
+        evaluator: { type: "heuristic", function: "totalCost" },
+        higherIsBetter: false,
+        tags: [],
+      }
+      await Metric.register(metric)
+      testIds.push(metric.id)
+
+      // Record some data points
+      for (let i = 0; i < 10; i++) {
+        const trace = createMockTrace({
+          summary: {
+            duration: 1000,
+            toolCallCount: 1,
+            errorCount: 0,
+            tokens: { input: 100, output: 50, reasoning: 0, cache: { read: 0, write: 0 } },
+            cost: 0.01 + i * 0.001,
+          },
+        })
+        await TimeSeries.record(metric.id, trace)
+      }
+
+      const dashboard = await EvaluationIntegration.getDashboard({
+        metricIDs: [metric.id],
+        period: "hour",
+      })
+
+      expect(dashboard.metrics.length).toBe(1)
+      expect(dashboard.metrics[0].metric.id).toBe(metric.id)
+      expect(dashboard.metrics[0].dataPoints).toBeGreaterThan(0)
+    })
+  })
+
+  describe("manual evaluation", () => {
+    test("evaluateTrace processes a single trace", async () => {
+      const metric: Metric.Definition = {
+        id: "manual-metric",
+        name: "Manual Metric",
+        description: "Test metric for manual evaluation",
+        version: "1.0.0",
+        category: "cost",
+        evaluator: { type: "heuristic", function: "totalCost" },
+        higherIsBetter: false,
+        tags: [],
+      }
+      await Metric.register(metric)
+      testIds.push(metric.id)
+
+      const trace = createMockTrace()
+
+      await EvaluationIntegration.evaluateTrace(trace, {
+        metricIDs: [metric.id],
+        recordTimeSeries: true,
+      })
+
+      // Check that evaluation occurred
+      const points = await TimeSeries.getDataPoints(metric.id)
+      const tracePoint = points.find((p) => p.traceID === trace.id)
+      expect(tracePoint).toBeDefined()
+    })
+
+    test("evaluateTraces processes multiple traces", async () => {
+      const metric: Metric.Definition = {
+        id: "batch-metric",
+        name: "Batch Metric",
+        description: "Test metric for batch evaluation",
+        version: "1.0.0",
+        category: "cost",
+        evaluator: { type: "heuristic", function: "totalCost" },
+        higherIsBetter: false,
+        tags: [],
+      }
+      await Metric.register(metric)
+      testIds.push(metric.id)
+
+      const traces = [
+        createMockTrace(),
+        createMockTrace(),
+        createMockTrace(),
+      ]
+      const traceIDs = traces.map((t) => t.id)
+
+      await EvaluationIntegration.evaluateTraces(traces, {
+        metricIDs: [metric.id],
+        recordTimeSeries: true,
+      })
+
+      // Check that all traces were evaluated
+      const points = await TimeSeries.getDataPoints(metric.id)
+      for (const traceID of traceIDs) {
+        const tracePoint = points.find((p) => p.traceID === traceID)
+        expect(tracePoint).toBeDefined()
+      }
+    })
+  })
+
+  describe("edge cases - configuration", () => {
+    test("handles empty metric list", async () => {
+      // Should not throw with empty metrics
+      await EvaluationIntegration.enableAutoEvaluation({
+        metricIDs: [],
+        recordTimeSeries: true,
+      })
+
+      const trace = createMockTrace()
+      await EvaluationIntegration.evaluateTrace(trace, {
+        metricIDs: [],
+      })
+
+      expect(true).toBe(true)
+    })
+
+    test("handles non-existent metric gracefully", async () => {
+      const trace = createMockTrace()
+
+      // Should handle missing metric without crashing
+      try {
+        await EvaluationIntegration.evaluateTrace(trace, {
+          metricIDs: ["non-existent-metric"],
+          recordTimeSeries: true,
+        })
+      } catch (error) {
+        // Expected to fail, but shouldn't crash the whole system
+        expect(error).toBeDefined()
+      }
+    })
+
+    test("handles reconfiguration of auto-evaluation", async () => {
+      const metric1: Metric.Definition = {
+        id: "reconfig-metric-1",
+        name: "Reconfig Metric 1",
+        description: "First metric",
+        version: "1.0.0",
+        category: "cost",
+        evaluator: { type: "heuristic", function: "totalCost" },
+        higherIsBetter: false,
+        tags: [],
+      }
+      await Metric.register(metric1)
+      testIds.push(metric1.id)
+
+      const metric2: Metric.Definition = {
+        id: "reconfig-metric-2",
+        name: "Reconfig Metric 2",
+        description: "Second metric",
+        version: "1.0.0",
+        category: "reliability",
+        evaluator: { type: "heuristic", function: "toolErrorRate" },
+        higherIsBetter: false,
+        tags: [],
+      }
+      await Metric.register(metric2)
+      testIds.push(metric2.id)
+
+      // First configuration
+      await EvaluationIntegration.enableAutoEvaluation({
+        metricIDs: [metric1.id],
+      })
+
+      // Reconfigure with different metrics
+      await EvaluationIntegration.enableAutoEvaluation({
+        metricIDs: [metric2.id],
+      })
+
+      // Should work without issues
+      expect(true).toBe(true)
+    })
+
+    test("handles missing configuration for manual evaluation", async () => {
+      EvaluationIntegration.disableAutoEvaluation()
+
+      const trace = createMockTrace()
+
+      // Should throw when no config provided and auto-eval disabled
+      try {
+        await EvaluationIntegration.evaluateTrace(trace)
+        expect(false).toBe(true) // Should not reach here
+      } catch (error: any) {
+        expect(error.message).toContain("No configuration provided")
+      }
+    })
+  })
+
+  describe("edge cases - baseline comparison", () => {
+    test("skips baseline comparison when baseline has insufficient samples", () => withInstance(async () => {
+      const metric: Metric.Definition = {
+        id: "insufficient-baseline-metric",
+        name: "Insufficient Baseline Metric",
+        description: "Test baseline with too few samples",
+        version: "1.0.0",
+        category: "cost",
+        evaluator: { type: "heuristic", function: "totalCost" },
+        higherIsBetter: false,
+        tags: [],
+      }
+      await Metric.register(metric)
+      testIds.push(metric.id)
+
+      // Create baseline requiring 10 samples but only add 2
+      const baseline = await Baseline.create({
+        id: "insufficient-baseline",
+        name: "Insufficient Baseline",
+        description: "Not enough samples",
+        metricIDs: [metric.id],
+        minSampleSize: 10,
+        regressionThreshold: 0.2,
+      })
+      testIds.push(baseline.id)
+
+      // Add only 2 traces
+      for (let i = 0; i < 2; i++) {
+        const trace = createMockTrace()
+        await Baseline.addTrace(baseline.id, trace)
+      }
+
+      const alerts: any[] = []
+      const unsubscribe = EvaluationIntegration.onRegression((alert) => {
+        alerts.push(alert)
+      })
+
+      const trace = createMockTrace({
+        summary: {
+          duration: 1000,
+          toolCallCount: 1,
+          errorCount: 0,
+          tokens: { input: 100, output: 50, reasoning: 0, cache: { read: 0, write: 0 } },
+          cost: 100.0, // Huge cost, but shouldn't alert due to insufficient baseline
+        },
+      })
+
+      await EvaluationIntegration.evaluateTrace(trace, {
+        metricIDs: [metric.id],
+        checkBaselines: true,
+      })
+
+      // Should not receive alert due to insufficient baseline samples
+      expect(alerts.length).toBe(0)
+
+      unsubscribe()
+    }))
+
+    test("handles baseline with no matching metrics", () => withInstance(async () => {
+      const metric1: Metric.Definition = {
+        id: "baseline-metric-1",
+        name: "Baseline Metric 1",
+        description: "First metric",
+        version: "1.0.0",
+        category: "cost",
+        evaluator: { type: "heuristic", function: "totalCost" },
+        higherIsBetter: false,
+        tags: [],
+      }
+      await Metric.register(metric1)
+      testIds.push(metric1.id)
+
+      const metric2: Metric.Definition = {
+        id: "baseline-metric-2",
+        name: "Baseline Metric 2",
+        description: "Second metric",
+        version: "1.0.0",
+        category: "reliability",
+        evaluator: { type: "heuristic", function: "toolErrorRate" },
+        higherIsBetter: false,
+        tags: [],
+      }
+      await Metric.register(metric2)
+      testIds.push(metric2.id)
+
+      // Create baseline for metric1
+      const baseline = await Baseline.create({
+        id: "mismatched-baseline",
+        name: "Mismatched Baseline",
+        description: "Only tracks metric1",
+        metricIDs: [metric1.id],
+        minSampleSize: 2,
+      })
+      testIds.push(baseline.id)
+
+      for (let i = 0; i < 3; i++) {
+        const trace = createMockTrace()
+        await Baseline.addTrace(baseline.id, trace)
+      }
+
+      // Evaluate with metric2 only
+      const trace = createMockTrace()
+      await EvaluationIntegration.evaluateTrace(trace, {
+        metricIDs: [metric2.id], // Different metric
+        checkBaselines: true,
+      })
+
+      // Should complete without errors
+      expect(true).toBe(true)
+    }))
+
+    test("detects improvement alerts", () => withInstance(async () => {
+      const metric: Metric.Definition = {
+        id: "improvement-metric",
+        name: "Improvement Metric",
+        description: "Test metric for improvement detection",
+        version: "1.0.0",
+        category: "cost",
+        evaluator: { type: "heuristic", function: "totalCost" },
+        higherIsBetter: false,
+        tags: [],
+      }
+      await Metric.register(metric)
+      testIds.push(metric.id)
+
+      // Create baseline with high cost
+      const baseline = await Baseline.create({
+        id: "improvement-baseline",
+        name: "Improvement Baseline",
+        description: "Baseline with high costs",
+        metricIDs: [metric.id],
+        minSampleSize: 3,
+        regressionThreshold: 0.2,
+      })
+      testIds.push(baseline.id)
+
+      // Add expensive traces
+      for (let i = 0; i < 5; i++) {
+        const trace = createMockTrace({
+          summary: {
+            duration: 1000,
+            toolCallCount: 1,
+            errorCount: 0,
+            tokens: { input: 100, output: 50, reasoning: 0, cache: { read: 0, write: 0 } },
+            cost: 0.10,
+          },
+        })
+        await Baseline.addTrace(baseline.id, trace)
+      }
+
+      const improvements: any[] = []
+      const unsubscribe = EvaluationIntegration.onImprovement((alert) => {
+        improvements.push(alert)
+      })
+
+      // Create trace with much lower cost
+      const cheapTrace = createMockTrace({
+        summary: {
+          duration: 1000,
+          toolCallCount: 1,
+          errorCount: 0,
+          tokens: { input: 100, output: 50, reasoning: 0, cache: { read: 0, write: 0 } },
+          cost: 0.01, // 90% cheaper
+        },
+      })
+
+      await EvaluationIntegration.evaluateTrace(cheapTrace, {
+        metricIDs: [metric.id],
+        checkBaselines: true,
+      })
+
+      // Should detect improvement
+      expect(improvements.length).toBeGreaterThan(0)
+      expect(improvements[0].type).toBe("improvement")
+      expect(improvements[0].currentValue).toBeLessThan(improvements[0].baselineValue)
+
+      unsubscribe()
+    }))
+  })
+
+  describe("edge cases - anomaly detection", () => {
+    test("handles insufficient data for anomaly detection", async () => {
+      const metric: Metric.Definition = {
+        id: "anomaly-insufficient-metric",
+        name: "Anomaly Insufficient Metric",
+        description: "Test anomaly with insufficient data",
+        version: "1.0.0",
+        category: "cost",
+        evaluator: { type: "heuristic", function: "totalCost" },
+        higherIsBetter: false,
+        tags: [],
+      }
+      await Metric.register(metric)
+      testIds.push(metric.id)
+
+      // Record only 1 trace (need 3 for anomaly detection)
+      const trace1 = createMockTrace({
+        summary: {
+          duration: 1000,
+          toolCallCount: 1,
+          errorCount: 0,
+          tokens: { input: 100, output: 50, reasoning: 0, cache: { read: 0, write: 0 } },
+          cost: 0.02,
+        },
+      })
+      await TimeSeries.record(metric.id, trace1)
+
+      const anomalies: any[] = []
+      const unsubscribe = EvaluationIntegration.onAnomaly((alert) => {
+        anomalies.push(alert)
+      })
+
+      // Try to evaluate with anomaly detection
+      const trace2 = createMockTrace({
+        summary: {
+          duration: 1000,
+          toolCallCount: 1,
+          errorCount: 0,
+          tokens: { input: 100, output: 50, reasoning: 0, cache: { read: 0, write: 0 } },
+          cost: 100.0, // Huge anomaly
+        },
+      })
+
+      await EvaluationIntegration.evaluateTrace(trace2, {
+        metricIDs: [metric.id],
+        recordTimeSeries: true,
+        detectAnomalies: true,
+      })
+
+      // Should not alert due to insufficient data
+      expect(anomalies.length).toBe(0)
+
+      unsubscribe()
+    })
+
+    test("handles all identical values in time series", async () => {
+      const metric: Metric.Definition = {
+        id: `identical-values-metric-${Date.now()}-${Math.random()}`,
+        name: "Identical Values Metric",
+        description: "Test with all identical values",
+        version: "1.0.0",
+        category: "cost",
+        evaluator: { type: "heuristic", function: "totalCost" },
+        higherIsBetter: false,
+        tags: [],
+      }
+      await Metric.register(metric)
+      testIds.push(metric.id)
+
+      // Record 5 traces with identical cost
+      for (let i = 0; i < 5; i++) {
+        const trace = createMockTrace({
+          summary: {
+            duration: 1000,
+            toolCallCount: 1,
+            errorCount: 0,
+            tokens: { input: 100, output: 50, reasoning: 0, cache: { read: 0, write: 0 } },
+            cost: 0.02, // Always same
+          },
+        })
+        await TimeSeries.record(metric.id, trace)
+      }
+
+      const anomalies: any[] = []
+      const unsubscribe = EvaluationIntegration.onAnomaly((alert) => {
+        anomalies.push(alert)
+      })
+
+      // New trace with different cost
+      const differentTrace = createMockTrace({
+        summary: {
+          duration: 1000,
+          toolCallCount: 1,
+          errorCount: 0,
+          tokens: { input: 100, output: 50, reasoning: 0, cache: { read: 0, write: 0 } },
+          cost: 0.50, // Different
+        },
+      })
+
+      await EvaluationIntegration.evaluateTrace(differentTrace, {
+        metricIDs: [metric.id],
+        recordTimeSeries: true,
+        detectAnomalies: true,
+      })
+
+      // Should detect anomaly (stdDev=0 edge case)
+      expect(anomalies.length).toBeGreaterThan(0)
+
+      unsubscribe()
+    })
+
+    test("respects custom anomaly threshold", async () => {
+      const metric: Metric.Definition = {
+        id: "custom-threshold-metric",
+        name: "Custom Threshold Metric",
+        description: "Test custom anomaly threshold",
+        version: "1.0.0",
+        category: "cost",
+        evaluator: { type: "heuristic", function: "totalCost" },
+        higherIsBetter: false,
+        tags: [],
+      }
+      await Metric.register(metric)
+      testIds.push(metric.id)
+
+      // Record normal traces with some variance
+      for (let i = 0; i < 10; i++) {
+        const trace = createMockTrace({
+          summary: {
+            duration: 1000,
+            toolCallCount: 1,
+            errorCount: 0,
+            tokens: { input: 100, output: 50, reasoning: 0, cache: { read: 0, write: 0 } },
+            cost: 0.02 + (Math.random() * 0.01), // 0.02-0.03
+          },
+        })
+        await TimeSeries.record(metric.id, trace)
+      }
+
+      const anomalies: any[] = []
+      const unsubscribe = EvaluationIntegration.onAnomaly((alert) => {
+        anomalies.push(alert)
+      })
+
+      // Slightly elevated cost
+      const elevatedTrace = createMockTrace({
+        summary: {
+          duration: 1000,
+          toolCallCount: 1,
+          errorCount: 0,
+          tokens: { input: 100, output: 50, reasoning: 0, cache: { read: 0, write: 0 } },
+          cost: 0.05, // 2x normal but maybe not 3-sigma
+        },
+      })
+
+      // With strict threshold (2-sigma), should detect
+      await EvaluationIntegration.evaluateTrace(elevatedTrace, {
+        metricIDs: [metric.id],
+        recordTimeSeries: true,
+        detectAnomalies: true,
+        anomalyThreshold: 2,
+      })
+
+      // Might or might not detect depending on exact variance
+      // Just check it doesn't crash
+      expect(true).toBe(true)
+
+      unsubscribe()
+    })
+  })
+
+  describe("edge cases - callback management", () => {
+    test("handles multiple callbacks for same alert type", async () => {
+      const metric: Metric.Definition = {
+        id: `multi-callback-metric-${Date.now()}-${Math.random()}`,
+        name: "Multi Callback Metric",
+        description: "Test multiple callbacks",
+        version: "1.0.0",
+        category: "cost",
+        evaluator: { type: "heuristic", function: "totalCost" },
+        higherIsBetter: false,
+        tags: [],
+      }
+      await Metric.register(metric)
+      testIds.push(metric.id)
+
+      // Record some data
+      for (let i = 0; i < 5; i++) {
+        const trace = createMockTrace({
+          summary: {
+            duration: 1000,
+            toolCallCount: 1,
+            errorCount: 0,
+            tokens: { input: 100, output: 50, reasoning: 0, cache: { read: 0, write: 0 } },
+            cost: 0.02,
+          },
+        })
+        await TimeSeries.record(metric.id, trace)
+      }
+
+      const alerts1: any[] = []
+      const alerts2: any[] = []
+      const alerts3: any[] = []
+
+      const unsub1 = EvaluationIntegration.onAnomaly((alert) => alerts1.push(alert))
+      const unsub2 = EvaluationIntegration.onAnomaly((alert) => alerts2.push(alert))
+      const unsub3 = EvaluationIntegration.onAnomaly((alert) => alerts3.push(alert))
+
+      const anomalousTrace = createMockTrace({
+        summary: {
+          duration: 1000,
+          toolCallCount: 1,
+          errorCount: 0,
+          tokens: { input: 100, output: 50, reasoning: 0, cache: { read: 0, write: 0 } },
+          cost: 0.50,
+        },
+      })
+
+      await EvaluationIntegration.evaluateTrace(anomalousTrace, {
+        metricIDs: [metric.id],
+        recordTimeSeries: true,
+        detectAnomalies: true,
+      })
+
+      // All callbacks should receive the alert
+      expect(alerts1.length).toBeGreaterThan(0)
+      expect(alerts2.length).toBeGreaterThan(0)
+      expect(alerts3.length).toBeGreaterThan(0)
+
+      unsub1()
+      unsub2()
+      unsub3()
+    })
+
+    test("handles callback errors gracefully", async () => {
+      const metric: Metric.Definition = {
+        id: `callback-error-metric-${Date.now()}-${Math.random()}`,
+        name: "Callback Error Metric",
+        description: "Test callback error handling",
+        version: "1.0.0",
+        category: "cost",
+        evaluator: { type: "heuristic", function: "totalCost" },
+        higherIsBetter: false,
+        tags: [],
+      }
+      await Metric.register(metric)
+      testIds.push(metric.id)
+
+      for (let i = 0; i < 5; i++) {
+        const trace = createMockTrace({
+          summary: {
+            duration: 1000,
+            toolCallCount: 1,
+            errorCount: 0,
+            tokens: { input: 100, output: 50, reasoning: 0, cache: { read: 0, write: 0 } },
+            cost: 0.02,
+          },
+        })
+        await TimeSeries.record(metric.id, trace)
+      }
+
+      const successfulAlerts: any[] = []
+
+      // First callback throws error
+      const unsub1 = EvaluationIntegration.onAnomaly(() => {
+        throw new Error("Callback error!")
+      })
+
+      // Second callback should still work
+      const unsub2 = EvaluationIntegration.onAnomaly((alert) => {
+        successfulAlerts.push(alert)
+      })
+
+      const anomalousTrace = createMockTrace({
+        summary: {
+          duration: 1000,
+          toolCallCount: 1,
+          errorCount: 0,
+          tokens: { input: 100, output: 50, reasoning: 0, cache: { read: 0, write: 0 } },
+          cost: 0.50,
+        },
+      })
+
+      await EvaluationIntegration.evaluateTrace(anomalousTrace, {
+        metricIDs: [metric.id],
+        recordTimeSeries: true,
+        detectAnomalies: true,
+      })
+
+      // Second callback should still receive alert despite first one failing
+      expect(successfulAlerts.length).toBeGreaterThan(0)
+
+      unsub1()
+      unsub2()
+    })
+
+    test("unsubscribe prevents future callbacks", async () => {
+      const metric: Metric.Definition = {
+        id: "unsubscribe-metric",
+        name: "Unsubscribe Metric",
+        description: "Test unsubscribe functionality",
+        version: "1.0.0",
+        category: "cost",
+        evaluator: { type: "heuristic", function: "totalCost" },
+        higherIsBetter: false,
+        tags: [],
+      }
+      await Metric.register(metric)
+      testIds.push(metric.id)
+
+      for (let i = 0; i < 5; i++) {
+        const trace = createMockTrace({
+          summary: {
+            duration: 1000,
+            toolCallCount: 1,
+            errorCount: 0,
+            tokens: { input: 100, output: 50, reasoning: 0, cache: { read: 0, write: 0 } },
+            cost: 0.02,
+          },
+        })
+        await TimeSeries.record(metric.id, trace)
+      }
+
+      let callCount = 0
+      const unsubscribe = EvaluationIntegration.onAnomaly(() => {
+        callCount++
+      })
+
+      // First evaluation
+      const trace1 = createMockTrace({
+        summary: {
+          duration: 1000,
+          toolCallCount: 1,
+          errorCount: 0,
+          tokens: { input: 100, output: 50, reasoning: 0, cache: { read: 0, write: 0 } },
+          cost: 0.50,
+        },
+      })
+
+      await EvaluationIntegration.evaluateTrace(trace1, {
+        metricIDs: [metric.id],
+        recordTimeSeries: true,
+        detectAnomalies: true,
+      })
+
+      const callsAfterFirst = callCount
+
+      // Unsubscribe
+      unsubscribe()
+
+      // Second evaluation
+      const trace2 = createMockTrace({
+        summary: {
+          duration: 1000,
+          toolCallCount: 1,
+          errorCount: 0,
+          tokens: { input: 100, output: 50, reasoning: 0, cache: { read: 0, write: 0 } },
+          cost: 0.50,
+        },
+      })
+
+      await EvaluationIntegration.evaluateTrace(trace2, {
+        metricIDs: [metric.id],
+        recordTimeSeries: true,
+        detectAnomalies: true,
+      })
+
+      // Call count should not increase after unsubscribe
+      expect(callCount).toBe(callsAfterFirst)
+    })
+  })
+
+  describe("edge cases - dashboard", () => {
+    test("handles empty dashboard query", async () => {
+      const dashboard = await EvaluationIntegration.getDashboard({
+        metricIDs: [],
+      })
+
+      expect(dashboard.metrics.length).toBe(0)
+    })
+
+    test("handles dashboard with no data", async () => {
+      const metric: Metric.Definition = {
+        id: "empty-dashboard-metric",
+        name: "Empty Dashboard Metric",
+        description: "Test empty dashboard",
+        version: "1.0.0",
+        category: "cost",
+        evaluator: { type: "heuristic", function: "totalCost" },
+        higherIsBetter: false,
+        tags: [],
+      }
+      await Metric.register(metric)
+      testIds.push(metric.id)
+
+      const dashboard = await EvaluationIntegration.getDashboard({
+        metricIDs: [metric.id],
+      })
+
+      expect(dashboard.metrics.length).toBe(1)
+      expect(dashboard.metrics[0].dataPoints).toBe(0)
+      expect(dashboard.metrics[0].trend).toBeNull()
+    })
+
+    test("handles dashboard with time range filters", async () => {
+      const metric: Metric.Definition = {
+        id: "timerange-dashboard-metric",
+        name: "Time Range Dashboard Metric",
+        description: "Test dashboard with time filters",
+        version: "1.0.0",
+        category: "cost",
+        evaluator: { type: "heuristic", function: "totalCost" },
+        higherIsBetter: false,
+        tags: [],
+      }
+      await Metric.register(metric)
+      testIds.push(metric.id)
+
+      const now = Date.now()
+      const oneDayAgo = now - 24 * 60 * 60 * 1000
+      const twoDaysAgo = now - 2 * 24 * 60 * 60 * 1000
+
+      // Record traces at different times
+      for (let i = 0; i < 3; i++) {
+        const trace = createMockTrace({
+          createdAt: twoDaysAgo + i * 1000,
+          summary: {
+            duration: 1000,
+            toolCallCount: 1,
+            errorCount: 0,
+            tokens: { input: 100, output: 50, reasoning: 0, cache: { read: 0, write: 0 } },
+            cost: 0.02,
+          },
+        })
+        await TimeSeries.record(metric.id, trace)
+      }
+
+      // Query only last 24 hours
+      const dashboard = await EvaluationIntegration.getDashboard({
+        metricIDs: [metric.id],
+        since: oneDayAgo,
+      })
+
+      // Should work without errors
+      expect(dashboard.metrics.length).toBe(1)
+    })
+  })
+
+  describe("edge cases - tags", () => {
+    test("records time-series with custom tags", async () => {
+      const metric: Metric.Definition = {
+        id: "tags-metric",
+        name: "Tags Metric",
+        description: "Test custom tags",
+        version: "1.0.0",
+        category: "cost",
+        evaluator: { type: "heuristic", function: "totalCost" },
+        higherIsBetter: false,
+        tags: [],
+      }
+      await Metric.register(metric)
+      testIds.push(metric.id)
+
+      const trace = createMockTrace()
+
+      await EvaluationIntegration.evaluateTrace(trace, {
+        metricIDs: [metric.id],
+        recordTimeSeries: true,
+        tags: {
+          environment: "staging",
+          version: "v2.0.0",
+          region: "us-east-1",
+        },
+      })
+
+      const points = await TimeSeries.getDataPoints(metric.id)
+      const point = points.find((p) => p.traceID === trace.id)
+
+      expect(point).toBeDefined()
+      expect(point!.tags?.["environment"]).toBe("staging")
+      expect(point!.tags?.["version"]).toBe("v2.0.0")
+      expect(point!.tags?.["region"]).toBe("us-east-1")
+    })
+
+    test("handles undefined tags gracefully", async () => {
+      const metric: Metric.Definition = {
+        id: "no-tags-metric",
+        name: "No Tags Metric",
+        description: "Test without tags",
+        version: "1.0.0",
+        category: "cost",
+        evaluator: { type: "heuristic", function: "totalCost" },
+        higherIsBetter: false,
+        tags: [],
+      }
+      await Metric.register(metric)
+      testIds.push(metric.id)
+
+      const trace = createMockTrace()
+
+      await EvaluationIntegration.evaluateTrace(trace, {
+        metricIDs: [metric.id],
+        recordTimeSeries: true,
+        // No tags specified
+      })
+
+      const points = await TimeSeries.getDataPoints(metric.id)
+      const point = points.find((p) => p.traceID === trace.id)
+
+      expect(point).toBeDefined()
+    })
+  })
+})
diff --git a/packages/opencode/test/evaluation/metric.test.ts b/packages/opencode/test/evaluation/metric.test.ts
new file mode 100644
index 0000000000..06ac04bffb
--- /dev/null
+++ b/packages/opencode/test/evaluation/metric.test.ts
@@ -0,0 +1,198 @@
+import { describe, expect, test, beforeEach } from "bun:test"
+import { Metric } from "../../src/evaluation/metric"
+
+// Clean up test metrics after each test
+const testMetricIds: string[] = []
+
+beforeEach(async () => {
+  // Clean up any test metrics from previous runs
+  for (const id of testMetricIds) {
+    try {
+      await Metric.remove(id)
+    } catch {}
+  }
+  testMetricIds.length = 0
+})
+
+describe("Metric", () => {
+  describe("register and get", () => {
+    test("can register and retrieve a metric", async () => {
+      const metric: Metric.Definition = {
+        id: "test-metric-1",
+        name: "Test Metric",
+        description: "A test metric",
+        version: "1.0.0",
+        category: "performance",
+        evaluator: {
+          type: "heuristic",
+          function: "toolErrorRate",
+        },
+        threshold: {
+          pass: 0.1,
+        },
+        higherIsBetter: false,
+        tags: ["test"],
+      }
+
+      testMetricIds.push(metric.id)
+      await Metric.register(metric)
+
+      const retrieved = await Metric.get(metric.id)
+      expect(retrieved).toEqual(metric)
+    })
+  })
+
+  describe("exists", () => {
+    test("returns true for existing metric", async () => {
+      const metric: Metric.Definition = {
+        id: "test-metric-exists",
+        name: "Test",
+        description: "Test",
+        version: "1.0.0",
+        category: "cost",
+        evaluator: { type: "heuristic", function: "totalCost" },
+        higherIsBetter: false,
+        tags: [],
+      }
+
+      testMetricIds.push(metric.id)
+      await Metric.register(metric)
+
+      expect(await Metric.exists(metric.id)).toBe(true)
+    })
+
+    test("returns false for non-existing metric", async () => {
+      expect(await Metric.exists("non-existing-metric")).toBe(false)
+    })
+  })
+
+  describe("list", () => {
+    test("returns all registered metrics", async () => {
+      const metric1: Metric.Definition = {
+        id: "test-list-1",
+        name: "Metric 1",
+        description: "Test",
+        version: "1.0.0",
+        category: "performance",
+        evaluator: { type: "heuristic", function: "responseDuration" },
+        higherIsBetter: false,
+        tags: [],
+      }
+
+      const metric2: Metric.Definition = {
+        id: "test-list-2",
+        name: "Metric 2",
+        description: "Test",
+        version: "1.0.0",
+        category: "cost",
+        evaluator: { type: "heuristic", function: "totalCost" },
+        higherIsBetter: false,
+        tags: [],
+      }
+
+      testMetricIds.push(metric1.id, metric2.id)
+      await Metric.register(metric1)
+      await Metric.register(metric2)
+
+      const all = await Metric.list()
+      const testMetrics = all.filter((m) => m.id.startsWith("test-list-"))
+
+      expect(testMetrics.length).toBeGreaterThanOrEqual(2)
+      expect(testMetrics.some((m) => m.id === metric1.id)).toBe(true)
+      expect(testMetrics.some((m) => m.id === metric2.id)).toBe(true)
+    })
+  })
+
+  describe("findByCategory", () => {
+    test("filters metrics by category", async () => {
+      const perfMetric: Metric.Definition = {
+        id: "test-cat-perf",
+        name: "Performance Metric",
+        description: "Test",
+        version: "1.0.0",
+        category: "performance",
+        evaluator: { type: "heuristic", function: "responseDuration" },
+        higherIsBetter: false,
+        tags: [],
+      }
+
+      const costMetric: Metric.Definition = {
+        id: "test-cat-cost",
+        name: "Cost Metric",
+        description: "Test",
+        version: "1.0.0",
+        category: "cost",
+        evaluator: { type: "heuristic", function: "totalCost" },
+        higherIsBetter: false,
+        tags: [],
+      }
+
+      testMetricIds.push(perfMetric.id, costMetric.id)
+      await Metric.register(perfMetric)
+      await Metric.register(costMetric)
+
+      const perfMetrics = await Metric.findByCategory("performance")
+      const testPerfMetrics = perfMetrics.filter((m) => m.id.startsWith("test-cat-"))
+
+      expect(testPerfMetrics.some((m) => m.id === perfMetric.id)).toBe(true)
+      expect(testPerfMetrics.some((m) => m.id === costMetric.id)).toBe(false)
+    })
+  })
+
+  describe("findByTag", () => {
+    test("filters metrics by tag", async () => {
+      const metric1: Metric.Definition = {
+        id: "test-tag-1",
+        name: "Tagged Metric 1",
+        description: "Test",
+        version: "1.0.0",
+        category: "performance",
+        evaluator: { type: "heuristic", function: "responseDuration" },
+        higherIsBetter: false,
+        tags: ["important", "production"],
+      }
+
+      const metric2: Metric.Definition = {
+        id: "test-tag-2",
+        name: "Tagged Metric 2",
+        description: "Test",
+        version: "1.0.0",
+        category: "cost",
+        evaluator: { type: "heuristic", function: "totalCost" },
+        higherIsBetter: false,
+        tags: ["experimental"],
+      }
+
+      testMetricIds.push(metric1.id, metric2.id)
+      await Metric.register(metric1)
+      await Metric.register(metric2)
+
+      const importantMetrics = await Metric.findByTag("important")
+      const testImportantMetrics = importantMetrics.filter((m) => m.id.startsWith("test-tag-"))
+
+      expect(testImportantMetrics.some((m) => m.id === metric1.id)).toBe(true)
+      expect(testImportantMetrics.some((m) => m.id === metric2.id)).toBe(false)
+    })
+  })
+
+  describe("remove", () => {
+    test("removes a metric", async () => {
+      const metric: Metric.Definition = {
+        id: "test-remove",
+        name: "To Remove",
+        description: "Test",
+        version: "1.0.0",
+        category: "performance",
+        evaluator: { type: "heuristic", function: "responseDuration" },
+        higherIsBetter: false,
+        tags: [],
+      }
+
+      await Metric.register(metric)
+      expect(await Metric.exists(metric.id)).toBe(true)
+
+      await Metric.remove(metric.id)
+      expect(await Metric.exists(metric.id)).toBe(false)
+    })
+  })
+})
diff --git a/packages/opencode/test/evaluation/realistic-scenarios.test.ts b/packages/opencode/test/evaluation/realistic-scenarios.test.ts
new file mode 100644
index 0000000000..76af72b039
--- /dev/null
+++ b/packages/opencode/test/evaluation/realistic-scenarios.test.ts
@@ -0,0 +1,545 @@
+import { describe, test, expect, beforeEach, afterEach } from "bun:test"
+import { EvaluationIntegration } from "../../src/evaluation/integration"
+import { Metric } from "../../src/evaluation/metric"
+import { Baseline } from "../../src/evaluation/baseline"
+import { TimeSeries } from "../../src/evaluation/timeseries"
+import { Instance } from "../../src/project/instance"
+import { tmpdir } from "../fixture/fixture"
+import { RealisticTraces } from "./fixtures/realistic-traces"
+import { TimeSeriesSimulator } from "./helpers/time-series-simulation"
+
+// Helper to wrap tests with Instance context for storage isolation
+async function withInstance(fn: () => Promise<void>) {
+  await using tmp = await tmpdir()
+  await Instance.provide({
+    directory: tmp.path,
+    fn,
+  })
+}
+
+/**
+ * Realistic scenario tests using production-like trace patterns.
+ * 
+ * These tests validate the evaluation framework with:
+ * - Real trace patterns from actual agent behavior
+ * - Time-series patterns observed in production
+ * - Complex workflows and edge cases
+ */
+
+// Helper to create baseline with retry logic for robustness
+async function createBaselineRobust(
+  config: Parameters<typeof Baseline.create>[0],
+  retries = 3
+): Promise<ReturnType<typeof Baseline.create>> {
+  for (let i = 0; i < retries; i++) {
+    try {
+      const baseline = await Baseline.create(config)
+      // Verify it was actually created
+      await Baseline.get(baseline.id)
+      return baseline
+    } catch (error) {
+      if (i === retries - 1) throw error
+      // Wait before retry
+      await new Promise(resolve => setTimeout(resolve, 50))
+    }
+  }
+  throw new Error("Failed to create baseline")
+}
+
+// Helper to add traces with verification
+async function addTracesRobust(
+  baselineID: string,
+  traces: any[],
+  retries = 3
+): Promise<void> {
+  for (let i = 0; i < retries; i++) {
+    try {
+      // Verify baseline exists first
+      await Baseline.get(baselineID)
+      
+      // Add traces
+      for (const trace of traces) {
+        await Baseline.addTrace(baselineID, trace)
+      }
+      
+      // Verify traces were added
+      const updated = await Baseline.get(baselineID)
+      if (updated.traceIDs.length >= traces.length) {
+        return
+      }
+    } catch (error) {
+      if (i === retries - 1) throw error
+      await new Promise(resolve => setTimeout(resolve, 50))
+    }
+  }
+}
+
+describe("Realistic Evaluation Scenarios", () => {
+  const testIds: string[] = []
+
+  beforeEach(async () => {
+    EvaluationIntegration.disableAutoEvaluation()
+  })
+
+  afterEach(async () => {
+    EvaluationIntegration.disableAutoEvaluation()
+    for (const id of testIds) {
+      try {
+        await Metric.remove(id)
+      } catch {}
+      try {
+        await Baseline.remove(id)
+      } catch {}
+      try {
+        await TimeSeries.clearMetric(id)
+      } catch {}
+    }
+    testIds.length = 0
+  })
+
+  describe("Real-World Trace Patterns", () => {
+    test("detects regression when switching from Haiku to Sonnet", () => withInstance(async () => {
+      const durationMetric: Metric.Definition = {
+        id: `model-switch-${Date.now()}`,
+        name: "Response Duration",
+        evaluator: { type: "heuristic", function: "responseDuration" },
+        higherIsBetter: false,
+        category: "performance",
+        tags: [],
+        version: "1.0.0",
+        description: "Measures response time",
+      }
+      await Metric.register(durationMetric)
+      testIds.push(durationMetric.id)
+
+      // Baseline with fast Haiku model
+      const baseline = await createBaselineRobust({
+        id: `haiku-baseline-${Date.now()}`,
+        name: "Haiku Model Baseline",
+        description: "Baseline for Haiku model performance",
+        metricIDs: [durationMetric.id],
+        tags: ["model:haiku"],
+        minSampleSize: 5,
+      })
+      testIds.push(baseline.id)
+
+      // Add Haiku traces to baseline
+      const haikuTraces = RealisticTraces.generateVariations(
+        RealisticTraces.haikuModel,
+        10,
+        0.1
+      )
+      await addTracesRobust(baseline.id, haikuTraces)
+
+      // Monitor for regressions
+      const regressions: any[] = []
+      const unsubscribe = EvaluationIntegration.onRegression((alert) => {
+        regressions.push(alert)
+      })
+
+      await EvaluationIntegration.enableAutoEvaluation({
+        metricIDs: [durationMetric.id],
+        checkBaselines: true,
+      })
+
+      // Switch to Sonnet model (slower but higher quality)
+      const sonnetTrace = RealisticTraces.successfulCodeEdit()
+      await EvaluationIntegration.evaluateTrace(sonnetTrace)
+
+      // Should detect that Sonnet is significantly slower
+      expect(regressions.length).toBeGreaterThan(0)
+      expect(regressions[0].currentValue).toBeGreaterThan(
+        regressions[0].baselineValue
+      )
+
+      unsubscribe()
+    }))
+
+    test("detects improvement from code optimization", () => withInstance(async () => {
+      const costMetric: Metric.Definition = {
+        id: `optimization-${Date.now()}`,
+        name: "Total Cost",
+        evaluator: { type: "heuristic", function: "totalCost" },
+        higherIsBetter: false,
+        category: "cost",
+        tags: [],
+        version: "1.0.0",
+        description: "Measures total cost",
+      }
+      await Metric.register(costMetric)
+      testIds.push(costMetric.id)
+
+      // Baseline with pre-optimization traces
+      const baseline = await createBaselineRobust({
+        id: `pre-opt-baseline-${Date.now()}`,
+        name: "Pre-Optimization",
+        description: "Baseline before optimization",
+        metricIDs: [costMetric.id],
+        minSampleSize: 5,
+      })
+      testIds.push(baseline.id)
+
+      // Add expensive complex refactoring traces
+      const preOptTraces = RealisticTraces.generateVariations(
+        RealisticTraces.complexRefactoring,
+        10,
+        0.15
+      )
+      await addTracesRobust(baseline.id, preOptTraces)
+
+      // Monitor for improvements
+      const improvements: any[] = []
+      const unsubscribe = EvaluationIntegration.onImprovement((alert) => {
+        improvements.push(alert)
+      })
+
+      await EvaluationIntegration.enableAutoEvaluation({
+        metricIDs: [costMetric.id],
+        checkBaselines: true,
+      })
+
+      // After optimization: uses cache heavily
+      const optimizedTrace = RealisticTraces.cachedExecution()
+      await EvaluationIntegration.evaluateTrace(optimizedTrace)
+
+      // Should detect significant cost reduction
+      expect(improvements.length).toBeGreaterThan(0)
+      expect(improvements[0].currentValue).toBeLessThan(
+        improvements[0].baselineValue
+      )
+
+      unsubscribe()
+    }))
+
+    test("handles retry patterns correctly", async () => {
+      const errorMetric: Metric.Definition = {
+        id: `error-rate-${Date.now()}`,
+        name: "Tool Error Rate",
+        evaluator: { type: "heuristic", function: "toolErrorRate" },
+        higherIsBetter: false,
+        category: "reliability",
+        tags: [],
+        version: "1.0.0",
+        description: "Measures error rate",
+      }
+      await Metric.register(errorMetric)
+      testIds.push(errorMetric.id)
+
+      // Successful traces have low error rate
+      const successTrace = RealisticTraces.successfulCodeEdit()
+      await EvaluationIntegration.evaluateTrace(
+        successTrace,
+        {
+          metricIDs: [errorMetric.id],
+          recordTimeSeries: true,
+        }
+      )
+
+      // Retry traces have errors but eventually succeed
+      const retryTrace = RealisticTraces.failedWithRetry()
+      await EvaluationIntegration.evaluateTrace(retryTrace, {
+        metricIDs: [errorMetric.id],
+        recordTimeSeries: true,
+      })
+
+      // Error rate should be > 0 for retry trace
+      const points = await TimeSeries.getDataPoints(errorMetric.id)
+      expect(points.length).toBe(2)
+
+      const retryPoint = points.find((p) => p.traceID === retryTrace.id)
+      expect(retryPoint).toBeDefined()
+      expect(retryPoint!.value).toBeGreaterThan(0)
+    })
+  })
+
+  describe("Time-Series Patterns", () => {
+    test("detects anomaly in stable pattern", async () => {
+      const metric: Metric.Definition = {
+        id: `stable-anomaly-${Date.now()}`,
+        name: "Cost Monitoring",
+        evaluator: { type: "heuristic", function: "totalCost" },
+        higherIsBetter: false,
+        category: "cost",
+        tags: [],
+        version: "1.0.0",
+        description: "Monitors cost",
+      }
+      await Metric.register(metric)
+      testIds.push(metric.id)
+
+      // Generate stable baseline with 2% variance
+      const stableTraces = TimeSeriesSimulator.stable(50, 0.02, 0.02)
+      for (const trace of stableTraces) {
+        await TimeSeries.record(metric.id, trace)
+      }
+
+      // Monitor for anomalies
+      const anomalies: any[] = []
+      const unsubscribe = EvaluationIntegration.onAnomaly((alert) => {
+        anomalies.push(alert)
+      })
+
+      await EvaluationIntegration.enableAutoEvaluation({
+        metricIDs: [metric.id],
+        recordTimeSeries: true,
+        detectAnomalies: true,
+      })
+
+      // Inject anomalous trace (10x normal)
+      const anomalousTrace = RealisticTraces.custom({
+        summary: {
+          duration: 1000,
+          toolCallCount: 1,
+          errorCount: 0,
+          tokens: { input: 100, output: 50, reasoning: 0, cache: { read: 0, write: 0 } },
+          cost: 0.20, // 10x normal
+        },
+      })
+
+      await EvaluationIntegration.evaluateTrace(anomalousTrace)
+
+      expect(anomalies.length).toBeGreaterThan(0)
+      expect(anomalies[0].zScore).toBeGreaterThan(3)
+
+      unsubscribe()
+    })
+
+    test("detects gradual degradation over time", async () => {
+      const metric: Metric.Definition = {
+        id: `degradation-${Date.now()}`,
+        name: "Performance Degradation",
+        evaluator: { type: "heuristic", function: "totalCost" },
+        higherIsBetter: false, // Cost going up is bad
+        category: "cost",
+        tags: [],
+        version: "1.0.0",
+        description: "Detects degradation",
+      }
+      await Metric.register(metric)
+      testIds.push(metric.id)
+
+      // Generate degrading pattern: 10% increase over 100 samples
+      const degradingTraces = TimeSeriesSimulator.degradation(100, 0.10, 0.02)
+      for (const trace of degradingTraces) {
+        await TimeSeries.record(metric.id, trace)
+      }
+
+      // Analyze trend
+      const analysis = await TimeSeries.analyzeTrend(metric.id, {
+        days: 4, // ~100 hours
+      })
+
+      // Debug: check what values we actually got
+      const points = await TimeSeries.getDataPoints(metric.id)
+      const firstCost = points[0]?.value
+      const lastCost = points[points.length - 1]?.value
+      
+      // Should detect degrading trend
+      // Since higherIsBetter=false and cost is increasing, it should be "degrading"
+      expect(lastCost).toBeGreaterThan(firstCost) // Cost should increase
+      expect(analysis.slope).toBeGreaterThan(0) // Positive slope = increasing cost
+      expect(analysis.trend).toBe("degrading")
+      expect(Math.abs(analysis.changePercent)).toBeGreaterThan(5) // At least 5% change
+    })
+
+    test("identifies business hours vs off-hours patterns", async () => {
+      const metric: Metric.Definition = {
+        id: `daily-pattern-${Date.now()}`,
+        name: "Daily Pattern",
+        evaluator: { type: "heuristic", function: "totalCost" },
+        higherIsBetter: false,
+        category: "cost",
+        tags: [],
+        version: "1.0.0",
+        description: "Daily usage pattern",
+      }
+      await Metric.register(metric)
+      testIds.push(metric.id)
+
+      // Generate 7 days of hourly data with business hours pattern
+      const dailyTraces = TimeSeriesSimulator.dailyPattern(7, 24, 0.02, 0.05)
+      for (const trace of dailyTraces) {
+        await TimeSeries.record(metric.id, trace)
+      }
+
+      // Get all data points
+      const points = await TimeSeries.getDataPoints(metric.id)
+      expect(points.length).toBe(7 * 24) // 7 days, 24 hours each
+
+      // Calculate average cost - sort into high and low cost groups
+      // The simulator creates bimodal distribution: business hours (1.5x) vs off-hours (0.7x)
+      const allCosts = points.map(p => p.value).sort((a, b) => a - b)
+      const median = allCosts[Math.floor(allCosts.length / 2)]
+      
+      const lowCosts = allCosts.filter(c => c < median)
+      const highCosts = allCosts.filter(c => c >= median)
+      
+      const avgLow = lowCosts.reduce((a, b) => a + b, 0) / lowCosts.length
+      const avgHigh = highCosts.reduce((a, b) => a + b, 0) / highCosts.length
+
+      // High-cost group should be significantly more expensive than low-cost group
+      // With 1.5x vs 0.7x multipliers, ratio should be > 2x
+      expect(lowCosts.length).toBeGreaterThan(0)
+      expect(highCosts.length).toBeGreaterThan(0)
+      expect(avgHigh).toBeGreaterThan(avgLow * 1.5)
+    })
+
+    test("handles A/B test comparison", () => withInstance(async () => {
+      const metric: Metric.Definition = {
+        id: `ab-test-${Date.now()}`,
+        name: "A/B Test",
+        evaluator: { type: "heuristic", function: "totalCost" },
+        higherIsBetter: false,
+        category: "cost",
+        tags: [],
+        version: "1.0.0",
+        description: "A/B test comparison",
+      }
+      await Metric.register(metric)
+      testIds.push(metric.id)
+
+      // Generate A/B test data (small sample for test performance)
+      const { groupA, groupB } = TimeSeriesSimulator.abTest(5, 0.02, 0.028, 0.05)
+
+      // Create baselines for both groups
+      const baselineA = await createBaselineRobust({
+        id: `group-a-${Date.now()}`,
+        name: "Group A",
+        description: "A/B test group A",
+        metricIDs: [metric.id],
+        tags: ["variant:A"],
+        minSampleSize: 3,
+      })
+      testIds.push(baselineA.id)
+
+      const baselineB = await createBaselineRobust({
+        id: `group-b-${Date.now()}`,
+        name: "Group B",
+        description: "A/B test group B",
+        metricIDs: [metric.id],
+        tags: ["variant:B"],
+        minSampleSize: 3,
+      })
+      testIds.push(baselineB.id)
+
+      // Add traces to baselines with verification
+      await addTracesRobust(baselineA.id, groupA)
+      await addTracesRobust(baselineB.id, groupB)
+      
+      // Delay to ensure persistence
+      await new Promise(resolve => setTimeout(resolve, 100))
+
+      // Compare the two baselines
+      const comparison = await Baseline.compareAB(baselineA.id, baselineB.id)
+
+      // Group B should be more expensive (0.028 vs 0.02 = 40% increase)
+      expect(comparison.metrics[0].percentChange).toBeGreaterThan(20)
+      expect(comparison.metrics[0].meanB).toBeGreaterThan(
+        comparison.metrics[0].meanA
+      )
+    }))
+
+    test("detects step function change after deployment", async () => {
+      const metric: Metric.Definition = {
+        id: `deployment-${Date.now()}`,
+        name: "Deployment Impact",
+        evaluator: { type: "heuristic", function: "totalCost" },
+        higherIsBetter: false,
+        category: "cost",
+        tags: [],
+        version: "1.0.0",
+        description: "Detects deployment impact",
+      }
+      await Metric.register(metric)
+      testIds.push(metric.id)
+
+      // Generate step function: sudden change at deployment
+      const stepTraces = TimeSeriesSimulator.stepFunction(30, 30, 0.02, 0.04)
+      for (const trace of stepTraces) {
+        await TimeSeries.record(metric.id, trace)
+      }
+
+      // Get all data points and verify the step change
+      const points = await TimeSeries.getDataPoints(metric.id)
+      expect(points.length).toBe(60)
+      
+      // Calculate averages before and after deployment
+      const preDeployment = points.slice(0, 30).map(p => p.value)
+      const postDeployment = points.slice(30, 60).map(p => p.value)
+      
+      const avgPre = preDeployment.reduce((a, b) => a + b, 0) / preDeployment.length
+      const avgPost = postDeployment.reduce((a, b) => a + b, 0) / postDeployment.length
+      
+      // Should detect ~100% increase (0.02 → 0.04)
+      expect(avgPost).toBeGreaterThan(avgPre * 1.8) // At least 80% increase
+      expect(avgPost / avgPre).toBeCloseTo(2.0, 0.3) // Close to 2x
+    })
+  })
+
+  describe("Complex Workflows", () => {
+    test("simulates week-long development cycle", async () => {
+      const costMetric: Metric.Definition = {
+        id: `dev-cycle-${Date.now()}`,
+        name: "Development Cycle",
+        evaluator: { type: "heuristic", function: "totalCost" },
+        higherIsBetter: false,
+        category: "cost",
+        tags: [],
+        version: "1.0.0",
+        description: "Tracks development cycle",
+      }
+      await Metric.register(costMetric)
+      testIds.push(costMetric.id)
+
+      // Phase 1: Exploration (expensive, complex tasks)  
+      const explorationTraces = RealisticTraces.generateVariations(
+        RealisticTraces.complexRefactoring,
+        20,
+        0.2
+      )
+
+      // Phase 2: Implementation (mixed complexity)
+      const implementationTraces = [
+        ...RealisticTraces.generateVariations(
+          RealisticTraces.successfulCodeEdit,
+          15,
+          0.15
+        ),
+        ...RealisticTraces.generateVariations(RealisticTraces.failedWithRetry, 5, 0.1),
+      ]
+
+      // Phase 3: Polishing (cheap, cached tasks)
+      const polishingTraces = RealisticTraces.generateVariations(
+        RealisticTraces.cachedExecution,
+        30,
+        0.1
+      )
+
+      // Record all traces with timestamps spread over 7 days
+      const allTraces = [
+        ...explorationTraces,
+        ...implementationTraces,
+        ...polishingTraces,
+      ]
+      const startTime = Date.now() - 7 * 24 * 60 * 60 * 1000
+      const timeStep = (7 * 24 * 60 * 60 * 1000) / allTraces.length
+      
+      for (let i = 0; i < allTraces.length; i++) {
+        const trace = allTraces[i]
+        // Override timestamp to spread traces over 7 days
+        trace.createdAt = Math.floor(startTime + i * timeStep)
+        trace.completedAt = trace.createdAt + trace.summary.duration
+        await TimeSeries.record(costMetric.id, trace)
+      }
+
+      // Analyze trend (should show improvement over time)
+      const trend = await TimeSeries.analyzeTrend(costMetric.id, { days: 7 })
+
+      // Should detect improving trend as work becomes more efficient
+      // With higherIsBetter=false (cost), decreasing values = improving
+      expect(trend.trend).toBe("improving")
+      expect(trend.slope).toBeLessThan(0) // Negative slope = decreasing cost
+      expect(trend.changePercent).toBeLessThan(-5) // At least 5% improvement
+    })
+  })
+})
diff --git a/packages/opencode/test/evaluation/robustness.test.ts b/packages/opencode/test/evaluation/robustness.test.ts
new file mode 100644
index 0000000000..bc5fb460b7
--- /dev/null
+++ b/packages/opencode/test/evaluation/robustness.test.ts
@@ -0,0 +1,366 @@
+import { describe, test, expect, afterEach } from "bun:test"
+import { Metric } from "../../src/evaluation/metric"
+import { TimeSeries } from "../../src/evaluation/timeseries"
+import { Baseline } from "../../src/evaluation/baseline"
+import { TimeUtils } from "../../src/evaluation/time-utils"
+import { MetricSemantics } from "../../src/evaluation/metric-semantics"
+import { RealisticTraces } from "./fixtures/realistic-traces"
+
+/**
+ * Tests for robustness improvements:
+ * - Timestamp validation
+ * - Batch operations
+ * - Data quality checks
+ * - Metric semantics
+ */
+
+describe("Robustness Improvements", () => {
+  const testIds: string[] = []
+
+  afterEach(async () => {
+    for (const id of testIds) {
+      try {
+        await Metric.remove(id)
+      } catch {}
+      try {
+        await Baseline.remove(id)
+      } catch {}
+      try {
+        await TimeSeries.clearMetric(id)
+      } catch {}
+    }
+    testIds.length = 0
+  })
+
+  describe("Timestamp Validation", () => {
+    test("validates reasonable timestamps", () => {
+      const now = Date.now()
+      const validated = TimeUtils.validateTimestamp(now, "test")
+      expect(validated).toBe(now)
+    })
+
+    test("throws on invalid timestamps", () => {
+      expect(() => TimeUtils.validateTimestamp(0, "test")).toThrow("Invalid timestamp")
+      expect(() => TimeUtils.validateTimestamp(-1, "test")).toThrow("must be positive")
+      expect(() => TimeUtils.validateTimestamp(NaN, "test")).toThrow("Invalid timestamp")
+    })
+
+    test("throws on timestamp in seconds instead of milliseconds", () => {
+      const timestampInSeconds = Math.floor(Date.now() / 1000)
+      expect(() => TimeUtils.validateTimestamp(timestampInSeconds, "test")).toThrow(
+        "appears to be in seconds"
+      )
+    })
+
+    test("warns on very old timestamps", () => {
+      const oneYearAgo = Date.now() - 400 * 24 * 60 * 60 * 1000
+      // Should not throw, but will warn
+      const validated = TimeUtils.validateTimestamp(oneYearAgo, "test", {
+        warnIfOlderThanDays: 365,
+      })
+      expect(validated).toBe(oneYearAgo)
+    })
+  })
+
+  describe("Time Utilities", () => {
+    test("creates evenly-spaced time range", () => {
+      const timestamps = TimeUtils.createTimeRange(7, Date.now(), 100)
+      expect(timestamps.length).toBe(100)
+
+      // Check spacing is consistent
+      const gaps = []
+      for (let i = 1; i < timestamps.length; i++) {
+        gaps.push(timestamps[i] - timestamps[i - 1])
+      }
+      const avgGap = gaps.reduce((a, b) => a + b) / gaps.length
+      const maxDeviation = Math.max(...gaps.map((g) => Math.abs(g - avgGap)))
+      expect(maxDeviation).toBeLessThan(10) // Within 10ms tolerance
+    })
+
+    test("formats timestamps for debugging", () => {
+      const now = Date.now()
+      const formatted = TimeUtils.formatTimestamp(now)
+      expect(formatted).toContain("ago)")
+
+      const hourAgo = now - 60 * 60 * 1000
+      const formattedHour = TimeUtils.formatTimestamp(hourAgo)
+      expect(formattedHour).toContain("1h ago")
+    })
+
+    test("extracts UTC hours correctly", () => {
+      const timestamp = new Date("2024-01-15T14:30:00Z").getTime()
+      const hour = TimeUtils.getHourOfDay(timestamp)
+      expect(hour).toBe(14)
+    })
+
+    test("identifies business hours", () => {
+      const businessHour = new Date("2024-01-15T14:00:00Z").getTime()
+      const offHour = new Date("2024-01-15T22:00:00Z").getTime()
+
+      expect(TimeUtils.isBusinessHours(businessHour)).toBe(true)
+      expect(TimeUtils.isBusinessHours(offHour)).toBe(false)
+    })
+  })
+
+  describe("Batch Operations", () => {
+    test("recordBatch is faster than sequential record", async () => {
+      const metric: Metric.Definition = {
+        id: `batch-test-${Date.now()}`,
+        name: "Batch Test",
+        evaluator: { type: "heuristic", function: "totalCost" },
+        higherIsBetter: false,
+        category: "cost",
+        tags: [],
+        version: "1.0.0",
+        description: "Test batch operations",
+      }
+      await Metric.register(metric)
+      testIds.push(metric.id)
+
+      const traces = Array.from({ length: 20 }, () => RealisticTraces.quickFix())
+
+      // Batch operation
+      const batchStart = Date.now()
+      await TimeSeries.recordBatch(metric.id, traces)
+      const batchDuration = Date.now() - batchStart
+
+      // Verify all traces were recorded
+      const points = await TimeSeries.getDataPoints(metric.id)
+      expect(points.length).toBe(20)
+
+      // Batch should be reasonably fast (< 1s for 20 traces)
+      expect(batchDuration).toBeLessThan(1000)
+    })
+
+    test("Baseline.addTraces is faster than sequential addTrace", async () => {
+      const metric: Metric.Definition = {
+        id: `baseline-batch-${Date.now()}`,
+        name: "Baseline Batch Test",
+        evaluator: { type: "heuristic", function: "totalCost" },
+        higherIsBetter: false,
+        category: "cost",
+        tags: [],
+        version: "1.0.0",
+        description: "Test baseline batch",
+      }
+      await Metric.register(metric)
+      testIds.push(metric.id)
+
+      const baseline = await Baseline.create({
+        id: `batch-baseline-${Date.now()}`,
+        name: "Batch Test",
+        description: "Test batch operations",
+        metricIDs: [metric.id],
+        minSampleSize: 5,
+      })
+      testIds.push(baseline.id)
+
+      const traces = Array.from({ length: 10 }, () => RealisticTraces.quickFix())
+
+      // Batch operation
+      const batchStart = Date.now()
+      await Baseline.addTraces(baseline.id, traces)
+      const batchDuration = Date.now() - batchStart
+
+      // Verify all traces were added
+      const updated = await Baseline.get(baseline.id)
+      expect(updated.traceIDs.length).toBe(10)
+
+      // Batch should be reasonably fast (< 2s for 10 traces)
+      expect(batchDuration).toBeLessThan(2000)
+    })
+  })
+
+  describe("Data Quality Checks", () => {
+    test("detects empty dataset", async () => {
+      const metric: Metric.Definition = {
+        id: `quality-empty-${Date.now()}`,
+        name: "Quality Empty",
+        evaluator: { type: "heuristic", function: "totalCost" },
+        higherIsBetter: false,
+        category: "cost",
+        tags: [],
+        version: "1.0.0",
+        description: "Test quality",
+      }
+      await Metric.register(metric)
+      testIds.push(metric.id)
+
+      const quality = await TimeSeries.checkDataQuality(metric.id)
+      expect(quality.totalPoints).toBe(0)
+      expect(quality.warnings).toContain("No data points found")
+    })
+
+    test("detects insufficient data", async () => {
+      const metric: Metric.Definition = {
+        id: `quality-insufficient-${Date.now()}`,
+        name: "Quality Insufficient",
+        evaluator: { type: "heuristic", function: "totalCost" },
+        higherIsBetter: false,
+        category: "cost",
+        tags: [],
+        version: "1.0.0",
+        description: "Test quality",
+      }
+      await Metric.register(metric)
+      testIds.push(metric.id)
+
+      // Add only 3 traces
+      const traces = Array.from({ length: 3 }, () => RealisticTraces.quickFix())
+      await TimeSeries.recordBatch(metric.id, traces)
+
+      const quality = await TimeSeries.checkDataQuality(metric.id)
+      expect(quality.totalPoints).toBe(3)
+      expect(quality.warnings.some((w) => w.includes("Only 3 data points"))).toBe(true)
+    })
+
+    test("detects data gaps", async () => {
+      const metric: Metric.Definition = {
+        id: `quality-gaps-${Date.now()}`,
+        name: "Quality Gaps",
+        evaluator: { type: "heuristic", function: "totalCost" },
+        higherIsBetter: false,
+        category: "cost",
+        tags: [],
+        version: "1.0.0",
+        description: "Test quality",
+      }
+      await Metric.register(metric)
+      testIds.push(metric.id)
+
+      // Create traces with a 5-hour gap
+      const trace1 = RealisticTraces.quickFix()
+      trace1.createdAt = Date.now() - 10 * 60 * 60 * 1000 // 10 hours ago
+
+      const trace2 = RealisticTraces.quickFix()
+      trace2.createdAt = Date.now() - 2 * 60 * 60 * 1000 // 2 hours ago (8-hour gap!)
+
+      const trace3 = RealisticTraces.quickFix()
+      trace3.createdAt = Date.now() // Now
+
+      await TimeSeries.recordBatch(metric.id, [trace1, trace2, trace3])
+
+      const quality = await TimeSeries.checkDataQuality(metric.id)
+      expect(quality.gaps.length).toBeGreaterThan(0)
+      expect(quality.warnings.some((w) => w.includes("data gaps"))).toBe(true)
+    })
+
+    test("reports when data spans short time period", async () => {
+      const metric: Metric.Definition = {
+        id: `quality-timespan-${Date.now()}`,
+        name: "Quality Timespan",
+        evaluator: { type: "heuristic", function: "totalCost" },
+        higherIsBetter: false,
+        category: "cost",
+        tags: [],
+        version: "1.0.0",
+        description: "Test quality",
+      }
+      await Metric.register(metric)
+      testIds.push(metric.id)
+
+      // Create traces all within 1 minute
+      const traces = Array.from({ length: 5 }, (_, i) => {
+        const trace = RealisticTraces.quickFix()
+        trace.createdAt = Date.now() + i * 1000 // 1 second apart
+        return trace
+      })
+
+      await TimeSeries.recordBatch(metric.id, traces)
+
+      const quality = await TimeSeries.checkDataQuality(metric.id)
+      expect(quality.timeRange.durationDays).toBeLessThan(0.1) // Less than 0.1 days
+      expect(quality.warnings.some((w) => w.includes("Data spans only"))).toBe(true)
+    })
+  })
+
+  describe("Metric Semantics", () => {
+    test("validates cost metric configuration", () => {
+      const goodMetric: Metric.Definition = {
+        id: "cost-good",
+        name: "Cost (Good)",
+        evaluator: { type: "heuristic", function: "totalCost" },
+        higherIsBetter: false, // Correct!
+        category: "cost",
+        semantics: MetricSemantics.Common.cost,
+        tags: [],
+        version: "1.0.0",
+        description: "Test",
+      }
+
+      const goodResult = MetricSemantics.validate(goodMetric)
+      expect(goodResult.valid).toBe(true)
+      expect(goodResult.errors.length).toBe(0)
+
+      const badMetric: Metric.Definition = {
+        id: "cost-bad",
+        name: "Cost (Bad)",
+        evaluator: { type: "heuristic", function: "totalCost" },
+        higherIsBetter: true, // Wrong!
+        category: "cost",
+        semantics: MetricSemantics.Common.cost,
+        tags: [],
+        version: "1.0.0",
+        description: "Test",
+      }
+
+      const badResult = MetricSemantics.validate(badMetric)
+      expect(badResult.valid).toBe(false)
+      expect(badResult.errors.length).toBeGreaterThan(0)
+      expect(badResult.errors[0]).toContain("higherIsBetter=false")
+    })
+
+    test("suggests appropriate semantics", () => {
+      const costMetric = {
+        id: "total-cost",
+        name: "Total Cost",
+        category: "cost" as const,
+        higherIsBetter: false,
+      }
+      const suggestion = MetricSemantics.suggest(costMetric)
+      expect(suggestion).toBe(MetricSemantics.Common.cost)
+
+      const durationMetric = {
+        id: "response-time",
+        name: "Response Time",
+        category: "performance" as const,
+        higherIsBetter: false,
+      }
+      const durationSuggestion = MetricSemantics.suggest(durationMetric)
+      expect(durationSuggestion).toBe(MetricSemantics.Common.duration)
+    })
+
+    test("formats values with semantics", () => {
+      const costFormatted = MetricSemantics.formatValue(0.0245, {
+        semantics: MetricSemantics.Common.cost,
+      })
+      expect(costFormatted).toBe("$0.0245")
+
+      const durationFormatted = MetricSemantics.formatValue(1500, {
+        semantics: MetricSemantics.Common.duration,
+      })
+      expect(durationFormatted).toBe("1.50s")
+
+      const errorFormatted = MetricSemantics.formatValue(0.05, {
+        semantics: MetricSemantics.Common.errorRate,
+      })
+      expect(errorFormatted).toBe("5.0%")
+    })
+
+    test("interprets trends with semantics", () => {
+      const costInterpretation = MetricSemantics.interpretTrend(0.001, {
+        higherIsBetter: false,
+        semantics: MetricSemantics.Common.cost,
+      })
+      expect(costInterpretation).toContain("increasing")
+      expect(costInterpretation).toContain("worse")
+
+      const throughputInterpretation = MetricSemantics.interpretTrend(0.05, {
+        higherIsBetter: true,
+        semantics: MetricSemantics.Common.throughput,
+      })
+      expect(throughputInterpretation).toContain("increasing")
+      expect(throughputInterpretation).toContain("better")
+    })
+  })
+})
diff --git a/packages/opencode/test/evaluation/runner.test.ts b/packages/opencode/test/evaluation/runner.test.ts
new file mode 100644
index 0000000000..d44c760e09
--- /dev/null
+++ b/packages/opencode/test/evaluation/runner.test.ts
@@ -0,0 +1,299 @@
+import { describe, expect, test } from "bun:test"
+import { TestRunner } from "../../src/evaluation/runner"
+import type { Trace } from "../../src/trace"
+import type { Dataset } from "../../src/evaluation/dataset"
+
+const createMockTrace = (overrides?: Partial<Trace.Complete>): Trace.Complete => ({
+  id: "test-trace-1",
+  projectID: "test-project",
+  session: {
+    id: "test-session",
+    projectID: "test-project",
+    directory: "/test",
+    title: "Test Session",
+    version: "1.0.0",
+    time: {
+      created: Date.now(),
+      updated: Date.now(),
+    },
+  },
+  messageCount: 3,
+  agentName: "test-agent",
+  modelConfig: {
+    provider: "anthropic",
+    model: "claude-3-5-sonnet-20241022",
+  },
+  output: "Hello, I can help you with that task!",
+  toolCalls: [
+    { id: "Read", status: "success", duration: 100 } as any,
+    { id: "Edit", status: "success", duration: 200 } as any,
+  ],
+  summary: {
+    duration: 1500,
+    toolCallCount: 2,
+    errorCount: 0,
+    tokens: {
+      input: 100,
+      output: 50,
+      reasoning: 0,
+      cache: { read: 0, write: 0 },
+    },
+    cost: 0.02,
+  },
+  evaluationIDs: [],
+  createdAt: Date.now(),
+  ...overrides,
+})
+
+describe("TestRunner - Assertions", () => {
+  describe("tool-called assertion", () => {
+    test("passes when tool is called", async () => {
+      const trace = createMockTrace()
+      const assertion: Dataset.Assertion = {
+        type: "tool-called",
+        toolID: "Read",
+      }
+
+      const results = await TestRunner.runAssertions(trace, [assertion])
+      expect(results).toHaveLength(1)
+      expect(results[0].passed).toBe(true)
+      expect(results[0].message).toContain("Read")
+    })
+
+    test("fails when tool is not called", async () => {
+      const trace = createMockTrace()
+      const assertion: Dataset.Assertion = {
+        type: "tool-called",
+        toolID: "NonExistent",
+      }
+
+      const results = await TestRunner.runAssertions(trace, [assertion])
+      expect(results[0].passed).toBe(false)
+    })
+
+    test("respects minCount", async () => {
+      const trace = createMockTrace({
+        toolCalls: [
+          { id: "Read", status: "success", duration: 100 } as any,
+          { id: "Read", status: "success", duration: 120 } as any,
+        ],
+      })
+
+      const assertion: Dataset.Assertion = {
+        type: "tool-called",
+        toolID: "Read",
+        minCount: 2,
+      }
+
+      const results = await TestRunner.runAssertions(trace, [assertion])
+      expect(results[0].passed).toBe(true)
+    })
+
+    test("respects maxCount", async () => {
+      const trace = createMockTrace({
+        toolCalls: [
+          { id: "Read", status: "success", duration: 100 } as any,
+          { id: "Read", status: "success", duration: 120 } as any,
+          { id: "Read", status: "success", duration: 130 } as any,
+        ],
+      })
+
+      const assertion: Dataset.Assertion = {
+        type: "tool-called",
+        toolID: "Read",
+        maxCount: 2,
+      }
+
+      const results = await TestRunner.runAssertions(trace, [assertion])
+      expect(results[0].passed).toBe(false)
+    })
+  })
+
+  describe("output-matches assertion", () => {
+    test("passes when pattern matches", async () => {
+      const trace = createMockTrace()
+      const assertion: Dataset.Assertion = {
+        type: "output-matches",
+        pattern: "help.*task",
+      }
+
+      const results = await TestRunner.runAssertions(trace, [assertion])
+      expect(results[0].passed).toBe(true)
+    })
+
+    test("fails when pattern doesn't match", async () => {
+      const trace = createMockTrace()
+      const assertion: Dataset.Assertion = {
+        type: "output-matches",
+        pattern: "goodbye",
+      }
+
+      const results = await TestRunner.runAssertions(trace, [assertion])
+      expect(results[0].passed).toBe(false)
+    })
+
+    test("supports flags", async () => {
+      const trace = createMockTrace()
+      const assertion: Dataset.Assertion = {
+        type: "output-matches",
+        pattern: "HELLO",
+        flags: "i",
+      }
+
+      const results = await TestRunner.runAssertions(trace, [assertion])
+      expect(results[0].passed).toBe(true)
+    })
+  })
+
+  describe("output-contains assertion", () => {
+    test("passes when substring is found", async () => {
+      const trace = createMockTrace()
+      const assertion: Dataset.Assertion = {
+        type: "output-contains",
+        substring: "help",
+      }
+
+      const results = await TestRunner.runAssertions(trace, [assertion])
+      expect(results[0].passed).toBe(true)
+    })
+
+    test("fails when substring is not found", async () => {
+      const trace = createMockTrace()
+      const assertion: Dataset.Assertion = {
+        type: "output-contains",
+        substring: "error",
+      }
+
+      const results = await TestRunner.runAssertions(trace, [assertion])
+      expect(results[0].passed).toBe(false)
+    })
+  })
+
+  describe("no-errors assertion", () => {
+    test("passes when no errors", async () => {
+      const trace = createMockTrace()
+      const assertion: Dataset.Assertion = {
+        type: "no-errors",
+      }
+
+      const results = await TestRunner.runAssertions(trace, [assertion])
+      expect(results[0].passed).toBe(true)
+    })
+
+    test("fails when errors present", async () => {
+      const trace = createMockTrace({
+        summary: { ...createMockTrace().summary, errorCount: 2 },
+      })
+      const assertion: Dataset.Assertion = {
+        type: "no-errors",
+      }
+
+      const results = await TestRunner.runAssertions(trace, [assertion])
+      expect(results[0].passed).toBe(false)
+      expect(results[0].message).toContain("2 error")
+    })
+  })
+
+  describe("duration-under assertion", () => {
+    test("passes when under threshold", async () => {
+      const trace = createMockTrace()
+      const assertion: Dataset.Assertion = {
+        type: "duration-under",
+        milliseconds: 2000,
+      }
+
+      const results = await TestRunner.runAssertions(trace, [assertion])
+      expect(results[0].passed).toBe(true)
+    })
+
+    test("fails when over threshold", async () => {
+      const trace = createMockTrace()
+      const assertion: Dataset.Assertion = {
+        type: "duration-under",
+        milliseconds: 1000,
+      }
+
+      const results = await TestRunner.runAssertions(trace, [assertion])
+      expect(results[0].passed).toBe(false)
+    })
+  })
+
+  describe("cost-under assertion", () => {
+    test("passes when under threshold", async () => {
+      const trace = createMockTrace()
+      const assertion: Dataset.Assertion = {
+        type: "cost-under",
+        dollars: 0.05,
+      }
+
+      const results = await TestRunner.runAssertions(trace, [assertion])
+      expect(results[0].passed).toBe(true)
+    })
+
+    test("fails when over threshold", async () => {
+      const trace = createMockTrace()
+      const assertion: Dataset.Assertion = {
+        type: "cost-under",
+        dollars: 0.01,
+      }
+
+      const results = await TestRunner.runAssertions(trace, [assertion])
+      expect(results[0].passed).toBe(false)
+    })
+  })
+
+  describe("custom assertion", () => {
+    test("passes when expression evaluates to true", async () => {
+      const trace = createMockTrace()
+      const assertion: Dataset.Assertion = {
+        type: "custom",
+        expression: "trace.toolCalls.length === 2",
+        description: "Should have exactly 2 tool calls",
+      }
+
+      const results = await TestRunner.runAssertions(trace, [assertion])
+      expect(results[0].passed).toBe(true)
+    })
+
+    test("fails when expression evaluates to false", async () => {
+      const trace = createMockTrace()
+      const assertion: Dataset.Assertion = {
+        type: "custom",
+        expression: "trace.summary.cost > 1.0",
+        description: "Cost should be high",
+      }
+
+      const results = await TestRunner.runAssertions(trace, [assertion])
+      expect(results[0].passed).toBe(false)
+    })
+
+    test("handles expression errors gracefully", async () => {
+      const trace = createMockTrace()
+      const assertion: Dataset.Assertion = {
+        type: "custom",
+        expression: "trace.nonExistent.property",
+        description: "Invalid expression",
+      }
+
+      const results = await TestRunner.runAssertions(trace, [assertion])
+      expect(results[0].passed).toBe(false)
+      expect(results[0].message).toContain("failed")
+    })
+  })
+
+  describe("multiple assertions", () => {
+    test("runs all assertions independently", async () => {
+      const trace = createMockTrace()
+      const assertions: Dataset.Assertion[] = [
+        { type: "tool-called", toolID: "Read" },
+        { type: "output-contains", substring: "help" },
+        { type: "no-errors" },
+        { type: "duration-under", milliseconds: 2000 },
+      ]
+
+      const results = await TestRunner.runAssertions(trace, assertions)
+      expect(results).toHaveLength(4)
+      expect(results.every((r) => r.passed)).toBe(true)
+    })
+  })
+})
diff --git a/packages/opencode/test/evaluation/telemetry.test.ts b/packages/opencode/test/evaluation/telemetry.test.ts
new file mode 100644
index 0000000000..f9637f9b33
--- /dev/null
+++ b/packages/opencode/test/evaluation/telemetry.test.ts
@@ -0,0 +1,342 @@
+import { describe, test, expect, afterEach } from "bun:test"
+import { Telemetry } from "../../src/evaluation/telemetry"
+import { FeedbackManager } from "../../src/evaluation/feedback-manager"
+import { Bus } from "../../src/bus"
+import { Instance } from "../../src/project/instance"
+import { tmpdir } from "../fixture/fixture"
+import { RealisticTraces } from "./fixtures/realistic-traces"
+
+/**
+ * Tests for telemetry collection and feedback management.
+ */
+
+// Helper to wrap tests with Instance context
+async function withInstance(fn: () => Promise<void>) {
+  await using tmp = await tmpdir()
+  await Instance.provide({
+    directory: tmp.path,
+    fn,
+  })
+}
+
+describe("Telemetry", () => {
+  const tracesToClean: string[] = []
+
+  afterEach(async () => {
+    for (const traceID of tracesToClean) {
+      try {
+        // Clean up telemetry and feedback
+        const { Storage } = await import("../../src/storage/storage")
+        await Storage.remove(["telemetry", traceID])
+        await Storage.remove(["feedback", traceID])
+      } catch {}
+    }
+    tracesToClean.length = 0
+  })
+
+  describe("Task Classification", () => {
+    test("classifies simple edit tasks", () =>
+      withInstance(async () => {
+        const trace = RealisticTraces.quickFix()
+        tracesToClean.push(trace.id)
+
+        const metadata = await Telemetry.enrichTrace(trace)
+
+        expect(metadata.taskClassification.type).toBe("edit")
+        expect(metadata.taskClassification.complexity).toBe("simple")
+        expect(metadata.taskClassification.confidence).toBeGreaterThan(0.5)
+      }))
+
+    test("classifies complex refactoring tasks", () =>
+      withInstance(async () => {
+        const trace = RealisticTraces.complexRefactoring()
+        tracesToClean.push(trace.id)
+
+        const metadata = await Telemetry.enrichTrace(trace)
+
+        expect(metadata.taskClassification.type).toBe("refactor")
+        expect(metadata.taskClassification.complexity).toBe("complex")
+      }))
+
+    test("classifies debug tasks with errors", () =>
+      withInstance(async () => {
+        const trace = RealisticTraces.failedWithRetry()
+        tracesToClean.push(trace.id)
+
+        const metadata = await Telemetry.enrichTrace(trace)
+
+        // Should detect debugging pattern (execute + errors)
+        expect(metadata.taskClassification.type).toBe("debug")
+      }))
+
+    test("enriches trace with timestamp", () =>
+      withInstance(async () => {
+        const trace = RealisticTraces.successfulCodeEdit()
+        tracesToClean.push(trace.id)
+
+        const metadata = await Telemetry.enrichTrace(trace)
+
+        expect(metadata.traceID).toBe(trace.id)
+        expect(metadata.timestamp).toBe(trace.createdAt)
+        expect(metadata.collectedAt).toBeGreaterThan(0)
+        expect(metadata.version).toBe("1.0.0")
+      }))
+  })
+
+  describe("Outcome Proxies", () => {
+    test("initializes with zero subsequent edits", () =>
+      withInstance(async () => {
+        const trace = RealisticTraces.successfulCodeEdit()
+        tracesToClean.push(trace.id)
+
+        const metadata = await Telemetry.enrichTrace(trace)
+
+        expect(metadata.outcomeProxies.subsequentEdits).toBe(0)
+        expect(metadata.outcomeProxies.subsequentEditWindow).toBe(60 * 60 * 1000)
+      }))
+
+    test("outcome proxies can be updated", () =>
+      withInstance(async () => {
+        const trace = RealisticTraces.successfulCodeEdit()
+        tracesToClean.push(trace.id)
+
+        const metadata = await Telemetry.enrichTrace(trace)
+
+        // Simulate updating outcome after observing subsequent edits
+        metadata.outcomeProxies.subsequentEdits = 3
+
+        const { Storage } = await import("../../src/storage/storage")
+        await Storage.write(["telemetry", trace.id], metadata)
+
+        const retrieved = await Telemetry.getEnrichedMetadata(trace.id)
+        expect(retrieved?.outcomeProxies.subsequentEdits).toBe(3)
+      }))
+  })
+
+  describe("Telemetry Events", () => {
+    test("emits enriched event when trace is enriched", () =>
+      withInstance(async () => {
+        const trace = RealisticTraces.successfulCodeEdit()
+        tracesToClean.push(trace.id)
+
+        let eventReceived = false
+        const unsubscribe = Bus.subscribe(Telemetry.Event.Enriched, (event) => {
+          if (event.properties.metadata.traceID === trace.id) {
+            eventReceived = true
+          }
+        })
+
+        await Telemetry.enrichTrace(trace)
+
+        // Give event time to propagate
+        await new Promise((resolve) => setTimeout(resolve, 10))
+
+        expect(eventReceived).toBe(true)
+        unsubscribe()
+      }))
+  })
+
+  describe("User Feedback", () => {
+    test("records and retrieves user feedback", () =>
+      withInstance(async () => {
+        const traceID = "test-trace-" + Date.now()
+        tracesToClean.push(traceID)
+
+        const feedback: Telemetry.UserFeedback = {
+          traceID,
+          timestamp: Date.now(),
+          responses: {
+            correctness: 5,
+            speed: "fast",
+            wouldUseAgain: true,
+          },
+          comment: "Excellent result!",
+          requestedAt: Date.now() - 60000,
+          respondedAt: Date.now(),
+        }
+
+        await Telemetry.recordFeedback(feedback)
+
+        const retrieved = await Telemetry.getFeedback(traceID)
+        expect(retrieved).not.toBeNull()
+        expect(retrieved?.responses.correctness).toBe(5)
+        expect(retrieved?.comment).toBe("Excellent result!")
+      }))
+
+    test("emits event when feedback is received", () =>
+      withInstance(async () => {
+        const traceID = "test-trace-" + Date.now()
+        tracesToClean.push(traceID)
+
+        let eventReceived = false
+        const unsubscribe = Bus.subscribe(Telemetry.Event.FeedbackReceived, (event) => {
+          if (event.properties.feedback.traceID === traceID) {
+            eventReceived = true
+          }
+        })
+
+        const feedback: Telemetry.UserFeedback = {
+          traceID,
+          timestamp: Date.now(),
+          responses: {
+            correctness: 4,
+          },
+          requestedAt: Date.now() - 5000,
+          respondedAt: Date.now(),
+        }
+
+        await Telemetry.recordFeedback(feedback)
+
+        // Give event time to propagate
+        await new Promise((resolve) => setTimeout(resolve, 10))
+
+        expect(eventReceived).toBe(true)
+        unsubscribe()
+      }))
+
+    test("requests feedback with structured questions", () =>
+      withInstance(async () => {
+        const traceIDs = ["trace-1", "trace-2"]
+
+        let requestReceived = false
+        const unsubscribe = Bus.subscribe(Telemetry.Event.FeedbackRequested, (event) => {
+          requestReceived = true
+          expect(event.properties.request.traceIDs).toEqual(traceIDs)
+          expect(event.properties.request.questions.length).toBeGreaterThan(0)
+        })
+
+        await Telemetry.requestFeedback(traceIDs)
+
+        // Give event time to propagate
+        await new Promise((resolve) => setTimeout(resolve, 10))
+
+        expect(requestReceived).toBe(true)
+        unsubscribe()
+      }))
+  })
+
+  describe("Telemetry Query", () => {
+    test("queries telemetry by time range", () =>
+      withInstance(async () => {
+        const now = Date.now()
+        const trace1 = RealisticTraces.quickFix()
+        trace1.createdAt = now - 10000
+        tracesToClean.push(trace1.id)
+
+        const trace2 = RealisticTraces.successfulCodeEdit()
+        trace2.createdAt = now - 5000
+        tracesToClean.push(trace2.id)
+
+        await Telemetry.enrichTrace(trace1)
+        await Telemetry.enrichTrace(trace2)
+
+        // Query with time range and then filter to our specific traces
+        const results = await Telemetry.query({
+          since: now - 8000,
+          limit: 100, // Get more results to ensure we find ours
+        })
+
+        // Filter to only our test traces to avoid pollution from other tests
+        const ourResults = results.filter(
+          (r) => r.traceID === trace1.id || r.traceID === trace2.id
+        )
+
+        // Should only include trace2 (created after threshold), not trace1
+        expect(ourResults.some((r) => r.traceID === trace2.id)).toBe(true)
+        expect(ourResults.some((r) => r.traceID === trace1.id)).toBe(false)
+      }))
+
+    test("queries telemetry by task type", () =>
+      withInstance(async () => {
+        const trace1 = RealisticTraces.quickFix() // edit task
+        tracesToClean.push(trace1.id)
+
+        const trace2 = RealisticTraces.complexRefactoring() // refactor task
+        tracesToClean.push(trace2.id)
+
+        await Telemetry.enrichTrace(trace1)
+        await Telemetry.enrichTrace(trace2)
+
+        const results = await Telemetry.query({
+          taskType: "refactor",
+          limit: 10,
+        })
+
+        expect(results.some((r) => r.traceID === trace2.id)).toBe(true)
+        expect(results.every((r) => r.taskClassification.type === "refactor")).toBe(true)
+      }))
+
+    test("gets aggregated statistics", () =>
+      withInstance(async () => {
+        const trace1 = RealisticTraces.quickFix()
+        tracesToClean.push(trace1.id)
+
+        const trace2 = RealisticTraces.complexRefactoring()
+        tracesToClean.push(trace2.id)
+
+        await Telemetry.enrichTrace(trace1)
+        await Telemetry.enrichTrace(trace2)
+
+        const stats = await Telemetry.getStatistics()
+
+        expect(stats.totalTraces).toBeGreaterThanOrEqual(2)
+        expect(stats.byTaskType).toBeDefined()
+        expect(stats.byComplexity).toBeDefined()
+      }))
+  })
+
+  describe("Telemetry Cleanup", () => {
+    test("cleans up old telemetry data", () =>
+      withInstance(async () => {
+        const trace = RealisticTraces.quickFix()
+        trace.createdAt = Date.now() - 40 * 24 * 60 * 60 * 1000 // 40 days ago
+        tracesToClean.push(trace.id)
+
+        await Telemetry.enrichTrace(trace)
+
+        // Clean up data older than 30 days
+        const removed = await Telemetry.cleanup(30 * 24 * 60 * 60 * 1000)
+
+        expect(removed).toBeGreaterThanOrEqual(1)
+
+        // Verify it's actually removed
+        const retrieved = await Telemetry.getEnrichedMetadata(trace.id)
+        expect(retrieved).toBeNull()
+      }))
+  })
+})
+
+describe("FeedbackManager", () => {
+  afterEach(() => {
+    FeedbackManager.disable()
+  })
+
+  test("can be enabled and disabled", () =>
+    withInstance(async () => {
+      FeedbackManager.enable()
+      // Should not throw
+      FeedbackManager.disable()
+      // Should be idempotent
+      FeedbackManager.disable()
+    }))
+
+  test("accepts custom strategy", () =>
+    withInstance(async () => {
+      FeedbackManager.enable({
+        minCostThreshold: 0.1,
+        maxRequestsPerHour: 2,
+      })
+      // Should not throw
+      FeedbackManager.disable()
+    }))
+
+  test("tracks feedback statistics", () =>
+    withInstance(async () => {
+      const stats = await FeedbackManager.getStatistics()
+
+      expect(stats.totalRequested).toBeGreaterThanOrEqual(0)
+      expect(stats.totalResponded).toBeGreaterThanOrEqual(0)
+      expect(stats.responseRate).toBeGreaterThanOrEqual(0)
+      expect(stats.avgResponseTime).toBeGreaterThanOrEqual(0)
+    }))
+})
diff --git a/packages/opencode/test/evaluation/timeseries.test.ts b/packages/opencode/test/evaluation/timeseries.test.ts
new file mode 100644
index 0000000000..341086d98c
--- /dev/null
+++ b/packages/opencode/test/evaluation/timeseries.test.ts
@@ -0,0 +1,393 @@
+import { describe, expect, test, beforeEach } from "bun:test"
+import { TimeSeries } from "../../src/evaluation/timeseries"
+import { Metric } from "../../src/evaluation/metric"
+import type { Trace } from "../../src/trace"
+
+const testIds: string[] = []
+
+beforeEach(async () => {
+  for (const id of testIds) {
+    try {
+      await Metric.remove(id).catch(() => {})
+    } catch {}
+  }
+  testIds.length = 0
+})
+
+const createMockTrace = (overrides?: Partial<Trace.Complete>): Trace.Complete => ({
+  id: `trace-${Date.now()}-${Math.random()}`,
+  projectID: "test-project",
+  session: {
+    id: "test-session",
+    projectID: "test-project",
+    directory: "/test",
+    title: "Test Session",
+    version: "1.0.0",
+    time: { created: Date.now(), updated: Date.now() },
+  },
+  messageCount: 3,
+  agentName: "test-agent",
+  modelConfig: {
+    provider: "anthropic",
+    model: "claude-3-5-sonnet-20241022",
+  },
+  output: "Test output",
+  toolCalls: [
+    { id: "Read", status: "success", duration: 100 } as any,
+    { id: "Edit", status: "success", duration: 200 } as any,
+  ],
+  summary: {
+    duration: 1500,
+    toolCallCount: 2,
+    errorCount: 0,
+    tokens: { input: 100, output: 50, reasoning: 0, cache: { read: 20, write: 0 } },
+    cost: 0.02,
+  },
+  evaluationIDs: [],
+  createdAt: Date.now(),
+  ...overrides,
+})
+
+describe("TimeSeries", () => {
+  describe("record and getDataPoints", () => {
+    test("records and retrieves data points", async () => {
+      const metric: Metric.Definition = {
+        id: "ts-metric",
+        name: "TS Metric",
+        description: "Time series metric",
+        version: "1.0.0",
+        category: "cost",
+        evaluator: { type: "heuristic", function: "totalCost" },
+        higherIsBetter: false,
+        tags: [],
+      }
+      await Metric.register(metric)
+      testIds.push(metric.id)
+
+      const trace = createMockTrace()
+      await TimeSeries.record(metric.id, trace, { environment: "test" })
+
+      const points = await TimeSeries.getDataPoints(metric.id)
+      expect(points.length).toBeGreaterThan(0)
+
+      const point = points.find((p) => p.traceID === trace.id)
+      expect(point).toBeDefined()
+      expect(point!.metricID).toBe(metric.id)
+      expect(point!.tags?.["environment"]).toBe("test")
+    })
+
+    test("filters data points by time range", async () => {
+      const metric: Metric.Definition = {
+        id: "ts-range-metric",
+        name: "Range Metric",
+        description: "Test time range",
+        version: "1.0.0",
+        category: "cost",
+        evaluator: { type: "heuristic", function: "totalCost" },
+        higherIsBetter: false,
+        tags: [],
+      }
+      await Metric.register(metric)
+      testIds.push(metric.id)
+
+      const now = Date.now()
+      const trace1 = createMockTrace()
+      const trace2 = createMockTrace()
+
+      await TimeSeries.record(metric.id, trace1)
+      await new Promise((resolve) => setTimeout(resolve, 10))
+      await TimeSeries.record(metric.id, trace2)
+
+      const allPoints = await TimeSeries.getDataPoints(metric.id)
+      expect(allPoints.length).toBeGreaterThanOrEqual(2)
+
+      const recentPoints = await TimeSeries.getDataPoints(metric.id, {
+        since: now + 5,
+      })
+      expect(recentPoints.length).toBeLessThanOrEqual(allPoints.length)
+    })
+
+    test("filters data points by tags", async () => {
+      const metric: Metric.Definition = {
+        id: "ts-tag-metric",
+        name: "Tag Metric",
+        description: "Test tag filtering",
+        version: "1.0.0",
+        category: "cost",
+        evaluator: { type: "heuristic", function: "totalCost" },
+        higherIsBetter: false,
+        tags: [],
+      }
+      await Metric.register(metric)
+      testIds.push(metric.id)
+
+      const trace1 = createMockTrace()
+      const trace2 = createMockTrace()
+
+      await TimeSeries.record(metric.id, trace1, { env: "prod" })
+      await TimeSeries.record(metric.id, trace2, { env: "dev" })
+
+      const prodPoints = await TimeSeries.getDataPoints(metric.id, {
+        tags: { env: "prod" },
+      })
+      expect(prodPoints.every((p) => p.tags?.["env"] === "prod")).toBe(true)
+    })
+  })
+
+  describe("getAggregates", () => {
+    test("computes hourly aggregates", async () => {
+      const metric: Metric.Definition = {
+        id: "ts-agg-metric",
+        name: "Aggregate Metric",
+        description: "Test aggregation",
+        version: "1.0.0",
+        category: "cost",
+        evaluator: { type: "heuristic", function: "totalCost" },
+        higherIsBetter: false,
+        tags: [],
+      }
+      await Metric.register(metric)
+      testIds.push(metric.id)
+
+      // Record multiple data points
+      for (let i = 0; i < 5; i++) {
+        const trace = createMockTrace({
+          summary: {
+            duration: 1500,
+            toolCallCount: 2,
+            errorCount: 0,
+            tokens: { input: 100, output: 50, reasoning: 0, cache: { read: 0, write: 0 } },
+            cost: 0.01 + i * 0.01, // Varying costs
+          },
+        })
+        await TimeSeries.record(metric.id, trace)
+      }
+
+      const aggregates = await TimeSeries.getAggregates(metric.id, {
+        period: "hour",
+      })
+
+      expect(aggregates.length).toBeGreaterThan(0)
+      const agg = aggregates[0]
+      expect(agg.metricID).toBe(metric.id)
+      expect(agg.period).toBe("hour")
+      expect(agg.count).toBeGreaterThan(0)
+      expect(agg.mean).toBeGreaterThan(0)
+      expect(agg.min).toBeLessThanOrEqual(agg.max)
+    })
+  })
+
+  describe("analyzeTrend", () => {
+    test("detects improving trend", async () => {
+      const metric: Metric.Definition = {
+        id: "trend-improving-metric",
+        name: "Improving Metric",
+        description: "Metric that improves over time",
+        version: "1.0.0",
+        category: "cost",
+        evaluator: { type: "heuristic", function: "totalCost" },
+        higherIsBetter: false, // Lower cost is better
+        tags: [],
+      }
+      await Metric.register(metric)
+      testIds.push(metric.id)
+
+      // Record traces with decreasing cost (improving) over 10 hours
+      const baseTime = Date.now() - 10 * 60 * 60 * 1000
+      for (let i = 0; i < 10; i++) {
+        const trace = createMockTrace({
+          summary: {
+            duration: 1500,
+            toolCallCount: 2,
+            errorCount: 0,
+            tokens: { input: 100, output: 50, reasoning: 0, cache: { read: 0, write: 0 } },
+            cost: 0.10 - i * 0.005, // Cost decreasing
+          },
+        })
+        trace.createdAt = baseTime + i * 60 * 60 * 1000 // Spread over 10 hours
+        await TimeSeries.record(metric.id, trace)
+      }
+
+      const analysis = await TimeSeries.analyzeTrend(metric.id, { days: 1 })
+
+      expect(analysis.metricID).toBe(metric.id)
+      // Trend detection depends on correlation which requires sufficient time spread
+      // With simulated data, checking that we get reasonable analysis structure
+      expect(analysis.slope).toBeLessThan(0) // Decreasing cost
+      expect(analysis.dataPoints).toBeGreaterThan(0)
+      expect(["improving", "stable"]).toContain(analysis.trend)
+    })
+
+    test("detects degrading trend", async () => {
+      const metric: Metric.Definition = {
+        id: "trend-degrading-metric",
+        name: "Degrading Metric",
+        description: "Metric that degrades over time",
+        version: "1.0.0",
+        category: "reliability",
+        evaluator: { type: "heuristic", function: "toolErrorRate" },
+        higherIsBetter: false, // Lower error rate is better
+        tags: [],
+      }
+      await Metric.register(metric)
+      testIds.push(metric.id)
+
+      // Record traces with increasing error rate (degrading) over 10 hours
+      const baseTime = Date.now() - 10 * 60 * 60 * 1000
+      for (let i = 0; i < 10; i++) {
+        const errorCount = i >= 5 ? 1 : 0 // Errors increase
+        const trace = createMockTrace({
+          toolCalls: [
+            { id: "Read", status: errorCount > 0 ? "error" : "success", duration: 100 } as any,
+            { id: "Edit", status: "success", duration: 200 } as any,
+          ],
+        })
+        trace.createdAt = baseTime + i * 60 * 60 * 1000 // Spread over 10 hours
+        await TimeSeries.record(metric.id, trace)
+      }
+
+      const analysis = await TimeSeries.analyzeTrend(metric.id, { days: 1 })
+
+      expect(analysis.metricID).toBe(metric.id)
+      //  Trend detection depends on correlation which requires sufficient time spread
+      // With simulated data, checking that we get reasonable analysis structure  
+      expect(analysis.dataPoints).toBeGreaterThan(0)
+      expect(["degrading", "stable"]).toContain(analysis.trend)
+    })
+
+    test("detects stable trend", async () => {
+      const metric: Metric.Definition = {
+        id: "trend-stable-metric",
+        name: "Stable Metric",
+        description: "Metric that stays stable",
+        version: "1.0.0",
+        category: "cost",
+        evaluator: { type: "heuristic", function: "totalCost" },
+        higherIsBetter: false,
+        tags: [],
+      }
+      await Metric.register(metric)
+      testIds.push(metric.id)
+
+      // Record traces with consistent cost
+      for (let i = 0; i < 10; i++) {
+        const trace = createMockTrace({
+          summary: {
+            duration: 1500,
+            toolCallCount: 2,
+            errorCount: 0,
+            tokens: { input: 100, output: 50, reasoning: 0, cache: { read: 0, write: 0 } },
+            cost: 0.02 + (Math.random() * 0.001), // Small variation
+          },
+        })
+        await TimeSeries.record(metric.id, trace)
+      }
+
+      const analysis = await TimeSeries.analyzeTrend(metric.id, { days: 1 })
+
+      expect(analysis.metricID).toBe(metric.id)
+      expect(analysis.trend).toBe("stable")
+      expect(analysis.trendStrength).toBeLessThan(0.3)
+    })
+
+    test("detects anomalies", async () => {
+      const metric: Metric.Definition = {
+        id: "trend-anomaly-metric",
+        name: "Anomaly Metric",
+        description: "Metric with anomalies",
+        version: "1.0.0",
+        category: "cost",
+        evaluator: { type: "heuristic", function: "totalCost" },
+        higherIsBetter: false,
+        tags: [],
+      }
+      await Metric.register(metric)
+      testIds.push(metric.id)
+
+      // Record mostly stable traces with one outlier
+      for (let i = 0; i < 10; i++) {
+        const cost = i === 5 ? 0.50 : 0.02 // Spike at i=5
+        const trace = createMockTrace({
+          summary: {
+            duration: 1500,
+            toolCallCount: 2,
+            errorCount: 0,
+            tokens: { input: 100, output: 50, reasoning: 0, cache: { read: 0, write: 0 } },
+            cost,
+          },
+        })
+        await TimeSeries.record(metric.id, trace)
+      }
+
+      const analysis = await TimeSeries.analyzeTrend(metric.id, {
+        days: 1,
+        anomalyThreshold: 2,
+      })
+
+      expect(analysis.anomalies.length).toBeGreaterThan(0)
+      const anomaly = analysis.anomalies[0]
+      expect(anomaly.value).toBeGreaterThan(0.1)
+      expect(Math.abs(anomaly.deviationSigmas)).toBeGreaterThan(2)
+    })
+  })
+
+  describe("detectAnomaly", () => {
+    test("detects anomalous current value", async () => {
+      const metric: Metric.Definition = {
+        id: "anomaly-detect-metric",
+        name: "Anomaly Detection Metric",
+        description: "Test anomaly detection",
+        version: "1.0.0",
+        category: "cost",
+        evaluator: { type: "heuristic", function: "totalCost" },
+        higherIsBetter: false,
+        tags: [],
+      }
+      await Metric.register(metric)
+      testIds.push(metric.id)
+
+      // Record historical data with consistent values
+      for (let i = 0; i < 5; i++) {
+        const trace = createMockTrace({
+          summary: {
+            duration: 1500,
+            toolCallCount: 2,
+            errorCount: 0,
+            tokens: { input: 100, output: 50, reasoning: 0, cache: { read: 0, write: 0 } },
+            cost: 0.02,
+          },
+        })
+        await TimeSeries.record(metric.id, trace)
+      }
+
+      // Check normal value
+      const normalResult = await TimeSeries.detectAnomaly(metric.id, 0.02)
+      expect(normalResult.isAnomaly).toBe(false)
+
+      // Check anomalous value
+      const anomalousResult = await TimeSeries.detectAnomaly(metric.id, 0.50)
+      expect(anomalousResult.isAnomaly).toBe(true)
+      expect(Math.abs(anomalousResult.zScore)).toBeGreaterThan(3)
+    })
+
+    test("handles insufficient data", async () => {
+      const metric: Metric.Definition = {
+        id: "anomaly-nodata-metric",
+        name: "No Data Metric",
+        description: "Test with no data",
+        version: "1.0.0",
+        category: "cost",
+        evaluator: { type: "heuristic", function: "totalCost" },
+        higherIsBetter: false,
+        tags: [],
+      }
+      await Metric.register(metric)
+      testIds.push(metric.id)
+
+      const result = await TimeSeries.detectAnomaly(metric.id, 0.02)
+      expect(result.isAnomaly).toBe(false)
+      expect(result.zScore).toBe(0)
+      expect(result.historicalStdDev).toBe(0)
+    })
+  })
+})
diff --git a/packages/opencode/test/tool/bash.test.ts b/packages/opencode/test/tool/bash.test.ts
index 3a74cba445..2c9b99d49c 100644
--- a/packages/opencode/test/tool/bash.test.ts
+++ b/packages/opencode/test/tool/bash.test.ts
@@ -4,13 +4,29 @@ import { BashTool } from "../../src/tool/bash"
 import { Log } from "../../src/util/log"
 import { Instance } from "../../src/project/instance"
 
-const ctx = {
-  sessionID: "test",
-  messageID: "",
-  toolCallID: "",
-  agent: "build",
-  abort: AbortSignal.any([]),
-  metadata: () => {},
+type Metadata = {
+  output?: string
+  exit?: number
+}
+
+const createContext = () => {
+  const controller = new AbortController()
+  const snapshots: Metadata[] = []
+  return {
+    controller,
+    snapshots,
+    ctx: {
+      sessionID: "test",
+      messageID: "",
+      callID: "",
+      agent: "build",
+      abort: controller.signal,
+      metadata(input: { metadata?: Metadata }) {
+        if (!input?.metadata) return
+        snapshots.push(input.metadata)
+      },
+    },
+  }
 }
 
 const bash = await BashTool.init()
@@ -19,6 +35,7 @@ Log.init({ print: false })
 
 describe("tool.bash", () => {
   test("basic", async () => {
+    const { ctx } = createContext()
     await Instance.provide({
       directory: projectRoot,
       fn: async () => {
@@ -36,6 +53,7 @@ describe("tool.bash", () => {
   })
 
   test("cd ../ should fail outside of project root", async () => {
+    const { ctx } = createContext()
     await Instance.provide({
       directory: projectRoot,
       fn: async () => {
@@ -51,4 +69,71 @@ describe("tool.bash", () => {
       },
     })
   })
+
+  test("streams incremental metadata", async () => {
+    const { ctx, snapshots } = createContext()
+    await Instance.provide({
+      directory: projectRoot,
+      fn: async () => {
+        const result = await bash.execute(
+          {
+            command: "printf 'one\\n'; sleep 0.05; printf 'two\\n'",
+            description: "stream output",
+          },
+          ctx,
+        )
+        const outputs = snapshots
+          .map((entry) => entry.output || "")
+          .filter((text) => text.length > 0)
+        expect(outputs.length).toBeGreaterThanOrEqual(2)
+        const first = outputs[0]
+        expect(first.includes("one")).toBe(true)
+        expect(first.includes("two")).toBe(false)
+        expect(outputs.at(-1)).toContain("two")
+        expect(result.metadata.output).toContain("two")
+      },
+    })
+  })
+
+  test("terminates on timeout", async () => {
+    const { ctx, snapshots } = createContext()
+    await Instance.provide({
+      directory: projectRoot,
+      fn: async () => {
+        const result = await bash.execute(
+          {
+            command: "sleep 2",
+            timeout: 50,
+            description: "timeout",
+          },
+          ctx,
+        )
+        const last = snapshots.at(-1)
+        expect(last?.exit).not.toBe(0)
+        expect(result.metadata.exit).not.toBe(0)
+      },
+    })
+  })
+
+  test("supports external abort", async () => {
+    const { ctx, controller, snapshots } = createContext()
+    await Instance.provide({
+      directory: projectRoot,
+      fn: async () => {
+        const execution = bash.execute(
+          {
+            command: "sleep 5",
+            description: "abort",
+          },
+          ctx,
+        )
+        setTimeout(() => controller.abort(), 50)
+        const result = await execution
+        const last = snapshots.at(-1)
+        expect(controller.signal.aborted).toBe(true)
+        expect(last?.exit).not.toBe(0)
+        expect(result.metadata.exit).not.toBe(0)
+      },
+    })
+  })
 })
diff --git a/packages/opencode/test/tool/edit.test.ts b/packages/opencode/test/tool/edit.test.ts
new file mode 100644
index 0000000000..53aa387ef6
--- /dev/null
+++ b/packages/opencode/test/tool/edit.test.ts
@@ -0,0 +1,37 @@
+import { describe, expect, test } from "bun:test"
+import path from "path"
+import { EditTool } from "../../src/tool/edit"
+import { Instance } from "../../src/project/instance"
+import { tmpdir } from "../fixture/fixture"
+
+const edit = await EditTool.init()
+
+const ctx = {
+  sessionID: "test",
+  messageID: "",
+  callID: "",
+  agent: "build",
+  abort: AbortSignal.any([]),
+  metadata: () => {},
+}
+
+describe("tool.edit", () => {
+  test("rejects edits outside workspace", async () => {
+    await using dir = await tmpdir()
+    await Instance.provide({
+      directory: dir.path,
+      fn: async () => {
+        await expect(
+          edit.execute(
+            {
+              filePath: path.join(dir.path, "..", "escape.txt"),
+              oldString: "foo",
+              newString: "bar",
+            },
+            ctx,
+          ),
+        ).rejects.toThrow("not in the current working directory")
+      },
+    })
+  })
+})
diff --git a/packages/opencode/test/tool/fetchurl.test.ts b/packages/opencode/test/tool/fetchurl.test.ts
new file mode 100644
index 0000000000..07be660d35
--- /dev/null
+++ b/packages/opencode/test/tool/fetchurl.test.ts
@@ -0,0 +1,243 @@
+import { describe, expect, mock, test } from "bun:test"
+import path from "path"
+import { FetchUrlTool } from "../../src/tool/fetchurl"
+import { Instance } from "../../src/project/instance"
+
+const tool = await FetchUrlTool.init()
+const projectRoot = path.join(__dirname, "../..")
+
+const baseCtx = () => {
+  const controller = new AbortController()
+  return {
+    controller,
+    ctx: {
+      sessionID: "test",
+      messageID: "",
+      callID: "",
+      agent: "build",
+      abort: controller.signal,
+      metadata: mock(() => {}),
+    },
+  }
+}
+
+const toUrlString = (input: RequestInfo) => {
+  if (typeof input === "string") return input
+  if (input instanceof URL) return input.toString()
+  return input.url
+}
+
+const useFetch = (handler: (input: RequestInfo, init?: RequestInit) => Promise<Response>) => {
+  const original = globalThis.fetch
+  const stub = (async (input: RequestInfo, init?: RequestInit) => handler(input, init)) as typeof fetch
+  if (typeof original.preconnect === "function") {
+    stub.preconnect = original.preconnect.bind(original)
+  }
+  globalThis.fetch = stub
+  return () => {
+    globalThis.fetch = original
+  }
+}
+
+describe("tool.fetchurl", () => {
+  test("formats html as markdown", async () => {
+    const restore = useFetch(async () =>
+      new Response("<html><body><main><h1>Hello</h1><p>World</p></main></body></html>", {
+        status: 200,
+        headers: {
+          "content-type": "text/html",
+        },
+      }),
+    )
+    const { ctx } = baseCtx()
+    try {
+      await Instance.provide({
+        directory: projectRoot,
+        fn: async () => {
+          const result = await tool.execute(
+            {
+              url: "https://example.com/page",
+            },
+            ctx,
+          )
+          expect(result.metadata["integration"]).toBe("generic")
+          expect(String(result.metadata["content_type"]).includes("text/html")).toBe(true)
+          expect(result.output).toContain("# Content from https://example.com/page")
+          expect(result.output).toContain("# Hello")
+        },
+      })
+    } finally {
+      restore()
+    }
+  })
+
+  test("converts json to markdown summary", async () => {
+    const payload = { title: "Ticket", state: "Todo", description: "Investigate" }
+    const restore = useFetch(async () =>
+      new Response(JSON.stringify(payload), {
+        status: 200,
+        headers: {
+          "content-type": "application/json",
+        },
+      }),
+    )
+    const { ctx } = baseCtx()
+    try {
+      await Instance.provide({
+        directory: projectRoot,
+        fn: async () => {
+          const result = await tool.execute(
+            {
+              url: "https://linear.app/api",
+            },
+            ctx,
+          )
+          expect(result.metadata["integration"]).toBe("linear")
+          expect(result.output).toContain("# LINEAR Content")
+          expect(result.output).toContain("## Ticket")
+          expect(result.output).toContain("**State:** Todo")
+        },
+      })
+    } finally {
+      restore()
+    }
+  })
+
+  test("respects text format preference", async () => {
+    const restore = useFetch(async () =>
+      new Response('{"message":"ok"}', {
+        status: 200,
+        headers: {
+          "content-type": "application/json",
+        },
+      }),
+    )
+    const { ctx } = baseCtx()
+    try {
+      await Instance.provide({
+        directory: projectRoot,
+        fn: async () => {
+          const result = await tool.execute(
+            {
+              url: "https://api.example.com/data",
+              format: "text",
+            },
+            ctx,
+          )
+          expect(result.output).toBe('{"message":"ok"}')
+        },
+      })
+    } finally {
+      restore()
+    }
+  })
+
+  test("follows redirects", async () => {
+    const calls: string[] = []
+    const restore = useFetch(async (input) => {
+      const url = toUrlString(input)
+      calls.push(url)
+      if (calls.length === 1) {
+        return new Response(null, {
+          status: 302,
+          headers: {
+            location: "/final",
+          },
+        })
+      }
+      return new Response("done", {
+        status: 200,
+        headers: {
+          "content-type": "text/plain",
+        },
+      })
+    })
+    const { ctx } = baseCtx()
+    try {
+      await Instance.provide({
+        directory: projectRoot,
+        fn: async () => {
+          const result = await tool.execute(
+            {
+              url: "https://example.com/start",
+            },
+            ctx,
+          )
+          expect(result.metadata["final_url"]).toBe("https://example.com/final")
+          expect(result.output).toContain("done")
+          expect(calls.length).toBe(2)
+        },
+      })
+    } finally {
+      restore()
+    }
+  })
+
+  test("injects auth headers", async () => {
+    const calls: Array<{ url: string; headers: Record<string, string> }> = []
+    const restore = useFetch(async (input, init) => {
+      const headers = init?.headers instanceof Headers ? Object.fromEntries(init.headers.entries()) : ((init?.headers ?? {}) as Record<string, string>)
+      calls.push({ url: toUrlString(input), headers })
+      return new Response("{}", {
+        status: 200,
+        headers: {
+          "content-type": "application/json",
+        },
+      })
+    })
+    const { ctx } = baseCtx()
+    try {
+      await Instance.provide({
+        directory: projectRoot,
+        fn: async () => {
+          await tool.execute(
+            {
+              url: "https://api.example.com/secure",
+              auth_type: "header",
+              auth_header_name: "X-API-Key",
+              auth_token: "secret",
+            },
+            ctx,
+          )
+          expect(calls[0].headers["X-API-Key"]).toBe("secret")
+        },
+      })
+    } finally {
+      restore()
+    }
+  })
+
+  test("appends auth query token", async () => {
+    const calls: string[] = []
+    const restore = useFetch(async (input) => {
+      calls.push(toUrlString(input))
+      return new Response("{}", {
+        status: 200,
+        headers: {
+          "content-type": "application/json",
+        },
+      })
+    })
+    const { ctx } = baseCtx()
+    try {
+      await Instance.provide({
+        directory: projectRoot,
+        fn: async () => {
+          await tool.execute(
+            {
+              url: "https://api.example.com/data",
+              auth_type: "query",
+              auth_query_param: "token",
+              auth_token: "abc",
+            },
+            ctx,
+          )
+          const target = new URL(calls[0])
+          expect(target.searchParams.get("token")).toBe("abc")
+        },
+      })
+    } finally {
+      restore()
+    }
+  })
+})
diff --git a/packages/opencode/test/tool/multiedit.test.ts b/packages/opencode/test/tool/multiedit.test.ts
new file mode 100644
index 0000000000..9d1a98e347
--- /dev/null
+++ b/packages/opencode/test/tool/multiedit.test.ts
@@ -0,0 +1,41 @@
+import { describe, expect, test } from "bun:test"
+import path from "path"
+import { MultiEditTool } from "../../src/tool/multiedit"
+import { Instance } from "../../src/project/instance"
+import { tmpdir } from "../fixture/fixture"
+
+const multi = await MultiEditTool.init()
+const ctx = {
+  sessionID: "test",
+  messageID: "",
+  callID: "",
+  agent: "build",
+  abort: AbortSignal.any([]),
+  metadata: () => {},
+}
+
+describe("tool.multiedit", () => {
+  test("rejects override path outside workspace", async () => {
+    await using dir = await tmpdir()
+    await Instance.provide({
+      directory: dir.path,
+      fn: async () => {
+        await expect(
+          multi.execute(
+            {
+              filePath: path.join(dir.path, "file.txt"),
+              edits: [
+                {
+                  filePath: path.join(dir.path, "..", "escape.txt"),
+                  oldString: "",
+                  newString: "data",
+                },
+              ],
+            },
+            ctx,
+          ),
+        ).rejects.toThrow("not in the current working directory")
+      },
+    })
+  })
+})
diff --git a/packages/opencode/test/tool/patch.test.ts b/packages/opencode/test/tool/patch.test.ts
index 5defc0f52c..3c717f6357 100644
--- a/packages/opencode/test/tool/patch.test.ts
+++ b/packages/opencode/test/tool/patch.test.ts
@@ -5,6 +5,7 @@ import { Log } from "../../src/util/log"
 import { Instance } from "../../src/project/instance"
 import { tmpdir } from "../fixture/fixture"
 import * as fs from "fs/promises"
+import { FileTime } from "../../src/file/time"
 
 const ctx = {
   sessionID: "test",
@@ -260,4 +261,21 @@ describe("tool.patch", () => {
       },
     })
   })
+  test("should reject move targets outside workspace", async () => {
+    await using fixture = await tmpdir()
+    await Instance.provide({
+      directory: fixture.path,
+      fn: async () => {
+        const filePath = path.join(fixture.path, "sample.txt")
+        await fs.writeFile(filePath, "content")
+        FileTime.read(ctx.sessionID, filePath)
+        const patchText = `*** Begin Patch
+*** Update File: sample.txt
+*** Move to: ../escape.txt
+*** End Patch`
+        await expect(patchTool.execute({ patchText }, ctx)).rejects.toThrow("not in the current working directory")
+      },
+    })
+  })
+
 })
\ No newline at end of file
diff --git a/packages/opencode/test/tool/write.test.ts b/packages/opencode/test/tool/write.test.ts
new file mode 100644
index 0000000000..68d78255a2
--- /dev/null
+++ b/packages/opencode/test/tool/write.test.ts
@@ -0,0 +1,58 @@
+import { describe, expect, test } from "bun:test"
+import path from "path"
+import { WriteTool } from "../../src/tool/write"
+import { Instance } from "../../src/project/instance"
+import { tmpdir } from "../fixture/fixture"
+
+const tool = await WriteTool.init()
+
+const baseCtx = {
+  sessionID: "test",
+  messageID: "",
+  callID: "",
+  agent: "build",
+  abort: AbortSignal.any([]),
+  metadata: () => {},
+}
+
+describe("tool.write", () => {
+  test("writes file within workspace", async () => {
+    await using dir = await tmpdir()
+    await Instance.provide({
+      directory: dir.path,
+      fn: async () => {
+        const target = path.join(dir.path, "note.txt")
+        const result = await tool.execute(
+          {
+            filePath: target,
+            content: "hello world",
+          },
+          baseCtx,
+        )
+        const written = await Bun.file(target).text()
+        expect(written).toBe("hello world")
+        expect(result.metadata["filepath"]).toBe(target)
+        expect(result.metadata["exists"]).toBe(false)
+      },
+    })
+  })
+
+  test("rejects paths outside workspace", async () => {
+    await using dir = await tmpdir()
+    await Instance.provide({
+      directory: dir.path,
+      fn: async () => {
+        const outside = path.join(dir.path, "..", "escape.txt")
+        await expect(
+          tool.execute(
+            {
+              filePath: outside,
+              content: "nope",
+            },
+            baseCtx,
+          ),
+        ).rejects.toThrow("not in the current working directory")
+      },
+    })
+  })
+})
diff --git a/packages/plugin/src/example.ts b/packages/plugin/src/example.ts
index 1e4557a68d..3fa25d1ae5 100644
--- a/packages/plugin/src/example.ts
+++ b/packages/plugin/src/example.ts
@@ -11,7 +11,13 @@ export const ExamplePlugin: Plugin = async (ctx) => {
           foo: tool.schema.string().describe("foo"),
         },
         async execute(args) {
-          return `Hello ${args.foo}!`
+          return {
+            title: "Greeting",
+            output: `Hello ${args.foo}!`,
+            metadata: {
+              length: args.foo.length,
+            },
+          }
         },
       }),
     },
diff --git a/packages/plugin/src/tool.ts b/packages/plugin/src/tool.ts
index 2998a1e72c..a3339bf2a9 100644
--- a/packages/plugin/src/tool.ts
+++ b/packages/plugin/src/tool.ts
@@ -7,13 +7,21 @@ export type ToolContext = {
   abort: AbortSignal
 }
 
-export function tool<Args extends z.ZodRawShape>(input: {
+export type ToolResult = {
+  output: string
+  title?: string
+  metadata?: Record<string, unknown>
+}
+
+export type ToolDefinition<Args extends z.ZodRawShape = z.ZodRawShape> = {
   description: string
   args: Args
-  execute(args: z.infer<z.ZodObject<Args>>, context: ToolContext): Promise<string>
-}) {
+  execute(args: z.infer<z.ZodObject<Args>>, context: ToolContext): Promise<string | ToolResult>
+}
+
+export function tool<Args extends z.ZodRawShape>(input: ToolDefinition<Args>) {
   return input
 }
 tool.schema = z
 
-export type ToolDefinition = ReturnType<typeof tool>
+export type ToolExecute<Args extends z.ZodRawShape = z.ZodRawShape> = ToolDefinition<Args>["execute"]
diff --git a/packages/sdk/go/event.go b/packages/sdk/go/event.go
index 35b4353cf2..32fc2f8fdf 100644
--- a/packages/sdk/go/event.go
+++ b/packages/sdk/go/event.go
@@ -110,7 +110,7 @@ func (r *EventListResponse) UnmarshalJSON(data []byte) (err error) {
 // [EventListResponseEventSessionIdle], [EventListResponseEventSessionUpdated],
 // [EventListResponseEventSessionDeleted], [EventListResponseEventSessionError],
 // [EventListResponseEventServerConnected],
-// [EventListResponseEventFileWatcherUpdated],
+// [EventListResponseEventFileWatcherUpdated], [EventListResponseEventToolTelemetry] or
 // [EventListResponseEventIdeInstalled].
 func (r EventListResponse) AsUnion() EventListResponseUnion {
 	return r.union
@@ -201,6 +201,10 @@ func init() {
 			TypeFilter: gjson.JSON,
 			Type:       reflect.TypeOf(EventListResponseEventFileWatcherUpdated{}),
 		},
+		apijson.UnionVariant{
+			TypeFilter: gjson.JSON,
+			Type:       reflect.TypeOf(EventListResponseEventToolTelemetry{}),
+		},
 		apijson.UnionVariant{
 			TypeFilter: gjson.JSON,
 			Type:       reflect.TypeOf(EventListResponseEventIdeInstalled{}),
@@ -1235,6 +1239,78 @@ func (r eventListResponseEventFileWatcherUpdatedJSON) RawJSON() string {
 
 func (r EventListResponseEventFileWatcherUpdated) implementsEventListResponse() {}
 
+type EventListResponseEventToolTelemetry struct {
+	Properties EventListResponseEventToolTelemetryProperties `json:"properties,required"`
+	Type       EventListResponseEventToolTelemetryType       `json:"type,required"`
+	JSON       eventListResponseEventToolTelemetryJSON       `json:"-"`
+}
+
+// eventListResponseEventToolTelemetryJSON contains the JSON metadata for the struct [EventListResponseEventToolTelemetry]
+type eventListResponseEventToolTelemetryJSON struct {
+	Properties  apijson.Field
+	Type        apijson.Field
+	raw         string
+	ExtraFields map[string]apijson.Field
+}
+
+func (r *EventListResponseEventToolTelemetry) UnmarshalJSON(data []byte) (err error) {
+	return apijson.UnmarshalRoot(data, r)
+}
+
+func (r eventListResponseEventToolTelemetryJSON) RawJSON() string {
+	return r.raw
+}
+
+func (r EventListResponseEventToolTelemetry) implementsEventListResponse() {}
+
+type EventListResponseEventToolTelemetryProperties struct {
+	CallID    *string                                           `json:"callID,omitempty"`
+	Duration  float64                                           `json:"duration,required"`
+	Error     *string                                           `json:"error,omitempty"`
+	Extra     map[string]any                                    `json:"extra,omitempty"`
+	ID        string                                            `json:"id,required"`
+	SessionID string                                            `json:"sessionID,required"`
+	Status    string                                            `json:"status,required"`
+	Timestamp float64                                           `json:"timestamp,required"`
+	JSON      eventListResponseEventToolTelemetryPropertiesJSON `json:"-"`
+}
+
+// eventListResponseEventToolTelemetryPropertiesJSON contains the JSON metadata for the struct [EventListResponseEventToolTelemetryProperties]
+type eventListResponseEventToolTelemetryPropertiesJSON struct {
+	CallID      apijson.Field
+	Duration    apijson.Field
+	Error       apijson.Field
+	Extra       apijson.Field
+	ID          apijson.Field
+	SessionID   apijson.Field
+	Status      apijson.Field
+	Timestamp   apijson.Field
+	raw         string
+	ExtraFields map[string]apijson.Field
+}
+
+func (r *EventListResponseEventToolTelemetryProperties) UnmarshalJSON(data []byte) (err error) {
+	return apijson.UnmarshalRoot(data, r)
+}
+
+func (r eventListResponseEventToolTelemetryPropertiesJSON) RawJSON() string {
+	return r.raw
+}
+
+type EventListResponseEventToolTelemetryType string
+
+const (
+	EventListResponseEventToolTelemetryTypeToolTelemetry EventListResponseEventToolTelemetryType = "tool.telemetry"
+)
+
+func (r EventListResponseEventToolTelemetryType) IsKnown() bool {
+	switch r {
+	case EventListResponseEventToolTelemetryTypeToolTelemetry:
+		return true
+	}
+	return false
+}
+
 type EventListResponseEventFileWatcherUpdatedProperties struct {
 	Event EventListResponseEventFileWatcherUpdatedPropertiesEvent `json:"event,required"`
 	File  string                                                  `json:"file,required"`
@@ -1368,11 +1444,12 @@ const (
 	EventListResponseTypeServerConnected      EventListResponseType = "server.connected"
 	EventListResponseTypeFileWatcherUpdated   EventListResponseType = "file.watcher.updated"
 	EventListResponseTypeIdeInstalled         EventListResponseType = "ide.installed"
+	EventListResponseTypeToolTelemetry        EventListResponseType = "tool.telemetry"
 )
 
 func (r EventListResponseType) IsKnown() bool {
 	switch r {
-	case EventListResponseTypeInstallationUpdated, EventListResponseTypeLspClientDiagnostics, EventListResponseTypeMessageUpdated, EventListResponseTypeMessageRemoved, EventListResponseTypeMessagePartUpdated, EventListResponseTypeMessagePartRemoved, EventListResponseTypeSessionCompacted, EventListResponseTypePermissionUpdated, EventListResponseTypePermissionReplied, EventListResponseTypeFileEdited, EventListResponseTypeSessionIdle, EventListResponseTypeSessionUpdated, EventListResponseTypeSessionDeleted, EventListResponseTypeSessionError, EventListResponseTypeServerConnected, EventListResponseTypeFileWatcherUpdated, EventListResponseTypeIdeInstalled:
+	case EventListResponseTypeInstallationUpdated, EventListResponseTypeLspClientDiagnostics, EventListResponseTypeMessageUpdated, EventListResponseTypeMessageRemoved, EventListResponseTypeMessagePartUpdated, EventListResponseTypeMessagePartRemoved, EventListResponseTypeSessionCompacted, EventListResponseTypePermissionUpdated, EventListResponseTypePermissionReplied, EventListResponseTypeFileEdited, EventListResponseTypeSessionIdle, EventListResponseTypeSessionUpdated, EventListResponseTypeSessionDeleted, EventListResponseTypeSessionError, EventListResponseTypeServerConnected, EventListResponseTypeFileWatcherUpdated, EventListResponseTypeIdeInstalled, EventListResponseTypeToolTelemetry:
 		return true
 	}
 	return false
diff --git a/packages/tui/internal/app/app.go b/packages/tui/internal/app/app.go
index 4a891f2827..12c5ee2406 100644
--- a/packages/tui/internal/app/app.go
+++ b/packages/tui/internal/app/app.go
@@ -27,6 +27,29 @@ type Message struct {
 	Parts []opencode.PartUnion
 }
 
+type TelemetryEntry struct {
+	Tool      string
+	Status    string
+	Duration  time.Duration
+	Timestamp time.Time
+	Error     string
+}
+
+type UsageTokens struct {
+	Input      float64
+	Output     float64
+	Reasoning  float64
+	CacheRead  float64
+	CacheWrite float64
+}
+
+type UsageSummary struct {
+	Sessions int
+	Messages int
+	Cost     float64
+	Tokens   UsageTokens
+}
+
 type App struct {
 	Project           opencode.Project
 	Agents            []opencode.Agent
@@ -52,6 +75,8 @@ type App struct {
 	IsLeaderSequence  bool
 	IsBashMode        bool
 	ScrollSpeed       int
+	Telemetry         []TelemetryEntry
+	Usage             UsageSummary
 }
 
 func (a *App) Agent() *opencode.Agent {
@@ -213,9 +238,60 @@ func New(
 		ScrollSpeed:    int(configInfo.Tui.ScrollSpeed),
 	}
 
+	if usage, err := loadUsageSummary(ctx, httpClient, project); err == nil {
+		app.Usage = usage
+	} else {
+		slog.Warn("failed to load usage summary", "error", err)
+	}
+
 	return app, nil
 }
 
+func (a *App) RecordTelemetry(entry TelemetryEntry) {
+	const maxEntries = 20
+	a.Telemetry = append(a.Telemetry, entry)
+	if len(a.Telemetry) > maxEntries {
+		a.Telemetry = a.Telemetry[len(a.Telemetry)-maxEntries:]
+	}
+}
+
+func loadUsageSummary(ctx context.Context, client *opencode.Client, project *opencode.Project) (UsageSummary, error) {
+	result := UsageSummary{}
+	sessionsRes, err := client.Session.List(ctx, opencode.SessionListParams{
+		Directory: opencode.F(project.Worktree),
+	})
+	if err != nil || sessionsRes == nil {
+		return result, err
+	}
+
+	sessions := *sessionsRes
+	result.Sessions = len(sessions)
+
+	for _, session := range sessions {
+		messagesRes, err := client.Session.Messages(ctx, session.ID, opencode.SessionMessagesParams{
+			Directory: opencode.F(session.Directory),
+		})
+		if err != nil || messagesRes == nil {
+			continue
+		}
+		messages := *messagesRes
+		result.Messages += len(messages)
+		for _, message := range messages {
+			switch info := message.Info.AsUnion().(type) {
+			case opencode.AssistantMessage:
+				result.Cost += info.Cost
+				result.Tokens.Input += info.Tokens.Input
+				result.Tokens.Output += info.Tokens.Output
+				result.Tokens.Reasoning += info.Tokens.Reasoning
+				result.Tokens.CacheRead += info.Tokens.Cache.Read
+				result.Tokens.CacheWrite += info.Tokens.Cache.Write
+			}
+		}
+	}
+
+	return result, nil
+}
+
 func (a *App) Keybind(commandName commands.CommandName) string {
 	command := a.Commands[commandName]
 	if len(command.Keybindings) == 0 {
diff --git a/packages/tui/internal/components/chat/message.go b/packages/tui/internal/components/chat/message.go
index fc5a21ad1e..f426fcee4c 100644
--- a/packages/tui/internal/components/chat/message.go
+++ b/packages/tui/internal/components/chat/message.go
@@ -636,30 +636,35 @@ func renderToolDetails(
 					body = util.ToMarkdown(body, width, backgroundColor)
 				}
 			}
-		case "todowrite":
-			todos := metadata["todos"]
-			if todos != nil {
-				for _, item := range todos.([]any) {
-					todo := item.(map[string]any)
-					content := todo["content"]
-          if content == nil {
-            continue
-          }
-					switch todo["status"] {
-					case "completed":
-						body += fmt.Sprintf("- [x] %s\n", content)
-					case "cancelled":
-						// strike through cancelled todo
-						body += fmt.Sprintf("- [ ] ~~%s~~\n", content)
-					case "in_progress":
-						// highlight in progress todo
-						body += fmt.Sprintf("- [ ] `%s`\n", content)
-					default:
-						body += fmt.Sprintf("- [ ] %s\n", content)
-					}
+	case "todowrite":
+		todos := metadata["todos"]
+		if todos != nil {
+			for _, item := range todos.([]any) {
+				todo := item.(map[string]any)
+				content := todo["content"]
+	          if content == nil {
+ 	            continue
+ 	          }
+				active := ""
+				if value, ok := todo["activeForm"].(string); ok && strings.TrimSpace(value) != "" {
+					active = value
+				}
+				switch todo["status"] {
+				case "completed":
+					body += fmt.Sprintf("- [x] %s\n", content)
+				case "cancelled":
+					body += fmt.Sprintf("- [ ] ~~%s~~\n", content)
+				case "in_progress":
+					body += fmt.Sprintf("- [ ] `%s`\n", content)
+				default:
+					body += fmt.Sprintf("- [ ] %s\n", content)
+				}
+				if active != "" {
+					body += fmt.Sprintf("  ↳ %s\n", active)
 				}
-				body = util.ToMarkdown(body, width, backgroundColor)
 			}
+			body = util.ToMarkdown(body, width, backgroundColor)
+		}
 		case "task":
 			summary := metadata["summary"]
 			if summary != nil {
diff --git a/packages/tui/internal/components/status/status.go b/packages/tui/internal/components/status/status.go
index aba80900bf..278e974ebb 100644
--- a/packages/tui/internal/components/status/status.go
+++ b/packages/tui/internal/components/status/status.go
@@ -67,11 +67,11 @@ func (m *statusComponent) logo() string {
 		Bold(true).
 		Render
 
-	open := base("open")
-	code := emphasis("code")
+	grim := base("grim")
+	oire := emphasis("oire")
 	version := base(" " + m.app.Version)
 
-	content := open + code
+	content := grim + oire
 	if m.width > 40 {
 		content += version
 	}
diff --git a/packages/tui/internal/tui/tui.go b/packages/tui/internal/tui/tui.go
index 3310d517c5..a1e3d6febd 100644
--- a/packages/tui/internal/tui/tui.go
+++ b/packages/tui/internal/tui/tui.go
@@ -661,6 +661,24 @@ func (a Model) Update(msg tea.Msg) (tea.Model, tea.Cmd) {
 		if msg.Properties.SessionID == a.app.Session.ID {
 			return a, toast.NewSuccessToast("Session compacted successfully")
 		}
+	case opencode.EventListResponseEventToolTelemetry:
+		duration := time.Duration(msg.Properties.Duration * float64(time.Millisecond))
+		if duration < 0 {
+			duration = 0
+		}
+		timestamp := time.UnixMilli(int64(msg.Properties.Timestamp))
+		errorMessage := ""
+		if msg.Properties.Error != nil {
+			errorMessage = *msg.Properties.Error
+		}
+		a.app.RecordTelemetry(app.TelemetryEntry{
+			Tool:      msg.Properties.ID,
+			Status:    msg.Properties.Status,
+			Duration:  duration,
+			Timestamp: timestamp,
+			Error:     errorMessage,
+		})
+		return a, nil
 	case tea.WindowSizeMsg:
 		msg.Height -= 2 // Make space for the status bar
 		a.width, a.height = msg.Width, msg.Height
@@ -948,6 +966,64 @@ func (a Model) Cleanup() {
 	a.status.Cleanup()
 }
 
+func renderTelemetry(entries []app.TelemetryEntry, usage app.UsageSummary, width int) string {
+	if len(entries) == 0 {
+		if usage.Sessions == 0 {
+			return ""
+		}
+	}
+
+	const limit = 5
+	start := len(entries) - limit
+	if start < 0 {
+		start = 0
+	}
+
+	t := theme.CurrentTheme()
+	headStyle := styles.NewStyle().Foreground(t.TextMuted()).Background(t.Background())
+	rowStyle := styles.NewStyle().Foreground(t.TextMuted()).Background(t.Background())
+	summary := fmt.Sprintf(
+		"sessions %d • messages %d • cost $%.2f • tokens %.0f/%.0f",
+		usage.Sessions,
+		usage.Messages,
+		usage.Cost,
+		usage.Tokens.Input,
+		usage.Tokens.Output,
+	)
+	lines := []string{headStyle.Render(summary)}
+	for _, entry := range entries[start:] {
+		status := entry.Status
+		if status == "success" {
+			status = "ok"
+		}
+		duration := formatTelemetryDuration(entry.Duration)
+		timestamp := entry.Timestamp.Format("15:04:05")
+		message := fmt.Sprintf("%s %-10s %-4s %6s", timestamp, entry.Tool, status, duration)
+		if entry.Error != "" {
+			message += " • " + entry.Error
+		}
+		lines = append(lines, rowStyle.Render(message))
+	}
+
+	bloc := strings.Join(lines, "\n")
+	return lipgloss.PlaceHorizontal(
+		width,
+		lipgloss.Left,
+		bloc,
+		styles.WhitespaceStyle(t.Background()),
+	)
+}
+
+func formatTelemetryDuration(d time.Duration) string {
+	if d <= 0 {
+		return "0ms"
+	}
+	if d < time.Second {
+		return fmt.Sprintf("%dms", d/time.Millisecond)
+	}
+	return fmt.Sprintf("%.2fs", d.Seconds())
+}
+
 func (a Model) home() (string, int, int) {
 	t := theme.CurrentTheme()
 	effectiveWidth := a.width - 4
@@ -1094,20 +1170,28 @@ func (a Model) chat() (string, int, int) {
 		styles.WhitespaceStyle(t.Background()),
 	)
 
+	telemetryView := renderTelemetry(a.app.Telemetry, a.app.Usage, effectiveWidth)
+	telemetryHeight := 0
+	if telemetryView != "" {
+		telemetryHeight = lipgloss.Height(telemetryView)
+	}
 	mainLayout := messagesView + "\n" + editorView
+	if telemetryHeight > 0 {
+		mainLayout = telemetryView + "\n" + mainLayout
+	}
 	editorX := max(0, (effectiveWidth-editorWidth)/2)
 	editorY := a.height - editorHeight
 
 	if lines > 1 {
 		content := a.editor.Content()
 		editorHeight := lipgloss.Height(content)
-		if editorY+editorHeight > a.height {
-			difference := (editorY + editorHeight) - a.height
+		if editorY+telemetryHeight+editorHeight > a.height {
+			difference := (editorY + telemetryHeight + editorHeight) - a.height
 			editorY -= difference
 		}
 		mainLayout = layout.PlaceOverlay(
 			editorX,
-			editorY,
+			editorY+telemetryHeight,
 			content,
 			mainLayout,
 		)
@@ -1117,17 +1201,17 @@ func (a Model) chat() (string, int, int) {
 		a.completions.SetWidth(editorWidth)
 		overlay := a.completions.View()
 		overlayHeight := lipgloss.Height(overlay)
-		editorY := a.height - editorHeight + 1
+		editorYOverlay := editorY + telemetryHeight + 1
 
 		mainLayout = layout.PlaceOverlay(
 			editorX,
-			editorY-overlayHeight,
+			editorYOverlay-overlayHeight,
 			overlay,
 			mainLayout,
 		)
 	}
 
-	return mainLayout, editorX + 5, editorY + 2
+	return mainLayout, editorX + 5, editorY + telemetryHeight + 2
 }
 
 func (a Model) executeCommand(command commands.Command) (tea.Model, tea.Cmd) {
diff --git a/packages/web/config.mjs b/packages/web/config.mjs
index 5e2c8d3e43..bb5827e020 100644
--- a/packages/web/config.mjs
+++ b/packages/web/config.mjs
@@ -5,7 +5,7 @@ export default {
   console: stage === "production" ? "https://opencode.ai/auth" : `https://${stage}.opencode.ai/auth`,
   email: "contact@anoma.ly",
   socialCard: "https://social-cards.sst.dev",
-  github: "https://github.com/sst/opencode",
+  github: "https://github.com/evalops/opencode",
   discord: "https://opencode.ai/discord",
   headerLinks: [
     { name: "Home", url: "/" },
diff --git a/packages/web/src/components/Lander.astro b/packages/web/src/components/Lander.astro
index ef032e49ec..968edf9d9e 100644
--- a/packages/web/src/components/Lander.astro
+++ b/packages/web/src/components/Lander.astro
@@ -136,26 +136,26 @@ if (image) {
     <section class="images">
       <div class="left">
         <figure>
-          <figcaption>opencode TUI with the tokyonight theme</figcaption>
+          <figcaption>Grimoire TUI with the tokyonight theme</figcaption>
           <a href="/docs/cli">
-            <Image src={TuiScreenshot} alt="opencode TUI with the tokyonight theme" />
+            <Image src={TuiScreenshot} alt="Grimoire TUI with the tokyonight theme" />
           </a>
         </figure>
       </div>
       <div class="right">
         <div class="row1">
           <figure>
-            <figcaption>opencode in VS Code</figcaption>
+            <figcaption>Grimoire in VS Code</figcaption>
             <a href="/docs/ide">
-              <Image src={VscodeScreenshot} alt="opencode in VS Code" />
+              <Image src={VscodeScreenshot} alt="Grimoire in VS Code" />
             </a>
           </figure>
         </div>
         <div class="row2">
           <figure>
-            <figcaption>opencode in GitHub</figcaption>
+            <figcaption>Grimoire in GitHub</figcaption>
             <a href="/docs/github">
-              <Image src={GithubScreenshot} alt="opencode in GitHub" />
+              <Image src={GithubScreenshot} alt="Grimoire in GitHub" />
             </a>
           </figure>
         </div>
diff --git a/packages/web/src/components/share/part.module.css b/packages/web/src/components/share/part.module.css
index 45310a0b23..1f9d4b5fbd 100644
--- a/packages/web/src/components/share/part.module.css
+++ b/packages/web/src/components/share/part.module.css
@@ -360,7 +360,7 @@
         border-bottom: none;
       }
 
-      & > span {
+      & > [data-slot="checkbox"] {
         position: absolute;
         display: inline-block;
         left: 0.5rem;
@@ -374,6 +374,21 @@
         }
       }
 
+      [data-slot="content"] {
+        display: block;
+        position: static;
+        font-weight: 500;
+        color: var(--sl-color-text);
+      }
+
+      [data-slot="active"] {
+        display: block;
+        margin-top: 0.125rem;
+        font-size: 0.7rem;
+        font-style: italic;
+        color: var(--sl-color-orange);
+      }
+
       &[data-status="pending"] {
         color: var(--sl-color-text);
       }
@@ -381,11 +396,11 @@
       &[data-status="in_progress"] {
         color: var(--sl-color-text);
 
-        & > span {
+        & > [data-slot="checkbox"] {
           border-color: var(--sl-color-orange);
         }
 
-        & > span::before {
+        & > [data-slot="checkbox"]::before {
           content: "";
           position: absolute;
           top: 2px;
@@ -399,11 +414,11 @@
       &[data-status="completed"] {
         color: var(--sl-color-text-secondary);
 
-        & > span {
+        & > [data-slot="checkbox"] {
           border-color: var(--sl-color-green-low);
         }
 
-        & > span::before {
+        & > [data-slot="checkbox"]::before {
           content: "";
           position: absolute;
           top: 2px;
diff --git a/packages/web/src/components/share/part.tsx b/packages/web/src/components/share/part.tsx
index ddef206bad..95229d5a54 100644
--- a/packages/web/src/components/share/part.tsx
+++ b/packages/web/src/components/share/part.tsx
@@ -305,8 +305,9 @@ type ToolProps = {
 interface Todo {
   id: string
   content: string
-  status: "pending" | "in_progress" | "completed"
-  priority: "low" | "medium" | "high"
+  status: "pending" | "in_progress" | "completed" | "cancelled"
+  priority?: "low" | "medium" | "high"
+  activeForm?: string
 }
 
 function stripWorkingDirectory(filePath?: string, workingDir?: string) {
@@ -389,6 +390,7 @@ export function TodoWriteTool(props: ToolProps) {
     in_progress: 0,
     pending: 1,
     completed: 2,
+    cancelled: 3,
   }
   const todos = createMemo(() =>
     ((props.state.input?.todos ?? []) as Todo[]).slice().sort((a, b) => priority[a.status] - priority[b.status]),
@@ -411,8 +413,11 @@ export function TodoWriteTool(props: ToolProps) {
           <For each={todos()}>
             {(todo) => (
               <li data-slot="item" data-status={todo.status}>
-                <span></span>
-                {todo.content}
+                <span data-slot="checkbox"></span>
+                <span data-slot="content">{todo.content}</span>
+                <Show when={todo.status === "in_progress" && todo.activeForm}>
+                  <span data-slot="active">{todo.activeForm}</span>
+                </Show>
               </li>
             )}
           </For>
diff --git a/packages/web/src/content/docs/agents.mdx b/packages/web/src/content/docs/agents.mdx
index 3c7bb93af0..ba6ec9f8f8 100644
--- a/packages/web/src/content/docs/agents.mdx
+++ b/packages/web/src/content/docs/agents.mdx
@@ -579,7 +579,7 @@ Here are some common use cases for different agents.
 Here are some examples agents you might find useful.
 
 :::tip
-Do you have an agent you'd like to share? [Submit a PR](https://github.com/sst/opencode).
+Do you have an agent you'd like to share? [Submit a PR](https://github.com/evalops/opencode).
 :::
 
 ---
diff --git a/packages/web/src/content/docs/github.mdx b/packages/web/src/content/docs/github.mdx
index d592fc84f3..5bd0c36961 100644
--- a/packages/web/src/content/docs/github.mdx
+++ b/packages/web/src/content/docs/github.mdx
@@ -61,7 +61,7 @@ Or you can set it up manually.
              fetch-depth: 1
 
          - name: Run opencode
-           uses: sst/opencode/github@latest
+           uses: evalops/opencode/github@latest
            env:
              ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
            with:
diff --git a/packages/web/src/content/docs/index.mdx b/packages/web/src/content/docs/index.mdx
index 9cc1bf8ed6..f8ad0f7d7e 100644
--- a/packages/web/src/content/docs/index.mdx
+++ b/packages/web/src/content/docs/index.mdx
@@ -105,7 +105,7 @@ You can also install it with the following commands:
 
 Support for installing opencode on Windows using Bun is currently in progress.
 
-You can also grab the binary from the [Releases](https://github.com/sst/opencode/releases).
+You can also grab the binary from the [Releases](https://github.com/evalops/opencode/releases).
 
 ---
 
diff --git a/packages/web/src/content/docs/server.mdx b/packages/web/src/content/docs/server.mdx
index 2eea2ba74c..aa51a9680a 100644
--- a/packages/web/src/content/docs/server.mdx
+++ b/packages/web/src/content/docs/server.mdx
@@ -103,8 +103,8 @@ The opencode server exposes the following APIs.
 | `POST`   | `/session/:id/summarize`                 | Summarize session                  |                                                                                                                                                                            |
 | `GET`    | `/session/:id/message`                   | List messages in a session         | Returns `{ info: `<a href={typesUrl}>Message</a>`, parts: `<a href={typesUrl}>Part[]</a>`}[]`                                                                              |
 | `GET`    | `/session/:id/message/:messageID`        | Get message details                | Returns `{ info: `<a href={typesUrl}>Message</a>`, parts: `<a href={typesUrl}>Part[]</a>`}`                                                                                |
-| `POST`   | `/session/:id/message`                   | Send chat message                  | body matches [`ChatInput`](https://github.com/sst/opencode/blob/main/packages/opencode/src/session/index.ts#L358), returns <a href={typesUrl}><code>Message</code></a>     |
-| `POST`   | `/session/:id/shell`                     | Run a shell command                | body matches [`CommandInput`](https://github.com/sst/opencode/blob/main/packages/opencode/src/session/index.ts#L1007), returns <a href={typesUrl}><code>Message</code></a> |
+| `POST`   | `/session/:id/message`                   | Send chat message                  | body matches [`ChatInput`](https://github.com/evalops/opencode/blob/main/packages/opencode/src/session/index.ts#L358), returns <a href={typesUrl}><code>Message</code></a>     |
+| `POST`   | `/session/:id/shell`                     | Run a shell command                | body matches [`CommandInput`](https://github.com/evalops/opencode/blob/main/packages/opencode/src/session/index.ts#L1007), returns <a href={typesUrl}><code>Message</code></a> |
 | `POST`   | `/session/:id/revert`                    | Revert a message                   | body: `{ messageID }`                                                                                                                                                      |
 | `POST`   | `/session/:id/unrevert`                  | Restore reverted messages          |                                                                                                                                                                            |
 | `POST`   | `/session/:id/permissions/:permissionID` | Respond to a permission request    | body: `{ response }`                                                                                                                                                       |
diff --git a/packages/web/src/content/docs/troubleshooting.mdx b/packages/web/src/content/docs/troubleshooting.mdx
index 57fbfe0884..74f597c659 100644
--- a/packages/web/src/content/docs/troubleshooting.mdx
+++ b/packages/web/src/content/docs/troubleshooting.mdx
@@ -46,7 +46,7 @@ If you're experiencing issues with opencode:
 
    The best way to report bugs or request features is through our GitHub repository:
 
-   [**github.com/sst/opencode/issues**](https://github.com/sst/opencode/issues)
+   [**github.com/evalops/opencode/issues**](https://github.com/evalops/opencode/issues)
 
    Before creating a new issue, search existing issues to see if your problem has already been reported.
 
diff --git a/script/publish.ts b/script/publish.ts
index 97ced29aba..b2f6c557e3 100755
--- a/script/publish.ts
+++ b/script/publish.ts
@@ -28,7 +28,7 @@ process.env["OPENCODE_VERSION"] = version
 console.log("version:", version)
 
 if (!snapshot) {
-  const previous = await fetch("https://api.github.com/repos/sst/opencode/releases/latest")
+  const previous = await fetch("https://api.github.com/repos/evalops/opencode/releases/latest")
     .then((res) => {
       if (!res.ok) throw new Error(res.statusText)
       return res.json()
@@ -84,6 +84,7 @@ if (!snapshot) {
       notes.push(line)
     }
   }
+  console.log(notes)
   server.close()
 }
 
diff --git a/script/stats.ts b/script/stats.ts
index d5f6c103f6..b2a16a20b8 100755
--- a/script/stats.ts
+++ b/script/stats.ts
@@ -73,7 +73,7 @@ async function fetchReleases(): Promise<Release[]> {
   const per = 100
 
   while (true) {
-    const url = `https://api.github.com/repos/sst/opencode/releases?page=${page}&per_page=${per}`
+    const url = `https://api.github.com/repos/evalops/opencode/releases?page=${page}&per_page=${per}`
 
     const response = await fetch(url)
     if (!response.ok) {
@@ -188,7 +188,7 @@ async function save(githubTotal: number, npmDownloads: number) {
   )
 }
 
-console.log("Fetching GitHub releases for sst/opencode...\n")
+console.log("Fetching GitHub releases for evalops/opencode...\n")
 
 const releases = await fetchReleases()
 console.log(`\nFetched ${releases.length} releases total\n`)
diff --git a/sdks/vscode/README.md b/sdks/vscode/README.md
index 1ca5078ce5..000af95d71 100644
--- a/sdks/vscode/README.md
+++ b/sdks/vscode/README.md
@@ -15,7 +15,7 @@ This extension requires the [opencode CLI](https://opencode.ai) to be installed
 
 ## Support
 
-This is an early release. If you encounter issues or have feedback, please create an issue at https://github.com/sst/opencode/issues.
+This is an early release. If you encounter issues or have feedback, please create an issue at https://github.com/evalops/opencode/issues.
 
 ## Development
 
diff --git a/sdks/vscode/package.json b/sdks/vscode/package.json
index 8ee62545dc..6eaa81d6cd 100644
--- a/sdks/vscode/package.json
+++ b/sdks/vscode/package.json
@@ -6,7 +6,7 @@
   "publisher": "sst-dev",
   "repository": {
     "type": "git",
-    "url": "https://github.com/sst/opencode"
+    "url": "https://github.com/evalops/opencode"
   },
   "license": "MIT",
   "icon": "images/icon.png",