Skip to content

Commit d5c343c

Browse files
authored
feat: add Terminal-Bench adapter and headless agent CLI (#198)
This commit introduces a headless agent runner and an adapter for the Terminal-Bench framework, enabling automated, programmatic evaluation of the agent's capabilities. The core of this change is a new `AgentSession` class that encapsulates the logic for managing a single workspace session. This refactors logic out of `ipcMain`, allowing the agent core to be used in environments without an Electron UI, such as the new headless CLI. Key components added: - `src/debug/agentSessionCli.ts`: A CLI for running an agent session headlessly. It can be driven programmatically and supports JSON output. - `benchmarks/terminal_bench/`: A Python adapter for Terminal-Bench that packages the cmux application, installs it in a task container, and runs it against benchmark instructions using the agent CLI. - `Makefile` target `benchmark-terminal` and `docs/benchmarking.md` to provide an easy entrypoint and documentation for running benchmarks. - Integration tests for the new `agentSessionCli` to ensure its correctness.
1 parent 9854bb8 commit d5c343c

File tree

18 files changed

+1827
-317
lines changed

18 files changed

+1827
-317
lines changed

.gitignore

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,3 +87,10 @@ CODE_CHANGES.md
8787
README_COMPACT_HERE.md
8888
artifacts/
8989
tests/e2e/tmp/
90+
runs/
91+
92+
# Python
93+
__pycache__
94+
95+
tmpfork
96+
.cmux-agent-cli

Makefile

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ include fmt.mk
2727
.PHONY: test test-unit test-integration test-watch test-coverage test-e2e
2828
.PHONY: dist dist-mac dist-win dist-linux
2929
.PHONY: docs docs-build docs-watch
30+
.PHONY: benchmark-terminal
3031
.PHONY: ensure-deps
3132

3233
TS_SOURCES := $(shell find src -type f \( -name '*.ts' -o -name '*.tsx' \))
@@ -174,6 +175,19 @@ docs-build: ## Build documentation
174175
docs-watch: ## Watch and rebuild documentation
175176
@cd docs && mdbook watch
176177

178+
## Benchmarks
179+
benchmark-terminal: ## Run Terminal-Bench with the cmux agent (use TB_DATASET/TB_ARGS to customize)
180+
@TB_DATASET=$${TB_DATASET:-terminal-bench-core==0.1.1}; \
181+
CONCURRENCY_FLAG=$${TB_CONCURRENCY:+--n-concurrent $$TB_CONCURRENCY}; \
182+
LIVESTREAM_FLAG=$${TB_LIVESTREAM:+--livestream}; \
183+
echo "Running Terminal-Bench with dataset $$TB_DATASET"; \
184+
uvx terminal-bench run \
185+
--dataset "$$TB_DATASET" \
186+
--agent-import-path benchmarks.terminal_bench.cmux_agent:CmuxAgent \
187+
$$CONCURRENCY_FLAG \
188+
$$LIVESTREAM_FLAG \
189+
$${TB_ARGS}
190+
177191
## Clean
178192
clean: ## Clean build artifacts
179193
@echo "Cleaning build artifacts..."
@@ -182,5 +196,3 @@ clean: ## Clean build artifacts
182196

183197
# Parallel build optimization - these can run concurrently
184198
.NOTPARALLEL: build-main # TypeScript can handle its own parallelism
185-
186-
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
from .cmux_agent import CmuxAgent
2+
3+
__all__ = ["CmuxAgent"]
Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
#!/usr/bin/env bash
2+
3+
set -euo pipefail
4+
5+
log() {
6+
printf '[cmux-run] %s\n' "$1"
7+
}
8+
9+
fatal() {
10+
printf '[cmux-run] ERROR: %s\n' "$1" >&2
11+
exit 1
12+
}
13+
14+
instruction=${1:-}
15+
if [[ -z "${instruction}" ]]; then
16+
fatal "instruction argument is required"
17+
fi
18+
19+
export BUN_INSTALL="${BUN_INSTALL:-/root/.bun}"
20+
export PATH="${BUN_INSTALL}/bin:${PATH}"
21+
22+
CMUX_APP_ROOT="${CMUX_APP_ROOT:-/opt/cmux-app}"
23+
CMUX_CONFIG_ROOT="${CMUX_CONFIG_ROOT:-/root/.cmux}"
24+
CMUX_PROJECT_PATH="${CMUX_PROJECT_PATH:-}"
25+
CMUX_PROJECT_CANDIDATES="${CMUX_PROJECT_CANDIDATES:-/workspace:/app:/workspaces:/root/project}"
26+
CMUX_MODEL="${CMUX_MODEL:-anthropic:claude-sonnet-4-5}"
27+
CMUX_TIMEOUT_MS="${CMUX_TIMEOUT_MS:-}"
28+
CMUX_TRUNK="${CMUX_TRUNK:-main}"
29+
CMUX_WORKSPACE_ID="${CMUX_WORKSPACE_ID:-cmux-bench}"
30+
CMUX_THINKING_LEVEL="${CMUX_THINKING_LEVEL:-high}"
31+
CMUX_MODE="${CMUX_MODE:-exec}"
32+
33+
ensure_bun() {
34+
if ! command -v bun >/dev/null 2>&1; then
35+
fatal "bun must be installed before running the cmux agent"
36+
fi
37+
}
38+
39+
resolve_project_path() {
40+
if [[ -n "${CMUX_PROJECT_PATH}" ]]; then
41+
if [[ -d "${CMUX_PROJECT_PATH}" ]]; then
42+
printf '%s\n' "${CMUX_PROJECT_PATH}"
43+
return 0
44+
fi
45+
fatal "CMUX_PROJECT_PATH=${CMUX_PROJECT_PATH} not found"
46+
fi
47+
48+
IFS=":" read -r -a candidates <<<"${CMUX_PROJECT_CANDIDATES}"
49+
for candidate in "${candidates[@]}"; do
50+
if [[ -d "${candidate}" ]]; then
51+
printf '%s\n' "${candidate}"
52+
return 0
53+
fi
54+
done
55+
56+
fatal "no project path located (searched ${CMUX_PROJECT_CANDIDATES})"
57+
}
58+
59+
ensure_git_repo() {
60+
local project_path=$1
61+
62+
if command -v git >/dev/null 2>&1; then
63+
if git -C "${project_path}" rev-parse --is-inside-work-tree >/dev/null 2>&1; then
64+
# Ensure trunk branch exists even on pre-existing repos.
65+
if ! git -C "${project_path}" rev-parse --verify "${CMUX_TRUNK}" >/dev/null 2>&1; then
66+
git -C "${project_path}" checkout -b "${CMUX_TRUNK}" >/dev/null 2>&1 || true
67+
else
68+
git -C "${project_path}" checkout "${CMUX_TRUNK}" >/dev/null 2>&1 || true
69+
fi
70+
return 0
71+
fi
72+
73+
log "initialising git repository at ${project_path}"
74+
if git -C "${project_path}" init --initial-branch="${CMUX_TRUNK}" >/dev/null 2>&1; then
75+
:
76+
else
77+
git -C "${project_path}" init >/dev/null
78+
git -C "${project_path}" checkout -B "${CMUX_TRUNK}" >/dev/null
79+
fi
80+
git -C "${project_path}" config user.name "cmux-bench"
81+
git -C "${project_path}" config user.email "bench@cmux.local"
82+
git -C "${project_path}" add -A >/dev/null
83+
git -C "${project_path}" commit -m "chore: initial snapshot" --allow-empty >/dev/null
84+
git -C "${project_path}" branch -M "${CMUX_TRUNK}" >/dev/null
85+
else
86+
log "git not available; skipping repository initialisation"
87+
fi
88+
}
89+
90+
ensure_bun
91+
project_path=$(resolve_project_path)
92+
ensure_git_repo "${project_path}"
93+
94+
bun --version >/dev/null 2>&1 || fatal "bun not available after ensure_bun"
95+
96+
log "starting cmux agent session for ${project_path}"
97+
cd "${CMUX_APP_ROOT}"
98+
99+
cmd=(bun src/debug/agentSessionCli.ts
100+
--config-root "${CMUX_CONFIG_ROOT}"
101+
--project-path "${project_path}"
102+
--workspace-path "${project_path}"
103+
--workspace-id "${CMUX_WORKSPACE_ID}"
104+
--model "${CMUX_MODEL}"
105+
--mode "${CMUX_MODE}")
106+
107+
if [[ -n "${CMUX_TIMEOUT_MS}" ]]; then
108+
cmd+=(--timeout "${CMUX_TIMEOUT_MS}")
109+
fi
110+
111+
if [[ -n "${CMUX_THINKING_LEVEL}" ]]; then
112+
cmd+=(--thinking-level "${CMUX_THINKING_LEVEL}")
113+
fi
114+
115+
if ! printf '%s' "${instruction}" | "${cmd[@]}"; then
116+
fatal "cmux agent session failed"
117+
fi
Lines changed: 221 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,221 @@
1+
from __future__ import annotations
2+
3+
import io
4+
import os
5+
import shlex
6+
import tarfile
7+
import tempfile
8+
from pathlib import Path
9+
from typing import Any
10+
11+
from terminal_bench.agents.base_agent import AgentResult
12+
from terminal_bench.agents.installed_agents.abstract_installed_agent import (
13+
AbstractInstalledAgent,
14+
)
15+
from terminal_bench.terminal.models import TerminalCommand
16+
from terminal_bench.terminal.tmux_session import TmuxSession
17+
18+
19+
class CmuxAgent(AbstractInstalledAgent):
20+
"""
21+
Minimal Terminal-Bench adapter that installs cmux into the task container and
22+
forwards the benchmark instruction to the cmux headless runner.
23+
"""
24+
25+
_ARCHIVE_NAME = "cmux-app.tar.gz"
26+
_RUNNER_NAME = "cmux-run.sh"
27+
_DEFAULT_TRUNK = "main"
28+
_DEFAULT_MODEL = "anthropic/claude-sonnet-4-5"
29+
30+
def __init__(
31+
self, mode: str | None = None, thinking_level: str | None = None, **kwargs: Any
32+
) -> None:
33+
super().__init__(**kwargs)
34+
repo_root_env = os.environ.get("CMUX_AGENT_REPO_ROOT")
35+
repo_root = (
36+
Path(repo_root_env).resolve()
37+
if repo_root_env
38+
else Path(__file__).resolve().parents[2]
39+
)
40+
if not repo_root.exists():
41+
raise RuntimeError(f"cmux repo root {repo_root} does not exist")
42+
43+
self._repo_root = repo_root
44+
self._archive_bytes: bytes | None = None
45+
self._prepared_container_id: str | None = None
46+
self._mode = mode.lower() if mode else None
47+
self._thinking_level = thinking_level.lower() if thinking_level else None
48+
49+
@staticmethod
50+
def name() -> str:
51+
return "cmux"
52+
53+
@property
54+
def _env(self) -> dict[str, str]:
55+
keys = [
56+
"ANTHROPIC_API_KEY",
57+
"ANTHROPIC_BASE_URL",
58+
"OPENAI_API_KEY",
59+
"OPENAI_BASE_URL",
60+
"OPENAI_API_BASE",
61+
"OPENAI_ORG_ID",
62+
"AZURE_OPENAI_API_KEY",
63+
"AZURE_OPENAI_ENDPOINT",
64+
"AZURE_OPENAI_DEPLOYMENT",
65+
"AZURE_OPENAI_API_VERSION",
66+
"MISTRAL_API_KEY",
67+
"GOOGLE_API_KEY",
68+
"OPENROUTER_API_KEY",
69+
"CMUX_AGENT_GIT_URL",
70+
"CMUX_BUN_INSTALL_URL",
71+
"CMUX_PROJECT_PATH",
72+
"CMUX_PROJECT_CANDIDATES",
73+
"CMUX_TRUNK",
74+
"CMUX_MODEL",
75+
"CMUX_TIMEOUT_MS",
76+
"CMUX_THINKING_LEVEL",
77+
"CMUX_CONFIG_ROOT",
78+
"CMUX_APP_ROOT",
79+
"CMUX_WORKSPACE_ID",
80+
]
81+
82+
env: dict[str, str] = {}
83+
for key in keys:
84+
value = os.environ.get(key)
85+
if value:
86+
env[key] = value
87+
88+
env.setdefault("CMUX_TRUNK", self._DEFAULT_TRUNK)
89+
env.setdefault("CMUX_MODEL", self._DEFAULT_MODEL)
90+
env.setdefault("CMUX_CONFIG_ROOT", "/root/.cmux")
91+
env.setdefault("CMUX_APP_ROOT", "/opt/cmux-app")
92+
env.setdefault("CMUX_WORKSPACE_ID", "cmux-bench")
93+
env.setdefault("CMUX_THINKING_LEVEL", "high")
94+
env.setdefault("CMUX_MODE", "exec")
95+
96+
model_value = env.get("CMUX_MODEL")
97+
if model_value and "/" in model_value and ":" not in model_value:
98+
provider, model_name = model_value.split("/", 1)
99+
env["CMUX_MODEL"] = f"{provider}:{model_name}"
100+
101+
thinking_value = self._thinking_level or env.get("CMUX_THINKING_LEVEL")
102+
if thinking_value:
103+
normalized = thinking_value.strip().lower()
104+
if normalized not in {"off", "low", "medium", "high"}:
105+
raise ValueError(
106+
"CMUX_THINKING_LEVEL must be one of off, low, medium, high"
107+
)
108+
env["CMUX_THINKING_LEVEL"] = normalized
109+
110+
mode_value = self._mode or env.get("CMUX_MODE")
111+
if mode_value:
112+
normalized_mode = mode_value.strip().lower()
113+
if normalized_mode in {"exec", "execute"}:
114+
env["CMUX_MODE"] = "exec"
115+
elif normalized_mode == "plan":
116+
env["CMUX_MODE"] = "plan"
117+
else:
118+
raise ValueError("CMUX_MODE must be one of plan, exec, or execute")
119+
120+
return env
121+
122+
@property
123+
def _install_agent_script_path(self) -> Path:
124+
return self._get_templated_script_path("cmux_setup.sh.j2")
125+
126+
def perform_task(
127+
self,
128+
instruction: str,
129+
session: TmuxSession,
130+
logging_dir=None,
131+
) -> AgentResult:
132+
if not instruction or not instruction.strip():
133+
raise ValueError("instruction must be a non-empty string")
134+
135+
self._prepare_payloads(session)
136+
return super().perform_task(
137+
instruction=instruction, session=session, logging_dir=logging_dir
138+
)
139+
140+
def _prepare_payloads(self, session: TmuxSession) -> None:
141+
container_id = getattr(session.container, "id", None)
142+
if container_id and container_id == self._prepared_container_id:
143+
return
144+
145+
archive = self._build_archive()
146+
temp_path: Path | None = None
147+
try:
148+
with tempfile.NamedTemporaryFile(
149+
suffix=".tar.gz", delete=False
150+
) as temp_file:
151+
temp_file.write(archive)
152+
temp_path = Path(temp_file.name)
153+
except Exception as error:
154+
raise RuntimeError(
155+
f"failed to materialize cmux archive: {error}"
156+
) from error
157+
158+
try:
159+
assert temp_path is not None, "temporary archive path missing"
160+
session.copy_to_container(
161+
paths=temp_path,
162+
container_dir="/installed-agent",
163+
container_filename=self._ARCHIVE_NAME,
164+
)
165+
finally:
166+
if temp_path is not None:
167+
temp_path.unlink(missing_ok=True)
168+
169+
runner_path = Path(__file__).with_name(self._RUNNER_NAME)
170+
if not runner_path.exists():
171+
raise RuntimeError(f"cmux runner script missing at {runner_path}")
172+
173+
session.copy_to_container(
174+
paths=runner_path,
175+
container_dir="/installed-agent",
176+
container_filename=self._RUNNER_NAME,
177+
)
178+
179+
if container_id:
180+
self._prepared_container_id = container_id
181+
182+
def _build_archive(self) -> bytes:
183+
if self._archive_bytes is not None:
184+
return self._archive_bytes
185+
186+
include_paths = [
187+
"package.json",
188+
"bun.lock",
189+
"bunfig.toml",
190+
"tsconfig.json",
191+
"tsconfig.main.json",
192+
"src",
193+
]
194+
195+
buffer = io.BytesIO()
196+
with tarfile.open(fileobj=buffer, mode="w:gz") as tar:
197+
for relative in include_paths:
198+
source_path = self._repo_root / relative
199+
if not source_path.exists():
200+
raise FileNotFoundError(f"Required file {source_path} not found")
201+
tar.add(
202+
source_path,
203+
arcname=relative,
204+
recursive=True,
205+
)
206+
buffer.seek(0)
207+
self._archive_bytes = buffer.getvalue()
208+
return self._archive_bytes
209+
210+
def _run_agent_commands(self, instruction: str) -> list[TerminalCommand]:
211+
escaped = shlex.quote(instruction)
212+
command = f"bash /installed-agent/{self._RUNNER_NAME} {escaped}"
213+
return [
214+
TerminalCommand(
215+
command=command,
216+
min_timeout_sec=0.0,
217+
max_timeout_sec=float("inf"),
218+
block=True,
219+
append_enter=True,
220+
)
221+
]

0 commit comments

Comments
 (0)