Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -87,3 +87,10 @@ CODE_CHANGES.md
README_COMPACT_HERE.md
artifacts/
tests/e2e/tmp/
runs/

# Python
__pycache__

tmpfork
.cmux-agent-cli
16 changes: 14 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ include fmt.mk
.PHONY: test test-unit test-integration test-watch test-coverage test-e2e
.PHONY: dist dist-mac dist-win dist-linux
.PHONY: docs docs-build docs-watch
.PHONY: benchmark-terminal
.PHONY: ensure-deps

TS_SOURCES := $(shell find src -type f \( -name '*.ts' -o -name '*.tsx' \))
Expand Down Expand Up @@ -174,6 +175,19 @@ docs-build: ## Build documentation
docs-watch: ## Watch and rebuild documentation
@cd docs && mdbook watch

## Benchmarks
benchmark-terminal: ## Run Terminal-Bench with the cmux agent (use TB_DATASET/TB_ARGS to customize)
@TB_DATASET=$${TB_DATASET:-terminal-bench-core==0.1.1}; \
CONCURRENCY_FLAG=$${TB_CONCURRENCY:+--n-concurrent $$TB_CONCURRENCY}; \
LIVESTREAM_FLAG=$${TB_LIVESTREAM:+--livestream}; \
echo "Running Terminal-Bench with dataset $$TB_DATASET"; \
uvx terminal-bench run \
--dataset "$$TB_DATASET" \
--agent-import-path benchmarks.terminal_bench.cmux_agent:CmuxAgent \
$$CONCURRENCY_FLAG \
$$LIVESTREAM_FLAG \
$${TB_ARGS}

## Clean
clean: ## Clean build artifacts
@echo "Cleaning build artifacts..."
Expand All @@ -182,5 +196,3 @@ clean: ## Clean build artifacts

# Parallel build optimization - these can run concurrently
.NOTPARALLEL: build-main # TypeScript can handle its own parallelism


3 changes: 3 additions & 0 deletions benchmarks/terminal_bench/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .cmux_agent import CmuxAgent

__all__ = ["CmuxAgent"]
117 changes: 117 additions & 0 deletions benchmarks/terminal_bench/cmux-run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
#!/usr/bin/env bash

set -euo pipefail

log() {
printf '[cmux-run] %s\n' "$1"
}

fatal() {
printf '[cmux-run] ERROR: %s\n' "$1" >&2
exit 1
}

instruction=${1:-}
if [[ -z "${instruction}" ]]; then
fatal "instruction argument is required"
fi

export BUN_INSTALL="${BUN_INSTALL:-/root/.bun}"
export PATH="${BUN_INSTALL}/bin:${PATH}"

CMUX_APP_ROOT="${CMUX_APP_ROOT:-/opt/cmux-app}"
CMUX_CONFIG_ROOT="${CMUX_CONFIG_ROOT:-/root/.cmux}"
CMUX_PROJECT_PATH="${CMUX_PROJECT_PATH:-}"
CMUX_PROJECT_CANDIDATES="${CMUX_PROJECT_CANDIDATES:-/workspace:/app:/workspaces:/root/project}"
CMUX_MODEL="${CMUX_MODEL:-anthropic:claude-sonnet-4-5}"
CMUX_TIMEOUT_MS="${CMUX_TIMEOUT_MS:-}"
CMUX_TRUNK="${CMUX_TRUNK:-main}"
CMUX_WORKSPACE_ID="${CMUX_WORKSPACE_ID:-cmux-bench}"
CMUX_THINKING_LEVEL="${CMUX_THINKING_LEVEL:-high}"
CMUX_MODE="${CMUX_MODE:-exec}"

ensure_bun() {
if ! command -v bun >/dev/null 2>&1; then
fatal "bun must be installed before running the cmux agent"
fi
}

resolve_project_path() {
if [[ -n "${CMUX_PROJECT_PATH}" ]]; then
if [[ -d "${CMUX_PROJECT_PATH}" ]]; then
printf '%s\n' "${CMUX_PROJECT_PATH}"
return 0
fi
fatal "CMUX_PROJECT_PATH=${CMUX_PROJECT_PATH} not found"
fi

IFS=":" read -r -a candidates <<<"${CMUX_PROJECT_CANDIDATES}"
for candidate in "${candidates[@]}"; do
if [[ -d "${candidate}" ]]; then
printf '%s\n' "${candidate}"
return 0
fi
done

fatal "no project path located (searched ${CMUX_PROJECT_CANDIDATES})"
}

ensure_git_repo() {
local project_path=$1

if command -v git >/dev/null 2>&1; then
if git -C "${project_path}" rev-parse --is-inside-work-tree >/dev/null 2>&1; then
# Ensure trunk branch exists even on pre-existing repos.
if ! git -C "${project_path}" rev-parse --verify "${CMUX_TRUNK}" >/dev/null 2>&1; then
git -C "${project_path}" checkout -b "${CMUX_TRUNK}" >/dev/null 2>&1 || true
else
git -C "${project_path}" checkout "${CMUX_TRUNK}" >/dev/null 2>&1 || true
fi
return 0
fi

log "initialising git repository at ${project_path}"
if git -C "${project_path}" init --initial-branch="${CMUX_TRUNK}" >/dev/null 2>&1; then
:
else
git -C "${project_path}" init >/dev/null
git -C "${project_path}" checkout -B "${CMUX_TRUNK}" >/dev/null
fi
git -C "${project_path}" config user.name "cmux-bench"
git -C "${project_path}" config user.email "bench@cmux.local"
git -C "${project_path}" add -A >/dev/null
git -C "${project_path}" commit -m "chore: initial snapshot" --allow-empty >/dev/null
git -C "${project_path}" branch -M "${CMUX_TRUNK}" >/dev/null
else
log "git not available; skipping repository initialisation"
fi
}

ensure_bun
project_path=$(resolve_project_path)
ensure_git_repo "${project_path}"

bun --version >/dev/null 2>&1 || fatal "bun not available after ensure_bun"

log "starting cmux agent session for ${project_path}"
cd "${CMUX_APP_ROOT}"

cmd=(bun src/debug/agentSessionCli.ts
--config-root "${CMUX_CONFIG_ROOT}"
--project-path "${project_path}"
--workspace-path "${project_path}"
--workspace-id "${CMUX_WORKSPACE_ID}"
--model "${CMUX_MODEL}"
--mode "${CMUX_MODE}")

if [[ -n "${CMUX_TIMEOUT_MS}" ]]; then
cmd+=(--timeout "${CMUX_TIMEOUT_MS}")
fi

if [[ -n "${CMUX_THINKING_LEVEL}" ]]; then
cmd+=(--thinking-level "${CMUX_THINKING_LEVEL}")
fi

if ! printf '%s' "${instruction}" | "${cmd[@]}"; then
fatal "cmux agent session failed"
fi
221 changes: 221 additions & 0 deletions benchmarks/terminal_bench/cmux_agent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,221 @@
from __future__ import annotations

import io
import os
import shlex
import tarfile
import tempfile
from pathlib import Path
from typing import Any

from terminal_bench.agents.base_agent import AgentResult
from terminal_bench.agents.installed_agents.abstract_installed_agent import (
AbstractInstalledAgent,
)
from terminal_bench.terminal.models import TerminalCommand
from terminal_bench.terminal.tmux_session import TmuxSession


class CmuxAgent(AbstractInstalledAgent):
"""
Minimal Terminal-Bench adapter that installs cmux into the task container and
forwards the benchmark instruction to the cmux headless runner.
"""

_ARCHIVE_NAME = "cmux-app.tar.gz"
_RUNNER_NAME = "cmux-run.sh"
_DEFAULT_TRUNK = "main"
_DEFAULT_MODEL = "anthropic/claude-sonnet-4-5"

def __init__(
self, mode: str | None = None, thinking_level: str | None = None, **kwargs: Any
) -> None:
super().__init__(**kwargs)
repo_root_env = os.environ.get("CMUX_AGENT_REPO_ROOT")
repo_root = (
Path(repo_root_env).resolve()
if repo_root_env
else Path(__file__).resolve().parents[2]
)
if not repo_root.exists():
raise RuntimeError(f"cmux repo root {repo_root} does not exist")

self._repo_root = repo_root
self._archive_bytes: bytes | None = None
self._prepared_container_id: str | None = None
self._mode = mode.lower() if mode else None
self._thinking_level = thinking_level.lower() if thinking_level else None

@staticmethod
def name() -> str:
return "cmux"

@property
def _env(self) -> dict[str, str]:
keys = [
"ANTHROPIC_API_KEY",
"ANTHROPIC_BASE_URL",
"OPENAI_API_KEY",
"OPENAI_BASE_URL",
"OPENAI_API_BASE",
"OPENAI_ORG_ID",
"AZURE_OPENAI_API_KEY",
"AZURE_OPENAI_ENDPOINT",
"AZURE_OPENAI_DEPLOYMENT",
"AZURE_OPENAI_API_VERSION",
"MISTRAL_API_KEY",
"GOOGLE_API_KEY",
"OPENROUTER_API_KEY",
"CMUX_AGENT_GIT_URL",
"CMUX_BUN_INSTALL_URL",
"CMUX_PROJECT_PATH",
"CMUX_PROJECT_CANDIDATES",
"CMUX_TRUNK",
"CMUX_MODEL",
"CMUX_TIMEOUT_MS",
"CMUX_THINKING_LEVEL",
"CMUX_CONFIG_ROOT",
"CMUX_APP_ROOT",
"CMUX_WORKSPACE_ID",
]

env: dict[str, str] = {}
for key in keys:
value = os.environ.get(key)
if value:
env[key] = value

env.setdefault("CMUX_TRUNK", self._DEFAULT_TRUNK)
env.setdefault("CMUX_MODEL", self._DEFAULT_MODEL)
env.setdefault("CMUX_CONFIG_ROOT", "/root/.cmux")
env.setdefault("CMUX_APP_ROOT", "/opt/cmux-app")
env.setdefault("CMUX_WORKSPACE_ID", "cmux-bench")
env.setdefault("CMUX_THINKING_LEVEL", "high")
env.setdefault("CMUX_MODE", "exec")

model_value = env.get("CMUX_MODEL")
if model_value and "/" in model_value and ":" not in model_value:
provider, model_name = model_value.split("/", 1)
env["CMUX_MODEL"] = f"{provider}:{model_name}"

thinking_value = self._thinking_level or env.get("CMUX_THINKING_LEVEL")
if thinking_value:
normalized = thinking_value.strip().lower()
if normalized not in {"off", "low", "medium", "high"}:
raise ValueError(
"CMUX_THINKING_LEVEL must be one of off, low, medium, high"
)
env["CMUX_THINKING_LEVEL"] = normalized

mode_value = self._mode or env.get("CMUX_MODE")
if mode_value:
normalized_mode = mode_value.strip().lower()
if normalized_mode in {"exec", "execute"}:
env["CMUX_MODE"] = "exec"
elif normalized_mode == "plan":
env["CMUX_MODE"] = "plan"
else:
raise ValueError("CMUX_MODE must be one of plan, exec, or execute")

return env

@property
def _install_agent_script_path(self) -> Path:
return self._get_templated_script_path("cmux_setup.sh.j2")

def perform_task(
self,
instruction: str,
session: TmuxSession,
logging_dir=None,
) -> AgentResult:
if not instruction or not instruction.strip():
raise ValueError("instruction must be a non-empty string")

self._prepare_payloads(session)
return super().perform_task(
instruction=instruction, session=session, logging_dir=logging_dir
)

def _prepare_payloads(self, session: TmuxSession) -> None:
container_id = getattr(session.container, "id", None)
if container_id and container_id == self._prepared_container_id:
return

archive = self._build_archive()
temp_path: Path | None = None
try:
with tempfile.NamedTemporaryFile(
suffix=".tar.gz", delete=False
) as temp_file:
temp_file.write(archive)
temp_path = Path(temp_file.name)
except Exception as error:
raise RuntimeError(
f"failed to materialize cmux archive: {error}"
) from error

try:
assert temp_path is not None, "temporary archive path missing"
session.copy_to_container(
paths=temp_path,
container_dir="/installed-agent",
container_filename=self._ARCHIVE_NAME,
)
finally:
if temp_path is not None:
temp_path.unlink(missing_ok=True)

runner_path = Path(__file__).with_name(self._RUNNER_NAME)
if not runner_path.exists():
raise RuntimeError(f"cmux runner script missing at {runner_path}")

session.copy_to_container(
paths=runner_path,
container_dir="/installed-agent",
container_filename=self._RUNNER_NAME,
)

if container_id:
self._prepared_container_id = container_id

def _build_archive(self) -> bytes:
if self._archive_bytes is not None:
return self._archive_bytes

include_paths = [
"package.json",
"bun.lock",
"bunfig.toml",
"tsconfig.json",
"tsconfig.main.json",
"src",
]

buffer = io.BytesIO()
with tarfile.open(fileobj=buffer, mode="w:gz") as tar:
for relative in include_paths:
source_path = self._repo_root / relative
if not source_path.exists():
raise FileNotFoundError(f"Required file {source_path} not found")
tar.add(
source_path,
arcname=relative,
recursive=True,
)
buffer.seek(0)
self._archive_bytes = buffer.getvalue()
return self._archive_bytes

def _run_agent_commands(self, instruction: str) -> list[TerminalCommand]:
escaped = shlex.quote(instruction)
command = f"bash /installed-agent/{self._RUNNER_NAME} {escaped}"
return [
TerminalCommand(
command=command,
min_timeout_sec=0.0,
max_timeout_sec=float("inf"),
block=True,
append_enter=True,
)
]
Loading