Skip to content

Commit 2e3410d

Browse files
authored
fix: harden cmux agent payload setup (#213)
Extract payload packaging into `cmux_payload` to reuse staging logic. Normalize env defaults, validate required fields, support colon models. Add tests covering env normalization and timeout validation.
1 parent d5c343c commit 2e3410d

File tree

3 files changed

+232
-115
lines changed

3 files changed

+232
-115
lines changed

benchmarks/terminal_bench/cmux_agent.py

Lines changed: 120 additions & 115 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,9 @@
11
from __future__ import annotations
22

3-
import io
43
import os
54
import shlex
6-
import tarfile
7-
import tempfile
85
from pathlib import Path
9-
from typing import Any
6+
from typing import Any, Sequence
107

118
from terminal_bench.agents.base_agent import AgentResult
129
from terminal_bench.agents.installed_agents.abstract_installed_agent import (
@@ -15,6 +12,8 @@
1512
from terminal_bench.terminal.models import TerminalCommand
1613
from terminal_bench.terminal.tmux_session import TmuxSession
1714

15+
from .cmux_payload import build_app_archive, stage_payload
16+
1817

1918
class CmuxAgent(AbstractInstalledAgent):
2019
"""
@@ -25,10 +24,51 @@ class CmuxAgent(AbstractInstalledAgent):
2524
_ARCHIVE_NAME = "cmux-app.tar.gz"
2625
_RUNNER_NAME = "cmux-run.sh"
2726
_DEFAULT_TRUNK = "main"
28-
_DEFAULT_MODEL = "anthropic/claude-sonnet-4-5"
27+
_DEFAULT_MODEL = "anthropic:claude-sonnet-4-5"
28+
_DEFAULT_PROJECT_CANDIDATES = "/workspace:/app:/workspaces:/root/project"
29+
_INCLUDE_PATHS: Sequence[str] = (
30+
"package.json",
31+
"bun.lock",
32+
"bunfig.toml",
33+
"tsconfig.json",
34+
"tsconfig.main.json",
35+
"src",
36+
)
37+
38+
_PROVIDER_ENV_KEYS: Sequence[str] = (
39+
"ANTHROPIC_API_KEY",
40+
"ANTHROPIC_BASE_URL",
41+
"OPENAI_API_KEY",
42+
"OPENAI_BASE_URL",
43+
"OPENAI_API_BASE",
44+
"OPENAI_ORG_ID",
45+
"AZURE_OPENAI_API_KEY",
46+
"AZURE_OPENAI_ENDPOINT",
47+
"AZURE_OPENAI_DEPLOYMENT",
48+
"AZURE_OPENAI_API_VERSION",
49+
)
50+
51+
_CONFIG_ENV_KEYS: Sequence[str] = (
52+
"CMUX_AGENT_GIT_URL",
53+
"CMUX_BUN_INSTALL_URL",
54+
"CMUX_PROJECT_PATH",
55+
"CMUX_PROJECT_CANDIDATES",
56+
"CMUX_TRUNK",
57+
"CMUX_MODEL",
58+
"CMUX_TIMEOUT_MS",
59+
"CMUX_THINKING_LEVEL",
60+
"CMUX_CONFIG_ROOT",
61+
"CMUX_APP_ROOT",
62+
"CMUX_WORKSPACE_ID",
63+
"CMUX_MODE",
64+
)
2965

3066
def __init__(
31-
self, mode: str | None = None, thinking_level: str | None = None, **kwargs: Any
67+
self,
68+
model_name: str = "anthropic:claude-sonnet-4-5",
69+
mode: str | None = None,
70+
thinking_level: str | None = None,
71+
**kwargs: Any,
3272
) -> None:
3373
super().__init__(**kwargs)
3474
repo_root_env = os.environ.get("CMUX_AGENT_REPO_ROOT")
@@ -40,47 +80,27 @@ def __init__(
4080
if not repo_root.exists():
4181
raise RuntimeError(f"cmux repo root {repo_root} does not exist")
4282

83+
runner_path = Path(__file__).with_name(self._RUNNER_NAME)
84+
if not runner_path.is_file():
85+
raise RuntimeError(f"cmux runner script missing at {runner_path}")
86+
87+
self._runner_path = runner_path
4388
self._repo_root = repo_root
4489
self._archive_bytes: bytes | None = None
45-
self._prepared_container_id: str | None = None
90+
self._staged_container_id: str | None = None
4691
self._mode = mode.lower() if mode else None
4792
self._thinking_level = thinking_level.lower() if thinking_level else None
93+
self._model_name = (model_name or "").strip()
4894

4995
@staticmethod
5096
def name() -> str:
5197
return "cmux"
5298

5399
@property
54100
def _env(self) -> dict[str, str]:
55-
keys = [
56-
"ANTHROPIC_API_KEY",
57-
"ANTHROPIC_BASE_URL",
58-
"OPENAI_API_KEY",
59-
"OPENAI_BASE_URL",
60-
"OPENAI_API_BASE",
61-
"OPENAI_ORG_ID",
62-
"AZURE_OPENAI_API_KEY",
63-
"AZURE_OPENAI_ENDPOINT",
64-
"AZURE_OPENAI_DEPLOYMENT",
65-
"AZURE_OPENAI_API_VERSION",
66-
"MISTRAL_API_KEY",
67-
"GOOGLE_API_KEY",
68-
"OPENROUTER_API_KEY",
69-
"CMUX_AGENT_GIT_URL",
70-
"CMUX_BUN_INSTALL_URL",
71-
"CMUX_PROJECT_PATH",
72-
"CMUX_PROJECT_CANDIDATES",
73-
"CMUX_TRUNK",
74-
"CMUX_MODEL",
75-
"CMUX_TIMEOUT_MS",
76-
"CMUX_THINKING_LEVEL",
77-
"CMUX_CONFIG_ROOT",
78-
"CMUX_APP_ROOT",
79-
"CMUX_WORKSPACE_ID",
80-
]
81-
82101
env: dict[str, str] = {}
83-
for key in keys:
102+
103+
for key in (*self._PROVIDER_ENV_KEYS, *self._CONFIG_ENV_KEYS):
84104
value = os.environ.get(key)
85105
if value:
86106
env[key] = value
@@ -92,30 +112,61 @@ def _env(self) -> dict[str, str]:
92112
env.setdefault("CMUX_WORKSPACE_ID", "cmux-bench")
93113
env.setdefault("CMUX_THINKING_LEVEL", "high")
94114
env.setdefault("CMUX_MODE", "exec")
115+
env.setdefault("CMUX_PROJECT_CANDIDATES", self._DEFAULT_PROJECT_CANDIDATES)
95116

96-
model_value = env.get("CMUX_MODEL")
97-
if model_value and "/" in model_value and ":" not in model_value:
117+
model_value = self._model_name or env["CMUX_MODEL"]
118+
model_value = model_value.strip()
119+
if not model_value:
120+
raise ValueError("CMUX_MODEL must be a non-empty string")
121+
if "/" in model_value and ":" not in model_value:
98122
provider, model_name = model_value.split("/", 1)
99-
env["CMUX_MODEL"] = f"{provider}:{model_name}"
100-
101-
thinking_value = self._thinking_level or env.get("CMUX_THINKING_LEVEL")
102-
if thinking_value:
103-
normalized = thinking_value.strip().lower()
104-
if normalized not in {"off", "low", "medium", "high"}:
105-
raise ValueError(
106-
"CMUX_THINKING_LEVEL must be one of off, low, medium, high"
107-
)
108-
env["CMUX_THINKING_LEVEL"] = normalized
109-
110-
mode_value = self._mode or env.get("CMUX_MODE")
111-
if mode_value:
112-
normalized_mode = mode_value.strip().lower()
113-
if normalized_mode in {"exec", "execute"}:
114-
env["CMUX_MODE"] = "exec"
115-
elif normalized_mode == "plan":
116-
env["CMUX_MODE"] = "plan"
117-
else:
118-
raise ValueError("CMUX_MODE must be one of plan, exec, or execute")
123+
model_value = f"{provider}:{model_name}"
124+
env["CMUX_MODEL"] = model_value
125+
126+
thinking_value = self._thinking_level or env["CMUX_THINKING_LEVEL"]
127+
normalized_thinking = thinking_value.strip().lower()
128+
if normalized_thinking not in {"off", "low", "medium", "high"}:
129+
raise ValueError(
130+
"CMUX_THINKING_LEVEL must be one of off, low, medium, high"
131+
)
132+
env["CMUX_THINKING_LEVEL"] = normalized_thinking
133+
134+
mode_value = self._mode or env["CMUX_MODE"]
135+
normalized_mode = mode_value.strip().lower()
136+
if normalized_mode in {"exec", "execute"}:
137+
env["CMUX_MODE"] = "exec"
138+
elif normalized_mode == "plan":
139+
env["CMUX_MODE"] = "plan"
140+
else:
141+
raise ValueError("CMUX_MODE must be one of plan, exec, or execute")
142+
143+
config_root = env["CMUX_CONFIG_ROOT"].strip()
144+
app_root = env["CMUX_APP_ROOT"].strip()
145+
workspace_id = env["CMUX_WORKSPACE_ID"].strip()
146+
project_candidates = env["CMUX_PROJECT_CANDIDATES"].strip()
147+
if not config_root:
148+
raise ValueError("CMUX_CONFIG_ROOT must be set")
149+
if not app_root:
150+
raise ValueError("CMUX_APP_ROOT must be set")
151+
if not workspace_id:
152+
raise ValueError("CMUX_WORKSPACE_ID must be set")
153+
if not project_candidates:
154+
raise ValueError("CMUX_PROJECT_CANDIDATES must be set")
155+
env["CMUX_CONFIG_ROOT"] = config_root
156+
env["CMUX_APP_ROOT"] = app_root
157+
env["CMUX_WORKSPACE_ID"] = workspace_id
158+
env["CMUX_PROJECT_CANDIDATES"] = project_candidates
159+
160+
timeout_value = env.get("CMUX_TIMEOUT_MS")
161+
if timeout_value:
162+
timeout_value = timeout_value.strip()
163+
if not timeout_value.isdigit():
164+
raise ValueError("CMUX_TIMEOUT_MS must be an integer expressed in ms")
165+
env["CMUX_TIMEOUT_MS"] = timeout_value
166+
167+
project_path = env.get("CMUX_PROJECT_PATH")
168+
if project_path is not None and not project_path.strip():
169+
raise ValueError("CMUX_PROJECT_PATH must be non-empty when provided")
119170

120171
return env
121172

@@ -132,80 +183,34 @@ def perform_task(
132183
if not instruction or not instruction.strip():
133184
raise ValueError("instruction must be a non-empty string")
134185

135-
self._prepare_payloads(session)
186+
self._ensure_payload_staged(session)
136187
return super().perform_task(
137188
instruction=instruction, session=session, logging_dir=logging_dir
138189
)
139190

140-
def _prepare_payloads(self, session: TmuxSession) -> None:
191+
def _ensure_payload_staged(self, session: TmuxSession) -> None:
141192
container_id = getattr(session.container, "id", None)
142-
if container_id and container_id == self._prepared_container_id:
193+
if container_id and container_id == self._staged_container_id:
143194
return
144195

145196
archive = self._build_archive()
146-
temp_path: Path | None = None
147-
try:
148-
with tempfile.NamedTemporaryFile(
149-
suffix=".tar.gz", delete=False
150-
) as temp_file:
151-
temp_file.write(archive)
152-
temp_path = Path(temp_file.name)
153-
except Exception as error:
154-
raise RuntimeError(
155-
f"failed to materialize cmux archive: {error}"
156-
) from error
157-
158-
try:
159-
assert temp_path is not None, "temporary archive path missing"
160-
session.copy_to_container(
161-
paths=temp_path,
162-
container_dir="/installed-agent",
163-
container_filename=self._ARCHIVE_NAME,
164-
)
165-
finally:
166-
if temp_path is not None:
167-
temp_path.unlink(missing_ok=True)
168-
169-
runner_path = Path(__file__).with_name(self._RUNNER_NAME)
170-
if not runner_path.exists():
171-
raise RuntimeError(f"cmux runner script missing at {runner_path}")
172-
173-
session.copy_to_container(
174-
paths=runner_path,
175-
container_dir="/installed-agent",
176-
container_filename=self._RUNNER_NAME,
197+
stage_payload(
198+
session=session,
199+
archive_bytes=archive,
200+
archive_name=self._ARCHIVE_NAME,
201+
runner_path=self._runner_path,
177202
)
178203

179204
if container_id:
180-
self._prepared_container_id = container_id
205+
self._staged_container_id = container_id
181206

182207
def _build_archive(self) -> bytes:
183208
if self._archive_bytes is not None:
184209
return self._archive_bytes
185210

186-
include_paths = [
187-
"package.json",
188-
"bun.lock",
189-
"bunfig.toml",
190-
"tsconfig.json",
191-
"tsconfig.main.json",
192-
"src",
193-
]
194-
195-
buffer = io.BytesIO()
196-
with tarfile.open(fileobj=buffer, mode="w:gz") as tar:
197-
for relative in include_paths:
198-
source_path = self._repo_root / relative
199-
if not source_path.exists():
200-
raise FileNotFoundError(f"Required file {source_path} not found")
201-
tar.add(
202-
source_path,
203-
arcname=relative,
204-
recursive=True,
205-
)
206-
buffer.seek(0)
207-
self._archive_bytes = buffer.getvalue()
208-
return self._archive_bytes
211+
archive = build_app_archive(self._repo_root, self._INCLUDE_PATHS)
212+
self._archive_bytes = archive
213+
return archive
209214

210215
def _run_agent_commands(self, instruction: str) -> list[TerminalCommand]:
211216
escaped = shlex.quote(instruction)
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
from __future__ import annotations
2+
3+
from pathlib import Path
4+
5+
import pytest
6+
7+
from .cmux_agent import CmuxAgent
8+
9+
10+
@pytest.fixture(autouse=True)
11+
def _clear_cmux_env(monkeypatch: pytest.MonkeyPatch) -> None:
12+
keys = [
13+
"CMUX_AGENT_GIT_URL",
14+
"CMUX_BUN_INSTALL_URL",
15+
"CMUX_PROJECT_PATH",
16+
"CMUX_PROJECT_CANDIDATES",
17+
"CMUX_TRUNK",
18+
"CMUX_MODEL",
19+
"CMUX_TIMEOUT_MS",
20+
"CMUX_THINKING_LEVEL",
21+
"CMUX_CONFIG_ROOT",
22+
"CMUX_APP_ROOT",
23+
"CMUX_WORKSPACE_ID",
24+
"CMUX_MODE",
25+
]
26+
for key in keys:
27+
monkeypatch.delenv(key, raising=False)
28+
29+
30+
def _repo_root() -> Path:
31+
return Path(__file__).resolve().parents[2]
32+
33+
34+
def test_env_defaults_are_normalized(monkeypatch: pytest.MonkeyPatch) -> None:
35+
monkeypatch.setenv("CMUX_AGENT_REPO_ROOT", str(_repo_root()))
36+
agent = CmuxAgent(model_name="anthropic/claude-sonnet-4-5")
37+
38+
env = agent._env
39+
40+
assert env["CMUX_MODEL"] == "anthropic:claude-sonnet-4-5"
41+
assert env["CMUX_THINKING_LEVEL"] == "high"
42+
assert env["CMUX_MODE"] == "exec"
43+
assert env["CMUX_PROJECT_CANDIDATES"] == agent._DEFAULT_PROJECT_CANDIDATES
44+
45+
46+
def test_timeout_must_be_numeric(monkeypatch: pytest.MonkeyPatch) -> None:
47+
monkeypatch.setenv("CMUX_AGENT_REPO_ROOT", str(_repo_root()))
48+
monkeypatch.setenv("CMUX_TIMEOUT_MS", "not-a-number")
49+
50+
agent = CmuxAgent()
51+
with pytest.raises(ValueError):
52+
_ = agent._env

0 commit comments

Comments
 (0)