Skip to content

Commit b9dc721

Browse files
authored
feat(gateway): add e2e test for cached input cost validation (#904)
Extended e2e chat tests to validate cached input cost functionality. Added logic to verify that cached input cost is greater than zero when supported. <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit - New Features - Added prompt caching support for Claude/Anthropic models, including ephemeral cache markers and updated headers. - Enhanced token usage and cost calculations to account for cached input tokens; UI now shows cached tokens and cached input cost in logs. - Tests - Introduced end-to-end tests validating caching behavior and cost reporting across providers. - Documentation - Added guidance on strict TypeScript usage in contributor docs. - Chores - Optimized test scripts to skip building UI/docs during test runs. - Refactor - Removed debug logging and streamlined parsing paths without changing behavior. <!-- end of auto-generated comment: release notes by coderabbit.ai -->
1 parent 6661518 commit b9dc721

File tree

14 files changed

+399
-42
lines changed

14 files changed

+399
-42
lines changed

CLAUDE.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,10 @@ Always run `pnpm format` before committing code. Run `pnpm generate` if API rout
2828
- `pnpm lint` - Check linting and formatting (without fixing)
2929
- `pnpm generate` - Regenerate OpenAPI schemas from API routes
3030

31+
### Writing code
32+
33+
This is a pure TypeScript project. Never use `any` or `as any` unless absolutely necessary.
34+
3135
### Testing
3236

3337
NOTE: these commands can only be run in the root directory of the repository, not in individual app directories.

apps/gateway/src/anthropic/anthropic.ts

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,11 @@ const anthropicMessageSchema = z.object({
1818
z.object({
1919
type: z.literal("text"),
2020
text: z.string(),
21+
cache_control: z
22+
.object({
23+
type: z.enum(["ephemeral"]),
24+
})
25+
.optional(),
2126
}),
2227
z.object({
2328
type: z.literal("image"),
@@ -91,6 +96,11 @@ const anthropicRequestSchema = z.object({
9196
z.object({
9297
type: z.literal("text"),
9398
text: z.string(),
99+
cache_control: z
100+
.object({
101+
type: z.enum(["ephemeral"]),
102+
})
103+
.optional(),
94104
}),
95105
),
96106
])
@@ -178,31 +188,20 @@ anthropic.openapi(messages, async (c) => {
178188
try {
179189
rawRequest = await c.req.json();
180190
} catch (error) {
181-
// console.log("Failed to parse JSON from request:", error);
182191
throw new HTTPException(400, {
183192
message: `Invalid JSON in request body: ${error}`,
184193
});
185194
}
186195

187-
// console.log("Raw Anthropic request:", JSON.stringify(rawRequest, null, 2));
188-
189196
// Validate with our schema
190197
const validation = anthropicRequestSchema.safeParse(rawRequest);
191198
if (!validation.success) {
192-
// console.log(
193-
// "Anthropic request validation failed:",
194-
// JSON.stringify(validation.error.issues, null, 2),
195-
// );
196199
throw new HTTPException(400, {
197200
message: `Invalid request format: ${validation.error.issues.map((issue) => `${issue.path.join(".")}: ${issue.message}`).join(", ")}`,
198201
});
199202
}
200203

201204
const anthropicRequest: AnthropicRequest = validation.data;
202-
// console.log(
203-
// "Validated Anthropic request:",
204-
// JSON.stringify(anthropicRequest, null, 2),
205-
// );
206205

207206
// Transform Anthropic request to OpenAI format
208207
const openaiMessages: Array<Record<string, unknown>> = [];

apps/gateway/src/chat-cache.e2e.ts

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
import "dotenv/config";
2+
import { beforeAll, beforeEach, describe, expect, test } from "vitest";
3+
4+
import {
5+
beforeAllHook,
6+
beforeEachHook,
7+
getConcurrentTestOptions,
8+
getTestOptions,
9+
logMode,
10+
testModels,
11+
validateLogByRequestId,
12+
validateResponse,
13+
} from "@/chat-helpers.e2e.js";
14+
15+
import { models } from "@llmgateway/models";
16+
17+
import { app } from "./app.js";
18+
19+
import type { ProviderModelMapping } from "@llmgateway/models";
20+
21+
// Helper function to generate unique request IDs for tests
22+
export function generateTestRequestId(): string {
23+
return `test-${Date.now()}-${Math.random().toString(36).substr(2, 9)}`;
24+
}
25+
26+
describe("e2e", getConcurrentTestOptions(), () => {
27+
beforeAll(beforeAllHook);
28+
29+
beforeEach(beforeEachHook);
30+
31+
test("empty", () => {
32+
expect(true).toBe(true);
33+
});
34+
35+
if (process.env.CACHE_MODE === "true") {
36+
test.each(testModels)(
37+
"completions with cache checks $model",
38+
getTestOptions(),
39+
async ({ model, originalModel }) => {
40+
// Use a long prompt to trigger caching mechanism (1024+ tokens) for models that support it
41+
const longPrompt = `You are a helpful assistant. Please analyze the following long text carefully and provide insights. ${"This is a very detailed example text that needs to be quite long to trigger caching mechanisms which require at least 1024 tokens. ".repeat(50)} Just reply 'OK' after processing this text.`;
42+
43+
const requestId = generateTestRequestId();
44+
const res = await app.request("/v1/chat/completions", {
45+
method: "POST",
46+
headers: {
47+
"Content-Type": "application/json",
48+
"x-request-id": requestId,
49+
Authorization: `Bearer real-token`,
50+
},
51+
body: JSON.stringify({
52+
model: model,
53+
messages: [
54+
{
55+
role: "system",
56+
content: longPrompt,
57+
},
58+
{
59+
role: "user",
60+
content: "Hello, just reply 'OK'!",
61+
},
62+
],
63+
}),
64+
});
65+
66+
const json = await res.json();
67+
if (logMode) {
68+
console.log("response:", JSON.stringify(json, null, 2));
69+
}
70+
71+
expect(res.status).toBe(200);
72+
validateResponse(json);
73+
74+
const log = await validateLogByRequestId(requestId);
75+
expect(log.streamed).toBe(false);
76+
77+
expect(json).toHaveProperty("usage");
78+
expect(json.usage).toHaveProperty("prompt_tokens");
79+
expect(json.usage).toHaveProperty("completion_tokens");
80+
expect(json.usage).toHaveProperty("total_tokens");
81+
expect(typeof json.usage.prompt_tokens).toBe("number");
82+
expect(typeof json.usage.completion_tokens).toBe("number");
83+
expect(typeof json.usage.total_tokens).toBe("number");
84+
expect(json.usage.prompt_tokens).toBeGreaterThan(0);
85+
expect(json.usage.completion_tokens).toBeGreaterThan(0);
86+
expect(json.usage.total_tokens).toBeGreaterThan(0);
87+
88+
// expect(log.inputCost).not.toBeNull();
89+
// expect(log.outputCost).not.toBeNull();
90+
// expect(log.cost).not.toBeNull();
91+
92+
const originalModelProviderMapping = models
93+
.find((m) => m.id === originalModel)
94+
?.providers.find(
95+
(p) => p.providerId === log.usedProvider,
96+
) as ProviderModelMapping;
97+
if (originalModelProviderMapping.cachedInputPrice) {
98+
// for models that support cached input cost, make the same request twice with a long prompt (1024+ tokens) to trigger caching
99+
const secondRequestId = generateTestRequestId();
100+
const secondRes = await app.request("/v1/chat/completions", {
101+
method: "POST",
102+
headers: {
103+
"Content-Type": "application/json",
104+
"x-request-id": secondRequestId,
105+
Authorization: `Bearer real-token`,
106+
},
107+
body: JSON.stringify({
108+
model: model,
109+
messages: [
110+
{
111+
role: "system",
112+
content: longPrompt,
113+
},
114+
{
115+
role: "user",
116+
content: "Hello, just reply 'OK'!",
117+
},
118+
],
119+
}),
120+
});
121+
const secondJson = await secondRes.json();
122+
if (logMode) {
123+
console.log(
124+
"second response:",
125+
JSON.stringify(secondJson, null, 2),
126+
);
127+
}
128+
const secondLog = await validateLogByRequestId(secondRequestId);
129+
console.log("Second request log for caching test:", {
130+
cachedInputCost: secondLog.cachedInputCost,
131+
cachedTokens: secondLog.cachedTokens,
132+
provider: log.usedProvider,
133+
inputCost: secondLog.inputCost,
134+
totalCost: secondLog.cost,
135+
});
136+
137+
expect(secondLog.cachedInputCost).toBeGreaterThan(0);
138+
}
139+
},
140+
);
141+
}
142+
});

apps/gateway/src/chat/chat.ts

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2234,6 +2234,7 @@ chat.openapi(completions, async (c) => {
22342234
data,
22352235
usedProvider,
22362236
fullContent,
2237+
usedModel,
22372238
);
22382239

22392240
// If we have usage data from Google, add it to the streaming chunk
@@ -2399,7 +2400,12 @@ chat.openapi(completions, async (c) => {
23992400
}
24002401

24012402
// Extract token usage using helper function
2402-
const usage = extractTokenUsage(data, usedProvider, fullContent);
2403+
const usage = extractTokenUsage(
2404+
data,
2405+
usedProvider,
2406+
fullContent,
2407+
usedModel,
2408+
);
24032409
if (usage.promptTokens !== null) {
24042410
promptTokens = usage.promptTokens;
24052411
}
@@ -3001,7 +3007,7 @@ chat.openapi(completions, async (c) => {
30013007
cachedTokens,
30023008
toolResults,
30033009
images,
3004-
} = parseProviderResponse(usedProvider, json, messages);
3010+
} = parseProviderResponse(usedProvider, json, messages, usedModel);
30053011

30063012
// Debug: Log images found in response
30073013
logger.debug("Gateway - parseProviderResponse extracted images", { images });

apps/gateway/src/chat/tools/extract-token-usage.ts

Lines changed: 38 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ export function extractTokenUsage(
99
data: any,
1010
provider: Provider,
1111
fullContent?: string,
12+
usedModel?: string,
1213
) {
1314
let promptTokens = null;
1415
let completionTokens = null;
@@ -44,20 +45,50 @@ export function extractTokenUsage(
4445
break;
4546
case "anthropic":
4647
if (data.usage) {
47-
promptTokens = data.usage.input_tokens ?? null;
48+
// For Anthropic: input_tokens are the non-cached tokens
49+
// We need to add cache_creation_input_tokens to get total input tokens
50+
const inputTokens = data.usage.input_tokens ?? 0;
51+
const cacheCreationTokens = data.usage.cache_creation_input_tokens ?? 0;
52+
const cacheReadTokens = data.usage.cache_read_input_tokens ?? 0;
53+
54+
// Total prompt tokens = non-cached + cache creation + cache read
55+
promptTokens = inputTokens + cacheCreationTokens + cacheReadTokens;
4856
completionTokens = data.usage.output_tokens ?? null;
4957
reasoningTokens = data.usage.reasoning_output_tokens ?? null;
50-
cachedTokens = data.usage.cache_read_input_tokens ?? null;
58+
// Cached tokens are the tokens read from cache (discount applies to these)
59+
cachedTokens = cacheReadTokens || null;
5160
totalTokens = (promptTokens ?? 0) + (completionTokens ?? 0);
5261
}
5362
break;
5463
default: // OpenAI format
5564
if (data.usage) {
56-
promptTokens = data.usage.prompt_tokens ?? null;
57-
completionTokens = data.usage.completion_tokens ?? null;
58-
totalTokens = data.usage.total_tokens ?? null;
59-
reasoningTokens = data.usage.reasoning_tokens ?? null;
60-
cachedTokens = data.usage.prompt_tokens_details?.cached_tokens ?? null;
65+
// Special handling for routeway-discount claude models (use Anthropic-style parsing)
66+
if (
67+
provider === "routeway-discount" &&
68+
usedModel?.startsWith("claude-")
69+
) {
70+
// Use Anthropic-style token parsing for claude models
71+
const inputTokens = data.usage.input_tokens ?? 0;
72+
const cacheCreationTokens =
73+
data.usage.cache_creation_input_tokens ?? 0;
74+
const cacheReadTokens = data.usage.cache_read_input_tokens ?? 0;
75+
76+
// Total prompt tokens = non-cached + cache creation + cache read
77+
promptTokens = inputTokens + cacheCreationTokens + cacheReadTokens;
78+
completionTokens = data.usage.output_tokens ?? null;
79+
reasoningTokens = data.usage.reasoning_output_tokens ?? null;
80+
// Cached tokens are the tokens read from cache (discount applies to these)
81+
cachedTokens = cacheReadTokens || null;
82+
totalTokens = (promptTokens ?? 0) + (completionTokens ?? 0);
83+
} else {
84+
// Standard OpenAI-style token parsing
85+
promptTokens = data.usage.prompt_tokens ?? null;
86+
completionTokens = data.usage.completion_tokens ?? null;
87+
totalTokens = data.usage.total_tokens ?? null;
88+
reasoningTokens = data.usage.reasoning_tokens ?? null;
89+
cachedTokens =
90+
data.usage.prompt_tokens_details?.cached_tokens ?? null;
91+
}
6192
}
6293
break;
6394
}

apps/gateway/src/chat/tools/parse-provider-response.ts

Lines changed: 47 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ export function parseProviderResponse(
1010
usedProvider: Provider,
1111
json: any,
1212
messages: any[] = [],
13+
usedModel?: string,
1314
) {
1415
let content = null;
1516
let reasoningContent = null;
@@ -38,14 +39,25 @@ export function parseProviderResponse(
3839
thinkingBlocks.map((block: any) => block.thinking).join("") || null;
3940

4041
finishReason = json.stop_reason || null;
41-
promptTokens = json.usage?.input_tokens || null;
42-
completionTokens = json.usage?.output_tokens || null;
43-
reasoningTokens = json.usage?.reasoning_output_tokens || null;
44-
cachedTokens = json.usage?.cache_read_input_tokens || null;
45-
totalTokens =
46-
json.usage?.input_tokens && json.usage?.output_tokens
47-
? json.usage.input_tokens + json.usage.output_tokens
48-
: null;
42+
43+
// For Anthropic: input_tokens are the non-cached tokens
44+
// We need to add cache_creation_input_tokens to get total input tokens
45+
if (json.usage) {
46+
const inputTokens = json.usage.input_tokens || 0;
47+
const cacheCreationTokens = json.usage.cache_creation_input_tokens || 0;
48+
const cacheReadTokens = json.usage.cache_read_input_tokens || 0;
49+
50+
// Total prompt tokens = non-cached + cache creation + cache read
51+
promptTokens = inputTokens + cacheCreationTokens + cacheReadTokens;
52+
completionTokens = json.usage.output_tokens || null;
53+
reasoningTokens = json.usage.reasoning_output_tokens || null;
54+
// Cached tokens are the tokens read from cache (discount applies to these)
55+
cachedTokens = cacheReadTokens || null;
56+
totalTokens =
57+
promptTokens && completionTokens
58+
? promptTokens + completionTokens
59+
: null;
60+
}
4961
// Extract tool calls from Anthropic format
5062
toolResults =
5163
json.content
@@ -273,10 +285,33 @@ export function parseProviderResponse(
273285
}
274286
}
275287

276-
promptTokens = json.usage?.prompt_tokens || null;
277-
completionTokens = json.usage?.completion_tokens || null;
278-
reasoningTokens = json.usage?.reasoning_tokens || null;
279-
cachedTokens = json.usage?.prompt_tokens_details?.cached_tokens || null;
288+
// Special handling for routeway-discount claude models (use Anthropic-style parsing)
289+
if (
290+
usedProvider === "routeway-discount" &&
291+
usedModel?.startsWith("claude-")
292+
) {
293+
// Use Anthropic-style token parsing for claude models
294+
if (json.usage) {
295+
const inputTokens = json.usage.input_tokens || 0;
296+
const cacheCreationTokens =
297+
json.usage.cache_creation_input_tokens || 0;
298+
const cacheReadTokens = json.usage.cache_read_input_tokens || 0;
299+
300+
// Total prompt tokens = non-cached + cache creation + cache read
301+
promptTokens = inputTokens + cacheCreationTokens + cacheReadTokens;
302+
completionTokens = json.usage.output_tokens || null;
303+
reasoningTokens = json.usage.reasoning_output_tokens || null;
304+
// Cached tokens are the tokens read from cache (discount applies to these)
305+
cachedTokens = cacheReadTokens || null;
306+
}
307+
} else {
308+
// Standard OpenAI-style token parsing
309+
promptTokens = json.usage?.prompt_tokens || null;
310+
completionTokens = json.usage?.completion_tokens || null;
311+
reasoningTokens = json.usage?.reasoning_tokens || null;
312+
cachedTokens =
313+
json.usage?.prompt_tokens_details?.cached_tokens || null;
314+
}
280315
totalTokens =
281316
json.usage?.total_tokens ||
282317
(promptTokens !== null && completionTokens !== null

0 commit comments

Comments
 (0)