theopenco
diff --git a/‎CLAUDE.md‎
Lines changed: 4 additions & 0 deletions b/‎CLAUDE.md‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎apps/gateway/src/anthropic/anthropic.ts‎
Lines changed: 10 additions & 11 deletions b/‎apps/gateway/src/anthropic/anthropic.ts‎
Lines changed: 10 additions & 11 deletions
diff --git a/‎apps/gateway/src/chat-cache.e2e.ts‎
Lines changed: 142 additions & 0 deletions b/‎apps/gateway/src/chat-cache.e2e.ts‎
Lines changed: 142 additions & 0 deletions
diff --git a/‎apps/gateway/src/chat/chat.ts‎
Lines changed: 8 additions & 2 deletions b/‎apps/gateway/src/chat/chat.ts‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎apps/gateway/src/chat/tools/extract-token-usage.ts‎
Lines changed: 38 additions & 7 deletions b/‎apps/gateway/src/chat/tools/extract-token-usage.ts‎
Lines changed: 38 additions & 7 deletions
diff --git a/‎apps/gateway/src/chat/tools/parse-provider-response.ts‎
Lines changed: 47 additions & 12 deletions b/‎apps/gateway/src/chat/tools/parse-provider-response.ts‎
Lines changed: 47 additions & 12 deletions
@@ -28,6 +28,10 @@ Always run `pnpm format` before committing code. Run `pnpm generate` if API rout
 - `pnpm lint` - Check linting and formatting (without fixing)
 - `pnpm generate` - Regenerate OpenAPI schemas from API routes
 
+### Writing code
+
+This is a pure TypeScript project. Never use `any` or `as any` unless absolutely necessary.
+
 ### Testing
 
 NOTE: these commands can only be run in the root directory of the repository, not in individual app directories.
 
@@ -18,6 +18,11 @@ const anthropicMessageSchema = z.object({
 				z.object({
 					type: z.literal("text"),
 					text: z.string(),
+					cache_control: z
+						.object({
+							type: z.enum(["ephemeral"]),
+						})
+						.optional(),
 				}),
 				z.object({
 					type: z.literal("image"),
@@ -91,6 +96,11 @@ const anthropicRequestSchema = z.object({
 				z.object({
 					type: z.literal("text"),
 					text: z.string(),
+					cache_control: z
+						.object({
+							type: z.enum(["ephemeral"]),
+						})
+						.optional(),
 				}),
 			),
 		])
@@ -178,31 +188,20 @@ anthropic.openapi(messages, async (c) => {
 	try {
 		rawRequest = await c.req.json();
 	} catch (error) {
-		// console.log("Failed to parse JSON from request:", error);
 		throw new HTTPException(400, {
 			message: `Invalid JSON in request body: ${error}`,
 		});
 	}
 
-	// console.log("Raw Anthropic request:", JSON.stringify(rawRequest, null, 2));
-
 	// Validate with our schema
 	const validation = anthropicRequestSchema.safeParse(rawRequest);
 	if (!validation.success) {
-		// console.log(
-		// 	"Anthropic request validation failed:",
-		// 	JSON.stringify(validation.error.issues, null, 2),
-		// );
 		throw new HTTPException(400, {
 			message: `Invalid request format: ${validation.error.issues.map((issue) => `${issue.path.join(".")}: ${issue.message}`).join(", ")}`,
 		});
 	}
 
 	const anthropicRequest: AnthropicRequest = validation.data;
-	// console.log(
-	// 	"Validated Anthropic request:",
-	// 	JSON.stringify(anthropicRequest, null, 2),
-	// );
 
 	// Transform Anthropic request to OpenAI format
 	const openaiMessages: Array<Record<string, unknown>> = [];
 
@@ -0,0 +1,142 @@
+import "dotenv/config";
+import { beforeAll, beforeEach, describe, expect, test } from "vitest";
+
+import {
+	beforeAllHook,
+	beforeEachHook,
+	getConcurrentTestOptions,
+	getTestOptions,
+	logMode,
+	testModels,
+	validateLogByRequestId,
+	validateResponse,
+} from "@/chat-helpers.e2e.js";
+
+import { models } from "@llmgateway/models";
+
+import { app } from "./app.js";
+
+import type { ProviderModelMapping } from "@llmgateway/models";
+
+// Helper function to generate unique request IDs for tests
+export function generateTestRequestId(): string {
+	return `test-${Date.now()}-${Math.random().toString(36).substr(2, 9)}`;
+}
+
+describe("e2e", getConcurrentTestOptions(), () => {
+	beforeAll(beforeAllHook);
+
+	beforeEach(beforeEachHook);
+
+	test("empty", () => {
+		expect(true).toBe(true);
+	});
+
+	if (process.env.CACHE_MODE === "true") {
+		test.each(testModels)(
+			"completions with cache checks $model",
+			getTestOptions(),
+			async ({ model, originalModel }) => {
+				// Use a long prompt to trigger caching mechanism (1024+ tokens) for models that support it
+				const longPrompt = `You are a helpful assistant. Please analyze the following long text carefully and provide insights. ${"This is a very detailed example text that needs to be quite long to trigger caching mechanisms which require at least 1024 tokens. ".repeat(50)} Just reply 'OK' after processing this text.`;
+
+				const requestId = generateTestRequestId();
+				const res = await app.request("/v1/chat/completions", {
+					method: "POST",
+					headers: {
+						"Content-Type": "application/json",
+						"x-request-id": requestId,
+						Authorization: `Bearer real-token`,
+					},
+					body: JSON.stringify({
+						model: model,
+						messages: [
+							{
+								role: "system",
+								content: longPrompt,
+							},
+							{
+								role: "user",
+								content: "Hello, just reply 'OK'!",
+							},
+						],
+					}),
+				});
+
+				const json = await res.json();
+				if (logMode) {
+					console.log("response:", JSON.stringify(json, null, 2));
+				}
+
+				expect(res.status).toBe(200);
+				validateResponse(json);
+
+				const log = await validateLogByRequestId(requestId);
+				expect(log.streamed).toBe(false);
+
+				expect(json).toHaveProperty("usage");
+				expect(json.usage).toHaveProperty("prompt_tokens");
+				expect(json.usage).toHaveProperty("completion_tokens");
+				expect(json.usage).toHaveProperty("total_tokens");
+				expect(typeof json.usage.prompt_tokens).toBe("number");
+				expect(typeof json.usage.completion_tokens).toBe("number");
+				expect(typeof json.usage.total_tokens).toBe("number");
+				expect(json.usage.prompt_tokens).toBeGreaterThan(0);
+				expect(json.usage.completion_tokens).toBeGreaterThan(0);
+				expect(json.usage.total_tokens).toBeGreaterThan(0);
+
+				// expect(log.inputCost).not.toBeNull();
+				// expect(log.outputCost).not.toBeNull();
+				// expect(log.cost).not.toBeNull();
+
+				const originalModelProviderMapping = models
+					.find((m) => m.id === originalModel)
+					?.providers.find(
+						(p) => p.providerId === log.usedProvider,
+					) as ProviderModelMapping;
+				if (originalModelProviderMapping.cachedInputPrice) {
+					// for models that support cached input cost, make the same request twice with a long prompt (1024+ tokens) to trigger caching
+					const secondRequestId = generateTestRequestId();
+					const secondRes = await app.request("/v1/chat/completions", {
+						method: "POST",
+						headers: {
+							"Content-Type": "application/json",
+							"x-request-id": secondRequestId,
+							Authorization: `Bearer real-token`,
+						},
+						body: JSON.stringify({
+							model: model,
+							messages: [
+								{
+									role: "system",
+									content: longPrompt,
+								},
+								{
+									role: "user",
+									content: "Hello, just reply 'OK'!",
+								},
+							],
+						}),
+					});
+					const secondJson = await secondRes.json();
+					if (logMode) {
+						console.log(
+							"second response:",
+							JSON.stringify(secondJson, null, 2),
+						);
+					}
+					const secondLog = await validateLogByRequestId(secondRequestId);
+					console.log("Second request log for caching test:", {
+						cachedInputCost: secondLog.cachedInputCost,
+						cachedTokens: secondLog.cachedTokens,
+						provider: log.usedProvider,
+						inputCost: secondLog.inputCost,
+						totalCost: secondLog.cost,
+					});
+
+					expect(secondLog.cachedInputCost).toBeGreaterThan(0);
+				}
+			},
+		);
+	}
+});
@@ -2234,6 +2234,7 @@ chat.openapi(completions, async (c) => {
 									data,
 									usedProvider,
 									fullContent,
+									usedModel,
 								);
 
 								// If we have usage data from Google, add it to the streaming chunk
@@ -2399,7 +2400,12 @@ chat.openapi(completions, async (c) => {
 							}
 
 							// Extract token usage using helper function
-							const usage = extractTokenUsage(data, usedProvider, fullContent);
+							const usage = extractTokenUsage(
+								data,
+								usedProvider,
+								fullContent,
+								usedModel,
+							);
 							if (usage.promptTokens !== null) {
 								promptTokens = usage.promptTokens;
 							}
@@ -3001,7 +3007,7 @@ chat.openapi(completions, async (c) => {
 		cachedTokens,
 		toolResults,
 		images,
-	} = parseProviderResponse(usedProvider, json, messages);
+	} = parseProviderResponse(usedProvider, json, messages, usedModel);
 
 	// Debug: Log images found in response
 	logger.debug("Gateway - parseProviderResponse extracted images", { images });
 
@@ -9,6 +9,7 @@ export function extractTokenUsage(
 	data: any,
 	provider: Provider,
 	fullContent?: string,
+	usedModel?: string,
 ) {
 	let promptTokens = null;
 	let completionTokens = null;
@@ -44,20 +45,50 @@ export function extractTokenUsage(
 			break;
 		case "anthropic":
 			if (data.usage) {
-				promptTokens = data.usage.input_tokens ?? null;
+				// For Anthropic: input_tokens are the non-cached tokens
+				// We need to add cache_creation_input_tokens to get total input tokens
+				const inputTokens = data.usage.input_tokens ?? 0;
+				const cacheCreationTokens = data.usage.cache_creation_input_tokens ?? 0;
+				const cacheReadTokens = data.usage.cache_read_input_tokens ?? 0;
+
+				// Total prompt tokens = non-cached + cache creation + cache read
+				promptTokens = inputTokens + cacheCreationTokens + cacheReadTokens;
 				completionTokens = data.usage.output_tokens ?? null;
 				reasoningTokens = data.usage.reasoning_output_tokens ?? null;
-				cachedTokens = data.usage.cache_read_input_tokens ?? null;
+				// Cached tokens are the tokens read from cache (discount applies to these)
+				cachedTokens = cacheReadTokens || null;
 				totalTokens = (promptTokens ?? 0) + (completionTokens ?? 0);
 			}
 			break;
 		default: // OpenAI format
 			if (data.usage) {
-				promptTokens = data.usage.prompt_tokens ?? null;
-				completionTokens = data.usage.completion_tokens ?? null;
-				totalTokens = data.usage.total_tokens ?? null;
-				reasoningTokens = data.usage.reasoning_tokens ?? null;
-				cachedTokens = data.usage.prompt_tokens_details?.cached_tokens ?? null;
+				// Special handling for routeway-discount claude models (use Anthropic-style parsing)
+				if (
+					provider === "routeway-discount" &&
+					usedModel?.startsWith("claude-")
+				) {
+					// Use Anthropic-style token parsing for claude models
+					const inputTokens = data.usage.input_tokens ?? 0;
+					const cacheCreationTokens =
+						data.usage.cache_creation_input_tokens ?? 0;
+					const cacheReadTokens = data.usage.cache_read_input_tokens ?? 0;
+
+					// Total prompt tokens = non-cached + cache creation + cache read
+					promptTokens = inputTokens + cacheCreationTokens + cacheReadTokens;
+					completionTokens = data.usage.output_tokens ?? null;
+					reasoningTokens = data.usage.reasoning_output_tokens ?? null;
+					// Cached tokens are the tokens read from cache (discount applies to these)
+					cachedTokens = cacheReadTokens || null;
+					totalTokens = (promptTokens ?? 0) + (completionTokens ?? 0);
+				} else {
+					// Standard OpenAI-style token parsing
+					promptTokens = data.usage.prompt_tokens ?? null;
+					completionTokens = data.usage.completion_tokens ?? null;
+					totalTokens = data.usage.total_tokens ?? null;
+					reasoningTokens = data.usage.reasoning_tokens ?? null;
+					cachedTokens =
+						data.usage.prompt_tokens_details?.cached_tokens ?? null;
+				}
 			}
 			break;
 	}
 
@@ -10,6 +10,7 @@ export function parseProviderResponse(
 	usedProvider: Provider,
 	json: any,
 	messages: any[] = [],
+	usedModel?: string,
 ) {
 	let content = null;
 	let reasoningContent = null;
@@ -38,14 +39,25 @@ export function parseProviderResponse(
 				thinkingBlocks.map((block: any) => block.thinking).join("") || null;
 
 			finishReason = json.stop_reason || null;
-			promptTokens = json.usage?.input_tokens || null;
-			completionTokens = json.usage?.output_tokens || null;
-			reasoningTokens = json.usage?.reasoning_output_tokens || null;
-			cachedTokens = json.usage?.cache_read_input_tokens || null;
-			totalTokens =
-				json.usage?.input_tokens && json.usage?.output_tokens
-					? json.usage.input_tokens + json.usage.output_tokens
-					: null;
+
+			// For Anthropic: input_tokens are the non-cached tokens
+			// We need to add cache_creation_input_tokens to get total input tokens
+			if (json.usage) {
+				const inputTokens = json.usage.input_tokens || 0;
+				const cacheCreationTokens = json.usage.cache_creation_input_tokens || 0;
+				const cacheReadTokens = json.usage.cache_read_input_tokens || 0;
+
+				// Total prompt tokens = non-cached + cache creation + cache read
+				promptTokens = inputTokens + cacheCreationTokens + cacheReadTokens;
+				completionTokens = json.usage.output_tokens || null;
+				reasoningTokens = json.usage.reasoning_output_tokens || null;
+				// Cached tokens are the tokens read from cache (discount applies to these)
+				cachedTokens = cacheReadTokens || null;
+				totalTokens =
+					promptTokens && completionTokens
+						? promptTokens + completionTokens
+						: null;
+			}
 			// Extract tool calls from Anthropic format
 			toolResults =
 				json.content
@@ -273,10 +285,33 @@ export function parseProviderResponse(
 					}
 				}
 
-				promptTokens = json.usage?.prompt_tokens || null;
-				completionTokens = json.usage?.completion_tokens || null;
-				reasoningTokens = json.usage?.reasoning_tokens || null;
-				cachedTokens = json.usage?.prompt_tokens_details?.cached_tokens || null;
+				// Special handling for routeway-discount claude models (use Anthropic-style parsing)
+				if (
+					usedProvider === "routeway-discount" &&
+					usedModel?.startsWith("claude-")
+				) {
+					// Use Anthropic-style token parsing for claude models
+					if (json.usage) {
+						const inputTokens = json.usage.input_tokens || 0;
+						const cacheCreationTokens =
+							json.usage.cache_creation_input_tokens || 0;
+						const cacheReadTokens = json.usage.cache_read_input_tokens || 0;
+
+						// Total prompt tokens = non-cached + cache creation + cache read
+						promptTokens = inputTokens + cacheCreationTokens + cacheReadTokens;
+						completionTokens = json.usage.output_tokens || null;
+						reasoningTokens = json.usage.reasoning_output_tokens || null;
+						// Cached tokens are the tokens read from cache (discount applies to these)
+						cachedTokens = cacheReadTokens || null;
+					}
+				} else {
+					// Standard OpenAI-style token parsing
+					promptTokens = json.usage?.prompt_tokens || null;
+					completionTokens = json.usage?.completion_tokens || null;
+					reasoningTokens = json.usage?.reasoning_tokens || null;
+					cachedTokens =
+						json.usage?.prompt_tokens_details?.cached_tokens || null;
+				}
 				totalTokens =
 					json.usage?.total_tokens ||
 					(promptTokens !== null && completionTokens !== null