[perf] Optimize llm_client parameter configuration method

Abandon-ht · Abandon-ht · commit c64f974a7d13 · 2025-02-26T08:38:54.000Z
diff --git a/backend/llm_client_backend.py b/backend/llm_client_backend.py
@@ -14,15 +14,14 @@
 from services.memory_check import MemoryChecker
 
 class LlmClientBackend(BaseModelBackend):
-    MAX_CONTEXT_LENGTH = 500
-    POOL_SIZE = 2
-
     def __init__(self, model_config):
         super().__init__(model_config)
         self._client_pool = []
         self._active_clients = {}
         self._pool_lock = asyncio.Lock()
         self.logger = logging.getLogger("api.llm")
+        self.MAX_CONTEXT_LENGTH = model_config.get("max_context_length", 500)
+        self.POOL_SIZE = model_config.get("pool_size", 2)
         self._inference_executor = ThreadPoolExecutor(max_workers=self.POOL_SIZE)
         self._active_tasks = weakref.WeakSet()
         self.memory_checker = MemoryChecker(
@@ -55,15 +54,16 @@ async def _get_client(self, request):
         try:
             await asyncio.wait_for(self._pool_lock.acquire(), timeout=30.0)
             
+            if self._client_pool:
+                client = self._client_pool.pop()
+                self.logger.debug(f"Reusing client from pool | ID:{id(client)}")
+                return client
+
             if "memory_required" in self.config:
                 await self.memory_checker.check_memory(
                     self.config["memory_required"]
                 )
 
-            if self._client_pool:
-                client = self._client_pool.pop()
-                self.logger.debug(f"Reusing client from pool | ID:{id(client)}")
-                return client
 
             if len(self._active_clients) >= self.POOL_SIZE:
                 raise RuntimeError("Connection pool exhausted")
diff --git a/config/config.yaml b/config/config.yaml
@@ -26,6 +26,7 @@ models:
     model_name: "qwen2.5-0.5B-prefill-20e"
     object: "llm.setup"
     pool_size: 2
+    max_context_length: 128
     response_format: "llm.utf-8.stream"
     input: "llm.utf-8"
     memory_required: 716800
@@ -39,22 +40,24 @@ models:
     model_name: "Qwen2.5-0.5B-w8a16"
     object: "llm.setup"
     pool_size: 2
+    max_context_length: 1024
     response_format: "llm.utf-8.stream"
     input: "llm.utf-8"
     memory_required: 716800
     system_prompt: |
       You are a helpful assistant.
 
-  deepseek-r1-1.5B-ax630c:
+  deepseek-r1-distill-qwen-1.5b:
     type: tcp_client
     host: "192.168.20.65"
     port: 10001
     model_name: "deepseek-r1-1.5B-ax630c"
     object: "llm.setup"
     pool_size: 1
+    max_context_length: 1024
     response_format: "llm.utf-8.stream"
     input: "llm.utf-8"
-    memory_required: 1572864
+    memory_required: 2097152
     system_prompt: |
       You are a helpful assistant.
 
@@ -64,7 +67,8 @@ models:
     port: 10001
     model_name: "internvl2.5-1B-ax630c"
     object: "vlm.setup"
-    pool_size: 1
+    pool_size: 2
+    max_context_length: 256
     response_format: "vlm.utf-8.stream"
     input: "vlm.utf-8"
     memory_required: 1048576