[clean] Clean up the code

LittleMouse · LittleMouse · commit 82997ac42b64 · 2025-04-08T18:07:54.000+08:00
diff --git a/README.md b/README.md
@@ -78,7 +78,7 @@ curl -X POST "http://localhost:8000/v1/chat/completions" \
 -H "Content-Type: application/json" \
 -H "Authorization: Bearer YOUR_KEY" \
 -d '{
-  "model": "gpt-3.5-turbo",
+  "model": "qwen2.5-0.5B-p256-ax630c",
   "messages": [{"role": "user", "content": "Hello!"}],
   "temperature": 0.7
 }'
@@ -90,7 +90,7 @@ curl -X POST "http://localhost:8000/v1/audio/speech" \
 -H "Content-Type: application/json" \
 -H "Authorization: Bearer YOUR_KEY" \
 -d '{
-  "model": "tts-1",
+  "model": "melotts",
   "input": "Hello world!",
   "voice": "alloy"
 }'
diff --git a/api_server.py b/api_server.py
@@ -1,17 +1,14 @@
 import os
 import uuid
 import yaml
-from fastapi import FastAPI, Request, HTTPException, File, Form, UploadFile
-from fastapi.responses import JSONResponse, StreamingResponse
 import logging
-from slowapi import Limiter
-from slowapi.util import get_remote_address
 import time
 import json
 import asyncio
 
+from fastapi import FastAPI, Request, HTTPException, File, Form, UploadFile
+from fastapi.responses import JSONResponse, StreamingResponse
 from backend import (
-    TestBackend,
     OpenAIProxyBackend,
     LlmClientBackend,
     VisionModelBackend,
@@ -34,7 +31,6 @@
 logger = logging.getLogger("api")
 
 app = FastAPI(title="OpenAI Compatible API Server")
-limiter = Limiter(key_func=get_remote_address)
 
 class Config:
     def __init__(self):
@@ -113,9 +109,7 @@ async def chat_completions(request: Request, body: ChatCompletionRequest):
             detail=f"Unsupported model: {body.model}"
         )
     
-    try:
-        print(f"Received request: {body.model_dump()}")
-        
+    try:        
         if body.stream:
             chunk_generator = await backend.generate(body)
             if not chunk_generator:
@@ -133,7 +127,6 @@ async def format_stream():
                             chunk_dict = chunk.model_dump()
                             
                         json_chunk = json.dumps(chunk_dict, ensure_ascii=False)
-                        print(f"Sending chunk: {json_chunk}")
                         yield f"data: {json_chunk}\n\n"
                 except asyncio.CancelledError:
                     logger.warning("Client disconnected early, terminating inference...")
@@ -150,7 +143,6 @@ async def format_stream():
             )
         else:
             response = await backend.generate(body)
-            print(f"Sending response: {response}")
             return JSONResponse(content=response)
         
     except HTTPException as he:
diff --git a/backend/__init__.py b/backend/__init__.py
@@ -1,4 +1,3 @@
-from .test_backend import TestBackend
 from .openai_proxy_backend import OpenAIProxyBackend
 from .llm_client_backend import LlmClientBackend
 from .tts_client_backend import TtsClientBackend
diff --git a/backend/llm_client_backend.py b/backend/llm_client_backend.py
@@ -20,7 +20,7 @@ def __init__(self, model_config):
         self._active_clients = {}
         self._pool_lock = asyncio.Lock()
         self.logger = logging.getLogger("api.llm")
-        self.MAX_CONTEXT_LENGTH = model_config.get("max_context_length", 200)
+        self.MAX_CONTEXT_LENGTH = model_config.get("max_context_length", 128)
         self.POOL_SIZE = model_config.get("pool_size", 2)
         self._inference_executor = ThreadPoolExecutor(max_workers=self.POOL_SIZE)
         self._active_tasks = weakref.WeakSet()
diff --git a/backend/test_backend.py b/backend/test_backend.py
diff --git a/client/asr_client.py b/client/asr_client.py
@@ -117,21 +117,4 @@ def create_transcription(self, audio_data: bytes, language: str = "zh") -> str:
         for chunk in self.inference_stream(audio_b64, object_type="asr.base64"):
             full_text += chunk
             
-        return full_text
-
-if __name__ == "__main__":
-    with ASRClient(host='192.168.20.183') as client:
-        setup_response = client.setup("whisper.setup", {
-            "model": "whisper-tiny",
-            "response_format": "asr.utf-8",
-            "input": "whisper.base64",
-            "language": "zh",
-            "enoutput": True,
-        })
-        print("Setup response:", setup_response)
-
-        for chunk in client.inference_stream("AAABAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAwQACAAEAA8AGQAWABUAHQAnADQANwAzADEAJAAlAA=="):
-            print("Received chunk:", chunk)
-
-        exit_response = client.exit()
-        print("Exit response:", exit_response)
+        return full_text
diff --git a/client/llm_client.py b/client/llm_client.py
@@ -109,23 +109,4 @@ def _wait_response(self, request_id: str) -> dict:
     def connect(self):
         with self._lock:
             if not self.sock:
-                self._connect()
-
-if __name__ == "__main__":
-    with LLMClient(host='192.168.20.183') as client:
-        setup_response = client.setup("llm.setup", {
-            "model": "Qwen2.5-0.5B-w8a16",
-            "response_format": "llm.utf-8.stream",
-            "input": "llm.utf-8",
-            "enoutput": True,
-            "max_token_len": 1023,
-            "prompt": "You are a helpful assistant"
-        })
-        print("Setup response:", setup_response)
-
-        for chunk in client.inference_stream("Tell me a story"):
-            print("Received chunk:", chunk)
-            client.stop_inference()
-
-        exit_response = client.exit()
-        print("Exit response:", exit_response)
+                self._connect()
diff --git a/client/sys_client.py b/client/sys_client.py
@@ -137,13 +137,4 @@ def create_transcription(self, audio_data: bytes, language: str = "zh") -> str:
         for chunk in self.inference_stream(audio_b64, object_type="asr.base64"):
             full_text += chunk
             
-        return full_text
-
-if __name__ == "__main__":
-    with SYSClient(host='192.168.20.48') as client:
-        hw_response = client.hwinfo()
-        print("hwinfo response:", hw_response)
-        cmm_response = client.cmminfo()
-        print("cmm response:", cmm_response)
-        model_list_response = client.model_list()
-        print("model_list_response:", model_list_response)
+        return full_text
diff --git a/client/tts_client.py b/client/tts_client.py
@@ -114,23 +114,4 @@ def _wait_response(self, request_id: str) -> dict:
     def connect(self):
         with self._lock:
             if not self.sock:
-                self._connect()
-
-if __name__ == "__main__":
-    with TTSClient(host='192.168.20.183') as client:
-        setup_response = client.setup("melotts.setup", {
-            "model": "melotts_zh-cn",
-            "response_format": "pcm.stream.base64",
-            "input": "tts.utf-8",
-            "enoutput": True,
-        })
-        print("Setup response:", setup_response)
-        time.sleep(1)
-        for chunk in client.inference_stream("好的，我来给你讲一个故事。", object_type="tts.utf-8"):
-            print("Received data chunk:", chunk)
-            with open('output_base64.txt', 'a') as f_base:  
-                f_base.write(chunk + '\n')
-            with open('output.pcm', 'ab') as f_pcm:  
-                f_pcm.write(base64.b64decode(chunk))
-        exit_response = client.exit()
-        print("Exit response:", exit_response)
+                self._connect()
diff --git a/services/model_list.py b/services/model_list.py
@@ -65,6 +65,12 @@ async def get_model_list(self, required_mem: int) -> None:
                         elif '-0.5B-' in mode:
                             new_entry['memory_required'] = 560460
                             new_entry['pool_size'] = 2
+                        else:
+                            new_entry['memory_required'] = 1363148
+                            new_entry['pool_size'] = 2
+
+                        if '-p256-' in mode:
+                            new_entry['max_context_length'] = 256
 
                     elif model_type == 'tts':
                         if 'melotts' in mode.lower():

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,3 @@`
`1`		`-from .test_backend import TestBackend`
`2`	`1`	`from .openai_proxy_backend import OpenAIProxyBackend`
`3`	`2`	`from .llm_client_backend import LlmClientBackend`
`4`	`3`	`from .tts_client_backend import TtsClientBackend`