EmbeddedLLM · tjtanaa · Jul 1, 2024 · Jul 1, 2024 · Jul 1, 2024 · Jul 1, 2024
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # EmbeddedLLM
 
-Run local LLMs on iGPU, APU and CPU (AMD , Intel, and Qualcomm (Coming Soon))
+Run local LLMs on iGPU, APU and CPU (AMD , Intel, and Qualcomm (Coming Soon)).
 Easiest way to launch OpenAI API Compatible Server on Windows, Linux and MacOS
 
 | Support matrix        | Supported now                                       | Under Development | On the roadmap |
@@ -32,6 +32,10 @@ Easiest way to launch OpenAI API Compatible Server on Windows, Linux and MacOS
 | Phi3-mini-128k-instruct | 3.8B | 128k | [microsoft/Phi-3-mini-128k-instruct-onnx](https://huggingface.co/microsoft/Phi-3-mini-128k-instruct-onnx) |
 | Phi3-medium-4k-instruct | 17B | 4096 | [microsoft/Phi-3-medium-4k-instruct-onnx-directml](https://huggingface.co/microsoft/Phi-3-medium-4k-instruct-onnx-directml) |
 | Phi3-medium-128k-instruct | 17B | 128k | [microsoft/Phi-3-medium-128k-instruct-onnx-directml](https://huggingface.co/microsoft/Phi-3-medium-128k-instruct-onnx-directml) |
+| Openchat-3.6-8b | 8B | 8192 | [EmbeddedLLM/openchat-3.6-8b-20240522-onnx](https://huggingface.co/EmbeddedLLM/openchat-3.6-8b-20240522-onnx) |
+| Yi-1.5-6b-chat | 6B | 32k | [EmbeddedLLM/01-ai_Yi-1.5-6B-Chat-onnx](https://huggingface.co/EmbeddedLLM/01-ai_Yi-1.5-6B-Chat-onnx) |
+| Phi-3-vision-128k-instruct | | 128k | [EmbeddedLLM/Phi-3-vision-128k-instruct-onnx](https://huggingface.co/EmbeddedLLM/Phi-3-vision-128k-instruct-onnx/tree/main/onnx/cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4) |
+
 
 ## Getting Started
 
@@ -87,6 +91,18 @@ options:
 
 1. `ellm_chatbot --port 7788 --host localhost --server_port <ellm_server_port> --server_host localhost`.
 
+  ![Chatbot Web UI](asset/ellm_chatbot_vid.webp)
+
+## Launch Model Management UI
+It is an interface that allows you to download and deploy OpenAI API compatible server.
+You can find out the disk space required to download the model in the UI.
+
+1. `ellm_modelui --port 6678`
+
+  ![Model Management UI](asset/ellm_modelui.png)
+
+
+
 ## Acknowledgements
 
-- Excellent open-source projects: [vLLM](https://github.com/vllm-project/vllm.git), [onnxruntime-genai](https://github.com/microsoft/onnxruntime-genai.git) and many others.
+- Excellent open-source projects: [vLLM](https://github.com/vllm-project/vllm.git), [onnxruntime-genai](https://github.com/microsoft/onnxruntime-genai.git) and many others.
diff --git a/asset/ellm_chatbot_vid.webp b/asset/ellm_chatbot_vid.webp
diff --git a/asset/ellm_modelui.png b/asset/ellm_modelui.png
diff --git a/ellm_api_server.spec b/ellm_api_server.spec
@@ -0,0 +1,69 @@
+# -*- mode: python ; coding: utf-8 -*-
+
+from pathlib import Path
+from PyInstaller.utils.hooks import collect_all
+
+binaries_list = []
+
+print(Path("src/owl/entrypoints/api.py").resolve().as_posix())
+
+datas_list = [
+    (Path("src/embeddedllm/entrypoints/api_server.py").resolve().as_posix(), 'embeddedllm/entrypoints')
+]
+
+hiddenimports_list = ['multipart']
+
+def add_package(package_name):
+    datas, binaries, hiddenimports = collect_all(package_name)
+    datas_list.extend(datas)
+    binaries_list.extend(binaries)
+    hiddenimports_list.extend(hiddenimports)
+
+add_package('onnxruntime')
+add_package('onnxruntime_genai')
+
+print(binaries_list)
+with open("binary.txt", 'w') as f:
+    f.write(str(binaries_list))
+
+a = Analysis(
+    ['src\\embeddedllm\\entrypoints\\api_server.py'],
+    pathex=[],
+    binaries=binaries_list,
+    datas=datas_list,
+    hiddenimports=hiddenimports_list,
+    hookspath=[],
+    hooksconfig={},
+    runtime_hooks=[],
+    excludes=[],
+    noarchive=False,
+    optimize=0,
+)
+pyz = PYZ(a.pure)
+
+exe = EXE(
+    pyz,
+    a.scripts,
+    [],
+    exclude_binaries=True,
+    name='ellm_api_server',
+    debug=False,
+    bootloader_ignore_signals=False,
+    strip=False,
+    upx=True,
+    console=True,
+    disable_windowed_traceback=False,
+    argv_emulation=False,
+    target_arch=None,
+    codesign_identity=None,
+    entitlements_file=None,
+)
+coll = COLLECT(
+    exe,
+    a.binaries,
+    a.datas,
+    strip=False,
+    upx=True,
+    upx_exclude=[],
+    name='ellm_api_server',
+)
diff --git a/scripts/python/httpx_client_stream.py b/scripts/python/httpx_client_stream.py
@@ -1,17 +1,17 @@
 import asyncio
-
-import httpx
 import json
 
-def parse_stream(stream:str):
+import httpx
 
-    stream = stream.replace('data: ', '')
 
+def parse_stream(stream: str):
+    stream = stream.replace("data: ", "")
     response_obj = json.loads(stream)
     # print(response_obj)
 
     return response_obj
 
+
 async def stream_chat_completion(url: str, payload: dict):
     async with httpx.AsyncClient() as client:
         async with client.stream("POST", url, json=payload) as response:
@@ -22,9 +22,9 @@ async def stream_chat_completion(url: str, payload: dict):
                         if "[DONE]" in decodes_stream:
                             continue
                         resp = parse_stream(decodes_stream)
-                        if resp["choices"][0]["delta"].get('content', None):
-                            print(resp["choices"][0]["delta"]["content"], end='', flush=True)
-                        
+                        if resp["choices"][0]["delta"].get("content", None):
+                            print(resp["choices"][0]["delta"]["content"], end="", flush=True)
+
                         # time.sleep(1)
             else:
                 print(f"Error: {response.status_code}")

diff --git a/scripts/python/httpx_client_vision.py b/scripts/python/httpx_client_vision.py
@@ -1,11 +1,10 @@
-import httpx
-import os
 import base64
 import mimetypes
+import os
+
+import httpx
 
-from embeddedllm.protocol import (
-    CustomChatCompletionMessageParam,
-)
+from embeddedllm.protocol import CustomChatCompletionMessageParam
 
 
 def chat_completion(url: str, payload: dict):

diff --git a/scripts/python/httpx_client_vision_stream.py b/scripts/python/httpx_client_vision_stream.py
@@ -1,7 +1,8 @@
 import asyncio
-import os
 import base64
 import mimetypes
+import os
+
 import httpx
 
 

diff --git a/scripts/python/litellm_vision_client.py b/scripts/python/litellm_vision_client.py
@@ -1,8 +1,9 @@
-import litellm
 import base64
 import mimetypes
 import os
 
+import litellm
+
 current_file_path = os.path.abspath(__file__)
 IMAGE_PATH = os.path.join(os.path.dirname(current_file_path), "..", "images", "catdog.png")
 

diff --git a/scripts/python/openai_vision_client.py b/scripts/python/openai_vision_client.py
@@ -1,9 +1,10 @@
-from openai import AsyncOpenAI
 import asyncio
 import base64
 import mimetypes
 import os
 
+from openai import AsyncOpenAI
+
 current_file_path = os.path.abspath(__file__)
 IMAGE_PATH = os.path.join(os.path.dirname(current_file_path), "..", "images", "catdog.png")
 

diff --git a/setup.py b/setup.py
@@ -1,8 +1,8 @@
 import io
 import os
+import platform
 import re
 from typing import List
-import platform
 
 from setuptools import find_packages, setup
 
@@ -140,6 +140,7 @@ def get_ellm_version() -> str:
         "console_scripts": [
             "ellm_server=embeddedllm.entrypoints.api_server:main",
             "ellm_chatbot=embeddedllm.entrypoints.webui:main",
+            "ellm_modelui=embeddedllm.entrypoints.modelui:main",
         ],
     },
 )
diff --git a/src/embeddedllm/entrypoints/api_server.py b/src/embeddedllm/entrypoints/api_server.py
@@ -48,8 +48,10 @@ async def validation_exception_handler(_, exc):
 @app.get("/health")
 async def health() -> Response:
     """Health check."""
-    await openai_chat_server.engine.check_health()
-    return Response(status_code=200)
+    if openai_chat_server.check_health():
+        return Response(status_code=200)
+    else:
+        return Response(status_code=500)
 
 
 @app.get("/v1/models")

diff --git a/src/embeddedllm/entrypoints/chat_server.py b/src/embeddedllm/entrypoints/chat_server.py
@@ -68,6 +68,7 @@ def __init__(
         chat_template: Optional[str] = None,
         vision: Optional[bool] = False,
     ):
+        self.initialized_flag = False
         self.model_path = model_path
         self.served_model_name = served_model_name
         self.response_role = response_role
@@ -77,6 +78,8 @@ def __init__(
         self.tokenizer = self.engine.tokenizer
         self._load_chat_template(chat_template)
 
+        self.initialized_flag = True
+
     def _load_chat_template(self, chat_template: Optional[str]):
         tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast] = self.tokenizer
 
@@ -533,3 +536,6 @@ async def _check_model(
             err_type="NotFoundError",
             status_code=HTTPStatus.NOT_FOUND,
         )
+
+    def check_health(self):
+        return self.initialized_flag