Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 18 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# EmbeddedLLM

Run local LLMs on iGPU, APU and CPU (AMD , Intel, and Qualcomm (Coming Soon))
Run local LLMs on iGPU, APU and CPU (AMD , Intel, and Qualcomm (Coming Soon)).
Easiest way to launch OpenAI API Compatible Server on Windows, Linux and MacOS

| Support matrix | Supported now | Under Development | On the roadmap |
Expand Down Expand Up @@ -32,6 +32,10 @@ Easiest way to launch OpenAI API Compatible Server on Windows, Linux and MacOS
| Phi3-mini-128k-instruct | 3.8B | 128k | [microsoft/Phi-3-mini-128k-instruct-onnx](https://huggingface.co/microsoft/Phi-3-mini-128k-instruct-onnx) |
| Phi3-medium-4k-instruct | 17B | 4096 | [microsoft/Phi-3-medium-4k-instruct-onnx-directml](https://huggingface.co/microsoft/Phi-3-medium-4k-instruct-onnx-directml) |
| Phi3-medium-128k-instruct | 17B | 128k | [microsoft/Phi-3-medium-128k-instruct-onnx-directml](https://huggingface.co/microsoft/Phi-3-medium-128k-instruct-onnx-directml) |
| Openchat-3.6-8b | 8B | 8192 | [EmbeddedLLM/openchat-3.6-8b-20240522-onnx](https://huggingface.co/EmbeddedLLM/openchat-3.6-8b-20240522-onnx) |
| Yi-1.5-6b-chat | 6B | 32k | [EmbeddedLLM/01-ai_Yi-1.5-6B-Chat-onnx](https://huggingface.co/EmbeddedLLM/01-ai_Yi-1.5-6B-Chat-onnx) |
| Phi-3-vision-128k-instruct | | 128k | [EmbeddedLLM/Phi-3-vision-128k-instruct-onnx](https://huggingface.co/EmbeddedLLM/Phi-3-vision-128k-instruct-onnx/tree/main/onnx/cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4) |


## Getting Started

Expand Down Expand Up @@ -87,6 +91,18 @@ options:

1. `ellm_chatbot --port 7788 --host localhost --server_port <ellm_server_port> --server_host localhost`.

![Chatbot Web UI](asset/ellm_chatbot_vid.webp)

## Launch Model Management UI
It is an interface that allows you to download and deploy OpenAI API compatible server.
You can find out the disk space required to download the model in the UI.

1. `ellm_modelui --port 6678`

![Model Management UI](asset/ellm_modelui.png)



## Acknowledgements

- Excellent open-source projects: [vLLM](https://github.com/vllm-project/vllm.git), [onnxruntime-genai](https://github.com/microsoft/onnxruntime-genai.git) and many others.
- Excellent open-source projects: [vLLM](https://github.com/vllm-project/vllm.git), [onnxruntime-genai](https://github.com/microsoft/onnxruntime-genai.git) and many others.
Binary file added asset/ellm_chatbot_vid.webp
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added asset/ellm_modelui.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
69 changes: 69 additions & 0 deletions ellm_api_server.spec
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# -*- mode: python ; coding: utf-8 -*-

from pathlib import Path
from PyInstaller.utils.hooks import collect_all

binaries_list = []

print(Path("src/owl/entrypoints/api.py").resolve().as_posix())

datas_list = [
(Path("src/embeddedllm/entrypoints/api_server.py").resolve().as_posix(), 'embeddedllm/entrypoints')
]

hiddenimports_list = ['multipart']

def add_package(package_name):
datas, binaries, hiddenimports = collect_all(package_name)
datas_list.extend(datas)
binaries_list.extend(binaries)
hiddenimports_list.extend(hiddenimports)

add_package('onnxruntime')
add_package('onnxruntime_genai')

print(binaries_list)
with open("binary.txt", 'w') as f:
f.write(str(binaries_list))

a = Analysis(
['src\\embeddedllm\\entrypoints\\api_server.py'],
pathex=[],
binaries=binaries_list,
datas=datas_list,
hiddenimports=hiddenimports_list,
hookspath=[],
hooksconfig={},
runtime_hooks=[],
excludes=[],
noarchive=False,
optimize=0,
)
pyz = PYZ(a.pure)

exe = EXE(
pyz,
a.scripts,
[],
exclude_binaries=True,
name='ellm_api_server',
debug=False,
bootloader_ignore_signals=False,
strip=False,
upx=True,
console=True,
disable_windowed_traceback=False,
argv_emulation=False,
target_arch=None,
codesign_identity=None,
entitlements_file=None,
)
coll = COLLECT(
exe,
a.binaries,
a.datas,
strip=False,
upx=True,
upx_exclude=[],
name='ellm_api_server',
)
14 changes: 7 additions & 7 deletions scripts/python/httpx_client_stream.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
import asyncio

import httpx
import json

def parse_stream(stream:str):
import httpx

stream = stream.replace('data: ', '')

def parse_stream(stream: str):
stream = stream.replace("data: ", "")
response_obj = json.loads(stream)
# print(response_obj)

return response_obj


async def stream_chat_completion(url: str, payload: dict):
async with httpx.AsyncClient() as client:
async with client.stream("POST", url, json=payload) as response:
Expand All @@ -22,9 +22,9 @@ async def stream_chat_completion(url: str, payload: dict):
if "[DONE]" in decodes_stream:
continue
resp = parse_stream(decodes_stream)
if resp["choices"][0]["delta"].get('content', None):
print(resp["choices"][0]["delta"]["content"], end='', flush=True)
if resp["choices"][0]["delta"].get("content", None):
print(resp["choices"][0]["delta"]["content"], end="", flush=True)

# time.sleep(1)
else:
print(f"Error: {response.status_code}")
Expand Down
9 changes: 4 additions & 5 deletions scripts/python/httpx_client_vision.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
import httpx
import os
import base64
import mimetypes
import os

import httpx

from embeddedllm.protocol import (
CustomChatCompletionMessageParam,
)
from embeddedllm.protocol import CustomChatCompletionMessageParam


def chat_completion(url: str, payload: dict):
Expand Down
3 changes: 2 additions & 1 deletion scripts/python/httpx_client_vision_stream.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import asyncio
import os
import base64
import mimetypes
import os

import httpx


Expand Down
3 changes: 2 additions & 1 deletion scripts/python/litellm_vision_client.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import litellm
import base64
import mimetypes
import os

import litellm

current_file_path = os.path.abspath(__file__)
IMAGE_PATH = os.path.join(os.path.dirname(current_file_path), "..", "images", "catdog.png")

Expand Down
3 changes: 2 additions & 1 deletion scripts/python/openai_vision_client.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
from openai import AsyncOpenAI
import asyncio
import base64
import mimetypes
import os

from openai import AsyncOpenAI

current_file_path = os.path.abspath(__file__)
IMAGE_PATH = os.path.join(os.path.dirname(current_file_path), "..", "images", "catdog.png")

Expand Down
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import io
import os
import platform
import re
from typing import List
import platform

from setuptools import find_packages, setup

Expand Down Expand Up @@ -140,6 +140,7 @@ def get_ellm_version() -> str:
"console_scripts": [
"ellm_server=embeddedllm.entrypoints.api_server:main",
"ellm_chatbot=embeddedllm.entrypoints.webui:main",
"ellm_modelui=embeddedllm.entrypoints.modelui:main",
],
},
)
6 changes: 4 additions & 2 deletions src/embeddedllm/entrypoints/api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,10 @@ async def validation_exception_handler(_, exc):
@app.get("/health")
async def health() -> Response:
"""Health check."""
await openai_chat_server.engine.check_health()
return Response(status_code=200)
if openai_chat_server.check_health():
return Response(status_code=200)
else:
return Response(status_code=500)


@app.get("/v1/models")
Expand Down
6 changes: 6 additions & 0 deletions src/embeddedllm/entrypoints/chat_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ def __init__(
chat_template: Optional[str] = None,
vision: Optional[bool] = False,
):
self.initialized_flag = False
self.model_path = model_path
self.served_model_name = served_model_name
self.response_role = response_role
Expand All @@ -77,6 +78,8 @@ def __init__(
self.tokenizer = self.engine.tokenizer
self._load_chat_template(chat_template)

self.initialized_flag = True

def _load_chat_template(self, chat_template: Optional[str]):
tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast] = self.tokenizer

Expand Down Expand Up @@ -533,3 +536,6 @@ async def _check_model(
err_type="NotFoundError",
status_code=HTTPStatus.NOT_FOUND,
)

def check_health(self):
return self.initialized_flag
Loading