EmbeddedLLM · tjtanaa · Aug 9, 2024 · Aug 2, 2024 · Aug 2, 2024 · Aug 6, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,17 +1,38 @@
 # CHANGELOG
 
-## [Unrelease]
+## [Unreleased]
+
+## ADDED
+
+Engine
+
+- Added OpenVINO support. #19
+
+### CHANGES / FIXES
+
+Ipex-LLM Engine
+
+- Model generation does not adhere to the max_tokens params. #20
+
+## [v0.2.0a]
 
 ### ADDED
 
 DOC
+
 - Update `README.md` to include usage of precompiled engine executable.
 
 ### CHANGES / FIXES
 
+Installation
+
+- Fixed the `ipex-llm` pypi library version.
+
 Engine
+
 - Re-structure the configuration to specify which backend and device to launch the `ipex-llm` model.
 - Fixed Non-Streaming Mode of ONNX is returning the Prompt in the Response #12
 
 PyInstaller Executable
-- Update the `ellm_api_server.spec` to support compilation of `ipex-llm` into executable. #14 
+
+- Update the `ellm_api_server.spec` to support compilation of `ipex-llm` into executable. #14
diff --git a/README.md b/README.md
@@ -69,11 +69,13 @@ Run local LLMs on iGPU, APU and CPU (AMD , Intel, and Qualcomm (Coming Soon)). E
      - **CPU:** `$env:ELLM_TARGET_DEVICE='cpu'; pip install -e .[cpu]`
      - **CUDA:** `$env:ELLM_TARGET_DEVICE='cuda'; pip install -e .[cuda]`
      - **IPEX:** `$env:ELLM_TARGET_DEVICE='ipex'; python setup.py develop`
+     - **OpenVINO:** `$env:ELLM_TARGET_DEVICE='openvino'; pip install -e .[openvino]`
      - **With Web UI**:
        - **DirectML:** `$env:ELLM_TARGET_DEVICE='directml'; pip install -e .[directml,webui]`
        - **CPU:** `$env:ELLM_TARGET_DEVICE='cpu'; pip install -e .[cpu,webui]`
        - **CUDA:** `$env:ELLM_TARGET_DEVICE='cuda'; pip install -e .[cuda,webui]`
        - **IPEX:** `$env:ELLM_TARGET_DEVICE='ipex'; python setup.py develop; pip install -r requirements-webui.txt`
+       - **OpenVINO:** `$env:ELLM_TARGET_DEVICE='openvino'; pip install -e .[openvino,webui]`
 
 - **Linux**
 
@@ -88,11 +90,13 @@ Run local LLMs on iGPU, APU and CPU (AMD , Intel, and Qualcomm (Coming Soon)). E
      - **CPU:** `ELLM_TARGET_DEVICE='cpu' pip install -e .[cpu]`
      - **CUDA:** `ELLM_TARGET_DEVICE='cuda' pip install -e .[cuda]`
      - **IPEX:** `ELLM_TARGET_DEVICE='ipex' python setup.py develop`
+     - **OpenVINO:** `ELLM_TARGET_DEVICE='openvino' pip install -e .[openvino]`
      - **With Web UI**:
        - **DirectML:** `ELLM_TARGET_DEVICE='directml' pip install -e .[directml,webui]`
        - **CPU:** `ELLM_TARGET_DEVICE='cpu' pip install -e .[cpu,webui]`
        - **CUDA:** `ELLM_TARGET_DEVICE='cuda' pip install -e .[cuda,webui]`
-       - **IPEX:** `$env:ELLM_TARGET_DEVICE='ipex'; python setup.py develop; pip install -r requirements-webui.txt`
+       - **IPEX:** `ELLM_TARGET_DEVICE='ipex' python setup.py develop; pip install -r requirements-webui.txt`
+       - **OpenVINO:** `ELLM_TARGET_DEVICE='openvino' pip install -e .[openvino,webui]`
 
 ### Launch OpenAI API Compatible Server
 
@@ -131,12 +135,29 @@ It is an interface that allows you to download and deploy OpenAI API compatible
 
 ## Compile OpenAI-API Compatible Server into Windows Executable
 
+**NOTE:** OpenVINO packaging currently uses `torch==2.4.0`. It will not be able to run due to missing dependencies which is `libomp`. Make sure to install `libomp` and add the `libomp-xxxxxxx.dll` to `C:\\Windows\\System32`.
+
 1. Install `embeddedllm`.
 2. Install PyInstaller: `pip install pyinstaller==6.9.0`.
 3. Compile Windows Executable: `pyinstaller .\ellm_api_server.spec`.
 4. You can find the executable in the `dist\ellm_api_server`.
 5. Use it like `ellm_server`. `.\ellm_api_server.exe --model_path <path/to/model/weight>`.
 
+   _Powershell/Terminal Usage_:
+
+   ```powershell
+   ellm_server --model_path <path/to/model/weight>
+
+   # DirectML
+   ellm_server --model_path 'EmbeddedLLM_Phi-3-mini-4k-instruct-062024-onnx\onnx\directml\Phi-3-mini-4k-instruct-062024-int4' --port 5555
+
+   # IPEX-LLM
+   ellm_server --model_path '.\meta-llama_Meta-Llama-3.1-8B-Instruct\'  --backend 'ipex' --device 'xpu' --port 5555 --served_model_name 'meta-llama_Meta/Llama-3.1-8B-Instruct'
+
+   # OpenVINO
+   ellm_server --model_path '.\meta-llama_Meta-Llama-3.1-8B-Instruct\'  --backend 'openvino' --device 'gpu' --port 5555 --served_model_name 'meta-llama_Meta/Llama-3.1-8B-Instruct'
+   ```
+
 ## Prebuilt OpenAI API Compatible Windows Executable (Alpha)
 
 You can find the prebuilt OpenAI API Compatible Windows Executable in the Release page.
@@ -151,6 +172,9 @@ _Powershell/Terminal Usage (Use it like `ellm_server`)_:
 
 # IPEX-LLM
 .\ellm_api_server.exe --model_path '.\meta-llama_Meta-Llama-3.1-8B-Instruct\'  --backend 'ipex' --device 'xpu' --port 5555 --served_model_name 'meta-llama_Meta/Llama-3.1-8B-Instruct'
+
+# OpenVINO
+.\ellm_api_server.exe --model_path '.\meta-llama_Meta-Llama-3.1-8B-Instruct\'  --backend 'openvino' --device 'gpu' --port 5555 --served_model_name 'meta-llama_Meta/Llama-3.1-8B-Instruct'
 ```
 
 ## Acknowledgements

diff --git a/ellm_api_server.spec b/ellm_api_server.spec
@@ -20,7 +20,7 @@ def get_embeddedllm_backend():
         version = importlib.metadata.version("embeddedllm")
 
         # Use regex to extract the backend
-        match = re.search(r"\+(directml|cpu|cuda|ipex)$", version)
+        match = re.search(r"\+(directml|cpu|cuda|ipex|openvino)$", version)
 
         if match:
             backend = match.group(1)
@@ -36,18 +36,17 @@ backend = get_embeddedllm_backend()
 
 binaries_list = []
 
+binaries_list.extend([
+    (Path('C:\\Windows\\System32\\libomp140.x86_64.dll').as_posix(), '.'),
+    (Path('C:\\Windows\\System32\\libomp140d.x86_64.dll').as_posix(), '.'),
+])
+
 datas_list = [
     (Path("src/embeddedllm/entrypoints/api_server.py").resolve().as_posix(), 'embeddedllm/entrypoints'),
 ]
 datas_list.extend(collect_data_files('torch', include_py_files=True))
 
 hiddenimports_list = ['multipart']
-# Add missing hidden imports
-#hiddenimports_list.extend([
-#    'torch', 'torchvision', 'intel_extension_for_pytorch',
-#    'intel_extension_for_pytorch.xpu', 'intel_extension_for_pytorch.xpu.fp8',
-#    'intel_extension_for_pytorch.nn.utils'
-#])
 
 pathex = []
 
@@ -60,6 +59,7 @@ def add_package(package_name):
 if backend in ('directml', 'cpu', 'cuda'):
     add_package('onnxruntime')
     add_package('onnxruntime_genai')
+
 elif backend == 'ipex':
     print(f"Backend IPEX")
     add_package('ipex_llm')
@@ -71,6 +71,21 @@ elif backend == 'ipex':
     add_package('numpy')
     binaries_list.append((f'{CONDA_PATH.parent}/Library/bin/*', '.'))
 
+elif backend == 'openvino':
+    print(f"Backend OpenVino")
+    add_package('onnx')
+    add_package('torch')
+    add_package('torchvision')
+    add_package('optimum')
+    add_package('optimum.intel')
+    add_package('embeddedllm')
+    add_package('numpy')
+    add_package('openvino')
+    add_package('openvino-genai')
+    add_package('openvino-telemetry')
+    add_package('openvino-tokenizers')
+    binaries_list.append((f'{CONDA_PATH.parent}/Library/bin/*', '.'))
+
 print(binaries_list)
 
 with open("binary.txt", 'w') as f:

diff --git a/requirements-openvino.txt b/requirements-openvino.txt
@@ -0,0 +1,4 @@
+optimum-intel[openvino,nncf]@git+https://github.com/huggingface/optimum-intel.git
+torch>=2.4
+onnx<=1.16.1
+transformers>=4.42
diff --git a/scripts/benchmark/benchmark_api_server.py b/scripts/benchmark/benchmark_api_server.py
diff --git a/setup.py b/setup.py
@@ -50,6 +50,10 @@ def _is_ipex() -> bool:
     return ELLM_TARGET_DEVICE == "ipex"
 
 
+def _is_openvino() -> bool:
+    return ELLM_TARGET_DEVICE == "openvino"
+
+
 class ELLMInstallCommand(install):
     def run(self):
         install.run(self)
@@ -157,7 +161,9 @@ def find_version(filepath: str) -> str:
 
 def _read_requirements(filename: str) -> List[str]:
     with open(get_path(filename)) as f:
-        requirements = f.read().strip().split("\n")
+        # requirements = f.read().strip().split("\n")
+        requirements = f.readlines()
+
     resolved_requirements = []
     for line in requirements:
         if line.startswith("-r "):
@@ -178,6 +184,8 @@ def get_requirements() -> List[str]:
         requirements = _read_requirements("requirements-cpu.txt")
     elif _is_ipex():
         requirements = _read_requirements("requirements-ipex.txt")
+    elif _is_openvino():
+        requirements = _read_requirements("requirements-openvino.txt")
     else:
         raise ValueError("Unsupported platform, please use CUDA, ROCm, Neuron, or CPU.")
     return requirements
@@ -194,6 +202,8 @@ def get_ellm_version() -> str:
         version += "+cpu"
     elif _is_ipex():
         version += "+ipex"
+    elif _is_openvino():
+        version += "+openvino"
     else:
         raise RuntimeError("Unknown runtime environment")
 
@@ -245,6 +255,7 @@ def get_ellm_version() -> str:
         "webui": _read_requirements("requirements-webui.txt"),
         "cuda": ["onnxruntime-genai-cuda==0.3.0rc2"],
         "ipex": [],
+        "openvino": [],
     },
     dependency_links=dependency_links,
     entry_points={