From 40a90c040171fdd5b609780a6833d7f303713dc1 Mon Sep 17 00:00:00 2001
From: Caleb John <45307388+calebjohn24@users.noreply.github.com>
Date: Thu, 9 Jan 2025 02:22:54 -0800
Subject: [PATCH] Add torch model and GPU support. (#195)

* Setup TorchVL

* fix build, setup cpu, gpu, cloud variants

* remove torch files

* update .gitignore

* update tests

* fix torchVL class and add streaming tests

* format

* remove space in torch max tokens

* format

* Add black install

* fix cloud test

* isort imports

* fix test build ci

* Remove build py, update readme

* update readme

* Add back torch deps

* update readme

* update readme

* update readme
---
 .github/workflows/python-client-tests.yml     |   4 +-
 .gitignore                                    |   1 +
 clients/python/README.md                      |  26 ++++-
 clients/python/build.py                       |  93 ---------------
 clients/python/moondream/__init__.py          |  15 ++-
 clients/python/moondream/torch_vl.py          | 110 ++++++++++++++++++
 clients/python/pyproject.toml                 |  55 +++++----
 clients/python/tests/test_api_inference.py    |   2 -
 clients/python/tests/test_local_inference.py  |   3 +-
 .../tests/test_local_torch_inference.py       |  84 +++++++++++++
 moondream/torch/sample.py                     |   1 -
 11 files changed, 271 insertions(+), 123 deletions(-)
 delete mode 100644 clients/python/build.py
 create mode 100644 clients/python/moondream/torch_vl.py
 create mode 100644 clients/python/tests/test_local_torch_inference.py

diff --git a/.github/workflows/python-client-tests.yml b/.github/workflows/python-client-tests.yml
index 5e50ad96..d63ff7d2 100644
--- a/.github/workflows/python-client-tests.yml
+++ b/.github/workflows/python-client-tests.yml
@@ -40,12 +40,12 @@ jobs:
     - name: Install dependencies
       working-directory: ./clients/python
       run: |
-        python -m pip install --upgrade pip
         poetry install --all-extras
     
     - name: Format code
       working-directory: ./clients/python
       run: |
+        poetry run pip install black
         poetry run black tests/test_local_inference.py --check
 
     - name: Run tests
@@ -54,4 +54,4 @@ jobs:
         MOONDREAM_API_KEY: ${{ secrets.MOONDREAM_API_KEY }}
       run: |
         poetry run pip install pytest pytest-asyncio
-        poetry run pytest tests/test_*.py -v
+        poetry run pytest tests/test_api_inference.py -v
diff --git a/.gitignore b/.gitignore
index 098f5493..c1b5e0bf 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,3 +5,4 @@ data
 /pyproject.toml
 poetry.lock
 dist
+clients/python/moondream/torch
diff --git a/clients/python/README.md b/clients/python/README.md
index 422095c0..df0c3f08 100644
--- a/clients/python/README.md
+++ b/clients/python/README.md
@@ -17,7 +17,23 @@ local inference and cloud-based API access.
 Install the package from PyPI:
 
 ```bash
-pip install moondream==0.0.5
+pip install moondream==0.0.6
+```
+
+To install the CPU dependencies for local inference, run:
+
+```bash
+pip install moondream[cpu]
+```
+
+To install the GPU dependencies for local inference, run:
+
+```bash
+# Copy the torch implementation from the root moondream repo into the moondream/torch directory
+cp -r moondream/torch clients/python/moondream/torch
+
+# Install the GPU dependencies
+pip install moondream[gpu]
 ```
 
 ## Quick Start
@@ -160,7 +176,13 @@ All methods return typed dictionaries:
 - CUDA (GPU) and MPS (Apple Silicon) support coming soon
 - For optimal performance with GPU/MPS, use the PyTorch implementation for now
 
+## Development Notes
+
+- Copy the torch implementation from the root moondream repo into the `torch` directory
+- Run `poetry install --extras "gpu"` to install the GPU dependencies
+- Run `poetry install --extras "cpu"` to install the CPU dependencies
+
 ## Links
 
 - [Website](https://moondream.ai/)
-- [Demo](https://moondream.ai/playground)
+- [Demo](https://moondream.ai/playground)
\ No newline at end of file
diff --git a/clients/python/build.py b/clients/python/build.py
deleted file mode 100644
index 460a7e52..00000000
--- a/clients/python/build.py
+++ /dev/null
@@ -1,93 +0,0 @@
-import toml
-import sys
-import shutil
-from pathlib import Path
-
-BASE_CONFIG = {
-    "tool": {
-        "poetry": {
-            "version": "0.0.2",
-            "description": "Python client library for moondream",
-            "authors": ["vik <vik@moondream.ai>"],
-            "readme": "README.md",
-            # Explicitly declare where to find the package source
-            "packages": [{"include": "moondream", "from": "."}],  # for GPU variant
-            "dependencies": {
-                "python": "^3.10",
-                "pillow": "^10.4.0",
-                "numpy": "^2.1.2",
-                "tokenizers": "^0.20.1",
-            },
-            "scripts": {"moondream": "moondream.cli:main"},
-        },
-        "pyright": {
-            "venvPath": ".",
-            "venv": ".venv",
-            "reportMissingParameterType": False,
-        },
-    },
-    "build-system": {
-        "requires": ["poetry-core"],
-        "build-backend": "poetry.core.masonry.api",
-    },
-}
-
-
-def build(variant):
-    config = BASE_CONFIG.copy()
-    src_dir = Path("moondream")
-
-    if variant == "gpu":
-        package_name = "moondream-gpu"
-        config["tool"]["poetry"]["name"] = package_name
-        config["tool"]["poetry"]["dependencies"]["onnxruntime-gpu"] = "^1.20.0"
-
-        # Create package directory
-        target_dir = Path(package_name)
-        if target_dir.exists():
-            shutil.rmtree(target_dir)
-
-        # Create directory and copy files
-        target_dir.mkdir(exist_ok=True)
-
-        # Ensure __init__.py exists (fix the filename)
-        # init_file = target_dir / "__init__.py"
-        # if not init_file.exists():
-        #     init_file.touch()
-
-        # Copy all python files
-        for py_file in src_dir.glob("*.py"):
-            shutil.copy2(py_file, target_dir)
-
-    elif variant == "cpu":
-        package_name = "moondream"
-        config["tool"]["poetry"]["name"] = package_name
-        config["tool"]["poetry"]["dependencies"]["onnxruntime"] = "^1.19.2"
-        config["tool"]["poetry"]["packages"] = [{"include": "moondream", "from": "."}]
-
-    else:
-        print(f"Unknown variant: {variant}")
-        print("Usage: python build.py [cpu|gpu]")
-        sys.exit(1)
-
-    import copy
-
-    config = copy.deepcopy(config)
-
-    # Write the configuration
-    toml_content = toml.dumps(config)
-    print("Generated pyproject.toml content:")
-    print(toml_content)  # Debug output
-    Path("pyproject.toml").write_text(toml_content)
-
-    print(f"Built configuration for {variant} variant")
-    return package_name
-
-
-if __name__ == "__main__":
-    if len(sys.argv) != 2:
-        print("Usage: python build.py [cpu|gpu]")
-        sys.exit(1)
-
-    package_name = build(sys.argv[1])
-    print(f"Successfully created {package_name} configuration")
diff --git a/clients/python/moondream/__init__.py b/clients/python/moondream/__init__.py
index feb01791..46bd3235 100644
--- a/clients/python/moondream/__init__.py
+++ b/clients/python/moondream/__init__.py
@@ -1,7 +1,6 @@
 from typing import Optional
 
 from .cloud_vl import CloudVL
-from .onnx_vl import OnnxVL
 from .types import VLM
 
 DEFAULT_API_URL = "https://api.moondream.ai/v1"
@@ -14,7 +13,19 @@ def vl(
     api_url: Optional[str] = None,
 ) -> VLM:
     if model:
-        return OnnxVL.from_path(model)
+        model_filetype = model.split(".")[-1]
+        if model_filetype == "safetensors":
+            from .torch_vl import TorchVL
+
+            return TorchVL(model=model)
+        elif model_filetype == "mf":
+            from .onnx_vl import OnnxVL
+
+            return OnnxVL.from_path(model)
+
+        raise ValueError(
+            "Unsupported model filetype. Please use a .safetensors model for GPU use or .mf model for CPU use."
+        )
 
     if api_key:
         if not api_url:
diff --git a/clients/python/moondream/torch_vl.py b/clients/python/moondream/torch_vl.py
new file mode 100644
index 00000000..31e653fa
--- /dev/null
+++ b/clients/python/moondream/torch_vl.py
@@ -0,0 +1,110 @@
+from typing import Literal, Optional, Union
+
+import torch
+from PIL import Image
+
+from .torch.moondream import MoondreamConfig, MoondreamModel
+from .torch.weights import load_weights_into_model
+from .types import (
+    VLM,
+    Base64EncodedImage,
+    CaptionOutput,
+    DetectOutput,
+    EncodedImage,
+    PointOutput,
+    QueryOutput,
+    SamplingSettings,
+)
+from .version import __version__
+
+
+class TorchVL(VLM):
+    def __init__(
+        self,
+        *,
+        model: str,
+    ):
+        config = MoondreamConfig()
+        self.model = MoondreamModel(config)
+        load_weights_into_model(model, self.model)
+        self.model.eval()
+        # Move model to the appropriate device
+        if torch.cuda.is_available():
+            self.device = "cuda"
+        elif torch.backends.mps.is_available():
+            self.device = "mps"
+        else:
+            self.device = "cpu"
+        self.model.to(self.device)
+
+    def encode_image(
+        self, image: Union[Image.Image, EncodedImage]
+    ) -> Base64EncodedImage:
+        if isinstance(image, EncodedImage):
+            assert type(image) == Base64EncodedImage
+            return image
+
+        if not self.model:
+            raise ValueError("No local model loaded")
+
+        return self.model.encode_image(image)
+
+    def caption(
+        self,
+        image: Union[Image.Image, EncodedImage],
+        length: Literal["normal", "short"] = "normal",
+        stream: bool = False,
+        settings: Optional[SamplingSettings] = None,
+    ) -> CaptionOutput:
+        if not self.model:
+            raise ValueError("No local model loaded")
+
+        encoded_image = (
+            self.model.encode_image(image) if isinstance(image, Image.Image) else image
+        )
+        return self.model.caption(
+            encoded_image, length=length, stream=stream, settings=settings
+        )
+
+    def query(
+        self,
+        image: Union[Image.Image, EncodedImage],
+        question: str,
+        stream: bool = False,
+        settings: Optional[SamplingSettings] = None,
+    ) -> QueryOutput:
+        if not self.model:
+            raise ValueError("No local model loaded")
+
+        encoded_image = (
+            self.model.encode_image(image) if isinstance(image, Image.Image) else image
+        )
+        return self.model.query(
+            encoded_image, question, stream=stream, settings=settings
+        )
+
+    def detect(
+        self,
+        image: Union[Image.Image, EncodedImage],
+        object: str,
+    ) -> DetectOutput:
+        if not self.model:
+            raise ValueError("No local model loaded")
+
+        encoded_image = (
+            self.model.encode_image(image) if isinstance(image, Image.Image) else image
+        )
+        return self.model.detect(encoded_image, object)
+
+    def point(
+        self,
+        image: Union[Image.Image, EncodedImage],
+        object: str,
+    ) -> PointOutput:
+        if not self.model:
+            raise ValueError("No local model loaded")
+
+        encoded_image = (
+            self.model.encode_image(image) if isinstance(image, Image.Image) else image
+        )
+        return self.model.point(encoded_image, object)
diff --git a/clients/python/pyproject.toml b/clients/python/pyproject.toml
index 635be1ea..700b119c 100644
--- a/clients/python/pyproject.toml
+++ b/clients/python/pyproject.toml
@@ -1,32 +1,47 @@
+[build-system]
+requires = [ "poetry-core",]
+build-backend = "poetry.core.masonry.api"
+
 [tool.poetry]
 name = "moondream"
-version = "0.0.5"
+version = "0.0.2"
 description = "Python client library for moondream"
-authors = ["vik <vik@moondream.ai>"]
+authors = [ "M87 Labs <contact@moondream.ai>",]
 readme = "README.md"
+[[tool.poetry.packages]]
+include = "moondream"
+from = "."
+
+[tool.pyright]
+venvPath = "."
+venv = ".venv"
+reportMissingParameterType = false
 
 [tool.poetry.dependencies]
 python = "^3.10"
 pillow = "^10.4.0"
-onnxruntime = "^1.19.2"
 numpy = "^2.1.2"
-onnx = "^1.17.0"
-tokenizers = "^0.20.1"
+onnxruntime = { version = ">=1.19.2", optional = true }
+tokenizers = { version = ">=0.20.1", optional = true }
+torch = { version = ">=2.5.0", optional = true }
+safetensors = { version = ">=0.4.2", optional = true }
+einops = { version = ">=0.7.0", optional = true }
+pyvips-binary = { version = ">=8.16.0", optional = true }
+pyvips = { version = ">=2.2.1", optional = true }
+
+[tool.poetry.extras]
+cpu = [
+    "onnxruntime",
+    "tokenizers"
+]
+gpu = [
+    "torch",
+    "safetensors",
+    "einops",
+    "pyvips-binary",
+    "pyvips",
+    "tokenizers"
+]
 
 [tool.poetry.scripts]
 moondream = "moondream.cli:main"
-
-[tool.poetry.group.dev.dependencies]
-pytest = "^8.3.4"
-pytest-asyncio = "^0.25.1"
-requests = "^2.32.3"
-black = "^24.10.0"
-
-[tool.pyright]
-venvPath = "."
-venv = ".venv"
-reportMissingParameterType = false
-
-[build-system]
-requires = ["poetry-core"]
-build-backend = "poetry.core.masonry.api"
diff --git a/clients/python/tests/test_api_inference.py b/clients/python/tests/test_api_inference.py
index 36ba51d0..bcc58dcc 100644
--- a/clients/python/tests/test_api_inference.py
+++ b/clients/python/tests/test_api_inference.py
@@ -28,7 +28,6 @@ def test_api_initialization(model):
     assert isinstance(model, md.cloud_vl.CloudVL)
 
 
-@pytest.mark.skip(reason="API returning 502 errors, needs investigation")
 def test_image_captioning(model, test_image):
     # Test normal length caption
     result = model.caption(test_image, length="normal")
@@ -56,7 +55,6 @@ def test_streaming_caption(model, test_image):
     assert len(caption) > 0
 
 
-@pytest.mark.skip(reason="API returning 502 errors, needs investigation")
 def test_query_answering(model, test_image):
     # Test basic question answering
     result = model.query(test_image, "What is in this image?")
diff --git a/clients/python/tests/test_local_inference.py b/clients/python/tests/test_local_inference.py
index d0da8d10..9ae4a24b 100644
--- a/clients/python/tests/test_local_inference.py
+++ b/clients/python/tests/test_local_inference.py
@@ -109,6 +109,7 @@ def test_invalid_caption_length(model, test_image):
 
 def test_invalid_model_path():
     with pytest.raises(
-        ValueError, match="Model path is invalid or file does not exist"
+        ValueError,
+        match="Unsupported model filetype. Please use a .safetensors for GPU use or .mf for CPU use.",
     ):
         md.vl(model="invalid/path/to/model.bin")
diff --git a/clients/python/tests/test_local_torch_inference.py b/clients/python/tests/test_local_torch_inference.py
new file mode 100644
index 00000000..d42f0b76
--- /dev/null
+++ b/clients/python/tests/test_local_torch_inference.py
@@ -0,0 +1,84 @@
+import os
+import pytest
+from PIL import Image
+import moondream as md
+
+MODEL_PATH = "/Users/caleb/Projects/moondream/moondream-playground-inf/src/ai-models/05/moondream-01-08-2025.safetensors"
+TEST_IMAGE_PATH = os.path.join(
+    os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))),
+    "assets",
+    "demo-1.jpg",
+)
+
+
+@pytest.fixture
+def model():
+    return md.vl(model=MODEL_PATH)
+
+
+@pytest.fixture
+def test_image():
+    return Image.open(TEST_IMAGE_PATH)
+
+
+def test_image_captioning(model, test_image):
+    # Test normal length caption
+    result = model.caption(test_image, length="normal")
+    assert "caption" in result
+    assert isinstance(result["caption"], str)
+    assert len(result["caption"]) > 0
+
+    # Test short length caption
+    result = model.caption(test_image, length="short")
+    assert "caption" in result
+    assert isinstance(result["caption"], str)
+    assert len(result["caption"]) > 0
+
+    # Test streaming caption
+    result = model.caption(test_image, stream=True)
+    assert "caption" in result
+
+    # Test that we can iterate over the stream
+    num_chunks = 0
+    caption = ""
+    for chunk in result["caption"]:
+        assert isinstance(chunk, str)
+        caption += chunk
+        num_chunks += 1
+
+    assert len(caption) > 0
+    assert num_chunks > 1
+
+
+def test_query(model, test_image):
+    result = model.query(test_image, "What is in this image?")
+    assert "answer" in result
+    assert isinstance(result["answer"], str)
+    assert len(result["answer"]) > 0
+
+    # Test streaming query
+    result = model.query(test_image, "What is in this image?", stream=True)
+    assert "answer" in result
+
+    # Test that we can iterate over the stream
+    num_chunks = 0
+    answer = ""
+    for chunk in result["answer"]:
+        assert isinstance(chunk, str)
+        answer += chunk
+        num_chunks += 1
+
+    assert num_chunks > 1
+    assert len(answer) > 0
+
+
+def test_detect(model, test_image):
+    result = model.detect(test_image, "person")
+    assert "objects" in result
+    assert isinstance(result["objects"], list)
+
+
+def test_point(model, test_image):
+    result = model.point(test_image, "face")
+    assert "points" in result
+    assert isinstance(result["points"], list)
diff --git a/moondream/torch/sample.py b/moondream/torch/sample.py
index 54ca4133..327356f6 100644
--- a/moondream/torch/sample.py
+++ b/moondream/torch/sample.py
@@ -45,7 +45,6 @@
 
     if not args.benchmark:
         encoded_image = model.encode_image(image)
-
         print("Caption: short")
         for t in model.caption(encoded_image, "short", stream=True)["caption"]:
             print(t, end="", flush=True)