From fc49f314c4f2d4089d8865fef700bc4e600bbf01 Mon Sep 17 00:00:00 2001
From: Caleb John <45307388+calebjohn24@users.noreply.github.com>
Date: Thu, 5 Dec 2024 14:48:40 -0800
Subject: [PATCH] Add node support for local sidecar server and request types
 (#165)

* Add support for local sidecar server

* Remove un-needed types

* Update README.MD

* Update README.MD API Constructor Docs

* Update README.MD examples

* Fix default port in tests

* Update README types

* Update README typo
---
 clients/node/README.MD                        | 118 +++++++--
 .../__tests__/moondream.integration.test.ts   | 232 +++++++++++++++++-
 clients/node/src/__tests__/moondream.test.ts  |  50 +++-
 clients/node/src/moondream.ts                 |  59 ++---
 clients/node/src/types.ts                     |  38 ++-
 5 files changed, 430 insertions(+), 67 deletions(-)
diff --git a/clients/node/README.MD b/clients/node/README.MD
index f10e675b..6faa4942 100644
--- a/clients/node/README.MD
+++ b/clients/node/README.MD
@@ -21,8 +21,9 @@ yarn add moondream
 ## Quick Start
 
 Before using this client library, you'll need an API key to access Moondream's hosted service.
-You can get a free API key from [console.moondream.ai](https://console.moondream.ai). Currently
-local inference is only available in Python, but Node.js support is coming very soon.
+You can get a free API key from [console.moondream.ai](https://console.moondream.ai).
+
+### Cloud
 
 ```javascript
 import { vl } from "moondream";
@@ -39,15 +40,72 @@ const image = fs.readFileSync("path/to/image.jpg");
 // Basic usage examples
 async function main() {
   // Generate a caption for the image
-  const caption = await model.caption(image);
+  const caption = await model.caption({
+    image: image,
+    length: "normal",
+    stream: false
+  });
   console.log("Caption:", caption);
 
   // Ask a question about the image
-  const answer = await model.query(image, "What's in this image?");
+  const answer = await model.query({
+    image: image,
+    question: "What's in this image?",
+    stream: false
+  });
   console.log("Answer:", answer);
 
   // Stream the response
-  const stream = await model.caption(image, "normal", true);
+  const stream = await model.caption({
+    image: image,
+    length: "normal",
+    stream: true
+  });
+  for await (const chunk of stream.caption) {
+    process.stdout.write(chunk);
+  }
+}
+
+main();
+```
+
+### Local Inference
+
+- Install the `moondream` CLI: `pip install moondream`
+- Run the local server: `moondream serve --model <path-to-model>`
+- Set the `apiUrl` parameter to the URL of the local server (the default is `http://localhost:3475`)
+
+```javascript
+const model = new vl({
+  apiUrl: "http://localhost:3475",
+});
+
+const image = fs.readFileSync("path/to/image.jpg");
+
+// Basic usage examples
+async function main() {
+  // Generate a caption for the image
+  const caption = await model.caption({
+    image: image,
+    length: "normal",
+    stream: false
+  });
+  console.log("Caption:", caption);
+
+  // Ask a question about the image
+  const answer = await model.query({
+    image: image,
+    question: "What's in this image?",
+    stream: false
+  });
+  console.log("Answer:", answer);
+
+  // Stream the response
+  const stream = await model.caption({
+    image: image,
+    length: "normal",
+    stream: true
+  });
   for await (const chunk of stream.caption) {
     process.stdout.write(chunk);
   }
@@ -68,47 +126,77 @@ main();
 ### Constructor
 
 ```javascript
+// for cloud inference
 const model = new vl({
   apiKey: "your-api-key",
 });
+
+// or for local inference
+const model = new vl({
+  apiUrl: "http://localhost:3475",
+});
 ```
 
 ### Methods
 
-#### caption(image, length?, stream?, settings?)
+#### caption({ image: string, length: string, stream?: boolean })
 
 Generate a caption for an image.
 
 ```javascript
-const result = await model.caption(image, "normal", false);
+const result = await model.caption({
+  image: image,
+  length: "normal",
+  stream: false
+});
+
 // or with streaming
-const stream = await model.caption(image, "normal", true);
+const stream = await model.caption({
+  image: image,
+  length: "normal",
+  stream: true
+});
 ```
 
-#### query(image, question, stream?, settings?)
+#### query({ image: string, question: string, stream?: boolean })
 
 Ask a question about an image.
 
 ```javascript
-const result = await model.query(image, "What's in this image?", false);
+const result = await model.query({
+  image: image,
+  question: "What's in this image?",
+  stream: false
+});
+
 // or with streaming
-const stream = await model.query(image, "What's in this image?", true);
+const stream = await model.query({
+  image: image,
+  question: "What's in this image?",
+  stream: true
+});
 ```
 
-#### detect(image, object)
+#### detect({ image: string, object: string })
 
 Detect specific objects in an image.
 
 ```javascript
-const result = await model.detect(image, "car");
+const result = await model.detect({
+  image: image,
+  object: "car"
+});
 ```
 
-#### point(image, object)
+#### point({ image: string, object: string })
 
 Get coordinates of specific objects in an image.
 
 ```javascript
-const result = await model.point(image, "person");
+const result = await model.point({
+  image: image,
+  object: "person"
+});
 ```
 
 ### Input Types
diff --git a/clients/node/src/__tests__/moondream.integration.test.ts b/clients/node/src/__tests__/moondream.integration.test.ts
index 2130ce72..9909eb40 100644
--- a/clients/node/src/__tests__/moondream.integration.test.ts
+++ b/clients/node/src/__tests__/moondream.integration.test.ts
@@ -3,6 +3,7 @@ import { vl, MoondreamVLConfig } from '../moondream';
 import fs from 'fs/promises';
 import path from 'path';
 import dotenv from 'dotenv';
+import { Base64EncodedImage, CaptionRequest, DetectRequest, PointRequest, QueryRequest } from '../types';
 
 dotenv.config();
 
@@ -13,7 +14,7 @@ if (!apiKey) {
 
 describe('MoondreamClient Integration Tests', () => {
     let client: vl;
-    let imageBuffer: Buffer;
+    let imageBuffer: Base64EncodedImage;
 
     const moondreamConfig: MoondreamVLConfig = {
         apiKey: apiKey
@@ -21,20 +22,33 @@ describe('MoondreamClient Integration Tests', () => {
 
     beforeAll(async () => {
         client = new vl(moondreamConfig);
-        // Load test image
-        imageBuffer = await fs.readFile(path.join(__dirname, '../../../../assets/demo-1.jpg'));
+        // Load test image and convert to base64
+        const rawBuffer = await fs.readFile(path.join(__dirname, '../../../../assets/demo-1.jpg'));
+        imageBuffer = {
+            imageUrl: rawBuffer.toString('base64')
+        };
     });
 
     describe('caption', () => {
         it('should get a caption for a real image', async () => {
-            const result = await client.caption(imageBuffer);
+            const request: CaptionRequest = {
+                image: imageBuffer,
+                length: 'normal',
+                stream: false
+            };
+            const result = await client.caption(request);
             expect(result.caption).toBeDefined();
             expect(typeof result.caption).toBe('string');
             console.log('Caption:', result.caption);
         }, 10000); // Increased timeout for API call
 
         it('should stream captions for a real image', async () => {
-            const result = await client.caption(imageBuffer, 'normal', true);
+            const request: CaptionRequest = {
+                image: imageBuffer,
+                length: 'normal',
+                stream: true
+            };
+            const result = await client.caption(request);
 
             // Handle both streaming and non-streaming responses
             if (typeof result.caption === 'string') {
@@ -55,7 +69,12 @@ describe('MoondreamClient Integration Tests', () => {
 
     describe('caption-no-stream', () => {
         it('should get a caption for a real image', async () => {
-            const result = await client.caption(imageBuffer, 'normal', false);
+            const request: CaptionRequest = {
+                image: imageBuffer,
+                length: 'normal',
+                stream: false
+            };
+            const result = await client.caption(request);
             expect(result.caption).toBeDefined();
             expect(typeof result.caption).toBe('string');
             console.log('Caption:', result.caption);
@@ -66,7 +85,12 @@ describe('MoondreamClient Integration Tests', () => {
     describe('query', () => {
         it('should answer questions about a real image', async () => {
             const question = "What colors are present in this image?";
-            const result = await client.query(imageBuffer, question);
+            const request: QueryRequest = {
+                image: imageBuffer,
+                question: question,
+                stream: false
+            };
+            const result = await client.query(request);
 
             expect(result.answer).toBeDefined();
             expect(typeof result.answer).toBe('string');
@@ -76,7 +100,12 @@ describe('MoondreamClient Integration Tests', () => {
 
         it('should stream answers about a real image', async () => {
             const question = "What is the character doing?";
-            const result = await client.query(imageBuffer, question, true);
+            const request: QueryRequest = {
+                image: imageBuffer,
+                question: question,
+                stream: true
+            };
+            const result = await client.query(request);
 
             // Handle both streaming and non-streaming responses
             if (typeof result.answer === 'string') {
@@ -100,7 +129,12 @@ describe('MoondreamClient Integration Tests', () => {
     describe('query-no-stream', () => {
         it('should answer questions about a real image', async () => {
             const question = "What colors are present in this image?";
-            const result = await client.query(imageBuffer, question, false);
+            const request: QueryRequest = {
+                image: imageBuffer,
+                question: question,
+                stream: false
+            };
+            const result = await client.query(request);
             expect(result.answer).toBeDefined();
             expect(typeof result.answer).toBe('string');
             console.log('Answer:', result.answer);
@@ -110,7 +144,11 @@ describe('MoondreamClient Integration Tests', () => {
     describe('detect', () => {
         it('should detect objects in a real image', async () => {
             const objectToDetect = "burger";
-            const result = await client.detect(imageBuffer, objectToDetect);
+            const request: DetectRequest = {
+                image: imageBuffer,
+                object: objectToDetect,
+            };
+            const result = await client.detect(request);
 
             expect(result.objects).toBeDefined();
             expect(Array.isArray(result.objects)).toBe(true);
@@ -121,7 +159,11 @@ describe('MoondreamClient Integration Tests', () => {
     describe('point', () => {
         it('should point to objects in a real image', async () => {
             const objectToPoint = "burger";
-            const result = await client.point(imageBuffer, objectToPoint);
+            const request: PointRequest = {
+                image: imageBuffer,
+                object: objectToPoint,
+            };
+            const result = await client.point(request);
 
             expect(result.points).toBeDefined();
             expect(Array.isArray(result.points)).toBe(true);
@@ -134,4 +176,170 @@ describe('MoondreamClient Integration Tests', () => {
             console.log('Pointed locations:', result.points);
         }, 10000);
     });
-});
\ No newline at end of file
+});
+
+describe('MoondreamClient Local Server Integration Tests', () => {
+    let client: vl;
+    let imageBuffer: Base64EncodedImage;
+
+    const moondreamConfig: MoondreamVLConfig = {
+        apiUrl: 'http://localhost:3475'
+    };
+
+    beforeAll(async () => {
+        client = new vl(moondreamConfig);
+        // Load test image and convert to base64
+        const rawBuffer = await fs.readFile(path.join(__dirname, '../../../../assets/demo-1.jpg'));
+        imageBuffer = {
+            imageUrl: rawBuffer.toString('base64')
+        };
+    });
+
+    describe('caption', () => {
+        it('should get a caption for a real image', async () => {
+            const request: CaptionRequest = {
+                image: imageBuffer,
+                length: 'normal',
+                stream: false
+            };
+            const result = await client.caption(request);
+            expect(result.caption).toBeDefined();
+            expect(typeof result.caption).toBe('string');
+            console.log('Caption:', result.caption);
+        }, 10000); // Increased timeout for API call
+
+        it('should stream captions for a real image', async () => {
+            const request: CaptionRequest = {
+                image: imageBuffer,
+                length: 'normal',
+                stream: true
+            };
+            const result = await client.caption(request);
+
+            // Handle both streaming and non-streaming responses
+            if (typeof result.caption === 'string') {
+                expect(result.caption).toBeTruthy();
+                console.log('Caption (non-streamed):', result.caption);
+            } else {
+                const chunks: string[] = [];
+                for await (const chunk of result.caption) {
+                    chunks.push(chunk);
+                }
+                const finalCaption = chunks.join('');
+                expect(finalCaption).toBeTruthy();
+                expect(chunks.length).toBeGreaterThan(0);
+                console.log('Streamed caption:', finalCaption);
+            }
+        }, 10000);
+    });
+
+    describe('caption-no-stream', () => {
+        it('should get a caption for a real image', async () => {
+            const request: CaptionRequest = {
+                image: imageBuffer,
+                length: 'normal',
+                stream: false
+            };
+            const result = await client.caption(request);
+            expect(result.caption).toBeDefined();
+            expect(typeof result.caption).toBe('string');
+            console.log('Caption:', result.caption);
+            expect((result.caption as string).length).toBeGreaterThan(0);
+        }, 10000);
+    });
+
+    describe('query', () => {
+        it('should answer questions about a real image', async () => {
+            const question = "What colors are present in this image?";
+            const request: QueryRequest = {
+                image: imageBuffer,
+                question: question,
+                stream: false
+            };
+            const result = await client.query(request);
+
+            expect(result.answer).toBeDefined();
+            expect(typeof result.answer).toBe('string');
+            console.log('Question:', question);
+            console.log('Answer:', result.answer);
+        }, 10000);
+
+        it('should stream answers about a real image', async () => {
+            const question = "What is the character doing?";
+            const request: QueryRequest = {
+                image: imageBuffer,
+                question: question,
+                stream: true
+            };
+            const result = await client.query(request);
+
+            // Handle both streaming and non-streaming responses
+            if (typeof result.answer === 'string') {
+                expect(result.answer).toBeTruthy();
+                console.log('Question:', question);
+                console.log('Answer (non-streamed):', result.answer);
+            } else {
+                const chunks: string[] = [];
+                for await (const chunk of result.answer) {
+                    chunks.push(chunk);
+                }
+                const finalAnswer = chunks.join('');
+                expect(finalAnswer).toBeTruthy();
+                expect(chunks.length).toBeGreaterThan(0);
+                console.log('Question:', question);
+                console.log('Streamed answer:', finalAnswer);
+            }
+        }, 10000);
+    });
+
+    describe('query-no-stream', () => {
+        it('should answer questions about a real image', async () => {
+            const question = "What colors are present in this image?";
+            const request: QueryRequest = {
+                image: imageBuffer,
+                question: question,
+                stream: false
+            };
+            const result = await client.query(request);
+            expect(result.answer).toBeDefined();
+            expect(typeof result.answer).toBe('string');
+            console.log('Answer:', result.answer);
+        });
+    });
+
+    describe('detect', () => {
+        it('should detect objects in a real image', async () => {
+            const objectToDetect = "burger";
+            const request: DetectRequest = {
+                image: imageBuffer,
+                object: objectToDetect,
+            };
+            const result = await client.detect(request);
+
+            expect(result.objects).toBeDefined();
+            expect(Array.isArray(result.objects)).toBe(true);
+            console.log('Detected objects:', result.objects);
+        }, 10000);
+    });
+
+    describe('point', () => {
+        it('should point to objects in a real image', async () => {
+            const objectToPoint = "burger";
+            const request: PointRequest = {
+                image: imageBuffer,
+                object: objectToPoint,
+            };
+            const result = await client.point(request);
+
+            expect(result.points).toBeDefined();
+            expect(Array.isArray(result.points)).toBe(true);
+            result.points.forEach(point => {
+                expect(point).toHaveProperty('x');
+                expect(point).toHaveProperty('y');
+                expect(typeof point.x).toBe('number');
+                expect(typeof point.y).toBe('number');
+            });
+            console.log('Pointed locations:', result.points);
+        }, 10000);
+    });
+});
diff --git a/clients/node/src/__tests__/moondream.test.ts b/clients/node/src/__tests__/moondream.test.ts
index ec6b82f3..b139ec23 100644
--- a/clients/node/src/__tests__/moondream.test.ts
+++ b/clients/node/src/__tests__/moondream.test.ts
@@ -1,5 +1,5 @@
-import { Readable } from 'stream';
 import { vl, MoondreamVLConfig } from '../moondream';
+import { CaptionRequest, DetectRequest, PointRequest, QueryRequest } from '../types';
 
 // Mock sharp
 jest.mock('sharp', () => {
@@ -36,7 +36,12 @@ describe('MoondreamClient', () => {
       const mockedFetch = jest.spyOn(global, 'fetch');
       mockedFetch.mockResolvedValueOnce(mockResponse as any);
 
-      const result = await client.caption(mockImageBuffer);
+      const request: CaptionRequest = {
+        image: mockImageBuffer,
+        length: 'normal',
+        stream: false
+      };
+      const result = await client.caption(request);
 
       expect(result).toEqual({ caption: 'A beautiful landscape' });
       expect(mockedFetch).toHaveBeenCalledWith(
@@ -80,7 +85,12 @@ describe('MoondreamClient', () => {
       const mockedFetch = jest.spyOn(global, 'fetch');
       mockedFetch.mockResolvedValueOnce(mockResponse as any);
 
-      const result = await client.caption(mockImageBuffer, 'normal', true);
+      const request: CaptionRequest = {
+        image: mockImageBuffer,
+        length: 'normal',
+        stream: true
+      };
+      const result = await client.caption(request);
       expect(result.caption).toBeDefined();
 
       const chunks = [];
@@ -98,7 +108,12 @@ describe('MoondreamClient', () => {
       const mockedFetch = jest.spyOn(global, 'fetch');
       mockedFetch.mockResolvedValueOnce(mockResponse as any);
 
-      await expect(client.caption(mockImageBuffer))
+      const request: CaptionRequest = {
+        image: mockImageBuffer,
+        length: 'normal',
+        stream: false
+      };
+      await expect(client.caption(request))
         .rejects
         .toThrow('HTTP error! status: 400');
     });
@@ -113,7 +128,11 @@ describe('MoondreamClient', () => {
       const mockedFetch = jest.spyOn(global, 'fetch');
       mockedFetch.mockResolvedValueOnce(mockResponse as any);
 
-      const result = await client.query(mockImageBuffer, 'What is in this image?');
+      const request: QueryRequest = {
+        image: mockImageBuffer,
+        question: 'What is in this image?'
+      };
+      const result = await client.query(request);
 
       expect(result).toEqual({ answer: 'This is a dog' });
       expect(mockedFetch).toHaveBeenCalledWith(
@@ -157,7 +176,12 @@ describe('MoondreamClient', () => {
       const mockedFetch = jest.spyOn(global, 'fetch');
       mockedFetch.mockResolvedValueOnce(mockResponse as any);
 
-      const result = await client.query(mockImageBuffer, 'What is this?', true);
+      const request: QueryRequest = {
+        image: mockImageBuffer,
+        question: 'What is this?',
+        stream: true
+      };
+      const result = await client.query(request);
       expect(result.answer).toBeDefined();
 
       const chunks = [];
@@ -171,7 +195,7 @@ describe('MoondreamClient', () => {
   describe('detect', () => {
     it('should successfully detect objects in an image', async () => {
       const mockObjects = [
-        { bbox: [0, 0, 100, 100], score: 0.95 }
+        { x_min: 0, y_min: 0, x_max: 100, y_max: 100 }
       ];
       const mockResponse = {
         ok: true,
@@ -180,7 +204,11 @@ describe('MoondreamClient', () => {
       const mockedFetch = jest.spyOn(global, 'fetch');
       mockedFetch.mockResolvedValueOnce(mockResponse as any);
 
-      const result = await client.detect(mockImageBuffer, 'dog');
+      const request: DetectRequest = {
+        image: mockImageBuffer,
+        object: 'dog'
+      };
+      const result = await client.detect(request);
 
       expect(result).toEqual({ objects: mockObjects });
       expect(mockedFetch).toHaveBeenCalledWith(
@@ -314,7 +342,11 @@ describe('MoondreamClient', () => {
       const mockedFetch = jest.spyOn(global, 'fetch');
       mockedFetch.mockResolvedValueOnce(mockResponse as any);
 
-      const result = await client.point(mockImageBuffer, 'dog');
+      const request: PointRequest = {
+        image: mockImageBuffer,
+        object: 'dog'
+      };
+      const result = await client.point(request);
 
       expect(result).toEqual({ points: mockPoints });
       expect(mockedFetch).toHaveBeenCalledWith(
diff --git a/clients/node/src/moondream.ts b/clients/node/src/moondream.ts
index 40a11aca..b710a834 100644
--- a/clients/node/src/moondream.ts
+++ b/clients/node/src/moondream.ts
@@ -2,25 +2,34 @@ import { Buffer } from 'buffer';
 import sharp from 'sharp';
 import {
   Base64EncodedImage,
-  Length,
-  SamplingSettings,
   CaptionOutput,
   QueryOutput,
   DetectOutput,
   PointOutput,
+  CaptionRequest,
+  QueryRequest,
+  DetectRequest,
+  PointRequest,
 } from './types';
 
 export interface MoondreamVLConfig {
-  apiKey: string;
+  apiKey?: string;
+  apiUrl?: string;
 }
+const DEFAULT_API_URL = 'https://api.moondream.ai/v1';
 
 export class vl {
   private apiKey: string;
   private apiUrl: string;
 
   constructor(config: MoondreamVLConfig) {
-    this.apiKey = config.apiKey;
-    this.apiUrl = 'https://api.moondream.ai/v1';
+    this.apiKey = config.apiKey || '';
+    this.apiUrl = config.apiUrl || DEFAULT_API_URL;
+    if (this.apiKey === '' && this.apiUrl === DEFAULT_API_URL) {
+      throw new Error(
+        'An apiKey is required for cloud inference. '
+      );
+    }
   }
 
   private async encodeImage(
@@ -108,12 +117,9 @@ export class vl {
   }
 
   public async caption(
-    image: Buffer | Base64EncodedImage,
-    length: Length = 'normal',
-    stream = false,
-    settings?: SamplingSettings
+    request: CaptionRequest
   ): Promise<CaptionOutput> {
-    const encodedImage = await this.encodeImage(image);
+    const encodedImage = await this.encodeImage(request.image);
 
     const response = await fetch(`${this.apiUrl}/caption`, {
       method: 'POST',
@@ -123,8 +129,8 @@ export class vl {
       },
       body: JSON.stringify({
         image_url: encodedImage.imageUrl,
-        length,
-        stream,
+        length: request.length,
+        stream: request.stream,
       }),
     });
 
@@ -132,7 +138,7 @@ export class vl {
       throw new Error(`HTTP error! status: ${response.status}`);
     }
 
-    if (stream) {
+    if (request.stream) {
       return { caption: this.streamResponse(response) };
     }
 
@@ -141,12 +147,9 @@ export class vl {
   }
 
   public async query(
-    image: Buffer | Base64EncodedImage,
-    question: string,
-    stream = false,
-    settings?: SamplingSettings
+    request: QueryRequest
   ): Promise<QueryOutput> {
-    const encodedImage = await this.encodeImage(image);
+    const encodedImage = await this.encodeImage(request.image);
 
     const response = await fetch(`${this.apiUrl}/query`, {
       method: 'POST',
@@ -156,8 +159,8 @@ export class vl {
       },
       body: JSON.stringify({
         image_url: encodedImage.imageUrl,
-        question,
-        stream,
+        question: request.question,
+        stream: request.stream,
         // TODO: Pass sampling settings
       }),
     });
@@ -166,7 +169,7 @@ export class vl {
       throw new Error(`HTTP error! status: ${response.status}`);
     }
 
-    if (stream) {
+    if (request.stream) {
       return { answer: this.streamResponse(response) };
     }
 
@@ -175,10 +178,9 @@ export class vl {
   }
 
   public async detect(
-    image: Buffer | Base64EncodedImage,
-    object: string
+    request: DetectRequest
   ): Promise<DetectOutput> {
-    const encodedImage = await this.encodeImage(image);
+    const encodedImage = await this.encodeImage(request.image);
 
     const response = await fetch(`${this.apiUrl}/detect`, {
       method: 'POST',
@@ -188,7 +190,7 @@ export class vl {
       },
       body: JSON.stringify({
         image_url: encodedImage.imageUrl,
-        object,
+        object: request.object,
       }),
     });
 
@@ -201,10 +203,9 @@ export class vl {
   }
 
   public async point(
-    image: Buffer | Base64EncodedImage,
-    object: string
+    request: PointRequest
   ): Promise<PointOutput> {
-    const encodedImage = await this.encodeImage(image);
+    const encodedImage = await this.encodeImage(request.image);
 
     const response = await fetch(`${this.apiUrl}/point`, {
       method: 'POST',
@@ -214,7 +215,7 @@ export class vl {
       },
       body: JSON.stringify({
         image_url: encodedImage.imageUrl,
-        object,
+        object: request.object,
       }),
     });
 
diff --git a/clients/node/src/types.ts b/clients/node/src/types.ts
index 65a1aec1..c66d102b 100644
--- a/clients/node/src/types.ts
+++ b/clients/node/src/types.ts
@@ -17,6 +17,15 @@ export interface SamplingSettings {
   maxTokens?: number;
 }
 
+/**
+ * Request structure for image caption requests
+ */
+export interface CaptionRequest {
+  image: Buffer | Base64EncodedImage;
+  length?: Length;
+  stream?: boolean;
+  settings?: SamplingSettings;
+}
 /**
  * Response structure for image caption requests
  */
@@ -24,6 +33,15 @@ export interface CaptionOutput {
   caption: string | AsyncGenerator<string, void, unknown>;
 }
 
+/**
+ * Request structure for image query requests
+ */
+export interface QueryRequest {
+  image: Buffer | Base64EncodedImage;
+  question: string;
+  stream?: boolean;
+  settings?: SamplingSettings;
+}
 /**
  * Response structure for image query requests
  */
@@ -32,9 +50,18 @@ export interface QueryOutput {
 }
 
 /**
- * Bounding box coordinates [x1, y1, x2, y2]
+ * Request structure for object detection requests
+ */
+export interface DetectRequest {
+  image: Buffer | Base64EncodedImage;
+  object: string;
+}
+/**
+ * Response structure for object detection requests
  */
-export type BoundingBox = [number, number, number, number];
+export interface DetectOutput {
+  objects: DetectedObject[];
+}
 
 /**
  * Object detection result
@@ -103,6 +130,13 @@ export type ApiResponse<T> = {
   requestId?: string;
 }
 
+/**
+ * Pointing request structure
+ */
+export interface PointRequest {
+  image: Buffer | Base64EncodedImage;
+  object: string;
+}
 /**
  * Point coordinates for object location
  */