From fc49f314c4f2d4089d8865fef700bc4e600bbf01 Mon Sep 17 00:00:00 2001 From: Caleb John <45307388+calebjohn24@users.noreply.github.com> Date: Thu, 5 Dec 2024 14:48:40 -0800 Subject: [PATCH] Add node support for local sidecar server and request types (#165) * Add support for local sidecar server * Remove un-needed types * Update README.MD * Update README.MD API Constructor Docs * Update README.MD examples * Fix default port in tests * Update README types * Update README typo --- clients/node/README.MD | 118 +++++++-- .../__tests__/moondream.integration.test.ts | 232 +++++++++++++++++- clients/node/src/__tests__/moondream.test.ts | 50 +++- clients/node/src/moondream.ts | 59 ++--- clients/node/src/types.ts | 38 ++- 5 files changed, 430 insertions(+), 67 deletions(-) diff --git a/clients/node/README.MD b/clients/node/README.MD index f10e675b..6faa4942 100644 --- a/clients/node/README.MD +++ b/clients/node/README.MD @@ -21,8 +21,9 @@ yarn add moondream ## Quick Start Before using this client library, you'll need an API key to access Moondream's hosted service. -You can get a free API key from [console.moondream.ai](https://console.moondream.ai). Currently -local inference is only available in Python, but Node.js support is coming very soon. +You can get a free API key from [console.moondream.ai](https://console.moondream.ai). + +### Cloud ```javascript import { vl } from "moondream"; @@ -39,15 +40,72 @@ const image = fs.readFileSync("path/to/image.jpg"); // Basic usage examples async function main() { // Generate a caption for the image - const caption = await model.caption(image); + const caption = await model.caption({ + image: image, + length: "normal", + stream: false + }); console.log("Caption:", caption); // Ask a question about the image - const answer = await model.query(image, "What's in this image?"); + const answer = await model.query({ + image: image, + question: "What's in this image?", + stream: false + }); console.log("Answer:", answer); // Stream the response - const stream = await model.caption(image, "normal", true); + const stream = await model.caption({ + image: image, + length: "normal", + stream: true + }); + for await (const chunk of stream.caption) { + process.stdout.write(chunk); + } +} + +main(); +``` + +### Local Inference + +- Install the `moondream` CLI: `pip install moondream` +- Run the local server: `moondream serve --model ` +- Set the `apiUrl` parameter to the URL of the local server (the default is `http://localhost:3475`) + +```javascript +const model = new vl({ + apiUrl: "http://localhost:3475", +}); + +const image = fs.readFileSync("path/to/image.jpg"); + +// Basic usage examples +async function main() { + // Generate a caption for the image + const caption = await model.caption({ + image: image, + length: "normal", + stream: false + }); + console.log("Caption:", caption); + + // Ask a question about the image + const answer = await model.query({ + image: image, + question: "What's in this image?", + stream: false + }); + console.log("Answer:", answer); + + // Stream the response + const stream = await model.caption({ + image: image, + length: "normal", + stream: true + }); for await (const chunk of stream.caption) { process.stdout.write(chunk); } @@ -68,47 +126,77 @@ main(); ### Constructor ```javascript +// for cloud inference const model = new vl({ apiKey: "your-api-key", }); + +// or for local inference +const model = new vl({ + apiUrl: "http://localhost:3475", +}); ``` ### Methods -#### caption(image, length?, stream?, settings?) +#### caption({ image: string, length: string, stream?: boolean }) Generate a caption for an image. ```javascript -const result = await model.caption(image, "normal", false); +const result = await model.caption({ + image: image, + length: "normal", + stream: false +}); + // or with streaming -const stream = await model.caption(image, "normal", true); +const stream = await model.caption({ + image: image, + length: "normal", + stream: true +}); ``` -#### query(image, question, stream?, settings?) +#### query({ image: string, question: string, stream?: boolean }) Ask a question about an image. ```javascript -const result = await model.query(image, "What's in this image?", false); +const result = await model.query({ + image: image, + question: "What's in this image?", + stream: false +}); + // or with streaming -const stream = await model.query(image, "What's in this image?", true); +const stream = await model.query({ + image: image, + question: "What's in this image?", + stream: true +}); ``` -#### detect(image, object) +#### detect({ image: string, object: string }) Detect specific objects in an image. ```javascript -const result = await model.detect(image, "car"); +const result = await model.detect({ + image: image, + object: "car" +}); ``` -#### point(image, object) +#### point({ image: string, object: string }) Get coordinates of specific objects in an image. ```javascript -const result = await model.point(image, "person"); +const result = await model.point({ + image: image, + object: "person" +}); ``` ### Input Types diff --git a/clients/node/src/__tests__/moondream.integration.test.ts b/clients/node/src/__tests__/moondream.integration.test.ts index 2130ce72..9909eb40 100644 --- a/clients/node/src/__tests__/moondream.integration.test.ts +++ b/clients/node/src/__tests__/moondream.integration.test.ts @@ -3,6 +3,7 @@ import { vl, MoondreamVLConfig } from '../moondream'; import fs from 'fs/promises'; import path from 'path'; import dotenv from 'dotenv'; +import { Base64EncodedImage, CaptionRequest, DetectRequest, PointRequest, QueryRequest } from '../types'; dotenv.config(); @@ -13,7 +14,7 @@ if (!apiKey) { describe('MoondreamClient Integration Tests', () => { let client: vl; - let imageBuffer: Buffer; + let imageBuffer: Base64EncodedImage; const moondreamConfig: MoondreamVLConfig = { apiKey: apiKey @@ -21,20 +22,33 @@ describe('MoondreamClient Integration Tests', () => { beforeAll(async () => { client = new vl(moondreamConfig); - // Load test image - imageBuffer = await fs.readFile(path.join(__dirname, '../../../../assets/demo-1.jpg')); + // Load test image and convert to base64 + const rawBuffer = await fs.readFile(path.join(__dirname, '../../../../assets/demo-1.jpg')); + imageBuffer = { + imageUrl: rawBuffer.toString('base64') + }; }); describe('caption', () => { it('should get a caption for a real image', async () => { - const result = await client.caption(imageBuffer); + const request: CaptionRequest = { + image: imageBuffer, + length: 'normal', + stream: false + }; + const result = await client.caption(request); expect(result.caption).toBeDefined(); expect(typeof result.caption).toBe('string'); console.log('Caption:', result.caption); }, 10000); // Increased timeout for API call it('should stream captions for a real image', async () => { - const result = await client.caption(imageBuffer, 'normal', true); + const request: CaptionRequest = { + image: imageBuffer, + length: 'normal', + stream: true + }; + const result = await client.caption(request); // Handle both streaming and non-streaming responses if (typeof result.caption === 'string') { @@ -55,7 +69,12 @@ describe('MoondreamClient Integration Tests', () => { describe('caption-no-stream', () => { it('should get a caption for a real image', async () => { - const result = await client.caption(imageBuffer, 'normal', false); + const request: CaptionRequest = { + image: imageBuffer, + length: 'normal', + stream: false + }; + const result = await client.caption(request); expect(result.caption).toBeDefined(); expect(typeof result.caption).toBe('string'); console.log('Caption:', result.caption); @@ -66,7 +85,12 @@ describe('MoondreamClient Integration Tests', () => { describe('query', () => { it('should answer questions about a real image', async () => { const question = "What colors are present in this image?"; - const result = await client.query(imageBuffer, question); + const request: QueryRequest = { + image: imageBuffer, + question: question, + stream: false + }; + const result = await client.query(request); expect(result.answer).toBeDefined(); expect(typeof result.answer).toBe('string'); @@ -76,7 +100,12 @@ describe('MoondreamClient Integration Tests', () => { it('should stream answers about a real image', async () => { const question = "What is the character doing?"; - const result = await client.query(imageBuffer, question, true); + const request: QueryRequest = { + image: imageBuffer, + question: question, + stream: true + }; + const result = await client.query(request); // Handle both streaming and non-streaming responses if (typeof result.answer === 'string') { @@ -100,7 +129,12 @@ describe('MoondreamClient Integration Tests', () => { describe('query-no-stream', () => { it('should answer questions about a real image', async () => { const question = "What colors are present in this image?"; - const result = await client.query(imageBuffer, question, false); + const request: QueryRequest = { + image: imageBuffer, + question: question, + stream: false + }; + const result = await client.query(request); expect(result.answer).toBeDefined(); expect(typeof result.answer).toBe('string'); console.log('Answer:', result.answer); @@ -110,7 +144,11 @@ describe('MoondreamClient Integration Tests', () => { describe('detect', () => { it('should detect objects in a real image', async () => { const objectToDetect = "burger"; - const result = await client.detect(imageBuffer, objectToDetect); + const request: DetectRequest = { + image: imageBuffer, + object: objectToDetect, + }; + const result = await client.detect(request); expect(result.objects).toBeDefined(); expect(Array.isArray(result.objects)).toBe(true); @@ -121,7 +159,11 @@ describe('MoondreamClient Integration Tests', () => { describe('point', () => { it('should point to objects in a real image', async () => { const objectToPoint = "burger"; - const result = await client.point(imageBuffer, objectToPoint); + const request: PointRequest = { + image: imageBuffer, + object: objectToPoint, + }; + const result = await client.point(request); expect(result.points).toBeDefined(); expect(Array.isArray(result.points)).toBe(true); @@ -134,4 +176,170 @@ describe('MoondreamClient Integration Tests', () => { console.log('Pointed locations:', result.points); }, 10000); }); -}); \ No newline at end of file +}); + +describe('MoondreamClient Local Server Integration Tests', () => { + let client: vl; + let imageBuffer: Base64EncodedImage; + + const moondreamConfig: MoondreamVLConfig = { + apiUrl: 'http://localhost:3475' + }; + + beforeAll(async () => { + client = new vl(moondreamConfig); + // Load test image and convert to base64 + const rawBuffer = await fs.readFile(path.join(__dirname, '../../../../assets/demo-1.jpg')); + imageBuffer = { + imageUrl: rawBuffer.toString('base64') + }; + }); + + describe('caption', () => { + it('should get a caption for a real image', async () => { + const request: CaptionRequest = { + image: imageBuffer, + length: 'normal', + stream: false + }; + const result = await client.caption(request); + expect(result.caption).toBeDefined(); + expect(typeof result.caption).toBe('string'); + console.log('Caption:', result.caption); + }, 10000); // Increased timeout for API call + + it('should stream captions for a real image', async () => { + const request: CaptionRequest = { + image: imageBuffer, + length: 'normal', + stream: true + }; + const result = await client.caption(request); + + // Handle both streaming and non-streaming responses + if (typeof result.caption === 'string') { + expect(result.caption).toBeTruthy(); + console.log('Caption (non-streamed):', result.caption); + } else { + const chunks: string[] = []; + for await (const chunk of result.caption) { + chunks.push(chunk); + } + const finalCaption = chunks.join(''); + expect(finalCaption).toBeTruthy(); + expect(chunks.length).toBeGreaterThan(0); + console.log('Streamed caption:', finalCaption); + } + }, 10000); + }); + + describe('caption-no-stream', () => { + it('should get a caption for a real image', async () => { + const request: CaptionRequest = { + image: imageBuffer, + length: 'normal', + stream: false + }; + const result = await client.caption(request); + expect(result.caption).toBeDefined(); + expect(typeof result.caption).toBe('string'); + console.log('Caption:', result.caption); + expect((result.caption as string).length).toBeGreaterThan(0); + }, 10000); + }); + + describe('query', () => { + it('should answer questions about a real image', async () => { + const question = "What colors are present in this image?"; + const request: QueryRequest = { + image: imageBuffer, + question: question, + stream: false + }; + const result = await client.query(request); + + expect(result.answer).toBeDefined(); + expect(typeof result.answer).toBe('string'); + console.log('Question:', question); + console.log('Answer:', result.answer); + }, 10000); + + it('should stream answers about a real image', async () => { + const question = "What is the character doing?"; + const request: QueryRequest = { + image: imageBuffer, + question: question, + stream: true + }; + const result = await client.query(request); + + // Handle both streaming and non-streaming responses + if (typeof result.answer === 'string') { + expect(result.answer).toBeTruthy(); + console.log('Question:', question); + console.log('Answer (non-streamed):', result.answer); + } else { + const chunks: string[] = []; + for await (const chunk of result.answer) { + chunks.push(chunk); + } + const finalAnswer = chunks.join(''); + expect(finalAnswer).toBeTruthy(); + expect(chunks.length).toBeGreaterThan(0); + console.log('Question:', question); + console.log('Streamed answer:', finalAnswer); + } + }, 10000); + }); + + describe('query-no-stream', () => { + it('should answer questions about a real image', async () => { + const question = "What colors are present in this image?"; + const request: QueryRequest = { + image: imageBuffer, + question: question, + stream: false + }; + const result = await client.query(request); + expect(result.answer).toBeDefined(); + expect(typeof result.answer).toBe('string'); + console.log('Answer:', result.answer); + }); + }); + + describe('detect', () => { + it('should detect objects in a real image', async () => { + const objectToDetect = "burger"; + const request: DetectRequest = { + image: imageBuffer, + object: objectToDetect, + }; + const result = await client.detect(request); + + expect(result.objects).toBeDefined(); + expect(Array.isArray(result.objects)).toBe(true); + console.log('Detected objects:', result.objects); + }, 10000); + }); + + describe('point', () => { + it('should point to objects in a real image', async () => { + const objectToPoint = "burger"; + const request: PointRequest = { + image: imageBuffer, + object: objectToPoint, + }; + const result = await client.point(request); + + expect(result.points).toBeDefined(); + expect(Array.isArray(result.points)).toBe(true); + result.points.forEach(point => { + expect(point).toHaveProperty('x'); + expect(point).toHaveProperty('y'); + expect(typeof point.x).toBe('number'); + expect(typeof point.y).toBe('number'); + }); + console.log('Pointed locations:', result.points); + }, 10000); + }); +}); diff --git a/clients/node/src/__tests__/moondream.test.ts b/clients/node/src/__tests__/moondream.test.ts index ec6b82f3..b139ec23 100644 --- a/clients/node/src/__tests__/moondream.test.ts +++ b/clients/node/src/__tests__/moondream.test.ts @@ -1,5 +1,5 @@ -import { Readable } from 'stream'; import { vl, MoondreamVLConfig } from '../moondream'; +import { CaptionRequest, DetectRequest, PointRequest, QueryRequest } from '../types'; // Mock sharp jest.mock('sharp', () => { @@ -36,7 +36,12 @@ describe('MoondreamClient', () => { const mockedFetch = jest.spyOn(global, 'fetch'); mockedFetch.mockResolvedValueOnce(mockResponse as any); - const result = await client.caption(mockImageBuffer); + const request: CaptionRequest = { + image: mockImageBuffer, + length: 'normal', + stream: false + }; + const result = await client.caption(request); expect(result).toEqual({ caption: 'A beautiful landscape' }); expect(mockedFetch).toHaveBeenCalledWith( @@ -80,7 +85,12 @@ describe('MoondreamClient', () => { const mockedFetch = jest.spyOn(global, 'fetch'); mockedFetch.mockResolvedValueOnce(mockResponse as any); - const result = await client.caption(mockImageBuffer, 'normal', true); + const request: CaptionRequest = { + image: mockImageBuffer, + length: 'normal', + stream: true + }; + const result = await client.caption(request); expect(result.caption).toBeDefined(); const chunks = []; @@ -98,7 +108,12 @@ describe('MoondreamClient', () => { const mockedFetch = jest.spyOn(global, 'fetch'); mockedFetch.mockResolvedValueOnce(mockResponse as any); - await expect(client.caption(mockImageBuffer)) + const request: CaptionRequest = { + image: mockImageBuffer, + length: 'normal', + stream: false + }; + await expect(client.caption(request)) .rejects .toThrow('HTTP error! status: 400'); }); @@ -113,7 +128,11 @@ describe('MoondreamClient', () => { const mockedFetch = jest.spyOn(global, 'fetch'); mockedFetch.mockResolvedValueOnce(mockResponse as any); - const result = await client.query(mockImageBuffer, 'What is in this image?'); + const request: QueryRequest = { + image: mockImageBuffer, + question: 'What is in this image?' + }; + const result = await client.query(request); expect(result).toEqual({ answer: 'This is a dog' }); expect(mockedFetch).toHaveBeenCalledWith( @@ -157,7 +176,12 @@ describe('MoondreamClient', () => { const mockedFetch = jest.spyOn(global, 'fetch'); mockedFetch.mockResolvedValueOnce(mockResponse as any); - const result = await client.query(mockImageBuffer, 'What is this?', true); + const request: QueryRequest = { + image: mockImageBuffer, + question: 'What is this?', + stream: true + }; + const result = await client.query(request); expect(result.answer).toBeDefined(); const chunks = []; @@ -171,7 +195,7 @@ describe('MoondreamClient', () => { describe('detect', () => { it('should successfully detect objects in an image', async () => { const mockObjects = [ - { bbox: [0, 0, 100, 100], score: 0.95 } + { x_min: 0, y_min: 0, x_max: 100, y_max: 100 } ]; const mockResponse = { ok: true, @@ -180,7 +204,11 @@ describe('MoondreamClient', () => { const mockedFetch = jest.spyOn(global, 'fetch'); mockedFetch.mockResolvedValueOnce(mockResponse as any); - const result = await client.detect(mockImageBuffer, 'dog'); + const request: DetectRequest = { + image: mockImageBuffer, + object: 'dog' + }; + const result = await client.detect(request); expect(result).toEqual({ objects: mockObjects }); expect(mockedFetch).toHaveBeenCalledWith( @@ -314,7 +342,11 @@ describe('MoondreamClient', () => { const mockedFetch = jest.spyOn(global, 'fetch'); mockedFetch.mockResolvedValueOnce(mockResponse as any); - const result = await client.point(mockImageBuffer, 'dog'); + const request: PointRequest = { + image: mockImageBuffer, + object: 'dog' + }; + const result = await client.point(request); expect(result).toEqual({ points: mockPoints }); expect(mockedFetch).toHaveBeenCalledWith( diff --git a/clients/node/src/moondream.ts b/clients/node/src/moondream.ts index 40a11aca..b710a834 100644 --- a/clients/node/src/moondream.ts +++ b/clients/node/src/moondream.ts @@ -2,25 +2,34 @@ import { Buffer } from 'buffer'; import sharp from 'sharp'; import { Base64EncodedImage, - Length, - SamplingSettings, CaptionOutput, QueryOutput, DetectOutput, PointOutput, + CaptionRequest, + QueryRequest, + DetectRequest, + PointRequest, } from './types'; export interface MoondreamVLConfig { - apiKey: string; + apiKey?: string; + apiUrl?: string; } +const DEFAULT_API_URL = 'https://api.moondream.ai/v1'; export class vl { private apiKey: string; private apiUrl: string; constructor(config: MoondreamVLConfig) { - this.apiKey = config.apiKey; - this.apiUrl = 'https://api.moondream.ai/v1'; + this.apiKey = config.apiKey || ''; + this.apiUrl = config.apiUrl || DEFAULT_API_URL; + if (this.apiKey === '' && this.apiUrl === DEFAULT_API_URL) { + throw new Error( + 'An apiKey is required for cloud inference. ' + ); + } } private async encodeImage( @@ -108,12 +117,9 @@ export class vl { } public async caption( - image: Buffer | Base64EncodedImage, - length: Length = 'normal', - stream = false, - settings?: SamplingSettings + request: CaptionRequest ): Promise { - const encodedImage = await this.encodeImage(image); + const encodedImage = await this.encodeImage(request.image); const response = await fetch(`${this.apiUrl}/caption`, { method: 'POST', @@ -123,8 +129,8 @@ export class vl { }, body: JSON.stringify({ image_url: encodedImage.imageUrl, - length, - stream, + length: request.length, + stream: request.stream, }), }); @@ -132,7 +138,7 @@ export class vl { throw new Error(`HTTP error! status: ${response.status}`); } - if (stream) { + if (request.stream) { return { caption: this.streamResponse(response) }; } @@ -141,12 +147,9 @@ export class vl { } public async query( - image: Buffer | Base64EncodedImage, - question: string, - stream = false, - settings?: SamplingSettings + request: QueryRequest ): Promise { - const encodedImage = await this.encodeImage(image); + const encodedImage = await this.encodeImage(request.image); const response = await fetch(`${this.apiUrl}/query`, { method: 'POST', @@ -156,8 +159,8 @@ export class vl { }, body: JSON.stringify({ image_url: encodedImage.imageUrl, - question, - stream, + question: request.question, + stream: request.stream, // TODO: Pass sampling settings }), }); @@ -166,7 +169,7 @@ export class vl { throw new Error(`HTTP error! status: ${response.status}`); } - if (stream) { + if (request.stream) { return { answer: this.streamResponse(response) }; } @@ -175,10 +178,9 @@ export class vl { } public async detect( - image: Buffer | Base64EncodedImage, - object: string + request: DetectRequest ): Promise { - const encodedImage = await this.encodeImage(image); + const encodedImage = await this.encodeImage(request.image); const response = await fetch(`${this.apiUrl}/detect`, { method: 'POST', @@ -188,7 +190,7 @@ export class vl { }, body: JSON.stringify({ image_url: encodedImage.imageUrl, - object, + object: request.object, }), }); @@ -201,10 +203,9 @@ export class vl { } public async point( - image: Buffer | Base64EncodedImage, - object: string + request: PointRequest ): Promise { - const encodedImage = await this.encodeImage(image); + const encodedImage = await this.encodeImage(request.image); const response = await fetch(`${this.apiUrl}/point`, { method: 'POST', @@ -214,7 +215,7 @@ export class vl { }, body: JSON.stringify({ image_url: encodedImage.imageUrl, - object, + object: request.object, }), }); diff --git a/clients/node/src/types.ts b/clients/node/src/types.ts index 65a1aec1..c66d102b 100644 --- a/clients/node/src/types.ts +++ b/clients/node/src/types.ts @@ -17,6 +17,15 @@ export interface SamplingSettings { maxTokens?: number; } +/** + * Request structure for image caption requests + */ +export interface CaptionRequest { + image: Buffer | Base64EncodedImage; + length?: Length; + stream?: boolean; + settings?: SamplingSettings; +} /** * Response structure for image caption requests */ @@ -24,6 +33,15 @@ export interface CaptionOutput { caption: string | AsyncGenerator; } +/** + * Request structure for image query requests + */ +export interface QueryRequest { + image: Buffer | Base64EncodedImage; + question: string; + stream?: boolean; + settings?: SamplingSettings; +} /** * Response structure for image query requests */ @@ -32,9 +50,18 @@ export interface QueryOutput { } /** - * Bounding box coordinates [x1, y1, x2, y2] + * Request structure for object detection requests + */ +export interface DetectRequest { + image: Buffer | Base64EncodedImage; + object: string; +} +/** + * Response structure for object detection requests */ -export type BoundingBox = [number, number, number, number]; +export interface DetectOutput { + objects: DetectedObject[]; +} /** * Object detection result @@ -103,6 +130,13 @@ export type ApiResponse = { requestId?: string; } +/** + * Pointing request structure + */ +export interface PointRequest { + image: Buffer | Base64EncodedImage; + object: string; +} /** * Point coordinates for object location */