Merge branch 'develop' into sif-dev

Sifchain · Jan 10, 2025 · 45fd355 · 45fd355
2 parents f3534fc + c5b3e73
commit 45fd355
Show file tree

Hide file tree

Showing 5 changed files with 311 additions and 236 deletions.
diff --git a/packages/core/src/models.ts b/packages/core/src/models.ts
@@ -1,11 +1,11 @@
 import settings from "./settings.ts";
 import {
-    Models,
-    ModelProviderName,
+    EmbeddingModelSettings,
+    ImageModelSettings,
     ModelClass,
+    ModelProviderName,
+    Models,
     ModelSettings,
-    ImageModelSettings,
-    EmbeddingModelSettings,
 } from "./types.ts";
 
 export const models: Models = {
@@ -332,6 +332,7 @@ export const models: Models = {
         },
     },
     [ModelProviderName.GOOGLE]: {
+        endpoint: "https://generativelanguage.googleapis.com",
         model: {
             [ModelClass.SMALL]: {
                 name:

diff --git a/packages/plugin-node/README.md b/packages/plugin-node/README.md
@@ -80,7 +80,51 @@ Provides web scraping and content extraction capabilities using Playwright.
 
 ### ImageDescriptionService
 
-Processes and analyzes images to generate descriptions.
+Processes and analyzes images to generate descriptions. Supports multiple providers:
+
+- Local processing using Florence model
+- OpenAI Vision API
+- Google Gemini
+
+Configuration:
+
+```env
+# For OpenAI Vision
+OPENAI_API_KEY=your_openai_api_key
+
+# For Google Gemini
+GOOGLE_GENERATIVE_AI_API_KEY=your_google_api_key
+```
+
+Provider selection:
+
+- If `imageVisionModelProvider` is set to `google/openai`, it will use this one.
+- Else if `model` is set to `google/openai`, it will use this one.
+- Default if nothing is set is OpenAI.
+
+The service automatically handles different image formats, including GIFs (first frame extraction).
+
+Features by provider:
+
+**Local (Florence):**
+
+- Basic image captioning
+- Local processing without API calls
+
+**OpenAI Vision:**
+
+- Detailed image descriptions
+- Text detection
+- Object recognition
+
+**Google Gemini 1.5:**
+
+- High-quality image understanding
+- Detailed descriptions with natural language
+- Multi-modal context understanding
+- Support for complex scenes and content
+
+The provider can be configured through the runtime settings, allowing easy switching between providers based on your needs.
 
 ### LlamaService
 

diff --git a/packages/plugin-node/src/index.ts b/packages/plugin-node/src/index.ts
@@ -2,17 +2,17 @@ export * from "./services/index.ts";
 
 import { Plugin } from "@elizaos/core";
 
+import { describeImage } from "./actions/describe-image.ts";
 import {
+    AwsS3Service,
     BrowserService,
     ImageDescriptionService,
     LlamaService,
     PdfService,
     SpeechService,
     TranscriptionService,
     VideoService,
-    AwsS3Service,
 } from "./services/index.ts";
-import { describeImage } from "./actions/describe-image.ts";
 
 export type NodePlugin = ReturnType<typeof createNodePlugin>;