Skip to content

Commit

Permalink
Add anthropic image description
Browse files Browse the repository at this point in the history
  • Loading branch information
kroist committed Dec 24, 2024
1 parent 4c658d7 commit 64342ea
Show file tree
Hide file tree
Showing 4 changed files with 314 additions and 73 deletions.
3 changes: 2 additions & 1 deletion packages/plugin-node/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,13 @@
"tsup.config.ts"
],
"dependencies": {
"@elizaos/core": "workspace:*",
"@aws-sdk/client-s3": "^3.705.0",
"@aws-sdk/s3-request-presigner": "^3.705.0",
"@cliqz/adblocker-playwright": "1.34.0",
"@echogarden/espeak-ng-emscripten": "0.3.3",
"@echogarden/kissfft-wasm": "0.2.0",
"@echogarden/speex-resampler-wasm": "0.2.1",
"@elizaos/core": "workspace:*",
"@huggingface/transformers": "3.0.2",
"@opendocsg/pdf2md": "0.1.32",
"@types/uuid": "10.0.0",
Expand All @@ -32,6 +32,7 @@
"echogarden": "2.0.7",
"espeak-ng": "1.0.2",
"ffmpeg-static": "5.2.0",
"file-type": "^19.6.0",
"fluent-ffmpeg": "2.1.3",
"formdata-node": "6.0.3",
"fs-extra": "11.2.0",
Expand Down
98 changes: 88 additions & 10 deletions packages/plugin-node/src/services/image.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import fs from "fs";
import gifFrames from "gif-frames";
import os from "os";
import path from "path";
import { resizeImageBuffer } from "./imageUtils";

export class ImageDescriptionService
extends Service
Expand Down Expand Up @@ -97,11 +98,13 @@ export class ImageDescriptionService

if (model === models[ModelProviderName.LLAMALOCAL]) {
await this.initializeLocalModel();
} else if (model === models[ModelProviderName.ANTHROPIC]) {
this.modelId = "claude-3-haiku-20240307";
this.device = "cloud";
} else {
this.modelId = "gpt-4o-mini";
this.device = "cloud";
}

this.initialized = true;
}

Expand All @@ -111,7 +114,7 @@ export class ImageDescriptionService
"Runtime is required for OpenAI image recognition"
);
}
return this.recognizeWithOpenAI(imageUrl);
return this.recognizeWithCloud(imageUrl);
}

this.queue.push(imageUrl);
Expand All @@ -130,7 +133,7 @@ export class ImageDescriptionService
});
}

private async recognizeWithOpenAI(
private async recognizeWithCloud(
imageUrl: string
): Promise<{ title: string; description: string }> {
const isGif = imageUrl.toLowerCase().endsWith(".gif");
Expand All @@ -157,12 +160,15 @@ export class ImageDescriptionService

const prompt =
"Describe this image and give it a title. The first line should be the title, and then a line break, then a detailed description of the image. Respond with the format 'title\ndescription'";
const text = await this.requestOpenAI(
imageUrl,
imageData,
prompt,
isGif
);
const text =
this.runtime.imageModelProvider === ModelProviderName.ANTHROPIC
? await this.requestAnthropic(imageData, prompt)
: await this.requestOpenAI(
imageUrl,
imageData,
prompt,
isGif
);

const [title, ...descriptionParts] = text.split("\n");
return {
Expand Down Expand Up @@ -206,7 +212,7 @@ export class ImageDescriptionService
Authorization: `Bearer ${this.runtime.getSetting("OPENAI_API_KEY")}`,
},
body: JSON.stringify({
model: "gpt-4o-mini",
model: this.modelId,
messages: [{ role: "user", content }],
max_tokens: isGif ? 500 : 300,
}),
Expand All @@ -231,6 +237,78 @@ export class ImageDescriptionService
);
}

private async requestAnthropic(
imageData: Buffer,
prompt: string
): Promise<string> {
for (let attempt = 0; attempt < 3; attempt++) {
try {
const endpoint =
models[this.runtime.imageModelProvider].endpoint ??
"https://api.anthropic.com/v1";

// Resize image to 400x400 max, keeping the token count ~ 213
const resizedImage = await resizeImageBuffer(
imageData,
400,
400
);

const response = await fetch(endpoint + "/messages", {
method: "POST",
headers: {
"Content-Type": "application/json",
"x-api-key": `${this.runtime.getSetting("ANTHROPIC_API_KEY")}`,
"anthropic-version": "2023-06-01",
},
body: JSON.stringify({
model: this.modelId,
max_tokens: 300,
messages: [
{
role: "user",
content: [
{
type: "image",
source: {
type: "base64",
media_type: resizedImage.mimeType,
data: resizedImage.buffer.toString(
"base64"
),
},
},
{
type: "text",
text: prompt,
},
],
},
],
}),
});

if (!response.ok) {
throw new Error(
`HTTP error! status: ${await response.text()}`
);
}

const data = await response.json();
return data.content[0].text;
} catch (error) {
elizaLogger.error(
`Anthropic request failed (attempt ${attempt + 1}):`,
error
);
if (attempt === 2) throw error;
}
}
throw new Error(
"Failed to recognize image with Anthropic after 3 attempts"
);
}

private async processQueue(): Promise<void> {
if (this.processing || this.queue.length === 0) return;

Expand Down
79 changes: 79 additions & 0 deletions packages/plugin-node/src/services/imageUtils.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import sharp from "sharp";
import * as FileType from "file-type/core";

interface ImageDimensions {
width: number;
height: number;
}

interface ProcessedImage {
buffer: Buffer;
mimeType: string;
dimensions: {
original: ImageDimensions;
resized: ImageDimensions;
};
}

export async function resizeImageBuffer(
imageBuffer: Buffer,
maxWidth: number,
maxHeight: number
): Promise<ProcessedImage> {
// Detect MIME type
try {
// Detect MIME type
const fileTypeResult = await FileType.fileTypeFromBuffer(imageBuffer);
if (!fileTypeResult || !fileTypeResult.mime.startsWith("image/")) {
throw new Error("Invalid image format");
}

// Get original image metadata
const metadata = await sharp(imageBuffer).metadata();
if (!metadata.width || !metadata.height) {
throw new Error("Could not get image dimensions");
}

// Calculate new dimensions maintaining aspect ratio
let width = metadata.width;
let height = metadata.height;

if (width > maxWidth) {
height = Math.round((maxWidth * height) / width);
width = maxWidth;
}

if (height > maxHeight) {
width = Math.round((maxHeight * width) / height);
height = maxHeight;
}

// Process the image
const resizedBuffer = await sharp(imageBuffer)
.resize(width, height, {
fit: "inside",
withoutEnlargement: true,
})
.toBuffer();

return {
buffer: resizedBuffer,
mimeType: fileTypeResult.mime,
dimensions: {
original: {
width: metadata.width,
height: metadata.height,
},
resized: {
width,
height,
},
},
};
} catch (error) {
if (error instanceof Error) {
throw new Error(`Image processing failed: ${error.message}`);
}
throw new Error("Image processing failed with unknown error");
}
}
Loading

0 comments on commit 64342ea

Please sign in to comment.