diff --git a/apps/site/docs/en/model-provider.md b/apps/site/docs/en/model-provider.md index a15adb6f..48898ede 100644 --- a/apps/site/docs/en/model-provider.md +++ b/apps/site/docs/en/model-provider.md @@ -40,7 +40,7 @@ export MIDSCENE_AZURE_OPENAI_INIT_CONFIG_JSON='{"apiVersion": "2024-11-01-previe ## Choose a model other than `gpt-4o` -We find that `gpt-4o` performs the best for Midscene at this moment. The other known supported models are: `gemini-1.5-pro`, `qwen-vl-max-latest`, `doubao-vision-pro-32k` +We find that `gpt-4o` performs the best for Midscene at this moment. The other known supported models are `claude-3-opus-20240229`, `gemini-1.5-pro`, `qwen-vl-max-latest`, `doubao-vision-pro-32k` If you want to use other models, please follow these steps: @@ -49,6 +49,18 @@ If you want to use other models, please follow these steps: 3. If you find it not working well after changing the model, you can try using some short and clear prompt (or roll back to the previous model). See more details in [Prompting Tips](./prompting-tips.html). 4. Remember to follow the terms of use of each model. +## Example: Using `claude-3-opus-20240229` from Anthropic + +When configuring `MIDSCENE_USE_ANTHROPIC_SDK=1`, Midscene will use Anthropic SDK (`@anthropic-ai/sdk`) to call the model. + +Configure the environment variables: + +```bash +export MIDSCENE_USE_ANTHROPIC_SDK=1 +export ANTHROPIC_API_KEY="....." +export MIDSCENE_MODEL_NAME="claude-3-opus-20240229" +``` + ## Example: Using `gemini-1.5-pro` from Google Configure the environment variables: @@ -80,3 +92,9 @@ export OPENAI_BASE_URL="https://ark.cn-beijing.volces.com/api/v3" export OPENAI_API_KEY="..." export MIDSCENE_MODEL_NAME="ep-202....." ``` + +## Troubleshooting LLM Service Connectivity Issues + +If you want to troubleshoot connectivity issues, you can use the 'connectivity-test' folder in our example project: [https://github.com/web-infra-dev/midscene-example/tree/main/connectivity-test](https://github.com/web-infra-dev/midscene-example/tree/main/connectivity-test) + +Put your `.env` file in the `connectivity-test` folder, and run the test with `npm i && npm run test`. diff --git a/apps/site/docs/zh/model-provider.md b/apps/site/docs/zh/model-provider.md index c70b7de1..6bd76be5 100644 --- a/apps/site/docs/zh/model-provider.md +++ b/apps/site/docs/zh/model-provider.md @@ -37,7 +37,7 @@ export MIDSCENE_AZURE_OPENAI_INIT_CONFIG_JSON='{"apiVersion": "2024-11-01-previe ## 选用 `gpt-4o` 以外的其他模型 -我们发现 `gpt-4o` 是目前表现最佳的模型。其他已知支持的模型有:`qwen-vl-max-latest` (千问), `gemini-1.5-pro`, `doubao-vision-pro-32k` (豆包) +我们发现 `gpt-4o` 是目前表现最佳的模型。其他已知支持的模型有:`claude-3-opus-20240229`, `gemini-1.5-pro`, `qwen-vl-max-latest` (千问), `doubao-vision-pro-32k` (豆包) 如果你想要使用其他模型,请遵循以下步骤: @@ -46,24 +46,36 @@ export MIDSCENE_AZURE_OPENAI_INIT_CONFIG_JSON='{"apiVersion": "2024-11-01-previe 3. 如果发现使用新模型后效果不佳,可以尝试使用一些简短且清晰的提示词(或回滚到之前的模型)。更多详情请参阅 [Prompting Tips](./prompting-tips.html)。 4. 请遵守各模型的使用条款。 -## 示例:使用 Google 的 `gemini-1.5-pro` 模型 +## 示例:使用阿里云的 `qwen-vl-max-latest` 模型 配置环境变量: ```bash -export OPENAI_BASE_URL="https://generativelanguage.googleapis.com/v1beta/openai" -export OPENAI_API_KEY="....." -export MIDSCENE_MODEL_NAME="gemini-1.5-pro" +export OPENAI_API_KEY="sk-..." +export OPENAI_BASE_URL="https://dashscope.aliyuncs.com/compatible-mode/v1" +export MIDSCENE_MODEL_NAME="qwen-vl-max-latest" ``` -## 示例:使用阿里云的 `qwen-vl-max-latest` 模型 +## 示例:使用 Anthropic 的 `claude-3-opus-20240229` 模型 + +当配置 `MIDSCENE_USE_ANTHROPIC_SDK=1` 时,Midscene 会使用 Anthropic SDK (`@anthropic-ai/sdk`) 来调用模型。 配置环境变量: ```bash -export OPENAI_API_KEY="sk-..." -export OPENAI_BASE_URL="https://dashscope.aliyuncs.com/compatible-mode/v1" -export MIDSCENE_MODEL_NAME="qwen-vl-max-latest" +export MIDSCENE_USE_ANTHROPIC_SDK=1 +export ANTHROPIC_API_KEY="....." +export MIDSCENE_MODEL_NAME="claude-3-opus-20240229" +``` + +## 示例:使用 Google 的 `gemini-1.5-pro` 模型 + +配置环境变量: + +```bash +export OPENAI_BASE_URL="https://generativelanguage.googleapis.com/v1beta/openai" +export OPENAI_API_KEY="....." +export MIDSCENE_MODEL_NAME="gemini-1.5-pro" ``` ## 示例:使用火山云的豆包 `doubao-vision-pro-32k` 模型 @@ -77,3 +89,9 @@ export OPENAI_BASE_URL="https://ark.cn-beijing.volces.com/api/v3" export OPENAI_API_KEY="..." export MIDSCENE_MODEL_NAME="ep-202....." ``` + +## 调试 LLM 服务连接问题 + +如果你想要调试 LLM 服务连接问题,可以使用示例项目中的 `connectivity-test` 目录:[https://github.com/web-infra-dev/midscene-example/tree/main/connectivity-test](https://github.com/web-infra-dev/midscene-example/tree/main/connectivity-test) + +将你的 `.env` 文件放在 `connectivity-test` 文件夹中,然后运行 `npm i && npm run test` 来查看问题。 diff --git a/packages/midscene/package.json b/packages/midscene/package.json index dfd69899..73f8e91f 100644 --- a/packages/midscene/package.json +++ b/packages/midscene/package.json @@ -37,6 +37,7 @@ "prepublishOnly": "npm run build" }, "dependencies": { + "@anthropic-ai/sdk": "0.33.1", "@azure/identity": "4.5.0", "@midscene/shared": "workspace:*", "dirty-json": "0.9.2", diff --git a/packages/midscene/src/ai-model/common.ts b/packages/midscene/src/ai-model/common.ts index 0d74a64b..e5674fe0 100644 --- a/packages/midscene/src/ai-model/common.ts +++ b/packages/midscene/src/ai-model/common.ts @@ -1,11 +1,13 @@ +import assert from 'node:assert'; import { MIDSCENE_MODEL_TEXT_ONLY, getAIConfig } from '@/env'; import type { AIUsageInfo } from '@/types'; + import type { ChatCompletionContentPart, ChatCompletionSystemMessageParam, ChatCompletionUserMessageParam, } from 'openai/resources'; -import { callToGetJSONObject, preferOpenAIModel } from './openai'; +import { callToGetJSONObject, checkAIConfig } from './openai'; export type AIArgs = [ ChatCompletionSystemMessageParam, @@ -24,17 +26,16 @@ export async function callAiFn(options: { AIActionType: AIActionType; }): Promise<{ content: T; usage?: AIUsageInfo }> { const { msgs, AIActionType: AIActionTypeValue } = options; - if (preferOpenAIModel('openAI')) { - const { content, usage } = await callToGetJSONObject( - msgs, - AIActionTypeValue, - ); - return { content, usage }; - } + assert( + checkAIConfig(), + 'Cannot find config for AI model service. You should set it before using. https://midscenejs.com/model-provider.html', + ); - throw Error( - 'Cannot find OpenAI config. You should set it before using. https://midscenejs.com/model-provider.html', + const { content, usage } = await callToGetJSONObject( + msgs, + AIActionTypeValue, ); + return { content, usage }; } export function transformUserMessages(msgs: ChatCompletionContentPart[]) { diff --git a/packages/midscene/src/ai-model/openai/index.ts b/packages/midscene/src/ai-model/openai/index.ts index 44bbfd5f..e9767e7e 100644 --- a/packages/midscene/src/ai-model/openai/index.ts +++ b/packages/midscene/src/ai-model/openai/index.ts @@ -1,5 +1,6 @@ import assert from 'node:assert'; import { AIResponseFormat, type AIUsageInfo } from '@/types'; +import { Anthropic } from '@anthropic-ai/sdk'; import { DefaultAzureCredential, getBearerTokenProvider, @@ -10,6 +11,7 @@ import OpenAI, { AzureOpenAI } from 'openai'; import type { ChatCompletionMessageParam } from 'openai/resources'; import { SocksProxyAgent } from 'socks-proxy-agent'; import { + ANTHROPIC_API_KEY, MIDSCENE_AZURE_OPENAI_INIT_CONFIG_JSON, MIDSCENE_AZURE_OPENAI_SCOPE, MIDSCENE_DANGEROUSLY_PRINT_ALL_CONFIG, @@ -18,6 +20,7 @@ import { MIDSCENE_MODEL_NAME, MIDSCENE_OPENAI_INIT_CONFIG_JSON, MIDSCENE_OPENAI_SOCKS_PROXY, + MIDSCENE_USE_ANTHROPIC_SDK, MIDSCENE_USE_AZURE_OPENAI, OPENAI_API_KEY, OPENAI_BASE_URL, @@ -31,10 +34,11 @@ import { findElementSchema } from '../prompt/element_inspector'; import { planSchema } from '../prompt/planning'; import { assertSchema } from '../prompt/util'; -export function preferOpenAIModel(preferVendor?: 'coze' | 'openAI') { +export function checkAIConfig(preferVendor?: 'coze' | 'openAI') { if (preferVendor && preferVendor !== 'openAI') return false; if (getAIConfig(OPENAI_API_KEY)) return true; if (getAIConfig(MIDSCENE_USE_AZURE_OPENAI)) return true; + if (getAIConfig(ANTHROPIC_API_KEY)) return true; return Boolean(getAIConfig(MIDSCENE_OPENAI_INIT_CONFIG_JSON)); } @@ -50,8 +54,11 @@ export function getModelName() { return modelName; } -async function createOpenAI() { - let openai: OpenAI | AzureOpenAI; +async function createChatClient(): Promise<{ + completion: OpenAI.Chat.Completions; + style: 'openai' | 'anthropic'; +}> { + let openai: OpenAI | AzureOpenAI | undefined; const extraConfig = getAIConfigInJson(MIDSCENE_OPENAI_INIT_CONFIG_JSON); const socksProxy = getAIConfig(MIDSCENE_OPENAI_SOCKS_PROXY); @@ -65,7 +72,7 @@ async function createOpenAI() { httpAgent: socksAgent, ...extraConfig, dangerouslyAllowBrowser: true, - }); + }) as OpenAI; } else if (getAIConfig(MIDSCENE_USE_AZURE_OPENAI)) { // sample code: https://github.com/Azure/azure-sdk-for-js/blob/main/sdk/openai/openai/samples/cookbook/simpleCompletionsPage/app.js const scope = getAIConfig(MIDSCENE_AZURE_OPENAI_SCOPE); @@ -87,7 +94,7 @@ async function createOpenAI() { ...extraConfig, ...extraAzureConfig, }); - } else { + } else if (!getAIConfig(MIDSCENE_USE_ANTHROPIC_SDK)) { openai = new OpenAI({ baseURL: getAIConfig(OPENAI_BASE_URL), apiKey: getAIConfig(OPENAI_API_KEY), @@ -97,7 +104,7 @@ async function createOpenAI() { }); } - if (getAIConfig(MIDSCENE_LANGSMITH_DEBUG)) { + if (openai && getAIConfig(MIDSCENE_LANGSMITH_DEBUG)) { if (ifInBrowser) { throw new Error('langsmith is not supported in browser'); } @@ -106,7 +113,30 @@ async function createOpenAI() { openai = wrapOpenAI(openai); } - return openai; + if (typeof openai !== 'undefined') { + return { + completion: openai.chat.completions, + style: 'openai', + }; + } + + // Anthropic + if (getAIConfig(MIDSCENE_USE_ANTHROPIC_SDK)) { + const apiKey = getAIConfig(ANTHROPIC_API_KEY); + assert(apiKey, 'ANTHROPIC_API_KEY is required'); + openai = new Anthropic({ + apiKey, + }) as any; + } + + if (typeof openai !== 'undefined' && (openai as any).messages) { + return { + completion: (openai as any).messages, + style: 'anthropic', + }; + } + + throw new Error('Openai SDK or Anthropic SDK is not initialized'); } export async function call( @@ -115,32 +145,74 @@ export async function call( | OpenAI.ChatCompletionCreateParams['response_format'] | OpenAI.ResponseFormatJSONObject, ): Promise<{ content: string; usage?: AIUsageInfo }> { - const openai = await createOpenAI(); + const { completion, style } = await createChatClient(); const shouldPrintTiming = typeof getAIConfig(MIDSCENE_DEBUG_AI_PROFILE) === 'string'; - if (getAIConfig(MIDSCENE_DANGEROUSLY_PRINT_ALL_CONFIG)) { - console.log(allAIConfig()); - } + const startTime = Date.now(); const model = getModelName(); - const completion = await openai.chat.completions.create({ - model, - messages, - response_format: responseFormat, + let content: string | undefined; + let usage: OpenAI.CompletionUsage | undefined; + const commonConfig = { temperature: 0.1, stream: false, - // betas: ['computer-use-2024-10-22'], - } as any); - shouldPrintTiming && - console.log( - 'Midscene - AI call', + max_tokens: 3000, + }; + if (style === 'openai') { + const result = await completion.create({ model, - completion.usage, - `${Date.now() - startTime}ms`, - ); - const { content } = completion.choices[0].message; - assert(content, 'empty content'); - return { content, usage: completion.usage }; + messages, + response_format: responseFormat, + ...commonConfig, + // betas: ['computer-use-2024-10-22'], + } as any); + shouldPrintTiming && + console.log( + 'Midscene - AI call', + model, + result.usage, + `${Date.now() - startTime}ms`, + ); + content = result.choices[0].message.content!; + assert(content, 'empty content'); + usage = result.usage; + } else if (style === 'anthropic') { + const convertImageContent = (content: any) => { + if (content.type === 'image_url') { + const imgBase64 = content.image_url.url; + assert(imgBase64, 'image_url is required'); + return { + source: { + type: 'base64', + media_type: imgBase64.includes('data:image/png;base64,') + ? 'image/png' + : 'image/jpeg', + data: imgBase64.split(',')[1], + }, + type: 'image', + }; + } + return content; + }; + + const result = await completion.create({ + model, + system: 'You are a versatile professional in software UI automation', + messages: messages.map((m) => ({ + role: 'user', + content: Array.isArray(m.content) + ? (m.content as any).map(convertImageContent) + : m.content, + })), + response_format: responseFormat, + ...commonConfig, + } as any); + content = (result as any).content[0].text as string; + assert(content, 'empty content'); + usage = result.usage; + } + + return { content: content || '', usage }; } export async function callToGetJSONObject( @@ -166,13 +238,14 @@ export async function callToGetJSONObject( case AIActionType.EXTRACT_DATA: //TODO: Currently the restriction type can only be a json subset of the constraint, and the way the extract api is used needs to be adjusted to limit the user's data to this as well // targetResponseFormat = extractDataSchema; + responseFormat = { type: AIResponseFormat.JSON }; break; case AIActionType.PLAN: responseFormat = planSchema; break; } - if (model === 'gpt-4o-2024-05-13') { + if (model === 'gpt-4o-2024-05-13' || !responseFormat) { responseFormat = { type: AIResponseFormat.JSON }; } } diff --git a/packages/midscene/src/env.ts b/packages/midscene/src/env.ts index b906e218..c3a536ec 100644 --- a/packages/midscene/src/env.ts +++ b/packages/midscene/src/env.ts @@ -21,6 +21,9 @@ export const MIDSCENE_AZURE_OPENAI_SCOPE = 'MIDSCENE_AZURE_OPENAI_SCOPE'; export const MIDSCENE_AZURE_OPENAI_INIT_CONFIG_JSON = 'MIDSCENE_AZURE_OPENAI_INIT_CONFIG_JSON'; +export const MIDSCENE_USE_ANTHROPIC_SDK = 'MIDSCENE_USE_ANTHROPIC_SDK'; +export const ANTHROPIC_API_KEY = 'ANTHROPIC_API_KEY'; + // @deprecated export const OPENAI_USE_AZURE = 'OPENAI_USE_AZURE'; @@ -54,6 +57,9 @@ const allConfigFromEnv = () => { 'https://cognitiveservices.azure.com/.default', [MIDSCENE_AZURE_OPENAI_INIT_CONFIG_JSON]: process.env[MIDSCENE_AZURE_OPENAI_INIT_CONFIG_JSON] || undefined, + [MIDSCENE_USE_ANTHROPIC_SDK]: + process.env[MIDSCENE_USE_ANTHROPIC_SDK] || undefined, + [ANTHROPIC_API_KEY]: process.env[ANTHROPIC_API_KEY] || undefined, }; }; diff --git a/packages/midscene/tests/ai/connectivity.test.ts b/packages/midscene/tests/ai/connectivity.test.ts index ec742ead..0653d6a9 100644 --- a/packages/midscene/tests/ai/connectivity.test.ts +++ b/packages/midscene/tests/ai/connectivity.test.ts @@ -1,72 +1,98 @@ +import { existsSync } from 'node:fs'; +import path from 'node:path'; import { AIActionType } from '@/ai-model/common'; import { call, callToGetJSONObject } from '@/ai-model/openai'; import { base64Encoded } from '@/image'; import dotenv from 'dotenv'; import { getFixture } from 'tests/utils'; -import { describe, expect, it, vi } from 'vitest'; - -const result = dotenv.config({ debug: true }); -if (result.error) { - throw result.error; -} +import { beforeAll, describe, expect, it, vi } from 'vitest'; vi.setConfig({ testTimeout: 20 * 1000, }); -describe('openai sdk connectivity', () => { - it('connectivity', async () => { - const result = await call([ - { - role: 'system', - content: 'Answer the question', - }, - { - role: 'user', - content: '鲁迅认识周树人吗?回答我:1. 分析原因 2.回答:是/否/无效问题', - }, - ]); - - expect(result.content.length).toBeGreaterThan(1); - }); +[ + '.env.qwen', + '.env.gemini', + '.env.doubao', + '.env.init_json', + '.env.anthropic', + '.env.openai', +].forEach((envFile) => { + const configPath = path.resolve(__dirname, `../../${envFile}`); + if (!existsSync(configPath)) { + return; + } - it('call to get json result', async () => { - const result = await callToGetJSONObject<{ answer: number }>( - [ - { - role: 'system', - content: 'Answer the question with JSON: {answer: number}', - }, - { - role: 'user', - content: '3 x 5 = ?', - }, - ], - AIActionType.EXTRACT_DATA, - ); - expect(result.content).toEqual({ answer: 15 }); - }); + describe.skipIf(process.env.CI)( + `LLM service connectivity: ${envFile}`, + () => { + beforeAll(() => { + const result = dotenv.config({ + debug: true, + path: configPath, + override: true, + }); + if (result.error) { + throw result.error; + } + }); - it('image input', async () => { - const imagePath = getFixture('baidu.png'); - const result = await call([ - { - role: 'user', - content: [ + it('text only', async () => { + const result = await call([ { - type: 'text', - text: 'Describe this image in one sentence.', + role: 'system', + content: 'Answer the question', }, { - type: 'image_url', - image_url: { - url: base64Encoded(imagePath), - detail: 'high', + role: 'user', + content: + '鲁迅认识周树人吗?回答我:1. 分析原因 2.回答:是/否/无效问题', + }, + ]); + + expect(result.content.length).toBeGreaterThan(1); + }); + + it('call to get json result', async () => { + const result = await callToGetJSONObject<{ answer: number }>( + [ + { + role: 'system', + content: 'Answer the question with JSON: {answer: number}', + }, + { + role: 'user', + content: '3 x 5 = ?', }, + ], + AIActionType.EXTRACT_DATA, + ); + expect(result.content).toEqual({ answer: 15 }); + }); + + it('image input', async () => { + const imagePath = getFixture('baidu.png'); + const result = await call([ + { + role: 'user', + content: [ + { + type: 'text', + text: 'Describe this image in one sentence.', + }, + { + type: 'image_url', + image_url: { + url: base64Encoded(imagePath), + detail: 'high', + }, + }, + ], }, - ], - }, - ]); + ]); - expect(result.content.length).toBeGreaterThan(10); - }); + expect(result.content.length).toBeGreaterThan(10); + }); + }, + ); }); diff --git a/packages/web-integration/src/common/tasks.ts b/packages/web-integration/src/common/tasks.ts index 225d108d..85e4ae51 100644 --- a/packages/web-integration/src/common/tasks.ts +++ b/packages/web-integration/src/common/tasks.ts @@ -91,10 +91,17 @@ export class PageTaskExecutor { (async () => { await sleep(100); if ((this.page as PuppeteerWebPage).waitUntilNetworkIdle) { - await (this.page as PuppeteerWebPage).waitUntilNetworkIdle(); + try { + await (this.page as PuppeteerWebPage).waitUntilNetworkIdle({ + idleTime: 100, + timeout: 800, + }); + } catch (error) { + // console.error('waitUntilNetworkIdle error', error); + } } })(), - sleep(300), + sleep(200), ]); } if (appendAfterExecution) { diff --git a/packages/web-integration/src/puppeteer/page.ts b/packages/web-integration/src/puppeteer/page.ts index 4ab8cdf8..d007e9a7 100644 --- a/packages/web-integration/src/puppeteer/page.ts +++ b/packages/web-integration/src/puppeteer/page.ts @@ -9,10 +9,12 @@ export class WebPage extends BasePage<'puppeteer', PuppeteerPageType> { async waitUntilNetworkIdle(options?: { idleTime?: number; concurrency?: number; + timeout?: number; }): Promise { await this.underlyingPage.waitForNetworkIdle({ - idleTime: options?.idleTime || 500, + idleTime: options?.idleTime || 300, concurrency: options?.concurrency || 2, + timeout: options?.timeout || 15000, }); } } diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index a05b512e..d0b79b2b 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -142,6 +142,9 @@ importers: packages/midscene: dependencies: + '@anthropic-ai/sdk': + specifier: 0.33.1 + version: 0.33.1 '@azure/identity': specifier: 4.5.0 version: 4.5.0 @@ -415,6 +418,9 @@ packages: peerDependencies: react: '>=16.9.0' + '@anthropic-ai/sdk@0.33.1': + resolution: {integrity: sha512-VrlbxiAdVRGuKP2UQlCnsShDHJKWepzvfRCkZMpU+oaUdKLpOfmylLMRojGrAgebV+kDtPjewCVP0laHXg+vsA==} + '@arr/every@1.0.1': resolution: {integrity: sha512-UQFQ6SgyJ6LX42W8rHCs8KVc0JS0tzVL9ct4XYedJukskYVWTo49tNiMEK9C2HTyarbNiT/RVIRSY82vH+6sTg==} engines: {node: '>=4'} @@ -10058,6 +10064,18 @@ snapshots: resize-observer-polyfill: 1.5.1 throttle-debounce: 5.0.2 + '@anthropic-ai/sdk@0.33.1': + dependencies: + '@types/node': 18.19.62 + '@types/node-fetch': 2.6.11 + abort-controller: 3.0.0 + agentkeepalive: 4.5.0 + form-data-encoder: 1.7.2 + formdata-node: 4.4.1 + node-fetch: 2.7.0 + transitivePeerDependencies: + - encoding + '@arr/every@1.0.1': {} '@ast-grep/napi-darwin-arm64@0.16.0':