From 10661333e275111a5bf14c06e7acc682b7b24515 Mon Sep 17 00:00:00 2001 From: yutao Date: Fri, 20 Dec 2024 14:51:49 +0800 Subject: [PATCH] feat: update docs for more model --- .gitignore | 1 + .vscode/settings.json | 4 +- apps/site/docs/en/faq.md | 6 +-- apps/site/docs/en/model-provider.md | 39 ++++++++++++++++--- apps/site/docs/zh/faq.md | 6 +-- apps/site/docs/zh/model-provider.md | 38 +++++++++++++++--- .../midscene/tests/ai/connectivity.test.ts | 11 +++++- .../tests/ai/evaluate/inspect.test.ts | 3 +- 8 files changed, 86 insertions(+), 22 deletions(-) diff --git a/.gitignore b/.gitignore index c34a6aea..7df6cec2 100644 --- a/.gitignore +++ b/.gitignore @@ -52,6 +52,7 @@ jspm_packages/ # dotenv environment variables file .env +.env.* # next.js build output .next diff --git a/.vscode/settings.json b/.vscode/settings.json index a519cb2d..439ea43f 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -8,10 +8,12 @@ "AITEST", "Aliyun", "aweme", + "doubao", "douyin", "httpbin", "iconfont", "qwen", - "taobao" + "taobao", + "Volcengine" ] } diff --git a/apps/site/docs/en/faq.md b/apps/site/docs/en/faq.md index d583c925..fc39b160 100644 --- a/apps/site/docs/en/faq.md +++ b/apps/site/docs/en/faq.md @@ -14,11 +14,9 @@ There are some limitations with Midscene. We are still working on them. 2. LLM is not 100% stable. Even GPT-4o can't return the right answer all the time. Following the [Prompting Tips](./prompting-tips) will help improve stability. 3. Since we use JavaScript to retrieve items from the page, the elements inside the iframe cannot be accessed. -## Which LLM should I choose ? +## Can I use a model other than `gpt-4o`? -Midscene needs a multimodal Large Language Model (LLM) to understand the UI. Currently, we find that OpenAI's GPT-4o performs much better than others. - -You can [customize model and provider](./model-provider.html) if needed. +Yes. You can [customize model and provider](./model-provider.html) if needed. ## About the token cost diff --git a/apps/site/docs/en/model-provider.md b/apps/site/docs/en/model-provider.md index af094c5f..a15adb6f 100644 --- a/apps/site/docs/en/model-provider.md +++ b/apps/site/docs/en/model-provider.md @@ -30,7 +30,7 @@ export MIDSCENE_OPENAI_INIT_CONFIG_JSON='{"baseURL":"....","defaultHeaders":{"ke export MIDSCENE_OPENAI_SOCKS_PROXY="socks5://127.0.0.1:1080" ``` -Using Azure OpenAI Service: +## Using Azure OpenAI Service ```bash export MIDSCENE_USE_AZURE_OPENAI=1 @@ -38,13 +38,28 @@ export MIDSCENE_AZURE_OPENAI_SCOPE="https://cognitiveservices.azure.com/.default export MIDSCENE_AZURE_OPENAI_INIT_CONFIG_JSON='{"apiVersion": "2024-11-01-preview", "endpoint": "...", "deployment": "..."}' ``` -Note: +## Choose a model other than `gpt-4o` -- Always choose a model that supports vision input. -- Currently, the known supported models are: `gpt-4o`, `qwen-vl-max-latest`, `gemini-1.5-pro` -- Please follow the terms of use of each model. +We find that `gpt-4o` performs the best for Midscene at this moment. The other known supported models are: `gemini-1.5-pro`, `qwen-vl-max-latest`, `doubao-vision-pro-32k` -## Example: Using `qwen-vl-max-latest` service from Aliyun +If you want to use other models, please follow these steps: + +1. Choose a model that supports image input (a.k.a. multimodal model). +2. Find out how to to call it with an OpenAI SDK compatible endpoint. Usually you should set the `OPENAI_BASE_URL`, `OPENAI_API_KEY` and `MIDSCENE_MODEL_NAME`. +3. If you find it not working well after changing the model, you can try using some short and clear prompt (or roll back to the previous model). See more details in [Prompting Tips](./prompting-tips.html). +4. Remember to follow the terms of use of each model. + +## Example: Using `gemini-1.5-pro` from Google + +Configure the environment variables: + +```bash +export OPENAI_BASE_URL="https://generativelanguage.googleapis.com/v1beta/openai" +export OPENAI_API_KEY="....." +export MIDSCENE_MODEL_NAME="gemini-1.5-pro" +``` + +## Example: Using `qwen-vl-max-latest` from Aliyun Configure the environment variables: @@ -53,3 +68,15 @@ export OPENAI_API_KEY="sk-..." export OPENAI_BASE_URL="https://dashscope.aliyuncs.com/compatible-mode/v1" export MIDSCENE_MODEL_NAME="qwen-vl-max-latest" ``` + +## Example: Using `doubao-vision-pro-32k` from Volcengine + +Create a inference point first: https://console.volcengine.com/ark/region:ark+cn-beijing/endpoint + +Configure the environment variables: + +```bash +export OPENAI_BASE_URL="https://ark.cn-beijing.volces.com/api/v3" +export OPENAI_API_KEY="..." +export MIDSCENE_MODEL_NAME="ep-202....." +``` diff --git a/apps/site/docs/zh/faq.md b/apps/site/docs/zh/faq.md index 87926ec1..7f70e2f2 100644 --- a/apps/site/docs/zh/faq.md +++ b/apps/site/docs/zh/faq.md @@ -16,11 +16,9 @@ Midscene 存在一些局限性,我们仍在努力改进。 2. 稳定性风险:即使是 GPT-4o 也无法确保 100% 返回正确答案。遵循 [编写提示词的技巧](./prompting-tips) 可以帮助提高 SDK 稳定性。 3. 元素访问受限:由于我们使用 JavaScript 从页面提取元素,所以无法访问 iframe 内部的元素。 -## 选用那个 LLM 模型? +## 能否选用 `gpt-4o` 以外的其他模型? -Midscene 需要一个能够理解用户界面的多模态大型语言模型。目前,我们发现 OpenAI 的 GPT-4o 表现最好,远超其它模型。 - -你可以根据需要[自定义模型和服务商](./model-provider.html)。 +可以。你可以[自定义模型和服务商](./model-provider.html)。 ## 关于 token 成本 diff --git a/apps/site/docs/zh/model-provider.md b/apps/site/docs/zh/model-provider.md index bdb94922..c70b7de1 100644 --- a/apps/site/docs/zh/model-provider.md +++ b/apps/site/docs/zh/model-provider.md @@ -27,7 +27,7 @@ export MIDSCENE_OPENAI_INIT_CONFIG_JSON='{"baseURL":"....","defaultHeaders":{"ke export MIDSCENE_OPENAI_SOCKS_PROXY="socks5://127.0.0.1:1080" ``` -使用 Azure OpenAI 服务时的配置: +## 使用 Azure OpenAI 服务时的配置 ```bash export MIDSCENE_USE_AZURE_OPENAI=1 @@ -35,12 +35,28 @@ export MIDSCENE_AZURE_OPENAI_SCOPE="https://cognitiveservices.azure.com/.default export MIDSCENE_AZURE_OPENAI_INIT_CONFIG_JSON='{"apiVersion": "2024-11-01-preview", "endpoint": "...", "deployment": "..."}' ``` -说明: +## 选用 `gpt-4o` 以外的其他模型 -- 务必选择一个支持视觉输入的模型。目前我们已知支持的模型有:`gpt-4o`, `qwen-vl-max-latest` (千问), `gemini-1.5-pro` -- 请遵守各项模型的使用条款 +我们发现 `gpt-4o` 是目前表现最佳的模型。其他已知支持的模型有:`qwen-vl-max-latest` (千问), `gemini-1.5-pro`, `doubao-vision-pro-32k` (豆包) -## 示例:使用部署在阿里云的 `qwen-vl-max-latest` 模型 +如果你想要使用其他模型,请遵循以下步骤: + +1. 选择一个支持视觉输入的模型(也就是“多模态模型”)。 +2. 找出如何使用 OpenAI SDK 兼容的方式调用它,模型提供商一般都会提供这样的接入点,你需要配置的是 `OPENAI_BASE_URL`, `OPENAI_API_KEY` 和 `MIDSCENE_MODEL_NAME`。 +3. 如果发现使用新模型后效果不佳,可以尝试使用一些简短且清晰的提示词(或回滚到之前的模型)。更多详情请参阅 [Prompting Tips](./prompting-tips.html)。 +4. 请遵守各模型的使用条款。 + +## 示例:使用 Google 的 `gemini-1.5-pro` 模型 + +配置环境变量: + +```bash +export OPENAI_BASE_URL="https://generativelanguage.googleapis.com/v1beta/openai" +export OPENAI_API_KEY="....." +export MIDSCENE_MODEL_NAME="gemini-1.5-pro" +``` + +## 示例:使用阿里云的 `qwen-vl-max-latest` 模型 配置环境变量: @@ -49,3 +65,15 @@ export OPENAI_API_KEY="sk-..." export OPENAI_BASE_URL="https://dashscope.aliyuncs.com/compatible-mode/v1" export MIDSCENE_MODEL_NAME="qwen-vl-max-latest" ``` + +## 示例:使用火山云的豆包 `doubao-vision-pro-32k` 模型 + +调用前需要配置推理点:https://console.volcengine.com/ark/region:ark+cn-beijing/endpoint + +配置环境变量: + +```bash +export OPENAI_BASE_URL="https://ark.cn-beijing.volces.com/api/v3" +export OPENAI_API_KEY="..." +export MIDSCENE_MODEL_NAME="ep-202....." +``` diff --git a/packages/midscene/tests/ai/connectivity.test.ts b/packages/midscene/tests/ai/connectivity.test.ts index a8617ad4..ec742ead 100644 --- a/packages/midscene/tests/ai/connectivity.test.ts +++ b/packages/midscene/tests/ai/connectivity.test.ts @@ -1,7 +1,15 @@ import { AIActionType } from '@/ai-model/common'; import { call, callToGetJSONObject } from '@/ai-model/openai'; +import { base64Encoded } from '@/image'; +import dotenv from 'dotenv'; +import { getFixture } from 'tests/utils'; import { describe, expect, it, vi } from 'vitest'; +const result = dotenv.config({ debug: true }); +if (result.error) { + throw result.error; +} + vi.setConfig({ testTimeout: 20 * 1000, }); @@ -39,6 +47,7 @@ describe('openai sdk connectivity', () => { }); it('image input', async () => { + const imagePath = getFixture('baidu.png'); const result = await call([ { role: 'user', @@ -50,7 +59,7 @@ describe('openai sdk connectivity', () => { { type: 'image_url', image_url: { - url: 'https://portal.volccdn.com/obj/volcfe/bee_prod/biz_950/tos_38e6e81e1366482ed046045e72b0684d.png', + url: base64Encoded(imagePath), detail: 'high', }, }, diff --git a/packages/midscene/tests/ai/evaluate/inspect.test.ts b/packages/midscene/tests/ai/evaluate/inspect.test.ts index 236940a5..049be393 100644 --- a/packages/midscene/tests/ai/evaluate/inspect.test.ts +++ b/packages/midscene/tests/ai/evaluate/inspect.test.ts @@ -1,4 +1,4 @@ -import { readFileSync, writeFileSync } from 'node:fs'; +import { readFileSync } from 'node:fs'; import path from 'node:path'; import { describe } from 'node:test'; import { AiInspectElement, plan } from '@/ai-model'; @@ -13,6 +13,7 @@ import { repeat, runTestCases, } from './test-suite/util'; +import 'dotenv/config'; const repeatTime = 2; const relocateAfterPlanning = false;