From 10661333e275111a5bf14c06e7acc682b7b24515 Mon Sep 17 00:00:00 2001
From: yutao <yutao.tao@bytedance.com>
Date: Fri, 20 Dec 2024 14:51:49 +0800
Subject: [PATCH] feat: update docs for more model

---
 .gitignore                                    |  1 +
 .vscode/settings.json                         |  4 +-
 apps/site/docs/en/faq.md                      |  6 +--
 apps/site/docs/en/model-provider.md           | 39 ++++++++++++++++---
 apps/site/docs/zh/faq.md                      |  6 +--
 apps/site/docs/zh/model-provider.md           | 38 +++++++++++++++---
 .../midscene/tests/ai/connectivity.test.ts    | 11 +++++-
 .../tests/ai/evaluate/inspect.test.ts         |  3 +-
 8 files changed, 86 insertions(+), 22 deletions(-)

diff --git a/.gitignore b/.gitignore
index c34a6aea..7df6cec2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -52,6 +52,7 @@ jspm_packages/
 
 # dotenv environment variables file
 .env
+.env.*
 
 # next.js build output
 .next
diff --git a/.vscode/settings.json b/.vscode/settings.json
index a519cb2d..439ea43f 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -8,10 +8,12 @@
     "AITEST",
     "Aliyun",
     "aweme",
+    "doubao",
     "douyin",
     "httpbin",
     "iconfont",
     "qwen",
-    "taobao"
+    "taobao",
+    "Volcengine"
   ]
 }
diff --git a/apps/site/docs/en/faq.md b/apps/site/docs/en/faq.md
index d583c925..fc39b160 100644
--- a/apps/site/docs/en/faq.md
+++ b/apps/site/docs/en/faq.md
@@ -14,11 +14,9 @@ There are some limitations with Midscene. We are still working on them.
 2. LLM is not 100% stable. Even GPT-4o can't return the right answer all the time. Following the [Prompting Tips](./prompting-tips) will help improve stability.
 3. Since we use JavaScript to retrieve items from the page, the elements inside the iframe cannot be accessed.
 
-## Which LLM should I choose ?
+## Can I use a model other than `gpt-4o`?
 
-Midscene needs a multimodal Large Language Model (LLM) to understand the UI. Currently, we find that OpenAI's  GPT-4o performs much better than others.
-
-You can [customize model and provider](./model-provider.html) if needed.
+Yes. You can [customize model and provider](./model-provider.html) if needed.
 
 ## About the token cost
 
diff --git a/apps/site/docs/en/model-provider.md b/apps/site/docs/en/model-provider.md
index af094c5f..a15adb6f 100644
--- a/apps/site/docs/en/model-provider.md
+++ b/apps/site/docs/en/model-provider.md
@@ -30,7 +30,7 @@ export MIDSCENE_OPENAI_INIT_CONFIG_JSON='{"baseURL":"....","defaultHeaders":{"ke
 export MIDSCENE_OPENAI_SOCKS_PROXY="socks5://127.0.0.1:1080"
 ```
 
-Using Azure OpenAI Service:
+## Using Azure OpenAI Service
 
 ```bash
 export MIDSCENE_USE_AZURE_OPENAI=1
@@ -38,13 +38,28 @@ export MIDSCENE_AZURE_OPENAI_SCOPE="https://cognitiveservices.azure.com/.default
 export MIDSCENE_AZURE_OPENAI_INIT_CONFIG_JSON='{"apiVersion": "2024-11-01-preview", "endpoint": "...", "deployment": "..."}'
 ```
 
-Note:
+## Choose a model other than `gpt-4o`
 
-- Always choose a model that supports vision input. 
-- Currently, the known supported models are: `gpt-4o`, `qwen-vl-max-latest`, `gemini-1.5-pro`
-- Please follow the terms of use of each model.
+We find that `gpt-4o` performs the best for Midscene at this moment. The other known supported models are: `gemini-1.5-pro`, `qwen-vl-max-latest`, `doubao-vision-pro-32k`
 
-## Example: Using `qwen-vl-max-latest` service from Aliyun
+If you want to use other models, please follow these steps:
+
+1. Choose a model that supports image input (a.k.a. multimodal model).
+2. Find out how to to call it with an OpenAI SDK compatible endpoint. Usually you should set the `OPENAI_BASE_URL`, `OPENAI_API_KEY` and `MIDSCENE_MODEL_NAME`.
+3. If you find it not working well after changing the model, you can try using some short and clear prompt (or roll back to the previous model). See more details in [Prompting Tips](./prompting-tips.html).
+4. Remember to follow the terms of use of each model.
+
+## Example: Using `gemini-1.5-pro` from Google
+
+Configure the environment variables:
+
+```bash
+export OPENAI_BASE_URL="https://generativelanguage.googleapis.com/v1beta/openai"
+export OPENAI_API_KEY="....."
+export MIDSCENE_MODEL_NAME="gemini-1.5-pro"
+```
+
+## Example: Using `qwen-vl-max-latest` from Aliyun
 
 Configure the environment variables:
 
@@ -53,3 +68,15 @@ export OPENAI_API_KEY="sk-..."
 export OPENAI_BASE_URL="https://dashscope.aliyuncs.com/compatible-mode/v1"
 export MIDSCENE_MODEL_NAME="qwen-vl-max-latest"
 ```
+
+## Example: Using `doubao-vision-pro-32k` from Volcengine
+
+Create a inference point first: https://console.volcengine.com/ark/region:ark+cn-beijing/endpoint
+
+Configure the environment variables:
+
+```bash
+export OPENAI_BASE_URL="https://ark.cn-beijing.volces.com/api/v3"
+export OPENAI_API_KEY="..."
+export MIDSCENE_MODEL_NAME="ep-202....."
+```
diff --git a/apps/site/docs/zh/faq.md b/apps/site/docs/zh/faq.md
index 87926ec1..7f70e2f2 100644
--- a/apps/site/docs/zh/faq.md
+++ b/apps/site/docs/zh/faq.md
@@ -16,11 +16,9 @@ Midscene 存在一些局限性，我们仍在努力改进。
 2. 稳定性风险：即使是 GPT-4o 也无法确保 100% 返回正确答案。遵循 [编写提示词的技巧](./prompting-tips) 可以帮助提高 SDK 稳定性。
 3. 元素访问受限：由于我们使用 JavaScript 从页面提取元素，所以无法访问 iframe 内部的元素。
 
-## 选用那个 LLM 模型？
+## 能否选用 `gpt-4o` 以外的其他模型？
 
-Midscene 需要一个能够理解用户界面的多模态大型语言模型。目前，我们发现 OpenAI 的 GPT-4o 表现最好，远超其它模型。
-
-你可以根据需要[自定义模型和服务商](./model-provider.html)。
+可以。你可以[自定义模型和服务商](./model-provider.html)。
 
 ## 关于 token 成本
 
diff --git a/apps/site/docs/zh/model-provider.md b/apps/site/docs/zh/model-provider.md
index bdb94922..c70b7de1 100644
--- a/apps/site/docs/zh/model-provider.md
+++ b/apps/site/docs/zh/model-provider.md
@@ -27,7 +27,7 @@ export MIDSCENE_OPENAI_INIT_CONFIG_JSON='{"baseURL":"....","defaultHeaders":{"ke
 export MIDSCENE_OPENAI_SOCKS_PROXY="socks5://127.0.0.1:1080"
 ```
 
-使用 Azure OpenAI 服务时的配置：
+## 使用 Azure OpenAI 服务时的配置
 
 ```bash
 export MIDSCENE_USE_AZURE_OPENAI=1
@@ -35,12 +35,28 @@ export MIDSCENE_AZURE_OPENAI_SCOPE="https://cognitiveservices.azure.com/.default
 export MIDSCENE_AZURE_OPENAI_INIT_CONFIG_JSON='{"apiVersion": "2024-11-01-preview", "endpoint": "...", "deployment": "..."}'
 ```
 
-说明：
+## 选用 `gpt-4o` 以外的其他模型
 
-- 务必选择一个支持视觉输入的模型。目前我们已知支持的模型有：`gpt-4o`, `qwen-vl-max-latest` (千问), `gemini-1.5-pro`
-- 请遵守各项模型的使用条款
+我们发现 `gpt-4o` 是目前表现最佳的模型。其他已知支持的模型有：`qwen-vl-max-latest` (千问), `gemini-1.5-pro`, `doubao-vision-pro-32k` (豆包)
 
-## 示例：使用部署在阿里云的 `qwen-vl-max-latest` 模型
+如果你想要使用其他模型，请遵循以下步骤：
+
+1. 选择一个支持视觉输入的模型（也就是“多模态模型”）。
+2. 找出如何使用 OpenAI SDK 兼容的方式调用它，模型提供商一般都会提供这样的接入点，你需要配置的是 `OPENAI_BASE_URL`, `OPENAI_API_KEY` 和 `MIDSCENE_MODEL_NAME`。
+3. 如果发现使用新模型后效果不佳，可以尝试使用一些简短且清晰的提示词（或回滚到之前的模型）。更多详情请参阅 [Prompting Tips](./prompting-tips.html)。
+4. 请遵守各模型的使用条款。
+
+## 示例：使用 Google 的 `gemini-1.5-pro` 模型
+
+配置环境变量：
+
+```bash
+export OPENAI_BASE_URL="https://generativelanguage.googleapis.com/v1beta/openai"
+export OPENAI_API_KEY="....."
+export MIDSCENE_MODEL_NAME="gemini-1.5-pro"
+```
+
+## 示例：使用阿里云的 `qwen-vl-max-latest` 模型
 
 配置环境变量：
 
@@ -49,3 +65,15 @@ export OPENAI_API_KEY="sk-..."
 export OPENAI_BASE_URL="https://dashscope.aliyuncs.com/compatible-mode/v1"
 export MIDSCENE_MODEL_NAME="qwen-vl-max-latest"
 ```
+
+## 示例：使用火山云的豆包 `doubao-vision-pro-32k` 模型
+
+调用前需要配置推理点：https://console.volcengine.com/ark/region:ark+cn-beijing/endpoint
+
+配置环境变量：
+
+```bash
+export OPENAI_BASE_URL="https://ark.cn-beijing.volces.com/api/v3"
+export OPENAI_API_KEY="..."
+export MIDSCENE_MODEL_NAME="ep-202....."
+```
diff --git a/packages/midscene/tests/ai/connectivity.test.ts b/packages/midscene/tests/ai/connectivity.test.ts
index a8617ad4..ec742ead 100644
--- a/packages/midscene/tests/ai/connectivity.test.ts
+++ b/packages/midscene/tests/ai/connectivity.test.ts
@@ -1,7 +1,15 @@
 import { AIActionType } from '@/ai-model/common';
 import { call, callToGetJSONObject } from '@/ai-model/openai';
+import { base64Encoded } from '@/image';
+import dotenv from 'dotenv';
+import { getFixture } from 'tests/utils';
 import { describe, expect, it, vi } from 'vitest';
 
+const result = dotenv.config({ debug: true });
+if (result.error) {
+  throw result.error;
+}
+
 vi.setConfig({
   testTimeout: 20 * 1000,
 });
@@ -39,6 +47,7 @@ describe('openai sdk connectivity', () => {
   });
 
   it('image input', async () => {
+    const imagePath = getFixture('baidu.png');
     const result = await call([
       {
         role: 'user',
@@ -50,7 +59,7 @@ describe('openai sdk connectivity', () => {
           {
             type: 'image_url',
             image_url: {
-              url: 'https://portal.volccdn.com/obj/volcfe/bee_prod/biz_950/tos_38e6e81e1366482ed046045e72b0684d.png',
+              url: base64Encoded(imagePath),
               detail: 'high',
             },
           },
diff --git a/packages/midscene/tests/ai/evaluate/inspect.test.ts b/packages/midscene/tests/ai/evaluate/inspect.test.ts
index 236940a5..049be393 100644
--- a/packages/midscene/tests/ai/evaluate/inspect.test.ts
+++ b/packages/midscene/tests/ai/evaluate/inspect.test.ts
@@ -1,4 +1,4 @@
-import { readFileSync, writeFileSync } from 'node:fs';
+import { readFileSync } from 'node:fs';
 import path from 'node:path';
 import { describe } from 'node:test';
 import { AiInspectElement, plan } from '@/ai-model';
@@ -13,6 +13,7 @@ import {
   repeat,
   runTestCases,
 } from './test-suite/util';
+import 'dotenv/config';
 
 const repeatTime = 2;
 const relocateAfterPlanning = false;