diff --git a/.yarnrc.yml b/.yarnrc.yml new file mode 100644 index 00000000000..3186f3f0795 --- /dev/null +++ b/.yarnrc.yml @@ -0,0 +1 @@ +nodeLinker: node-modules diff --git a/app/client/api.ts b/app/client/api.ts index f60b0e2ad71..00348548caf 100644 --- a/app/client/api.ts +++ b/app/client/api.ts @@ -25,6 +25,7 @@ import { XAIApi } from "./platforms/xai"; import { ChatGLMApi } from "./platforms/glm"; import { SiliconflowApi } from "./platforms/siliconflow"; import { Ai302Api } from "./platforms/ai302"; +import type { TTSPlayManager } from "../utils/audio"; export const ROLES = ["system", "user", "assistant"] as const; export type MessageRole = (typeof ROLES)[number]; @@ -107,7 +108,11 @@ export interface LLMModelProvider { export abstract class LLMApi { abstract chat(options: ChatOptions): Promise; - abstract speech(options: SpeechOptions): Promise; + abstract speech(options: SpeechOptions): Promise; + abstract streamSpeech?( + options: SpeechOptions, + audioManager?: TTSPlayManager, + ): AsyncGenerator; abstract usage(): Promise; abstract models(): Promise; } diff --git a/app/client/platforms/alibaba.ts b/app/client/platforms/alibaba.ts index 4875e5c02d9..0e7e49e86bb 100644 --- a/app/client/platforms/alibaba.ts +++ b/app/client/platforms/alibaba.ts @@ -6,7 +6,9 @@ import { useChatStore, ChatMessageTool, usePluginStore, + FunctionToolItem, } from "@/app/store"; +import { TTSPlayManager } from "@/app/utils/audio"; import { preProcessImageContentForAlibabaDashScope, streamWithThink, @@ -51,6 +53,8 @@ interface RequestParam { repetition_penalty?: number; top_p: number; max_tokens?: number; + tools?: FunctionToolItem[]; + enable_search?: boolean; } interface RequestPayload { model: string; @@ -89,10 +93,102 @@ export class QwenApi implements LLMApi { return res?.output?.choices?.at(0)?.message?.content ?? ""; } - speech(options: SpeechOptions): Promise { + async speech(options: SpeechOptions): Promise { throw new Error("Method not implemented."); } + async *streamSpeech( + options: SpeechOptions, + audioManager?: TTSPlayManager, + ): AsyncGenerator { + if (!options.input || !options.model) { + throw new Error("Missing required parameters: input and model"); + } + const requestPayload = { + model: options.model, + input: { + text: options.input, + voice: options.voice, + }, + speed: options.speed, + response_format: options.response_format, + }; + const controller = new AbortController(); + options.onController?.(controller); + + if (audioManager) { + audioManager.setStreamController(controller); + } + try { + const speechPath = this.path(Alibaba.SpeechPath); + const speechPayload = { + method: "POST", + body: JSON.stringify(requestPayload), + signal: controller.signal, + headers: { + ...getHeaders(), + "X-DashScope-SSE": "enable", + }, + }; + + // make a fetch request + const requestTimeoutId = setTimeout( + () => controller.abort(), + getTimeoutMSByModel(options.model), + ); + + const res = await fetch(speechPath, speechPayload); + clearTimeout(requestTimeoutId); // Clear timeout on successful connection + + const reader = res.body!.getReader(); + const decoder = new TextDecoder(); + let buffer = ""; + while (true) { + const { done, value } = await reader.read(); + if (done) { + break; + } + buffer += decoder.decode(value, { stream: true }); + const lines = buffer.split("\n"); + buffer = lines.pop() || ""; + + for (const line of lines) { + const data = line.slice(5); + try { + if (line.startsWith("data:")) { + const json = JSON.parse(data); + if (json.output?.audio?.data) { + yield await audioManager!.pcmBase64ToAudioBuffer( + json.output.audio.data, + { channels: 1, sampleRate: 24000, bitDepth: 16 }, + ); + } + } + } catch (parseError) { + console.warn( + "[StreamSpeech] Failed to parse SSE data:", + parseError, + ); + continue; + } + } + } + reader.releaseLock(); + } catch (e) { + // 如果是用户主动取消(AbortError),则不作为错误处理 + if (e instanceof Error && e.name === "AbortError") { + console.log("[Request] Stream speech was aborted by user"); + return; // 正常退出,不抛出错误 + } + console.log("[Request] failed to make a speech request", e); + throw e; + } finally { + if (audioManager) { + audioManager.clearStreamController(); + } + } + } + async chat(options: ChatOptions) { const modelConfig = { ...useAppConfig.getState().modelConfig, @@ -129,6 +225,7 @@ export class QwenApi implements LLMApi { temperature: modelConfig.temperature, // max_tokens: modelConfig.max_tokens, top_p: modelConfig.top_p === 1 ? 0.99 : modelConfig.top_p, // qwen top_p is should be < 1 + enable_search: modelConfig.enableNetWork, }, }; @@ -161,11 +258,16 @@ export class QwenApi implements LLMApi { .getAsTools( useChatStore.getState().currentSession().mask?.plugin || [], ); + // console.log("getAsTools", tools, funcs); + const _tools = tools as unknown as FunctionToolItem[]; + if (_tools && _tools.length > 0) { + requestPayload.parameters.tools = _tools; + } return streamWithThink( chatPath, requestPayload, headers, - tools as any, + [], funcs, controller, // parseSSE @@ -198,7 +300,7 @@ export class QwenApi implements LLMApi { }); } else { // @ts-ignore - runTools[index]["function"]["arguments"] += args; + runTools[index]["function"]["arguments"] += args || ""; } } diff --git a/app/components/chat.tsx b/app/components/chat.tsx index 6691403e65b..a2575124f14 100644 --- a/app/components/chat.tsx +++ b/app/components/chat.tsx @@ -48,6 +48,7 @@ import PluginIcon from "../icons/plugin.svg"; import ShortcutkeyIcon from "../icons/shortcutkey.svg"; import McpToolIcon from "../icons/tool.svg"; import HeadphoneIcon from "../icons/headphone.svg"; +import NetWorkIcon from "../icons/network.svg"; import { BOT_HELLO, ChatMessage, @@ -75,6 +76,7 @@ import { useMobileScreen, selectOrCopy, showPlugins, + canUseNetWork, } from "../utils"; import { uploadImage as uploadImageRemote } from "@/app/utils/chat"; @@ -101,8 +103,6 @@ import { import { useNavigate } from "react-router-dom"; import { CHAT_PAGE_SIZE, - DEFAULT_TTS_ENGINE, - ModelProvider, Path, REQUEST_TIMEOUT_MS, ServiceProvider, @@ -512,6 +512,7 @@ export function ChatActions(props: { // switch themes const theme = config.theme; + const enableNetWork = session.mask.modelConfig.enableNetWork || false; function nextTheme() { const themes = [Theme.Auto, Theme.Light, Theme.Dark]; @@ -521,6 +522,13 @@ export function ChatActions(props: { config.update((config) => (config.theme = nextTheme)); } + function nextNetWork() { + chatStore.updateTargetSession(session, (session) => { + session.mask.modelConfig.enableNetWork = + !session.mask.modelConfig.enableNetWork; + }); + } + // stop all responses const couldStop = ChatControllerPool.hasPending(); const stopAll = () => ChatControllerPool.stopAll(); @@ -699,6 +707,9 @@ export function ChatActions(props: { session.mask.modelConfig.providerName = providerName as ServiceProvider; session.mask.syncGlobalConfig = false; + session.mask.modelConfig.enableNetWork = canUseNetWork(model) + ? session.mask.modelConfig.enableNetWork + : false; }); if (providerName == "ByteDance") { const selectedModel = models.find( @@ -833,6 +844,16 @@ export function ChatActions(props: { /> )} {!isMobileScreen && } + + {canUseNetWork(currentModel) && ( + } + /> + )}
{config.realtimeConfig.enable && ( @@ -1286,6 +1307,7 @@ function _Chat() { const accessStore = useAccessStore(); const [speechStatus, setSpeechStatus] = useState(false); const [speechLoading, setSpeechLoading] = useState(false); + const [speechCooldown, setSpeechCooldown] = useState(false); async function openaiSpeech(text: string) { if (speechStatus) { @@ -1293,14 +1315,14 @@ function _Chat() { setSpeechStatus(false); } else { var api: ClientApi; - api = new ClientApi(ModelProvider.GPT); const config = useAppConfig.getState(); + api = new ClientApi(config.ttsConfig.modelProvider); setSpeechLoading(true); ttsPlayer.init(); - let audioBuffer: ArrayBuffer; + let audioBuffer: ArrayBuffer | AudioBuffer; const { markdownToTxt } = require("markdown-to-txt"); const textContent = markdownToTxt(text); - if (config.ttsConfig.engine !== DEFAULT_TTS_ENGINE) { + if (config.ttsConfig.engine === "Edge") { const edgeVoiceName = accessStore.edgeVoiceName(); const tts = new MsEdgeTTS(); await tts.setMetadata( @@ -1308,28 +1330,63 @@ function _Chat() { OUTPUT_FORMAT.AUDIO_24KHZ_96KBITRATE_MONO_MP3, ); audioBuffer = await tts.toArrayBuffer(textContent); + playSpeech(audioBuffer); } else { - audioBuffer = await api.llm.speech({ - model: config.ttsConfig.model, - input: textContent, - voice: config.ttsConfig.voice, - speed: config.ttsConfig.speed, - }); + if (api.llm.streamSpeech) { + // 使用流式播放,边接收边播放 + setSpeechStatus(true); + ttsPlayer.startStreamPlay(() => { + setSpeechStatus(false); + }); + + try { + for await (const chunk of api.llm.streamSpeech( + { + model: config.ttsConfig.model, + input: textContent, + voice: config.ttsConfig.voice, + speed: config.ttsConfig.speed, + }, + ttsPlayer, + )) { + ttsPlayer.addToQueue(chunk); + } + ttsPlayer.finishStreamPlay(); + } catch (e) { + console.error("[Stream Speech]", e); + showToast(prettyObject(e)); + setSpeechStatus(false); + ttsPlayer.stop(); + } finally { + setSpeechLoading(false); + } + } else { + audioBuffer = await api.llm.speech({ + model: config.ttsConfig.model, + input: textContent, + voice: config.ttsConfig.voice, + speed: config.ttsConfig.speed, + }); + playSpeech(audioBuffer); + } } - setSpeechStatus(true); - ttsPlayer - .play(audioBuffer, () => { - setSpeechStatus(false); - }) - .catch((e) => { - console.error("[OpenAI Speech]", e); - showToast(prettyObject(e)); - setSpeechStatus(false); - }) - .finally(() => setSpeechLoading(false)); } } + function playSpeech(audioBuffer: ArrayBuffer | AudioBuffer) { + setSpeechStatus(true); + ttsPlayer + .play(audioBuffer, () => { + setSpeechStatus(false); + }) + .catch((e) => { + console.error("[OpenAI Speech]", e); + showToast(prettyObject(e)); + setSpeechStatus(false); + }) + .finally(() => setSpeechLoading(false)); + } + const context: RenderMessage[] = useMemo(() => { return session.mask.hideContext ? [] : session.mask.context.slice(); }, [session.mask.context, session.mask.hideContext]); diff --git a/app/components/tts-config.tsx b/app/components/tts-config.tsx index 39ae85730c2..a0ad22fa00b 100644 --- a/app/components/tts-config.tsx +++ b/app/components/tts-config.tsx @@ -3,10 +3,9 @@ import { TTSConfig, TTSConfigValidator } from "../store"; import Locale from "../locales"; import { ListItem, Select } from "./ui-lib"; import { - DEFAULT_TTS_ENGINE, - DEFAULT_TTS_ENGINES, - DEFAULT_TTS_MODELS, - DEFAULT_TTS_VOICES, + ServiceProvider, + TTS_CONFIGS, + TTSEngineType } from "../constant"; import { InputRange } from "./input-range"; @@ -48,22 +47,33 @@ export function TTSConfigList(props: { - {props.ttsConfig.engine === DEFAULT_TTS_ENGINE && ( + {(props.ttsConfig.engine === ServiceProvider.OpenAI || + props.ttsConfig.engine === ServiceProvider.Alibaba) && ( <>