ChatGPTNextWeb · Little-LittleProgrammer · Jul 30, 2025 · Jul 30, 2025 · Jul 30, 2025 · Jul 31, 2025
diff --git a/.yarnrc.yml b/.yarnrc.yml
@@ -0,0 +1 @@
+nodeLinker: node-modules
diff --git a/app/client/api.ts b/app/client/api.ts
@@ -107,7 +107,8 @@ export interface LLMModelProvider {
 
 export abstract class LLMApi {
   abstract chat(options: ChatOptions): Promise<void>;
-  abstract speech(options: SpeechOptions): Promise<ArrayBuffer>;
+  abstract speech(options: SpeechOptions): Promise<ArrayBuffer | AudioBuffer>;
+  abstract streamSpeech?(options: SpeechOptions): AsyncGenerator<AudioBuffer>;
   abstract usage(): Promise<LLMUsage>;
   abstract models(): Promise<LLMModel[]>;
 }

diff --git a/app/client/platforms/alibaba.ts b/app/client/platforms/alibaba.ts
@@ -1,5 +1,10 @@
 "use client";
-import { ApiPath, Alibaba, ALIBABA_BASE_URL } from "@/app/constant";
+import {
+  ApiPath,
+  Alibaba,
+  ALIBABA_BASE_URL,
+  REQUEST_TIMEOUT_MS,
+} from "@/app/constant";
 import {
   useAccessStore,
   useAppConfig,
@@ -89,10 +94,72 @@ export class QwenApi implements LLMApi {
     return res?.output?.choices?.at(0)?.message?.content ?? "";
   }
 
-  speech(options: SpeechOptions): Promise<ArrayBuffer> {
+  async speech(options: SpeechOptions): Promise<ArrayBuffer> {
     throw new Error("Method not implemented.");
   }
 
+  async *streamSpeech(options: SpeechOptions): AsyncGenerator<AudioBuffer> {
+    const requestPayload = {
+      model: options.model,
+      input: {
+        text: options.input,
+        voice: options.voice,
+      },
+      speed: options.speed,
+      response_format: options.response_format,
+    };
+    const controller = new AbortController();
+    options.onController?.(controller);
+    try {
+      const speechPath = this.path(Alibaba.SpeechPath);
+      const speechPayload = {
+        method: "POST",
+        body: JSON.stringify(requestPayload),
+        signal: controller.signal,
+        headers: {
+          ...getHeaders(),
+          "X-DashScope-SSE": "enable",
+        },
+      };
+
+      // make a fetch request
+      const requestTimeoutId = setTimeout(
+        () => controller.abort(),
+        REQUEST_TIMEOUT_MS,
+      );
+
+      const res = await fetch(speechPath, speechPayload);
+
+      const reader = res.body!.getReader();
+      const decoder = new TextDecoder();
+      let buffer = "";
+      while (true) {
+        const { done, value } = await reader.read();
+        if (done) {
+          break;
+        }
+        buffer += decoder.decode(value, { stream: true });
+        const lines = buffer.split("\n");
+        buffer = lines.pop() || "";
+
+        for (const line of lines) {
+          if (line.startsWith("data:")) {
+            const data = line.slice(5);
+            const json = JSON.parse(data);
+            if (json.output.audio.data) {
+              yield this.PCMBase64ToAudioBuffer(json.output.audio.data);
+            }
+          }
+        }
+      }
+      clearTimeout(requestTimeoutId);
+      reader.releaseLock();
+    } catch (e) {
+      console.log("[Request] failed to make a speech request", e);
+      throw e;
+    }
+  }
+
   async chat(options: ChatOptions) {
     const modelConfig = {
       ...useAppConfig.getState().modelConfig,
@@ -273,5 +340,72 @@ export class QwenApi implements LLMApi {
   async models(): Promise<LLMModel[]> {
     return [];
   }
+
+  // 播放 PCM base64 数据
+  private async PCMBase64ToAudioBuffer(base64Data: string) {
+    try {
+      // 解码 base64
+      const binaryString = atob(base64Data);
+      const bytes = new Uint8Array(binaryString.length);
+      for (let i = 0; i < binaryString.length; i++) {
+        bytes[i] = binaryString.charCodeAt(i);
+      }
+
+      // 转换为 AudioBuffer
+      const audioBuffer = await this.convertToAudioBuffer(bytes);
+
+      return audioBuffer;
+    } catch (error) {
+      console.error("播放 PCM 数据失败:", error);
+      throw error;
+    }
+  }
+
+  // 将 PCM 字节数据转换为 AudioBuffer
+  private convertToAudioBuffer(pcmData: Uint8Array) {
+    const audioContext = new (window.AudioContext ||
+      window.webkitAudioContext)();
+    const channels = 1;
+    const sampleRate = 24000;
+    return new Promise<AudioBuffer>((resolve, reject) => {
+      try {
+        let float32Array;
+        // 16位 PCM 转换为 32位浮点数
+        float32Array = this.pcm16ToFloat32(pcmData);
+
+        // 创建 AudioBuffer
+        const audioBuffer = audioContext.createBuffer(
+          channels,
+          float32Array.length / channels,
+          sampleRate,
+        );
+
+        // 复制数据到 AudioBuffer
+        for (let channel = 0; channel < channels; channel++) {
+          const channelData = audioBuffer.getChannelData(channel);
+          for (let i = 0; i < channelData.length; i++) {
+            channelData[i] = float32Array[i * channels + channel];
+          }
+        }
+
+        resolve(audioBuffer);
+      } catch (error) {
+        reject(error);
+      }
+    });
+  }
+  // 16位 PCM 转 32位浮点数
+  private pcm16ToFloat32(pcmData: Uint8Array) {
+    const length = pcmData.length / 2;
+    const float32Array = new Float32Array(length);
+
+    for (let i = 0; i < length; i++) {
+      const int16 = (pcmData[i * 2 + 1] << 8) | pcmData[i * 2];
+      const int16Signed = int16 > 32767 ? int16 - 65536 : int16;
+      float32Array[i] = int16Signed / 32768;
+    }
+
+    return float32Array;
+  }
 }
 export { Alibaba };
diff --git a/app/components/chat.tsx b/app/components/chat.tsx
@@ -101,8 +101,6 @@ import {
 import { useNavigate } from "react-router-dom";
 import {
   CHAT_PAGE_SIZE,
-  DEFAULT_TTS_ENGINE,
-  ModelProvider,
   Path,
   REQUEST_TIMEOUT_MS,
   ServiceProvider,
@@ -1286,50 +1284,83 @@ function _Chat() {
   const accessStore = useAccessStore();
   const [speechStatus, setSpeechStatus] = useState(false);
   const [speechLoading, setSpeechLoading] = useState(false);
+  const [speechCooldown, setSpeechCooldown] = useState(false);
 
   async function openaiSpeech(text: string) {
     if (speechStatus) {
       ttsPlayer.stop();
       setSpeechStatus(false);
     } else {
-  async function openaiSpeech(text: string) {
-    if (speechStatus) {
-      ttsPlayer.stop();
-      setSpeechStatus(false);
-    } else {
+  async function openaiSpeech(text: string) {
+    if (speechStatus) {
+      await ttsPlayer.stop();
+      setSpeechStatus(false);
+    } else {
-  async function openaiSpeech(text: string) {
-    if (speechStatus) {
-      ttsPlayer.stop();
-      setSpeechStatus(false);
-    } else {
+  async function openaiSpeech(text: string) {
+    if (speechStatus) {
+      await ttsPlayer.stop();
+      setSpeechStatus(false);
+    } else {
       var api: ClientApi;
-      api = new ClientApi(ModelProvider.GPT);
       const config = useAppConfig.getState();
+      api = new ClientApi(config.ttsConfig.modelProvider);
       setSpeechLoading(true);
       ttsPlayer.init();
-      let audioBuffer: ArrayBuffer;
+      let audioBuffer: ArrayBuffer | AudioBuffer;
       const { markdownToTxt } = require("markdown-to-txt");
       const textContent = markdownToTxt(text);
-      if (config.ttsConfig.engine !== DEFAULT_TTS_ENGINE) {
+      if (config.ttsConfig.engine === "Edge") {
         const edgeVoiceName = accessStore.edgeVoiceName();
         const tts = new MsEdgeTTS();
         await tts.setMetadata(
           edgeVoiceName,
           OUTPUT_FORMAT.AUDIO_24KHZ_96KBITRATE_MONO_MP3,
         );
         audioBuffer = await tts.toArrayBuffer(textContent);
+        playSpeech(audioBuffer);
       } else {
-        audioBuffer = await api.llm.speech({
-          model: config.ttsConfig.model,
-          input: textContent,
-          voice: config.ttsConfig.voice,
-          speed: config.ttsConfig.speed,
-        });
+        if (api.llm.streamSpeech) {
+          // 使用流式播放，边接收边播放
+          setSpeechStatus(true);
+          ttsPlayer.startStreamPlay(() => {
+            setSpeechStatus(false);
+          });
+
+          try {
+            for await (const chunk of api.llm.streamSpeech({
+              model: config.ttsConfig.model,
+              input: textContent,
+              voice: config.ttsConfig.voice,
+              speed: config.ttsConfig.speed,
+            })) {
+              ttsPlayer.addToQueue(chunk);
+            }
+            ttsPlayer.finishStreamPlay();
+          } catch (e) {
+            console.error("[Stream Speech]", e);
+            showToast(prettyObject(e));
+            setSpeechStatus(false);
+            ttsPlayer.stop();
+          } finally {
+            setSpeechLoading(false);
+          }
+        } else {
+          audioBuffer = await api.llm.speech({
+            model: config.ttsConfig.model,
+            input: textContent,
+            voice: config.ttsConfig.voice,
+            speed: config.ttsConfig.speed,
+          });
+          playSpeech(audioBuffer);
+        }
       }
-      setSpeechStatus(true);
-      ttsPlayer
-        .play(audioBuffer, () => {
-          setSpeechStatus(false);
-        })
-        .catch((e) => {
-          console.error("[OpenAI Speech]", e);
-          showToast(prettyObject(e));
-          setSpeechStatus(false);
-        })
-        .finally(() => setSpeechLoading(false));
     }
   }
 
+  function playSpeech(audioBuffer: ArrayBuffer | AudioBuffer) {
+    setSpeechStatus(true);
+    ttsPlayer
+      .play(audioBuffer, () => {
+        setSpeechStatus(false);
+      })
+      .catch((e) => {
+        console.error("[OpenAI Speech]", e);
+        showToast(prettyObject(e));
+        setSpeechStatus(false);
+      })
+      .finally(() => setSpeechLoading(false));
+  }
+
   const context: RenderMessage[] = useMemo(() => {
     return session.mask.hideContext ? [] : session.mask.context.slice();
   }, [session.mask.context, session.mask.hideContext]);

diff --git a/app/components/tts-config.tsx b/app/components/tts-config.tsx
@@ -3,10 +3,9 @@ import { TTSConfig, TTSConfigValidator } from "../store";
 import Locale from "../locales";
 import { ListItem, Select } from "./ui-lib";
 import {
-  DEFAULT_TTS_ENGINE,
-  DEFAULT_TTS_ENGINES,
-  DEFAULT_TTS_MODELS,
-  DEFAULT_TTS_VOICES,
+    ServiceProvider,
+    TTS_CONFIGS,
+    TTSEngineType
 } from "../constant";
 import { InputRange } from "./input-range";
 
@@ -48,22 +47,33 @@ export function TTSConfigList(props: {
         <Select
           value={props.ttsConfig.engine}
           onChange={(e) => {
+            const newEngine = e.currentTarget.value as TTSEngineType;
             props.updateConfig(
-              (config) =>
-                (config.engine = TTSConfigValidator.engine(
-                  e.currentTarget.value,
-                )),
+              (config) => {
+                config.engine = TTSConfigValidator.engine(newEngine);
+                const engineConfig = TTS_CONFIGS[newEngine];
+                config.model = TTSConfigValidator.model(
+                    engineConfig.Model[0] || ""
+                );
+                config.voice = TTSConfigValidator.voice(
+                    engineConfig.Voices[0] || ""
+                );
+                config.modelProvider = TTSConfigValidator.modelProvider(
+                    engineConfig.ModelProvider
+                );
+              }
             );
           }}
         >
-          {DEFAULT_TTS_ENGINES.map((v, i) => (
+          {Object.keys(TTS_CONFIGS).map((v, i) => (
             <option value={v} key={i}>
-              {v}
+              {v}-TTS
             </option>
           ))}
         </Select>
       </ListItem>
-      {props.ttsConfig.engine === DEFAULT_TTS_ENGINE && (
+      {(props.ttsConfig.engine === ServiceProvider.OpenAI || 
+        props.ttsConfig.engine === ServiceProvider.Alibaba) && (
         <>
           <ListItem title={Locale.Settings.TTS.Model}>
             <Select
@@ -77,7 +87,7 @@ export function TTSConfigList(props: {
                 );
               }}
             >
-              {DEFAULT_TTS_MODELS.map((v, i) => (
+              {TTS_CONFIGS[props.ttsConfig.engine]!.Model.map((v, i) => (
                 <option value={v} key={i}>
                   {v}
                 </option>
@@ -99,7 +109,7 @@ export function TTSConfigList(props: {
                 );
               }}
             >
-              {DEFAULT_TTS_VOICES.map((v, i) => (
+              {TTS_CONFIGS[props.ttsConfig.engine]!.Voices.map((v, i) => (
                 <option value={v} key={i}>
                   {v}
                 </option>