ChatGPTNextWeb · DDMeaqua · Oct 11, 2024 · Oct 11, 2024 · Oct 11, 2024 · Oct 11, 2024
diff --git a/app/client/api.ts b/app/client/api.ts
@@ -63,6 +63,16 @@ export interface SpeechOptions {
   onController?: (controller: AbortController) => void;
 }
 
+export interface TranscriptionOptions {
+  model?: "whisper-1";
+  file: Blob;
+  language?: string;
+  prompt?: string;
+  response_format?: "json" | "text" | "srt" | "verbose_json" | "vtt";
+  temperature?: number;
+  onController?: (controller: AbortController) => void;
+}
+
 export interface ChatOptions {
   messages: RequestMessage[];
   config: LLMConfig;
@@ -98,6 +108,7 @@ export interface LLMModelProvider {
 export abstract class LLMApi {
   abstract chat(options: ChatOptions): Promise<void>;
   abstract speech(options: SpeechOptions): Promise<ArrayBuffer>;
+  abstract transcription(options: TranscriptionOptions): Promise<string>;
   abstract usage(): Promise<LLMUsage>;
   abstract models(): Promise<LLMModel[]>;
 }

diff --git a/app/client/platforms/openai.ts b/app/client/platforms/openai.ts
@@ -180,6 +180,47 @@ export class ChatGPTApi implements LLMApi {
     }
   }
 
+  async transcription(options: TranscriptionOptions): Promise<string> {
+    const formData = new FormData();
+    formData.append("file", options.file, "audio.wav");
+    formData.append("model", options.model ?? "whisper-1");
+    if (options.language) formData.append("language", options.language);
+    if (options.prompt) formData.append("prompt", options.prompt);
+    if (options.response_format)
+      formData.append("response_format", options.response_format);
+    if (options.temperature)
+      formData.append("temperature", options.temperature.toString());
+
+    console.log("[Request] openai audio transcriptions payload: ", options);
+
+    const controller = new AbortController();
+    options.onController?.(controller);
+
+    try {
+      const path = this.path(OpenaiPath.TranscriptionPath, options.model);
+      const headers = getHeaders(true);
+      const payload = {
+        method: "POST",
+        body: formData,
+        signal: controller.signal,
+        headers: headers,
+      };
+
+      // make a fetch request
+      const requestTimeoutId = setTimeout(
+        () => controller.abort(),
+        REQUEST_TIMEOUT_MS,
+      );
+      const res = await fetch(path, payload);
+      clearTimeout(requestTimeoutId);
+      const json = await res.json();
+      return json.text;
-      const res = await fetch(path, payload);
-      clearTimeout(requestTimeoutId);
-      const json = await res.json();
-      return json.text;
+      const res = await fetch(path, payload);
+      clearTimeout(requestTimeoutId);
+      if (!res.ok) {
+        const errorText = await res.text();
+        console.error(`[Response] Transcription request failed: ${errorText}`);
+        throw new Error(`Transcription request failed with status ${res.status}`);
+      }
+      const json = await res.json();
+      return json.text;
-      const res = await fetch(path, payload);
-      clearTimeout(requestTimeoutId);
-      const json = await res.json();
-      return json.text;
+      const res = await fetch(path, payload);
+      clearTimeout(requestTimeoutId);
+      if (!res.ok) {
+        const errorText = await res.text();
+        console.error(`[Response] Transcription request failed: ${errorText}`);
+        throw new Error(`Transcription request failed with status ${res.status}`);
+      }
+      const json = await res.json();
+      return json.text;
+    } catch (e) {
+      console.log("[Request] failed to make a audio transcriptions request", e);
+      throw e;
+    }
+  }
-  async transcription(options: TranscriptionOptions): Promise<string> {
-    const formData = new FormData();
-    formData.append("file", options.file, "audio.wav");
-    formData.append("model", options.model ?? "whisper-1");
-    if (options.language) formData.append("language", options.language);
-    if (options.prompt) formData.append("prompt", options.prompt);
-    if (options.response_format)
-      formData.append("response_format", options.response_format);
-    if (options.temperature)
-      formData.append("temperature", options.temperature.toString());
-
-    console.log("[Request] openai audio transcriptions payload: ", options);
-
-    const controller = new AbortController();
-    options.onController?.(controller);
-
-    try {
-      const path = this.path(OpenaiPath.TranscriptionPath, options.model);
-      const headers = getHeaders(true);
-      const payload = {
-        method: "POST",
-        body: formData,
-        signal: controller.signal,
-        headers: headers,
-      };
-
-      // make a fetch request
-      const requestTimeoutId = setTimeout(
-        () => controller.abort(),
-        REQUEST_TIMEOUT_MS,
-      );
-      const res = await fetch(path, payload);
-      clearTimeout(requestTimeoutId);
-      const json = await res.json();
-      return json.text;
-    } catch (e) {
-      console.log("[Request] failed to make a audio transcriptions request", e);
-      throw e;
-    }
-  }
+  async transcription(options: TranscriptionOptions): Promise<string> {
+    const formData = new FormData();
+    formData.append("file", options.file, "audio.wav");
+    formData.append("model", options.model ?? "whisper-1");
+    if (options.language) formData.append("language", options.language);
+    if (options.prompt) formData.append("prompt", options.prompt);
+    if (options.response_format)
+      formData.append("response_format", options.response_format);
+    if (options.temperature)
+      formData.append("temperature", options.temperature.toString());
+
+    console.log("[Request] openai audio transcriptions payload: ", options);
+
+    const controller = new AbortController();
+    options.onController?.(controller);
+
+    try {
+      const path = this.path(OpenaiPath.TranscriptionPath, options.model);
+      const headers = getHeaders(true);
+      const payload = {
+        method: "POST",
+        body: formData,
+        signal: controller.signal,
+        headers,
+      };
+
+      // make a fetch request
+      const requestTimeoutId = setTimeout(
+        () => controller.abort(),
+        REQUEST_TIMEOUT_MS,
+      );
+      const res = await fetch(path, payload);
+      clearTimeout(requestTimeoutId);
+      if (!res.ok) {
+        throw new Error(`Transcription request failed with status ${res.status}`);
+      }
+      const json = await res.json();
+      return json.text ?? '';
+    } catch (e) {
+      console.error("[Request] failed to make an audio transcriptions request", e);
+      throw new Error(`Transcription request failed: ${e.message}`);
+    }
+  }
-  async transcription(options: TranscriptionOptions): Promise<string> {
-    const formData = new FormData();
-    formData.append("file", options.file, "audio.wav");
-    formData.append("model", options.model ?? "whisper-1");
-    if (options.language) formData.append("language", options.language);
-    if (options.prompt) formData.append("prompt", options.prompt);
-    if (options.response_format)
-      formData.append("response_format", options.response_format);
-    if (options.temperature)
-      formData.append("temperature", options.temperature.toString());
-
-    console.log("[Request] openai audio transcriptions payload: ", options);
-
-    const controller = new AbortController();
-    options.onController?.(controller);
-
-    try {
-      const path = this.path(OpenaiPath.TranscriptionPath, options.model);
-      const headers = getHeaders(true);
-      const payload = {
-        method: "POST",
-        body: formData,
-        signal: controller.signal,
-        headers: headers,
-      };
-
-      // make a fetch request
-      const requestTimeoutId = setTimeout(
-        () => controller.abort(),
-        REQUEST_TIMEOUT_MS,
-      );
-      const res = await fetch(path, payload);
-      clearTimeout(requestTimeoutId);
-      const json = await res.json();
-      return json.text;
-    } catch (e) {
-      console.log("[Request] failed to make a audio transcriptions request", e);
-      throw e;
-    }
-  }
+  async transcription(options: TranscriptionOptions): Promise<string> {
+    const formData = new FormData();
+    formData.append("file", options.file, "audio.wav");
+    formData.append("model", options.model ?? "whisper-1");
+    if (options.language) formData.append("language", options.language);
+    if (options.prompt) formData.append("prompt", options.prompt);
+    if (options.response_format)
+      formData.append("response_format", options.response_format);
+    if (options.temperature)
+      formData.append("temperature", options.temperature.toString());
+
+    console.log("[Request] openai audio transcriptions payload: ", options);
+
+    const controller = new AbortController();
+    options.onController?.(controller);
+
+    try {
+      const path = this.path(OpenaiPath.TranscriptionPath, options.model);
+      const headers = getHeaders(true);
+      const payload = {
+        method: "POST",
+        body: formData,
+        signal: controller.signal,
+        headers,
+      };
+
+      // make a fetch request
+      const requestTimeoutId = setTimeout(
+        () => controller.abort(),
+        REQUEST_TIMEOUT_MS,
+      );
+      const res = await fetch(path, payload);
+      clearTimeout(requestTimeoutId);
+      if (!res.ok) {
+        throw new Error(`Transcription request failed with status ${res.status}`);
+      }
+      const json = await res.json();
+      return json.text ?? '';
+    } catch (e) {
+      console.error("[Request] failed to make an audio transcriptions request", e);
+      throw new Error(`Transcription request failed: ${e.message}`);
+    }
+  }
+
   async chat(options: ChatOptions) {
     const modelConfig = {
       ...useAppConfig.getState().modelConfig,

diff --git a/app/components/chat.tsx b/app/components/chat.tsx
@@ -10,6 +10,7 @@ import React, {
 } from "react";
 
 import SendWhiteIcon from "../icons/send-white.svg";
+import VoiceWhiteIcon from "../icons/voice-white.svg";
 import BrainIcon from "../icons/brain.svg";
 import RenameIcon from "../icons/rename.svg";
 import ExportIcon from "../icons/share.svg";
@@ -72,6 +73,7 @@ import {
   isDalle3,
   showPlugins,
   safeLocalStorage,
+  isFirefox,
 } from "../utils";
 
 import { uploadImage as uploadImageRemote } from "@/app/utils/chat";
@@ -97,8 +99,9 @@ import {
 } from "./ui-lib";
 import { useNavigate } from "react-router-dom";
 import {
-  CHAT_PAGE_SIZE,
+  DEFAULT_STT_ENGINE,
   DEFAULT_TTS_ENGINE,
+  FIREFOX_DEFAULT_STT_ENGINE,
   ModelProvider,
   Path,
   REQUEST_TIMEOUT_MS,
@@ -118,6 +121,7 @@ import { MultimodalContent } from "../client/api";
 const localStorage = safeLocalStorage();
 import { ClientApi } from "../client/api";
 import { createTTSPlayer } from "../utils/audio";
+import { OpenAITranscriptionApi, WebTranscriptionApi } from "../utils/speech";
 import { MsEdgeTTS, OUTPUT_FORMAT } from "../utils/ms_edge_tts";
 
 const ttsPlayer = createTTSPlayer();
@@ -546,6 +550,44 @@ export function ChatActions(props: {
     }
   }, [chatStore, currentModel, models]);
 
+  const [isListening, setIsListening] = useState(false);
+  const [isTranscription, setIsTranscription] = useState(false);
+  const [speechApi, setSpeechApi] = useState<any>(null);
-  const [speechApi, setSpeechApi] = useState<any>(null);
+  const [speechApi, setSpeechApi] = useState<WebTranscriptionApi | OpenAITranscriptionApi | null>(null);
-  const [speechApi, setSpeechApi] = useState<any>(null);
+  const [speechApi, setSpeechApi] = useState<WebTranscriptionApi | OpenAITranscriptionApi | null>(null);
+
+  useEffect(() => {
+    if (isFirefox()) config.sttConfig.engine = FIREFOX_DEFAULT_STT_ENGINE;
+    setSpeechApi(
+      config.sttConfig.engine === DEFAULT_STT_ENGINE
+        ? new WebTranscriptionApi((transcription) =>
+            onRecognitionEnd(transcription),
+          )
+        : new OpenAITranscriptionApi((transcription) =>
+            onRecognitionEnd(transcription),
+          ),
+    );
+  }, []);
+
+  const startListening = async () => {
+    if (speechApi) {
+      await speechApi.start();
+      setIsListening(true);
+    }
+  };
+  const stopListening = async () => {
+    if (speechApi) {
+      if (config.sttConfig.engine !== DEFAULT_STT_ENGINE)
+        setIsTranscription(true);
+      await speechApi.stop();
+      setIsListening(false);
+    }
+  };
+  const onRecognitionEnd = (finalTranscript: string) => {
+    console.log(finalTranscript);
+    if (finalTranscript) props.setUserInput(finalTranscript);
+    if (config.sttConfig.engine !== DEFAULT_STT_ENGINE)
+      setIsTranscription(false);
+  };
-  const onRecognitionEnd = (finalTranscript: string) => {
-    console.log(finalTranscript);
-    if (finalTranscript) props.setUserInput(finalTranscript);
-    if (config.sttConfig.engine !== DEFAULT_STT_ENGINE)
-      setIsTranscription(false);
-  };
+  const onRecognitionEnd = (finalTranscript: string) => {
+    if (finalTranscript) props.setUserInput(finalTranscript);
+    if (config.sttConfig.engine !== DEFAULT_STT_ENGINE)
+      setIsTranscription(false);
+  };
-  const onRecognitionEnd = (finalTranscript: string) => {
-    console.log(finalTranscript);
-    if (finalTranscript) props.setUserInput(finalTranscript);
-    if (config.sttConfig.engine !== DEFAULT_STT_ENGINE)
-      setIsTranscription(false);
-  };
+  const onRecognitionEnd = (finalTranscript: string) => {
+    if (finalTranscript) props.setUserInput(finalTranscript);
+    if (config.sttConfig.engine !== DEFAULT_STT_ENGINE)
+      setIsTranscription(false);
+  };
+
   return (
     <div className={styles["chat-input-actions"]}>
       {couldStop && (
@@ -780,6 +822,16 @@ export function ChatActions(props: {
           icon={<ShortcutkeyIcon />}
         />
       )}
+
+      {config.sttConfig.enable && (
+        <ChatAction
+          onClick={async () =>
+            isListening ? await stopListening() : await startListening()
+          }
+          text={isListening ? Locale.Chat.StopSpeak : Locale.Chat.StartSpeak}
+          icon={<VoiceWhiteIcon />}
+        />
+      )}
     </div>
   );
 }
@@ -1505,7 +1557,7 @@ function _Chat() {
     setAttachImages(images);
   }
 
-  // 快捷键 shortcut keys
+  // 快捷键
   const [showShortcutKeyModal, setShowShortcutKeyModal] = useState(false);
 
   useEffect(() => {

diff --git a/app/components/settings.tsx b/app/components/settings.tsx
@@ -83,6 +83,7 @@ import { nanoid } from "nanoid";
 import { useMaskStore } from "../store/mask";
 import { ProviderType } from "../utils/cloud";
 import { TTSConfigList } from "./tts-config";
+import { STTConfigList } from "./stt-config";
 
 function EditPromptModal(props: { id: string; onClose: () => void }) {
   const promptStore = usePromptStore();
@@ -1703,6 +1704,17 @@ export function Settings() {
           />
         </List>
 
+        <List>
+          <STTConfigList
+            sttConfig={config.sttConfig}
+            updateConfig={(updater) => {
+              const sttConfig = { ...config.sttConfig };
+              updater(sttConfig);
+              config.update((config) => (config.sttConfig = sttConfig));
+            }}
+          />
+        </List>
+
         <DangerItems />
       </div>
     </ErrorBoundary>

diff --git a/app/components/stt-config.tsx b/app/components/stt-config.tsx
@@ -0,0 +1,51 @@
+import { STTConfig, STTConfigValidator } from "../store";
+
+import Locale from "../locales";
+import { ListItem, Select } from "./ui-lib";
+import { DEFAULT_STT_ENGINES } from "../constant";
+import { isFirefox } from "../utils";
+
+export function STTConfigList(props: {
+  sttConfig: STTConfig;
+  updateConfig: (updater: (config: STTConfig) => void) => void;
+}) {
+  return (
+    <>
+      <ListItem
+        title={Locale.Settings.STT.Enable.Title}
+        subTitle={Locale.Settings.STT.Enable.SubTitle}
+      >
+        <input
+          type="checkbox"
+          checked={props.sttConfig.enable}
+          onChange={(e) =>
+            props.updateConfig(
+              (config) => (config.enable = e.currentTarget.checked),
+            )
+          }
+        ></input>
+      </ListItem>
-      <ListItem
-        title={Locale.Settings.STT.Enable.Title}
-        subTitle={Locale.Settings.STT.Enable.SubTitle}
-      >
-        <input
-          type="checkbox"
-          checked={props.sttConfig.enable}
-          onChange={(e) =>
-            props.updateConfig(
-              (config) => (config.enable = e.currentTarget.checked),
-            )
-          }
-        ></input>
-      </ListItem>
+      <ListItem
+        title={Locale.Settings.STT.Enable.Title}
+        subTitle={Locale.Settings.STT.Enable.SubTitle}
+      >
+        <input
+          type="checkbox"
+          checked={props.sttConfig.enable}
+          onChange={(e) =>
+            props.updateConfig(
+              (config) => ({ ...config, enable: e.currentTarget.checked })
+            )
+          }
+        ></input>
+      </ListItem>
-      <ListItem
-        title={Locale.Settings.STT.Enable.Title}
-        subTitle={Locale.Settings.STT.Enable.SubTitle}
-      >
-        <input
-          type="checkbox"
-          checked={props.sttConfig.enable}
-          onChange={(e) =>
-            props.updateConfig(
-              (config) => (config.enable = e.currentTarget.checked),
-            )
-          }
-        ></input>
-      </ListItem>
+      <ListItem
+        title={Locale.Settings.STT.Enable.Title}
+        subTitle={Locale.Settings.STT.Enable.SubTitle}
+      >
+        <input
+          type="checkbox"
+          checked={props.sttConfig.enable}
+          onChange={(e) =>
+            props.updateConfig(
+              (config) => ({ ...config, enable: e.currentTarget.checked })
+            )
+          }
+        ></input>
+      </ListItem>
+      {!isFirefox() && (
+        <ListItem title={Locale.Settings.STT.Engine.Title}>
+          <Select
+            value={props.sttConfig.engine}
+            onChange={(e) => {
+              props.updateConfig(
+                (config) =>
+                  (config.engine = STTConfigValidator.engine(
+                    e.currentTarget.value,
+                  )),
+              );
+            }}
+          >
+            {DEFAULT_STT_ENGINES.map((v, i) => (
+              <option value={v} key={i}>
+                {v}
+              </option>
+            ))}
+          </Select>
+        </ListItem>
+      )}
+    </>
+  );
+}
diff --git a/app/components/stt.module.scss b/app/components/stt.module.scss
@@ -0,0 +1,119 @@
+@import "../styles/animation.scss";
+.plugin-page {
+  height: 100%;
+  display: flex;
+  flex-direction: column;
+
+  .plugin-page-body {
+    padding: 20px;
+    overflow-y: auto;
+
+    .plugin-filter {
+      width: 100%;
+      max-width: 100%;
+      margin-bottom: 20px;
+      animation: slide-in ease 0.3s;
+      height: 40px;
+
+      display: flex;
+
+      .search-bar {
+        flex-grow: 1;
+        max-width: 100%;
+        min-width: 0;
+        outline: none;
+      }
+
+      .search-bar:focus {
+        border: 1px solid var(--primary);
+      }
+
+      .plugin-filter-lang {
+        height: 100%;
+        margin-left: 10px;
+      }
+
+      .plugin-create {
+        height: 100%;
+        margin-left: 10px;
+        box-sizing: border-box;
+        min-width: 80px;
+      }
+    }
+
+    .plugin-item {
+      display: flex;
+      justify-content: space-between;
+      padding: 20px;
+      border: var(--border-in-light);
+      animation: slide-in ease 0.3s;
+
+      &:not(:last-child) {
+        border-bottom: 0;
+      }
+
+      &:first-child {
+        border-top-left-radius: 10px;
+        border-top-right-radius: 10px;
+      }
+
+      &:last-child {
+        border-bottom-left-radius: 10px;
+        border-bottom-right-radius: 10px;
+      }
+
+      .plugin-header {
+        display: flex;
+        align-items: center;
+
+        .plugin-icon {
+          display: flex;
+          align-items: center;
+          justify-content: center;
+          margin-right: 10px;
+        }
+
+        .plugin-title {
+          .plugin-name {
+            font-size: 14px;
+            font-weight: bold;
+          }
+          .plugin-info {
+            font-size: 12px;
+          }
+          .plugin-runtime-warning {
+            font-size: 12px;
+            color: #f86c6c;
+          }
+        }
+      }
+
+      .plugin-actions {
+        display: flex;
+        flex-wrap: nowrap;
+        transition: all ease 0.3s;
+        justify-content: center;
+        align-items: center;
+      }
+
+      @media screen and (max-width: 600px) {
+        display: flex;
+        flex-direction: column;
+        padding-bottom: 10px;
+        border-radius: 10px;
+        margin-bottom: 20px;
+        box-shadow: var(--card-shadow);
+
+        &:not(:last-child) {
+          border-bottom: var(--border-in-light);
+        }
+
+        .plugin-actions {
+          width: 100%;
+          justify-content: space-between;
+          padding-top: 10px;
+        }
+      }
+    }
+  }
+}
diff --git a/app/constant.ts b/app/constant.ts
@@ -150,6 +150,7 @@ export const Anthropic = {
 export const OpenaiPath = {
   ChatPath: "v1/chat/completions",
   SpeechPath: "v1/audio/speech",
+  TranscriptionPath: "v1/audio/transcriptions",
   ImagePath: "v1/images/generations",
   UsagePath: "dashboard/billing/usage",
   SubsPath: "dashboard/billing/subscription",
@@ -270,6 +271,10 @@ export const DEFAULT_TTS_VOICES = [
   "shimmer",
 ];
 
+export const DEFAULT_STT_ENGINE = "WebAPI";
+export const DEFAULT_STT_ENGINES = ["WebAPI", "OpenAI Whisper"];
+export const FIREFOX_DEFAULT_STT_ENGINE = "OpenAI Whisper";
+
 const openaiModels = [
   "gpt-3.5-turbo",
   "gpt-3.5-turbo-1106",