Unverified 提交 31c1a145 authored 作者: Adekunle James Adeniji's avatar Adekunle James Adeniji 提交者: GitHub

feat: add voice input feature with transcription support (#2344)

closes #1804 <!-- devin-review-badge-begin --> --- <a href="https://app.devin.ai/review/dyad-sh/dyad/pull/2344"> <picture> <source media="(prefers-color-scheme: dark)" srcset="https://static.devin.ai/assets/gh-open-in-devin-review-dark.svg?v=1"> <img src="https://static.devin.ai/assets/gh-open-in-devin-review-light.svg?v=1" alt="Open with Devin"> </picture> </a> <!-- devin-review-badge-end --> <!-- CURSOR_SUMMARY --> --- > [!NOTE] > Introduces voice input across chat inputs with transcription via Dyad Engine and Pro gating. > > - Replaces send row with `LexicalVoiceInputRow` in `ChatInput` and `HomeChatInput`, adding mic control, waveform (`VoiceWaveform`), and send/cancel integration > - New `VoiceInputButton` handles Pro-only disabled state, recording/transcribing states, and tooltips > - New hooks `useAudioRecorder` and `useVoiceInput` to record via `MediaRecorder`, visualize with `AnalyserNode`, and call `ipc.misc.transcribeAudio` > - IPC: adds `misc.transcribeAudio` contract, registers `transcription_handlers` that validate input, support E2E mock, and call `transcribeWithDyadEngine` > - Dyad Engine util: adds `transcribeWithDyadEngine` with request-id attempt tracking and multipart upload to `/audio/transcriptions` > - E2E tests for voice flow and Pro gating; mocks `getUserMedia` and asserts transcription append > > <sup>Written by [Cursor Bugbot](https://cursor.com/dashboard?tab=bugbot) for commit 7dc1944bf0149a9f88b63a3fdfe0df83e7aa4f9f. This will update automatically on new commits. Configure [here](https://cursor.com/dashboard?tab=bugbot).</sup> <!-- /CURSOR_SUMMARY --> <!-- This is an auto-generated description by cubic. --> --- ## Summary by cubic Adds voice input with waveform visualization and transcription for chat, gated to Dyad Pro users. Improves reliability with fixes for recording setup leaks, analyser state, and proper audio MIME typing so IPC-backed Dyad Engine transcription consistently appends text; addresses #1804. - **New Features** - Integrated VoiceInputButton and VoiceWaveform via LexicalVoiceInputRow in ChatInput and HomeChatInput; appends transcribed text to the input. - Added useAudioRecorder/useVoiceInput hooks to record via MediaRecorder, visualize with AnalyserNode, and invoke IPC channel chat:transcribe. - Pro gating with tooltip and disabled state for non-Pro users; recording can always be stopped. - IPC handler validates payloads and calls Dyad Engine via multipart upload; includes E2E mock support. - E2E tests mock getUserMedia and verify transcription append and Pro-only disabled state. - **Migration** - Provide a Dyad Pro API key (settings or DYAD_PRO_API_KEY) and enable Dyad Pro. - Ensure microphone permissions are granted. - Optionally set DYAD_ENGINE_URL; defaults to https://engine.dyad.sh/v1. <sup>Written for commit fa71433ae270a7276e5466c6c8df359eab1eb03d. Summary will update on new commits.</sup> <!-- End of auto-generated description by cubic. --> --------- Co-authored-by: 's avatarWill Chen <willchen90@gmail.com>
上级 51779705
import { test } from "./helpers/test_helper";
import { expect } from "@playwright/test";
import { Timeout } from "./helpers/constants";
test("voice-to-text button visible for pro users", async ({ po }) => {
await po.setUpDyadPro();
// Navigate to an app to get the ChatInput
await po.importApp("minimal");
// The mic button should be visible (Pro user)
const micButton = po.page.getByRole("button", { name: "Voice to text" });
await expect(micButton).toBeVisible({ timeout: Timeout.SHORT });
await expect(micButton).toBeEnabled();
});
test("voice-to-text button shows lock for non-pro users", async ({ po }) => {
await po.setUp();
// Navigate to an app to get the ChatInput
await po.importApp("minimal");
// The locked mic button should be visible (non-Pro user)
const lockedMicButton = po.page.getByRole("button", {
name: "Voice to text (Pro)",
});
await expect(lockedMicButton).toBeVisible({ timeout: Timeout.SHORT });
});
test("voice-to-text button shows lock on home page for non-pro users", async ({
po,
}) => {
await po.setUp();
// On the home page, the locked mic button should be visible
const lockedMicButton = po.chatActions
.getHomeChatInputContainer()
.getByRole("button", { name: "Voice to text (Pro)" });
await expect(lockedMicButton).toBeVisible({ timeout: Timeout.SHORT });
});
test("voice-to-text button visible on home page for pro users", async ({
po,
}) => {
await po.setUpDyadPro();
// On the home page, the mic button should be visible
const micButton = po.chatActions
.getHomeChatInputContainer()
.getByRole("button", { name: "Voice to text" });
await expect(micButton).toBeVisible({ timeout: Timeout.SHORT });
await expect(micButton).toBeEnabled();
});
test("voice-to-text button changes state when recording", async ({ po }) => {
await po.setUpDyadPro();
await po.importApp("minimal");
const micButton = po.page.getByRole("button", { name: "Voice to text" });
await expect(micButton).toBeVisible({ timeout: Timeout.SHORT });
// Grant microphone permission and click to start recording
// Note: In Electron E2E, getUserMedia may not be available, so we test the
// button click doesn't crash and the button remains interactive.
await micButton.click();
// After clicking, the button should either be in recording state (Stop recording)
// or show an error toast if mic access is denied in the test environment.
// We verify the button is still present and the app didn't crash.
const stopButton = po.page.getByRole("button", { name: "Stop recording" });
const voiceButton = po.page.getByRole("button", { name: "Voice to text" });
// One of these should be visible - either we started recording or fell back
await expect(stopButton.or(voiceButton)).toBeVisible({
timeout: Timeout.SHORT,
});
});
......@@ -16,6 +16,8 @@ import {
ChevronsDownUp,
SendHorizontalIcon,
Lock,
Mic,
MicOff,
} from "lucide-react";
import type React from "react";
import { useCallback, useEffect, useRef, useState, useMemo } from "react";
......@@ -94,6 +96,8 @@ import { useChats } from "@/hooks/useChats";
import { useRouter } from "@tanstack/react-router";
import { showError as showErrorToast } from "@/lib/toast";
import { cn } from "@/lib/utils";
import { useVoiceToText } from "@/hooks/useVoiceToText";
import { isDyadProEnabled } from "@/lib/schemas";
const showTokenBarAtom = atom(false);
......@@ -206,6 +210,21 @@ export function ChatInput({ chatId }: { chatId?: number }) {
}, [chatId, messagesById]);
const { userBudget } = useUserBudgetInfo();
const isProEnabled = settings ? isDyadProEnabled(settings) : false;
const handleTranscription = useCallback(
(text: string) => {
setInputValue((prev: string) => (prev.trim() ? prev + " " + text : text));
},
[setInputValue],
);
const { isRecording, isTranscribing, toggleRecording } = useVoiceToText({
enabled: isProEnabled,
onTranscription: handleTranscription,
onError: (message) => showErrorToast(message),
});
const [needsFreshPlanChat, setNeedsFreshPlanChat] = useAtom(
needsFreshPlanChatAtom,
);
......@@ -311,6 +330,10 @@ export function ChatInput({ chatId }: { chatId?: number }) {
return;
}
if (isRecording) {
await toggleRecording();
}
// If switching to plan mode from another mode in a chat with messages,
// create a new chat for a clean context.
if (needsFreshPlanChat && settings?.selectedChatMode === "plan" && appId) {
......@@ -719,6 +742,68 @@ export function ChatInput({ chatId }: { chatId?: number }) {
messageHistory={userMessageHistory}
/>
{/* Voice-to-text button */}
{isProEnabled ? (
<Tooltip>
<TooltipTrigger
render={
<button
onClick={toggleRecording}
disabled={isTranscribing}
aria-label={
isRecording
? t("stopRecording", "Stop recording")
: isTranscribing
? t("transcribing", "Transcribing...")
: t("voiceToText", "Voice to text")
}
className={cn(
"px-2 py-2 mb-0.5 text-muted-foreground rounded-lg transition-colors duration-150 cursor-pointer disabled:cursor-default disabled:opacity-30",
isRecording &&
"text-red-500 hover:text-red-600 animate-pulse",
!isRecording && !isTranscribing && "hover:text-primary",
)}
/>
}
>
{isTranscribing ? (
<Loader2 size={20} className="animate-spin" />
) : isRecording ? (
<MicOff size={20} />
) : (
<Mic size={20} />
)}
</TooltipTrigger>
<TooltipContent>
{isRecording
? t("stopRecording", "Stop recording")
: isTranscribing
? t("transcribing", "Transcribing...")
: t("voiceToText", "Voice to text")}
</TooltipContent>
</Tooltip>
) : (
<Tooltip>
<TooltipTrigger
render={
<button
onClick={() =>
ipc.system.openExternalUrl("https://dyad.sh/pro")
}
aria-label={t("voiceToTextPro", "Voice to text (Pro)")}
className="px-2 py-2 mb-0.5 text-muted-foreground hover:text-primary rounded-lg transition-colors duration-150 cursor-pointer relative"
/>
}
>
<Mic size={20} />
<Lock size={10} className="absolute -top-0.5 -right-0.5" />
</TooltipTrigger>
<TooltipContent>
{t("voiceToTextRequiresPro", "Voice to text (requires Pro)")}
</TooltipContent>
</Tooltip>
)}
{isStreaming ? (
<Tooltip>
<TooltipTrigger
......
......@@ -3,6 +3,10 @@ import {
StopCircleIcon,
FolderOpenIcon,
XIcon,
Mic,
MicOff,
Loader2,
Lock,
} from "lucide-react";
import {
Tooltip,
......@@ -29,6 +33,11 @@ import { AuxiliaryActionsMenu } from "./AuxiliaryActionsMenu";
import { cn } from "@/lib/utils";
import { useLoadApps } from "@/hooks/useLoadApps";
import { AppSearchDialog } from "../AppSearchDialog";
import { useVoiceToText } from "@/hooks/useVoiceToText";
import { useUserBudgetInfo } from "@/hooks/useUserBudgetInfo";
import { ipc } from "@/ipc/types";
import { useCallback } from "react";
import { showError } from "@/lib/toast";
export function HomeChatInput({
onSubmit,
......@@ -43,6 +52,21 @@ export function HomeChatInput({
hasChatId: false,
}); // eslint-disable-line @typescript-eslint/no-unused-vars
useChatModeToggle();
const { userBudget } = useUserBudgetInfo();
const isProEnabled = !!userBudget && !!settings?.enableDyadPro;
const handleTranscription = useCallback(
(text: string) => {
setInputValue((prev: string) => (prev.trim() ? prev + " " + text : text));
},
[setInputValue],
);
const { isRecording, isTranscribing, toggleRecording } = useVoiceToText({
enabled: isProEnabled,
onTranscription: handleTranscription,
onError: (message) => showError(message),
});
const [appSearchOpen, setAppSearchOpen] = useState(false);
const { apps } = useLoadApps();
......@@ -81,7 +105,7 @@ export function HomeChatInput({
};
// Custom submit function that wraps the provided onSubmit
const handleCustomSubmit = () => {
const handleCustomSubmit = async () => {
if (
(!inputValue.trim() && attachments.length === 0) ||
isStreaming ||
......@@ -90,6 +114,10 @@ export function HomeChatInput({
return;
}
if (isRecording) {
await toggleRecording();
}
// Call the parent's onSubmit handler with attachments and selected app
onSubmit({
attachments,
......@@ -152,6 +180,66 @@ export function HomeChatInput({
messageHistory={[]}
/>
{/* Voice-to-text button */}
{isProEnabled ? (
<Tooltip>
<TooltipTrigger
render={
<button
onClick={toggleRecording}
disabled={isTranscribing}
aria-label={
isRecording
? "Stop recording"
: isTranscribing
? "Transcribing..."
: "Voice to text"
}
className={cn(
"px-2 py-2 mb-0.5 text-muted-foreground rounded-lg transition-colors duration-150 cursor-pointer disabled:cursor-default disabled:opacity-30",
isRecording &&
"text-red-500 hover:text-red-600 animate-pulse",
!isRecording && !isTranscribing && "hover:text-primary",
)}
/>
}
>
{isTranscribing ? (
<Loader2 size={20} className="animate-spin" />
) : isRecording ? (
<MicOff size={20} />
) : (
<Mic size={20} />
)}
</TooltipTrigger>
<TooltipContent>
{isRecording
? "Stop recording"
: isTranscribing
? "Transcribing..."
: "Voice to text"}
</TooltipContent>
</Tooltip>
) : (
<Tooltip>
<TooltipTrigger
render={
<button
onClick={() =>
ipc.system.openExternalUrl("https://dyad.sh/pro")
}
aria-label="Voice to text (Pro)"
className="px-2 py-2 mb-0.5 text-muted-foreground hover:text-primary rounded-lg transition-colors duration-150 cursor-pointer relative"
/>
}
>
<Mic size={20} />
<Lock size={10} className="absolute -top-0.5 -right-0.5" />
</TooltipTrigger>
<TooltipContent>Voice to text (requires Pro)</TooltipContent>
</Tooltip>
)}
{isStreaming ? (
<Tooltip>
<TooltipTrigger
......
import { renderHook, act, waitFor } from "@testing-library/react";
import { describe, it, expect, beforeEach, vi } from "vitest";
import { useVoiceToText } from "@/hooks/useVoiceToText";
const { transcribeAudioMock } = vi.hoisted(() => ({
transcribeAudioMock: vi.fn(),
}));
vi.mock("@/ipc/types", () => ({
ipc: {
audio: {
transcribeAudio: transcribeAudioMock,
},
},
}));
class MockMediaRecorder {
public state: "inactive" | "recording" | "paused" = "inactive";
public ondataavailable: ((event: { data: Blob }) => void) | null = null;
public onstop: (() => void | Promise<void>) | null = null;
public start = vi.fn(() => {
this.state = "recording";
});
public stop = vi.fn(() => {
this.state = "inactive";
void this.onstop?.();
});
}
describe("useVoiceToText", () => {
let trackStopMock: ReturnType<typeof vi.fn>;
let mediaRecorderInstances: MockMediaRecorder[];
beforeEach(() => {
transcribeAudioMock.mockReset();
mediaRecorderInstances = [];
trackStopMock = vi.fn();
const stream = {
getTracks: () => [{ stop: trackStopMock }],
} as unknown as MediaStream;
Object.defineProperty(globalThis.navigator, "mediaDevices", {
value: {
getUserMedia: vi.fn().mockResolvedValue(stream),
},
configurable: true,
});
const MediaRecorderConstructor = vi.fn(() => {
const instance = new MockMediaRecorder();
mediaRecorderInstances.push(instance);
return instance;
});
Object.defineProperty(globalThis, "MediaRecorder", {
value: MediaRecorderConstructor,
configurable: true,
writable: true,
});
});
it("stops the active microphone stream when unmounted mid-recording", async () => {
const onTranscription = vi.fn();
const { result, unmount } = renderHook(() =>
useVoiceToText({
enabled: true,
onTranscription,
}),
);
await act(async () => {
await result.current.toggleRecording();
});
expect(result.current.isRecording).toBe(true);
unmount();
expect(mediaRecorderInstances).toHaveLength(1);
expect(mediaRecorderInstances[0].stop).toHaveBeenCalledTimes(1);
expect(trackStopMock).toHaveBeenCalledTimes(1);
expect(transcribeAudioMock).not.toHaveBeenCalled();
expect(onTranscription).not.toHaveBeenCalled();
});
it("still transcribes when recording is stopped by the user", async () => {
transcribeAudioMock.mockResolvedValue({ text: " hello world " });
const onTranscription = vi.fn();
const { result } = renderHook(() =>
useVoiceToText({
enabled: true,
onTranscription,
}),
);
await act(async () => {
await result.current.toggleRecording();
});
const recorder = mediaRecorderInstances[0];
recorder.ondataavailable?.({
data: new Blob(["test audio"], { type: "audio/webm" }),
});
await act(async () => {
await result.current.toggleRecording();
});
await waitFor(() => {
expect(transcribeAudioMock).toHaveBeenCalledTimes(1);
});
expect(onTranscription).toHaveBeenCalledWith("hello world");
expect(trackStopMock).toHaveBeenCalledTimes(1);
});
});
import { useState, useRef, useCallback, useEffect } from "react";
import { ipc } from "@/ipc/types";
import { v4 as uuidv4 } from "uuid";
interface UseVoiceToTextOptions {
enabled: boolean;
onTranscription: (text: string) => void;
onError?: (error: string) => void;
}
export function useVoiceToText({
enabled,
onTranscription,
onError,
}: UseVoiceToTextOptions) {
const [isRecording, setIsRecording] = useState(false);
const [isTranscribing, setIsTranscribing] = useState(false);
const mediaRecorderRef = useRef<MediaRecorder | null>(null);
const chunksRef = useRef<Blob[]>([]);
const streamRef = useRef<MediaStream | null>(null);
const skipOnStopProcessingRef = useRef(false);
const stopMediaStream = useCallback(() => {
if (streamRef.current) {
streamRef.current.getTracks().forEach((track) => track.stop());
streamRef.current = null;
}
}, []);
useEffect(() => {
return () => {
skipOnStopProcessingRef.current = true;
const mediaRecorder = mediaRecorderRef.current;
if (mediaRecorder && mediaRecorder.state !== "inactive") {
mediaRecorder.stop();
}
mediaRecorderRef.current = null;
stopMediaStream();
chunksRef.current = [];
};
}, [stopMediaStream]);
const toggleRecording = useCallback(async () => {
if (isTranscribing) return;
if (isRecording) {
// Stop recording
if (mediaRecorderRef.current?.state === "recording") {
mediaRecorderRef.current.stop();
}
return;
}
if (!enabled) return;
// Start recording
try {
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
streamRef.current = stream;
const mediaRecorder = new MediaRecorder(stream, {
mimeType: "audio/webm",
});
mediaRecorderRef.current = mediaRecorder;
chunksRef.current = [];
mediaRecorder.ondataavailable = (event) => {
if (event.data.size > 0) {
chunksRef.current.push(event.data);
}
};
mediaRecorder.onstop = async () => {
mediaRecorderRef.current = null;
stopMediaStream();
if (skipOnStopProcessingRef.current) {
chunksRef.current = [];
return;
}
setIsRecording(false);
const blob = new Blob(chunksRef.current, { type: "audio/webm" });
chunksRef.current = [];
if (blob.size === 0) {
return;
}
setIsTranscribing(true);
try {
const arrayBuffer = await blob.arrayBuffer();
const audioData = Array.from(new Uint8Array(arrayBuffer));
const result = await ipc.audio.transcribeAudio({
audioData,
filename: "recording.webm",
requestId: uuidv4(),
});
if (result.text.trim()) {
onTranscription(result.text.trim());
}
} catch (err) {
const message =
err instanceof Error ? err.message : "Transcription failed";
onError?.(message);
} finally {
setIsTranscribing(false);
}
};
mediaRecorder.start();
setIsRecording(true);
} catch (err) {
stopMediaStream();
const message =
err instanceof Error ? err.message : "Failed to access microphone";
onError?.(message);
}
}, [
enabled,
isRecording,
isTranscribing,
onTranscription,
onError,
stopMediaStream,
]);
return {
isRecording,
isTranscribing,
toggleRecording,
};
}
import fetch from "node-fetch"; // Electron main process might need node-fetch
import log from "electron-log";
import { createLoggedHandler } from "./safe_handle";
import { createLoggedTypedHandler } from "./base";
import { readSettings } from "../../main/settings"; // Assuming settings are read this way
import { UserBudgetInfo, UserBudgetInfoSchema } from "@/ipc/types";
import { IS_TEST_BUILD } from "../utils/test_utils";
import { z } from "zod";
import { audioContracts } from "../types/audio";
import type { TranscribeAudioParams } from "../types/audio";
import { transcribeWithDyadEngine } from "../utils/llm_engine_provider";
export const UserInfoResponseSchema = z.object({
usedCredits: z.number(),
......@@ -17,6 +21,9 @@ export type UserInfoResponse = z.infer<typeof UserInfoResponseSchema>;
const logger = log.scope("pro_handlers");
const handle = createLoggedHandler(logger);
const typedHandle = createLoggedTypedHandler(logger);
const dyadEngineUrl = process.env.DYAD_ENGINE_URL;
export function registerProHandlers() {
// This method should try to avoid throwing errors because this is auxiliary
......@@ -94,4 +101,34 @@ export function registerProHandlers() {
return null;
}
});
typedHandle(
audioContracts.transcribeAudio,
async (_event, input: TranscribeAudioParams) => {
const settings = readSettings();
const apiKey = settings.providerSettings?.auto?.apiKey?.value;
if (!apiKey || !settings.enableDyadPro) {
throw new Error(
"Dyad Pro is not enabled. Voice-to-text requires a Pro subscription.",
);
}
const audioBuffer = Buffer.from(input.audioData);
const text = await transcribeWithDyadEngine(
audioBuffer,
input.filename,
input.requestId,
{
apiKey,
baseURL: dyadEngineUrl ?? "https://engine.dyad.sh/v1",
dyadOptions: {},
settings,
},
);
return { text };
},
);
}
......@@ -39,6 +39,7 @@ import { securityContracts } from "../types/security";
import { miscContracts, miscEvents } from "../types/misc";
import { freeAgentQuotaContracts } from "../types/free_agent_quota";
import { planEvents, planContracts } from "../types/plan";
import { audioContracts } from "../types/audio";
// =============================================================================
// Invoke Channels (derived from all contracts)
......@@ -93,6 +94,7 @@ export const VALID_INVOKE_CHANNELS = [
...getInvokeChannels(miscContracts),
...getInvokeChannels(freeAgentQuotaContracts),
...getInvokeChannels(planContracts),
...getInvokeChannels(audioContracts),
// Test-only channels
...TEST_INVOKE_CHANNELS,
......
import { z } from "zod";
import { defineContract, createClient } from "../contracts/core";
// =============================================================================
// Transcription Schemas
// =============================================================================
export const TranscribeAudioParamsSchema = z.object({
audioData: z.array(z.number()),
filename: z.string(),
requestId: z.string(),
});
export type TranscribeAudioParams = z.infer<typeof TranscribeAudioParamsSchema>;
export const TranscribeAudioResultSchema = z.object({
text: z.string(),
});
export type TranscribeAudioResult = z.infer<typeof TranscribeAudioResultSchema>;
// =============================================================================
// Contracts
// =============================================================================
export const audioContracts = {
transcribeAudio: defineContract({
channel: "pro:transcribe-audio" as const,
input: TranscribeAudioParamsSchema,
output: TranscribeAudioResultSchema,
}),
};
// =============================================================================
// Client
// =============================================================================
export const audioClient = createClient(audioContracts);
......@@ -50,6 +50,7 @@ export { visualEditingContracts } from "./visual-editing";
export { securityContracts } from "./security";
export { miscContracts, miscEvents } from "./misc";
export { freeAgentQuotaContracts } from "./free_agent_quota";
export { audioContracts } from "./audio";
// =============================================================================
// Client Exports
......@@ -79,6 +80,7 @@ export { visualEditingClient } from "./visual-editing";
export { securityClient } from "./security";
export { miscClient, miscEventClient } from "./misc";
export { freeAgentQuotaClient } from "./free_agent_quota";
export { audioClient } from "./audio";
// =============================================================================
// Type Exports
......@@ -290,6 +292,9 @@ export type {
// Free agent quota types
export type { FreeAgentQuotaStatus } from "./free_agent_quota";
// Pro types
export type { TranscribeAudioParams, TranscribeAudioResult } from "./audio";
// =============================================================================
// Schema Exports (for validation in handlers/components)
// =============================================================================
......@@ -346,6 +351,7 @@ import { visualEditingClient } from "./visual-editing";
import { securityClient } from "./security";
import { miscClient, miscEventClient } from "./misc";
import { freeAgentQuotaClient } from "./free_agent_quota";
import { audioClient } from "./audio";
/**
* Unified IPC client with all domains organized by namespace.
......@@ -401,6 +407,7 @@ export const ipc = {
security: securityClient,
misc: miscClient,
freeAgentQuota: freeAgentQuotaClient,
audio: audioClient,
// Event clients for main->renderer pub/sub
events: {
......
......@@ -237,3 +237,52 @@ export function createDyadEngine(
return provider;
}
export async function transcribeWithDyadEngine(
audioBuffer: Buffer,
filename: string,
requestId: string,
options: ExampleProviderSettings,
): Promise<string> {
const baseURL = withoutTrailingSlash(options.baseURL);
const apiKey = loadApiKey({
apiKey: options.apiKey,
environmentVariableName: "DYAD_PRO_API_KEY",
description: "Dyad Pro API key",
});
logger.info("transcribing with dyad engine with baseURL", baseURL);
const formData = new FormData();
const mimeType = filename.endsWith(".webm")
? "audio/webm"
: filename.endsWith(".mp3")
? "audio/mpeg"
: filename.endsWith(".wav")
? "audio/wav"
: filename.endsWith(".m4a")
? "audio/mp4"
: "audio/webm";
const blob = new Blob([new Uint8Array(audioBuffer)], { type: mimeType });
formData.append("file", blob, filename);
formData.append("model", "gpt-4o-mini-transcribe");
const fetchFn = options.fetch || fetch;
const response = await fetchFn(`${baseURL}/audio/transcriptions`, {
method: "POST",
headers: {
Authorization: `Bearer ${apiKey}`,
"X-Dyad-Request-Id": requestId,
...options.headers,
},
body: formData,
});
if (!response.ok) {
const errorText = await response.text();
throw new Error(
`Dyad Engine transcription failed: ${response.status} ${response.statusText} - ${errorText}`,
);
}
const data = (await response.json()) as { text: string };
return data.text;
}
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论