import { execFile } from "child_process"; import { promisify } from "util"; import { writeFile, unlink } from "fs/promises"; import { tmpdir } from "os"; import { join } from "path"; import { randomBytes } from "crypto"; const execFileAsync = promisify(execFile); const WHISPER_CLI = "/opt/homebrew/Cellar/whisper-cpp/1.8.3/bin/whisper-cli"; const WHISPER_MODEL = "/opt/homebrew/share/whisper-cpp/models/ggml-base.en.bin"; const FFMPEG = "/opt/homebrew/bin/ffmpeg"; const WHISPER_NATIVE = new Set(["wav", "flac", "mp3", "ogg"]); export const AUDIO_TYPES = new Set([ "audio/mpeg", "audio/mp4", "audio/ogg", "audio/wav", "audio/webm", "audio/flac", "audio/x-m4a", "audio/m4a", "audio/mp3", "audio/aac", "audio/amr", "audio/3gpp", "audio/x-caf", "audio/caf", ]); export function isAudioFile(f: { mimetype?: string }): boolean { if (!f.mimetype) return false; return AUDIO_TYPES.has(f.mimetype) || f.mimetype.startsWith("audio/"); } export async function transcribeBuffer( buf: Buffer, filename: string, _mimeType: string ): Promise { const ext = (filename.split(".").pop() || "mp3").toLowerCase(); const id = randomBytes(8).toString("hex"); const tmpInput = join(tmpdir(), `whisper-in-${id}.${ext}`); const needsConvert = !WHISPER_NATIVE.has(ext); const tmpWav = needsConvert ? join(tmpdir(), `whisper-wav-${id}.wav`) : tmpInput; try { await writeFile(tmpInput, buf); if (needsConvert) { await execFileAsync(FFMPEG, [ "-y", "-i", tmpInput, "-ar", "16000", "-ac", "1", "-f", "wav", tmpWav, ]); } const { stdout } = await execFileAsync(WHISPER_CLI, [ "--model", WHISPER_MODEL, "--no-prints", "--no-timestamps", "--file", tmpWav, ]); const transcript = stdout.replace(/\s+/g, " ").trim(); return transcript; } finally { await unlink(tmpInput).catch((err) => console.warn(`[whisper] cleanup failed: ${tmpInput}`, err.message)); if (needsConvert) await unlink(tmpWav).catch((err) => console.warn(`[whisper] cleanup failed: ${tmpWav}`, err.message)); } }