2023-09-19 07:52:39 +08:00
|
|
|
import gradio as gr
|
|
|
|
from transformers import pipeline
|
|
|
|
import numpy as np
|
|
|
|
|
|
|
|
transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en")
|
|
|
|
|
|
|
|
def transcribe(audio):
|
|
|
|
sr, y = audio
|
2024-09-16 23:16:48 +08:00
|
|
|
|
|
|
|
# Convert to mono if stereo
|
|
|
|
if y.ndim > 1:
|
|
|
|
y = y.mean(axis=1)
|
|
|
|
|
2023-09-19 07:52:39 +08:00
|
|
|
y = y.astype(np.float32)
|
|
|
|
y /= np.max(np.abs(y))
|
|
|
|
|
2024-07-20 09:34:34 +08:00
|
|
|
return transcriber({"sampling_rate": sr, "raw": y})["text"] # type: ignore
|
2023-09-19 07:52:39 +08:00
|
|
|
|
|
|
|
demo = gr.Interface(
|
|
|
|
transcribe,
|
2024-09-16 23:16:48 +08:00
|
|
|
gr.Audio(sources="microphone"),
|
2023-09-19 07:52:39 +08:00
|
|
|
"text",
|
|
|
|
)
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
demo.launch()
|