Improve audio streaming (#5238)

* changes

* changes

* add changeset

* add changeset

* chages

* Update silver-clowns-brush.md

* changes

* chagers

* changes

* Update silver-clowns-brush.md

* change

* change

---------

Co-authored-by: gradio-pr-bot <gradio-pr-bot@users.noreply.github.com>
Co-authored-by: pngwn <hello@pngwn.io>
This commit is contained in:
aliabid94 2023-08-22 15:01:34 -07:00 committed by GitHub
parent a74605572d
commit de23e9f7d6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 103 additions and 55 deletions

View File

@ -0,0 +1,10 @@
---
"gradio": patch
---
highlight:Improve audio streaming
This PR improves audio streaming in two ways:
1. Proper audio streaming with WAV files. We now do the proper processing to stream out wav files as a single stream of audio without any cracks in the seams.
2. Audio streaming with bytes. Stream any audio type by yielding out bytes, and it should work flawlessly.

View File

@ -1 +1 @@
{"cells": [{"cell_type": "markdown", "id": 302934307671667531413257853548643485645, "metadata": {}, "source": ["# Gradio Demo: stream_audio_out"]}, {"cell_type": "code", "execution_count": null, "id": 272996653310673477252411125948039410165, "metadata": {}, "outputs": [], "source": ["!pip install -q gradio "]}, {"cell_type": "code", "execution_count": null, "id": 288918539441861185822528903084949547379, "metadata": {}, "outputs": [], "source": ["import gradio as gr\n", "from pydub import AudioSegment\n", "\n", "def stream_audio(audio_file):\n", " audio = AudioSegment.from_mp3(audio_file)\n", " i = 0\n", " chunk_size = 3000\n", " \n", " while chunk_size*i < len(audio):\n", " chunk = audio[chunk_size*i:chunk_size*(i+1)]\n", " i += 1\n", " if chunk:\n", " file = f\"/tmp/{i}.mp3\"\n", " chunk.export(file, format=\"mp3\") \n", " yield file\n", " \n", "demo = gr.Interface(\n", " fn=stream_audio,\n", " inputs=gr.Audio(type=\"filepath\", label=\"Audio file to stream\"),\n", " outputs=gr.Audio(autoplay=True, streaming=True),\n", ")\n", "\n", "if __name__ == \"__main__\":\n", " demo.queue().launch()\n"]}], "metadata": {}, "nbformat": 4, "nbformat_minor": 5}
{"cells": [{"cell_type": "markdown", "id": 302934307671667531413257853548643485645, "metadata": {}, "source": ["# Gradio Demo: stream_audio_out"]}, {"cell_type": "code", "execution_count": null, "id": 272996653310673477252411125948039410165, "metadata": {}, "outputs": [], "source": ["!pip install -q gradio "]}, {"cell_type": "code", "execution_count": null, "id": 288918539441861185822528903084949547379, "metadata": {}, "outputs": [], "source": ["import gradio as gr\n", "from pydub import AudioSegment\n", "from time import sleep\n", "\n", "with gr.Blocks() as demo:\n", " input_audio = gr.Audio(label=\"Input Audio\", type=\"filepath\", format=\"mp3\")\n", " with gr.Row():\n", " with gr.Column():\n", " stream_as_file_btn = gr.Button(\"Stream as File\")\n", " format = gr.Radio([\"wav\", \"mp3\"], value=\"wav\", label=\"Format\")\n", " stream_as_file_output = gr.Audio(streaming=True)\n", "\n", " def stream_file(audio_file, format):\n", " audio = AudioSegment.from_file(audio_file)\n", " i = 0\n", " chunk_size = 3000 \n", " while chunk_size*i < len(audio):\n", " chunk = audio[chunk_size*i:chunk_size*(i+1)]\n", " i += 1\n", " if chunk:\n", " file = f\"/tmp/{i}.{format}\"\n", " chunk.export(file, format=format)\n", " yield file\n", " sleep(1)\n", " \n", " stream_as_file_btn.click(stream_file, [input_audio, format], stream_as_file_output)\n", "\n", " with gr.Column():\n", " stream_as_bytes_btn = gr.Button(\"Stream as Bytes\")\n", " stream_as_bytes_output = gr.Audio(format=\"bytes\", streaming=True)\n", "\n", " def stream_bytes(audio_file):\n", " chunk_size = 20_000\n", " with open(audio_file, \"rb\") as f:\n", " while True:\n", " chunk = f.read(chunk_size)\n", " if chunk:\n", " yield chunk\n", " sleep(1)\n", " else:\n", " break\n", " \n", " stream_as_bytes_btn.click(stream_bytes, input_audio, stream_as_bytes_output)\n", "\n", "if __name__ == \"__main__\":\n", " demo.queue().launch()\n"]}], "metadata": {}, "nbformat": 4, "nbformat_minor": 5}

View File

@ -1,24 +1,46 @@
import gradio as gr
from pydub import AudioSegment
from time import sleep
def stream_audio(audio_file):
audio = AudioSegment.from_mp3(audio_file)
i = 0
chunk_size = 3000
while chunk_size*i < len(audio):
chunk = audio[chunk_size*i:chunk_size*(i+1)]
i += 1
if chunk:
file = f"/tmp/{i}.mp3"
chunk.export(file, format="mp3")
yield file
demo = gr.Interface(
fn=stream_audio,
inputs=gr.Audio(type="filepath", label="Audio file to stream"),
outputs=gr.Audio(autoplay=True, streaming=True),
)
with gr.Blocks() as demo:
input_audio = gr.Audio(label="Input Audio", type="filepath", format="mp3")
with gr.Row():
with gr.Column():
stream_as_file_btn = gr.Button("Stream as File")
format = gr.Radio(["wav", "mp3"], value="wav", label="Format")
stream_as_file_output = gr.Audio(streaming=True)
def stream_file(audio_file, format):
audio = AudioSegment.from_file(audio_file)
i = 0
chunk_size = 3000
while chunk_size*i < len(audio):
chunk = audio[chunk_size*i:chunk_size*(i+1)]
i += 1
if chunk:
file = f"/tmp/{i}.{format}"
chunk.export(file, format=format)
yield file
sleep(1)
stream_as_file_btn.click(stream_file, [input_audio, format], stream_as_file_output)
with gr.Column():
stream_as_bytes_btn = gr.Button("Stream as Bytes")
stream_as_bytes_output = gr.Audio(format="bytes", streaming=True)
def stream_bytes(audio_file):
chunk_size = 20_000
with open(audio_file, "rb") as f:
while True:
chunk = f.read(chunk_size)
if chunk:
yield chunk
sleep(1)
else:
break
stream_as_bytes_btn.click(stream_bytes, input_audio, stream_as_bytes_output)
if __name__ == "__main__":
demo.queue().launch()

View File

@ -1350,20 +1350,23 @@ Received outputs:
) -> list:
if session_hash is None or run is None:
return data
if run not in self.pending_streams[session_hash]:
self.pending_streams[session_hash][run] = {}
stream_run = self.pending_streams[session_hash][run]
from gradio.events import StreamableOutput
for i, output_id in enumerate(self.dependencies[fn_index]["outputs"]):
block = self.blocks[output_id]
if isinstance(block, StreamableOutput) and block.streaming:
stream = block.stream_output(data[i])
if run not in self.pending_streams[session_hash]:
self.pending_streams[session_hash][run] = defaultdict(list)
self.pending_streams[session_hash][run][output_id].append(stream)
if data[i]:
data[i]["is_file"] = False
data[i]["name"] = f"{session_hash}/{run}/{output_id}"
data[i]["is_stream"] = True
first_chunk = output_id not in stream_run
binary_data, output_data = block.stream_output(
data[i], f"{session_hash}/{run}/{output_id}", first_chunk
)
if first_chunk:
stream_run[output_id] = []
self.pending_streams[session_hash][run][output_id].append(binary_data)
data[i] = output_data
return data
async def process_api(

View File

@ -45,7 +45,7 @@ class Audio(
"""
Creates an audio component that can be used to upload/record audio (as an input) or display audio (as an output).
Preprocessing: passes the uploaded audio as a {Tuple(int, numpy.array)} corresponding to (sample rate in Hz, audio data as a 16-bit int array whose values range from -32768 to 32767), or as a {str} filepath, depending on `type`.
Postprocessing: expects a {Tuple(int, numpy.array)} corresponding to (sample rate in Hz, audio data as a float or int numpy array) or as a {str} or {pathlib.Path} filepath or URL to an audio file, which gets displayed
Postprocessing: expects a {Tuple(int, numpy.array)} corresponding to (sample rate in Hz, audio data as a float or int numpy array) or as a {str} or {pathlib.Path} filepath or URL to an audio file, or bytes for binary content (recommended for streaming)
Examples-format: a {str} filepath to a local file that contains audio.
Demos: main_note, generate_tone, reverse_audio
Guides: real-time-speech-recognition
@ -328,8 +328,8 @@ class Audio(
return masked_inputs
def postprocess(
self, y: tuple[int, np.ndarray] | str | Path | None
) -> str | dict | None:
self, y: tuple[int, np.ndarray] | str | Path | bytes | None
) -> str | dict | bytes | None:
"""
Parameters:
y: audio data in either of the following formats: a tuple of (sample_rate, data), or a string filepath or URL to an audio file, or None.
@ -338,25 +338,22 @@ class Audio(
"""
if y is None:
return None
if isinstance(y, str) and client_utils.is_http_url_like(y):
if isinstance(y, bytes):
if self.streaming:
return y
file_path = self.file_bytes_to_file(y, "audio")
elif isinstance(y, str) and client_utils.is_http_url_like(y):
return {"name": y, "data": None, "is_file": True}
if isinstance(y, tuple):
elif isinstance(y, tuple):
sample_rate, data = y
file_path = self.audio_to_temp_file(
data,
sample_rate,
format="mp3" if self.streaming else self.format,
format=self.format,
)
self.temp_files.add(file_path)
else:
if isinstance(y, Path):
y = str(y)
if self.streaming and not y.endswith(".mp3"):
sample_rate, data = processing_utils.audio_from_file(y)
file_path = self.audio_to_temp_file(data, sample_rate, format="mp3")
self.temp_files.add(file_path)
else:
file_path = self.make_temp_copy_if_needed(y)
file_path = self.make_temp_copy_if_needed(y)
return {
"name": file_path,
"data": None,
@ -364,17 +361,37 @@ class Audio(
"orig_name": Path(file_path).name,
}
def stream_output(self, y):
def stream_output(self, y, output_id: str, first_chunk: bool):
output_file = {
"name": output_id,
"is_stream": True,
"is_file": False,
}
if y is None:
return None
return None, output_file
if isinstance(y, bytes):
return y, output_file
if client_utils.is_http_url_like(y["name"]):
response = requests.get(y["name"])
bytes = response.content
binary_data = response.content
else:
output_file["orig_name"] = y["orig_name"]
file_path = y["name"]
is_wav = file_path.endswith(".wav")
with open(file_path, "rb") as f:
bytes = f.read()
return bytes
binary_data = f.read()
if is_wav:
# strip length information from first chunk header, remove headers entirely from subsequent chunks
if first_chunk:
binary_data = (
binary_data[:4] + b"\xFF\xFF\xFF\xFF" + binary_data[8:]
)
binary_data = (
binary_data[:40] + b"\xFF\xFF\xFF\xFF" + binary_data[44:]
)
else:
binary_data = binary_data[44:]
return binary_data, output_file
def check_streamable(self):
if self.source != "microphone":

View File

@ -328,8 +328,8 @@ class IOComponent(Component):
processing_utils.audio_to_file(sample_rate, data, filename, format=format)
return filename
def file_bytes_to_file(self, data: bytes, dir: str, file_name: str):
path = Path(dir) / self.hash_bytes(data)
def file_bytes_to_file(self, data: bytes, file_name: str):
path = Path(self.DEFAULT_TEMP_DIR) / self.hash_bytes(data)
path.mkdir(exist_ok=True, parents=True)
path = path / Path(file_name).name
path.write_bytes(data)

View File

@ -190,9 +190,7 @@ class File(
path = self.make_temp_copy_if_needed(file_name)
else:
data, _ = client_utils.decode_base64_to_binary(data)
path = self.file_bytes_to_file(
data, dir=self.DEFAULT_TEMP_DIR, file_name=file_name
)
path = self.file_bytes_to_file(data, file_name=file_name)
path = str(utils.abspath(path))
self.temp_files.add(path)

View File

@ -157,9 +157,7 @@ class UploadButton(Clickable, Uploadable, IOComponent, FileSerializable):
path = self.make_temp_copy_if_needed(file_name)
else:
data, _ = client_utils.decode_base64_to_binary(data)
path = self.file_bytes_to_file(
data, dir=self.DEFAULT_TEMP_DIR, file_name=file_name
)
path = self.file_bytes_to_file(data, file_name=file_name)
path = str(utils.abspath(path))
self.temp_files.add(path)
file = tempfile.NamedTemporaryFile(

View File

@ -274,7 +274,7 @@ class StreamableOutput(EventListener):
def __init__(self):
self.streaming: bool
def stream_output(self, y) -> bytes:
def stream_output(self, y, output_id: str, first_chunk: bool) -> tuple[bytes, Any]:
raise NotImplementedError