Improve audio streaming (#5238)

* changes * changes * add changeset * add changeset * chages * Update silver-clowns-brush.md * changes * chagers * changes * Update silver-clowns-brush.md * change * change --------- Co-authored-by: gradio-pr-bot <gradio-pr-bot@users.noreply.github.com> Co-authored-by: pngwn <hello@pngwn.io>
2025-03-25 12:10:31 +08:00 · 2023-08-22 15:01:34 -07:00 · 2023-08-22 15:01:34 -07:00 · de23e9f7d6
commit de23e9f7d6
parent a74605572d
9 changed files with 103 additions and 55 deletions
--- a/.changeset/silver-clowns-brush.md
+++ b/.changeset/silver-clowns-brush.md
@ -0,0 +1,10 @@
+---
+"gradio": patch
+---
+
+highlight:Improve audio streaming
+
+This PR improves audio streaming in two ways:
+
+1. Proper audio streaming with WAV files. We now do the proper processing to stream out wav files as a single stream of audio without any cracks in the seams.
+2. Audio streaming with bytes. Stream any audio type by yielding out bytes, and it should work flawlessly.
--- a/demo/stream_audio_out/run.ipynb
+++ b/demo/stream_audio_out/run.ipynb
@ -1 +1 @@
-{"cells": [{"cell_type": "markdown", "id": 302934307671667531413257853548643485645, "metadata": {}, "source": ["# Gradio Demo: stream_audio_out"]}, {"cell_type": "code", "execution_count": null, "id": 272996653310673477252411125948039410165, "metadata": {}, "outputs": [], "source": ["!pip install -q gradio "]}, {"cell_type": "code", "execution_count": null, "id": 288918539441861185822528903084949547379, "metadata": {}, "outputs": [], "source": ["import gradio as gr\n", "from pydub import AudioSegment\n", "\n", "def stream_audio(audio_file):\n", "    audio = AudioSegment.from_mp3(audio_file)\n", "    i = 0\n", "    chunk_size = 3000\n", "    \n", "    while chunk_size*i < len(audio):\n", "        chunk = audio[chunk_size*i:chunk_size*(i+1)]\n", "        i += 1\n", "        if chunk:\n", "            file = f\"/tmp/{i}.mp3\"\n", "            chunk.export(file, format=\"mp3\")            \n", "            yield file\n", "        \n", "demo = gr.Interface(\n", "    fn=stream_audio,\n", "    inputs=gr.Audio(type=\"filepath\", label=\"Audio file to stream\"),\n", "    outputs=gr.Audio(autoplay=True, streaming=True),\n", ")\n", "\n", "if __name__ == \"__main__\":\n", "    demo.queue().launch()\n"]}], "metadata": {}, "nbformat": 4, "nbformat_minor": 5}
+{"cells": [{"cell_type": "markdown", "id": 302934307671667531413257853548643485645, "metadata": {}, "source": ["# Gradio Demo: stream_audio_out"]}, {"cell_type": "code", "execution_count": null, "id": 272996653310673477252411125948039410165, "metadata": {}, "outputs": [], "source": ["!pip install -q gradio "]}, {"cell_type": "code", "execution_count": null, "id": 288918539441861185822528903084949547379, "metadata": {}, "outputs": [], "source": ["import gradio as gr\n", "from pydub import AudioSegment\n", "from time import sleep\n", "\n", "with gr.Blocks() as demo:\n", "    input_audio = gr.Audio(label=\"Input Audio\", type=\"filepath\", format=\"mp3\")\n", "    with gr.Row():\n", "        with gr.Column():\n", "            stream_as_file_btn = gr.Button(\"Stream as File\")\n", "            format = gr.Radio([\"wav\", \"mp3\"], value=\"wav\", label=\"Format\")\n", "            stream_as_file_output = gr.Audio(streaming=True)\n", "\n", "            def stream_file(audio_file, format):\n", "                audio = AudioSegment.from_file(audio_file)\n", "                i = 0\n", "                chunk_size = 3000                \n", "                while chunk_size*i < len(audio):\n", "                    chunk = audio[chunk_size*i:chunk_size*(i+1)]\n", "                    i += 1\n", "                    if chunk:\n", "                        file = f\"/tmp/{i}.{format}\"\n", "                        chunk.export(file, format=format)\n", "                        yield file\n", "                        sleep(1)\n", "                \n", "            stream_as_file_btn.click(stream_file, [input_audio, format], stream_as_file_output)\n", "\n", "        with gr.Column():\n", "            stream_as_bytes_btn = gr.Button(\"Stream as Bytes\")\n", "            stream_as_bytes_output = gr.Audio(format=\"bytes\", streaming=True)\n", "\n", "            def stream_bytes(audio_file):\n", "                chunk_size = 20_000\n", "                with open(audio_file, \"rb\") as f:\n", "                    while True:\n", "                        chunk = f.read(chunk_size)\n", "                        if chunk:\n", "                            yield chunk\n", "                            sleep(1)\n", "                        else:\n", "                            break\n", "            \n", "            stream_as_bytes_btn.click(stream_bytes, input_audio, stream_as_bytes_output)\n", "\n", "if __name__ == \"__main__\":\n", "    demo.queue().launch()\n"]}], "metadata": {}, "nbformat": 4, "nbformat_minor": 5}
--- a/demo/stream_audio_out/run.py
+++ b/demo/stream_audio_out/run.py
@ -1,24 +1,46 @@
 import gradio as gr
 from pydub import AudioSegment
+from time import sleep

-def stream_audio(audio_file):
-    audio = AudioSegment.from_mp3(audio_file)
-    i = 0
-    chunk_size = 3000
-    
-    while chunk_size*i < len(audio):
-        chunk = audio[chunk_size*i:chunk_size*(i+1)]
-        i += 1
-        if chunk:
-            file = f"/tmp/{i}.mp3"
-            chunk.export(file, format="mp3")            
-            yield file
-        
-demo = gr.Interface(
-    fn=stream_audio,
-    inputs=gr.Audio(type="filepath", label="Audio file to stream"),
-    outputs=gr.Audio(autoplay=True, streaming=True),
-)
+with gr.Blocks() as demo:
+    input_audio = gr.Audio(label="Input Audio", type="filepath", format="mp3")
+    with gr.Row():
+        with gr.Column():
+            stream_as_file_btn = gr.Button("Stream as File")
+            format = gr.Radio(["wav", "mp3"], value="wav", label="Format")
+            stream_as_file_output = gr.Audio(streaming=True)
+
+            def stream_file(audio_file, format):
+                audio = AudioSegment.from_file(audio_file)
+                i = 0
+                chunk_size = 3000                
+                while chunk_size*i < len(audio):
+                    chunk = audio[chunk_size*i:chunk_size*(i+1)]
+                    i += 1
+                    if chunk:
+                        file = f"/tmp/{i}.{format}"
+                        chunk.export(file, format=format)
+                        yield file
+                        sleep(1)
+                
+            stream_as_file_btn.click(stream_file, [input_audio, format], stream_as_file_output)
+
+        with gr.Column():
+            stream_as_bytes_btn = gr.Button("Stream as Bytes")
+            stream_as_bytes_output = gr.Audio(format="bytes", streaming=True)
+
+            def stream_bytes(audio_file):
+                chunk_size = 20_000
+                with open(audio_file, "rb") as f:
+                    while True:
+                        chunk = f.read(chunk_size)
+                        if chunk:
+                            yield chunk
+                            sleep(1)
+                        else:
+                            break
+            
+            stream_as_bytes_btn.click(stream_bytes, input_audio, stream_as_bytes_output)

 if __name__ == "__main__":
    demo.queue().launch()
--- a/gradio/blocks.py
+++ b/gradio/blocks.py
@ -1350,20 +1350,23 @@ Received outputs:
    ) -> list:
        if session_hash is None or run is None:
            return data
+        if run not in self.pending_streams[session_hash]:
+            self.pending_streams[session_hash][run] = {}
+        stream_run = self.pending_streams[session_hash][run]

        from gradio.events import StreamableOutput

        for i, output_id in enumerate(self.dependencies[fn_index]["outputs"]):
            block = self.blocks[output_id]
            if isinstance(block, StreamableOutput) and block.streaming:
-                stream = block.stream_output(data[i])
-                if run not in self.pending_streams[session_hash]:
-                    self.pending_streams[session_hash][run] = defaultdict(list)
-                self.pending_streams[session_hash][run][output_id].append(stream)
-                if data[i]:
-                    data[i]["is_file"] = False
-                    data[i]["name"] = f"{session_hash}/{run}/{output_id}"
-                    data[i]["is_stream"] = True
+                first_chunk = output_id not in stream_run
+                binary_data, output_data = block.stream_output(
+                    data[i], f"{session_hash}/{run}/{output_id}", first_chunk
+                )
+                if first_chunk:
+                    stream_run[output_id] = []
+                self.pending_streams[session_hash][run][output_id].append(binary_data)
+                data[i] = output_data
        return data

    async def process_api(
--- a/gradio/components/audio.py
+++ b/gradio/components/audio.py
@ -45,7 +45,7 @@ class Audio(
    """
    Creates an audio component that can be used to upload/record audio (as an input) or display audio (as an output).
    Preprocessing: passes the uploaded audio as a {Tuple(int, numpy.array)} corresponding to (sample rate in Hz, audio data as a 16-bit int array whose values range from -32768 to 32767), or as a {str} filepath, depending on `type`.
-    Postprocessing: expects a {Tuple(int, numpy.array)} corresponding to (sample rate in Hz, audio data as a float or int numpy array) or as a {str} or {pathlib.Path} filepath or URL to an audio file, which gets displayed
+    Postprocessing: expects a {Tuple(int, numpy.array)} corresponding to (sample rate in Hz, audio data as a float or int numpy array) or as a {str} or {pathlib.Path} filepath or URL to an audio file, or bytes for binary content (recommended for streaming)
    Examples-format: a {str} filepath to a local file that contains audio.
    Demos: main_note, generate_tone, reverse_audio
    Guides: real-time-speech-recognition
@ -328,8 +328,8 @@ class Audio(
        return masked_inputs

    def postprocess(
-        self, y: tuple[int, np.ndarray] | str | Path | None
-    ) -> str | dict | None:
+        self, y: tuple[int, np.ndarray] | str | Path | bytes | None
+    ) -> str | dict | bytes | None:
        """
        Parameters:
            y: audio data in either of the following formats: a tuple of (sample_rate, data), or a string filepath or URL to an audio file, or None.
@ -338,25 +338,22 @@ class Audio(
        """
        if y is None:
            return None
-        if isinstance(y, str) and client_utils.is_http_url_like(y):
+        if isinstance(y, bytes):
+            if self.streaming:
+                return y
+            file_path = self.file_bytes_to_file(y, "audio")
+        elif isinstance(y, str) and client_utils.is_http_url_like(y):
            return {"name": y, "data": None, "is_file": True}
-        if isinstance(y, tuple):
+        elif isinstance(y, tuple):
            sample_rate, data = y
            file_path = self.audio_to_temp_file(
                data,
                sample_rate,
-                format="mp3" if self.streaming else self.format,
+                format=self.format,
            )
            self.temp_files.add(file_path)
        else:
-            if isinstance(y, Path):
-                y = str(y)
-            if self.streaming and not y.endswith(".mp3"):
-                sample_rate, data = processing_utils.audio_from_file(y)
-                file_path = self.audio_to_temp_file(data, sample_rate, format="mp3")
-                self.temp_files.add(file_path)
-            else:
-                file_path = self.make_temp_copy_if_needed(y)
+            file_path = self.make_temp_copy_if_needed(y)
        return {
            "name": file_path,
            "data": None,
@ -364,17 +361,37 @@ class Audio(
            "orig_name": Path(file_path).name,
        }

-    def stream_output(self, y):
+    def stream_output(self, y, output_id: str, first_chunk: bool):
+        output_file = {
+            "name": output_id,
+            "is_stream": True,
+            "is_file": False,
+        }
        if y is None:
-            return None
+            return None, output_file
+        if isinstance(y, bytes):
+            return y, output_file
        if client_utils.is_http_url_like(y["name"]):
            response = requests.get(y["name"])
-            bytes = response.content
+            binary_data = response.content
        else:
+            output_file["orig_name"] = y["orig_name"]
            file_path = y["name"]
+            is_wav = file_path.endswith(".wav")
            with open(file_path, "rb") as f:
-                bytes = f.read()
-        return bytes
+                binary_data = f.read()
+            if is_wav:
+                # strip length information from first chunk header, remove headers entirely from subsequent chunks
+                if first_chunk:
+                    binary_data = (
+                        binary_data[:4] + b"\xFF\xFF\xFF\xFF" + binary_data[8:]
+                    )
+                    binary_data = (
+                        binary_data[:40] + b"\xFF\xFF\xFF\xFF" + binary_data[44:]
+                    )
+                else:
+                    binary_data = binary_data[44:]
+        return binary_data, output_file

    def check_streamable(self):
        if self.source != "microphone":
--- a/gradio/components/base.py
+++ b/gradio/components/base.py
@ -328,8 +328,8 @@ class IOComponent(Component):
        processing_utils.audio_to_file(sample_rate, data, filename, format=format)
        return filename

-    def file_bytes_to_file(self, data: bytes, dir: str, file_name: str):
-        path = Path(dir) / self.hash_bytes(data)
+    def file_bytes_to_file(self, data: bytes, file_name: str):
+        path = Path(self.DEFAULT_TEMP_DIR) / self.hash_bytes(data)
        path.mkdir(exist_ok=True, parents=True)
        path = path / Path(file_name).name
        path.write_bytes(data)
--- a/gradio/components/file.py
+++ b/gradio/components/file.py
@ -190,9 +190,7 @@ class File(
                    path = self.make_temp_copy_if_needed(file_name)
                else:
                    data, _ = client_utils.decode_base64_to_binary(data)
-                    path = self.file_bytes_to_file(
-                        data, dir=self.DEFAULT_TEMP_DIR, file_name=file_name
-                    )
+                    path = self.file_bytes_to_file(data, file_name=file_name)
                    path = str(utils.abspath(path))
                    self.temp_files.add(path)

--- a/gradio/components/upload_button.py
+++ b/gradio/components/upload_button.py
@ -157,9 +157,7 @@ class UploadButton(Clickable, Uploadable, IOComponent, FileSerializable):
                    path = self.make_temp_copy_if_needed(file_name)
                else:
                    data, _ = client_utils.decode_base64_to_binary(data)
-                    path = self.file_bytes_to_file(
-                        data, dir=self.DEFAULT_TEMP_DIR, file_name=file_name
-                    )
+                    path = self.file_bytes_to_file(data, file_name=file_name)
                    path = str(utils.abspath(path))
                    self.temp_files.add(path)
                file = tempfile.NamedTemporaryFile(
--- a/gradio/events.py
+++ b/gradio/events.py
@ -274,7 +274,7 @@ class StreamableOutput(EventListener):
    def __init__(self):
        self.streaming: bool

-    def stream_output(self, y) -> bytes:
+    def stream_output(self, y, output_id: str, first_chunk: bool) -> tuple[bytes, Any]:
        raise NotImplementedError