Improve make_waveform (#4918)

* make waveform animate * fixes * changelog * fix * Update CHANGELOG.md * format * fix * changes * add animate flag * format * fixes * demo * fixes * lint --------- Co-authored-by: Abubakar Abid <abubakar@huggingface.co>
2025-01-18 10:44:33 +08:00 · 2023-07-20 23:47:48 +03:00 · 2023-07-20 23:47:48 +03:00 · ad9fb84f05
commit ad9fb84f05
parent c57a4e2729
4 changed files with 142 additions and 47 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -4,13 +4,14 @@ No changes to highlight.

 ## New Features:

+- Provide a parameter `animate` (`False` by default) in `gr.make_waveform()` which animates the overlayed waveform by [@dawoodkhan82](https://github.com/dawoodkhan82) in [PR 4918](https://github.com/gradio-app/gradio/pull/4918)
+- Add `show_download_button` param to allow the download button in static Image components to be hidden by [@hannahblair](https://github.com/hannahblair) in [PR 4959](https://github.com/gradio-app/gradio/pull/4959)
+- Added autofocus argument to Textbox by [@aliabid94](https://github.com/aliabid94) in [PR 4978](https://github.com/gradio-app/gradio/pull/4978) 
 - The `gr.ChatInterface` UI now converts the "Submit" button to a "Stop" button in ChatInterface while streaming, which can be used to pause generation. By [@abidlabs](https://github.com/abidlabs) in [PR 4971](https://github.com/gradio-app/gradio/pull/4971).

 ## Bug Fixes:

 - Fixes `cancels` for generators so that if a generator is canceled before it is complete, subsequent runs of the event do not continue from the previous iteration, but rather start from the beginning. By [@abidlabs](https://github.com/abidlabs) in [PR 4969](https://github.com/gradio-app/gradio/pull/4969).
- Add `show_download_button` param to allow the download button in static Image components to be hidden by [@hannahblair](https://github.com/hannahblair) in [PR 4959](https://github.com/gradio-app/gradio/pull/4959)
- Added autofocus argument to Textbox by [@aliabid94](https://github.com/aliabid94) in [PR 4978](https://github.com/gradio-app/gradio/pull/4978) 
 - Use `gr.State` in `gr.ChatInterface` to reduce latency by [@freddyaboulton](https://github.com/freddyaboulton) in [PR 4976](https://github.com/gradio-app/gradio/pull/4976)
 - Add a `chatbot_user_message_border_color_accent` theme variable to control the border color of user messages in a chatbot by [@freddyaboulton](https://github.com/freddyaboulton) in [PR 4989](https://github.com/gradio-app/gradio/pull/4989). Set the value of this variable in `Default` theme to `*primary_200`.

--- a/demo/waveform/run.ipynb
+++ b/demo/waveform/run.ipynb
@ -1 +1 @@
-{"cells": [{"cell_type": "markdown", "id": 302934307671667531413257853548643485645, "metadata": {}, "source": ["# Gradio Demo: waveform"]}, {"cell_type": "code", "execution_count": null, "id": 272996653310673477252411125948039410165, "metadata": {}, "outputs": [], "source": ["!pip install -q gradio "]}, {"cell_type": "code", "execution_count": null, "id": 288918539441861185822528903084949547379, "metadata": {}, "outputs": [], "source": ["import gradio as gr\n", "import random\n", "\n", "\n", "COLORS = [\n", "    [\"#ff0000\", \"#00ff00\"],\n", "    [\"#00ff00\", \"#0000ff\"],\n", "    [\"#0000ff\", \"#ff0000\"],\n", "]    \n", "\n", "def audio_waveform(audio, image):\n", "    return (\n", "        audio,\n", "        gr.make_waveform(audio),\n", "        gr.make_waveform(audio, bg_image=image, bars_color=random.choice(COLORS)),\n", "    )\n", "\n", "\n", "gr.Interface(\n", "    audio_waveform,\n", "    inputs=[gr.Audio(), gr.Image(type=\"filepath\")],\n", "    outputs=[\n", "        gr.Audio(),\n", "        gr.Video(),\n", "        gr.Video(),\n", "    ],\n", ").launch()\n"]}], "metadata": {}, "nbformat": 4, "nbformat_minor": 5}
+{"cells": [{"cell_type": "markdown", "id": 302934307671667531413257853548643485645, "metadata": {}, "source": ["# Gradio Demo: waveform"]}, {"cell_type": "code", "execution_count": null, "id": 272996653310673477252411125948039410165, "metadata": {}, "outputs": [], "source": ["!pip install -q gradio "]}, {"cell_type": "code", "execution_count": null, "id": 288918539441861185822528903084949547379, "metadata": {}, "outputs": [], "source": ["import gradio as gr\n", "import random\n", "\n", "\n", "COLORS = [\n", "    [\"#ff0000\", \"#00ff00\"],\n", "    [\"#00ff00\", \"#0000ff\"],\n", "    [\"#0000ff\", \"#ff0000\"],\n", "]    \n", "\n", "def audio_waveform(audio, image):\n", "    return (\n", "        audio,\n", "        gr.make_waveform(audio),\n", "        gr.make_waveform(audio, animate=True),\n", "        gr.make_waveform(audio, bg_image=image, bars_color=random.choice(COLORS)),\n", "    )\n", "\n", "\n", "gr.Interface(\n", "    audio_waveform,\n", "    inputs=[gr.Audio(), gr.Image(type=\"filepath\")],\n", "    outputs=[\n", "        gr.Audio(),\n", "        gr.Video(),\n", "        gr.Video(),\n", "        gr.Video(),\n", "    ],\n", ").launch()\n"]}], "metadata": {}, "nbformat": 4, "nbformat_minor": 5}
--- a/demo/waveform/run.py
+++ b/demo/waveform/run.py
@ -12,6 +12,7 @@ def audio_waveform(audio, image):
    return (
        audio,
        gr.make_waveform(audio),
+        gr.make_waveform(audio, animate=True),
        gr.make_waveform(audio, bg_image=image, bars_color=random.choice(COLORS)),
    )

@ -23,5 +24,6 @@ gr.Interface(
        gr.Audio(),
        gr.Video(),
        gr.Video(),
+        gr.Video(),
    ],
 ).launch()
--- a/gradio/helpers.py
+++ b/gradio/helpers.py
@ -21,6 +21,7 @@ import PIL
 import PIL.Image
 from gradio_client import utils as client_utils
 from gradio_client.documentation import document, set_documentation_group
+from matplotlib import animation

 from gradio import components, processing_utils, routes, utils
 from gradio.context import Context
@ -756,6 +757,7 @@ def make_waveform(
    bars_color: str | tuple[str, str] = ("#fbbf24", "#ea580c"),
    bar_count: int = 50,
    bar_width: float = 0.6,
+    animate: bool = False,
 ) -> str:
    """
    Generates a waveform video from an audio file. Useful for creating an easy to share audio visualization. The output should be passed into a `gr.Video` component.
@ -767,6 +769,7 @@ def make_waveform(
        bars_color: Color of waveform bars. Can be a single color or a tuple of (start_color, end_color) of gradient
        bar_count: Number of bars in waveform
        bar_width: Width of bars in waveform. 1 represents full width, 0.5 represents half width, etc.
+        animate: If true, the audio waveform overlay will be animated, if false, it will be static.
    Returns:
        A filepath to the output video in mp4 format.
    """
@ -820,71 +823,160 @@ def make_waveform(
            if isinstance(bars_color, str)
            else get_color_gradient(bars_color[0], bars_color[1], bar_count)
        )
-        plt.bar(
+
+        if animate:
+            fig = plt.figure(figsize=(5, 1), dpi=200, frameon=False)
+            fig.subplots_adjust(left=0, bottom=0, right=1, top=1)
+        plt.axis("off")
+        plt.margins(x=0)
+
+        bar_alpha = fg_alpha if animate else 1.0
+        barcollection = plt.bar(
            np.arange(0, bar_count),
            samples * 2,
            bottom=(-1 * samples),
            width=bar_width,
            color=color,
+            alpha=bar_alpha,
        )
-        plt.axis("off")
-        plt.margins(x=0)
+
        tmp_img = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
+
        savefig_kwargs: dict[str, Any] = {"bbox_inches": "tight"}
        if bg_image is not None:
            savefig_kwargs["transparent"] = True
+            if animate:
+                savefig_kwargs["facecolor"] = "none"
        else:
            savefig_kwargs["facecolor"] = bg_color
        plt.savefig(tmp_img.name, **savefig_kwargs)
-        waveform_img = PIL.Image.open(tmp_img.name)
-        waveform_img = waveform_img.resize((1000, 200))

-        # Composite waveform with background image
-        if bg_image is not None:
-            waveform_array = np.array(waveform_img)
-            waveform_array[:, :, 3] = waveform_array[:, :, 3] * fg_alpha
-            waveform_img = PIL.Image.fromarray(waveform_array)
+        if not animate:
+            waveform_img = PIL.Image.open(tmp_img.name)
+            waveform_img = waveform_img.resize((1000, 200))

-            bg_img = PIL.Image.open(bg_image)
-            waveform_width, waveform_height = waveform_img.size
-            bg_width, bg_height = bg_img.size
-            if waveform_width != bg_width:
-                bg_img = bg_img.resize(
-                    (waveform_width, 2 * int(bg_height * waveform_width / bg_width / 2))
-                )
+            # Composite waveform with background image
+            if bg_image is not None:
+                waveform_array = np.array(waveform_img)
+                waveform_array[:, :, 3] = waveform_array[:, :, 3] * fg_alpha
+                waveform_img = PIL.Image.fromarray(waveform_array)
+
+                bg_img = PIL.Image.open(bg_image)
+                waveform_width, waveform_height = waveform_img.size
                bg_width, bg_height = bg_img.size
-            composite_height = max(bg_height, waveform_height)
-            composite = PIL.Image.new(
-                "RGBA", (waveform_width, composite_height), "#FFFFFF"
-            )
-            composite.paste(bg_img, (0, composite_height - bg_height))
-            composite.paste(
-                waveform_img, (0, composite_height - waveform_height), waveform_img
-            )
-            composite.save(tmp_img.name)
-            img_width, img_height = composite.size
+                if waveform_width != bg_width:
+                    bg_img = bg_img.resize(
+                        (
+                            waveform_width,
+                            2 * int(bg_height * waveform_width / bg_width / 2),
+                        )
+                    )
+                    bg_width, bg_height = bg_img.size
+                composite_height = max(bg_height, waveform_height)
+                composite = PIL.Image.new(
+                    "RGBA", (waveform_width, composite_height), "#FFFFFF"
+                )
+                composite.paste(bg_img, (0, composite_height - bg_height))
+                composite.paste(
+                    waveform_img, (0, composite_height - waveform_height), waveform_img
+                )
+                composite.save(tmp_img.name)
+                img_width, img_height = composite.size
+            else:
+                img_width, img_height = waveform_img.size
+                waveform_img.save(tmp_img.name)
        else:
-            img_width, img_height = waveform_img.size
-            waveform_img.save(tmp_img.name)
+
+            def _animate(_):
+                for idx, b in enumerate(barcollection):
+                    rand_height = np.random.uniform(0.8, 1.2)
+                    b.set_height(samples[idx] * rand_height * 2)
+                    b.set_y((-rand_height * samples)[idx])
+
+            frames = int(duration * 10)
+            anim = animation.FuncAnimation(
+                fig,  # type: ignore
+                _animate,
+                repeat=False,
+                blit=False,
+                frames=frames,
+                interval=100,
+            )
+            anim.save(
+                tmp_img.name,
+                writer="pillow",
+                fps=10,
+                codec="png",
+                savefig_kwargs=savefig_kwargs,
+            )

    # Convert waveform to video with ffmpeg
    output_mp4 = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)

-    ffmpeg_cmd = [
-        ffmpeg,
-        "-loop",
-        "1",
-        "-i",
-        tmp_img.name,
-        "-i",
-        audio_file,
-        "-vf",
-        f"color=c=#FFFFFF77:s={img_width}x{img_height}[bar];[0][bar]overlay=-w+(w/{duration})*t:H-h:shortest=1",
-        "-t",
-        str(duration),
-        "-y",
-        output_mp4.name,
-    ]
+    if animate and bg_image is not None:
+        ffmpeg_cmd = [
+            ffmpeg,
+            "-loop",
+            "1",
+            "-i",
+            bg_image,
+            "-i",
+            tmp_img.name,
+            "-i",
+            audio_file,
+            "-filter_complex",
+            "[0:v]scale=w=trunc(iw/2)*2:h=trunc(ih/2)*2[bg];[1:v]format=rgba,colorchannelmixer=aa=1.0[ov];[bg][ov]overlay=(main_w-overlay_w*0.9)/2:main_h-overlay_h*0.9/2[output]",
+            "-t",
+            str(duration),
+            "-map",
+            "[output]",
+            "-map",
+            "2:a",
+            "-c:v",
+            "libx264",
+            "-c:a",
+            "aac",
+            "-shortest",
+            "-y",
+            output_mp4.name,
+        ]
+    elif animate and bg_image is None:
+        ffmpeg_cmd = [
+            ffmpeg,
+            "-i",
+            tmp_img.name,
+            "-i",
+            audio_file,
+            "-filter_complex",
+            "[0:v][1:a]concat=n=1:v=1:a=1[v][a]",
+            "-map",
+            "[v]",
+            "-map",
+            "[a]",
+            "-c:v",
+            "libx264",
+            "-c:a",
+            "aac",
+            "-shortest",
+            "-y",
+            output_mp4.name,
+        ]
+    else:
+        ffmpeg_cmd = [
+            ffmpeg,
+            "-loop",
+            "1",
+            "-i",
+            tmp_img.name,
+            "-i",
+            audio_file,
+            "-vf",
+            f"color=c=#FFFFFF77:s={img_width}x{img_height}[bar];[0][bar]overlay=-w+(w/{duration})*t:H-h:shortest=1",  # type: ignore
+            "-t",
+            str(duration),
+            "-y",
+            output_mp4.name,
+        ]

    subprocess.check_call(ffmpeg_cmd)
    return output_mp4.name