Audio waveform (#2706)

* utils func * changes * changes * changes * notebook * fix * changes * change * changes * changes * notebook fix * changes * changes * changes * change * fix * revert backend changes, gr.make_waveform * notebook * Update CHANGELOG.md Co-authored-by: Abubakar Abid <abubakar@huggingface.co> * format fix * changes * changes * revert pnpm * change * change * changes * changes * changes Co-authored-by: Ali Abid <aabid94@gmail.com> Co-authored-by: Abubakar Abid <abubakar@huggingface.co>
2025-04-12 12:40:29 +08:00 · 2022-12-15 13:35:22 -05:00 · 2022-12-15 13:35:22 -05:00 · 714ab2cc09
commit 714ab2cc09
parent 074bf909ee
7 changed files with 176 additions and 7 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,7 +1,11 @@
 # Upcoming Release 

 ## New Features:
-No changes to highlight.
+
+### Add Waveform Visual Support to Audio
+Adds a `gr.make_waveform()` function that creates a waveform video by combining an audio and an optional background image by [@dawoodkhan82](http://github.com/dawoodkhan82) and [@aliabid94](http://github.com/aliabid94) in [PR 2706](https://github.com/gradio-app/gradio/pull/2706. Helpful for making audio outputs much more shareable.
+
+![waveform screenrecording](https://user-images.githubusercontent.com/7870876/206062396-164a5e71-451a-4fe0-94a7-cbe9269d57e6.gif)

 ## Bug Fixes:
 No changes to highlight.
--- a/demo/waveform/run.ipynb
+++ b/demo/waveform/run.ipynb
@ -0,0 +1 @@
+{"cells": [{"cell_type": "markdown", "id": 302934307671667531413257853548643485645, "metadata": {}, "source": ["# Gradio Demo: waveform"]}, {"cell_type": "code", "execution_count": null, "id": 272996653310673477252411125948039410165, "metadata": {}, "outputs": [], "source": ["!pip install -q gradio "]}, {"cell_type": "code", "execution_count": null, "id": 288918539441861185822528903084949547379, "metadata": {}, "outputs": [], "source": ["import gradio as gr\n", "import random\n", "\n", "\n", "COLORS = [\n", "    [\"#ff0000\", \"#00ff00\"],\n", "    [\"#00ff00\", \"#0000ff\"],\n", "    [\"#0000ff\", \"#ff0000\"],\n", "]    \n", "\n", "def audio_waveform(audio, image):\n", "    return (\n", "        audio,\n", "        gr.make_waveform(audio),\n", "        gr.make_waveform(audio, bg_image=image, bars_color=random.choice(COLORS)),\n", "    )\n", "\n", "\n", "gr.Interface(\n", "    audio_waveform,\n", "    inputs=[gr.Audio(), gr.Image(type=\"filepath\")],\n", "    outputs=[\n", "        gr.Audio(),\n", "        gr.Video(),\n", "        gr.Video(),\n", "    ],\n", ").launch()\n"]}], "metadata": {}, "nbformat": 4, "nbformat_minor": 5}
--- a/demo/waveform/run.py
+++ b/demo/waveform/run.py
@ -0,0 +1,27 @@
+import gradio as gr
+import random
+
+
+COLORS = [
+    ["#ff0000", "#00ff00"],
+    ["#00ff00", "#0000ff"],
+    ["#0000ff", "#ff0000"],
+]    
+
+def audio_waveform(audio, image):
+    return (
+        audio,
+        gr.make_waveform(audio),
+        gr.make_waveform(audio, bg_image=image, bars_color=random.choice(COLORS)),
+    )
+
+
+gr.Interface(
+    audio_waveform,
+    inputs=[gr.Audio(), gr.Image(type="filepath")],
+    outputs=[
+        gr.Audio(),
+        gr.Video(),
+        gr.Video(),
+    ],
+).launch()
--- a/gradio/init.py
+++ b/gradio/init.py
@ -78,6 +78,7 @@ from gradio.templates import (
    TextArea,
    Webcam,
 )
+from gradio.utils import make_waveform

 current_pkg_version = pkgutil.get_data(__name__, "version.txt").decode("ascii").strip()
 __version__ = current_pkg_version
--- a/gradio/utils.py
+++ b/gradio/utils.py
@ -11,7 +11,9 @@ import os
 import pkgutil
 import random
 import re
+import subprocess
 import sys
+import tempfile
 import time
 import typing
 import warnings
@ -36,12 +38,17 @@ from typing import (
 import aiohttp
 import fsspec.asyn
 import httpx
+import matplotlib
 import matplotlib.pyplot as plt
+import numpy as np
+import PIL
 import requests
 from pydantic import BaseModel, Json, parse_obj_as

 import gradio
+from gradio import processing_utils
 from gradio.context import Context
+from gradio.documentation import document, set_documentation_group

 if TYPE_CHECKING:  # Only import for type checking (is False at runtime).
    from gradio.blocks import BlockContext
@ -808,6 +815,129 @@ class TupleNoPrint(tuple):
        return ""


+set_documentation_group("component-helpers")
+
+
+@document()
+def make_waveform(
+    audio: str | Tuple[int, np.ndarray],
+    *,
+    bg_color: str = "#f3f4f6",
+    bg_image: str = None,
+    fg_alpha: float = 0.75,
+    bars_color: str | Tuple[str, str] = ("#fbbf24", "#ea580c"),
+    bar_count: int = 50,
+    bar_width: float = 0.6,
+):
+    """
+    Generates a waveform video from an audio file. Useful for creating an easy to share audio visualization. The output should be passed into a `gr.Video` component.
+    Parameters:
+        audio: Audio file path or tuple of (sample_rate, audio_data)
+        bg_color: Background color of waveform (ignored if bg_image is provided)
+        bg_image: Background image of waveform
+        fg_alpha: Opacity of foreground waveform
+        bars_color: Color of waveform bars. Can be a single color or a tuple of (start_color, end_color) of gradient
+        bar_count: Number of bars in waveform
+        bar_width: Width of bars in waveform. 1 represents full width, 0.5 represents half width, etc.
+    Returns:
+        A filepath to the output video.
+    """
+    if isinstance(audio, str):
+        audio_file = audio
+        audio = processing_utils.audio_from_file(audio)
+    else:
+        tmp_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
+        processing_utils.audio_to_file(audio[0], audio[1], tmp_wav.name)
+        audio_file = tmp_wav.name
+    duration = round(len(audio[1]) / audio[0], 4)
+
+    # Helper methods to create waveform
+    def hex_to_RGB(hex_str):
+        return [int(hex_str[i : i + 2], 16) for i in range(1, 6, 2)]
+
+    def get_color_gradient(c1, c2, n):
+        assert n > 1
+        c1_rgb = np.array(hex_to_RGB(c1)) / 255
+        c2_rgb = np.array(hex_to_RGB(c2)) / 255
+        mix_pcts = [x / (n - 1) for x in range(n)]
+        rgb_colors = [((1 - mix) * c1_rgb + (mix * c2_rgb)) for mix in mix_pcts]
+        return [
+            "#" + "".join([format(int(round(val * 255)), "02x") for val in item])
+            for item in rgb_colors
+        ]
+
+    # Reshape audio to have a fixed number of bars
+    samples = audio[1]
+    if len(samples.shape) > 1:
+        samples = np.mean(samples, 1)
+    bins_to_pad = bar_count - (len(samples) % bar_count)
+    samples = np.pad(samples, [(0, bins_to_pad)])
+    samples = np.reshape(samples, (bar_count, -1))
+    samples = np.abs(samples)
+    samples = np.max(samples, 1)
+
+    matplotlib.use("Agg")
+    plt.clf()
+    # Plot waveform
+    color = (
+        bars_color
+        if isinstance(bars_color, str)
+        else get_color_gradient(bars_color[0], bars_color[1], bar_count)
+    )
+    plt.bar(
+        np.arange(0, bar_count),
+        samples * 2,
+        bottom=(-1 * samples),
+        width=bar_width,
+        color=color,
+    )
+    plt.axis("off")
+    plt.margins(x=0)
+    tmp_img = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
+    savefig_kwargs = {"bbox_inches": "tight"}
+    if bg_image is not None:
+        savefig_kwargs["transparent"] = True
+    else:
+        savefig_kwargs["facecolor"] = bg_color
+    plt.savefig(tmp_img.name, **savefig_kwargs)
+    waveform_img = PIL.Image.open(tmp_img.name)
+    waveform_img = waveform_img.resize((1000, 200))
+
+    # Composite waveform with background image
+    if bg_image is not None:
+        waveform_array = np.array(waveform_img)
+        waveform_array[:, :, 3] = waveform_array[:, :, 3] * fg_alpha
+        waveform_img = PIL.Image.fromarray(waveform_array)
+
+        bg_img = PIL.Image.open(bg_image)
+        waveform_width, waveform_height = waveform_img.size
+        bg_width, bg_height = bg_img.size
+        if waveform_width != bg_width:
+            bg_img = bg_img.resize(
+                (waveform_width, 2 * int(bg_height * waveform_width / bg_width / 2))
+            )
+            bg_width, bg_height = bg_img.size
+        composite_height = max(bg_height, waveform_height)
+        composite = PIL.Image.new("RGBA", (waveform_width, composite_height), "#FFFFFF")
+        composite.paste(bg_img, (0, composite_height - bg_height))
+        composite.paste(
+            waveform_img, (0, composite_height - waveform_height), waveform_img
+        )
+        composite.save(tmp_img.name)
+        img_width, img_height = composite.size
+    else:
+        img_width, img_height = waveform_img.size
+        waveform_img.save(tmp_img.name)
+
+    # Convert waveform to video with ffmpeg
+    output_mp4 = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
+
+    ffmpeg_cmd = f"""ffmpeg -loop 1 -i {tmp_img.name} -i {audio_file} -vf "color=c=#FFFFFF77:s={img_width}x{img_height}[bar];[0][bar]overlay=-w+(w/{duration})*t:H-h:shortest=1" -t {duration} -y {output_mp4.name}"""
+
+    subprocess.call(ffmpeg_cmd, shell=True)
+    return output_mp4.name
+
+
 def tex2svg(formula, *args):
    FONTSIZE = 20
    DPI = 300
--- a/test/test_blocks.py
+++ b/test/test_blocks.py
@ -323,7 +323,7 @@ class TestComponentsInBlocks:
            )

        output = demo.postprocess_data(
-            0, [gr.update(value=None) for _ in io_components], state=None
+            0, [gr.update(value=None) for _ in io_components], state={}
        )
        assert all(
            [o["value"] == c.postprocess(None) for o, c in zip(output, io_components)]
@ -339,7 +339,7 @@ class TestComponentsInBlocks:
                outputs=text,
            )

-        output = demo.postprocess_data(0, gr.update(value="NO_VALUE"), state=None)
+        output = demo.postprocess_data(0, gr.update(value="NO_VALUE"), state={})
        assert output[0]["value"] == "NO_VALUE"

    def test_blocks_returns_correct_output_dict_single_key(self):
@ -353,12 +353,10 @@ class TestComponentsInBlocks:

            update.click(update_values, inputs=[num], outputs=[num2])

-        output = demo.postprocess_data(
-            0, {num2: gr.Number.update(value=42)}, state=None
-        )
+        output = demo.postprocess_data(0, {num2: gr.Number.update(value=42)}, state={})
        assert output[0]["value"] == 42

-        output = demo.postprocess_data(0, {num2: 23}, state=None)
+        output = demo.postprocess_data(0, {num2: 23}, state={})
        assert output[0] == 23

    @pytest.mark.asyncio
--- a/test/test_components.py
+++ b/test/test_components.py
@ -1170,6 +1170,14 @@ class TestVideo:
        iface = gr.Interface(lambda x: x, "video", "playable_video")
        assert iface(x_video).endswith(".mp4")

+    def test_with_waveform(self):
+        """
+        Interface, process
+        """
+        x_audio = media_data.BASE64_AUDIO["name"]
+        iface = gr.Interface(lambda x: gr.make_waveform(x), "audio", "video")
+        assert iface(x_audio).endswith(".mp4")
+
    def test_video_postprocess_converts_to_playable_format(self):
        test_file_dir = pathlib.Path(pathlib.Path(__file__).parent, "test_files")
        # This file has a playable container but not playable codec
				`@ -0,0 +1 @@`
				{"cells": [{"cell_type": "markdown", "id": 302934307671667531413257853548643485645, "metadata": {}, "source": ["# Gradio Demo: waveform"]}, {"cell_type": "code", "execution_count": null, "id": 272996653310673477252411125948039410165, "metadata": {}, "outputs": [], "source": ["!pip install -q gradio "]}, {"cell_type": "code", "execution_count": null, "id": 288918539441861185822528903084949547379, "metadata": {}, "outputs": [], "source": ["import gradio as gr\n", "import random\n", "\n", "\n", "COLORS = [\n", " [\"#ff0000\", \"#00ff00\"],\n", " [\"#00ff00\", \"#0000ff\"],\n", " [\"#0000ff\", \"#ff0000\"],\n", "] \n", "\n", "def audio_waveform(audio, image):\n", " return (\n", " audio,\n", " gr.make_waveform(audio),\n", " gr.make_waveform(audio, bg_image=image, bars_color=random.choice(COLORS)),\n", " )\n", "\n", "\n", "gr.Interface(\n", " audio_waveform,\n", " inputs=[gr.Audio(), gr.Image(type=\"filepath\")],\n", " outputs=[\n", " gr.Audio(),\n", " gr.Video(),\n", " gr.Video(),\n", " ],\n", ").launch()\n"]}], "metadata": {}, "nbformat": 4, "nbformat_minor": 5}