Audio waveform (#2706)

* utils func

* changes

* changes

* changes

* notebook

* fix

* changes

* change

* changes

* changes

* notebook fix

* changes

* changes

* changes

* change

* fix

* revert backend changes,  gr.make_waveform

* notebook

* Update CHANGELOG.md

Co-authored-by: Abubakar Abid <abubakar@huggingface.co>

* format fix

* changes

* changes

* revert pnpm

* change

* change

* changes

* changes

* changes

Co-authored-by: Ali Abid <aabid94@gmail.com>
Co-authored-by: Abubakar Abid <abubakar@huggingface.co>
This commit is contained in:
Dawood Khan 2022-12-15 13:35:22 -05:00 committed by GitHub
parent 074bf909ee
commit 714ab2cc09
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 176 additions and 7 deletions

View File

@ -1,7 +1,11 @@
# Upcoming Release
## New Features:
No changes to highlight.
### Add Waveform Visual Support to Audio
Adds a `gr.make_waveform()` function that creates a waveform video by combining an audio and an optional background image by [@dawoodkhan82](http://github.com/dawoodkhan82) and [@aliabid94](http://github.com/aliabid94) in [PR 2706](https://github.com/gradio-app/gradio/pull/2706. Helpful for making audio outputs much more shareable.
![waveform screenrecording](https://user-images.githubusercontent.com/7870876/206062396-164a5e71-451a-4fe0-94a7-cbe9269d57e6.gif)
## Bug Fixes:
No changes to highlight.

1
demo/waveform/run.ipynb Normal file
View File

@ -0,0 +1 @@
{"cells": [{"cell_type": "markdown", "id": 302934307671667531413257853548643485645, "metadata": {}, "source": ["# Gradio Demo: waveform"]}, {"cell_type": "code", "execution_count": null, "id": 272996653310673477252411125948039410165, "metadata": {}, "outputs": [], "source": ["!pip install -q gradio "]}, {"cell_type": "code", "execution_count": null, "id": 288918539441861185822528903084949547379, "metadata": {}, "outputs": [], "source": ["import gradio as gr\n", "import random\n", "\n", "\n", "COLORS = [\n", " [\"#ff0000\", \"#00ff00\"],\n", " [\"#00ff00\", \"#0000ff\"],\n", " [\"#0000ff\", \"#ff0000\"],\n", "] \n", "\n", "def audio_waveform(audio, image):\n", " return (\n", " audio,\n", " gr.make_waveform(audio),\n", " gr.make_waveform(audio, bg_image=image, bars_color=random.choice(COLORS)),\n", " )\n", "\n", "\n", "gr.Interface(\n", " audio_waveform,\n", " inputs=[gr.Audio(), gr.Image(type=\"filepath\")],\n", " outputs=[\n", " gr.Audio(),\n", " gr.Video(),\n", " gr.Video(),\n", " ],\n", ").launch()\n"]}], "metadata": {}, "nbformat": 4, "nbformat_minor": 5}

27
demo/waveform/run.py Normal file
View File

@ -0,0 +1,27 @@
import gradio as gr
import random
COLORS = [
["#ff0000", "#00ff00"],
["#00ff00", "#0000ff"],
["#0000ff", "#ff0000"],
]
def audio_waveform(audio, image):
return (
audio,
gr.make_waveform(audio),
gr.make_waveform(audio, bg_image=image, bars_color=random.choice(COLORS)),
)
gr.Interface(
audio_waveform,
inputs=[gr.Audio(), gr.Image(type="filepath")],
outputs=[
gr.Audio(),
gr.Video(),
gr.Video(),
],
).launch()

View File

@ -78,6 +78,7 @@ from gradio.templates import (
TextArea,
Webcam,
)
from gradio.utils import make_waveform
current_pkg_version = pkgutil.get_data(__name__, "version.txt").decode("ascii").strip()
__version__ = current_pkg_version

View File

@ -11,7 +11,9 @@ import os
import pkgutil
import random
import re
import subprocess
import sys
import tempfile
import time
import typing
import warnings
@ -36,12 +38,17 @@ from typing import (
import aiohttp
import fsspec.asyn
import httpx
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import PIL
import requests
from pydantic import BaseModel, Json, parse_obj_as
import gradio
from gradio import processing_utils
from gradio.context import Context
from gradio.documentation import document, set_documentation_group
if TYPE_CHECKING: # Only import for type checking (is False at runtime).
from gradio.blocks import BlockContext
@ -808,6 +815,129 @@ class TupleNoPrint(tuple):
return ""
set_documentation_group("component-helpers")
@document()
def make_waveform(
audio: str | Tuple[int, np.ndarray],
*,
bg_color: str = "#f3f4f6",
bg_image: str = None,
fg_alpha: float = 0.75,
bars_color: str | Tuple[str, str] = ("#fbbf24", "#ea580c"),
bar_count: int = 50,
bar_width: float = 0.6,
):
"""
Generates a waveform video from an audio file. Useful for creating an easy to share audio visualization. The output should be passed into a `gr.Video` component.
Parameters:
audio: Audio file path or tuple of (sample_rate, audio_data)
bg_color: Background color of waveform (ignored if bg_image is provided)
bg_image: Background image of waveform
fg_alpha: Opacity of foreground waveform
bars_color: Color of waveform bars. Can be a single color or a tuple of (start_color, end_color) of gradient
bar_count: Number of bars in waveform
bar_width: Width of bars in waveform. 1 represents full width, 0.5 represents half width, etc.
Returns:
A filepath to the output video.
"""
if isinstance(audio, str):
audio_file = audio
audio = processing_utils.audio_from_file(audio)
else:
tmp_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
processing_utils.audio_to_file(audio[0], audio[1], tmp_wav.name)
audio_file = tmp_wav.name
duration = round(len(audio[1]) / audio[0], 4)
# Helper methods to create waveform
def hex_to_RGB(hex_str):
return [int(hex_str[i : i + 2], 16) for i in range(1, 6, 2)]
def get_color_gradient(c1, c2, n):
assert n > 1
c1_rgb = np.array(hex_to_RGB(c1)) / 255
c2_rgb = np.array(hex_to_RGB(c2)) / 255
mix_pcts = [x / (n - 1) for x in range(n)]
rgb_colors = [((1 - mix) * c1_rgb + (mix * c2_rgb)) for mix in mix_pcts]
return [
"#" + "".join([format(int(round(val * 255)), "02x") for val in item])
for item in rgb_colors
]
# Reshape audio to have a fixed number of bars
samples = audio[1]
if len(samples.shape) > 1:
samples = np.mean(samples, 1)
bins_to_pad = bar_count - (len(samples) % bar_count)
samples = np.pad(samples, [(0, bins_to_pad)])
samples = np.reshape(samples, (bar_count, -1))
samples = np.abs(samples)
samples = np.max(samples, 1)
matplotlib.use("Agg")
plt.clf()
# Plot waveform
color = (
bars_color
if isinstance(bars_color, str)
else get_color_gradient(bars_color[0], bars_color[1], bar_count)
)
plt.bar(
np.arange(0, bar_count),
samples * 2,
bottom=(-1 * samples),
width=bar_width,
color=color,
)
plt.axis("off")
plt.margins(x=0)
tmp_img = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
savefig_kwargs = {"bbox_inches": "tight"}
if bg_image is not None:
savefig_kwargs["transparent"] = True
else:
savefig_kwargs["facecolor"] = bg_color
plt.savefig(tmp_img.name, **savefig_kwargs)
waveform_img = PIL.Image.open(tmp_img.name)
waveform_img = waveform_img.resize((1000, 200))
# Composite waveform with background image
if bg_image is not None:
waveform_array = np.array(waveform_img)
waveform_array[:, :, 3] = waveform_array[:, :, 3] * fg_alpha
waveform_img = PIL.Image.fromarray(waveform_array)
bg_img = PIL.Image.open(bg_image)
waveform_width, waveform_height = waveform_img.size
bg_width, bg_height = bg_img.size
if waveform_width != bg_width:
bg_img = bg_img.resize(
(waveform_width, 2 * int(bg_height * waveform_width / bg_width / 2))
)
bg_width, bg_height = bg_img.size
composite_height = max(bg_height, waveform_height)
composite = PIL.Image.new("RGBA", (waveform_width, composite_height), "#FFFFFF")
composite.paste(bg_img, (0, composite_height - bg_height))
composite.paste(
waveform_img, (0, composite_height - waveform_height), waveform_img
)
composite.save(tmp_img.name)
img_width, img_height = composite.size
else:
img_width, img_height = waveform_img.size
waveform_img.save(tmp_img.name)
# Convert waveform to video with ffmpeg
output_mp4 = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
ffmpeg_cmd = f"""ffmpeg -loop 1 -i {tmp_img.name} -i {audio_file} -vf "color=c=#FFFFFF77:s={img_width}x{img_height}[bar];[0][bar]overlay=-w+(w/{duration})*t:H-h:shortest=1" -t {duration} -y {output_mp4.name}"""
subprocess.call(ffmpeg_cmd, shell=True)
return output_mp4.name
def tex2svg(formula, *args):
FONTSIZE = 20
DPI = 300

View File

@ -323,7 +323,7 @@ class TestComponentsInBlocks:
)
output = demo.postprocess_data(
0, [gr.update(value=None) for _ in io_components], state=None
0, [gr.update(value=None) for _ in io_components], state={}
)
assert all(
[o["value"] == c.postprocess(None) for o, c in zip(output, io_components)]
@ -339,7 +339,7 @@ class TestComponentsInBlocks:
outputs=text,
)
output = demo.postprocess_data(0, gr.update(value="NO_VALUE"), state=None)
output = demo.postprocess_data(0, gr.update(value="NO_VALUE"), state={})
assert output[0]["value"] == "NO_VALUE"
def test_blocks_returns_correct_output_dict_single_key(self):
@ -353,12 +353,10 @@ class TestComponentsInBlocks:
update.click(update_values, inputs=[num], outputs=[num2])
output = demo.postprocess_data(
0, {num2: gr.Number.update(value=42)}, state=None
)
output = demo.postprocess_data(0, {num2: gr.Number.update(value=42)}, state={})
assert output[0]["value"] == 42
output = demo.postprocess_data(0, {num2: 23}, state=None)
output = demo.postprocess_data(0, {num2: 23}, state={})
assert output[0] == 23
@pytest.mark.asyncio

View File

@ -1170,6 +1170,14 @@ class TestVideo:
iface = gr.Interface(lambda x: x, "video", "playable_video")
assert iface(x_video).endswith(".mp4")
def test_with_waveform(self):
"""
Interface, process
"""
x_audio = media_data.BASE64_AUDIO["name"]
iface = gr.Interface(lambda x: gr.make_waveform(x), "audio", "video")
assert iface(x_audio).endswith(".mp4")
def test_video_postprocess_converts_to_playable_format(self):
test_file_dir = pathlib.Path(pathlib.Path(__file__).parent, "test_files")
# This file has a playable container but not playable codec