diff --git a/demo/files/video.avi b/demo/files/video.avi new file mode 100644 index 0000000000..dabce398b2 Binary files /dev/null and b/demo/files/video.avi differ diff --git a/demo/files/video.mp4 b/demo/files/video.mp4 new file mode 100644 index 0000000000..b11552f9cb Binary files /dev/null and b/demo/files/video.mp4 differ diff --git a/demo/video_flip.py b/demo/video_flip.py index f9d4298dbb..89fa4f9b2b 100644 --- a/demo/video_flip.py +++ b/demo/video_flip.py @@ -4,7 +4,11 @@ def video_flip(video): return video iface = gr.Interface( - video_flip, "video", "playable_video", theme="huggingface") + video_flip, "video", "playable_video", theme="huggingface", + examples=[ + ["files/video.avi"], + ["files/video.mp4"] + ]) if __name__ == "__main__": iface.launch() diff --git a/frontend/package-lock.json b/frontend/package-lock.json index e323c37bee..d3f38fb34e 100644 --- a/frontend/package-lock.json +++ b/frontend/package-lock.json @@ -11666,11 +11666,18 @@ "integrity": "sha512-QBmA/G2y+IfeS4oktet3qRZ+P5kPhCKRXxXnQEudYqUaEioAU1/Lq2us3D/t1Jfo4hE9REQPrbB7K5sOczJVIw==" }, "mime-types": { - "version": "2.1.30", - "resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.30.tgz", - "integrity": "sha512-crmjA4bLtR8m9qLpHvgxSChT+XoSlZi8J4n/aIdn3z92e/U47Z0V/yl+Wh9W046GgFVAmoNR/fmdbZYcSSIUeg==", + "version": "2.1.33", + "resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.33.tgz", + "integrity": "sha512-plLElXp7pRDd0bNZHw+nMd52vRYjLwQjygaNg7ddJ2uJtTlmnTCjWuPKxVu6//AdaRuME84SvLW91sIkBqGT0g==", "requires": { - "mime-db": "1.47.0" + "mime-db": "1.50.0" + }, + "dependencies": { + "mime-db": { + "version": "1.50.0", + "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.50.0.tgz", + "integrity": "sha512-9tMZCDlYHqeERXEHO9f/hKfNXhre5dK2eE/krIvUjZbS2KPcqGDfNShIWS1uW9XOTKQKqK6qbeOci18rbfW77A==" + } } }, "mimic-fn": { diff --git a/frontend/package.json b/frontend/package.json index 98e3c3489f..1eac5ea556 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -15,6 +15,7 @@ "fabric": "^4.5.0", "html2canvas-objectfit-fix": "^1.2.0", "jspreadsheet-ce": "^4.7.3", + "mime-types": "^2.1.33", "plotly.js": "^2.3.1", "prettier-eslint": "^13.0.0", "prettier-eslint-cli": "^5.0.1", diff --git a/frontend/src/components/component_example.jsx b/frontend/src/components/component_example.jsx index fb2defe9f8..e6bec30c60 100644 --- a/frontend/src/components/component_example.jsx +++ b/frontend/src/components/component_example.jsx @@ -12,7 +12,11 @@ export default class ComponentExample extends React.Component { export class FileComponentExample extends ComponentExample { static async preprocess(x, examples_dir) { - return examples_dir + "/" + x; + return { + "name": x, + "data": examples_dir + "/" + x, + "is_example": true + } } } diff --git a/frontend/src/components/input/audio.jsx b/frontend/src/components/input/audio.jsx index a5782252d5..81b255916d 100644 --- a/frontend/src/components/input/audio.jsx +++ b/frontend/src/components/input/audio.jsx @@ -1,6 +1,6 @@ import React from "react"; import BaseComponent from "../base_component"; -import { DataURLComponentExample } from "../component_example"; +import { FileComponentExample } from "../component_example"; import Recorder from "recorder-js"; import { getSaliencyColor } from "../../utils"; @@ -147,12 +147,12 @@ class AudioInput extends BaseComponent { let file = files[0]; ReaderObj.readAsDataURL(file); ReaderObj.onloadend = function () { - component.props.handleChange({ "name": file.name, "data": this.result }); + component.props.handleChange({ "name": file.name, "data": this.result, "is_example": false }); }; }; } -class AudioInputExample extends DataURLComponentExample { +class AudioInputExample extends FileComponentExample { render() { return
{this.props.value}
; } diff --git a/frontend/src/components/input/file.jsx b/frontend/src/components/input/file.jsx index 8919a03de9..0dee67afcb 100644 --- a/frontend/src/components/input/file.jsx +++ b/frontend/src/components/input/file.jsx @@ -98,7 +98,7 @@ class FileInput extends BaseComponent { name: file.name, size: file.size, data: reader.result, - is_local_example: false + is_example: false }); if (this.file_data.length === file_count) { this.handleChange(this.file_data); diff --git a/frontend/src/components/input/video.jsx b/frontend/src/components/input/video.jsx index 81c25a22b4..e61555a931 100644 --- a/frontend/src/components/input/video.jsx +++ b/frontend/src/components/input/video.jsx @@ -1,5 +1,6 @@ import React from "react"; import BaseComponent from "../base_component"; +import { FileComponentExample } from "../component_example"; import ComponentExample from "../component_example"; import { isPlayable } from "../../utils"; @@ -25,8 +26,7 @@ class VideoInput extends BaseComponent { evt.stopPropagation(); }; if (this.props.value != null) { - if (isPlayable("video", this.props.value["data"].substring( - 5, this.props.value["data"].indexOf(";")))) { + if (isPlayable("video", this.props.value["name"])) { return (
@@ -93,15 +93,36 @@ class VideoInput extends BaseComponent { let file = files[0]; ReaderObj.readAsDataURL(file); ReaderObj.onloadend = function () { - component.props.handleChange({ "name": file.name, "data": this.result }); + component.props.handleChange({ "name": file.name, "data": this.result, "is_example": false }); }; } } -class VideoInputExample extends ComponentExample { +class VideoInputExample extends FileComponentExample { + constructor(props) { + super(props); + this.video = React.createRef(); + } render() { - return {this.props.value}; + if (isPlayable("video", this.props.value)) { + return
+
+ +
+
+ } else { + return
{this.props.value}
+ } } } + export { VideoInput, VideoInputExample }; diff --git a/frontend/src/components/output/video.jsx b/frontend/src/components/output/video.jsx index a146d7ae6e..7ec9bc482f 100644 --- a/frontend/src/components/output/video.jsx +++ b/frontend/src/components/output/video.jsx @@ -6,8 +6,7 @@ import { isPlayable } from "../../utils"; class VideoOutput extends BaseComponent { render() { if (this.props.value) { - if (isPlayable("video", this.props.value["data"].substring( - 5, this.props.value["data"].indexOf(";")))) { + if (isPlayable("video", this.props.value["name"])) { return
diff --git a/frontend/src/themes/defaults.scss b/frontend/src/themes/defaults.scss index 10ad234e0f..a0ddc9fba1 100644 --- a/frontend/src/themes/defaults.scss +++ b/frontend/src/themes/defaults.scss @@ -428,6 +428,14 @@ @apply w-full h-full object-contain; } } + .input_video_example { + .video_holder { + @apply h-36 object-contain flex justify-center; + } + .video_preview { + @apply w-full; + } + } .input_file { @apply w-full h-80; .upload_zone { diff --git a/frontend/src/themes/grass.scss b/frontend/src/themes/grass.scss index 6b424aac6b..de5e272dec 100644 --- a/frontend/src/themes/grass.scss +++ b/frontend/src/themes/grass.scss @@ -380,6 +380,14 @@ html { @apply w-full h-full object-contain; } } + .input_video_example { + .video_holder { + @apply h-36 object-contain flex justify-center; + } + .video_preview { + @apply w-full; + } + } .input_file { @apply w-full h-48; .upload_zone { diff --git a/frontend/src/themes/huggingface.scss b/frontend/src/themes/huggingface.scss index 0a0efcac9b..0174ac0468 100644 --- a/frontend/src/themes/huggingface.scss +++ b/frontend/src/themes/huggingface.scss @@ -370,6 +370,14 @@ @apply w-full h-full object-contain; } } + .input_video_example { + .video_holder { + @apply h-36 object-contain flex justify-center; + } + .video_preview { + @apply w-full; + } + } .input_file { @apply w-full h-80; .upload_zone { diff --git a/frontend/src/themes/peach.scss b/frontend/src/themes/peach.scss index 0ad0d5d78f..ed8dc63bcf 100644 --- a/frontend/src/themes/peach.scss +++ b/frontend/src/themes/peach.scss @@ -383,6 +383,14 @@ @apply w-full h-full object-contain; } } + .input_video_example { + .video_holder { + @apply h-36 object-contain flex justify-center; + } + .video_preview { + @apply w-full; + } + } .input_file { @apply w-full h-80; .upload_zone { diff --git a/frontend/src/utils.jsx b/frontend/src/utils.jsx index 7446bd0832..f9e60fe867 100644 --- a/frontend/src/utils.jsx +++ b/frontend/src/utils.jsx @@ -1,3 +1,5 @@ +var mime = require('mime-types') + export function prettyBytes(bytes) { let units = ["B", "KB", "MB", "GB", "PB"]; let i = 0; @@ -162,12 +164,14 @@ export function CSVToArray(strData, strDelimiter) { return arrData; } -export function isPlayable(data_type, mime_type) { +export function isPlayable(data_type, file_name) { if (data_type == "audio") { let audio_element = new Audio(); + let mime_type = mime.lookup(file_name) return audio_element.canPlayType(mime_type) != ""; } else { let video_element = document.createElement("video"); + let mime_type = mime.lookup(file_name) return video_element.canPlayType(mime_type) != ""; } } \ No newline at end of file diff --git a/gradio.egg-info/PKG-INFO b/gradio.egg-info/PKG-INFO index a58dc36f21..8498130add 100644 --- a/gradio.egg-info/PKG-INFO +++ b/gradio.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 1.0 Name: gradio -Version: 2.3.7b0 +Version: 2.3.7b1 Summary: Python library for easily interacting with trained machine learning models Home-page: https://github.com/gradio-app/gradio-UI Author: Abubakar Abid diff --git a/gradio.egg-info/requires.txt b/gradio.egg-info/requires.txt index b6135b3e41..ed319be6ca 100644 --- a/gradio.egg-info/requires.txt +++ b/gradio.egg-info/requires.txt @@ -11,5 +11,5 @@ pandas paramiko pillow pycryptodome +pydub requests -scipy diff --git a/gradio/frontend/asset-manifest.json b/gradio/frontend/asset-manifest.json index 86f9cd1123..43a8eda7fe 100644 --- a/gradio/frontend/asset-manifest.json +++ b/gradio/frontend/asset-manifest.json @@ -1,6 +1,6 @@ { "files": { - "main.css": "/static/css/main.61fa3417.css", + "main.css": "/static/css/main.dcd72078.css", "main.js": "/static/bundle.js", "index.html": "/index.html", "static/bundle.js.LICENSE.txt": "/static/bundle.js.LICENSE.txt", @@ -11,7 +11,7 @@ }, "entrypoints": [ "static/bundle.css", - "static/css/main.61fa3417.css", + "static/css/main.dcd72078.css", "static/bundle.js" ] } \ No newline at end of file diff --git a/gradio/frontend/index.html b/gradio/frontend/index.html index 235a73f311..8ba5973edd 100644 --- a/gradio/frontend/index.html +++ b/gradio/frontend/index.html @@ -8,4 +8,4 @@ window.config = {{ config|tojson }}; } catch (e) { window.config = {}; - }Gradio
\ No newline at end of file + }Gradio
\ No newline at end of file diff --git a/gradio/inputs.py b/gradio/inputs.py index dd8c6f9553..fb887b7987 100644 --- a/gradio/inputs.py +++ b/gradio/inputs.py @@ -9,7 +9,7 @@ import warnings from gradio.component import Component import numpy as np import PIL -import scipy.io.wavfile +from pydub import AudioSegment from gradio import processing_utils, test_data import pandas as pd from ffmpy import FFmpeg @@ -877,9 +877,12 @@ class Video(InputComponent): def preprocess(self, x): if x is None: return x - file_name, file_data = x["name"], x["data"] - file = processing_utils.decode_base64_to_file( - file_data, filename=file_name) + file_name, file_data, is_example = x["name"], x["data"], x["is_example"] + if is_example: + file = processing_utils.create_tmp_copy_of_file(file_name) + else: + file = processing_utils.decode_base64_to_file( + file_data, file_path=file_name) file_name = file.name uploaded_format = file_name.split(".")[-1].lower() if self.type is not None and uploaded_format != self.type: @@ -915,7 +918,7 @@ class Audio(InputComponent): """ Parameters: source (str): Source of audio. "upload" creates a box where user can drop an audio file, "microphone" creates a microphone input. - type (str): Type of value to be returned by component. "numpy" returns a 2-set tuple with an integer sample_rate and the data numpy.array of shape (samples, 2), "file" returns a temporary file object whose path can be retrieved by file_obj.name, "mfcc" returns the mfcc coefficients of the input audio. + type (str): Type of value to be returned by component. "numpy" returns a 2-set tuple with an integer sample_rate and the data numpy.array of shape (samples, 2), "file" returns a temporary file object whose path can be retrieved by file_obj.name. label (str): component name in interface. optional (bool): If True, the interface can be submitted with no uploaded audio, in which case the input value is None. """ @@ -923,7 +926,8 @@ class Audio(InputComponent): requires_permissions = source == "microphone" self.type = type self.optional = optional - self.test_input = {"name": "sample.wav", "data": test_data.BASE64_AUDIO} + self.test_input = {"name": "sample.wav", + "data": test_data.BASE64_AUDIO, "is_example": False} self.interpret_by_tokens = True super().__init__(label, requires_permissions) @@ -947,14 +951,17 @@ class Audio(InputComponent): """ if x is None: return x - file_name, file_data = x["name"], x["data"] - file_obj = processing_utils.decode_base64_to_file(file_data, filename=file_name) + file_name, file_data, is_example = x["name"], x["data"], x["is_example"] + if is_example: + file_obj = processing_utils.create_tmp_copy_of_file(file_name) + else: + file_obj = processing_utils.decode_base64_to_file( + file_data, file_path=file_name) if self.type == "file": return file_obj elif self.type == "numpy": - return scipy.io.wavfile.read(file_obj.name) - elif self.type == "mfcc": - return processing_utils.generate_mfcc_features_from_audio_file(file_obj.name) + audio_segment = AudioSegment.from_file(file_obj.name) + return audio_segment.frame_rate, np.array(audio_segment.get_array_of_samples()) def preprocess_example(self, x): return processing_utils.encode_file_to_base64(x, type="audio") @@ -970,8 +977,8 @@ class Audio(InputComponent): def tokenize(self, x): file_obj = processing_utils.decode_base64_to_file(x) - x = scipy.io.wavfile.read(file_obj.name) - sample_rate, data = x + x = AudioSegment.from_file(file_obj.name) + sample_rate, data = x.frame_rate, np.array(x.get_array_of_samples()) leave_one_out_sets = [] tokens = [] masks = [] @@ -986,7 +993,12 @@ class Audio(InputComponent): leave_one_out_data = np.copy(data) leave_one_out_data[start:stop] = 0 file = tempfile.NamedTemporaryFile(delete=False) - scipy.io.wavfile.write(file, sample_rate, leave_one_out_data) + audio_segment = AudioSegment( + leave_one_out_data.tobytes(), + frame_rate=sample_rate, + sample_width=leave_one_out_data.dtype.itemsize, + channels=len(leave_one_out_data.shape)) + audio_segment.export(file.name) out_data = processing_utils.encode_file_to_base64( file.name, type="audio", ext="wav") leave_one_out_sets.append(out_data) @@ -995,7 +1007,12 @@ class Audio(InputComponent): token[0:start] = 0 token[stop:] = 0 file = tempfile.NamedTemporaryFile(delete=False) - scipy.io.wavfile.write(file, sample_rate, token) + audio_segment = AudioSegment( + token.tobytes(), + frame_rate=sample_rate, + sample_width=token.dtype.itemsize, + channels=len(token.shape)) + audio_segment.export(file.name) token_data = processing_utils.encode_file_to_base64( file.name, type="audio", ext="wav") tokens.append(token_data) @@ -1005,13 +1022,15 @@ class Audio(InputComponent): # create a "zero input" vector and get sample rate x = tokens[0] file_obj = processing_utils.decode_base64_to_file(x) - sample_rate, data = scipy.io.wavfile.read(file_obj.name) + audio_segment = AudioSegment.from_file(file_obj.name) + sample_rate, data = audio_segment.frame_rate, np.array(audio_segment.get_array_of_samples()) zero_input = np.zeros_like(data, dtype=int) # decode all of the tokens token_data = [] for token in tokens: file_obj = processing_utils.decode_base64_to_file(token) - _, data = scipy.io.wavfile.read(file_obj.name) + audio_segment = AudioSegment.from_file(file_obj.name) + data = np.array(audio_segment.get_array_of_samples()) token_data.append(data) # construct the masked version masked_inputs = [] @@ -1020,7 +1039,12 @@ class Audio(InputComponent): for t, b in zip(token_data, binary_mask_vector): masked_input = masked_input + t*int(b) file = tempfile.NamedTemporaryFile(delete=False) - scipy.io.wavfile.write(file, sample_rate, masked_input) + audio_segment = AudioSegment( + masked_input.tobytes(), + frame_rate=sample_rate, + sample_width=masked_input.dtype.itemsize, + channels=len(masked_input.shape)) + audio_segment.export(file.name) masked_data = processing_utils.encode_file_to_base64( file.name, type="audio", ext="wav") masked_inputs.append(masked_data) @@ -1033,27 +1057,6 @@ class Audio(InputComponent): """ return list(scores) - def embed(self, x): - """ - Resamples each audio signal to be 1,000 frames and then returns the flattened vectors - """ - num_frames = 1000 - if self.type == "file": - file_name = x.name - mfcc = processing_utils.generate_mfcc_features_from_audio_file( - file_name, downsample_to=num_frames) - return mfcc.flatten() - elif self.type == "numpy": - sample_rate, signal = x - mfcc = processing_utils.generate_mfcc_features_from_audio_file( - wav_filename=None, sample_rate=sample_rate, signal=signal, downsample_to=num_frames) - return mfcc.flatten() - elif self.type == "mfcc": - mfcc = scipy.signal.resample(x, num_frames, axis=1) - return mfcc.flatten() - else: - raise ValueError("Unknown type: " + str(self.type) + - ". Please choose from: 'numpy', 'mfcc', 'file'.") def save_flagged(self, dir, label, data, encryption_key): """ @@ -1103,14 +1106,14 @@ class File(InputComponent): return None def process_single_file(f): - file_name, data, is_local_example = f["name"], f["data"], f["is_local_example"] + file_name, data, is_example = f["name"], f["data"], f["is_example"] if self.type == "file": - if is_local_example: - return open(file_name) + if is_example: + return processing_utils.create_tmp_copy_of_file(file_name) else: - return processing_utils.decode_base64_to_file(data, file_name=file_name) + return processing_utils.decode_base64_to_file(data, file_path=file_name) elif self.type == "bytes": - if is_local_example: + if is_example: with open(file_name, "rb") as file_data: return file_data.read() return processing_utils.decode_base64_to_binary(data)[0] diff --git a/gradio/networking.py b/gradio/networking.py index 4d946aa7ec..abd5c51bbc 100644 --- a/gradio/networking.py +++ b/gradio/networking.py @@ -269,24 +269,6 @@ def update_embeddings(): return jsonify({"sample_embedding_2d": sample_embedding_2d}) -@app.route("/api/predict_examples/", methods=["POST"]) -@login_check -def predict_examples(): - example_ids = request.json["data"] - predictions_set = {} - for example_id in example_ids: - example_set = app.interface.examples[example_id] - processed_example_set = [iface.preprocess_example(example) - for iface, example in zip(app.interface.input_components, example_set)] - try: - predictions, _ = app.interface.process(processed_example_set) - except: - continue - predictions_set[example_id] = predictions - output = {"data": predictions_set} - return jsonify(output) - - def flag_data(input_data, output_data, flag_option=None, flag_index=None, username=None): flag_path = os.path.join(app.cwd, app.interface.flagging_dir) log_fp = "{}/log.csv".format(flag_path) diff --git a/gradio/outputs.py b/gradio/outputs.py index 28f963f266..a6e534c294 100644 --- a/gradio/outputs.py +++ b/gradio/outputs.py @@ -13,7 +13,7 @@ import operator from numbers import Number import warnings import tempfile -import scipy +from pydub import AudioSegment import os import pandas as pd import PIL @@ -361,8 +361,14 @@ class Audio(OutputComponent): def postprocess(self, y): if self.type in ["numpy", "file", "auto"]: if self.type == "numpy" or (self.type == "auto" and isinstance(y, tuple)): + sample_rate, data = y file = tempfile.NamedTemporaryFile(delete=False) - scipy.io.wavfile.write(file, y[0], y[1]) + audio_segment = AudioSegment( + data.tobytes(), + frame_rate=sample_rate, + sample_width=data.dtype.itemsize, + channels=len(data.shape)) + audio_segment.export(file.name) y = file.name return processing_utils.encode_file_to_base64(y, type="audio", ext="wav") else: diff --git a/gradio/processing_utils.py b/gradio/processing_utils.py index 9509d56514..45a6a32b29 100644 --- a/gradio/processing_utils.py +++ b/gradio/processing_utils.py @@ -2,8 +2,8 @@ from PIL import Image, ImageOps from io import BytesIO import base64 import tempfile -import scipy.io.wavfile -from scipy.fftpack import dct +import shutil +import os import numpy as np from gradio import encryptor @@ -79,12 +79,15 @@ def decode_base64_to_binary(encoding): data = encoding return base64.b64decode(data), extension -def decode_base64_to_file(encoding, encryption_key=None, filename=None): +def decode_base64_to_file(encoding, encryption_key=None, file_path=None): data, mime_extension = decode_base64_to_binary(encoding) prefix, extension = None, None - if filename is not None and "." in filename: - prefix = filename[0: filename.index(".")] - extension = filename[filename.index(".") + 1:] + if file_path is not None: + filename = os.path.basename(file_path) + prefix = filename + if "." in filename: + prefix = filename[0: filename.index(".")] + extension = filename[filename.index(".") + 1:] if extension is None: extension = mime_extension if extension is None: @@ -97,6 +100,19 @@ def decode_base64_to_file(encoding, encryption_key=None, filename=None): file_obj.flush() return file_obj +def create_tmp_copy_of_file(file_path): + file_name = os.path.basename(file_path) + prefix, extension = file_name, None + if "." in file_name: + prefix = file_name[0: file_name.index(".")] + extension = file_name[file_name.index(".") + 1:] + if extension is None: + file_obj = tempfile.NamedTemporaryFile(delete=False, prefix=prefix) + else: + file_obj = tempfile.NamedTemporaryFile(delete=False, prefix=prefix, suffix="."+extension) + shutil.copy2(file_path, file_obj.name) + return file_obj + def _convert(image, dtype, force_copy=False, uniform=False): """ Adapted from: https://github.com/scikit-image/scikit-image/blob/main/skimage/util/dtype.py#L510-L531 @@ -379,94 +395,3 @@ def _convert(image, dtype, force_copy=False, uniform=False): image = _scale(image, 8 * itemsize_in, 8 * itemsize_out, copy=False) image += imin_out return image.astype(dtype_out) - - -################## -# AUDIO FILES -################## - -def generate_mfcc_features_from_audio_file(wav_filename=None, - pre_emphasis=0.95, - frame_size= 0.025, - frame_stride=0.01, - NFFT=512, - nfilt=40, - num_ceps=12, - cep_lifter=22, - sample_rate=None, - signal=None, - downsample_to=None): - """ - Loads and preprocesses a .wav audio file (or alternatively, a sample rate & signal) into mfcc coefficients, the typical inputs to models. - Adapted from: https://haythamfayek.com/2016/04/21/speech-processing-for-machine-learning.html - :param wav_filename: string name of audio file to process. - :param pre_emphasis: a float factor, typically 0.95 or 0.97, which amplifies high frequencies. - :param frame_size: a float that is the length, in seconds, of time frame over which to take the fft. - :param frame_stride: a float that is the offset, in seconds, between consecutive time frames. - :param NFFT: The number of points in the short-time fft for each time frame. - :param nfilt: The number of filters on the Mel-scale to extract frequency bands. - :param num_ceps: the number of cepstral coefficients to retrain. - :param cep_lifter: the int factor, by which to de-emphasize higher-frequency. - :param sample_rate: optional param represnting sample rate that is used if `wav_filename` is not provided - :param signal: optional param representing sample data that is used if `wav_filename` is not provided - :param downsample_to: optional param. If provided, audio file is downsampled to this many frames. - :return: a 3D numpy array of mfcc coefficients, of the shape 1 x num_frames x num_coeffs. - """ - if (wav_filename is None) and (sample_rate is None or signal is None): - raise ValueError("Either a wav_filename must be provdied or a sample_rate and signal") - elif wav_filename is None: - pass - else: - sample_rate, signal = scipy.io.wavfile.read(wav_filename) - - if not(downsample_to is None): - signal = scipy.signal.resample(signal, downsample_to) - - emphasized_signal = np.append(signal[0], signal[1:] - pre_emphasis * signal[:-1]) - - frame_length, frame_step = frame_size * sample_rate, frame_stride * sample_rate # Convert from seconds to samples - signal_length = len(emphasized_signal) - frame_length = int(round(frame_length)) - frame_step = int(round(frame_step)) - num_frames = int(np.ceil(float(np.abs(signal_length - frame_length)) / frame_step)) # Make sure that we have at least 1 frame - - pad_signal_length = num_frames * frame_step + frame_length - z = np.zeros((pad_signal_length - signal_length)) - pad_signal = np.append(emphasized_signal, z) # Pad Signal to make sure that all frames have equal number of samples without truncating any samples from the original signal - - indices = np.tile(np.arange(0, frame_length), (num_frames, 1)) + np.tile(np.arange(0, num_frames * frame_step, frame_step), (frame_length, 1)).T - frames = pad_signal[indices.astype(np.int32, copy=False)] - - frames *= np.hamming(frame_length) - mag_frames = np.absolute(np.fft.rfft(frames, NFFT)) # Magnitude of the FFT - pow_frames = ((1.0 / NFFT) * ((mag_frames) ** 2)) # Power Spectrum - - low_freq_mel = 0 - high_freq_mel = (2595 * np.log10(1 + (sample_rate / 2) / 700)) # Convert Hz to Mel - mel_points = np.linspace(low_freq_mel, high_freq_mel, nfilt + 2) # Equally spaced in Mel scale - hz_points = (700 * (10**(mel_points / 2595) - 1)) # Convert Mel to Hz - bin = np.floor((NFFT + 1) * hz_points / sample_rate) - - fbank = np.zeros((nfilt, int(np.floor(NFFT / 2 + 1)))) - for m in range(1, nfilt + 1): - f_m_minus = int(bin[m - 1]) # left - f_m = int(bin[m]) # center - f_m_plus = int(bin[m + 1]) # right - - for k in range(f_m_minus, f_m): - fbank[m - 1, k] = (k - bin[m - 1]) / (bin[m] - bin[m - 1]) - for k in range(f_m, f_m_plus): - fbank[m - 1, k] = (bin[m + 1] - k) / (bin[m + 1] - bin[m]) - filter_banks = np.dot(pow_frames, fbank.T) - filter_banks = np.where(filter_banks == 0, np.finfo(float).eps, filter_banks) # Numerical Stability - filter_banks = 20 * np.log10(filter_banks) # dB - - mfcc = dct(filter_banks, type=2, axis=1, norm='ortho')[:, 0: (num_ceps + 1)] # Keep filters 1-13 by default. - (nframes, ncoeff) = mfcc.shape - n = np.arange(ncoeff) - lift = 1 + (cep_lifter / 2) * np.sin(np.pi * n / cep_lifter) - mfcc *= lift - - filter_banks -= (np.mean(filter_banks, axis=0) + 1e-8) - mfcc -= (np.mean(mfcc, axis=0) + 1e-8) - return mfcc[np.newaxis, :, :] # Create a batch dimension. \ No newline at end of file diff --git a/gradio/version.txt b/gradio/version.txt index 4f6b33a304..79c92c0223 100644 --- a/gradio/version.txt +++ b/gradio/version.txt @@ -1 +1 @@ -2.3.7b +2.3.7b1 diff --git a/setup.py b/setup.py index d6a995f0cf..cb3afe1e79 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ except ImportError: setup( name='gradio', - version='2.3.7b', + version='2.3.7b1', include_package_data=True, description='Python library for easily interacting with trained machine learning models', author='Abubakar Abid', @@ -16,7 +16,7 @@ setup( keywords=['machine learning', 'visualization', 'reproducibility'], install_requires=[ 'numpy', - 'scipy', + 'pydub', 'matplotlib', 'pandas', 'pillow', diff --git a/test/test_inputs.py b/test/test_inputs.py index 5bbc49042b..afde36c4b4 100644 --- a/test/test_inputs.py +++ b/test/test_inputs.py @@ -2,7 +2,7 @@ import unittest import gradio as gr import PIL import numpy as np -import scipy +from pydub import AudioSegment import os class TestTextbox(unittest.TestCase): @@ -97,7 +97,8 @@ class TestAudio(unittest.TestCase): def test_in_interface(self): x_wav = gr.test_data.BASE64_AUDIO def max_amplitude_from_wav_file(wav_file): - _, data = scipy.io.wavfile.read(wav_file.name) + audio_segment = AudioSegment.from_file(wav_file.name) + data = np.array(audio_segment.get_array_of_samples()) return np.max(data) iface = gr.Interface( @@ -111,7 +112,7 @@ class TestFile(unittest.TestCase): x_file = { "name": "audio.wav", "data": gr.test_data.BASE64_AUDIO, - "is_local_example": False + "is_example": False } def get_size_of_file(file_obj): return os.path.getsize(file_obj.name)