Support video example preview

2025-01-30 11:00:11 +08:00 · 2021-10-05 19:23:23 +00:00 · 2021-10-05 19:23:23 +00:00 · 9b968d3fb5
commit 9b968d3fb5
parent 760bf48b24
26 changed files with 179 additions and 190 deletions
--- a/demo/files/video.avi
+++ b/demo/files/video.avi
--- a/demo/files/video.mp4
+++ b/demo/files/video.mp4
--- a/demo/video_flip.py
+++ b/demo/video_flip.py
@ -4,7 +4,11 @@ def video_flip(video):
    return video

 iface = gr.Interface(
-    video_flip, "video", "playable_video", theme="huggingface")
+    video_flip, "video", "playable_video", theme="huggingface",
+    examples=[
+        ["files/video.avi"],
+        ["files/video.mp4"]
+    ])

 if __name__ == "__main__":
    iface.launch()
--- a/frontend/package-lock.json
+++ b/frontend/package-lock.json
@ -11666,11 +11666,18 @@
      "integrity": "sha512-QBmA/G2y+IfeS4oktet3qRZ+P5kPhCKRXxXnQEudYqUaEioAU1/Lq2us3D/t1Jfo4hE9REQPrbB7K5sOczJVIw=="
    },
    "mime-types": {
-      "version": "2.1.30",
-      "resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.30.tgz",
-      "integrity": "sha512-crmjA4bLtR8m9qLpHvgxSChT+XoSlZi8J4n/aIdn3z92e/U47Z0V/yl+Wh9W046GgFVAmoNR/fmdbZYcSSIUeg==",
+      "version": "2.1.33",
+      "resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.33.tgz",
+      "integrity": "sha512-plLElXp7pRDd0bNZHw+nMd52vRYjLwQjygaNg7ddJ2uJtTlmnTCjWuPKxVu6//AdaRuME84SvLW91sIkBqGT0g==",
      "requires": {
-        "mime-db": "1.47.0"
+        "mime-db": "1.50.0"
+      },
+      "dependencies": {
+        "mime-db": {
+          "version": "1.50.0",
+          "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.50.0.tgz",
+          "integrity": "sha512-9tMZCDlYHqeERXEHO9f/hKfNXhre5dK2eE/krIvUjZbS2KPcqGDfNShIWS1uW9XOTKQKqK6qbeOci18rbfW77A=="
+        }
      }
    },
    "mimic-fn": {
--- a/frontend/package.json
+++ b/frontend/package.json
@ -15,6 +15,7 @@
    "fabric": "^4.5.0",
    "html2canvas-objectfit-fix": "^1.2.0",
    "jspreadsheet-ce": "^4.7.3",
+    "mime-types": "^2.1.33",
    "plotly.js": "^2.3.1",
    "prettier-eslint": "^13.0.0",
    "prettier-eslint-cli": "^5.0.1",
--- a/frontend/src/components/component_example.jsx
+++ b/frontend/src/components/component_example.jsx
@ -12,7 +12,11 @@ export default class ComponentExample extends React.Component {

 export class FileComponentExample extends ComponentExample {
  static async preprocess(x, examples_dir) {
-    return examples_dir + "/" + x;
+    return {
+      "name": x,
+      "data": examples_dir + "/" + x,
+      "is_example": true
+    }
  }
 }

--- a/frontend/src/components/input/audio.jsx
+++ b/frontend/src/components/input/audio.jsx
@ -1,6 +1,6 @@
 import React from "react";
 import BaseComponent from "../base_component";
-import { DataURLComponentExample } from "../component_example";
+import { FileComponentExample } from "../component_example";
 import Recorder from "recorder-js";
 import { getSaliencyColor } from "../../utils";

@ -147,12 +147,12 @@ class AudioInput extends BaseComponent {
    let file = files[0];
    ReaderObj.readAsDataURL(file);
    ReaderObj.onloadend = function () {
-      component.props.handleChange({ "name": file.name, "data": this.result });
+      component.props.handleChange({ "name": file.name, "data": this.result, "is_example": false });
    };
  };
 }

-class AudioInputExample extends DataURLComponentExample {
+class AudioInputExample extends FileComponentExample {
  render() {
    return <div className="input_audio_example">{this.props.value}</div>;
  }
--- a/frontend/src/components/input/file.jsx
+++ b/frontend/src/components/input/file.jsx
@ -98,7 +98,7 @@ class FileInput extends BaseComponent {
      name: file.name,
      size: file.size,
      data: reader.result,
-      is_local_example: false
+      is_example: false
    });
    if (this.file_data.length === file_count) {
      this.handleChange(this.file_data);
--- a/frontend/src/components/input/video.jsx
+++ b/frontend/src/components/input/video.jsx
@ -1,5 +1,6 @@
 import React from "react";
 import BaseComponent from "../base_component";
+import { FileComponentExample } from "../component_example";
 import ComponentExample from "../component_example";
 import { isPlayable } from "../../utils";

@ -25,8 +26,7 @@ class VideoInput extends BaseComponent {
      evt.stopPropagation();
    };
    if (this.props.value != null) {
-      if (isPlayable("video", this.props.value["data"].substring(
-        5, this.props.value["data"].indexOf(";")))) {
+      if (isPlayable("video", this.props.value["name"])) {
        return (
          <div className="input_video">
            <div className="video_preview_holder">
@ -93,15 +93,36 @@ class VideoInput extends BaseComponent {
    let file = files[0];
    ReaderObj.readAsDataURL(file);
    ReaderObj.onloadend = function () {
-      component.props.handleChange({ "name": file.name, "data": this.result });
+      component.props.handleChange({ "name": file.name, "data": this.result, "is_example": false });
    };
  }
 }

-class VideoInputExample extends ComponentExample {
+class VideoInputExample extends FileComponentExample {
+  constructor(props) {
+    super(props);
+    this.video = React.createRef();
+  }
  render() {
-    return <span className="input_video_example">{this.props.value}</span>;
+    if (isPlayable("video", this.props.value)) {
+      return <div className="input_video_example">
+        <div className="video_holder">
+          <video
+            ref={this.video}
+            className="video_preview"
+            onMouseOver={() => { this.video.current.play() }}
+            onMouseOut={() => { this.video.current.pause() }}
+            preload="metadata"
+          >
+            <source src={this.props.examples_dir + "/" + this.props.value}></source>
+          </video>
+        </div>
+      </div>
+    } else {
+      return <div className="input_video_example">{this.props.value}</div>
+    }
  }
 }

+
 export { VideoInput, VideoInputExample };
--- a/frontend/src/components/output/video.jsx
+++ b/frontend/src/components/output/video.jsx
@ -6,8 +6,7 @@ import { isPlayable } from "../../utils";
 class VideoOutput extends BaseComponent {
  render() {
    if (this.props.value) {
-      if (isPlayable("video", this.props.value["data"].substring(
-        5, this.props.value["data"].indexOf(";")))) {
+      if (isPlayable("video", this.props.value["name"])) {
        return <div className="output_video">
          <video controls src={this.props.value["data"]}></video>
        </div>
--- a/frontend/src/themes/defaults.scss
+++ b/frontend/src/themes/defaults.scss
@ -428,6 +428,14 @@
            @apply w-full h-full object-contain;
        }
    }
+    .input_video_example {
+        .video_holder {
+            @apply h-36 object-contain flex justify-center;
+        }
+        .video_preview {
+            @apply w-full;            
+        }
+    }
    .input_file {
        @apply w-full h-80;
        .upload_zone {
--- a/frontend/src/themes/grass.scss
+++ b/frontend/src/themes/grass.scss
@ -380,6 +380,14 @@ html {
            @apply w-full h-full object-contain;
        }
    }
+    .input_video_example {
+        .video_holder {
+            @apply h-36 object-contain flex justify-center;
+        }
+        .video_preview {
+            @apply w-full;            
+        }
+    }
    .input_file {
        @apply w-full h-48;
        .upload_zone {
--- a/frontend/src/themes/huggingface.scss
+++ b/frontend/src/themes/huggingface.scss
@ -370,6 +370,14 @@
            @apply w-full h-full object-contain;
        }
    }
+    .input_video_example {
+        .video_holder {
+            @apply h-36 object-contain flex justify-center;
+        }
+        .video_preview {
+            @apply w-full;            
+        }
+    }
    .input_file {
        @apply w-full h-80;
        .upload_zone {
--- a/frontend/src/themes/peach.scss
+++ b/frontend/src/themes/peach.scss
@ -383,6 +383,14 @@
            @apply w-full h-full object-contain;
        }
    }
+    .input_video_example {
+        .video_holder {
+            @apply h-36 object-contain flex justify-center;
+        }
+        .video_preview {
+            @apply w-full;            
+        }
+    }
    .input_file {
        @apply w-full h-80;
        .upload_zone {
--- a/frontend/src/utils.jsx
+++ b/frontend/src/utils.jsx
@ -1,3 +1,5 @@
+var mime = require('mime-types')
+
 export function prettyBytes(bytes) {
  let units = ["B", "KB", "MB", "GB", "PB"];
  let i = 0;
@ -162,12 +164,14 @@ export function CSVToArray(strData, strDelimiter) {
  return arrData;
 }

-export function isPlayable(data_type, mime_type) {
+export function isPlayable(data_type, file_name) {
  if (data_type == "audio") {
    let audio_element = new Audio();
+    let mime_type = mime.lookup(file_name)
    return audio_element.canPlayType(mime_type) != "";
  } else {
    let video_element = document.createElement("video");
+    let mime_type = mime.lookup(file_name)
    return video_element.canPlayType(mime_type) != "";
  }
 }
--- a/gradio.egg-info/PKG-INFO
+++ b/gradio.egg-info/PKG-INFO
@ -1,6 +1,6 @@
 Metadata-Version: 1.0
 Name: gradio
-Version: 2.3.7b0
+Version: 2.3.7b1
 Summary: Python library for easily interacting with trained machine learning models
 Home-page: https://github.com/gradio-app/gradio-UI
 Author: Abubakar Abid
--- a/gradio.egg-info/requires.txt
+++ b/gradio.egg-info/requires.txt
@ -11,5 +11,5 @@ pandas
 paramiko
 pillow
 pycryptodome
+pydub
 requests
-scipy
--- a/gradio/frontend/asset-manifest.json
+++ b/gradio/frontend/asset-manifest.json
@ -1,6 +1,6 @@
 {
  "files": {
-    "main.css": "/static/css/main.61fa3417.css",
+    "main.css": "/static/css/main.dcd72078.css",
    "main.js": "/static/bundle.js",
    "index.html": "/index.html",
    "static/bundle.js.LICENSE.txt": "/static/bundle.js.LICENSE.txt",
@ -11,7 +11,7 @@
  },
  "entrypoints": [
    "static/bundle.css",
-    "static/css/main.61fa3417.css",
+    "static/css/main.dcd72078.css",
    "static/bundle.js"
  ]
 }
--- a/gradio/frontend/index.html
+++ b/gradio/frontend/index.html
@ -8,4 +8,4 @@
      window.config = {{ config|tojson }};
    } catch (e) {
      window.config = {};
-    }</script><script src="https://cdnjs.cloudflare.com/ajax/libs/iframe-resizer/4.3.1/iframeResizer.contentWindow.min.js"></script><title>Gradio</title><link href="static/bundle.css" rel="stylesheet"><link href="static/css/main.61fa3417.css" rel="stylesheet"></head><body style="height:100%"><div id="root" style="height:100%"></div><script src="static/bundle.js"></script></body></html>
+    }</script><script src="https://cdnjs.cloudflare.com/ajax/libs/iframe-resizer/4.3.1/iframeResizer.contentWindow.min.js"></script><title>Gradio</title><link href="static/bundle.css" rel="stylesheet"><link href="static/css/main.dcd72078.css" rel="stylesheet"></head><body style="height:100%"><div id="root" style="height:100%"></div><script src="static/bundle.js"></script></body></html>
--- a/gradio/inputs.py
+++ b/gradio/inputs.py
@ -9,7 +9,7 @@ import warnings
 from gradio.component import Component
 import numpy as np
 import PIL
-import scipy.io.wavfile
+from pydub import AudioSegment
 from gradio import processing_utils, test_data
 import pandas as pd
 from ffmpy import FFmpeg
@ -877,9 +877,12 @@ class Video(InputComponent):
    def preprocess(self, x):
        if x is None:
            return x
-        file_name, file_data = x["name"], x["data"]
-        file = processing_utils.decode_base64_to_file(
-            file_data, filename=file_name)
+        file_name, file_data, is_example = x["name"], x["data"], x["is_example"]
+        if is_example:
+            file = processing_utils.create_tmp_copy_of_file(file_name)
+        else:
+            file = processing_utils.decode_base64_to_file(
+                file_data, file_path=file_name)
        file_name = file.name
        uploaded_format = file_name.split(".")[-1].lower()
        if self.type is not None and uploaded_format != self.type:
@ -915,7 +918,7 @@ class Audio(InputComponent):
        """
        Parameters:
        source (str): Source of audio. "upload" creates a box where user can drop an audio file, "microphone" creates a microphone input.
-        type (str): Type of value to be returned by component. "numpy" returns a 2-set tuple with an integer sample_rate and the data numpy.array of shape (samples, 2), "file" returns a temporary file object whose path can be retrieved by file_obj.name, "mfcc" returns the mfcc coefficients of the input audio.
+        type (str): Type of value to be returned by component. "numpy" returns a 2-set tuple with an integer sample_rate and the data numpy.array of shape (samples, 2), "file" returns a temporary file object whose path can be retrieved by file_obj.name.
        label (str): component name in interface.
        optional (bool): If True, the interface can be submitted with no uploaded audio, in which case the input value is None.
        """
@ -923,7 +926,8 @@ class Audio(InputComponent):
        requires_permissions = source == "microphone"
        self.type = type
        self.optional = optional
-        self.test_input = {"name": "sample.wav", "data": test_data.BASE64_AUDIO}
+        self.test_input = {"name": "sample.wav",
+                           "data": test_data.BASE64_AUDIO, "is_example": False}
        self.interpret_by_tokens = True
        super().__init__(label, requires_permissions)

@ -947,14 +951,17 @@ class Audio(InputComponent):
        """
        if x is None:
            return x
-        file_name, file_data = x["name"], x["data"]
-        file_obj = processing_utils.decode_base64_to_file(file_data, filename=file_name)
+        file_name, file_data, is_example = x["name"], x["data"], x["is_example"]
+        if is_example:
+            file_obj = processing_utils.create_tmp_copy_of_file(file_name)
+        else:
+            file_obj = processing_utils.decode_base64_to_file(
+                file_data, file_path=file_name)
        if self.type == "file":
            return file_obj
        elif self.type == "numpy":
-            return scipy.io.wavfile.read(file_obj.name)
-        elif self.type == "mfcc":
-            return processing_utils.generate_mfcc_features_from_audio_file(file_obj.name)
+            audio_segment = AudioSegment.from_file(file_obj.name)
+            return audio_segment.frame_rate, np.array(audio_segment.get_array_of_samples())

    def preprocess_example(self, x):
        return processing_utils.encode_file_to_base64(x, type="audio")
@ -970,8 +977,8 @@ class Audio(InputComponent):

    def tokenize(self, x):
        file_obj = processing_utils.decode_base64_to_file(x)
-        x = scipy.io.wavfile.read(file_obj.name)
-        sample_rate, data = x
+        x = AudioSegment.from_file(file_obj.name)
+        sample_rate, data = x.frame_rate, np.array(x.get_array_of_samples())
        leave_one_out_sets = []
        tokens = []
        masks = []
@ -986,7 +993,12 @@ class Audio(InputComponent):
            leave_one_out_data = np.copy(data)
            leave_one_out_data[start:stop] = 0
            file = tempfile.NamedTemporaryFile(delete=False)
-            scipy.io.wavfile.write(file, sample_rate, leave_one_out_data)
+            audio_segment = AudioSegment(
+                leave_one_out_data.tobytes(), 
+                frame_rate=sample_rate, 
+                sample_width=leave_one_out_data.dtype.itemsize, 
+                channels=len(leave_one_out_data.shape))
+            audio_segment.export(file.name)
            out_data = processing_utils.encode_file_to_base64(
                file.name, type="audio", ext="wav")
            leave_one_out_sets.append(out_data)
@ -995,7 +1007,12 @@ class Audio(InputComponent):
            token[0:start] = 0
            token[stop:] = 0
            file = tempfile.NamedTemporaryFile(delete=False)
-            scipy.io.wavfile.write(file, sample_rate, token)
+            audio_segment = AudioSegment(
+                token.tobytes(), 
+                frame_rate=sample_rate, 
+                sample_width=token.dtype.itemsize, 
+                channels=len(token.shape))
+            audio_segment.export(file.name)
            token_data = processing_utils.encode_file_to_base64(
                file.name, type="audio", ext="wav")
            tokens.append(token_data)
@ -1005,13 +1022,15 @@ class Audio(InputComponent):
        # create a "zero input" vector and get sample rate
        x = tokens[0]
        file_obj = processing_utils.decode_base64_to_file(x)
-        sample_rate, data = scipy.io.wavfile.read(file_obj.name)
+        audio_segment = AudioSegment.from_file(file_obj.name)
+        sample_rate, data = audio_segment.frame_rate, np.array(audio_segment.get_array_of_samples())
        zero_input = np.zeros_like(data, dtype=int)
        # decode all of the tokens
        token_data = []
        for token in tokens:
            file_obj = processing_utils.decode_base64_to_file(token)
-            _, data = scipy.io.wavfile.read(file_obj.name)
+            audio_segment = AudioSegment.from_file(file_obj.name)
+            data = np.array(audio_segment.get_array_of_samples())
            token_data.append(data)
        # construct the masked version
        masked_inputs = []
@ -1020,7 +1039,12 @@ class Audio(InputComponent):
            for t, b in zip(token_data, binary_mask_vector):
                masked_input = masked_input + t*int(b)
            file = tempfile.NamedTemporaryFile(delete=False)
-            scipy.io.wavfile.write(file, sample_rate, masked_input)
+            audio_segment = AudioSegment(
+                masked_input.tobytes(), 
+                frame_rate=sample_rate, 
+                sample_width=masked_input.dtype.itemsize, 
+                channels=len(masked_input.shape))
+            audio_segment.export(file.name)
            masked_data = processing_utils.encode_file_to_base64(
                file.name, type="audio", ext="wav")
            masked_inputs.append(masked_data)
@ -1033,27 +1057,6 @@ class Audio(InputComponent):
        """
        return list(scores)

-    def embed(self, x):
-        """
-        Resamples each audio signal to be 1,000 frames and then returns the flattened vectors
-        """
-        num_frames = 1000
-        if self.type == "file":
-            file_name = x.name
-            mfcc = processing_utils.generate_mfcc_features_from_audio_file(
-                file_name, downsample_to=num_frames)
-            return mfcc.flatten()
-        elif self.type == "numpy":
-            sample_rate, signal = x
-            mfcc = processing_utils.generate_mfcc_features_from_audio_file(
-                wav_filename=None, sample_rate=sample_rate, signal=signal, downsample_to=num_frames)
-            return mfcc.flatten()
-        elif self.type == "mfcc":
-            mfcc = scipy.signal.resample(x, num_frames, axis=1)
-            return mfcc.flatten()
-        else:
-            raise ValueError("Unknown type: " + str(self.type) +
-                             ". Please choose from: 'numpy', 'mfcc', 'file'.")

    def save_flagged(self, dir, label, data, encryption_key):
        """
@ -1103,14 +1106,14 @@ class File(InputComponent):
            return None

        def process_single_file(f):
-            file_name, data, is_local_example = f["name"], f["data"], f["is_local_example"]
+            file_name, data, is_example = f["name"], f["data"], f["is_example"]
            if self.type == "file":
-                if is_local_example:
-                    return open(file_name)
+                if is_example:
+                    return processing_utils.create_tmp_copy_of_file(file_name)
                else:
-                    return processing_utils.decode_base64_to_file(data, file_name=file_name)
+                    return processing_utils.decode_base64_to_file(data, file_path=file_name)
            elif self.type == "bytes":
-                if is_local_example:
+                if is_example:
                    with open(file_name, "rb") as file_data:
                        return file_data.read()
                return processing_utils.decode_base64_to_binary(data)[0]
--- a/gradio/networking.py
+++ b/gradio/networking.py
@ -269,24 +269,6 @@ def update_embeddings():
    return jsonify({"sample_embedding_2d": sample_embedding_2d})


-@app.route("/api/predict_examples/", methods=["POST"])
-@login_check
-def predict_examples():
-    example_ids = request.json["data"]
-    predictions_set = {}
-    for example_id in example_ids:
-        example_set = app.interface.examples[example_id]
-        processed_example_set = [iface.preprocess_example(example)
-                                 for iface, example in zip(app.interface.input_components, example_set)]
-        try:
-            predictions, _ = app.interface.process(processed_example_set)
-        except:
-            continue
-        predictions_set[example_id] = predictions
-    output = {"data": predictions_set}
-    return jsonify(output)
-
-
 def flag_data(input_data, output_data, flag_option=None, flag_index=None, username=None):
    flag_path = os.path.join(app.cwd, app.interface.flagging_dir)
    log_fp = "{}/log.csv".format(flag_path)
--- a/gradio/outputs.py
+++ b/gradio/outputs.py
@ -13,7 +13,7 @@ import operator
 from numbers import Number
 import warnings
 import tempfile
-import scipy
+from pydub import AudioSegment
 import os
 import pandas as pd
 import PIL
@ -361,8 +361,14 @@ class Audio(OutputComponent):
    def postprocess(self, y):
        if self.type in ["numpy", "file", "auto"]:
            if self.type == "numpy" or (self.type == "auto" and isinstance(y, tuple)):
+                sample_rate, data = y
                file = tempfile.NamedTemporaryFile(delete=False)
-                scipy.io.wavfile.write(file, y[0], y[1])
+                audio_segment = AudioSegment(
+                    data.tobytes(), 
+                    frame_rate=sample_rate, 
+                    sample_width=data.dtype.itemsize, 
+                    channels=len(data.shape))
+                audio_segment.export(file.name)
                y = file.name
            return processing_utils.encode_file_to_base64(y, type="audio", ext="wav")
        else:
--- a/gradio/processing_utils.py
+++ b/gradio/processing_utils.py
@ -2,8 +2,8 @@ from PIL import Image, ImageOps
 from io import BytesIO
 import base64
 import tempfile
-import scipy.io.wavfile
-from scipy.fftpack import dct
+import shutil
+import os
 import numpy as np
 from gradio import encryptor

@ -79,12 +79,15 @@ def decode_base64_to_binary(encoding):
        data = encoding
    return base64.b64decode(data), extension

-def decode_base64_to_file(encoding, encryption_key=None, filename=None):
+def decode_base64_to_file(encoding, encryption_key=None, file_path=None):
    data, mime_extension = decode_base64_to_binary(encoding)
    prefix, extension = None, None
-    if filename is not None and "." in filename:
-        prefix = filename[0: filename.index(".")]
-        extension = filename[filename.index(".") + 1:]
+    if file_path is not None:
+        filename = os.path.basename(file_path)
+        prefix = filename
+        if "." in filename:
+            prefix = filename[0: filename.index(".")]
+            extension = filename[filename.index(".") + 1:]
    if extension is None:
        extension = mime_extension
    if extension is None:
@ -97,6 +100,19 @@ def decode_base64_to_file(encoding, encryption_key=None, filename=None):
    file_obj.flush()
    return file_obj

+def create_tmp_copy_of_file(file_path):
+    file_name = os.path.basename(file_path)
+    prefix, extension = file_name, None
+    if "." in file_name:
+        prefix = file_name[0: file_name.index(".")]
+        extension = file_name[file_name.index(".") + 1:]
+    if extension is None:
+        file_obj = tempfile.NamedTemporaryFile(delete=False, prefix=prefix)
+    else:
+        file_obj = tempfile.NamedTemporaryFile(delete=False, prefix=prefix, suffix="."+extension)
+    shutil.copy2(file_path, file_obj.name)
+    return file_obj
+
 def _convert(image, dtype, force_copy=False, uniform=False):
    """
    Adapted from: https://github.com/scikit-image/scikit-image/blob/main/skimage/util/dtype.py#L510-L531
@ -379,94 +395,3 @@ def _convert(image, dtype, force_copy=False, uniform=False):
    image = _scale(image, 8 * itemsize_in, 8 * itemsize_out, copy=False)
    image += imin_out
    return image.astype(dtype_out)
-
-
-##################
-# AUDIO FILES
-##################
-
-def generate_mfcc_features_from_audio_file(wav_filename=None,
-                                           pre_emphasis=0.95,
-                                           frame_size= 0.025,
-                                           frame_stride=0.01,
-                                           NFFT=512,
-                                           nfilt=40,
-                                           num_ceps=12,
-                                           cep_lifter=22,
-                                           sample_rate=None,
-                                           signal=None,
-                                           downsample_to=None):
-    """
-    Loads and preprocesses a .wav audio file (or alternatively, a sample rate & signal) into mfcc coefficients, the typical inputs to models.
-    Adapted from: https://haythamfayek.com/2016/04/21/speech-processing-for-machine-learning.html
-    :param wav_filename: string name of audio file to process.
-    :param pre_emphasis: a float factor, typically 0.95 or 0.97, which amplifies high frequencies.
-    :param frame_size: a float that is the length, in seconds, of time frame over which to take the fft.
-    :param frame_stride: a float that is the offset, in seconds, between consecutive time frames.
-    :param NFFT: The number of points in the short-time fft for each time frame.
-    :param nfilt: The number of filters on the Mel-scale to extract frequency bands.
-    :param num_ceps: the number of cepstral coefficients to retrain.
-    :param cep_lifter: the int factor, by which to de-emphasize higher-frequency.
-    :param sample_rate: optional param represnting sample rate that is used if `wav_filename` is not provided
-    :param signal: optional param representing sample data that is used if `wav_filename` is not provided
-    :param downsample_to: optional param. If provided, audio file is downsampled to this many frames.  
-    :return: a 3D numpy array of mfcc coefficients, of the shape 1 x num_frames x num_coeffs.
-    """
-    if (wav_filename is None) and (sample_rate is None or signal is None):
-        raise ValueError("Either a wav_filename must be provdied or a sample_rate and signal") 
-    elif wav_filename is None:
-        pass
-    else:
-        sample_rate, signal = scipy.io.wavfile.read(wav_filename)
-
-    if not(downsample_to is None):
-        signal = scipy.signal.resample(signal, downsample_to)    
-
-    emphasized_signal = np.append(signal[0], signal[1:] - pre_emphasis * signal[:-1])
-
-    frame_length, frame_step = frame_size * sample_rate, frame_stride * sample_rate  # Convert from seconds to samples
-    signal_length = len(emphasized_signal)
-    frame_length = int(round(frame_length))
-    frame_step = int(round(frame_step))
-    num_frames = int(np.ceil(float(np.abs(signal_length - frame_length)) / frame_step))  # Make sure that we have at least 1 frame
-
-    pad_signal_length = num_frames * frame_step + frame_length
-    z = np.zeros((pad_signal_length - signal_length))
-    pad_signal = np.append(emphasized_signal, z)  # Pad Signal to make sure that all frames have equal number of samples without truncating any samples from the original signal
-
-    indices = np.tile(np.arange(0, frame_length), (num_frames, 1)) + np.tile(np.arange(0, num_frames * frame_step, frame_step), (frame_length, 1)).T
-    frames = pad_signal[indices.astype(np.int32, copy=False)]
-
-    frames *= np.hamming(frame_length)
-    mag_frames = np.absolute(np.fft.rfft(frames, NFFT))  # Magnitude of the FFT
-    pow_frames = ((1.0 / NFFT) * ((mag_frames) ** 2))  # Power Spectrum
-
-    low_freq_mel = 0
-    high_freq_mel = (2595 * np.log10(1 + (sample_rate / 2) / 700))  # Convert Hz to Mel
-    mel_points = np.linspace(low_freq_mel, high_freq_mel, nfilt + 2)  # Equally spaced in Mel scale
-    hz_points = (700 * (10**(mel_points / 2595) - 1))  # Convert Mel to Hz
-    bin = np.floor((NFFT + 1) * hz_points / sample_rate)
-
-    fbank = np.zeros((nfilt, int(np.floor(NFFT / 2 + 1))))
-    for m in range(1, nfilt + 1):
-        f_m_minus = int(bin[m - 1])   # left
-        f_m = int(bin[m])             # center
-        f_m_plus = int(bin[m + 1])    # right
-
-        for k in range(f_m_minus, f_m):
-            fbank[m - 1, k] = (k - bin[m - 1]) / (bin[m] - bin[m - 1])
-        for k in range(f_m, f_m_plus):
-            fbank[m - 1, k] = (bin[m + 1] - k) / (bin[m + 1] - bin[m])
-    filter_banks = np.dot(pow_frames, fbank.T)
-    filter_banks = np.where(filter_banks == 0, np.finfo(float).eps, filter_banks)  # Numerical Stability
-    filter_banks = 20 * np.log10(filter_banks)  # dB
-
-    mfcc = dct(filter_banks, type=2, axis=1, norm='ortho')[:, 0: (num_ceps + 1)]  # Keep filters 1-13 by default.
-    (nframes, ncoeff) = mfcc.shape
-    n = np.arange(ncoeff)
-    lift = 1 + (cep_lifter / 2) * np.sin(np.pi * n / cep_lifter)
-    mfcc *= lift
-
-    filter_banks -= (np.mean(filter_banks, axis=0) + 1e-8)
-    mfcc -= (np.mean(mfcc, axis=0) + 1e-8)
-    return mfcc[np.newaxis, :, :]  # Create a batch dimension.
--- a/gradio/version.txt
+++ b/gradio/version.txt
@ -1 +1 @@
-2.3.7b
+2.3.7b1
--- a/setup.py
+++ b/setup.py
@ -5,7 +5,7 @@ except ImportError:

 setup(
    name='gradio',
-    version='2.3.7b',
+    version='2.3.7b1',
    include_package_data=True,
    description='Python library for easily interacting with trained machine learning models',
    author='Abubakar Abid',
@ -16,7 +16,7 @@ setup(
    keywords=['machine learning', 'visualization', 'reproducibility'],
    install_requires=[
        'numpy',
-        'scipy',
+        'pydub',
        'matplotlib',
        'pandas',
        'pillow',
--- a/test/test_inputs.py
+++ b/test/test_inputs.py
@ -2,7 +2,7 @@ import unittest
 import gradio as gr
 import PIL
 import numpy as np
-import scipy
+from pydub import AudioSegment
 import os

 class TestTextbox(unittest.TestCase):
@ -97,7 +97,8 @@ class TestAudio(unittest.TestCase):
    def test_in_interface(self):
        x_wav = gr.test_data.BASE64_AUDIO
        def max_amplitude_from_wav_file(wav_file):
-            _, data = scipy.io.wavfile.read(wav_file.name)
+            audio_segment = AudioSegment.from_file(wav_file.name)
+            data = np.array(audio_segment.get_array_of_samples())
            return np.max(data)

        iface = gr.Interface(
@ -111,7 +112,7 @@ class TestFile(unittest.TestCase):
        x_file = {
            "name": "audio.wav",
            "data": gr.test_data.BASE64_AUDIO,
-            "is_local_example": False
+            "is_example": False
        }
        def get_size_of_file(file_obj):
            return os.path.getsize(file_obj.name)
 @ -1 +1 @@
 .3.7b
 .3.7b1