updated preprocessing for images and added preprocessing for audio

2024-11-21 01:01:05 +08:00 · 2019-06-21 19:02:14 -07:00 · 2019-06-21 19:02:14 -07:00 · 6d1dfea318
commit 6d1dfea318
parent 8f4486eac8
8 changed files with 118 additions and 20 deletions
--- a/build/lib/gradio/inputs.py
+++ b/build/lib/gradio/inputs.py
@ -93,7 +93,7 @@ class Sketchpad(AbstractInput):
        """
        Default preprocessing method for the SketchPad is to convert the sketch to black and white and resize 28x28
        """
-        im = preprocessing_utils.encoding_to_image(inp)
+        im = preprocessing_utils.decode_base64_to_image(inp)
        im = im.convert('L')
        if self.invert_colors:
            im = ImageOps.invert(im)
@ -111,7 +111,7 @@ class Sketchpad(AbstractInput):
        Default rebuild method to decode a base64 image
        """
        inp = msg['data']['input']
-        im = preprocessing_utils.encoding_to_image(inp)
+        im = preprocessing_utils.decode_base64_to_image(inp)
        timestamp = datetime.datetime.now()
        filename = f'input_{timestamp.strftime("%Y-%m-%d-%H-%M-%S")}.png'
        im.save(f'{dir}/{filename}', 'PNG')
@ -135,7 +135,7 @@ class Webcam(AbstractInput):
        """
        Default preprocessing method for is to convert the picture to black and white and resize to be 48x48
        """
-        im = preprocessing_utils.encoding_to_image(inp)
+        im = preprocessing_utils.decode_base64_to_image(inp)
        im = im.convert('RGB')
        im = preprocessing_utils.resize_and_crop(im, (self.image_width, self.image_height))
        array = np.array(im).flatten().reshape(1, self.image_width, self.image_height, self.num_channels)
@ -146,7 +146,7 @@ class Webcam(AbstractInput):
        Default rebuild method to decode a base64 image
        """
        inp = msg['data']['input']
-        im = preprocessing_utils.encoding_to_image(inp)
+        im = preprocessing_utils.decode_base64_to_image(inp)
        timestamp = datetime.datetime.now()
        filename = f'input_{timestamp.strftime("%Y-%m-%d-%H-%M-%S")}.png'
        im.save(f'{dir}/{filename}', 'PNG')
@ -203,7 +203,7 @@ class ImageUpload(AbstractInput):
        """
        Default preprocessing method for is to convert the picture to black and white and resize to be 48x48
        """
-        im = preprocessing_utils.encoding_to_image(inp)
+        im = preprocessing_utils.decode_base64_to_image(inp)
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            im = im.convert(self.image_mode)
@ -222,7 +222,7 @@ class ImageUpload(AbstractInput):
        Default rebuild method to decode a base64 image
        """
        inp = msg['data']['input']
-        im = preprocessing_utils.encoding_to_image(inp)
+        im = preprocessing_utils.decode_base64_to_image(inp)
        timestamp = datetime.datetime.now()
        filename = f'input_{timestamp.strftime("%Y-%m-%d-%H-%M-%S")}.png'
        im.save(f'{dir}/{filename}', 'PNG')
--- a/build/lib/gradio/outputs.py
+++ b/build/lib/gradio/outputs.py
@ -156,7 +156,7 @@ class Image(AbstractOutput):
        Default rebuild method to decode a base64 image
        """
        out = msg['data']['output']
-        im = preprocessing_utils.encoding_to_image(out)
+        im = preprocessing_utils.decode_base64_to_image(out)
        timestamp = datetime.datetime.now()
        filename = f'output_{timestamp.strftime("%Y-%m-%d-%H-%M-%S")}.png'
        im.save(f'{dir}/{filename}', 'PNG')
--- a/gradio/inputs.py
+++ b/gradio/inputs.py
@ -12,6 +12,7 @@ import time
 import warnings
 import json

+
 # Where to find the static resources associated with each template.
 BASE_INPUT_INTERFACE_TEMPLATE_PATH = 'templates/input/{}.html'
 BASE_INPUT_INTERFACE_JS_PATH = 'static/js/interfaces/input/{}.js'
@ -93,12 +94,11 @@ class Sketchpad(AbstractInput):
        """
        Default preprocessing method for the SketchPad is to convert the sketch to black and white and resize 28x28
        """
-        im = preprocessing_utils.encoding_to_image(inp)
+        im = preprocessing_utils.decode_base64_to_image(inp)
        im = im.convert('L')
        if self.invert_colors:
            im = ImageOps.invert(im)
        im = im.resize((self.image_width, self.image_height))
-        # im = preprocessing_utils.resize_and_crop(im, (self.image_width, self.image_height))
        if self.flatten:
            array = np.array(im).flatten().reshape(1, self.image_width * self.image_height)
        else:
@ -113,7 +113,7 @@ class Sketchpad(AbstractInput):
        Default rebuild method to decode a base64 image
        """
        inp = msg['data']['input']
-        im = preprocessing_utils.encoding_to_image(inp)
+        im = preprocessing_utils.decode_base64_to_image(inp)
        timestamp = time.time()*1000
        filename = f'input_{timestamp}.png'
        im.save(f'{dir}/{filename}', 'PNG')
@ -137,7 +137,7 @@ class Webcam(AbstractInput):
        """
        Default preprocessing method for is to convert the picture to black and white and resize to be 48x48
        """
-        im = preprocessing_utils.encoding_to_image(inp)
+        im = preprocessing_utils.decode_base64_to_image(inp)
        im = im.convert('RGB')
        im = preprocessing_utils.resize_and_crop(im, (self.image_width, self.image_height))
        array = np.array(im).flatten().reshape(1, self.image_width, self.image_height, self.num_channels)
@ -148,7 +148,7 @@ class Webcam(AbstractInput):
        Default rebuild method to decode a base64 image
        """
        inp = msg['data']['input']
-        im = preprocessing_utils.encoding_to_image(inp)
+        im = preprocessing_utils.decode_base64_to_image(inp)
        timestamp = time.time()*1000
        filename = f'input_{timestamp}.png'
        im.save(f'{dir}/{filename}', 'PNG')
@ -205,7 +205,7 @@ class ImageUpload(AbstractInput):
        """
        Default preprocessing method for is to convert the picture to black and white and resize to be 48x48
        """
-        im = preprocessing_utils.encoding_to_image(inp)
+        im = preprocessing_utils.decode_base64_to_image(inp)
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            im = im.convert(self.image_mode)
@ -224,7 +224,7 @@ class ImageUpload(AbstractInput):
        Default rebuild method to decode a base64 image
        """
        inp = msg['data']['input']
-        im = preprocessing_utils.encoding_to_image(inp)
+        im = preprocessing_utils.decode_base64_to_image(inp)
        timestamp = time.time()*1000
        filename = f'input_{timestamp}.png'
        im.save(f'{dir}/{filename}', 'PNG')
@ -266,9 +266,11 @@ class Microphone(AbstractInput):

    def preprocess(self, inp):
        """
-        By default, no pre-processing is applied to a microphone input file (TODO:aliabid94 fix this)
+        By default, no pre-processing is applied to a microphone input file
        """
-        return inp
+        file_obj = preprocessing_utils.decode_base64_to_wav_file(inp)
+        mfcc_array = preprocessing_utils.generate_mfcc_features_from_audio_file(file_obj.name)
+        return mfcc_array

    def rebuild_flagged(self, dir, msg):
        """
--- a/gradio/interface.py
+++ b/gradio/interface.py
@ -136,6 +136,7 @@ class Interface:
        Method that calls the relevant method of the model object to make a prediction.
        :param preprocessed_input: the preprocessed input returned by the input interface
        """
+        print(preprocessed_input.shape)
        if self.model_type == "sklearn":
            return self.model_obj.predict(preprocessed_input)
        elif self.model_type == "keras":
--- a/gradio/networking.py
+++ b/gradio/networking.py
@ -198,7 +198,7 @@ def serve_files_in_background(interface, port, directory_to_serve=None):
                self._set_headers()
                data_string = self.rfile.read(int(self.headers["Content-Length"]))
                msg = json.loads(data_string)
-                img_orig = preprocessing_utils.encoding_to_image(msg["data"])
+                img_orig = preprocessing_utils.decode_base64_to_image(msg["data"])
                img_orig = img_orig.convert('RGB')
                img_orig = img_orig.resize((224, 224))

@ -230,7 +230,7 @@ def serve_files_in_background(interface, port, directory_to_serve=None):
                self._set_headers()
                data_string = self.rfile.read(int(self.headers["Content-Length"]))
                msg = json.loads(data_string)
-                img_orig = preprocessing_utils.encoding_to_image(msg["data"])
+                img_orig = preprocessing_utils.decode_base64_to_image(msg["data"])
                img_orig = img_orig.convert('RGB')
                img_orig = img_orig.resize((224, 224))
                enhancer = ImageEnhance.Brightness(img_orig)
--- a/gradio/outputs.py
+++ b/gradio/outputs.py
@ -161,7 +161,7 @@ class Image(AbstractOutput):
        Default rebuild method to decode a base64 image
        """
        out = msg['data']['output']
-        im = preprocessing_utils.encoding_to_image(out)
+        im = preprocessing_utils.decode_base64_to_image(out)
        timestamp = datetime.datetime.now()
        filename = f'output_{timestamp.strftime("%Y-%m-%d-%H-%M-%S")}.png'
        im.save(f'{dir}/{filename}', 'PNG')
--- a/gradio/preprocessing_utils.py
+++ b/gradio/preprocessing_utils.py
@ -1,13 +1,21 @@
 from PIL import Image
 from io import BytesIO
 import base64
+import tempfile
+import scipy.io.wavfile
+from scipy.fftpack import dct
+import numpy as np


-def encoding_to_image(encoding):
+#########################
+# IMAGE PRE-PROCESSING
+#########################
+def decode_base64_to_image(encoding):
    content = encoding.split(';')[1]
    image_encoded = content.split(',')[1]
    return Image.open(BytesIO(base64.b64decode(image_encoded)))

+
 def resize_and_crop(img, size, crop_type='top'):
    """
    Resize and crop an image to fit the specified size.
@ -58,3 +66,89 @@ def resize_and_crop(img, size, crop_type='top'):
                         Image.ANTIALIAS)
        # If the scale is the same, we do not need to crop
    return img
+
+
+##################
+# AUDIO FILES
+##################
+
+def decode_base64_to_wav_file(encoding):
+    inp = encoding.split(';')[1].split(',')[1]
+    wav_obj = base64.b64decode(inp)
+    file_obj = tempfile.NamedTemporaryFile()
+    file_obj.close()
+    with open(file_obj.name, 'wb') as f:
+        f.write(wav_obj)
+    return file_obj
+
+
+def generate_mfcc_features_from_audio_file(wav_filename,
+                                           pre_emphasis=0.95,
+                                           frame_size= 0.025,
+                                           frame_stride=0.01,
+                                           NFFT=512,
+                                           nfilt=40,
+                                           num_ceps=12,
+                                           cep_lifter=22):
+    """
+    Loads and preprocesses a .wav audio file into mfcc coefficients, the typical inputs to models.
+    Adapted from: https://haythamfayek.com/2016/04/21/speech-processing-for-machine-learning.html
+    :param wav_filename: string name of audio file to process.
+    :param pre_emphasis: a float factor, typically 0.95 or 0.97, which amplifies high frequencies.
+    :param frame_size: a float that is the length, in seconds, of time frame over which to take the fft.
+    :param frame_stride: a float that is the offset, in seconds, between consecutive time frames.
+    :param NFFT: The number of points in the short-time fft for each time frame.
+    :param nfilt: The number of filters on the Mel-scale to extract frequency bands.
+    :param num_ceps: the number of cepstral coefficients to retrain.
+    :param cep_lifter: the int factor, by which to de-emphasize higher-frequency.
+    :return: a numpy array of mfcc coefficients.
+    """
+    sample_rate, signal = scipy.io.wavfile.read(wav_filename)
+    emphasized_signal = np.append(signal[0], signal[1:] - pre_emphasis * signal[:-1])
+
+    frame_length, frame_step = frame_size * sample_rate, frame_stride * sample_rate  # Convert from seconds to samples
+    signal_length = len(emphasized_signal)
+    frame_length = int(round(frame_length))
+    frame_step = int(round(frame_step))
+    num_frames = int(np.ceil(float(np.abs(signal_length - frame_length)) / frame_step))  # Make sure that we have at least 1 frame
+
+    pad_signal_length = num_frames * frame_step + frame_length
+    z = np.zeros((pad_signal_length - signal_length))
+    pad_signal = np.append(emphasized_signal, z)  # Pad Signal to make sure that all frames have equal number of samples without truncating any samples from the original signal
+
+    indices = np.tile(np.arange(0, frame_length), (num_frames, 1)) + np.tile(np.arange(0, num_frames * frame_step, frame_step), (frame_length, 1)).T
+    frames = pad_signal[indices.astype(np.int32, copy=False)]
+
+    frames *= np.hamming(frame_length)
+    mag_frames = np.absolute(np.fft.rfft(frames, NFFT))  # Magnitude of the FFT
+    pow_frames = ((1.0 / NFFT) * ((mag_frames) ** 2))  # Power Spectrum
+
+    low_freq_mel = 0
+    high_freq_mel = (2595 * np.log10(1 + (sample_rate / 2) / 700))  # Convert Hz to Mel
+    mel_points = np.linspace(low_freq_mel, high_freq_mel, nfilt + 2)  # Equally spaced in Mel scale
+    hz_points = (700 * (10**(mel_points / 2595) - 1))  # Convert Mel to Hz
+    bin = np.floor((NFFT + 1) * hz_points / sample_rate)
+
+    fbank = np.zeros((nfilt, int(np.floor(NFFT / 2 + 1))))
+    for m in range(1, nfilt + 1):
+        f_m_minus = int(bin[m - 1])   # left
+        f_m = int(bin[m])             # center
+        f_m_plus = int(bin[m + 1])    # right
+
+        for k in range(f_m_minus, f_m):
+            fbank[m - 1, k] = (k - bin[m - 1]) / (bin[m] - bin[m - 1])
+        for k in range(f_m, f_m_plus):
+            fbank[m - 1, k] = (bin[m + 1] - k) / (bin[m + 1] - bin[m])
+    filter_banks = np.dot(pow_frames, fbank.T)
+    filter_banks = np.where(filter_banks == 0, np.finfo(float).eps, filter_banks)  # Numerical Stability
+    filter_banks = 20 * np.log10(filter_banks)  # dB
+
+    mfcc = dct(filter_banks, type=2, axis=1, norm='ortho')[:, 0: (num_ceps + 1)]  # Keep filters 1-13 by default.
+    (nframes, ncoeff) = mfcc.shape
+    n = np.arange(ncoeff)
+    lift = 1 + (cep_lifter / 2) * np.sin(np.pi * n / cep_lifter)
+    mfcc *= lift
+
+    filter_banks -= (np.mean(filter_banks, axis=0) + 1e-8)
+    mfcc -= (np.mean(mfcc, axis=0) + 1e-8)
+    return mfcc[np.newaxis, :, :]  # Create a batch dimension.
--- a/setup.py
+++ b/setup.py
@ -20,5 +20,6 @@ setup(
        'requests',
        'psutil',
        'paramiko',
+        'scipy',
    ],
 )