diff --git a/build/lib/gradio/inputs.py b/build/lib/gradio/inputs.py index c003edadbc..0629684f5b 100644 --- a/build/lib/gradio/inputs.py +++ b/build/lib/gradio/inputs.py @@ -93,7 +93,7 @@ class Sketchpad(AbstractInput): """ Default preprocessing method for the SketchPad is to convert the sketch to black and white and resize 28x28 """ - im = preprocessing_utils.encoding_to_image(inp) + im = preprocessing_utils.decode_base64_to_image(inp) im = im.convert('L') if self.invert_colors: im = ImageOps.invert(im) @@ -111,7 +111,7 @@ class Sketchpad(AbstractInput): Default rebuild method to decode a base64 image """ inp = msg['data']['input'] - im = preprocessing_utils.encoding_to_image(inp) + im = preprocessing_utils.decode_base64_to_image(inp) timestamp = datetime.datetime.now() filename = f'input_{timestamp.strftime("%Y-%m-%d-%H-%M-%S")}.png' im.save(f'{dir}/{filename}', 'PNG') @@ -135,7 +135,7 @@ class Webcam(AbstractInput): """ Default preprocessing method for is to convert the picture to black and white and resize to be 48x48 """ - im = preprocessing_utils.encoding_to_image(inp) + im = preprocessing_utils.decode_base64_to_image(inp) im = im.convert('RGB') im = preprocessing_utils.resize_and_crop(im, (self.image_width, self.image_height)) array = np.array(im).flatten().reshape(1, self.image_width, self.image_height, self.num_channels) @@ -146,7 +146,7 @@ class Webcam(AbstractInput): Default rebuild method to decode a base64 image """ inp = msg['data']['input'] - im = preprocessing_utils.encoding_to_image(inp) + im = preprocessing_utils.decode_base64_to_image(inp) timestamp = datetime.datetime.now() filename = f'input_{timestamp.strftime("%Y-%m-%d-%H-%M-%S")}.png' im.save(f'{dir}/{filename}', 'PNG') @@ -203,7 +203,7 @@ class ImageUpload(AbstractInput): """ Default preprocessing method for is to convert the picture to black and white and resize to be 48x48 """ - im = preprocessing_utils.encoding_to_image(inp) + im = preprocessing_utils.decode_base64_to_image(inp) with warnings.catch_warnings(): warnings.simplefilter("ignore") im = im.convert(self.image_mode) @@ -222,7 +222,7 @@ class ImageUpload(AbstractInput): Default rebuild method to decode a base64 image """ inp = msg['data']['input'] - im = preprocessing_utils.encoding_to_image(inp) + im = preprocessing_utils.decode_base64_to_image(inp) timestamp = datetime.datetime.now() filename = f'input_{timestamp.strftime("%Y-%m-%d-%H-%M-%S")}.png' im.save(f'{dir}/{filename}', 'PNG') diff --git a/build/lib/gradio/outputs.py b/build/lib/gradio/outputs.py index fa0bd8f003..33e33ed053 100644 --- a/build/lib/gradio/outputs.py +++ b/build/lib/gradio/outputs.py @@ -156,7 +156,7 @@ class Image(AbstractOutput): Default rebuild method to decode a base64 image """ out = msg['data']['output'] - im = preprocessing_utils.encoding_to_image(out) + im = preprocessing_utils.decode_base64_to_image(out) timestamp = datetime.datetime.now() filename = f'output_{timestamp.strftime("%Y-%m-%d-%H-%M-%S")}.png' im.save(f'{dir}/{filename}', 'PNG') diff --git a/gradio/inputs.py b/gradio/inputs.py index 6e3779326d..16a3b54234 100644 --- a/gradio/inputs.py +++ b/gradio/inputs.py @@ -12,6 +12,7 @@ import time import warnings import json + # Where to find the static resources associated with each template. BASE_INPUT_INTERFACE_TEMPLATE_PATH = 'templates/input/{}.html' BASE_INPUT_INTERFACE_JS_PATH = 'static/js/interfaces/input/{}.js' @@ -93,12 +94,11 @@ class Sketchpad(AbstractInput): """ Default preprocessing method for the SketchPad is to convert the sketch to black and white and resize 28x28 """ - im = preprocessing_utils.encoding_to_image(inp) + im = preprocessing_utils.decode_base64_to_image(inp) im = im.convert('L') if self.invert_colors: im = ImageOps.invert(im) im = im.resize((self.image_width, self.image_height)) - # im = preprocessing_utils.resize_and_crop(im, (self.image_width, self.image_height)) if self.flatten: array = np.array(im).flatten().reshape(1, self.image_width * self.image_height) else: @@ -113,7 +113,7 @@ class Sketchpad(AbstractInput): Default rebuild method to decode a base64 image """ inp = msg['data']['input'] - im = preprocessing_utils.encoding_to_image(inp) + im = preprocessing_utils.decode_base64_to_image(inp) timestamp = time.time()*1000 filename = f'input_{timestamp}.png' im.save(f'{dir}/{filename}', 'PNG') @@ -137,7 +137,7 @@ class Webcam(AbstractInput): """ Default preprocessing method for is to convert the picture to black and white and resize to be 48x48 """ - im = preprocessing_utils.encoding_to_image(inp) + im = preprocessing_utils.decode_base64_to_image(inp) im = im.convert('RGB') im = preprocessing_utils.resize_and_crop(im, (self.image_width, self.image_height)) array = np.array(im).flatten().reshape(1, self.image_width, self.image_height, self.num_channels) @@ -148,7 +148,7 @@ class Webcam(AbstractInput): Default rebuild method to decode a base64 image """ inp = msg['data']['input'] - im = preprocessing_utils.encoding_to_image(inp) + im = preprocessing_utils.decode_base64_to_image(inp) timestamp = time.time()*1000 filename = f'input_{timestamp}.png' im.save(f'{dir}/{filename}', 'PNG') @@ -205,7 +205,7 @@ class ImageUpload(AbstractInput): """ Default preprocessing method for is to convert the picture to black and white and resize to be 48x48 """ - im = preprocessing_utils.encoding_to_image(inp) + im = preprocessing_utils.decode_base64_to_image(inp) with warnings.catch_warnings(): warnings.simplefilter("ignore") im = im.convert(self.image_mode) @@ -224,7 +224,7 @@ class ImageUpload(AbstractInput): Default rebuild method to decode a base64 image """ inp = msg['data']['input'] - im = preprocessing_utils.encoding_to_image(inp) + im = preprocessing_utils.decode_base64_to_image(inp) timestamp = time.time()*1000 filename = f'input_{timestamp}.png' im.save(f'{dir}/{filename}', 'PNG') @@ -266,9 +266,11 @@ class Microphone(AbstractInput): def preprocess(self, inp): """ - By default, no pre-processing is applied to a microphone input file (TODO:aliabid94 fix this) + By default, no pre-processing is applied to a microphone input file """ - return inp + file_obj = preprocessing_utils.decode_base64_to_wav_file(inp) + mfcc_array = preprocessing_utils.generate_mfcc_features_from_audio_file(file_obj.name) + return mfcc_array def rebuild_flagged(self, dir, msg): """ diff --git a/gradio/interface.py b/gradio/interface.py index d40a8b2b3d..8d0a0d9508 100644 --- a/gradio/interface.py +++ b/gradio/interface.py @@ -136,6 +136,7 @@ class Interface: Method that calls the relevant method of the model object to make a prediction. :param preprocessed_input: the preprocessed input returned by the input interface """ + print(preprocessed_input.shape) if self.model_type == "sklearn": return self.model_obj.predict(preprocessed_input) elif self.model_type == "keras": diff --git a/gradio/networking.py b/gradio/networking.py index cdd530b498..a935d64866 100644 --- a/gradio/networking.py +++ b/gradio/networking.py @@ -198,7 +198,7 @@ def serve_files_in_background(interface, port, directory_to_serve=None): self._set_headers() data_string = self.rfile.read(int(self.headers["Content-Length"])) msg = json.loads(data_string) - img_orig = preprocessing_utils.encoding_to_image(msg["data"]) + img_orig = preprocessing_utils.decode_base64_to_image(msg["data"]) img_orig = img_orig.convert('RGB') img_orig = img_orig.resize((224, 224)) @@ -230,7 +230,7 @@ def serve_files_in_background(interface, port, directory_to_serve=None): self._set_headers() data_string = self.rfile.read(int(self.headers["Content-Length"])) msg = json.loads(data_string) - img_orig = preprocessing_utils.encoding_to_image(msg["data"]) + img_orig = preprocessing_utils.decode_base64_to_image(msg["data"]) img_orig = img_orig.convert('RGB') img_orig = img_orig.resize((224, 224)) enhancer = ImageEnhance.Brightness(img_orig) diff --git a/gradio/outputs.py b/gradio/outputs.py index 95a6d2afd8..b52bd33acf 100644 --- a/gradio/outputs.py +++ b/gradio/outputs.py @@ -161,7 +161,7 @@ class Image(AbstractOutput): Default rebuild method to decode a base64 image """ out = msg['data']['output'] - im = preprocessing_utils.encoding_to_image(out) + im = preprocessing_utils.decode_base64_to_image(out) timestamp = datetime.datetime.now() filename = f'output_{timestamp.strftime("%Y-%m-%d-%H-%M-%S")}.png' im.save(f'{dir}/{filename}', 'PNG') diff --git a/gradio/preprocessing_utils.py b/gradio/preprocessing_utils.py index 3fc2f4e1b6..bb3025b9cc 100644 --- a/gradio/preprocessing_utils.py +++ b/gradio/preprocessing_utils.py @@ -1,13 +1,21 @@ from PIL import Image from io import BytesIO import base64 +import tempfile +import scipy.io.wavfile +from scipy.fftpack import dct +import numpy as np -def encoding_to_image(encoding): +######################### +# IMAGE PRE-PROCESSING +######################### +def decode_base64_to_image(encoding): content = encoding.split(';')[1] image_encoded = content.split(',')[1] return Image.open(BytesIO(base64.b64decode(image_encoded))) + def resize_and_crop(img, size, crop_type='top'): """ Resize and crop an image to fit the specified size. @@ -58,3 +66,89 @@ def resize_and_crop(img, size, crop_type='top'): Image.ANTIALIAS) # If the scale is the same, we do not need to crop return img + + +################## +# AUDIO FILES +################## + +def decode_base64_to_wav_file(encoding): + inp = encoding.split(';')[1].split(',')[1] + wav_obj = base64.b64decode(inp) + file_obj = tempfile.NamedTemporaryFile() + file_obj.close() + with open(file_obj.name, 'wb') as f: + f.write(wav_obj) + return file_obj + + +def generate_mfcc_features_from_audio_file(wav_filename, + pre_emphasis=0.95, + frame_size= 0.025, + frame_stride=0.01, + NFFT=512, + nfilt=40, + num_ceps=12, + cep_lifter=22): + """ + Loads and preprocesses a .wav audio file into mfcc coefficients, the typical inputs to models. + Adapted from: https://haythamfayek.com/2016/04/21/speech-processing-for-machine-learning.html + :param wav_filename: string name of audio file to process. + :param pre_emphasis: a float factor, typically 0.95 or 0.97, which amplifies high frequencies. + :param frame_size: a float that is the length, in seconds, of time frame over which to take the fft. + :param frame_stride: a float that is the offset, in seconds, between consecutive time frames. + :param NFFT: The number of points in the short-time fft for each time frame. + :param nfilt: The number of filters on the Mel-scale to extract frequency bands. + :param num_ceps: the number of cepstral coefficients to retrain. + :param cep_lifter: the int factor, by which to de-emphasize higher-frequency. + :return: a numpy array of mfcc coefficients. + """ + sample_rate, signal = scipy.io.wavfile.read(wav_filename) + emphasized_signal = np.append(signal[0], signal[1:] - pre_emphasis * signal[:-1]) + + frame_length, frame_step = frame_size * sample_rate, frame_stride * sample_rate # Convert from seconds to samples + signal_length = len(emphasized_signal) + frame_length = int(round(frame_length)) + frame_step = int(round(frame_step)) + num_frames = int(np.ceil(float(np.abs(signal_length - frame_length)) / frame_step)) # Make sure that we have at least 1 frame + + pad_signal_length = num_frames * frame_step + frame_length + z = np.zeros((pad_signal_length - signal_length)) + pad_signal = np.append(emphasized_signal, z) # Pad Signal to make sure that all frames have equal number of samples without truncating any samples from the original signal + + indices = np.tile(np.arange(0, frame_length), (num_frames, 1)) + np.tile(np.arange(0, num_frames * frame_step, frame_step), (frame_length, 1)).T + frames = pad_signal[indices.astype(np.int32, copy=False)] + + frames *= np.hamming(frame_length) + mag_frames = np.absolute(np.fft.rfft(frames, NFFT)) # Magnitude of the FFT + pow_frames = ((1.0 / NFFT) * ((mag_frames) ** 2)) # Power Spectrum + + low_freq_mel = 0 + high_freq_mel = (2595 * np.log10(1 + (sample_rate / 2) / 700)) # Convert Hz to Mel + mel_points = np.linspace(low_freq_mel, high_freq_mel, nfilt + 2) # Equally spaced in Mel scale + hz_points = (700 * (10**(mel_points / 2595) - 1)) # Convert Mel to Hz + bin = np.floor((NFFT + 1) * hz_points / sample_rate) + + fbank = np.zeros((nfilt, int(np.floor(NFFT / 2 + 1)))) + for m in range(1, nfilt + 1): + f_m_minus = int(bin[m - 1]) # left + f_m = int(bin[m]) # center + f_m_plus = int(bin[m + 1]) # right + + for k in range(f_m_minus, f_m): + fbank[m - 1, k] = (k - bin[m - 1]) / (bin[m] - bin[m - 1]) + for k in range(f_m, f_m_plus): + fbank[m - 1, k] = (bin[m + 1] - k) / (bin[m + 1] - bin[m]) + filter_banks = np.dot(pow_frames, fbank.T) + filter_banks = np.where(filter_banks == 0, np.finfo(float).eps, filter_banks) # Numerical Stability + filter_banks = 20 * np.log10(filter_banks) # dB + + mfcc = dct(filter_banks, type=2, axis=1, norm='ortho')[:, 0: (num_ceps + 1)] # Keep filters 1-13 by default. + (nframes, ncoeff) = mfcc.shape + n = np.arange(ncoeff) + lift = 1 + (cep_lifter / 2) * np.sin(np.pi * n / cep_lifter) + mfcc *= lift + + filter_banks -= (np.mean(filter_banks, axis=0) + 1e-8) + mfcc -= (np.mean(mfcc, axis=0) + 1e-8) + return mfcc[np.newaxis, :, :] # Create a batch dimension. diff --git a/setup.py b/setup.py index f1fe0fff0b..5a5d0e286d 100644 --- a/setup.py +++ b/setup.py @@ -20,5 +20,6 @@ setup( 'requests', 'psutil', 'paramiko', + 'scipy', ], )