mirror of
https://github.com/gradio-app/gradio.git
synced 2024-11-21 01:01:05 +08:00
updated preprocessing for images and added preprocessing for audio
This commit is contained in:
parent
8f4486eac8
commit
6d1dfea318
@ -93,7 +93,7 @@ class Sketchpad(AbstractInput):
|
||||
"""
|
||||
Default preprocessing method for the SketchPad is to convert the sketch to black and white and resize 28x28
|
||||
"""
|
||||
im = preprocessing_utils.encoding_to_image(inp)
|
||||
im = preprocessing_utils.decode_base64_to_image(inp)
|
||||
im = im.convert('L')
|
||||
if self.invert_colors:
|
||||
im = ImageOps.invert(im)
|
||||
@ -111,7 +111,7 @@ class Sketchpad(AbstractInput):
|
||||
Default rebuild method to decode a base64 image
|
||||
"""
|
||||
inp = msg['data']['input']
|
||||
im = preprocessing_utils.encoding_to_image(inp)
|
||||
im = preprocessing_utils.decode_base64_to_image(inp)
|
||||
timestamp = datetime.datetime.now()
|
||||
filename = f'input_{timestamp.strftime("%Y-%m-%d-%H-%M-%S")}.png'
|
||||
im.save(f'{dir}/{filename}', 'PNG')
|
||||
@ -135,7 +135,7 @@ class Webcam(AbstractInput):
|
||||
"""
|
||||
Default preprocessing method for is to convert the picture to black and white and resize to be 48x48
|
||||
"""
|
||||
im = preprocessing_utils.encoding_to_image(inp)
|
||||
im = preprocessing_utils.decode_base64_to_image(inp)
|
||||
im = im.convert('RGB')
|
||||
im = preprocessing_utils.resize_and_crop(im, (self.image_width, self.image_height))
|
||||
array = np.array(im).flatten().reshape(1, self.image_width, self.image_height, self.num_channels)
|
||||
@ -146,7 +146,7 @@ class Webcam(AbstractInput):
|
||||
Default rebuild method to decode a base64 image
|
||||
"""
|
||||
inp = msg['data']['input']
|
||||
im = preprocessing_utils.encoding_to_image(inp)
|
||||
im = preprocessing_utils.decode_base64_to_image(inp)
|
||||
timestamp = datetime.datetime.now()
|
||||
filename = f'input_{timestamp.strftime("%Y-%m-%d-%H-%M-%S")}.png'
|
||||
im.save(f'{dir}/{filename}', 'PNG')
|
||||
@ -203,7 +203,7 @@ class ImageUpload(AbstractInput):
|
||||
"""
|
||||
Default preprocessing method for is to convert the picture to black and white and resize to be 48x48
|
||||
"""
|
||||
im = preprocessing_utils.encoding_to_image(inp)
|
||||
im = preprocessing_utils.decode_base64_to_image(inp)
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("ignore")
|
||||
im = im.convert(self.image_mode)
|
||||
@ -222,7 +222,7 @@ class ImageUpload(AbstractInput):
|
||||
Default rebuild method to decode a base64 image
|
||||
"""
|
||||
inp = msg['data']['input']
|
||||
im = preprocessing_utils.encoding_to_image(inp)
|
||||
im = preprocessing_utils.decode_base64_to_image(inp)
|
||||
timestamp = datetime.datetime.now()
|
||||
filename = f'input_{timestamp.strftime("%Y-%m-%d-%H-%M-%S")}.png'
|
||||
im.save(f'{dir}/{filename}', 'PNG')
|
||||
|
@ -156,7 +156,7 @@ class Image(AbstractOutput):
|
||||
Default rebuild method to decode a base64 image
|
||||
"""
|
||||
out = msg['data']['output']
|
||||
im = preprocessing_utils.encoding_to_image(out)
|
||||
im = preprocessing_utils.decode_base64_to_image(out)
|
||||
timestamp = datetime.datetime.now()
|
||||
filename = f'output_{timestamp.strftime("%Y-%m-%d-%H-%M-%S")}.png'
|
||||
im.save(f'{dir}/{filename}', 'PNG')
|
||||
|
@ -12,6 +12,7 @@ import time
|
||||
import warnings
|
||||
import json
|
||||
|
||||
|
||||
# Where to find the static resources associated with each template.
|
||||
BASE_INPUT_INTERFACE_TEMPLATE_PATH = 'templates/input/{}.html'
|
||||
BASE_INPUT_INTERFACE_JS_PATH = 'static/js/interfaces/input/{}.js'
|
||||
@ -93,12 +94,11 @@ class Sketchpad(AbstractInput):
|
||||
"""
|
||||
Default preprocessing method for the SketchPad is to convert the sketch to black and white and resize 28x28
|
||||
"""
|
||||
im = preprocessing_utils.encoding_to_image(inp)
|
||||
im = preprocessing_utils.decode_base64_to_image(inp)
|
||||
im = im.convert('L')
|
||||
if self.invert_colors:
|
||||
im = ImageOps.invert(im)
|
||||
im = im.resize((self.image_width, self.image_height))
|
||||
# im = preprocessing_utils.resize_and_crop(im, (self.image_width, self.image_height))
|
||||
if self.flatten:
|
||||
array = np.array(im).flatten().reshape(1, self.image_width * self.image_height)
|
||||
else:
|
||||
@ -113,7 +113,7 @@ class Sketchpad(AbstractInput):
|
||||
Default rebuild method to decode a base64 image
|
||||
"""
|
||||
inp = msg['data']['input']
|
||||
im = preprocessing_utils.encoding_to_image(inp)
|
||||
im = preprocessing_utils.decode_base64_to_image(inp)
|
||||
timestamp = time.time()*1000
|
||||
filename = f'input_{timestamp}.png'
|
||||
im.save(f'{dir}/{filename}', 'PNG')
|
||||
@ -137,7 +137,7 @@ class Webcam(AbstractInput):
|
||||
"""
|
||||
Default preprocessing method for is to convert the picture to black and white and resize to be 48x48
|
||||
"""
|
||||
im = preprocessing_utils.encoding_to_image(inp)
|
||||
im = preprocessing_utils.decode_base64_to_image(inp)
|
||||
im = im.convert('RGB')
|
||||
im = preprocessing_utils.resize_and_crop(im, (self.image_width, self.image_height))
|
||||
array = np.array(im).flatten().reshape(1, self.image_width, self.image_height, self.num_channels)
|
||||
@ -148,7 +148,7 @@ class Webcam(AbstractInput):
|
||||
Default rebuild method to decode a base64 image
|
||||
"""
|
||||
inp = msg['data']['input']
|
||||
im = preprocessing_utils.encoding_to_image(inp)
|
||||
im = preprocessing_utils.decode_base64_to_image(inp)
|
||||
timestamp = time.time()*1000
|
||||
filename = f'input_{timestamp}.png'
|
||||
im.save(f'{dir}/{filename}', 'PNG')
|
||||
@ -205,7 +205,7 @@ class ImageUpload(AbstractInput):
|
||||
"""
|
||||
Default preprocessing method for is to convert the picture to black and white and resize to be 48x48
|
||||
"""
|
||||
im = preprocessing_utils.encoding_to_image(inp)
|
||||
im = preprocessing_utils.decode_base64_to_image(inp)
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("ignore")
|
||||
im = im.convert(self.image_mode)
|
||||
@ -224,7 +224,7 @@ class ImageUpload(AbstractInput):
|
||||
Default rebuild method to decode a base64 image
|
||||
"""
|
||||
inp = msg['data']['input']
|
||||
im = preprocessing_utils.encoding_to_image(inp)
|
||||
im = preprocessing_utils.decode_base64_to_image(inp)
|
||||
timestamp = time.time()*1000
|
||||
filename = f'input_{timestamp}.png'
|
||||
im.save(f'{dir}/{filename}', 'PNG')
|
||||
@ -266,9 +266,11 @@ class Microphone(AbstractInput):
|
||||
|
||||
def preprocess(self, inp):
|
||||
"""
|
||||
By default, no pre-processing is applied to a microphone input file (TODO:aliabid94 fix this)
|
||||
By default, no pre-processing is applied to a microphone input file
|
||||
"""
|
||||
return inp
|
||||
file_obj = preprocessing_utils.decode_base64_to_wav_file(inp)
|
||||
mfcc_array = preprocessing_utils.generate_mfcc_features_from_audio_file(file_obj.name)
|
||||
return mfcc_array
|
||||
|
||||
def rebuild_flagged(self, dir, msg):
|
||||
"""
|
||||
|
@ -136,6 +136,7 @@ class Interface:
|
||||
Method that calls the relevant method of the model object to make a prediction.
|
||||
:param preprocessed_input: the preprocessed input returned by the input interface
|
||||
"""
|
||||
print(preprocessed_input.shape)
|
||||
if self.model_type == "sklearn":
|
||||
return self.model_obj.predict(preprocessed_input)
|
||||
elif self.model_type == "keras":
|
||||
|
@ -198,7 +198,7 @@ def serve_files_in_background(interface, port, directory_to_serve=None):
|
||||
self._set_headers()
|
||||
data_string = self.rfile.read(int(self.headers["Content-Length"]))
|
||||
msg = json.loads(data_string)
|
||||
img_orig = preprocessing_utils.encoding_to_image(msg["data"])
|
||||
img_orig = preprocessing_utils.decode_base64_to_image(msg["data"])
|
||||
img_orig = img_orig.convert('RGB')
|
||||
img_orig = img_orig.resize((224, 224))
|
||||
|
||||
@ -230,7 +230,7 @@ def serve_files_in_background(interface, port, directory_to_serve=None):
|
||||
self._set_headers()
|
||||
data_string = self.rfile.read(int(self.headers["Content-Length"]))
|
||||
msg = json.loads(data_string)
|
||||
img_orig = preprocessing_utils.encoding_to_image(msg["data"])
|
||||
img_orig = preprocessing_utils.decode_base64_to_image(msg["data"])
|
||||
img_orig = img_orig.convert('RGB')
|
||||
img_orig = img_orig.resize((224, 224))
|
||||
enhancer = ImageEnhance.Brightness(img_orig)
|
||||
|
@ -161,7 +161,7 @@ class Image(AbstractOutput):
|
||||
Default rebuild method to decode a base64 image
|
||||
"""
|
||||
out = msg['data']['output']
|
||||
im = preprocessing_utils.encoding_to_image(out)
|
||||
im = preprocessing_utils.decode_base64_to_image(out)
|
||||
timestamp = datetime.datetime.now()
|
||||
filename = f'output_{timestamp.strftime("%Y-%m-%d-%H-%M-%S")}.png'
|
||||
im.save(f'{dir}/{filename}', 'PNG')
|
||||
|
@ -1,13 +1,21 @@
|
||||
from PIL import Image
|
||||
from io import BytesIO
|
||||
import base64
|
||||
import tempfile
|
||||
import scipy.io.wavfile
|
||||
from scipy.fftpack import dct
|
||||
import numpy as np
|
||||
|
||||
|
||||
def encoding_to_image(encoding):
|
||||
#########################
|
||||
# IMAGE PRE-PROCESSING
|
||||
#########################
|
||||
def decode_base64_to_image(encoding):
|
||||
content = encoding.split(';')[1]
|
||||
image_encoded = content.split(',')[1]
|
||||
return Image.open(BytesIO(base64.b64decode(image_encoded)))
|
||||
|
||||
|
||||
def resize_and_crop(img, size, crop_type='top'):
|
||||
"""
|
||||
Resize and crop an image to fit the specified size.
|
||||
@ -58,3 +66,89 @@ def resize_and_crop(img, size, crop_type='top'):
|
||||
Image.ANTIALIAS)
|
||||
# If the scale is the same, we do not need to crop
|
||||
return img
|
||||
|
||||
|
||||
##################
|
||||
# AUDIO FILES
|
||||
##################
|
||||
|
||||
def decode_base64_to_wav_file(encoding):
|
||||
inp = encoding.split(';')[1].split(',')[1]
|
||||
wav_obj = base64.b64decode(inp)
|
||||
file_obj = tempfile.NamedTemporaryFile()
|
||||
file_obj.close()
|
||||
with open(file_obj.name, 'wb') as f:
|
||||
f.write(wav_obj)
|
||||
return file_obj
|
||||
|
||||
|
||||
def generate_mfcc_features_from_audio_file(wav_filename,
|
||||
pre_emphasis=0.95,
|
||||
frame_size= 0.025,
|
||||
frame_stride=0.01,
|
||||
NFFT=512,
|
||||
nfilt=40,
|
||||
num_ceps=12,
|
||||
cep_lifter=22):
|
||||
"""
|
||||
Loads and preprocesses a .wav audio file into mfcc coefficients, the typical inputs to models.
|
||||
Adapted from: https://haythamfayek.com/2016/04/21/speech-processing-for-machine-learning.html
|
||||
:param wav_filename: string name of audio file to process.
|
||||
:param pre_emphasis: a float factor, typically 0.95 or 0.97, which amplifies high frequencies.
|
||||
:param frame_size: a float that is the length, in seconds, of time frame over which to take the fft.
|
||||
:param frame_stride: a float that is the offset, in seconds, between consecutive time frames.
|
||||
:param NFFT: The number of points in the short-time fft for each time frame.
|
||||
:param nfilt: The number of filters on the Mel-scale to extract frequency bands.
|
||||
:param num_ceps: the number of cepstral coefficients to retrain.
|
||||
:param cep_lifter: the int factor, by which to de-emphasize higher-frequency.
|
||||
:return: a numpy array of mfcc coefficients.
|
||||
"""
|
||||
sample_rate, signal = scipy.io.wavfile.read(wav_filename)
|
||||
emphasized_signal = np.append(signal[0], signal[1:] - pre_emphasis * signal[:-1])
|
||||
|
||||
frame_length, frame_step = frame_size * sample_rate, frame_stride * sample_rate # Convert from seconds to samples
|
||||
signal_length = len(emphasized_signal)
|
||||
frame_length = int(round(frame_length))
|
||||
frame_step = int(round(frame_step))
|
||||
num_frames = int(np.ceil(float(np.abs(signal_length - frame_length)) / frame_step)) # Make sure that we have at least 1 frame
|
||||
|
||||
pad_signal_length = num_frames * frame_step + frame_length
|
||||
z = np.zeros((pad_signal_length - signal_length))
|
||||
pad_signal = np.append(emphasized_signal, z) # Pad Signal to make sure that all frames have equal number of samples without truncating any samples from the original signal
|
||||
|
||||
indices = np.tile(np.arange(0, frame_length), (num_frames, 1)) + np.tile(np.arange(0, num_frames * frame_step, frame_step), (frame_length, 1)).T
|
||||
frames = pad_signal[indices.astype(np.int32, copy=False)]
|
||||
|
||||
frames *= np.hamming(frame_length)
|
||||
mag_frames = np.absolute(np.fft.rfft(frames, NFFT)) # Magnitude of the FFT
|
||||
pow_frames = ((1.0 / NFFT) * ((mag_frames) ** 2)) # Power Spectrum
|
||||
|
||||
low_freq_mel = 0
|
||||
high_freq_mel = (2595 * np.log10(1 + (sample_rate / 2) / 700)) # Convert Hz to Mel
|
||||
mel_points = np.linspace(low_freq_mel, high_freq_mel, nfilt + 2) # Equally spaced in Mel scale
|
||||
hz_points = (700 * (10**(mel_points / 2595) - 1)) # Convert Mel to Hz
|
||||
bin = np.floor((NFFT + 1) * hz_points / sample_rate)
|
||||
|
||||
fbank = np.zeros((nfilt, int(np.floor(NFFT / 2 + 1))))
|
||||
for m in range(1, nfilt + 1):
|
||||
f_m_minus = int(bin[m - 1]) # left
|
||||
f_m = int(bin[m]) # center
|
||||
f_m_plus = int(bin[m + 1]) # right
|
||||
|
||||
for k in range(f_m_minus, f_m):
|
||||
fbank[m - 1, k] = (k - bin[m - 1]) / (bin[m] - bin[m - 1])
|
||||
for k in range(f_m, f_m_plus):
|
||||
fbank[m - 1, k] = (bin[m + 1] - k) / (bin[m + 1] - bin[m])
|
||||
filter_banks = np.dot(pow_frames, fbank.T)
|
||||
filter_banks = np.where(filter_banks == 0, np.finfo(float).eps, filter_banks) # Numerical Stability
|
||||
filter_banks = 20 * np.log10(filter_banks) # dB
|
||||
|
||||
mfcc = dct(filter_banks, type=2, axis=1, norm='ortho')[:, 0: (num_ceps + 1)] # Keep filters 1-13 by default.
|
||||
(nframes, ncoeff) = mfcc.shape
|
||||
n = np.arange(ncoeff)
|
||||
lift = 1 + (cep_lifter / 2) * np.sin(np.pi * n / cep_lifter)
|
||||
mfcc *= lift
|
||||
|
||||
filter_banks -= (np.mean(filter_banks, axis=0) + 1e-8)
|
||||
mfcc -= (np.mean(mfcc, axis=0) + 1e-8)
|
||||
return mfcc[np.newaxis, :, :] # Create a batch dimension.
|
||||
|
Loading…
Reference in New Issue
Block a user