updated preprocessing for images and added preprocessing for audio

This commit is contained in:
Abubakar Abid 2019-06-21 19:02:14 -07:00
parent 8f4486eac8
commit 6d1dfea318
8 changed files with 118 additions and 20 deletions

View File

@ -93,7 +93,7 @@ class Sketchpad(AbstractInput):
"""
Default preprocessing method for the SketchPad is to convert the sketch to black and white and resize 28x28
"""
im = preprocessing_utils.encoding_to_image(inp)
im = preprocessing_utils.decode_base64_to_image(inp)
im = im.convert('L')
if self.invert_colors:
im = ImageOps.invert(im)
@ -111,7 +111,7 @@ class Sketchpad(AbstractInput):
Default rebuild method to decode a base64 image
"""
inp = msg['data']['input']
im = preprocessing_utils.encoding_to_image(inp)
im = preprocessing_utils.decode_base64_to_image(inp)
timestamp = datetime.datetime.now()
filename = f'input_{timestamp.strftime("%Y-%m-%d-%H-%M-%S")}.png'
im.save(f'{dir}/{filename}', 'PNG')
@ -135,7 +135,7 @@ class Webcam(AbstractInput):
"""
Default preprocessing method for is to convert the picture to black and white and resize to be 48x48
"""
im = preprocessing_utils.encoding_to_image(inp)
im = preprocessing_utils.decode_base64_to_image(inp)
im = im.convert('RGB')
im = preprocessing_utils.resize_and_crop(im, (self.image_width, self.image_height))
array = np.array(im).flatten().reshape(1, self.image_width, self.image_height, self.num_channels)
@ -146,7 +146,7 @@ class Webcam(AbstractInput):
Default rebuild method to decode a base64 image
"""
inp = msg['data']['input']
im = preprocessing_utils.encoding_to_image(inp)
im = preprocessing_utils.decode_base64_to_image(inp)
timestamp = datetime.datetime.now()
filename = f'input_{timestamp.strftime("%Y-%m-%d-%H-%M-%S")}.png'
im.save(f'{dir}/{filename}', 'PNG')
@ -203,7 +203,7 @@ class ImageUpload(AbstractInput):
"""
Default preprocessing method for is to convert the picture to black and white and resize to be 48x48
"""
im = preprocessing_utils.encoding_to_image(inp)
im = preprocessing_utils.decode_base64_to_image(inp)
with warnings.catch_warnings():
warnings.simplefilter("ignore")
im = im.convert(self.image_mode)
@ -222,7 +222,7 @@ class ImageUpload(AbstractInput):
Default rebuild method to decode a base64 image
"""
inp = msg['data']['input']
im = preprocessing_utils.encoding_to_image(inp)
im = preprocessing_utils.decode_base64_to_image(inp)
timestamp = datetime.datetime.now()
filename = f'input_{timestamp.strftime("%Y-%m-%d-%H-%M-%S")}.png'
im.save(f'{dir}/{filename}', 'PNG')

View File

@ -156,7 +156,7 @@ class Image(AbstractOutput):
Default rebuild method to decode a base64 image
"""
out = msg['data']['output']
im = preprocessing_utils.encoding_to_image(out)
im = preprocessing_utils.decode_base64_to_image(out)
timestamp = datetime.datetime.now()
filename = f'output_{timestamp.strftime("%Y-%m-%d-%H-%M-%S")}.png'
im.save(f'{dir}/{filename}', 'PNG')

View File

@ -12,6 +12,7 @@ import time
import warnings
import json
# Where to find the static resources associated with each template.
BASE_INPUT_INTERFACE_TEMPLATE_PATH = 'templates/input/{}.html'
BASE_INPUT_INTERFACE_JS_PATH = 'static/js/interfaces/input/{}.js'
@ -93,12 +94,11 @@ class Sketchpad(AbstractInput):
"""
Default preprocessing method for the SketchPad is to convert the sketch to black and white and resize 28x28
"""
im = preprocessing_utils.encoding_to_image(inp)
im = preprocessing_utils.decode_base64_to_image(inp)
im = im.convert('L')
if self.invert_colors:
im = ImageOps.invert(im)
im = im.resize((self.image_width, self.image_height))
# im = preprocessing_utils.resize_and_crop(im, (self.image_width, self.image_height))
if self.flatten:
array = np.array(im).flatten().reshape(1, self.image_width * self.image_height)
else:
@ -113,7 +113,7 @@ class Sketchpad(AbstractInput):
Default rebuild method to decode a base64 image
"""
inp = msg['data']['input']
im = preprocessing_utils.encoding_to_image(inp)
im = preprocessing_utils.decode_base64_to_image(inp)
timestamp = time.time()*1000
filename = f'input_{timestamp}.png'
im.save(f'{dir}/{filename}', 'PNG')
@ -137,7 +137,7 @@ class Webcam(AbstractInput):
"""
Default preprocessing method for is to convert the picture to black and white and resize to be 48x48
"""
im = preprocessing_utils.encoding_to_image(inp)
im = preprocessing_utils.decode_base64_to_image(inp)
im = im.convert('RGB')
im = preprocessing_utils.resize_and_crop(im, (self.image_width, self.image_height))
array = np.array(im).flatten().reshape(1, self.image_width, self.image_height, self.num_channels)
@ -148,7 +148,7 @@ class Webcam(AbstractInput):
Default rebuild method to decode a base64 image
"""
inp = msg['data']['input']
im = preprocessing_utils.encoding_to_image(inp)
im = preprocessing_utils.decode_base64_to_image(inp)
timestamp = time.time()*1000
filename = f'input_{timestamp}.png'
im.save(f'{dir}/{filename}', 'PNG')
@ -205,7 +205,7 @@ class ImageUpload(AbstractInput):
"""
Default preprocessing method for is to convert the picture to black and white and resize to be 48x48
"""
im = preprocessing_utils.encoding_to_image(inp)
im = preprocessing_utils.decode_base64_to_image(inp)
with warnings.catch_warnings():
warnings.simplefilter("ignore")
im = im.convert(self.image_mode)
@ -224,7 +224,7 @@ class ImageUpload(AbstractInput):
Default rebuild method to decode a base64 image
"""
inp = msg['data']['input']
im = preprocessing_utils.encoding_to_image(inp)
im = preprocessing_utils.decode_base64_to_image(inp)
timestamp = time.time()*1000
filename = f'input_{timestamp}.png'
im.save(f'{dir}/{filename}', 'PNG')
@ -266,9 +266,11 @@ class Microphone(AbstractInput):
def preprocess(self, inp):
"""
By default, no pre-processing is applied to a microphone input file (TODO:aliabid94 fix this)
By default, no pre-processing is applied to a microphone input file
"""
return inp
file_obj = preprocessing_utils.decode_base64_to_wav_file(inp)
mfcc_array = preprocessing_utils.generate_mfcc_features_from_audio_file(file_obj.name)
return mfcc_array
def rebuild_flagged(self, dir, msg):
"""

View File

@ -136,6 +136,7 @@ class Interface:
Method that calls the relevant method of the model object to make a prediction.
:param preprocessed_input: the preprocessed input returned by the input interface
"""
print(preprocessed_input.shape)
if self.model_type == "sklearn":
return self.model_obj.predict(preprocessed_input)
elif self.model_type == "keras":

View File

@ -198,7 +198,7 @@ def serve_files_in_background(interface, port, directory_to_serve=None):
self._set_headers()
data_string = self.rfile.read(int(self.headers["Content-Length"]))
msg = json.loads(data_string)
img_orig = preprocessing_utils.encoding_to_image(msg["data"])
img_orig = preprocessing_utils.decode_base64_to_image(msg["data"])
img_orig = img_orig.convert('RGB')
img_orig = img_orig.resize((224, 224))
@ -230,7 +230,7 @@ def serve_files_in_background(interface, port, directory_to_serve=None):
self._set_headers()
data_string = self.rfile.read(int(self.headers["Content-Length"]))
msg = json.loads(data_string)
img_orig = preprocessing_utils.encoding_to_image(msg["data"])
img_orig = preprocessing_utils.decode_base64_to_image(msg["data"])
img_orig = img_orig.convert('RGB')
img_orig = img_orig.resize((224, 224))
enhancer = ImageEnhance.Brightness(img_orig)

View File

@ -161,7 +161,7 @@ class Image(AbstractOutput):
Default rebuild method to decode a base64 image
"""
out = msg['data']['output']
im = preprocessing_utils.encoding_to_image(out)
im = preprocessing_utils.decode_base64_to_image(out)
timestamp = datetime.datetime.now()
filename = f'output_{timestamp.strftime("%Y-%m-%d-%H-%M-%S")}.png'
im.save(f'{dir}/{filename}', 'PNG')

View File

@ -1,13 +1,21 @@
from PIL import Image
from io import BytesIO
import base64
import tempfile
import scipy.io.wavfile
from scipy.fftpack import dct
import numpy as np
def encoding_to_image(encoding):
#########################
# IMAGE PRE-PROCESSING
#########################
def decode_base64_to_image(encoding):
content = encoding.split(';')[1]
image_encoded = content.split(',')[1]
return Image.open(BytesIO(base64.b64decode(image_encoded)))
def resize_and_crop(img, size, crop_type='top'):
"""
Resize and crop an image to fit the specified size.
@ -58,3 +66,89 @@ def resize_and_crop(img, size, crop_type='top'):
Image.ANTIALIAS)
# If the scale is the same, we do not need to crop
return img
##################
# AUDIO FILES
##################
def decode_base64_to_wav_file(encoding):
inp = encoding.split(';')[1].split(',')[1]
wav_obj = base64.b64decode(inp)
file_obj = tempfile.NamedTemporaryFile()
file_obj.close()
with open(file_obj.name, 'wb') as f:
f.write(wav_obj)
return file_obj
def generate_mfcc_features_from_audio_file(wav_filename,
pre_emphasis=0.95,
frame_size= 0.025,
frame_stride=0.01,
NFFT=512,
nfilt=40,
num_ceps=12,
cep_lifter=22):
"""
Loads and preprocesses a .wav audio file into mfcc coefficients, the typical inputs to models.
Adapted from: https://haythamfayek.com/2016/04/21/speech-processing-for-machine-learning.html
:param wav_filename: string name of audio file to process.
:param pre_emphasis: a float factor, typically 0.95 or 0.97, which amplifies high frequencies.
:param frame_size: a float that is the length, in seconds, of time frame over which to take the fft.
:param frame_stride: a float that is the offset, in seconds, between consecutive time frames.
:param NFFT: The number of points in the short-time fft for each time frame.
:param nfilt: The number of filters on the Mel-scale to extract frequency bands.
:param num_ceps: the number of cepstral coefficients to retrain.
:param cep_lifter: the int factor, by which to de-emphasize higher-frequency.
:return: a numpy array of mfcc coefficients.
"""
sample_rate, signal = scipy.io.wavfile.read(wav_filename)
emphasized_signal = np.append(signal[0], signal[1:] - pre_emphasis * signal[:-1])
frame_length, frame_step = frame_size * sample_rate, frame_stride * sample_rate # Convert from seconds to samples
signal_length = len(emphasized_signal)
frame_length = int(round(frame_length))
frame_step = int(round(frame_step))
num_frames = int(np.ceil(float(np.abs(signal_length - frame_length)) / frame_step)) # Make sure that we have at least 1 frame
pad_signal_length = num_frames * frame_step + frame_length
z = np.zeros((pad_signal_length - signal_length))
pad_signal = np.append(emphasized_signal, z) # Pad Signal to make sure that all frames have equal number of samples without truncating any samples from the original signal
indices = np.tile(np.arange(0, frame_length), (num_frames, 1)) + np.tile(np.arange(0, num_frames * frame_step, frame_step), (frame_length, 1)).T
frames = pad_signal[indices.astype(np.int32, copy=False)]
frames *= np.hamming(frame_length)
mag_frames = np.absolute(np.fft.rfft(frames, NFFT)) # Magnitude of the FFT
pow_frames = ((1.0 / NFFT) * ((mag_frames) ** 2)) # Power Spectrum
low_freq_mel = 0
high_freq_mel = (2595 * np.log10(1 + (sample_rate / 2) / 700)) # Convert Hz to Mel
mel_points = np.linspace(low_freq_mel, high_freq_mel, nfilt + 2) # Equally spaced in Mel scale
hz_points = (700 * (10**(mel_points / 2595) - 1)) # Convert Mel to Hz
bin = np.floor((NFFT + 1) * hz_points / sample_rate)
fbank = np.zeros((nfilt, int(np.floor(NFFT / 2 + 1))))
for m in range(1, nfilt + 1):
f_m_minus = int(bin[m - 1]) # left
f_m = int(bin[m]) # center
f_m_plus = int(bin[m + 1]) # right
for k in range(f_m_minus, f_m):
fbank[m - 1, k] = (k - bin[m - 1]) / (bin[m] - bin[m - 1])
for k in range(f_m, f_m_plus):
fbank[m - 1, k] = (bin[m + 1] - k) / (bin[m + 1] - bin[m])
filter_banks = np.dot(pow_frames, fbank.T)
filter_banks = np.where(filter_banks == 0, np.finfo(float).eps, filter_banks) # Numerical Stability
filter_banks = 20 * np.log10(filter_banks) # dB
mfcc = dct(filter_banks, type=2, axis=1, norm='ortho')[:, 0: (num_ceps + 1)] # Keep filters 1-13 by default.
(nframes, ncoeff) = mfcc.shape
n = np.arange(ncoeff)
lift = 1 + (cep_lifter / 2) * np.sin(np.pi * n / cep_lifter)
mfcc *= lift
filter_banks -= (np.mean(filter_banks, axis=0) + 1e-8)
mfcc -= (np.mean(mfcc, axis=0) + 1e-8)
return mfcc[np.newaxis, :, :] # Create a batch dimension.

View File

@ -20,5 +20,6 @@ setup(
'requests',
'psutil',
'paramiko',
'scipy',
],
)