Source code for soundpy.models.template_models

'''The models.template_models module contains functions for building (ideally research-based) models.
'''

import os, sys
import inspect
import pathlib
import numpy as np
# for building and training models
import tensorflow
from tensorflow.keras import applications
from tensorflow.keras.models import Sequential, load_model, Model
from tensorflow.keras.layers import Dense, Conv2D, Flatten, Dropout, Conv2DTranspose, \
    LSTM, MaxPooling2D, TimeDistributed
from tensorflow.keras.constraints import max_norm
 
currentdir = os.path.dirname(os.path.abspath(
    inspect.getfile(inspect.currentframe())))
packagedir = os.path.dirname(currentdir)
sys.path.insert(0, packagedir)

import soundpy as pyst

###############################################################################

            
[docs]def adjust_layers_cnn(**kwargs): '''Reduces layers of CNN until the model can be built. If the number of filters for 'mfcc' or 'fbank' is in the lower range (i.e. 13 or so), this causes issues with the default settings of the cnn architecture. The architecture was built with at least 40 filters being applied during feature extraction. To deal with this problem, the number of CNN layers are reduced. Parameters ---------- **kwargs : Keyword arguments Keyword arguments for soundpy.models.template_models.cnn_classifier Returns ------- settings : dict Updated dictionary with relevant settings for model. References ---------- https://github.com/pgys/NoIze ''' settings = dict(kwargs) try: assert len(settings['feature_maps']) == len(settings['kernel_size']) except AssertionError: raise ValueError('Expect `feature_maps` and `kernel_size` to have same length.') if len(settings['feature_maps']) > 1: settings['feature_maps'] = kwargs['feature_maps'][:-1] else: raise ValueError('CNN Model cannot be trained with set number of '+\ 'features and feature maps.') if len(settings['kernel_size']) > 1: settings['kernel_size'] = kwargs['kernel_size'][:-1] return settings
[docs]def cnn_classifier(feature_maps = [40, 20, 10], kernel_size = [(3, 3), (3, 3), (3, 3)], strides = 2, activation_layer = 'relu', activation_output = 'softmax', input_shape = (79, 40, 1), num_labels = 3, dense_hidden_units=100, dropout=0.25): '''Build a single or multilayer convolutional neural network. Parameters ---------- feature_maps : int or list The filter or feature map applied to the data. One feature map per convolutional neural layer required. For example, a list of length 3 will result in a three-layer convolutional neural network. kernel_size : tuple or list of tuples Must match the number of feature_maps. The size of each corresponding feature map. strides : int activation_layer : str (default 'relu') activation_outpu : str (default 'softmax') input_shape : tuple The shape of the input dense_hidden_units : int, optional dropout : float, optional Reduces overfitting Returns ------- model : tf.keras.Model Model ready to be compiled. settings : dict Dictionary with relevant settings for model. Warning ------- If number features are not compatible with number of layers, warning raised and layers adjusted. E.g. For lower number of MFCC features this will likely be applied if number of layers is greater than 1. References ---------- A. Sehgal and N. Kehtarnavaz, "A Convolutional Neural Network Smartphone App for Real-Time Voice Activity Detection," in IEEE Access, vol. 6, pp. 9017-9026, 2018. ''' try: settings = dict(feature_maps = feature_maps, kernel_size = kernel_size, strides = strides, activation_layer = activation_layer, activation_output = activation_output, input_shape = input_shape, num_labels = num_labels, dense_hidden_units = dense_hidden_units, dropout = dropout) model = Sequential() if not isinstance(feature_maps, list): feature_maps = list(feature_maps) if not isinstance(kernel_size, list): kernel_size = list(kernel_size) assert len(feature_maps) == len(kernel_size) for i, featmap in enumerate(feature_maps): if i == 0: model.add(Conv2D(featmap, kernel_size[i], strides = strides, activation = activation_layer, input_shape = input_shape)) else: model.add(Conv2D(featmap, kernel_size[i], strides = strides, activation = activation_layer)) if dense_hidden_units is not None: model.add(Dense(dense_hidden_units)) if dropout is not None: model.add(Dropout(dropout)) model.add(Flatten()) model.add(Dense(num_labels, activation = activation_output)) except ValueError: import warnings msg = '\nWARNING: number of layers ({}) incompatible with number'.format(len(feature_maps))+\ ' of features or filters ({}). Reducing number of layers until'.format(feature_maps[0])+\ ' model and number features / filters is compatible.' warnings.warn(msg) num_layers_orig = len(feature_maps) try: updated_settings = adjust_layers_cnn(**settings) except ValueError as e: print(e) return None, settings model, settings = cnn_classifier(**updated_settings) print('Updated number of layers: {}'.format(len(settings['feature_maps']))) return model, settings
# TODO: update model based on research
[docs]def autoencoder_denoise(input_shape, kernel_size = (3,3), max_norm_value = 2.0, activation_function_layer = 'relu', activation_function_output = 'sigmoid', padding = 'same', kernel_initializer = 'he_uniform'): '''Build a simple autoencoder denoiser. Parameters ---------- input_shape : tuple Shape of the input data. max_norm_value : int or float Returns ------- autoencoder : tf.keras.Model Model ready to be compiled References ---------- Versloot, Christian (2019, December 19). Creating a Signal Noise Removal Autoencoder with Keras. MachineCurve. https://www.machinecurve.com ''' autoencoder = Sequential() autoencoder.add(Conv2D(128, kernel_size = kernel_size, kernel_constraint = max_norm(max_norm_value), activation = activation_function_layer, kernel_initializer = kernel_initializer, input_shape=input_shape)) autoencoder.add(Conv2D(32, kernel_size = kernel_size, kernel_constraint = max_norm(max_norm_value), activation = activation_function_layer, kernel_initializer = kernel_initializer)) autoencoder.add(Conv2DTranspose(32, kernel_size = kernel_size, kernel_constraint=max_norm(max_norm_value), activation = activation_function_layer, kernel_initializer = kernel_initializer)) autoencoder.add(Conv2DTranspose(128, kernel_size = kernel_size, kernel_constraint = max_norm(max_norm_value), activation = activation_function_layer, kernel_initializer = kernel_initializer)) autoencoder.add(Conv2D(1, kernel_size = kernel_size, kernel_constraint = max_norm(max_norm_value), activation = activation_function_output, padding = padding)) settings = dict(input_shape = input_shape, kernel_size = kernel_size, max_norm_value = max_norm_value, activation_function_layer = activation_function_layer, activation_function_output = activation_function_output, padding = padding, kernel_initializer = kernel_initializer) return autoencoder, settings
[docs]def resnet50_classifier(input_shape, num_labels, activation = 'softmax', final_layer_name = 'features'): '''Simple image classifier built ontop of a pretrained ResNet50 model. References ---------- Revay, S. & Teschke, M. (2019). Multiclass Language Identification using Deep Learning on Spectral Images of Audio Signals. arXiv:1905.04348 [cs.SD] ''' if len(input_shape) != 3: raise ValueError('ResNet50 expects 3D feature data, not {}D.'.format( len(input_shape))) if input_shape[-1] != 3: raise ValueError('ResNet50 expects 3 channels for RGB values, not {}.'.format( input_shape[-1])) base_model = applications.resnet50.ResNet50(weights=None, include_top=False, input_shape = input_shape) x = base_model.output x = Flatten()(x) predictions = Dense(num_labels, activation = activation, name = final_layer_name)(x) # add name to this layer for TensorBoard visuals model = Model(inputs = base_model.input, outputs = predictions) settings = dict(input_shape = input_shape, num_labels = num_labels, activation = activation, final_layer_name = final_layer_name) return model, settings
[docs]def cnnlstm_classifier(num_labels, input_shape, lstm_cells, feature_map_filters = 32, kernel_size = (8,4), pool_size = (3,3), dense_hidden_units = 60, activation_layer = 'relu', activation_output = 'softmax', dropout = 0.25): '''Model architecture inpsired from the paper below. References ---------- Kim, Myungjong & Cao, Beiming & An, Kwanghoon & Wang, Jun. (2018). Dysarthric Speech Recognition Using Convolutional LSTM Neural Network. 10.21437/interspeech.2018-2250. ''' cnn = Sequential() cnn.add(Conv2D(feature_map_filters, kernel_size = kernel_size, activation = activation_layer)) # non-overlapping pool_size cnn.add(MaxPooling2D(pool_size = pool_size)) cnn.add(Dropout(dropout)) cnn.add(Flatten()) # prepare stacked LSTM model = Sequential() model.add(TimeDistributed(cnn,input_shape = input_shape)) model.add(LSTM(lstm_cells, return_sequences = True)) model.add(LSTM(lstm_cells, return_sequences = True)) model.add(Flatten()) model.add(Dense(num_labels,activation = activation_output)) settings = dict(input_shape = input_shape, num_labels = num_labels, kernel_size = kernel_size, pool_size = pool_size, activation_layer = activation_layer, activation_output = activation_output, lstm_cells = lstm_cells, feature_map_filters = feature_map_filters, dense_hidden_units = dense_hidden_units, dropout = dropout) return model, settings