Source code for soundpy.models.template_models

'''The models.template_models module contains functions for building (ideally research-based) models.
'''

import os, sys
import inspect
import pathlib
import numpy as np
# for building and training models
import tensorflow
from tensorflow.keras import applications
from tensorflow.keras.models import Sequential, load_model, Model
from tensorflow.keras.layers import Dense, Conv2D, Flatten, Dropout, Conv2DTranspose, \
    LSTM, MaxPooling2D, TimeDistributed
from tensorflow.keras.constraints import max_norm
 
currentdir = os.path.dirname(os.path.abspath(
    inspect.getfile(inspect.currentframe())))
packagedir = os.path.dirname(currentdir)
sys.path.insert(0, packagedir)

import soundpy as pyst

###############################################################################

            
[docs]def adjust_layers_cnn(**kwargs):
    '''Reduces layers of CNN until the model can be built.

    If the number of filters for 'mfcc' or 'fbank' is in the lower range
    (i.e. 13 or so), this causes issues with the default settings of
    the cnn architecture. The architecture was built with at least 40
    filters being applied during feature extraction. To deal with this
    problem, the number of CNN layers are reduced. 
    
    Parameters
    ----------
    **kwargs : Keyword arguments 
        Keyword arguments for soundpy.models.template_models.cnn_classifier
        
    Returns
    -------
    settings : dict 
        Updated dictionary with relevant settings for model.
        
    References
    ----------
    https://github.com/pgys/NoIze
    '''
    
    settings = dict(kwargs)
    try: 
        assert len(settings['feature_maps']) == len(settings['kernel_size'])
    except AssertionError:
        raise ValueError('Expect `feature_maps` and `kernel_size` to have same length.')
    if len(settings['feature_maps']) > 1:
        settings['feature_maps'] = kwargs['feature_maps'][:-1]
    else:
        raise ValueError('CNN Model cannot be trained with set number of '+\
            'features and feature maps.')
    if len(settings['kernel_size']) > 1:
        settings['kernel_size'] = kwargs['kernel_size'][:-1]
    return settings

[docs]def cnn_classifier(feature_maps = [40, 20, 10], 
                   kernel_size =  [(3, 3), (3, 3), (3, 3)],
                   strides = 2,
                   activation_layer = 'relu', 
                 activation_output = 'softmax', 
                 input_shape = (79, 40, 1), 
                 num_labels = 3, 
                 dense_hidden_units=100, dropout=0.25):
    '''Build a single or multilayer convolutional neural network.
    
    Parameters
    ----------
    feature_maps : int or list 
        The filter or feature map applied to the data. One feature map per
        convolutional neural layer required. For example, a list of length
        3 will result in a three-layer convolutional neural network.
    kernel_size : tuple or list of tuples
        Must match the number of feature_maps. The size of each corresponding feature map.
    strides : int
    
    activation_layer : str 
        (default 'relu')
    activation_outpu : str 
        (default 'softmax')
    input_shape : tuple
        The shape of the input 
    dense_hidden_units : int, optional
    
    dropout : float, optional
        Reduces overfitting
        
    Returns
    -------
    model : tf.keras.Model
        Model ready to be compiled.
        
    settings : dict 
        Dictionary with relevant settings for model.
        
    Warning
    -------
    If number features are not compatible with number of layers, warning raised and 
    layers adjusted. E.g. For lower number of MFCC features this will likely be applied if number
    of layers is greater than 1.
        
    References
    ----------
    A. Sehgal and N. Kehtarnavaz, "A Convolutional Neural Network 
    Smartphone App for Real-Time Voice Activity Detection," in IEEE Access, 
    vol. 6, pp. 9017-9026, 2018. 
    '''
    try:
        settings = dict(feature_maps = feature_maps, 
                        kernel_size = kernel_size,
                        strides = strides,
                        activation_layer = activation_layer,
                        activation_output = activation_output,
                        input_shape = input_shape,
                        num_labels = num_labels,
                        dense_hidden_units = dense_hidden_units,
                        dropout = dropout)
        model = Sequential()
        if not isinstance(feature_maps, list):
            feature_maps = list(feature_maps)
        if not isinstance(kernel_size, list):
            kernel_size = list(kernel_size)
        assert len(feature_maps) == len(kernel_size)
        for i, featmap in enumerate(feature_maps):
            if i == 0:
                model.add(Conv2D(featmap,
                        kernel_size[i],
                        strides = strides,
                        activation = activation_layer,
                        input_shape = input_shape))
            else:
                model.add(Conv2D(featmap,
                        kernel_size[i],
                        strides = strides,
                        activation = activation_layer))
        if dense_hidden_units is not None:
            model.add(Dense(dense_hidden_units))
        if dropout is not None:
            model.add(Dropout(dropout))
        model.add(Flatten())
        model.add(Dense(num_labels, activation = activation_output))   
    except ValueError:
        import warnings
        msg = '\nWARNING: number of layers ({}) incompatible with number'.format(len(feature_maps))+\
            ' of features or filters ({}). Reducing number of layers until'.format(feature_maps[0])+\
                ' model and number features / filters is compatible.'
        warnings.warn(msg)
        num_layers_orig = len(feature_maps)
        try:
            updated_settings = adjust_layers_cnn(**settings)
        except ValueError as e:
            print(e)
            return None, settings
        model, settings = cnn_classifier(**updated_settings)
        print('Updated number of layers: {}'.format(len(settings['feature_maps'])))
        
    return model, settings

# TODO: update model based on research
[docs]def autoencoder_denoise(input_shape,
                        kernel_size = (3,3),
                        max_norm_value = 2.0,
                        activation_function_layer = 'relu',
                        activation_function_output = 'sigmoid',
                        padding = 'same',
                        kernel_initializer = 'he_uniform'):
    '''Build a simple autoencoder denoiser.
    
    Parameters
    ----------
    input_shape : tuple
        Shape of the input data.
    max_norm_value : int or float
        
    Returns
    -------
    autoencoder : tf.keras.Model
        Model ready to be compiled
        
    References
    ----------
    Versloot, Christian (2019, December 19). Creating a Signal Noise Removal 
    Autoencoder with Keras. MachineCurve. https://www.machinecurve.com
    '''
    autoencoder = Sequential()
    autoencoder.add(Conv2D(128, kernel_size = kernel_size,
                            kernel_constraint = max_norm(max_norm_value),
                            activation = activation_function_layer,
                            kernel_initializer = kernel_initializer,
                            input_shape=input_shape))
    autoencoder.add(Conv2D(32, kernel_size = kernel_size,
                            kernel_constraint = max_norm(max_norm_value),
                            activation = activation_function_layer,
                            kernel_initializer = kernel_initializer))
    autoencoder.add(Conv2DTranspose(32, kernel_size = kernel_size,
                            kernel_constraint=max_norm(max_norm_value),
                            activation = activation_function_layer,
                            kernel_initializer = kernel_initializer))
    autoencoder.add(Conv2DTranspose(128, kernel_size = kernel_size,
                            kernel_constraint = max_norm(max_norm_value),
                            activation = activation_function_layer,
                            kernel_initializer = kernel_initializer))
    autoencoder.add(Conv2D(1, kernel_size = kernel_size,
                            kernel_constraint = max_norm(max_norm_value),
                            activation = activation_function_output, 
                            padding = padding))
    settings = dict(input_shape = input_shape,
                    kernel_size = kernel_size,
                    max_norm_value = max_norm_value,
                    activation_function_layer = activation_function_layer,
                    activation_function_output = activation_function_output,
                    padding = padding,
                    kernel_initializer = kernel_initializer)
    return autoencoder, settings

[docs]def resnet50_classifier(input_shape, num_labels, activation = 'softmax',
                        final_layer_name = 'features'):
    '''Simple image classifier built ontop of a pretrained ResNet50 model.
    
    References
    ----------
    Revay, S. & Teschke, M. (2019). Multiclass Language Identification using Deep 
    Learning on Spectral Images of Audio Signals. arXiv:1905.04348 [cs.SD]
    '''
    if len(input_shape) != 3:
        raise ValueError('ResNet50 expects 3D feature data, not {}D.'.format(
            len(input_shape)))
    if input_shape[-1] != 3:
        raise ValueError('ResNet50 expects 3 channels for RGB values, not {}.'.format(
            input_shape[-1]))
    base_model = applications.resnet50.ResNet50(weights=None, include_top=False,
                                                input_shape = input_shape)
    x = base_model.output 
    x = Flatten()(x)
    predictions = Dense(num_labels,
                        activation = activation,
                        name = final_layer_name)(x) # add name to this layer for TensorBoard visuals
    model = Model(inputs = base_model.input,
                  outputs = predictions)
    settings = dict(input_shape = input_shape,
                    num_labels = num_labels,
                    activation = activation,
                    final_layer_name = final_layer_name)
    return model, settings

[docs]def cnnlstm_classifier(num_labels, input_shape, lstm_cells, 
                       feature_map_filters = 32, kernel_size = (8,4),
                       pool_size = (3,3), dense_hidden_units = 60, 
                       activation_layer = 'relu', activation_output = 'softmax', 
                       dropout = 0.25):
    '''Model architecture inpsired from the paper below.
    
    References
    ----------
    Kim, Myungjong & Cao, Beiming & An, Kwanghoon & Wang, Jun. (2018). Dysarthric Speech Recognition Using Convolutional LSTM Neural Network. 10.21437/interspeech.2018-2250. 
    '''
    cnn = Sequential()
    cnn.add(Conv2D(feature_map_filters, kernel_size = kernel_size, activation = activation_layer))
    # non-overlapping pool_size 
    cnn.add(MaxPooling2D(pool_size = pool_size))
    cnn.add(Dropout(dropout))
    cnn.add(Flatten())

    # prepare stacked LSTM
    model = Sequential()
    model.add(TimeDistributed(cnn,input_shape = input_shape))
    model.add(LSTM(lstm_cells, return_sequences = True))
    model.add(LSTM(lstm_cells, return_sequences = True))
    model.add(Flatten())
    model.add(Dense(num_labels,activation = activation_output)) 
    settings = dict(input_shape = input_shape,
                    num_labels = num_labels,
                    kernel_size = kernel_size,
                    pool_size = pool_size,
                    activation_layer = activation_layer,
                    activation_output = activation_output,
                    lstm_cells = lstm_cells,
                    feature_map_filters = feature_map_filters,
                    dense_hidden_units = dense_hidden_units,
                    dropout = dropout)
    return model, settings