Source code for soundpy.feats

'''Feats module includes functions related to converting audio sample data 
to features for analysis, filtering, machine learning, or visualization.  
''' 


###############################################################################
import os, sys
import inspect
currentdir = os.path.dirname(os.path.abspath(
    inspect.getfile(inspect.currentframe())))
packagedir = os.path.dirname(currentdir)
sys.path.insert(0, packagedir)

import collections
import numpy as np
import math
import random
import matplotlib
import librosa
from scipy.signal import hann, hamming
from scipy.fftpack import dct
import pathlib
from python_speech_features import fbank, mfcc, delta
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
import soundpy as sp


# TODO Clean up   
# stereo sound only for plotting 'signal'; NOT for frequency features.
[docs]def plot(feature_matrix, feature_type, 
        save_pic=False, name4pic=None, energy_scale='power_to_db',
        title=None, sr=None, win_size_ms=None, percent_overlap=None,
        x_label=None, y_label=None, subprocess=False, overwrite=False):
    '''Visualize feature extraction; frames on x axis, features on y axis. Uses librosa to scale the data if scale applied.
    
    Note: can only take multiple channels if `feature_type` is 'signal'. For other 
    feature types, the plot will not work as expected.
    
    Parameters
    ----------
    feature_matrix : np.ndarray [shape=(num_samples,), (num_samples, num_channels), or (num_features, num_frames), dtype=np.float].
        Matrix of features. If the features are not of type 'signal' and the
        shape is 1 D, one dimension will be added to be plotted with a colormesh. 
        
    feature_type : str
        Options: 'signal', 'stft', 'mfcc', or 'fbank' features, or 
        what user would like to name the feature set.
        signal: the 1 D samples of sound.
        STFT: short-time Fourier transform
        MFCC: mel frequency cepstral coefficients.
        FBANK: mel-log filterbank energies (default 'fbank').

    save_pic : bool
        True to save image as .png; False to just plot it. If `subprocess` is 
        True, `save_pic` will automatically be set to True.

    name4pic : str, optional
        If `save_pic` set to True, the name the image should be saved under.
        
    energy_scale : str, optional
        If features need to be adjusted, e.g. from power to decibels. 
        Default is 'power_to_db'.
        
    title : str, optional
        The title for the graph. If None, `feature_type` is used.

    sr : int, optional
        Useful in plotting the time for features.
        
    win_size_ms : int, float, optional
        Useful in plotting the time for features in the frequency domain (e.g. 
        STFT, FBANK, MFCC features)
        
    percent_overlap : int, float, optional
        Useful in plotting the time for features in the frequency domain (e.g. 
        STFT, FBANK, MFCC features)
        
    x_label : str, optional 
        The label to be applied to the x axis. 
        
    y_label : str, optional 
        The label to be applied to the y axis.
        
    subprocess : bool 
        If `subprocess` is True, matplotlib will use backend 'Agg', which only allows plots to be saved.
        If `subprocess` is False, the default backend 'TkAgg' will be used, which allows plots to be 
        generated live as well as saved. The 'Agg' backend is useful if one wants to visualize sound
        while a main process is being performed, for example, while a model is being trained.
        (default False)
        
    overwrite : bool 
        If False, if .png file already exists under given name, a date tag will be 
        added to the .png filename to avoid overwriting the file. 
        (default False)
        
    Returns
    -------
    None
    '''
    if not subprocess:
        # can show plots
        # interferes with training models though
        matplotlib.use('TkAgg')
    else:
        # does not interfere with training models
        matplotlib.use('Agg')
        if save_pic is False:
            import warnings
            save_pic = True
            if name4pic is None:
                location = 'current working directory'
            else:
                location = name4pic
            msg = 'Due to matplotlib using AGG backend, cannot display plot. '+\
                'Therefore, the plot will be saved here: {}'.format(location)
            warnings.warn(msg)
    import matplotlib.pyplot as plt
    # ensure real numbers
    if feature_matrix.dtype == np.complex64 or feature_matrix.dtype == np.complex128:
        feature_matrix = np.abs(feature_matrix)
    # features presented via colormesh need 2D format.
    if len(feature_matrix.shape) == 1:
        feature_matrix = np.expand_dims(feature_matrix, axis=1)
    if 'fbank' in feature_type:
        axis_feature_label = 'Num Mel Filters'
    elif 'mfcc' in feature_type:
        axis_feature_label = 'Num Mel Freq Cepstral Coefficients'
    elif 'stft' in feature_type:
        axis_feature_label = 'Number of frames'
    elif 'signal' in feature_type:
        axis_feature_label = 'Amplitude'
    else:
        axis_feature_label = 'Energy'
    if energy_scale is None or feature_type == 'signal':
        energy_label = 'energy'
        energy_scale = None
    if energy_scale == 'power_to_db':
        feature_matrix = librosa.power_to_db(feature_matrix)
        energy_label = 'decicels'
    elif energy_scale == 'db_to_power':
        feature_matrix = librosa.db_to_power(feature_matrix)
        energy_label = 'power'
    elif energy_scale == 'amplitude_to_db':
        feature_matrix = librosa.amplitude_to_db(feature_matrix)
        energy_label = 'decibels'
    elif energy_scale == 'db_to_amplitude':
        feature_matrix = librosa.db_to_amplitude(feature_matrix)
        energy_label = 'amplitude'
    plt.clf()
    if 'signal' not in feature_type:
        x_axis_label = 'Frequency bins'
    else:
        x_axis_label = 'Samples over time'
    if 'signal' in feature_type:
        # transpose matrix if second dimension is larger - probably 
        # because channels are in first dimension. Expect in second dimension
        if not feature_matrix.shape[0] > feature_matrix.shape[1]:
            feature_matrix = feature_matrix.T
        if sr is not None:
            x_axis_label = 'Time (sec)'
            dur_sec = feature_matrix.shape[0] / sr
            time_sec = sp.dsp.get_time_points(dur_sec, sr)
            for channel in range(feature_matrix.shape[1]):
                data = feature_matrix[:,channel]
                # overlay the channel data
                if len(time_sec) > len(data):
                    time_sec = time_sec[:len(data)]
                elif len(time_sec) < len(data):
                    data = data[:len(time_sec)]
                plt.plot(time_sec, data)
        else:
            for channel in range(feature_matrix.shape[1]):
                data = feature_matrix[:,channel]
                # overlay the channel data
                plt.plot(data)
        x_axis_label += ' across {} channel(s)'.format(feature_matrix.shape[1])
    else:
        plt.pcolormesh(feature_matrix.T)
        plt.colorbar(label=energy_label)
    plt.xlabel(x_axis_label)
    plt.ylabel(axis_feature_label)
    # if feature_matrix has multiple frames, not just one
    if feature_matrix.shape[1] > 1 and 'signal' not in feature_type: 
        if win_size_ms is not None and percent_overlap is not None:
            # the xticks basically show time but need to be multiplied by 0.01
            plt.xlabel('Time (sec)') 
            locs, labels = plt.xticks()
            if percent_overlap == 0:
                new_labels=[str(round(i*0.001*win_size_ms,1)) for i in locs]
            else:
                new_labels=[str(round(i*0.001*win_size_ms*percent_overlap,1)) for i in locs]
            plt.xticks(ticks=locs,labels=new_labels)
        else:
            plt.xlabel('Number frames')
        plt.ylabel('Frequency bins')
    if title is None:
        plt.title('{} Features'.format(feature_type.upper()))
    else:
        plt.title(title)
    if x_label is not None:
        plt.xlabel(x_label)
    if y_label is not None:
        plt.ylabel(y_label)
    if save_pic:
        outputname = name4pic or 'visualize{}feats_{}'.format(feature_type.upper(),
                                                              sp.utils.get_date())
        outputname = sp.utils.string2pathlib(outputname)
        if outputname.suffix:
            if outputname.suffix != '.png':
                # add .png as extension
                fname = outputname.name + '.png'
                outputname = outputname.parent.joinpath(fname)
        else:
            fname = outputname.stem + '.png'
            outputname = outputname.parent.joinpath(fname)
        if not overwrite:
            if os.path.exists(outputname):
                fname = outputname.stem
                fname += '_'+sp.utils.get_date()
                outputname = outputname.parent.joinpath(fname+outputname.suffix)
        plt.savefig(outputname)
    else:
        plt.show()

# tested for stereo sound
[docs]def plotsound(audiodata, feature_type='fbank', win_size_ms = 20, \
    percent_overlap = 0.5, fft_bins = None, num_filters=40, num_mfcc=40, sr=None,\
        save_pic=False, name4pic=None, energy_scale='power_to_db', mono=None, real_signal=False, **kwargs):
    '''Visualize feature extraction depending on set parameters. 
    
    Stereo sound can be graphed. If `feature_type` is 'signal', all channels will be 
    graphed on same plot. Otherwise, each channel will be plotted separately.
    
    Parameters
    ----------
    audiodata : str, numpy.ndarray [size=(num_samples,) or (num_samples, num_channels)]
        If str, wavfile (must be compatible with scipy.io.wavfile). Otherwise 
        the samples of the sound data. Note: in the latter case, `sr`
        must be declared.
    
    feature_type : str
        Options: 'signal', 'mfcc', or 'fbank' features. 
        MFCC: mel frequency cepstral
        coefficients; FBANK: mel-log filterbank energies (default 'fbank')
    
    win_size_ms : int or float
        Window length in milliseconds for Fourier transform to be applied
        (default 20)
    
    percent_overlap : int or float 
        Amount of overlap between processing windows. For example, if `percent_overlap`
        is set at 0.5, the overlap will be half that of `win_size_ms`. (default 0.5) 
        If an integer is provided, it will be converted to a float between 0 and 1.
    
    num_filters : int
        Number of mel-filters to be used when applying mel-scale. For 
        'fbank' features, 20-128 are common, with 40 being very common.
        (default 40)
    
    num_mfcc : int
        Number of mel frequency cepstral coefficients. First coefficient
        pertains to loudness; 2-13 frequencies relevant for speech; 13-40
        for acoustic environment analysis or non-linguistic information.
        Note: it is not possible to choose only 2-13 or 13-40; if `num_mfcc`
        is set to 40, all 40 coefficients will be included.
        (default 40). 
    
    sr : int, optional
        The sample rate of the sound data or the desired sample rate of
        the wavfile to be loaded. (default None)
    
    mono : bool, optional
        When loading an audiofile, True will limit number of channels to
        one; False will allow more channels to be loaded. (default None, 
        which results in mono channel loading.)
    
    **kwargs : additional keyword arguments
        Keyword arguments for soundpy.feats.plot
    '''
    percent_overlap = check_percent_overlap(percent_overlap)
    if 'signal' not in feature_type:
        if isinstance(audiodata, np.ndarray) and len(audiodata.shape) > 1:
            for channel in range(audiodata.shape[1]):
                if name4pic is None:
                    name4pic = '{}_channel_{}'.format(feature_type, channel+1)
                else:
                    name4pic = sp.string2pathlib(name4pic)
                    name = name4pic.stem
                    if channel == 0:
                        name += '_channel{}'.format(channel+1)
                    else:
                        name = name[:-1]+'{}'.format(channel+1)
                    name4pic = name4pic.parent.joinpath(name+name4pic.suffix)
                if 'title' not in kwargs:
                    kwargs['title'] = '{} features\n(channel {})'.format(
                        feature_type, channel+1)
                else:
                    if channel == 0:
                        kwargs['title'] += '\n(channel {})'.format(channel+1)
                    else:
                        kwargs['title'] = kwargs['title'][:-2]+'{})'.format(
                            channel+1)
                feats = sp.feats.get_feats(
                    audiodata[:,channel], feature_type=feature_type, 
                    win_size_ms = win_size_ms, percent_overlap = percent_overlap,
                    fft_bins = fft_bins, num_filters=num_filters, num_mfcc = num_mfcc,
                    sr=sr, mono = mono, real_signal = real_signal)
                sp.feats.plot(
                    feats, feature_type=feature_type, sr=sr,
                    save_pic = save_pic, name4pic=name4pic, 
                    energy_scale = energy_scale,
                    win_size_ms = win_size_ms, percent_overlap = percent_overlap,
                    **kwargs)
            return None
    feats = sp.feats.get_feats(audiodata, feature_type=feature_type, 
                      win_size_ms = win_size_ms, percent_overlap = percent_overlap,
                      fft_bins = fft_bins, num_filters=num_filters, num_mfcc = num_mfcc,
                      sr=sr, mono = mono, real_signal = real_signal)
    sp.feats.plot(feats, feature_type=feature_type, sr=sr,
                    save_pic = save_pic, name4pic=name4pic, energy_scale = energy_scale,
                    win_size_ms = win_size_ms, percent_overlap = percent_overlap,
                    **kwargs)

# stereo sound with mono (True/False) works for 'signal' data
# only mono for frequency features
[docs]def get_feats(sound,
              sr = None, 
              feature_type = 'fbank', 
              win_size_ms = 20, 
              percent_overlap = 0.5,
              window = 'hann',
              fft_bins = None,
              num_filters = None,
              num_mfcc = None,
              remove_first_coefficient = False,
              sinosoidal_liftering = False,
              dur_sec = None,
              mono = None,
              rate_of_change = False,
              rate_of_acceleration = False,
              subtract_mean = False,
              real_signal = True,
              fmin = None, 
              fmax = None,
              zeropad = True):
    '''Collects raw signal data, stft, fbank, or mfcc features.
    
    Parameters
    ----------
    sound : str or numpy.ndarray [size=(num_samples,) or (num_samples, num_channels)]
        If str, wavfile (must be compatible with scipy.io.wavfile). Otherwise 
        the samples of the sound data. Note: in the latter case, `sr`
        must be declared.
    
    sr : int, optional
        The sample rate of the sound data or the desired sample rate of
        the wavfile to be loaded. (default None)
    
    feature_type : str
        Options include 'signal', 'stft', 'powspec', 'fbank',  or 'mfcc' data 
        (default 'fbank').
        signal: energy/amplitude measurements along time
        STFT: short-time fourier transform
        powspec : power spectrum (absolute value of stft, squared)
        FBANK: mel-log filterbank energies 
        MFCC: mel frequency cepstral coefficients 
    
    win_size_ms : int or float
        Window length in milliseconds for Fourier transform to be applied
        (default 20)
    
    percent_overlap : int or float 
        Amount of overlap between processing windows. For example, if `percent_overlap`
        is set at 0.5, the overlap will be half that of `win_size_ms`. (default 0.5) 
        If an integer is provided, it will be converted to a float between 0 and 1. 
    
    window : str or np.ndarray [size (n_fft, )]
        The window function to be applied to each window. (Default 'hann')
    
    fft_bins : int  
        Number of frequency bins to apply in fast Fourier transform. (default None) 
    
    num_filters : int
        Number of mel-filters to be used when applying mel-scale. For 
        'fbank' features, 20-128 are common, with 40 being very common. If None, will 
        be set to 40.
        (default None)
    
    num_mfcc : int
        Number of mel frequency cepstral coefficients. First coefficient
        pertains to loudness; 2-13 frequencies relevant for speech; 13-40
        for acoustic environment analysis or non-linguistic information.
        If None, will be set to `num_filters` or 40.
        (default None). 
    
    dur_sec : float, optional
        Time in seconds to limit in loading a signal. (default None)
    
    mono: bool, optional
        For loading an audiofile, True will result in only one channel of 
        data being loaded; False will allow additional channels be loaded. 
        (default None, which results in mono channel data)
    
    rate_of_change : bool 
        If True, the first derivative of spectral data will be concatenated 
        to the features.
        This is applicable for all feature types except 'signal'.
        
    rate_of_acceleration : bool 
        If True, the second derivative of spectral data will be concatenated 
        to the features.
        This is applicable for all feature types except 'signal'.
        
    subtract_mean : bool 
        If True, the mean of each feature column will be subtracted from
        each row. This is applicable for all feature types except 'signal'.
        
    Returns
    -------
    feats : tuple (num_samples, sr) or np.ndarray [size (num_frames, num_filters) dtype=np.float or np.complex]
        Feature data. If `feature_type` is 'signal', returns a tuple containing samples and sampling rate. If `feature_type` is of another type, returns np.ndarray with shape (num_frames, num_filters/features)
    '''
    # load data
    if isinstance(sound, str) or isinstance(sound, pathlib.PosixPath):
        if mono is None:
            mono = True
        data, sr = sp.loadsound(sound, sr = sr, dur_sec = dur_sec, mono = mono)
        if mono is False and len(data.shape) > 1:
            index_samples = np.argmax(data.shape)
            index_channels = np.argmin(data.shape)
            num_channels = data.shape[index_channels]
            # transpose data to be (samples, num_channels) rather than (num_channels, samples)
            if index_channels == 0:
                data = data.T 
            # remove additional channel for 'stft', 'fbank' etc. feature
            # extraction
            if 'signal' not in feature_type and num_channels > 1:
                import warnings
                warnings.warn('Only one channel is used for {}'.format(feature_type)+\
                    ' feature extraction. Removing extra channels.')
                data = data[:,0]
    else:
        if sr is None:
            raise ValueError('No samplerate given. Either provide '+\
                'filename or appropriate samplerate.')
        data, sr = sound, sr
        if len(data.shape) > 1 and 'signal' not in feature_type:
            print('Only one channel can be currently used for feature '+\
                'extraction. Using the first channel.')
            data = data[:,0]
        elif len(data.shape) > 1 and mono:
            data = data[:,0]
        if dur_sec:
            data = data[:int(sr*dur_sec)]
    
    if isinstance(data, np.ndarray):
        if not np.isfinite(data).all():
            raise TypeError('NAN values found in loaded sound samples.')
    else:
        raise TypeError('Data is type {} and '.format(type(data))+\
            'a numpy.ndarray was expected.')
    # ensure percent overlap is between 0 and 1
    percent_overlap = check_percent_overlap(percent_overlap)
    win_shift_ms = win_size_ms - (win_size_ms * percent_overlap)
    if win_shift_ms <= 0:
        raise ValueError('`percent_overlap` {} is too high. '.format(percent_overlap)+\
            'The signal cannot be processed with 0 or negative window shift / hop length.')
    if fft_bins is None:
        # base on frame length / window: larger windows --> higher freq resolution
        fft_bins = int(win_size_ms * sr // 1000)
    if 'stft' in feature_type or 'powspec' in feature_type:
        feats = sp.feats.get_stft(
            sound = data, 
            sr = sr, 
            win_size_ms = win_size_ms,
            percent_overlap = percent_overlap,
            real_signal = real_signal,
            fft_bins = fft_bins,
            window = window,
            zeropad = zeropad
            )
    elif 'fbank' in feature_type:
        if num_filters is None:
            num_filters = 40
        feats = sp.feats.get_fbank(
            sound = data, 
            sr = sr, 
            num_filters = num_filters,
            win_size_ms = win_size_ms,
            percent_overlap = percent_overlap,
            real_signal = real_signal,
            fft_bins = fft_bins,
            window = window, 
            zeropad = zeropad
            )
        
    elif 'mfcc' in feature_type:
        if num_mfcc is None:
            if num_filters is not None:
                num_mfcc = num_filters
            else:
                num_mfcc = 40
        if num_filters is None:
            num_filters = 40
        feats = sp.feats.get_mfcc(
            sound = data, 
            sr = sr, 
            num_mfcc = num_mfcc,
            remove_first_coefficient = remove_first_coefficient,
            sinosoidal_liftering = sinosoidal_liftering,
            num_filters = num_filters,
            win_size_ms = win_size_ms,
            percent_overlap = percent_overlap,
            real_signal = real_signal,
            fft_bins = fft_bins,
            window = window, 
            zeropad = zeropad
            )
    elif 'signal' in feature_type:
        if data.dtype == np.complex128 or data.dtype == np.complex64:
            import warnings
            msg = '\nWARNING: real raw signal features are being generated '+\
                'from a STFT matrix.'
            warnings.warn(msg)
            feats = sp.feats.feats2audio(data, feature_type = 'stft',
                                            sr=sr, 
                                            win_size_ms = win_size_ms,
                                            percent_overlap = percent_overlap)
        else:
            feats = data
        
    # TODO test difference between python_speech_features and librosa
    if not 'signal' in feature_type:
        if subtract_mean is True:
            feats -= (np.mean(feats, axis=0) + 1e-8)
        if rate_of_change is True:
            #d1 = delta(feats, N=2)
            d1 = librosa.feature.delta(feats.T).T
            feats = np.concatenate((feats, d1), axis=1)
        if rate_of_acceleration is True:
            #d2 = delta(delta(feats, N=2), N=2)
            d2 = librosa.feature.delta(feats.T, order=2).T
            feats = np.concatenate((feats, d2), axis=1)

    if 'powspec' in feature_type:
        feats = sp.dsp.calc_power(feats)
    return feats


[docs]def load_feat_settings(feat_settings_dict):
    '''Loads feature settings into named tuple. Sets defaults if not present.
    TODO: test w previous version
    '''
    FeatureExtractionSettings = collections.namedtuple('FeatureExtractionSettings',
                                          ['sr', 'feature_type', 'win_size_ms',
                                           'percent_overlap', 'window', 'dur_sec',
                                           'num_filters','num_mfcc', 'fft_bins',
                                           'remove_first_coefficient','sinosoidal_liftering',
                                           'mono', 'rate_of_change', 'rate_of_acceleration',
                                           'subtract_mean', 'real_signal', 'fmin', 'fmax',
                                           'zeropad', 'input_shape', 'base_shape',
                                           'num_feats'])    

    # newer version of soundpy: 0.1.0a3
    # store get_feats kwargs under `kwargs` key
    if 'kwargs' in feat_settings_dict:
        kwargs = sp.utils.restore_dictvalue(feat_settings_dict['kwargs'])
        feat_settings_dict.update(kwargs)
    # if values saved as strings and should be a list or tuple, restore them to original type: 
    # see `soundpy.utils.restore_dictvalue`
    sr = sp.utils.restore_dictvalue(
        feat_settings_dict['sr'])
    feature_type = sp.utils.restore_dictvalue(
        feat_settings_dict['feature_type'])
    win_size_ms = sp.utils.restore_dictvalue(
        feat_settings_dict['win_size_ms'])
    percent_overlap = sp.utils.restore_dictvalue(
        feat_settings_dict['percent_overlap'])
    try:
        window = sp.utils.restore_dictvalue(feat_settings_dict['window'])
    except KeyError:
        # set default here...
        window = 'hann'
    dur_sec = sp.utils.restore_dictvalue(
        feat_settings_dict['dur_sec'])
    try: 
        num_filters = sp.utils.restore_dictvalue(feat_settings_dict['num_filters'])
    except KeyError:
        num_filters = None
    try: 
        num_mfcc = sp.utils.restore_dictvalue(feat_settings_dict['num_mfcc'])
    except KeyError:
        num_mfcc = None
    try:
        # newer version soundpy 0.1.0v3
        fft_bins = sp.utils.restore_dictvalue(feat_settings_dict['fft_bins'])
    except KeyError:
        # older version soundpy 0.1.0v2
        fft_bins = sp.utils.restore_dictvalue(feat_settings_dict['n_fft'])
    try: 
        remove_first_coefficient = sp.utils.restore_dictvalue(
            feat_settings_dict['remove_first_coefficient'])
    except KeyError:
        remove_first_coefficient = False
    try: 
        sinosoidal_liftering = sp.utils.restore_dictvalue(
            feat_settings_dict['sinosoidal_liftering'])
    except KeyError:
        sinosoidal_liftering = False
    try: 
        mono = sp.utils.restore_dictvalue(
            feat_settings_dict['mono'])
    except KeyError:
        mono = True # default setting
    try: 
        rate_of_change = sp.utils.restore_dictvalue(
            feat_settings_dict['rate_of_change'])
    except KeyError:
        rate_of_change = False
    try: 
        rate_of_acceleration = sp.utils.restore_dictvalue(
            feat_settings_dict['rate_of_acceleration'])
    except KeyError:
        rate_of_acceleration = False
    try: 
        subtract_mean = sp.utils.restore_dictvalue(
            feat_settings_dict['subtract_mean'])
    except KeyError:
        subtract_mean = False
    try: 
        real_signal = sp.utils.restore_dictvalue(
            feat_settings_dict['real_signal'])
    except KeyError:
        real_signal = True
    try: 
        fmin = sp.utils.restore_dictvalue(
            feat_settings_dict['fmin'])
    except KeyError:
        fmin = None
    try: 
        fmax = sp.utils.restore_dictvalue(
            feat_settings_dict['fmax'])
    except KeyError:
        fmax = None
    try: 
        zeropad = sp.utils.restore_dictvalue(
            feat_settings_dict['zeropad'])
    except KeyError:
        zeropad = True
    try:
        # older version of soundpy: 0.1.0a2
        input_shape = sp.utils.restore_dictvalue(
            feat_settings_dict['input_shape'])
    except KeyError:
        # newer version of soundpy: 0.1.0a3
        input_shape = sp.utils.restore_dictvalue(
            feat_settings_dict['feat_model_shape'])
    try:
        # older version of soundpy: 0.1.0a2
        base_shape = sp.utils.restore_dictvalue(
            feat_settings_dict['desired_shape'])
    except KeyError:
        # newer version of soundpy: 0.1.0a3
        try:
            base_shape = sp.utils.restore_dictvalue(
                feat_settings_dict['feat_base_shape'])
        except KeyError:
            base_shape = input_shape
    try:
        # older version of soundpy: 0.1.0a2
        num_feats = sp.utils.restore_dictvalue(
            feat_settings_dict['num_feats'])
    except KeyError:
        # newer version of soundpy: 0.1.0a3
        num_feats = base_shape[-1]
        
    featsettings = FeatureExtractionSettings(
        sr = sr,
        feature_type = feature_type, 
        win_size_ms = win_size_ms,
        percent_overlap = percent_overlap,
        window = window,
        dur_sec = dur_sec, 
        num_filters = num_filters,
        num_mfcc = num_mfcc,
        fft_bins = fft_bins,
        remove_first_coefficient = remove_first_coefficient,
        sinosoidal_liftering = sinosoidal_liftering,
        mono = mono, 
        rate_of_change = rate_of_change,
        rate_of_acceleration = rate_of_acceleration,
        subtract_mean = subtract_mean,
        real_signal = real_signal,
        fmin = fmin, 
        fmax = fmax, 
        zeropad = zeropad,
        input_shape = input_shape,
        base_shape = base_shape,
        num_feats = num_feats)
    return featsettings

# TODO: create class instance where fund freq, vad, etc. can be saved 
# allows for more control over fft bins / resolution of each iteration.
[docs]def get_stft(sound, sr=22050, win_size_ms = 50, percent_overlap = 0.5,
                real_signal = False, fft_bins = 1024, 
                window = 'hann', zeropad = True, **kwargs):
    '''Returns short-time Fourier transform matrix.
    
    This function allows more flexibility in number of `fft_bins` and `real_signal`
    settings. Additionally, this does not require the package librosa, making it 
    a bit easier to manipulate if desired. For an example, see
    `soundpy.augment.vtlp`.
    
    Parameters
    ----------
    sound : np.ndarray [shape=(num_samples,) or (num_samples, num_channels)], str, or pathlib.PosixPath
        If type np.ndarray, expect raw samples in mono or stereo sound. If type str or 
        pathlib.PosixPath, expect pathway to audio file.
        
    sr : int 
        The sample rate of `sound`. 
    
    win_size_ms : int, float 
        Window length in milliseconds for Fourier transform to be applied
        (default 50)
        
    percent_overlap : int or float 
        Amount of overlap between processing windows. For example, if `percent_overlap`
        is set at 0.5, the overlap will be half that of `win_size_ms`. (default 0.5) 
        If an integer is provided, it will be converted to a float between 0 and 1.
        
    real_signal : bool 
        If True, only half the FFT spectrum will be used; there should really be no difference
        as the FFT is symmetrical. If anything, setting `real_signal` to True may speed up 
        functionality / make functions more efficient.
        
    fft_bins : int 
        Number of frequency bins to use when applying fast Fourier Transform. (default 1024)
        
    window : str 
        The window function to apply to each window segment. Options are 'hann' and 'hamming'.
        (default 'hann')
        
    zeropad : bool 
        If True, samples will be zeropadded to fill any partially filled window. If False, the 
        samples constituting the partially filled window will be cut off.
        
    **kwargs : additional keyword arguments
        Keyword arguments for `soundpy.files.loadsound`.
        
    Returns
    -------
    stft_matrix : np.ndarray[size=(num_frames, fft_bins)]
    '''
    if isinstance(sound, np.ndarray):
        if sound.dtype == np.complex_:
            import warnings
            msg = '\nWARNING: data provided to `soundpy.feats.get_stft` is already'+\
                ' a STFT matrix. Returning original data.'
            warnings.warn(msg)
            return sound
        data = sound
    else:
        data, sr2 = sp.loadsound(sound, sr=sr, **kwargs)
        assert sr2 == sr
    frame_length = sp.dsp.calc_frame_length(win_size_ms, sr)
    num_overlap_samples = int(frame_length * percent_overlap)
    num_subframes = sp.dsp.calc_num_subframes(len(data),
                                                frame_length = frame_length,
                                                overlap_samples = num_overlap_samples,
                                                zeropad = zeropad)
    
    if fft_bins is None:
        fft_bins = int(win_size_ms * sr // 1000)
    total_rows = fft_bins // 2 + 1
    # if mono, only one channel; otherwise match num channels in sound signal
    if sp.dsp.ismono(data):
        stft_matrix = sp.dsp.create_empty_matrix(
            (num_subframes, total_rows),
            complex_vals = True)
    else:
        stft_matrix = sp.dsp.create_empty_matrix(
            (num_subframes, total_rows, data.shape[1]),
            complex_vals = True)
    section_start = 0
    window_frame = sp.dsp.create_window(window, frame_length)
    for frame in range(num_subframes):
        section = data[section_start:section_start+frame_length]
        section = sp.dsp.apply_window(section, 
                                        window_frame, 
                                        zeropad = zeropad)
        
        section_fft = sp.dsp.calc_fft(section, 
                                        real_signal = real_signal,
                                        fft_bins = fft_bins,
                                        )
        stft_matrix[frame] = section_fft[:total_rows]
        section_start += (frame_length - num_overlap_samples)
    return stft_matrix

[docs]def get_fbank(sound, sr, num_filters, fmin=None, fmax=None, fft_bins = None, **kwargs):
    '''Extract mel-filterbank energy features from audio.
    
    Parameters
    ----------
    sound : np.ndarray [size=(num_samples,) or (num_samples, num_features)], str, or pathlib.PosixPath
        Sound in raw samples, a power spectrum, or a short-time-fourier-transform. If type string or pathlib.PosixPath, expect pathway to audio file.
        
    sr : int 
        The sample rate of `sound`.
        
    num_filters : int 
        The number of mel-filters to use when extracting mel-filterbank energies.
        
    fmin : int or float, optional
        The minimum frequency of interest. If None, will be set to 0. (default None)
    
    fmax : int or float, optional
        The maximum frequency of interst. If None, will be set to half of `sr`.
        (default None)
        
    fft_bins : int, optional
        The number of frequency bins / fast Fourier transform bins used in calculating 
        the fast Fourier transform. If None, set depending on type of parameter `sound`.
        If `sound` is a raw signal or audio pathway, `fft_bins` will be set to 1024;
        if `sound` is a STFT or power spectrum, `fft_bins` will be set to 2 * length
        of `sound` feature column, or 2 * sound.shape[1].
        
    **kwargs : additional keyword arguments
        Keyword arguments for `soundpy.feats.get_stft`.
        
    Returns
    -------
    fbank : np.ndarray [shape=(num_samples, num_filters)]
        The mel-filterbank energeis extracted. The number of samples depends on 
        the parameters applied in `soundpy.feats.get_stft`.
    
    References
    ----------
    Fayek, H. M. (2016). Speech Processing for Machine Learning: 
    Filter banks, Mel-Frequency Cepstral Coefficients (MFCCs) and What’s In-Between.
    Retrieved from:
    https://haythamfayek.com/2016/04/21/speech-processing-for-machine-learning.html
    '''
    if isinstance(sound, np.ndarray):
        if sound.dtype == np.complex64 or sound.dtype == np.complex128:
            stft = True
        # probably a power spectrum without complex values...
        # TODO improve
        elif len(sound.shape) > 1 and sound.shape[1] > sound.shape[0]:
            stft = True
        else:
            stft = False
    else:
        # probably a pathway - load in get_stft
        stft = False
    if fmin is None:
        fmin = 0
    if fmax is None:
        fmax = sr/2

    mel_points = sp.dsp.fbank_filters(fmin, fmax, num_filters = num_filters)
    hz_points = sp.dsp.mel_to_hz(mel_points)
    
    if fft_bins is None:
        if stft is True:
            # assumes number of fft bins is the length of second column
            # https://librosa.org/doc/latest/generated/librosa.istft.html?highlight=istf#librosa.istft
            fft_bins = (sound.shape[1]-1) * 2 
        else:
            try:
                fft_bins = int(kwargs['win_size_ms'] * sr // 1000)
            except KeyError:
                fft_bins = 512
        
    freq_bins = np.floor((fft_bins + 1) * hz_points / sr)
    
    if stft:
        # use number of fft columns in stft as reference
        fbank = np.zeros((num_filters, sound.shape[1]))
    else:
        fbank = np.zeros((num_filters, int(np.floor(fft_bins / 2 + 1))))
    for m in range(1, num_filters + 1):
        f_m_minus = int(freq_bins[m - 1]) # left
        f_m = int(freq_bins[m]) # center
        f_m_plus = int(freq_bins[m + 1]) # right
        
        for k in range(f_m_minus, f_m):
            fbank[m - 1, k] = (k - freq_bins[m - 1]) / (freq_bins[m] - freq_bins[m -1])
            for k in range(f_m, f_m_plus):
                fbank[m - 1, k] = (freq_bins[m + 1] - k) / (freq_bins[m + 1] - freq_bins[m])
    if stft:
        if np.min(sound) < 0:
            powspec = sp.dsp.calc_power(sound)
        else:
            powspec = sound
    else:
        sound_stft = sp.feats.get_stft(sound, sr=sr, fft_bins = fft_bins, **kwargs)
        powspec = sp.dsp.calc_power(sound_stft)

    filter_banks = np.dot(powspec, fbank.T)
    filter_banks = np.where(filter_banks == 0, np.finfo(float).eps, filter_banks) # numerical stability
    return filter_banks

[docs]def get_mfcc(sound, sr, num_mfcc, remove_first_coefficient=False, 
             sinosoidal_liftering = False, **kwargs):
    '''Extracts mel-frequency cepstral coefficients from audio.
    
    Parameters
    ----------
    sound : np.ndarray [size=(num_samples,) or (num_samples, num_features)] or str or pathlib.PosixPath
        If `sound` is a np.ndarray, expected as raw samples, a power spectrum or a
        short-time Fourier transform. If string or pathlib.PosixPath, should be the pathway
        to the audio file.
        
    sr : int 
        The sample rate of the `sound`.
        
    num_mfcc : int 
        The number of mel-frequency cepstral coefficients
        
    remove_first_coefficient : bool
        If True, the first coefficient, representing amplitude or volume of signal, is
        removed. Found to sometimes improve automatic speech recognition. 
        (default False)
        
    sinosoidal_liftering : bool 
        If True, reduces influence of higher coefficients, found to aid in handling 
        noise in background in automatic speech recognition. (default False)
        
    **kwargs : additional keyword arguments
        Keyword arguments for soundpy.feats.get_fbank()
        
    References
    ----------
    Fayek, H. M. (2016). Speech Processing for Machine Learning: Filter banks, Mel-Frequency Cepstral Coefficients (MFCCs) and What’s In-Between. Retrieved from https://haythamfayek.com/2016/04/21/speech-processing-for-machine-learning.html
    '''
    fbank = sp.feats.get_fbank(sound, sr=sr, **kwargs)
    mfcc = dct(fbank, type=2, axis=1, norm='ortho')
    if remove_first_coefficient is True:
        mfcc = mfcc[:,1:num_mfcc]
    else:
        mfcc = mfcc[:,:num_mfcc]
    return mfcc

[docs]def get_vad_stft(sound, sr=48000, win_size_ms = 50, percent_overlap = 0.5,
                          real_signal = False, fft_bins = 1024, 
                          window = 'hann', use_beg_ms = 120,
                          extend_window_ms = 0, energy_thresh = 40, 
                          freq_thresh = 185, sfm_thresh = 5, 
                          zeropad = True, **kwargs):
    '''Returns STFT matrix and VAD matrix. STFT matrix contains only VAD sections.
    
    Parameters
    ----------
    sound : str or numpy.ndarray [size=(num_samples,) or (num_samples, num_channels)]
        If str, wavfile (must be compatible with scipy.io.wavfile). Otherwise 
        the samples of the sound data. Note: in the latter case, `sr`
        must be declared.
    
    sr : int, optional
        The sample rate of the sound data or the desired sample rate of
        the wavfile to be loaded. (default None)
    
    win_size_ms : int or float
        Window length in milliseconds for Fourier transform to be applied
        (default 50)
    
    percent_overlap : int or float 
        Amount of overlap between processing windows. For example, if `percent_overlap`
        is set at 0.5, the overlap will be half that of `win_size_ms`. (default 0.5) 
        If an integer is provided, it will be converted to a float between 0 and 1. 
        
    real_signal : bool 
        If True, only half the FFT spectrum will be used; there should really be no difference
        as the FFT is symmetrical. If anything, setting `real_signal` to True may speed up 
        functionality / make functions more efficient.

    fft_bins : int 
        Number of frequency bins to use when applying fast Fourier Transform. (default 1024)
        
    window : str 
        The window function to apply to each window segment. Options are 'hann' and 'hamming'.
        (default 'hann')
        
    use_beg_ms : int 
        The amount of time in milliseconds to use from beginning of signal to estimate background
        noise.
        
    extend_window_ms : int 
        The amount of time in milliseconds to pad or extend the identified VAD segments. This 
        may be useful to include more speech / sound, if desired.
        
    energy_thresh : int 
        The threshold to set for measuring energy for VAD in the signal. (default 40)
        
    freq_thresh : int 
        The threshold to set for measuring frequency for VAD in the signal. (default 185)
        
    sfm_thresh : int 
        The threshold to set for measuring spectral flatness for VAD in the signal. (default 5)
        
    zeropad : bool 
        If True, samples will be zeropadded to fill any partially filled window. If False, the 
        samples constituting the partially filled window will be cut off.
    
    **kwargs : additional keyword arguments
        Keyword arguments for `soundpy.files.loadsound`
        
    Returns
    -------
    stft_matrix : np.ndarray [size=(num_frames_vad, fft_bins//2+1), dtype=np.complex_]
        The STFT matrix frames of where voice activity has been detected.
        
    vad_matrix_extwin : np.ndarray [size=(num_frames,)]
        A vector containing indices of the full STFT matrix for frames of where voice activity 
        was detected or not.
    '''
    # raise ValueError if percent_overlap is not supported
    if percent_overlap != 0 and percent_overlap < 0.5:
        raise ValueError('For this VAD function, `percent_overlap` ' +\
            'set to {} is not currently supported.\n'.format(percent_overlap) +\
                'Suggested to set at either 0 or 0.5')
    if percent_overlap > 0.5:
        import warnings
        msg = '\nWarning: for this VAD function, parameter `percent_overlap` has most success '+\
            'when set at 0 or 0.5'
    # raise warnings if sample rate lower than 44100 Hz
    if sr < 44100:
        import warnings
        msg = '\nWarning: voice-activity-detection works best with sample '+\
            'rates above 44100 Hz. Current `sr` set at {}.'.format(sr)
        warnings.warn(msg)
    if isinstance(sound, np.ndarray):
        data = sound.copy()
    else:
        data, sr2 = sp.loadsound(sound, sr=sr, **kwargs)
        assert sr2 == sr
    frame_length = sp.dsp.calc_frame_length(win_size_ms, sr)
    num_overlap_samples = int(frame_length * percent_overlap)
    num_subframes = sp.dsp.calc_num_subframes(len(data),
                                                frame_length = frame_length,
                                                overlap_samples = num_overlap_samples,
                                                zeropad = zeropad)
    
    # set number of subframes for extending window
    extwin_num_samples = sp.dsp.calc_frame_length(extend_window_ms, sr)
    num_win_subframes = sp.dsp.calc_num_subframes(extwin_num_samples,
                                                    frame_length = frame_length,
                                                    overlap_samples = num_overlap_samples,
                                                    zeropad = zeropad)
    
    total_rows = fft_bins
    if len(data.shape) > 1 and data.shape[1] > 1:
        stereo = True
        stft_matrix = sp.dsp.create_empty_matrix(
            (num_subframes, total_rows, data.shape[1]),
            complex_vals = True)
        # stereo sound --> average out channels for measuring energy
        data_vad = sp.dsp.average_channels(data)
    else:
        stereo = False
        stft_matrix = sp.dsp.create_empty_matrix(
            (num_subframes, total_rows),
            complex_vals = True)
        data_vad = data
    
    vad_matrix, (sr, e, f, sfm) = sp.dsp.vad(data_vad, sr, 
                                               win_size_ms = win_size_ms,
                                               percent_overlap = percent_overlap, 
                                               use_beg_ms = use_beg_ms,
                                               energy_thresh = energy_thresh, 
                                               freq_thresh = freq_thresh, 
                                               sfm_thresh = sfm_thresh)
    vad_matrix_extwin = vad_matrix.copy()
    
    # extend VAD windows where VAD indicated
    if extend_window_ms > 0:
        for i, row in enumerate(vad_matrix):
            if row > 0:
                # label samples before VAD as VAD
                if i > num_win_subframes:
                    vad_matrix_extwin[i-num_win_subframes:i] = 1
                else:
                    vad_matrix_extwin[:i] = 1
                # label samples before VAD as VAD
                if i + num_win_subframes < len(vad_matrix):
                    vad_matrix_extwin[i:num_win_subframes+i] = 1
                else:
                    vad_matrix_extwin[i:] = 1
                    
    section_start = 0
    extra_rows = 0
    window_frame = sp.dsp.create_window(window, frame_length)
    row = 0
    for frame in range(num_subframes):
        vad = vad_matrix_extwin[frame]
        if vad > 0:
            section = data[section_start:section_start+frame_length]
            section = sp.dsp.apply_window(section, 
                                            window_frame, 
                                            zeropad = zeropad)
            section_fft = sp.dsp.calc_fft(section, 
                                            real_signal = real_signal,
                                            fft_bins = total_rows,
                                            )
            stft_matrix[row] = section_fft
            row += 1
        else:
            extra_rows += 1
        section_start += (frame_length - num_overlap_samples)
    stft_matrix = stft_matrix[:-extra_rows]
    return stft_matrix[:,:fft_bins//2+1], vad_matrix_extwin

[docs]def get_stft_clipped(samples, sr, win_size_ms = 50, percent_overlap = 0.5, 
                     extend_window_ms = 0,  window = 'hann', zeropad = True, **kwargs):
    '''Returns STFT matrix and VAD matrix with beginning and ending silence removed.
    
    Parameters
    ----------
    samples : str or numpy.ndarray [size=(num_samples,) or (num_samples, num_channels)]
        If str, wavfile (must be compatible with scipy.io.wavfile). Otherwise 
        the samples of the sound data. 
    
    sr : int, optional
        The sample rate of the sound data or the desired sample rate of
        the wavfile to be loaded. 
    
    win_size_ms : int or float
        Window length in milliseconds for Fourier transform to be applied
        (default 50)
    
    percent_overlap : int or float 
        Amount of overlap between processing windows. For example, if `percent_overlap`
        is set at 0.5, the overlap will be half that of `win_size_ms`. (default 0.5) 
        If an integer is provided, it will be converted to a float between 0 and 1. 
    
    extend_window_ms : int 
        The amount of time in milliseconds to pad or extend the identified VAD segments. This 
        may be useful to include more speech / sound, if desired.
        
    window : str 
        The window function to apply to each window segment. Options are 'hann' and 'hamming'.
        (default 'hann')
        
    zeropad : bool 
        If True, samples will be zeropadded to fill any partially filled window. If False, the 
        samples constituting the partially filled window will be cut off.
        
    **kwargs : additional keyword arguments 
        Keyword arguments for `soundpy.files.loadsound`.
        
    Returns 
    -------
    stft_speech : np.ndarry [size (num_frames_clipped, fft_bins//2+1)]
        The STFT of the `samples` with beginning and ending silences clipped.
    
    vad_matrix : np.ndarry [size (num_frames, )]
        A vector with zeros and ones indicating which indices of the full STFT that 
        have voice activity or not.
    '''
    stft = sp.feats.get_stft(samples, sr, 
                               win_size_ms = win_size_ms, 
                               percent_overlap = percent_overlap,
                               window = window, zeropad = zeropad)
    energy = sp.dsp.get_energy(stft)
    energy_mean = sp.dsp.get_energy_mean(energy)
    beg_index, beg_speech_found = sp.dsp.sound_index(
        energy,energy_mean,start=True)
    end_index, end_speech_found = sp.dsp.sound_index(
        energy,energy_mean,start=False)
    vad_matrix = np.zeros(len(stft))
    if beg_speech_found == False or end_speech_found == False:
        import warnings
        msg = '\nNo speech detected'
        warnings.warn(msg)
        return [], vad_matrix
    if beg_index < end_index:
        if extend_window_ms > 0:
            extra_samples = sp.dsp.calc_frame_length(extend_window_ms, sr)
            num_win_subframes = sp.dsp.calc_num_subframes(
                extra_samples,
                frame_length = frame_length,
                overlap_samples = num_overlap_samples,
                zeropad = zeropad)
            beg_index -= num_win_subframes
            if beg_index < 0:
                beg_index = 0
            end_index += num_win_subframes
            if end_index > len(vad_matrix):
                end_index = len(vad_matrix)
        stft_speech = stft[beg_index:end_index]
        vad_matrix[beg_index:end_index] = 1
        return stft_speech, vad_matrix
    return [], vad_matrix
    
[docs]def get_vad_samples(sound, sr=None, win_size_ms = 50, percent_overlap = 0.5,
                    use_beg_ms = 120, extend_window_ms = 0, energy_thresh = 40, 
                    freq_thresh = 185, sfm_thresh = 5, window = 'hann', zeropad = True,
                    **kwargs):
    '''Returns samples and VAD matrix. Only samples where with VAD are returned.
    
    Parameters
    ----------
    sound : str or numpy.ndarray [size=(num_samples,) or (num_samples, num_channels)]
        If str, wavfile (must be compatible with scipy.io.wavfile). Otherwise 
        the samples of the sound data. Note: in the latter case, `sr`
        must be declared.
    
    sr : int, optional
        The sample rate of the sound data or the desired sample rate of
        the wavfile to be loaded. (default None)
    
    win_size_ms : int or float
        Window length in milliseconds for Fourier transform to be applied
        (default 50)
    
    percent_overlap : int or float 
        Amount of overlap between processing windows. For example, if `percent_overlap`
        is set at 0.5, the overlap will be half that of `win_size_ms`. (default 0.5) 
        If an integer is provided, it will be converted to a float between 0 and 1. 
        
    use_beg_ms : int 
        The amount of time in milliseconds to use from beginning of signal to estimate background
        noise.
        
    extend_window_ms : int 
        The amount of time in milliseconds to pad or extend the identified VAD segments. This 
        may be useful to include more speech / sound, if desired.
        
    energy_thresh : int 
        The threshold to set for measuring energy for VAD in the signal. (default 40)
        
    freq_thresh : int 
        The threshold to set for measuring frequency for VAD in the signal. (default 185)
        
    sfm_thresh : int 
        The threshold to set for measuring spectral flatness for VAD in the signal. (default 5)
        
    window : str 
        The window function to apply to each window segment. Options are 'hann' and 'hamming'.
        (default 'hann')
        
    zeropad : bool 
        If True, samples will be zeropadded to fill any partially filled window. If False, the 
        samples constituting the partially filled window will be cut off.
    
    **kwargs : additional keyword arguments
        Keyword arguments for `soundpy.files.loadsound`
    
    Returns
    -------
    samples_matrix : np.ndarray [size = (num_samples_vad, )]
        The samples of where voice activity was detected.
    vad_matrix_extwin : np.ndarray [size = (num_frames, )]
        A vector of zeros and ones indicating the frames / windows of the samples that either
        had voice activity or not.
    '''
    # raise error if percent_overlap is not supported
    if percent_overlap != 0 and percent_overlap < 0.5:
        raise ValueError('For this VAD function, `percent_overlap` ' +\
            'set to {} is not currently supported.\n'.format(percent_overlap) +\
                'Suggested to set at either 0 or 0.5')
    if percent_overlap > 0.5:
        import warnings
        msg = '\nWarning: for this VAD function, parameter `percent_overlap` has most success '+\
            'when set at 0 or 0.5'
    # raise warnings if sample rate lower than 44100 Hz
    if sr < 44100:
        import warnings
        msg = '\nWarning: voice-activity-detection works best with sample '+\
            'rates above 44100 Hz. Current `sr` set at {}.'.format(sr)
        warnings.warn(msg)
    if isinstance(sound, np.ndarray):
        data = sound
    else:
        data, sr2 = sp.loadsound(sound, sr=sr, **kwargs)
        assert sr2 == sr
        
    frame_length = sp.dsp.calc_frame_length(win_size_ms, sr)
    num_overlap_samples = int(frame_length * percent_overlap)
    num_subframes = sp.dsp.calc_num_subframes(len(data),
                                                frame_length = frame_length,
                                                overlap_samples = num_overlap_samples,
                                                zeropad = zeropad)
    # set number of subframes for extending window
    extwin_num_samples = sp.dsp.calc_frame_length(extend_window_ms, sr)
    num_win_subframes = sp.dsp.calc_num_subframes(extwin_num_samples,
                                                    frame_length = frame_length,
                                                    overlap_samples = num_overlap_samples,
                                                    zeropad = zeropad)
    
    samples_matrix = sp.dsp.create_empty_matrix((len(data)),
                                                complex_vals = False)
    vad_matrix, (sr, e, f, sfm) = sp.dsp.vad(data, sr, 
                                               win_size_ms = win_size_ms,
                                               percent_overlap = percent_overlap, 
                                               use_beg_ms = use_beg_ms, 
                                               energy_thresh = energy_thresh, 
                                               freq_thresh = freq_thresh, 
                                               sfm_thresh = sfm_thresh)
    vad_matrix_extwin = vad_matrix.copy()
    # extend VAD windows with if VAD found
    if extend_window_ms > 0:
        for i, row in enumerate(vad_matrix):
            if row > 0:
                # label samples before VAD as VAD
                if i > num_win_subframes:
                    vad_matrix_extwin[i-num_win_subframes:i] = 1
                else:
                    vad_matrix_extwin[:i] = 1
                # label samples before VAD as VAD
                if i + num_win_subframes < len(vad_matrix):
                    vad_matrix_extwin[i:num_win_subframes+i] = 1
                else:
                    vad_matrix_extwin[i:] = 1
                    
    section_start = 0
    extra_rows = 0
    row = 0
    window_frame = sp.dsp.create_window(window, frame_length)
    for frame in range(num_subframes):
        vad = vad_matrix_extwin[frame]
        if vad > 0:
            section = data[section_start : section_start + frame_length]
            if percent_overlap > 0:
                # apply overlap add to signal
                section_windowed = sp.dsp.apply_window(section, window_frame, zeropad = zeropad)
                samples_matrix[row : row + frame_length] += section_windowed
            else:
                samples_matrix[row : row + frame_length] += section
            row += (frame_length - num_overlap_samples)
        else:
            extra_rows += frame_length - num_overlap_samples
        section_start += (frame_length - num_overlap_samples)
    samples_matrix = samples_matrix[:-extra_rows]
    return samples_matrix, vad_matrix_extwin

[docs]def get_samples_clipped(samples, sr, win_size_ms = 50, percent_overlap = 0.5,
                        extend_window_ms = 0, window = 'hann', zeropad = True, **kwargs):
    '''Returns samples and VAD matrix with beginning and ending silence removed.
    
    
    Parameters
    ----------
    samples : str or numpy.ndarray [size=(num_samples,) or (num_samples, num_channels)]
        If str, wavfile (must be compatible with scipy.io.wavfile). Otherwise 
        the samples of the sound data. 
    
    sr : int, optional
        The sample rate of the sound data or the desired sample rate of
        the wavfile to be loaded. 
    
    win_size_ms : int or float
        Window length in milliseconds for Fourier transform to be applied
        (default 50)
    
    percent_overlap : int or float 
        Amount of overlap between processing windows. For example, if `percent_overlap`
        is set at 0.5, the overlap will be half that of `win_size_ms`. (default 0.5) 
        If an integer is provided, it will be converted to a float between 0 and 1. 
    
    extend_window_ms : int 
        The amount of time in milliseconds to pad or extend the identified VAD segments. This 
        may be useful to include more speech / sound, if desired. (default 0)
        
    window : str 
        The window function to apply to each window segment. Options are 'hann' and 'hamming'.
        (default 'hann')
        
    zeropad : bool 
        If True, samples will be zeropadded to fill any partially filled window. If False, the 
        samples constituting the partially filled window will be cut off.
        
    **kwargs : additional keyword arguments 
        Keyword arguments for `soundpy.files.loadsound`.
        
        
    Returns 
    -------
    stft_speech : np.ndarry [size (num_frames_clipped, fft_bins//2+1)]
        The STFT of the `samples` with beginning and ending silences clipped.
    
    vad_matrix : np.ndarry [size (num_frames, )]
        A vector with zeros and ones indicating which indices of the full STFT that 
        have voice activity or not.
    '''
    if not isinstance(samples, np.ndarray):
        samples, sr = sp.loadsound(samples, sr=sr)
    stft = sp.feats.get_stft(samples,sr, 
                               win_size_ms = win_size_ms, 
                               percent_overlap = percent_overlap,
                               window = window, zeropad = zeropad)
    energy = sp.dsp.get_energy(stft)
    energy_mean = sp.dsp.get_energy_mean(energy)
    beg = sp.dsp.sound_index(energy,energy_mean,start=True)
    end = sp.dsp.sound_index(energy,energy_mean,start=False)
    vad_matrix = np.zeros(len(samples))
    if beg[1] == False or end[1] == False:
        import warnings
        msg = 'No speech detected'
        warnings.warn(msg)
        return [], vad_matrix
    
    perc_start = beg[0]/len(energy)
    perc_end = end[0]/len(energy)
    sample_start = int(perc_start*len(samples))
    sample_end = int(perc_end*len(samples))
    if sample_start < sample_end:
        if extend_window_ms > 0:
            extra_frames = sp.dsp.calc_frame_length(extend_window_ms, sr)
            sample_start -= extra_frames
            if sample_start < 0:
                sample_start = 0
            sample_end += extra_frames
            if sample_end > len(vad_matrix):
                sample_end = len(vad_matrix)
        samples_speech = samples[sample_start:sample_end]
        vad_matrix[sample_start:sample_end] = 1
            
        return samples_speech, vad_matrix

    import warnings
    msg = 'No speech detected'
    warnings.warn(msg)
    return [], vad_matrix

# have applied to stft matrix, looks good
[docs]def normalize(data, max_val=None, min_val=None):
    '''Normalizes data to be between 0 and 1. Should not be applied to raw sample data.
    
    This is useful if you have predetermined max and min values you want to normalize
    new data with. Is helpful in training models on sound features (not raw samples).
    
    Parameters
    ----------
    data : np.ndarray [size=(num_features,) or (num_frames,num_features)]
        Data to be normalized.
    
    max_val : int or float, optional
        Predetermined maximum value. If None, will use max value
        from `data`.
    
    min_val : int or float, optional
        Predetermined minimum value. If None, will use min value
        from `data`.
    
    
    Returns
    -------
    normed_data : np.ndarray [size = (num_features,) or (num_frames, num_features)]
    
    
    Examples
    --------
    >>> # using the min and max of a previous dataset:
    >>> import numpy as np
    >>> np.random.seed(0)
    >>> input_samples = np.random.random_sample((5,))
    >>> input_samples
    array([0.5488135 , 0.71518937, 0.60276338, 0.54488318, 0.4236548 ])
    >>> np.random.seed(40)
    >>> previous_samples = np.random.random_sample((5,))
    >>> previous_samples
    array([0.40768703, 0.05536604, 0.78853488, 0.28730518, 0.45035059])
    >>> max_prev = np.max(previous_samples)
    >>> min_prev = np.min(previous_samples)
    >>> output_samples = normalize(input_samples, min_val = min_prev, max_val = max_prev)
    >>> output_samples
    array([0.67303388, 0.89996095, 0.74661839, 0.66767314, 0.50232462])
    '''
    if data.dtype == np.complex_:
        # take power of absoulte value of stft
        data = np.abs(data)**2
    # add epsilon to avoid division by zero error
    eps = 2**-52
    if max_val is None:
         normed_data = (data - np.min(data)) / (np.max(data) - np.min(data) + eps)
    else:
        if min_val is None:
            min_val = -max_val
        normed_data = (data - min_val) / (max_val - min_val + eps)
    return normed_data

# checked for stereo sound - works: plots each channel in separate plot
[docs]def plot_dom_freq(sound, energy_scale = 'power_to_db', title = None,
                  save_pic = False, name4pic = None, overwrite = False, **kwargs):
    '''Plots the approximate dominant frequency over a STFT plot of a signal.
    
    If `sound` has multiple channels, the VAD for each channel is plotted in its 
    own plot.
    
    Parameters
    ----------
    sound : np.ndarray [shape=(num_samples,) or (num_samples, num_channels)]
        The sound to plot the dominant frequency of.
    
    energy_scale : str 
        The scale of energy for the plot. If in frequency spectrum, likey in power and needs
        to be put into db. (default 'power_to_db')
    
    title : str 
        The title for the plot. (default None)
    
    **kwargs : additional keyword arguments 
        Keyword arguments used in both `soundpy.feats.get_stft` and `soundpy.dsp.get_pitch`.

    Returns 
    -------
    None
    '''
    import matplotlib.pyplot as plt
    # ensure numpy array 
    if not isinstance(sound, np.ndarray):
        raise TypeError('Function `soundpy.feats.plot_vad` expects a '+\
            'numpy.ndarray, not type {}.'.format(type(sound)))
    # ensure sample rate is provided
    if 'sr' not in kwargs:
        raise ValueError('Function `soundpy.feats.plot_vad` requires sample rate'+\
            ' of the provided audio samples. Please provide the sample rate '+\
                'under the parameter `sr`.')
    # set defaults if not provided
    if 'win_size_ms' not in kwargs:
        kwargs['win_size_ms'] = 20
    if 'percent_overlap' not in kwargs:
        kwargs['percent_overlap'] = 0.5
        

    y, sr = sound, kwargs['sr']
    if len(y.shape) == 1:
        # add channel column
        y = y.reshape(y.shape+(1,))
    elif y.shape[1] > 11:
        import warnings 
        msg = '\nWARNING: provided `sound` data could be in the wrong format. \n'+\
            'Function `soundpy.feats.plot_vad` expects raw sample data. Data '+\
                'provided could be a stft, fbank, mfcc matrix or some other data'+\
                    ' format. If plot results do not appear as expected, check data.'
        warnings.warn(msg)
        
    if 'mono' in kwargs and kwargs['mono'] is True:
        y = y[:,0]
        y = y.reshape(y.shape+(1,))
    
    for channel in range(y.shape[1]):
        stft_matrix = sp.feats.get_stft(y[:,channel], **kwargs)
        
        # remove complex info for plotting:
        power_matrix = sp.dsp.calc_power(stft_matrix)
        pitch = sp.dsp.get_pitch(sound, **kwargs)
        if energy_scale == 'power_to_db':
            db_matrix = librosa.power_to_db(power_matrix)
        plt.pcolormesh(db_matrix.T)
        # limit the y axis; otherwise y axis goes way too high
        axes = plt.gca()
        axes.set_ylim([0,db_matrix.shape[1]])
        color = 'yellow'
        linestyle = ':'
        plt.plot(pitch, 'ro', color=color)
        if not title:
            title = 'Appx Dominant Frequency'
        # adjust title if more than one channel
        if y.shape[1] > 1:
            if channel == 0:
                title += '\n(channel {})'.format(channel+1)
            else:
                title = title[:-2] + '{})'.format(channel+1)
        plt.title(title)
        if not save_pic:
            plt.show()
        # set up name for saving the plot, given channel number and if other files exist
        else:
            if name4pic is None:
                name4pic = 'dom_freq'
                if y.shape[1] > 1:
                    name4pic += '_channel{}'.format(channel+1)
                name4pic = sp.utils.string2pathlib(name4pic+'.png')
            else:
                name4pic = sp.utils.string2pathlib(name4pic)
                if y.shape[1] > 1:
                    if channel == 0:
                        name = name4pic.stem + '_channel{}'.format(channel+1)
                        name4pic = name4pic.parent.joinpath(name, name4pic.suffix)
                    else:
                        name = name4pic.stem[:-1] + str(channel+1)
                        name4pic = name4pic.parent.joinpath(name + name4pic.suffix)
                if not name4pic.suffix:
                    name4pic = name4pic.parent.joinpath(name4pic.stem+'.png')
            if not overwrite:
                if os.path.exists(name4pic):
                    final_name = name4pic.stem + '_' + sp.utils.get_date()
                    final_name = name4pic.parent.joinpath(final_name+name4pic.suffix)
                else:
                    final_name = name4pic
            else:
                final_name = name4pic
            plt.savefig(final_name)
    
# checked for stereo sound - works: plots each channel in separate plot
[docs]def plot_vad(sound, energy_scale = 'power_to_db', 
             title = 'Voice Activity', 
             use_beg_ms = 120, extend_window_ms=0, 
             beg_end_clipped = True, save_pic = False, name4pic = None, 
             overwrite = False, **kwargs):
    '''Plots where voice (sound) activity detected on power spectrum. 
    
    This either plots immediately or saves the plot at `name4pic`. If `sound` 
    has multiple channels, the VAD for each channel is plotted in its own plot.
    
    Parameters
    ----------
    sound : np.ndarray [shape=(num_samples,) or (num_samples, num_channels)]
        The sound to plot the VAD of.
    
    energy_scale : str 
        If plotting STFT or power spectrum, will plot it in decibels. 
        (default 'power_to_db')
        
    title : str 
        The title of the plot (default 'Voice Activity')
        
    use_beg_ms : int 
        The amount of noise to use at the beginning of the signal to measuer VAD. This
        is only applied if `beg_end_silence` is set to False.
        
    extend_window_ms : int 
        The number of milliseconds VAD should be padded. This is useful if one wants to 
        encompass more speech if the VAD is not including all the speech / desired sound. 
        However, this may capture more noise. (default 0)
        
    beg_end_silence : bool 
        If True, just the silences at the beginning and end of the sample will be cut off.
        If False, VAD will be checked throughout the sample, not just the beginning and 
        end. NOTE: Both options have strengths and weaknesses. Sometimes the VAD checking 
        the entire signal is unreliable (e.i. when `beg_end_silence is set to False`), 
        not recognizing speech in speech filled samples. And when set to True, some speech
        sounds tend to get ignored ('s', 'x' and other fricatives).
        
    save_pic : bool 
        If True, the plot will be saved rather than plotted immediately.
        
    name4pic : str 
        The full pathway and filename to save the picture (as .png file). A file
        extension is expected. (default None)
        
    overwrite : bool 
        If False, a date tag will be added to `name4pic` if `name4pic` already exists.
        (default False)
        
    **kwargs : keyword arguments
        Additional keyword arguments for `soundpy.feats.get_speech_stft` or 
        `soundpy.dsp.vad`.
        
    Returns
    -------
    None
    '''
    import matplotlib.pyplot as plt
    # ensure numpy array 
    if not isinstance(sound, np.ndarray):
        raise TypeError('Function `soundpy.feats.plot_vad` expects a '+\
            'numpy.ndarray, not type {}.'.format(type(sound)))
    # ensure sample rate is provided
    if 'sr' not in kwargs:
        raise ValueError('Function `soundpy.feats.plot_vad` requires sample rate'+\
            ' of the provided audio samples. Please provide the sample rate '+\
                'under the parameter `sr`.')
    else:
        # ensure sr is at least 44100; otherwise raise warning
        # vad does not work as well with lower sample rates
        if kwargs['sr'] < 44100:
            import warnings
            msg = '\nWarning: VAD works best with sample rates at or above '+\
                '44100 Hz. To supress this warning, resample the audio from'+\
                    ' {} Hz to at least 44100 Hz.'.format(kwargs['sr'])
            warnings.warn(msg)
    # set defaults if not in kwargs
    if 'win_size_ms' not in kwargs:
        kwargs['win_size_ms'] = 50
    if 'percent_overlap' not in kwargs:
        kwargs['percent_overlap'] = 0.5

    y, sr = sound, kwargs['sr']
    if len(y.shape) == 1:
        # add channel column
        y = y.reshape(y.shape+(1,))
    elif y.shape[1] > 11:
        import warnings 
        msg = '\nWARNING: provided `sound` data could be in the wrong format. \n'+\
            'Function `soundpy.feats.plot_vad` expects raw sample data. Data '+\
                'provided could be a stft, fbank, mfcc matrix or some other data'+\
                    ' format. If plot results do not appear as expected, check data.'
        warnings.warn(msg)
        
    if 'mono' in kwargs and kwargs['mono'] is True:
        y = y[:,0]
        y = y.reshape(y.shape+(1,))
    
    for channel in range(y.shape[1]):
        stft_matrix = sp.feats.get_stft(y[:,channel], **kwargs)
        
        if beg_end_clipped:
            stft_vad, vad_matrix = sp.feats.get_stft_clipped(y[:,channel],
                                                            **kwargs)
        else:
            #vad_matrix, __ = sp.dsp.vad(y[:,channel], use_beg_ms = use_beg_ms, **kwargs)
            stft_vad, vad_matrix = sp.feats.get_vad_stft(y[:,channel],
                                                        use_beg_ms = use_beg_ms,
                                                        **kwargs)
        
        # extend window of VAD if desired
        if extend_window_ms > 0:
            frame_length = sp.dsp.calc_frame_length(kwargs['win_size_ms'],
                                                    kwargs['sr'])
            num_overlap_samples = int(frame_length * kwargs['percent_overlap'])
            # set number of subframes for extending window
            extwin_num_samples = sp.dsp.calc_frame_length(extend_window_ms, kwargs['sr'])
            num_win_subframes = sp.dsp.calc_num_subframes(extwin_num_samples,
                                                            frame_length = frame_length,
                                                            overlap_samples = num_overlap_samples,
                                                            zeropad = True)
            vad_matrix_extwin = vad_matrix.copy()
            for i, row in enumerate(vad_matrix):
                if row > 0:
                    # label samples before VAD as VAD
                    if i > num_win_subframes:
                        vad_matrix_extwin[i-num_win_subframes:i] = 1
                    else:
                        vad_matrix_extwin[:i] = 1
                    # label samples before VAD as VAD
                    if i + num_win_subframes < len(vad_matrix):
                        vad_matrix_extwin[i:num_win_subframes+i] = 1
                    else:
                        vad_matrix_extwin[i:] = 1
            
            vad_matrix = vad_matrix_extwin
        # remove complex info for plotting:
        power_matrix = sp.dsp.calc_power(stft_matrix)
        db_matrix = librosa.power_to_db(power_matrix)
        y_axis = db_matrix.shape[1]
        if max(vad_matrix) > 0:
            vad_matrix = sp.dsp.scalesound(vad_matrix, max_val = y_axis, min_val = 0)
        plt.pcolormesh(db_matrix.T)
        # limit the y axis; otherwise y axis goes way too high
        axes = plt.gca()
        axes.set_ylim([0,db_matrix.shape[1]])
        color = 'yellow'
        linestyle = ':'
        plt.plot(vad_matrix, 'ro', color=color)
        if not title:
            title = 'Voice Activity in Signal'
        if beg_end_clipped and 'clipped' not in title:
            title += ' (clipped)'
        # adjust title if more than one channel
        if y.shape[1] > 1:
            if channel == 0:
                title += '\n(channel {})'.format(channel+1)
            else:
                title = title[:-2] + '{})'.format(channel+1)
        plt.title(title)
        if not save_pic:
            plt.show()
        # set up name for saving the plot, given channel number and if other files exist
        else:
            if name4pic is None:
                name4pic = 'vad'
                if beg_end_clipped:
                    name4pic += '_clipped'
                if y.shape[1] > 1:
                    name4pic += '_channel{}'.format(channel+1)
                name4pic = sp.utils.string2pathlib(name4pic+'.png')
            else:
                name4pic = sp.utils.string2pathlib(name4pic)
                if y.shape[1] > 1:
                    if channel == 0:
                        name = name4pic.stem + '_channel{}'.format(channel+1)
                        name4pic = name4pic.parent.joinpath(name, name4pic.suffix)
                    else:
                        name = name4pic.stem[:-1] + str(channel+1)
                        name4pic = name4pic.parent.joinpath(name + name4pic.suffix)
                if not name4pic.suffix:
                    name4pic = name4pic.parent.joinpath(name4pic.stem+'.png')
            if not overwrite:
                if os.path.exists(name4pic):
                    final_name = name4pic.stem + '_' + sp.utils.get_date()
                    final_name = name4pic.parent.joinpath(final_name+name4pic.suffix)
                else:
                    final_name = name4pic
            else:
                final_name = name4pic
            plt.savefig(final_name)

[docs]def get_change_acceleration_rate(spectro_data):
    '''Gets first and second derivatives of spectral data.
    
    This is useful particularly for speech recognition.
    
    Parameters
    ----------
    spectro_data : np.ndarray [shape = (num_samples, num_features)]
    
    Returns
    -------
    delta : np.ndarray [shape = (num_samples, num_features)]
        The first order derivative of spectral data. Reflects rate of change in signal.
        
    delta_delta : np.ndarray [shape = (num_samples, num_features)]
        The second order derivative of spectral data. Reflects rate of acceleration in signal.
    '''
    spectro_data = spectro_data.T
    #first derivative = delta (rate of change)
    delta = librosa.feature.delta(spectro_data)
    #second derivative = delta delta (acceleration changes)
    delta_delta = librosa.feature.delta(spectro_data,order=2)
    delta = delta.T
    delta_delta = delta_delta.T
    return delta, delta_delta

[docs]def get_mfcc_fbank(samples, feature_type='mfcc', sr=48000, win_size_ms=20,
                     percent_overlap=0.5, num_filters=40, num_mfcc=40,
                     fft_bins = None, window_function = None, zeropad = True, **kwargs):
    '''Collects fbank or mfcc features via python-speech-features (rather than librosa).
    '''
    if samples.dtype == np.complex64 or samples.dtype == np.complex128:
        raise TypeError('Function `soundpy.feats.get_mfcc_fbank` only works'+\
            ' with raw signals, not complex data. Received input of type {}'.format(
                samples.dtype))
    if not window_function:
        # default for python_speech_features:
        def window_function(x): return np.ones((x,))
    else:
        if 'hamming' in window_function:
            window_function = hamming
        elif 'hann' in window_function:
            window_function = hann
        else:
            # default for python_speech_features:
            def window_function(x): return np.ones((x,))
    if len(samples)/sr*1000 < win_size_ms:
        if zeropad:
            samples = sp.dsp.zeropad_sound(samples, win_size_ms * sr / 1000, sr = sr)
        else:
            win_size_ms = len(samples)/sr*1000
    frame_length = sp.dsp.calc_frame_length(win_size_ms, sr)
    percent_overlap = check_percent_overlap(percent_overlap)
    window_shift_ms = win_size_ms * percent_overlap
    if 'fbank' in feature_type:
        feats, energy = fbank(samples,
                         samplerate = sr,
                         winlen = win_size_ms * 0.001,
                         winstep = window_shift_ms * 0.001,
                         nfilt = num_filters,
                         nfft = fft_bins,
                         winfunc = window_function, 
                         **kwargs)
    elif 'mfcc' in feature_type:
        feats = mfcc(samples,
                     samplerate = sr,
                     winlen = win_size_ms * 0.001,
                     winstep = window_shift_ms * 0.001,
                     nfilt = num_filters,
                     numcep = num_mfcc,
                     nfft = fft_bins,
                     winfunc = window_function,
                     **kwargs)
    return feats

[docs]def zeropad_features(feats, desired_shape, complex_vals = False):
    '''Applies zeropadding to a copy of feats. 
    '''
    # to avoid UFuncTypeError:
    if feats.dtype == np.complex or feats.dtype == np.complex64 or \
        feats.dtype == np.complex128:
            complex_vals = True
    fts = feats.copy()
    if feats.shape != desired_shape:
        if complex_vals:
            dtype = np.complex
        else:
            dtype = np.float
        empty_matrix = np.zeros(desired_shape, dtype = dtype)
        try:
            if len(desired_shape) == 1:
                empty_matrix[:feats.shape[0]] += feats
            elif len(desired_shape) == 2:
                empty_matrix[:feats.shape[0], 
                            :feats.shape[1]] += feats
            elif len(desired_shape) == 3:
                empty_matrix[:feats.shape[0], 
                            :feats.shape[1],
                            :feats.shape[2]] += feats
            elif len(desired_shape) == 4:
                empty_matrix[:feats.shape[0], 
                            :feats.shape[1],
                            :feats.shape[2],
                            :feats.shape[3]] += feats
            elif len(desired_shape) == 5:
                empty_matrix[:feats.shape[0], 
                            :feats.shape[1],
                            :feats.shape[2],
                            :feats.shape[3],
                            :feats.shape[4]] += feats
            else:
                raise TypeError('Zeropadding columns requires a matrix with '+\
                    'a minimum of 1 dimension and maximum of 5 dimensions.')
            fts = empty_matrix
        except ValueError as e:
            print(e)
            raise ValueError('The desired shape is smaller than the original shape.'+ \
                ' No zeropadding necessary.')
        except IndexError as e:
            print(e)
            raise IndexError('The dimensions do not align. Zeropadding '+ \
                'expects same number of dimensions.')
    assert fts.shape == desired_shape
    return fts

[docs]def reduce_num_features(feats, desired_shape):
    '''Limits number features of a copy of feats.
    
    This is useful if you want the features to be a certain size, for 
    training models for example.
    '''
    fts = feats.copy()
    if feats.shape != desired_shape:
        empty_matrix = np.zeros(desired_shape, dtype = feats.dtype)
        try:
            if len(desired_shape) == 1:
                empty_matrix += feats[:empty_matrix.shape[0]]
            elif len(desired_shape) == 2:
                empty_matrix += feats[:empty_matrix.shape[0], 
                            :empty_matrix.shape[1]]
            elif len(desired_shape) == 3:
                empty_matrix += feats[:empty_matrix.shape[0], 
                            :empty_matrix.shape[1],
                            :empty_matrix.shape[2]]
            elif len(desired_shape) == 4:
                empty_matrix += feats[:empty_matrix.shape[0], 
                            :empty_matrix.shape[1],
                            :empty_matrix.shape[2],
                            :empty_matrix.shape[3]]
            elif len(desired_shape) == 5:
                empty_matrix += feats[:empty_matrix.shape[0], 
                            :empty_matrix.shape[1],
                            :empty_matrix.shape[2],
                            :empty_matrix.shape[3],
                            :empty_matrix.shape[4]]
            else:
                raise TypeError('Reducing items in columns requires a matrix with'+\
                    ' a minimum of 1 dimension and maximum of 5 dimensions.')
            fts = empty_matrix
        except ValueError as e:
            print(e)
            raise ValueError('The desired shape is larger than the original shape.'+ \
                ' Perhaps try zeropadding.')
        except IndexError as e:
            print(e)
            raise IndexError('The dimensions do not align. Zeropadding '+ \
                'expects same number of dimensions.')
    assert fts.shape == desired_shape
    return fts

# TODO remove warning for 'operands could not be broadcast together with shapes..'
# TODO test
[docs]def adjust_shape(data, desired_shape, change_dims = None, complex_vals = None):
    try:
        if change_dims is not None:
            raise DeprecationWarning('\nWARNING: Function `soundpy.feats.adjust_shape` will not '+\
                'use the parameter `change_dims` in future versions. \nIf extra dimensions '+\
                    'of length 1 are to be added to the `data`, this will be completed. '+\
                        'However extra dims of greater length are not covered in this function.')
    except DeprecationWarning as e:
        print(e)
    try:
        if complex_vals is not None:
            raise DeprecationWarning('\nWARNING: Function `soundpy.feats.adjust_shape` will not '+\
                'use the parameter `complex_vals` in future versions. This will be '+\
                    'implicitly conducted within the function using `numpy.dtype`.')
    except DeprecationWarning as e:
        print(e)
    
    if len(data.shape) != len(desired_shape):
        data_shape_orig = data.shape
        if desired_shape[0] == 1:
            if data.shape[0] != 1:
                data = data.reshape((1,)+data.shape)
        if desired_shape[-1] == 1:
            if data.shape[-1] != 1:
                data = data.reshape(data.shape + (1,))
        if len(data.shape) != len(desired_shape):   
            raise ValueError('Currently cannot adjust data to a different number of '+\
                'dimensions.\nOriginal data shape: '+str(data_shape_orig)+ \
                    '\nDesired shape: '+str(desired_shape))
    
    # if complex values are in data, set complex_vals to True
    if data.dtype == np.complex_:
        complex_vals = True
    else:
        complex_vals = False
        
    # attempt to zeropad data:
    try:
        greater_items = [i for i, x in enumerate(data.shape) if x > desired_shape[i]]
        # all dimensions can be zeropadded or left alone
        if len(greater_items) == 0:
            data_prepped = sp.feats.zeropad_features(
                data, desired_shape = desired_shape, complex_vals = complex_vals)
        # not all dimensions can be zeropadded. Zeropad what can be zeropadded.
        # then reduce larger dimensions
        elif len(greater_items) == len(data.shape): 
            raise ValueError
            # get out of try statement and run `reduce_num_features` in except clause
        else:
            temp_shape = []
            for i, item in enumerate(data.shape):
                if item <= desired_shape[i]:
                    temp_shape.append(desired_shape[i])
                else:
                    temp_shape.append(item)
            temp_shape = tuple(temp_shape)
            # first zeropad the dimensions that are too small
            data_prepped = sp.feats.zeropad_features(
                data, desired_shape = temp_shape, complex_vals = complex_vals)
            # then clip the dimensions that are too big
            data_prepped = sp.feats.reduce_num_features(
                data_prepped, desired_shape = desired_shape)
    # if zeropadding is smaller than data.shape/features:
    except ValueError:
        # remove extra data/columns to match desired_shape:
        data_prepped = sp.feats.reduce_num_features(data, 
                                                 desired_shape = desired_shape)
    return data_prepped


[docs]def reduce_dim(matrix, axis=0):
    import math
    import numpy as np
    if axis < 0:
        axis = len(matrix.shape) + axis
    if axis == 0:
        new_matrix = np.zeros((math.ceil(matrix.shape[0]/2),)+matrix.shape[1:])
        row = 0
        for i in np.arange(0, matrix.shape[0], 2):
            if i < matrix.shape[0] - 2:
                new_matrix[row] = (matrix[i] + matrix[i+1]) / 2
                row += 1
            else:
                new_matrix[row] = matrix[i]
    elif axis == 1:
        new_matrix = np.zeros(matrix.shape[:1] + (math.ceil(matrix.shape[1]/2),))
        col = 0
        for i in np.arange(0, matrix.shape[1], 2):
            if i < matrix.shape[1] - 2:
                new_matrix[:, col] = (matrix[:, i] + matrix[:, i+1]) / 2
                col += 1
            else:
                new_matrix[:, col] = matrix[:, i]
    else:
        raise ValueError('Function `reduce_dim` only accepts 2D data. Axis {}'.format(axis),
                         ' is out of bounds.')
    return new_matrix


[docs]def featshape_new_subframe(feature_matrix_shape, new_frame_size, 
                               zeropad = True, axis=0, include_dim_size_1=False):
    '''Subdivides features from (num_frames, num_feats) to (new_frame_size, num_frames, num_feats)
    
    Parameters
    ----------
    feature_matrix_shape : tuple [size=(num_frames, num_features)]
        Feature matrix shape to be subdivided. Can be multidimensional.
        
    new_frame_size : int 
        The number of subframes to section axis into.
    
    zeropad : bool 
        If True, frames that don't completely fill a `new_frame_size` will be 
        zeropadded. Otherwise, those frames will be discarded. (default True)
        
    axis : int 
        The axis where the `new_frame_size` should be applied. (default 0)
        
    Returns
    -------
    new_shape : tuple [size=(num_subframes, new_frame_size, num_feats)]
    
    '''
    if axis < 0:
        # get the axis number if using -1 or -2, etc.
        axis = len(feature_matrix_shape) + axis
    original_dim_length = feature_matrix_shape[axis]
    if zeropad is True:
        subsection_frames = math.ceil(original_dim_length / new_frame_size)
    else:
        subsection_frames = original_dim_length // new_frame_size
    new_shape = []
    for i, ax in enumerate(feature_matrix_shape):
        if i == axis:
            if subsection_frames == 1 and include_dim_size_1 is False:
                # don't include extra dimension if length 1
                new_shape.append(new_frame_size)
            else:
                new_shape.append(new_frame_size) 
                new_shape.append(subsection_frames)
        else:
            new_shape.append(ax)
    new_shape = tuple(new_shape)
    return new_shape


[docs]def apply_new_subframe(feature_matrix, new_frame_size, zeropad=True, axis=0):
    '''Reshapes `feature_matrix` to allow for `new_frame_size`. 
    
    Note: Dimensions of `feature_matrix` must be at least 2 and can be up to 5, 
    returning a matrix with one additional dimension. 
    
    Parameters
    ----------
    feature_matrix : np.ndarray [size(num_frames, num_features) ]
        Expects minimum 2D, maximum 5D matrix.
        
    new_frame_size : int 
        The number of subframes to section axis into.
        
    axis : int 
        The axis to apply the `new_frame_size`. (default 0)

    zeropad : bool 
        If True, the feature_matrix will be zeropadded to include frames that do not 
        fill entire frame_size, given the `new_frame_size`. If False, feature_matrix
        will not include the last zeropadded frame. (default True)
        
    Returns
    -------
    feats_reshaped : np.ndarray [size(num_subframes, new_frame_size, num_features)]
        The `feature_matrix` returned with `axis` subdivided into 2 dimensions, the number of subframes and the other length `new_frame_size`. 
        
    Raises
    ------
    ValueError if number of dimensions of `feature_matrix` is below 2 or exceeds 5.
    
    Examples
    --------
    >>> import numpy as np
    >>> matrix = np.arange(24).reshape(3,4,2)
    >>> # apply new_frame_size to dimension of length 4 (i.e. axis 1)
    >>> matrix_zp = apply_new_subframe(matrix, new_frame_size = 3, axis = 1)
    >>> matrix_zp.shape
    (3, 2, 3, 2)
    >>> matrix_zp
    array([[[[ 0,  1],
            [ 2,  3],
            [ 4,  5]],

            [[ 6,  7],
            [ 0,  0],
            [ 0,  0]]],


        [[[ 8,  9],
            [10, 11],
            [12, 13]],

            [[14, 15],
            [ 0,  0],
            [ 0,  0]]],


        [[[16, 17],
            [18, 19],
            [20, 21]],

            [[22, 23],
            [ 0,  0],
            [ 0,  0]]]])
    >>> matrix_nozp = apply_new_subframe(matrix, new_frame_size = 3, axis = 1,
    ...                                    zeropad=False)
    >>> matrix_nozp.shape
    (3, 1, 3, 2)
    >>> matrix_nozp
    array([[[[ 0,  1],
            [ 2,  3],
            [ 4,  5]]],


        [[[ 8,  9],
            [10, 11],
            [12, 13]]],


        [[[16, 17],
            [18, 19],
            [20, 21]]]])

    '''
    if len(feature_matrix.shape) < 2 or len(feature_matrix.shape) > 5:
        raise ValueError('Function `soundpy.feats.apply_new_subframe` '+\
            'can only be applied to matrices between 2 and 5 dimensions.')
    
    datatype = feature_matrix.dtype
    if axis < 0:
        # get the axis number if using -1 or -2, etc.
        axis = len(feature_matrix.shape) + axis
    new_shape = featshape_new_subframe(feature_matrix.shape,
                                           new_frame_size = new_frame_size,
                                           axis = axis,
                                           zeropad = zeropad)
    total_new_samples = np.prod(new_shape)
    current_samples = np.prod(feature_matrix.shape)
    
    # zeropad or reduce feature_matrix to match number of current samples
    diff = total_new_samples - current_samples

    for i, item in enumerate(feature_matrix.shape):
        if i != axis:
            diff /= item
    if zeropad is True:
        if diff >= 0:
            diff = math.ceil(diff)
        else:
            diff = int(diff)
    else:
        if diff >= 0:
            diff = int(diff)
        else:
            diff = math.ceil(diff)
    if axis == 0:
        feature_matrix = sp.feats.adjust_shape(
            feature_matrix,
            ((feature_matrix.shape[0] + diff,) + feature_matrix.shape[1:]))
    elif axis > 0:
        feature_matrix = sp.feats.adjust_shape(
            feature_matrix,
            (feature_matrix.shape[:axis] + (feature_matrix.shape[axis] + diff, ) + \
                feature_matrix.shape[axis+1:]))
    
    feats_reshaped = feature_matrix.reshape(new_shape)
    feats_reshaped = feats_reshaped.astype(datatype)
    return feats_reshaped


[docs]def check_percent_overlap(percent_overlap):
    '''Ensures percent_overlap is between 0 and 1.
    '''
    if percent_overlap > 1:
        percent_overlap *= 0.01
        if percent_overlap > 1:
            raise ValueError('The percent overlap value '+str(percent_overlap)+\
                ' is too large. Please use a value between 0 and 1 or 0 and 100.')
    return percent_overlap

[docs]def separate_dependent_var(matrix):
    '''Separates matrix into features and labels. Expects 3D array.

    Assumes the last column of the last dimension of the matrix constitutes
    the dependent variable (labels), and all other columns the indpendent variables
    (features). Additionally, it is assumed that for each block of data, 
    only one label is needed; therefore, just the first label is taken for 
    each block.

    Parameters
    ----------
    matrix : numpy.ndarray [size = (num_samples, num_frames, num_features)]
        The `matrix` holds the numerical data to separate. num_features is
        expected to be at least 2.

    Returns
    -------
    X : numpy.ndarray [size = (num_samples, num_frames, num_features -1)]
        A matrix holding the (assumed) independent variables
    y : numpy.ndarray, numpy.int64, numpy.float64 [size = (num_samples,)]
        A vector holding the labels assigned to the independent variables.
        If only one value in array, just the value inside is returned

    Examples
    --------
    >>> import numpy as np
    >>> #vector
    >>> separate_dependent_var(np.array([1,2,3,4]))
    (array([1, 2, 3]), 4)
    >>> #simple matrix
    >>> matrix = np.arange(4).reshape(2,2)
    >>> matrix
    array([[0, 1],
           [2, 3]])
    >>> X, y = separate_dependent_var(matrix)
    >>> X
    array([[0],
           [2]])
    >>> y 
    1
    >>> #more complex matrix
    >>> matrix = np.arange(20).reshape((2,2,5))
    >>> matrix
    array([[[ 0,  1,  2,  3,  4],
            [ 5,  6,  7,  8,  9]],
    <BLANKLINE>
           [[10, 11, 12, 13, 14],
            [15, 16, 17, 18, 19]]])
    >>> X, y = separate_dependent_var(matrix)
    >>> X
    array([[[ 0,  1,  2,  3],
            [ 5,  6,  7,  8]],
    <BLANKLINE>
           [[10, 11, 12, 13],
            [15, 16, 17, 18]]])
    >>> y
    array([ 4, 14])
    '''
    # get last column
    if matrix.shape[-1] == 1:
        raise ValueError('Expects input matrix to be size (num_samples, num_frames, ' + \
                         'num_features). Number of features must exceed 1 in order ' + \
                         'to separate into X and y arrays.')
    y_step1 = np.take(matrix, -1, axis=-1)
    # because the label is the same for each block of data, just need the first
    # row,  not all the rows, as they are the same label.
    y = np.take(y_step1, 0, axis=-1)
    # get features:
    X = np.delete(matrix, -1, axis=-1)
    return X, y

# TODO: perhaps remove - just use np.expand_dims() 
# TODO: https://github.com/biopython/biopython/issues/1496
# Fix numpy array repr for Doctest. 
[docs]def add_tensor(matrix):
    '''Adds tensor / dimension to input ndarray (e.g. features).

    Keras requires an extra dimension at some layers, which represents 
    the 'tensor' encapsulating the data. 

    Further clarification taking the example below. The input matrix has 
    shape (2,3,4). Think of it as 2 different events, each having
    3 sets of measurements, with each of those having 4 features. So, 
    let's measure differences between 2 cities at 3 different times of
    day. Let's take measurements at 08:00, 14:00, and 19:00 in... 
    Magic City and Never-ever Town. We'll measure.. 1) tempurature, 
    2) wind speed 3) light level 4) noise level.

    How I best understand it, putting our measurements into a matrix
    with an added dimension/tensor, this highlights the separate 
    measurements, telling the algorithm: yes, these are 4 features
    from the same city, BUT they occur at different times. Or it's 
    just how Keras set up the code :P 

    Parameters
    ----------
    matrix : numpy.ndarray
        The `matrix` holds the numerical data to add a dimension to.

    Returns
    -------
    matrix : numpy.ndarray
        The `matrix` with an additional dimension.

    Examples
    --------
    >>> import numpy as np
    >>> matrix = np.arange(24).reshape((2,3,4))
    >>> matrix.shape
    (2, 3, 4)
    >>> matrix
    array([[[ 0,  1,  2,  3],
            [ 4,  5,  6,  7],
            [ 8,  9, 10, 11]],
    <BLANKLINE>
           [[12, 13, 14, 15],
            [16, 17, 18, 19],
            [20, 21, 22, 23]]])
    >>> matrix_2 = add_tensor(matrix)
    >>> matrix_2.shape
    (2, 3, 4, 1)
    >>> matrix_2
    array([[[[ 0],
             [ 1],
             [ 2],
             [ 3]],
    <BLANKLINE>
            [[ 4],
             [ 5],
             [ 6],
             [ 7]],
    <BLANKLINE>
            [[ 8],
             [ 9],
             [10],
             [11]]],
    <BLANKLINE>
    <BLANKLINE>
           [[[12],
             [13],
             [14],
             [15]],
    <BLANKLINE>
            [[16],
             [17],
             [18],
             [19]],
    <BLANKLINE>
            [[20],
             [21],
             [22],
             [23]]]])
    '''
    if isinstance(matrix, np.ndarray) and len(matrix) > 0:
        matrix = matrix.reshape(matrix.shape + (1,))
        return matrix
    elif isinstance(matrix, np.ndarray):
        raise ValueError('Input matrix is empty.')
    else:
        raise TypeError('Expected type numpy.ndarray, recieved {}'.format(
            type(matrix)))
    
# TODO improve / remove.. move to data module?
[docs]def scale_X_y(matrix, is_train=True, scalars=None):
    '''Separates and scales data into X and y arrays. Adds dimension for keras.
    
    Assumes the last column of the last dimension is the y or label data.
    
    Parameters
    ----------
    matrix : np.ndarray [size = (num_samples, num_frames, num_features)]
        Matrix with X and y data
    is_train : bool
        Relevant for the `scalars` parameter. If the data is training
        data (i.e. True), the `scalars` will be created. If the data
        is test data (i.e. False), the function expects `scalars` to 
        be provided. (default True)
    scalars : dict, optional
        Dictionary with scalars to be applied to non-training data.
        
    Returns
    -------
    X : np.ndarray [size = (num_sampls, num_frames, num_features-1, 1)]
        Scaled features with extra dimension
    y : np.ndarray [size = (num_samples, 1, 1)]
        Scaled independent variable with extra dimension
    scalars : dict
        The scalars either created or previously loaded.
    '''
    X, y = sp.feats.separate_dependent_var(matrix)
    if is_train:
        scalars = {}
    elif scalars is None:
        raise TypeError('If non-train data, `scalars` cannot be of type None.')
    if len(X.shape) != 3:
        raise ValueError('Expected 3d input, not input of shape {}.'.format(
            matrix.shape))
    if X.dtype == np.complex_:
        # convert stft to power spectrum
        print('\nTaking absolute value and power of complex data..'+\
            '\ni.e. Removing complex values.')
        X = np.abs(X)**2
    for j in range(X.shape[2]):
        if is_train:
            scalars[j] = StandardScaler()
            X[:, :, j] = scalars[j].fit_transform(X[:, :, j])
        else:
            X[:, :, j] = scalars[j].transform(X[:, :, j])
        X[:, :, j] = preprocessing.normalize(X[:, :, j])
    # Keras needs an extra dimension as a tensor / holder of data
    X = sp.feats.add_tensor(X)
    y = sp.feats.add_tensor(y)
    return X, y, scalars

# TODO test for all these features:
[docs]def list_available_features():
    return ['stft', 'powspec', 'fbank', 'mfcc', 'signal']

# TODO REMOVE context_window for next release.
# don't apply context window and such during feature extraction phase
# TODO check if `real_signal` influences change of shape or not
[docs]def get_feature_matrix_shape(sr = None, dur_sec = None, feature_type = None,
                             win_size_ms = None, percent_overlap = None,
                             fft_bins = None, num_mfcc = None, num_filters = None,
                             rate_of_change = False, rate_of_acceleration = False,
                             context_window = None, frames_per_sample = None, zeropad = True, labeled_data = False, remove_first_coefficient = False, real_signal = False, **kwargs):
    '''Returns expected shapes of feature matrix depending on several parameters.
     
    Parameters
    ----------
    sr : int 
        Sample rate of the audio to be extracted.
        
    dur_sec : int, float 
        The number of seconds of audio feature extraction will be applied to.
        
    feature_type : str 
        Accepted features include 'signal', 'stft', 'powspec', 'fbank', 'mfcc'. Which
        `feature_type` applied will influence the resulting shape of the feature matrix
        shape.
        
    win_size_ms : int or float
        The size of the window the audio signal should be broken into. If `feature_type` 
        is set to 'signal', this is irrelevant. Otherwise will raise TypeError if set to None.
        
    percent_overlap : float 
        The amount of overlap between windows. If set to 0.5, the number of overlapping
        samples will be half the number of samples that make up `win_size_ms`.
        
    fft_bins : int 
        The number of frequency bins to use when calculating the fast Fourier transform.
        If None, the calculated `frame_length` will be used. 
        
    num_mfcc : int 
        If extracting 'mfcc' features, the total number of coefficients expected.
        
    num_filters : int 
        If extracting 'fbank' features, the total number of mel-filters to be applied.
        
    rate_of_change : bool 
        If True, the first delta will be concatenated to features extracted.
        
    rate_of_acceleration : bool 
        If True, the second delta will be concatenated to features extracted.
        
    context_window : int
        The size of `context_window` or number of samples padding a central frame.
        This may be useful for models training on small changes occuring in the signal, e.g. to break up the image of sound into smaller parts. 
        
    frames_per_sample : int
        The previous keyword argument for sugementing audio into smaller parts.
        Will be removed in future versions and available in generator functions as 
        `context_window`. `frames_per_sample` equals 2 * `context_window` + 1. See 
        `soundpy.models.dataprep.Generator`
        
    zeropad : bool 
        If True, windows and frames will be zeropadded to avoid losing any sample data.
        
    labeled_data : bool 
        If True, a label will be added to the output shape of features. 
        
    remove_first_coefficient : bool 
        If True, the first mfcc coefficient will not be included in feature
        matrix.
        
    **kwargs : additional keyword arguments
        Keyword arguments for `soundpy.feats.get_feats`. These may not be used in this
        function as they may not influence the size of the feature matrix.
        
    Returns
    -------
    feature_matrix_base : tuple
        The base shape of the feature matrix. This is the shape that should result from 
        extracting the features for each audio file 
        
    feature_matrix_model : tuple 
        The shape relevant to training models. For example, one including space for a
        context window and label. 
    '''
    if sr is None:
        raise TypeError('Function `soundpy.feats.get_feature_matrix_shape` expects'+\
            ' parameter `sr` to be of type `int`, not type None.')
    if dur_sec is None:
        raise TypeError('Function `soundpy.feats.get_feature_matrix_shape` expects'+\
            ' parameter `dur_sec` to be of type `int` or `float`, not type None.')
    if win_size_ms is None:
        raise TypeError('Function `soundpy.feats.get_feature_matrix_shape` expects'+\
            ' parameter `win_size_ms` to be of type `int` or `float`, not type None.')
    if feature_type is None:
        raise TypeError('Function `soundpy.feats.get_feature_matrix_shape` expected'+\
            ' parameter `feature_type` to be one of the following: '+\
                ','.join(sp.feats.list_available_features())+\
                    '\nInstead got None.')
    total_samples = sp.dsp.calc_frame_length(dur_sec*1000, sr=sr)
    frame_length = sp.dsp.calc_frame_length(win_size_ms, sr)
    # all we need to know if signal is feature
    if 'signal' in feature_type:
        total_rows_per_wav = total_samples // frame_length
        num_feats = frame_length
        feature_matrix_model = (
            total_rows_per_wav,
            num_feats)
        feature_matrix_base = (
            total_samples,) # currently only single channels
    else:
        if win_size_ms is None or percent_overlap is None:
            raise TypeError('`win_size_ms` or `percent_overlap` cannot be type '+\
                'None. Please set these values, e.g. `win_size_ms` = 20, `percent_overlap` = 0.5')
        win_shift_ms = win_size_ms - (win_size_ms * percent_overlap)
        hop_length = int(win_shift_ms * 0.001 * sr)
        if fft_bins is None:
            fft_bins = int(win_size_ms * sr // 1000)
        # https://librosa.org/doc/latest/generated/librosa.util.frame.html#librosa.util.frame
        total_rows_per_wav = int(1 + (total_samples - fft_bins)//hop_length)
        if 'mfcc' in feature_type:
            if num_mfcc is None:
                num_feats = 40
            else:
                num_feats = num_mfcc
            if remove_first_coefficient is True:
                num_feats -= 1
        elif 'fbank' in feature_type:
            if num_filters is None:
                num_feats = 40
            else:
                num_feats = num_filters
        elif 'stft' in feature_type or 'powspec' in feature_type:
            num_feats = fft_bins//2 + 1
        else:
            raise ValueError('Feature type "{}" '.format(feature_type)+\
                'not understood.\nMust include one of the following: \n'+\
                    ', '.join(list_available_features()))
        if rate_of_change is True and rate_of_acceleration is True:
            num_feats += 2 * num_feats
        elif rate_of_change is True or rate_of_acceleration is True:
            num_feats += num_feats
        try:
            if frames_per_sample is not None or context_window is not None:
                raise DeprecationWarning('\nWARNING: In future versions, the `frames_per_sample` and '+\
                    '`context_window` parameters will be no longer used in feature extraction.\n'+\
                        ' Instead features can be segmented in generator functions using the '+\
                            'parameter `context_window`: `soundpy.models.dataprep.Generator`.')
        except DeprecationWarning as e:
            print(e)
        if context_window or frames_per_sample:
            if context_window:
                subframes = context_window * 2 + 1
            else:
                subframes = frames_per_sample
            batches = math.ceil(total_rows_per_wav/subframes)
            feature_matrix_model = (
                batches,
                subframes,
                num_feats)
            feature_matrix_base = (
                batches * subframes,
                num_feats)
        else:
            feature_matrix_model = (
                total_rows_per_wav,
                num_feats)
            feature_matrix_base = (
                total_rows_per_wav,
                num_feats)
    if labeled_data is True:
        feature_matrix_model = feature_matrix_model[:-1] + (feature_matrix_model[-1] + 1,)
    return feature_matrix_base, feature_matrix_model
        
        
[docs]def visualize_feat_extraction(feats, iteration = None, dataset=None, label=None,
                              datadir = None, subsections = False, **kwargs):
    '''Saves plots of features during feature extraction or training of models.
    
    Parameters
    ----------
    feats : np.ndarray [shape=(num_samples,) or (num_samples, num_frames) or \
    (num_frames, num_features) or (num_subsections, num_frames, num_features)]
        The extracted features can be raw signal data, stft, fbank, powspec, mfcc
        data, either as a single plot or subsectioned into batches / subframes.
        
    iteration : int, optional
        The iteration of the audio getting extracted; e.g. the 10th training item.
        
    dataset : str, optional
        The identifying string (for example 'train' , 'val', or 'test', but this can 
        be anything).
        
    label : str, int, optional
        The label of the audio file. Used in titles and filenames.
        
    datadir : str, pathlib.PosixPath, optional
        The directory where related data is located. An 'image' directory will be 
        created within this `datadir` where the saved plots will be stored. If 
        None, will be created in current working directory.
        
    subsections : bool, optional 
        To subsection raw 'signal' data into frames. For other features, this is 
        easier to identify via the shape of `feats`.
        
    **kwargs : additional keyword arguments
        Keyword arguments for `soundpy.feats.get_feats`
        
    Returns
    -------
    None
    '''
    # visualize features:
    if datadir is None:
        datadir = './'
    if not isinstance(datadir, pathlib.PosixPath):
        datadir = sp.utils.string2pathlib(datadir)
    if dataset is not None and iteration is not None:
        save_pic_path = datadir.joinpath(
            'images',dataset,'{}_sample{}_{}'.format(
                kwargs['feature_type'], iteration, label))
        title = '{} {} features: label {}'.format(
                        dataset, kwargs['feature_type'].upper(),
                        label)
    else: 
        save_pic_path = datadir.joinpath(
            'images', '{}_{}'.format(kwargs['feature_type'],
                                     sp.utils.get_date()))
        title = '{} features: label {}'.format(kwargs['feature_type'].upper(),
                        label)
    # make sure this directory exists
    save_pic_dir = sp.utils.check_dir(save_pic_path.parent, make=True)
    # if in batches, save the features in each batch 
    
    # first non raw signal data (e.g. stft, powspec, fbank, mfcc)
    if 'signal' not in kwargs['feature_type'] and \
        len(feats.shape) > 2:
        if len(feats.shape) == 4:
            if feats.shape[-1] == 1:
                feats_temp = feats[:,:,:,0]
            else:
                raise ValueError('Cannot visualize greater than 3D data.')
        elif len(feats.shape) > 4:
            raise ValueError('Cannot visualize greater than 3D data.')
        else:
            feats_temp = feats
        orig_name = save_pic_path.stem
        for i, feat_section in enumerate(feats_temp):
            new_name = orig_name + '_frame_{}'.format(i)
            save_pic_path = save_pic_path.parent.joinpath(new_name)
            sp.feats.plot(feature_matrix = feat_section, 
                            feature_type = kwargs['feature_type'],
                            win_size_ms = kwargs['win_size_ms'],
                            percent_overlap = kwargs['percent_overlap'],
                            title = title + ' frame {}'.format(i),
                            name4pic = save_pic_path,
                            save_pic = True,
                            subprocess = True)
        return None
    
    # then raw signal data; needs parameter `subsections` set to True
    # can only be 2D or 3D (if last dimension is 1)
    if subsections is True and 'signal' in kwargs['feature_type']:
        if len(feats.shape) == 3 and feats.shape[-1] > 1:
            raise ValueError('Cannot visualize raw signal greater than 2D.')
        elif len(feats.shape) == 3 and feats.shape[-1] == 1:
            feats_temp = feats[:,:,0]
        elif len(feats.shape) == 2:
            feats_temp = feats
        elif len(feats.shape) == 1:
            feats_temp = None
        else:
            raise ValueError('Cannot visualize raw signal greater than 2D.')
        if feats_temp is not None:
            orig_name = save_pic_path.stem
            for i, feat_section in enumerate(feats_temp):
                new_name = orig_name + '_frame_{}'.format(i)
                save_pic_path = save_pic_path.parent.joinpath(new_name)
                sp.feats.plot(feature_matrix = feat_section, 
                                    feature_type = kwargs['feature_type'],
                                    win_size_ms = kwargs['win_size_ms'],
                                    percent_overlap = kwargs['percent_overlap'],
                                    title = title + ' frame {}'.format(i),
                                    save_pic = True,
                                    name4pic = save_pic_path.joinpath('frame {}'.format(i)),
                                    subprocess = True)
            return None
    
    # otherwise save features in a single plot
    sp.feats.plot(feature_matrix = feats, 
                    feature_type = kwargs['feature_type'],
                    win_size_ms = kwargs['win_size_ms'],
                    percent_overlap = kwargs['percent_overlap'],
                    title = title,
                    save_pic = True,
                    name4pic = save_pic_path,
                    subprocess = True)
    return None
    
[docs]def save_features_datasets(datasets_dict, datasets_path2save_dict, 
                            context_window=None, frames_per_sample = None, labeled_data=False, 
                            subsection_data=False, divide_factor=None,
                            visualize=False, vis_every_n_frames=50, 
                            log_settings=True, decode_dict = None, 
                            random_seed = None, **kwargs):
    '''Extracts and saves audio features, sectioned into datasets, to indicated locations.
    
    If MemoryError, the provided dataset dicts will be adjusted to allow data to be subsectioned.
    
    Parameters
    ----------
    datasets_dict : dict 
        Dictionary with keys representing datasets and values the audifiles making up that dataset.
        E.g. {'train':['1.wav', '2.wav', '3.wav'], 'val': ['4.wav'], 'test':['5.wav']} for unlabled
        data or  {'train':[(0, '1.wav'), (1, '2.wav'), (0, '3.wav')], 'val': [(1, '4.wav')], 
        'test':[(0, '5.wav')]} for labeled data.
    
    datasets_path2save_dict : dict
        Dictionary with keys representing datasets and values the pathways of where extracted 
        features of that dataset will be saved.
        E.g. {'train': './data/train.npy', 'val': './data/val.npy', 'test': './data/test.npy'}
    
    context_window : int
        The size of `context_window` or number of samples padding a central frame.
        This may be useful for models training on small changes occuring in the signal, e.g. to break up the image of sound into smaller parts, to feed 
        to a long short-term memory network (LSTM), for example.
        (Can avoid this by simply reshaping data later). 
        
    frames_per_sample : int
        The previous keyword argument for sugementing audio into smaller parts.
        Will be removed in future versions. This equals 2 * `context_window` + 1
    
    labeled_data : bool 
        If True, expects each audiofile to be accompanied by an integer label. See example 
        given for `datasets_dict`.
    
    subsection_data : bool 
        If you have a large dataset, you may want to divide it into subsections. See 
        soundpy.datasets.subsection_data. If datasets are large enough to raise a MemoryError, 
        this will be applied automatically.
    
    divide_factor : int, optional
        The number of subsections to divide data into. Only large enough sections will be divided.
        If smaller datasets (i.e. validation and test datasets) are as large or smaller than 
        the new subsectioned larger dataset(s) (i.e. train), they will be left unchanged.
        (defaults to 5)
    
    visualize : bool
        If True, periodic plots of the features will be saved throughout the extraction process. (default False)
    
    vis_every_n_frames : int 
        How often visuals should be made: every 10 samples, every 100, etc. (default 50)
    
    log_settings : bool
        If True, a .csv file will be saved in the feature extraction directory with 
        most of the feature settings saved. (default True)
    
    decode_dict : dict, optional
        The dictionary to get the label given the encoded label. This is for plotting 
        purposes. (default None)
    
    **kwargs : additional keyword arguments
        Keyword arguments for `soundpy.feats.get_feats`.
    
    Returns
    -------
    datasets_dict : dict 
        The final dataset dictionary used in feature extraction. The datasets may 
        have been subdivided.
    
    datasets_path2save_dict : dict
        The final dataset feature pathway dict. The pathways will have been 
        adjusted if the datasets have been subdivided.
        
    See Also
    --------
    soundpy.feats.get_feats
        Extract features from audio file or audio data.
    '''
    # if dataset is large, may want to divide it into sections
    if divide_factor is None:
        divide_factor = 5
    if subsection_data:
        datasets_dict, datasets_path2save_dict = sp.datasets.section_data(
            datasets_dict,
            datasets_path2save_dict,
            divide_factor=divide_factor)
    # save where data was extracted from:
    dataset_dirs = []
    try:
        # sr must be set. Set to default value.
        if not 'sr' in kwargs or kwargs['sr'] is None:
            import warnings
            msg = '\nWARNING: sample rate was not set. Setting it at 22050 Hz.'
            warnings.warn(msg)
            kwargs['sr'] = 22050
            
        # win_size_ms must be set. Set to default value.
        if not 'win_size_ms' in kwargs or kwargs['win_size_ms'] is None:
            import warnings
            msg = '\nWARNING: `win_size_ms` was not set. Setting it to 20 ms'
            warnings.warn(msg)
            kwargs['win_size_ms'] = 20
            
        # percent_overlap must be set. Set to default value.
        if not 'percent_overlap' in kwargs or kwargs['percent_overlap'] is None:
            import warnings
            msg = '\nWARNING: `percent_overlap` was not set. Setting it to 0.5'
            warnings.warn(msg)
            kwargs['percent_overlap'] = 0.5
        
            
        feat_base_shape, feat_model_shape = sp.feats.get_feature_matrix_shape(
            context_window = context_window,
            frames_per_sample = frames_per_sample,
            labeled_data = labeled_data,
            **kwargs)
    
        # set whether or not features will include complex values:
        if 'stft' in kwargs['feature_type']:
            complex_vals = True
        else:
            complex_vals = False
            
        total_audiofiles = 0

        for key, value in datasets_dict.items():
            # get parent directory of where data should be saved (i.e. for saving pics)
            datapath = datasets_path2save_dict[key]
            if not isinstance(datapath, pathlib.PosixPath):
                datapath = pathlib.Path(datapath)
            datadir = datapath.parent
            # when loading a dictionary, the value is a string
            if isinstance(value, str):
                value = sp.utils.restore_dictvalue(value)
            # len(vale) is the total number of audio files
            feats4model_shape = (len(value),) + feat_model_shape
            feats_matrix = sp.dsp.create_empty_matrix(
                feats4model_shape, 
                complex_vals=complex_vals)
            
            audio_list = value.copy()
            total_audiofiles += len(audio_list)
            # shuffle audiofiles:
            if random_seed is not None:
                random.seed(random_seed)
            random.shuffle(audio_list)
            for j, audiofile in enumerate(audio_list):
                if labeled_data:
                    label, audiofile = int(audiofile[0]), audiofile[1]
                else:
                    label = None
                if isinstance(audiofile, str):
                    audiofile = pathlib.PosixPath(audiofile)
                if j == 0:
                    dataset_dirs.append(audiofile.parent)
                feats = sp.feats.get_feats(audiofile,
                                            **kwargs)

                # zeropad or clip feats if too short or long:
                feats = sp.feats.adjust_shape(
                    feats, 
                    desired_shape = feat_base_shape)
                
                # add label column to feature matrix
                if labeled_data:
                    # create label column
                    label_col = np.zeros((len(feats),1)) + label
                    feats = np.concatenate([feats,label_col], axis=1)
                    
                feats = feats.reshape(feats4model_shape[1:])
                
                #visualize features only every n num frames
                if visualize and j % vis_every_n_frames == 0:
                    if labeled_data:
                        if decode_dict is not None:
                            try:
                                label_plot = decode_dict[label].upper()
                            except KeyError:
                                try:
                                    label_plot = decode_dict[str(label)].upper()
                                except KeyError:
                                    label_plot = label
                        else:
                            label_plot = label
                    else:
                        label_plot = audiofile.parent.stem.upper()
                
                    sp.feats.visualize_feat_extraction(
                        feats,
                        iteration = j,
                        dataset = key,
                        label = label_plot,
                        datadir = datadir,
                        subsections = True, # prepping feats 4 model results in subsections 
                        **kwargs)
                
                # fill in empty matrix with features from each audiofile
                feats_matrix[j] = feats
                sp.utils.print_progress(iteration = j, 
                                        total_iterations = len(value),
                                        task = '{} {} feature extraction'.format(
                                            key, kwargs['feature_type']))
            # save data:
            np.save(datasets_path2save_dict[key], feats_matrix)
            print('\nFeatures saved at {}\n'.format(datasets_path2save_dict[key]))
        if log_settings:
            log_filename = datadir.joinpath('log_extraction_settings.csv')
            feat_settings = dict(
                dataset_dirs = dataset_dirs,
                feat_base_shape = feat_base_shape,
                feat_model_shape = feat_model_shape,
                complex_vals = complex_vals,
                context_window = context_window,
                frames_per_sample = frames_per_sample,
                labeled_data = labeled_data,
                decode_dict = decode_dict,
                visualize = visualize,
                vis_every_n_frames = vis_every_n_frames,
                subsection_data = subsection_data,
                divide_factor = divide_factor,
                total_audiofiles = total_audiofiles,
                kwargs = kwargs
                )
            feat_settings_path = sp.utils.save_dict(
                dict2save = feat_settings,
                filename = log_filename,
                overwrite=True)
    except MemoryError as e:
        print('MemoryError: ',e)
        print('\nSectioning data and trying again.\n')
        datasets_dict, datasets_path2save_dict = sp.datasets.section_data(
            datasets_dict, datasets_path2save_dict, divide_factor=divide_factor)
        datasets_dict, datasets_path2save_dict = save_features_datasets(
            datasets_dict = datasets_dict, 
            datasets_path2save_dict = datasets_path2save_dict,
            context_window = context_window,
            frames_per_sample = frames_per_sample,
            labeled_data = labeled_data,
            subsection_data = subsection_data,
            divide_factor = divide_factor,
            visualize = visualize,
            vis_every_n_frames = vis_every_n_frames,
            log_settings = log_settings,
            decode_dict = decode_dict,
            **kwargs)
    return datasets_dict, datasets_path2save_dict

# TODO: update / consolidate
[docs]def save_features_datasets_zipfiles(datasets_dict, datasets_path2save_dict, 
                                    extract_dir, dur_sec,
                                    feature_type='fbank', num_feats=None, sr=22050, 
                                    win_size_ms=20, percent_overlap=0.5, n_fft = None,
                                    frames_per_sample=None, labeled_data=False, 
                                    subsection_data=False, divide_factor=None,
                                    visualize=False, vis_every_n_frames=50, 
                                    use_librosa=True, center=True, mode='reflect', 
                                    log_settings=True, decode_dict = None,
                                    audiofile_lim = 10, **kwargs):
    '''Extracts and saves audio features, sectioned into datasets, to indicated locations.
    
    If MemoryError, the provided dataset dicts will be adjusted to allow data to be subsectioned.
    
    Parameters
    ----------
    datasets_dict : dict 
        Dictionary with keys representing datasets and values the audifiles making up that dataset.
        E.g. {'train':['1.wav', '2.wav', '3.wav'], 'val': ['4.wav'], 'test':['5.wav']} for unlabled
        data or  {'train':[(0, '1.wav'), (1, '2.wav'), (0, '3.wav')], 'val': [(1, '4.wav')], 
        'test':[(0, '5.wav')]} for labeled data.
    datasets_path2save_dict : dict
        Dictionary with keys representing datasets and values the pathways of where extracted 
        features of that dataset will be saved.
        E.g. {'train': './data/train.npy', 'val': './data/val.npy', 'test': './data/test.npy'}
    feature_type : str 
        String including only one of the following: 'signal', 'stft', 'powspec', 'fbank', and 'mfcc'. 
        'signal' currently only supports mono channel data. TODO test for stereo
        'powspec' and 'stft' are basically the same; 'powspec' is the 'stft' except without 
        complex values and squared. E.g 'mfcc_noisy' or 'stft_train'.
    sr : int 
        The sample rate the audio data should be loaded with.
    n_fft : int 
        The number of frequency bins used for the Fast Fourier Transform (fft)
    dur_sec : int or float
        The desired duration of how long the audio data should be. This is used to calculate 
        size of feature data and is therefore necessary, as audiofiles tend to differe in length.
        If audiofiles are longer or shorter, they will be cut or zeropadded respectively.
    num_feats : int 
        The number of mfcc coefficients (mfcc), mel filters (fbank), or frequency bins (stft).
    win_size_ms : int 
        The desired window size in milliseconds to process audio samples.
    percent_overlap : float
        The amount audio samples should overlap as each window is processed.
    frames_per_sample : int, optional 
        If you want to section each audio file feature data into smaller frames. This might be 
        useful for speech related contexts. (Can avoid this by simply reshaping data later)
    labeled_data : bool 
        If True, expects each audiofile to be accompanied by an integer label. See example 
        given for `datasets_dict`.
    subsection_data : bool 
        If you have a large dataset, you may want to divide it into subsections. See 
        soundpy.datasets.subsection_data. If datasets are large enough to raise a MemoryError, 
        this will be applied automatically.
    divide_factor : int, optional
        The number of subsections to divide data into. Only large enough sections will be divided.
        If smaller datasets (i.e. validation and test datasets) are as large or smaller than 
        the new subsectioned larger dataset(s) (i.e. train), they will be left unchanged.
        (defaults to 5)
    visualize : bool
        If True, periodic plots of the features will be saved throughout the extraction process. (default False)
    vis_every_n_frames : int 
        How often visuals should be made: every 10 samples, every 100, etc. (default 50)
    use_librosa : bool 
        If True, librosa is used to load and extract features. As of now, no other option is 
        available. TODO: add other options. :P I just wanted to be clear that some elements
        of this function are unique to using librosa. (default True)
    center : bool 
        Relevant for librosa and feature extraction. (default True)
    mode : str 
        Relevant for librosa and feature extraction. (default 'reflect')
    log_settings : bool
        If True, a .csv file will be saved in the feature extraction directory with 
        most of the feature settings saved. (default True)
    decode_dict : dict, optional
        The dictionary to get the label given the encoded label. This is for plotting 
        purposes. (default None)
    **kwargs : additional keyword arguments
        Keyword arguments for `soundpy.feats.get_feats`.
    
    Returns
    -------
    datasets_dict : dict 
        The final dataset dictionary used in feature extraction. The datasets may 
        have been subdivided.
    datasets_path2save_dict : dict
        The final dataset feature pathway dict. The pathways will have been 
        adjusted if the datasets have been subdivided.
        
    See Also
    --------
    soundpy.feats.get_feats
        Extract features from audio file or audio data.
    '''
    # if dataset is large, may want to divide it into sections
    if divide_factor is None:
        divide_factor = 5
    if subsection_data:
        datasets_dict, datasets_path2save_dict = sp.datasets.section_data(
            datasets_dict,
            datasets_path2save_dict,
            divide_factor=divide_factor)
    try:
        # depending on which packages one uses, shape of data changes.
        # for example, Librosa centers/zeropads data automatically
        # TODO see which shapes result from python_speech_features
        total_samples = sp.dsp.calc_frame_length(dur_sec*1000, sr=sr)
        # if using Librosa:
        if use_librosa:
            frame_length = sp.dsp.calc_frame_length(win_size_ms, sr)
            win_shift_ms = win_size_ms - (win_size_ms * percent_overlap)
            hop_length = int(win_shift_ms*0.001*sr)
            if n_fft is None:
                n_fft = frame_length
            # librosa centers samples by default, sligthly adjusting total 
            # number of samples
            if center:
                y_zeros = np.zeros((total_samples,))
                y_centered = np.pad(y_zeros, int(n_fft // 2), mode=mode)
                total_samples = len(y_centered)
            # each audio file 
            if 'signal' in feature_type:
                # don't apply fft to signal (not sectioned into overlapping windows)
                total_rows_per_wav = total_samples // frame_length
            else:
                # do apply fft to signal (via Librosa) - (will be sectioned into overlapping windows)
                total_rows_per_wav = int(1 + (total_samples - n_fft)//hop_length)
            # set defaults to num_feats if set as None:
            if num_feats is None:
                if 'mfcc' in feature_type or 'fbank' in feature_type:
                    num_feats = 40
                elif 'powspec' in feature_type or 'stft' in feature_type:
                    num_feats = int(1+n_fft/2)
                elif 'signal' in feature_type:
                    num_feats = frame_length
                    ### how many samples make up one window frame?
                    ###num_samps_frame = int(sr * win_size_ms * 0.001)
                    #### make divisible by 10
                    #### TODO: might not be necessary
                    ###if not num_samps_frame % 10 == 0:
                        ###num_samps_frame *= 0.1
                        #### num_feats is how many samples per window frame (here rounded up 
                        #### to the nearest 10)
                        ###num_feats = int(round(num_samps_frame, 0) * 10)
                    ### limit in seconds how many samples
                    ### is this necessary?
                    ### dur_sec = num_features * frames_per_sample * batch_size / sr
                else:
                    raise ValueError('Feature type "{}" '.format(feature_type)+\
                        'not understood.\nMust include one of the following: \n'+\
                            ', '.join(list_available_features()))

            # adjust shape for model
            # input_shape: the input shape for the model
            # desired_shape: the 2D shape of expected samples. This is used for zeropadding or
            # limiting the feats to this shape. Once this shape, feats can be reshaped into input_shape
            # TODO test for labeled data with frames_per_sample
            if frames_per_sample is not None:
                # want smaller windows, e.g. autoencoder denoiser or speech recognition
                batch_size = math.ceil(total_rows_per_wav/frames_per_sample)
                if labeled_data:
                    input_shape = (batch_size, frames_per_sample, num_feats + 1)
                    desired_shape = (input_shape[0] * input_shape[1], 
                                     input_shape[2]-1)
                else:
                    input_shape = (batch_size, frames_per_sample, num_feats)
                    desired_shape = (input_shape[0]*input_shape[1],
                                     input_shape[2])
            else:
                if labeled_data:
                    input_shape = (int(total_rows_per_wav), num_feats + 1)
                    desired_shape = (input_shape[0], input_shape[1]-1)
                else:
                    input_shape = (int(total_rows_per_wav), num_feats)
                    desired_shape = input_shape
            # set whether or not features will include complex values:
            if 'stft' in feature_type:
                complex_vals = True
            else:
                complex_vals = False
            # limit feat_type to the basic feature extracted
            # for example:
            # feature_type 'powspec' is actually 'stft' but with complex info removed.
            # the basic feat_type is still 'stft'
            if 'mfcc' in feature_type:
                feat_type = 'mfcc'
            elif 'fbank' in feature_type:
                feat_type = 'fbank'
            elif 'stft' in feature_type:
                feat_type = 'stft'
            elif 'powspec' in feature_type:
                feat_type = 'stft'
            elif 'signal' in feature_type:
                feat_type = 'signal'
            else:
                raise TypeError('Expected '+', '.join(list_available_features())+\
                    ' to be in `feature_type`, not {}'.format(feature_type))
            for key, value in datasets_dict.items():
                # get parent directory of where data should be saved (i.e. for saving pics)
                datapath = datasets_path2save_dict[key]
                if not isinstance(datapath, pathlib.PosixPath):
                    datapath = pathlib.Path(datapath)
                datadir = datapath.parent
                # when loading a dictionary, the value is a string
                if isinstance(value, str):
                    value = sp.utils.restore_dictvalue(value)
                extraction_shape = (len(value) * audiofile_lim,) + input_shape
                feats_matrix = sp.dsp.create_empty_matrix(
                    extraction_shape, 
                    complex_vals=complex_vals)
                # count empty rows (if speaker doesn't have audiofile_lim data)
                empty_rows = 0
                for j, zipfile in enumerate(value):
                    if labeled_data:
                        label, zipfile = int(zipfile[0]), zipfile[1]
                    if isinstance(zipfile, str):
                        zipfile = pathlib.PosixPath(zipfile)
                    # extract `audiofile_lim` from zipfile:
                    extract_dir = sp.utils.check_dir(extract_dir, make=True)
                    sp.files.extract(zipfile, extract_path = extract_dir)
                    audiolist = sp.files.collect_audiofiles(extract_dir,
                                                              recursive = True)
                    if audiofile_lim is not None:
                        for i in range(audiofile_lim):
                            if i == len(audiolist) and i < audiofile_lim:
                                print('Short number files: ', audiofile_lim - i)
                                empty_rows += audiofile_lim - i
                                break
                            feats = sp.feats.get_feats(audiolist[i],
                                                        sr=sr,
                                                        feature_type=feat_type,
                                                        win_size_ms=win_size_ms,
                                                        percent_overlap=percent_overlap,
                                                        num_filters=num_feats,
                                                        num_mfcc=num_feats,
                                                        dur_sec=dur_sec,
                                                        **kwargs)
                            # if power spectrum (remove complex values and squaring features)
                            if 'powspec' in feature_type:
                                feats = np.abs(feats)**2
                                
                            if visualize:
                                if labeled_data:
                                    if decode_dict is not None:
                                        try:
                                            label_plot = decode_dict[label].upper()
                                        except KeyError:
                                            try:
                                                label_plot = decode_dict[str(label)].upper()
                                            except KeyError:
                                                label_plot = label
                                    else:
                                        label_plot = label
                                else:
                                    label_plot = audiofile.parent.stem.upper()
                                # visualize features:
                                if 'mfcc' in feature_type or 'signal' in feature_type:
                                    energy_scale = None
                                else:
                                    energy_scale = 'power_to_db'
                                #visualize features only every n num frames
                                if j % vis_every_n_frames == 0:
                                    save_pic_path = datadir.joinpath(
                                        'images',key,'{}_sample{}'.format(
                                            feature_type, j))
                                    # make sure this directory exists
                                    save_pic_dir = sp.utils.check_dir(save_pic_path.parent, make=True)
                                    sp.feats.plot(feats, 
                                                    feature_type = feature_type,
                                                    win_size_ms = win_size_ms,
                                                    percent_overlap = percent_overlap,
                                                    energy_scale = energy_scale,
                                                    title='{} {} features: label {}'.format(
                                                        key, feature_type.upper(),
                                                        label_plot),
                                                    save_pic=visualize, 
                                                    name4pic=save_pic_path)
                                
                            # zeropad feats if too short:
                            if 'signal' in feat_type:
                                feats_zeropadded = np.zeros(desired_shape)
                                feats_zeropadded = feats_zeropadded.flatten()
                                if len(feats.shape) > 1:
                                    feats_zeropadded = feats_zeropadded.reshape(feats_zeropadded.shape[0],
                                                                                feats.shape[1])
                                if len(feats) > len(feats_zeropadded):
                                    feats = feats[:len(feats_zeropadded)]
                                feats_zeropadded[:len(feats)] += feats

                                # reshape here for training models to avoid memory issues later 
                                # (while training) if total samples is large
                                feats = feats_zeropadded.reshape(desired_shape)
                            
                            feats = sp.feats.zeropad_features(
                                feats, 
                                desired_shape = desired_shape,
                                complex_vals = complex_vals)
                            
                            if labeled_data:
                                # create label column
                                label_col = np.zeros((len(feats),1)) + label
                                feats = np.concatenate([feats,label_col], axis=1)



                                                    
                            feats = feats.reshape(extraction_shape[1:])
                            # fill in empty matrix with features from each audiofile

                            feats_matrix[j+i] = feats
                    # delete extracted data (not directories):
                    sp.files.delete_dir_contents(extract_dir, remove_dir = False)
                    sp.utils.print_progress(iteration = j, 
                                            total_iterations = len(value),
                                            task = '{} {} feature extraction'.format(
                                                key, feature_type))
                if empty_rows > 0:
                    print('\nFeatures have {} empty rows.\n'.format(empty_rows))
                    print(feats_matrix.shape)
                    feats_matrix = feats_matrix[:-empty_rows]
                    print('\nNow removing them:')
                    print(feats_matrix.shape)
                # save data:
                np.save(datasets_path2save_dict[key], feats_matrix)
                print('\nFeatures saved at {}\n'.format(datasets_path2save_dict[key]))
            if log_settings:
                log_filename = datadir.joinpath('log_extraction_settings.csv')
                feat_settings = dict(dur_sec=dur_sec,
                                     feature_type=feature_type,
                                     feat_type=feat_type,
                                     complex_vals=complex_vals,
                                     sr=sr,
                                     num_feats=num_feats,
                                     n_fft=n_fft,
                                     win_size_ms=win_size_ms,
                                     frame_length=frame_length,
                                     percent_overlap=percent_overlap,
                                     frames_per_sample=frames_per_sample,
                                     labeled_data=labeled_data,
                                     visualize=visualize,
                                     # different for each dataset
                                     #total_samples=total_samples, 
                                     input_shape=input_shape,
                                     desired_shape=desired_shape,
                                     use_librosa=use_librosa,
                                     center=center,
                                     mode=mode,
                                     subsection_data=subsection_data,
                                     divide_factor=divide_factor,
                                     kwargs = kwargs
                                     )
                feat_settings_path = sp.utils.save_dict(
                    dict2save = feat_settings,
                    filename = log_filename,
                    overwrite=True)
        else:
            raise ValueError('Sorry, this functionality is not yet supported. '+\
                'Set `use_librosa` to True.')
    except MemoryError as e:
        print('MemoryError: ',e)
        print('\nSectioning data and trying again.\n')
        datasets_dict, datasets_path2save_dict = sp.datasets.section_data(
            datasets_dict, datasets_path2save_dict, divide_factor=divide_factor)
        datasets_dict, datasets_path2save_dict = save_features_datasets_zipfiles(
            datasets_dict = datasets_dict, 
            datasets_path2save_dict = datasets_path2save_dict,
            extract_dir = extract_dir,
            audiofile_lim = audiofile_lim,
            feature_type = feature_type, 
            sr = sr, 
            n_fft = n_fft, 
            dur_sec = dur_sec,
            num_feats = num_feats,
            win_size_ms = win_size_ms,
            percent_overlap = percent_overlap,
            use_librosa = use_librosa, 
            window = window,
            center = center,
            mode = mode,
            frames_per_sample = frames_per_sample,
            visualize = visualize, 
            vis_every_n_frames = vis_every_n_frames, 
            labeled_data = labeled_data,
            log_settings = log_settings,
            decode_dict = decode_dict,
            **kwargs)
    return datasets_dict, datasets_path2save_dict

[docs]def prep_new_audiofeats(feats, desired_shape, input_shape):
    '''Prepares new audio data to feed to a pre-trained model.
    
    Parameters
    ----------
    feats : np.ndarray [shape = (num_frames, num_features)]
        The features to prepare for feeding to a model.
    
    desired_shape : tuple 
        The expected number of samples necessary to fulfill the expected
        `input_shape` for the model. The `feats` will be zeropadded or
        limited to match this `desired_shape`.
    
    input_shape : tuple 
        The `input_shape` the model expects a single sample of data to be.
        
    Returns
    -------
    feats_reshaped : np.ndarray [shape = (`input_shape`)]
        The features reshaped to what the model expects.
    '''
    feats_reshaped = sp.feats.adjust_shape(feats, desired_shape)
    # reshape to input shape with a necessary "tensor" dimension
    feats_reshaped = feats_reshaped.reshape(input_shape)
    return feats_reshaped


[docs]def feats2audio(feats, feature_type, sr, win_size_ms,
                percent_overlap, phase=None):
    '''Prepares features into audio playable format.
    
    Parameters
    ----------
    feats : np.ndarray [shape = (num_frames, num_feats)]
        If the features are a signal, 
        [size = (batch_size * num_frames * num_features, 1)]. 
        Otherwise [size = (batch_size * num_frames, num_features)].
    feature_type : str
        Either 'stft', 'fbank', 'signal', or 'mfcc'. For the 'signal'
        feature, only mono channel is supported.
    sr : int 
        Sampling rate that the features were extracted with
    win_size_ms : int 
        The window size in milliseconds the features were extracted with
    percent_overlap : float
        The percent overlap between windows.
    phase : np.ndarray [shape = (num_frames, num_feats)], optional
        The original phase information of the reconstructed signal.
        
    Returns
    -------
    y : np.ndarray [shape = (num_samples, )]
        The reconstructed signal in samples.
    '''
    # (default) librosa handles data in shape (num_feats, num_frames)
    # while soundpy works with data in shape (num_frames, num_feats)
    if phase is not None:
        try:
            assert feats.shape == phase.shape
        except AssertionError:
            raise ValueError('Expected `feats` (shape {})'.format(feats.shape)+\
                ' and `phase` (shape {}) '.format(phase.shape) +\
                    'to have the same shape: (num_frames, num_features)')
    win_shift_ms = win_size_ms - (win_size_ms * percent_overlap)
    if 'signal' not in feature_type:
        # Will apply Librosa package to feats. Librosa expects data to have
        # shape (num_features, num_frames) not (num_frames, num_features)
        feats = feats.T 
        if phase is not None:
            phase = phase.T
    if 'fbank' in feature_type:
        y = librosa.feature.inverse.mel_to_audio(
            feats, 
            sr=sr, 
            n_fft = int(win_size_ms*0.001*sr), 
            hop_length=int(win_shift_ms*0.001*sr))
    elif 'mfcc' in feature_type:
        feats = feats[:14,:]
        y = librosa.feature.inverse.mfcc_to_audio(
            feats, 
            sr=sr, 
            n_fft = int(win_size_ms*0.001*sr), 
            hop_length=int(win_shift_ms*0.001*sr),
            n_mels=13)
    elif 'stft' in feature_type or 'powspec' in feature_type:
        # can use istft with phase information applied
        if phase is not None:
            feats = feats * phase
            y = librosa.istft(
                feats,
                hop_length=int(win_shift_ms*0.001*sr),
                win_length = int(win_size_ms*0.001*sr))
        # if no phase information available:
        else:
            y = librosa.griffinlim(
                feats,
                hop_length=int(win_shift_ms*0.001*sr),
                win_length = int(win_size_ms*0.001*sr))
    elif 'signal' in feature_type:
        y = feats.flatten()
    return y

[docs]def grayscale2color(image_matrix, colorscale=3):
    '''Expects grayscale image. Copies first channel into additional channels.
    
    This is useful for pre-trained models that require features
    to have rgb channels, not grayscale. Assumes last channel the colorscale 
    column.
    '''
    if len(image_matrix.shape) == 2:
        # if colorscale column not there, adds it
        image_matrix = image_matrix.reshape(image_matrix.shape + (1,))
    expected_shape = image_matrix.shape[:-1] + (colorscale,)
    # create extra empty channels to copy gray image to it:
    image_zeropadded = sp.feats.zeropad_features(image_matrix, expected_shape)
    for i in range(colorscale):
        if i == 0:
            pass
        else:
            if len(image_zeropadded.shape) == 3:
                image_zeropadded[:,:,i] = image_zeropadded[:,:,0]
            elif len(image_zeropadded.shape) == 4:
                image_zeropadded[:,:,:,i] = image_zeropadded[:,:,:,0]
            elif len(image_zeropadded.shape) == 5:
                image_zeropadded[:,:,:,:,i] = image_zeropadded[:,:,:,:,0]
            elif len(image_zeropadded.shape) == 6:
                image_zeropadded[:,:,:,:,:,i] = image_zeropadded[:,:,:,:,:,0]
            else:
                raise ValueError('This function expects between 2 and 6 dimensions, '\
                    'not {} dimensions'.format(len(image_matrix.shape)))
    return image_zeropadded
    

if __name__ == "__main__":
    import doctest
    doctest.testmod()