Source code for soundpy.augment

'''The augment module includes functions related to augmenting audio data.
These functions pull from implementations performed in research. 

Other resources for augmentation (not included in soundpy functionality):

Ma, E. (2019). NLP Augmentation. https://github.com/makcedward/nlpaug

Park, D. S., Chan, W., Zhang, Y., Chiu, C., Zoph, B., Cubuk, E. D., & Le, Q. V.
(2019). Google Brain. arxiv.org/pdf/1904.08779.pdf


Nanni, L., Maguolo, G., & Paci, M. (2020). Data augmentation approaches for 
improving animal audio classification. Ecological Informatics, 57, 101084. 
https://doi.org/https://doi.org/10.1016/j.ecoinf.2020.101084:

1.Signal speed scaling by a random number in[0.8,1.2](SpeedupFactoryRange).
2.Pitch shift by a random number in [−2,2]semitones(SemitoneShiftRange).
3.Volume increase/decrease by a random number in [−3,3]dB(VolumeGainRange).
4.Addition of random noise in the range [0,10]dB(SNR).
5.Time shift in the range [−0.005,0.005]seconds(TimeShiftRange).

'''
###############################################################################
import os, sys
import inspect
currentdir = os.path.dirname(os.path.abspath(
    inspect.getfile(inspect.currentframe())))
packagedir = os.path.dirname(currentdir)
sys.path.insert(0, packagedir)

import numpy as np
import math
import librosa
import pathlib
import soundpy as sp

[docs]def speed_increase(sound, sr, perc=0.15, **kwargs):
    '''Acoustic augmentation of speech.
    
    References
    ----------
    Nanni, L., Maguolo, G., & Paci, M. (2020). Data augmentation approaches for 
    improving animal audio classification. Ecological Informatics, 57, 101084. 
    https://doi.org/https://doi.org/10.1016/j.ecoinf.2020.101084
    
    Ko, T., Peddinti, V., Povey, D., & Khudanpur (2015). Audio Augmentation for 
    Speech Recognition. Interspeech. 
    
    W. Verhelst and M. Roelands, “An overlap-add technique based on
    waveform similarity (wsola) for high quality time-scale modifica-
    tion of speech,” in Proceedings of the International Conference on
    Acoustics, Speech and Signal Processing (ICASSP), vol. 2, April
    1993, pp. 554–557 vol.2.
    '''
    if isinstance(sound, np.ndarray):
        data = sound
    else:
        data, sr2 = sp.loadsound(sound, sr=sr, **kwargs)
        assert sr2 == sr
    # if entered 50 instead of .50, turns 50 into .50
    if perc > 1:
        while perc > 1:
            perc *= .01
            if perc <= 1:
                break
    rate = 1. + perc
    y_fast = librosa.effects.time_stretch(data, rate)
    return y_fast

[docs]def speed_decrease(sound, sr, perc=0.15, **kwargs):
    '''Acoustic augmentation of speech. 
    
    References
    ----------
    Nanni, L., Maguolo, G., & Paci, M. (2020). Data augmentation approaches for 
    improving animal audio classification. Ecological Informatics, 57, 101084. 
    https://doi.org/https://doi.org/10.1016/j.ecoinf.2020.101084
    '''
    if isinstance(sound, np.ndarray):
        data = sound
    else:
        data, sr2 = sp.loadsound(sound, sr=sr, **kwargs)
        assert sr2 == sr
    # if entered 50 instead of .50, turns 50 into .50
    if perc > 1:
        while perc > 1:
            perc *= .01
            if perc <= 1:
                break
    rate = 1. - perc
    y_slow = librosa.effects.time_stretch(data, rate)
    return y_slow


[docs]def time_shift(sound, sr, random_seed = None, **kwargs):
    '''Acoustic augmentation of sound (probably not for speech).
    
    Applies random shift of sound by dividing sound into 2 sections and 
    switching them.
    
    Nanni, L., Maguolo, G., & Paci, M. (2020). Data augmentation approaches for 
    improving animal audio classification. Ecological Informatics, 57, 101084. 
    https://doi.org/https://doi.org/10.1016/j.ecoinf.2020.101084
    '''
    if isinstance(sound, np.ndarray):
        data = sound
    else:
        data, sr2 = sp.loadsound(sound, sr=sr, **kwargs)
        assert sr2 == sr
    switched = sp.augment.shufflesound(data, sr=sr, 
                                          num_subsections = 2, 
                                          random_seed = random_seed)
    return switched
    

[docs]def shufflesound(sound, sr, num_subsections = 2, random_seed = None, **kwargs):
    '''Acoustic augmentation of noise or background sounds.
    
    This separates the sound into `num_subsections` and pseudorandomizes
    the order.
    
    References
    ----------
    Inoue, T., Vinayavekhin, P., Wang, S., Wood, D., Munawar, A., Ko, B. J.,
    Greco, N., & Tachibana, R. (2019). Shuffling and mixing data augmentation 
    for environmental sound classification. Detection and Classification of 
    Acoustic Scenes and Events 2019. 25-26 October 2019, New York, NY, USA
    '''
    if isinstance(sound, np.ndarray):
        data = sound
    else:
        data, sr2 = sp.loadsound(sound, sr=sr, **kwargs)
        assert sr2 == sr
    subsection_length = len(data) // num_subsections
    order = np.arange(num_subsections)
    if random_seed is not None:
        np.random.seed(random_seed)
    np.random.shuffle(order)
    section_dict = {}
    sample = 0
    for i in range(num_subsections):
        if i == num_subsections-1:
            section = data[sample:]
        else:
            section = data[sample:sample+subsection_length]
        section_dict[i] = section
        sample += subsection_length
    # combine samples in new order:
    samples_shuffled = np.array([])
    for i in order:
        samples_shuffled = np.concatenate((samples_shuffled, section_dict[i]),axis=0)
    return samples_shuffled

[docs]def add_white_noise(sound, sr, noise_level=0.01, snr=10, random_seed=None, **kwargs):
    '''
    References
    ----------
    Nanni, L., Maguolo, G., & Paci, M. (2020). Data augmentation approaches for 
    improving animal audio classification. Ecological Informatics, 57, 101084. 
    https://doi.org/https://doi.org/10.1016/j.ecoinf.2020.101084
    '''
    if isinstance(sound, np.ndarray):
        data = sound
    else:
        data, sr2 = sp.loadsound(sound, sr=sr)
        assert sr2 == sr
    n = sp.dsp.generate_noise(num_samples = len(data), 
                                amplitude=noise_level, 
                                random_seed=random_seed)
    if isinstance(snr, list):
        snr = np.random.choice(snr)
    sound_n, snr = sp.dsp.add_backgroundsound(data, n, sr = sr, snr=snr, **kwargs)
    return sound_n

[docs]def harmonic_distortion(sound, sr, **kwargs):
    '''Applies sin function five times.
    
    References
    ----------
    Nanni, L., Maguolo, G., & Paci, M. (2020). Data augmentation approaches for 
    improving animal audio classification. Ecological Informatics, 57, 101084. 
    https://doi.org/https://doi.org/10.1016/j.ecoinf.2020.101084
    '''
    if isinstance(sound, np.ndarray):
        data = sound
    else:
        data, sr2 = sp.loadsound(sound, sr=sr, **kwargs)
        assert sr2 == sr
    data = 2*np.pi*data
    count = 0
    while count < 5:
        data = np.sin(data)
        count += 1
    return data
    
[docs]def pitch_increase(sound, sr, num_semitones = 2, **kwargs):
    '''
    
    References
    ----------
    Nanni, L., Maguolo, G., & Paci, M. (2020). Data augmentation approaches for 
    improving animal audio classification. Ecological Informatics, 57, 101084. 
    https://doi.org/https://doi.org/10.1016/j.ecoinf.2020.101084
    '''
    if isinstance(sound, np.ndarray):
        data = sound
    else:
        data, sr2 = sp.loadsound(sound, sr=sr, **kwargs)
        assert sr2 == sr
    y_i = librosa.effects.pitch_shift(data, sr=sr, n_steps = num_semitones)
    return y_i

[docs]def pitch_decrease(sound, sr, num_semitones = 2, **kwargs):
    '''
    
    References
    ----------
    Nanni, L., Maguolo, G., & Paci, M. (2020). Data augmentation approaches for 
    improving animal audio classification. Ecological Informatics, 57, 101084. 
    https://doi.org/https://doi.org/10.1016/j.ecoinf.2020.101084
    '''
    if isinstance(sound, np.ndarray):
        data = sound
    else:
        data, sr2 = sp.loadsound(sound, sr=sr, **kwargs)
        assert sr2 == sr
    y_d = librosa.effects.pitch_shift(data, sr=sr, n_steps = -num_semitones)
    return y_d
      
# TODO how to control output size without losing frequency data?
# basically how to scale down dimension of frequencies after warping?
# https://docs.scipy.org/doc/scipy/reference/tutorial/ndimage.html#interpolation-functions
# scikit-image resize (only powerspectrum)
# https://stackoverflow.com/questions/23918036/interpolate-whole-arrays-of-complex-numbers
[docs]def vtlp(sound, sr, a = (0.8,1.2), random_seed = None,
         oversize_factor = 16, win_size_ms = 50, percent_overlap = 0.5,
         bilinear_warp = True, real_signal = True, fft_bins = 1024, window = 'hann',
         zeropad = True, expected_shape = None, visualize = False):
    '''Applies vocal tract length perturbations directly to dft (oversized) windows.
    
    References
    ----------
    Kim, C., Shin, M., Garg, A., & Gowda, D. (2019). Improved vocal tract length perturbation 
    for a state-of-the-art end-to-end speech recognition system. Interspeech. September 15-19, 
    Graz, Austria.
    
    Nanni, L., Maguolo, G., & Paci, M. (2020). Data augmentation approaches for 
    improving animal audio classification. Ecological Informatics, 57, 101084. 
    https://doi.org/https://doi.org/10.1016/j.ecoinf.2020.101084
    '''
    if isinstance(sound, np.ndarray):
        data = sound
    else:
        data, sr2 = sp.loadsound(sound, sr=sr)
        assert sr2 == sr
    if random_seed is not None:
        np.random.seed(random_seed)
    if isinstance(a, tuple) or isinstance(a, list):
        vtlp_a = np.random.choice(np.arange(min(a), max(a)+.1, 0.1)  )
    elif isinstance(a, int) or isinstance(a, float):
        vtlp_a = a
    else:
        vtlp_a = None
    if isinstance(vtlp_a, int) or isinstance(vtlp_a, float) or isinstance(vtlp_a, np.int_) \
        or isinstance(vtlp_a, np.float_):
            pass
    else:
        raise TypeError('Function `soundpy.augment.vtlp` expected a to be an int or float, or'+\
            ' a list / tuple of ints, or floats; not of type {}'.format(type(a)))
    frame_length = sp.dsp.calc_frame_length(win_size_ms, sr)
    num_overlap_samples = int(frame_length * percent_overlap)
    num_subframes = sp.dsp.calc_num_subframes(len(data),
                                                frame_length = frame_length,
                                                overlap_samples = num_overlap_samples,
                                                zeropad = zeropad)
    max_freq = sr/2.
    if expected_shape is not None:
        # expects last column to represent the number of relevant frequency bins
        #fft_bins = expected_shape[-1]
        fft_bins = (expected_shape[-1]-1) * 2 
    if fft_bins is None:
        fft_bins = int(win_size_ms * sr // 1000)
    total_rows = fft_bins * oversize_factor
    # initialize empty matrix to fill dft values into
    stft_matrix = sp.dsp.create_empty_matrix(
        (num_subframes,total_rows), complex_vals = True)
    
    section_start = 0
    window_frame = sp.dsp.create_window(window, frame_length)
    for frame in range(num_subframes):
        section = data[section_start:section_start+frame_length]
        section = sp.dsp.apply_window(section, window_frame, zeropad = zeropad)
        # apply dft to large window - increase frequency resolution during warping
        section_fft = sp.dsp.calc_fft(section, 
                                        real_signal = real_signal,
                                        fft_bins = total_rows,
                                        )
        if bilinear_warp:
            section_warped = sp.dsp.bilinear_warp(section_fft, vtlp_a)
        else:
            section_warped = sp.dsp.piecewise_linear_warp(section_fft, vtlp_a,
                                                                max_freq = max_freq)
        if real_signal:
            section_warped = section_warped[:len(section_warped)]
        else:
            section_warped = section_warped[:len(section_warped)//2 + 1]
        stft_matrix[frame][:len(section_warped)] = section_warped
        section_start += (frame_length - num_overlap_samples)
    if expected_shape is not None:
        stft_matrix = stft_matrix[:,:len(section_warped)]
        # TODO: find out how to reduce resolution of frequency
        # this technically works but is 1) slow and 2) loses lots of info
        if oversize_factor > 1:
            import skimage
            from skimage.transform import resize
            power_matrix = sp.dsp.calc_power(stft_matrix)
            stft_matrix = resize(power_matrix, expected_shape)
            import warnings
            msg = '\nWARNING: Only the power spectrum of the VTLP augmented signal'+\
                ' can be returned due to resizing the augmentation from '+\
                    '{} to {}'.format(power_matrix.shape, expected_shape)
            warnings.warn(msg)
            #for i in np.arange(0, int(np.sqrt(oversize_factor))):
                #stft_matrix = sp.feats.reduce_dim(stft_matrix, axis=1)
        # ensures matches expected_shape
        stft_matrix = sp.feats.adjust_shape(stft_matrix, expected_shape)
    else:
        stft_matrix = stft_matrix[:,:len(section_warped)]
    if visualize:
        sp.feats.plot(stft_matrix, feature_type = 'stft', subprocess=True, 
                    name4pic = 'vtlp_{}.png'.format(sp.utils.get_date()),
                    title = 'size: {}'.format(stft_matrix.shape),
                    save_pic=True)
    return stft_matrix, vtlp_a

[docs]def get_augmentation_dict():
    '''Returns dictionary with augmentation options as keys and values set to False.
    
    Examples
    --------
    >>> import soundpy as sp
    >>> ad = sp.augment.get_augmentation_dict()
    >>> ad
    {'speed_increase': False,
    'speed_decrease': False,
    'time_shift': False,
    'shufflesound': False,
    'add_white_noise': False,
    'harmonic_distortion': False,
    'pitch_increase': False,
    'pitch_decrease': False,
    'vtlp': False}
    >>> # to set augmentation to True:
    >>> ad['add_white_noise'] = True
    >>> ad
    {'speed_increase': False,
    'speed_decrease': False,
    'time_shift': False,
    'shufflesound': False,
    'add_white_noise': True,
    'harmonic_distortion': False,
    'pitch_increase': False,
    'pitch_decrease': False,
    'vtlp': False}
    '''
    base_dict = dict([('speed_increase', False),
                      ('speed_decrease', False),
                      ('time_shift', False),
                      ('shufflesound', False),
                      ('add_white_noise', False),
                      ('harmonic_distortion', False),
                      ('pitch_increase', False),
                      ('pitch_decrease', False),
                      ('vtlp', False),
                      ])
    return base_dict

[docs]def list_augmentations():
    '''Lists available augmentations.
    
    Examples
    --------
    >>> import soundpy as sp
    >>> print(sp.augment.list_augmentations())
    Available augmentations:
            speed_increase
            speed_decrease
            time_shift
            shufflesound
            add_white_noise
            harmonic_distortion
            pitch_increase
            pitch_decrease
            vtlp
    '''
    augmentation_dict = sp.augment.get_augmentation_dict()
    aug_list = '\t'+'\n\t'.join(str(x) for x in augmentation_dict.keys())
    augmentations = 'Available augmentations:\n '+ aug_list
    return augmentations
    
# TODO test to see if list can be applied to all augmentations, not just 'add_white_noise'
[docs]def get_augmentation_settings_dict(augmentation):
    '''Returns default settings of base function for augmentation.
    
    Parameters
    ----------
    augmentation : str 
        The augmentation of interest.
    
    Returns
    -------
    aug_defaults : dict 
        A dictionary with the base augmentation function parameters as keys  
        and default values as values. 
    
    Examples
    --------
    >>> import soundpy as sp
    >>> d = sp.augment.get_augmentation_settings_dict('speed_decrease')
    >>> d
    {'perc': 0.15}
    >>> # can use this dictionary to apply different values for augmentation
    >>> d['perc'] = 0.1
    >>> d
    {'perc': 0.1}
    >>> # to build a dictionary with several settings:
    >>> many_settings_dict = {}
    >>> many_settings_dict['add_white_noise'] = sp.augment.get_augmentation_settings_dict('add_white_noise')
    >>> many_settings_dict['pitch_increase'] = sp.augment.get_augmentation_settings_dict('pitch_increase')
    >>> many_settings_dict
    {'add_white_noise': {'noise_level': 0.01, 'snr': 10, 'random_seed': None},
    'pitch_increase': {'num_semitones': 2}}
    >>> # change 'snr' default values to list of several values
    >>> # this would apply white noise at either 10, 15, or 20 SNR, at random
    >>> many_settings_dict['add_white_noise']['snr'] = [10, 15, 20]
    >>> # change number of semitones pitch increase is applied
    >>> many_settings_dict['pitch_increase']['num_semitones'] = 1
    >>> many_settings_dict
    {'add_white_noise': {'noise_level': 0.01,
    'snr': [10, 15, 20],
    'random_seed': None},
    'pitch_increase': {'num_semitones': 1}}

    Raises
    ------
    ValueError 
        If `augmentation` does not match available augmentations.
    
    See Also
    --------
    soundpy.models.dataprep.augment_features
        The above dictionary example `many_settings_dict` can be applied under the
        parameter `augment_settings_dict` to apply augmentation settings when 
        augmenting data, for example, within a generator function. See `soundpy.models.dataprep.GeneratorFeatExtraction`.
    '''
    if augmentation == 'speed_increase':
        aug_defaults = sp.utils.get_default_args(sp.augment.speed_increase)
    elif augmentation == 'speed_decrease':
        aug_defaults = sp.utils.get_default_args(sp.augment.speed_decrease)        
    elif augmentation == 'time_shift':
        aug_defaults = sp.utils.get_default_args(sp.augment.time_shift)
    elif augmentation == 'shufflesound':
        aug_defaults = sp.utils.get_default_args(sp.augment.shufflesound)
    elif augmentation == 'add_white_noise':
        aug_defaults = sp.utils.get_default_args(sp.augment.add_white_noise)
    elif augmentation == 'harmonic_distortion':
        aug_defaults = sp.utils.get_default_args(sp.augment.harmonic_distortion)
    elif augmentation == 'pitch_increase':
        aug_defaults = sp.utils.get_default_args(sp.augment.pitch_increase)
    elif augmentation == 'pitch_decrease':
        aug_defaults = sp.utils.get_default_args(sp.augment.pitch_decrease)
    elif augmentation == 'vtlp':
        aug_defaults = sp.utils.get_default_args(sp.augment.vtlp)
    else:
        raise ValueError('Receieved `augmentation` "{}"'.format(augmentation)+\
            ' which is not included in available augmentations:\n{}'.format(
                sp.augment.list_augmentations()))
    return aug_defaults