'''The augment module includes functions related to augmenting audio data.
These functions pull from implementations performed in research.
Other resources for augmentation (not included in soundpy functionality):
Ma, E. (2019). NLP Augmentation. https://github.com/makcedward/nlpaug
Park, D. S., Chan, W., Zhang, Y., Chiu, C., Zoph, B., Cubuk, E. D., & Le, Q. V.
(2019). Google Brain. arxiv.org/pdf/1904.08779.pdf
Nanni, L., Maguolo, G., & Paci, M. (2020). Data augmentation approaches for
improving animal audio classification. Ecological Informatics, 57, 101084.
https://doi.org/https://doi.org/10.1016/j.ecoinf.2020.101084:
1.Signal speed scaling by a random number in[0.8,1.2](SpeedupFactoryRange).
2.Pitch shift by a random number in [−2,2]semitones(SemitoneShiftRange).
3.Volume increase/decrease by a random number in [−3,3]dB(VolumeGainRange).
4.Addition of random noise in the range [0,10]dB(SNR).
5.Time shift in the range [−0.005,0.005]seconds(TimeShiftRange).
'''
###############################################################################
import os, sys
import inspect
currentdir = os.path.dirname(os.path.abspath(
inspect.getfile(inspect.currentframe())))
packagedir = os.path.dirname(currentdir)
sys.path.insert(0, packagedir)
import numpy as np
import math
import librosa
import pathlib
import soundpy as sp
[docs]def speed_increase(sound, sr, perc=0.15, **kwargs):
'''Acoustic augmentation of speech.
References
----------
Nanni, L., Maguolo, G., & Paci, M. (2020). Data augmentation approaches for
improving animal audio classification. Ecological Informatics, 57, 101084.
https://doi.org/https://doi.org/10.1016/j.ecoinf.2020.101084
Ko, T., Peddinti, V., Povey, D., & Khudanpur (2015). Audio Augmentation for
Speech Recognition. Interspeech.
W. Verhelst and M. Roelands, “An overlap-add technique based on
waveform similarity (wsola) for high quality time-scale modifica-
tion of speech,” in Proceedings of the International Conference on
Acoustics, Speech and Signal Processing (ICASSP), vol. 2, April
1993, pp. 554–557 vol.2.
'''
if isinstance(sound, np.ndarray):
data = sound
else:
data, sr2 = sp.loadsound(sound, sr=sr, **kwargs)
assert sr2 == sr
# if entered 50 instead of .50, turns 50 into .50
if perc > 1:
while perc > 1:
perc *= .01
if perc <= 1:
break
rate = 1. + perc
y_fast = librosa.effects.time_stretch(data, rate)
return y_fast
[docs]def speed_decrease(sound, sr, perc=0.15, **kwargs):
'''Acoustic augmentation of speech.
References
----------
Nanni, L., Maguolo, G., & Paci, M. (2020). Data augmentation approaches for
improving animal audio classification. Ecological Informatics, 57, 101084.
https://doi.org/https://doi.org/10.1016/j.ecoinf.2020.101084
'''
if isinstance(sound, np.ndarray):
data = sound
else:
data, sr2 = sp.loadsound(sound, sr=sr, **kwargs)
assert sr2 == sr
# if entered 50 instead of .50, turns 50 into .50
if perc > 1:
while perc > 1:
perc *= .01
if perc <= 1:
break
rate = 1. - perc
y_slow = librosa.effects.time_stretch(data, rate)
return y_slow
[docs]def time_shift(sound, sr, random_seed = None, **kwargs):
'''Acoustic augmentation of sound (probably not for speech).
Applies random shift of sound by dividing sound into 2 sections and
switching them.
Nanni, L., Maguolo, G., & Paci, M. (2020). Data augmentation approaches for
improving animal audio classification. Ecological Informatics, 57, 101084.
https://doi.org/https://doi.org/10.1016/j.ecoinf.2020.101084
'''
if isinstance(sound, np.ndarray):
data = sound
else:
data, sr2 = sp.loadsound(sound, sr=sr, **kwargs)
assert sr2 == sr
switched = sp.augment.shufflesound(data, sr=sr,
num_subsections = 2,
random_seed = random_seed)
return switched
[docs]def shufflesound(sound, sr, num_subsections = 2, random_seed = None, **kwargs):
'''Acoustic augmentation of noise or background sounds.
This separates the sound into `num_subsections` and pseudorandomizes
the order.
References
----------
Inoue, T., Vinayavekhin, P., Wang, S., Wood, D., Munawar, A., Ko, B. J.,
Greco, N., & Tachibana, R. (2019). Shuffling and mixing data augmentation
for environmental sound classification. Detection and Classification of
Acoustic Scenes and Events 2019. 25-26 October 2019, New York, NY, USA
'''
if isinstance(sound, np.ndarray):
data = sound
else:
data, sr2 = sp.loadsound(sound, sr=sr, **kwargs)
assert sr2 == sr
subsection_length = len(data) // num_subsections
order = np.arange(num_subsections)
if random_seed is not None:
np.random.seed(random_seed)
np.random.shuffle(order)
section_dict = {}
sample = 0
for i in range(num_subsections):
if i == num_subsections-1:
section = data[sample:]
else:
section = data[sample:sample+subsection_length]
section_dict[i] = section
sample += subsection_length
# combine samples in new order:
samples_shuffled = np.array([])
for i in order:
samples_shuffled = np.concatenate((samples_shuffled, section_dict[i]),axis=0)
return samples_shuffled
[docs]def add_white_noise(sound, sr, noise_level=0.01, snr=10, random_seed=None, **kwargs):
'''
References
----------
Nanni, L., Maguolo, G., & Paci, M. (2020). Data augmentation approaches for
improving animal audio classification. Ecological Informatics, 57, 101084.
https://doi.org/https://doi.org/10.1016/j.ecoinf.2020.101084
'''
if isinstance(sound, np.ndarray):
data = sound
else:
data, sr2 = sp.loadsound(sound, sr=sr)
assert sr2 == sr
n = sp.dsp.generate_noise(num_samples = len(data),
amplitude=noise_level,
random_seed=random_seed)
if isinstance(snr, list):
snr = np.random.choice(snr)
sound_n, snr = sp.dsp.add_backgroundsound(data, n, sr = sr, snr=snr, **kwargs)
return sound_n
[docs]def harmonic_distortion(sound, sr, **kwargs):
'''Applies sin function five times.
References
----------
Nanni, L., Maguolo, G., & Paci, M. (2020). Data augmentation approaches for
improving animal audio classification. Ecological Informatics, 57, 101084.
https://doi.org/https://doi.org/10.1016/j.ecoinf.2020.101084
'''
if isinstance(sound, np.ndarray):
data = sound
else:
data, sr2 = sp.loadsound(sound, sr=sr, **kwargs)
assert sr2 == sr
data = 2*np.pi*data
count = 0
while count < 5:
data = np.sin(data)
count += 1
return data
[docs]def pitch_increase(sound, sr, num_semitones = 2, **kwargs):
'''
References
----------
Nanni, L., Maguolo, G., & Paci, M. (2020). Data augmentation approaches for
improving animal audio classification. Ecological Informatics, 57, 101084.
https://doi.org/https://doi.org/10.1016/j.ecoinf.2020.101084
'''
if isinstance(sound, np.ndarray):
data = sound
else:
data, sr2 = sp.loadsound(sound, sr=sr, **kwargs)
assert sr2 == sr
y_i = librosa.effects.pitch_shift(data, sr=sr, n_steps = num_semitones)
return y_i
[docs]def pitch_decrease(sound, sr, num_semitones = 2, **kwargs):
'''
References
----------
Nanni, L., Maguolo, G., & Paci, M. (2020). Data augmentation approaches for
improving animal audio classification. Ecological Informatics, 57, 101084.
https://doi.org/https://doi.org/10.1016/j.ecoinf.2020.101084
'''
if isinstance(sound, np.ndarray):
data = sound
else:
data, sr2 = sp.loadsound(sound, sr=sr, **kwargs)
assert sr2 == sr
y_d = librosa.effects.pitch_shift(data, sr=sr, n_steps = -num_semitones)
return y_d
# TODO how to control output size without losing frequency data?
# basically how to scale down dimension of frequencies after warping?
# https://docs.scipy.org/doc/scipy/reference/tutorial/ndimage.html#interpolation-functions
# scikit-image resize (only powerspectrum)
# https://stackoverflow.com/questions/23918036/interpolate-whole-arrays-of-complex-numbers
[docs]def vtlp(sound, sr, a = (0.8,1.2), random_seed = None,
oversize_factor = 16, win_size_ms = 50, percent_overlap = 0.5,
bilinear_warp = True, real_signal = True, fft_bins = 1024, window = 'hann',
zeropad = True, expected_shape = None, visualize = False):
'''Applies vocal tract length perturbations directly to dft (oversized) windows.
References
----------
Kim, C., Shin, M., Garg, A., & Gowda, D. (2019). Improved vocal tract length perturbation
for a state-of-the-art end-to-end speech recognition system. Interspeech. September 15-19,
Graz, Austria.
Nanni, L., Maguolo, G., & Paci, M. (2020). Data augmentation approaches for
improving animal audio classification. Ecological Informatics, 57, 101084.
https://doi.org/https://doi.org/10.1016/j.ecoinf.2020.101084
'''
if isinstance(sound, np.ndarray):
data = sound
else:
data, sr2 = sp.loadsound(sound, sr=sr)
assert sr2 == sr
if random_seed is not None:
np.random.seed(random_seed)
if isinstance(a, tuple) or isinstance(a, list):
vtlp_a = np.random.choice(np.arange(min(a), max(a)+.1, 0.1) )
elif isinstance(a, int) or isinstance(a, float):
vtlp_a = a
else:
vtlp_a = None
if isinstance(vtlp_a, int) or isinstance(vtlp_a, float) or isinstance(vtlp_a, np.int_) \
or isinstance(vtlp_a, np.float_):
pass
else:
raise TypeError('Function `soundpy.augment.vtlp` expected a to be an int or float, or'+\
' a list / tuple of ints, or floats; not of type {}'.format(type(a)))
frame_length = sp.dsp.calc_frame_length(win_size_ms, sr)
num_overlap_samples = int(frame_length * percent_overlap)
num_subframes = sp.dsp.calc_num_subframes(len(data),
frame_length = frame_length,
overlap_samples = num_overlap_samples,
zeropad = zeropad)
max_freq = sr/2.
if expected_shape is not None:
# expects last column to represent the number of relevant frequency bins
#fft_bins = expected_shape[-1]
fft_bins = (expected_shape[-1]-1) * 2
if fft_bins is None:
fft_bins = int(win_size_ms * sr // 1000)
total_rows = fft_bins * oversize_factor
# initialize empty matrix to fill dft values into
stft_matrix = sp.dsp.create_empty_matrix(
(num_subframes,total_rows), complex_vals = True)
section_start = 0
window_frame = sp.dsp.create_window(window, frame_length)
for frame in range(num_subframes):
section = data[section_start:section_start+frame_length]
section = sp.dsp.apply_window(section, window_frame, zeropad = zeropad)
# apply dft to large window - increase frequency resolution during warping
section_fft = sp.dsp.calc_fft(section,
real_signal = real_signal,
fft_bins = total_rows,
)
if bilinear_warp:
section_warped = sp.dsp.bilinear_warp(section_fft, vtlp_a)
else:
section_warped = sp.dsp.piecewise_linear_warp(section_fft, vtlp_a,
max_freq = max_freq)
if real_signal:
section_warped = section_warped[:len(section_warped)]
else:
section_warped = section_warped[:len(section_warped)//2 + 1]
stft_matrix[frame][:len(section_warped)] = section_warped
section_start += (frame_length - num_overlap_samples)
if expected_shape is not None:
stft_matrix = stft_matrix[:,:len(section_warped)]
# TODO: find out how to reduce resolution of frequency
# this technically works but is 1) slow and 2) loses lots of info
if oversize_factor > 1:
import skimage
from skimage.transform import resize
power_matrix = sp.dsp.calc_power(stft_matrix)
stft_matrix = resize(power_matrix, expected_shape)
import warnings
msg = '\nWARNING: Only the power spectrum of the VTLP augmented signal'+\
' can be returned due to resizing the augmentation from '+\
'{} to {}'.format(power_matrix.shape, expected_shape)
warnings.warn(msg)
#for i in np.arange(0, int(np.sqrt(oversize_factor))):
#stft_matrix = sp.feats.reduce_dim(stft_matrix, axis=1)
# ensures matches expected_shape
stft_matrix = sp.feats.adjust_shape(stft_matrix, expected_shape)
else:
stft_matrix = stft_matrix[:,:len(section_warped)]
if visualize:
sp.feats.plot(stft_matrix, feature_type = 'stft', subprocess=True,
name4pic = 'vtlp_{}.png'.format(sp.utils.get_date()),
title = 'size: {}'.format(stft_matrix.shape),
save_pic=True)
return stft_matrix, vtlp_a
[docs]def get_augmentation_dict():
'''Returns dictionary with augmentation options as keys and values set to False.
Examples
--------
>>> import soundpy as sp
>>> ad = sp.augment.get_augmentation_dict()
>>> ad
{'speed_increase': False,
'speed_decrease': False,
'time_shift': False,
'shufflesound': False,
'add_white_noise': False,
'harmonic_distortion': False,
'pitch_increase': False,
'pitch_decrease': False,
'vtlp': False}
>>> # to set augmentation to True:
>>> ad['add_white_noise'] = True
>>> ad
{'speed_increase': False,
'speed_decrease': False,
'time_shift': False,
'shufflesound': False,
'add_white_noise': True,
'harmonic_distortion': False,
'pitch_increase': False,
'pitch_decrease': False,
'vtlp': False}
'''
base_dict = dict([('speed_increase', False),
('speed_decrease', False),
('time_shift', False),
('shufflesound', False),
('add_white_noise', False),
('harmonic_distortion', False),
('pitch_increase', False),
('pitch_decrease', False),
('vtlp', False),
])
return base_dict
[docs]def list_augmentations():
'''Lists available augmentations.
Examples
--------
>>> import soundpy as sp
>>> print(sp.augment.list_augmentations())
Available augmentations:
speed_increase
speed_decrease
time_shift
shufflesound
add_white_noise
harmonic_distortion
pitch_increase
pitch_decrease
vtlp
'''
augmentation_dict = sp.augment.get_augmentation_dict()
aug_list = '\t'+'\n\t'.join(str(x) for x in augmentation_dict.keys())
augmentations = 'Available augmentations:\n '+ aug_list
return augmentations
# TODO test to see if list can be applied to all augmentations, not just 'add_white_noise'
[docs]def get_augmentation_settings_dict(augmentation):
'''Returns default settings of base function for augmentation.
Parameters
----------
augmentation : str
The augmentation of interest.
Returns
-------
aug_defaults : dict
A dictionary with the base augmentation function parameters as keys
and default values as values.
Examples
--------
>>> import soundpy as sp
>>> d = sp.augment.get_augmentation_settings_dict('speed_decrease')
>>> d
{'perc': 0.15}
>>> # can use this dictionary to apply different values for augmentation
>>> d['perc'] = 0.1
>>> d
{'perc': 0.1}
>>> # to build a dictionary with several settings:
>>> many_settings_dict = {}
>>> many_settings_dict['add_white_noise'] = sp.augment.get_augmentation_settings_dict('add_white_noise')
>>> many_settings_dict['pitch_increase'] = sp.augment.get_augmentation_settings_dict('pitch_increase')
>>> many_settings_dict
{'add_white_noise': {'noise_level': 0.01, 'snr': 10, 'random_seed': None},
'pitch_increase': {'num_semitones': 2}}
>>> # change 'snr' default values to list of several values
>>> # this would apply white noise at either 10, 15, or 20 SNR, at random
>>> many_settings_dict['add_white_noise']['snr'] = [10, 15, 20]
>>> # change number of semitones pitch increase is applied
>>> many_settings_dict['pitch_increase']['num_semitones'] = 1
>>> many_settings_dict
{'add_white_noise': {'noise_level': 0.01,
'snr': [10, 15, 20],
'random_seed': None},
'pitch_increase': {'num_semitones': 1}}
Raises
------
ValueError
If `augmentation` does not match available augmentations.
See Also
--------
soundpy.models.dataprep.augment_features
The above dictionary example `many_settings_dict` can be applied under the
parameter `augment_settings_dict` to apply augmentation settings when
augmenting data, for example, within a generator function. See `soundpy.models.dataprep.GeneratorFeatExtraction`.
'''
if augmentation == 'speed_increase':
aug_defaults = sp.utils.get_default_args(sp.augment.speed_increase)
elif augmentation == 'speed_decrease':
aug_defaults = sp.utils.get_default_args(sp.augment.speed_decrease)
elif augmentation == 'time_shift':
aug_defaults = sp.utils.get_default_args(sp.augment.time_shift)
elif augmentation == 'shufflesound':
aug_defaults = sp.utils.get_default_args(sp.augment.shufflesound)
elif augmentation == 'add_white_noise':
aug_defaults = sp.utils.get_default_args(sp.augment.add_white_noise)
elif augmentation == 'harmonic_distortion':
aug_defaults = sp.utils.get_default_args(sp.augment.harmonic_distortion)
elif augmentation == 'pitch_increase':
aug_defaults = sp.utils.get_default_args(sp.augment.pitch_increase)
elif augmentation == 'pitch_decrease':
aug_defaults = sp.utils.get_default_args(sp.augment.pitch_decrease)
elif augmentation == 'vtlp':
aug_defaults = sp.utils.get_default_args(sp.augment.vtlp)
else:
raise ValueError('Receieved `augmentation` "{}"'.format(augmentation)+\
' which is not included in available augmentations:\n{}'.format(
sp.augment.list_augmentations()))
return aug_defaults