'''Feats module includes functions related to converting audio sample data
to features for analysis, filtering, machine learning, or visualization.
'''
###############################################################################
import os, sys
import inspect
currentdir = os.path.dirname(os.path.abspath(
inspect.getfile(inspect.currentframe())))
packagedir = os.path.dirname(currentdir)
sys.path.insert(0, packagedir)
import collections
import numpy as np
import math
import random
import matplotlib
import librosa
from scipy.signal import hann, hamming
from scipy.fftpack import dct
import pathlib
from python_speech_features import fbank, mfcc, delta
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
import soundpy as sp
# TODO Clean up
# stereo sound only for plotting 'signal'; NOT for frequency features.
[docs]def plot(feature_matrix, feature_type,
save_pic=False, name4pic=None, energy_scale='power_to_db',
title=None, sr=None, win_size_ms=None, percent_overlap=None,
x_label=None, y_label=None, subprocess=False, overwrite=False):
'''Visualize feature extraction; frames on x axis, features on y axis. Uses librosa to scale the data if scale applied.
Note: can only take multiple channels if `feature_type` is 'signal'. For other
feature types, the plot will not work as expected.
Parameters
----------
feature_matrix : np.ndarray [shape=(num_samples,), (num_samples, num_channels), or (num_features, num_frames), dtype=np.float].
Matrix of features. If the features are not of type 'signal' and the
shape is 1 D, one dimension will be added to be plotted with a colormesh.
feature_type : str
Options: 'signal', 'stft', 'mfcc', or 'fbank' features, or
what user would like to name the feature set.
signal: the 1 D samples of sound.
STFT: short-time Fourier transform
MFCC: mel frequency cepstral coefficients.
FBANK: mel-log filterbank energies (default 'fbank').
save_pic : bool
True to save image as .png; False to just plot it. If `subprocess` is
True, `save_pic` will automatically be set to True.
name4pic : str, optional
If `save_pic` set to True, the name the image should be saved under.
energy_scale : str, optional
If features need to be adjusted, e.g. from power to decibels.
Default is 'power_to_db'.
title : str, optional
The title for the graph. If None, `feature_type` is used.
sr : int, optional
Useful in plotting the time for features.
win_size_ms : int, float, optional
Useful in plotting the time for features in the frequency domain (e.g.
STFT, FBANK, MFCC features)
percent_overlap : int, float, optional
Useful in plotting the time for features in the frequency domain (e.g.
STFT, FBANK, MFCC features)
x_label : str, optional
The label to be applied to the x axis.
y_label : str, optional
The label to be applied to the y axis.
subprocess : bool
If `subprocess` is True, matplotlib will use backend 'Agg', which only allows plots to be saved.
If `subprocess` is False, the default backend 'TkAgg' will be used, which allows plots to be
generated live as well as saved. The 'Agg' backend is useful if one wants to visualize sound
while a main process is being performed, for example, while a model is being trained.
(default False)
overwrite : bool
If False, if .png file already exists under given name, a date tag will be
added to the .png filename to avoid overwriting the file.
(default False)
Returns
-------
None
'''
if not subprocess:
# can show plots
# interferes with training models though
matplotlib.use('TkAgg')
else:
# does not interfere with training models
matplotlib.use('Agg')
if save_pic is False:
import warnings
save_pic = True
if name4pic is None:
location = 'current working directory'
else:
location = name4pic
msg = 'Due to matplotlib using AGG backend, cannot display plot. '+\
'Therefore, the plot will be saved here: {}'.format(location)
warnings.warn(msg)
import matplotlib.pyplot as plt
# ensure real numbers
if feature_matrix.dtype == np.complex64 or feature_matrix.dtype == np.complex128:
feature_matrix = np.abs(feature_matrix)
# features presented via colormesh need 2D format.
if len(feature_matrix.shape) == 1:
feature_matrix = np.expand_dims(feature_matrix, axis=1)
if 'fbank' in feature_type:
axis_feature_label = 'Num Mel Filters'
elif 'mfcc' in feature_type:
axis_feature_label = 'Num Mel Freq Cepstral Coefficients'
elif 'stft' in feature_type:
axis_feature_label = 'Number of frames'
elif 'signal' in feature_type:
axis_feature_label = 'Amplitude'
else:
axis_feature_label = 'Energy'
if energy_scale is None or feature_type == 'signal':
energy_label = 'energy'
energy_scale = None
if energy_scale == 'power_to_db':
feature_matrix = librosa.power_to_db(feature_matrix)
energy_label = 'decicels'
elif energy_scale == 'db_to_power':
feature_matrix = librosa.db_to_power(feature_matrix)
energy_label = 'power'
elif energy_scale == 'amplitude_to_db':
feature_matrix = librosa.amplitude_to_db(feature_matrix)
energy_label = 'decibels'
elif energy_scale == 'db_to_amplitude':
feature_matrix = librosa.db_to_amplitude(feature_matrix)
energy_label = 'amplitude'
plt.clf()
if 'signal' not in feature_type:
x_axis_label = 'Frequency bins'
else:
x_axis_label = 'Samples over time'
if 'signal' in feature_type:
# transpose matrix if second dimension is larger - probably
# because channels are in first dimension. Expect in second dimension
if not feature_matrix.shape[0] > feature_matrix.shape[1]:
feature_matrix = feature_matrix.T
if sr is not None:
x_axis_label = 'Time (sec)'
dur_sec = feature_matrix.shape[0] / sr
time_sec = sp.dsp.get_time_points(dur_sec, sr)
for channel in range(feature_matrix.shape[1]):
data = feature_matrix[:,channel]
# overlay the channel data
if len(time_sec) > len(data):
time_sec = time_sec[:len(data)]
elif len(time_sec) < len(data):
data = data[:len(time_sec)]
plt.plot(time_sec, data)
else:
for channel in range(feature_matrix.shape[1]):
data = feature_matrix[:,channel]
# overlay the channel data
plt.plot(data)
x_axis_label += ' across {} channel(s)'.format(feature_matrix.shape[1])
else:
plt.pcolormesh(feature_matrix.T)
plt.colorbar(label=energy_label)
plt.xlabel(x_axis_label)
plt.ylabel(axis_feature_label)
# if feature_matrix has multiple frames, not just one
if feature_matrix.shape[1] > 1 and 'signal' not in feature_type:
if win_size_ms is not None and percent_overlap is not None:
# the xticks basically show time but need to be multiplied by 0.01
plt.xlabel('Time (sec)')
locs, labels = plt.xticks()
if percent_overlap == 0:
new_labels=[str(round(i*0.001*win_size_ms,1)) for i in locs]
else:
new_labels=[str(round(i*0.001*win_size_ms*percent_overlap,1)) for i in locs]
plt.xticks(ticks=locs,labels=new_labels)
else:
plt.xlabel('Number frames')
plt.ylabel('Frequency bins')
if title is None:
plt.title('{} Features'.format(feature_type.upper()))
else:
plt.title(title)
if x_label is not None:
plt.xlabel(x_label)
if y_label is not None:
plt.ylabel(y_label)
if save_pic:
outputname = name4pic or 'visualize{}feats_{}'.format(feature_type.upper(),
sp.utils.get_date())
outputname = sp.utils.string2pathlib(outputname)
if outputname.suffix:
if outputname.suffix != '.png':
# add .png as extension
fname = outputname.name + '.png'
outputname = outputname.parent.joinpath(fname)
else:
fname = outputname.stem + '.png'
outputname = outputname.parent.joinpath(fname)
if not overwrite:
if os.path.exists(outputname):
fname = outputname.stem
fname += '_'+sp.utils.get_date()
outputname = outputname.parent.joinpath(fname+outputname.suffix)
plt.savefig(outputname)
else:
plt.show()
# tested for stereo sound
[docs]def plotsound(audiodata, feature_type='fbank', win_size_ms = 20, \
percent_overlap = 0.5, fft_bins = None, num_filters=40, num_mfcc=40, sr=None,\
save_pic=False, name4pic=None, energy_scale='power_to_db', mono=None, real_signal=False, **kwargs):
'''Visualize feature extraction depending on set parameters.
Stereo sound can be graphed. If `feature_type` is 'signal', all channels will be
graphed on same plot. Otherwise, each channel will be plotted separately.
Parameters
----------
audiodata : str, numpy.ndarray [size=(num_samples,) or (num_samples, num_channels)]
If str, wavfile (must be compatible with scipy.io.wavfile). Otherwise
the samples of the sound data. Note: in the latter case, `sr`
must be declared.
feature_type : str
Options: 'signal', 'mfcc', or 'fbank' features.
MFCC: mel frequency cepstral
coefficients; FBANK: mel-log filterbank energies (default 'fbank')
win_size_ms : int or float
Window length in milliseconds for Fourier transform to be applied
(default 20)
percent_overlap : int or float
Amount of overlap between processing windows. For example, if `percent_overlap`
is set at 0.5, the overlap will be half that of `win_size_ms`. (default 0.5)
If an integer is provided, it will be converted to a float between 0 and 1.
num_filters : int
Number of mel-filters to be used when applying mel-scale. For
'fbank' features, 20-128 are common, with 40 being very common.
(default 40)
num_mfcc : int
Number of mel frequency cepstral coefficients. First coefficient
pertains to loudness; 2-13 frequencies relevant for speech; 13-40
for acoustic environment analysis or non-linguistic information.
Note: it is not possible to choose only 2-13 or 13-40; if `num_mfcc`
is set to 40, all 40 coefficients will be included.
(default 40).
sr : int, optional
The sample rate of the sound data or the desired sample rate of
the wavfile to be loaded. (default None)
mono : bool, optional
When loading an audiofile, True will limit number of channels to
one; False will allow more channels to be loaded. (default None,
which results in mono channel loading.)
**kwargs : additional keyword arguments
Keyword arguments for soundpy.feats.plot
'''
percent_overlap = check_percent_overlap(percent_overlap)
if 'signal' not in feature_type:
if isinstance(audiodata, np.ndarray) and len(audiodata.shape) > 1:
for channel in range(audiodata.shape[1]):
if name4pic is None:
name4pic = '{}_channel_{}'.format(feature_type, channel+1)
else:
name4pic = sp.string2pathlib(name4pic)
name = name4pic.stem
if channel == 0:
name += '_channel{}'.format(channel+1)
else:
name = name[:-1]+'{}'.format(channel+1)
name4pic = name4pic.parent.joinpath(name+name4pic.suffix)
if 'title' not in kwargs:
kwargs['title'] = '{} features\n(channel {})'.format(
feature_type, channel+1)
else:
if channel == 0:
kwargs['title'] += '\n(channel {})'.format(channel+1)
else:
kwargs['title'] = kwargs['title'][:-2]+'{})'.format(
channel+1)
feats = sp.feats.get_feats(
audiodata[:,channel], feature_type=feature_type,
win_size_ms = win_size_ms, percent_overlap = percent_overlap,
fft_bins = fft_bins, num_filters=num_filters, num_mfcc = num_mfcc,
sr=sr, mono = mono, real_signal = real_signal)
sp.feats.plot(
feats, feature_type=feature_type, sr=sr,
save_pic = save_pic, name4pic=name4pic,
energy_scale = energy_scale,
win_size_ms = win_size_ms, percent_overlap = percent_overlap,
**kwargs)
return None
feats = sp.feats.get_feats(audiodata, feature_type=feature_type,
win_size_ms = win_size_ms, percent_overlap = percent_overlap,
fft_bins = fft_bins, num_filters=num_filters, num_mfcc = num_mfcc,
sr=sr, mono = mono, real_signal = real_signal)
sp.feats.plot(feats, feature_type=feature_type, sr=sr,
save_pic = save_pic, name4pic=name4pic, energy_scale = energy_scale,
win_size_ms = win_size_ms, percent_overlap = percent_overlap,
**kwargs)
# stereo sound with mono (True/False) works for 'signal' data
# only mono for frequency features
[docs]def get_feats(sound,
sr = None,
feature_type = 'fbank',
win_size_ms = 20,
percent_overlap = 0.5,
window = 'hann',
fft_bins = None,
num_filters = None,
num_mfcc = None,
remove_first_coefficient = False,
sinosoidal_liftering = False,
dur_sec = None,
mono = None,
rate_of_change = False,
rate_of_acceleration = False,
subtract_mean = False,
real_signal = True,
fmin = None,
fmax = None,
zeropad = True):
'''Collects raw signal data, stft, fbank, or mfcc features.
Parameters
----------
sound : str or numpy.ndarray [size=(num_samples,) or (num_samples, num_channels)]
If str, wavfile (must be compatible with scipy.io.wavfile). Otherwise
the samples of the sound data. Note: in the latter case, `sr`
must be declared.
sr : int, optional
The sample rate of the sound data or the desired sample rate of
the wavfile to be loaded. (default None)
feature_type : str
Options include 'signal', 'stft', 'powspec', 'fbank', or 'mfcc' data
(default 'fbank').
signal: energy/amplitude measurements along time
STFT: short-time fourier transform
powspec : power spectrum (absolute value of stft, squared)
FBANK: mel-log filterbank energies
MFCC: mel frequency cepstral coefficients
win_size_ms : int or float
Window length in milliseconds for Fourier transform to be applied
(default 20)
percent_overlap : int or float
Amount of overlap between processing windows. For example, if `percent_overlap`
is set at 0.5, the overlap will be half that of `win_size_ms`. (default 0.5)
If an integer is provided, it will be converted to a float between 0 and 1.
window : str or np.ndarray [size (n_fft, )]
The window function to be applied to each window. (Default 'hann')
fft_bins : int
Number of frequency bins to apply in fast Fourier transform. (default None)
num_filters : int
Number of mel-filters to be used when applying mel-scale. For
'fbank' features, 20-128 are common, with 40 being very common. If None, will
be set to 40.
(default None)
num_mfcc : int
Number of mel frequency cepstral coefficients. First coefficient
pertains to loudness; 2-13 frequencies relevant for speech; 13-40
for acoustic environment analysis or non-linguistic information.
If None, will be set to `num_filters` or 40.
(default None).
dur_sec : float, optional
Time in seconds to limit in loading a signal. (default None)
mono: bool, optional
For loading an audiofile, True will result in only one channel of
data being loaded; False will allow additional channels be loaded.
(default None, which results in mono channel data)
rate_of_change : bool
If True, the first derivative of spectral data will be concatenated
to the features.
This is applicable for all feature types except 'signal'.
rate_of_acceleration : bool
If True, the second derivative of spectral data will be concatenated
to the features.
This is applicable for all feature types except 'signal'.
subtract_mean : bool
If True, the mean of each feature column will be subtracted from
each row. This is applicable for all feature types except 'signal'.
Returns
-------
feats : tuple (num_samples, sr) or np.ndarray [size (num_frames, num_filters) dtype=np.float or np.complex]
Feature data. If `feature_type` is 'signal', returns a tuple containing samples and sampling rate. If `feature_type` is of another type, returns np.ndarray with shape (num_frames, num_filters/features)
'''
# load data
if isinstance(sound, str) or isinstance(sound, pathlib.PosixPath):
if mono is None:
mono = True
data, sr = sp.loadsound(sound, sr = sr, dur_sec = dur_sec, mono = mono)
if mono is False and len(data.shape) > 1:
index_samples = np.argmax(data.shape)
index_channels = np.argmin(data.shape)
num_channels = data.shape[index_channels]
# transpose data to be (samples, num_channels) rather than (num_channels, samples)
if index_channels == 0:
data = data.T
# remove additional channel for 'stft', 'fbank' etc. feature
# extraction
if 'signal' not in feature_type and num_channels > 1:
import warnings
warnings.warn('Only one channel is used for {}'.format(feature_type)+\
' feature extraction. Removing extra channels.')
data = data[:,0]
else:
if sr is None:
raise ValueError('No samplerate given. Either provide '+\
'filename or appropriate samplerate.')
data, sr = sound, sr
if len(data.shape) > 1 and 'signal' not in feature_type:
print('Only one channel can be currently used for feature '+\
'extraction. Using the first channel.')
data = data[:,0]
elif len(data.shape) > 1 and mono:
data = data[:,0]
if dur_sec:
data = data[:int(sr*dur_sec)]
if isinstance(data, np.ndarray):
if not np.isfinite(data).all():
raise TypeError('NAN values found in loaded sound samples.')
else:
raise TypeError('Data is type {} and '.format(type(data))+\
'a numpy.ndarray was expected.')
# ensure percent overlap is between 0 and 1
percent_overlap = check_percent_overlap(percent_overlap)
win_shift_ms = win_size_ms - (win_size_ms * percent_overlap)
if win_shift_ms <= 0:
raise ValueError('`percent_overlap` {} is too high. '.format(percent_overlap)+\
'The signal cannot be processed with 0 or negative window shift / hop length.')
if fft_bins is None:
# base on frame length / window: larger windows --> higher freq resolution
fft_bins = int(win_size_ms * sr // 1000)
if 'stft' in feature_type or 'powspec' in feature_type:
feats = sp.feats.get_stft(
sound = data,
sr = sr,
win_size_ms = win_size_ms,
percent_overlap = percent_overlap,
real_signal = real_signal,
fft_bins = fft_bins,
window = window,
zeropad = zeropad
)
elif 'fbank' in feature_type:
if num_filters is None:
num_filters = 40
feats = sp.feats.get_fbank(
sound = data,
sr = sr,
num_filters = num_filters,
win_size_ms = win_size_ms,
percent_overlap = percent_overlap,
real_signal = real_signal,
fft_bins = fft_bins,
window = window,
zeropad = zeropad
)
elif 'mfcc' in feature_type:
if num_mfcc is None:
if num_filters is not None:
num_mfcc = num_filters
else:
num_mfcc = 40
if num_filters is None:
num_filters = 40
feats = sp.feats.get_mfcc(
sound = data,
sr = sr,
num_mfcc = num_mfcc,
remove_first_coefficient = remove_first_coefficient,
sinosoidal_liftering = sinosoidal_liftering,
num_filters = num_filters,
win_size_ms = win_size_ms,
percent_overlap = percent_overlap,
real_signal = real_signal,
fft_bins = fft_bins,
window = window,
zeropad = zeropad
)
elif 'signal' in feature_type:
if data.dtype == np.complex128 or data.dtype == np.complex64:
import warnings
msg = '\nWARNING: real raw signal features are being generated '+\
'from a STFT matrix.'
warnings.warn(msg)
feats = sp.feats.feats2audio(data, feature_type = 'stft',
sr=sr,
win_size_ms = win_size_ms,
percent_overlap = percent_overlap)
else:
feats = data
# TODO test difference between python_speech_features and librosa
if not 'signal' in feature_type:
if subtract_mean is True:
feats -= (np.mean(feats, axis=0) + 1e-8)
if rate_of_change is True:
#d1 = delta(feats, N=2)
d1 = librosa.feature.delta(feats.T).T
feats = np.concatenate((feats, d1), axis=1)
if rate_of_acceleration is True:
#d2 = delta(delta(feats, N=2), N=2)
d2 = librosa.feature.delta(feats.T, order=2).T
feats = np.concatenate((feats, d2), axis=1)
if 'powspec' in feature_type:
feats = sp.dsp.calc_power(feats)
return feats
[docs]def load_feat_settings(feat_settings_dict):
'''Loads feature settings into named tuple. Sets defaults if not present.
TODO: test w previous version
'''
FeatureExtractionSettings = collections.namedtuple('FeatureExtractionSettings',
['sr', 'feature_type', 'win_size_ms',
'percent_overlap', 'window', 'dur_sec',
'num_filters','num_mfcc', 'fft_bins',
'remove_first_coefficient','sinosoidal_liftering',
'mono', 'rate_of_change', 'rate_of_acceleration',
'subtract_mean', 'real_signal', 'fmin', 'fmax',
'zeropad', 'input_shape', 'base_shape',
'num_feats'])
# newer version of soundpy: 0.1.0a3
# store get_feats kwargs under `kwargs` key
if 'kwargs' in feat_settings_dict:
kwargs = sp.utils.restore_dictvalue(feat_settings_dict['kwargs'])
feat_settings_dict.update(kwargs)
# if values saved as strings and should be a list or tuple, restore them to original type:
# see `soundpy.utils.restore_dictvalue`
sr = sp.utils.restore_dictvalue(
feat_settings_dict['sr'])
feature_type = sp.utils.restore_dictvalue(
feat_settings_dict['feature_type'])
win_size_ms = sp.utils.restore_dictvalue(
feat_settings_dict['win_size_ms'])
percent_overlap = sp.utils.restore_dictvalue(
feat_settings_dict['percent_overlap'])
try:
window = sp.utils.restore_dictvalue(feat_settings_dict['window'])
except KeyError:
# set default here...
window = 'hann'
dur_sec = sp.utils.restore_dictvalue(
feat_settings_dict['dur_sec'])
try:
num_filters = sp.utils.restore_dictvalue(feat_settings_dict['num_filters'])
except KeyError:
num_filters = None
try:
num_mfcc = sp.utils.restore_dictvalue(feat_settings_dict['num_mfcc'])
except KeyError:
num_mfcc = None
try:
# newer version soundpy 0.1.0v3
fft_bins = sp.utils.restore_dictvalue(feat_settings_dict['fft_bins'])
except KeyError:
# older version soundpy 0.1.0v2
fft_bins = sp.utils.restore_dictvalue(feat_settings_dict['n_fft'])
try:
remove_first_coefficient = sp.utils.restore_dictvalue(
feat_settings_dict['remove_first_coefficient'])
except KeyError:
remove_first_coefficient = False
try:
sinosoidal_liftering = sp.utils.restore_dictvalue(
feat_settings_dict['sinosoidal_liftering'])
except KeyError:
sinosoidal_liftering = False
try:
mono = sp.utils.restore_dictvalue(
feat_settings_dict['mono'])
except KeyError:
mono = True # default setting
try:
rate_of_change = sp.utils.restore_dictvalue(
feat_settings_dict['rate_of_change'])
except KeyError:
rate_of_change = False
try:
rate_of_acceleration = sp.utils.restore_dictvalue(
feat_settings_dict['rate_of_acceleration'])
except KeyError:
rate_of_acceleration = False
try:
subtract_mean = sp.utils.restore_dictvalue(
feat_settings_dict['subtract_mean'])
except KeyError:
subtract_mean = False
try:
real_signal = sp.utils.restore_dictvalue(
feat_settings_dict['real_signal'])
except KeyError:
real_signal = True
try:
fmin = sp.utils.restore_dictvalue(
feat_settings_dict['fmin'])
except KeyError:
fmin = None
try:
fmax = sp.utils.restore_dictvalue(
feat_settings_dict['fmax'])
except KeyError:
fmax = None
try:
zeropad = sp.utils.restore_dictvalue(
feat_settings_dict['zeropad'])
except KeyError:
zeropad = True
try:
# older version of soundpy: 0.1.0a2
input_shape = sp.utils.restore_dictvalue(
feat_settings_dict['input_shape'])
except KeyError:
# newer version of soundpy: 0.1.0a3
input_shape = sp.utils.restore_dictvalue(
feat_settings_dict['feat_model_shape'])
try:
# older version of soundpy: 0.1.0a2
base_shape = sp.utils.restore_dictvalue(
feat_settings_dict['desired_shape'])
except KeyError:
# newer version of soundpy: 0.1.0a3
try:
base_shape = sp.utils.restore_dictvalue(
feat_settings_dict['feat_base_shape'])
except KeyError:
base_shape = input_shape
try:
# older version of soundpy: 0.1.0a2
num_feats = sp.utils.restore_dictvalue(
feat_settings_dict['num_feats'])
except KeyError:
# newer version of soundpy: 0.1.0a3
num_feats = base_shape[-1]
featsettings = FeatureExtractionSettings(
sr = sr,
feature_type = feature_type,
win_size_ms = win_size_ms,
percent_overlap = percent_overlap,
window = window,
dur_sec = dur_sec,
num_filters = num_filters,
num_mfcc = num_mfcc,
fft_bins = fft_bins,
remove_first_coefficient = remove_first_coefficient,
sinosoidal_liftering = sinosoidal_liftering,
mono = mono,
rate_of_change = rate_of_change,
rate_of_acceleration = rate_of_acceleration,
subtract_mean = subtract_mean,
real_signal = real_signal,
fmin = fmin,
fmax = fmax,
zeropad = zeropad,
input_shape = input_shape,
base_shape = base_shape,
num_feats = num_feats)
return featsettings
# TODO: create class instance where fund freq, vad, etc. can be saved
# allows for more control over fft bins / resolution of each iteration.
[docs]def get_stft(sound, sr=22050, win_size_ms = 50, percent_overlap = 0.5,
real_signal = False, fft_bins = 1024,
window = 'hann', zeropad = True, **kwargs):
'''Returns short-time Fourier transform matrix.
This function allows more flexibility in number of `fft_bins` and `real_signal`
settings. Additionally, this does not require the package librosa, making it
a bit easier to manipulate if desired. For an example, see
`soundpy.augment.vtlp`.
Parameters
----------
sound : np.ndarray [shape=(num_samples,) or (num_samples, num_channels)], str, or pathlib.PosixPath
If type np.ndarray, expect raw samples in mono or stereo sound. If type str or
pathlib.PosixPath, expect pathway to audio file.
sr : int
The sample rate of `sound`.
win_size_ms : int, float
Window length in milliseconds for Fourier transform to be applied
(default 50)
percent_overlap : int or float
Amount of overlap between processing windows. For example, if `percent_overlap`
is set at 0.5, the overlap will be half that of `win_size_ms`. (default 0.5)
If an integer is provided, it will be converted to a float between 0 and 1.
real_signal : bool
If True, only half the FFT spectrum will be used; there should really be no difference
as the FFT is symmetrical. If anything, setting `real_signal` to True may speed up
functionality / make functions more efficient.
fft_bins : int
Number of frequency bins to use when applying fast Fourier Transform. (default 1024)
window : str
The window function to apply to each window segment. Options are 'hann' and 'hamming'.
(default 'hann')
zeropad : bool
If True, samples will be zeropadded to fill any partially filled window. If False, the
samples constituting the partially filled window will be cut off.
**kwargs : additional keyword arguments
Keyword arguments for `soundpy.files.loadsound`.
Returns
-------
stft_matrix : np.ndarray[size=(num_frames, fft_bins)]
'''
if isinstance(sound, np.ndarray):
if sound.dtype == np.complex_:
import warnings
msg = '\nWARNING: data provided to `soundpy.feats.get_stft` is already'+\
' a STFT matrix. Returning original data.'
warnings.warn(msg)
return sound
data = sound
else:
data, sr2 = sp.loadsound(sound, sr=sr, **kwargs)
assert sr2 == sr
frame_length = sp.dsp.calc_frame_length(win_size_ms, sr)
num_overlap_samples = int(frame_length * percent_overlap)
num_subframes = sp.dsp.calc_num_subframes(len(data),
frame_length = frame_length,
overlap_samples = num_overlap_samples,
zeropad = zeropad)
if fft_bins is None:
fft_bins = int(win_size_ms * sr // 1000)
total_rows = fft_bins // 2 + 1
# if mono, only one channel; otherwise match num channels in sound signal
if sp.dsp.ismono(data):
stft_matrix = sp.dsp.create_empty_matrix(
(num_subframes, total_rows),
complex_vals = True)
else:
stft_matrix = sp.dsp.create_empty_matrix(
(num_subframes, total_rows, data.shape[1]),
complex_vals = True)
section_start = 0
window_frame = sp.dsp.create_window(window, frame_length)
for frame in range(num_subframes):
section = data[section_start:section_start+frame_length]
section = sp.dsp.apply_window(section,
window_frame,
zeropad = zeropad)
section_fft = sp.dsp.calc_fft(section,
real_signal = real_signal,
fft_bins = fft_bins,
)
stft_matrix[frame] = section_fft[:total_rows]
section_start += (frame_length - num_overlap_samples)
return stft_matrix
[docs]def get_fbank(sound, sr, num_filters, fmin=None, fmax=None, fft_bins = None, **kwargs):
'''Extract mel-filterbank energy features from audio.
Parameters
----------
sound : np.ndarray [size=(num_samples,) or (num_samples, num_features)], str, or pathlib.PosixPath
Sound in raw samples, a power spectrum, or a short-time-fourier-transform. If type string or pathlib.PosixPath, expect pathway to audio file.
sr : int
The sample rate of `sound`.
num_filters : int
The number of mel-filters to use when extracting mel-filterbank energies.
fmin : int or float, optional
The minimum frequency of interest. If None, will be set to 0. (default None)
fmax : int or float, optional
The maximum frequency of interst. If None, will be set to half of `sr`.
(default None)
fft_bins : int, optional
The number of frequency bins / fast Fourier transform bins used in calculating
the fast Fourier transform. If None, set depending on type of parameter `sound`.
If `sound` is a raw signal or audio pathway, `fft_bins` will be set to 1024;
if `sound` is a STFT or power spectrum, `fft_bins` will be set to 2 * length
of `sound` feature column, or 2 * sound.shape[1].
**kwargs : additional keyword arguments
Keyword arguments for `soundpy.feats.get_stft`.
Returns
-------
fbank : np.ndarray [shape=(num_samples, num_filters)]
The mel-filterbank energeis extracted. The number of samples depends on
the parameters applied in `soundpy.feats.get_stft`.
References
----------
Fayek, H. M. (2016). Speech Processing for Machine Learning:
Filter banks, Mel-Frequency Cepstral Coefficients (MFCCs) and What’s In-Between.
Retrieved from:
https://haythamfayek.com/2016/04/21/speech-processing-for-machine-learning.html
'''
if isinstance(sound, np.ndarray):
if sound.dtype == np.complex64 or sound.dtype == np.complex128:
stft = True
# probably a power spectrum without complex values...
# TODO improve
elif len(sound.shape) > 1 and sound.shape[1] > sound.shape[0]:
stft = True
else:
stft = False
else:
# probably a pathway - load in get_stft
stft = False
if fmin is None:
fmin = 0
if fmax is None:
fmax = sr/2
mel_points = sp.dsp.fbank_filters(fmin, fmax, num_filters = num_filters)
hz_points = sp.dsp.mel_to_hz(mel_points)
if fft_bins is None:
if stft is True:
# assumes number of fft bins is the length of second column
# https://librosa.org/doc/latest/generated/librosa.istft.html?highlight=istf#librosa.istft
fft_bins = (sound.shape[1]-1) * 2
else:
try:
fft_bins = int(kwargs['win_size_ms'] * sr // 1000)
except KeyError:
fft_bins = 512
freq_bins = np.floor((fft_bins + 1) * hz_points / sr)
if stft:
# use number of fft columns in stft as reference
fbank = np.zeros((num_filters, sound.shape[1]))
else:
fbank = np.zeros((num_filters, int(np.floor(fft_bins / 2 + 1))))
for m in range(1, num_filters + 1):
f_m_minus = int(freq_bins[m - 1]) # left
f_m = int(freq_bins[m]) # center
f_m_plus = int(freq_bins[m + 1]) # right
for k in range(f_m_minus, f_m):
fbank[m - 1, k] = (k - freq_bins[m - 1]) / (freq_bins[m] - freq_bins[m -1])
for k in range(f_m, f_m_plus):
fbank[m - 1, k] = (freq_bins[m + 1] - k) / (freq_bins[m + 1] - freq_bins[m])
if stft:
if np.min(sound) < 0:
powspec = sp.dsp.calc_power(sound)
else:
powspec = sound
else:
sound_stft = sp.feats.get_stft(sound, sr=sr, fft_bins = fft_bins, **kwargs)
powspec = sp.dsp.calc_power(sound_stft)
filter_banks = np.dot(powspec, fbank.T)
filter_banks = np.where(filter_banks == 0, np.finfo(float).eps, filter_banks) # numerical stability
return filter_banks
[docs]def get_mfcc(sound, sr, num_mfcc, remove_first_coefficient=False,
sinosoidal_liftering = False, **kwargs):
'''Extracts mel-frequency cepstral coefficients from audio.
Parameters
----------
sound : np.ndarray [size=(num_samples,) or (num_samples, num_features)] or str or pathlib.PosixPath
If `sound` is a np.ndarray, expected as raw samples, a power spectrum or a
short-time Fourier transform. If string or pathlib.PosixPath, should be the pathway
to the audio file.
sr : int
The sample rate of the `sound`.
num_mfcc : int
The number of mel-frequency cepstral coefficients
remove_first_coefficient : bool
If True, the first coefficient, representing amplitude or volume of signal, is
removed. Found to sometimes improve automatic speech recognition.
(default False)
sinosoidal_liftering : bool
If True, reduces influence of higher coefficients, found to aid in handling
noise in background in automatic speech recognition. (default False)
**kwargs : additional keyword arguments
Keyword arguments for soundpy.feats.get_fbank()
References
----------
Fayek, H. M. (2016). Speech Processing for Machine Learning: Filter banks, Mel-Frequency Cepstral Coefficients (MFCCs) and What’s In-Between. Retrieved from https://haythamfayek.com/2016/04/21/speech-processing-for-machine-learning.html
'''
fbank = sp.feats.get_fbank(sound, sr=sr, **kwargs)
mfcc = dct(fbank, type=2, axis=1, norm='ortho')
if remove_first_coefficient is True:
mfcc = mfcc[:,1:num_mfcc]
else:
mfcc = mfcc[:,:num_mfcc]
return mfcc
[docs]def get_vad_stft(sound, sr=48000, win_size_ms = 50, percent_overlap = 0.5,
real_signal = False, fft_bins = 1024,
window = 'hann', use_beg_ms = 120,
extend_window_ms = 0, energy_thresh = 40,
freq_thresh = 185, sfm_thresh = 5,
zeropad = True, **kwargs):
'''Returns STFT matrix and VAD matrix. STFT matrix contains only VAD sections.
Parameters
----------
sound : str or numpy.ndarray [size=(num_samples,) or (num_samples, num_channels)]
If str, wavfile (must be compatible with scipy.io.wavfile). Otherwise
the samples of the sound data. Note: in the latter case, `sr`
must be declared.
sr : int, optional
The sample rate of the sound data or the desired sample rate of
the wavfile to be loaded. (default None)
win_size_ms : int or float
Window length in milliseconds for Fourier transform to be applied
(default 50)
percent_overlap : int or float
Amount of overlap between processing windows. For example, if `percent_overlap`
is set at 0.5, the overlap will be half that of `win_size_ms`. (default 0.5)
If an integer is provided, it will be converted to a float between 0 and 1.
real_signal : bool
If True, only half the FFT spectrum will be used; there should really be no difference
as the FFT is symmetrical. If anything, setting `real_signal` to True may speed up
functionality / make functions more efficient.
fft_bins : int
Number of frequency bins to use when applying fast Fourier Transform. (default 1024)
window : str
The window function to apply to each window segment. Options are 'hann' and 'hamming'.
(default 'hann')
use_beg_ms : int
The amount of time in milliseconds to use from beginning of signal to estimate background
noise.
extend_window_ms : int
The amount of time in milliseconds to pad or extend the identified VAD segments. This
may be useful to include more speech / sound, if desired.
energy_thresh : int
The threshold to set for measuring energy for VAD in the signal. (default 40)
freq_thresh : int
The threshold to set for measuring frequency for VAD in the signal. (default 185)
sfm_thresh : int
The threshold to set for measuring spectral flatness for VAD in the signal. (default 5)
zeropad : bool
If True, samples will be zeropadded to fill any partially filled window. If False, the
samples constituting the partially filled window will be cut off.
**kwargs : additional keyword arguments
Keyword arguments for `soundpy.files.loadsound`
Returns
-------
stft_matrix : np.ndarray [size=(num_frames_vad, fft_bins//2+1), dtype=np.complex_]
The STFT matrix frames of where voice activity has been detected.
vad_matrix_extwin : np.ndarray [size=(num_frames,)]
A vector containing indices of the full STFT matrix for frames of where voice activity
was detected or not.
'''
# raise ValueError if percent_overlap is not supported
if percent_overlap != 0 and percent_overlap < 0.5:
raise ValueError('For this VAD function, `percent_overlap` ' +\
'set to {} is not currently supported.\n'.format(percent_overlap) +\
'Suggested to set at either 0 or 0.5')
if percent_overlap > 0.5:
import warnings
msg = '\nWarning: for this VAD function, parameter `percent_overlap` has most success '+\
'when set at 0 or 0.5'
# raise warnings if sample rate lower than 44100 Hz
if sr < 44100:
import warnings
msg = '\nWarning: voice-activity-detection works best with sample '+\
'rates above 44100 Hz. Current `sr` set at {}.'.format(sr)
warnings.warn(msg)
if isinstance(sound, np.ndarray):
data = sound.copy()
else:
data, sr2 = sp.loadsound(sound, sr=sr, **kwargs)
assert sr2 == sr
frame_length = sp.dsp.calc_frame_length(win_size_ms, sr)
num_overlap_samples = int(frame_length * percent_overlap)
num_subframes = sp.dsp.calc_num_subframes(len(data),
frame_length = frame_length,
overlap_samples = num_overlap_samples,
zeropad = zeropad)
# set number of subframes for extending window
extwin_num_samples = sp.dsp.calc_frame_length(extend_window_ms, sr)
num_win_subframes = sp.dsp.calc_num_subframes(extwin_num_samples,
frame_length = frame_length,
overlap_samples = num_overlap_samples,
zeropad = zeropad)
total_rows = fft_bins
if len(data.shape) > 1 and data.shape[1] > 1:
stereo = True
stft_matrix = sp.dsp.create_empty_matrix(
(num_subframes, total_rows, data.shape[1]),
complex_vals = True)
# stereo sound --> average out channels for measuring energy
data_vad = sp.dsp.average_channels(data)
else:
stereo = False
stft_matrix = sp.dsp.create_empty_matrix(
(num_subframes, total_rows),
complex_vals = True)
data_vad = data
vad_matrix, (sr, e, f, sfm) = sp.dsp.vad(data_vad, sr,
win_size_ms = win_size_ms,
percent_overlap = percent_overlap,
use_beg_ms = use_beg_ms,
energy_thresh = energy_thresh,
freq_thresh = freq_thresh,
sfm_thresh = sfm_thresh)
vad_matrix_extwin = vad_matrix.copy()
# extend VAD windows where VAD indicated
if extend_window_ms > 0:
for i, row in enumerate(vad_matrix):
if row > 0:
# label samples before VAD as VAD
if i > num_win_subframes:
vad_matrix_extwin[i-num_win_subframes:i] = 1
else:
vad_matrix_extwin[:i] = 1
# label samples before VAD as VAD
if i + num_win_subframes < len(vad_matrix):
vad_matrix_extwin[i:num_win_subframes+i] = 1
else:
vad_matrix_extwin[i:] = 1
section_start = 0
extra_rows = 0
window_frame = sp.dsp.create_window(window, frame_length)
row = 0
for frame in range(num_subframes):
vad = vad_matrix_extwin[frame]
if vad > 0:
section = data[section_start:section_start+frame_length]
section = sp.dsp.apply_window(section,
window_frame,
zeropad = zeropad)
section_fft = sp.dsp.calc_fft(section,
real_signal = real_signal,
fft_bins = total_rows,
)
stft_matrix[row] = section_fft
row += 1
else:
extra_rows += 1
section_start += (frame_length - num_overlap_samples)
stft_matrix = stft_matrix[:-extra_rows]
return stft_matrix[:,:fft_bins//2+1], vad_matrix_extwin
[docs]def get_stft_clipped(samples, sr, win_size_ms = 50, percent_overlap = 0.5,
extend_window_ms = 0, window = 'hann', zeropad = True, **kwargs):
'''Returns STFT matrix and VAD matrix with beginning and ending silence removed.
Parameters
----------
samples : str or numpy.ndarray [size=(num_samples,) or (num_samples, num_channels)]
If str, wavfile (must be compatible with scipy.io.wavfile). Otherwise
the samples of the sound data.
sr : int, optional
The sample rate of the sound data or the desired sample rate of
the wavfile to be loaded.
win_size_ms : int or float
Window length in milliseconds for Fourier transform to be applied
(default 50)
percent_overlap : int or float
Amount of overlap between processing windows. For example, if `percent_overlap`
is set at 0.5, the overlap will be half that of `win_size_ms`. (default 0.5)
If an integer is provided, it will be converted to a float between 0 and 1.
extend_window_ms : int
The amount of time in milliseconds to pad or extend the identified VAD segments. This
may be useful to include more speech / sound, if desired.
window : str
The window function to apply to each window segment. Options are 'hann' and 'hamming'.
(default 'hann')
zeropad : bool
If True, samples will be zeropadded to fill any partially filled window. If False, the
samples constituting the partially filled window will be cut off.
**kwargs : additional keyword arguments
Keyword arguments for `soundpy.files.loadsound`.
Returns
-------
stft_speech : np.ndarry [size (num_frames_clipped, fft_bins//2+1)]
The STFT of the `samples` with beginning and ending silences clipped.
vad_matrix : np.ndarry [size (num_frames, )]
A vector with zeros and ones indicating which indices of the full STFT that
have voice activity or not.
'''
stft = sp.feats.get_stft(samples, sr,
win_size_ms = win_size_ms,
percent_overlap = percent_overlap,
window = window, zeropad = zeropad)
energy = sp.dsp.get_energy(stft)
energy_mean = sp.dsp.get_energy_mean(energy)
beg_index, beg_speech_found = sp.dsp.sound_index(
energy,energy_mean,start=True)
end_index, end_speech_found = sp.dsp.sound_index(
energy,energy_mean,start=False)
vad_matrix = np.zeros(len(stft))
if beg_speech_found == False or end_speech_found == False:
import warnings
msg = '\nNo speech detected'
warnings.warn(msg)
return [], vad_matrix
if beg_index < end_index:
if extend_window_ms > 0:
extra_samples = sp.dsp.calc_frame_length(extend_window_ms, sr)
num_win_subframes = sp.dsp.calc_num_subframes(
extra_samples,
frame_length = frame_length,
overlap_samples = num_overlap_samples,
zeropad = zeropad)
beg_index -= num_win_subframes
if beg_index < 0:
beg_index = 0
end_index += num_win_subframes
if end_index > len(vad_matrix):
end_index = len(vad_matrix)
stft_speech = stft[beg_index:end_index]
vad_matrix[beg_index:end_index] = 1
return stft_speech, vad_matrix
return [], vad_matrix
[docs]def get_vad_samples(sound, sr=None, win_size_ms = 50, percent_overlap = 0.5,
use_beg_ms = 120, extend_window_ms = 0, energy_thresh = 40,
freq_thresh = 185, sfm_thresh = 5, window = 'hann', zeropad = True,
**kwargs):
'''Returns samples and VAD matrix. Only samples where with VAD are returned.
Parameters
----------
sound : str or numpy.ndarray [size=(num_samples,) or (num_samples, num_channels)]
If str, wavfile (must be compatible with scipy.io.wavfile). Otherwise
the samples of the sound data. Note: in the latter case, `sr`
must be declared.
sr : int, optional
The sample rate of the sound data or the desired sample rate of
the wavfile to be loaded. (default None)
win_size_ms : int or float
Window length in milliseconds for Fourier transform to be applied
(default 50)
percent_overlap : int or float
Amount of overlap between processing windows. For example, if `percent_overlap`
is set at 0.5, the overlap will be half that of `win_size_ms`. (default 0.5)
If an integer is provided, it will be converted to a float between 0 and 1.
use_beg_ms : int
The amount of time in milliseconds to use from beginning of signal to estimate background
noise.
extend_window_ms : int
The amount of time in milliseconds to pad or extend the identified VAD segments. This
may be useful to include more speech / sound, if desired.
energy_thresh : int
The threshold to set for measuring energy for VAD in the signal. (default 40)
freq_thresh : int
The threshold to set for measuring frequency for VAD in the signal. (default 185)
sfm_thresh : int
The threshold to set for measuring spectral flatness for VAD in the signal. (default 5)
window : str
The window function to apply to each window segment. Options are 'hann' and 'hamming'.
(default 'hann')
zeropad : bool
If True, samples will be zeropadded to fill any partially filled window. If False, the
samples constituting the partially filled window will be cut off.
**kwargs : additional keyword arguments
Keyword arguments for `soundpy.files.loadsound`
Returns
-------
samples_matrix : np.ndarray [size = (num_samples_vad, )]
The samples of where voice activity was detected.
vad_matrix_extwin : np.ndarray [size = (num_frames, )]
A vector of zeros and ones indicating the frames / windows of the samples that either
had voice activity or not.
'''
# raise error if percent_overlap is not supported
if percent_overlap != 0 and percent_overlap < 0.5:
raise ValueError('For this VAD function, `percent_overlap` ' +\
'set to {} is not currently supported.\n'.format(percent_overlap) +\
'Suggested to set at either 0 or 0.5')
if percent_overlap > 0.5:
import warnings
msg = '\nWarning: for this VAD function, parameter `percent_overlap` has most success '+\
'when set at 0 or 0.5'
# raise warnings if sample rate lower than 44100 Hz
if sr < 44100:
import warnings
msg = '\nWarning: voice-activity-detection works best with sample '+\
'rates above 44100 Hz. Current `sr` set at {}.'.format(sr)
warnings.warn(msg)
if isinstance(sound, np.ndarray):
data = sound
else:
data, sr2 = sp.loadsound(sound, sr=sr, **kwargs)
assert sr2 == sr
frame_length = sp.dsp.calc_frame_length(win_size_ms, sr)
num_overlap_samples = int(frame_length * percent_overlap)
num_subframes = sp.dsp.calc_num_subframes(len(data),
frame_length = frame_length,
overlap_samples = num_overlap_samples,
zeropad = zeropad)
# set number of subframes for extending window
extwin_num_samples = sp.dsp.calc_frame_length(extend_window_ms, sr)
num_win_subframes = sp.dsp.calc_num_subframes(extwin_num_samples,
frame_length = frame_length,
overlap_samples = num_overlap_samples,
zeropad = zeropad)
samples_matrix = sp.dsp.create_empty_matrix((len(data)),
complex_vals = False)
vad_matrix, (sr, e, f, sfm) = sp.dsp.vad(data, sr,
win_size_ms = win_size_ms,
percent_overlap = percent_overlap,
use_beg_ms = use_beg_ms,
energy_thresh = energy_thresh,
freq_thresh = freq_thresh,
sfm_thresh = sfm_thresh)
vad_matrix_extwin = vad_matrix.copy()
# extend VAD windows with if VAD found
if extend_window_ms > 0:
for i, row in enumerate(vad_matrix):
if row > 0:
# label samples before VAD as VAD
if i > num_win_subframes:
vad_matrix_extwin[i-num_win_subframes:i] = 1
else:
vad_matrix_extwin[:i] = 1
# label samples before VAD as VAD
if i + num_win_subframes < len(vad_matrix):
vad_matrix_extwin[i:num_win_subframes+i] = 1
else:
vad_matrix_extwin[i:] = 1
section_start = 0
extra_rows = 0
row = 0
window_frame = sp.dsp.create_window(window, frame_length)
for frame in range(num_subframes):
vad = vad_matrix_extwin[frame]
if vad > 0:
section = data[section_start : section_start + frame_length]
if percent_overlap > 0:
# apply overlap add to signal
section_windowed = sp.dsp.apply_window(section, window_frame, zeropad = zeropad)
samples_matrix[row : row + frame_length] += section_windowed
else:
samples_matrix[row : row + frame_length] += section
row += (frame_length - num_overlap_samples)
else:
extra_rows += frame_length - num_overlap_samples
section_start += (frame_length - num_overlap_samples)
samples_matrix = samples_matrix[:-extra_rows]
return samples_matrix, vad_matrix_extwin
[docs]def get_samples_clipped(samples, sr, win_size_ms = 50, percent_overlap = 0.5,
extend_window_ms = 0, window = 'hann', zeropad = True, **kwargs):
'''Returns samples and VAD matrix with beginning and ending silence removed.
Parameters
----------
samples : str or numpy.ndarray [size=(num_samples,) or (num_samples, num_channels)]
If str, wavfile (must be compatible with scipy.io.wavfile). Otherwise
the samples of the sound data.
sr : int, optional
The sample rate of the sound data or the desired sample rate of
the wavfile to be loaded.
win_size_ms : int or float
Window length in milliseconds for Fourier transform to be applied
(default 50)
percent_overlap : int or float
Amount of overlap between processing windows. For example, if `percent_overlap`
is set at 0.5, the overlap will be half that of `win_size_ms`. (default 0.5)
If an integer is provided, it will be converted to a float between 0 and 1.
extend_window_ms : int
The amount of time in milliseconds to pad or extend the identified VAD segments. This
may be useful to include more speech / sound, if desired. (default 0)
window : str
The window function to apply to each window segment. Options are 'hann' and 'hamming'.
(default 'hann')
zeropad : bool
If True, samples will be zeropadded to fill any partially filled window. If False, the
samples constituting the partially filled window will be cut off.
**kwargs : additional keyword arguments
Keyword arguments for `soundpy.files.loadsound`.
Returns
-------
stft_speech : np.ndarry [size (num_frames_clipped, fft_bins//2+1)]
The STFT of the `samples` with beginning and ending silences clipped.
vad_matrix : np.ndarry [size (num_frames, )]
A vector with zeros and ones indicating which indices of the full STFT that
have voice activity or not.
'''
if not isinstance(samples, np.ndarray):
samples, sr = sp.loadsound(samples, sr=sr)
stft = sp.feats.get_stft(samples,sr,
win_size_ms = win_size_ms,
percent_overlap = percent_overlap,
window = window, zeropad = zeropad)
energy = sp.dsp.get_energy(stft)
energy_mean = sp.dsp.get_energy_mean(energy)
beg = sp.dsp.sound_index(energy,energy_mean,start=True)
end = sp.dsp.sound_index(energy,energy_mean,start=False)
vad_matrix = np.zeros(len(samples))
if beg[1] == False or end[1] == False:
import warnings
msg = 'No speech detected'
warnings.warn(msg)
return [], vad_matrix
perc_start = beg[0]/len(energy)
perc_end = end[0]/len(energy)
sample_start = int(perc_start*len(samples))
sample_end = int(perc_end*len(samples))
if sample_start < sample_end:
if extend_window_ms > 0:
extra_frames = sp.dsp.calc_frame_length(extend_window_ms, sr)
sample_start -= extra_frames
if sample_start < 0:
sample_start = 0
sample_end += extra_frames
if sample_end > len(vad_matrix):
sample_end = len(vad_matrix)
samples_speech = samples[sample_start:sample_end]
vad_matrix[sample_start:sample_end] = 1
return samples_speech, vad_matrix
import warnings
msg = 'No speech detected'
warnings.warn(msg)
return [], vad_matrix
# have applied to stft matrix, looks good
[docs]def normalize(data, max_val=None, min_val=None):
'''Normalizes data to be between 0 and 1. Should not be applied to raw sample data.
This is useful if you have predetermined max and min values you want to normalize
new data with. Is helpful in training models on sound features (not raw samples).
Parameters
----------
data : np.ndarray [size=(num_features,) or (num_frames,num_features)]
Data to be normalized.
max_val : int or float, optional
Predetermined maximum value. If None, will use max value
from `data`.
min_val : int or float, optional
Predetermined minimum value. If None, will use min value
from `data`.
Returns
-------
normed_data : np.ndarray [size = (num_features,) or (num_frames, num_features)]
Examples
--------
>>> # using the min and max of a previous dataset:
>>> import numpy as np
>>> np.random.seed(0)
>>> input_samples = np.random.random_sample((5,))
>>> input_samples
array([0.5488135 , 0.71518937, 0.60276338, 0.54488318, 0.4236548 ])
>>> np.random.seed(40)
>>> previous_samples = np.random.random_sample((5,))
>>> previous_samples
array([0.40768703, 0.05536604, 0.78853488, 0.28730518, 0.45035059])
>>> max_prev = np.max(previous_samples)
>>> min_prev = np.min(previous_samples)
>>> output_samples = normalize(input_samples, min_val = min_prev, max_val = max_prev)
>>> output_samples
array([0.67303388, 0.89996095, 0.74661839, 0.66767314, 0.50232462])
'''
if data.dtype == np.complex_:
# take power of absoulte value of stft
data = np.abs(data)**2
# add epsilon to avoid division by zero error
eps = 2**-52
if max_val is None:
normed_data = (data - np.min(data)) / (np.max(data) - np.min(data) + eps)
else:
if min_val is None:
min_val = -max_val
normed_data = (data - min_val) / (max_val - min_val + eps)
return normed_data
# checked for stereo sound - works: plots each channel in separate plot
[docs]def plot_dom_freq(sound, energy_scale = 'power_to_db', title = None,
save_pic = False, name4pic = None, overwrite = False, **kwargs):
'''Plots the approximate dominant frequency over a STFT plot of a signal.
If `sound` has multiple channels, the VAD for each channel is plotted in its
own plot.
Parameters
----------
sound : np.ndarray [shape=(num_samples,) or (num_samples, num_channels)]
The sound to plot the dominant frequency of.
energy_scale : str
The scale of energy for the plot. If in frequency spectrum, likey in power and needs
to be put into db. (default 'power_to_db')
title : str
The title for the plot. (default None)
**kwargs : additional keyword arguments
Keyword arguments used in both `soundpy.feats.get_stft` and `soundpy.dsp.get_pitch`.
Returns
-------
None
'''
import matplotlib.pyplot as plt
# ensure numpy array
if not isinstance(sound, np.ndarray):
raise TypeError('Function `soundpy.feats.plot_vad` expects a '+\
'numpy.ndarray, not type {}.'.format(type(sound)))
# ensure sample rate is provided
if 'sr' not in kwargs:
raise ValueError('Function `soundpy.feats.plot_vad` requires sample rate'+\
' of the provided audio samples. Please provide the sample rate '+\
'under the parameter `sr`.')
# set defaults if not provided
if 'win_size_ms' not in kwargs:
kwargs['win_size_ms'] = 20
if 'percent_overlap' not in kwargs:
kwargs['percent_overlap'] = 0.5
y, sr = sound, kwargs['sr']
if len(y.shape) == 1:
# add channel column
y = y.reshape(y.shape+(1,))
elif y.shape[1] > 11:
import warnings
msg = '\nWARNING: provided `sound` data could be in the wrong format. \n'+\
'Function `soundpy.feats.plot_vad` expects raw sample data. Data '+\
'provided could be a stft, fbank, mfcc matrix or some other data'+\
' format. If plot results do not appear as expected, check data.'
warnings.warn(msg)
if 'mono' in kwargs and kwargs['mono'] is True:
y = y[:,0]
y = y.reshape(y.shape+(1,))
for channel in range(y.shape[1]):
stft_matrix = sp.feats.get_stft(y[:,channel], **kwargs)
# remove complex info for plotting:
power_matrix = sp.dsp.calc_power(stft_matrix)
pitch = sp.dsp.get_pitch(sound, **kwargs)
if energy_scale == 'power_to_db':
db_matrix = librosa.power_to_db(power_matrix)
plt.pcolormesh(db_matrix.T)
# limit the y axis; otherwise y axis goes way too high
axes = plt.gca()
axes.set_ylim([0,db_matrix.shape[1]])
color = 'yellow'
linestyle = ':'
plt.plot(pitch, 'ro', color=color)
if not title:
title = 'Appx Dominant Frequency'
# adjust title if more than one channel
if y.shape[1] > 1:
if channel == 0:
title += '\n(channel {})'.format(channel+1)
else:
title = title[:-2] + '{})'.format(channel+1)
plt.title(title)
if not save_pic:
plt.show()
# set up name for saving the plot, given channel number and if other files exist
else:
if name4pic is None:
name4pic = 'dom_freq'
if y.shape[1] > 1:
name4pic += '_channel{}'.format(channel+1)
name4pic = sp.utils.string2pathlib(name4pic+'.png')
else:
name4pic = sp.utils.string2pathlib(name4pic)
if y.shape[1] > 1:
if channel == 0:
name = name4pic.stem + '_channel{}'.format(channel+1)
name4pic = name4pic.parent.joinpath(name, name4pic.suffix)
else:
name = name4pic.stem[:-1] + str(channel+1)
name4pic = name4pic.parent.joinpath(name + name4pic.suffix)
if not name4pic.suffix:
name4pic = name4pic.parent.joinpath(name4pic.stem+'.png')
if not overwrite:
if os.path.exists(name4pic):
final_name = name4pic.stem + '_' + sp.utils.get_date()
final_name = name4pic.parent.joinpath(final_name+name4pic.suffix)
else:
final_name = name4pic
else:
final_name = name4pic
plt.savefig(final_name)
# checked for stereo sound - works: plots each channel in separate plot
[docs]def plot_vad(sound, energy_scale = 'power_to_db',
title = 'Voice Activity',
use_beg_ms = 120, extend_window_ms=0,
beg_end_clipped = True, save_pic = False, name4pic = None,
overwrite = False, **kwargs):
'''Plots where voice (sound) activity detected on power spectrum.
This either plots immediately or saves the plot at `name4pic`. If `sound`
has multiple channels, the VAD for each channel is plotted in its own plot.
Parameters
----------
sound : np.ndarray [shape=(num_samples,) or (num_samples, num_channels)]
The sound to plot the VAD of.
energy_scale : str
If plotting STFT or power spectrum, will plot it in decibels.
(default 'power_to_db')
title : str
The title of the plot (default 'Voice Activity')
use_beg_ms : int
The amount of noise to use at the beginning of the signal to measuer VAD. This
is only applied if `beg_end_silence` is set to False.
extend_window_ms : int
The number of milliseconds VAD should be padded. This is useful if one wants to
encompass more speech if the VAD is not including all the speech / desired sound.
However, this may capture more noise. (default 0)
beg_end_silence : bool
If True, just the silences at the beginning and end of the sample will be cut off.
If False, VAD will be checked throughout the sample, not just the beginning and
end. NOTE: Both options have strengths and weaknesses. Sometimes the VAD checking
the entire signal is unreliable (e.i. when `beg_end_silence is set to False`),
not recognizing speech in speech filled samples. And when set to True, some speech
sounds tend to get ignored ('s', 'x' and other fricatives).
save_pic : bool
If True, the plot will be saved rather than plotted immediately.
name4pic : str
The full pathway and filename to save the picture (as .png file). A file
extension is expected. (default None)
overwrite : bool
If False, a date tag will be added to `name4pic` if `name4pic` already exists.
(default False)
**kwargs : keyword arguments
Additional keyword arguments for `soundpy.feats.get_speech_stft` or
`soundpy.dsp.vad`.
Returns
-------
None
'''
import matplotlib.pyplot as plt
# ensure numpy array
if not isinstance(sound, np.ndarray):
raise TypeError('Function `soundpy.feats.plot_vad` expects a '+\
'numpy.ndarray, not type {}.'.format(type(sound)))
# ensure sample rate is provided
if 'sr' not in kwargs:
raise ValueError('Function `soundpy.feats.plot_vad` requires sample rate'+\
' of the provided audio samples. Please provide the sample rate '+\
'under the parameter `sr`.')
else:
# ensure sr is at least 44100; otherwise raise warning
# vad does not work as well with lower sample rates
if kwargs['sr'] < 44100:
import warnings
msg = '\nWarning: VAD works best with sample rates at or above '+\
'44100 Hz. To supress this warning, resample the audio from'+\
' {} Hz to at least 44100 Hz.'.format(kwargs['sr'])
warnings.warn(msg)
# set defaults if not in kwargs
if 'win_size_ms' not in kwargs:
kwargs['win_size_ms'] = 50
if 'percent_overlap' not in kwargs:
kwargs['percent_overlap'] = 0.5
y, sr = sound, kwargs['sr']
if len(y.shape) == 1:
# add channel column
y = y.reshape(y.shape+(1,))
elif y.shape[1] > 11:
import warnings
msg = '\nWARNING: provided `sound` data could be in the wrong format. \n'+\
'Function `soundpy.feats.plot_vad` expects raw sample data. Data '+\
'provided could be a stft, fbank, mfcc matrix or some other data'+\
' format. If plot results do not appear as expected, check data.'
warnings.warn(msg)
if 'mono' in kwargs and kwargs['mono'] is True:
y = y[:,0]
y = y.reshape(y.shape+(1,))
for channel in range(y.shape[1]):
stft_matrix = sp.feats.get_stft(y[:,channel], **kwargs)
if beg_end_clipped:
stft_vad, vad_matrix = sp.feats.get_stft_clipped(y[:,channel],
**kwargs)
else:
#vad_matrix, __ = sp.dsp.vad(y[:,channel], use_beg_ms = use_beg_ms, **kwargs)
stft_vad, vad_matrix = sp.feats.get_vad_stft(y[:,channel],
use_beg_ms = use_beg_ms,
**kwargs)
# extend window of VAD if desired
if extend_window_ms > 0:
frame_length = sp.dsp.calc_frame_length(kwargs['win_size_ms'],
kwargs['sr'])
num_overlap_samples = int(frame_length * kwargs['percent_overlap'])
# set number of subframes for extending window
extwin_num_samples = sp.dsp.calc_frame_length(extend_window_ms, kwargs['sr'])
num_win_subframes = sp.dsp.calc_num_subframes(extwin_num_samples,
frame_length = frame_length,
overlap_samples = num_overlap_samples,
zeropad = True)
vad_matrix_extwin = vad_matrix.copy()
for i, row in enumerate(vad_matrix):
if row > 0:
# label samples before VAD as VAD
if i > num_win_subframes:
vad_matrix_extwin[i-num_win_subframes:i] = 1
else:
vad_matrix_extwin[:i] = 1
# label samples before VAD as VAD
if i + num_win_subframes < len(vad_matrix):
vad_matrix_extwin[i:num_win_subframes+i] = 1
else:
vad_matrix_extwin[i:] = 1
vad_matrix = vad_matrix_extwin
# remove complex info for plotting:
power_matrix = sp.dsp.calc_power(stft_matrix)
db_matrix = librosa.power_to_db(power_matrix)
y_axis = db_matrix.shape[1]
if max(vad_matrix) > 0:
vad_matrix = sp.dsp.scalesound(vad_matrix, max_val = y_axis, min_val = 0)
plt.pcolormesh(db_matrix.T)
# limit the y axis; otherwise y axis goes way too high
axes = plt.gca()
axes.set_ylim([0,db_matrix.shape[1]])
color = 'yellow'
linestyle = ':'
plt.plot(vad_matrix, 'ro', color=color)
if not title:
title = 'Voice Activity in Signal'
if beg_end_clipped and 'clipped' not in title:
title += ' (clipped)'
# adjust title if more than one channel
if y.shape[1] > 1:
if channel == 0:
title += '\n(channel {})'.format(channel+1)
else:
title = title[:-2] + '{})'.format(channel+1)
plt.title(title)
if not save_pic:
plt.show()
# set up name for saving the plot, given channel number and if other files exist
else:
if name4pic is None:
name4pic = 'vad'
if beg_end_clipped:
name4pic += '_clipped'
if y.shape[1] > 1:
name4pic += '_channel{}'.format(channel+1)
name4pic = sp.utils.string2pathlib(name4pic+'.png')
else:
name4pic = sp.utils.string2pathlib(name4pic)
if y.shape[1] > 1:
if channel == 0:
name = name4pic.stem + '_channel{}'.format(channel+1)
name4pic = name4pic.parent.joinpath(name, name4pic.suffix)
else:
name = name4pic.stem[:-1] + str(channel+1)
name4pic = name4pic.parent.joinpath(name + name4pic.suffix)
if not name4pic.suffix:
name4pic = name4pic.parent.joinpath(name4pic.stem+'.png')
if not overwrite:
if os.path.exists(name4pic):
final_name = name4pic.stem + '_' + sp.utils.get_date()
final_name = name4pic.parent.joinpath(final_name+name4pic.suffix)
else:
final_name = name4pic
else:
final_name = name4pic
plt.savefig(final_name)
[docs]def get_change_acceleration_rate(spectro_data):
'''Gets first and second derivatives of spectral data.
This is useful particularly for speech recognition.
Parameters
----------
spectro_data : np.ndarray [shape = (num_samples, num_features)]
Returns
-------
delta : np.ndarray [shape = (num_samples, num_features)]
The first order derivative of spectral data. Reflects rate of change in signal.
delta_delta : np.ndarray [shape = (num_samples, num_features)]
The second order derivative of spectral data. Reflects rate of acceleration in signal.
'''
spectro_data = spectro_data.T
#first derivative = delta (rate of change)
delta = librosa.feature.delta(spectro_data)
#second derivative = delta delta (acceleration changes)
delta_delta = librosa.feature.delta(spectro_data,order=2)
delta = delta.T
delta_delta = delta_delta.T
return delta, delta_delta
[docs]def get_mfcc_fbank(samples, feature_type='mfcc', sr=48000, win_size_ms=20,
percent_overlap=0.5, num_filters=40, num_mfcc=40,
fft_bins = None, window_function = None, zeropad = True, **kwargs):
'''Collects fbank or mfcc features via python-speech-features (rather than librosa).
'''
if samples.dtype == np.complex64 or samples.dtype == np.complex128:
raise TypeError('Function `soundpy.feats.get_mfcc_fbank` only works'+\
' with raw signals, not complex data. Received input of type {}'.format(
samples.dtype))
if not window_function:
# default for python_speech_features:
def window_function(x): return np.ones((x,))
else:
if 'hamming' in window_function:
window_function = hamming
elif 'hann' in window_function:
window_function = hann
else:
# default for python_speech_features:
def window_function(x): return np.ones((x,))
if len(samples)/sr*1000 < win_size_ms:
if zeropad:
samples = sp.dsp.zeropad_sound(samples, win_size_ms * sr / 1000, sr = sr)
else:
win_size_ms = len(samples)/sr*1000
frame_length = sp.dsp.calc_frame_length(win_size_ms, sr)
percent_overlap = check_percent_overlap(percent_overlap)
window_shift_ms = win_size_ms * percent_overlap
if 'fbank' in feature_type:
feats, energy = fbank(samples,
samplerate = sr,
winlen = win_size_ms * 0.001,
winstep = window_shift_ms * 0.001,
nfilt = num_filters,
nfft = fft_bins,
winfunc = window_function,
**kwargs)
elif 'mfcc' in feature_type:
feats = mfcc(samples,
samplerate = sr,
winlen = win_size_ms * 0.001,
winstep = window_shift_ms * 0.001,
nfilt = num_filters,
numcep = num_mfcc,
nfft = fft_bins,
winfunc = window_function,
**kwargs)
return feats
[docs]def zeropad_features(feats, desired_shape, complex_vals = False):
'''Applies zeropadding to a copy of feats.
'''
# to avoid UFuncTypeError:
if feats.dtype == np.complex or feats.dtype == np.complex64 or \
feats.dtype == np.complex128:
complex_vals = True
fts = feats.copy()
if feats.shape != desired_shape:
if complex_vals:
dtype = np.complex
else:
dtype = np.float
empty_matrix = np.zeros(desired_shape, dtype = dtype)
try:
if len(desired_shape) == 1:
empty_matrix[:feats.shape[0]] += feats
elif len(desired_shape) == 2:
empty_matrix[:feats.shape[0],
:feats.shape[1]] += feats
elif len(desired_shape) == 3:
empty_matrix[:feats.shape[0],
:feats.shape[1],
:feats.shape[2]] += feats
elif len(desired_shape) == 4:
empty_matrix[:feats.shape[0],
:feats.shape[1],
:feats.shape[2],
:feats.shape[3]] += feats
elif len(desired_shape) == 5:
empty_matrix[:feats.shape[0],
:feats.shape[1],
:feats.shape[2],
:feats.shape[3],
:feats.shape[4]] += feats
else:
raise TypeError('Zeropadding columns requires a matrix with '+\
'a minimum of 1 dimension and maximum of 5 dimensions.')
fts = empty_matrix
except ValueError as e:
print(e)
raise ValueError('The desired shape is smaller than the original shape.'+ \
' No zeropadding necessary.')
except IndexError as e:
print(e)
raise IndexError('The dimensions do not align. Zeropadding '+ \
'expects same number of dimensions.')
assert fts.shape == desired_shape
return fts
[docs]def reduce_num_features(feats, desired_shape):
'''Limits number features of a copy of feats.
This is useful if you want the features to be a certain size, for
training models for example.
'''
fts = feats.copy()
if feats.shape != desired_shape:
empty_matrix = np.zeros(desired_shape, dtype = feats.dtype)
try:
if len(desired_shape) == 1:
empty_matrix += feats[:empty_matrix.shape[0]]
elif len(desired_shape) == 2:
empty_matrix += feats[:empty_matrix.shape[0],
:empty_matrix.shape[1]]
elif len(desired_shape) == 3:
empty_matrix += feats[:empty_matrix.shape[0],
:empty_matrix.shape[1],
:empty_matrix.shape[2]]
elif len(desired_shape) == 4:
empty_matrix += feats[:empty_matrix.shape[0],
:empty_matrix.shape[1],
:empty_matrix.shape[2],
:empty_matrix.shape[3]]
elif len(desired_shape) == 5:
empty_matrix += feats[:empty_matrix.shape[0],
:empty_matrix.shape[1],
:empty_matrix.shape[2],
:empty_matrix.shape[3],
:empty_matrix.shape[4]]
else:
raise TypeError('Reducing items in columns requires a matrix with'+\
' a minimum of 1 dimension and maximum of 5 dimensions.')
fts = empty_matrix
except ValueError as e:
print(e)
raise ValueError('The desired shape is larger than the original shape.'+ \
' Perhaps try zeropadding.')
except IndexError as e:
print(e)
raise IndexError('The dimensions do not align. Zeropadding '+ \
'expects same number of dimensions.')
assert fts.shape == desired_shape
return fts
# TODO remove warning for 'operands could not be broadcast together with shapes..'
# TODO test
[docs]def adjust_shape(data, desired_shape, change_dims = None, complex_vals = None):
try:
if change_dims is not None:
raise DeprecationWarning('\nWARNING: Function `soundpy.feats.adjust_shape` will not '+\
'use the parameter `change_dims` in future versions. \nIf extra dimensions '+\
'of length 1 are to be added to the `data`, this will be completed. '+\
'However extra dims of greater length are not covered in this function.')
except DeprecationWarning as e:
print(e)
try:
if complex_vals is not None:
raise DeprecationWarning('\nWARNING: Function `soundpy.feats.adjust_shape` will not '+\
'use the parameter `complex_vals` in future versions. This will be '+\
'implicitly conducted within the function using `numpy.dtype`.')
except DeprecationWarning as e:
print(e)
if len(data.shape) != len(desired_shape):
data_shape_orig = data.shape
if desired_shape[0] == 1:
if data.shape[0] != 1:
data = data.reshape((1,)+data.shape)
if desired_shape[-1] == 1:
if data.shape[-1] != 1:
data = data.reshape(data.shape + (1,))
if len(data.shape) != len(desired_shape):
raise ValueError('Currently cannot adjust data to a different number of '+\
'dimensions.\nOriginal data shape: '+str(data_shape_orig)+ \
'\nDesired shape: '+str(desired_shape))
# if complex values are in data, set complex_vals to True
if data.dtype == np.complex_:
complex_vals = True
else:
complex_vals = False
# attempt to zeropad data:
try:
greater_items = [i for i, x in enumerate(data.shape) if x > desired_shape[i]]
# all dimensions can be zeropadded or left alone
if len(greater_items) == 0:
data_prepped = sp.feats.zeropad_features(
data, desired_shape = desired_shape, complex_vals = complex_vals)
# not all dimensions can be zeropadded. Zeropad what can be zeropadded.
# then reduce larger dimensions
elif len(greater_items) == len(data.shape):
raise ValueError
# get out of try statement and run `reduce_num_features` in except clause
else:
temp_shape = []
for i, item in enumerate(data.shape):
if item <= desired_shape[i]:
temp_shape.append(desired_shape[i])
else:
temp_shape.append(item)
temp_shape = tuple(temp_shape)
# first zeropad the dimensions that are too small
data_prepped = sp.feats.zeropad_features(
data, desired_shape = temp_shape, complex_vals = complex_vals)
# then clip the dimensions that are too big
data_prepped = sp.feats.reduce_num_features(
data_prepped, desired_shape = desired_shape)
# if zeropadding is smaller than data.shape/features:
except ValueError:
# remove extra data/columns to match desired_shape:
data_prepped = sp.feats.reduce_num_features(data,
desired_shape = desired_shape)
return data_prepped
[docs]def reduce_dim(matrix, axis=0):
import math
import numpy as np
if axis < 0:
axis = len(matrix.shape) + axis
if axis == 0:
new_matrix = np.zeros((math.ceil(matrix.shape[0]/2),)+matrix.shape[1:])
row = 0
for i in np.arange(0, matrix.shape[0], 2):
if i < matrix.shape[0] - 2:
new_matrix[row] = (matrix[i] + matrix[i+1]) / 2
row += 1
else:
new_matrix[row] = matrix[i]
elif axis == 1:
new_matrix = np.zeros(matrix.shape[:1] + (math.ceil(matrix.shape[1]/2),))
col = 0
for i in np.arange(0, matrix.shape[1], 2):
if i < matrix.shape[1] - 2:
new_matrix[:, col] = (matrix[:, i] + matrix[:, i+1]) / 2
col += 1
else:
new_matrix[:, col] = matrix[:, i]
else:
raise ValueError('Function `reduce_dim` only accepts 2D data. Axis {}'.format(axis),
' is out of bounds.')
return new_matrix
[docs]def featshape_new_subframe(feature_matrix_shape, new_frame_size,
zeropad = True, axis=0, include_dim_size_1=False):
'''Subdivides features from (num_frames, num_feats) to (new_frame_size, num_frames, num_feats)
Parameters
----------
feature_matrix_shape : tuple [size=(num_frames, num_features)]
Feature matrix shape to be subdivided. Can be multidimensional.
new_frame_size : int
The number of subframes to section axis into.
zeropad : bool
If True, frames that don't completely fill a `new_frame_size` will be
zeropadded. Otherwise, those frames will be discarded. (default True)
axis : int
The axis where the `new_frame_size` should be applied. (default 0)
Returns
-------
new_shape : tuple [size=(num_subframes, new_frame_size, num_feats)]
'''
if axis < 0:
# get the axis number if using -1 or -2, etc.
axis = len(feature_matrix_shape) + axis
original_dim_length = feature_matrix_shape[axis]
if zeropad is True:
subsection_frames = math.ceil(original_dim_length / new_frame_size)
else:
subsection_frames = original_dim_length // new_frame_size
new_shape = []
for i, ax in enumerate(feature_matrix_shape):
if i == axis:
if subsection_frames == 1 and include_dim_size_1 is False:
# don't include extra dimension if length 1
new_shape.append(new_frame_size)
else:
new_shape.append(new_frame_size)
new_shape.append(subsection_frames)
else:
new_shape.append(ax)
new_shape = tuple(new_shape)
return new_shape
[docs]def apply_new_subframe(feature_matrix, new_frame_size, zeropad=True, axis=0):
'''Reshapes `feature_matrix` to allow for `new_frame_size`.
Note: Dimensions of `feature_matrix` must be at least 2 and can be up to 5,
returning a matrix with one additional dimension.
Parameters
----------
feature_matrix : np.ndarray [size(num_frames, num_features) ]
Expects minimum 2D, maximum 5D matrix.
new_frame_size : int
The number of subframes to section axis into.
axis : int
The axis to apply the `new_frame_size`. (default 0)
zeropad : bool
If True, the feature_matrix will be zeropadded to include frames that do not
fill entire frame_size, given the `new_frame_size`. If False, feature_matrix
will not include the last zeropadded frame. (default True)
Returns
-------
feats_reshaped : np.ndarray [size(num_subframes, new_frame_size, num_features)]
The `feature_matrix` returned with `axis` subdivided into 2 dimensions, the number of subframes and the other length `new_frame_size`.
Raises
------
ValueError if number of dimensions of `feature_matrix` is below 2 or exceeds 5.
Examples
--------
>>> import numpy as np
>>> matrix = np.arange(24).reshape(3,4,2)
>>> # apply new_frame_size to dimension of length 4 (i.e. axis 1)
>>> matrix_zp = apply_new_subframe(matrix, new_frame_size = 3, axis = 1)
>>> matrix_zp.shape
(3, 2, 3, 2)
>>> matrix_zp
array([[[[ 0, 1],
[ 2, 3],
[ 4, 5]],
[[ 6, 7],
[ 0, 0],
[ 0, 0]]],
[[[ 8, 9],
[10, 11],
[12, 13]],
[[14, 15],
[ 0, 0],
[ 0, 0]]],
[[[16, 17],
[18, 19],
[20, 21]],
[[22, 23],
[ 0, 0],
[ 0, 0]]]])
>>> matrix_nozp = apply_new_subframe(matrix, new_frame_size = 3, axis = 1,
... zeropad=False)
>>> matrix_nozp.shape
(3, 1, 3, 2)
>>> matrix_nozp
array([[[[ 0, 1],
[ 2, 3],
[ 4, 5]]],
[[[ 8, 9],
[10, 11],
[12, 13]]],
[[[16, 17],
[18, 19],
[20, 21]]]])
'''
if len(feature_matrix.shape) < 2 or len(feature_matrix.shape) > 5:
raise ValueError('Function `soundpy.feats.apply_new_subframe` '+\
'can only be applied to matrices between 2 and 5 dimensions.')
datatype = feature_matrix.dtype
if axis < 0:
# get the axis number if using -1 or -2, etc.
axis = len(feature_matrix.shape) + axis
new_shape = featshape_new_subframe(feature_matrix.shape,
new_frame_size = new_frame_size,
axis = axis,
zeropad = zeropad)
total_new_samples = np.prod(new_shape)
current_samples = np.prod(feature_matrix.shape)
# zeropad or reduce feature_matrix to match number of current samples
diff = total_new_samples - current_samples
for i, item in enumerate(feature_matrix.shape):
if i != axis:
diff /= item
if zeropad is True:
if diff >= 0:
diff = math.ceil(diff)
else:
diff = int(diff)
else:
if diff >= 0:
diff = int(diff)
else:
diff = math.ceil(diff)
if axis == 0:
feature_matrix = sp.feats.adjust_shape(
feature_matrix,
((feature_matrix.shape[0] + diff,) + feature_matrix.shape[1:]))
elif axis > 0:
feature_matrix = sp.feats.adjust_shape(
feature_matrix,
(feature_matrix.shape[:axis] + (feature_matrix.shape[axis] + diff, ) + \
feature_matrix.shape[axis+1:]))
feats_reshaped = feature_matrix.reshape(new_shape)
feats_reshaped = feats_reshaped.astype(datatype)
return feats_reshaped
[docs]def check_percent_overlap(percent_overlap):
'''Ensures percent_overlap is between 0 and 1.
'''
if percent_overlap > 1:
percent_overlap *= 0.01
if percent_overlap > 1:
raise ValueError('The percent overlap value '+str(percent_overlap)+\
' is too large. Please use a value between 0 and 1 or 0 and 100.')
return percent_overlap
[docs]def separate_dependent_var(matrix):
'''Separates matrix into features and labels. Expects 3D array.
Assumes the last column of the last dimension of the matrix constitutes
the dependent variable (labels), and all other columns the indpendent variables
(features). Additionally, it is assumed that for each block of data,
only one label is needed; therefore, just the first label is taken for
each block.
Parameters
----------
matrix : numpy.ndarray [size = (num_samples, num_frames, num_features)]
The `matrix` holds the numerical data to separate. num_features is
expected to be at least 2.
Returns
-------
X : numpy.ndarray [size = (num_samples, num_frames, num_features -1)]
A matrix holding the (assumed) independent variables
y : numpy.ndarray, numpy.int64, numpy.float64 [size = (num_samples,)]
A vector holding the labels assigned to the independent variables.
If only one value in array, just the value inside is returned
Examples
--------
>>> import numpy as np
>>> #vector
>>> separate_dependent_var(np.array([1,2,3,4]))
(array([1, 2, 3]), 4)
>>> #simple matrix
>>> matrix = np.arange(4).reshape(2,2)
>>> matrix
array([[0, 1],
[2, 3]])
>>> X, y = separate_dependent_var(matrix)
>>> X
array([[0],
[2]])
>>> y
1
>>> #more complex matrix
>>> matrix = np.arange(20).reshape((2,2,5))
>>> matrix
array([[[ 0, 1, 2, 3, 4],
[ 5, 6, 7, 8, 9]],
<BLANKLINE>
[[10, 11, 12, 13, 14],
[15, 16, 17, 18, 19]]])
>>> X, y = separate_dependent_var(matrix)
>>> X
array([[[ 0, 1, 2, 3],
[ 5, 6, 7, 8]],
<BLANKLINE>
[[10, 11, 12, 13],
[15, 16, 17, 18]]])
>>> y
array([ 4, 14])
'''
# get last column
if matrix.shape[-1] == 1:
raise ValueError('Expects input matrix to be size (num_samples, num_frames, ' + \
'num_features). Number of features must exceed 1 in order ' + \
'to separate into X and y arrays.')
y_step1 = np.take(matrix, -1, axis=-1)
# because the label is the same for each block of data, just need the first
# row, not all the rows, as they are the same label.
y = np.take(y_step1, 0, axis=-1)
# get features:
X = np.delete(matrix, -1, axis=-1)
return X, y
# TODO: perhaps remove - just use np.expand_dims()
# TODO: https://github.com/biopython/biopython/issues/1496
# Fix numpy array repr for Doctest.
[docs]def add_tensor(matrix):
'''Adds tensor / dimension to input ndarray (e.g. features).
Keras requires an extra dimension at some layers, which represents
the 'tensor' encapsulating the data.
Further clarification taking the example below. The input matrix has
shape (2,3,4). Think of it as 2 different events, each having
3 sets of measurements, with each of those having 4 features. So,
let's measure differences between 2 cities at 3 different times of
day. Let's take measurements at 08:00, 14:00, and 19:00 in...
Magic City and Never-ever Town. We'll measure.. 1) tempurature,
2) wind speed 3) light level 4) noise level.
How I best understand it, putting our measurements into a matrix
with an added dimension/tensor, this highlights the separate
measurements, telling the algorithm: yes, these are 4 features
from the same city, BUT they occur at different times. Or it's
just how Keras set up the code :P
Parameters
----------
matrix : numpy.ndarray
The `matrix` holds the numerical data to add a dimension to.
Returns
-------
matrix : numpy.ndarray
The `matrix` with an additional dimension.
Examples
--------
>>> import numpy as np
>>> matrix = np.arange(24).reshape((2,3,4))
>>> matrix.shape
(2, 3, 4)
>>> matrix
array([[[ 0, 1, 2, 3],
[ 4, 5, 6, 7],
[ 8, 9, 10, 11]],
<BLANKLINE>
[[12, 13, 14, 15],
[16, 17, 18, 19],
[20, 21, 22, 23]]])
>>> matrix_2 = add_tensor(matrix)
>>> matrix_2.shape
(2, 3, 4, 1)
>>> matrix_2
array([[[[ 0],
[ 1],
[ 2],
[ 3]],
<BLANKLINE>
[[ 4],
[ 5],
[ 6],
[ 7]],
<BLANKLINE>
[[ 8],
[ 9],
[10],
[11]]],
<BLANKLINE>
<BLANKLINE>
[[[12],
[13],
[14],
[15]],
<BLANKLINE>
[[16],
[17],
[18],
[19]],
<BLANKLINE>
[[20],
[21],
[22],
[23]]]])
'''
if isinstance(matrix, np.ndarray) and len(matrix) > 0:
matrix = matrix.reshape(matrix.shape + (1,))
return matrix
elif isinstance(matrix, np.ndarray):
raise ValueError('Input matrix is empty.')
else:
raise TypeError('Expected type numpy.ndarray, recieved {}'.format(
type(matrix)))
# TODO improve / remove.. move to data module?
[docs]def scale_X_y(matrix, is_train=True, scalars=None):
'''Separates and scales data into X and y arrays. Adds dimension for keras.
Assumes the last column of the last dimension is the y or label data.
Parameters
----------
matrix : np.ndarray [size = (num_samples, num_frames, num_features)]
Matrix with X and y data
is_train : bool
Relevant for the `scalars` parameter. If the data is training
data (i.e. True), the `scalars` will be created. If the data
is test data (i.e. False), the function expects `scalars` to
be provided. (default True)
scalars : dict, optional
Dictionary with scalars to be applied to non-training data.
Returns
-------
X : np.ndarray [size = (num_sampls, num_frames, num_features-1, 1)]
Scaled features with extra dimension
y : np.ndarray [size = (num_samples, 1, 1)]
Scaled independent variable with extra dimension
scalars : dict
The scalars either created or previously loaded.
'''
X, y = sp.feats.separate_dependent_var(matrix)
if is_train:
scalars = {}
elif scalars is None:
raise TypeError('If non-train data, `scalars` cannot be of type None.')
if len(X.shape) != 3:
raise ValueError('Expected 3d input, not input of shape {}.'.format(
matrix.shape))
if X.dtype == np.complex_:
# convert stft to power spectrum
print('\nTaking absolute value and power of complex data..'+\
'\ni.e. Removing complex values.')
X = np.abs(X)**2
for j in range(X.shape[2]):
if is_train:
scalars[j] = StandardScaler()
X[:, :, j] = scalars[j].fit_transform(X[:, :, j])
else:
X[:, :, j] = scalars[j].transform(X[:, :, j])
X[:, :, j] = preprocessing.normalize(X[:, :, j])
# Keras needs an extra dimension as a tensor / holder of data
X = sp.feats.add_tensor(X)
y = sp.feats.add_tensor(y)
return X, y, scalars
# TODO test for all these features:
[docs]def list_available_features():
return ['stft', 'powspec', 'fbank', 'mfcc', 'signal']
# TODO REMOVE context_window for next release.
# don't apply context window and such during feature extraction phase
# TODO check if `real_signal` influences change of shape or not
[docs]def get_feature_matrix_shape(sr = None, dur_sec = None, feature_type = None,
win_size_ms = None, percent_overlap = None,
fft_bins = None, num_mfcc = None, num_filters = None,
rate_of_change = False, rate_of_acceleration = False,
context_window = None, frames_per_sample = None, zeropad = True, labeled_data = False, remove_first_coefficient = False, real_signal = False, **kwargs):
'''Returns expected shapes of feature matrix depending on several parameters.
Parameters
----------
sr : int
Sample rate of the audio to be extracted.
dur_sec : int, float
The number of seconds of audio feature extraction will be applied to.
feature_type : str
Accepted features include 'signal', 'stft', 'powspec', 'fbank', 'mfcc'. Which
`feature_type` applied will influence the resulting shape of the feature matrix
shape.
win_size_ms : int or float
The size of the window the audio signal should be broken into. If `feature_type`
is set to 'signal', this is irrelevant. Otherwise will raise TypeError if set to None.
percent_overlap : float
The amount of overlap between windows. If set to 0.5, the number of overlapping
samples will be half the number of samples that make up `win_size_ms`.
fft_bins : int
The number of frequency bins to use when calculating the fast Fourier transform.
If None, the calculated `frame_length` will be used.
num_mfcc : int
If extracting 'mfcc' features, the total number of coefficients expected.
num_filters : int
If extracting 'fbank' features, the total number of mel-filters to be applied.
rate_of_change : bool
If True, the first delta will be concatenated to features extracted.
rate_of_acceleration : bool
If True, the second delta will be concatenated to features extracted.
context_window : int
The size of `context_window` or number of samples padding a central frame.
This may be useful for models training on small changes occuring in the signal, e.g. to break up the image of sound into smaller parts.
frames_per_sample : int
The previous keyword argument for sugementing audio into smaller parts.
Will be removed in future versions and available in generator functions as
`context_window`. `frames_per_sample` equals 2 * `context_window` + 1. See
`soundpy.models.dataprep.Generator`
zeropad : bool
If True, windows and frames will be zeropadded to avoid losing any sample data.
labeled_data : bool
If True, a label will be added to the output shape of features.
remove_first_coefficient : bool
If True, the first mfcc coefficient will not be included in feature
matrix.
**kwargs : additional keyword arguments
Keyword arguments for `soundpy.feats.get_feats`. These may not be used in this
function as they may not influence the size of the feature matrix.
Returns
-------
feature_matrix_base : tuple
The base shape of the feature matrix. This is the shape that should result from
extracting the features for each audio file
feature_matrix_model : tuple
The shape relevant to training models. For example, one including space for a
context window and label.
'''
if sr is None:
raise TypeError('Function `soundpy.feats.get_feature_matrix_shape` expects'+\
' parameter `sr` to be of type `int`, not type None.')
if dur_sec is None:
raise TypeError('Function `soundpy.feats.get_feature_matrix_shape` expects'+\
' parameter `dur_sec` to be of type `int` or `float`, not type None.')
if win_size_ms is None:
raise TypeError('Function `soundpy.feats.get_feature_matrix_shape` expects'+\
' parameter `win_size_ms` to be of type `int` or `float`, not type None.')
if feature_type is None:
raise TypeError('Function `soundpy.feats.get_feature_matrix_shape` expected'+\
' parameter `feature_type` to be one of the following: '+\
','.join(sp.feats.list_available_features())+\
'\nInstead got None.')
total_samples = sp.dsp.calc_frame_length(dur_sec*1000, sr=sr)
frame_length = sp.dsp.calc_frame_length(win_size_ms, sr)
# all we need to know if signal is feature
if 'signal' in feature_type:
total_rows_per_wav = total_samples // frame_length
num_feats = frame_length
feature_matrix_model = (
total_rows_per_wav,
num_feats)
feature_matrix_base = (
total_samples,) # currently only single channels
else:
if win_size_ms is None or percent_overlap is None:
raise TypeError('`win_size_ms` or `percent_overlap` cannot be type '+\
'None. Please set these values, e.g. `win_size_ms` = 20, `percent_overlap` = 0.5')
win_shift_ms = win_size_ms - (win_size_ms * percent_overlap)
hop_length = int(win_shift_ms * 0.001 * sr)
if fft_bins is None:
fft_bins = int(win_size_ms * sr // 1000)
# https://librosa.org/doc/latest/generated/librosa.util.frame.html#librosa.util.frame
total_rows_per_wav = int(1 + (total_samples - fft_bins)//hop_length)
if 'mfcc' in feature_type:
if num_mfcc is None:
num_feats = 40
else:
num_feats = num_mfcc
if remove_first_coefficient is True:
num_feats -= 1
elif 'fbank' in feature_type:
if num_filters is None:
num_feats = 40
else:
num_feats = num_filters
elif 'stft' in feature_type or 'powspec' in feature_type:
num_feats = fft_bins//2 + 1
else:
raise ValueError('Feature type "{}" '.format(feature_type)+\
'not understood.\nMust include one of the following: \n'+\
', '.join(list_available_features()))
if rate_of_change is True and rate_of_acceleration is True:
num_feats += 2 * num_feats
elif rate_of_change is True or rate_of_acceleration is True:
num_feats += num_feats
try:
if frames_per_sample is not None or context_window is not None:
raise DeprecationWarning('\nWARNING: In future versions, the `frames_per_sample` and '+\
'`context_window` parameters will be no longer used in feature extraction.\n'+\
' Instead features can be segmented in generator functions using the '+\
'parameter `context_window`: `soundpy.models.dataprep.Generator`.')
except DeprecationWarning as e:
print(e)
if context_window or frames_per_sample:
if context_window:
subframes = context_window * 2 + 1
else:
subframes = frames_per_sample
batches = math.ceil(total_rows_per_wav/subframes)
feature_matrix_model = (
batches,
subframes,
num_feats)
feature_matrix_base = (
batches * subframes,
num_feats)
else:
feature_matrix_model = (
total_rows_per_wav,
num_feats)
feature_matrix_base = (
total_rows_per_wav,
num_feats)
if labeled_data is True:
feature_matrix_model = feature_matrix_model[:-1] + (feature_matrix_model[-1] + 1,)
return feature_matrix_base, feature_matrix_model
[docs]def save_features_datasets(datasets_dict, datasets_path2save_dict,
context_window=None, frames_per_sample = None, labeled_data=False,
subsection_data=False, divide_factor=None,
visualize=False, vis_every_n_frames=50,
log_settings=True, decode_dict = None,
random_seed = None, **kwargs):
'''Extracts and saves audio features, sectioned into datasets, to indicated locations.
If MemoryError, the provided dataset dicts will be adjusted to allow data to be subsectioned.
Parameters
----------
datasets_dict : dict
Dictionary with keys representing datasets and values the audifiles making up that dataset.
E.g. {'train':['1.wav', '2.wav', '3.wav'], 'val': ['4.wav'], 'test':['5.wav']} for unlabled
data or {'train':[(0, '1.wav'), (1, '2.wav'), (0, '3.wav')], 'val': [(1, '4.wav')],
'test':[(0, '5.wav')]} for labeled data.
datasets_path2save_dict : dict
Dictionary with keys representing datasets and values the pathways of where extracted
features of that dataset will be saved.
E.g. {'train': './data/train.npy', 'val': './data/val.npy', 'test': './data/test.npy'}
context_window : int
The size of `context_window` or number of samples padding a central frame.
This may be useful for models training on small changes occuring in the signal, e.g. to break up the image of sound into smaller parts, to feed
to a long short-term memory network (LSTM), for example.
(Can avoid this by simply reshaping data later).
frames_per_sample : int
The previous keyword argument for sugementing audio into smaller parts.
Will be removed in future versions. This equals 2 * `context_window` + 1
labeled_data : bool
If True, expects each audiofile to be accompanied by an integer label. See example
given for `datasets_dict`.
subsection_data : bool
If you have a large dataset, you may want to divide it into subsections. See
soundpy.datasets.subsection_data. If datasets are large enough to raise a MemoryError,
this will be applied automatically.
divide_factor : int, optional
The number of subsections to divide data into. Only large enough sections will be divided.
If smaller datasets (i.e. validation and test datasets) are as large or smaller than
the new subsectioned larger dataset(s) (i.e. train), they will be left unchanged.
(defaults to 5)
visualize : bool
If True, periodic plots of the features will be saved throughout the extraction process. (default False)
vis_every_n_frames : int
How often visuals should be made: every 10 samples, every 100, etc. (default 50)
log_settings : bool
If True, a .csv file will be saved in the feature extraction directory with
most of the feature settings saved. (default True)
decode_dict : dict, optional
The dictionary to get the label given the encoded label. This is for plotting
purposes. (default None)
**kwargs : additional keyword arguments
Keyword arguments for `soundpy.feats.get_feats`.
Returns
-------
datasets_dict : dict
The final dataset dictionary used in feature extraction. The datasets may
have been subdivided.
datasets_path2save_dict : dict
The final dataset feature pathway dict. The pathways will have been
adjusted if the datasets have been subdivided.
See Also
--------
soundpy.feats.get_feats
Extract features from audio file or audio data.
'''
# if dataset is large, may want to divide it into sections
if divide_factor is None:
divide_factor = 5
if subsection_data:
datasets_dict, datasets_path2save_dict = sp.datasets.section_data(
datasets_dict,
datasets_path2save_dict,
divide_factor=divide_factor)
# save where data was extracted from:
dataset_dirs = []
try:
# sr must be set. Set to default value.
if not 'sr' in kwargs or kwargs['sr'] is None:
import warnings
msg = '\nWARNING: sample rate was not set. Setting it at 22050 Hz.'
warnings.warn(msg)
kwargs['sr'] = 22050
# win_size_ms must be set. Set to default value.
if not 'win_size_ms' in kwargs or kwargs['win_size_ms'] is None:
import warnings
msg = '\nWARNING: `win_size_ms` was not set. Setting it to 20 ms'
warnings.warn(msg)
kwargs['win_size_ms'] = 20
# percent_overlap must be set. Set to default value.
if not 'percent_overlap' in kwargs or kwargs['percent_overlap'] is None:
import warnings
msg = '\nWARNING: `percent_overlap` was not set. Setting it to 0.5'
warnings.warn(msg)
kwargs['percent_overlap'] = 0.5
feat_base_shape, feat_model_shape = sp.feats.get_feature_matrix_shape(
context_window = context_window,
frames_per_sample = frames_per_sample,
labeled_data = labeled_data,
**kwargs)
# set whether or not features will include complex values:
if 'stft' in kwargs['feature_type']:
complex_vals = True
else:
complex_vals = False
total_audiofiles = 0
for key, value in datasets_dict.items():
# get parent directory of where data should be saved (i.e. for saving pics)
datapath = datasets_path2save_dict[key]
if not isinstance(datapath, pathlib.PosixPath):
datapath = pathlib.Path(datapath)
datadir = datapath.parent
# when loading a dictionary, the value is a string
if isinstance(value, str):
value = sp.utils.restore_dictvalue(value)
# len(vale) is the total number of audio files
feats4model_shape = (len(value),) + feat_model_shape
feats_matrix = sp.dsp.create_empty_matrix(
feats4model_shape,
complex_vals=complex_vals)
audio_list = value.copy()
total_audiofiles += len(audio_list)
# shuffle audiofiles:
if random_seed is not None:
random.seed(random_seed)
random.shuffle(audio_list)
for j, audiofile in enumerate(audio_list):
if labeled_data:
label, audiofile = int(audiofile[0]), audiofile[1]
else:
label = None
if isinstance(audiofile, str):
audiofile = pathlib.PosixPath(audiofile)
if j == 0:
dataset_dirs.append(audiofile.parent)
feats = sp.feats.get_feats(audiofile,
**kwargs)
# zeropad or clip feats if too short or long:
feats = sp.feats.adjust_shape(
feats,
desired_shape = feat_base_shape)
# add label column to feature matrix
if labeled_data:
# create label column
label_col = np.zeros((len(feats),1)) + label
feats = np.concatenate([feats,label_col], axis=1)
feats = feats.reshape(feats4model_shape[1:])
#visualize features only every n num frames
if visualize and j % vis_every_n_frames == 0:
if labeled_data:
if decode_dict is not None:
try:
label_plot = decode_dict[label].upper()
except KeyError:
try:
label_plot = decode_dict[str(label)].upper()
except KeyError:
label_plot = label
else:
label_plot = label
else:
label_plot = audiofile.parent.stem.upper()
sp.feats.visualize_feat_extraction(
feats,
iteration = j,
dataset = key,
label = label_plot,
datadir = datadir,
subsections = True, # prepping feats 4 model results in subsections
**kwargs)
# fill in empty matrix with features from each audiofile
feats_matrix[j] = feats
sp.utils.print_progress(iteration = j,
total_iterations = len(value),
task = '{} {} feature extraction'.format(
key, kwargs['feature_type']))
# save data:
np.save(datasets_path2save_dict[key], feats_matrix)
print('\nFeatures saved at {}\n'.format(datasets_path2save_dict[key]))
if log_settings:
log_filename = datadir.joinpath('log_extraction_settings.csv')
feat_settings = dict(
dataset_dirs = dataset_dirs,
feat_base_shape = feat_base_shape,
feat_model_shape = feat_model_shape,
complex_vals = complex_vals,
context_window = context_window,
frames_per_sample = frames_per_sample,
labeled_data = labeled_data,
decode_dict = decode_dict,
visualize = visualize,
vis_every_n_frames = vis_every_n_frames,
subsection_data = subsection_data,
divide_factor = divide_factor,
total_audiofiles = total_audiofiles,
kwargs = kwargs
)
feat_settings_path = sp.utils.save_dict(
dict2save = feat_settings,
filename = log_filename,
overwrite=True)
except MemoryError as e:
print('MemoryError: ',e)
print('\nSectioning data and trying again.\n')
datasets_dict, datasets_path2save_dict = sp.datasets.section_data(
datasets_dict, datasets_path2save_dict, divide_factor=divide_factor)
datasets_dict, datasets_path2save_dict = save_features_datasets(
datasets_dict = datasets_dict,
datasets_path2save_dict = datasets_path2save_dict,
context_window = context_window,
frames_per_sample = frames_per_sample,
labeled_data = labeled_data,
subsection_data = subsection_data,
divide_factor = divide_factor,
visualize = visualize,
vis_every_n_frames = vis_every_n_frames,
log_settings = log_settings,
decode_dict = decode_dict,
**kwargs)
return datasets_dict, datasets_path2save_dict
# TODO: update / consolidate
[docs]def save_features_datasets_zipfiles(datasets_dict, datasets_path2save_dict,
extract_dir, dur_sec,
feature_type='fbank', num_feats=None, sr=22050,
win_size_ms=20, percent_overlap=0.5, n_fft = None,
frames_per_sample=None, labeled_data=False,
subsection_data=False, divide_factor=None,
visualize=False, vis_every_n_frames=50,
use_librosa=True, center=True, mode='reflect',
log_settings=True, decode_dict = None,
audiofile_lim = 10, **kwargs):
'''Extracts and saves audio features, sectioned into datasets, to indicated locations.
If MemoryError, the provided dataset dicts will be adjusted to allow data to be subsectioned.
Parameters
----------
datasets_dict : dict
Dictionary with keys representing datasets and values the audifiles making up that dataset.
E.g. {'train':['1.wav', '2.wav', '3.wav'], 'val': ['4.wav'], 'test':['5.wav']} for unlabled
data or {'train':[(0, '1.wav'), (1, '2.wav'), (0, '3.wav')], 'val': [(1, '4.wav')],
'test':[(0, '5.wav')]} for labeled data.
datasets_path2save_dict : dict
Dictionary with keys representing datasets and values the pathways of where extracted
features of that dataset will be saved.
E.g. {'train': './data/train.npy', 'val': './data/val.npy', 'test': './data/test.npy'}
feature_type : str
String including only one of the following: 'signal', 'stft', 'powspec', 'fbank', and 'mfcc'.
'signal' currently only supports mono channel data. TODO test for stereo
'powspec' and 'stft' are basically the same; 'powspec' is the 'stft' except without
complex values and squared. E.g 'mfcc_noisy' or 'stft_train'.
sr : int
The sample rate the audio data should be loaded with.
n_fft : int
The number of frequency bins used for the Fast Fourier Transform (fft)
dur_sec : int or float
The desired duration of how long the audio data should be. This is used to calculate
size of feature data and is therefore necessary, as audiofiles tend to differe in length.
If audiofiles are longer or shorter, they will be cut or zeropadded respectively.
num_feats : int
The number of mfcc coefficients (mfcc), mel filters (fbank), or frequency bins (stft).
win_size_ms : int
The desired window size in milliseconds to process audio samples.
percent_overlap : float
The amount audio samples should overlap as each window is processed.
frames_per_sample : int, optional
If you want to section each audio file feature data into smaller frames. This might be
useful for speech related contexts. (Can avoid this by simply reshaping data later)
labeled_data : bool
If True, expects each audiofile to be accompanied by an integer label. See example
given for `datasets_dict`.
subsection_data : bool
If you have a large dataset, you may want to divide it into subsections. See
soundpy.datasets.subsection_data. If datasets are large enough to raise a MemoryError,
this will be applied automatically.
divide_factor : int, optional
The number of subsections to divide data into. Only large enough sections will be divided.
If smaller datasets (i.e. validation and test datasets) are as large or smaller than
the new subsectioned larger dataset(s) (i.e. train), they will be left unchanged.
(defaults to 5)
visualize : bool
If True, periodic plots of the features will be saved throughout the extraction process. (default False)
vis_every_n_frames : int
How often visuals should be made: every 10 samples, every 100, etc. (default 50)
use_librosa : bool
If True, librosa is used to load and extract features. As of now, no other option is
available. TODO: add other options. :P I just wanted to be clear that some elements
of this function are unique to using librosa. (default True)
center : bool
Relevant for librosa and feature extraction. (default True)
mode : str
Relevant for librosa and feature extraction. (default 'reflect')
log_settings : bool
If True, a .csv file will be saved in the feature extraction directory with
most of the feature settings saved. (default True)
decode_dict : dict, optional
The dictionary to get the label given the encoded label. This is for plotting
purposes. (default None)
**kwargs : additional keyword arguments
Keyword arguments for `soundpy.feats.get_feats`.
Returns
-------
datasets_dict : dict
The final dataset dictionary used in feature extraction. The datasets may
have been subdivided.
datasets_path2save_dict : dict
The final dataset feature pathway dict. The pathways will have been
adjusted if the datasets have been subdivided.
See Also
--------
soundpy.feats.get_feats
Extract features from audio file or audio data.
'''
# if dataset is large, may want to divide it into sections
if divide_factor is None:
divide_factor = 5
if subsection_data:
datasets_dict, datasets_path2save_dict = sp.datasets.section_data(
datasets_dict,
datasets_path2save_dict,
divide_factor=divide_factor)
try:
# depending on which packages one uses, shape of data changes.
# for example, Librosa centers/zeropads data automatically
# TODO see which shapes result from python_speech_features
total_samples = sp.dsp.calc_frame_length(dur_sec*1000, sr=sr)
# if using Librosa:
if use_librosa:
frame_length = sp.dsp.calc_frame_length(win_size_ms, sr)
win_shift_ms = win_size_ms - (win_size_ms * percent_overlap)
hop_length = int(win_shift_ms*0.001*sr)
if n_fft is None:
n_fft = frame_length
# librosa centers samples by default, sligthly adjusting total
# number of samples
if center:
y_zeros = np.zeros((total_samples,))
y_centered = np.pad(y_zeros, int(n_fft // 2), mode=mode)
total_samples = len(y_centered)
# each audio file
if 'signal' in feature_type:
# don't apply fft to signal (not sectioned into overlapping windows)
total_rows_per_wav = total_samples // frame_length
else:
# do apply fft to signal (via Librosa) - (will be sectioned into overlapping windows)
total_rows_per_wav = int(1 + (total_samples - n_fft)//hop_length)
# set defaults to num_feats if set as None:
if num_feats is None:
if 'mfcc' in feature_type or 'fbank' in feature_type:
num_feats = 40
elif 'powspec' in feature_type or 'stft' in feature_type:
num_feats = int(1+n_fft/2)
elif 'signal' in feature_type:
num_feats = frame_length
### how many samples make up one window frame?
###num_samps_frame = int(sr * win_size_ms * 0.001)
#### make divisible by 10
#### TODO: might not be necessary
###if not num_samps_frame % 10 == 0:
###num_samps_frame *= 0.1
#### num_feats is how many samples per window frame (here rounded up
#### to the nearest 10)
###num_feats = int(round(num_samps_frame, 0) * 10)
### limit in seconds how many samples
### is this necessary?
### dur_sec = num_features * frames_per_sample * batch_size / sr
else:
raise ValueError('Feature type "{}" '.format(feature_type)+\
'not understood.\nMust include one of the following: \n'+\
', '.join(list_available_features()))
# adjust shape for model
# input_shape: the input shape for the model
# desired_shape: the 2D shape of expected samples. This is used for zeropadding or
# limiting the feats to this shape. Once this shape, feats can be reshaped into input_shape
# TODO test for labeled data with frames_per_sample
if frames_per_sample is not None:
# want smaller windows, e.g. autoencoder denoiser or speech recognition
batch_size = math.ceil(total_rows_per_wav/frames_per_sample)
if labeled_data:
input_shape = (batch_size, frames_per_sample, num_feats + 1)
desired_shape = (input_shape[0] * input_shape[1],
input_shape[2]-1)
else:
input_shape = (batch_size, frames_per_sample, num_feats)
desired_shape = (input_shape[0]*input_shape[1],
input_shape[2])
else:
if labeled_data:
input_shape = (int(total_rows_per_wav), num_feats + 1)
desired_shape = (input_shape[0], input_shape[1]-1)
else:
input_shape = (int(total_rows_per_wav), num_feats)
desired_shape = input_shape
# set whether or not features will include complex values:
if 'stft' in feature_type:
complex_vals = True
else:
complex_vals = False
# limit feat_type to the basic feature extracted
# for example:
# feature_type 'powspec' is actually 'stft' but with complex info removed.
# the basic feat_type is still 'stft'
if 'mfcc' in feature_type:
feat_type = 'mfcc'
elif 'fbank' in feature_type:
feat_type = 'fbank'
elif 'stft' in feature_type:
feat_type = 'stft'
elif 'powspec' in feature_type:
feat_type = 'stft'
elif 'signal' in feature_type:
feat_type = 'signal'
else:
raise TypeError('Expected '+', '.join(list_available_features())+\
' to be in `feature_type`, not {}'.format(feature_type))
for key, value in datasets_dict.items():
# get parent directory of where data should be saved (i.e. for saving pics)
datapath = datasets_path2save_dict[key]
if not isinstance(datapath, pathlib.PosixPath):
datapath = pathlib.Path(datapath)
datadir = datapath.parent
# when loading a dictionary, the value is a string
if isinstance(value, str):
value = sp.utils.restore_dictvalue(value)
extraction_shape = (len(value) * audiofile_lim,) + input_shape
feats_matrix = sp.dsp.create_empty_matrix(
extraction_shape,
complex_vals=complex_vals)
# count empty rows (if speaker doesn't have audiofile_lim data)
empty_rows = 0
for j, zipfile in enumerate(value):
if labeled_data:
label, zipfile = int(zipfile[0]), zipfile[1]
if isinstance(zipfile, str):
zipfile = pathlib.PosixPath(zipfile)
# extract `audiofile_lim` from zipfile:
extract_dir = sp.utils.check_dir(extract_dir, make=True)
sp.files.extract(zipfile, extract_path = extract_dir)
audiolist = sp.files.collect_audiofiles(extract_dir,
recursive = True)
if audiofile_lim is not None:
for i in range(audiofile_lim):
if i == len(audiolist) and i < audiofile_lim:
print('Short number files: ', audiofile_lim - i)
empty_rows += audiofile_lim - i
break
feats = sp.feats.get_feats(audiolist[i],
sr=sr,
feature_type=feat_type,
win_size_ms=win_size_ms,
percent_overlap=percent_overlap,
num_filters=num_feats,
num_mfcc=num_feats,
dur_sec=dur_sec,
**kwargs)
# if power spectrum (remove complex values and squaring features)
if 'powspec' in feature_type:
feats = np.abs(feats)**2
if visualize:
if labeled_data:
if decode_dict is not None:
try:
label_plot = decode_dict[label].upper()
except KeyError:
try:
label_plot = decode_dict[str(label)].upper()
except KeyError:
label_plot = label
else:
label_plot = label
else:
label_plot = audiofile.parent.stem.upper()
# visualize features:
if 'mfcc' in feature_type or 'signal' in feature_type:
energy_scale = None
else:
energy_scale = 'power_to_db'
#visualize features only every n num frames
if j % vis_every_n_frames == 0:
save_pic_path = datadir.joinpath(
'images',key,'{}_sample{}'.format(
feature_type, j))
# make sure this directory exists
save_pic_dir = sp.utils.check_dir(save_pic_path.parent, make=True)
sp.feats.plot(feats,
feature_type = feature_type,
win_size_ms = win_size_ms,
percent_overlap = percent_overlap,
energy_scale = energy_scale,
title='{} {} features: label {}'.format(
key, feature_type.upper(),
label_plot),
save_pic=visualize,
name4pic=save_pic_path)
# zeropad feats if too short:
if 'signal' in feat_type:
feats_zeropadded = np.zeros(desired_shape)
feats_zeropadded = feats_zeropadded.flatten()
if len(feats.shape) > 1:
feats_zeropadded = feats_zeropadded.reshape(feats_zeropadded.shape[0],
feats.shape[1])
if len(feats) > len(feats_zeropadded):
feats = feats[:len(feats_zeropadded)]
feats_zeropadded[:len(feats)] += feats
# reshape here for training models to avoid memory issues later
# (while training) if total samples is large
feats = feats_zeropadded.reshape(desired_shape)
feats = sp.feats.zeropad_features(
feats,
desired_shape = desired_shape,
complex_vals = complex_vals)
if labeled_data:
# create label column
label_col = np.zeros((len(feats),1)) + label
feats = np.concatenate([feats,label_col], axis=1)
feats = feats.reshape(extraction_shape[1:])
# fill in empty matrix with features from each audiofile
feats_matrix[j+i] = feats
# delete extracted data (not directories):
sp.files.delete_dir_contents(extract_dir, remove_dir = False)
sp.utils.print_progress(iteration = j,
total_iterations = len(value),
task = '{} {} feature extraction'.format(
key, feature_type))
if empty_rows > 0:
print('\nFeatures have {} empty rows.\n'.format(empty_rows))
print(feats_matrix.shape)
feats_matrix = feats_matrix[:-empty_rows]
print('\nNow removing them:')
print(feats_matrix.shape)
# save data:
np.save(datasets_path2save_dict[key], feats_matrix)
print('\nFeatures saved at {}\n'.format(datasets_path2save_dict[key]))
if log_settings:
log_filename = datadir.joinpath('log_extraction_settings.csv')
feat_settings = dict(dur_sec=dur_sec,
feature_type=feature_type,
feat_type=feat_type,
complex_vals=complex_vals,
sr=sr,
num_feats=num_feats,
n_fft=n_fft,
win_size_ms=win_size_ms,
frame_length=frame_length,
percent_overlap=percent_overlap,
frames_per_sample=frames_per_sample,
labeled_data=labeled_data,
visualize=visualize,
# different for each dataset
#total_samples=total_samples,
input_shape=input_shape,
desired_shape=desired_shape,
use_librosa=use_librosa,
center=center,
mode=mode,
subsection_data=subsection_data,
divide_factor=divide_factor,
kwargs = kwargs
)
feat_settings_path = sp.utils.save_dict(
dict2save = feat_settings,
filename = log_filename,
overwrite=True)
else:
raise ValueError('Sorry, this functionality is not yet supported. '+\
'Set `use_librosa` to True.')
except MemoryError as e:
print('MemoryError: ',e)
print('\nSectioning data and trying again.\n')
datasets_dict, datasets_path2save_dict = sp.datasets.section_data(
datasets_dict, datasets_path2save_dict, divide_factor=divide_factor)
datasets_dict, datasets_path2save_dict = save_features_datasets_zipfiles(
datasets_dict = datasets_dict,
datasets_path2save_dict = datasets_path2save_dict,
extract_dir = extract_dir,
audiofile_lim = audiofile_lim,
feature_type = feature_type,
sr = sr,
n_fft = n_fft,
dur_sec = dur_sec,
num_feats = num_feats,
win_size_ms = win_size_ms,
percent_overlap = percent_overlap,
use_librosa = use_librosa,
window = window,
center = center,
mode = mode,
frames_per_sample = frames_per_sample,
visualize = visualize,
vis_every_n_frames = vis_every_n_frames,
labeled_data = labeled_data,
log_settings = log_settings,
decode_dict = decode_dict,
**kwargs)
return datasets_dict, datasets_path2save_dict
[docs]def prep_new_audiofeats(feats, desired_shape, input_shape):
'''Prepares new audio data to feed to a pre-trained model.
Parameters
----------
feats : np.ndarray [shape = (num_frames, num_features)]
The features to prepare for feeding to a model.
desired_shape : tuple
The expected number of samples necessary to fulfill the expected
`input_shape` for the model. The `feats` will be zeropadded or
limited to match this `desired_shape`.
input_shape : tuple
The `input_shape` the model expects a single sample of data to be.
Returns
-------
feats_reshaped : np.ndarray [shape = (`input_shape`)]
The features reshaped to what the model expects.
'''
feats_reshaped = sp.feats.adjust_shape(feats, desired_shape)
# reshape to input shape with a necessary "tensor" dimension
feats_reshaped = feats_reshaped.reshape(input_shape)
return feats_reshaped
[docs]def feats2audio(feats, feature_type, sr, win_size_ms,
percent_overlap, phase=None):
'''Prepares features into audio playable format.
Parameters
----------
feats : np.ndarray [shape = (num_frames, num_feats)]
If the features are a signal,
[size = (batch_size * num_frames * num_features, 1)].
Otherwise [size = (batch_size * num_frames, num_features)].
feature_type : str
Either 'stft', 'fbank', 'signal', or 'mfcc'. For the 'signal'
feature, only mono channel is supported.
sr : int
Sampling rate that the features were extracted with
win_size_ms : int
The window size in milliseconds the features were extracted with
percent_overlap : float
The percent overlap between windows.
phase : np.ndarray [shape = (num_frames, num_feats)], optional
The original phase information of the reconstructed signal.
Returns
-------
y : np.ndarray [shape = (num_samples, )]
The reconstructed signal in samples.
'''
# (default) librosa handles data in shape (num_feats, num_frames)
# while soundpy works with data in shape (num_frames, num_feats)
if phase is not None:
try:
assert feats.shape == phase.shape
except AssertionError:
raise ValueError('Expected `feats` (shape {})'.format(feats.shape)+\
' and `phase` (shape {}) '.format(phase.shape) +\
'to have the same shape: (num_frames, num_features)')
win_shift_ms = win_size_ms - (win_size_ms * percent_overlap)
if 'signal' not in feature_type:
# Will apply Librosa package to feats. Librosa expects data to have
# shape (num_features, num_frames) not (num_frames, num_features)
feats = feats.T
if phase is not None:
phase = phase.T
if 'fbank' in feature_type:
y = librosa.feature.inverse.mel_to_audio(
feats,
sr=sr,
n_fft = int(win_size_ms*0.001*sr),
hop_length=int(win_shift_ms*0.001*sr))
elif 'mfcc' in feature_type:
feats = feats[:14,:]
y = librosa.feature.inverse.mfcc_to_audio(
feats,
sr=sr,
n_fft = int(win_size_ms*0.001*sr),
hop_length=int(win_shift_ms*0.001*sr),
n_mels=13)
elif 'stft' in feature_type or 'powspec' in feature_type:
# can use istft with phase information applied
if phase is not None:
feats = feats * phase
y = librosa.istft(
feats,
hop_length=int(win_shift_ms*0.001*sr),
win_length = int(win_size_ms*0.001*sr))
# if no phase information available:
else:
y = librosa.griffinlim(
feats,
hop_length=int(win_shift_ms*0.001*sr),
win_length = int(win_size_ms*0.001*sr))
elif 'signal' in feature_type:
y = feats.flatten()
return y
[docs]def grayscale2color(image_matrix, colorscale=3):
'''Expects grayscale image. Copies first channel into additional channels.
This is useful for pre-trained models that require features
to have rgb channels, not grayscale. Assumes last channel the colorscale
column.
'''
if len(image_matrix.shape) == 2:
# if colorscale column not there, adds it
image_matrix = image_matrix.reshape(image_matrix.shape + (1,))
expected_shape = image_matrix.shape[:-1] + (colorscale,)
# create extra empty channels to copy gray image to it:
image_zeropadded = sp.feats.zeropad_features(image_matrix, expected_shape)
for i in range(colorscale):
if i == 0:
pass
else:
if len(image_zeropadded.shape) == 3:
image_zeropadded[:,:,i] = image_zeropadded[:,:,0]
elif len(image_zeropadded.shape) == 4:
image_zeropadded[:,:,:,i] = image_zeropadded[:,:,:,0]
elif len(image_zeropadded.shape) == 5:
image_zeropadded[:,:,:,:,i] = image_zeropadded[:,:,:,:,0]
elif len(image_zeropadded.shape) == 6:
image_zeropadded[:,:,:,:,:,i] = image_zeropadded[:,:,:,:,:,0]
else:
raise ValueError('This function expects between 2 and 6 dimensions, '\
'not {} dimensions'.format(len(image_matrix.shape)))
return image_zeropadded
if __name__ == "__main__":
import doctest
doctest.testmod()