#!/usr/bin/env python # coding: utf-8 # In[5]: get_ipython().run_line_magic('matplotlib', 'inline') # In[129]: import librosa, librosa.display import torch import matplotlib.pyplot as plt from matplotlib.patches import Rectangle as Rect import numpy as np from math import floor # # STFT Shape # The STFT divides the samples into frames (or chunks) of size `n_fft` with `hop_length` samples between each frame. So: # In[121]: n_samp = 1000 n_fft = 500 hop = 125 sig = np.arange(n_samp) frames = librosa.util.frame(sig, n_fft, hop) [f'Frame {i} is samples {c[0]}-{c[-1]}' for i,c in enumerate(frames.T)] # So for a signal of length `n_samp` we get `floor((n_samp - n_fft) / hop) + 1` frames of `n_fft` samples each: # In[123]: frames.shape, frames.shape == (n_fft, floor((n_samp - n_fft) / hop) + 1) # Each of those chunks is put through a Discrete Fourier Transform (DFT) to give `(n_fft//2)+1` frequency bins. The Fast Fourier Transform (FFT) is a particular algorithm to produce a Discrete Fourier Transform, basically everything uses the FFT (or a variant of it) so these terms are basically interchangeable. Librosa returns these as a (freq, frame) tensor, i.e. as a (freq, time) tensor. So: # In[93]: stft = librosa.core.stft(np.zeros(n_samp), n_fft, hop, center=False) # Ignore center=False for the moment, we'll come back to it stft.shape # In[94]: stft.shape == (n_fft//2+1, floor((n_samp - n_fft) / hop) + 1) # Each of those `n_fft//2 + 1` values represent the contribution of a particular frequency band in making up the overall signal (or for the STFT in making up that frame). The first value (the `+1`) is the 0Hz component. This 0Hz component is sometimes also called the DC component, or DC offset, of the signal (this goes back to when audio was processed with analog electronic equipment where DC stands for Direct Current, as opposed to Alternating Current or AC). # The other values represent the component of the signal in each of `n_fft//2` evenly spaced bins (or bands) across the sampled frequency range. The sampled frequency range is given by half the sample rate. So with a sample rate of say 16kHz, we have a frequency range of 8kHz which will be divided into `n_fft//2` bins. # In[111]: freqs = librosa.core.fft_frequencies(sr=16000, n_fft=16) freqs, len(freqs) == 16//2 + 1 # So that's indicating our 8000Hz sampling range is divided into our 9 (16//2+1) bins of 1000kHz each, centered at 0Hz, 1000Hz, 2000Hz...8000Hz. Of course our first bin can't really centered at 0Hz because there aren't any frequencies below 0Hz right? Different sources seem to vary as to whether they identify this as a special band, representing the DC component or as just another band. I'm not sure which is right. This also applies to the final band here, it is apparently centered at 8000Hz but due to the Nyquist limit we can't capture frequencies above `sample_rate//2` which is 8kHz here. So both these bands must be a little different. But for deep learning this probably doesn't matter much, our network should learn to use whatever useful information is here and we don't have to tell it what these mean so can hopefully get by fine without really being sure ourselves. # # It's worth noting that technically this isn't quite correct. In fact an FFT of length `n_fft` will return `n_fft` bins. But the second half is just a transform of the first half providing no additional information so it tends to be discarded by common implementations (`torch.stft` has the `one_sided` argument which controls this and defaults to `True`, librosa just discards it with no option). Numpy being more faithful to the maths will give an extended set of bins: # In[114]: freqs = np.fft.fftfreq(16, 1/16000) freqs, len(freqs) == 16 # I don't understand the full answer here, or what those negative frequencies numpy is suggesting are, but in case you ever see stuff about one-sided FFTs, or discarding the top half, this is what is meant. # ## Windows # One further complexity of the STFT is windows. After the signal is split into `n_fft` sized frames, the "window function" (function in the mathematical sense) is applied to each of those each of those frames. In digital processing the window is a 1D tensor of length `window_length` (defaulting to `n_fft`) and applying it is just an element-wise multiplication of the chunk by the window. In `torch.stft` the default window is simply a tensor of all ones, so it doesn't affect the chunks at all. In `librosa` it defaults to the hann window which seems to be a common choice for this. While not an expert on this, thee basic idea here seems to be that the STFT is good because it allows you to keep a given frequency resolution (`n_fft`) while increasing your temporal resolution by having `hop_length < n_fft`. But this results in a lot of duplicated frequency information between frames and as shown above the FFT collects frequencies across the entire signal (or frame in the case of STFT). So, rather than just collecting all frequencies across the entire window you can instead use a window to weight the contribution of samples such that each frame gives a higher weighting to some of those samples than to others. Thus each frame can more highly weight a unique section of the sound and you can counteract somewhat that duplication due to overlap. If you plot the hann window you can see the sort of thing that is happening: # In[104]: win_length = n_fft # == 1000 hw = librosa.filters.get_window('hann', win_length) plt.figure(figsize=(7,3)); plt.plot(hw); # Visualising multiple windows spaced `hop_length` apart we see: # In[168]: n_fft=500 n_samp=1000 win = librosa.filters.get_window('hann', n_fft) def get_wins(window, n_samp, n_fft=500, hop=125): n_ch = ((n_samp - n_fft) // hop) + 1 # Number of chunks return [np.pad(window, (ch*hop, n_samp-ch*hop), 'constant')[:n_samp] for ch in range(n_ch)] wins = get_wins(win, 1000) (fig,ax)=plt.subplots(1, 1, figsize=(6,3)) for w in wins: ax.plot(w) # Looking at this you might notice that we aren't really including the beginning and ending samples now as our window is tapering off. Our first and last chunk here will have little useful information. So, by default librosa will actually add `n_fft//2` samples of padding to the beginning and end of our signal. Plotting this and highlighting the actual samples of our input we see: # In[196]: win = librosa.filters.get_window('hann', n_fft) wins = get_wins(win, n_samp + n_fft) (fig,ax)=plt.subplots(1, 1, figsize=(7,3)) y = range(-n_fft//2,n_samp+(n_fft)//2) ax.set_xticks(range(0,n_samp+1,hop)) for w in wins: ax.plot(y, w) ax.add_patch(Rect((0,0), n_samp, 1, alpha=0.3)); # This looks better, all our samples are now nicely included. This is what that `center=False` option was in the `librosa.core.stft` call above. So with `center=True`, the number of frames for our signal is actually `floor((n_samp - n_fft + n_fft) / hop) + 1` with the `+ n_fft` being our padding, or simplifying `floor(n_samp / hop) + 1`. So: # In[108]: stft = librosa.core.stft(np.zeros(n_samp), n_fft, hop) stft.shape # In[109]: stft.shape == (n_fft//2+1, floor(n_samp / hop) + 1) # Related to the `window` argument (which recall takes a 1D tensor, or a callable to create one), is the `win_length` argument. This alows you to specify that you are providing a window that is smaller than your `n_fft` (`win_length > n_fft` will result in an error). The window tensor you provide will be padded with zeros before being applied to, i.e. elementwise multiplied, each frame). As such the `win_length` argument will not affect the STFT output shape. Setting this window smaller than our frame size (`n_fft`) means we can include less of the samples that overlap with other frames. So, mimicing the function of window length we get: # In[197]: n_fft = 500 win_length = 250 win = librosa.util.pad_center(librosa.filters.get_window('hann', win_length), n_fft) wins = get_wins(win, n_samp + n_fft) (fig,ax)=plt.subplots(1, 1, figsize=(7,3)) y = range(-n_fft//2,n_samp+(n_fft)//2) ax.set_xticks(range(0,n_samp+1,hop)) for w in wins: ax.plot(y, w) ax.add_patch(Rect((0,0), n_samp, 1, alpha=0.3)); # This discussion of window length assumes you are using the default of frames centered within the window. If you do not use this default you would need to use an appropriate window for non-centered data. You might for instance use only the second half of a centered window.