#!/usr/bin/env python # coding: utf-8 #
# #
# # # Simple Sound Filtering using Discrete Fourier Transform # # ### Example – Waves and Acoustics # By Jonas Tjemsland, Andreas Krogen, Håkon Ånes og Jon Andreas Støvneng. # # Last edited: May 20th 2016 # # ___ # This notebook gives an introduction to programming with sounds in Python. We are using Python's [wave class](https://docs.python.org/2/library/wave.html). Check it out to see how the different functions are operated! Towards the end, a couple of sound tracks are filtered using discrete Fourier transforms (DFTs) and the trigonometric least squares approximation. # # **NOTE: Some of the sound files might be a bit loud, and you are adviced to turn down the volume before you play them.** # # As always, we start by importing necessary libraries, and set common figure parameters. # In[1]: import numpy as np from matplotlib import pylab as plt import wave import IPython from scipy import fftpack as fft get_ipython().run_line_magic('matplotlib', 'inline') # Casting unitary numbers to real numbers will give errors # because of numerical rounding errors. We therefore disable # warning messages. import warnings warnings.filterwarnings('ignore') # Set common figure parameters newparams = {'axes.labelsize': 8, 'axes.linewidth': 1, 'savefig.dpi': 200, 'lines.linewidth': 1, 'figure.figsize': (8, 3), 'ytick.labelsize': 7, 'xtick.labelsize': 7, 'ytick.major.pad': 5, 'xtick.major.pad': 5, 'legend.fontsize': 7, 'legend.frameon': True, 'legend.handlelength': 1.5, 'axes.titlesize': 7,} plt.rcParams.update(newparams) # ### Programming with sound # # A sound is a longitudinal wave in a medium, such as air or water. It is a vibration that propagates through the medium as fluctuations of pressure and displacements of particles. For example, the A tone is a sound wave of frequency 440 Hz, created by a tuning fork, speaker, string or another device which makes the air oscillate with the given frequency. On a computer, the sounds are nothing but a sequence of numbers. A given tone can mathematically be described by # # $$ # s(t)=A\sin(2\pi f t), # $$ # # where $A$ is the amplitude, $f$ is the frequency and $t$ is time. In a computer, $s(t)$ is evaluated at discrete points in time, determined by the sampling rate. An ordninary audio CD has a sampling frequency of 44100 Hz. Sampling will be discussed below. # # The following function creates a tune with specific frequency, length, amplitude and sampling rate. # In[2]: def tone(frequency=440., length=1., amplitude=1., sampleRate=44100., soundType='int8'): """ Returns a sine function representing a tune with a given frequency. :frequency: float/int. Frequency of the tone. :length: float/int. Length of the tone in seconds. :amplitude: float/int. Amplitude of the tone. :sampleRate: float/int. Sampling frequency. :soundType: string. Type of the elements in the returned array. :returns: float numpy array. Sine function representing the tone. """ t = np.linspace(0,length,length*sampleRate) data = amplitude*np.sin(2*np.pi*frequency*t) return data.astype(soundType) # For simplicity, let us create a nostalgic 8 bit mono wav sound file. Note that 8 bit samples are stored as uint8, ranging from 0 to 255, while 16-bit samples are stored as int16, ranging from -32768 to 32767. In other words, since we are creating an audiofile of samplewidth 8 bits, we need to add 128 to the samples before we write to file. The wave module will automatically change to uint8 if this is not done, such that -128 becomes 128, -127 becomes 129, and so on, and the sound file may become somewhat distored. Thanks to Øystein Hiåsen for pointing this out! If we where to use 16-bit samples we don't have to worry about this. This notebook supports 8, 16 and 32 bit samples. # In[3]: # Parameters that are being used in the start of this notebook sampleRate = 44100 sampwidth = 1 # In bytes. 1 for 8 bit, 2 for 16 bit and 4 for 32 bit volumePercent = 50 # Volume percentage nchannels = 1 # Mono. Only mono works for this notebook # Some dependent variables shift = 128 if sampwidth == 1 else 0 # The shift of the 8 bit samples, as explained in the section above. soundType = 'i' + str(sampwidth) amplitude = np.iinfo(soundType).min*volumePercent/100. # In[4]: # Parameters for a A tone lasting 1 second at a sample frequency = 440 length = 1 # Calculate the sine function for the given parameters data = tone(frequency, length, amplitude, sampleRate, soundType) # Plot the function plt.plot(data) plt.xlim([0,2000]) plt.title('Visualization of the A tone') plt.xlabel('Sample number') # Open a new .wav-file in "write" mode, set file parameters # and write to file data += shift # Shift the samplings if we use 8 bit with wave.open('Atone.wav', 'w') as file: file.setparams((nchannels, sampwidth, sampleRate, 0, 'NONE', '')) file.writeframes(data) # Display the sound file IPython.display.Audio('Atone.wav') # Let us try to create a *beat* by letting two waves of different frequency interfere. The beat will then have a frequency equal to the absolute value of the difference in frequency of the two waves, $f = \left|f_2 - f_1\right|$. You can read more about beat frequencies on the great [Hyperphysics](http://hyperphysics.phy-astr.gsu.edu/hbase/sound/beat.html) website. # In[5]: frequency = 400 frequency2 = 408 length = 5 # Calculate the sine function for the given parameters data = ( tone(frequency, length, amplitude, sampleRate,soundType) + tone(frequency2, length, amplitude, sampleRate,soundType) ) # Plot the function plt.plot(data) plt.xlim([0,20000]) plt.title('Visualization of beat') plt.xlabel('Sample number') # Create sound file data += shift with wave.open('beat.wav','w') as file: file.setparams((nchannels, sampwidth, sampleRate, 0, 'NONE', '')) file.writeframes(data) IPython.display.Audio('beat.wav') # Using these concepts, we can actually create a simple melody. # In[6]: # Create a "function" to translate from a given tone to its frequency baseTone = 130.813 # The frequecy of the tone C3, bass C tones = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'Bb', 'B', 'Ch','C#h','Dh','D#h','Eh','Fh','F#h','Gh','G#h','Ah','Bbh','Bh'] # And use dictionary comprehension to fill in the frequencies notes2freq = {tones[i]: baseTone*2**(i/12) for i in range(0,len(tones))} # The meldody data saved as a list of tuples (note, duration) l = 2. notes = [('D',0.083*l),('D',0.083*l),('D',0.083*l),('G',0.5*l),('Dh',0.5*l),('Ch',0.083*l), ('B',0.083*l),('A',0.083*l),('Gh',0.5*l),('Dh',0.25*l),('Ch',0.083*l),('B',0.083*l), ('A',0.083*l),('Gh',0.5*l),('Dh',0.25*l),('Ch',0.083*l),('B',0.083*l),('Ch',0.083*l), ('A',0.5*l),('D',0.167*l),('D',0.083*l),('G',0.5*l),('Dh',0.5*l),('Ch',0.083*l), ('B',0.083*l),('A',0.083*l),('Gh',0.5*l),('Dh',0.25*l),('Ch',0.083*l),('B',0.083*l), ('A',0.083*l),('Gh',0.5*l),('Dh',0.25*l),('Ch',0.083*l),('B',0.083*l),('Ch',0.083*l), ('A',0.5*l),('D',0.167*l),('D',0.083*l),('E',0.375*l),('E',0.125*l),('Ch',0.125*l), ('B',0.125*l),('A',0.125*l),('G',0.125*l),('G',0.083*l),('A',0.083*l),('B',0.083*l), ('A',0.167*l),('E',0.083*l),('F#',0.25*l),('D',0.167*l),('D',0.083*l),('E',0.375*l), ('E',0.125*l),('Ch',0.125*l),('B',0.125*l),('A',0.125*l),('G',0.125*l), ('Dh',0.1875*l),('A',0.0625*l),('A',0.5*l),('D',0.167*l),('D',0.083*l),('E',0.375*l), ('E',0.125*l),('Ch',0.125*l),('B',0.125*l),('A',0.125*l),('G',0.125*l),('G',0.083*l), ('A',0.083*l),('B',0.083*l),('A',0.167*l),('E',0.083*l),('F#',0.25*l),('Dh',0.167*l), ('Dh',0.083*l),('Gh',0.125*l),('Fh',0.125*l),('D#h',0.125*l),('Ch',0.125*l), ('Bb',0.125*l),('A',0.125*l),('G',0.125*l),('Dh',0.75*l)] # Create the file data data = np.array([],dtype=soundType) for note, duration in notes: currentFrequency = notes2freq[note] currentTone = tone(currentFrequency, duration, amplitude, sampleRate, soundType) data = np.append(data, currentTone) data += shift with wave.open('melody.wav','w') as file: file.setparams((nchannels, sampwidth, sampleRate, 0, 'NONE', '')) file.writeframes(data) IPython.display.Audio('melody.wav') # Beautiful. # # ### Nyquist-Shannon sampling theorem and aliasing # # Before moving on, we should briefly discuss the Nyquist-Shannon sampling theorem. As mentioned earlier, when we are dealing with digital audio, we need to make a smooth sound signal discrete. This is achieved by sampling the signal at discrete points in time. The number of samplings per unit time is described by the sampling frequency, and # intuitively the sampling frequency should be highly dependent on the frequencies of the signal. # # The Nyquist-Shannon sampling theorem says that in order to reconstuct a signal, the sampling frequency should be greater than twice the highest frequency of the signal (often called the criterion for unambiguous representation) [3]. C.E. Shannon formulated it as [4]: # # > If a function $f(t)$ contains no frequencies higher than $W$ Hz # , it is completely determined by giving its ordinates at a series of points spaced $1/(W/2)$ seconds apart. # # Since the human hearing range is about 20 to 20 000 Hz, the sample frequency 44100 Hz of ordinary CDs is quite understandable. # # Aliasing describes the effect that causes different signals to become indistinguishable. For example, if the samling frequency is $W$, the frequencies $f$ and $W-f$ is indistinguishable. This means that the highest frequency one can represent by a sampling frequency $W$ is $W/2$, often called the Nyquist frequency. Moreover, all the frequencies below $W/2$ will be mirrored about $W/2$. This is visualized in below. # # In the following we create an audio file where the frequency increases stepwise from 0 Hz to the sampling rate chosen at 5000 Hz. As we quickly notice, what we hear is not the stepwise increase in frequency we intuitively expect, but a stepwise increase in frequency followed by a stepwise decrease. # In[7]: sampleRate = 5000 length = .5 # Calculate the sine function for the given parameters data = np.array([],dtype=soundType) for frequency in np.linspace(0, sampleRate, 20): currentTone = tone(frequency, length, amplitude, sampleRate, soundType) data = np.append(data, currentTone) data += shift with wave.open('aliasing.wav','w') as file: file.setparams((nchannels, sampwidth, sampleRate, 0, 'NONE', '')) file.writeframes(data) IPython.display.Audio('aliasing.wav') # There are ways to remove some of the aliasing (anti-aliasing), but this is not covered in this notebook. # ### Sounds in frequency domain # # If we perform the DFT of a sound track in the spacial domain [the amplitude $s(t)$], the result can be interpreted as a function describing $s(t)$ in the corresponding frequency domain. (This is explained in more detail in our [notebook on DFTs](https://nbviewer.jupyter.org/url/www.numfys.net/media/notebooks/mo_b2_discrete_fourier_transform.ipynb).) Thus, if the DFT is performed of the A tone, we will get a peak at $f=440$ Hz. In addition, and the peaks will be mirrored about half the sampling frequency (because of the Nyquist-Shannon sampling theorem). Furthermore, because of aliasing, we will get some peaks at $f=(440 + 880 \cdot n)$ Hz, $n=0,1,...$. Since the tones are finite, and because of numerical rounding errors, we will also get some noise addition to some increase in the amplitude close to the peaks. The noise and the aliasing becomes smaller if we use higher samling width. # # Let us plot the A tone in the frequency domain by taking $\log \mathcal F [s(t)](f)$. We shift the two quadrants and define the origin to be at half the sampling frequency, making the filtering easier. # In[8]: sampleRate = 44100 length = 1 frequency = 440 plt.figure(figsize=(8,5)) subplot = 0 for sampwidth in [1,2,4]: # Calculate the data for the given sample width soundType = 'i' + str(sampwidth) amplitude = np.iinfo(soundType).min*volumePercent/100. data = tone(frequency, length, amplitude, sampleRate, soundType) # Use DFT to tranform the data into the frequency domain dataFreq = np.log(fft.fftshift(fft.fft(data))) # Plot the results subplot += 1 plt.subplot(3,1,subplot) plt.plot(np.linspace(-0.5*sampleRate, 0.5*sampleRate, len(dataFreq)), dataFreq) plt.xlim([-0.5*sampleRate, 0.5*sampleRate]) plt.title('Frequency domain of the %d bit A tone' %(sampwidth*8)) plt.xlabel('Frequency, [Hz]') plt.tight_layout() # From the consept introduced so far it is easy to understand how one can filter out specific frequencies. # # ### Example: Filtering out specific frequencies # # We are now going to filter the 16 bit sound file [`spock_bad.wav`](https://www.numfys.net/media/spock_bad.wav). # In[9]: filename = 'spock_bad' IPython.display.Audio(filename + '.wav') # In[10]: with wave.open(filename + '.wav', 'rb') as file: data = file.readframes(-1) sampleRate = file.getframerate() sampwidth = file.getsampwidth() soundType = 'i' + str(sampwidth) data = np.fromstring(data, soundType) n = len(data) # Perform the DFT dataFreq = fft.fftshift(fft.fft(data)) # Plot the sound file in the spatial domain plt.subplot(2,1,1) plt.plot(np.linspace(0, n/sampleRate, n), data) plt.title('Spatial domain of %s' % filename) plt.xlabel('Time, [s]') plt.xlim([0, n/sampleRate]) # Plot the sound file in the frequency domain plt.subplot(2,1,2) plt.plot(np.linspace(-0.5*sampleRate, 0.5*sampleRate, n), np.log(np.abs(dataFreq))) plt.title('Frequency domain of %s' % filename) plt.xlabel('Frequency, [Hz]') plt.xlim([-0.5*sampleRate, 0.5*sampleRate]) plt.tight_layout() # We observe peeks in the frequencies about $f=\{100, 200, 1000, 5000\}$ Hz. Hence, we try to filter these out and listen to the result. # In[11]: w = 100 # Width of the removing intervals # Filter out the frequencies in the frequency domain for f in [100, 200, 1000, 5000]: dataFreq[n/2 + n*f/sampleRate - w : n/2 + n*f/sampleRate + w] = 0 dataFreq[n/2 - n*f/sampleRate - w : n/2 - n*f/sampleRate + w] = 0 # Transform the filtered frequency domain back to the spatial domain data = fft.ifft(fft.fftshift(dataFreq)) data = data.astype(soundType) # Save the filtered data to a new file with wave.open(filename + '_filtered.wav', 'w') as A, \ wave.open(filename + '.wav', 'rb') as file: A.setparams(file.getparams()) A.writeframes(data.real) # Plot the filtered sound in the spatial domain plt.subplot(2,1,1) plt.plot(np.linspace(0, n/sampleRate, n), data) plt.title('Spatial domain of %s' % (filename + '_filtered')) plt.xlabel('Time, [s]') plt.xlim([0, n/sampleRate]) # Plot the filtered sound in the frequency domain plt.subplot(2,1,2) plt.plot(np.linspace(-0.5*sampleRate, 0.5*sampleRate, n), np.log(np.abs(dataFreq))) plt.title('Frequency domain of %s' % (filename + '_filtered')) plt.xlabel('Frequency, [Hz]') plt.xlim([-0.5*sampleRate, 0.5*sampleRate]) plt.tight_layout() # Play the filtered sound file IPython.display.Audio(filename + '_filtered.wav') # ### Example: Lowpass and highpass filtering # # In this example, we are going to check what an ideal lowpass filter and an ideal highpass filter does to a sound file. These filters remove signals with frequencies higher than the cutoff frequency and lower than the cuttoff frequency, respectively. These filters are most often non-ideal, but these types of filters are not discussed here. The filters attenuate frequencies above or below a given cutoff value. We are using the sound file [`vena.wav`](https://www.numfys.net/media/vena.wav). # In[12]: filename = 'vena.wav' IPython.display.Audio(filename) # In[13]: def idealLowpass(data, cutoff, sampleRate=44100): """ Removes all frequencies above a given cutoff value of a mono sound file described by an array of data and a samle frequency. :data: float numpy array. The data of a sound file. :cutoff: float/int. Cutoff value. :sampleRate: int. The sampling frequency of the sound file. :returns: float numpy array. Filtered sound data. """ n = len(data) dataFreq = fft.fftshift(fft.fft(data)) dataFreq[0 : n/2 -n*cutoff/sampleRate] = 0 dataFreq[n/2 + n*cutoff/sampleRate : len(dataFreq)] = 0 data = fft.ifft(fft.fftshift(dataFreq)) return data.real def idealHighpass(data, cutoff, sampleRate): """ Removes all frequencies below a given cutoff value of a mono sound file described by an array of data and a samle frequency. :data: float numpy array. The data of a sound file. :cutoff: float/int. Cutoff value. :sampleRate: int. The sampling frequency of the sound file. :returns: float numpy array. Filtered sound data. """ n = len(data) dataFreq = fft.fftshift(fft.fft(data)) dataFreq[n/2 - n*cutoff/sampleRate : n/2 + n*cutoff/sampleRate] = 0 data = fft.ifft(fft.fftshift(dataFreq)) return data.real # In[14]: # Read data from sound file with wave.open(filename, 'rb') as file: data = file.readframes(-1) sampleRate = file.getframerate() sampwidth = file.getsampwidth() soundType = 'i' + str(sampwidth) data = np.fromstring(data, soundType) n = len(data) # We start by applying a lowpass filter. # In[15]: # Perform the lowpass filter cutoff = 500 dataLowpass = idealLowpass(data, cutoff, sampleRate) # Save the filtered sound with wave.open('Vena_lowpass.wav', 'w') as A, \ wave.open(filename, 'rb') as file: A.setparams(file.getparams()) A.writeframes(dataLowpass.astype(soundType)) # Play the filtered sound IPython.display.Audio('Vena_lowpass.wav') # As you can hear, all the high frequencies are gone, and we are left with the bass tones. Note for example that we no longer hear the high-hat and the tambourine. # # Now, we apply the highpass filter. # In[16]: # Perform the highpass filter cutoff = 500 dataHighpass = idealHighpass(data, cutoff, sampleRate) # Save the filtered sound with wave.open('Vena_highpass.wav', 'w') as A, \ wave.open(filename, 'rb') as file: A.setparams(file.getparams()) A.writeframes(dataHighpass.astype(soundType)) # Play the filtered sound IPython.display.Audio('Vena_highpass.wav') # Now it is the other way around; all the low frequencies are gone, and we can no longer hear the bass tones. # # ### Example: Removing white noise (random noise) # # In this example, we are going to filter out white noise using a trigonometric least squares approximation. The approximation is explained in our notebook on [trigonometric interpolation](https://nbviewer.jupyter.org/url/www.numfys.net/media/notebooks/mo_curve2_trigonometric_interpolation.ipynb). # In[17]: def leastSquaresTrig(x, m, N): """ Calculates a trigonometric polynomial of degree n/2 (n even) or (n-1)/2 (n odd) that interpolates a set of n real data points. The data points can be written as (t_i,x_i), i=0,1,2,..., where t_i are equally spaced in the interval [c,d]. :x: complex or float numpy array. Data points. :c: float. Interval start. t[0]. :d: float. Interval end. t[n-1]. :returns: float numpy array. """ n = len(x) if not 0<=m<=n<=N: raise ValueError('Is 0 <= m <= n <= N?') y = fft.fft(x) yp = np.zeros(N, np.complex64) yp[0:m/2] = y[0:m/2] yp[N - m/2 + 1:N] = y[n - m/2 + 1:n] if (m % 2): yp[m/2] = y[m/2] else: yp[m/2] = np.real(y[m/2]) if m0: yp[N-m/2] = yp[m/2] return np.real(fft.ifft(yp))*N/n # We are using the sound file [`comet_noise.wav`](https://www.numfys.net/media/comet_noise.wav). # In[18]: filename = 'comet_noise.wav' IPython.display.Audio(filename) # In[19]: with wave.open(filename, 'rb') as file: data = file.readframes(-1) sampleRate = file.getframerate() sampwidth = file.getsampwidth() soundType = 'i' + str(sampwidth) data = np.fromstring(data, soundType) data = leastSquaresTrig(data, len(data)/14, len(data)) with wave.open('comet_filtered.wav', 'w') as A, \ wave.open(filename, 'rb') as file: A.setparams(file.getparams()) A.writeframes(data.astype(soundType)) IPython.display.Audio('comet_filtered.wav') # Note that the white noise can no longer be heared, but this is at expence of the quality. # ### References # # All the sound files are collected from http://tos.trekcore.com/. # # [1] H. P. Langtangen: "A primer on scientific programming with Python", 4th edition, p. 621-627, Springer 2014 # [2] R Nave: "Beats", http://hyperphysics.phy-astr.gsu.edu/hbase/sound/beat.html # [3] C. E. Shannon: "Communication in the presence of noise", Proc. Institute of Radio Engineers, vol. 37, no.1, p. 10–21, 1949. # [4] Wikipedia: "Audio bit depth", https://en.wikipedia.org/wiki/Audio_bit_depth, 10th May 2016 [acquired: 11th May 2016] # [5] Wikipedia: "Nyquist–Shannon sampling theorem", https://en.wikipedia.org/wiki/Nyquist%E2%80%93Shannon_sampling_theorem, 9th May 2016 [acquired: 11th May 2016]