#!/usr/bin/env python # coding: utf-8 # Source Separation with Sparsity # =============================== # # *Important:* Please read the [installation page](http://gpeyre.github.io/numerical-tours/installation_python/) for details about how to install the toolboxes. # $\newcommand{\dotp}[2]{\langle #1, #2 \rangle}$ # $\newcommand{\enscond}[2]{\lbrace #1, #2 \rbrace}$ # $\newcommand{\pd}[2]{ \frac{ \partial #1}{\partial #2} }$ # $\newcommand{\umin}[1]{\underset{#1}{\min}\;}$ # $\newcommand{\umax}[1]{\underset{#1}{\max}\;}$ # $\newcommand{\umin}[1]{\underset{#1}{\min}\;}$ # $\newcommand{\uargmin}[1]{\underset{#1}{argmin}\;}$ # $\newcommand{\norm}[1]{\|#1\|}$ # $\newcommand{\abs}[1]{\left|#1\right|}$ # $\newcommand{\choice}[1]{ \left\{ \begin{array}{l} #1 \end{array} \right. }$ # $\newcommand{\pa}[1]{\left(#1\right)}$ # $\newcommand{\diag}[1]{{diag}\left( #1 \right)}$ # $\newcommand{\qandq}{\quad\text{and}\quad}$ # $\newcommand{\qwhereq}{\quad\text{where}\quad}$ # $\newcommand{\qifq}{ \quad \text{if} \quad }$ # $\newcommand{\qarrq}{ \quad \Longrightarrow \quad }$ # $\newcommand{\ZZ}{\mathbb{Z}}$ # $\newcommand{\CC}{\mathbb{C}}$ # $\newcommand{\RR}{\mathbb{R}}$ # $\newcommand{\EE}{\mathbb{E}}$ # $\newcommand{\Zz}{\mathcal{Z}}$ # $\newcommand{\Ww}{\mathcal{W}}$ # $\newcommand{\Vv}{\mathcal{V}}$ # $\newcommand{\Nn}{\mathcal{N}}$ # $\newcommand{\NN}{\mathcal{N}}$ # $\newcommand{\Hh}{\mathcal{H}}$ # $\newcommand{\Bb}{\mathcal{B}}$ # $\newcommand{\Ee}{\mathcal{E}}$ # $\newcommand{\Cc}{\mathcal{C}}$ # $\newcommand{\Gg}{\mathcal{G}}$ # $\newcommand{\Ss}{\mathcal{S}}$ # $\newcommand{\Pp}{\mathcal{P}}$ # $\newcommand{\Ff}{\mathcal{F}}$ # $\newcommand{\Xx}{\mathcal{X}}$ # $\newcommand{\Mm}{\mathcal{M}}$ # $\newcommand{\Ii}{\mathcal{I}}$ # $\newcommand{\Dd}{\mathcal{D}}$ # $\newcommand{\Ll}{\mathcal{L}}$ # $\newcommand{\Tt}{\mathcal{T}}$ # $\newcommand{\si}{\sigma}$ # $\newcommand{\al}{\alpha}$ # $\newcommand{\la}{\lambda}$ # $\newcommand{\ga}{\gamma}$ # $\newcommand{\Ga}{\Gamma}$ # $\newcommand{\La}{\Lambda}$ # $\newcommand{\si}{\sigma}$ # $\newcommand{\Si}{\Sigma}$ # $\newcommand{\be}{\beta}$ # $\newcommand{\de}{\delta}$ # $\newcommand{\De}{\Delta}$ # $\newcommand{\phi}{\varphi}$ # $\newcommand{\th}{\theta}$ # $\newcommand{\om}{\omega}$ # $\newcommand{\Om}{\Omega}$ # This numerical tour explore local Fourier analysis of sounds, and its # application to source separation from stereo measurements. # In[1]: from __future__ import division import numpy as np import scipy as scp import pylab as pyl import matplotlib.pyplot as plt from nt_toolbox.general import * from nt_toolbox.signal import * import warnings warnings.filterwarnings('ignore') get_ipython().run_line_magic('matplotlib', 'inline') get_ipython().run_line_magic('load_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') # Sound Mixing # ------------ # We load 3 sounds and simulate a stero recording by performing a linear # blending of the sounds. # # Sound loading. # In[2]: from nt_toolbox.load_sound import * n = 1024*16 s = 3 #number of sounds p = 2 #number of micros x = np.zeros([n,3]) x[:,0] = load_sound("nt_toolbox/data/bird.wav",n) x[:,1] = load_sound("nt_toolbox/data/female.wav",n) x[:,2] = load_sound("nt_toolbox/data/male.wav",n) # Normalize the energy of the signals. # In[3]: x = x/np.tile(np.std(x,0),(n,1)) # We mix the sound using a $2\mathrm{x}3$ transformation matrix. # Here the direction are well-spaced, but you can try with more complicated # mixing matrices. # # Compute the mixing matrix # In[4]: theta = np.linspace(0, np.pi, s + 1)[:-1] theta[0] = .2 M = np.vstack((np.cos(theta), np.sin(theta))) # Compute the mixed sources. # In[5]: y = np.dot(x,np.transpose(M)) # Display of the sounds and their mix. # In[6]: plt.figure(figsize = (10,10)) for i in range(s): plt.subplot(s, 1, i+1) plt.plot(x[:, i]) plt.xlim(0,n) plt.title("Source #%i" %(i+1)) # Display of the micro output. # In[7]: plt.figure(figsize = (10,7)) for i in range(p): plt.subplot(p, 1, i+1) plt.plot(y[:, i]) plt.xlim(0,n) plt.title("Micro #%i" %(i+1)) # Local Fourier analysis of sound. # -------------------------------- # In order to perform the separation, one performs a local Fourier analysis # of the sound. The hope is that the sources will be well-separated over # the Fourier domain because the sources are sparse after a STFT. # # # # # First set up parameters for the STFT. # In[8]: w = 128 #size of the window q = w//4 #overlap of the window # Compute the STFT of the sources. # In[9]: from nt_toolbox.perform_stft import * from nt_toolbox.plot_spectrogram import * X = np.zeros([w,4*w+1,s]) Y = np.zeros([w,4*w+1,p]) for i in range(s): X[:,:,i] = perform_stft(x[:,i],w,q,n) plt.figure(figsize = (15,10)) plot_spectrogram(X[:,:,i],"Source #%i" %(i+1)) # __Exercise 1__ # # Compute the STFT of the micros, and store them into a matrix |Y|. # In[10]: run -i nt_solutions/audio_2_separation/exo1 # In[11]: ## Insert your code here. # Estimation of Mixing Direction by Clustering # -------------------------------------------- # Since the sources are quite sparse over the Fourier plane, the directions # are well estimated by looking as the direction emerging from a point # clouds of the transformed coefficients. # # # First we compute the position of the point cloud. # In[12]: mf = np.shape(Y)[0] mt = np.shape(Y)[1] P = np.reshape(Y, (mt*mf,p)) P = np.vstack((np.real(P), np.imag(P))) # Then we keep only the 5% points with largest energy. # # # Display some points in the original (spacial) domain. # # Number of displayed points. # In[13]: npts = 6000 # Display the original points. # In[14]: from numpy import random sel = random.permutation(n) sel = sel[:npts] plt.figure(figsize = (7,5)) plt.plot(y[sel,0], y[sel,1], ".", ms = 3) plt.xlim(-5,5) plt.ylim(-5,5) plt.title('Time domain') plt.show() # __Exercise 2__ # # Display some points of $P$ in the transformed (time/frequency) domain. # In[15]: run -i nt_solutions/audio_2_separation/exo2 # In[16]: ## Insert your code here. # We compute the angle associated to each point over the transformed # domain. The histogram shows the main direction of mixing. # In[17]: import math nrow = np.shape(P)[0] Theta = np.zeros(nrow) for i in range(nrow): Theta[i] = math.atan2(P[i,1],P[i,0])%np.pi # Display histogram. # In[18]: nbins = 100 t = np.linspace(np.pi/200,np.pi,nbins) hist = np.histogram(Theta,t) h = hist[0]/np.sum(hist[0]) t = t[:-1] plt.figure(figsize = (7,5)) plt.bar(t, h, width = np.pi/nbins, color = "darkblue", edgecolor = "darkblue") plt.xlim(0,np.pi) plt.ylim(0,np.max(h)) plt.show() # __Exercise 3__ # # The histogram computed from the whole set of points are not peacked # enough. To stabilize the detection of mixing direction, compute an # histogram from a reduced set of point that have the largest amplitude. # Compute the energy of each point. Extract only a small sub-set. # In[19]: run -i nt_solutions/audio_2_separation/exo3 # In[20]: ## Insert your code here. # __Exercise 4__ # # Detect the direction $M_1$ approximating the true direction $M$ by # looking at the local maxima of the histogram. First detect the set of # local maxima, and then keep only the three largest. Sort in descending order. # In[21]: run -i nt_solutions/audio_2_separation/exo4 # In[22]: ## Insert your code here. # Separation of the Sources using Clustering # ------------------------------------------ # Once the mixing direction are known, one can project the sources on the # direction. # # # We compute the projection of the coefficients Y on each estimated # direction. # In[23]: A = np.reshape(Y, (mt*mf,p)) # Compute the projection of the coefficients on the directions. # In[24]: C = abs(np.dot(np.transpose(M1),np.transpose(A))) # At each point $x$, the index $I(x)$ is the direction which creates the # largest projection. # # $I$ is the index of the closest source. # In[25]: tmp = np.max(C,0) I = np.argmax(C,0) I = np.reshape(I, (mf,mt)) + 1 # An additional denoising is achieved by removing small coefficients. # In[26]: T = .05 D = np.sqrt(np.sum(abs(Y)**2, 2)) I = I*(D > T) # We can display the segmentation of the time frequency plane. # In[27]: plt.figure(figsize = (15,10)) imageplot(I[:mf//2,:]) plt.imshow(I[:mf//2,:], cmap = plt.get_cmap("jet"), interpolation = "nearest") plt.show() # The recovered coefficients are obtained by projection. # In[28]: Proj = np.dot(np.transpose(M1),np.transpose(A)) Xr = np.zeros([w,4*w+1,s]) for i in range(s): Xr[:,:,i] = np.reshape(Proj[i,:], (mf,mt))*(I == i) # The estimated signals are obtained by inverting the STFT. # In[29]: xr = np.zeros([n,s]) for i in range(s): xr[:,i] = perform_stft(Xr[:,:,i], w, q, n) # One can display the recovered signals. # In[30]: plt.figure(figsize = (10,10)) for i in range(s): plt.subplot(s,1,i+1) plt.plot(xr[:,i]) plt.xlim(0,n) plt.title("Estimated source #%i" %(i+1)) # One can listen to the recovered sources. # In[31]: i = 1 from IPython.display import Audio Audio(x[:,i], rate = 15000) # In[32]: Audio(xr[:,i],rate = 15000)