import numpy as np
from sklearn.decomposition import PCA
from scipy.linalg import svd
import matplotlib.pyplot as plt
country_correlation = np.load('Data/country_correlation.npy')
country_names = np.load('Data/country_names.npy')
data = np.load('Data/country_capability_dict.npy').item()
number_of_features = 50000
X = []
for country in data:
X.append(data[country])
X = np.array(X)
X = X[:, 0:number_of_features]
N = X.shape[0]
M = X.shape[1]
Y = X - np.ones((N,1))*X.mean(0)
U,S,V = svd(Y,full_matrices=False)
rho = (S*S) / (S*S).sum()
plt.figure(figsize=(15, 7))
plt.plot(range(1,len(rho)+1),rho,'ro-')
plt.title('Variance explained by principal components');
plt.xlabel('Principal component');
plt.ylabel('Variance explained');
plt.show()
Here plotted is the vcariance explaned of every principal component. From 0(0%) to 1(100%).
V = V.T
Z = np.matmul(Y, V)
# Indices of the principal components to be plotted
i = 0
j = 1
# Plot PCA of the data
plt.figure(figsize=(8, 8))
plt.title('Country Capabilities: PCA')
plt.plot(Z[:,i], Z[:,j], 'go', label= 'Country')
plt.legend()
plt.xlabel('PC{0}'.format(i+1))
plt.ylabel('PC{0}'.format(j+1))
# Output result to screen
plt.show()
In the following graph the data is projected onto the principal directions selected.