import pandas as pd import numpy as np %load_ext rmagic %%R download.file("https://dl.dropbox.com/u/7710864/courseraPublic/samsungData.rda",destfile="../data/samsungData.rda",method="curl") load("../data/samsungData.rda") write.csv(samsungData,file="../data/samsungData.csv") samsungData = pd.read_csv('../data/samsungData.csv') samsungData = samsungData.drop(['Unnamed: 0'], axis=1) samsungData.columns[:12] samsungData['activity'].value_counts() subj1 = samsungData[samsungData['subject'] == 1] numericActivity = subj1.groupby('activity') cols = {'laying' : 'b', 'sitting' : 'g', 'standing' : 'r', 'walk' : 'c', 'walkdown' : 'm', 'walkup' : 'y'} f, (ax1, ax2) = subplots(ncols=2) f.set_size_inches(10, 5) for act, df in numericActivity: ax1.scatter(df.index, df.ix[:,0], c=cols[act], label=act) ax2.scatter(df.index, df.ix[:,1], c=cols[act], label=act) ax1.set_ylabel(samsungData.columns[0]) ax2.set_ylabel(samsungData.columns[1]) ax2.legend(loc='lower right') f.tight_layout(); subj1.ix[:,:3].columns from scipy.spatial.distance import pdist, squareform from scipy.cluster.hierarchy import linkage, dendrogram actlabels = pd.Categorical.from_array(subj1['activity']) distanceMatrix = pdist(subj1.ix[:,:3]) dendrogram(linkage(distanceMatrix, method='complete'), color_threshold=0.3, leaf_label_func=lambda x: 'O' * (actlabels.labels[x] + 1), leaf_font_size=6) f = gcf() f.set_size_inches(8, 4); subj1.ix[:,[9, 10]].columns f, (ax1, ax2) = subplots(ncols=2) f.set_size_inches(10, 5) for act, df in numericActivity: ax1.scatter(df.index, df.ix[:,9], c=cols[act], label=act) ax2.scatter(df.index, df.ix[:,10], c=cols[act], label=act) ax1.set_ylabel(samsungData.columns[9]) ax2.set_ylabel(samsungData.columns[10]) ax2.legend(loc='lower right') f.tight_layout(); distanceMatrix = pdist(subj1.ix[:,9:12]) dendrogram(linkage(distanceMatrix, method='complete'), color_threshold=0.3, leaf_label_func=lambda x: 'O' * (actlabels.labels[x] + 1), leaf_font_size=6) f = gcf() f.set_size_inches(8, 4); # a simple scale function to normalize a matrix def scale(matrix): from numpy import mean, std return (matrix - mean(matrix, axis=0)) / std(matrix, axis=0) U, D, Vt = np.linalg.svd(subj1.ix[:,:-2].apply(scale), full_matrices=False) f, (ax1, ax2) = subplots(ncols=2) f.set_size_inches(10, 5) for lb, cl in zip(list(actlabels.levels), 'b g r c m y k'.split()): idx = subj1['activity'] == lb ax1.scatter(subj1.index[idx], U[idx,0], c=cl, label=lb) ax2.scatter(subj1.index[idx], U[idx,1], c=cl, label=lb) ax1.set_ylabel('U[:,0]') ax2.set_ylabel('U[:,1]') ax2.legend(loc='lower right') f.tight_layout(); plot(Vt[1,:], 'ok'); maxContrib = np.argmax(Vt[1,:]) maxContrib distanceMatrix = pdist(subj1.take(range(9,12) + [maxContrib], axis=1)) dendrogram(linkage(distanceMatrix, method='complete'), color_threshold=0.3, leaf_label_func=lambda x: 'O' * (actlabels.labels[x] + 1), leaf_font_size=6) f = gcf() f.set_size_inches(8, 4); samsungData.columns[maxContrib] from scipy.cluster.vq import kmeans, vq data = np.matrix(subj1.ix[:,:-2]) centers, _ = kmeans(data, 6, iter=100) cluster, _ = vq(data, centers) df = pd.crosstab(cluster, subj1['activity']) df idmax = np.argmax(df['laying']) plot(centers[idmax,:10], 'ok') ylabel('Cluster Center'); idmax = np.argmax(df['walk']) plot(centers[idmax,:10], 'ok') ylabel('Cluster Center');