#!/usr/bin/env python # coding: utf-8 # # SC-1 Feature Engineering and Classification # # The first attempt at seriation classification (https://github.com/mmadsen/experiment-seriation-classification/blob/master/analysis/sc-1/sc-1-seriation-classification-analysis.ipynb) was a partial (but encouraging) success, achieving basically 80% accuracy in correctly labeling which of two regional metapopulation models an IDSS seriation solution is derived from. The "classifier" used was simply k-Nearest Neighbors, with an optimal value of k=3. # # The sole "feature" used in classification was the Euclidean distance between sorted Laplacian eigenvalue spectra for each graph, as described in the following lab note: http://goo.gl/HYvyoM # # Since I used a single feature in that first analysis, to do better than 80% we're going to have to add more features. One promising approach is to use more of the information in the Laplacian spectrum itself, instead of reducing it to a single distance metric. To that, we can then add other graph theoretic features, keeping in mind that some of them may be highly collinear with information already contained in the Laplacian. # In[17]: import numpy as np import networkx as nx import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import cPickle as pickle from copy import deepcopy get_ipython().run_line_magic('matplotlib', 'inline') plt.style.use("fivethirtyeight") sns.set() # In[18]: all_graphs = pickle.load(open("train-cont-graphs.pkl",'r')) all_labels = pickle.load(open("train-cont-labels.pkl",'r')) # The strategy, unlike our first attempt, requires a real train/test split in the dataset because we're going to fit an actual model (although a true LOO cross validation is still of course possible). But we need a train_test_split function which is able ot deal with lists of NetworkX objects. # In[19]: def train_test_split(graph_list, label_list, test_fraction=0.20): """ Randomly splits a set of graphs and labels into training and testing data sets. We need a custom function because the dataset isn't a numeric matrix, but a list of NetworkX Graph objects. """ rand_ix = np.random.random_integers(0, len(graph_list), size=int(len(graph_list) * test_fraction)) print "random indices: %s" % rand_ix test_graphs = [] test_labels = [] train_graphs = [] train_labels = [] # first copy the chosen test values, without deleting anything since that would alter the indices for ix in rand_ix: test_graphs.append(graph_list[ix]) test_labels.append(label_list[ix]) # now copy the indices that are NOT in the test index list for ix in range(0, len(graph_list)): if ix in rand_ix: continue train_graphs.append(graph_list[ix]) train_labels.append(label_list[ix]) return (train_graphs, train_labels, test_graphs, test_labels) # ## Feature Engineering ## # The goal here is to construct a standard training and test data matrix of numeric values, which will contain the sorted Laplacian eigenvalues of the graphs in each data set. One feature will thus represent the largest eigenvalue for each graph, a second feature will represent the second largest eigenvalue, and so on. # # We do not necessarily assume that all of the graphs have the same number of vertices, although if there are marked differences, we would need to handle missing data for those graphs which had many fewer eigenvalues (or restrict our slice of the spectrum to the smallest number of eigenvalues present). # In[20]: train_graphs, train_labels, test_graphs, test_labels = train_test_split(all_graphs, all_labels, test_fraction=0.10) print "train size: %s" % len(train_graphs) print "test size: %s" % len(test_graphs) # In[21]: def graphs_to_eigenvalue_matrix(graph_list, num_eigenvalues = None): """ Given a list of NetworkX graphs, returns a numeric matrix where rows represent graphs, and columns represent the reverse sorted eigenvalues of the Laplacian matrix for each graph, possibly trimmed to only use the num_eigenvalues largest values. If num_eigenvalues is unspecified, all eigenvalues are used. """ # peek at the first graph and see how many eigenvalues there are tg = graph_list[0] n = len(nx.spectrum.laplacian_spectrum(tg, weight=None)) # we either use all of the eigenvalues, or we use the smaller of # the requested number or the actual number (if it is smaller than requested) if num_eigenvalues is None: ev_used = n else: ev_used = min(n, num_eigenvalues) print "(debug) eigenvalues - test graph: %s num_eigenvalues: %s ev_used: %s" % (n, num_eigenvalues, ev_used) data_mat = np.zeros((len(graph_list),ev_used)) #print "data matrix shape: ", data_mat.shape for ix in range(0, len(graph_list)): spectrum = sorted(nx.spectrum.laplacian_spectrum(graph_list[ix], weight=None), reverse=True) data_mat[ix,:] = spectrum[0:ev_used] return data_mat # In[ ]: # ## First Classifier ## # We're going to be using a gradient boosted classifier, which has some of best accuracy of any of the standard classifier methods. Ultimately we'll figure out the best hyperparameters using cross-validation, but first we just want to see whether the approach gets us anywhere in the right ballpark -- remember, we can 80% accuracy with just eigenvalue distance, so we have to be in that neighborhood or higher to be worth the effort of switching to a more complex model. # In[22]: from sklearn.ensemble import GradientBoostingClassifier from sklearn.metrics import accuracy_score, classification_report, confusion_matrix # In[23]: train_matrix = graphs_to_eigenvalue_matrix(train_graphs, num_eigenvalues=20) test_matrix = graphs_to_eigenvalue_matrix(test_graphs, num_eigenvalues=20) # In[24]: clf = GradientBoostingClassifier(n_estimators = 250) clf.fit(train_matrix, train_labels) # In[25]: pred_label = clf.predict(test_matrix) # In[26]: cm = confusion_matrix(test_labels, pred_label) cmdf = pd.DataFrame(cm) cmdf.columns = map(lambda x: 'predicted {}'.format(x), cmdf.columns) cmdf.index = map(lambda x: 'actual {}'.format(x), cmdf.index) print cmdf print classification_report(test_labels, pred_label) print "Accuracy on test: %0.3f" % accuracy_score(test_labels, pred_label) # **Definite** improvement over just using the eigenvalue distance, as expected. # # I did a run with all 30 eigenvalues and got the same answer as using just the 20 largest eigenvalues, presumably because the smallest 10 are very close to zero and do not vary enough between classes to be useful. But clearly, tuning this hyperparameter will be useful on the margins. # # The next step, of course, is to perform a cross validation of the hyperparameters, and write an sklearn-compliant object that makes it easy to cross-validate automatically over the graph objects in various ways, since it would be good to do random splits of the graphs, not just splits of the numeric data matrix. # A second strategy will be to see if augmenting the eigenvalue features with various graph theoretic properties helps at all. Some features, such as the mean degree of the graph, are likely to be highly redundant, since that information is fully captured by the eigenvalues of the Laplacian matrix (specifically, in its diagonal). So the trick will be to find some graph metrics which may not be fully captured by the eigenvalues. That will require more thought, but some of the work I did for the semantic Axelrod paper on graph orbits and symmetries might be useful here. # ## Finding Optimal Hyperparameters ## # In[ ]: # In[27]: from sklearn.pipeline import Pipeline from sklearn.grid_search import GridSearchCV # In[28]: pipeline = Pipeline([ ('clf', GradientBoostingClassifier()) ]) params = { 'clf__learning_rate': [5.0,2.0,1.0, 0.75, 0.5, 0.25, 0.1, 0.05, 0.01], 'clf__n_estimators': [10,25,50,100,250,500] } grid_search = GridSearchCV(pipeline, params, n_jobs = -1, verbose = 1) # In[29]: grid_search.fit(train_matrix, train_labels) # In[30]: print("Best score: %0.3f" % grid_search.best_score_) print("Best parameters:") best_params = grid_search.best_estimator_.get_params() for param in sorted(params.keys()): print("param: %s: %r" % (param, best_params[param])) # In[31]: pred_label = grid_search.predict(test_matrix) # In[32]: cm = confusion_matrix(test_labels, pred_label) cmdf = pd.DataFrame(cm) cmdf.columns = map(lambda x: 'predicted {}'.format(x), cmdf.columns) cmdf.index = map(lambda x: 'actual {}'.format(x), cmdf.index) print cmdf print classification_report(test_labels, pred_label) print "Accuracy on test: %0.3f" % accuracy_score(test_labels, pred_label) # In[ ]: