#!/usr/bin/env python # coding: utf-8 # # Machine Learning Module # # **Lecturer:** Ashish Mahabal
# **Jupyter Notebook Authors:** Ashish Mahabal & Yuhan Yao # # This is a Jupyter notebook lesson taken from the GROWTH Summer School 2019. For other lessons and their accompanying lectures, please see: http://growth.caltech.edu/growth-school-2019.html # # ## Objective # Classify different classes using (a) decision trees and (b) random forest # # ## Key steps # - Pick variable types # - Select training sample # - Select method # - Look at confusion matrix and details # # ## Required dependencies # # See GROWTH school webpage for detailed instructions on how to install these modules and packages. Nominally, you should be able to install the python modules with `pip install `. The external astromatic packages are easiest installed using package managers (e.g., `rpm`, `apt-get`). # # ### Python modules # * python 3 # * astropy # * numpy # * astroquery # * pandas # * matplotlib # * pydotplus # * IPython.display # * sklearn # # ### External packages # None # # ### Partial Credits # Pavlos Protopapas (LSSDS notebook) # In[1]: # For inline plots get_ipython().run_line_magic('matplotlib', 'inline') import numpy as np import pandas as pd import matplotlib.pyplot as plt import io import pydotplus from IPython.display import Image # Various scikit-learn modules from sklearn.model_selection import train_test_split from sklearn import tree from sklearn.metrics import confusion_matrix from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, export_graphviz from sklearn.model_selection import GridSearchCV from sklearn.ensemble import RandomForestClassifier # ### Define datadir and files we will use # In[2]: datadir = 'data' # for decision tree catalog = datadir + '/CatalinaVars.tbl.gz' lightcurves = datadir + '/CRTS_6varclasses.csv.gz' # for random forest featuresfile = datadir + '/cvs_and_blazars.dat' # ### Read the light curves # In[3]: lcs = pd.read_csv(lightcurves, compression='gzip', header=1, sep=',', skipinitialspace=True, nrows=100000) #skiprows=[4,5]) #,nrows=100000) lcs.columns = ['ID', 'MJD', 'Mag', 'magerr', 'RA', 'Dec'] lcs.head() # In[4]: len(lcs.groupby('ID')) # ### Read catalog with class information for variables # In[5]: cat = pd.read_csv(catalog, compression='gzip', header=5, sep=' ', skipinitialspace=True, ) columns = cat.columns[1:] cat = cat[cat.columns[:-1]] cat.columns = columns cat.head() # ### Get a subset of variable types, and with minimum length of light curves # ### The classes are from Drake et al. 2014 and Mahabal et al. 2017 # ### 2: EA (detached binaries), 4: RRab, 5: RRc, 6:RRd, 8: RS CVn, 13: LPV # In[6]: vars6 = cat[ cat['Var_Type'].isin([2,4,5,6,8,13]) & (cat['Number_Obs']>100) ] vars6.head() # ### Some visualizations - these are not particularly informative, but just representative that one can try at the start of an analysis. # In[7]: print(vars6.shape) plt.hist(vars6.Var_Type) # In[8]: plt.plot(pd.to_numeric(vars6.Period_days).values, pd.to_numeric(vars6.V_mag).values, 'r.') # ### Pick two classes # In[9]: vars2 = vars6[ vars6['Var_Type'].isin([6,13]) ] vars2.head() # ### Create a 'target' column with 0|1 for the two classes # In[10]: Y = vars2['Var_Type'].values Y = np.array([1 if y==6 else 0 for y in Y]) X = vars2.drop('Var_Type',1).as_matrix() # In[11]: vars2['target'] = (vars2['Var_Type'].values==6)*1 # In[12]: vars2.head() # In[13]: Y[:10] # In[14]: X[:,:3] # ### Create a training sample with 60% of the objects # In[15]: # Create test/train mask itrain, itest = train_test_split(range(vars2.shape[0]), train_size=0.6) mask=np.ones(vars2.shape[0], dtype='int') mask[itrain]=1 mask[itest]=0 mask = (mask==1) # In[16]: mask[:10] # In[17]: print("% Class 6 objects in Training:", np.mean(vars2.target[mask]), np.std((vars2.target[mask]))) print("% Class 13 objects in Testing:", np.mean(vars2.target[~mask]), np.std((vars2.target[~mask]))) # ## A bit about Decision Trees # # One builds a decision tree using one or more predictors: # # Here we want to classify variable astronomical sources (e.g. EA, RRab, RRc, RRd, RS CVn, LPV etc.) using a few features we have access to viz. V_mag, Period_days, Amplitude, Number_Obs (note that not all of these may be good or useful for our purpose). # # Generically let's call them 𝑋𝑖1,𝑋𝑖2,...,𝑋𝑖𝑝 ( 𝑖 for source type, 𝑝 for predictors). We also have an observed label π‘Œπ‘– for each type of variable. # # We first assign everyone to the same class, say π‘ŒΜ‚ 𝑖=1 . We can calculate the squared error πΈπ‘Ÿπ‘Ÿ=βˆ‘π‘–(π‘ŒΜ‚ π‘–βˆ’π‘Œπ‘–)2 # At each step of the algorithm we consider a list of possible decision (or split), for example 𝑋12>35 , i.e. period is greater than 35. # For each possible decision we recalculate the predictor for that rule, for example π‘ŒΜ‚ 𝑖=6 if 𝑋12>35 and Y i=13 if X<35. # We recalculate the error for each possible decision: πΈπ‘Ÿπ‘Ÿ=βˆ‘π‘–(π‘ŒΜ‚ π‘–βˆ’π‘Œπ‘–)2 # We choose the decision that reduces the error by the largest amount # then keep going (if there are multiple predictors). The Err term is our impurity/loss function. # ### Define a few functions we will be using # In[18]: def display_dt(dt): dummy_io = io.StringIO() tree.export_graphviz(dt, out_file = dummy_io, proportion=True) print(dummy_io.getvalue()) # In[19]: # This function creates images of tree models using pydotplus # https://github.com/JWarmenhoven/ISLR-python def print_tree(estimator, features, class_names=None, filled=True): tree = estimator names = features color = filled classn = class_names dot_data = io.StringIO() export_graphviz(estimator, out_file=dot_data, feature_names=features, proportion=True, class_names=classn, filled=filled) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) return(graph) # In[20]: # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # # Important parameters # indf - Input dataframe # featurenames - vector of names of predictors # targetname - name of column you want to predict (e.g. 0 or 1, 'M' or 'F', # 'yes' or 'no') # target1val - particular value you want to have as a 1 in the target # mask - boolean vector indicating test set (~mask is training set) # reuse_split - dictionary that contains traning and testing dataframes # (we'll use this to test different classifiers on the same # test-train splits) # score_func - we've used the accuracy as a way of scoring algorithms but # this can be more general later on # n_folds - Number of folds for cross validation () # n_jobs - used for parallelization # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # def do_classify(clf, parameters, indf, featurenames, targetname, target1val, mask=None, reuse_split=None, score_func=None, n_folds=5, n_jobs=1): subdf=indf[featurenames] X=subdf.values y=(indf[targetname].values==target1val)*1 if mask.any() !=None: print("using mask") Xtrain, Xtest, ytrain, ytest = X[mask], X[~mask], y[mask], y[~mask] if reuse_split !=None: print("using reuse split") Xtrain, Xtest, ytrain, ytest = reuse_split['Xtrain'], reuse_split['Xtest'], reuse_split['ytrain'], reuse_split['ytest'] if parameters: clf = cv_optimize(clf, parameters, Xtrain, ytrain, n_jobs=n_jobs, n_folds=n_folds, score_func=score_func) clf=clf.fit(Xtrain, ytrain) training_accuracy = clf.score(Xtrain, ytrain) test_accuracy = clf.score(Xtest, ytest) print("############# based on standard predict ################") print("Accuracy on training data: %0.2f" % (training_accuracy)) print("Accuracy on test data: %0.2f" % (test_accuracy)) print(confusion_matrix(ytest, clf.predict(Xtest))) print("########################################################") return(clf, Xtrain, ytrain, Xtest, ytest) # ### Run the decision tree model and get a confusion matrix # ### We will use the V magnitude and period as variables # ### We will use the gini index # Let's first fit on two covariates to help us visualize what's going on. Have a look at the options on the help page . We'll be optimizing over two options here: max_depth - the maximum depth of the tree, min_samples_leaf - the minimum number of samples required to be at a leaf node. # Assuming the root note is S, and S' iterates over leaf notes such that the union of S'=S. Then the Gini impurity measures L(S') = |S'| 1βˆ’ p_{S'}^2 βˆ’ (1βˆ’ p_{S'})^2, where p_{S'} is the fraction of S' that are positive examples # # In[21]: clfTree1 = tree.DecisionTreeClassifier(max_depth=3, criterion='gini') subdf=vars2[['V_mag', 'Period_days']] X=subdf.values y=(vars2['target'].values==1)*1 # TRAINING AND TESTING Xtrain, Xtest, ytrain, ytest = X[mask], X[~mask], y[mask], y[~mask] # FIT THE TREE clf=clfTree1.fit(Xtrain, ytrain) training_accuracy = clf.score(Xtrain, ytrain) test_accuracy = clf.score(Xtest, ytest) print("############# based on standard predict ################") print("Accuracy on training data: %0.2f" % (training_accuracy)) print("Accuracy on test data: %0.2f" % (test_accuracy)) print(confusion_matrix(ytest, clf.predict(Xtest))) print("########################################################") # ### Yeah, we got perfect classification! # ### But there is a good reason for it. In fact two. # ### (1) Period is not a good variable to use for classification - if you have the period of an object, you already know a lot # ### (2) In this particular case, the periods for the two classes are very distinct from each other and so they are easy to separate when period is used as one of the variables. # In[22]: len(Xtrain), len(Xtest), len(ytrain), len(ytest) # In[23]: display_dt(clf) # In[24]: graph3 = print_tree(clf, features=['V_mag', 'Period_days'], class_names=['No', 'Yes']) Image(graph3.create_png()) # ### Let us include all the steps above in a function # In[25]: def dtclassify(allclasses,class1,class2,var1,var2): vars2 = allclasses[ allclasses['Var_Type'].isin([class1,class2]) ] Y = vars2['Var_Type'].values Y = np.array([1 if y==6 else 0 for y in Y]) X = vars2.drop('Var_Type',1).as_matrix() vars2['target'] = (vars2['Var_Type'].values==class1)*1 # Create test/train mask itrain, itest = train_test_split(range(vars2.shape[0]), train_size=0.6) mask=np.ones(vars2.shape[0], dtype='int') mask[itrain]=1 mask[itest]=0 mask = (mask==1) print("% Class ",class1," objects in Training:", np.mean(vars2.target[mask]), np.std((vars2.target[mask]))) print("% Class ",class2," objects in Testing:", np.mean(vars2.target[~mask]), np.std((vars2.target[~mask]))) clfTree1 = tree.DecisionTreeClassifier(max_depth=3, criterion='gini') subdf=vars2[[var1, var2]] X=subdf.values y=(vars2['target'].values==1)*1 # TRAINING AND TESTING Xtrain, Xtest, ytrain, ytest = X[mask], X[~mask], y[mask], y[~mask] # FIT THE TREE clf=clfTree1.fit(Xtrain, ytrain) training_accuracy = clf.score(Xtrain, ytrain) test_accuracy = clf.score(Xtest, ytest) print("############# based on standard predict ################") print("Accuracy on training data: %0.2f" % (training_accuracy)) print("Accuracy on test data: %0.2f" % (test_accuracy)) print(confusion_matrix(ytest, clf.predict(Xtest))) print("########################################################") display_dt(clf) return [clf,var1,var2] # graph3 = print_tree(clf, features=[var1, var2], class_names=['No', 'Yes']) # Image(graph3.create_png()) # In[26]: # A generic function to do CV def cv_optimize(clf, parameters, X, y, n_jobs=1, n_folds=5, score_func=None): if score_func: gs = GridSearchCV(clf, param_grid=parameters, cv=n_folds, n_jobs=n_jobs, scoring=score_func) else: gs = GridSearchCV(clf, param_grid=parameters, n_jobs=n_jobs, cv=n_folds) gs.fit(X, y) best = gs.best_estimator_ return best # ### First try it again with variables we have already tried # In[27]: dtclassify(vars6,6,13,'V_mag','Period_days') # In[28]: graph3 = print_tree(clf, features=['V_mag', 'Period_days'], class_names=['No', 'Yes']) Image(graph3.create_png()) # ### And now with other classes and variables # ### unbalanced classes in this case # In[29]: [clf,var1,var2] = dtclassify(vars6,8,13,'V_mag','Amplitude') # In[30]: graph3 = print_tree(clf, features=[var1, var2], class_names=['No', 'Yes']) Image(graph3.create_png()) # In[31]: clf # ### Lets look at features derived from light curves now, and use them to see if we can separate CVs and Blazars # ### This is a different dataset than before # In[32]: features = pd.read_csv(featuresfile, header=1, sep=',', skipinitialspace=True,) features.columns = ['source_id', 'id2', 'RA', 'Dec', 'gl', 'gb', 'amplitude', 'beyond1std', 'fpr_mid20', 'fpr_mid35', 'fpr_mid50', 'fpr_mid65', 'fpr_mid80', 'linear_trend', 'max_slope', 'med_abs_dev', 'med_buf_range_per', 'pair_slope_trend', 'percent_amplitude', 'pdfp', 'skew', 'kurtosis', 'std', 'magratio', 'data_num', 'npeaks', 'peakstats', 'Var_Type'] features.head() # In[33]: [clf,var1,var2] = dtclassify(features,0,1,'amplitude','beyond1std') # ### That was no classification at all! All objects were declared to be in the same class. # In[34]: graph3 = print_tree(clf, features=[var1, var2], class_names=['No', 'Yes']) Image(graph3.create_png()) # ## We will now use Random forests instead of individual trees # ## Random Forests # # Random forests works by aggregating the results from a number of randomly perturbed decision trees constructed to explain the data. # # ### A bit on bootstrap aggregation # The idea of random forests arises naturally by first considering Tree bagging. In tree bagging we do the following $b$ times: # # 1. Take a random subsample of your data # 2. Build a classification (or regression) tree like in the previous section # 3. repeat # # For a new data point we can then simply run that point through all the $b$ trees constructed, get all the decisions $\hat{Y}_1,..., \hat{Y}_b$ and take a majority vote. This form of averaging gets rid of some of the over-fitting issues found in just using one tree. Plus fitting these trees costs a lot computationally, so what else can we do? # # ### Leads to Random Forests? # This method is very similar to the bootstrap aggregation method. However, as the name suggests some extra randomness is injected into the building of the trees. It turns out that the trees that are build from the random subsample of your data are quite similar, so the solution is quite simple. In Random Forests we do the following $b$ times: # # 1. Take a random subsample of your data # 2. randomly select a subset of predictors to be used in building the tree # 3. Build a classification (or regression) tree with only those variables selected in 2 # 4. repeat # # We take a majority vote the same as before. Have a look at the help page for the [Random Forest Classifier](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html), we'll be optimizing two options: `n_estimators` - the number of trees in the forest, `max_features` - the number of features to consider when looking for the best split (step 2 above). # ### Select appropriate variables # ### Note that a couple of variables chosen here are perhaps not good variables # In[35]: Xnames # In[ ]: Xnames = list(features.columns.values[4:-1]) # ### Like before we will define a training set of 60% # In[ ]: clfForest = RandomForestClassifier(n_estimators=10, oob_score=True, max_features='auto') features['target'] = features['Var_Type'] subdf=features[Xnames] X=subdf.values y=(features['target'].values==1)*1 # Create test/train mask itrain, itest = train_test_split(range(features.shape[0]), train_size=0.6) mask=np.ones(features.shape[0], dtype='int') mask[itrain]=1 mask[itest]=0 mask = (mask==1) # TRAINING AND TESTING Xtrain, Xtest, ytrain, ytest = X[mask], X[~mask], y[mask], y[~mask] # FIT THE TREE clf=clfForest.fit(Xtrain, ytrain) print(clfForest.n_estimators) training_accuracy = clfForest.score(Xtrain, ytrain) test_accuracy = clfForest.score(Xtest, ytest) print("############# based on standard predict ################") print("Accuracy on training data: %0.2f" % (training_accuracy)) print("Accuracy on test data: %0.2f" % (test_accuracy)) print(confusion_matrix(ytest, clf.predict(Xtest))) print("########################################################") parameters = {"n_estimators": list(range(1, 20))} clfForest, Xtrain, ytrain, Xtest, ytest = do_classify(clfForest, parameters, features, Xnames, 'target', 1, mask=mask, n_jobs = 4, score_func='f1') print(clfForest.n_estimators) # ### That was not great, but who said classification was easy? # ### Try with other classes, perhaps get other features, and of yes, a larger training sample with less sparse light curves.