#!/usr/bin/env python # coding: utf-8 # # Cooling Tower Identification: MODA Analysis # #### Import dependencies # # * This was written in Python 2.7 # In[1]: import sys as sys import os import warnings import pandas as pd import numpy as np with warnings.catch_warnings(): warnings.filterwarnings("ignore") from patsy import dmatrices from sklearn.linear_model import LogisticRegression; from sklearn.cross_validation import train_test_split; from sklearn import metrics import matplotlib.pyplot as plt import seaborn as sns from matplotlib import style style.use('seaborn-notebook') get_ipython().run_line_magic('matplotlib', 'inline') np.random.seed(100) # In[2]: data_path = "../data/tower_data.csv" if os.path.isfile(data_path): tower_data = pd.read_csv(data_path) else: print 'run Tower-Identification-OSA-Data notebook first' # In[3]: ###Some helper functions def Remove_Dummy_BINS(df): ''' removes illegitimate BINs from input dataframe. ''' dummy_BINs = [1000000 , 2000000, 3000000, 4000000, 5000000] df = df[~df.BIN.isin(dummy_BINs)] return df def CleanInputData(df): ''' Performs cleaning on input tower_data, see comments below ''' exclude_BC = ['A','B','V'] # Removing single-family, 2-family, vacant land testset = df[~df['BC'].isin(exclude_BC)] testset = testset[testset.BldgArea != 0] # removing any buildings with no reported area testset = testset[testset.NumFloors != 0] # removing any buildings with no report floors return testset # ## Exploratory Analysis # # There are so many potentially relevant characteristics available in PLUTO. One very relevant factor is a building's class, which can indicate a lot about what's inside a building (and what needs to be kept cool). # # # Building Class Codes can be found in the appendix of the [PLUTO Data Dictionary](https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&uact=8&ved=0ahUKEwiNgdrMp-nWAhUFOiYKHYWoACMQFggoMAA&url=https%3A%2F%2Fwww1.nyc.gov%2Fassets%2Fplanning%2Fdownload%2Fpdf%2Fdata-maps%2Fopen-data%2Fpluto_datadictionary.pdf%3Fr%3D16v2&usg=AOvVaw1-CEPaXRsh0VS-bz9UKkb9) # In[4]: floor_bin = np.concatenate((np.arange(0,11,2) , [15,20,30,50,100]), axis=0) tower_data['FLOOR_BIN'] = pd.cut(tower_data.NumFloors,floor_bin) pivot = tower_data.pivot_table(index='FLOOR_BIN',columns='BC',values='Identified',aggfunc='sum') hmap = sns.heatmap(pivot.fillna(0), cmap="Blues"); hmap.set(xlabel='Building Class Code', ylabel='Number of Floors', alpha=0.8, title = '# of Cooling Towers'); # ## More Data Preparation # # In[5]: ## Cleaning the input data before running any models. We don't want to include people's homes who would not have ## commercial cooling units. testset = CleanInputData(tower_data) print len(testset) # #### Specify a model # # It it tempting to include every available variable, but this can lead to a model that fits the training data extremely well but does not generalize. # In[6]: # Lots of variables available, Adding too many leads to overfitting. ## Numerical Variables I_Vars = [ 'Log_BldgArea', 'NumFloors', 'OfficeArea' , 'RetailArea', 'ComArea'] #,'LotArea','ComArea','ResArea','OfficeArea','RetailArea',\ #'GarageArea','StrgeArea','FactryArea','OtherArea', 'OtherArea','NumBldgs',\ #'UnitsRes','UnitsTotal','LotFront','BldgFront','LotDepth','BldgDepth','AssessLand',\ #'ExemptLand','YearBuilt','BuiltFAR','ResidFAR','FacilFAR','CommFAR'] ## Categorial Variables C_Vars = ['BC', 'BoroCode']#,'LotType','BsmtCode' C_Vars_format = [ 'C('+ c + ')' for c in C_Vars ] s = ' + ' Deps = s.join(I_Vars) Cats = s.join(C_Vars_format) Spec = 'Identified ~ ' + Deps + ' + ' + Cats Spec_I = 'Identified ~ ' + Deps Spec_C = 'Identified ~ ' + Cats print Spec # this is the model string for logistic regression print len(testset) ## Dropping records that aren't valid for the variables in question testset.dropna(subset=C_Vars, inplace=True) testset.dropna(subset=I_Vars, inplace=True) print len(testset) # appears to be only one for c in C_Vars: testset[[c]] = testset[c].astype(str) # ### Splitting into training and testing data (30% testing) # In[7]: y, X = dmatrices(Spec,testset,return_type="dataframe") y = np.ravel(y) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) ## choice to normalize variables if wanted, makes a difference with the SVM classifier normed = X_train[I_Vars] normed = (normed - normed.mean()) / normed.std() X_train.update(normed) normed = X_test[I_Vars] normed = (normed - normed.mean()) / normed.std() X_test.update(normed) # ## Testing a simple logistic regression with varying regularization # # # # In[8]: for c in np.arange(-5,5): model = LogisticRegression(C=10.0**c, penalty='l2') model.fit(X_train, y_train) predicted = model.predict(X_test) print '(inverse) regularization: ' + str(10.0**c) print metrics.classification_report(y_test,predicted ) probs = model.predict_proba(X_test) print '\n score: ' + str(metrics.roc_auc_score(y_test, probs[:, 1])) # In[9]: def ModelSummary(y_test, predicted): TP = float(metrics.confusion_matrix(y_test, predicted)[1][1] ) # true positive FP = float(metrics.confusion_matrix(y_test, predicted)[0][1] ) # false positive FN = float(metrics.confusion_matrix(y_test, predicted)[1][0] ) # false negative TN = float(metrics.confusion_matrix(y_test, predicted)[0][0] ) # true negative print 'Recall %.3f' % (TP/(TP+FN)) print 'Precision %.3f' % (TP/(TP+FP)) print '%.3f of buildings sampled have towers' % (sum(y)/len(y)) # ### Results # # * We are identifying on the order of 30% of true towers in the testing set, not ideal # * But, 65-70% of our predicted towers are true towers, not bad when compared to picking randomly (< 1%) # # In[10]: ModelSummary(y_test, predicted) # ### Another way of showing this with a ROC curve # # Aiming for higher recall (y-axis) will dramatically increase the the false-positive-rate (x-axis). The false positive problem is particularly important in the cooling towers case, because the classes are so imbalanced - less than 1% of buildings are identified with a cooling tower. # In[11]: n_classes = 2 FPR = dict() TPR = dict() roc_auc = dict() for i in range(n_classes): FPR[i], TPR[i], _ = metrics.roc_curve(y_test, probs[:, i]) roc_auc[i] = metrics.auc(FPR[i], TPR[i]) plt.figure() plt.plot(FPR[1], TPR[1], label='ROC curve (AUROC = %0.2f)' % roc_auc[1]) plt.plot([0, 1], [0, 1], 'k--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC Curve For Tower Identification Model ') plt.legend(loc="lower right") plt.show() # ## Another classifier: support vector machine # # In[12]: from sklearn import svm # Option to give more class-weight to identified cooling towers because they are very sparse in the dataset of total buildings. ClassWeights = [1, 5, 10, 15, 20, 30] for weight in ClassWeights: svm_model = svm.LinearSVC(class_weight={1: weight, 0 : 1}) svm_model.fit(X_train, y_train) predicted = svm_model.predict(X_test) print metrics.classification_report(y_test,predicted ) # ### Support Vector Classifier Results # # * Much higher recall, finding nearly 90% of cooling towers # * We have to pay a price in precision for finding more towers, precision is only 23% # * Still much better than picking randomly # * Can tune class weights to dial-in the precision vs. recall we want # # # In[13]: ModelSummary(y_test, predicted) # # Postscript # # One obvious conclusion is the trade-off between precision and recall, which amounts to asking oneself *"how many false positives am I willing to tolerate in order to find every true positive (cooling tower)."* The answer to this kind of question is more often than not dependant on a (non-technical) operational constraint, which could stem from many factors such as the resources available or relative priorities of competing goals. # # ### Try it for yourself # # These models are very basic and a reflection of time and resource constraints during the Legionnaires' disease activation. Playing with different types of models, cross-validation, hyperparameter tuning, etc. would surely produce an improved model. If you find one, let us know! # #