#!/usr/bin/env python # coding: utf-8 # In[258]: # rattle package in R has weather dataset #(see help at http://artax.karlin.mff.cuni.cz/r-help/library/rattle/html/weather.html) # In[259]: import os as os # In[260]: import pandas as pd # In[261]: os.getcwd() # In[262]: os.listdir() # In[263]: #Finding only csv files in a directory using os and glob packages import glob path = os.getcwd() extension = 'csv' os.chdir(path) result = [i for i in glob.glob('*.{}'.format(extension))] print(result) # In[264]: dataframe=pd.read_csv("weather.csv") # In[265]: dataframe.head() # In[266]: dataframe.info() # In[267]: dataframe=dataframe.drop('Unnamed: 0', 1) # In[268]: dataframe.describe() # In[269]: dataframe['RainTomorrow'].unique() # In[270]: dataframe['RainToday'].unique() # In[271]: dataframe['Location'].unique() # In[272]: dataframe['Date'].unique() # In[273]: # Bagged Decision Trees for Classification from sklearn import cross_validation from sklearn.ensemble import BaggingClassifier from sklearn.tree import DecisionTreeClassifier from sklearn import tree # In[274]: dataframe.columns # In[275]: del dataframe['Date'] # In[276]: del dataframe['Location'] # In[277]: del dataframe['WindDir9am'] # In[278]: del dataframe['WindSpeed3pm'] # In[279]: del dataframe['WindGustDir'] del dataframe['WindDir3pm'] del dataframe['RISK_MM'] # In[280]: dataframe=dataframe.replace(['Yes', 'No'], [1, 0]) #using replace to change string to numeric values # In[281]: dataframe=dataframe.dropna() # In[282]: dataframe.head() # In[283]: len(dataframe) # In[284]: len(dataframe.columns) # In[285]: names=dataframe.columns names # In[286]: dataframe.describe() # In[287]: type(dataframe) # In[288]: array = dataframe.values # In[289]: pd.value_counts(dataframe["RainTomorrow"]) # In[290]: array # In[291]: X = array[:,0:16] Y = array[:,16] num_folds = 10 num_instances = len(X) seed = 7 # In[292]: type(X) # In[293]: X # In[294]: #Y[Y == "Yes"] = 1 An alternative way to make a NumPy arraye change values #Y[Y == "No"] = 0 Y # In[295]: dtr = tree.DecisionTreeRegressor(max_depth=3) dtr.fit(X, Y) # In[296]: # from sklearn.metrics import roc_curve, auc # In[297]: #!sudo pip install pydotplus # http://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html # http://machinelearningmastery.com/ensemble-machine-learning-algorithms-python-scikit-learn/ # http://machinelearningmastery.com/compare-machine-learning-algorithms-python-scikit-learn/ # In[298]: #!pip freeze #checking if we have the right packages # In[299]: #!pip install --upgrade pip # In[300]: #!pip install pydotplus # In[301]: import pydotplus as pydot from IPython.display import Image from sklearn.externals.six import StringIO # In[302]: # Graphviz #sudo add-apt-repository ppa:gviz-adm/graphviz-dev # sudo apt-get update # http://www.graphviz.org/Download_linux_ubuntu.php # In[303]: dot_data = StringIO() # In[304]: tree.export_graphviz(dtr, out_file=dot_data,feature_names=names[:-1]) # In[305]: graph = pydot.graph_from_dot_data(dot_data.getvalue()) # In[306]: Image(graph.create_png()) # In[307]: kfold = cross_validation.KFold(n=num_instances, n_folds=num_folds, random_state=seed) cart = DecisionTreeClassifier() num_trees = 100 model = BaggingClassifier(base_estimator=cart, n_estimators=num_trees, random_state=seed) # In[308]: model # In[309]: kfold # In[310]: results = cross_validation.cross_val_score(model, X, Y, cv=kfold) print(results.mean()) # In[311]: results