#!/usr/bin/env python
# coding: utf-8

# In[258]:


# rattle package in R has weather dataset
#(see help at http://artax.karlin.mff.cuni.cz/r-help/library/rattle/html/weather.html)


# In[259]:


import os as os


# In[260]:


import pandas as pd


# In[261]:


os.getcwd()


# In[262]:


os.listdir()


# In[263]:


#Finding only csv files in a directory using os and glob packages
import glob

path = os.getcwd()
extension = 'csv'
os.chdir(path)
result = [i for i in glob.glob('*.{}'.format(extension))]
print(result)


# In[264]:


dataframe=pd.read_csv("weather.csv")


# In[265]:


dataframe.head()


# In[266]:


dataframe.info()


# In[267]:


dataframe=dataframe.drop('Unnamed: 0', 1)


# In[268]:


dataframe.describe()


# In[269]:


dataframe['RainTomorrow'].unique()


# In[270]:


dataframe['RainToday'].unique()


# In[271]:


dataframe['Location'].unique()


# In[272]:


dataframe['Date'].unique()


# In[273]:


# Bagged Decision Trees for Classification
from sklearn import cross_validation
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree


# In[274]:


dataframe.columns


# In[275]:


del dataframe['Date']


# In[276]:


del dataframe['Location']


# In[277]:


del dataframe['WindDir9am']


# In[278]:


del dataframe['WindSpeed3pm']


# In[279]:


del dataframe['WindGustDir']
del dataframe['WindDir3pm']
del dataframe['RISK_MM']


# In[280]:


dataframe=dataframe.replace(['Yes', 'No'], [1, 0]) #using replace to change string to numeric values


# In[281]:


dataframe=dataframe.dropna()


# In[282]:


dataframe.head()


# In[283]:


len(dataframe)


# In[284]:


len(dataframe.columns)


# In[285]:


names=dataframe.columns
names


# In[286]:


dataframe.describe()


# In[287]:


type(dataframe)


# In[288]:


array = dataframe.values


# In[289]:


pd.value_counts(dataframe["RainTomorrow"])


# In[290]:


array


# In[291]:


X = array[:,0:16]
Y = array[:,16]
num_folds = 10
num_instances = len(X)
seed = 7


# In[292]:


type(X)


# In[293]:


X


# In[294]:


#Y[Y == "Yes"] = 1 An alternative way to make a NumPy arraye change values
#Y[Y == "No"] = 0
Y


# In[295]:


dtr = tree.DecisionTreeRegressor(max_depth=3)
dtr.fit(X, Y)


# In[296]:


# from sklearn.metrics import roc_curve, auc


# In[297]:


#!sudo pip install pydotplus
# http://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html
# http://machinelearningmastery.com/ensemble-machine-learning-algorithms-python-scikit-learn/
# http://machinelearningmastery.com/compare-machine-learning-algorithms-python-scikit-learn/


# In[298]:


#!pip freeze
#checking if we have the right packages


# In[299]:


#!pip install --upgrade pip


# In[300]:


#!pip install pydotplus


# In[301]:


import pydotplus as pydot

from IPython.display import Image

from sklearn.externals.six import StringIO


# In[302]:


# Graphviz
#sudo add-apt-repository ppa:gviz-adm/graphviz-dev
# sudo apt-get update
# http://www.graphviz.org/Download_linux_ubuntu.php


# In[303]:


dot_data = StringIO()


# In[304]:


tree.export_graphviz(dtr, out_file=dot_data,feature_names=names[:-1])


# In[305]:


graph = pydot.graph_from_dot_data(dot_data.getvalue())


# In[306]:


Image(graph.create_png())


# In[307]:


kfold = cross_validation.KFold(n=num_instances, n_folds=num_folds, random_state=seed)
cart = DecisionTreeClassifier()
num_trees = 100
model = BaggingClassifier(base_estimator=cart, n_estimators=num_trees, random_state=seed)


# In[308]:


model


# In[309]:


kfold


# In[310]:


results = cross_validation.cross_val_score(model, X, Y, cv=kfold)
print(results.mean())


# In[311]:


results