Click here to Interact with this code on nbViewer
import pandas as pd
Get the csv
from github
dataframe = pd.read_csv("https://raw.githubusercontent.com/ujwalnk/MachineLearning101/main/data/01%20Weather%20Data.csv")
dataframe.head()
Date | Location | MinTemp | MaxTemp | Rainfall | Evaporation | Sunshine | WindGustDir | WindGustSpeed | WindDir9am | ... | Humidity9am | Humidity3pm | Pressure9am | Pressure3pm | Cloud9am | Cloud3pm | Temp9am | Temp3pm | RainToday | RainTomorrow | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2008-12-01 | Albury | 13.4 | 22.9 | 0.6 | NaN | NaN | W | 44.0 | W | ... | 71.0 | 22.0 | 1007.7 | 1007.1 | 8.0 | NaN | 16.9 | 21.8 | No | No |
1 | 2008-12-02 | Albury | 7.4 | 25.1 | 0.0 | NaN | NaN | WNW | 44.0 | NNW | ... | 44.0 | 25.0 | 1010.6 | 1007.8 | NaN | NaN | 17.2 | 24.3 | No | No |
2 | 2008-12-03 | Albury | 12.9 | 25.7 | 0.0 | NaN | NaN | WSW | 46.0 | W | ... | 38.0 | 30.0 | 1007.6 | 1008.7 | NaN | 2.0 | 21.0 | 23.2 | No | No |
3 | 2008-12-04 | Albury | 9.2 | 28.0 | 0.0 | NaN | NaN | NE | 24.0 | SE | ... | 45.0 | 16.0 | 1017.6 | 1012.8 | NaN | NaN | 18.1 | 26.5 | No | No |
4 | 2008-12-05 | Albury | 17.5 | 32.3 | 1.0 | NaN | NaN | W | 41.0 | ENE | ... | 82.0 | 33.0 | 1010.8 | 1006.0 | 7.0 | 8.0 | 17.8 | 29.7 | No | No |
5 rows × 23 columns
dataframe = dataframe.dropna()
dataframe.isnull().sum(), dataframe.count()
(Date 0 Location 0 MinTemp 0 MaxTemp 0 Rainfall 0 Evaporation 0 Sunshine 0 WindGustDir 0 WindGustSpeed 0 WindDir9am 0 WindDir3pm 0 WindSpeed9am 0 WindSpeed3pm 0 Humidity9am 0 Humidity3pm 0 Pressure9am 0 Pressure3pm 0 Cloud9am 0 Cloud3pm 0 Temp9am 0 Temp3pm 0 RainToday 0 RainTomorrow 0 dtype: int64, Date 56420 Location 56420 MinTemp 56420 MaxTemp 56420 Rainfall 56420 Evaporation 56420 Sunshine 56420 WindGustDir 56420 WindGustSpeed 56420 WindDir9am 56420 WindDir3pm 56420 WindSpeed9am 56420 WindSpeed3pm 56420 Humidity9am 56420 Humidity3pm 56420 Pressure9am 56420 Pressure3pm 56420 Cloud9am 56420 Cloud3pm 56420 Temp9am 56420 Temp3pm 56420 RainToday 56420 RainTomorrow 56420 dtype: int64)
Drop Unnecessary Columns
dataframe = dataframe.drop("Date", axis=1)
Sort and check for datapoints
dataframe = dataframe.drop_duplicates()
dataframe.sort_values("RainTomorrow", axis=0, ascending=True, inplace=True)
dataframe["RainTomorrow"].value_counts()
No 43993 Yes 12427 Name: RainTomorrow, dtype: int64
from sklearn.model_selection import train_test_split
# Import label encoder
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
dataframe["RainTomorrow"] = label_encoder.fit_transform(dataframe["RainTomorrow"])
y = dataframe["RainTomorrow"]
X = dataframe = pd.get_dummies(dataframe.drop("RainTomorrow", axis=1))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
Need to make a separate validation and test data as all data is labelled
X_train.shape, X_test.shape, y_train.shape, y_test.shape
((45136, 92), (11284, 92), (45136,), (11284,))
The shapes match, so start training the model
from sklearn.ensemble import RandomForestClassifier as clf
rf_model = clf()
rf_model.fit(X_train, y_train)
RandomForestClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestClassifier()
rf_model.score(X_test, y_test)
0.8645870258773485