In [1]:
import pandas as pd
In [2]:
dataframe = pd.read_csv("JaipurFinalCleanData.csv")
In [3]:
print (dataframe.head())
         date  mean_temperature  max_temperature  min_temperature  \
0  2016-05-04                34               41               27   
1  2016-05-05                31               38               24   
2  2016-05-06                28               34               21   
3  2016-05-07                30               38               23   
4  2016-05-08                34               41               26   

   Mean_dew_pt  mean_pressure  max_humidity  min_humidity  max_dew_pt_1  \
0            6        1006.00            27             5            12   
1            7        1005.65            29             6            13   
2           11        1007.94            61            13            16   
3           13        1008.39            69            18            17   
4           10        1007.62            50             8            14   

   max_dew_pt_2  min_dew_pt_1  min_dew_pt_2  max_pressure_1  max_pressure_2  \
0            10            -2            -2            1009            1008   
1            12             0            -2            1008            1009   
2            13             6             0            1011            1008   
3            16             9             6            1011            1011   
4            17             6             9            1010            1011   

   min_pressure_1  min_pressure_2  rainfall  
0            1000            1001       0.0  
1            1001            1000       0.0  
2            1003            1001       5.0  
3            1004            1003       0.0  
4            1002            1004       0.0  
In [4]:
dataframe.dtypes
Out[4]:
date                 object
mean_temperature      int64
max_temperature       int64
min_temperature       int64
Mean_dew_pt           int64
mean_pressure       float64
max_humidity          int64
min_humidity          int64
max_dew_pt_1          int64
max_dew_pt_2          int64
min_dew_pt_1          int64
min_dew_pt_2          int64
max_pressure_1        int64
max_pressure_2        int64
min_pressure_1        int64
min_pressure_2        int64
rainfall            float64
dtype: object
In [5]:
dataframe = dataframe.drop(["max_dew_pt_2"], axis=1) 
In [6]:
dataframe.head()
Out[6]:
date mean_temperature max_temperature min_temperature Mean_dew_pt mean_pressure max_humidity min_humidity max_dew_pt_1 min_dew_pt_1 min_dew_pt_2 max_pressure_1 max_pressure_2 min_pressure_1 min_pressure_2 rainfall
0 2016-05-04 34 41 27 6 1006.00 27 5 12 -2 -2 1009 1008 1000 1001 0.0
1 2016-05-05 31 38 24 7 1005.65 29 6 13 0 -2 1008 1009 1001 1000 0.0
2 2016-05-06 28 34 21 11 1007.94 61 13 16 6 0 1011 1008 1003 1001 5.0
3 2016-05-07 30 38 23 13 1008.39 69 18 17 9 6 1011 1011 1004 1003 0.0
4 2016-05-08 34 41 26 10 1007.62 50 8 14 6 9 1010 1011 1002 1004 0.0
In [33]:
from sklearn.cluster import KMeans
import numpy as np
cols= np.column_stack((dataframe.iloc[:,1:].values))
km_res = KMeans(n_clusters=3).fit(cols) #excluding date column
In [34]:
km_res.cluster_centers_
Out[34]:
array([[  27.        ,   29.        ,   61.        , ...,   42.        ,
          37.        ,   38.        ],
       [1004.8       , 1004.73      , 1006.188     , ..., 1014.082     ,
        1013.632     , 1013.152     ],
       [  13.44444444,   13.        ,   14.88888889, ...,    8.88888889,
          10.        ,   10.55555556]])
In [35]:
km_res.fit_predict(dataframe.iloc[:,1:])
Out[35]:
array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 1, 1, 1, 2, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0,
       2, 2, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 1, 1,
       2, 2, 2, 2, 1, 2, 1, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 1, 1, 1, 1,
       2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1,
       1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2,
       0, 2, 0, 0, 0, 0, 2, 2, 2, 0, 0, 0, 0, 2, 2, 2])
In [16]:
km_res.fit_predict(dataframe.iloc[:,1:]).shape
Out[16]:
(676,)
In [17]:
dataframe.iloc[:,1:].shape
Out[17]:
(676, 15)