# Import Pandas to read file
import pandas as pd
traffic_flow = pd.read_csv('Metro_Interstate_Traffic_Volume.csv')
traffic_flow.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 48204 entries, 0 to 48203 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 holiday 48204 non-null object 1 temp 48204 non-null float64 2 rain_1h 48204 non-null float64 3 snow_1h 48204 non-null float64 4 clouds_all 48204 non-null int64 5 weather_main 48204 non-null object 6 weather_description 48204 non-null object 7 date_time 48204 non-null object 8 traffic_volume 48204 non-null int64 dtypes: float64(3), int64(2), object(4) memory usage: 3.3+ MB
# importy MatPlotLib to use it's features and adding '%matplotlib inline' to use its features in Jupyter
import matplotlib.pyplot as plt
%matplotlib inline
traffic_flow['traffic_volume'].plot.hist()
traffic_flow['traffic_volume'].describe()
count 48204.000000 mean 3259.818355 std 1986.860670 min 0.000000 25% 1193.000000 50% 3380.000000 75% 4933.000000 max 7280.000000 Name: traffic_volume, dtype: float64
traffic_flow['date_time'] = pd.to_datetime(traffic_flow['date_time'])
day_time = traffic_flow[(traffic_flow['date_time'].dt.hour >= 7) & (traffic_flow['date_time'].dt.hour < 19)]
night_time = traffic_flow[(traffic_flow['date_time'].dt.hour >= 19) & (traffic_flow['date_time'].dt.hour <= 23)]
day_time.head()
holiday | temp | rain_1h | snow_1h | clouds_all | weather_main | weather_description | date_time | traffic_volume | |
---|---|---|---|---|---|---|---|---|---|
0 | None | 288.28 | 0.0 | 0.0 | 40 | Clouds | scattered clouds | 2012-10-02 09:00:00 | 5545 |
1 | None | 289.36 | 0.0 | 0.0 | 75 | Clouds | broken clouds | 2012-10-02 10:00:00 | 4516 |
2 | None | 289.58 | 0.0 | 0.0 | 90 | Clouds | overcast clouds | 2012-10-02 11:00:00 | 4767 |
3 | None | 290.13 | 0.0 | 0.0 | 90 | Clouds | overcast clouds | 2012-10-02 12:00:00 | 5026 |
4 | None | 291.14 | 0.0 | 0.0 | 75 | Clouds | broken clouds | 2012-10-02 13:00:00 | 4918 |
night_time.head()
holiday | temp | rain_1h | snow_1h | clouds_all | weather_main | weather_description | date_time | traffic_volume | |
---|---|---|---|---|---|---|---|---|---|
10 | None | 290.97 | 0.0 | 0.0 | 20 | Clouds | few clouds | 2012-10-02 19:00:00 | 3539 |
11 | None | 289.38 | 0.0 | 0.0 | 1 | Clear | sky is clear | 2012-10-02 20:00:00 | 2784 |
12 | None | 288.61 | 0.0 | 0.0 | 1 | Clear | sky is clear | 2012-10-02 21:00:00 | 2361 |
13 | None | 287.16 | 0.0 | 0.0 | 1 | Clear | sky is clear | 2012-10-02 22:00:00 | 1529 |
14 | None | 285.45 | 0.0 | 0.0 | 1 | Clear | sky is clear | 2012-10-02 23:00:00 | 963 |
plt.figure(figsize = (10, 10))
plt.subplot(1,2,1)
plt.hist(day_time['traffic_volume'])
plt.title('Day Time Traffic Volume')
plt.xlim([0, 7500])
plt.ylim([0, 8000])
plt.subplot(1,2,2)
plt.hist(night_time['traffic_volume'])
plt.title('Night Time Traffic Volume')
plt.xlim([0, 7500])
plt.ylim([0, 8000])
traffic_flow['traffic_volume'].describe()
count 48204.000000 mean 3259.818355 std 1986.860670 min 0.000000 25% 1193.000000 50% 3380.000000 75% 4933.000000 max 7280.000000 Name: traffic_volume, dtype: float64
day_time['month'] = day_time['date_time'].dt.month
by_month = day_time.groupby('month').mean()
by_month['traffic_volume']
<ipython-input-6-0d0913a858f7>:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
month 1 4495.613727 2 4711.198394 3 4889.409560 4 4906.894305 5 4911.121609 6 4898.019566 7 4595.035744 8 4928.302035 9 4870.783145 10 4921.234922 11 4704.094319 12 4374.834566 Name: traffic_volume, dtype: float64
plt.plot(by_month['traffic_volume'])
plt.show()
day_time['dayofweek'] = day_time['date_time'].dt.dayofweek
by_dayofweek = day_time.groupby('dayofweek').mean()
by_dayofweek['traffic_volume'] # 0 is Monday, 6 is Sunday
<ipython-input-8-e42a3e605df7>:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
dayofweek 0 4893.551286 1 5189.004782 2 5284.454282 3 5311.303730 4 5291.600829 5 3927.249558 6 3436.541789 Name: traffic_volume, dtype: float64
plt.plot(by_dayofweek['traffic_volume'])
plt.show()
day_time['hour'] = day_time['date_time'].dt.hour
bussiness_days = day_time.copy()[day_time['dayofweek'] <= 4] # 4 == Friday
weekend = day_time.copy()[day_time['dayofweek'] >= 5] # 5 == Saturday
by_hour_business = bussiness_days.groupby('hour').mean()
by_hour_weekend = weekend.groupby('hour').mean()
print(by_hour_business['traffic_volume'])
print(by_hour_weekend['traffic_volume'])
hour 7 6030.413559 8 5503.497970 9 4895.269257 10 4378.419118 11 4633.419470 12 4855.382143 13 4859.180473 14 5152.995778 15 5592.897768 16 6189.473647 17 5784.827133 18 4434.209431 Name: traffic_volume, dtype: float64 hour 7 1589.365894 8 2338.578073 9 3111.623917 10 3686.632302 11 4044.154955 12 4372.482883 13 4362.296564 14 4358.543796 15 4342.456881 16 4339.693805 17 4151.919929 18 3811.792279 Name: traffic_volume, dtype: float64
<ipython-input-32-634df4e6b376>:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
plt.figure(figsize = (8, 3))
plt.subplot(1 , 2, 1)
plt.plot(by_hour_business['traffic_volume'])
plt.title('Business Days Traffic Volume')
plt.xlim([7, 18])
plt.ylim([1500, 6300])
plt.subplot(1 , 2, 2)
plt.plot(by_hour_weekend['traffic_volume'])
plt.title('WeekEnds Traffic Volume')
plt.xlim([7, 18])
plt.ylim([1500, 4500])
(1500.0, 4500.0)
corr_temp = day_time['traffic_volume'].corr(day_time['temp'])
print("Temp vs Traffic Volume:", corr_temp)
#traffic_flow['traffic_volume'].corr(traffic_flow['weather_main'])
#traffic_flow['traffic_volume'].corr(traffic_flow['weather_description'])
Temp vs Traffic Volume: 0.12831656852303905
corr_snow = day_time['traffic_volume'].corr(day_time['snow_1h'])
print("Snow vs Traffic Volume:", corr_snow)
Snow vs Traffic Volume: 0.001265349847175986
corr_clouds = day_time['traffic_volume'].corr(day_time['clouds_all'])
print("Clouds vs Traffic Volume:", corr_clouds)
Clouds vs Traffic Volume: -0.03293234866163948
corr_rain = day_time['traffic_volume'].corr(day_time['rain_1h'])
print("Rain vs Traffic Volume:", corr_rain)
Rain vs Traffic Volume: 0.0036965098350734324
plt.scatter(x = day_time['temp'], y = day_time['traffic_volume'])
plt.title('Traffic Flow vs. Temp')
plt.xlabel('Temperature')
plt.ylabel('Traffic Volume')
plt.show()
by_weather_main = day_time.groupby('weather_main').mean()
by_weather_description = day_time.groupby('weather_description').mean()
by_weather_main['traffic_volume'].plot.barh()
<matplotlib.axes._subplots.AxesSubplot at 0x7ff3c65e0610>
by_weather_description['traffic_volume'].plot.barh(figsize = (6, 9))
<matplotlib.axes._subplots.AxesSubplot at 0x7ff3c6254f70>