Based on data from https://www.e-control.at/ for district Mödling.
import pandas as pd
import altair as alt
URL = 'https://raw.githubusercontent.com/Datenspieler/notebooks_for_blog/master/2020-treibstoff/treibstoff.csv'
df = pd.read_csv(URL)
df.date = pd.to_datetime(df.date)
df.tail()
date | id | price | type | |
---|---|---|---|---|
7288 | 2020-07-21 00:20:02 | 218712 | 0.999 | SUP |
7289 | 2020-07-21 00:20:02 | 224519 | 1.003 | SUP |
7290 | 2020-07-21 00:20:02 | 1354761 | 1.004 | SUP |
7291 | 2020-07-21 00:20:02 | 665614 | 1.013 | SUP |
7292 | 2020-07-21 00:20:02 | 1354799 | 1.013 | SUP |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 7293 entries, 0 to 7292 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 date 7293 non-null datetime64[ns] 1 id 7293 non-null int64 2 price 7293 non-null float64 3 type 7293 non-null object dtypes: datetime64[ns](1), float64(1), int64(1), object(1) memory usage: 228.0+ KB
print(df.date.min())
print(df.date.max())
2020-06-23 22:20:02 2020-07-21 00:20:02
df['min'] = df.groupby(['type', 'date'])[['price']].transform('min')
df['mean'] = df.groupby(['type', 'date'])[['price']].transform('mean')
Since for this analysis only interested in minimum/mean price, not what gas station is offering that price, can drop a lot of data.
df = df[['date', 'type', 'min', 'mean']].drop_duplicates()
df.head(3)
date | type | min | mean | |
---|---|---|---|---|
0 | 2020-06-23 22:20:02 | DIE | 0.949 | 0.9546 |
5 | 2020-06-23 22:20:02 | SUP | 0.999 | 1.0046 |
10 | 2020-06-23 23:20:01 | DIE | 0.949 | 0.9546 |
chart_zero = alt.Chart(df).mark_line().encode(
x='date',
y=alt.Y(alt.repeat("column"), type='quantitative'),
color='type',
strokeDash='type',
).repeat(
column=['min', 'mean']
)
chart_zero
Some movement, escepcially if we look at the mean price. We can make it look more spectacular, if we don't let the price start at 0 - great trick and bad visualization practive at the same time 😉
chart_not_zero = alt.Chart(df).mark_line().encode(
x='date',
y=alt.Y(alt.repeat("column"), type='quantitative', scale=alt.Scale(zero=False)),
color='type',
strokeDash='type',
).repeat(
column=['min', 'mean']
)
chart_not_zero
To 'correct' for long term trend, let's take the 1 week (24 x 7 measurements) average.
df1 = df.set_index(['type', 'date']).unstack('type')
df1.columns = ['_'.join(col[::-1]).strip() for col in df1.columns.values]
df1.head()
DIE_min | SUP_min | DIE_mean | SUP_mean | |
---|---|---|---|---|
date | ||||
2020-06-23 22:20:02 | 0.949 | 0.999 | 0.954600 | 1.004600 |
2020-06-23 23:20:01 | 0.949 | 0.999 | 0.954600 | 1.004600 |
2020-06-24 00:20:02 | 0.949 | 0.999 | 0.958857 | 1.008857 |
2020-06-24 01:20:02 | 0.949 | 0.999 | 0.958857 | 1.008857 |
2020-06-24 02:20:01 | 0.949 | 0.999 | 0.958857 | 1.008857 |
df_delta = df1 - df1.rolling(7*24, center = True).mean()
df_delta = df_delta * 100 #Let's switch to Cent values
df_delta.head() # of course the first entries are NaN
DIE_min | SUP_min | DIE_mean | SUP_mean | |
---|---|---|---|---|
date | ||||
2020-06-23 22:20:02 | NaN | NaN | NaN | NaN |
2020-06-23 23:20:01 | NaN | NaN | NaN | NaN |
2020-06-24 00:20:02 | NaN | NaN | NaN | NaN |
2020-06-24 01:20:02 | NaN | NaN | NaN | NaN |
2020-06-24 02:20:01 | NaN | NaN | NaN | NaN |
df_delta.tail()
DIE_min | SUP_min | DIE_mean | SUP_mean | |
---|---|---|---|---|
date | ||||
2020-07-20 20:20:01 | NaN | NaN | NaN | NaN |
2020-07-20 21:20:02 | NaN | NaN | NaN | NaN |
2020-07-20 22:20:01 | NaN | NaN | NaN | NaN |
2020-07-20 23:20:02 | NaN | NaN | NaN | NaN |
2020-07-21 00:20:02 | NaN | NaN | NaN | NaN |
df_delta.describe()
DIE_min | SUP_min | DIE_mean | SUP_mean | |
---|---|---|---|---|
count | 484.000000 | 484.000000 | 484.000000 | 484.000000 |
mean | -0.006018 | -0.055542 | -0.034468 | -0.071316 |
std | 0.523949 | 0.631228 | 0.824300 | 1.033422 |
min | -1.725000 | -1.467262 | -2.210951 | -2.151137 |
25% | -0.088690 | -0.322917 | -0.418081 | -0.688466 |
50% | 0.080952 | 0.155357 | -0.061518 | -0.036823 |
75% | 0.200000 | 0.356250 | 0.280230 | 0.390915 |
max | 0.989286 | 0.911310 | 2.885819 | 3.697436 |
df_delta = df_delta.reset_index()
df_chart = df_delta.groupby([df_delta.date.dt.day_name(), df_delta.date.dt.hour]).mean()#.stack()
df_chart.index.names = ['Day of week', 'Hour']
df_chart
DIE_min | SUP_min | DIE_mean | SUP_mean | ||
---|---|---|---|---|---|
Day of week | Hour | ||||
Friday | 0 | 0.006151 | 0.375000 | 0.063581 | 0.399897 |
1 | -0.001190 | 0.373016 | 0.059726 | 0.398581 | |
2 | -0.008532 | 0.371032 | 0.053542 | 0.397589 | |
3 | -0.015873 | 0.369048 | 0.046916 | 0.396549 | |
4 | -0.023214 | 0.367063 | 0.039693 | 0.392836 | |
... | ... | ... | ... | ... | ... |
Wednesday | 19 | 0.201984 | 0.295040 | -0.018503 | -0.056196 |
20 | 0.195833 | 0.292659 | 0.023958 | -0.058444 | |
21 | 0.189683 | 0.290278 | 0.030862 | 0.063117 | |
22 | 0.183532 | 0.287698 | -0.037788 | 0.366133 | |
23 | 0.177381 | 0.285119 | -0.044204 | 0.363513 |
168 rows × 4 columns
df_chart = df_chart.reset_index()
df_chart.describe()
Hour | DIE_min | SUP_min | DIE_mean | SUP_mean | |
---|---|---|---|---|---|
count | 168.000000 | 168.000000 | 168.000000 | 168.000000 | 168.000000 |
mean | 11.500000 | 0.014533 | -0.035066 | -0.018058 | -0.054979 |
std | 6.942881 | 0.462745 | 0.542503 | 0.747398 | 0.945292 |
min | 0.000000 | -1.290079 | -1.226190 | -1.605849 | -1.867506 |
25% | 5.750000 | -0.003026 | -0.367857 | -0.307451 | -0.750507 |
50% | 11.500000 | 0.129663 | 0.173810 | -0.071218 | 0.063262 |
75% | 17.250000 | 0.289087 | 0.359623 | 0.311333 | 0.393182 |
max | 23.000000 | 0.866667 | 0.507143 | 2.326624 | 2.936946 |
df_chart.select_dtypes('number').max() - df_chart.select_dtypes('number').min()
Hour 23.000000 DIE_min 2.156746 SUP_min 1.733333 DIE_mean 3.932473 SUP_mean 4.804451 dtype: float64
for (col, title_part) in [('DIE_min','diesel, cheapest gas station') ,
('DIE_mean', 'diesel, mean of 5 cheapest gas stations'),
('SUP_min','premium gas, cheapest gas station') ,
('SUP_mean', 'premium gas, mean of 5 cheapest gas stations')]:
chart = alt.Chart(df_chart).mark_rect().encode(
x='Hour:O',
y=alt.Y('Day of week:O', sort=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']),
color=alt.Color(col, type='quantitative', title='Cent',
scale=alt.Scale(scheme='redyellowgreen', domainMid = 0, domain = [-2, 2], reverse=True))
).properties(
title='Difference in gasoline price - ' + title_part
)
chart.save(col + '.svg')
display(chart)