import os
import pandas as pd
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.offline as py
import plotly.graph_objs as go
from pandas_gbq import read_gbq
# enable ploty jupyter inline
init_notebook_mode(connected=True)
# standard SQL table id syntax
table_id = "`{}.{}.{}`".format(
os.environ['BQ_PROJECT_ID'],
os.environ['BQ_DATASET_ID'],
os.environ['BQ_TABLE_ID']
)
def read_data(query):
df = read_gbq(query,
project_id=os.environ['BQ_PROJECT_ID'],
private_key=os.environ['BQ_PRIVATE_KEY'],
dialect="standard")
return df
# test bq connection, read 10 rows
read_data("SELECT * from {} LIMIT 10".format(table_id))
Requesting query... ok. Job ID: job_a4UyJGwvGPthf7PWJG1Ee_4LA3sM Query running... Query done. Processed: 41.6 KB Standard price: $0.00 USD Retrieving results... Got 10 rows. Total time taken 1.94 s. Finished at 2017-08-21 11:53:28.
task_id | story_id | story_createdat | interval_begin | interval_end | story_point_begin | story_point_delta | comment_count_begin | comment_count_delta | story_rank_begin | story_rank_end | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | S1ATRx_d- | 15060237 | 2017-08-20 21:00:53 | 2017-08-21 06:33:42.454 | 2017-08-21 06:33:57.456 | 61 | 0 | 32 | 0 | 23 | 23 |
1 | S1ATRx_d- | 15054752 | 2017-08-19 18:13:24 | 2017-08-21 06:33:42.454 | 2017-08-21 06:33:57.456 | 151 | 0 | 40 | 0 | 5 | 5 |
2 | S1ATRx_d- | 15058090 | 2017-08-20 12:12:55 | 2017-08-21 06:33:42.454 | 2017-08-21 06:33:57.456 | 76 | 0 | 19 | 0 | 15 | 15 |
3 | S1ATRx_d- | 15060488 | 2017-08-20 22:09:40 | 2017-08-21 06:33:42.454 | 2017-08-21 06:33:57.456 | 67 | 0 | 44 | 0 | 26 | 26 |
4 | S1ATRx_d- | 15058723 | 2017-08-20 15:16:17 | 2017-08-21 06:33:42.454 | 2017-08-21 06:33:57.456 | 224 | 0 | 24 | 0 | 17 | 17 |
5 | rJYnOWuu- | 15060998 | 2017-08-21 00:08:19 | 2017-08-21 07:16:00.662 | 2017-08-21 07:16:15.663 | 31 | 0 | 1 | 0 | 8 | 8 |
6 | rJYnOWuu- | 15058351 | 2017-08-20 13:35:09 | 2017-08-21 07:16:00.662 | 2017-08-21 07:16:15.663 | 520 | 0 | 280 | 0 | 15 | 16 |
7 | rJYnOWuu- | 15057371 | 2017-08-20 07:33:41 | 2017-08-21 07:16:00.662 | 2017-08-21 07:16:15.663 | 242 | 0 | 123 | 0 | 22 | 22 |
8 | rJYnOWuu- | 15054752 | 2017-08-19 18:13:24 | 2017-08-21 07:16:00.662 | 2017-08-21 07:16:15.663 | 152 | 0 | 40 | 0 | 5 | 5 |
9 | rJYnOWuu- | 15061356 | 2017-08-21 01:46:20 | 2017-08-21 07:16:00.662 | 2017-08-21 07:16:15.663 | 7 | 1 | 0 | 0 | 16 | 14 |
Find all entries in the last day.
query_last_24_hour = """
SELECT
*
FROM
{}
WHERE
`interval_begin` >= TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 1 DAY)
ORDER BY
`interval_begin`;
""".format(table_id)
df = read_data(query_last_24_hour)
Requesting query... ok. Job ID: job_RBoiCT1EMJJJ_iyJV5AYiwWieMKY Query running... Query done. Processed: 46.7 KB Standard price: $0.00 USD Retrieving results... Got 526 rows. Total time taken 2.89 s. Finished at 2017-08-21 12:50:48.
Plot total story points on HN frontpage and total comment counts over time.
x=total['story_point_begin'].sum().index
traces = [
go.Scatter(
x=x,
y=total['story_point_begin'].sum(),
name="Story Points"
),
go.Scatter(
x=x,
y=total['comment_count_begin'].sum(),
yaxis="y2",
name="Comment Count"
)
]
layout = go.Layout(
title='Frontpage Story Points and Comment Count over time',
yaxis=dict(
title='Story Points'
),
yaxis2=dict(
title='Comment Count',
titlefont=dict(
color='rgb(148, 103, 189)'
),
tickfont=dict(
color='rgb(148, 103, 189)'
),
overlaying='y',
side='right'
)
)
fig = go.Figure(data=traces, layout=layout)
py.iplot(fig)
Plot story points deltas per sample interval (15 seconds) over time. Unfortunately 15 seconds was too short most entries observes 0 deltas.
total = df.groupby("interval_begin")
x=total['story_point_delta'].sum().index
traces = [
go.Scatter(
x=x,
y=total['story_point_delta'].sum(),
name="Story Points Delta"
),
go.Scatter(
x=x,
y=total['comment_count_delta'].sum(),
yaxis="y2",
name="Comment Count Delta"
)
]
layout = go.Layout(
title='HN Activity per 15 Seconds',
yaxis=dict(
title='Story Points'
),
yaxis2=dict(
title='Comment Count',
titlefont=dict(
color='rgb(148, 103, 189)'
),
tickfont=dict(
color='rgb(148, 103, 189)'
),
overlaying='y',
side='right'
)
)
fig = go.Figure(data=traces, layout=layout)
py.iplot(fig)
def make_trace(rank):
df_rank = df.query("story_rank_begin == {}".format(rank))
return go.Scatter(
x=df_rank['interval_begin'],
y=df_rank['story_point_begin'],
name="Story Point Rank={}".format(rank)
)
traces = [make_trace(r) for r in [1, 5, 10, 25]]
layout = go.Layout(title="Story Points by Rank over time")
fig = go.Figure(data=traces, layout=layout)
py.iplot(fig)
def timedf(x, y):
# in seconds
diff = (x.astype("int64") - y.astype("int64")) / 1000000000
# in minutes
return diff / 60
df['story_age_begin'] = timedf(df['interval_begin'], df['story_createdat'])
traces = [
go.Scatter(
y=df['comment_count_begin'],
x=df['story_age_begin'],
mode="markers",
marker=dict(
color=df['story_rank_begin']
)
)
]
layout = go.Layout(
title="Comment Count vs. Story Age",
xaxis=dict(
title='Story Age (in minutes)'
),
yaxis=dict(
title='Comment Count'
),
)
fig = go.Figure(data=traces, layout=layout)
py.iplot(fig)
def make_trace(rank):
df_rank = df.query("story_rank_begin == {}".format(rank))
return go.Scatter(
x=df_rank['story_point_begin'],
y=df_rank['comment_count_begin'],
name="Rank={}".format(rank),
mode="markers"
)
traces = [make_trace(r) for r in range(1, 11)]
layout = go.Layout(
title="Comment Count vs. Story Points (by rank)",
xaxis=dict(
title='Story Points'
),
yaxis=dict(
title='Comment Count'
),
)
fig = go.Figure(data=traces, layout=layout)
py.iplot(fig)