#!/usr/bin/env python # coding: utf-8 # In[15]: import os import pandas as pd from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot import plotly.offline as py import plotly.graph_objs as go from pandas_gbq import read_gbq # In[2]: # enable ploty jupyter inline init_notebook_mode(connected=True) # In[3]: # standard SQL table id syntax table_id = "`{}.{}.{}`".format( os.environ['BQ_PROJECT_ID'], os.environ['BQ_DATASET_ID'], os.environ['BQ_TABLE_ID'] ) def read_data(query): df = read_gbq(query, project_id=os.environ['BQ_PROJECT_ID'], private_key=os.environ['BQ_PRIVATE_KEY'], dialect="standard") return df # In[4]: # test bq connection, read 10 rows read_data("SELECT * from {} LIMIT 10".format(table_id)) # ## Load Data From Simple SQL Query # # Find all entries in the last day. # In[55]: query_last_24_hour = """ SELECT * FROM {} WHERE `interval_begin` >= TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 1 DAY) ORDER BY `interval_begin`; """.format(table_id) df = read_data(query_last_24_hour) # Plot total story points on HN frontpage and total comment counts over time. # In[57]: x=total['story_point_begin'].sum().index traces = [ go.Scatter( x=x, y=total['story_point_begin'].sum(), name="Story Points" ), go.Scatter( x=x, y=total['comment_count_begin'].sum(), yaxis="y2", name="Comment Count" ) ] layout = go.Layout( title='Frontpage Story Points and Comment Count over time', yaxis=dict( title='Story Points' ), yaxis2=dict( title='Comment Count', titlefont=dict( color='rgb(148, 103, 189)' ), tickfont=dict( color='rgb(148, 103, 189)' ), overlaying='y', side='right' ) ) fig = go.Figure(data=traces, layout=layout) py.iplot(fig) # Plot story points deltas per sample interval (15 seconds) over time. Unfortunately 15 seconds was too short most entries observes 0 deltas. # In[56]: total = df.groupby("interval_begin") x=total['story_point_delta'].sum().index traces = [ go.Scatter( x=x, y=total['story_point_delta'].sum(), name="Story Points Delta" ), go.Scatter( x=x, y=total['comment_count_delta'].sum(), yaxis="y2", name="Comment Count Delta" ) ] layout = go.Layout( title='HN Activity per 15 Seconds', yaxis=dict( title='Story Points' ), yaxis2=dict( title='Comment Count', titlefont=dict( color='rgb(148, 103, 189)' ), tickfont=dict( color='rgb(148, 103, 189)' ), overlaying='y', side='right' ) ) fig = go.Figure(data=traces, layout=layout) py.iplot(fig) # In[65]: def make_trace(rank): df_rank = df.query("story_rank_begin == {}".format(rank)) return go.Scatter( x=df_rank['interval_begin'], y=df_rank['story_point_begin'], name="Story Point Rank={}".format(rank) ) traces = [make_trace(r) for r in [1, 5, 10, 25]] layout = go.Layout(title="Story Points by Rank over time") fig = go.Figure(data=traces, layout=layout) py.iplot(fig) # In[108]: def timedf(x, y): # in seconds diff = (x.astype("int64") - y.astype("int64")) / 1000000000 # in minutes return diff / 60 df['story_age_begin'] = timedf(df['interval_begin'], df['story_createdat']) traces = [ go.Scatter( y=df['comment_count_begin'], x=df['story_age_begin'], mode="markers", marker=dict( color=df['story_rank_begin'] ) ) ] layout = go.Layout( title="Comment Count vs. Story Age", xaxis=dict( title='Story Age (in minutes)' ), yaxis=dict( title='Comment Count' ), ) fig = go.Figure(data=traces, layout=layout) py.iplot(fig) # In[115]: def make_trace(rank): df_rank = df.query("story_rank_begin == {}".format(rank)) return go.Scatter( x=df_rank['story_point_begin'], y=df_rank['comment_count_begin'], name="Rank={}".format(rank), mode="markers" ) traces = [make_trace(r) for r in range(1, 11)] layout = go.Layout( title="Comment Count vs. Story Points (by rank)", xaxis=dict( title='Story Points' ), yaxis=dict( title='Comment Count' ), ) fig = go.Figure(data=traces, layout=layout) py.iplot(fig)