#Embed the video
from IPython.display import YouTubeVideo
YouTubeVideo('G5pdpXp-mFM')

#The pandas library allows us to work with the data a bit like it's a spreadsheet
import pandas as pd

#Let's preview the data
#This notebook is running on a Linux box, so I can use the Linux head command to preview the first line of the data file
!head -n 1 data/tweet_activity_metrics.csv

#The time column is a date-time element: we can read it in as such
tw=pd.read_csv('data/tweet_activity_metrics.csv',parse_dates=['time'])
#Preview the first few lines of the data
tw[:3]

#We can check the data type of each column
#int64 is an integer, float64 a flowing point number (essentially, one with a decimal point)
#object is a string/text element and datatime64[ns] is something recognised as a datetime.
tw.dtypes

#If we wanted to, we could create a date column from the datetime Tweet timestamps
# by applying pd.Timestamp.date to each datetime 
#To keep the result as a timey thing, we then need to map it back to a datetime element
tw['date']=tw['time'].apply(pd.Timestamp.date).apply(pd.to_datetime)

#We can also pull out just the year
tw['year']=tw['time'].apply(lambda x: x.year)

#Preview a few rows showing just the original time and the new date columns 
tw[['time','date', 'year']][:3]

#Create a new column that identifies whether or not a tweet is a reply

#This function detects a reply as a tweet beginning with an @ sign
def twTyp(t):
    if t.startswith('@'):
        return 'reply'
    return 'standard'

#We can generate the new tweet type column by applying this function to each tweet
tw['type']= tw['Tweet text'].apply(twTyp)

#Let's preview a fragment of the result just to check
tw[['Tweet text','type']][2:5]

#Total impressions
tw['impressions'].sum()

#Find the mean number of impressions - and round to an integer value
tw['impressions'].mean().astype(int)

#We can summarise the results by year:
tw.groupby('year')['impressions'].mean().astype(int)

#Let's look at the means by month, as integer values, previewing just the first three
tw.groupby('date')['impressions'].mean().astype(int)[:3]

#Let's add in month and day of week columns
tw['month']=tw['time'].apply(lambda x: x.month)
tw['dayofweek']=tw['time'].apply(lambda x: x.dayofweek) #Monday=0, Sunday=6
#Another preview
tw[['time','date', 'year','month','dayofweek']][:3]

#Group by month and year
tw.groupby(['year','month'])['impressions'].mean().astype(int)

#Group by day of week by month and year
tw.groupby(['year','month','dayofweek'])['impressions'].mean().astype(int)[:10]

#Group by tweet type, month and year
tw.groupby(['year','month','type'])['impressions'].mean().astype(int)

#pandas also supports pivot tables that have similar settings to Excel pivot tables
#Tghe default operation applied is: mean
pt=tw.pivot_table( values='impressions', index=['year','month'], columns=['type'])
#Tweak the display setting to show integers
pd.set_option("display.precision",1)
pt

#Use the pivot table construction to show stats by month and type for each day of week
#We can also add in marginal total values
tw.pivot_table( values='impressions', index=['year','month'], columns=['dayofweek','type'], margins=True)

#The previous view shows how the reply stats skew the overall (All) averages across the rows,
# though the column summaries are a bit more meangiful.
#We can make the row summaries clearer if we tease the type out as a row variable...
tw.pivot_table( values='impressions', index=['year','month','type'], columns=['dayofweek'], margins=True)

#Reset the display setting to show floats
pd.set_option("display.precision",7)