#Embed the video from IPython.display import YouTubeVideo YouTubeVideo('G5pdpXp-mFM') #The pandas library allows us to work with the data a bit like it's a spreadsheet import pandas as pd #Let's preview the data #This notebook is running on a Linux box, so I can use the Linux head command to preview the first line of the data file !head -n 1 data/tweet_activity_metrics.csv #The time column is a date-time element: we can read it in as such tw=pd.read_csv('data/tweet_activity_metrics.csv',parse_dates=['time']) #Preview the first few lines of the data tw[:3] #We can check the data type of each column #int64 is an integer, float64 a flowing point number (essentially, one with a decimal point) #object is a string/text element and datatime64[ns] is something recognised as a datetime. tw.dtypes #If we wanted to, we could create a date column from the datetime Tweet timestamps # by applying pd.Timestamp.date to each datetime #To keep the result as a timey thing, we then need to map it back to a datetime element tw['date']=tw['time'].apply(pd.Timestamp.date).apply(pd.to_datetime) #We can also pull out just the year tw['year']=tw['time'].apply(lambda x: x.year) #Preview a few rows showing just the original time and the new date columns tw[['time','date', 'year']][:3] #Create a new column that identifies whether or not a tweet is a reply #This function detects a reply as a tweet beginning with an @ sign def twTyp(t): if t.startswith('@'): return 'reply' return 'standard' #We can generate the new tweet type column by applying this function to each tweet tw['type']= tw['Tweet text'].apply(twTyp) #Let's preview a fragment of the result just to check tw[['Tweet text','type']][2:5] #Total impressions tw['impressions'].sum() #Find the mean number of impressions - and round to an integer value tw['impressions'].mean().astype(int) #We can summarise the results by year: tw.groupby('year')['impressions'].mean().astype(int) #Let's look at the means by month, as integer values, previewing just the first three tw.groupby('date')['impressions'].mean().astype(int)[:3] #Let's add in month and day of week columns tw['month']=tw['time'].apply(lambda x: x.month) tw['dayofweek']=tw['time'].apply(lambda x: x.dayofweek) #Monday=0, Sunday=6 #Another preview tw[['time','date', 'year','month','dayofweek']][:3] #Group by month and year tw.groupby(['year','month'])['impressions'].mean().astype(int) #Group by day of week by month and year tw.groupby(['year','month','dayofweek'])['impressions'].mean().astype(int)[:10] #Group by tweet type, month and year tw.groupby(['year','month','type'])['impressions'].mean().astype(int) #pandas also supports pivot tables that have similar settings to Excel pivot tables #Tghe default operation applied is: mean pt=tw.pivot_table( values='impressions', index=['year','month'], columns=['type']) #Tweak the display setting to show integers pd.set_option("display.precision",1) pt #Use the pivot table construction to show stats by month and type for each day of week #We can also add in marginal total values tw.pivot_table( values='impressions', index=['year','month'], columns=['dayofweek','type'], margins=True) #The previous view shows how the reply stats skew the overall (All) averages across the rows, # though the column summaries are a bit more meangiful. #We can make the row summaries clearer if we tease the type out as a row variable... tw.pivot_table( values='impressions', index=['year','month','type'], columns=['dayofweek'], margins=True) #Reset the display setting to show floats pd.set_option("display.precision",7)