Simple Twitter Export Analysis¶

A reworking of @mediaczar's demo shown here: https://www.youtube.com/watch?v=G5pdpXp-mFM

In [9]:

#Embed the video
from IPython.display import YouTubeVideo
YouTubeVideo('G5pdpXp-mFM')

Out[9]:

In [2]:

#The pandas library allows us to work with the data a bit like it's a spreadsheet
import pandas as pd

In [4]:

#Let's preview the data
#This notebook is running on a Linux box, so I can use the Linux head command to preview the first line of the data file
!head -n 1 data/tweet_activity_metrics.csv

"Tweet id","Tweet permalink","Tweet text","time","impressions","engagements","engagement rate","retweets","replies","favorites","user profile clicks","url clicks","hashtag clicks","detail expands","permalink clicks","embedded media clicks","app opens","app install attempts","follows","email tweet","dial phone","promoted impressions","promoted engagements","promoted engagement rate","promoted retweets","promoted replies","promoted favorites","promoted user profile clicks","promoted url clicks","promoted hashtag clicks","promoted detail expands","promoted permalink clicks","promoted embedded media clicks","promoted app opens","promoted app install attempts","promoted follows","promoted email tweet","promoted dial phone"

In [56]:

#The time column is a date-time element: we can read it in as such
tw=pd.read_csv('data/tweet_activity_metrics.csv',parse_dates=['time'])
#Preview the first few lines of the data
tw[:3]

Out[56]:

	Tweet id	Tweet permalink	Tweet text	time	impressions	engagements	engagement rate	replies	...	promoted url clicks	promoted hashtag clicks	promoted detail expands	promoted permalink clicks	promoted embedded media clicks	promoted app opens	promoted app install attempts	promoted follows	promoted email tweet	promoted dial phone
0	511803466948554752	https://twitter.com/psychemedia/status/5118034...	@stormpetrel you should do - twitter doesn’t f...	2014-09-16 09:07:00	5	0	0.000000	0	...	-	-	-	-	-	-	-	-	-	-
1	511788367768743936	https://twitter.com/psychemedia/status/5117883...	@stormpetrel Have you tried using lists or do ...	2014-09-16 08:07:00	4	3	0.750000	2	...	-	-	-	-	-	-	-	-	-	-
2	511788201787551744	https://twitter.com/psychemedia/status/5117882...	@stormpetrel in twitter, your friend choices a...	2014-09-16 08:06:00	3	2	0.666667	2	...	-	-	-	-	-	-	-	-	-	-

3 rows × 38 columns

In [57]:

#We can check the data type of each column
#int64 is an integer, float64 a flowing point number (essentially, one with a decimal point)
#object is a string/text element and datatime64[ns] is something recognised as a datetime.
tw.dtypes

Out[57]:

Tweet id                                   int64
Tweet permalink                           object
Tweet text                                object
time                              datetime64[ns]
impressions                                int64
engagements                                int64
engagement rate                          float64
retweets                                   int64
replies                                    int64
favorites                                  int64
user profile clicks                        int64
url clicks                                 int64
hashtag clicks                             int64
detail expands                             int64
permalink clicks                           int64
embedded media clicks                      int64
app opens                                  int64
app install attempts                       int64
follows                                    int64
email tweet                                int64
dial phone                                 int64
promoted impressions                      object
promoted engagements                      object
promoted engagement rate                  object
promoted retweets                         object
promoted replies                          object
promoted favorites                        object
promoted user profile clicks              object
promoted url clicks                       object
promoted hashtag clicks                   object
promoted detail expands                   object
promoted permalink clicks                 object
promoted embedded media clicks            object
promoted app opens                        object
promoted app install attempts             object
promoted follows                          object
promoted email tweet                      object
promoted dial phone                       object
dtype: object

In [60]:

#If we wanted to, we could create a date column from the datetime Tweet timestamps
# by applying pd.Timestamp.date to each datetime 
#To keep the result as a timey thing, we then need to map it back to a datetime element
tw['date']=tw['time'].apply(pd.Timestamp.date).apply(pd.to_datetime)

#We can also pull out just the year
tw['year']=tw['time'].apply(lambda x: x.year)

#Preview a few rows showing just the original time and the new date columns 
tw[['time','date', 'year']][:3]

Out[60]:

	time	date	year
0	2014-09-16 09:07:00	2014-09-16	2014
1	2014-09-16 08:07:00	2014-09-16	2014
2	2014-09-16 08:06:00	2014-09-16	2014

In [61]:

#Create a new column that identifies whether or not a tweet is a reply

#This function detects a reply as a tweet beginning with an @ sign
def twTyp(t):
    if t.startswith('@'):
        return 'reply'
    return 'standard'

#We can generate the new tweet type column by applying this function to each tweet
tw['type']= tw['Tweet text'].apply(twTyp)

#Let's preview a fragment of the result just to check
tw[['Tweet text','type']][2:5]

Out[61]:

	Tweet text	type
2	@stormpetrel in twitter, your friend choices a...	reply
3	Hmmm - an extension forced me to sign into ch...	standard
4	Whatever the latest chrome update is it;s comp...	standard

In [62]:

#Total impressions
tw['impressions'].sum()

Out[62]:

In [63]:

#Find the mean number of impressions - and round to an integer value
tw['impressions'].mean().astype(int)

Out[63]:

In [64]:

#We can summarise the results by year:
tw.groupby('year')['impressions'].mean().astype(int)

Out[64]:

year
2014    260
Name: impressions, dtype: int64

In [65]:

#Let's look at the means by month, as integer values, previewing just the first three
tw.groupby('date')['impressions'].mean().astype(int)[:3]

Out[65]:

date
2014-03-04    581
2014-03-06    285
2014-03-07    201
Name: impressions, dtype: int64

In [67]:

#Let's add in month and day of week columns
tw['month']=tw['time'].apply(lambda x: x.month)
tw['dayofweek']=tw['time'].apply(lambda x: x.dayofweek) #Monday=0, Sunday=6
#Another preview
tw[['time','date', 'year','month','dayofweek']][:3]

Out[67]:

	time	date	year	month	dayofweek
0	2014-09-16 09:07:00	2014-09-16	2014	9	1
1	2014-09-16 08:07:00	2014-09-16	2014	9	1
2	2014-09-16 08:06:00	2014-09-16	2014	9	1

In [69]:

#Group by month and year
tw.groupby(['year','month'])['impressions'].mean().astype(int)

Out[69]:

year  month
2014  3        342
      4        260
      5        273
      6        259
      7        235
      8        262
      9        196
Name: impressions, dtype: int64

In [71]:

#Group by day of week by month and year
tw.groupby(['year','month','dayofweek'])['impressions'].mean().astype(int)[:10]

Out[71]:

year  month  dayofweek
2014  3      0            312
             1            370
             2            347
             3            364
             4            335
             5            275
             6            296
      4      0            308
             1            375
             2            177
Name: impressions, dtype: int64

In [73]:

#Group by tweet type, month and year
tw.groupby(['year','month','type'])['impressions'].mean().astype(int)

Out[73]:

year  month  type    
2014  3      reply       120
             standard    587
      4      reply       106
             standard    522
      5      reply       132
             standard    557
      6      reply        71
             standard    627
      7      reply        72
             standard    615
      8      reply        59
             standard    738
      9      reply        47
             standard    546
Name: impressions, dtype: int64

In [88]:

#pandas also supports pivot tables that have similar settings to Excel pivot tables
#Tghe default operation applied is: mean
pt=tw.pivot_table( values='impressions', index=['year','month'], columns=['type'])
#Tweak the display setting to show integers
pd.set_option("display.precision",1)
pt

Out[88]:

	type	reply	standard
year	month
2014	3	121	587
	4	106	522
	5	132	557
	6	72	627
	7	73	615
	8	60	738
	9	48	546

In [90]:

#Use the pivot table construction to show stats by month and type for each day of week
#We can also add in marginal total values
tw.pivot_table( values='impressions', index=['year','month'], columns=['dayofweek','type'], margins=True)

Out[90]:

	dayofweek	0		1		2		3		4		5		6		All
	type	reply	standard	reply	standard	reply	standard	reply	standard	reply	standard	reply	standard	reply	standard
year	month
2014	3	62	604	107	635	131	554	146	639	117	578	154	499	103	329	343
	4	74	652	131	581	99	324	102	569	127	466	102	462	102	560	260
	5	103	570	83	566	104	508	85	538	101	587	403	561	190	590	274
	6	79	585	81	756	76	545	82	628	71	517	40	479	48	646	260
	7	56	540	159	603	77	614	70	641	52	734	23	460	75	584	235
	8	59	500	59	609	51	598	67	743	46	1647	71	655	101	587	263
	9	47	571	48	590	49	603	47	499	52	523	40	470	54	430	197
All		67	580	81	635	79	545	88	598	89	666	176	528	79	556	261

In [91]:

#The previous view shows how the reply stats skew the overall (All) averages across the rows,
# though the column summaries are a bit more meangiful.
#We can make the row summaries clearer if we tease the type out as a row variable...
tw.pivot_table( values='impressions', index=['year','month','type'], columns=['dayofweek'], margins=True)

Out[91]:

		dayofweek	0	1	2	3	4	5	6	All
year	month	type
2014	3	reply	62	107	131	146	117	154	103	121
	3	standard	604	635	554	639	578	499	329	587
	4	reply	74	131	99	102	127	102	102	106
	4	standard	652	581	324	569	466	462	560	522
	5	reply	103	83	104	85	101	403	190	132
	5	standard	570	566	508	538	587	561	590	557
	6	reply	79	81	76	82	71	40	48	72
	6	standard	585	756	545	628	517	479	646	627
	7	reply	56	159	77	70	52	23	75	73
	7	standard	540	603	614	641	734	460	584	615
	8	reply	59	59	51	67	46	71	101	60
	8	standard	500	609	598	743	1647	655	587	738
	9	reply	47	48	49	47	52	40	54	48
	9	standard	571	590	603	499	523	470	430	546
All			227	266	262	270	270	308	221	261

In [ ]:

#Reset the display setting to show floats
pd.set_option("display.precision",7)