import tweepy
import pandas as pd
import json
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
Gute Referenz, der ich den meisten Code für den Datendownload verdanke: https://www.packtpub.com/big-data-and-business-intelligence/mastering-social-media-mining-python.
Falls der Code selbst ausgeführt werden soll, müssen hier die eigenen Twitter-Keys verwendet werden. Details siehe auch in der Referenz oben.
consumer_key = 'XXX'
consumer_secret = 'XXX'
access_token = 'XXX'
access_secret = 'XXX'
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)
client = tweepy.API(auth, wait_on_rate_limit=True)
users = ('derStandardat', 'KURIERat', 'ArminWolf', 'SPIEGEL_Top')
# last tweet from @diepresse_com in 2012
# last tweet from @oe24News in April 2016
Da derStandard.at mit dem 2016-10-26 das 3.200 Tweet Limit der Twitter API erreicht, lade ich nur Nov und Dez herunter
for user in users:
print('getting data for {}'.format(user))
fname = "user_timeline_{}.jsonl".format(user)
with open(fname, 'w') as f:
for page in tweepy.Cursor(client.user_timeline, screen_name=user, count=200).pages(16):
for status in page:
if (status.created_at.year == 2016) & (status.created_at.month > 10):
f.write(json.dumps(status._json)+"\n")
else:
break
getting data for derStandardat getting data for KURIERat getting data for ArminWolf getting data for SPIEGEL_Top
tweets = [pd.read_json(fname, lines=True) for fname in ["user_timeline_{}.jsonl".format(user) for user in users]]
df = pd.concat(tweets).reset_index(drop=True)
df[:3]
contributors | coordinates | created_at | entities | extended_entities | favorite_count | favorited | geo | id | id_str | ... | quoted_status | quoted_status_id | quoted_status_id_str | retweet_count | retweeted | retweeted_status | source | text | truncated | user | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | NaN | NaN | 2016-12-31 21:30:09 | {'hashtags': [], 'urls': [{'expanded_url': 'ht... | NaN | 0 | False | NaN | 815309091266592768 | 815309091266592768 | ... | NaN | NaN | NaN | 0 | False | NaN | <a href="http://swat.io" rel="nofollow">Swat.i... | Rund 10 Prozent der Österreicher "schwänzen" h... | False | {'profile_background_tile': False, 'is_transla... |
1 | NaN | NaN | 2016-12-31 21:00:22 | {'hashtags': [], 'urls': [{'expanded_url': 'ht... | NaN | 1 | False | NaN | 815301597106491396 | 815301597106491392 | ... | NaN | NaN | NaN | 2 | False | NaN | <a href="http://swat.io" rel="nofollow">Swat.i... | Anderswo ist schon seit Stunden 2017. So feier... | False | {'profile_background_tile': False, 'is_transla... |
2 | NaN | NaN | 2016-12-31 20:30:09 | {'hashtags': [], 'urls': [{'expanded_url': 'ht... | NaN | 2 | False | NaN | 815293991151370240 | 815293991151370240 | ... | NaN | NaN | NaN | 5 | False | NaN | <a href="http://swat.io" rel="nofollow">Swat.i... | "Zweifel an menschenwürdiger Unterbringung": B... | False | {'profile_background_tile': False, 'is_transla... |
3 rows × 29 columns
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 9627 entries, 0 to 9626 Data columns (total 29 columns): contributors 0 non-null float64 coordinates 0 non-null float64 created_at 9627 non-null datetime64[ns] entities 9627 non-null object extended_entities 244 non-null object favorite_count 9627 non-null int64 favorited 9627 non-null bool geo 0 non-null float64 id 9627 non-null int64 id_str 9627 non-null int64 in_reply_to_screen_name 1690 non-null object in_reply_to_status_id 1673 non-null float64 in_reply_to_status_id_str 1673 non-null float64 in_reply_to_user_id 1690 non-null float64 in_reply_to_user_id_str 1690 non-null float64 is_quote_status 9627 non-null bool lang 9627 non-null object place 0 non-null float64 possibly_sensitive 7737 non-null float64 quoted_status 116 non-null object quoted_status_id 153 non-null float64 quoted_status_id_str 153 non-null float64 retweet_count 9627 non-null int64 retweeted 9627 non-null bool retweeted_status 398 non-null object source 9627 non-null object text 9627 non-null object truncated 9627 non-null bool user 9627 non-null object dtypes: bool(4), datetime64[ns](1), float64(11), int64(4), object(9) memory usage: 1.9+ MB
df.columns
Index(['contributors', 'coordinates', 'created_at', 'entities', 'extended_entities', 'favorite_count', 'favorited', 'geo', 'id', 'id_str', 'in_reply_to_screen_name', 'in_reply_to_status_id', 'in_reply_to_status_id_str', 'in_reply_to_user_id', 'in_reply_to_user_id_str', 'is_quote_status', 'lang', 'place', 'possibly_sensitive', 'quoted_status', 'quoted_status_id', 'quoted_status_id_str', 'retweet_count', 'retweeted', 'retweeted_status', 'source', 'text', 'truncated', 'user'], dtype='object')
Username aus Spalte 'user' extrahieren und ergänzen
df_user = pd.DataFrame(df.user.tolist())
df_user.columns
Index(['contributors_enabled', 'created_at', 'default_profile', 'default_profile_image', 'description', 'entities', 'favourites_count', 'follow_request_sent', 'followers_count', 'following', 'friends_count', 'geo_enabled', 'has_extended_profile', 'id', 'id_str', 'is_translation_enabled', 'is_translator', 'lang', 'listed_count', 'location', 'name', 'notifications', 'profile_background_color', 'profile_background_image_url', 'profile_background_image_url_https', 'profile_background_tile', 'profile_banner_url', 'profile_image_url', 'profile_image_url_https', 'profile_link_color', 'profile_sidebar_border_color', 'profile_sidebar_fill_color', 'profile_text_color', 'profile_use_background_image', 'protected', 'screen_name', 'statuses_count', 'time_zone', 'translator_type', 'url', 'utc_offset', 'verified'], dtype='object')
df_user.name.value_counts()
derStandard.at 2981 KURIER 2583 Armin Wolf 2272 SPIEGEL ONLINE Top 1791 Name: name, dtype: int64
df_user[:2]
contributors_enabled | created_at | default_profile | default_profile_image | description | entities | favourites_count | follow_request_sent | followers_count | following | ... | profile_text_color | profile_use_background_image | protected | screen_name | statuses_count | time_zone | translator_type | url | utc_offset | verified | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | False | Fri Mar 06 16:43:22 +0000 2009 | False | False | Aus dem Newsroom der österreichischen Tageszei... | {'description': {'urls': [{'expanded_url': 'ht... | 2884 | False | 246282 | False | ... | 000000 | True | False | derStandardat | 67690 | Vienna | none | http://t.co/0N5NEkJcaI | 3600 | True |
1 | False | Fri Mar 06 16:43:22 +0000 2009 | False | False | Aus dem Newsroom der österreichischen Tageszei... | {'description': {'urls': [{'expanded_url': 'ht... | 2884 | False | 246282 | False | ... | 000000 | True | False | derStandardat | 67690 | Vienna | none | http://t.co/0N5NEkJcaI | 3600 | True |
2 rows × 42 columns
df = df.join([df_user.name])
Wer hat wieviele Followers?
df_user.drop_duplicates(subset=('name'))[['name', 'followers_count']].sort_values('followers_count', ascending=False)
name | followers_count | |
---|---|---|
5564 | Armin Wolf | 323854 |
7836 | SPIEGEL ONLINE Top | 280554 |
0 | derStandard.at | 246282 |
2981 | KURIER | 69563 |
df = df.sort_values('created_at', ascending=True)
df[:3]
contributors | coordinates | created_at | entities | extended_entities | favorite_count | favorited | geo | id | id_str | ... | quoted_status_id | quoted_status_id_str | retweet_count | retweeted | retweeted_status | source | text | truncated | user | name | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
9626 | NaN | NaN | 2016-11-01 00:27:19 | {'hashtags': [], 'urls': [{'expanded_url': 'ht... | NaN | 0 | False | NaN | 793248015691685889 | 793248015691685888 | ... | NaN | NaN | 1 | False | NaN | <a href="http://ifttt.com" rel="nofollow">IFTT... | 71 Jahre nach seinem Verschwinden: Schweden er... | False | {'profile_background_tile': False, 'is_transla... | SPIEGEL ONLINE Top |
9625 | NaN | NaN | 2016-11-01 00:42:44 | {'hashtags': [], 'urls': [{'expanded_url': 'ht... | NaN | 0 | False | NaN | 793251895787872257 | 793251895787872256 | ... | NaN | NaN | 0 | False | NaN | <a href="http://ifttt.com" rel="nofollow">IFTT... | Silvio Gazzaniga: Designer des WM-Pokals ist t... | False | {'profile_background_tile': False, 'is_transla... | SPIEGEL ONLINE Top |
9624 | NaN | NaN | 2016-11-01 05:17:19 | {'hashtags': [], 'urls': [{'expanded_url': 'ht... | NaN | 1 | False | NaN | 793320994966495234 | 793320994966495232 | ... | NaN | NaN | 2 | False | NaN | <a href="http://ifttt.com" rel="nofollow">IFTT... | +++ Der Morgen live +++: Wirtschaftsminister G... | False | {'profile_background_tile': False, 'is_transla... | SPIEGEL ONLINE Top |
3 rows × 30 columns
for name in df.name.unique():
print(name, df.loc[df.name == name, 'created_at'].min())
SPIEGEL ONLINE Top 2016-11-01 00:27:19 derStandard.at 2016-11-01 07:00:17 KURIER 2016-11-01 07:00:25 Armin Wolf 2016-11-01 12:37:55
df_temp = df.groupby([df.name])['id'].count()
df_temp
name Armin Wolf 2272 KURIER 2583 SPIEGEL ONLINE Top 1791 derStandard.at 2981 Name: id, dtype: int64
df_temp = df.groupby([df.retweeted_status.isnull() == False, df.name])['id'].count().unstack()
df_temp
name | Armin Wolf | KURIER | SPIEGEL ONLINE Top | derStandard.at |
---|---|---|---|---|
retweeted_status | ||||
False | 1961.0 | 2520.0 | 1791.0 | 2957.0 |
True | 311.0 | 63.0 | NaN | 24.0 |
df = df[df.retweeted_status.isnull()]
df_temp = df.groupby([df.retweeted_status.isnull() == False, df.name])['id'].count().unstack()
df_temp
name | Armin Wolf | KURIER | SPIEGEL ONLINE Top | derStandard.at |
---|---|---|---|---|
retweeted_status | ||||
False | 1961 | 2520 | 1791 | 2957 |
df_temp = df.groupby([df.name])['retweet_count'].sum()
df_temp
name Armin Wolf 11436 KURIER 4712 SPIEGEL ONLINE Top 3839 derStandard.at 6109 Name: retweet_count, dtype: int64
df_temp = df.groupby([df.name, df.created_at.dt.dayofweek])['id'].count().unstack()
df_temp.columns = ['Mo', 'Di', 'Mi', 'Do', 'Fr', 'Sa', 'So']
df_temp
Mo | Di | Mi | Do | Fr | Sa | So | |
---|---|---|---|---|---|---|---|
name | |||||||
Armin Wolf | 226 | 239 | 458 | 315 | 316 | 151 | 256 |
KURIER | 346 | 452 | 440 | 429 | 385 | 233 | 235 |
SPIEGEL ONLINE Top | 242 | 289 | 267 | 266 | 335 | 190 | 202 |
derStandard.at | 439 | 491 | 496 | 489 | 469 | 290 | 283 |
sns.set(style='whitegrid')
ax = sns.heatmap(df_temp, annot=True, linewidths=.5, fmt='.0f')
sns.set(font_scale=1.5)
sns.plt.title('Anzahl Tweets')
sns.set(font_scale=1)
ax.set(xlabel='Wochentag', ylabel='Twitter Account')
[<matplotlib.text.Text at 0x297beee81d0>, <matplotlib.text.Text at 0x297bb028c18>]
sns.set(style='whitegrid')
ax = sns.heatmap(df_temp.div(df_temp.sum(axis=1), axis=0) * 100, annot=True, linewidths=.5, fmt='.0f')
sns.set(font_scale=1.5)
sns.plt.title('Anzahl Tweets in Prozent je Account')
sns.set(font_scale=1)
ax.set(xlabel='Wochentag', ylabel='Twitter Account')
sns.plt.savefig('tweets_pro_wochentag.png')
df_temp = df.groupby([ df.name, df.created_at.dt.hour])['id'].count().unstack()
df_temp[:3]
created_at | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
name | |||||||||||||||||||||
Armin Wolf | 24.0 | 15.0 | 10.0 | 15.0 | 13.0 | 34.0 | 43.0 | 58.0 | 105.0 | 150.0 | ... | 111.0 | 115.0 | 155.0 | 109.0 | 73.0 | 49.0 | 84.0 | 118.0 | 92.0 | 85.0 |
KURIER | 7.0 | 2.0 | 1.0 | 4.0 | 3.0 | 216.0 | 128.0 | 179.0 | 150.0 | 192.0 | ... | 169.0 | 127.0 | 103.0 | 145.0 | 110.0 | 96.0 | 79.0 | 20.0 | 9.0 | 7.0 |
SPIEGEL ONLINE Top | 25.0 | 16.0 | 27.0 | 23.0 | 21.0 | 51.0 | 78.0 | 104.0 | 80.0 | 96.0 | ... | 117.0 | 117.0 | 127.0 | 102.0 | 72.0 | 78.0 | 78.0 | 53.0 | 42.0 | 35.0 |
3 rows × 24 columns
sns.set(style='whitegrid')
ax = sns.heatmap(df_temp)
sns.set(font_scale=1.5)
sns.plt.title('Anzahl Tweets')
sns.set(font_scale=1)
ax.set(xlabel='Uhrzeit - Stunde', ylabel='Twitter Account')
[<matplotlib.text.Text at 0x297c4d83198>, <matplotlib.text.Text at 0x297c4a62be0>]
sns.set(style='whitegrid')
ax = sns.heatmap(df_temp.div(df_temp.sum(axis=1), axis=0) * 100)
sns.set(font_scale=1.5)
sns.plt.title('Anzahl Tweets in Prozent je Account')
sns.set(font_scale=1)
ax.set(xlabel='Uhrzeit - Stunde', ylabel='Twitter Account')
sns.plt.savefig('tweets_pro_stunde.png')
df_temp = df[df.name == 'Armin Wolf'].groupby([ df.created_at.dt.weekday,
df.created_at.dt.hour])['id'].count().unstack()
df_temp.index = ['Mo', 'Di', 'Mi', 'Do', 'Fr', 'Sa', 'So']
df_temp
created_at | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Mo | 2.0 | NaN | NaN | NaN | NaN | NaN | 4.0 | 3.0 | 11.0 | 21.0 | ... | 4.0 | 12.0 | 12.0 | 7.0 | 9.0 | 1.0 | 15.0 | 28.0 | 17.0 | 25.0 |
Di | NaN | 3.0 | NaN | NaN | NaN | NaN | 4.0 | 13.0 | 18.0 | 23.0 | ... | 8.0 | 9.0 | 13.0 | 2.0 | 8.0 | 7.0 | 5.0 | 23.0 | 8.0 | 11.0 |
Mi | 11.0 | 12.0 | 7.0 | 15.0 | 5.0 | 11.0 | 11.0 | 19.0 | 30.0 | 44.0 | ... | 27.0 | 26.0 | 19.0 | 19.0 | 21.0 | 4.0 | 13.0 | 25.0 | 13.0 | 13.0 |
Do | 9.0 | NaN | 2.0 | NaN | NaN | 6.0 | 11.0 | 3.0 | 16.0 | 9.0 | ... | 15.0 | 17.0 | 40.0 | 28.0 | 4.0 | 11.0 | 18.0 | 11.0 | 23.0 | 17.0 |
Fr | NaN | NaN | 1.0 | NaN | 8.0 | 17.0 | 11.0 | 18.0 | 20.0 | 23.0 | ... | 33.0 | 18.0 | 45.0 | 13.0 | 14.0 | 8.0 | 10.0 | 2.0 | 8.0 | 4.0 |
Sa | NaN | NaN | NaN | NaN | NaN | NaN | 2.0 | NaN | 8.0 | 10.0 | ... | 16.0 | 16.0 | 12.0 | 7.0 | 3.0 | 2.0 | 5.0 | NaN | 2.0 | 5.0 |
So | 2.0 | NaN | NaN | NaN | NaN | NaN | NaN | 2.0 | 2.0 | 20.0 | ... | 8.0 | 17.0 | 14.0 | 33.0 | 14.0 | 16.0 | 18.0 | 29.0 | 21.0 | 10.0 |
7 rows × 24 columns
sns.set(style='whitegrid')
ax = sns.heatmap(df_temp)
sns.set(font_scale=1.5)
sns.plt.title('Anzahl Tweets von Armin Wolf')
sns.set(font_scale=1)
ax.set(xlabel='Uhrzeit - Stunde', ylabel='Wochentag')
[<matplotlib.text.Text at 0x297c4e33278>, <matplotlib.text.Text at 0x297c4afc400>]
df_temp = df.groupby([df.created_at.dt.month, df.created_at.dt.day,
df.created_at.dt.hour])['id'].count().unstack()
df_temp[:3]
created_at | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
created_at | created_at | |||||||||||||||||||||
11 | 1 | 2.0 | NaN | NaN | NaN | NaN | 2.0 | 4.0 | 21.0 | 10.0 | 8.0 | ... | 4.0 | 7.0 | 4.0 | 5.0 | 5.0 | 9.0 | 7.0 | 6.0 | 3.0 | NaN |
2 | 1.0 | NaN | 1.0 | NaN | 1.0 | 15.0 | 8.0 | 8.0 | 7.0 | 12.0 | ... | 9.0 | 7.0 | 7.0 | 5.0 | 9.0 | 5.0 | 5.0 | 8.0 | 2.0 | 1.0 | |
3 | 1.0 | 1.0 | 1.0 | NaN | 2.0 | 8.0 | 9.0 | 4.0 | 8.0 | 11.0 | ... | 10.0 | 9.0 | 8.0 | 18.0 | 8.0 | 4.0 | 6.0 | 3.0 | 6.0 | 2.0 |
3 rows × 24 columns
sns.set(style='whitegrid')
ax = sns.heatmap(df_temp)
sns.set(font_scale=1.5)
sns.plt.title('Anzahl Tweets pro Tag')
sns.set(font_scale=1)
ax.set(xlabel='Uhrzeit - Stunde', ylabel='Tag')
sns.plt.savefig('tweets_pro_tag.png')
df_temp = df[df.name == 'Armin Wolf'].groupby([df.created_at.dt.month, df.created_at.dt.day,
df.created_at.dt.hour])['id'].count().unstack()
df_temp[:3]
created_at | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
created_at | created_at | |||||||||||||||||||||
11 | 1 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | 2.0 | NaN | NaN | NaN | NaN |
2 | NaN | NaN | NaN | NaN | 1.0 | NaN | NaN | NaN | NaN | NaN | ... | 1.0 | 1.0 | 2.0 | NaN | 1.0 | NaN | NaN | 2.0 | NaN | NaN | |
3 | NaN | NaN | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 1.0 | 1.0 | NaN | NaN | NaN | NaN | NaN | NaN | 2.0 | NaN |
3 rows × 24 columns
sns.set(style='whitegrid')
ax = sns.heatmap(df_temp)
sns.set(font_scale=1.5)
sns.plt.title('Anzahl Tweets von Armin Wolf')
sns.set(font_scale=1)
ax.set(xlabel='Uhrzeit - Stunde', ylabel='Tag')
#sns.plt.savefig('tweets_armin_wolf.svg')
[<matplotlib.text.Text at 0x297c5388208>, <matplotlib.text.Text at 0x297c535b1d0>]
df.loc[(df.name == 'Armin Wolf') & (df.created_at.dt.month == 11) &
(df.created_at.dt.day == 9) & (df.created_at.dt.hour == 3),
['created_at', 'text']]
created_at | text | |
---|---|---|
7664 | 2016-11-09 03:00:17 | Oje. https://t.co/jg5LpILcXl |
7663 | 2016-11-09 03:00:26 | @RalphJanik Eh. |
7662 | 2016-11-09 03:06:42 | @pitgottschalk Ernste Frage? Der ganz links. |
7661 | 2016-11-09 03:11:24 | Die jüngste NYT-Prognose: Demnach würde Clinto... |
7660 | 2016-11-09 03:14:06 | US-Präsidenten ohne Stimmenmehrheit (popular v... |
7659 | 2016-11-09 03:21:15 | @EvaGauda US-Wahlsystem googeln. Tausendfach s... |
7658 | 2016-11-09 03:26:33 | Trump gewinnt Ohio, sagt CNN. Das ist wirklich... |
7657 | 2016-11-09 03:28:39 | @StefanHaboeck Das ist die Auszählung. Beziehe... |
7656 | 2016-11-09 03:28:52 | @TOMK1988 Nicht immer. Aber häufig. |
7655 | 2016-11-09 03:31:42 | So einen Wahlabend habe ich noch nie gesehen. ... |
7654 | 2016-11-09 03:36:47 | Jetzt hat auch Nate Silver Trump als Favoriten... |
7653 | 2016-11-09 03:38:25 | Wenn das alles hält, gibt es einen Präsidenten... |
7652 | 2016-11-09 03:39:54 | @TOMK1988 1960 glaube ich. Da gewann Nixon Ohi... |
7647 | 2016-11-09 03:49:12 | Dabei hätte das nach dem Debakel von 2000 nie ... |
7643 | 2016-11-09 03:54:16 | Wenn Clinton jetzt nicht Pennsylvania + Michig... |
df_temp = df.groupby([df.created_at.dt.month, df.created_at.dt.day,
df.created_at.dt.hour])['retweet_count'].sum().unstack()
df_temp[:3]
created_at | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
created_at | created_at | |||||||||||||||||||||
11 | 1 | 1.0 | NaN | NaN | NaN | NaN | 3.0 | 7.0 | 33.0 | 15.0 | 23.0 | ... | 1.0 | 15.0 | 18.0 | 14.0 | 39.0 | 30.0 | 23.0 | 7.0 | 7.0 | NaN |
2 | 4.0 | NaN | 1.0 | NaN | 4.0 | 9.0 | 18.0 | 10.0 | 14.0 | 8.0 | ... | 12.0 | 6.0 | 24.0 | 6.0 | 10.0 | 4.0 | 9.0 | 4.0 | 5.0 | 12.0 | |
3 | 0.0 | 1.0 | 21.0 | NaN | 8.0 | 24.0 | 7.0 | 3.0 | 6.0 | 15.0 | ... | 12.0 | 7.0 | 9.0 | 16.0 | 14.0 | 6.0 | 13.0 | 4.0 | 10.0 | 155.0 |
3 rows × 24 columns
sns.set(style='whitegrid')
ax = sns.heatmap(df_temp)
sns.set(font_scale=1.5)
sns.plt.title('Anzahl Retweets pro Tag')
sns.set(font_scale=1)
ax.set(xlabel='Uhrzeit - Stunde', ylabel='Tag')
[<matplotlib.text.Text at 0x297c535f6a0>, <matplotlib.text.Text at 0x297c53cebe0>]
sns.set(style='whitegrid')
ax = sns.distplot(df.loc[df.retweet_count <= 50,'retweet_count'], kde=False)
sns.set(font_scale=1.5)
sns.plt.title('Verteilung Tweets je Retweets <= 50')
sns.set(font_scale=1)
ax.set(xlabel='Retweets', ylabel='Anzahl Tweets')
sns.plt.savefig('retweets_hist_bis_50.png')
sns.set(style='whitegrid')
ax = sns.distplot(df.loc[df.retweet_count > 50,'retweet_count'], kde=False)
sns.set(font_scale=1.5)
sns.plt.title('Verteilung Tweets je Retweets > 50')
sns.set(font_scale=1)
ax.set(xlabel='Retweets', ylabel='Anzahl Tweets')
sns.plt.savefig('retweets_hist_ab_50.png')
df.sort_values('retweet_count', ascending=False).iloc[0,:]
contributors NaN coordinates NaN created_at 2016-11-26 08:49:32 entities {'hashtags': [], 'urls': [], 'symbols': [], 'u... extended_entities NaN favorite_count 3007 favorited False geo NaN id 802434099990462464 id_str 802434099990462464 in_reply_to_screen_name None in_reply_to_status_id NaN in_reply_to_status_id_str NaN in_reply_to_user_id NaN in_reply_to_user_id_str NaN is_quote_status False lang de place NaN possibly_sensitive NaN quoted_status NaN quoted_status_id NaN quoted_status_id_str NaN retweet_count 1391 retweeted False retweeted_status NaN source <a href="http://twitter.com/#!/download/ipad" ... text Fidel Castros US-Präsidenten:\nEisenhower\nKen... truncated False user {'profile_background_tile': False, 'is_transla... name Armin Wolf Name: 6788, dtype: object
for name in df.name.unique():
print('*' * 80)
print(name)
print(df[df.name == name].sort_values('retweet_count', ascending=False).iloc[0,:])
print()
******************************************************************************** SPIEGEL ONLINE Top contributors NaN coordinates NaN created_at 2016-11-03 23:54:03 entities {'hashtags': [], 'urls': [{'expanded_url': 'ht... extended_entities NaN favorite_count 34 favorited False geo NaN id 794326808586047491 id_str 794326808586047488 in_reply_to_screen_name NaN in_reply_to_status_id NaN in_reply_to_status_id_str NaN in_reply_to_user_id NaN in_reply_to_user_id_str NaN is_quote_status False lang de place NaN possibly_sensitive 0 quoted_status NaN quoted_status_id NaN quoted_status_id_str NaN retweet_count 152 retweeted False retweeted_status NaN source <a href="http://ifttt.com" rel="nofollow">IFTT... text Türkei: Polizei nimmt Chefs der prokurdischen ... truncated False user {'profile_background_tile': False, 'is_transla... name SPIEGEL ONLINE Top Name: 9523, dtype: object ******************************************************************************** derStandard.at contributors NaN coordinates NaN created_at 2016-12-04 18:48:50 entities {'hashtags': [{'text': 'bpw16', 'indices': [77... extended_entities {'media': [{'id_str': '805483973954699264', 'e... favorite_count 176 favorited False geo NaN id 805484020591194112 id_str 805484020591194112 in_reply_to_screen_name None in_reply_to_status_id NaN in_reply_to_status_id_str NaN in_reply_to_user_id NaN in_reply_to_user_id_str NaN is_quote_status False lang de place NaN possibly_sensitive 0 quoted_status NaN quoted_status_id NaN quoted_status_id_str NaN retweet_count 162 retweeted False retweeted_status NaN source <a href="https://about.twitter.com/products/tw... text Wer wem seine Stimme gegeben hat, hier alle De... truncated False user {'profile_background_tile': False, 'is_transla... name derStandard.at Name: 1257, dtype: object ******************************************************************************** KURIER contributors NaN coordinates NaN created_at 2016-11-05 08:42:36 entities {'hashtags': [], 'urls': [{'expanded_url': 'ht... extended_entities NaN favorite_count 17 favorited False geo NaN id 794822209135067136 id_str 794822209135067136 in_reply_to_screen_name None in_reply_to_status_id NaN in_reply_to_status_id_str NaN in_reply_to_user_id NaN in_reply_to_user_id_str NaN is_quote_status False lang de place NaN possibly_sensitive 0 quoted_status NaN quoted_status_id NaN quoted_status_id_str NaN retweet_count 46 retweeted False retweeted_status NaN source <a href="https://about.twitter.com/products/tw... text Keine "happy muslim arabic family" erwünscht: ... truncated False user {'profile_background_tile': False, 'is_transla... name KURIER Name: 5374, dtype: object ******************************************************************************** Armin Wolf contributors NaN coordinates NaN created_at 2016-11-26 08:49:32 entities {'hashtags': [], 'urls': [], 'symbols': [], 'u... extended_entities NaN favorite_count 3007 favorited False geo NaN id 802434099990462464 id_str 802434099990462464 in_reply_to_screen_name None in_reply_to_status_id NaN in_reply_to_status_id_str NaN in_reply_to_user_id NaN in_reply_to_user_id_str NaN is_quote_status False lang de place NaN possibly_sensitive NaN quoted_status NaN quoted_status_id NaN quoted_status_id_str NaN retweet_count 1391 retweeted False retweeted_status NaN source <a href="http://twitter.com/#!/download/ipad" ... text Fidel Castros US-Präsidenten:\nEisenhower\nKen... truncated False user {'profile_background_tile': False, 'is_transla... name Armin Wolf Name: 6788, dtype: object
df_temp = df[df.name == 'Armin Wolf'].groupby([df.created_at.dt.month, df.created_at.dt.day,
df.created_at.dt.hour])['retweet_count'].sum().unstack()
df_temp[:3]
created_at | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
created_at | created_at | |||||||||||||||||||||
11 | 1 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | 9.0 | NaN | NaN | NaN | NaN |
2 | NaN | NaN | NaN | NaN | 4.0 | NaN | NaN | NaN | NaN | NaN | ... | 0.0 | 5.0 | 13.0 | NaN | 0.0 | NaN | NaN | 0.0 | NaN | NaN | |
3 | NaN | NaN | 21.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 6.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | 8.0 | NaN |
3 rows × 24 columns
sns.set(style='whitegrid')
ax = sns.heatmap(df_temp)
sns.set(font_scale=1.5)
sns.plt.title('Anzahl Retweets von Tweets von Armin Wolf')
sns.set(font_scale=1)
ax.set(xlabel='Uhrzeit - Stunde', ylabel='Tag')
#sns.plt.savefig('reteets_armin_wolf.svg')
def get_hashtags(entities):
hashtags = entities.get('hashtags', [])
return [tag['text'].lower() for tag in hashtags]
df_temp_hashtags = df.entities.apply(get_hashtags).apply(pd.Series)
df_temp_hashtags[7:10]
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
9619 | NaN | NaN | NaN | NaN | NaN |
2980 | putin | NaN | NaN | NaN | NaN |
5563 | NaN | NaN | NaN | NaN | NaN |
df_temp_hashtags2 = pd.concat([df, df_temp_hashtags], axis=1)
df_temp_hashtags2[7:10]
contributors | coordinates | created_at | entities | extended_entities | favorite_count | favorited | geo | id | id_str | ... | text | truncated | user | name | hashtags_list | 0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
9619 | NaN | NaN | 2016-11-01 06:47:27 | {'hashtags': [], 'urls': [{'expanded_url': 'ht... | NaN | 1 | False | NaN | 793343678643331072 | 793343678643331072 | ... | Haushaltskrise: Saudischer Finanzminister nach... | False | {'profile_background_tile': False, 'is_transla... | SPIEGEL ONLINE Top | [] | NaN | NaN | NaN | NaN | NaN |
2980 | NaN | NaN | 2016-11-01 07:00:17 | {'hashtags': [{'text': 'Putin', 'indices': [33... | NaN | 1 | False | NaN | 793346910333575168 | 793346910333575168 | ... | Der russische Präsident Wladimir #Putin setzt ... | False | {'profile_background_tile': False, 'is_transla... | derStandard.at | [putin] | putin | NaN | NaN | NaN | NaN |
5563 | NaN | NaN | 2016-11-01 07:00:25 | {'hashtags': [], 'urls': [], 'symbols': [], 'u... | NaN | 1 | False | NaN | 793346944131293184 | 793346944131293184 | ... | Guten Morgen aus dem KURIER Newsroom. @Juergen... | False | {'profile_background_tile': False, 'is_transla... | KURIER | [] | NaN | NaN | NaN | NaN | NaN |
3 rows × 36 columns
df_hashtags = pd.melt(df_extended, id_vars=['id_str', 'name'] ,value_vars=[0,1,2,3],
value_name='hashtag', var_name='hashtag_nr').dropna()
df_hashtags[:3]
id_str | name | hashtag_nr | hashtag | |
---|---|---|---|---|
8 | 793346910333575168 | derStandard.at | 0 | putin |
25 | 793354418855022592 | derStandard.at | 0 | wissenschaft |
29 | 793362107639226368 | derStandard.at | 0 | smartphone |
df_hashtags.hashtag.value_counts()[:5]
bpw16 146 wirtschaft 87 berlin 63 wissenschaft 58 trump 56 Name: hashtag, dtype: int64
from wordcloud import WordCloud
# https://github.com/amueller/word_cloud
# Achtung, ich habe an der Erweiterung wordcloud - die für englische Sprache entwickelt wurde - etwas
# herumgebastelt, um zu verhindern, dass ein 's' am Ende eines Worts als Pluar interpretiert wird.
# Ansonst würden zum Beispiel die Wörter hau und Haus zusammen erfasst.
#stopword Liste 1 von http://members.unine.ch/jacques.savoy/clef/index.html
STOPWORDS_GENERAL = set([x.strip() for x in open('germanST.txt').read().split('\n')])
STOPWORDS_GENERAL = STOPWORDS_GENERAL.union(set(('nämlich',)))
text_hashtags = ' '.join(df_hashtags.hashtag.tolist())
wordcloud = WordCloud(max_font_size=40, relative_scaling=.5, background_color='white',stopwords=STOPWORDS_GENERAL,
scale=3, random_state=42).generate(text_hashtags)
plt.imshow(wordcloud)
plt.axis("off")
#plt.title('Hashtags');
plt.savefig('hashtags.png', transparent=True, bbox_inches='tight', pad_inches=0)
plt.show()
df_hashtags.name.value_counts()
derStandard.at 1825 KURIER 886 Armin Wolf 66 SPIEGEL ONLINE Top 1 Name: name, dtype: int64
for name in df_hashtags.name.unique():
text_hashtags_name = ' '.join(df_hashtags[df_hashtags.name == name].hashtag.tolist())
wordcloud = WordCloud(max_font_size=40, relative_scaling=.5, background_color='white',stopwords=STOPWORDS_GENERAL,
scale=3, random_state=42).generate(text_hashtags_name)
plt.imshow(wordcloud)
plt.axis("off");
plt.title('Hashtags: ' + name);
plt.savefig('hashtags_' + name.replace(' ','_') + '.png',
transparent=True, bbox_inches='tight', pad_inches=0)
plt.show()