import tweepy
import pandas as pd
import json
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
Gute Referenz, der ich den meisten Code für den Datendownload verdanke: https://www.packtpub.com/big-data-and-business-intelligence/mastering-social-media-mining-python.
Falls der Code selbst ausgeführt werden soll, müssen hier die eigenen Twitter-Keys verwendet werden. Details siehe auch in der Referenz oben.
consumer_key = 'XXX'
consumer_secret = 'XXX'
access_token = 'XXX'
access_secret = 'XXX'
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)
client = tweepy.API(auth, wait_on_rate_limit=True)
users = ('derStandardat', 'KURIERat', 'ArminWolf', 'SPIEGEL_Top')
# last tweet from @diepresse_com in 2012
# last tweet from @oe24News in April 2016
Da derStandard.at mit dem 2016-10-26 das 3.200 Tweet Limit der Twitter API erreicht, lade ich nur Nov und Dez herunter
for user in users:
print('getting data for {}'.format(user))
fname = "user_timeline_{}.jsonl".format(user)
with open(fname, 'w') as f:
for page in tweepy.Cursor(client.user_timeline, screen_name=user, count=200).pages(16):
for status in page:
if (status.created_at.year == 2016) & (status.created_at.month > 10):
f.write(json.dumps(status._json)+"\n")
else:
break
tweets = [pd.read_json(fname, lines=True) for fname in ["user_timeline_{}.jsonl".format(user) for user in users]]
df = pd.concat(tweets).reset_index(drop=True)
df[:3]
df.info()
df.columns
Username aus Spalte 'user' extrahieren und ergänzen
df_user = pd.DataFrame(df.user.tolist())
df_user.columns
df_user.name.value_counts()
df_user[:2]
df = df.join([df_user.name])
Wer hat wieviele Followers?
df_user.drop_duplicates(subset=('name'))[['name', 'followers_count']].sort_values('followers_count', ascending=False)
df = df.sort_values('created_at', ascending=True)
df[:3]
for name in df.name.unique():
print(name, df.loc[df.name == name, 'created_at'].min())
df_temp = df.groupby([df.name])['id'].count()
df_temp
df_temp = df.groupby([df.retweeted_status.isnull() == False, df.name])['id'].count().unstack()
df_temp
df = df[df.retweeted_status.isnull()]
df_temp = df.groupby([df.retweeted_status.isnull() == False, df.name])['id'].count().unstack()
df_temp
df_temp = df.groupby([df.name])['retweet_count'].sum()
df_temp
df_temp = df.groupby([df.name, df.created_at.dt.dayofweek])['id'].count().unstack()
df_temp.columns = ['Mo', 'Di', 'Mi', 'Do', 'Fr', 'Sa', 'So']
df_temp
sns.set(style='whitegrid')
ax = sns.heatmap(df_temp, annot=True, linewidths=.5, fmt='.0f')
sns.set(font_scale=1.5)
sns.plt.title('Anzahl Tweets')
sns.set(font_scale=1)
ax.set(xlabel='Wochentag', ylabel='Twitter Account')
sns.set(style='whitegrid')
ax = sns.heatmap(df_temp.div(df_temp.sum(axis=1), axis=0) * 100, annot=True, linewidths=.5, fmt='.0f')
sns.set(font_scale=1.5)
sns.plt.title('Anzahl Tweets in Prozent je Account')
sns.set(font_scale=1)
ax.set(xlabel='Wochentag', ylabel='Twitter Account')
sns.plt.savefig('tweets_pro_wochentag.png')
df_temp = df.groupby([ df.name, df.created_at.dt.hour])['id'].count().unstack()
df_temp[:3]
sns.set(style='whitegrid')
ax = sns.heatmap(df_temp)
sns.set(font_scale=1.5)
sns.plt.title('Anzahl Tweets')
sns.set(font_scale=1)
ax.set(xlabel='Uhrzeit - Stunde', ylabel='Twitter Account')
sns.set(style='whitegrid')
ax = sns.heatmap(df_temp.div(df_temp.sum(axis=1), axis=0) * 100)
sns.set(font_scale=1.5)
sns.plt.title('Anzahl Tweets in Prozent je Account')
sns.set(font_scale=1)
ax.set(xlabel='Uhrzeit - Stunde', ylabel='Twitter Account')
sns.plt.savefig('tweets_pro_stunde.png')
df_temp = df[df.name == 'Armin Wolf'].groupby([ df.created_at.dt.weekday,
df.created_at.dt.hour])['id'].count().unstack()
df_temp.index = ['Mo', 'Di', 'Mi', 'Do', 'Fr', 'Sa', 'So']
df_temp
sns.set(style='whitegrid')
ax = sns.heatmap(df_temp)
sns.set(font_scale=1.5)
sns.plt.title('Anzahl Tweets von Armin Wolf')
sns.set(font_scale=1)
ax.set(xlabel='Uhrzeit - Stunde', ylabel='Wochentag')
df_temp = df.groupby([df.created_at.dt.month, df.created_at.dt.day,
df.created_at.dt.hour])['id'].count().unstack()
df_temp[:3]
sns.set(style='whitegrid')
ax = sns.heatmap(df_temp)
sns.set(font_scale=1.5)
sns.plt.title('Anzahl Tweets pro Tag')
sns.set(font_scale=1)
ax.set(xlabel='Uhrzeit - Stunde', ylabel='Tag')
sns.plt.savefig('tweets_pro_tag.png')
df_temp = df[df.name == 'Armin Wolf'].groupby([df.created_at.dt.month, df.created_at.dt.day,
df.created_at.dt.hour])['id'].count().unstack()
df_temp[:3]
sns.set(style='whitegrid')
ax = sns.heatmap(df_temp)
sns.set(font_scale=1.5)
sns.plt.title('Anzahl Tweets von Armin Wolf')
sns.set(font_scale=1)
ax.set(xlabel='Uhrzeit - Stunde', ylabel='Tag')
#sns.plt.savefig('tweets_armin_wolf.svg')
df.loc[(df.name == 'Armin Wolf') & (df.created_at.dt.month == 11) &
(df.created_at.dt.day == 9) & (df.created_at.dt.hour == 3),
['created_at', 'text']]
df_temp = df.groupby([df.created_at.dt.month, df.created_at.dt.day,
df.created_at.dt.hour])['retweet_count'].sum().unstack()
df_temp[:3]
sns.set(style='whitegrid')
ax = sns.heatmap(df_temp)
sns.set(font_scale=1.5)
sns.plt.title('Anzahl Retweets pro Tag')
sns.set(font_scale=1)
ax.set(xlabel='Uhrzeit - Stunde', ylabel='Tag')
sns.set(style='whitegrid')
ax = sns.distplot(df.loc[df.retweet_count <= 50,'retweet_count'], kde=False)
sns.set(font_scale=1.5)
sns.plt.title('Verteilung Tweets je Retweets <= 50')
sns.set(font_scale=1)
ax.set(xlabel='Retweets', ylabel='Anzahl Tweets')
sns.plt.savefig('retweets_hist_bis_50.png')
sns.set(style='whitegrid')
ax = sns.distplot(df.loc[df.retweet_count > 50,'retweet_count'], kde=False)
sns.set(font_scale=1.5)
sns.plt.title('Verteilung Tweets je Retweets > 50')
sns.set(font_scale=1)
ax.set(xlabel='Retweets', ylabel='Anzahl Tweets')
sns.plt.savefig('retweets_hist_ab_50.png')
df.sort_values('retweet_count', ascending=False).iloc[0,:]
for name in df.name.unique():
print('*' * 80)
print(name)
print(df[df.name == name].sort_values('retweet_count', ascending=False).iloc[0,:])
print()
df_temp = df[df.name == 'Armin Wolf'].groupby([df.created_at.dt.month, df.created_at.dt.day,
df.created_at.dt.hour])['retweet_count'].sum().unstack()
df_temp[:3]
sns.set(style='whitegrid')
ax = sns.heatmap(df_temp)
sns.set(font_scale=1.5)
sns.plt.title('Anzahl Retweets von Tweets von Armin Wolf')
sns.set(font_scale=1)
ax.set(xlabel='Uhrzeit - Stunde', ylabel='Tag')
#sns.plt.savefig('reteets_armin_wolf.svg')
def get_hashtags(entities):
hashtags = entities.get('hashtags', [])
return [tag['text'].lower() for tag in hashtags]
df_temp_hashtags = df.entities.apply(get_hashtags).apply(pd.Series)
df_temp_hashtags[7:10]
df_temp_hashtags2 = pd.concat([df, df_temp_hashtags], axis=1)
df_temp_hashtags2[7:10]
df_hashtags = pd.melt(df_extended, id_vars=['id_str', 'name'] ,value_vars=[0,1,2,3],
value_name='hashtag', var_name='hashtag_nr').dropna()
df_hashtags[:3]
df_hashtags.hashtag.value_counts()[:5]
from wordcloud import WordCloud
# https://github.com/amueller/word_cloud
# Achtung, ich habe an der Erweiterung wordcloud - die für englische Sprache entwickelt wurde - etwas
# herumgebastelt, um zu verhindern, dass ein 's' am Ende eines Worts als Pluar interpretiert wird.
# Ansonst würden zum Beispiel die Wörter hau und Haus zusammen erfasst.
#stopword Liste 1 von http://members.unine.ch/jacques.savoy/clef/index.html
STOPWORDS_GENERAL = set([x.strip() for x in open('germanST.txt').read().split('\n')])
STOPWORDS_GENERAL = STOPWORDS_GENERAL.union(set(('nämlich',)))
text_hashtags = ' '.join(df_hashtags.hashtag.tolist())
wordcloud = WordCloud(max_font_size=40, relative_scaling=.5, background_color='white',stopwords=STOPWORDS_GENERAL,
scale=3, random_state=42).generate(text_hashtags)
plt.imshow(wordcloud)
plt.axis("off")
#plt.title('Hashtags');
plt.savefig('hashtags.png', transparent=True, bbox_inches='tight', pad_inches=0)
plt.show()
df_hashtags.name.value_counts()
for name in df_hashtags.name.unique():
text_hashtags_name = ' '.join(df_hashtags[df_hashtags.name == name].hashtag.tolist())
wordcloud = WordCloud(max_font_size=40, relative_scaling=.5, background_color='white',stopwords=STOPWORDS_GENERAL,
scale=3, random_state=42).generate(text_hashtags_name)
plt.imshow(wordcloud)
plt.axis("off");
plt.title('Hashtags: ' + name);
plt.savefig('hashtags_' + name.replace(' ','_') + '.png',
transparent=True, bbox_inches='tight', pad_inches=0)
plt.show()