#!/usr/bin/env python # coding: utf-8 # In[12]: import glob, os, re, json, pickle import pandas as pd from classifier import * from sklearn.metrics import classification_report from sklearn.model_selection import train_test_split pd.set_option('display.max_colwidth', -1) pd.set_option('display.max_rows', 200) pd.set_option('display.max_columns', 200) get_ipython().run_line_magic('reload_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') # # Human labeled data # In[11]: df_human = pd.read_json('human_labeled_data.json') df_human['text'] = df_human['text_data'].str.replace('https[^\s]*\s', '') df_human['manifestolabel_true'] = df_human['major_label'].str.replace('\d\d\d ','') # df_human['manifestolabel_true'] = df_human['manifestolabel_true'].replace('ignored','undefined') df_human = df_human.drop(['text_data','labeled','major_label','selected','taught','labels','users','uncertainty','text_id','predicted_label'],axis=1) # df_human.to_json('human_labeled_anonymized.json',orient='records') df_human.sample(n=20) # In[13]: df_human['manifestolabel_true'].value_counts() # # Classification performance on tweets # # * we want to predict manifesto codes on tweets well # * but we only have few tweets labeled # * we'd like to use as much training data as possible # * but we don't want to degrade manifesto-prediction performance on tweets by adding irrelevant manifesto training data. # # $\rightarrow$ How many manifesto training data should be have in our training set to achieve high tweet classification performance? # # $\Rightarrow$ **Crossvalidation with blending in manifesto data**: # * We start with 0 manifesto data and all labeled tweets and add manifesto data # * We evaluate on held-out tweet data # In[16]: mixin_manifesto = [] for N in [0, 100, 500, 1000, 5000, 10000]: for rep in range(10): df_manifesto = get_manifesto_data() tweets_train, tweets_test, labels_train, labels_test = train_test_split(df_human['text'], df_human['manifestolabel_true'], test_size=.2) df_manifesto = df_manifesto.sample(n=N) train_text = pd.concat([df_manifesto['text'],tweets_train]) train_labels = pd.concat([df_manifesto['manifestolabel'],labels_train]) enough_samples_per_class = train_labels.value_counts() > 5 valid = train_labels.isin(enough_samples_per_class[enough_samples_per_class==True].index) train_single(train_text[valid], train_labels[valid], 'tweets_and_manifesto') df_test = pd.concat([tweets_test,labels_test],axis=1) df_test.columns = ['text','manifestolabel_true'] tw = score_texts(df_test,['tweets_and_manifesto']) results_tweets_and_manifesto_df = pd.DataFrame(classification_report(tw['manifestolabel_true'],tw['tweets_and_manifesto'],output_dict=True,zero_division=0)).T print(f'N={N}\n') print(results_tweets_and_manifesto_df[results_tweets_and_manifesto_df['f1-score']>0]) mixin_manifesto.append( { 'N': N, 'rep': rep, 'f1':results_tweets_and_manifesto_df.loc['weighted avg','f1-score'] }) mixin_manifesto_df = pd.DataFrame(mixin_manifesto) # # Results # # More than 500 manifesto samples lead to decreased classification performance on held-out tweets # In[18]: mixin_manifesto_df.groupby('N').agg({'f1':np.median}).plot.bar()