#!/usr/bin/env python # coding: utf-8 # ## Exploring the Kaggle Toxic Comment Classification Challenge # ## Jason S. Kessler # # Please see https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge for a link to the dataset used and an explanation of the dataset. # # This notebook makes use of Scattertext (Kessler 2017) to identify category-associtated terms and # # Be advised that this analysis will involve extremely strong language, graphic descripions, and expose numerous annotation errors. # # For more informaiton on the psychology of cursing, I'd recommend reading Jay (2000). # # See Spertus (1997) for an overview of linguistic features for toxic comment classification. # # ### References # # Jay, Timothy. Why We Curse: A neuro-psycho-social theory of speech. John Benjamins Publishing Company. 2000. https://web.stanford.edu/class/linguist1/Rdgs/jay00.pdf # # Jason S. Kessler. Scattertext: a Browser-Based Tool for Visualizing how Corpora Differ. ACL System Demonstrations. 2017. https://arxiv.org/pdf/1703.00565 # # Ellen Spertus. Smokey: automatic recognition of hostile messages. IAAI 1997. 1997. https://www.aaai.org/Papers/IAAI/1997/IAAI97-209.pdf # # ### Other notebooks # # Many notebooks are available through . The R Tidytext package encourages some anti-patterns, as seen in https://www.kaggle.com/madcap/toxic-exploration/notebook. # In[89]: import zipfile, io, time, re import pandas as pd import numpy as np import scattertext as st import spacy from IPython.display import IFrame from IPython.core.display import display, HTML display(HTML("")) # In[26]: nlp = spacy.load('en') # In[2]: df = pd.read_csv(io.StringIO(zipfile.ZipFile('../toxic-comments/train.csv.zip').read('train.csv').decode('utf8'))) # ## We'll first look at offensive (e.g., all toxic categories) vs non-offensive # In[3]: toxic_categories = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'] def get_category(row): for c in toxic_categories: if row[c] == 1: return 'offensive' return 'not-offensive' df['category'] = df.apply(get_category, axis=1) # In[25]: def get_category_list(row): return [c for c in toxic_categories if row[c] == 1] df['category_list'] = df.apply(get_category_list, axis=1) # In[4]: print(df.category.value_counts()) # In[32]: t0 = time.time() df['parse'] = df.comment_text.apply(nlp) time.time() - t0 # In[15]: corpus = st.CorpusFromParsedDocuments( df, parsed_col='parse', category_col='category', feats_from_spacy_doc=st.UnigramsFromSpacyDoc() ).build() # There are too many points to terms on an interactive scatter chart # In[8]: print(len(corpus.get_terms())) # Let's restrict the set term's we'll examine to ~4000 by performing a grid search. # In[11]: for i in range(1, 20): print('Threshold:', i, '# terms:', len(corpus.compact(st.ClassPercentageCompactor(st.OncePerDocFrequencyRanker, i)).get_terms())) # In[17]: compact_corpus = corpus.compact(st.ClassPercentageCompactor(st.OncePerDocFrequencyRanker, 14)) # In[22]: html = st.produce_frequency_explorer( compact_corpus, category='offensive', category_name='Offensive', not_category_name='Inoffensive', term_scorer = st.RankDifference(), metadata = df.category_list.apply(', '.join), grey_threshold = 0, use_full_doc = True ) # In[23]: file_name = 'offensive_vs_not.html' open(file_name, 'wb').write(html.encode('utf-8')) IFrame(src=file_name, width=1400, height=800) # ### How does word usage differ among single categories # In[42]: dfs = [] single_df_columns = ['id', 'parse', 'category_list', 'single_category'] for cat in toxic_categories: new_df = df[(df[cat] == 1)] new_df['single_category'] = cat dfs.append(new_df[single_df_columns]) new_df = df[df[toxic_categories].sum(axis=1) == 0] new_df['single_category'] = 'not-offensive' dfs.append(new_df[single_df_columns]) single_df = pd.concat(dfs) del dfs # In[43]: single_category_corpus = st.CorpusFromParsedDocuments( single_df, parsed_col='parse', category_col='single_category', feats_from_spacy_doc=st.UnigramsFromSpacyDoc() ).build() # In[44]: term_freq_df = st.OncePerDocFrequencyRanker(single_category_corpus).get_ranks() # In[45]: for c in single_category_corpus.get_categories(): scores = (st.ScaledFScorePresets(beta = 1, one_to_neg_one=True) .get_scores( term_freq_df[c + ' freq'], term_freq_df[[oc for oc in term_freq_df.columns if oc != c + ' freq' and oc.endswith(' freq')]].sum(axis=1) )) term_freq_df['score'] = scores print(c) print('+',list(term_freq_df.sort_values(by='score', ascending=False).iloc[:10].index)) print('-',list(term_freq_df.sort_values(by='score', ascending=True).iloc[:10].index)) # In[79]: cats_to_remove = [c for c in single_category_corpus.get_categories() if c not in ('severe_toxic', 'obscene')] tox_obs_corpus = single_category_corpus.remove_categories(cats_to_remove) tox_obs_corpus = tox_obs_corpus.compact(st.ClassPercentageCompactor(term_count=2)) # In[81]: priors = (st.PriorFactory(single_category_corpus, category='severe_toxic', not_categories=['obscene']) .use_neutral_categories() .align_to_target(tox_obs_corpus) .get_priors()) # In[82]: term_ranker = st.OncePerDocFrequencyRanker term_scorer = st.LogOddsRatioInformativeDirichletPrior(priors, scale_type='class-size', sigma=10) rank_df = term_ranker(tox_obs_corpus).get_ranks() rank_df['score'] = term_scorer.get_scores(rank_df['severe_toxic freq'], rank_df['obscene freq']) print('toxic LORIDP',list(rank_df.sort_values(by='score', ascending=False).iloc[:10].index)) print('obscene LORIDP',list(rank_df.sort_values(by='score', ascending=True).iloc[:10].index)) # In[87]: html = st.produce_frequency_explorer( tox_obs_corpus, category='obscene', not_categories=['severe_toxic'], minimum_term_frequency = 10, term_scorer = term_scorer, term_ranker = term_ranker, metadata = tox_obs_corpus.get_df().category_list.apply(lambda x: ', '.join(x)), ) # In[96]: file_name = 'sev_tox_vs_obsc_corpus.html' open(file_name, 'wb').write(html.encode('utf-8')) IFrame(src=file_name, width=1400, height=800) # In[ ]: # ## Repeated text seems like a good toxic feature # In[92]: # https://www.geeksforgeeks.org/searching-for-patterns-set-2-kmp-algorithm/ def computeLPSArray(s, M, lps): s_len = 0; i = 1; lps[0] = 0 while i < M: if s[i] == s[s_len]: s_len += 1; lps[i] = s_len; i += 1 elif s_len != 0: s_len = lps[s_len-1] else: lps[i] = 0; i += 1 alpha = re.compile('[a-z]') def isRepeat(s): s = ''.join(alpha.findall(s.lower())) n = len(s) if n == 0: return False lps = [0] * n computeLPSArray(s, n, lps) #print(lps) return lps[n-1] > 0 and n%(n-lps[n-1]) == 0 # In[93]: df['is_repeat'] = df.comment_text.apply(lambda s: isRepeat(s)) # In[94]: df.category.value_counts() df.groupby('is_repeat').apply(lambda x: x.category.value_counts()) # In[95]: for _, (text, cat, cats) in df[df.is_repeat][['comment_text', 'category', 'category_list']].iterrows(): print(cat, cats) print('--') print(text) print('\n\n\n') # In[ ]: # In[ ]: