import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
%matplotlib inline
# Load the "autoreload" extension
%load_ext autoreload
# always reload modules marked with "%aimport"
%autoreload 1
# add the 'src' directory as one where we can import modules
src_dir = os.path.join(os.getcwd(), os.pardir, 'src')
sys.path.append(src_dir)
# import my method from the source code
%aimport data.read_data
from data.read_data import read_data
train, test = read_data(test=True)
print(train.shape)
train.head()
(60000, 6)
ID | review_content | review_title | review_stars | product | Target | |
---|---|---|---|---|---|---|
0 | 0 | En appelant un acheteur pour demander si l'écr... | La Police s'inscrit en acheteur privé sur Pric... | 5 | 2fbb619e3606f9b7c213e858a109cda771aa2c47ce50d5... | 0 |
1 | 1 | Alors, là, on a affaire au plus grand Navet ja... | Chef D'Oeuvre Absolu en vue... | 5 | 7b56d9d378d9e999d293f301ac43d044cd7b4786d09afb... | 1 |
2 | 2 | Effet garanti sur la terrase. Ils donnent immé... | Effet garanti sur la terrase. Ils donnent immé... | 3 | 7b37bf5dcb2fafd9229897910318a7dfa11a04ca36893c... | 0 |
3 | 3 | tres bon rapport qualite prix tre pratique en ... | bon produit | 4 | 77d2dbd504b933ab3aaf7cb0cd81c22f7c3549012f4f88... | 1 |
4 | 4 | Ordinateur de bureau trés bien pour quelqu'un ... | Apple Power MAC G4 | 3 | f574512e7d2dd1dd73c7f8f804bf16f14c932c5651a01b... | 1 |
fig, axr = plt.subplots(1,2, figsize=(14,5))
sns.countplot(train['Target'], ax=axr[0])
train.groupby(['Target', 'review_stars']).size().unstack('Target').plot(kind='bar', stacked=True, ax=axr[1])
<matplotlib.axes._subplots.AxesSubplot at 0x7f44f489ea20>
sns.countplot(y=train['product'], order=train['product'].value_counts()[:20].index)
<matplotlib.axes._subplots.AxesSubplot at 0x7f8f260fe550>
"Appear only one {}, More than one time {}".format((train.groupby('product').size() == 1).sum(),
(train.duplicated('product', keep=False).sum()))
'Appear only one 40068, More than one time 19932'
train[train['Target'] == 0].groupby('product').size().plot(kind='hist', logy=True, bins=100, figsize=(10,5), alpha=0.5,
label='0. Mean product reviews : {:.2f}'.format(train[train['Target'] == 0].groupby('product').size().mean()))
train[train['Target'] == 1].groupby('product').size().plot(kind='hist', logy=True, bins=100, figsize=(10,5), alpha=0.5,
label='1. Mean product reviews : {:.2f}'.format(train[train['Target'] == 1].groupby('product').size().mean()))
plt.legend(title='Target')
plt.xlabel('Product reviews per product')
plt.ylabel('Number of products')
Text(0,0.5,'Number of products')
fig, axr = plt.subplots(1,2, figsize=(14,5))
sns.distplot(np.log(train[train['Target'] == 0]['review_content'].apply(lambda x: len(x.split()))), label='Negative', ax=axr[0])
sns.distplot(np.log(train[train['Target'] == 1]['review_content'].apply(lambda x: len(x.split()))), label='Positive', ax=axr[0])
axr[0].legend()
sns.distplot(np.log(train[train['Target'] == 0]['review_title'].dropna().apply(lambda x: len(x.split() if x else np.NaN))), label='Negative', ax=axr[1])
sns.distplot(np.log(train[train['Target'] == 1]['review_title'].dropna().apply(lambda x: len(x.split() if x else np.NaN))), label='Positive', ax=axr[1])
axr[1].legend()
<matplotlib.legend.Legend at 0x7f8f090c07b8>
plt.figure(figsize=(14,4))
plt.title('Uncleaned words from rewiews')
sns.countplot(train['review_title'].str.split(expand=True).unstack(),
order=train['review_title'].str.split(expand=True).unstack().value_counts()[:20].index)
<matplotlib.axes._subplots.AxesSubplot at 0x7f8f26e73240>
with open('../data/external/fr-stopwords.txt') as fp:
stopwords = fp.read().splitlines()
positive = train[train['Target'] == 1]['review_title'].dropna().values
negative = train[train['Target'] == 0]['review_title'].dropna().values
# The wordcloud of Cthulhu/squidy thing for HP Lovecraft
fig, axr = plt.subplots(1,2, figsize=(16,13))
wcP = WordCloud(background_color="white", max_words=10000,
stopwords=stopwords, max_font_size= 40)
wcN = WordCloud(background_color="black", max_words=10000,
stopwords=stopwords, max_font_size= 40)
wcP.generate(" ".join(positive))
wcN.generate(" ".join(negative))
#plt.title("HP Lovecraft (Cthulhu-Squidy)", fontsize=20)
axr[0].set_title('Positive')
axr[1].set_title('Negative')
axr[0].imshow(wcP.recolor( colormap= 'viridis' , random_state=17), alpha=0.98)
axr[1].imshow(wcN.recolor( colormap= 'Pastel1_r' , random_state=17), alpha=0.98)
axr[0].axis('off'); axr[1].axis('off')
(-0.5, 399.5, 199.5, -0.5)