import pandas as pd
# using Kaggle API https://github.com/Kaggle/kaggle-api
DATA_FILE = "~/.kaggle/datasets/uciml/news-aggregator-dataset/uci-news-aggregator.csv"
news = pd.read_csv(DATA_FILE).sample(frac=0.1)
len(news)
42242
news.head(3)
ID | TITLE | URL | PUBLISHER | CATEGORY | STORY | HOSTNAME | TIMESTAMP | |
---|---|---|---|---|---|---|---|---|
13529 | 13530 | Robotic fish designed to perform escape maneuv... | http://www.ecnmag.com/news/2014/03/robotic-fis... | ECNmag.com | t | dSmJK-WR4xv2inMKmnmxaRfd6cf1M | www.ecnmag.com | 1395059947658 |
254251 | 254697 | Faces & names: 'X-Men' climbs to $302 million ... | http://www.duluthnewstribune.com/content/faces... | Duluth News Tribune | e | d5poaO2w8Yffx6MDgPRQSF5POXCXM | www.duluthnewstribune.com | 1401174011596 |
27785 | 27786 | Which 'Divergent' Starlet Skipped Underwear fo... | http://www.cambio.com/2014/03/19/which-diverge... | Cambio | e | d55mX4D4wN3d5vMMYF9GgviF21QlM | www.cambio.com | 1395333837043 |
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
X = news['TITLE']
y = encoder.fit_transform(news['CATEGORY'])
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)
# You can also make it categorical ont-hot vector.
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
OneHotEncoder().fit_transform(
LabelEncoder().fit_transform(news['CATEGORY']).reshape(-1,1)
).toarray()
array([[0., 0., 0., 1.], [0., 1., 0., 0.], [0., 1., 0., 0.], ..., [0., 1., 0., 0.], [0., 0., 1., 0.], [0., 0., 0., 1.]])
len(X_train)
31681
len(X_test)
10561
type(X_train)
pandas.core.series.Series
X_train.head(3)
155319 Facebook: Mobile Powers Growth 230004 Stocks rise ahead of Fed minutes; Dow jumps 10... 91687 GM Expected to Announce Major Investment in 20... Name: TITLE, dtype: object
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=3)
train_vectors = vectorizer.fit_transform(X_train)
test_vectors = vectorizer.transform(X_test)
train_vectors
<31681x9886 sparse matrix of type '<class 'numpy.int64'>' with 267205 stored elements in Compressed Sparse Row format>
X_train.iloc[1]
'Stocks rise ahead of Fed minutes; Dow jumps 100 points'
train_vectors[1]
<1x9886 sparse matrix of type '<class 'numpy.int64'>' with 10 stored elements in Compressed Sparse Row format>
type(train_vectors)
scipy.sparse.csr.csr_matrix
# one-hot vector
train_vectors[1].toarray()
array([[0, 0, 0, ..., 0, 0, 0]])
from sklearn.metrics import accuracy_score
%load_ext autotime
from sklearn import tree
dt = tree.DecisionTreeClassifier()
dt.fit(train_vectors, y_train)
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, presort=False, random_state=None, splitter='best')
time: 5.64 s
pred = dt.predict(test_vectors)
accuracy_score(y_test, pred, )
0.8092983618975476
time: 13.9 ms
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=20)
rf.fit(train_vectors, y_train)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1, oob_score=False, random_state=None, verbose=0, warm_start=False)
time: 8.03 s
pred = rf.predict(test_vectors)
accuracy_score(y_test, pred, )
0.8496354511883344
time: 247 ms
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(train_vectors, y_train)
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
time: 20.6 ms
pred = nb.predict(test_vectors)
accuracy_score(y_test, pred, )
0.9051226209639238
time: 7.33 ms
%unload_ext autotime
from sklearn.pipeline import make_pipeline
c = make_pipeline(vectorizer, nb)
from lime.lime_text import LimeTextExplainer
explainer = LimeTextExplainer(class_names=list(encoder.classes_))
# .sample is random select
example = X_test.sample(1).iloc[0]
example
'Henry Cavill Is Still Super Handsome (But Way More Serious) in the First Official ...'
c.predict_proba([example])
array([[8.54676048e-04, 9.00036923e-01, 4.36873332e-02, 5.54210682e-02]])
%%capture
exp = explainer.explain_instance(example, c.predict_proba, top_labels=2)
exp.show_in_notebook()