from sklearn.datasets import fetch_20newsgroups
categories = [
'alt.atheism',
'talk.religion.misc',
'comp.graphics',
'sci.space',
]
fetch_subset = lambda subset: fetch_20newsgroups(
subset=subset, categories=categories,
shuffle=True, random_state=42,
remove=('headers', 'footers', 'quotes'))
train = fetch_subset('train')
test = fetch_subset('test')
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegressionCV
from sklearn.feature_extraction.text import TfidfVectorizer
vec = TfidfVectorizer()
clf = LogisticRegressionCV()
pipeline = Pipeline([('vec', vec), ('clf', clf)])
pipeline.fit(train['data'], train['target'])
Pipeline(steps=[('vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict', dtype=<class 'numpy.int64'>, encoding='utf-8', input='content', lowercase=True, max_df=1.0, max_features=None, min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True, ...2', random_state=None, refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0))])
import eli5
from eli5 import explain_weights, explain_prediction
from eli5.formatters import format_as_html, format_as_text, format_html_styles, fields
# print(format_as_text(explain_weights(clf, vec, target_names=train['target_names'])))
from IPython.core.display import display, HTML
show_html = lambda html: display(HTML(html))
show_html_expl = lambda expl, **kwargs: show_html(format_as_html(expl, include_styles=False, **kwargs))
show_html(format_html_styles())
eli5.show_weights(clf, vec=vec, target_names=train['target_names'], horizontal_layout=False)
y=alt.atheism top features
Weight? | Feature |
---|---|
+18.117 | atheism |
+16.558 | atheists |
+14.393 | religion |
+14.380 | bobby |
+14.325 | matthew |
+13.389 | motto |
+13.215 | atheist |
+13.010 | islam |
+12.800 | nanci |
+12.216 | enviroleague |
+12.109 | loans |
+11.672 | satan |
+11.488 | posting |
+11.173 | enlightening |
+11.108 | natural |
… 6382 more positive … | |
… 20478 more negative … | |
-11.259 | fake |
-11.526 | order |
-12.169 | christian |
-12.253 | hudson |
-18.551 | space |
y=comp.graphics top features
Weight? | Feature |
---|---|
+25.897 | graphics |
+18.957 | image |
+17.298 | computer |
+16.843 | 3d |
+16.190 | file |
+14.020 | points |
+13.269 | sgi |
+13.180 | 42 |
+12.428 | hi |
+11.835 | 3do |
+11.175 | animation |
+11.146 | using |
+10.877 | code |
+10.792 | package |
+10.681 | video |
+10.585 | screen |
+10.571 | sphere |
+10.570 | 68070 |
+10.553 | files |
… 7893 more positive … | |
… 18967 more negative … | |
-18.127 | space |
y=sci.space top features
Weight? | Feature |
---|---|
+35.983 | space |
+17.907 | orbit |
+15.269 | nasa |
+15.173 | launch |
+13.235 | spacecraft |
+12.872 | mars |
+12.369 | nick |
+12.117 | moon |
+12.064 | allen |
+11.800 | shuttle |
+11.799 | dc |
+10.934 | sci |
+10.726 | solar |
+10.716 | earth |
… 10083 more positive … | |
… 16777 more negative … | |
-10.976 | file |
-11.109 | wrong |
-11.886 | image |
-12.109 | religion |
-13.500 | god |
-18.002 | graphics |
y=talk.religion.misc top features
Weight? | Feature |
---|---|
+19.215 | christian |
+16.667 | blood |
+14.907 | fbi |
+14.185 | christians |
+12.783 | hudson |
+12.746 | order |
+12.338 | christ |
+12.126 | ekr |
+11.972 | terrorist |
+11.608 | koresh |
+11.549 | dead |
+11.185 | cult |
… 6600 more positive … | |
… 20260 more negative … | |
-11.206 | anyone |
-11.567 | could |
-11.699 | get |
-12.212 | thanks |
-12.230 | edu |
-12.319 | it |
-13.026 | atheists |
-17.289 | space |
show_html_expl(
explain_prediction(clf, test['data'][2], vec, target_names=train['target_names']),
force_weights=False, horizontal_layout=True)
Explained as: linear model
y=alt.atheism (probability 0.000, score -16.171) top features
Contribution? | Feature |
---|---|
-1.394 | <BIAS> |
-14.777 | Highlighted in text (sum) |
hi there, i am here looking for some help. my friend is a interior decor designer. he is from thailand. he is trying to find some graphics software on pc. any suggestion on which software to buy,where to buy and how much it costs ? he likes the most sophisticated software(the more features it has,the better)
y=comp.graphics (probability 0.999, score 8.616) top features
Contribution? | Feature |
---|---|
+9.631 | Highlighted in text (sum) |
-1.015 | <BIAS> |
hi there, i am here looking for some help. my friend is a interior decor designer. he is from thailand. he is trying to find some graphics software on pc. any suggestion on which software to buy,where to buy and how much it costs ? he likes the most sophisticated software(the more features it has,the better)
y=sci.space (probability 0.001, score -6.824) top features
Contribution? | Feature |
---|---|
-1.016 | <BIAS> |
-5.808 | Highlighted in text (sum) |
hi there, i am here looking for some help. my friend is a interior decor designer. he is from thailand. he is trying to find some graphics software on pc. any suggestion on which software to buy,where to buy and how much it costs ? he likes the most sophisticated software(the more features it has,the better)
y=talk.religion.misc (probability 0.000, score -11.885) top features
Contribution? | Feature |
---|---|
-1.019 | <BIAS> |
-10.865 | Highlighted in text (sum) |
hi there, i am here looking for some help. my friend is a interior decor designer. he is from thailand. he is trying to find some graphics software on pc. any suggestion on which software to buy,where to buy and how much it costs ? he likes the most sophisticated software(the more features it has,the better)
dense_multitarget=True
is supported for prediction explanations too, and shows just the top prediction highlighting.
show_html_expl(explain_prediction(clf, test['data'][2], vec, target_names=train['target_names']),
force_weights=True)
Explained as: linear model
y=alt.atheism (probability 0.000, score -16.171) top features | y=comp.graphics (probability 0.999, score 8.616) top features | y=sci.space (probability 0.001, score -6.824) top features | y=talk.religion.misc (probability 0.000, score -11.885) top features | ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
|
|
|
y=alt.atheism (probability 0.000, score -16.171) top features
Contribution? | Feature |
---|---|
-1.394 | <BIAS> |
-14.777 | Highlighted in text (sum) |
hi there, i am here looking for some help. my friend is a interior decor designer. he is from thailand. he is trying to find some graphics software on pc. any suggestion on which software to buy,where to buy and how much it costs ? he likes the most sophisticated software(the more features it has,the better)
y=comp.graphics (probability 0.999, score 8.616) top features
Contribution? | Feature |
---|---|
+9.631 | Highlighted in text (sum) |
-1.015 | <BIAS> |
hi there, i am here looking for some help. my friend is a interior decor designer. he is from thailand. he is trying to find some graphics software on pc. any suggestion on which software to buy,where to buy and how much it costs ? he likes the most sophisticated software(the more features it has,the better)
y=sci.space (probability 0.001, score -6.824) top features
Contribution? | Feature |
---|---|
-1.016 | <BIAS> |
-5.808 | Highlighted in text (sum) |
hi there, i am here looking for some help. my friend is a interior decor designer. he is from thailand. he is trying to find some graphics software on pc. any suggestion on which software to buy,where to buy and how much it costs ? he likes the most sophisticated software(the more features it has,the better)
y=talk.religion.misc (probability 0.000, score -11.885) top features
Contribution? | Feature |
---|---|
-1.019 | <BIAS> |
-10.865 | Highlighted in text (sum) |
hi there, i am here looking for some help. my friend is a interior decor designer. he is from thailand. he is trying to find some graphics software on pc. any suggestion on which software to buy,where to buy and how much it costs ? he likes the most sophisticated software(the more features it has,the better)
We can hide weights by passing force_weights=False
(they still will be shown if it's impossible to highlight text)
show_html_expl(explain_prediction(clf, test['data'][4], vec, target_names=train['target_names']), force_weights=False)
Explained as: linear model
y=alt.atheism (probability 0.001, score -7.516) top features
Contribution? | Feature |
---|---|
-1.394 | <BIAS> |
-6.122 | Highlighted in text (sum) |
i am interested in finding 3d animation programs for the mac. i am especially interested in any programs that don't exist in a pc port and are so good that they would make me go buy a mac. do any such exist?
y=comp.graphics (probability 0.999, score 6.432) top features
Contribution? | Feature |
---|---|
+7.447 | Highlighted in text (sum) |
-1.015 | <BIAS> |
i am interested in finding 3d animation programs for the mac. i am especially interested in any programs that don't exist in a pc port and are so good that they would make me go buy a mac. do any such exist?
y=sci.space (probability 0.000, score -10.113) top features
Contribution? | Feature |
---|---|
-1.016 | <BIAS> |
-9.098 | Highlighted in text (sum) |
i am interested in finding 3d animation programs for the mac. i am especially interested in any programs that don't exist in a pc port and are so good that they would make me go buy a mac. do any such exist?
y=talk.religion.misc (probability 0.000, score -11.681) top features
Contribution? | Feature |
---|---|
-1.019 | <BIAS> |
-10.662 | Highlighted in text (sum) |
i am interested in finding 3d animation programs for the mac. i am especially interested in any programs that don't exist in a pc port and are so good that they would make me go buy a mac. do any such exist?
Show explanations for the winning class for first 10 documents from test data
import numpy as np
for doc in test['data'][:10]:
expl = explain_prediction(clf, doc, vec, target_names=train['target_names'], top_targets=1)
show_html_expl(expl, force_weights=False)
Explained as: linear model
y=sci.space (probability 0.979, score 5.057) top features
Contribution? | Feature |
---|---|
+6.073 | Highlighted in text (sum) |
-1.016 | <BIAS> |
trry the skywatch project in arizona.
Explained as: linear model
y=comp.graphics (probability 0.999, score 6.193) top features
Contribution? | Feature |
---|---|
+7.208 | Highlighted in text (sum) |
-1.015 | <BIAS> |
the vatican library recently made a tour of the us. can anyone help me in finding a ftp site where this collection is available.
Explained as: linear model
y=comp.graphics (probability 0.999, score 8.616) top features
Contribution? | Feature |
---|---|
+9.631 | Highlighted in text (sum) |
-1.015 | <BIAS> |
hi there, i am here looking for some help. my friend is a interior decor designer. he is from thailand. he is trying to find some graphics software on pc. any suggestion on which software to buy,where to buy and how much it costs ? he likes the most sophisticated software(the more features it has,the better)
Explained as: linear model
y=comp.graphics (probability 0.994, score 3.280) top features
Contribution? | Feature |
---|---|
+4.294 | Highlighted in text (sum) |
-1.015 | <BIAS> |
rfd request for discussion for the open telematic group otg i have proposed the forming of a consortium/task force for the promotion of naplps/jpeg, fif to openly discuss ways, method, procedures,algorythms, applications, implementation, extensions of naplps/jpeg standards. these standards should facilitate the creation of real_time online applications that make use of voice, video, telecommuting, hires graphics, conferencing, distant learning, online order entry, fax,in addition these dicussion would assist all to better understand how sgml, cals, oda, mime, oodbms, jpeg, mpeg, fractals, sql, cdrom, cdromxa, kodak photocd, tcl, v.fast, and eia/tia562, can best be incorporated and implemented to develop telematic/multimedia applications. we want to be able to support dos, unix, mac, windows, nt, os/2 platforms. it is our hope that individuals, developers, corporations, universities, r & d labs would join in in supporting such an endeavor. this would be a not_for_profit group with bylaws and charter. already many corporations have decided to support otg (open telematic group) so do not delay joining if you are a developer an rfd has been posted to form a usenet newsgroup and a faq will soon be be composed to start promulgating what is known on the subject. if you would like to be added to the maillist send email or mail to the address below. this group would publish an electronic quarterly naplps/jpeg newsletter as well as a hardcopy version. we urge all who wants to see cmcs hires based applications & the naplps/jpeg g r o w, decide to join and mutually benefit from this not-for_profit endeavor. note: telematic has been defined by mr. james martin as the marriage of voice, video, hi-res graphics, fax, ivr, music over telephone lines/lan. if you would like to get involve write to me at: img inter-multimedia group| internet: epimntl@world.std.com p.o. box 95901 | ed.pimentel@gisatl.fidonet.org atlanta, georgia, us | cis : 70611,3703 | fidonet : 1:133/407 | bbs : +1-404-985-1198 zyxel 14.4k
Explained as: linear model
y=comp.graphics (probability 0.999, score 6.432) top features
Contribution? | Feature |
---|---|
+7.447 | Highlighted in text (sum) |
-1.015 | <BIAS> |
i am interested in finding 3d animation programs for the mac. i am especially interested in any programs that don't exist in a pc port and are so good that they would make me go buy a mac. do any such exist?
Explained as: linear model
y=comp.graphics (probability 0.643, score 0.494) top features
Contribution? | Feature |
---|---|
+1.509 | Highlighted in text (sum) |
-1.015 | <BIAS> |
i'm also interested in such a program. but most of all i'd like to know wich program is able to convert gif or pcx to dxf !!! when i have this program, i can scan pictures and frase (or something like that !) them. this will be beyond the limit !!!
Explained as: linear model
y=comp.graphics (probability 0.412, score -0.996) top features
Contribution? | Feature |
---|---|
+0.019 | Highlighted in text (sum) |
-1.015 | <BIAS> |
or how about: "end light pollution now!!" your banner would have no effect on its subject, but my banner would.
Explained as: linear model
y=sci.space (probability 1.000, score 10.393) top features
Contribution? | Feature |
---|---|
+11.409 | Highlighted in text (sum) |
-1.016 | <BIAS> |
: while i'm sure sagan considers it sacrilegious, that wouldn't be : because of his doubtfull credibility as an astronomer. modern, : ground-based, visible light astronomy (what these proposed : orbiting billboards would upset) is already a dying field: the : opacity and distortions caused by the atmosphere itself have : driven most of the field to use radio, far infrared or space-based : telescopes. hardly. the keck telescope in hawaii has taken its first pictures; they're nearly as good as hubble for a tiny fraction of the cost. : in any case, a bright point of light passing through : the field doesn't ruin observations. if that were the case, the : thousands of existing satellites would have already done so (satelliets : might not seem so bright to the eyes, but as far as astronomy is concerned, : they are extremely bright.) i believe that this orbiting space junk will be far brighter still; more like the full moon. the moon upsets deep-sky observation all over the sky (and not just looking at it) because of scattered light. this is a known problem, but of course two weeks out of every four are ok. what happens when this billboard circles every 90 minutes? what would be a good time then? : frank crary : cu boulder
Explained as: linear model
y=alt.atheism (probability 0.991, score 8.925) top features
Contribution? | Feature |
---|---|
+10.319 | Highlighted in text (sum) |
-1.394 | <BIAS> |
not if you show that these hypothetical atheists are gullible, excitable and easily led from some concrete cause. in that case we would also have to discuss if that concrete cause, rather than atheism, was the factor that caused their subsequent behaviour.
Explained as: linear model
y=sci.space (probability 0.850, score -0.580) top features
Contribution? | Feature |
---|---|
+0.436 | Highlighted in text (sum) |
-1.016 | <BIAS> |
picture our universe floating like a log in a river. as the log floats down the river, it occasionally strikes rocks, the bank, the bottom, other logs. when this collission occurs, kinetic energy is translated into heat, the log degrades, gets scraped up, and other energy translaions occur. the distribution of damage to the log depends on the shape of the log. however, to a very small virus in a mite on the head of a termite in the center of the log, the shock waves from the collissions would appear uniformly random in direction. this is my theory for grb. they are evidence of our universe interacting with other universes! why not! makes just as much sense as the grb coming from the oort cloud! the log theory of universes can't be ruled out! of course, i'm a layman in the physics world. you physicists out there, tell me about this !!!!
Now use a vectorizer that skips stopwords
vec_stop = TfidfVectorizer(stop_words='english')
clf_stop = LogisticRegressionCV()
pipeline_stop = Pipeline([('vec', vec_stop), ('clf', clf_stop)])
pipeline_stop.fit(train['data'], train['target'])
Pipeline(steps=[('vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict', dtype=<class 'numpy.int64'>, encoding='utf-8', input='content', lowercase=True, max_df=1.0, max_features=None, min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True, ...2', random_state=None, refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0))])
Words such as "the", "in", "of" are not used as features and are not highlighted
show_html_expl(explain_prediction(clf_stop, test['data'][4], vec_stop, target_names=train['target_names']), force_weights=False)
Explained as: linear model
y=alt.atheism (probability 0.000, score -7.794) top features
Contribution? | Feature |
---|---|
-1.395 | <BIAS> |
-6.399 | Highlighted in text (sum) |
i am interested in finding 3d animation programs for the mac. i am especially interested in any programs that don't exist in a pc port and are so good that they would make me go buy a mac. do any such exist?
y=comp.graphics (probability 0.999, score 5.992) top features
Contribution? | Feature |
---|---|
+7.011 | Highlighted in text (sum) |
-1.018 | <BIAS> |
i am interested in finding 3d animation programs for the mac. i am especially interested in any programs that don't exist in a pc port and are so good that they would make me go buy a mac. do any such exist?
y=sci.space (probability 0.000, score -7.692) top features
Contribution? | Feature |
---|---|
-1.017 | <BIAS> |
-6.675 | Highlighted in text (sum) |
i am interested in finding 3d animation programs for the mac. i am especially interested in any programs that don't exist in a pc port and are so good that they would make me go buy a mac. do any such exist?
y=talk.religion.misc (probability 0.000, score -10.365) top features
Contribution? | Feature |
---|---|
-1.070 | <BIAS> |
-9.294 | Highlighted in text (sum) |
i am interested in finding 3d animation programs for the mac. i am especially interested in any programs that don't exist in a pc port and are so good that they would make me go buy a mac. do any such exist?