import graphlab
graphlab.canvas.set_target("ipynb")
sf = graphlab.SFrame.read_csv("/Users/chengjun/bigdata/w15", header=False)
This non-commercial license of GraphLab Create is assigned to wangchengjun@nju.edu.cn and will expire on July 31, 2016. For commercial licensing options, visit https://dato.com/buy/.
2016-04-14 01:12:14,140 [INFO] graphlab.cython.cy_server, 176: GraphLab Create v1.8.5 started. Logging: /tmp/graphlab_server_1460567529.log
Finished parsing file /Users/chengjun/bigdata/w15
Parsing completed. Parsed 100 lines in 0.546547 secs.
------------------------------------------------------ Inferred types from first line of file as column_type_hints=[str] If parsing fails due to incorrect types, you can correct the inferred type list above and pass it to read_csv in the column_type_hints argument ------------------------------------------------------
Read 12278 lines. Lines per second: 12121.5
Finished parsing file /Users/chengjun/bigdata/w15
Parsing completed. Parsed 72269 lines in 2.23078 secs.
sf
X1 |
---|
aynrand born and educated in russia rand migrated ... |
asphalt in american english asphalt or ... |
actinopterygii the actinopterygii consti ... |
altaiclanguages these language families share ... |
argon the name argon is derived from the greek ... |
augustderleth a 1938 guggenheim fellow der ... |
amateur amateurism can be seen in both a negative ... |
assemblyline an assembly line is a manufacturing ... |
astronomicalunit an astronomical unit ... |
abbess an abbess latin abbatissa feminine form ... |
dir(sf['X1'])
['_SArray__check_min_observations', '_SArray__construct_ctr', '__abs__', '__add__', '__and__', '__class__', '__contains__', '__delattr__', '__div__', '__doc__', '__eq__', '__floordiv__', '__format__', '__ge__', '__get_content_identifier__', '__getattribute__', '__getitem__', '__gt__', '__has_size__', '__hash__', '__init__', '__is_materialized__', '__iter__', '__le__', '__len__', '__lt__', '__materialize__', '__mod__', '__module__', '__mul__', '__ne__', '__neg__', '__new__', '__nonzero__', '__or__', '__pos__', '__pow__', '__proxy__', '__radd__', '__rdiv__', '__reduce__', '__reduce_ex__', '__repr__', '__rfloordiv__', '__rmod__', '__rmul__', '__rpow__', '__rsub__', '__rtruediv__', '__setattr__', '__sizeof__', '__slots__', '__str__', '__sub__', '__subclasshook__', '__truediv__', '_count_ngrams', '_count_words', '_getitem_cache', '_save_as_text', 'all', 'any', 'append', 'apply', 'argmax', 'argmin', 'astype', 'clip', 'clip_lower', 'clip_upper', 'contains', 'cumulative_max', 'cumulative_mean', 'cumulative_min', 'cumulative_std', 'cumulative_sum', 'cumulative_var', 'date_range', 'datetime_to_str', 'dict_has_all_keys', 'dict_has_any_keys', 'dict_keys', 'dict_trim_by_keys', 'dict_trim_by_values', 'dict_values', 'dropna', 'dtype', 'fillna', 'filter', 'from_avro', 'from_const', 'from_sequence', 'head', 'item_length', 'max', 'mean', 'min', 'nnz', 'num_missing', 'pixel_array_to_image', 'rolling_count', 'rolling_max', 'rolling_mean', 'rolling_min', 'rolling_stdv', 'rolling_sum', 'rolling_var', 'sample', 'save', 'show', 'size', 'sketch_summary', 'sort', 'split_datetime', 'std', 'str_to_datetime', 'subslice', 'sum', 'tail', 'to_numpy', 'topk_index', 'unique', 'unpack', 'var', 'vector_slice']
bow = sf['X1']._count_words()
type(sf['X1'])
graphlab.data_structures.sarray.SArray
type(bow)
graphlab.data_structures.sarray.SArray
bow.dict_has_any_keys(['limited'])
dtype: int Rows: 72269 [1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ... ]
bow.dict_values()[0][:20]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1]
sf['bow'] = bow
type(sf['bow'])
graphlab.data_structures.sarray.SArray
len(sf['bow'])
72269
sf['bow'][0].items()[:5]
[('limited', 3), ('writings', 2), ('personally', 1), ('four', 1), ('controversial', 1)]
sf['tfidf'] = graphlab.text_analytics.tf_idf(sf['X1'])
sf['tfidf'][0].items()[:5]
[('limited', 10.04705669672047), ('writings', 9.76010421134325), ('personally', 5.001941923280662), ('four', 2.1272386886969024), ('controversial', 4.375805453003677)]
sf.show()
sf
X1 | tfidf | bow |
---|---|---|
aynrand born and educated in russia rand migrated ... |
{'limited': 10.04705669672047, ... |
{'limited': 3, 'writings': 2, ... |
asphalt in american english asphalt or ... |
{'all': 1.3891905239989626, ... |
{'all': 1, 'accadian': 1, 'similarity': 1, ... |
actinopterygii the actinopterygii consti ... |
{'andreolepis': 11.188150547181156, ... |
{'andreolepis': 1, 'all': 1, 'evolutionary': 2, ... |
altaiclanguages these language families share ... |
{'sergei': 20.031873121992916, ... |
{'sergei': 3, 'all': 6, 'todays': 1, 'chinese': ... |
argon the name argon is derived from the greek ... |
{'limited': 3.3490188989068232, ... |
{'limited': 1, 'embolism': 1, ... |
augustderleth a 1938 guggenheim fellow der ... |
{'evelyn': 6.7937013925087175, ... |
{'evelyn': 1, 'detective': 4, ... |
amateur amateurism can be seen in both a negative ... |
{'since': 1.8775124538896095, ... |
{'since': 1, 'subpar': 1, 'lack': 2, 'valuable' ... |
assemblyline an assembly line is a manufacturing ... |
{'all': 4.167571571996888, ... |
{'all': 3, 'concept': 6, 'consider': 1, 'chine ... |
astronomicalunit an astronomical unit ... |
{'precise': 5.491057060675752, 'a ... |
{'precise': 1, 'all': 2, 'chinese': 1, 'suns': 1, ... |
abbess an abbess latin abbatissa feminine form ... |
{'kildares': 11.188150547181156, ... |
{'kildares': 1, 'they': 4, 'founder': 1, ... |
docs = sf['bow'].dict_trim_by_values(2)
docs = docs.dict_trim_by_keys(graphlab.text_analytics.stopwords(), exclude=True)
m = graphlab.topic_model.create(docs)
Learning a topic model
Number of documents 72269
Vocabulary size 171005
Running collapsed Gibbs sampling
+-----------+---------------+----------------+-----------------+
| Iteration | Elapsed Time | Tokens/Second | Est. Perplexity |
+-----------+---------------+----------------+-----------------+
| 10 | 2.48s | 8.92734e+06 | 0 |
+-----------+---------------+----------------+-----------------+
m
Class : TopicModel Schema ------ Vocabulary Size : 171005 Settings -------- Number of Topics : 10 alpha : 5.0 beta : 0.1 Iterations : 10 Training time : 3.4936 Verbose : False Accessible fields : m['topics'] : An SFrame containing the topics. m['vocabulary'] : An SArray containing the words in the vocabulary. Useful methods : m.get_topics() : Get the most probable words per topic. m.predict(new_docs) : Make predictions for new documents.
m.get_topics()
topic | word | score |
---|---|---|
0 | series | 0.018582602707 |
0 | time | 0.0160412461512 |
0 | played | 0.0142993990545 |
0 | back | 0.00951875933204 |
0 | game | 0.00839911774869 |
1 | war | 0.0176185833315 |
1 | film | 0.0159278169528 |
1 | group | 0.0140632734063 |
1 | party | 0.0103356107163 |
1 | year | 0.0102957274319 |
topics = m.get_topics().unstack(['word','score'], new_column_name='topic_words')['topic_words'].apply(lambda x: x.keys())
for topic in topics:
print topic
['series', 'game', 'time', 'back', 'played'] ['club', 'city', 'work', 'season', 'league'] ['party', 'group', 'war', 'film', 'year'] ['years', 'university', 'law', 'year', 'time'] ['town', 'age', 'system', 'south', 'church'] ['school', 'de', 'river', 'family', 'century'] ['world', 'national', 'including', 'number', 'team'] ['states', 'city', 'state', 'population', 'government'] ['album', 'band', 'song', 'music', 'released'] ['land', 'game', 'army', 'local', 'area']
pred = m.predict(docs)
pred.show()
pred = m.predict(docs, output_type='probabilities')
m['vocabulary']
dtype: str Rows: 171005 ['duke', 'studies', 'journal', 'chris', 'research', 'matthew', 'crisis', 'financial', 'paul', '1987', 'reagan', 'traditional', 'rightwing', 'nominee', 'libertarianism', 'cato', 'chief', 'smith', 'line', 'south', 'nick', '1999', 'documentary', 'animated', 'shows', 'references', 'commentator', 'powerful', 'ethics', 'rush', 'neil', 'lives', 'cited', 'produced', 'night', 'originality', 'interest', '2007', 'individual', 'authors', 'admirer', 'married', 'club', 'library', 'essays', 'recent', '2009', 'burns', 'inspiration', 'artist', 'women', 'early', 'barbara', 'organized', 'gave', 'referred', 'company', 'personalist', 'criticism', 'john', 'reviewers', 'language', 'understanding', 'writes', 'fewer', 'attention', 'positive', 'masterful', 'review', 'times', 'critic', 'praise', 'theory', 'randian', 'importance', 'calling', 'nonfiction', 'academics', 'kant', 'philosophers', 'italian', 'remarked', 'wife', 'house', 'subject', 'scholarly', 'edward', 'system', 'influence', 'acknowledged', '100', 'branden', 'criticized', 'sacrificing', 'exist', 'selfinterest', 'rational', 'communism', 'journals', 'copies', ... ]
m['topics']
topic_probabilities | vocabulary |
---|---|
[1.6417032014e-07, 1.42440301489e-07, ... |
duke |
[1.6417032014e-07, 1.42440301489e-07, ... |
studies |
[1.6417032014e-07, 1.42440301489e-07, ... |
journal |
[1.6417032014e-07, 1.42440301489e-07, ... |
chris |
[1.6417032014e-07, 1.42440301489e-07, ... |
research |
[0.000305520965781, 1.42440301489e-07, ... |
matthew |
[3.44757672294e-06, 1.42440301489e-07, ... |
crisis |
[1.6417032014e-07, 4.41564934616e-06, ... |
financial |
[1.6417032014e-07, 1.42440301489e-07, ... |
paul |
[1.6417032014e-07, 0.00033772595483, ... |
1987 |
def print_topics(m):
topics = m.get_topics(num_words=5)
topics = topics.unstack(['word','score'], new_column_name='topic_words')['topic_words']
topics = topics.apply(lambda x: x.keys())
for topic in topics:
print topic
print_topics(m)
m2 = graphlab.topic_model.create(docs,
num_topics=20,
initial_topics=m['topics'])
associations = graphlab.SFrame()
associations['word'] = ['recognition']
associations['topic'] = [0]
m2 = graphlab.topic_model.create(docs,
num_topics=20,
num_iterations=50,
associations=associations,
verbose=False)
m2.get_topics(num_words=10)
print_topics(m2)