#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('reload_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') get_ipython().run_line_magic('matplotlib', 'inline') import os import numpy as np import pandas as pd pd.set_option('display.max_colwidth', -1) # In[2]: import ktrain ktrain.__version__ # ## STEP 1: Get Raw Document Data # In[3]: # 20newsgroups from sklearn.datasets import fetch_20newsgroups # we only want to keep the body of the documents! remove = ('headers', 'footers', 'quotes') # fetch train and test data newsgroups_train = fetch_20newsgroups(subset='train', remove=remove) newsgroups_test = fetch_20newsgroups(subset='test', remove=remove) # compile the texts texts = newsgroups_train.data + newsgroups_test.data # let's also store the newsgroup category associated with each document # we can display this information in visualizations targets = [target for target in list(newsgroups_train.target) + list(newsgroups_test.target)] categories = [newsgroups_train.target_names[target] for target in targets] # ## STEP 2: Train an LDA Topic Model to Discover Topics # # The `get_topic_model` function learns a [topic model](https://en.wikipedia.org/wiki/Topic_model) using [Latent Dirichlet Allocation (LDA)](https://en.wikipedia.org/wiki/Latent_Dirichlet_allocation). # In[4]: get_ipython().run_cell_magic('time', '', 'tm = ktrain.text.get_topic_model(texts, n_features=10000)\n') # We can examine the discovered topics using `print_topics`, `get_topics`, or `topics`. Here, we will use `print_topics`: # In[5]: tm.print_topics() # From the above, we can immediately get a feel for what kinds of subjects are discussed within this dataset. For instsance, Topic \#13 appears to be about the Middle East with labels: "*israel jews jewish israeli arab peace*". # ## STEP 3: Compute the Document-Topic Matrix # # In[6]: get_ipython().run_cell_magic('time', '', 'tm.build(texts, threshold=0.25)\n') # Since the `build` method prunes documents based on threshold, we should prune the original data and any metadata in a similar way for consistency. This can be accomplished with the `filter` method. # In[7]: texts = tm.filter(texts) categories = tm.filter(categories) # This is useful to ensure all data and metadata are aligned with the same array indices in case we want to use them later (e.g., in visualizations, for example). # ## STEP 4: Inspect and Visualize Topics # Let's list the topics by document count: # In[8]: tm.print_topics(show_counts=True) # The topic with the most documents appears to be conversational questions, replies, and comments that aren't focused on a particular subject. Other topics are focused on specific domains (e.g., topic 27 with label "*jews israel jewish israeli arab muslims palestinian peace arabs land*"). # # Notice that some topics contain only a few documents (e.g., topic \#48 about sex, marriage, and relationships). This is typically an indication that this topic is mentioned within documents that also mention other topics prominently (e.g., topics about government policy vs. individual rights). # Let's visualize the corpus: # In[9]: tm.visualize_documents(doc_topics=tm.get_doctopics()) # Top-ranked document for the topic \#74, which is about Christianity: # In[10]: print(tm.get_docs(topic_ids=[74], rank=True)[0]['text']) # Let's visualize the "Christinaity" topic (`topic_id=48`) and the "Medical" topic (`topic_id=15`) # In[11]: doc_topics = tm.get_doctopics(topic_ids=[15, 74]) tm.visualize_documents(doc_topics=doc_topics) # ## STEP 5: Predicting the Topics of New Documents # The `predict` method can predict the topic probability distribution for any arbitrary document directly from raw text: # In[12]: tm.predict(['Elon Musk leads Space Exploration Technologies (SpaceX), where he oversees ' + 'the development and manufacturing of advanced rockets and spacecraft for missions ' + 'to and beyond Earth orbit.']) # As expected, the highest topic probability for this sentence is from topic \#12 (third row and third column), which is about space and related things: # In[13]: tm.topics[ np.argmax(tm.predict(['Elon Musk leads Space Exploration Technologies (SpaceX), where he oversees ' + 'the development and manufacturing of advanced rockets and spacecraft for missions ' + 'to and beyond Earth orbit.']))] # ## Saving and Restoring the Topic Model # # The topic model can be saved and restored as follows. # # **Save the Topic Model:** # In[14]: tm.save('/tmp/tm') # **Restore the Topic Model and Rebuild the Document-Topic Matrix** # In[15]: tm = ktrain.text.load_topic_model('/tmp/tm') # In[16]: tm.build(texts, threshold=0.25) # In[17]: tm.topics[ np.argmax(tm.predict(['Elon Musk leads Space Exploration Technologies (SpaceX), where he oversees ' + 'the development and manufacturing of advanced rockets and spacecraft for missions ' + 'to and beyond Earth orbit.']))] # In[ ]: # In[ ]: