#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('reload_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') get_ipython().run_line_magic('matplotlib', 'inline') import os import numpy as np import pandas as pd pd.set_option('display.max_colwidth', -1) # In[2]: import ktrain # ## STEP 1: Get Raw Document Data # In[3]: # 20newsgroups from sklearn.datasets import fetch_20newsgroups remove = ('headers', 'footers', 'quotes') newsgroups_train = fetch_20newsgroups(subset='train', remove=remove) newsgroups_test = fetch_20newsgroups(subset='test', remove=remove) texts = newsgroups_train.data + newsgroups_test.data # ## STEP 2: Train an LDA Topic Model to Discover Topics # # The `get_topic_model` function learns a [topic model](https://en.wikipedia.org/wiki/Topic_model) using [Latent Dirichlet Allocation (LDA)](https://en.wikipedia.org/wiki/Latent_Dirichlet_allocation). # In[4]: get_ipython().run_cell_magic('time', '', 'tm = ktrain.text.get_topic_model(texts, n_features=10000)\n') # Compute the topic-probability distribution for each document. We need this to compute document similarity. # In[5]: get_ipython().run_cell_magic('time', '', 'tm.build(texts, threshold=0.25)\n') # We can examine the discovered topics using `print_topics`, `get_topics`, or `topics`. Here, we will use `print_topics`: # In[6]: tm.print_topics() # ## STEP 3: Select Topics of Interest # Let's combine some technology-related documents into a set of positive examples of technology-focused posts. We can use these documents as seeds to find new documents about technology. To measure semantic similarity among documents, we will represent each document by its topic probability distribution that we computed above. We will also compile the document IDs for each document associated with these selected topics. # In[9]: tech_topics = [51, 85, 94, 22] # ## Scoring Documents by Similarity # # We will score new documents based on how similar they are to the sample selected above using a [One-Class classifier](https://en.wikipedia.org/wiki/One-class_classification). # In[10]: tm.train_scorer(topic_ids=tech_topics) # We can now invoke the `scorer` method to measure the degree to which new documents are similar to our technology-related topics. Let's use `scorer` to measure the similarity of the remaining documents in the corpus. Note that, although we are applying the scorer to documents within the set corpus used to train the topic model, this is not required. Our `scorer` can be applied to any arbitrary set of documents. # In[11]: other_topics = [i for i in range(tm.n_topics) if i not in tech_topics] other_texts = [d['text'] for d in tm.get_docs(topic_ids=other_topics)] # Let's score these documents and place into a Pandas dataframe. # In[12]: # score documents based on similarity other_scores = tm.score(other_texts) # In[13]: # display results in Pandas dataframe other_preds = [int(score > 0) for score in other_scores] data = sorted(list(zip(other_preds, other_scores, other_texts)), key=lambda item:item[1], reverse=True) print('Top Inliers (or Most Similar to Our Technology-Related Topics') print('\t\tNumber of Predicted Inliners: %s' % sum(other_preds)) df = pd.DataFrame(data, columns=['Prediction', 'Score', 'Text']) df.head() # As you can see, we see we've found additional technology-related posts in the dataset. # # Our `scorer` assigns a score to each document, where higher scores indicate a higher degree of similarity to technology-related seed docments. The `scorer` implements a decision function to make binary decisions on similarity such that documents with positive scores are deemed as similar and negative scores are deemed dissimilar. We've used this to create a prediction of 1 for similar and 0 for dissimilar. This identifies 377 documents as similar. The `scorer`, however, employs a One-Class classifier, which tends to be more strict. That is, there are likely documents with negative scores close to zero that are also similar. Let's look at these. # In[14]: df[df.Score <=0].head() # As you can see, these documents are also similar and related to technology (albeit slightly different aspects of technology than that of our seed set of documents). Such negatively-scored documents are useful for identifying so-called informative examples. Since documents are sorted by score (descending order), we can start at the beginning of the dataframe containing negatively-scored documents and add documents to the positive class until we start seeing negative documents that are **not** related to technology. These informative negative examples can, then, be added to a negative class for training a traditional binary classsifier. This process is referred to as [active learning](https://en.wikipedia.org/wiki/Active_learning_(machine_learning)). # # For instance, in this example, scores below -0.5 start to become **unrelated** to the themes covered by our technical topics. # In[15]: df[(df.Score<-0.51)].head() # ## Using Keyword Searches to Construct Seed Sets # Let's construct a set of seed documents from a keyword search instead of by LDA-discovered topics. Let's search all the documents for the word 'Christ': # In[16]: results = tm.search('Christ', case_sensitive=False) # There are 313 of them. # In[17]: len(results) # Most documents in this set are about Christianity, as expected: # In[18]: print(results[0]['text']) # Let's construct a positive class from these 313 documents and use them to find other religious documents: # In[19]: # train scorer from document IDs returned by keyword search doc_ids = [doc['doc_id'] for doc in results] tm.train_scorer(doc_ids=doc_ids) # get text and scores of remaining documents other_texts = [d['text'] for d in tm.get_docs() if d['doc_id'] not in doc_ids] other_scores = tm.score(other_texts) # display results in Pandas dataframe other_preds = [int(score > 0) for score in other_scores] data = sorted(list(zip(other_preds, other_scores, other_texts)), key=lambda item:item[1], reverse=True) print('Top Inliers (or Most Similar to Our Technology-Related Topics') print('\t\tNumber of Predicted Inliners: %s' % sum(other_preds)) df = pd.DataFrame(data, columns=['Prediction', 'Score', 'Text']) df.head(3) # Here, we've easily found many other documents about religion-focused documents that **do not** explicitly mention Christ. # # Notice that our One Class classifier classified 4759 documents as within-class. This is because many documents mentioning "Christ" may not be about religion or Christianity. As a result, our One-Class classifier predicts 1 for non-relgious documents that are also similar to our positive set. See documents predicted as positive with low scores close to zero: # In[23]: df[df.Score > 0].tail() # In[ ]: