#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('reload_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') get_ipython().run_line_magic('matplotlib', 'inline') import os import numpy as np import pandas as pd pd.set_option('display.max_colwidth', None) # In[2]: import ktrain # # Learning from Unlabeled Text Data # # Unlabeled, unstructured text or document data abound, and it is often necessary to "make sense" of these data for various applications. Examples include: # - *exploratory analysis of text data*: provide rich overviews of the information space to discover relevant information for which one may not have even known to look # - *building training sets for text classification*: identifying positive and negative example documents to train a [text classifier](https://en.wikipedia.org/wiki/Document_classification) in a semi-automated fashion # - *document similarity*: measuring the semantic simlarity between documents or sets of documents # - *document recommender systems*: given a specific document of interest, recommend other documents that are semantically similar to it # # Each of these examples involve **learning from largely unlabeled text data**. In this notebook, we will show you how to accomplish the above with minimal coding using *ktrain*. The *ktrain* library is an open-source, augmented ML library built around Keras and scikit-learn. It can be installed with `pip3 install ktrain` and is [available on GitHub](https://github.com/amaiya/ktrain). # # We will use the well-known [20-newsgroup dataset](http://qwone.com/~jason/20Newsgroups/) for this demonstration. # ## Get Raw Document Data # In[3]: # 20newsgroups from sklearn.datasets import fetch_20newsgroups # we only want to keep the body of the documents! remove = ('headers', 'footers', 'quotes') # fetch train and test data newsgroups_train = fetch_20newsgroups(subset='train', remove=remove) newsgroups_test = fetch_20newsgroups(subset='test', remove=remove) # compile the texts texts = newsgroups_train.data + newsgroups_test.data # let's also store the newsgroup category associated with each document # we can display this information in visualizations targets = [target for target in list(newsgroups_train.target) + list(newsgroups_test.target)] categories = [newsgroups_train.target_names[target] for target in targets] # We are loading the targets (i.e., newsgroup categories), but will not use them for learning a model. Rather, they are simply employed as an example of how to incorporate metadata about documents in visualizations and anlayses. # ## Train an LDA Topic Model to Discover Topics # # The `get_topic_model` function learns a [topic model](https://en.wikipedia.org/wiki/Topic_model) using [Latent Dirichlet Allocation (LDA)](https://en.wikipedia.org/wiki/Latent_Dirichlet_allocation). # # The `n_features` argument specifies the size of the vocabulary, and the `n_topics` argument sets the number of topics (or clusters) to discover. # In[4]: get_ipython().run_cell_magic('time', '', 'tm = ktrain.text.get_topic_model(texts, n_topics=None, n_features=10000)\n') # We can examine the discovered topics using `print_topics`, `get_topics`, or `topics`. Here, we will use `print_topics`: # In[5]: tm.print_topics() # From the above, we can immediately get a feel for what kinds of subjects are discussed within this dataset. For instsance, Topic \#13 appears to be about the Middle East with labels: "*israel jews jewish israeli arab peace*". # # We can examine the word weights for this topic, where the "weight" is a pseudo-count (that can be converted to a probability if normalizing over all words in vocabulary): # In[6]: tm.get_word_weights(topic_id=13, n_words=25) # ## Computing the Document-Topic Matrix # # We will now pre-compute the document-topic matrix. Each row in this matrix represents a document, and the columns represent the probability distribution over the 97 topics. This allows us to easily see what kinds of topics are covered by any specific document in the original corpus. # # When computing the document-topic matrix, we will also filter out documents whose maximum topic probability is less than 0.25 in order to consider the most representative documents for each topic. This may help to improve clarity of visualizations (shown later) by removing "unfocused" documents. # In[7]: get_ipython().run_cell_magic('time', '', 'tm.build(texts, threshold=0.25)\n') # Since the `build` method prunes documents based on threshold, we should prune the original data and any metadata in a similar way for consistency. This can be accomplished with the `filter` method. # In[8]: texts = tm.filter(texts) categories = tm.filter(categories) # This is useful to ensure all data and metadata are aligned with the same array indices in case we want to use them later (e.g., in visualizations, for example). # Having computed the document-topic matrix, we can now easily access the topic probablity distribution for any document in the corpus using `get_doctopics`. For instance, this document in the corpus is about sports: # In[9]: print(texts[35]) # And, here is the topic probability distribution for this document: # In[10]: tm.get_doctopics(doc_ids=[35]) # As expected, the highest topic probability (69%) is associated with a topic about sports: # In[11]: tm.topics[ np.argmax(tm.get_doctopics(doc_ids=[35]))] # ## Predicting the Topics of New Documents # The `predict` method can predict the topic probability distribution for any arbitrary document directly from raw text: # In[12]: tm.predict(['Elon Musk leads Space Exploration Technologies (SpaceX), where he oversees ' + 'the development and manufacturing of advanced rockets and spacecraft for missions ' + 'to and beyond Earth orbit.']) # As expected, the highest topic probability for this sentence is from topic \#12 (third row and third column), which is about space and related things: # In[13]: tm.topics[ np.argmax(tm.predict(['Elon Musk leads Space Exploration Technologies (SpaceX), where he oversees ' + 'the development and manufacturing of advanced rockets and spacecraft for missions ' + 'to and beyond Earth orbit.']))] # ## Visualizing Topics # Let's take another look at the list of discovered topics but sorted by document count. # In[14]: tm.print_topics(show_counts=True) # The topic with the most documents appears to be conversational questions, replies, and comments that aren't focused on a particular subject. Other topics are focused on specific domains (e.g., topic 15 is about **medicine** with label "*medical health disease cancer patients drug treatment*"). # # We can easily generate an interactive visualization of the documents under consideration using `visualize_documents`: # In[15]: tm.visualize_documents(doc_topics=tm.get_doctopics()) # The visualization allows you to hover over points to inspect documents. The `extra_info` argument to `visualize_documents` allows you to customize what is displayed in the hover pop-up. # # ## Inspecting Topics # The `get_docs` method allows you to retrieve document data by `doc_id` or `topic_id`. When `rank=True`, documents are sorted based on relevance to the topic. This is particularly useful for inspecting the most relevant documents to each topic. # # Consider Topic \# 51, which appears to be about computer hardware subjects: # In[16]: tm.topics[51] # Let's examine this the top most relevant document to this topic: # In[17]: doc = tm.get_docs(topic_ids=[51], rank=True)[0] print('DOC_ID: %s' % (doc['doc_id'])) print('TOPIC SCORE: %s '% (doc['topic_proba'])) print('TOPIC_ID: %s' % (doc['topic_id'])) print('TEXT: %s' % (doc['text'])) # Looks right to me. Note that the `get_docs` method returns a list of dicts with keys: # - `text`: the raw text of the document # - `doc_id`: the index into the array returned by `get_doctopics` # - `topic_id`: the index of the topic in the range of `range(n_topics)` # - `topic_proba`: the relevance of this document to the topic represented by `topic_id` # # When `rank=True`, within each `topic_id`, the dicts are sorted in desceding order by the `topic_proba` score. Hence, the first item is the most relevant document to the selected topic (`topic_id=51`). If `rank=False`, results are sorted in ascending order by `doc_id` (i.e,. the same order as `texts` that was supplied as input to `build`. # # # #### A Note About `get_docs` vs. `get_sorted_docs` # When we executed `print_topics(show_counts=True)` above, you may have noticed that topics towards the bottom of the list only had a few or even one document. For instance, `topic_ic=48` is about *sex, marriage and relationships* and is associated with only **one** document. This is because these counts are generated by assigning each document to the one topic to which it is most related. Does this mean there is only one document talking about *sex, marriage, and relationships*? No. Rather, it just means that there is only one document that is **most** relevant to this topic over other topics. Other documents may pertain to *sex, marriage, and relationships*, but were determined to pertain most to another topic. That is, although these other documents talk about *sex, marriage, and relationships*, their primary topic were determined to be something else. # # When invoking `tm.get_docs(topic_ids[48]`), this means only one document will be returned, since only a single document is primarily related to that topic. To see other documents that mention *sex, marriage, and relationships* (i.e., `topic_id=48`), we can use the `get_sorted_docs` method instead. The `get_sorted_docs` method will return **all documents** sorted by relevance to the given `topic_id`. For instance, this is the second most relevant document to *sex, marriage, and relationships*. Although it pertains to this topic, it was assigned to `topic_id=42` (*people and governments* topic) because it is discussing sex in the context of society and government. # In[18]: doc = tm.get_sorted_docs(topic_id=48)[1] print('DOC_ID: %s' % (doc['doc_id'])) print('TOPIC SCORE: %s '% (doc['topic_proba'])) print('TOPIC_ID: %s' % (doc['topic_id'])) print('TEXT: %s' % (doc['text']).strip()) # Next, let's examine some additional topics that appears related to the larger theme of technology. # # Here is a topic which appears to be about Windows software: # In[19]: tm.topics[85] # In[20]: tm.get_docs(topic_ids=[85], rank=True)[0] # A topic about **Programming**: # In[21]: tm.topics[94] # In[22]: tm.get_docs(topic_ids=[94], rank=True)[0] # A topic about **cryptography**: # In[23]: tm.topics[22] # In[24]: tm.get_docs(topic_ids=[22], rank=True)[0] # ## Compiling a Sample of Interesting Documents # Let's combine these technology-related documents into a set of positive examples of technology-focused posts. We can use these documents as seeds to find new documents about technology. To measure semantic similarity among documents, we will represent each document by its topic probability distribution. # In[25]: tech_topics = [51, 85, 94, 22] tech_probs = tm.get_doctopics(topic_ids=tech_topics) doc_ids = [doc['doc_id'] for doc in tm.get_docs(topic_ids=tech_topics)] # Let's visualize these technology-focused documents. We will also compile the original newsgroup categories for each document, so that they can be included in the visualization. (This is why we invoked the `filter` method earler.) # In[26]: newsgroup_categories = [categories[doc_id] for doc_id in doc_ids] tm.visualize_documents(doc_topics=tech_probs, extra_info={'cat': newsgroup_categories, 'doc_id':doc_ids}) # ## Scoring Documents by Similarity # # Once you've identified a set of documents that are interesting to your use case, you may want to identify additional documents that are semantically similar to this set. Here, suppose we wanted to identify new documents that are related to computer technology. We can accomplish this with the `train_scorer` and `score` methods. The `train_scorer` method compiles a list of seed documents based on supplied `topic_ids` or `doc_ids`. The `score` method scores new documents based on their similarity to the seed documents. Internally, this is accomplished by training a rudimentary [One-Class classifier](https://en.wikipedia.org/wiki/One-class_classification). While this classifier can be used as is, it is also useful to use the `score` method to help compile a training set for a traditinal binary classifier. # In[27]: tm.train_scorer(topic_ids=tech_topics) # We can now invoke the `score` method to measure the degree to which new documents are similar to our technology-related topics. Note that, although we are applying the scorer to documents within the set corpus used to train the topic model, this is not required. Our `scorer` can be applied to any arbitrary set of documents. # # Let's retrieve the text associated with all documents **not** associated with our selected technology-focused topics. These documents stored in `other_texts` are present in the original corpus. You could also construct `other_texts` to contain an entirely new, unseen corpus of documents. # In[28]: other_topics = [i for i in range(tm.n_topics) if i not in tech_topics] other_texts = [d['text'] for d in tm.get_docs(topic_ids=other_topics)] # Let's score these documents and place into a Pandas dataframe. # In[29]: # score documents based on similarity other_scores = tm.score(other_texts) # In[30]: # display results in Pandas dataframe other_preds = [int(score > 0) for score in other_scores] data = sorted(list(zip(other_preds, other_scores, other_texts)), key=lambda item:item[1], reverse=True) print('Top Inliers (or Most Similar to Our Technology-Related Topics') print('\t\tNumber of Predicted Inliners: %s' % sum(other_preds)) df = pd.DataFrame(data, columns=['Prediction', 'Score', 'Text']) df.head() # As you can see, we see we've found additional technology-related posts in the dataset. # # Our `scorer` assigns a score to each document, where higher scores indicate a higher degree of similarity to technology-related seed docments. The `scorer` implements a decision function to make binary decisions on similarity such that documents with positive scores are deemed as similar and negative scores are deemed dissimilar. We've used this to create a prediction of 1 for similar and 0 for dissimilar. This identifies 377 documents as similar. The `scorer`, however, employs a One-Class classifier, which tends to be more strict. That is, there are likely documents with negative scores close to zero that are also similar. Let's look at these. # In[31]: df[df.Score <=0].head() # As you can see, these documents are also similar and related to technology (albeit slightly different aspects of technology than that of our seed set of documents). Such negatively-scored documents are useful for identifying so-called informative examples. Since documents are sorted by score (descending order), we can start at the beginning of the dataframe containing negatively-scored documents and add documents to the positive class until we start seeing negative documents that are **not** related to technology. These informative negative examples can, then, be added to a negative class for training a traditional binary classsifier. This process is referred to as [active learning](https://en.wikipedia.org/wiki/Active_learning_(machine_learning)). # # For instance, in this example, scores below -0.5 start to become **unrelated** to the themes covered by our technical topics. # In[32]: df[(df.Score<-0.51)].head() # ## Using Keyword Searches to Construct Seed Sets # Let's construct a set of seed documents from a keyword search instead of by LDA-discovered topics. Let's search all the documents for the word 'Christ': # In[33]: results = tm.search('Christ', case_sensitive=False) # There are 313 of them. # In[34]: len(results) # Many documents in this set are about Christianity, as expected: # In[35]: print(results[0]['text']) # However, since we compiled the seed set of documents based on the keyword "Christ", some documents in the set may be only loosely related to Christianity (if at all). We will see below how this impacts results. # Let's construct a positive class from these 313 documents and use them to find other religious documents: # In[36]: # compile doc_ids doc_ids = [doc['doc_id'] for doc in results] # train scorer from document IDs returned by keyword search tm.train_scorer(doc_ids=doc_ids) # get text and scores of remaining documents other_texts = [d['text'] for d in tm.get_docs() if d['doc_id'] not in doc_ids] other_scores = tm.score(other_texts) # display results in Pandas dataframe other_preds = [int(score > 0) for score in other_scores] data = sorted(list(zip(other_preds, other_scores, other_texts)), key=lambda item:item[1], reverse=True) print('Top Inliers (or Most Similar to Our Technology-Related Topics') print('\t\tNumber of Predicted Inliners: %s' % sum(other_preds)) df = pd.DataFrame(data, columns=['Prediction', 'Score', 'Text']) df.head(3) # Here, we've easily found other documents about religion-focused documents that **do not** explicitly mention Christ. Note that our One-Class classifier predicted a positive label for 4759 documents here. The reason for this large number is that some of the documents in the seed set containing the word 'Christ' may be only loosely related to religion or Christinity (if at all). # # This document, for example, mentions 'Christ' somewhere in the post, but is largely unrelated to Christanity or religion: # In[37]: print(texts[11311][:1024]) # When including such documents in the seed set, documents **unrelated** to Christianity or religion are predicted with a positive label (i.e, false positives). However, higher scoring documents **will** be related to Christianity, since documents in this dataset containing "Christ" are more likely to be about Christianity and relgion. Thus, as a document similarity scorer, constructing a positive class by pulling in documents based on keywords can still work. # ## Recommending Similar Documents # # In the previous section, given a set of seed documents, we scored **new** documents based on similarity. Here, we will reverse this process. Given a **new** document, we will find (or recommend) documents that are semantically similar to it from the 20newsgroup corpus. # # We must first train the recommender. The `train_recommender` method trains a [Nearest Neighbors model](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestNeighbors.html#sklearn.neighbors.NearestNeighbors) that can be used to perform **semantic searches** and generate **document recommendations** on your dataset. # In[38]: tm.train_recommender() # Now, let's create some text about space exploration and recommend the top newsgroup posts similar to this text. # In[39]: rawtext = """ Elon Musk leads Space Exploration Technologies (SpaceX), where he oversees the development and manufacturing of advanced rockets and spacecraft for missions to and beyond Earth orbit. """ # Here is the top recommended 20newsgroup post based on semantic similarity to the text above. This can be considered a **semantic text search** as the `recommend` method will return documents that are semantically-related to the supplied text. # In[40]: for i, doc in enumerate(tm.recommend(text=rawtext, n=1)): print('RESULT #%s'% (i+1)) print('TEXT:\n\t%s' % (doc['text'])) print('NEWSGROUP:\n\t%s'% (categories[doc['doc_id']])) print('TOPIC:\n\t%s' % (tm.topics[doc['topic_id']])) print() # ## Saving and Restoring the Topic Model # # The topic model can be saved and restored as follows. # # **Save the Topic Model:** # In[41]: tm.save('/tmp/tm') # **Restore the Topic Model and Rebuild the Document-Topic Matrix** # In[42]: tm = ktrain.text.load_topic_model('/tmp/tm') # In[43]: tm.build(texts, threshold=0.25) # Note that the scorer and recommender are not saved, only the LDA topic model is saved. So, the scorer and recommender should be retrained prior to use as follows: # In[44]: tm.train_recommender() # In[45]: rawtext = """ Elon Musk leads Space Exploration Technologies (SpaceX), where he oversees the development and manufacturing of advanced rockets and spacecraft for missions to and beyond Earth orbit. """ # In[46]: print(tm.recommend(text=rawtext, n=1)[0]['text']) # In[ ]: # In[ ]: