#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('reload_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') get_ipython().run_line_magic('matplotlib', 'inline') import os import numpy as np import pandas as pd pd.set_option('display.max_colwidth', -1) # In[2]: import ktrain # ## STEP 1: Get Raw Document Data # In[3]: # 20newsgroups from sklearn.datasets import fetch_20newsgroups remove = ('headers', 'footers', 'quotes') newsgroups_train = fetch_20newsgroups(subset='train', remove=remove) newsgroups_test = fetch_20newsgroups(subset='test', remove=remove) texts = newsgroups_train.data + newsgroups_test.data # ## STEP 2: Represent Documents as Semantically Meaningful Vectors With LDA # In[4]: get_ipython().run_cell_magic('time', '', 'tm = ktrain.text.get_topic_model(texts, n_features=10000)\n') # In[5]: get_ipython().run_cell_magic('time', '', 'tm.build(texts, threshold=0.25)\n') # ## STEP 3: Train a Document Recommender # In[6]: tm.train_recommender() # ## STEP 4: Generate Recommendations # # # Given some text, recommend documents that are semantically relevant to it. # In[8]: rawtext = """ Elon Musk leads Space Exploration Technologies (SpaceX), where he oversees the development and manufacturing of advanced rockets and spacecraft for missions to and beyond Earth orbit. """ # In[9]: for i, doc in enumerate(tm.recommend(text=rawtext, n=5)): print('RESULT #%s'% (i+1)) print('TEXT:\n\t%s' % (" ".join(doc['text'].split()[:500]))) print() # ### Saving and Restoring the Topic Model # # The topic model can be saved and restored as follows. # # **Save the Topic Model:** # In[10]: tm.save('/tmp/tm') # **Restore the Topic Model and Rebuild the Document-Topic Matrix** # In[11]: tm = ktrain.text.load_topic_model('/tmp/tm') # In[12]: tm.build(texts, threshold=0.25) # Note that the scorer and recommender are not saved, only the LDA topic model is saved. So, the scorer and recommender should be retrained prior to use as follows: # In[13]: tm.train_recommender() # In[14]: rawtext = """ Elon Musk leads Space Exploration Technologies (SpaceX), where he oversees the development and manufacturing of advanced rockets and spacecraft for missions to and beyond Earth orbit. """ # In[15]: print(tm.recommend(text=rawtext, n=1)[0]['text']) # In[ ]: