#!/usr/bin/env python # coding: utf-8 # # Simple experiment using BigARTM Python API. # Author - **Murat Apishev** (great-mel@yandex.ru) # BigARTM is an open library for topic modeling of text collections. It's based on the ARTM theory. The projects main site # is http://bigartm.org/. # # The example of BigARTM Python API usage is showen below. We'll proceed a model experiment. # # #### Description: # Let's learn two topic models of text collections, ARTM and PLSA, and compare them. # # #### Quality measures: # One of the important measures is the perplexity. Nevertheless it's not the only way to check the quality of the model learning. The following qualities are implemented in library: # # - $\Phi$ (token-topic) matrix sparsity; # - $\Theta$ (topic-doc) matrix sparsity; # - topic kernel characteristics (purity, contrast, size); # - perplexity; # - topic mass; # - the ratio of background tokens; # - coherency of top-tokens and tokens from topic kernels. # # We'll use the first four ones. The higher values of sparsities and average purity and contrast means the more interpretable model. # # #### Goals of the experiment: # We'll try to learn the ARTM model in the way to obtain better values of sparsities and kernel characteristics, than in PLSA, without significant decline of the perplexity. # # The main tool to control the learning process is the regularization. Here's the list of currently implemented regularizers: # # - $\Phi$ matrix sparsing (+ semi-supervised learning); # - $\Theta$ matrix sparsing (+ semi-supervised learning); # - $\Phi$ matrix topics decorrelation; # - forced $\Phi$ matrix sparsing; # - classes balancing (Label Regularization); # - coherency improving. # # We'll use the first three regularizers in ARTM model in this experiment. ARTM without the regularization corresponds the PLSA model. # # #### Text collection: # Let's use the small 'kos' collection from the UCI repository https://archive.ics.uci.edu/ml/machine-learning-databases/bag-of-words/. The collection has following parameters: # # - 3430 documents; # - 6906 tokens in the dictionary; # - 467714 non-zero values in Bag-of-Words. # At first let's import all necessary modules (make sure you have the BigARTM Python API in your PATH variable): # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') import glob import os import matplotlib.pyplot as plt import artm print artm.version() # First of all you need to prepare the input data. BigARTM has its own documents format for processing called batches. BigARTM has tools for their creation from Bag-of-Words UCI and Vowpal Wabbit formats (see more at http://docs.bigartm.org/en/latest/formats.html). # # Library Python API similarly to scikit-learn algorithms represents input data in the form of one class called BatchVectorizer. This class object get batches or UCI / VW files as inputs and is used as input parameter in all methods. If the given data is not batches, the object will create them and store to disk. # # So let's create the object of BatchVectorizer: # In[2]: batch_vectorizer = None if len(glob.glob(os.path.join('kos', '*.batch'))) < 1: batch_vectorizer = artm.BatchVectorizer(data_path='', data_format='bow_uci', collection_name='kos', target_folder='kos') else: batch_vectorizer = artm.BatchVectorizer(data_path='kos', data_format='batches') # ARTM is a class, that represents BigARTM Python API. Allows to use almost all library abilities in scikit-learn style. Let's create two topic models for our experiments. The most important parameter of the model is the number of topics. Optionally the user can define the list of regularizers and quality measures (scores) to be used in this model. This step can be done later. Note, that each model defines its own namespace for names of regularizers and scores. # In[3]: dictionary = artm.Dictionary() model_plsa = artm.ARTM(topic_names=['topic_{}'.format(i) for i in xrange(15)], scores=[artm.PerplexityScore(name='PerplexityScore', use_unigram_document_model=False, dictionary=dictionary)], cache_theta=True) model_artm = artm.ARTM(topic_names=['topic_{}'.format(i) for i in xrange(15)], scores=[artm.PerplexityScore(name='PerplexityScore', use_unigram_document_model=False, dictionary=dictionary)], regularizers=[artm.SmoothSparseThetaRegularizer(name='SparseTheta', tau=-0.15)], cache_theta=True) # The next step is to initialize models. It can be done using dictionary. It means: # - the $\Phi$ matrix with the name 'pwt' will be created with numbers of rows and columns corresponding the numbers of tokens in the dictionary and topics in the model; # - this matrix will be filled with random values from (0, 1) and normalized. # # Note this step is optional: the model will be auto-initialized during the calls of fit_offline() / fit_online(). # # Dictionary is the object of BigARTM, containing the information about the collection (vocabulary, different counters and values, linked to tokens). # In[4]: if not os.path.isfile('kos/dictionary.dict'): dictionary.gather(data_path=batch_vectorizer.data_path) dictionary.save(dictionary_path='kos/dictionary.dict') dictionary.load(dictionary_path='kos/dictionary.dict') dictionary.load(dictionary_path='kos/dictionary.dict') # Then dictionary can be used to initialize topic model: # In[5]: model_plsa.initialize(dictionary=dictionary) model_artm.initialize(dictionary=dictionary) # As it was said earlier, ARTM provides the ability to use all the scores of BigARTM. Once the score was included into model, the model will save all its values, obtained at the time of each $\Phi$ matrix update. Let's add the scores we need for our experiment (only ones, missed in the constructors): # In[6]: model_plsa.scores.add(artm.SparsityPhiScore(name='SparsityPhiScore')) model_plsa.scores.add(artm.SparsityThetaScore(name='SparsityThetaScore')) model_plsa.scores.add(artm.TopicKernelScore(name='TopicKernelScore', probability_mass_threshold=0.3)) model_artm.scores.add(artm.SparsityPhiScore(name='SparsityPhiScore')) model_artm.scores.add(artm.SparsityThetaScore(name='SparsityThetaScore')) model_artm.scores.add(artm.TopicKernelScore(name='TopicKernelScore', probability_mass_threshold=0.3)) # Now we'll do the same thing with the regularizers for artm_model (let's set their start coefficients of the regularization, these values can be changed later): # In[7]: model_artm.regularizers.add(artm.SmoothSparsePhiRegularizer(name='SparsePhi', tau=-0.1)) model_artm.regularizers.add(artm.DecorrelatorPhiRegularizer(name='DecorrelatorPhi', tau=1.5e+5)) # Now we'll try to learn the model in offline mode (e.g. with one $\Phi$ matrix update during one path through the whole collection). Let's start with 15 passes: # In[8]: model_plsa.num_document_passes = 1 model_artm.num_document_passes = 1 model_plsa.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=15) model_artm.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=15) # Let's check the results of first part of learning process by comparing the values of scores of both models: # In[9]: def print_measures(model_plsa, model_artm): print 'Sparsity Phi: {0:.3f} (PLSA) vs. {1:.3f} (ARTM)'.format( model_plsa.score_tracker['SparsityPhiScore'].last_value, model_artm.score_tracker['SparsityPhiScore'].last_value) print 'Sparsity Theta: {0:.3f} (PLSA) vs. {1:.3f} (ARTM)'.format( model_plsa.score_tracker['SparsityThetaScore'].last_value, model_artm.score_tracker['SparsityThetaScore'].last_value) print 'Kernel contrast: {0:.3f} (PLSA) vs. {1:.3f} (ARTM)'.format( model_plsa.score_tracker['TopicKernelScore'].last_average_contrast, model_artm.score_tracker['TopicKernelScore'].last_average_contrast) print 'Kernel purity: {0:.3f} (PLSA) vs. {1:.3f} (ARTM)'.format( model_plsa.score_tracker['TopicKernelScore'].last_average_purity, model_artm.score_tracker['TopicKernelScore'].last_average_purity) print 'Perplexity: {0:.3f} (PLSA) vs. {1:.3f} (ARTM)'.format( model_plsa.score_tracker['PerplexityScore'].last_value, model_artm.score_tracker['PerplexityScore'].last_value) plt.plot(xrange(model_plsa.num_phi_updates), model_plsa.score_tracker['PerplexityScore'].value, 'b--', xrange(model_artm.num_phi_updates), model_artm.score_tracker['PerplexityScore'].value, 'r--', linewidth=2) plt.xlabel('Iterations count') plt.ylabel('PLSA perp. (blue), ARTM perp. (red)') plt.grid(True) plt.show() print_measures(model_plsa, model_artm) # We can see, that we have an improvement of sparsities and kernel measures, and the downgrade of the perplexion isn't big. Let's try to increase the absolute values of regularization coefficients: # In[10]: model_artm.regularizers['SparsePhi'].tau = -0.2 model_artm.regularizers['SparseTheta'].tau = -0.2 model_artm.regularizers['DecorrelatorPhi'].tau = 2.5e+5 # Besides that let's include into each model the TopTokenScore measure, which allows to look at the most probable tokens in each topic: # In[11]: model_plsa.scores.add(artm.TopTokensScore(name='TopTokensScore', num_tokens=6)) model_artm.scores.add(artm.TopTokensScore(name='TopTokensScore', num_tokens=6)) # We'll continue the learning process with 25 passes through the collection, and than will look at the values of the scores: # In[12]: model_plsa.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=25) model_artm.fit_offline(batch_vectorizer=batch_vectorizer, num_collection_passes=25) # In[13]: print_measures(model_plsa, model_artm) # Besides let's plot the changes of matrices sparsities by iterations: # In[14]: plt.plot(xrange(model_plsa.num_phi_updates), model_plsa.score_tracker['SparsityPhiScore'].value, 'b--', xrange(model_artm.num_phi_updates), model_artm.score_tracker['SparsityPhiScore'].value, 'r--', linewidth=2) plt.xlabel('Iterations count') plt.ylabel('PLSA Phi sp. (blue), ARTM Phi sp. (red)') plt.grid(True) plt.show() plt.plot(xrange(model_plsa.num_phi_updates), model_plsa.score_tracker['SparsityThetaScore'].value, 'b--', xrange(model_artm.num_phi_updates), model_artm.score_tracker['SparsityThetaScore'].value, 'r--', linewidth=2) plt.xlabel('Iterations count') plt.ylabel('PLSA Theta sp. (blue), ARTM Theta sp. (red)') plt.grid(True) plt.show() # It seems that achieved result is enough. The regularization helped us to improve all scores with quite small perplexity downgrade. Let's look at top-tokens: # In[15]: for topic_name in model_plsa.topic_names: print topic_name + ': ', print model_plsa.score_tracker['TopTokensScore'].last_tokens[topic_name] # In[16]: for topic_name in model_artm.topic_names: print topic_name + ': ', print model_artm.score_tracker['TopTokensScore'].last_tokens[topic_name] # We can see, that topics are approximatelly equal in terms of interpretability, but they are more different in ARTM. # Let's extract the $\Phi$ matrix as pandas.DataFrame and print it (to do this operation with more options use ARTm.get_phi()): # In[17]: print model_artm.phi_ # Let's additionally extract $\Theta$ mtrix and print it: # In[19]: theta_matrix = model_artm.get_theta() print theta_matrix # The model can be used to find $\theta_d$ vectors for new documents via ARTM.transform() method: # In[20]: test_batch_vectorizer = artm.BatchVectorizer(data_format='batches', data_path='kos_test', batches=['test_docs.batch']) test_theta_matrix = model_artm.transform(batch_vectorizer=test_batch_vectorizer) print test_theta_matrix # #### Conclusion # # Topic modeling task has an infinite set of solutions. It gives us freedom in our choice. Regularizers give an opportunity to get the result, that satisfacts several criteria (such as sparsity, interpretability) at the same time. # # Given example is a demonstrative one, one can choose more flexible strategies of regularization to get better result. The experiments with other, bigger collection can be proceeded in the same way as it was described above.