#!/usr/bin/env python # coding: utf-8 # ## Open Machine Learning Course mlcourse.ai. English session #1 # # ### Autor: Valentin Kovalev # ## Tutorial # ### Latent Dirichlet Allocation # In this tutorial I'll try some Latent Dirichlet Allocation to automaticallly extract the topics that charactereze texts.
Good tuning of LDA (that's an art) can give a really good result on the Leaderboard on kaggle contests with text features.
# #### import libraries # In[ ]: from gensim.corpora.dictionary import Dictionary from gensim.models.ldamodel import LdaModel from gensim.test.utils import common_texts # ### Extracting topics with LDA # # LDA represents documents as mix of topics that spit out words with certain probabilities.
# # For each possible topic Z, we'll multiply the frequency of this word type W in Z by the number of other words in document D that already belong to Z. The result will represent the probability that this word came from Z. # ### Train an LDA model using a Gensim corpus # #### Create a corpus from a list of texts # In[ ]: common_dictionary = Dictionary(common_texts) common_corpus = [common_dictionary.doc2bow(text) for text in common_texts] # #### Train the model on the corpus. # In[ ]: lda = LdaModel(common_corpus, num_topics=10) # ### We can save a model to disk, or reload a pre-trained model # This code will be commented for not to produce entities # In[ ]: from gensim.test.utils import datapath # #### Save model to disk # In[ ]: # temp_file = datapath("model") # lda.save(temp_file) # #### Load a potentially pretrained model from disk. # In[ ]: # lda = LdaModel.load(temp_file) # ### Check model on using new, unseen documents # #### Create a new corpus, made of previously unseen documents. # In[ ]: other_texts = [ ["computer", "time", "graph"], ["survey", "response", "eps"], ["human", "system", "computer"], ] other_corpus = [common_dictionary.doc2bow(text) for text in other_texts] unseen_doc = other_corpus[0] vector = lda[unseen_doc] # get topic probability distribution for a document # ### Update the model by incrementally training on the new corpus # In[ ]: lda.update(other_corpus) vector = lda[unseen_doc] # #### About hyperparameters # Alpha and Beta Hyperparameters – alpha represents document-topic density and Beta represents topic-word density. Higher the value of alpha, documents are composed of more topics and lower the value of alpha, documents contain fewer topics. On the other hand, higher the beta, topics are composed of a large number of words in the corpus, and with the lower value of beta, they are composed of few words. # # Number of Topics – Number of topics to be extracted from the corpus. Researchers have developed approaches to obtain an optimal number of topics by using Kullback Leibler Divergence Score. I will not discuss this in detail, as it is too mathematical. For understanding, one can refer to this[1] original paper on the use of KL divergence. # # Number of Topic Terms – Number of terms composed in a single topic. It is generally decided according to the requirement. If the problem statement talks about extracting themes or concepts, it is recommended to choose a higher number, if problem statement talks about extracting features or terms, a low number is recommended. # # Number of Iterations / passes – Maximum number of iterations allowed to LDA algorithm for convergence. # ### Ok, lets check on real data # In[ ]: doc1 = "Sugar is bad to consume. My sister likes to have sugar, but not my father." doc2 = "My father spends a lot of time driving my sister around to dance practice." doc3 = "Doctors suggest that driving may cause increased stress and blood pressure." doc4 = "Sometimes I feel pressure to perform well at school, but my father never seems to drive my sister to do better." doc5 = "Health experts say that Sugar is not good for your lifestyle." # compile documents doc_complete = [doc1, doc2, doc3, doc4, doc5] # #### Cleaning and preprocessing # In[ ]: import string from nltk.corpus import stopwords from nltk.stem.wordnet import WordNetLemmatizer stop = set(stopwords.words("english")) exclude = set(string.punctuation) lemma = WordNetLemmatizer() def clean(doc): stop_free = " ".join([i for i in doc.lower().split() if i not in stop]) punc_free = "".join(ch for ch in stop_free if ch not in exclude) normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split()) return normalized doc_clean = [clean(doc).split() for doc in doc_complete] # #### Preparing Document-Term Matrix # In[ ]: # Importing Gensim import gensim from gensim import corpora # Creating the term dictionary of our courpus, where every unique term is assigned an index. dictionary = corpora.Dictionary(doc_clean) # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above. doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean] # #### Running LDA Model # In[ ]: # Creating the object for LDA model using gensim library Lda = gensim.models.ldamodel.LdaModel # Running and Trainign LDA model on the document term matrix. ldamodel = Lda(doc_term_matrix, num_topics=3, id2word=dictionary, passes=50) # #### Results # In[ ]: print(ldamodel.print_topics(num_topics=3, num_words=3)) # ### Topic Modelling for Feature Selection # Sometimes LDA can also be used as feature selection technique. Take an example of text classification problem where the training data contain category wise documents. If LDA is running on sets of category wise documents. Followed by removing common topic terms across the results of different categories will give the best features for a category. # ### Bonus: pyLDAvis # pyLDAvis is designed to help users interpret the topics in a topic model that has been fit to a corpus of text data. The package extracts information from a fitted LDA topic model to inform an interactive web-based visualization. # # The visualization is intended to be used within an IPython notebook but can also be saved to a stand-alone HTML file for easy sharing. # By default the topics are projected to the 2D plane using PCoA on a distance matrix created using the Jensen-Shannon divergence on the topic-term distributions. You can pass in a different multidimensional scaling function via the mds parameter. In addition to pcoa, other provided options are tsne and mmds which operate on the same JS-divergence distance matrix. Both tsne and mmds require that you have sklearn installed. Here is tnse in action: # In[ ]: import pyLDAvis.gensim vis = pyLDAvis.gensim.prepare(ldamodel, corpus=doc_term_matrix, dictionary=dictionary) pyLDAvis.display(vis) # ### Refrences: # - LDA official site. # - LDA with Python. # - LDA on Python guide. # - test on gensim models. # - pyLDAvis library.