#!/usr/bin/env python
# coding: utf-8

# # Chapter 10: LDA for Topic Modeling

# ## Load Newsgroups data
# 
# As before, let's consider a specific set of categories:

# In[1]:


from sklearn.datasets import fetch_20newsgroups

def load_dataset(sset, cats):
    if cats==[]:
        newsgroups_dset = fetch_20newsgroups(subset=sset,
                          remove=('headers', 'footers', 'quotes'),
                          shuffle=True)
    else:
        newsgroups_dset = fetch_20newsgroups(subset=sset, categories=cats,
                          remove=('headers', 'footers', 'quotes'),
                          shuffle=True)
    return newsgroups_dset


# In[2]:


categories = ["comp.windows.x", "misc.forsale", "rec.autos", "rec.motorcycles", "rec.sport.baseball"]
categories += ["rec.sport.hockey", "sci.crypt", "sci.med", "sci.space", "talk.politics.mideast"]

newsgroups_all = load_dataset('all', categories)
print(len(newsgroups_all.data))


# ## Preprocess
# 
# Convert word forms to stems to get concise representations for the documents: 

# In[3]:


import nltk
from nltk.stem import SnowballStemmer

stemmer = SnowballStemmer("english")

def stem(text):
    return stemmer.stem(text)


# In[4]:


import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS as stopwords

#print(stopwords)

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text, min_len=4):
        if token not in stopwords: #and len(token) > 3:
            result.append(stem(token))
    return result


# Check how each document is represented. For example, look into the very first one:

# In[5]:


doc_sample = newsgroups_all.data[0]
print('Original document: ')
print(doc_sample)

print('\n\nTokenized document: ')
words = []
for token in gensim.utils.tokenize(doc_sample):
    words.append(token)
print(words)

print('\n\nPreprocessed document: ')
print(preprocess(doc_sample))


# How do the first 10 look like?

# In[6]:


for i in range(0, 10):
    print(str(i) + "\t" + ", ".join(preprocess(newsgroups_all.data[i])[:10]))


# Now let's represent each document as a dictionary of relevant words. Each word (*value* in the dictionary) has a unique identifier (*key*): 

# In[7]:


processed_docs = []
for i in range(0, len(newsgroups_all.data)):
    processed_docs.append(preprocess(newsgroups_all.data[i]))

print(len(processed_docs))
    
dictionary = gensim.corpora.Dictionary(processed_docs)
print(len(dictionary))

index = 0
for key, value in dictionary.iteritems():
    print(key, value)
    index += 1
    if index > 9:
        break


# Put some contraints on the dictionary of terms: for instance, keep up to $100,000$ words that occur more frequently than $10$ times (`no_below`) and less frequently than in $50\%$ of the documents (`no_above`). This should help you extract the most useful terms, while still keeping a reasonable number of them.

# In[8]:


dictionary.filter_extremes(no_below=10, no_above=0.5, keep_n=100000)
print(len(dictionary))


# Let's see how a particular document is represented in this dictionary: for example, look into the very first post, or into the 100th:

# In[9]:


bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[0]
#bow_corpus[99]


# Let's decode what each index (key) in this dictionary points to:

# In[10]:


#bow_doc = bow_corpus[99]
bow_doc = bow_corpus[0]

for i in range(len(bow_doc)):
    print(f"Key {bow_doc[i][0]} =\"{dictionary[bow_doc[i][0]]}\":\
    occurrences={bow_doc[i][1]}")


# ## Train an LDA model

# In[11]:


# Create the dictionary
id2word = dictionary

# Create the corpus with word frequencies
corpus = bow_corpus

# Build the LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=1000,
                                           passes=10,
                                           alpha='symmetric',
                                           iterations=100,
                                           per_word_topics=True)


for index, topic in lda_model.print_topics(-1):
    print(f"Topic: {index} \nWords: {topic}")


# ## Interpret the results
# 
# What is the most representative topic in each document?

# In[12]:


def analyse_topics(ldamodel, corpus, texts):
    main_topic = {}
    percentage = {}
    keywords = {}
    text_snippets = {}
    # Get main topic in each document
    for i, topic_list in enumerate(ldamodel[corpus]):
        #print("\n")
        #print(topic_list)
        #print("\n")
        #for i in range(0, len(topic_list)):
        #    print (topic_list[i])
        topic = topic_list[0] if ldamodel.per_word_topics else topic_list            
        #print(topic)
        topic = sorted(topic, key=lambda x: (x[1]), reverse=True)
        # Get the main topic, contribution (%) and keywords for each document
        for j, (topic_num, prop_topic) in enumerate(topic):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp[:5]])
                main_topic[i] = int(topic_num)
                percentage[i] = round(prop_topic,4)
                keywords[i] = topic_keywords
                text_snippets[i] = texts[i][:8]
            else:
                break
    return main_topic, percentage, keywords, text_snippets


main_topic, percentage, keywords, text_snippets = analyse_topics(
    lda_model, bow_corpus, processed_docs)

indexes = []
rows = []
for i in range(0, 10):
    indexes.append(i)
rows.append(['ID', 'Main Topic', 'Contribution (%)', 'Keywords', 'Snippet'])

for idx in indexes:
    rows.append([str(idx), f"{main_topic.get(idx)}", 
                f"{percentage.get(idx):.4f}",
                f"{keywords.get(idx)}\n",
                f"{text_snippets.get(idx)}"])

columns = zip(*rows)
column_widths = [max(len(item) for item in col) for col in columns]
for row in rows:
    print(''.join(' {:{width}} '.format(row[i], width=column_widths[i]) 
                  for i in range(0, len(row)))) 
    

# ## Explore words and topics with pyLDAvis

# In[13]:


import pyLDAvis.gensim
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary=lda_model.id2word)
vis


# Note, for newer versions of `gensim`, use the following code:

# In[ ]:


#import pyLDAvis.gensim_models
#pyLDAvis.enable_notebook()
#vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary=lda_model.id2word)
#vis


# In[ ]: