# !pip install pyLDAvis
# !pip install rake-nltk

path = '/content'

from nltk.util import ngrams

import os
import pandas as pd

import re
import numpy as np
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from itertools import groupby
from bs4 import BeautifulSoup
from collections import OrderedDict
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA

from pyLDAvis import sklearn as sklearn_lda
import pickle 
import pyLDAvis

from rake_nltk import Rake

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

import warnings
warnings.filterwarnings("ignore")

nltk.download('stopwords')
stopwords = list(set(stopwords.words('english')))

ps = PorterStemmer()
nltk.download('wordnet') 
lemmatizer = WordNetLemmatizer()

files = os.listdir(path); files

Subset_Database = pd.read_excel(os.path.join(path,'Subset_Database.xlsx'))
Subset_Hardware = pd.read_excel(os.path.join(path,'Subset_Hardware.xlsx'))
Subset_Inquiry = pd.read_excel(os.path.join(path,'Subset_Inquiry.xlsx'))
Subset_Network = pd.read_excel(os.path.join(path,'Subset_Network.xlsx'))
Subset_Software = pd.read_excel(os.path.join(path,'Subset_Software.xlsx'))

def clean_l1(text, extended=False):
  text = ' ' + text + ' '
  text = text.lower()
  text = re.sub(r'[^a-z ]', ' ', text)
  text = ' '.join(text.split())
  text = ' '.join([lemmatizer.lemmatize(w, 'v') for w in text.split()])
  text = ' '.join([lemmatizer.lemmatize(w) for w in text.split()])
  if extended:
    text = ' '.join([w for w in text.split() if not w in stopwords])
  return text

def plot_10_most_common_words(count_data, count_vectorizer):
    words = count_vectorizer.get_feature_names()
    total_counts = np.zeros(len(words))
    for t in count_data:
        total_counts+=t.toarray()[0]
    
    count_dict = (zip(words, total_counts))
    count_dict = sorted(count_dict, key=lambda x:x[1], reverse=True)[0:10]
    words = [w[0] for w in count_dict]
    counts = [w[1] for w in count_dict]
    x_pos = np.arange(len(words)) 
    
    plt.figure(2, figsize=(5, 5/1.6180))
    plt.subplot(title='10 most common words')
    sns.set_context("notebook", font_scale=1.25, rc={"lines.linewidth": 2.5})
    sns.barplot(x_pos, counts, palette='husl')
    plt.xticks(x_pos, words, rotation=90) 
    plt.xlabel('words')
    plt.ylabel('counts')
    plt.show()

def print_topics(model, count_vectorizer, n_top_words):
  words = count_vectorizer.get_feature_names()
  for topic_idx, topic in enumerate(model.components_):
      print("\nTopic #%d:" % topic_idx)
      print(" ".join([words[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))

def topic_model(df, outfile='output'):
  # select short_desc column
  df = df[['Short description']]
  # clean
  df['Short description'] = df['Short description'].apply(clean_l1)
  # drop null and duplicates
  df = df.dropna().drop_duplicates()
  # drop records with 1 or less word
  df = df[df['Short description'].str.split().str.len().gt(1)]
  # compile list of documents
  docs = df['Short description'].tolist()
  # keyword extraction and tokenization
  min=2; max=2
  r = Rake(min_length=min, max_length=max)
  r.extract_keywords_from_sentences(docs)
  keywords = r.get_ranked_phrases()[:100]
  def tokenize(text):
    tokens = text.split()
    tokens = list(set(tokens))
    for i in range(2,max+1):
      ngram = list(ngrams(tokens, i))
      ngram = [' '.join(list(x)) for x in ngram]
      tokens.extend(ngram)
    tokens = list(set(tokens) & set(keywords))
    return tokens
  # top-k most common words
  count_vectorizer = CountVectorizer(stop_words='english', tokenizer=tokenize)
  count_data = count_vectorizer.fit_transform(docs)
  # plot_10_most_common_words(count_data, count_vectorizer)
  # generate wordcloud
  sum_words = count_data.sum(axis=0) 
  words_freq = [(word, sum_words[0, idx]) for word, idx in count_vectorizer.vocabulary_.items()]
  words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
  words_dict = dict(words_freq)
  wordcloud = WordCloud(background_color="white", max_words=5000, contour_width=3, contour_color='steelblue')
  wordcloud.generate_from_frequencies(words_dict)
  wordcloud.to_file(outfile+'.png')
  # generate topic model
  number_topics = 5 #PARAM
  number_words = 10 #PARAM
  lda = LDA(n_components=number_topics, n_jobs=-1)
  lda.fit(count_data)
  # print("Topics found via LDA:")
  # print_topics(lda, count_vectorizer, number_words)
  # LDA Visualization
  LDAvis_prepared = sklearn_lda.prepare(lda, count_data, count_vectorizer)
  pyLDAvis.save_html(LDAvis_prepared, outfile+'.html')

topic_model(Subset_Database, 'Subset_Database')
topic_model(Subset_Hardware, 'Subset_Hardware')
topic_model(Subset_Inquiry, 'Subset_Inquiry')
topic_model(Subset_Network, 'Subset_Network')
topic_model(Subset_Software, 'Subset_Software')