# !pip install pyLDAvis # !pip install rake-nltk path = '/content' from nltk.util import ngrams import os import pandas as pd import re import numpy as np from tqdm import tqdm import seaborn as sns import matplotlib.pyplot as plt from itertools import groupby from bs4 import BeautifulSoup from collections import OrderedDict from wordcloud import WordCloud from sklearn.feature_extraction.text import CountVectorizer from sklearn.decomposition import LatentDirichletAllocation as LDA from pyLDAvis import sklearn as sklearn_lda import pickle import pyLDAvis from rake_nltk import Rake import nltk from nltk.corpus import stopwords from nltk.stem import PorterStemmer from nltk.stem import WordNetLemmatizer import warnings warnings.filterwarnings("ignore") nltk.download('stopwords') stopwords = list(set(stopwords.words('english'))) ps = PorterStemmer() nltk.download('wordnet') lemmatizer = WordNetLemmatizer() files = os.listdir(path); files Subset_Database = pd.read_excel(os.path.join(path,'Subset_Database.xlsx')) Subset_Hardware = pd.read_excel(os.path.join(path,'Subset_Hardware.xlsx')) Subset_Inquiry = pd.read_excel(os.path.join(path,'Subset_Inquiry.xlsx')) Subset_Network = pd.read_excel(os.path.join(path,'Subset_Network.xlsx')) Subset_Software = pd.read_excel(os.path.join(path,'Subset_Software.xlsx')) def clean_l1(text, extended=False): text = ' ' + text + ' ' text = text.lower() text = re.sub(r'[^a-z ]', ' ', text) text = ' '.join(text.split()) text = ' '.join([lemmatizer.lemmatize(w, 'v') for w in text.split()]) text = ' '.join([lemmatizer.lemmatize(w) for w in text.split()]) if extended: text = ' '.join([w for w in text.split() if not w in stopwords]) return text def plot_10_most_common_words(count_data, count_vectorizer): words = count_vectorizer.get_feature_names() total_counts = np.zeros(len(words)) for t in count_data: total_counts+=t.toarray()[0] count_dict = (zip(words, total_counts)) count_dict = sorted(count_dict, key=lambda x:x[1], reverse=True)[0:10] words = [w[0] for w in count_dict] counts = [w[1] for w in count_dict] x_pos = np.arange(len(words)) plt.figure(2, figsize=(5, 5/1.6180)) plt.subplot(title='10 most common words') sns.set_context("notebook", font_scale=1.25, rc={"lines.linewidth": 2.5}) sns.barplot(x_pos, counts, palette='husl') plt.xticks(x_pos, words, rotation=90) plt.xlabel('words') plt.ylabel('counts') plt.show() def print_topics(model, count_vectorizer, n_top_words): words = count_vectorizer.get_feature_names() for topic_idx, topic in enumerate(model.components_): print("\nTopic #%d:" % topic_idx) print(" ".join([words[i] for i in topic.argsort()[:-n_top_words - 1:-1]])) def topic_model(df, outfile='output'): # select short_desc column df = df[['Short description']] # clean df['Short description'] = df['Short description'].apply(clean_l1) # drop null and duplicates df = df.dropna().drop_duplicates() # drop records with 1 or less word df = df[df['Short description'].str.split().str.len().gt(1)] # compile list of documents docs = df['Short description'].tolist() # keyword extraction and tokenization min=2; max=2 r = Rake(min_length=min, max_length=max) r.extract_keywords_from_sentences(docs) keywords = r.get_ranked_phrases()[:100] def tokenize(text): tokens = text.split() tokens = list(set(tokens)) for i in range(2,max+1): ngram = list(ngrams(tokens, i)) ngram = [' '.join(list(x)) for x in ngram] tokens.extend(ngram) tokens = list(set(tokens) & set(keywords)) return tokens # top-k most common words count_vectorizer = CountVectorizer(stop_words='english', tokenizer=tokenize) count_data = count_vectorizer.fit_transform(docs) # plot_10_most_common_words(count_data, count_vectorizer) # generate wordcloud sum_words = count_data.sum(axis=0) words_freq = [(word, sum_words[0, idx]) for word, idx in count_vectorizer.vocabulary_.items()] words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True) words_dict = dict(words_freq) wordcloud = WordCloud(background_color="white", max_words=5000, contour_width=3, contour_color='steelblue') wordcloud.generate_from_frequencies(words_dict) wordcloud.to_file(outfile+'.png') # generate topic model number_topics = 5 #PARAM number_words = 10 #PARAM lda = LDA(n_components=number_topics, n_jobs=-1) lda.fit(count_data) # print("Topics found via LDA:") # print_topics(lda, count_vectorizer, number_words) # LDA Visualization LDAvis_prepared = sklearn_lda.prepare(lda, count_data, count_vectorizer) pyLDAvis.save_html(LDAvis_prepared, outfile+'.html') topic_model(Subset_Database, 'Subset_Database') topic_model(Subset_Hardware, 'Subset_Hardware') topic_model(Subset_Inquiry, 'Subset_Inquiry') topic_model(Subset_Network, 'Subset_Network') topic_model(Subset_Software, 'Subset_Software')