Notebook

Topic Modeling on ServiceNow Dataset¶

In [ ]:

# !pip install pyLDAvis
# !pip install rake-nltk

In [ ]:

path = '/content'

In [ ]:

from nltk.util import ngrams

In [ ]:

import os
import pandas as pd

import re
import numpy as np
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
from itertools import groupby
from bs4 import BeautifulSoup
from collections import OrderedDict
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA

from pyLDAvis import sklearn as sklearn_lda
import pickle 
import pyLDAvis

from rake_nltk import Rake

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

import warnings
warnings.filterwarnings("ignore")

nltk.download('stopwords')
stopwords = list(set(stopwords.words('english')))

ps = PorterStemmer()
nltk.download('wordnet') 
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.

In [ ]:

files = os.listdir(path); files

Out[ ]:

['incident_dump.xlsx',
 'incident_Cleaned.xlsx',
 'serviceNow_Model.h5',
 'Subset_Database.xlsx',
 'Subset_Hardware.xlsx',
 'Subset_Inquiry.xlsx',
 'Subset_Network.xlsx',
 'Subset_Software.xlsx']

In [ ]:

Subset_Database = pd.read_excel(os.path.join(path,'Subset_Database.xlsx'))
Subset_Hardware = pd.read_excel(os.path.join(path,'Subset_Hardware.xlsx'))
Subset_Inquiry = pd.read_excel(os.path.join(path,'Subset_Inquiry.xlsx'))
Subset_Network = pd.read_excel(os.path.join(path,'Subset_Network.xlsx'))
Subset_Software = pd.read_excel(os.path.join(path,'Subset_Software.xlsx'))

In [ ]:

def clean_l1(text, extended=False):
  text = ' ' + text + ' '
  text = text.lower()
  text = re.sub(r'[^a-z ]', ' ', text)
  text = ' '.join(text.split())
  text = ' '.join([lemmatizer.lemmatize(w, 'v') for w in text.split()])
  text = ' '.join([lemmatizer.lemmatize(w) for w in text.split()])
  if extended:
    text = ' '.join([w for w in text.split() if not w in stopwords])
  return text

In [ ]:

def plot_10_most_common_words(count_data, count_vectorizer):
    words = count_vectorizer.get_feature_names()
    total_counts = np.zeros(len(words))
    for t in count_data:
        total_counts+=t.toarray()[0]
    
    count_dict = (zip(words, total_counts))
    count_dict = sorted(count_dict, key=lambda x:x[1], reverse=True)[0:10]
    words = [w[0] for w in count_dict]
    counts = [w[1] for w in count_dict]
    x_pos = np.arange(len(words)) 
    
    plt.figure(2, figsize=(5, 5/1.6180))
    plt.subplot(title='10 most common words')
    sns.set_context("notebook", font_scale=1.25, rc={"lines.linewidth": 2.5})
    sns.barplot(x_pos, counts, palette='husl')
    plt.xticks(x_pos, words, rotation=90) 
    plt.xlabel('words')
    plt.ylabel('counts')
    plt.show()

In [ ]:

def print_topics(model, count_vectorizer, n_top_words):
  words = count_vectorizer.get_feature_names()
  for topic_idx, topic in enumerate(model.components_):
      print("\nTopic #%d:" % topic_idx)
      print(" ".join([words[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))

In [ ]:

def topic_model(df, outfile='output'):
  # select short_desc column
  df = df[['Short description']]
  # clean
  df['Short description'] = df['Short description'].apply(clean_l1)
  # drop null and duplicates
  df = df.dropna().drop_duplicates()
  # drop records with 1 or less word
  df = df[df['Short description'].str.split().str.len().gt(1)]
  # compile list of documents
  docs = df['Short description'].tolist()
  # keyword extraction and tokenization
  min=2; max=2
  r = Rake(min_length=min, max_length=max)
  r.extract_keywords_from_sentences(docs)
  keywords = r.get_ranked_phrases()[:100]
  def tokenize(text):
    tokens = text.split()
    tokens = list(set(tokens))
    for i in range(2,max+1):
      ngram = list(ngrams(tokens, i))
      ngram = [' '.join(list(x)) for x in ngram]
      tokens.extend(ngram)
    tokens = list(set(tokens) & set(keywords))
    return tokens
  # top-k most common words
  count_vectorizer = CountVectorizer(stop_words='english', tokenizer=tokenize)
  count_data = count_vectorizer.fit_transform(docs)
  # plot_10_most_common_words(count_data, count_vectorizer)
  # generate wordcloud
  sum_words = count_data.sum(axis=0) 
  words_freq = [(word, sum_words[0, idx]) for word, idx in count_vectorizer.vocabulary_.items()]
  words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
  words_dict = dict(words_freq)
  wordcloud = WordCloud(background_color="white", max_words=5000, contour_width=3, contour_color='steelblue')
  wordcloud.generate_from_frequencies(words_dict)
  wordcloud.to_file(outfile+'.png')
  # generate topic model
  number_topics = 5 #PARAM
  number_words = 10 #PARAM
  lda = LDA(n_components=number_topics, n_jobs=-1)
  lda.fit(count_data)
  # print("Topics found via LDA:")
  # print_topics(lda, count_vectorizer, number_words)
  # LDA Visualization
  LDAvis_prepared = sklearn_lda.prepare(lda, count_data, count_vectorizer)
  pyLDAvis.save_html(LDAvis_prepared, outfile+'.html')

In [ ]:

topic_model(Subset_Database, 'Subset_Database')
topic_model(Subset_Hardware, 'Subset_Hardware')
topic_model(Subset_Inquiry, 'Subset_Inquiry')
topic_model(Subset_Network, 'Subset_Network')
topic_model(Subset_Software, 'Subset_Software')

In [ ]: