Notebook

Topic Modelling¶

In [ ]:

# Installs requirements in case they are missing
%pip install spacy gensim pyLDAvis
import sys
import os

if sys.platform.startswith('win'):
    os.system('python -m spacy download en_core_web_md')
elif sys.platform.startswith('darwin') or sys.platform.startswith('linux'):
    os.system('python3 -m spacy download en_core_web_md')

In [ ]:

import glob
import spacy
from gensim import corpora
from gensim.models.ldamodel import LdaModel
import pyLDAvis.gensim
from IPython.display import display
import pyLDAvis

pyLDAvis.enable_notebook()

nlp = spacy.load('en_core_web_md')

In [ ]:

def preprocess(text):
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if not token.is_stop and not token.is_punct and token.lemma_.isalpha():
            filtered_tokens.append(token.lemma_)
    return filtered_tokens

def create_lda_model(file_paths):
    documents = []
    for path_book_name in file_paths:
        with open(path_book_name, 'r') as file:
            documents.append(file.read())

    cleaned_texts = []
    for doc in documents:
        cleaned_texts.append(preprocess(doc))

    dictionary = corpora.Dictionary(cleaned_texts)
    corpus = [dictionary.doc2bow(text) for text in cleaned_texts]

    lda_model = LdaModel(corpus, num_topics=10, id2word=dictionary, passes=15)
    
    return lda_model, corpus, dictionary

The Intertopic Distance Map is a visualization tool that is often used in topic modeling to show the relationships between different topics. In the graphs below each topic is represented as a bubble, and the distance between the bubbles represents the similarity between the topics. Topics that are closer together are more similar. The size of the bubble represents the prevalence of the topic in the corpus. Information Source: pyLDAvis documentation

In [ ]:

# Create LDA model for Shakespeare books
shakespeare_files = glob.glob("books/Shakespeare-corpus/*.txt")
shakespeare_lda_model, shakespeare_corpus, shakespeare_dictionary = create_lda_model(shakespeare_files)
shakespeare_vis = pyLDAvis.gensim.prepare(shakespeare_lda_model, shakespeare_corpus, shakespeare_dictionary)


# Display the visualizations
print("Shakespeare Corpus Topics:")
pyLDAvis.display(shakespeare_vis)

In [ ]:

# Create LDA model for Marlowe books
marlowe_files = glob.glob("books/Marlowe-corpus/*.txt")
marlowe_lda_model, marlowe_corpus, marlowe_dictionary = create_lda_model(marlowe_files)
marlowe_vis = pyLDAvis.gensim.prepare(marlowe_lda_model, marlowe_corpus, marlowe_dictionary)


# Display the visualizations
print("Marlowe Corpus Topics:")
pyLDAvis.display(marlowe_vis)

In [ ]:

if len(glob.glob("books/custom-corpus/*.txt")) > 0:
    custom_lda_model, custom_corpus, custom_dictionary = create_lda_model(glob.glob("books/custom-corpus/*.txt"))

    # Display the visualization
    print("Custom Corpus Topics:")
    display(pyLDAvis.gensim.prepare(custom_lda_model, custom_corpus, custom_dictionary))
else:
    print("No text files were found in the custom-corpus directory.")