This example is based on the dataset 'A Medical History of British India' provided by the Data Foundry. It uses the trial data version of the dataset (15.5 MB compressed). This dataset forms the first half of the Medical History of British India collection, which itself is part of the broader India Papers collection held by the Library.
import nltk nltk.download('punkt') nltk.download('stopwords') from nltk.tokenize import word_tokenize
from nltk.tokenize import word_tokenize f = open("nls-text-indiaPapers/74457530.txt", "r") text = f.read() #print (text) text_tokens = word_tokenize(text) from nltk.corpus import stopwords filtered_words = [word for word in text_tokens if word not in stopwords.words('english')] print(filtered_words)
from nltk.probability import FreqDist fdist = FreqDist(filtered_words) print(fdist)
# Frequency Distribution Plot import matplotlib.pyplot as plt fdist.plot(30) plt.show()