!wget https://raw.githubusercontent.com/cyrus723/my-first-binder/main/data/waller05302022.txt #Open the text file : text_file = open("waller05302022.txt", 'r') #Read the data : text = text_file.read() # Let's try to find text_file print(type(text_file)) print("\n") print(50 * '-') print("\n") print(text_file) #Datatype of the data read : print (type(text)) print("\n") #Length of the text : print (len(text)) print("\n") #Print the text : print(text) print(text_file.read()) # print nothing text_file.seek(0) print(text_file.read()) text_file.seek(0) for lines in text_file: print(lines.split()) text_file.close() with open("waller05302022.txt", 'r') as f: contents = f.read() print(contents) import urllib url='https://raw.githubusercontent.com/cyrus723/my-first-binder/main/data/waller05302022.txt' myfile = urllib.request.urlopen(url) for line in myfile: print (line) from urllib import request url = "https://raw.githubusercontent.com/cyrus723/my-first-binder/main/data/waller05302022.txt" response = request.urlopen(url) raw = response.read().decode('utf8') print(len(raw)) print(type(raw)) print(len(raw)) raw[:75] import nltk nltk.download('punkt') tokens = nltk.word_tokenize(raw) print(50 * "-") print(type(tokens)) print(50 * "-") print(len(tokens)) print(50 * "-") tokens[:10] import requests url='https://raw.githubusercontent.com/cyrus723/my-first-binder/main/data/waller05302022.txt' response = requests.get(url) print(response.text) print(type(response)) print(response) print(type(response.text)) response.text raw2=response.text len(raw2) import nltk nltk.download('punkt') tokens = nltk.word_tokenize(raw2) print(50 * "-") print(type(tokens)) print(50 * "-") print(len(tokens)) print(50 * "-") tokens[:10] #Import required libraries : import nltk from nltk import sent_tokenize from nltk import word_tokenize import nltk nltk.download("popular") #Tokenize the text by sentences : sentences = sent_tokenize(raw2) #How many sentences are there? : print (len(sentences)) #Print the sentences : #print(sentences) sentences #Tokenize the text with words : words = word_tokenize(raw2) #How many words are there? : print (len(words)) print("\n") #Print words : print (words) words[:10] #Import required libraries : from nltk.probability import FreqDist #Find the frequency : fdist = FreqDist(words) #Print 10 most common words : fdist.most_common(10) #Plot the graph for fdist : import matplotlib.pyplot as plt fdist.plot(10) #Empty list to store words: words_no_punc = [] #Removing punctuation marks : for w in words: if w.isalpha(): words_no_punc.append(w.lower()) #Print the words without punctution marks : print (words_no_punc) print ("\n") #Length : print (len(words_no_punc)) #Frequency distribution : fdist = FreqDist(words_no_punc) fdist.most_common(10) #Plot the most common words on grpah: fdist.plot(10) from nltk.corpus import stopwords #List of stopwords stopwords = stopwords.words("english") print(stopwords) #Empty list to store clean words : clean_words = [] for w in words_no_punc: if w not in stopwords: clean_words.append(w) print(clean_words) print("\n") print(len(clean_words)) #Frequency distribution : fdist = FreqDist(clean_words) fdist.most_common(10) #Plot the most common words on grpah: fdist.plot(10) #Library to form wordcloud : from wordcloud import WordCloud #Library to plot the wordcloud : import matplotlib.pyplot as plt #Generating the wordcloud : wordcloud = WordCloud().generate(text) #Plot the wordcloud : plt.figure(figsize = (12, 12)) plt.imshow(wordcloud) #To remove the axis value : plt.axis("off") plt.show()