!wget https://raw.githubusercontent.com/cyrus723/my-first-binder/main/data/waller05302022.txt

#Open the text file :
text_file = open("waller05302022.txt", 'r')

#Read the data :
text = text_file.read()

# Let's try to find text_file
print(type(text_file))
print("\n")
print(50 * '-')
print("\n")
print(text_file)

#Datatype of the data read :
print (type(text))
print("\n")
#Length of the text :
print (len(text))
print("\n")
#Print the text :
print(text)

print(text_file.read())  # print nothing

text_file.seek(0)
print(text_file.read())

text_file.seek(0)
for lines in text_file:
  print(lines.split())

text_file.close()

with open("waller05302022.txt", 'r') as f:
    contents = f.read()
    print(contents)


import urllib
url='https://raw.githubusercontent.com/cyrus723/my-first-binder/main/data/waller05302022.txt'
myfile = urllib.request.urlopen(url)

for line in myfile:  
    print (line) 

from urllib import request
url = "https://raw.githubusercontent.com/cyrus723/my-first-binder/main/data/waller05302022.txt"
response = request.urlopen(url)
raw = response.read().decode('utf8')
print(len(raw))

print(type(raw))
print(len(raw))
raw[:75]

import nltk
nltk.download('punkt')

tokens = nltk.word_tokenize(raw)
print(50 * "-")
print(type(tokens))
print(50 * "-")
print(len(tokens))
print(50 * "-")
tokens[:10]


import requests 
url='https://raw.githubusercontent.com/cyrus723/my-first-binder/main/data/waller05302022.txt'
response = requests.get(url) 
print(response.text) 
print(type(response))
print(response)
print(type(response.text))
response.text

raw2=response.text
len(raw2)

import nltk
nltk.download('punkt')

tokens = nltk.word_tokenize(raw2)
print(50 * "-")
print(type(tokens))
print(50 * "-")
print(len(tokens))
print(50 * "-")
tokens[:10]


#Import required libraries :
import nltk
from nltk import sent_tokenize
from nltk import word_tokenize


import nltk
nltk.download("popular")


#Tokenize the text by sentences :
sentences = sent_tokenize(raw2)

#How many sentences are there? :
print (len(sentences))

#Print the sentences :
#print(sentences)
sentences


#Tokenize the text with words :
words = word_tokenize(raw2)

#How many words are there? :
print (len(words))
print("\n")

#Print words :
print (words)
words[:10]


#Import required libraries :
from nltk.probability import FreqDist

#Find the frequency :
fdist = FreqDist(words)

#Print 10 most common words :
fdist.most_common(10)


#Plot the graph for fdist :
import matplotlib.pyplot as plt

fdist.plot(10)


#Empty list to store words:
words_no_punc = []

#Removing punctuation marks :
for w in words:
    if w.isalpha():
        words_no_punc.append(w.lower())

#Print the words without punctution marks :
print (words_no_punc)

print ("\n")

#Length :
print (len(words_no_punc))

#Frequency distribution :
fdist = FreqDist(words_no_punc)

fdist.most_common(10)

#Plot the most common words on grpah:

fdist.plot(10)

from nltk.corpus import stopwords

#List of stopwords
stopwords = stopwords.words("english")
print(stopwords)

#Empty list to store clean words :
clean_words = []

for w in words_no_punc:
    if w not in stopwords:
        clean_words.append(w)
        
print(clean_words)
print("\n")
print(len(clean_words))

#Frequency distribution :
fdist = FreqDist(clean_words)

fdist.most_common(10)


#Plot the most common words on grpah:

fdist.plot(10)

#Library to form wordcloud :
from wordcloud import WordCloud

#Library to plot the wordcloud :
import matplotlib.pyplot as plt

#Generating the wordcloud :
wordcloud = WordCloud().generate(text)

#Plot the wordcloud :
plt.figure(figsize = (12, 12)) 
plt.imshow(wordcloud) 

#To remove the axis value :
plt.axis("off") 
plt.show()