# fetch an html page
import urllib.request
url = 'https://rambasnet.github.io/teaching.html'
localfile = 'teaching.html'
urllib.request.urlretrieve(url, localfile)
('teaching.html', <http.client.HTTPMessage at 0x111004828>)
with open(localfile) as f:
data = f.read()
words = data.split(' ')
print('There are {0} words in the file.'.format(len(words)))
There are 10165 words in the file.
# can run terminal/bash commands from notebook using !
! pip install bs4
Requirement already satisfied: bs4 in /Users/rbasnet/miniconda3/lib/python3.7/site-packages (0.0.1) Requirement already satisfied: beautifulsoup4 in /Users/rbasnet/miniconda3/lib/python3.7/site-packages (from bs4) (4.7.1) Requirement already satisfied: soupsieve>=1.2 in /Users/rbasnet/miniconda3/lib/python3.7/site-packages (from beautifulsoup4->bs4) (1.9.1)
from bs4 import BeautifulSoup
localfile = 'teaching.html'
with open(localfile) as f:
soup = BeautifulSoup(f.read(), 'lxml')
text = soup.get_text()
print(text)
Ram Basnet | Homepage Dr. Ram Basnet Associate Professor of Computer Science Home Teaching Research Resources Contact Teaching Teaching Interests Cyber Security Python, C++, Java, Database, JavaScript Data Science Web Design and Secure Web App Development Courses Taught at CMU CSCI 106 - Web Page I 6 CSCI 110 - Beg. Prog. Python & Lab 6 CS1 - Foundation of CS 7 CS2 - Data Structures 7 CSCI 206 - Web Page II 2 CS3 - Intro to Algorithms 2 CSCI 310 - Adv. Prog. Python 7 CSCI 420 - Cyber Security 5 CSCI 465 - Net/App Security 5 CURRENT SCHEDULE Mon Tues Wed Thrs Fri 8:00 CS0WS 120 CS0-LWS 120 CS0WS 120 CS0-LWS 120 CS0WS 120 8:30 9:00 Ad PyWS 118 Off. Hr.CH 321 Ad PyWS 118 Off. Hr.CH 321 Off. Hr.WS 116 (CRL) 9:30 10:00 Off. Hr.CH 321 10:30 11:00 Net/App SecWS 205 Net/App SecWS 205 Net/App SecWS 205 11:30 12:00 12:30 1:00 CS 3CH 276 CS 3CH 276 CS 3CH 276 1:30 2:00 Off. Hr.CH 321 2:30 3:00 3:30 Home | Teaching | Research | Resources | Contact © 2019 var dt = new Date() document.getElementById("year").innerHTML = dt.getFullYear() var windowSize = window.matchMedia("(max-width: 375px)") if (windowSize.matches) { var element = document.getElementById("cmu-logo") element.setAttribute("style", "visibility: hidden;") } /* var navul = document.getElementById("navul") var alists = navul.getElementsByClassName("nav-link") for (var i = 0; i < alists.length; i++) { alists[i].addEventListener("click", function() { var current = document.getElementsByClassName("active") current[0].className = current[0].className.replace(" active", "") this.className += " active" }) } */ var hrefString = document.location.href ? document.location.href : document.location var url = hrefString.split("/") //replace string with location.href var navLinks = document .getElementById("navul") .getElementsByClassName("nav-item") var currentPage = url[url.length - 1] for (var i = 0; i < navLinks.length; i++) { var link = navLinks[i].getElementsByClassName("nav-link")[0] var lb = link.href.split("/") if (lb[lb.length - 1] == currentPage) { navLinks[i].className += " active" } else { navLinks[i].className = navLinks[i].className.replace(" active", "") } } ;(function(i, s, o, g, r, a, m) { i["GoogleAnalyticsObject"] = r ;(i[r] = i[r] || function() { ;(i[r].q = i[r].q || []).push(arguments) }), (i[r].l = 1 * new Date()) ;(a = s.createElement(o)), (m = s.getElementsByTagName(o)[0]) a.async = 1 a.src = g m.parentNode.insertBefore(a, m) })( window, document, "script", "//www.google-analytics.com/analytics.js", "ga" ) ga("create", "UA-46738331-1", "coloradomesa.edu") ga("send", "pageview")
# break into lines and remove leading and trailing space on each line
lines = [line.strip() for line in text.splitlines()]
print(lines[:20])
['', '', 'Ram Basnet | Homepage', '', '', '', '', '', '', '', '', '', '', '', '', 'Dr. Ram Basnet', 'Associate Professor of Computer Science', '', '', '']
# create list of words by spliting multi-word elements
words = [word.strip().lower() for line in lines for word in line.split()]
print(words[:20])
['ram', 'basnet', '|', 'homepage', 'dr.', 'ram', 'basnet', 'associate', 'professor', 'of', 'computer', 'science', 'home', 'teaching', 'research', 'resources', 'contact', 'teaching', 'teaching', 'interests']
print('There are {0} words in the file.'.format(len(words)))
There are 367 words in the file.
from collections import defaultdict
hist = defaultdict(int)
for w in words:
hist[w] += 1
# print top 10 most common words
listHist = [(k, v) for k, v in hist.items()]
print(listHist[:10])
[('ram', 2), ('basnet', 2), ('|', 5), ('homepage', 1), ('dr.', 1), ('associate', 1), ('professor', 1), ('of', 2), ('computer', 1), ('science', 2)]
listHist.sort(key = lambda x: x[1], reverse=True)
print(listHist[:10])
[('=', 25), ('var', 12), ('-', 11), ('{', 8), ('csci', 6), ('|', 5), ('i', 5), ('120', 5), ('off.', 5), ('}', 5)]
fileSrc = './resources/brain.jpg'
fileDst = 'brain-copy.jpg'
with open(fileSrc, 'rb') as rbf:
#rb - read binary mode
data = rbf.read() # read the whole binary file
with open(fileDst, 'wb') as wbf:
wbf.write(data) # write the whole binary file
import hashlib
file1Contents = open(fileSrc, 'rb').read()
file2Contents = open(fileDst, 'rb').read()
file1ChkSum = hashlib.sha256(file1Contents).hexdigest()
file2ChkSum = hashlib.sha256(file2Contents).hexdigest()
if (file1ChkSum == file2ChkSum):
print('two files checksums match!')
else:
print('oops! two files checksums do NOT match!')
two files checksums match!
import pickle
alist = list(range(2, 21, 2))
print(alist)
[2, 4, 6, 8, 10, 12, 14, 16, 18, 20]
# lets pickle alist
pickleFile = 'myPickle.pkl'
with open(pickleFile, 'wb') as p:
pickle.dump(alist, p)
# lets unpickle alist
with open(pickleFile, 'rb') as p:
blist = pickle.load(p)
alist == blist
True