#!/usr/bin/env python # coding: utf-8 # # Importing Necessary Libraries # In[1]: import requests from bs4 import BeautifulSoup as bs # ## Loading our First Page # In[5]: # Load the webpage content r = requests.get('https://keithgalli.github.io/web-scraping/example.html') # Convert to a beautiful soup object soup = bs(r.content) # Print out our html print(soup.prettify()) # ## Start using BeautifulSoup to Scrape # In[8]: first_header = soup.find('h2') first_header # In[9]: headers = soup.find_all('h2') headers # In[10]: # Pass in a list of elements to look for first_header = soup.find(["h1", "h2"]) first_header # In[11]: first_header = soup.find(["h2", "h1"]) first_header # In[12]: headers = soup.find_all(["h1", "h2"]) headers # In[16]: # You can pass in attributes to the find/find_all function paragraph = soup.find_all('p', attrs={'id': 'paragraph-id'}) paragraph # In[19]: # You can nest find/find_all calls body = soup.find('body') div = body.find('div') header = div.find('h1') header # In[21]: # We can search specific strings in our find/find_all calls import re para = soup.find_all('p', string=re.compile('Some')) para # In[22]: head = soup.find_all('h2', string=re.compile('(H|h)eader')) head # ## Select (CSS Selector) # In[24]: content = soup.select('div p') content # In[25]: pg = soup.select('h2 ~ p') pg # In[28]: bold = soup.select('p#paragraph-id b') bold # In[40]: paras = soup.select('body > p') print(paras) # In[41]: for para in paras: print(para.select("i")) # In[35]: # Grab by element with specific property soup.select("[align=middle]") # ## Get different properties of the HTML # ### Getting Strings from HTML # In[45]: # use .string soup.find('h2').string # In[49]: # If multiple child elements use get_text div = soup.find('div') print(div.get_text()) # ### Getting Links from HTML # In[50]: # Get a specific property from an element link = soup.find('a') link['href'] # ### Subsetting to get what you want from HTML # In[51]: paragraphs = soup.select("p#paragraph-id") paragraphs[0]['id'] # ## Code Navigation # In[61]: # Know the terms: Parent, Sibling, Child soup.body.find("div").find_parents() # In[62]: soup.body.find("div").find_parent() # In[63]: soup.body.find("div").find_previous_siblings() # In[64]: soup.body.find("div").find_previous_sibling() # In[59]: soup.body.find("div").find_next_siblings() # In[60]: soup.body.find("div").find_next_sibling() # # Exercises # ## Loading the webpage # In[2]: # Load the webpage content r = requests.get("https://keithgalli.github.io/web-scraping/webpage.html") # Convert to a beautiful soup object wp = bs(r.content) # Print out our html print(wp.prettify()) # ## Question 1: Grab all of the social links from the web page in 4 ways # Link to the web page: https://keithgalli.github.io/web-scraping/webpage.html # ### Method 1 # In[5]: links = wp.select('ul.socials a') actual_links = [link['href'] for link in links] actual_links # ### Method 2 # In[19]: ulist = wp.find('ul', attrs={'class': 'socials'}) links = ulist.find_all( "a") #adding this step because find doesn't give the output as a list actual_links = [link['href'] for link in links] actual_links # ### Method 3 # In[22]: links = wp.select("li.social a") actual_links = [link['href'] for link in links] actual_links # ### Method 4 # In[31]: links = wp.select("body ul li.social a") actual_links = [link['href'] for link in links] actual_links # ## Scraping the MIT Hockey Stats table # In[28]: import pandas as pd # In[48]: table = wp.select('table.hockey-stats')[0] columns = table.find_all('th') column_names = [c.string for c in columns] table_rows = table.find('tbody').find_all('tr') l = [] #creating an empty list for tr in table_rows: td = tr.find_all('td') row = [str(tr.get_text()).strip() for tr in td] l.append(row) # print(l[0]) df = pd.DataFrame(l, columns=column_names) df # ## Grab all fun facts that contain the word 'is' # In[61]: import re facts = wp.select('ul.fun-facts li') facts_with_is = [fact.find(string=re.compile('is')) for fact in facts] facts_with_is = [ fact.find_parent().get_text() for fact in facts_with_is if fact ] facts_with_is # ## Download an Image from a web page # In[63]: # Load the webpage content url = "https://keithgalli.github.io/web-scraping/" r = requests.get(url + "webpage.html") # Convert to a beautiful soup object webpage = bs(r.content) images = wp.select("div.row div.column img") image_url = images[0]['src'] full_url = url + image_url img_data = requests.get(full_url).content with open('lake_como.jpg', 'wb') as handler: handler.write(img_data) # **Image is Downloaded** # ## Solve the mystery challenge! # In[78]: files = webpage.select("div.block a") relative_files = [f['href'] for f in files] url = "https://keithgalli.github.io/web-scraping/" for f in relative_files: full_url = url + f page = requests.get(full_url) bs_page = bs(page.content) secret_word_element = bs_page.find("p", attrs={"id": "secret-word"}) secret_word = secret_word_element.string print(secret_word)