#!/usr/bin/env python
# coding: utf-8

# # Importing Necessary Libraries

# In[1]:


import requests
from bs4 import BeautifulSoup as bs


# ## Loading our First Page

# In[5]:


# Load the webpage content
r = requests.get('https://keithgalli.github.io/web-scraping/example.html')

# Convert to a beautiful soup object
soup = bs(r.content)

# Print out our html
print(soup.prettify())


# ## Start using BeautifulSoup to Scrape

# In[8]:


first_header = soup.find('h2')
first_header


# In[9]:


headers = soup.find_all('h2')
headers


# In[10]:


# Pass in a list of elements to look for
first_header = soup.find(["h1", "h2"])
first_header


# In[11]:


first_header = soup.find(["h2", "h1"])
first_header


# In[12]:


headers = soup.find_all(["h1", "h2"])
headers


# In[16]:


# You can pass in attributes to the find/find_all function
paragraph = soup.find_all('p', attrs={'id': 'paragraph-id'})
paragraph


# In[19]:


# You can nest find/find_all calls
body = soup.find('body')
div = body.find('div')
header = div.find('h1')
header


# In[21]:


# We can search specific strings in our find/find_all calls

import re

para = soup.find_all('p', string=re.compile('Some'))
para


# In[22]:


head = soup.find_all('h2', string=re.compile('(H|h)eader'))
head


# ## Select (CSS Selector)

# In[24]:


content = soup.select('div p')
content


# In[25]:


pg = soup.select('h2 ~ p')
pg


# In[28]:


bold = soup.select('p#paragraph-id b')
bold


# In[40]:


paras = soup.select('body > p')
print(paras)


# In[41]:


for para in paras:
    print(para.select("i"))


# In[35]:


# Grab by element with specific property
soup.select("[align=middle]")


# ## Get different properties of the HTML

# ### Getting Strings from HTML

# In[45]:


# use .string
soup.find('h2').string


# In[49]:


# If multiple child elements use get_text
div = soup.find('div')
print(div.get_text())


# ### Getting Links from HTML

# In[50]:


# Get a specific property from an element
link = soup.find('a')
link['href']


# ### Subsetting to get what you want from HTML

# In[51]:


paragraphs = soup.select("p#paragraph-id")
paragraphs[0]['id']


# ## Code Navigation

# In[61]:


# Know the terms: Parent, Sibling, Child
soup.body.find("div").find_parents()


# In[62]:


soup.body.find("div").find_parent()


# In[63]:


soup.body.find("div").find_previous_siblings()


# In[64]:


soup.body.find("div").find_previous_sibling()


# In[59]:


soup.body.find("div").find_next_siblings()


# In[60]:


soup.body.find("div").find_next_sibling()


# # Exercises

# ## Loading the webpage

# In[2]:


# Load the webpage content
r = requests.get("https://keithgalli.github.io/web-scraping/webpage.html")

# Convert to a beautiful soup object
wp = bs(r.content)

# Print out our html
print(wp.prettify())


# ## Question 1: Grab all of the social links from the web page in 4 ways

# Link to the web page: https://keithgalli.github.io/web-scraping/webpage.html

# ### Method 1

# In[5]:


links = wp.select('ul.socials a')
actual_links = [link['href'] for link in links]
actual_links


# ### Method 2

# In[19]:


ulist = wp.find('ul', attrs={'class': 'socials'})
links = ulist.find_all(
    "a")  #adding this step because find doesn't give the output as a list
actual_links = [link['href'] for link in links]
actual_links


# ### Method 3

# In[22]:


links = wp.select("li.social a")
actual_links = [link['href'] for link in links]
actual_links


# ### Method 4

# In[31]:


links = wp.select("body ul li.social a")
actual_links = [link['href'] for link in links]
actual_links


# ## Scraping the MIT Hockey Stats table

# In[28]:


import pandas as pd


# In[48]:


table = wp.select('table.hockey-stats')[0]
columns = table.find_all('th')
column_names = [c.string for c in columns]

table_rows = table.find('tbody').find_all('tr')

l = []  #creating an empty list
for tr in table_rows:
    td = tr.find_all('td')
    row = [str(tr.get_text()).strip() for tr in td]
    l.append(row)

# print(l[0])

df = pd.DataFrame(l, columns=column_names)
df


# ## Grab all fun facts that contain the word 'is'

# In[61]:


import re

facts = wp.select('ul.fun-facts li')
facts_with_is = [fact.find(string=re.compile('is')) for fact in facts]
facts_with_is = [
    fact.find_parent().get_text() for fact in facts_with_is if fact
]
facts_with_is


# ## Download an Image from a web page

# In[63]:


# Load the webpage content
url = "https://keithgalli.github.io/web-scraping/"
r = requests.get(url + "webpage.html")

# Convert to a beautiful soup object
webpage = bs(r.content)

images = wp.select("div.row div.column img")
image_url = images[0]['src']
full_url = url + image_url

img_data = requests.get(full_url).content
with open('lake_como.jpg', 'wb') as handler:
    handler.write(img_data)


# **Image is Downloaded**

# ## Solve the mystery challenge!

# In[78]:


files = webpage.select("div.block a")
relative_files = [f['href'] for f in files]


url = "https://keithgalli.github.io/web-scraping/"
for f in relative_files:
    full_url = url + f
    page = requests.get(full_url)
    bs_page = bs(page.content)
    secret_word_element = bs_page.find("p", attrs={"id": "secret-word"})
    secret_word = secret_word_element.string
    print(secret_word)