#!/usr/bin/env python
# coding: utf-8

# # Webscraping 101

# - What is webscraping?
# - Packages used
# - Making a request
# - Getting and parsing the data
# - A couple of tricks to get around tricky websites

# #### What is webscraping?
# Webscraping is extracting data from a website. This could be text, numbers, images, urls, etc. It is mainly used as a tool for data collection and research purposes.

# #### Packages that are used

# In[1]:


import requests # used for making requests to website and getting the information
from bs4 import BeautifulSoup # used for parsing the data


# #### Making a Request

# In[2]:


response = requests.get('https://google.com')


# In[3]:


# status code
print(response.status_code)


# #### A Couple of Common Status Codes
# - 200 (successful response from server)
# - 403 (forbidden response to server --> usually means they are doing a good job of blocking or you are missing credentials)
# - 404 (not found --> usually means the page you are looking for does not exist anymore)
# - Anything 500 is usually a server error
# 
# You can find a full list of HTTP response codes here: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status

# In[4]:


# page source
print(response.text)

# url sent
print(response.url)


# Unfortuantely, there's not much just to scrape from Google's main page so let's try Nike

# In[5]:


response = requests.get('https://www.nike.com/w/new-mens-shoes-3n82yznik1zy7ok')


# In[6]:


response.status_code


# ### Parsing the webpage
# To parse the webpage we will want to use BeautifulSoup to extract information from the page source that we just got

# In[7]:


soup = BeautifulSoup(response.text, 'html.parser') # first argument is the text, second is the type of parser we want to use


# In[8]:


data = soup.select('div[class="product-card__body"]')


# In[9]:


len(data)


# In[10]:


for x in data:
    print(x.text)


# ### Getting around Tricky Websites
# 1. Headers
# 2. Proxies
# 3. Selenium

# In[11]:


headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36'}


# In[12]:


response = requests.get('https://www.tiktok.com/')


# In[13]:


response.text


# In[14]:


response = requests.get('https://www.tiktok.com/', headers=headers)


# In[15]:


response.text


# #### Proxies
# Many proxy services are available such as Scraper API provide proxy services which you can use to scrape

# #### Selenium
# This will be another video but it allows you to automate actual Chrome browsers