#!/usr/bin/env python # coding: utf-8 # # Webscraping 101 # - What is webscraping? # - Packages used # - Making a request # - Getting and parsing the data # - A couple of tricks to get around tricky websites # #### What is webscraping? # Webscraping is extracting data from a website. This could be text, numbers, images, urls, etc. It is mainly used as a tool for data collection and research purposes. # #### Packages that are used # In[1]: import requests # used for making requests to website and getting the information from bs4 import BeautifulSoup # used for parsing the data # #### Making a Request # In[2]: response = requests.get('https://google.com') # In[3]: # status code print(response.status_code) # #### A Couple of Common Status Codes # - 200 (successful response from server) # - 403 (forbidden response to server --> usually means they are doing a good job of blocking or you are missing credentials) # - 404 (not found --> usually means the page you are looking for does not exist anymore) # - Anything 500 is usually a server error # # You can find a full list of HTTP response codes here: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status # In[4]: # page source print(response.text) # url sent print(response.url) # Unfortuantely, there's not much just to scrape from Google's main page so let's try Nike # In[5]: response = requests.get('https://www.nike.com/w/new-mens-shoes-3n82yznik1zy7ok') # In[6]: response.status_code # ### Parsing the webpage # To parse the webpage we will want to use BeautifulSoup to extract information from the page source that we just got # In[7]: soup = BeautifulSoup(response.text, 'html.parser') # first argument is the text, second is the type of parser we want to use # In[8]: data = soup.select('div[class="product-card__body"]') # In[9]: len(data) # In[10]: for x in data: print(x.text) # ### Getting around Tricky Websites # 1. Headers # 2. Proxies # 3. Selenium # In[11]: headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36'} # In[12]: response = requests.get('https://www.tiktok.com/') # In[13]: response.text # In[14]: response = requests.get('https://www.tiktok.com/', headers=headers) # In[15]: response.text # #### Proxies # Many proxy services are available such as Scraper API provide proxy services which you can use to scrape # #### Selenium # This will be another video but it allows you to automate actual Chrome browsers