#!/usr/bin/env python # coding: utf-8 # # Lesson 7—Fetching data online # Version 1.0. Prepared by [Makzan](https://makzan.net). Updated at 2021 March. # In this series, we will use 3 lectures to learn fetching data online. This includes: # # - Finding patterns in URL # - Open web URL # - Downloading files in Python # - Fetch data with API # - **Web scraping with Requests and BeautifulSoup** # - Web automation with Selenium # - Converting Wikipedia tabular data into CSV # In this lesson, we will learn to download web page and parse the HTML to extract the data we need. We will use `requests` and `BeautifulSoup`. `Requests` downloads the web page HTML file and `BeautifulSoup` parses the HTML into tree structure for us to access and extract data. # ## Web Scraping # # 1. Querying web page # 1. Parse the DOM tree # 1. Get the data we want from the HTML code # In[1]: from bs4 import BeautifulSoup import requests res = requests.get("https://news.gov.mo/home/zh-hant") soup = BeautifulSoup(res.text, "html.parser") for h5 in soup.select("h5"): print(h5.text.strip()) # ## Extra: Fetching with try-except # In[36]: from bs4 import BeautifulSoup import requests try: res = requests.get("https://news.gov.mo/home/zh-hant") except requests.exceptions.ConnectionError: print("Error: Invalid URL or Connection Lost.") exit() soup = BeautifulSoup(res.text, "html.parser") for h5 in soup.select("h5"): print(h5.text.strip()) # In[2]: from bs4 import BeautifulSoup import requests res = requests.get("https://news.gov.mo/home/zh-hant") soup = BeautifulSoup(res.text, "html.parser") for h5 in soup.select("h5")[:5]: print(h5.getText().strip()) # Fetch the content href = h5.select_one("a")["href"] res = requests.get("https://news.gov.mo/" + href) soup2 = BeautifulSoup(res.text, "html.parser") content = soup2.select_one(".asideBody p:first-of-type") print(content.text) print("---") print("Done.") # ## Fetching Macao Daily news # In[3]: from bs4 import BeautifulSoup import requests import datetime today = datetime.date.today() year = today.year month = today.month day = today.day month = str(month).zfill(2) day = str(day).zfill(2) res = requests.get(f"http://www.macaodaily.com/html/{year}-{month}/{day}/node_1.htm") res.encoding = "utf-8" soup = BeautifulSoup(res.text, "html.parser") # Be aware that you may need a different parser if "lxml" not found. links = soup.select("#all_article_list a") for link in links[:40]: print(link.text) print("Finished.") # ## ✏️ Exercise time: Lab 3 # 1. Please try to execute the code to see the program result. # 1. Please try to change the keyword inside the code to fetch different queries. # 1. Please try to make the code more flexible by changing the date and query into input. # 1. Please try to save the result into a text file. # 1. Please try to change the code to allow multiple searches until user enters "q". # In[46]: from bs4 import BeautifulSoup import requests # Task 1: Change year and month into input year = "2020" month = "06" for i in range(1,32): day = str(i).zfill(2) res = requests.get(f"http://www.macaodaily.com/html/{year}-{month}/{day}/node_1.htm") res.encoding = "utf-8" soup = BeautifulSoup(res.text, "html.parser") links = soup.select("#all_article_list a") for link in links: news_title = link.getText() # Task 2: Change keyword into input if "大灣區" in news_title: # Task 3: Save the result in TXT intead of printing out. print(f"{year}-{month}-{day}: {news_title}") print("Finished.") # Solution to Lab 3 # # https://makclass.com/vimeo_players/335074765 # ## When is the next holiday? # In[10]: import datetime url = f"https://www.gov.mo/zh-hant/public-holidays/year-{datetime.date.today().year}/" response = requests.get(url) soup = BeautifulSoup(response.text, "html.parser") print(soup.select("#public-holidays")[0].text.replace('\n','')) # In[11]: month = soup.select("#public-holidays .month")[0].text day = soup.select("#public-holidays .day")[0].text weekday = soup.select("#public-holidays .weekday")[0].text description = soup.select("#next-holiday-description strong")[0].text print(f"接下來的公眾假期:{description}, {month}{day}日{weekday}") # ## A list of holidays in Macao # In[4]: import requests from bs4 import BeautifulSoup response = requests.get("https://www.gov.mo/zh-hant/public-holidays/year-2020/") soup = BeautifulSoup(response.text, "html.parser") tables = soup.select(".table") for row in tables[0].select("tr"): if len(row.select("td")) > 0: date = row.select("td")[1].text name = row.select("td")[3].text print(f"{date}: {name}") # Only listing obligatory holidays # In[13]: import requests from bs4 import BeautifulSoup response = requests.get("https://www.gov.mo/zh-hant/public-holidays/year-2020/") soup = BeautifulSoup(response.text, "html.parser") tables = soup.select(".table") for row in tables[0].select("tr"): if len(row.select("td")) > 0: is_obligatory = (row.select("td")[0].text == "*") if is_obligatory: date = row.select("td")[1].text name = row.select("td")[3].text print(f"{date}: {name}") # ## Is today government holiday? # In[14]: import requests from bs4 import BeautifulSoup import datetime # Get today's year, month and day today = datetime.date.today() year = today.year month = today.month day = today.day today_weekday = today.weekday() today_date = f"{month}月{day}日" # Fetch gov.mo url = f"https://www.gov.mo/zh-hant/public-holidays/year-{year}/" response = requests.get(url) soup = BeautifulSoup(response.text, "html.parser") tables = soup.select(".table") holidays = {} for table in tables: for row in table.select("tr"): if len(row.select("td")) > 0: date = row.select("td")[1].text weekday = row.select("td")[2].text name = row.select("td")[3].text holidays[date] = name # Query holidays print(today_date) if today_date in holidays: holiday = holidays[today_date] print(f"今天是公眾假期:{holiday}") elif today_weekday == 0: print("今天是星期日,但不是公眾假期。") elif today_weekday == 6: print("今天是星期六,但不是公眾假期。") else: print("今天不是公眾假期。") # Our code is getting longer now. We can group the parts of the code that fetch gov.mo into a function. We name it `is_macao_holiday` and take a date parameter. # In[15]: def is_macao_holiday(query_date): # Fetch gov.mo url = f"https://www.gov.mo/zh-hant/public-holidays/year-{query_date.year}/" response = requests.get(url) soup = BeautifulSoup(response.text, "html.parser") tables = soup.select(".table") holidays = {} for table in tables: for row in table.select("tr"): if len(row.select("td")) > 0: date = row.select("td")[1].text weekday = row.select("td")[2].text name = row.select("td")[3].text holidays[date] = name # Query holidays date_key = f"{query_date.month}月{query_date.day}日" if date_key in holidays: holiday = holidays[date_key] print(f"{date_key}是公眾假期:{holiday}") elif query_date.weekday() == 0: print(f"{date_key}是星期日,但不是公眾假期。") elif query_date.weekday() == 6: print(f"{date_key}是星期六,但不是公眾假期。") else: print(f"{date_key}不是公眾假期。") # In[11]: is_macao_holiday(datetime.date.today()) # ### Picking a date other than today # We can use parser in `dateutil` to parse a given date in string format into date format. # In[12]: import dateutil date = dateutil.parser.parse("2020-01-01") is_macao_holiday(date) # In[13]: import dateutil date = dateutil.parser.parse("2020-10-26") is_macao_holiday(date) # Futhermore, we can store the result in dictionary for further querying. # In[16]: import requests from bs4 import BeautifulSoup import datetime # Get today's year, month and day today = datetime.date.today() year = today.year month = today.month day = today.day today_weekday = today.weekday() today_date = f"{month}月{day}日" # Fetch gov.mo url = f"https://www.gov.mo/zh-hant/public-holidays/year-{year}/" response = requests.get(url) soup = BeautifulSoup(response.text, "html.parser") tables = soup.select(".table") holidays = {} for table in tables: for row in table.select("tr"): if len(row.select("td")) > 0: is_obligatory = (row.select("td")[0].text == "*") date = row.select("td")[1].text weekday = row.select("td")[2].text name = row.select("td")[3].text holidays[date] = { 'date': date, 'weekday': weekday, 'name': name, 'is_obligatory': is_obligatory, } # The result is stored in dictionary `holidays`. # In[17]: len(holidays) # In[18]: holidays # In[19]: # Query holidays print(today_date) if today_date in holidays: holiday = holidays[today_date] if holiday['is_obligatory']: print(f"今天是強制公眾假期:{holiday['name']}") else: print(f"今天是公眾假期:{holiday['name']}") elif today_weekday == 0: print("今天是星期日,但不是公眾假期。") elif today_weekday == 6: print("今天是星期六,但不是公眾假期。") else: print("今天不是公眾假期。") # ## Summary # # In this lesson, we learned about using BeautifulSoup to extract data from the web.