#!/usr/bin/env python
# coding: utf-8

# # Lesson 7—Fetching data online

# Version 1.0. Prepared by [Makzan](https://makzan.net). Updated at 2021 March.

# In this series, we will use 3 lectures to learn fetching data online. This includes:
# 
# - Finding patterns in URL
# - Open web URL
# - Downloading files in Python
# - Fetch data with API
# - **Web scraping with Requests and BeautifulSoup**
# - Web automation with Selenium
# - Converting Wikipedia tabular data into CSV

# In this lesson, we will learn to download web page and parse the HTML to extract the data we need. We will use `requests` and `BeautifulSoup`. `Requests` downloads the web page HTML file and `BeautifulSoup` parses the HTML into tree structure for us to access and extract data.

# ## Web Scraping
# 
# 1. Querying web page
# 1. Parse the DOM tree
# 1. Get the data we want from the HTML code

# In[1]:


from bs4 import BeautifulSoup
import requests


res = requests.get("https://news.gov.mo/home/zh-hant")
soup = BeautifulSoup(res.text, "html.parser")

for h5 in soup.select("h5"):
    print(h5.text.strip())


# ## Extra: Fetching with try-except

# In[36]:


from bs4 import BeautifulSoup
import requests

try:
    res = requests.get("https://news.gov.mo/home/zh-hant")
except requests.exceptions.ConnectionError:
    print("Error: Invalid URL or Connection Lost.")
    exit()

soup = BeautifulSoup(res.text, "html.parser")

for h5 in soup.select("h5"):
    print(h5.text.strip())


# In[2]:


from bs4 import BeautifulSoup
import requests


res = requests.get("https://news.gov.mo/home/zh-hant")
soup = BeautifulSoup(res.text, "html.parser")

for h5 in soup.select("h5")[:5]:
    print(h5.getText().strip())
    
    # Fetch the content
    href = h5.select_one("a")["href"]
    res = requests.get("https://news.gov.mo/" + href)
    soup2 = BeautifulSoup(res.text, "html.parser")
    content = soup2.select_one(".asideBody p:first-of-type")
    print(content.text)
    print("---")

print("Done.")


# ## Fetching Macao Daily news

# In[3]:


from bs4 import BeautifulSoup
import requests
import datetime

today = datetime.date.today()
year = today.year
month = today.month
day = today.day

month = str(month).zfill(2)
day = str(day).zfill(2)    
res = requests.get(f"http://www.macaodaily.com/html/{year}-{month}/{day}/node_1.htm")

res.encoding = "utf-8"

soup = BeautifulSoup(res.text, "html.parser") # Be aware that you may need a different parser if "lxml" not found.

links = soup.select("#all_article_list a")
for link in links[:40]:
    print(link.text) 


print("Finished.")


# ## ✏️ Exercise time: Lab 3

# 1. Please try to execute the code to see the program result.
# 1. Please try to change the keyword inside the code to fetch different queries.
# 1. Please try to make the code more flexible by changing the date and query into input.
# 1. Please try to save the result into a text file.
# 1. Please try to change the code to allow multiple searches until user enters "q".

# In[46]:


from bs4 import BeautifulSoup
import requests

# Task 1: Change year and month into input
year = "2020"
month = "06"

for i in range(1,32):
    day = str(i).zfill(2)    
    res = requests.get(f"http://www.macaodaily.com/html/{year}-{month}/{day}/node_1.htm")

    res.encoding = "utf-8"

    soup = BeautifulSoup(res.text, "html.parser")

    links = soup.select("#all_article_list a")
    for link in links:
        news_title = link.getText()

        # Task 2: Change keyword into input
        if "大灣區" in news_title:
            # Task 3: Save the result in TXT intead of printing out.
            print(f"{year}-{month}-{day}: {news_title}")

print("Finished.")


# Solution to Lab 3
# 
# https://makclass.com/vimeo_players/335074765

# ## When is the next holiday?

# In[10]:


import datetime

url = f"https://www.gov.mo/zh-hant/public-holidays/year-{datetime.date.today().year}/"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

print(soup.select("#public-holidays")[0].text.replace('\n',''))


# In[11]:


month = soup.select("#public-holidays .month")[0].text
day = soup.select("#public-holidays .day")[0].text
weekday = soup.select("#public-holidays .weekday")[0].text
description = soup.select("#next-holiday-description strong")[0].text

print(f"接下來的公眾假期：{description}, {month}{day}日{weekday}")


# ## A list of holidays in Macao

# In[4]:


import requests
from bs4 import BeautifulSoup

response = requests.get("https://www.gov.mo/zh-hant/public-holidays/year-2020/")
soup = BeautifulSoup(response.text, "html.parser")

tables = soup.select(".table")

for row in tables[0].select("tr"):
    if len(row.select("td")) > 0:
        date = row.select("td")[1].text
        name = row.select("td")[3].text
        print(f"{date}: {name}")
  

# Only listing obligatory holidays

# In[13]:


import requests
from bs4 import BeautifulSoup

response = requests.get("https://www.gov.mo/zh-hant/public-holidays/year-2020/")
soup = BeautifulSoup(response.text, "html.parser")

tables = soup.select(".table")

for row in tables[0].select("tr"):
    if len(row.select("td")) > 0:
        is_obligatory = (row.select("td")[0].text == "*")
        if is_obligatory:
            date = row.select("td")[1].text
            name = row.select("td")[3].text
            print(f"{date}: {name}")
  

# ## Is today government holiday?

# In[14]:


import requests
from bs4 import BeautifulSoup
import datetime

# Get today's year, month and day
today = datetime.date.today()
year = today.year
month = today.month
day = today.day
today_weekday = today.weekday()
today_date = f"{month}月{day}日"


# Fetch gov.mo
url = f"https://www.gov.mo/zh-hant/public-holidays/year-{year}/"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

tables = soup.select(".table")

holidays = {}

for table in tables:
    for row in table.select("tr"):
        if len(row.select("td")) > 0:    
            date = row.select("td")[1].text
            weekday = row.select("td")[2].text
            name = row.select("td")[3].text
            holidays[date] = name


# Query holidays
print(today_date)
if today_date in holidays:
    holiday = holidays[today_date]
    print(f"今天是公眾假期：{holiday}")
elif today_weekday == 0:
    print("今天是星期日，但不是公眾假期。")
elif today_weekday == 6:
    print("今天是星期六，但不是公眾假期。")  
else:
    print("今天不是公眾假期。")


# Our code is getting longer now. We can group the parts of the code that fetch gov.mo into a function. We name it `is_macao_holiday` and take a date parameter.

# In[15]:


def is_macao_holiday(query_date):    
    # Fetch gov.mo
    url = f"https://www.gov.mo/zh-hant/public-holidays/year-{query_date.year}/"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")

    tables = soup.select(".table")

    holidays = {}

    for table in tables:
        for row in table.select("tr"):
            if len(row.select("td")) > 0:    
                date = row.select("td")[1].text
                weekday = row.select("td")[2].text
                name = row.select("td")[3].text
                holidays[date] = name


    # Query holidays
    date_key = f"{query_date.month}月{query_date.day}日"

    if date_key in holidays:        
        holiday = holidays[date_key]
        print(f"{date_key}是公眾假期：{holiday}")
    elif query_date.weekday() == 0:
        print(f"{date_key}是星期日，但不是公眾假期。")
    elif query_date.weekday() == 6:
        print(f"{date_key}是星期六，但不是公眾假期。")  
    else:
        print(f"{date_key}不是公眾假期。")


# In[11]:


is_macao_holiday(datetime.date.today())


# ### Picking a date other than today

# We can use parser in `dateutil` to parse a given date in string format into date format.

# In[12]:


import dateutil
date = dateutil.parser.parse("2020-01-01")
is_macao_holiday(date)


# In[13]:


import dateutil
date = dateutil.parser.parse("2020-10-26")
is_macao_holiday(date)


# Futhermore, we can store the result in dictionary for further querying.

# In[16]:


import requests
from bs4 import BeautifulSoup
import datetime

# Get today's year, month and day
today = datetime.date.today()
year = today.year
month = today.month
day = today.day
today_weekday = today.weekday()
today_date = f"{month}月{day}日"


# Fetch gov.mo
url = f"https://www.gov.mo/zh-hant/public-holidays/year-{year}/"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

tables = soup.select(".table")

holidays = {}

for table in tables:
    for row in table.select("tr"):
        if len(row.select("td")) > 0:    
            is_obligatory = (row.select("td")[0].text == "*")
            date = row.select("td")[1].text
            weekday = row.select("td")[2].text
            name = row.select("td")[3].text
            holidays[date] = {
                'date': date,
                'weekday': weekday,
                'name': name,
                'is_obligatory': is_obligatory,
            }


# The result is stored in dictionary `holidays`.

# In[17]:


len(holidays)


# In[18]:


holidays


# In[19]:


# Query holidays
print(today_date)
if today_date in holidays:
    holiday = holidays[today_date]
    if holiday['is_obligatory']:
        print(f"今天是強制公眾假期：{holiday['name']}")
    else:
        print(f"今天是公眾假期：{holiday['name']}")
elif today_weekday == 0:
    print("今天是星期日，但不是公眾假期。")
elif today_weekday == 6:
    print("今天是星期六，但不是公眾假期。")  
else:
    print("今天不是公眾假期。")


# ## Summary
# 
# In this lesson, we learned about using BeautifulSoup to extract data from the web.