#!/usr/bin/env python # coding: utf-8 # # How many fact sheets survived the NAA website migration in 2019 # In[1]: import pandas as pd import requests from bs4 import BeautifulSoup # ## Get the most recent version of the fact sheet index from the Internet Archive # # First we'll load the page. # In[2]: # Note the 'id_' in the url to get the original page without the IA navigation. response = requests.get( "https://web.archive.org/web/20190716210347id_/http://www.naa.gov.au/collection/fact-sheets/by-number/index.aspx" ) # In[3]: soup = BeautifulSoup(response.content) # Then we'll extract the rows from the index table. # In[4]: fs_list = soup.find("table", title="Numerical list of fact sheets").find_all("tr")[1:] # ## Look for the fact sheets # # Let's loop through all the rows in the fact sheet index, extracting the fact sheet number, title and url. Then we'll try loading the url. We'll save all the details and the HTTP status code for further exploration. # In[5]: fact_sheets = [] for row in fs_list: num = row.td.text fs = row.find("a") title = fs.text url = f'http://naa.gov.au{fs["href"]}' response = requests.get(url) status = response.status_code print(f"{title}: {status}") fact_sheets.append({"number": num, "title": title, "url": url, "status": status}) # ## Examine the results # In[7]: df = pd.DataFrame(fact_sheets) # Let's break down the results by HTTP status code. # In[8]: df["status"].value_counts() # In[11]: print(f"{(207 + 3) / (207 + 56 + 3):.2%} of fact sheets are kaput!") # ## Which fact sheets have survived? # In[10]: df.loc[df["status"] == 200] # ## Save the results as a CSV # In[12]: df.to_csv("data/fact_sheets.csv", index=False)