#!/usr/bin/env python # coding: utf-8 #
#

Geospatial Data Science Applications: GEOG 4/590

#

Feb 14, 2022

#

Lecture 7: Data access

# #

Johnny Ryan: jryan4@uoregon.edu

#
# ## Content of this lecture # # * Web2.0 #
#
# * Standard data access using APIs #
#
# * What to do when an API is unavailable or insufficient? #
#
# * Background for this week's lab # In[1]: from IPython.display import HTML HTML('') # ## APIs # # # * Application programming interface # # # * A type of **software** that provides a **standard set of protocols/functions** so that our computer can **communicate** with other computers # # # * In contrast, a **user interface** is a type of software that connects a **computer** to a **person** # ## APIs # # * Many organizations have great APIs because they want people to use their data # # # * We have used one... # * `cenpy` # # In[2]: # Install package get_ipython().system('pip install -U dataretrieval') # # In[3]: # Import the functions for downloading data from NWIS import dataretrieval.nwis as nwis # Specify the USGS site code site = '03339000' # Get instantaneous values (iv) df = nwis.get_record(sites=site, service='dv', start='2020-10-01', end='2021-09-30') df # In[4]: # Simple plot df['00060_Mean'].plot() # ## Tips for APIs # # * Take a minute to make sure that package is used (e.g. lots of forks, stars) and up-to-date (e.g. last commit) # # # * Read the `docs`, `demos`, `examples` and hope we find what we're looking for (sometimes they are not that comprehensive) # # # * If you can't find what you're looking for, inspect the source code (`.py` files) # ## APIs are sometimes not available or have limitations # # # # * These companies hoard data to secure market dominance # # # * Without access to their data it is difficult to tell whether they are in compliance # # # * By guarding data, they are also preventing it being used for good (maybe bad) causes # # Facebook loophole that allowed third-party apps to access not only user profile data but profile data of all friends. Kogan copied someone else's idea and made "thisisyourdigitallife". Only 270K used it but collected data from all friends of users (50+ million FB profiles). # ## Web scraping # # * Also known as crawling or harvesting is the practice of **automatically** gathering data from the internet without the use of an **API** # # # * Most commonly accomplished by writing a program that **queries** a web server, **requests** data (usually in the form of HTML), and **parses** that data to extract information # # # ## Suppose a friend wanted to do this? # # * Some HTML basics # # # * `requests`: standard Python library for requesting data from the web # # # * `BeautifulSoup`: a library for pulling data out of HTML and XML files # # # * `selenium`: is a library for performing **web browser automation** # # #

Geospatial Data Science Applications: GEOG 4/590

#

Feb 14, 2022

#

Lecture 7: Data access

# #

Johnny Ryan: jryan4@uoregon.edu

# # ### `requests` # In[2]: # Import packages import requests # In[7]: # Open a webpage html = requests.get('https://en.wikipedia.org/wiki/Climate_of_Oregon') # HTML html # # # ## BeautifulSoup4 # # * Now we could write a program to **parse** this HTML code (i.e. split into useful blocks)... # # # * ...or we could use another package called `BeautifulSoup` (also known as `bs4`) a Python library for parsing data out of HTML and XML files # # # ### Import packages # In[9]: # Import package from bs4 import BeautifulSoup, SoupStrainer # In[20]: # Read HTML content as "soup object" and define default parser soup = BeautifulSoup(html.text, 'html.parser') # ### Parse HTML using # # The `.find` and `.find_all` are the most common methods we will use. They can be used to filter HTML code to find a list of tags or tags with specific attributes. # In[24]: # Define heading tags heading_tags = ["h1", "h2"] # Find heading tags in HTML code headings = soup.find_all(heading_tags) # Loop over every heading and print text for tags in headings: print(tags.name + ' -> ' + tags.text.strip()) # In[31]: # Find every hyperlink links = soup.find_all('a') # Loop over every link and print hyperlink for link in links: print(link.get('href')) # In[98]: # Find number of images on page len(soup.find_all('img')) # In[104]: # Print details of first image print(soup.find_all('img')[0]) # In[103]: # Find attributes of first image print(soup.find_all('img')[0].attrs['src']) # In[97]: # Download image url = 'https://' + soup.find_all('img')[0].attrs['src'][2:] response = requests.get(url) if response.status_code == 200: with open("images/test_image.jpg", 'wb') as f: f.write(response.content) # In[99]: # Import packages import matplotlib.pyplot as plt import matplotlib.image as mpimg # Read image img = mpimg.imread('images/test_image.jpg') # Plot image plt.imshow(img) # ## Selenium # # * Sometimes we want even more control... # # # * Selenium is a package for performing **web browser automation** # # # * We can use Selenium to enter text in search boxes, click buttons etc. # # # In[1]: # Install webdriver_manager: https://github.com/SergeyPirogov/webdriver_manager get_ipython().system('pip3 install webdriver_manager') # In[2]: # Import packages from selenium import webdriver from selenium.webdriver.chrome.service import Service from selenium.webdriver.common.by import By from webdriver_manager.chrome import ChromeDriverManager # In[3]: # Install Chrome webdriver driver = webdriver.Chrome(service=Service(ChromeDriverManager().install())) # Open a web browser at the following page driver.get("https://www.google.com/maps") # # # In[4]: # Enter some text in the search box inputElement = driver.find_element(By.ID, "searchboxinput") inputElement.send_keys('South Sister Oregon') # # In[5]: # Click search button element = driver.find_element(By.ID, "searchbox-searchbutton") element.click() #