from datascience import *
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
We'll illustrate the use of maps with the New York Times Covid-19 database. Recall that this is a county-level database of confirmed cases and deaths, updated daily, compiled from state and local governments and health departments across the United States.
The Times has created many visualizations that are effective communications of important information about the pandemic. Here we will construct some simple circle maps.
The data are publically available via GitHub: https://github.com/nytimes/covid-19-data.
covid_table = Table.read_table("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv")
covid_table
date | county | state | fips | cases | deaths |
---|---|---|---|---|---|
2020-01-21 | Snohomish | Washington | 53061 | 1 | 0 |
2020-01-22 | Snohomish | Washington | 53061 | 1 | 0 |
2020-01-23 | Snohomish | Washington | 53061 | 1 | 0 |
2020-01-24 | Cook | Illinois | 17031 | 1 | 0 |
2020-01-24 | Snohomish | Washington | 53061 | 1 | 0 |
2020-01-25 | Orange | California | 6059 | 1 | 0 |
2020-01-25 | Cook | Illinois | 17031 | 1 | 0 |
2020-01-25 | Snohomish | Washington | 53061 | 1 | 0 |
2020-01-26 | Maricopa | Arizona | 4013 | 1 | 0 |
2020-01-26 | Los Angeles | California | 6037 | 1 | 0 |
... (1076214 rows omitted)
Now we load in a table of geographical data for counties.
county_geo = Table.read_table("https://raw.githubusercontent.com/jdlafferty/covid-19/master/data/geo-counties.csv")
county_geo
county | state | fips | lat | lon |
---|---|---|---|---|
New York City | New York | nan | 40.7146 | -74.0071 |
Westchester | New York | 36119 | 41.1191 | -73.7887 |
Nassau | New York | 36059 | 42.5164 | -73.6113 |
Suffolk | New York | 36103 | 40.9601 | -72.8343 |
Cook | Illinois | 17031 | 41.8139 | -87.6155 |
King | Washington | 53033 | 47.4325 | -121.959 |
Unknown | New Jersey | nan | 0 | 0 |
Wayne | Michigan | 26163 | 42.2852 | -83.3836 |
Los Angeles | California | 6037 | 34.0536 | -118.246 |
Bergen | New Jersey | 34003 | 40.9476 | -74.0276 |
... (1660 rows omitted)
First we will calculate the average new cases for a subset of states.
first_date = '2021-02-20'
# Some subsets of states to visualize:
continental_states = ['Pennsylvania', 'Arizona', 'Connecticut', 'Florida', 'Wisconsin', 'South Dakota', 'Colorado',
'New Jersey', 'California', 'Ohio', 'West Virginia', 'Oregon', 'Alabama', 'Maine', 'Vermont',
'Montana', 'Nevada', 'Washington', 'Massachusetts', 'Maryland', 'Iowa', 'North Dakota', 'Kentucky',
'Delaware', 'Mississippi', 'Tennessee', 'Virginia', 'Kansas', 'Missouri', 'Utah', 'North Carolina', 'Louisiana',
'South Carolina', 'Minnesota', 'Arkansas', 'Indiana', 'New York', 'Wyoming', 'New Mexico', 'Rhode Island',
'Michigan', 'Nebraska', 'New Hampshire', 'Georgia', 'Texas', 'Illinois', 'Oklahoma', 'Idaho']
new_england_states = ['Connecticut', 'Massachusetts', 'Vermont', 'New Hampshire', 'Rhode Island', 'Maine']
west_coast_states = ['California', 'Oregon', 'Washington']
southeast_states = ['Florida', 'Louisiana', 'Alabama', 'Mississippi', 'Georgia', 'South Carolina']
states = new_england_states
recent_data = covid_table.where('date', are.above(first_date))
recent_state_data = recent_data.where('state', are.contained_in(states))
recent_state_data
date | county | state | fips | cases | deaths |
---|---|---|---|---|---|
2021-02-21 | Fairfield | Connecticut | 9001 | 78181 | 2027 |
2021-02-21 | Hartford | Connecticut | 9003 | 68527 | 2251 |
2021-02-21 | Litchfield | Connecticut | 9005 | 10835 | 270 |
2021-02-21 | Middlesex | Connecticut | 9007 | 10113 | 337 |
2021-02-21 | New Haven | Connecticut | 9009 | 69233 | 1891 |
2021-02-21 | New London | Connecticut | 9011 | 18635 | 403 |
2021-02-21 | Tolland | Connecticut | 9013 | 7612 | 159 |
2021-02-21 | Unknown | Connecticut | nan | 937 | 9 |
2021-02-21 | Windham | Connecticut | 9015 | 9028 | 176 |
2021-02-21 | Androscoggin | Maine | 23001 | 4695 | 54 |
... (570 rows omitted)
FIPS is a number assigned to each county. This will be used to merge the Covid-19 data with the geo data.
In the following code, we group the data by the FIPS county code, and compute the new cases for each day during the past week in that county. Then we average.
# remove extra columns
data = recent_state_data.drop('date').drop('county').drop('state').drop('deaths')
# exclude cases where fips is not known
data = data.where('fips', are.above(0))
# now, group by fips and form a list of the cumlative cases
data = data.group('fips', list)
# apply the difference function np.diff to get the new cases
data = data.with_column('new cases', data.apply(np.diff, 'cases list'))
data = data.drop('cases list')
# Now average to get the average new cases in each county over the past week
# We add a small amount .001 to avoid zeros, which the graphics handles badly
new_cases = Table().with_columns('fips', data['fips'],
'new cases', data.apply(np.mean, 'new cases') + .001)
new_cases
fips | new cases |
---|---|
9001 | 294.572 |
9003 | 207.858 |
9005 | 38.2867 |
9007 | 39.001 |
9009 | 280.144 |
9011 | 69.7153 |
9013 | 25.001 |
9015 | 18.1439 |
23001 | 16.2867 |
23003 | 2.28671 |
... (57 rows omitted)
state_geo = county_geo.where('state', are.contained_in(states)).sort('fips')
state_geo
county | state | fips | lat | lon |
---|---|---|---|---|
Fairfield | Connecticut | 9001 | 41.1663 | -73.2288 |
Hartford | Connecticut | 9003 | 41.7638 | -72.6739 |
Litchfield | Connecticut | 9005 | 41.7463 | -73.189 |
Middlesex | Connecticut | 9007 | 41.4094 | -72.529 |
New Haven | Connecticut | 9009 | 41.308 | -72.9243 |
New London | Connecticut | 9011 | 41.3565 | -72.0963 |
Tolland | Connecticut | 9013 | 41.8745 | -72.374 |
Windham | Connecticut | 9015 | 41.6999 | -72.1551 |
Androscoggin | Maine | 23001 | 44.1971 | -70.2027 |
Cumberland | Maine | 23005 | 43.8005 | -70.2545 |
... (54 rows omitted)
Now we join the table of average new cases with the geo table, so that we have latitude and longitude for each county.
new_cases_geo = state_geo.join('fips', new_cases)
new_cases_geo = new_cases_geo.drop('fips')
new_cases_geo
county | state | lat | lon | new cases |
---|---|---|---|---|
Fairfield | Connecticut | 41.1663 | -73.2288 | 294.572 |
Hartford | Connecticut | 41.7638 | -72.6739 | 207.858 |
Litchfield | Connecticut | 41.7463 | -73.189 | 38.2867 |
Middlesex | Connecticut | 41.4094 | -72.529 | 39.001 |
New Haven | Connecticut | 41.308 | -72.9243 | 280.144 |
New London | Connecticut | 41.3565 | -72.0963 | 69.7153 |
Tolland | Connecticut | 41.8745 | -72.374 | 25.001 |
Windham | Connecticut | 41.6999 | -72.1551 | 18.1439 |
Androscoggin | Maine | 44.1971 | -70.2027 | 16.2867 |
Cumberland | Maine | 43.8005 | -70.2545 | 41.8581 |
... (49 rows omitted)
Finally, we can create a map where we show a circle at each county location, with area proportional to the average number of new cases over the past
dat = Table().with_columns('lat', new_cases_geo['lat'],
'long', new_cases_geo['lon'],
'labels', new_cases_geo['county'],
'areas', 10*new_cases_geo['new cases'],
'colors', 'red')
dat.show(10)
Circle.map_table(dat, weight=1)
lat | long | labels | areas | colors |
---|---|---|---|---|
41.1663 | -73.2288 | Fairfield | 2945.72 | red |
41.7638 | -72.6739 | Hartford | 2078.58 | red |
41.7463 | -73.189 | Litchfield | 382.867 | red |
41.4094 | -72.529 | Middlesex | 390.01 | red |
41.308 | -72.9243 | New Haven | 2801.44 | red |
41.3565 | -72.0963 | New London | 697.153 | red |
41.8745 | -72.374 | Tolland | 250.01 | red |
41.6999 | -72.1551 | Windham | 181.439 | red |
44.1971 | -70.2027 | Androscoggin | 162.867 | red |
43.8005 | -70.2545 | Cumberland | 418.581 | red |
... (49 rows omitted)
drinks = Table(['Drink', 'Cafe', 'Price']).with_rows([
['Milk Tea', 'Book Trader Cafe', 4],
['Espresso', "Willoughby's", 2],
['Coffee', "Willoughby's", 3],
['Espresso', "Blue State Coffee", 2]
])
drinks
Drink | Cafe | Price |
---|---|---|
Milk Tea | Book Trader Cafe | 4 |
Espresso | Willoughby's | 2 |
Coffee | Willoughby's | 3 |
Espresso | Blue State Coffee | 2 |
discounts = Table().with_columns(
'Coupon % off', make_array(5, 50, 25, 0),
'Location', make_array("Willoughby's", "Blue State Coffee", "Willoughby's", "Book Trader Cafe")
)
discounts
Coupon % off | Location |
---|---|
5 | Willoughby's |
50 | Blue State Coffee |
25 | Willoughby's |
0 | Book Trader Cafe |
# Discussion question: Generate a table with one row per cafe that
# has the name and discounted price of its cheapest discounted drink
# Link (join) drinks with discounts
combined = drinks.join('Cafe', discounts, 'Location')
combined.show()
# Compute discounted prices
discounted_prices = combined.column('Price') * (1 - combined.column('Coupon % off')/100)
discounted_drinks = combined.with_column('Discounted price', discounted_prices)
discounted_drinks
Cafe | Drink | Price | Coupon % off |
---|---|---|---|
Blue State Coffee | Espresso | 2 | 50 |
Book Trader Cafe | Milk Tea | 4 | 0 |
Willoughby's | Espresso | 2 | 5 |
Willoughby's | Espresso | 2 | 25 |
Willoughby's | Coffee | 3 | 5 |
Willoughby's | Coffee | 3 | 25 |
Cafe | Drink | Price | Coupon % off | Discounted price |
---|---|---|---|---|
Blue State Coffee | Espresso | 2 | 50 | 1 |
Book Trader Cafe | Milk Tea | 4 | 0 | 4 |
Willoughby's | Espresso | 2 | 5 | 1.9 |
Willoughby's | Espresso | 2 | 25 | 1.5 |
Willoughby's | Coffee | 3 | 5 | 2.85 |
Willoughby's | Coffee | 3 | 25 | 2.25 |
#Correct, Espresso is cheaper
discounted_drinks.sort('Discounted price').sort('Cafe', distinct=True)
Cafe | Drink | Price | Coupon % off | Discounted price |
---|---|---|---|---|
Blue State Coffee | Espresso | 2 | 50 | 1 |
Book Trader Cafe | Milk Tea | 4 | 0 | 4 |
Willoughby's | Espresso | 2 | 25 | 1.5 |
#Incorrect - need to sort by "Discounted price" first
discounted_drinks.sort('Cafe', distinct=True)
Cafe | Drink | Price | Coupon % off | Discounted price |
---|---|---|---|---|
Blue State Coffee | Espresso | 2 | 50 | 1 |
Book Trader Cafe | Milk Tea | 4 | 0 | 4 |
Willoughby's | Espresso | 2 | 5 | 1.9 |
#Incorrect, Coffee is first alphabetically
discounted_drinks.group('Cafe', min)
Cafe | Drink min | Price min | Coupon % off min | Discounted price min |
---|---|---|---|---|
Blue State Coffee | Espresso | 2 | 50 | 1 |
Book Trader Cafe | Milk Tea | 4 | 0 | 4 |
Willoughby's | Coffee | 2 | 5 | 1.5 |
Challenge yourself and try to solve these on your own before looking at the solutions!
trip0 = Table.read_table("trip.csv")
trip = Table().with_columns(
"Start", trip0.column("Start Station"),
"End", trip0.column("End Station"),
"Duration", trip0.column("Duration"))
trip.show(3)
Start | End | Duration |
---|---|---|
Harry Bridges Plaza (Ferry Building) | San Francisco Caltrain (Townsend at 4th) | 765 |
San Antonio Shopping Center | Mountain View City Hall | 1036 |
Post at Kearny | 2nd at South Park | 307 |
... (354149 rows omitted)
# The name of the station where the most rentals ended
#(assume no ties).
# The number of stations for which the average duration ending
# at that station was more than 10 minutes.
# The number of stations that have more than 500 starts
# AND more than 500 ends
# The name of the station where the most rentals ended (assume no ties).
# First, find end counts
# Then, find the station with the highest end count
trip.group('End').sort('count', descending=True).column(0).item(0)
'San Francisco Caltrain (Townsend at 4th)'
# The number of stations for which the average duration ending
# at that station was more than 10 minutes.
# First, find the average end time for each station
# Then, keep the ones above 10 minutes
# Then, count them
trip.group('End', np.average).where(2, are.above(10*60)).num_rows
68
# The number of stations that have more than 500 starts
# AND more than 500 ends
# First, find the start counts
starting = trip.group('Start').relabeled('count', 'Start count').relabeled('Start', 'Station')
# Then, find the end counts
ending = trip.group('End').relabeled('count', 'End count').relabeled('End', 'Station')
# Combine them with join
starting.join('Station', ending).where('Start count', are.above(500)).where('End count', are.above(500)).num_rows
56
3 > 1
True
type(3 > 1)
bool
3 < 1
False
True
True
3 == 3
True
# 3 = 3
x = 14
y = 3
x > 10
True
12 < x < 18
True
12 < x
True
x < 18
True
12 < x-y < 18
False
x > 10 and y > 5
False
pets = make_array('cat', 'dog', 'cat', 'cat', 'dog', 'rabbit')
pets
array(['cat', 'dog', 'cat', 'cat', 'dog', 'rabbit'], dtype='<U6')
pets == 'dog'
array([False, True, False, False, True, False])
0 + 1 + 0 + 0 + 1 + 0
2
sum(make_array(False, True, False, False, True, False))
2
sum(pets == 'dog')
2
np.count_nonzero(pets == 'dog')
2
pets > 'cat'
array([False, True, False, False, True, True])
sum(pets > 'cat')
3
"cat" < "catastrophe"
True
where
¶terms = Table().with_column('Semester', np.arange(1, 9))
terms
Semester |
---|
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
terms.where('Semester', are.above(6))
Semester |
---|
7 |
8 |
is_senior = are.above(6)
is_senior(4)
False
def also_is_senior(x):
return x > 6
also_is_senior(5)
False
terms.apply(also_is_senior, 'Semester')
array([False, False, False, False, False, False, True, True])
terms.where('Semester', are.above(6))
Semester |
---|
7 |
8 |
terms.where('Semester', is_senior)
Semester |
---|
7 |
8 |
terms.where('Semester', also_is_senior)
Semester |
---|
7 |
8 |
terms.where(terms.apply(also_is_senior, 'Semester'))
Semester |
---|
7 |
8 |
dat.where('labels', are.equal_to('Benton'))
lat | long | labels | areas | colors |
---|