In [1]:
from datascience import *
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

Maps

We'll illustrate the use of maps with the New York Times Covid-19 database. Recall that this is a county-level database of confirmed cases and deaths, updated daily, compiled from state and local governments and health departments across the United States.

The Times has created many visualizations that are effective communications of important information about the pandemic. Here we will construct some simple circle maps.

The data are publically available via GitHub: https://github.com/nytimes/covid-19-data.

In [2]:
covid_table = Table.read_table("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv")
covid_table
Out[2]:
date county state fips cases deaths
2020-01-21 Snohomish Washington 53061 1 0
2020-01-22 Snohomish Washington 53061 1 0
2020-01-23 Snohomish Washington 53061 1 0
2020-01-24 Cook Illinois 17031 1 0
2020-01-24 Snohomish Washington 53061 1 0
2020-01-25 Orange California 6059 1 0
2020-01-25 Cook Illinois 17031 1 0
2020-01-25 Snohomish Washington 53061 1 0
2020-01-26 Maricopa Arizona 4013 1 0
2020-01-26 Los Angeles California 6037 1 0

... (1076214 rows omitted)

Now we load in a table of geographical data for counties.

In [3]:
county_geo = Table.read_table("https://raw.githubusercontent.com/jdlafferty/covid-19/master/data/geo-counties.csv") 
county_geo
Out[3]:
county state fips lat lon
New York City New York nan 40.7146 -74.0071
Westchester New York 36119 41.1191 -73.7887
Nassau New York 36059 42.5164 -73.6113
Suffolk New York 36103 40.9601 -72.8343
Cook Illinois 17031 41.8139 -87.6155
King Washington 53033 47.4325 -121.959
Unknown New Jersey nan 0 0
Wayne Michigan 26163 42.2852 -83.3836
Los Angeles California 6037 34.0536 -118.246
Bergen New Jersey 34003 40.9476 -74.0276

... (1660 rows omitted)

Process the recent data

First we will calculate the average new cases for a subset of states.

In [4]:
first_date = '2021-02-20'

# Some subsets of states to visualize:
continental_states = ['Pennsylvania', 'Arizona', 'Connecticut', 'Florida', 'Wisconsin', 'South Dakota', 'Colorado',
 'New Jersey', 'California', 'Ohio', 'West Virginia', 'Oregon', 'Alabama', 'Maine', 'Vermont',
 'Montana', 'Nevada', 'Washington', 'Massachusetts', 'Maryland', 'Iowa', 'North Dakota', 'Kentucky',
 'Delaware', 'Mississippi', 'Tennessee', 'Virginia', 'Kansas', 'Missouri', 'Utah', 'North Carolina', 'Louisiana',
 'South Carolina', 'Minnesota', 'Arkansas', 'Indiana', 'New York', 'Wyoming', 'New Mexico', 'Rhode Island',
 'Michigan', 'Nebraska', 'New Hampshire', 'Georgia', 'Texas', 'Illinois', 'Oklahoma', 'Idaho']

new_england_states = ['Connecticut', 'Massachusetts', 'Vermont', 'New Hampshire', 'Rhode Island', 'Maine']
west_coast_states = ['California', 'Oregon', 'Washington']
southeast_states = ['Florida', 'Louisiana', 'Alabama', 'Mississippi', 'Georgia', 'South Carolina']

states = new_england_states

recent_data = covid_table.where('date', are.above(first_date))
recent_state_data = recent_data.where('state', are.contained_in(states))
recent_state_data
Out[4]:
date county state fips cases deaths
2021-02-21 Fairfield Connecticut 9001 78181 2027
2021-02-21 Hartford Connecticut 9003 68527 2251
2021-02-21 Litchfield Connecticut 9005 10835 270
2021-02-21 Middlesex Connecticut 9007 10113 337
2021-02-21 New Haven Connecticut 9009 69233 1891
2021-02-21 New London Connecticut 9011 18635 403
2021-02-21 Tolland Connecticut 9013 7612 159
2021-02-21 Unknown Connecticut nan 937 9
2021-02-21 Windham Connecticut 9015 9028 176
2021-02-21 Androscoggin Maine 23001 4695 54

... (570 rows omitted)

FIPS is a number assigned to each county. This will be used to merge the Covid-19 data with the geo data.

In the following code, we group the data by the FIPS county code, and compute the new cases for each day during the past week in that county. Then we average.

In [5]:
# remove extra columns
data = recent_state_data.drop('date').drop('county').drop('state').drop('deaths')

# exclude cases where fips is not known
data = data.where('fips', are.above(0))

# now, group by fips and form a list of the cumlative cases
data = data.group('fips', list)

# apply the difference function np.diff to get the new cases
data = data.with_column('new cases', data.apply(np.diff, 'cases list'))
data = data.drop('cases list')

# Now average to get the average new cases in each county over the past week
# We add a small amount .001 to avoid zeros, which the graphics handles badly 
new_cases = Table().with_columns('fips', data['fips'], 
                                 'new cases', data.apply(np.mean, 'new cases') + .001)
new_cases
Out[5]:
fips new cases
9001 294.572
9003 207.858
9005 38.2867
9007 39.001
9009 280.144
9011 69.7153
9013 25.001
9015 18.1439
23001 16.2867
23003 2.28671

... (57 rows omitted)

In [6]:
state_geo = county_geo.where('state', are.contained_in(states)).sort('fips')
state_geo
Out[6]:
county state fips lat lon
Fairfield Connecticut 9001 41.1663 -73.2288
Hartford Connecticut 9003 41.7638 -72.6739
Litchfield Connecticut 9005 41.7463 -73.189
Middlesex Connecticut 9007 41.4094 -72.529
New Haven Connecticut 9009 41.308 -72.9243
New London Connecticut 9011 41.3565 -72.0963
Tolland Connecticut 9013 41.8745 -72.374
Windham Connecticut 9015 41.6999 -72.1551
Androscoggin Maine 23001 44.1971 -70.2027
Cumberland Maine 23005 43.8005 -70.2545

... (54 rows omitted)

Now we join the table of average new cases with the geo table, so that we have latitude and longitude for each county.

In [7]:
new_cases_geo = state_geo.join('fips', new_cases)
new_cases_geo = new_cases_geo.drop('fips')
new_cases_geo
Out[7]:
county state lat lon new cases
Fairfield Connecticut 41.1663 -73.2288 294.572
Hartford Connecticut 41.7638 -72.6739 207.858
Litchfield Connecticut 41.7463 -73.189 38.2867
Middlesex Connecticut 41.4094 -72.529 39.001
New Haven Connecticut 41.308 -72.9243 280.144
New London Connecticut 41.3565 -72.0963 69.7153
Tolland Connecticut 41.8745 -72.374 25.001
Windham Connecticut 41.6999 -72.1551 18.1439
Androscoggin Maine 44.1971 -70.2027 16.2867
Cumberland Maine 43.8005 -70.2545 41.8581

... (49 rows omitted)

Finally, we can create a map where we show a circle at each county location, with area proportional to the average number of new cases over the past

In [8]:
dat = Table().with_columns('lat', new_cases_geo['lat'], 
                           'long', new_cases_geo['lon'], 
                           'labels', new_cases_geo['county'],
                           'areas', 10*new_cases_geo['new cases'],
                           'colors', 'red')
dat.show(10)
Circle.map_table(dat, weight=1)
lat long labels areas colors
41.1663 -73.2288 Fairfield 2945.72 red
41.7638 -72.6739 Hartford 2078.58 red
41.7463 -73.189 Litchfield 382.867 red
41.4094 -72.529 Middlesex 390.01 red
41.308 -72.9243 New Haven 2801.44 red
41.3565 -72.0963 New London 697.153 red
41.8745 -72.374 Tolland 250.01 red
41.6999 -72.1551 Windham 181.439 red
44.1971 -70.2027 Androscoggin 162.867 red
43.8005 -70.2545 Cumberland 418.581 red

... (49 rows omitted)

Out[8]:
Make this Notebook Trusted to load map: File -> Trust Notebook

Table examples

In [9]:
drinks = Table(['Drink', 'Cafe', 'Price']).with_rows([
    ['Milk Tea', 'Book Trader Cafe', 4],
    ['Espresso', "Willoughby's",  2],
    ['Coffee',   "Willoughby's",  3],
    ['Espresso', "Blue State Coffee",   2]
])
drinks
Out[9]:
Drink Cafe Price
Milk Tea Book Trader Cafe 4
Espresso Willoughby's 2
Coffee Willoughby's 3
Espresso Blue State Coffee 2
In [10]:
discounts = Table().with_columns(
    'Coupon % off', make_array(5, 50, 25, 0),
    'Location', make_array("Willoughby's", "Blue State Coffee", "Willoughby's", "Book Trader Cafe")
)
discounts
Out[10]:
Coupon % off Location
5 Willoughby's
50 Blue State Coffee
25 Willoughby's
0 Book Trader Cafe
In [11]:
# Discussion question:  Generate a table with one row per cafe that 
# has the name and discounted price of its cheapest discounted drink

# Link (join) drinks with discounts
combined = drinks.join('Cafe', discounts, 'Location')
combined.show()

# Compute discounted prices
discounted_prices = combined.column('Price') * (1 - combined.column('Coupon % off')/100)

discounted_drinks = combined.with_column('Discounted price', discounted_prices)
discounted_drinks
Cafe Drink Price Coupon % off
Blue State Coffee Espresso 2 50
Book Trader Cafe Milk Tea 4 0
Willoughby's Espresso 2 5
Willoughby's Espresso 2 25
Willoughby's Coffee 3 5
Willoughby's Coffee 3 25
Out[11]:
Cafe Drink Price Coupon % off Discounted price
Blue State Coffee Espresso 2 50 1
Book Trader Cafe Milk Tea 4 0 4
Willoughby's Espresso 2 5 1.9
Willoughby's Espresso 2 25 1.5
Willoughby's Coffee 3 5 2.85
Willoughby's Coffee 3 25 2.25
In [12]:
#Correct, Espresso is cheaper
discounted_drinks.sort('Discounted price').sort('Cafe', distinct=True) 
Out[12]:
Cafe Drink Price Coupon % off Discounted price
Blue State Coffee Espresso 2 50 1
Book Trader Cafe Milk Tea 4 0 4
Willoughby's Espresso 2 25 1.5
In [13]:
#Incorrect - need to sort by "Discounted price" first
discounted_drinks.sort('Cafe', distinct=True) 
Out[13]:
Cafe Drink Price Coupon % off Discounted price
Blue State Coffee Espresso 2 50 1
Book Trader Cafe Milk Tea 4 0 4
Willoughby's Espresso 2 5 1.9
In [14]:
#Incorrect, Coffee is first alphabetically
discounted_drinks.group('Cafe', min) 
Out[14]:
Cafe Drink min Price min Coupon % off min Discounted price min
Blue State Coffee Espresso 2 50 1
Book Trader Cafe Milk Tea 4 0 4
Willoughby's Coffee 2 5 1.5

Sample midterm question

Challenge yourself and try to solve these on your own before looking at the solutions!

In [15]:
trip0 = Table.read_table("trip.csv")
trip = Table().with_columns(
"Start", trip0.column("Start Station"),
"End", trip0.column("End Station"),
"Duration", trip0.column("Duration"))
trip.show(3)
Start End Duration
Harry Bridges Plaza (Ferry Building) San Francisco Caltrain (Townsend at 4th) 765
San Antonio Shopping Center Mountain View City Hall 1036
Post at Kearny 2nd at South Park 307

... (354149 rows omitted)

In [16]:
# The name of the station where the most rentals ended 
#(assume no ties).
In [ ]:
 
In [17]:
# The number of stations for which the average duration ending 
# at that station was more than 10 minutes.
In [ ]:
 
In [18]:
# The number of stations that have more than 500 starts 
# AND more than 500 ends
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [19]:
# The name of the station where the most rentals ended (assume no ties).
# First, find end counts
# Then, find the station with the highest end count
trip.group('End').sort('count', descending=True).column(0).item(0)
Out[19]:
'San Francisco Caltrain (Townsend at 4th)'
In [20]:
# The number of stations for which the average duration ending 
# at that station was more than 10 minutes.

# First, find the average end time for each station
# Then, keep the ones above 10 minutes
# Then, count them
trip.group('End', np.average).where(2, are.above(10*60)).num_rows
Out[20]:
68
In [21]:
# The number of stations that have more than 500 starts 
# AND more than 500 ends
# First, find the start counts
starting = trip.group('Start').relabeled('count', 'Start count').relabeled('Start', 'Station')
# Then, find the end counts
ending = trip.group('End').relabeled('count', 'End count').relabeled('End', 'Station')
# Combine them with join
starting.join('Station', ending).where('Start count', are.above(500)).where('End count', are.above(500)).num_rows
Out[21]:
56
In [ ]:
 

Comparison

In [22]:
3 > 1
Out[22]:
True
In [23]:
type(3 > 1)
Out[23]:
bool
In [24]:
3 < 1
Out[24]:
False
In [25]:
True
Out[25]:
True
In [26]:
3 == 3
Out[26]:
True
In [27]:
# 3 = 3
In [28]:
x = 14
y = 3
In [29]:
x > 10
Out[29]:
True
In [30]:
12 < x < 18
Out[30]:
True
In [31]:
12 < x
Out[31]:
True
In [32]:
x < 18
Out[32]:
True
In [33]:
12 < x-y < 18
Out[33]:
False
In [34]:
x > 10 and y > 5
Out[34]:
False
In [ ]:
 

Comparisons with arrays

In [35]:
pets = make_array('cat', 'dog', 'cat', 'cat', 'dog', 'rabbit')
pets
Out[35]:
array(['cat', 'dog', 'cat', 'cat', 'dog', 'rabbit'], dtype='<U6')
In [36]:
pets == 'dog'
Out[36]:
array([False,  True, False, False,  True, False])
In [37]:
0 + 1 + 0 + 0 + 1 + 0
Out[37]:
2
In [38]:
sum(make_array(False, True, False, False, True, False))
Out[38]:
2
In [39]:
sum(pets == 'dog')
Out[39]:
2
In [40]:
np.count_nonzero(pets == 'dog')
Out[40]:
2
In [41]:
pets > 'cat'
Out[41]:
array([False,  True, False, False,  True,  True])
In [42]:
sum(pets > 'cat')
Out[42]:
3
In [ ]:
 
In [43]:
"cat" < "catastrophe"
Out[43]:
True

Predicates and advanced where

In [44]:
terms = Table().with_column('Semester', np.arange(1, 9))
terms
Out[44]:
Semester
1
2
3
4
5
6
7
8
In [45]:
terms.where('Semester', are.above(6))
Out[45]:
Semester
7
8
In [46]:
is_senior = are.above(6)
In [47]:
is_senior(4)
Out[47]:
False
In [48]:
def also_is_senior(x):
    return x > 6
In [49]:
also_is_senior(5)
Out[49]:
False
In [50]:
terms.apply(also_is_senior, 'Semester')
Out[50]:
array([False, False, False, False, False, False,  True,  True])
In [51]:
terms.where('Semester', are.above(6))
Out[51]:
Semester
7
8
In [52]:
terms.where('Semester', is_senior)
Out[52]:
Semester
7
8
In [53]:
terms.where('Semester', also_is_senior)
Out[53]:
Semester
7
8
In [54]:
terms.where(terms.apply(also_is_senior, 'Semester'))
Out[54]:
Semester
7
8
In [ ]:
 
In [55]:
dat.where('labels', are.equal_to('Benton'))
Out[55]:
lat long labels areas colors