This notebook visualizes projected location data with matplotlib. More info: http://geoffboeing.com/2014/08/visualizing-summer-travels/
This notebook's code if fairly old now, and not particularly efficient. For more up-to-date code for loading, projecting, mapping, and analyzing spatial data, check out: https://github.com/gboeing/urban-data-science/tree/master/19-Spatial-Analysis-and-Cartography
# magic command to display matplotlib plots inline within the ipython notebook webpage
%matplotlib inline
# import necessary modules
import pandas as pd, numpy as np, matplotlib.pyplot as plt, matplotlib.font_manager as fm
from time import time
from shapely.geometry import Polygon, Point
from geopy.distance import great_circle
from geopandas import GeoDataFrame
from descartes import PolygonPatch
# load the gps coordinate data
# the data files are encoded as utf-8: specify so to prevent matplotlib from choking on diacritics
df = pd.read_csv('data/summer-travel-gps-full.csv', encoding='utf-8')
rs = pd.read_csv('data/summer-travel-gps-dbscan.csv', encoding='utf-8')
# specify the fonts and background color for our map
title_font = fm.FontProperties(family='Bitstream Vera Sans', style='normal', size=15, weight='normal', stretch='normal')
annotation_font = fm.FontProperties(family='Bitstream Vera Sans', style='normal', size=10, weight='normal', stretch='normal')
backgroundcolor = '#e4f4ff'
# load the shapefile
all_countries = GeoDataFrame.from_file('shapefiles/world_borders/TM_WORLD_BORDERS-0.3.shp')
# the original CRS of our shapefile and point data
original_crs = all_countries.crs
# the projected CRS to convert our shapefile and point data into
target_crs = {'datum':'WGS84', 'no_defs':True, 'proj':'aea', 'lat_1':35, 'lat_2':55, 'lat_0':45, 'lon_0':10}
# change the CRS of the shapefile to the specified projected one
all_countries.to_crs(crs=target_crs, inplace=True)
# create a geometry column in our point data set for geopandas to use
rs['geometry'] = rs.apply(lambda row: Point(row['lon'], row['lat']), axis=1)
# create a new geopandas geodataframe from the point data
points = GeoDataFrame(rs)
# you must specify its original CRS to convert it to a different (projected) one later
points.crs = original_crs
points.head()
Unnamed: 0 | lat | lon | date | city | country | geometry | |
---|---|---|---|---|---|---|---|
0 | 0 | 51.478199 | -0.446081 | 05/14/2014 10:51 | Hounslow | United Kingdom | POINT (-0.4460810000000001 51.47819910000001) |
1 | 1 | 38.774422 | -9.129195 | 05/14/2014 15:25 | Lisbon | Portugal | POINT (-9.129194699999999 38.7744221) |
2 | 2 | 38.742987 | -9.147780 | 05/14/2014 16:11 | Lisbon | Portugal | POINT (-9.1477802 38.74298710000001) |
3 | 3 | 38.712325 | -9.137434 | 05/17/2014 08:09 | Lisbon | Portugal | POINT (-9.137434000000001 38.7123248) |
4 | 4 | 38.693780 | -9.206363 | 05/15/2014 14:19 | Lisbon | Portugal | POINT (-9.206363400000001 38.69377960000001) |
# convert the point data to the same projected CRS we specified earlier for our shapefile
points.to_crs(crs=target_crs, inplace=True)
# convert the projected points into discrete x and y columns for easy matplotlib scatterplotting
points['x'] = points['geometry'].map(lambda point: point.x)
points['y'] = points['geometry'].map(lambda point: point.y)
points.head()
Unnamed: 0 | lat | lon | date | city | country | geometry | x | y | |
---|---|---|---|---|---|---|---|---|---|
0 | 0 | 51.478199 | -0.446081 | 05/14/2014 10:51 | Hounslow | United Kingdom | POINT (-716600.465168938 775891.5996303948) | -7.166005e+05 | 775891.599630 |
1 | 1 | 38.774422 | -9.129195 | 05/14/2014 15:25 | Lisbon | Portugal | POINT (-1633623.174194452 -509447.4438606029) | -1.633623e+06 | -509447.443861 |
2 | 2 | 38.742987 | -9.147780 | 05/14/2014 16:11 | Lisbon | Portugal | POINT (-1635993.280910485 -512502.2805595197) | -1.635993e+06 | -512502.280560 |
3 | 3 | 38.712325 | -9.137434 | 05/17/2014 08:09 | Lisbon | Portugal | POINT (-1635916.550109638 -516047.7724266322) | -1.635917e+06 | -516047.772427 |
4 | 4 | 38.693780 | -9.206363 | 05/15/2014 14:19 | Lisbon | Portugal | POINT (-1642181.794841199 -516694.1408334156) | -1.642182e+06 | -516694.140833 |
# calculate some margin so our data doesn't go right up to the edges of the plotting figure
x_margin_width = (points.bounds['maxx'].max() - points.bounds['minx'].min()) / 10
y_margin_width = (points.bounds['maxy'].max() - points.bounds['miny'].min()) / 3
# define the coordinates at the extent of our projected point data
xlim = (points.bounds['minx'].min() - x_margin_width, points.bounds['maxx'].max() + x_margin_width)
ylim = (points.bounds['miny'].min() - y_margin_width, points.bounds['maxy'].max() + y_margin_width)
# create a rectangle from these coordinates
spatial_extent = Polygon([(xlim[0], ylim[0]),
(xlim[0], ylim[1]),
(xlim[1], ylim[1]),
(xlim[1], ylim[0])])
# select the country shapes within the spatial extents of our point data
countries = all_countries[all_countries['geometry'].intersects(spatial_extent)]
# set dimensions in inches for the plotting figure size
xdim = (xlim[1] - xlim[0]) / 400000
ydim = (ylim[1] - ylim[0]) / 400000
# get a representative point for each of the most visited cities
most_index = df['city'].value_counts().head(6).index
most = pd.DataFrame(points[points['city'].isin(most_index)])
most.drop_duplicates(subset=['city'], keep='first', inplace=True)
most
Unnamed: 0 | lat | lon | date | city | country | geometry | x | y | |
---|---|---|---|---|---|---|---|---|---|
1 | 1 | 38.774422 | -9.129195 | 05/14/2014 15:25 | Lisbon | Portugal | POINT (-1633623.174194452 -509447.4438606029) | -1.633623e+06 | -509447.443861 |
29 | 29 | 41.302957 | 2.075222 | 06/20/2014 18:59 | Barcelona | Spain | POINT (-654565.8676264901 -384972.9524976124) | -6.545659e+05 | -384972.952498 |
58 | 58 | 50.087729 | 14.660600 | 06/02/2014 12:10 | Prague | Czech Republic | POINT (329263.3934799764 583235.1809137455) | 3.292634e+05 | 583235.180914 |
85 | 85 | 48.521902 | 9.055122 | 06/09/2014 12:12 | Tübingen | Germany | POINT (-68815.19792553004 397819.2685319655) | -6.881520e+04 | 397819.268532 |
132 | 132 | 37.970125 | 23.728545 | 07/05/2014 14:20 | Athens | Greece | POINT (1192502.696872958 -690469.0375840869) | 1.192503e+06 | -690469.037584 |
134 | 134 | 40.980462 | 28.820527 | 07/06/2014 21:27 | Istanbul | Turkey | POINT (1551608.582159148 -274514.2831009594) | 1.551609e+06 | -274514.283101 |
def get_patches(countries, visited_countries):
# specify the colors for our map
facecolor = '#f7f7f7'
visited_facecolor = '#eeeeee'
edgecolor = '#cccccc'
# create a list to contain a descartes PolygonPatch object for each Polygon in the GeoDataFrame geometry column
patches = []
for _, row in countries.iterrows():
fc = visited_facecolor if row['NAME'] in visited_countries else facecolor
# if this row contains a Polygon object
if type(row['geometry']) == Polygon:
patches.append(PolygonPatch(row['geometry'], fc=fc, ec=edgecolor, zorder=0))
# else, this row contains a MultiPolygon object - this is a shapely object that contains multiple Polygon objects
# for example, countries that contain islands will have one Polygon shape for their mainland, and others for the island geometries
else:
# for each sub-Polygon object in the MultiPolygon
for polygon in row['geometry']:
patches.append(PolygonPatch(polygon, fc=fc, ec=edgecolor, zorder=0))
return patches
# get a list of visited countries so we can shade those patches a different color
visited_countries = rs['country'].unique()
countries = countries.replace('The former Yugoslav Republic of Macedonia', 'Macedonia (FYROM)')
# create a figure, axis, and set the background color
fig = plt.figure(figsize=(xdim, ydim))
ax = fig.add_subplot(111)
ax.set_axis_bgcolor(backgroundcolor)
# add each patch we extracted from the GeoDataFrame's geometry column to the axis
for patch in get_patches(countries, visited_countries):
ax.add_patch(patch)
# add the projected point data to the axis as a scatter plot
points_scatter = ax.scatter(x=points['x'], y=points['y'], c='m', alpha=0.4, s=100)
ax.set_title('Projected shapefile and GPS coordinates', fontproperties=title_font)
# set the axes limits
ax.set_xlim(xlim)
ax.set_ylim(ylim)
# remove the tickmarks as these are projected geometries, the ticks are confusing northings/eastings
ax.set_xticks([])
ax.set_yticks([])
# annotate the most visited cities on the map
for _, row in most.iterrows():
plt.annotate(row['city'],
xy=(row['x'], row['y']),
xytext=(row['x'] + 35000, row['y'] - 100000),
bbox=dict(boxstyle='round', color='k', fc='w', alpha=0.8),
xycoords='data')
plt.show()
# next we'll identify the most isolated points (or clusters of points, based on some threshold distance)
# in the projected data set
start_time = time()
# what is the distance to the nearest point that is at least *threshold* miles away?
# ie, ignore all other points within this distance when identifying the next nearest point
# this treats everything within this threshold distance as a single cluster
threshold = 20
# create two new columns in the dataframe of simplified coordinates
# nearest_point will contain the index of the row of the nearest point from the original full data set
# nearest_dist will contain the value of the distance between these two points
points['nearest_point'] = None
points['nearest_dist'] = np.inf
# for each row (aka, coordinate pair) in the data set
for label, row in rs.iterrows():
point1 = (row['lat'], row['lon'])
for label2, row2 in rs.iterrows():
# don't compare the row to itself
if(label != label2):
# calculate the great circle distance between points
point2 = (row2['lat'], row2['lon'])
dist = great_circle(point1, point2).miles
# if this row's nearest is currently null, save this point as its nearest
# or if this distance is smaller than the previous smallest, update the row
if pd.isnull(points.loc[label, 'nearest_dist']) | ((dist > threshold) & (dist < points.loc[label, 'nearest_dist'])):
points.loc[label, 'nearest_dist'] = dist
points.loc[label, 'nearest_point'] = label2
print('process took %s seconds' % round(time() - start_time, 2))
process took 10.38 seconds
# sort the points by distance to nearest, then drop duplicates of nearest_point
most_isolated = points.sort_values(by='nearest_dist', ascending=False).drop_duplicates(subset='nearest_point', keep='first')
most_isolated = most_isolated.head(5)
# plot the most isolated clusters in the data set
# create a figure, axis, and set the background color
fig = plt.figure(figsize=(xdim, ydim))
ax = fig.add_subplot(111)
ax.set_axis_bgcolor(backgroundcolor)
# add each patch we extracted from the GeoDataFrame's geometry column to the axis
for patch in get_patches(countries, visited_countries):
ax.add_patch(patch)
# add the projected point data to the axis as a scatter plot
points_scatter = ax.scatter(points['x'], points['y'], c='m', alpha=.4, s=150)
isolated_scatter = ax.scatter(most_isolated['x'], most_isolated['y'], c='r', alpha=.9, s=150)
ax.set_title('Most Isolated Clusters, and Distance to Next Nearest', fontproperties=title_font)
# set the axes limits
ax.set_xlim(xlim)
ax.set_ylim(ylim)
# remove the tickmarks as these are projected geometries, the ticks are confusing northings/eastings
ax.set_xticks([])
ax.set_yticks([])
# annotate each of the most isolated clusters with city name, and distance to next nearest point + its name
for _, row in most_isolated.iterrows():
xytext = (row['x'], row['y'] - 120000) if row['city'] != 'Prizren' else (row['x'], row['y'] + 90000)
ax.annotate(row['city'] + ', ' + str(int(row['nearest_dist'])) + ' mi. to ' + rs['city'][row['nearest_point']],
xy=(row['x'], row['y']),
xytext=xytext,
fontproperties=annotation_font,
bbox=dict(boxstyle='round', color='k', fc='w', alpha=0.7),
xycoords='data')
plt.show()