This notebook contains the code to convert raw downloaded external data into a cleaned or simplified version for tutorial purposes.
The raw data is expected to be in the ./raw
sub-directory (not included in the git repo).
%matplotlib inline
import pandas as pd
import geopandas
countries = geopandas.read_file("zip://./raw/original_data_ne/ne_110m_admin_0_countries.zip")
countries.head()
scalerank | featurecla | LABELRANK | SOVEREIGNT | SOV_A3 | ADM0_DIF | LEVEL | TYPE | ADMIN | ADM0_A3 | ... | REGION_WB | NAME_LEN | LONG_LEN | ABBREV_LEN | TINY | HOMEPART | MIN_ZOOM | MIN_LABEL | MAX_LABEL | geometry | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Admin-0 country | 3.0 | Afghanistan | AFG | 0.0 | 2.0 | Sovereign country | Afghanistan | AFG | ... | South Asia | 11.0 | 11.0 | 4.0 | -99.0 | 1.0 | 0.0 | 3.0 | 7.0 | POLYGON ((61.21081709172574 35.65007233330923,... |
1 | 1 | Admin-0 country | 3.0 | Angola | AGO | 0.0 | 2.0 | Sovereign country | Angola | AGO | ... | Sub-Saharan Africa | 6.0 | 6.0 | 4.0 | -99.0 | 1.0 | 0.0 | 3.0 | 7.0 | (POLYGON ((23.90415368011818 -11.7222815894063... |
2 | 1 | Admin-0 country | 6.0 | Albania | ALB | 0.0 | 2.0 | Sovereign country | Albania | ALB | ... | Europe & Central Asia | 7.0 | 7.0 | 4.0 | -99.0 | 1.0 | 0.0 | 5.0 | 10.0 | POLYGON ((21.0200403174764 40.84272695572588, ... |
3 | 1 | Admin-0 country | 4.0 | United Arab Emirates | ARE | 0.0 | 2.0 | Sovereign country | United Arab Emirates | ARE | ... | Middle East & North Africa | 20.0 | 20.0 | 6.0 | -99.0 | 1.0 | 0.0 | 4.0 | 9.0 | POLYGON ((51.57951867046327 24.24549713795111,... |
4 | 1 | Admin-0 country | 2.0 | Argentina | ARG | 0.0 | 2.0 | Sovereign country | Argentina | ARG | ... | Latin America & Caribbean | 9.0 | 9.0 | 4.0 | -99.0 | 1.0 | 0.0 | 2.0 | 7.0 | (POLYGON ((-66.95992000000001 -54.896810000000... |
5 rows × 72 columns
len(countries)
177
countries_subset = countries[['ADM0_A3', 'NAME', 'CONTINENT', 'POP_EST', 'GDP_MD_EST', 'geometry']]
countries_subset.columns = countries_subset.columns.str.lower()
countries_subset = countries_subset.rename(columns={'adm0_a3': 'iso_a3'})
countries_subset.head()
iso_a3 | name | continent | pop_est | gdp_md_est | geometry | |
---|---|---|---|---|---|---|
0 | AFG | Afghanistan | Asia | 34124811.0 | 64080.0 | POLYGON ((61.21081709172574 35.65007233330923,... |
1 | AGO | Angola | Africa | 29310273.0 | 189000.0 | (POLYGON ((23.90415368011818 -11.7222815894063... |
2 | ALB | Albania | Europe | 3047987.0 | 33900.0 | POLYGON ((21.0200403174764 40.84272695572588, ... |
3 | ARE | United Arab Emirates | Asia | 6072475.0 | 667200.0 | POLYGON ((51.57951867046327 24.24549713795111,... |
4 | ARG | Argentina | South America | 44293293.0 | 879400.0 | (POLYGON ((-66.95992000000001 -54.896810000000... |
countries_subset.to_file("ne_110m_admin_0_countries.shp")
http://www.naturalearthdata.com/downloads/110m-cultural-vectors/110m-populated-places/ (simple, version 4.0.0, downloaded May 2018)
cities = geopandas.read_file("zip://./raw/original_data_ne/ne_110m_populated_places_simple.zip")
cities.head()
scalerank | natscale | labelrank | featurecla | name | namepar | namealt | diffascii | nameascii | adm0cap | ... | pop_other | rank_max | rank_min | geonameid | meganame | ls_name | ls_match | checkme | min_zoom | geometry | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 8 | 10 | 3 | Admin-0 capital | Vatican City | None | None | 0 | Vatican City | 1.0 | ... | 562430 | 2 | 2 | 6691831.0 | None | Vatican City | 1 | 0 | 7.0 | POINT (12.45338654497177 41.90328217996012) |
1 | 7 | 20 | 0 | Admin-0 capital | San Marino | None | None | 0 | San Marino | 1.0 | ... | 0 | 7 | 7 | 3168070.0 | None | San Marino | 1 | 5 | 6.1 | POINT (12.44177015780014 43.936095834768) |
2 | 7 | 20 | 0 | Admin-0 capital | Vaduz | None | None | 0 | Vaduz | 1.0 | ... | 33009 | 7 | 5 | 3042030.0 | None | Vaduz | 1 | 0 | 6.7 | POINT (9.516669472907267 47.13372377429357) |
3 | 6 | 30 | 8 | Admin-0 capital alt | Lobamba | None | None | 0 | Lobamba | 0.0 | ... | 0 | 5 | 4 | 935048.0 | None | Lobamba | 1 | 5 | 6.0 | POINT (31.19999710971274 -26.46666746135247) |
4 | 6 | 30 | 8 | Admin-0 capital | Luxembourg | None | None | 0 | Luxembourg | 1.0 | ... | 106219 | 9 | 8 | 2960316.0 | None | Luxembourg | 1 | 0 | 6.0 | POINT (6.130002806227083 49.61166037912108) |
5 rows × 38 columns
len(cities)
243
cities_subset = cities[['name', 'geometry']]
cities_subset.head()
name | geometry | |
---|---|---|
0 | Vatican City | POINT (12.45338654497177 41.90328217996012) |
1 | San Marino | POINT (12.44177015780014 43.936095834768) |
2 | Vaduz | POINT (9.516669472907267 47.13372377429357) |
3 | Lobamba | POINT (31.19999710971274 -26.46666746135247) |
4 | Luxembourg | POINT (6.130002806227083 49.61166037912108) |
cities_subset.to_file("ne_110m_populated_places.shp")
http://www.naturalearthdata.com/downloads/50m-physical-vectors/50m-rivers-lake-centerlines/ (version 4.0.0, downloaded May 2018)
rivers = geopandas.read_file("zip://./raw/ne_50m_rivers_lake_centerlines.zip")
rivers.head()
scalerank | featurecla | name | note | min_zoom | name_alt | name_en | min_label | geometry | |
---|---|---|---|---|---|---|---|---|---|
0 | 6 | Lake Centerline | Kama | None | 5.0 | None | Kama | 6.0 | LINESTRING (51.9371337598152 55.70106609892139... |
1 | 6 | River | Kama | None | 5.0 | None | Kama | 6.0 | LINESTRING (53.69384765584471 58.2063174502901... |
2 | 3 | Lake Centerline | Abay | None | 3.0 | None | Abay | 4.0 | LINESTRING (37.11301150887408 11.8549872909308... |
3 | 3 | Lake Centerline | Al Furat | None | 3.0 | None | Al Furat | 4.0 | LINESTRING (38.56119184742585 35.8626433379197... |
4 | 6 | Lake Centerline | Alabama | None | 5.0 | None | Alabama | 6.0 | (LINESTRING (-86.52176754393696 33.03211843501... |
Remove rows with missing geometry:
len(rivers)
462
rivers = rivers[~rivers.geometry.isna()].reset_index(drop=True)
len(rivers)
461
Subset of the columns:
rivers_subset = rivers[['featurecla', 'name_en', 'geometry']].rename(columns={'name_en': 'name'})
rivers_subset.head()
featurecla | name | geometry | |
---|---|---|---|
0 | Lake Centerline | Kama | LINESTRING (51.9371337598152 55.70106609892139... |
1 | River | Kama | LINESTRING (53.69384765584471 58.2063174502901... |
2 | Lake Centerline | Abay | LINESTRING (37.11301150887408 11.8549872909308... |
3 | Lake Centerline | Al Furat | LINESTRING (38.56119184742585 35.8626433379197... |
4 | Lake Centerline | Alabama | (LINESTRING (-86.52176754393696 33.03211843501... |
rivers_subset.to_file("ne_50m_rivers_lake_centerlines.shp")
Source: https://opendata.paris.fr/explore/dataset/quartier_paris/ (downloaded as GeoJSON file on August 20, 2018)
Administrative districts, polygon dataset
districts = geopandas.read_file("./raw/quartier_paris.geojson")
districts.head()
n_sq_qu | perimetre | objectid | longueur | c_qu | surface | n_sq_ar | c_quinsee | l_qu | c_ar | geometry | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 750000010 | 2139.625388 | 50 | 2139.535591 | 10 | 271750.323937 | 750000003 | 7510302 | Enfants-Rouges | 3 | POLYGON ((2.367101341254551 48.86162755885409,... |
1 | 750000016 | 3283.163371 | 56 | 3282.999717 | 16 | 378252.153674 | 750000004 | 7510404 | Notre-Dame | 4 | POLYGON ((2.361313701339139 48.84858030437791,... |
2 | 750000018 | 4052.729521 | 58 | 4052.473226 | 18 | 798389.398463 | 750000005 | 7510502 | Jardin-des-Plantes | 5 | POLYGON ((2.364561460891576 48.84365746114398,... |
3 | 750000025 | 3827.253353 | 7 | 3827.053421 | 25 | 826559.436780 | 750000007 | 7510701 | Saint-Thomas-d'Aquin | 7 | POLYGON ((2.322133508640103 48.84924973446431,... |
4 | 750000039 | 3245.891413 | 21 | 3245.778222 | 39 | 609034.654451 | 750000010 | 7511003 | Porte-Saint-Martin | 10 | POLYGON ((2.363917183048105 48.86754108728465,... |
districts = districts.rename(columns={'l_qu': 'district_name', 'c_qu': 'id'}).sort_values('id').reset_index(drop=True)
Add population data (based on pdfs downloaded from ..):
population = pd.read_csv("./raw/paris-population.csv")
population['temp'] = population.district_name.str.lower()
population['temp'] = population['temp'].replace({
'javel': 'javel 15art',
'saint avoye': 'sainte avoie',
"saint germain l'auxerrois": "st germain l'auxerrois",
'plaine monceau': 'plaine de monceaux',
'la chapelle': 'la chapelle'})
districts['temp'] = (districts.district_name.str.lower().str.replace('-', ' ')
.str.replace('é', 'e').str.replace('è', 'e').str.replace('ê', 'e').str.replace('ô', 'o'))
res = pd.merge(districts, population[['population', 'temp']], on='temp', how='outer')
assert len(res) == len(districts)
districts = res[['id', 'district_name', 'population', 'geometry']]
districts.head()
id | district_name | population | geometry | |
---|---|---|---|---|
0 | 1 | St-Germain-l'Auxerrois | 1672 | POLYGON ((2.344593389828428 48.85404991486192,... |
1 | 2 | Halles | 8984 | POLYGON ((2.349365804803003 48.86057567227663,... |
2 | 3 | Palais-Royal | 3195 | POLYGON ((2.339465868602756 48.86213531210705,... |
3 | 4 | Place-Vendôme | 3044 | POLYGON ((2.331944969393234 48.86491285292422,... |
4 | 5 | Gaillon | 1345 | POLYGON ((2.336320212305949 48.8679713890312, ... |
districts.to_file("processed/paris_districts.geojson", driver='GeoJSON')
districts = districts.to_crs(epsg=32631)
districts.to_file("paris_districts_utm.geojson", driver='GeoJSON')
Source: https://opendata.paris.fr/explore/dataset/commercesparis/ (downloaded as csv file (commercesparis.csv
) on October 30, 2018)
df = pd.read_csv("./raw/commercesparis.csv", sep=';')
/home/joris/miniconda3/lib/python3.5/site-packages/IPython/core/interactiveshell.py:2698: DtypeWarning: Columns (10) have mixed types. Specify dtype option on import or set low_memory=False. interactivity=interactivity, compiler=compiler, result=result)
df.iloc[0]
ORDRE 2379 ARRONDISSEMENT 75001 QUARTIER 3 IRIS 7.5101e+08 ILOT 7.5101e+10 NUMERO 3 LET NaN TYPE VOIE RUE LIBELLE VOIE PYRAMIDES ADRESSE COMPLETE 3 RUE PYRAMIDES CFA 7902 X 651020 Y 6.86291e+06 XY 48.864299, 2.332401 SEQUENCE 3 SITUATION Sur rue CODE ACTIVITE CH106 LIBELLE ACTIVITE Restaurant européen SURFACE 1 CC ID NaN CC NIV NaN Name: 0, dtype: object
Take subset of the restaurants:
restaurants = df[df['CODE ACTIVITE'].str.startswith('CH1', na=False)].copy()
restaurants['LIBELLE ACTIVITE'].value_counts()
Restaurant traditionnel français 1947 Restaurant asiatique 1643 Restaurant européen 1178 Restaurant indien, pakistanais et Moyen Orient 394 Restaurant maghrébin 207 Restaurant africain 138 Autre restaurant du monde 107 Restaurant central et sud américain 97 Restaurant antillais 27 Name: LIBELLE ACTIVITE, dtype: int64
restaurants = restaurants.dropna(subset=['XY']).reset_index(drop=True)
Translate the restaurants and rename column:
restaurants['LIBELLE ACTIVITE'] = restaurants['LIBELLE ACTIVITE'].replace({
'Restaurant traditionnel français': 'Traditional French restaurant',
'Restaurant asiatique': 'Asian restaurant',
'Restaurant européen': 'European restuarant',
'Restaurant indien, pakistanais et Moyen Orient': 'Indian / Middle Eastern restaurant',
'Restaurant maghrébin': 'Maghrebian restaurant',
'Restaurant africain': 'African restaurant',
'Autre restaurant du monde': 'Other world restaurant',
'Restaurant central et sud américain': 'Central and South American restuarant',
'Restaurant antillais': 'Caribbean restaurant'
})
restaurants = restaurants.rename(columns={'LIBELLE ACTIVITE': 'type'})
Create GeoDataFrame
from shapely.geometry import Point
restaurants['geometry'] = restaurants['XY'].str.split(', ').map(lambda x: Point(float(x[1]), float(x[0])))
restaurants = geopandas.GeoDataFrame(restaurants[['type', 'geometry']], crs={'init': 'epsg:4326'})
restaurants.head()
type | geometry | |
---|---|---|
0 | European restuarant | POINT (2.332401 48.864299) |
1 | Traditional French restaurant | POINT (2.331778 48.86526) |
2 | Traditional French restaurant | POINT (2.332541 48.865932) |
3 | Indian / Middle Eastern restaurant | POINT (2.332785 48.866285) |
4 | Traditional French restaurant | POINT (2.332008 48.866444) |
restaurants.to_file("processed/paris_restaurants.gpkg", driver='GPKG')