#!/usr/bin/env python
# coding: utf-8

# # Tokyo Photographs

# In[1]:


from IPython.display import display_markdown

display_markdown(open("README.md").read(), raw=True)


# In[65]:


get_ipython().run_line_magic('matplotlib', 'inline')

import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np


# In[72]:


db = pd.read_csv('data/tokyo.csv')


# ## Randomly subsetting

# In[73]:


# Set the "seed" so every run produces the generates the same random numbers
np.random.seed(1234)
# Create a sequence of length equal to the number of rows in the table
ri = np.arange(len(db))
# Randomly reorganize (shuffle) the values
np.random.shuffle(ri)
# Reindex the table by using only the first 10,000 numbers 
# of the (now randomly arranged) sequence
db = db.iloc[ri[:10000], :]


# ## Reproject XY coordinates in separate columns

# In[74]:


get_ipython().run_cell_magic('time', '', 'pts = db.apply(lambda r: Point(r.longitude, r.latitude), axis=1)\n')


# In[75]:


gdb = gpd.GeoDataFrame(db.assign(geometry=pts), \
                       crs={'init' :'epsg:4326'})


# In[76]:


get_ipython().run_cell_magic('time', '', 'gdb = gdb.to_crs(epsg=3857)\n')


# In[77]:


get_ipython().run_cell_magic('time', '', "xys = gdb['geometry'].apply(lambda pt: pd.Series({'x': pt.x, 'y': pt.y}))\ngdb['x'] = xys['x']\ngdb['y'] = xys['y']\n")


# In[79]:


gdb.drop('geometry', axis=1).to_csv('tokyo_clean.csv', index=False)


# ---
# 
# ## Download link
# 
# {download}`[Download the *tokyo_clean.csv* file] <tokyo_clean.csv>`