#!/usr/bin/env python
# coding: utf-8

# ## Imports
# 

# In[27]:


import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import os
from matplotlib.path import Path 
from io import BytesIO

get_ipython().run_line_magic('matplotlib', 'inline')


# ##### Load data from the json source file in variables
# This loads the file 'trip_final.json' that is available from the Montréal Open Data Portal at: 
# http://donnees.ville.montreal.qc.ca/dataset/mtl-trajet
# The website also contains an explaination of the methodology and the contents of this and related data-sets.
# A direct link to the json zip file is at the url below:
# http://depot.ville.montreal.qc.ca/mtl-trajet-2016/trip_final_2016_json.zip
# 
# For this notebook, you will also need to have the borough shapefiles, which can be dowloaded from:
# http://donnees.ville.montreal.qc.ca/dataset/polygones-arrondissements
# 
# The .py file run in the cell below loads the data from the LIMADMIN.json file

# ##### Note:
#     - You will need to change the json_filepath variable to the location of trip_final.json and LIMADMIN.json

# In[12]:


get_ipython().run_line_magic('run', '"load_trip_data.py"')


# ## Variable reference: what you need to know

# Because the loading of the files is happening in the 'load_trip_data.py' python file, it's worth explaining some of the variables here that will be used throughout the notebook.
# 
#     - trip_final: a library that is created by loading the json file into Python. 
#     - coords_if: a 293,330-row by 4-column numpy array (293330, 4) that contains the initial [0][1] and final [2][3] coordinates for each entry, made from 'trip_final' 
#     - burrShapes: contains the coordinates from the shapefiles for the different boroughs of Montréal. Coordinates are in Latitude - Longitude (Thankfully)
#     - numIds: integer that reflects the number of trajectories in the original data set, used to iterate through trip_final
#     - ids: A vector of length numIds that contains the unique ID number associated with each recorded trajectory. This is used for comparing subsets and applying selection criteria.

# ## Process the data
# This section contains the operations to arrange the data appropriately to be visualized in the next section. This is the most time-consuming and memory-intensive part to run.

# Create a Path object that hold the shapes of the individual burroughs

# In[15]:


allPaths=Path(np.empty((1,2)))
currBurr=0
for currBurr in range(0,34):
    currBurr_array=[]
    currBurr_array=np.asarray(burrShapes['features'][currBurr]['geometry']['coordinates'][0][0])
    currBurrPath = Path(currBurr_array)
    allPaths = Path.make_compound_path(allPaths,currBurrPath)


# Create 'masks' - boolean lists of rows that match our criteria, that will be applied to the ID list to filter.

# In[16]:


# Find all the ids for trajectories that begin outside of the collection of all boroughs (on the island)
mask1 = allPaths.contains_points(coords_if[:,:2])

# Invert the selection to instead have the rows for all trajectories starting outside the collection of boroughs
mask1_inv=np.invert(mask1.astype(bool))

# Use the 'mask' to create a list of the IDs that start off the island
startIds=ids[mask1_inv.astype(bool)]

# Use the same method above to generate a list of IDs for trajectories ending on the island.
mask2 = allPaths.contains_points(coords_if[:,2:])
endIds=ids[mask2]

# Find the overlap of those two sets of IDs to find keep only the trajectories starting off and ending on the island.
mask3=np.isin(startIds, endIds, assume_unique=True)
sharedIDs=startIds[mask3]

# Initialize empty arrays to hold the initial and final points
startPts=np.empty((numIds,2))
endPts=np.empty((numIds,2))

# Iterate through the dictionary and store the start and endpoints to seperate numpy arrays.
iS=0
for nTraj in range(0,numIds):
    tempID=trip_final["features"][nTraj]["properties"]["id_trip"]
    if sum(np.isin(sharedIDs,tempID))>0:
        if trip_final["features"][nTraj]["geometry"] is not None: 
            tempArray=[]
            tempArray=np.asarray(trip_final["features"][nTraj]["geometry"]["coordinates"][0][:]) # Load data as array instead of as a list.
            startPts[iS,0:2] = tempArray[0,0:2]
            endPts[iS,0:2] = tempArray[-1,0:2]
            iS=iS+1   
     

# Load trajectories into a single numpy array for visualizing iteratively

# In[18]:


# Create a mask 
currMask=np.isin(ids, sharedIDs, assume_unique=False)  # Using ids instead of allIDs 
currIDs=ids[currMask]


# In[35]:


# Set the length cutoff based on the histogram above.
length_cutoff = 600 # Limits the maximum pairs of coordinates for each trajectory. 
nIds = np.size(currIDs)
coordMat = np.empty((nIds,length_cutoff,2)) # 3rd dimension to represent the 2 geospatial coordinates


# In[21]:


i_id=0
trim_count=0
drop_count=0 # Dropped for having length of 1.
for nTraj in range(0,numIds):    # For each element of the big list
    tempID=trip_final["features"][nTraj]["properties"]["id_trip"] # This id from the list
    if sum(np.isin(currIDs,tempID))>0:    # If this ID is on our list
        if trip_final["features"][nTraj]["geometry"] is not None:    # And it is not an empty entry
            tempArray=[]
            tempArray=np.asarray(trip_final["features"][nTraj]["geometry"]["coordinates"][0][:]) # Load data as array instead of as a list.
            if tempArray.size > 2: # Use this to filter out single points
                curr_len = tempArray.size/2
                if(curr_len < length_cutoff):
                    temp_i = int(length_cutoff-curr_len) # To make sure they all end at the same time.
                    # May need further mods to tempArray to make it fit into the matrix as intended.
                    coordMat[int(i_id), temp_i:, :] = tempArray      # Add that slice vector to the coordMat array (to the end for visualization purposes)
                    i_id=i_id+1    
                else:
                      trim_count=trim_count+1
            else: 
                drop_count=drop_count+1


# In[37]:


print(trim_count, drop_count, trim_count/nIds) 


# As we can see above, none of the trajectories are dropped for only having a single value, and only 132 are excluded for being too long (just over 1% of the starting data)

# ## Visualize and export

# ##### Test to see that the operations properly limited our selection to the intended trajectories

# In[23]:


# Set the values for all the plots here. I determined these to be the area of interest for this dataset.
xmin = -74
xmax = -73.4
ymin = 45.3
ymax = 45.8


# In[38]:


fig1, (ax1, ax2) = plt.subplots(nrows = 1, ncols = 2, figsize=(16, 8))
ax1.scatter(startPts[:,0],startPts[:,1], alpha=0.05)
ax1.axis([xmin, xmax, ymin, ymax]) # Setting the axes like this avoid the zero values in the preallocated empty array.

# fig1, (ax) = plt.subplots(1, figsize=(8, 8))
ax2.scatter(endPts[:,0],endPts[:,1], alpha=0.05)
ax2.axis([xmin, xmax, ymin, ymax]) 

ax1.set_title("Initial point of trajectory")
ax2.set_title("Final point of trajectory")


# ##### Start by plotting a static image inline to be sure the plot looks as intended:

# In[31]:


directory= '/output/' # Set the directory where you would like to export the file.

if not os.path.exists(directory):
    os.makedirs(directory)

fig = plt.figure(frameon=False) 
fig.set_size_inches(20,20)

# To make the content fill the whole figure
ax = plt.Axes(fig, [0., 0., 1., 1.])
ax.set_axis_off()

t=length_cutoff-100 # Set a single t value to slice the multidimensional array.
x1=np.copy(np.squeeze(coordMat[:,t,0])) # With the above note in mind, this may be an exception??
y1=np.copy(np.squeeze(coordMat[:,t,1]))

# Remove the nans from the array
x1 = x1[~np.isnan(x1)]
y1 = y1[~np.isnan(y1)]

# Log colormap
hb = ax.hexbin(x1, y1, gridsize=500, bins='log', cmap='inferno',extent=(xmin, xmax, ymin, ymax))

fig.add_axes(ax)
ax.axis('equal')
plt.axis('off')
ax.axis([xmin, xmax, ymin, ymax]) # Setting the axes like this avoid the zero values in the preallocated empty array.

# save figure as png
png1 = BytesIO()
filename= directory+'testImage_'+ str(t) +'.png'
fig.savefig(filename, format='png', bbox_inches='tight', pad_inches=0)


# ##### Process the coordinate matrix 'coordMat' to create a timeseries.
# Although it would be more elegant to export a gif directly, I haven't had this working to my satisfaction, so the code below exports a sequence of .png files that can be combined in another step or another program (I use Fiji). 
# Note that this is the same code as in the cell above, contained within a for loop that iterates through each element (t) of the dimension corresponding to time.

# In[32]:


directory= '/output/'

fig = plt.figure(frameon=False) 
fig.set_size_inches(20,20)

# To make the content fill the whole figure
ax = plt.Axes(fig, [0., 0., 1., 1.])
ax.set_axis_off()

for t in range(0,length_cutoff):
    ax.clear()
    x1=np.copy(np.squeeze(coordMat[:,t,0])) # With the above note in mind, this may be an exception??
    y1=np.copy(np.squeeze(coordMat[:,t,1]))

    # Remove the nans from the array
    x1 = x1[~np.isnan(x1)]
    y1 = y1[~np.isnan(y1)]

    # Log colormap
    hb = ax.hexbin(x1, y1, gridsize=500, bins='log', cmap='inferno',
                   extent=(xmin, xmax, ymin, ymax))
    
    fig.add_axes(ax)
    ax.axis('equal')
    plt.axis('off')
    ax.axis([xmin, xmax, ymin, ymax]) 
    
    # save figure as png
    png1 = BytesIO()
    filename= directory+'hexbin_trip_final_'+ str(t) +'.png'
    fig.savefig(filename, format='png', bbox_inches='tight', pad_inches=0)
 
    
# If the cell completes as it should, a heatmap of the points contained within the island of Montreal should be plotted inline above. 
# And there you go, now the individual png files can be combined to make an animation.
# 
# A few things to experiment with:
#     - Changing the colormap used: change cmap='inferno' to a different colormap that can be seen described in the matplotlib gallery here:
#             https://matplotlib.org/users/colormaps.html
#     - Changing the number of bins (gridsize=Xxx). While I settled on 500 to be able to make out the seperate streets while still pooling data together in individual tiles, I have also experimented with 100 (more pixelated, but cool looking) and 1000 (finer details, almost beving like a scatter plot)
#     - If you want a direct numerical output of the counts, remove the log (bins='log') argument from hexbin - then you'll only be able to see the densest areas, such as highways.
#     - Different plot types:
#             - If you want to see each data point individually, using a scatter plot instead of hexplot would be the way to go ( suggest setting alpha < 0.5 to add transparency to the points)
#             - Contour plots are a nice way of visualizing the group behaviour, while losing some information about individual points.
#             - You can really test any kind of 2d representation, now that the vectors x1 and y1 are being created for each value of t. A few good references for plot types can be found here:
#                     - Python graph gallery: https://python-graph-gallery.com/
#                     - Seaborn plot gallery: http://seaborn.pydata.org/examples/index.html
#                     - Bokeh plot gallery: https://bokeh.pydata.org/en/latest/docs/gallery.html

# In[ ]: