#!/usr/bin/env python
# coding: utf-8

# # Evolution of urban patterns: urban morphology as an open reproducible data science
# 
# ## Summary statistics
# 
# This is the second notebook in a sequence of three. The notebook summarise morphometric data obtained in the previous notebook on the basis of historical periods.
# 
# It requires `data/case_studies.csv` input with origins of case studies and data generated by the first notebook.
# 
# Date: May 17, 2021
# 
# ---
# 
# We start with the import of libraries used in this notebook.

# In[1]:


import pandas as pd
import seaborn as sns
from shapely.geometry import Point
import geopandas as gpd
import matplotlib.pyplot as plt
import glob
import pathlib
from palettable.wesanderson import Moonrise5_6


# Using seaborn we can specify global settings for all matplotlib-based plots.

# In[2]:


sns.set()


# We will need the original input of our case studies as we needed in the first notebook.

# In[3]:


cases = pd.read_csv("data/case_studies.csv")
cases


# It will be easier to use `case` column as an index.

# In[4]:


cases = cases.set_index("case")


# Using `glob` library, we get a list of all GeoPackages created in the first notebook.

# In[5]:


files = glob.glob("data/*gpkg")


# We need to generate a single DataFrame with all the results of all case studies combined. Let's prepare an empty DataFrame for that.

# In[6]:


data = pd.DataFrame()


# Now we loop through the list of GeoPackages and:
# 1. Read the file.
# 2. Select only those geometries within 400 m buffer around the origin. That way we minimise potential edge effect affecting the results.
# 3. Store period and case name within the DataFrame and append it together with morphometric data from GPKG to our `data` DataFrame.

# In[7]:


for f in files:
    tessellation = gpd.read_file(f, layer="tessellation")
    case = pathlib.Path(f).stem
    coords = cases.origin.loc[case]
    buffer = gpd.GeoSeries([Point(tuple(map(float, coords[1:-1].split(', '))))], crs=4326).to_crs(tessellation.crs).buffer(400)
    tessellation['period'] = cases.period.loc[case]
    tessellation['case'] = case
    casegdf = tessellation[tessellation.centroid.within(buffer.iloc[0])]
    data = data.append(casegdf.drop(columns=["uID", "nID", "mm_len", "node_start", "node_end", "nodeID", "geometry"]))


# Since `period` is an ordered categorical column, let's encode it as such.

# In[8]:


data['period'] = pd.Categorical(data['period'], categories=['pre-industrial', 'industrial', 'garden city', 'modernist', 'neo-traditional', 'informal'], ordered=True)


# Now we can sort the `data` based on the `period` in a way which follows the time.

# In[9]:


data = data.sort_values('period').reset_index(drop=True)


# For our figures, we want `paper` context, `whitegrid` style and `palettable`'s `Moonrise5_6` color palette.

# In[10]:


sns.set_context("paper")
sns.set_style("whitegrid")
sns.set_palette(Moonrise5_6.hex_colors)


# It is better to rename final labels to nicers ones, so let's prepare a dictionary for that.

# In[11]:


labels = {
    "cell_area": "Area of a tessellation cell [m]",
    "car": "Covered area ratio",
    "blg_area": "Area of a building footprint [m]",
    "wall": "Length of a perimeter wall [m]",
    "adjacency": "Building adjacency",
    "neighbour_distance": "Mean neighbor distance between buildings [m]",
    "length": "Length of a street segment [m]",
    "linearity": "Linearity of a street segment",
    "width": "Width of a street profile [m]",
    "width_deviation": "Width deviation of a street profile [m]",
    "openness": "Openness of a street profile",
    "meshedness": "Meshedness of a street network",
}


# Now we loop through all morphometric characters and create box plots for each city, once with normal y axis and once with log y axis.

# In[12]:


for ch in data.columns.drop(['case', 'period']):
    plt.figure(figsize=(12, 8))
    boxen = sns.boxplot(x="case", y=ch, hue='period', dodge=False, data=data, showfliers=False)
    boxen.set_xticklabels(boxen.get_xticklabels(), rotation=90)
    boxen.set_ylabel(labels[ch])
    boxen.set_xlabel("Case study")
    sns.despine()
    plt.legend(loc="lower left", borderaxespad=0., ncol=6, frameon=False)
    plt.savefig(f"figures/{ch}_all_normal.png", bbox_inches="tight")
    plt.close("all")
    
    plt.figure(figsize=(12, 8))
    boxen = sns.boxplot(x="case", y=ch, hue='period', dodge=False, data=data, showfliers=False)
    boxen.set_xticklabels(boxen.get_xticklabels(), rotation=90)
    boxen.set_ylabel(labels[ch])
    boxen.set_xlabel("Case study")
    boxen.set_yscale("log")
    sns.despine()
    plt.legend(loc="lower left", borderaxespad=0., ncol=6, frameon=False)
    plt.savefig(f"figures/{ch}_all_log.png", bbox_inches="tight")
    plt.close("all")


# We can also create box plots for each character grouped by period.

# In[13]:


for ch in data.columns.drop(['case', 'period']):
    plt.figure(figsize=(12, 8))
    box = sns.boxplot(x="period", y=ch,dodge=False, data=data, showfliers=False)
    box.set_ylabel(labels[ch])
    box.set_xlabel("Historical period")
    sns.despine()
    plt.savefig(f"figures/{ch}_grouped_normal.png", bbox_inches="tight")
    plt.close("all")
    
    plt.figure(figsize=(12, 8))
    box = sns.boxplot(x="period", y=ch,dodge=False, data=data, showfliers=False)
    box.set_ylabel(labels[ch])
    box.set_xlabel("Historical period")
    box.set_yscale("log")
    sns.despine()
    plt.savefig(f"figures/{ch}_grouped_log.png", bbox_inches="tight")
    plt.close("all")


# To report median values per character and period, we can create a grouped DataFrame.

# In[16]:


grouper = data.drop(columns='case').groupby('period')
medians = grouper.median()
medians = medians.rename(columns=labels)
med = medians.T.round(2)
med


# It may also be useful to report interquartile range to illustrate the spread of each distribution.

# In[17]:


iqr = (grouper.quantile(.75).values - grouper.quantile(.25))
iqr = iqr.rename(columns=labels)
iq = iqr.T.round(2)
iq


# We want to report both in the same table.

# In[20]:


med.astype('str') + ' (' + iq.astype('str') + ')'


# Following cells pick individual boxplots (same we created above) and generate combined figures used in the final paper.

# In[67]:


fig = plt.figure(figsize=(8, 7), constrained_layout=True)
gs = fig.add_gridspec(6, 2)
top = fig.add_subplot(gs[0:4, :])
left = fig.add_subplot(gs[4:, 0])
right = fig.add_subplot(gs[4:, 1])

sns.boxplot(x="case", y='neighbour_distance', hue='period', dodge=False, data=data, showfliers=False, ax=top, linewidth=.75)
top.set_xticklabels(top.get_xticklabels(), rotation=90)
top.set_ylabel(labels['neighbour_distance'])
top.set_xlabel("")

sns.boxplot(x="period", y='cell_area',dodge=False, data=data, showfliers=False, ax=left, linewidth=.75)
left.set_ylabel(labels['cell_area'])
left.set_xlabel("Historical period")
left.set_xticklabels(left.get_xticklabels(), rotation=90)
left.set_yscale("log")

sns.boxplot(x="period", y='blg_area',dodge=False, data=data, showfliers=False, ax=right, linewidth=.75)
right.set_ylabel(labels['blg_area'])
right.set_xlabel("Historical period")
right.set_xticklabels(right.get_xticklabels(), rotation=90)
right.set_yscale("log")
sns.despine()

top.legend(loc="upper left", borderaxespad=0., ncol=3, frameon=True)
plt.savefig(f"figures/results_1.png", bbox_inches="tight", dpi=300)


# In[66]:


fig = plt.figure(figsize=(8, 7), constrained_layout=True)
gs = fig.add_gridspec(6, 2)
top = fig.add_subplot(gs[0:4, :])
left = fig.add_subplot(gs[4:, 0])
right = fig.add_subplot(gs[4:, 1])

sns.boxplot(x="case", y='openness', hue='period', dodge=False, data=data, showfliers=False, ax=top, linewidth=.75)
top.set_xticklabels(top.get_xticklabels(), rotation=90)
top.set_ylabel(labels['openness'])
top.set_xlabel("")

sns.boxplot(x="period", y='car',dodge=False, data=data, showfliers=False, ax=left, linewidth=.75)
left.set_ylabel(labels['car'])
left.set_xlabel("Historical period")
left.set_xticklabels(left.get_xticklabels(), rotation=90)
left.set_yscale("log")

sns.boxplot(x="period", y='width',dodge=False, data=data, showfliers=False, ax=right, linewidth=.75)
right.set_ylabel(labels['width'])
right.set_xlabel("Historical period")
right.set_xticklabels(right.get_xticklabels(), rotation=90)
right.set_yscale("log")
sns.despine()

top.legend(loc="upper left", borderaxespad=0., ncol=3, frameon=True)
plt.savefig(f"figures/results_2.png", bbox_inches="tight", dpi=300)


# In[72]:


fig = plt.figure(figsize=(8, 3), constrained_layout=True)
gs = fig.add_gridspec(1, 2)
left = fig.add_subplot(gs[0, 0])
right = fig.add_subplot(gs[0, 1])

sns.boxplot(x="period", y='wall',dodge=False, data=data, showfliers=False, ax=left, linewidth=.75)
left.set_ylabel(labels['wall'])
left.set_xlabel("Historical period")
left.set_xticklabels(left.get_xticklabels(), rotation=90)

sns.boxplot(x="period", y='adjacency',dodge=False, data=data, showfliers=False, ax=right, linewidth=.75)
right.set_ylabel(labels['adjacency'])
right.set_xlabel("Historical period")
right.set_xticklabels(right.get_xticklabels(), rotation=90)
sns.despine()

plt.savefig(f"figures/results_3.png", bbox_inches="tight", dpi=300)


# In[75]:


fig = plt.figure(figsize=(8, 7), constrained_layout=True)
gs = fig.add_gridspec(6, 2)
top = fig.add_subplot(gs[0:4, :])
left = fig.add_subplot(gs[4:, 0])
right = fig.add_subplot(gs[4:, 1])

sns.boxplot(x="case", y='meshedness', hue='period', dodge=False, data=data, showfliers=False, ax=top, linewidth=.75)
top.set_xticklabels(top.get_xticklabels(), rotation=90)
top.set_ylabel(labels['meshedness'])
top.set_xlabel("")

sns.boxplot(x="period", y='linearity',dodge=False, data=data, showfliers=False, ax=left, linewidth=.75)
left.set_ylabel(labels['linearity'])
left.set_xlabel("Historical period")
left.set_xticklabels(left.get_xticklabels(), rotation=90)

sns.boxplot(x="period", y='width_deviation',dodge=False, data=data, showfliers=False, ax=right, linewidth=.75)
right.set_ylabel(labels['width_deviation'])
right.set_xlabel("Historical period")
right.set_xticklabels(right.get_xticklabels(), rotation=90)
sns.despine()

top.legend(loc="upper right", borderaxespad=0., ncol=3, frameon=True)
plt.savefig(f"figures/results_4.png", bbox_inches="tight", dpi=300)


# ## Statistical comparison
# 
# We also want to compare similarity of distributions statistically, not only visually. We use Kruskal–Wallis one-way analysis of variance and Pairwise Mann–Whitney *U* test for that.

# In[10]:


from scipy import stats

data = data.set_index('case')


# In[13]:


data = data.rename(columns=labels)


# ### Kruskal–Wallis one-way analysis of variance
# 
# Kruskal–Wallis one-way analysis of variance is chosen as a non-parametric equivalent of ANOVA due to skewed distributions of underlying data.

# In[14]:


kruskal = pd.DataFrame()
for col in data.columns.drop("period"):
    H, p = stats.kruskal(*[data[data.period == per][col] for per in data.period.cat.categories])
    kruskal.loc[col, 'H'] = H
    kruskal.loc[col, 'p'] = p


# In[15]:


kruskal


# The results indicate that the distributions of morphometric values obtained from samples from different historical periods cannot be considered the same.

# ### Pairwise Mann–Whitney _U_ test

# In[26]:


from itertools import product

mann_whitney_U = {}
mann_whitney_p = {}
for col in data.columns.drop("period"):
    df_u = pd.DataFrame(columns=data.period.cat.categories, index=data.period.cat.categories)
    df_p = pd.DataFrame(columns=data.period.cat.categories, index=data.period.cat.categories)
    for a, b in product(data.period.cat.categories, repeat=2):
        if pd.isna(df_u.loc[b, a]):
            U, p = stats.mannwhitneyu(data[data.period == a][col], data[data.period == b][col])
            df_u.loc[a, b] = U
            df_p.loc[a, b] = p
        else:
            df_u.loc[a, b] = df_u.loc[b, a]
            df_p.loc[a, b] = df_p.loc[b, a]
            
    mann_whitney_U[col] = df_u
    mann_whitney_p[col] = df_p


# In[27]:


for col in data.columns.drop("period"):
    if not (mann_whitney_p[col] > .05).sum().sum() == 6:
        print(f"{col}")


# In three cases, `Building adjacency`, `Linearity of a street segment`, and `Width deviation of a street profile`, Mann–Whitney _U_ is not significant (`p < 0.05`) in all pairwise cases.

# In[28]:


mann_whitney_p['Building adjacency'] > .05


# In[31]:


mann_whitney_p['Building adjacency'].round(3)


# In the case of `Building adjacency`, the distributions of values in Garden city and Modernist periods cannot be considered significantly different. 

# In[21]:


mann_whitney_p['Linearity of a street segment'] > .05


# In[32]:


mann_whitney_p['Linearity of a street segment'].round(3)


# In the case of `Linearity of a street segment`, the distributions of values in Garden city and Modernist periods cannot be considered significantly different.

# In[22]:


mann_whitney_p['Width deviation of a street profile [m]'] > .05


# In[33]:


mann_whitney_p['Width deviation of a street profile [m]'].round(3)


# In the case of `Width deviation of a street profile`, the distributions of values in Pre-industrial and Modernist periods cannot be considered significantly different.