#!/usr/bin/env python # coding: utf-8 # # Evolution of urban patterns: urban morphology as an open reproducible data science # # ## Summary statistics # # This is the second notebook in a sequence of three. The notebook summarise morphometric data obtained in the previous notebook on the basis of historical periods. # # It requires `data/case_studies.csv` input with origins of case studies and data generated by the first notebook. # # Date: May 17, 2021 # # --- # # We start with the import of libraries used in this notebook. # In[1]: import pandas as pd import seaborn as sns from shapely.geometry import Point import geopandas as gpd import matplotlib.pyplot as plt import glob import pathlib from palettable.wesanderson import Moonrise5_6 # Using seaborn we can specify global settings for all matplotlib-based plots. # In[2]: sns.set() # We will need the original input of our case studies as we needed in the first notebook. # In[3]: cases = pd.read_csv("data/case_studies.csv") cases # It will be easier to use `case` column as an index. # In[4]: cases = cases.set_index("case") # Using `glob` library, we get a list of all GeoPackages created in the first notebook. # In[5]: files = glob.glob("data/*gpkg") # We need to generate a single DataFrame with all the results of all case studies combined. Let's prepare an empty DataFrame for that. # In[6]: data = pd.DataFrame() # Now we loop through the list of GeoPackages and: # 1. Read the file. # 2. Select only those geometries within 400 m buffer around the origin. That way we minimise potential edge effect affecting the results. # 3. Store period and case name within the DataFrame and append it together with morphometric data from GPKG to our `data` DataFrame. # In[7]: for f in files: tessellation = gpd.read_file(f, layer="tessellation") case = pathlib.Path(f).stem coords = cases.origin.loc[case] buffer = gpd.GeoSeries([Point(tuple(map(float, coords[1:-1].split(', '))))], crs=4326).to_crs(tessellation.crs).buffer(400) tessellation['period'] = cases.period.loc[case] tessellation['case'] = case casegdf = tessellation[tessellation.centroid.within(buffer.iloc[0])] data = data.append(casegdf.drop(columns=["uID", "nID", "mm_len", "node_start", "node_end", "nodeID", "geometry"])) # Since `period` is an ordered categorical column, let's encode it as such. # In[8]: data['period'] = pd.Categorical(data['period'], categories=['pre-industrial', 'industrial', 'garden city', 'modernist', 'neo-traditional', 'informal'], ordered=True) # Now we can sort the `data` based on the `period` in a way which follows the time. # In[9]: data = data.sort_values('period').reset_index(drop=True) # For our figures, we want `paper` context, `whitegrid` style and `palettable`'s `Moonrise5_6` color palette. # In[10]: sns.set_context("paper") sns.set_style("whitegrid") sns.set_palette(Moonrise5_6.hex_colors) # It is better to rename final labels to nicers ones, so let's prepare a dictionary for that. # In[11]: labels = { "cell_area": "Area of a tessellation cell [m]", "car": "Covered area ratio", "blg_area": "Area of a building footprint [m]", "wall": "Length of a perimeter wall [m]", "adjacency": "Building adjacency", "neighbour_distance": "Mean neighbor distance between buildings [m]", "length": "Length of a street segment [m]", "linearity": "Linearity of a street segment", "width": "Width of a street profile [m]", "width_deviation": "Width deviation of a street profile [m]", "openness": "Openness of a street profile", "meshedness": "Meshedness of a street network", } # Now we loop through all morphometric characters and create box plots for each city, once with normal y axis and once with log y axis. # In[12]: for ch in data.columns.drop(['case', 'period']): plt.figure(figsize=(12, 8)) boxen = sns.boxplot(x="case", y=ch, hue='period', dodge=False, data=data, showfliers=False) boxen.set_xticklabels(boxen.get_xticklabels(), rotation=90) boxen.set_ylabel(labels[ch]) boxen.set_xlabel("Case study") sns.despine() plt.legend(loc="lower left", borderaxespad=0., ncol=6, frameon=False) plt.savefig(f"figures/{ch}_all_normal.png", bbox_inches="tight") plt.close("all") plt.figure(figsize=(12, 8)) boxen = sns.boxplot(x="case", y=ch, hue='period', dodge=False, data=data, showfliers=False) boxen.set_xticklabels(boxen.get_xticklabels(), rotation=90) boxen.set_ylabel(labels[ch]) boxen.set_xlabel("Case study") boxen.set_yscale("log") sns.despine() plt.legend(loc="lower left", borderaxespad=0., ncol=6, frameon=False) plt.savefig(f"figures/{ch}_all_log.png", bbox_inches="tight") plt.close("all") # We can also create box plots for each character grouped by period. # In[13]: for ch in data.columns.drop(['case', 'period']): plt.figure(figsize=(12, 8)) box = sns.boxplot(x="period", y=ch,dodge=False, data=data, showfliers=False) box.set_ylabel(labels[ch]) box.set_xlabel("Historical period") sns.despine() plt.savefig(f"figures/{ch}_grouped_normal.png", bbox_inches="tight") plt.close("all") plt.figure(figsize=(12, 8)) box = sns.boxplot(x="period", y=ch,dodge=False, data=data, showfliers=False) box.set_ylabel(labels[ch]) box.set_xlabel("Historical period") box.set_yscale("log") sns.despine() plt.savefig(f"figures/{ch}_grouped_log.png", bbox_inches="tight") plt.close("all") # To report median values per character and period, we can create a grouped DataFrame. # In[16]: grouper = data.drop(columns='case').groupby('period') medians = grouper.median() medians = medians.rename(columns=labels) med = medians.T.round(2) med # It may also be useful to report interquartile range to illustrate the spread of each distribution. # In[17]: iqr = (grouper.quantile(.75).values - grouper.quantile(.25)) iqr = iqr.rename(columns=labels) iq = iqr.T.round(2) iq # We want to report both in the same table. # In[20]: med.astype('str') + ' (' + iq.astype('str') + ')' # Following cells pick individual boxplots (same we created above) and generate combined figures used in the final paper. # In[67]: fig = plt.figure(figsize=(8, 7), constrained_layout=True) gs = fig.add_gridspec(6, 2) top = fig.add_subplot(gs[0:4, :]) left = fig.add_subplot(gs[4:, 0]) right = fig.add_subplot(gs[4:, 1]) sns.boxplot(x="case", y='neighbour_distance', hue='period', dodge=False, data=data, showfliers=False, ax=top, linewidth=.75) top.set_xticklabels(top.get_xticklabels(), rotation=90) top.set_ylabel(labels['neighbour_distance']) top.set_xlabel("") sns.boxplot(x="period", y='cell_area',dodge=False, data=data, showfliers=False, ax=left, linewidth=.75) left.set_ylabel(labels['cell_area']) left.set_xlabel("Historical period") left.set_xticklabels(left.get_xticklabels(), rotation=90) left.set_yscale("log") sns.boxplot(x="period", y='blg_area',dodge=False, data=data, showfliers=False, ax=right, linewidth=.75) right.set_ylabel(labels['blg_area']) right.set_xlabel("Historical period") right.set_xticklabels(right.get_xticklabels(), rotation=90) right.set_yscale("log") sns.despine() top.legend(loc="upper left", borderaxespad=0., ncol=3, frameon=True) plt.savefig(f"figures/results_1.png", bbox_inches="tight", dpi=300) # In[66]: fig = plt.figure(figsize=(8, 7), constrained_layout=True) gs = fig.add_gridspec(6, 2) top = fig.add_subplot(gs[0:4, :]) left = fig.add_subplot(gs[4:, 0]) right = fig.add_subplot(gs[4:, 1]) sns.boxplot(x="case", y='openness', hue='period', dodge=False, data=data, showfliers=False, ax=top, linewidth=.75) top.set_xticklabels(top.get_xticklabels(), rotation=90) top.set_ylabel(labels['openness']) top.set_xlabel("") sns.boxplot(x="period", y='car',dodge=False, data=data, showfliers=False, ax=left, linewidth=.75) left.set_ylabel(labels['car']) left.set_xlabel("Historical period") left.set_xticklabels(left.get_xticklabels(), rotation=90) left.set_yscale("log") sns.boxplot(x="period", y='width',dodge=False, data=data, showfliers=False, ax=right, linewidth=.75) right.set_ylabel(labels['width']) right.set_xlabel("Historical period") right.set_xticklabels(right.get_xticklabels(), rotation=90) right.set_yscale("log") sns.despine() top.legend(loc="upper left", borderaxespad=0., ncol=3, frameon=True) plt.savefig(f"figures/results_2.png", bbox_inches="tight", dpi=300) # In[72]: fig = plt.figure(figsize=(8, 3), constrained_layout=True) gs = fig.add_gridspec(1, 2) left = fig.add_subplot(gs[0, 0]) right = fig.add_subplot(gs[0, 1]) sns.boxplot(x="period", y='wall',dodge=False, data=data, showfliers=False, ax=left, linewidth=.75) left.set_ylabel(labels['wall']) left.set_xlabel("Historical period") left.set_xticklabels(left.get_xticklabels(), rotation=90) sns.boxplot(x="period", y='adjacency',dodge=False, data=data, showfliers=False, ax=right, linewidth=.75) right.set_ylabel(labels['adjacency']) right.set_xlabel("Historical period") right.set_xticklabels(right.get_xticklabels(), rotation=90) sns.despine() plt.savefig(f"figures/results_3.png", bbox_inches="tight", dpi=300) # In[75]: fig = plt.figure(figsize=(8, 7), constrained_layout=True) gs = fig.add_gridspec(6, 2) top = fig.add_subplot(gs[0:4, :]) left = fig.add_subplot(gs[4:, 0]) right = fig.add_subplot(gs[4:, 1]) sns.boxplot(x="case", y='meshedness', hue='period', dodge=False, data=data, showfliers=False, ax=top, linewidth=.75) top.set_xticklabels(top.get_xticklabels(), rotation=90) top.set_ylabel(labels['meshedness']) top.set_xlabel("") sns.boxplot(x="period", y='linearity',dodge=False, data=data, showfliers=False, ax=left, linewidth=.75) left.set_ylabel(labels['linearity']) left.set_xlabel("Historical period") left.set_xticklabels(left.get_xticklabels(), rotation=90) sns.boxplot(x="period", y='width_deviation',dodge=False, data=data, showfliers=False, ax=right, linewidth=.75) right.set_ylabel(labels['width_deviation']) right.set_xlabel("Historical period") right.set_xticklabels(right.get_xticklabels(), rotation=90) sns.despine() top.legend(loc="upper right", borderaxespad=0., ncol=3, frameon=True) plt.savefig(f"figures/results_4.png", bbox_inches="tight", dpi=300) # ## Statistical comparison # # We also want to compare similarity of distributions statistically, not only visually. We use Kruskal–Wallis one-way analysis of variance and Pairwise Mann–Whitney *U* test for that. # In[10]: from scipy import stats data = data.set_index('case') # In[13]: data = data.rename(columns=labels) # ### Kruskal–Wallis one-way analysis of variance # # Kruskal–Wallis one-way analysis of variance is chosen as a non-parametric equivalent of ANOVA due to skewed distributions of underlying data. # In[14]: kruskal = pd.DataFrame() for col in data.columns.drop("period"): H, p = stats.kruskal(*[data[data.period == per][col] for per in data.period.cat.categories]) kruskal.loc[col, 'H'] = H kruskal.loc[col, 'p'] = p # In[15]: kruskal # The results indicate that the distributions of morphometric values obtained from samples from different historical periods cannot be considered the same. # ### Pairwise Mann–Whitney _U_ test # In[26]: from itertools import product mann_whitney_U = {} mann_whitney_p = {} for col in data.columns.drop("period"): df_u = pd.DataFrame(columns=data.period.cat.categories, index=data.period.cat.categories) df_p = pd.DataFrame(columns=data.period.cat.categories, index=data.period.cat.categories) for a, b in product(data.period.cat.categories, repeat=2): if pd.isna(df_u.loc[b, a]): U, p = stats.mannwhitneyu(data[data.period == a][col], data[data.period == b][col]) df_u.loc[a, b] = U df_p.loc[a, b] = p else: df_u.loc[a, b] = df_u.loc[b, a] df_p.loc[a, b] = df_p.loc[b, a] mann_whitney_U[col] = df_u mann_whitney_p[col] = df_p # In[27]: for col in data.columns.drop("period"): if not (mann_whitney_p[col] > .05).sum().sum() == 6: print(f"{col}") # In three cases, `Building adjacency`, `Linearity of a street segment`, and `Width deviation of a street profile`, Mann–Whitney _U_ is not significant (`p < 0.05`) in all pairwise cases. # In[28]: mann_whitney_p['Building adjacency'] > .05 # In[31]: mann_whitney_p['Building adjacency'].round(3) # In the case of `Building adjacency`, the distributions of values in Garden city and Modernist periods cannot be considered significantly different. # In[21]: mann_whitney_p['Linearity of a street segment'] > .05 # In[32]: mann_whitney_p['Linearity of a street segment'].round(3) # In the case of `Linearity of a street segment`, the distributions of values in Garden city and Modernist periods cannot be considered significantly different. # In[22]: mann_whitney_p['Width deviation of a street profile [m]'] > .05 # In[33]: mann_whitney_p['Width deviation of a street profile [m]'].round(3) # In the case of `Width deviation of a street profile`, the distributions of values in Pre-industrial and Modernist periods cannot be considered significantly different.