from os.path import join, dirname
from os import listdir
import numpy as np
import pandas as pd
# GUI library
import panel as pn
import panel.widgets as pnw
# Chart libraries
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, Legend
from bokeh.palettes import Spectral5, Set2
from bokeh.events import SelectionGeometry
# Dimensionality reduction
from sklearn.decomposition import PCA
from sklearn.manifold import MDS
from sklearn.preprocessing import StandardScaler, LabelEncoder
# from umap import UMAP
#
from shapely.geometry import MultiPoint, MultiLineString, Polygon, MultiPolygon, LineString
from shapely.ops import unary_union
from shapely.ops import triangulate
# local scripts
from embedding.Rangeset import Rangeset
from embedding.ProjectionQuality import projection_quality
pn.extension()
dataset_name = 'Betterlife'
bins = 5
show_labels = True
labels_column = 'index'
overview_height = 700
small_multiples_ncols = 4
histogram_width = 200
show_numpy_histogram = True
rangeset_threshold = 3
df1 = pd.read_csv('data/BLI_30102020171001105.csv')
df = df1[df1.INEQUALITY == 'TOT'].groupby(['Country', 'Indicator']).Value.sum().unstack(level=-1)
df = df.fillna(df.mean())
df['Household net adj. disposable income'] = [v / 1000 for v in df['Household net adjusted disposable income']]
df['Household net wealth'] = [v / 1000 for v in df['Household net wealth']]
df['Personal earnings'] = [v / 1000 for v in df['Personal earnings']]
label_encoders = {}
for var in []:
label_encoders[var] = LabelEncoder().fit(df[var])
df.loc[:,var] = label_encoders[var].transform(df[var]) + 1
print(list(df))
# attributes to be included
selected_var = ['Feeling safe walking alone at night',
'Household net adj. disposable income',
'Household net wealth',
'Labour market insecurity',
'Life satisfaction',
'Long-term unemployment rate',
'Personal earnings',
'Quality of support network',
'Self-reported health',
'Student skills',
'Voter turnout']
#selected_var = list(df)
# maximal slider range and step size
# {'variable_name': (min,max,stepsize)}
custom_range = {'Life satisfaction': (0,10,.1),
'Feeling safe walking alone at night': (0,100,1),
'Long-term unemployment rate': (0,18,.5),
'Self-reported health': (0,100,1),
'Voter turnout': (0,100,1),
'Household net adj. disposable income': (0, 50, .5),
'Household net wealth': (0, 800, 5),
'Labour market insecurity': (0,30,.5),
'Personal earnings': (0,70, 1),
'Quality of support network': (75, 100, 1),
'Student skills': (350,550,5),
'projection quality': (0,1,0.01)}
# custom min/max settings for sliders
# {'variable_name': (min,max)}
default_range = {'Life satisfaction': (5.3,7.6),
'Feeling safe walking alone at night': (36,90),
'Long-term unemployment rate': (0,8.5),
'Self-reported health': (33,88),
'Voter turnout': (47,91),
'Household net adj. disposable income': (16, 40),
'Household net wealth': (65,560),
'Labour market insecurity': (0.5,15.5),
'Personal earnings': (15,65),
'Quality of support network': (80, 98),
'Student skills': (430,530),
'projection quality': (0.65,1)}
# which variables to use for the embedding
selected_var_embd = selected_var.copy()
# selected_var_embd = []
# set up embedding
#embedding = PCA(n_components=2)
embedding = MDS(n_components=2, random_state=42)
#embedding = UMAP(random_state=42)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df[selected_var_embd])
# some projections change the original data, so we make a copy
# this can cost a lot of memory for large data
X = X_scaled.copy()
pp = embedding.fit_transform(X)
x_range = pp[:,0].max() - pp[:,0].min()
y_range = pp[:,1].max() - pp[:,1].min()
# keep the aspect ration of the projected data
overview_width = min(1000,int(overview_height * x_range / y_range))
histogram_height = min(200,int(histogram_width * y_range / x_range))
# add projection quality
df['projection quality'] = projection_quality(X_scaled, pp)
selected_var += ['projection quality']
print('mean projection quality', df['projection quality'].mean())
rangeset = Rangeset(pp, df)
rangeset.threshold = rangeset_threshold
rangeset.size_inside = 5
rangeset.size_outside = 13
from scipy.sparse import csr_matrix
from scipy.sparse.csgraph import minimum_spanning_tree
from sklearn.metrics import pairwise_distances
D = csr_matrix(pairwise_distances(pp))
MST = minimum_spanning_tree(D)
MST = MST[MST.nonzero()].A1
print('max edge in MST: {:.2f}'.format(MST.max()))
eps = np.quantile(MST, .75) + 1.5*(np.quantile(MST, .75) - np.quantile(MST, .25))
print('Wilkinson epsilon: {:.2f}'.format(eps))
# sorted(MST[MST.nonzero()].A[0])
TOOLS = "pan,wheel_zoom,box_zoom,box_select,lasso_select,help,reset,save"
overview = figure(tools=TOOLS, width=overview_width, height=overview_height, active_drag="lasso_select")
overview.scatter(x=pp[:,0], y=pp[:,1], color="#333333", muted_alpha=0,
size=3, level='underlay', name='points',
line_color=None, legend_label='data')
if show_labels:
labels = df.index.astype(str) if labels_column == 'index' else df[labels_column].astype(str)
overview.text(x=pp[:,0], y=pp[:,1], text=labels, legend_label='labels',
font_size="10pt", x_offset=5, y_offset=5, muted_alpha=0,
text_baseline="middle", text_align="left", color='#666666', level='annotation')
source_selection = ColumnDataSource({'x': [], 'y': []})
overview.patch(source=source_selection, x='x', y='y', fill_color=None, line_width=2, line_color='#4d4d4d',
level='annotation')
overview.legend.location = 'bottom_right'
overview.legend.label_height=1
overview.legend.click_policy='mute'
overview.legend.visible = True
overview.outline_line_color = None
overview.xgrid.visible = False
overview.ygrid.visible = False
overview.xaxis.visible = False
overview.yaxis.visible = False
overview.toolbar.logo = None
# Check the embedding with the code below
# pn.Row(overview)
small multiples charts are created upon request
def _make_chart( var, df_polys, df_scatter, bounds, cnt_in, cnt_out ):
global df
xvals = df[var].unique()
is_categorical = False
if len(xvals) < 10:
is_categorical = True
xvals = sorted(xvals.astype(str))
global histogram_width
p = figure(width=histogram_width, height=histogram_height, title=var)
df_scatter['size'] = df_scatter['size'] * histogram_height / overview_height
p.multi_polygons(source=df_polys, xs='xs', ys='ys', color='color', fill_alpha=0.5, level='image', line_color=None)
p.scatter(source=df_scatter, x='x', y='y', color='color', size='size', level='overlay')
global source_selection
p.patch(source=source_selection, x='x', y='y', fill_color=None, level='annotation', line_width=1, line_color='#4d4d4d')
p.xgrid.visible = False
p.ygrid.visible = False
p.xaxis.visible = False
p.yaxis.visible = False
p.toolbar.logo = None
p.toolbar_location = None
p.border_fill_color = '#f0f0f0'
p_histo = figure(height=100, width=histogram_width, name='histo')
if is_categorical:
p_histo = figure(height=100, width=histogram_width, name='histo', x_range=xvals)
p_histo.vbar(x=xvals, top=cnt_in, bottom=0, width=0.9, line_color='white', color=rangeset.colormap)
p_histo.vbar(x=xvals, top=0, bottom=np.array(cnt_out)*-1, width=0.9, line_color='white', color=rangeset.colormap)
else:
p_histo.quad(bottom=[0]*len(cnt_in), top=cnt_in, left=bounds[:-1], right=bounds[1:], line_color='white', color=rangeset.colormap)
p_histo.quad(bottom=np.array(cnt_out)*(-1), top=[0]*len(cnt_out), left=bounds[:-1], right=bounds[1:], line_color='white', color=rangeset.colormap)
df_select = df[df[var] < bounds[0]]
p_histo.square(df_select[var], -.5, color=rangeset.colormap[0])
df_select = df[df[var] > bounds[-1]]
p_histo.square(df_select[var], -.5, color=rangeset.colormap[-1])
p_histo.toolbar.logo = None
p_histo.toolbar_location = None
p_histo.xgrid.visible = False
p_histo.xaxis.minor_tick_line_color = None
p_histo.yaxis.minor_tick_line_color = None
p_histo.outline_line_color = None
p_histo.border_fill_color = '#f0f0f0'
global show_numpy_histogram
if show_numpy_histogram:
if is_categorical:
frequencies, edges = np.histogram(df[var], bins=len(xvals))
p_histo.vbar(x=xvals, bottom=0, width=.5, top=frequencies*-1,
line_color='white', color='gray', line_alpha=.5, fill_alpha=0.5)
else:
frequencies, edges = np.histogram(df[var])
p_histo.quad(bottom=[0]*len(frequencies), top=frequencies*-1, left=edges[:-1], right=edges[1:],
line_color='white', color='gray', line_alpha=.5, fill_alpha=0.5)
return (p, p_histo)
class MyCheckbox(pnw.Checkbox):
variable = ""
def __init__(self, variable="", slider=None, **kwds):
super().__init__(**kwds)
self.variable = variable
self.slider = slider
def init_slider_values(var):
vmin = df[var].min()
vmax = df[var].max()
step = 0
if var in custom_range:
vmin,vmax,step = custom_range[var]
value = (vmin,vmax)
if var in default_range:
value = default_range[var]
return (vmin, vmax, step, value)
Create all toplevel GUI elements
ranges_embd = pn.Column()
ranges_aux = pn.Column()
sliders = {}
def create_slider(var):
vmin, vmax, step, value = init_slider_values(var)
slider = pnw.RangeSlider(name=var, start=vmin, end=vmax, step=step, value=value)
checkbox = MyCheckbox(name='', variable=var, value=False, width=20, slider=slider)
return pn.Row(checkbox,slider)
for var in selected_var:
s = create_slider(var)
sliders[var] = s
if var in selected_var_embd:
ranges_embd.append(s)
else:
ranges_aux.append(s)
selected_var = []
for r in ranges_embd:
selected_var.append(r[1].name)
for r in ranges_aux:
selected_var.append(r[1].name)
gui_colormap = pn.Row(pn.pane.Str(styles={'background': rangeset.colormap[0]}, height=30, width=20), "very low",
pn.pane.Str(styles={'background': rangeset.colormap[1]}, height=30, width=20), "low",
pn.pane.Str(styles={'background': rangeset.colormap[2]}, height=30, width=20), "medium",
pn.pane.Str(styles={'background': rangeset.colormap[3]}, height=30, width=20), "high",
pn.pane.Str(styles={'background': rangeset.colormap[4]}, height=30, width=20), "very high", sizing_mode='stretch_width')
selectColoring = pn.widgets.Select(name='', options=['None']+selected_var)
# set up the GUI
layout = pn.Row(pn.Column(
pn.Row(pn.pane.Markdown('''# NoLiES: The non-linear embedding surveyor\n
NoLiES augments the projected data with additional information. The following interactions are supported:\n
* **Attribute-based coloring** Chose an attribute from the drop-down menu below the embedding to display contours for multiple value ranges.
* **Selective muting**: Click on the legend to mute/hide parts of the chart. Press _labels_ to hide the labels.
* **Contour control** Change the slider range to change the contours.
* **Histograms** Select the check-box next to the slider to view the attribute's histogram.
* **Selection** Use the selection tool to outline a set of points and share this outline across plots.''', sizing_mode='stretch_width'),
margin=(0, 25,0,25)),
pn.Row(
pn.Column(pn.pane.Markdown('''# Attributes\nEnable histograms with the checkboxes.'''),
'## Embedding',
ranges_embd,
#pn.layout.Divider(),
'## Auxiliary',
ranges_aux, margin=(0, 25, 0, 0)),
pn.Column(pn.pane.Markdown('''# Embedding - '''+type(embedding).__name__+''' Dataset - '''+dataset_name, sizing_mode='stretch_width'),
overview,
pn.Row(selectColoring, gui_colormap)
),
margin=(0,25,25,25)
),
#pn.Row(sizing_mode='stretch_height'),
pn.Row(pn.pane.Markdown('''Data source: http://stats.oecd.org/Index.aspx?DataSetCode=BLI''',
width=800), sizing_mode='stretch_width', margin=(0,25,0,25))),
pn.GridBox(ncols=small_multiples_ncols, sizing_mode='stretch_both', margin=(220,25,0,0)),
styles={'background': '#efefef'}
)
Adjust the order of the variable so that it reflects the sorting of the range sliders (we distinguish between those used for embedding and auxiliary ones).
# Check the GUI with the following code - this version is not interactive yet
layout
Callbacks for slider interactions
visible = [False]*len(selected_var)
mapping = {v: k for k, v in dict(enumerate(selected_var)).items()}
def onSliderChanged(event):
'''Actions upon attribute slider change.
Attributes
----------
event: bokeh.Events.Event
information about the event that triggered the callback
'''
var = event.obj.name
v_range = event.obj.value
# if changed variable is currently displayed
if var == layout[0][1][1][2][0].value:
setColoring(var, v_range)
# find the matching chart and update it
for col in layout[1]:
if col.name == var:
df_polys, df_scatter, bounds, cnt_in, cnt_out = rangeset.compute_contours(var, v_range, bins=20 if col.name == 'groups' else 5)
p,histo = _make_chart(var, df_polys, df_scatter, bounds, cnt_in, cnt_out)
col[0].object = p
col[1].object = histo
def onSliderChanged_released(event):
'''Actions upon attribute slider change.
Attributes
----------
event: bokeh.Events.Event
information about the event that triggered the callback
'''
var = event.obj.name
v_range = event.obj.value
print('\''+var+'\': ('+str(v_range[0])+','+str(v_range[1])+')')
def onAttributeSelected(event):
'''Actions upon attribute checkbox change.
Attributes
----------
event: bokeh.Events.Event
information about the event that triggered the callback
'''
var = event.obj.variable
i = mapping[var]
if event.obj.value == True:
v_range = event.obj.slider.value
df_polys, df_scatter, bounds, cnt_in, cnt_out = rangeset.compute_contours(var, v_range)
p,p_histo = _make_chart(var, df_polys, df_scatter, bounds, cnt_in, cnt_out)
pos_insert = sum(visible[:i])
layout[1].insert(pos_insert, pn.Column(p,pn.panel(p_histo), name=var, margin=5))
else:
pos_remove = sum(visible[:i])
layout[1].pop(pos_remove)
visible[i] = event.obj.value
# link widgets to their callbacks
for var in sliders.keys():
sliders[var][0].param.watch(onAttributeSelected, 'value')
sliders[var][1].param.watch(onSliderChanged, 'value')
sliders[var][1].param.watch(onSliderChanged_released, 'value_throttled')
Callbacks rangeset selection in overview plot
def clearColoring():
'''Remove rangeset augmentation from the embedding.'''
global overview
overview.legend.visible = False
for r in overview.renderers:
if r.name is not None and ('poly' in r.name or 'scatter' in r.name):
r.visible = False
r.muted = True
def setColoring(var, v_range=None):
'''Compute and render the rangeset for a selected variable.
Attributes
----------
var: str
the selected variable
v_range: tuple (min,max)
the user define value range for the rangeset
'''
global overview
overview.legend.visible = True
df_polys, df_scatter, bounds, cnt,cnt = rangeset.compute_contours(var, val_range=v_range, bins=bins)
for r in overview.renderers:
if r.name is not None and ('poly' in r.name or 'scatter' in r.name):
r.visible = False
r.muted = True
if len(df_polys) > 0:
for k in list(rangeset.labels.keys())[::-1]:
g = df_polys[df_polys.color == k]
label_id = rangeset.color2label(k)
label = label_id
if var in label_encoders.keys():
label = label_id + ' ' +label_encoders[var].inverse_transform([int(rangeset.color2label(k))-1])[0]
r = overview.select('poly '+label)
if len(r) > 0:
r[0].visible = True
r[0].muted = False
r[0].data_source.data = dict(ColumnDataSource(g).data)
else:
overview.multi_polygons(source = g, xs='xs', ys='ys', name='poly '+label, level='image',
color='color', alpha=.5, legend_label=label,
line_color=None, muted_color='gray', muted_alpha=.1)
g = df_scatter[df_scatter.color == k]
r = overview.select('scatter '+label)
if len(r) > 0:
r[0].visible = True
r[0].muted = False
r[0].data_source.data = dict(ColumnDataSource(g).data)
else:
overview.circle(source = g, x='x', y='y', size='size', name='scatter '+label,
color='color', alpha=1, legend_label=label,
muted_color='gray', muted_alpha=0)
def onChangeColoring(event):
'''Actions upon change of the rangeset attribute.
Attributes
----------
event: bokeh.Events.Event
information about the event that triggered the callback
'''
var = event.obj.value
if var == 'None':
clearColoring()
else:
v_range = sliders[var][1].value
setColoring(var, v_range)
selectColoring.param.watch( onChangeColoring, 'value' )
User selection of data points in the overview chart.
def onSelectionChanged(event):
if event.final:
sel_pp = pp[list(overview.select('points').data_source.selected.indices)]
if len(sel_pp) == 0:
source_selection.data = dict({'x': [], 'y': []})
else:
points = MultiPoint(sel_pp)
poly = unary_union([polygon for polygon in triangulate(points) if rangeset._max_edge(polygon) < 3]).boundary.parallel_offset(-0.05).coords.xy
source_selection.data = dict({'x': poly[0].tolist(), 'y': poly[1].tolist()})
overview.on_event(SelectionGeometry, onSelectionChanged)
layout.servable('NoLies')