Notebook

In [ ]:

from os.path import join, dirname
from os import listdir

import numpy as np
import pandas as pd

# GUI library
import panel as pn
import panel.widgets as pnw

# Chart libraries
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, Legend
from bokeh.palettes import Spectral5, Set2
from bokeh.events import SelectionGeometry

# Dimensionality reduction
from sklearn.decomposition import PCA
from sklearn.manifold import MDS
from sklearn.preprocessing import StandardScaler, LabelEncoder
# from umap import UMAP

#
from shapely.geometry import MultiPoint, MultiLineString, Polygon, MultiPolygon, LineString
from shapely.ops import unary_union
from shapely.ops import triangulate

# local scripts
from embedding.Rangeset import Rangeset
from embedding.ProjectionQuality import projection_quality

pn.extension()

Parameters¶

In [ ]:

dataset_name = 'Betterlife'

bins = 5

show_labels = True
labels_column = 'index'

overview_height = 700

small_multiples_ncols = 4
histogram_width = 200
show_numpy_histogram = True

rangeset_threshold = 3

Load data¶

In [ ]:

df1 = pd.read_csv('data/BLI_30102020171001105.csv')
df = df1[df1.INEQUALITY == 'TOT'].groupby(['Country', 'Indicator']).Value.sum().unstack(level=-1)
df = df.fillna(df.mean())
df['Household net adj. disposable income'] = [v / 1000 for v in df['Household net adjusted disposable income']]
df['Household net wealth'] = [v / 1000 for v in df['Household net wealth']]
df['Personal earnings'] = [v / 1000 for v in df['Personal earnings']]

In [ ]:

label_encoders = {}

for var in []:
    label_encoders[var] = LabelEncoder().fit(df[var])
    df.loc[:,var] = label_encoders[var].transform(df[var]) + 1

Preprocessing¶

In [ ]:

print(list(df))

In [ ]:

# attributes to be included
selected_var = ['Feeling safe walking alone at night',
                 'Household net adj. disposable income',
                 'Household net wealth',
                 'Labour market insecurity',
                 'Life satisfaction',
                 'Long-term unemployment rate',
                 'Personal earnings',
                 'Quality of support network',
                 'Self-reported health',
                 'Student skills',
                 'Voter turnout']
#selected_var = list(df)

# maximal slider range and step size
# {'variable_name': (min,max,stepsize)}
custom_range = {'Life satisfaction': (0,10,.1),
                'Feeling safe walking alone at night': (0,100,1),
                'Long-term unemployment rate': (0,18,.5),
                'Self-reported health': (0,100,1),
                'Voter turnout': (0,100,1),
                'Household net adj. disposable income': (0, 50, .5),
                'Household net wealth': (0, 800, 5),
                'Labour market insecurity': (0,30,.5),
                'Personal earnings': (0,70, 1),
                'Quality of support network': (75, 100, 1),
                'Student skills': (350,550,5),
                'projection quality': (0,1,0.01)}

# custom min/max settings for sliders
# {'variable_name': (min,max)}
default_range = {'Life satisfaction': (5.3,7.6),
                'Feeling safe walking alone at night': (36,90),
                'Long-term unemployment rate': (0,8.5),
                'Self-reported health': (33,88),
                'Voter turnout': (47,91),
                'Household net adj. disposable income': (16, 40),
                'Household net wealth': (65,560),
                'Labour market insecurity': (0.5,15.5),
                'Personal earnings': (15,65),
                'Quality of support network': (80, 98),
                'Student skills': (430,530),
                'projection quality': (0.65,1)}

In [ ]:

# which variables to use for the embedding
selected_var_embd = selected_var.copy()
# selected_var_embd = []

# set up embedding
#embedding = PCA(n_components=2)
embedding = MDS(n_components=2, random_state=42)
#embedding = UMAP(random_state=42)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(df[selected_var_embd])

# some projections change the original data, so we make a copy
# this can cost a lot of memory for large data
X  = X_scaled.copy()
pp = embedding.fit_transform(X)

In [ ]:

x_range = pp[:,0].max() - pp[:,0].min()
y_range = pp[:,1].max() - pp[:,1].min()

# keep the aspect ration of the projected data
overview_width = min(1000,int(overview_height * x_range / y_range))
histogram_height = min(200,int(histogram_width * y_range / x_range))

In [ ]:

# add projection quality
df['projection quality'] = projection_quality(X_scaled, pp)
selected_var += ['projection quality']

print('mean projection quality', df['projection quality'].mean())

In [ ]:

rangeset = Rangeset(pp, df)
rangeset.threshold = rangeset_threshold
rangeset.size_inside = 5
rangeset.size_outside = 13

In [ ]:

from scipy.sparse import csr_matrix
from scipy.sparse.csgraph import minimum_spanning_tree
from sklearn.metrics import pairwise_distances

D = csr_matrix(pairwise_distances(pp))
MST = minimum_spanning_tree(D)
MST = MST[MST.nonzero()].A1
print('max edge in MST: {:.2f}'.format(MST.max()))

In [ ]:

eps = np.quantile(MST, .75) + 1.5*(np.quantile(MST, .75) - np.quantile(MST, .25))

In [ ]:

print('Wilkinson epsilon: {:.2f}'.format(eps))

In [ ]:

# sorted(MST[MST.nonzero()].A[0])

GUI¶

Vis elements¶

overview chart shows a large version of the embedding

In [ ]:

TOOLS = "pan,wheel_zoom,box_zoom,box_select,lasso_select,help,reset,save"
overview = figure(tools=TOOLS, width=overview_width, height=overview_height, active_drag="lasso_select")

overview.scatter(x=pp[:,0], y=pp[:,1], color="#333333", muted_alpha=0,
                 size=3, level='underlay', name='points',
                 line_color=None, legend_label='data')

if show_labels:
    labels = df.index.astype(str) if labels_column == 'index' else df[labels_column].astype(str)
    overview.text(x=pp[:,0], y=pp[:,1], text=labels, legend_label='labels',
                  font_size="10pt", x_offset=5, y_offset=5, muted_alpha=0,
                  text_baseline="middle", text_align="left", color='#666666', level='annotation')

source_selection = ColumnDataSource({'x': [], 'y': []})
overview.patch(source=source_selection, x='x', y='y', fill_color=None, line_width=2, line_color='#4d4d4d',
               level='annotation')

overview.legend.location = 'bottom_right'
overview.legend.label_height=1
overview.legend.click_policy='mute'
overview.legend.visible = True

overview.outline_line_color = None

overview.xgrid.visible = False
overview.ygrid.visible = False
overview.xaxis.visible = False
overview.yaxis.visible = False
overview.toolbar.logo = None

In [ ]:

# Check the embedding with the code below

# pn.Row(overview)

small multiples charts are created upon request

In [ ]:

def _make_chart( var, df_polys, df_scatter, bounds, cnt_in, cnt_out ):
    global df

    xvals = df[var].unique()
    is_categorical = False
    if len(xvals) < 10:
        is_categorical = True
        xvals = sorted(xvals.astype(str))
    
    global histogram_width
    p = figure(width=histogram_width, height=histogram_height, title=var)
    df_scatter['size'] = df_scatter['size'] * histogram_height / overview_height
    
    p.multi_polygons(source=df_polys, xs='xs', ys='ys', color='color', fill_alpha=0.5, level='image', line_color=None)
    p.scatter(source=df_scatter, x='x', y='y', color='color', size='size', level='overlay')
    
    global source_selection
    p.patch(source=source_selection, x='x', y='y', fill_color=None, level='annotation', line_width=1, line_color='#4d4d4d')
    
    p.xgrid.visible = False
    p.ygrid.visible = False
    p.xaxis.visible = False
    p.yaxis.visible = False
    p.toolbar.logo = None
    p.toolbar_location = None
    p.border_fill_color = '#f0f0f0'
    
    p_histo = figure(height=100, width=histogram_width, name='histo')
    if is_categorical:
        p_histo = figure(height=100, width=histogram_width, name='histo', x_range=xvals)
        p_histo.vbar(x=xvals, top=cnt_in, bottom=0, width=0.9, line_color='white', color=rangeset.colormap)
        p_histo.vbar(x=xvals, top=0, bottom=np.array(cnt_out)*-1, width=0.9, line_color='white', color=rangeset.colormap)
    else:
        p_histo.quad(bottom=[0]*len(cnt_in), top=cnt_in, left=bounds[:-1], right=bounds[1:], line_color='white', color=rangeset.colormap)
        p_histo.quad(bottom=np.array(cnt_out)*(-1), top=[0]*len(cnt_out), left=bounds[:-1], right=bounds[1:], line_color='white', color=rangeset.colormap)

        df_select = df[df[var] < bounds[0]]
        p_histo.square(df_select[var], -.5, color=rangeset.colormap[0])
        df_select = df[df[var] > bounds[-1]]
        p_histo.square(df_select[var], -.5, color=rangeset.colormap[-1])
    
    p_histo.toolbar.logo = None
    p_histo.toolbar_location = None
    p_histo.xgrid.visible = False
    p_histo.xaxis.minor_tick_line_color = None
    p_histo.yaxis.minor_tick_line_color = None
    p_histo.outline_line_color = None
    p_histo.border_fill_color = '#f0f0f0'
    
    global show_numpy_histogram
    if show_numpy_histogram:
        if is_categorical:
            frequencies, edges = np.histogram(df[var], bins=len(xvals))
            p_histo.vbar(x=xvals, bottom=0, width=.5, top=frequencies*-1,
                         line_color='white', color='gray', line_alpha=.5,  fill_alpha=0.5)
        else:
            frequencies, edges = np.histogram(df[var])
            p_histo.quad(bottom=[0]*len(frequencies), top=frequencies*-1, left=edges[:-1], right=edges[1:], 
                     line_color='white', color='gray', line_alpha=.5,  fill_alpha=0.5)
    
    return (p, p_histo)

In [ ]:

class MyCheckbox(pnw.Checkbox):
    variable = ""
    
    def __init__(self, variable="", slider=None, **kwds):
        super().__init__(**kwds)
        
        self.variable = variable
        self.slider = slider
        
def init_slider_values(var):
    vmin = df[var].min()
    vmax = df[var].max()
    step = 0
    
    if var in custom_range:
        vmin,vmax,step = custom_range[var]
    value = (vmin,vmax)
    
    if var in default_range:
        value = default_range[var]
        
    return (vmin, vmax, step, value)

Create all toplevel GUI elements

In [ ]:

ranges_embd = pn.Column()
ranges_aux = pn.Column()

sliders = {}

def create_slider(var):
    vmin, vmax, step, value = init_slider_values(var) 
    slider = pnw.RangeSlider(name=var, start=vmin, end=vmax, step=step, value=value) 

    checkbox = MyCheckbox(name='', variable=var, value=False, width=20, slider=slider)
    return pn.Row(checkbox,slider)
    
for var in selected_var:
    s = create_slider(var)
    sliders[var] = s

    if var in selected_var_embd:
        ranges_embd.append(s)
    else:
        ranges_aux.append(s)
        
selected_var = []

for r in ranges_embd:
    selected_var.append(r[1].name)
for r in ranges_aux:
    selected_var.append(r[1].name)

In [ ]:

gui_colormap = pn.Row(pn.pane.Str(styles={'background': rangeset.colormap[0]}, height=30, width=20), "very low",
                      pn.pane.Str(styles={'background': rangeset.colormap[1]}, height=30, width=20), "low",
                      pn.pane.Str(styles={'background': rangeset.colormap[2]}, height=30, width=20), "medium",
                      pn.pane.Str(styles={'background': rangeset.colormap[3]}, height=30, width=20), "high",
                      pn.pane.Str(styles={'background': rangeset.colormap[4]}, height=30, width=20), "very high", sizing_mode='stretch_width')

selectColoring = pn.widgets.Select(name='', options=['None']+selected_var)


# set up the GUI
layout = pn.Row(pn.Column(
    pn.Row(pn.pane.Markdown('''# NoLiES: The non-linear embedding surveyor\n
NoLiES augments the projected data with additional information. The following interactions are supported:\n
* **Attribute-based coloring** Chose an attribute from the drop-down menu below the embedding to display contours for multiple value ranges.
* **Selective muting**: Click on the legend to mute/hide parts of the chart. Press _labels_ to hide the labels.
* **Contour control** Change the slider range to change the contours.
* **Histograms** Select the check-box next to the slider to view the attribute's histogram.
* **Selection** Use the selection tool to outline a set of points and share this outline across plots.''', sizing_mode='stretch_width'), 
           margin=(0, 25,0,25)),
    pn.Row(
        pn.Column(pn.pane.Markdown('''# Attributes\nEnable histograms with the checkboxes.'''), 
                  '## Embedding',
                  ranges_embd,
                  #pn.layout.Divider(),
                  '## Auxiliary',
                  ranges_aux, margin=(0, 25, 0, 0)),
        pn.Column(pn.pane.Markdown('''# Embedding - '''+type(embedding).__name__+'''&nbsp;&nbsp;  Dataset - '''+dataset_name, sizing_mode='stretch_width'), 
                  overview, 
                  pn.Row(selectColoring, gui_colormap)
                 ), 
        margin=(0,25,25,25)
        ), 
    #pn.Row(sizing_mode='stretch_height'), 
    pn.Row(pn.pane.Markdown('''Data source: http://stats.oecd.org/Index.aspx?DataSetCode=BLI''',
          width=800), sizing_mode='stretch_width', margin=(0,25,0,25))),
    pn.GridBox(ncols=small_multiples_ncols, sizing_mode='stretch_both', margin=(220,25,0,0)),
    styles={'background': '#efefef'}
    )

Adjust the order of the variable so that it reflects the sorting of the range sliders (we distinguish between those used for embedding and auxiliary ones).

In [ ]:

# Check the GUI with the following code - this version is not interactive yet

layout

Callbacks¶

Callbacks for slider interactions

In [ ]:

visible = [False]*len(selected_var)
mapping = {v: k for k, v in dict(enumerate(selected_var)).items()}

        
def onSliderChanged(event):
    '''Actions upon attribute slider change.
    
    Attributes
    ----------
    event: bokeh.Events.Event
        information about the event that triggered the callback
    '''

    var = event.obj.name
    v_range = event.obj.value
        
    # if changed variable is currently displayed
    if var == layout[0][1][1][2][0].value:
        setColoring(var, v_range)
        
    # find the matching chart and update it
    for col in layout[1]:
        if col.name == var:
            df_polys, df_scatter, bounds, cnt_in, cnt_out = rangeset.compute_contours(var, v_range, bins=20 if col.name == 'groups' else 5)
            p,histo = _make_chart(var, df_polys, df_scatter, bounds, cnt_in, cnt_out)
            col[0].object = p
            col[1].object = histo

def onSliderChanged_released(event):
    '''Actions upon attribute slider change.
    
    Attributes
    ----------
    event: bokeh.Events.Event
        information about the event that triggered the callback
    '''

    var = event.obj.name
    v_range = event.obj.value
    
    print('\''+var+'\': ('+str(v_range[0])+','+str(v_range[1])+')')


def onAttributeSelected(event):
    '''Actions upon attribute checkbox change.
    
    Attributes
    ----------
    event: bokeh.Events.Event
        information about the event that triggered the callback
    '''
    var = event.obj.variable
    i = mapping[var]
    
    if event.obj.value == True:
        v_range = event.obj.slider.value
        
        df_polys, df_scatter, bounds, cnt_in, cnt_out = rangeset.compute_contours(var, v_range)
        p,p_histo = _make_chart(var, df_polys, df_scatter, bounds, cnt_in, cnt_out)
        pos_insert = sum(visible[:i])
        layout[1].insert(pos_insert, pn.Column(p,pn.panel(p_histo), name=var, margin=5))
    else:
        pos_remove = sum(visible[:i])
        layout[1].pop(pos_remove)
    
    visible[i] = event.obj.value  

# link widgets to their callbacks
for var in sliders.keys():
    sliders[var][0].param.watch(onAttributeSelected, 'value')
    sliders[var][1].param.watch(onSliderChanged, 'value')
    sliders[var][1].param.watch(onSliderChanged_released, 'value_throttled')

Callbacks rangeset selection in overview plot

In [ ]:

def clearColoring():
    '''Remove rangeset augmentation from the embedding.'''
    
    global overview
    overview.legend.visible = False
    
    for r in overview.renderers:
        if r.name is not None and ('poly' in r.name or 'scatter' in r.name):
            r.visible = False
            r.muted = True
    
def setColoring(var, v_range=None):
    '''Compute and render the rangeset for a selected variable.
    
    Attributes
    ----------
    var: str
        the selected variable
    v_range: tuple (min,max)
        the user define value range for the rangeset
    '''
    
    global overview    
    overview.legend.visible = True
    
    df_polys, df_scatter, bounds, cnt,cnt = rangeset.compute_contours(var, val_range=v_range, bins=bins)
    for r in overview.renderers:
        if r.name is not None and ('poly' in r.name or 'scatter' in r.name):
            r.visible = False
            r.muted = True
    
    if len(df_polys) > 0:
        for k in list(rangeset.labels.keys())[::-1]:
            g = df_polys[df_polys.color == k]
            
            label_id = rangeset.color2label(k)
            label = label_id
            if var in label_encoders.keys():
                label = label_id + ' ' +label_encoders[var].inverse_transform([int(rangeset.color2label(k))-1])[0]
            r = overview.select('poly '+label)
            
            if len(r) > 0:
                r[0].visible = True
                r[0].muted = False
                r[0].data_source.data = dict(ColumnDataSource(g).data)
            else:
                overview.multi_polygons(source = g, xs='xs', ys='ys', name='poly '+label, level='image',
                                        color='color', alpha=.5, legend_label=label,
                                        line_color=None, muted_color='gray', muted_alpha=.1) 
                
            g = df_scatter[df_scatter.color == k]
            r = overview.select('scatter '+label)
            if len(r) > 0:
                r[0].visible = True
                r[0].muted = False
                r[0].data_source.data = dict(ColumnDataSource(g).data)
            else:
                overview.circle(source = g, x='x', y='y', size='size', name='scatter '+label,
                                color='color', alpha=1, legend_label=label,
                                muted_color='gray', muted_alpha=0) 

def onChangeColoring(event):
    '''Actions upon change of the rangeset attribute.
    
    Attributes
    ----------
    event: bokeh.Events.Event
        information about the event that triggered the callback
    '''
    var = event.obj.value
    
    if var == 'None':
        clearColoring()
    else:
        v_range = sliders[var][1].value
        setColoring(var, v_range)
        
selectColoring.param.watch( onChangeColoring, 'value' )

User selection of data points in the overview chart.

In [ ]:

def onSelectionChanged(event):
    if event.final:
        sel_pp = pp[list(overview.select('points').data_source.selected.indices)]
        if len(sel_pp) == 0:
            source_selection.data = dict({'x': [], 'y': []})
        else:
            points = MultiPoint(sel_pp)
            poly = unary_union([polygon for polygon in triangulate(points) if rangeset._max_edge(polygon) < 3]).boundary.parallel_offset(-0.05).coords.xy
            source_selection.data = dict({'x': poly[0].tolist(), 'y': poly[1].tolist()})

overview.on_event(SelectionGeometry, onSelectionChanged)

In [ ]:

layout.servable('NoLies')

Parameters¶

Load data¶

Preprocessing¶

GUI¶

Vis elements¶

Create input widget (buttons, sliders, etc) and layout¶

Callbacks¶