This is a workflow I use often in data exploration. UMAP gives a good representation of high-dimensional data, and Bokeh is helpful in creating a simple interactive plots with contextual info given by colors and tooltips.
Compared to TSNE, I've found UMAP much faster and capable of handling larger datasets
This workflow has been extremely helpful for:
TfidfVectorizer
or similar from scikit-learn
word2vec
or doc2vec
vectors by passing them to UMAPThis example uses the Australian atheletes data set, which contains 11 numeric variables. This workflow is even more helpful on larger datsets with higher dimensionality.
Uniform Manifold Approximation and Projection (UMAP) is a dimension reduction technique that can be used for visualisation similarly to t-SNE, but also for general non-linear dimension reduction.
UMAP: Uniform Manifold Approximation and Projection
Plotly's Python graphing library makes interactive, publication-quality graphs online.
import warnings
import pandas as pd
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
from statsmodels.api import datasets
from umap import UMAP
init_notebook_mode(connected=True)
ais = datasets.get_rdataset("ais", "DAAG")
data = ais['data']
data.head()
rcc | wcc | hc | hg | ferr | bmi | ssf | pcBfat | lbm | ht | wt | sex | sport | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 3.96 | 7.5 | 37.5 | 12.3 | 60 | 20.56 | 109.1 | 19.75 | 63.32 | 195.9 | 78.9 | f | B_Ball |
1 | 4.41 | 8.3 | 38.2 | 12.7 | 68 | 20.67 | 102.8 | 21.30 | 58.55 | 189.7 | 74.4 | f | B_Ball |
2 | 4.14 | 5.0 | 36.4 | 11.6 | 21 | 21.86 | 104.6 | 19.88 | 55.36 | 177.8 | 69.1 | f | B_Ball |
3 | 4.11 | 5.3 | 37.3 | 12.6 | 69 | 21.88 | 126.4 | 23.66 | 57.18 | 185.0 | 74.9 | f | B_Ball |
4 | 4.45 | 6.8 | 41.5 | 14.0 | 29 | 18.96 | 80.3 | 17.64 | 53.20 | 184.6 | 64.6 | f | B_Ball |
data_numeric = data.select_dtypes(exclude=['object'])
umap = UMAP(random_state=666)
# verbose numba warning with umap-learn==0.3.9
# see: https://github.com/lmcinnes/umap/issues/252
with warnings.catch_warnings():
warnings.simplefilter("ignore")
umap_data = umap.fit_transform(data_numeric)
The easiest/cleanest way to get data into plotly is to put everything you'll need (original data, UMAP values, point colorings/other metadata) into a single data frame.
umap_df = pd.DataFrame(umap_data, columns=['Component 1', 'Component 2'], index=data.index)
data_all = pd.concat([data, umap_df], axis=1)
category = "sex"
colormap = {v: i for i, v in enumerate(data_all[category].unique())}
data_all["color"] = data_all[category].map(colormap)
data_all.head()
rcc | wcc | hc | hg | ferr | bmi | ssf | pcBfat | lbm | ht | wt | sex | sport | Component 1 | Component 2 | color | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 3.96 | 7.5 | 37.5 | 12.3 | 60 | 20.56 | 109.1 | 19.75 | 63.32 | 195.9 | 78.9 | f | B_Ball | 3.492826 | -0.949805 | 0 |
1 | 4.41 | 8.3 | 38.2 | 12.7 | 68 | 20.67 | 102.8 | 21.30 | 58.55 | 189.7 | 74.4 | f | B_Ball | 3.061910 | -1.036136 | 0 |
2 | 4.14 | 5.0 | 36.4 | 11.6 | 21 | 21.86 | 104.6 | 19.88 | 55.36 | 177.8 | 69.1 | f | B_Ball | 4.035181 | 0.821345 | 0 |
3 | 4.11 | 5.3 | 37.3 | 12.6 | 69 | 21.88 | 126.4 | 23.66 | 57.18 | 185.0 | 74.9 | f | B_Ball | 3.397465 | -1.485374 | 0 |
4 | 4.45 | 6.8 | 41.5 | 14.0 | 29 | 18.96 | 80.3 | 17.64 | 53.20 | 184.6 | 64.6 | f | B_Ball | 3.049068 | 1.558317 | 0 |
title = "Australian Athletes Data by Gender - UMAP"
text = []
for row in data.iterrows():
tooltip = "<br>".join([f"<b>{k}</b>: {row[1][k]}" for k in row[1].index])
text.append(tooltip)
fig_data = [
go.Scatter(
x=data_all["Component 1"],
y=data_all["Component 2"],
text=text,
mode="markers",
hoverinfo="text",
marker={"color": data_all["color"], "colorscale" : "RdBu"},
)
]
layout = {"hovermode": "closest", "title" : title}
figure = go.Figure(data=fig_data, layout=layout)
iplot(figure)