Stratified sampling

In large dataset a relatively small group of points might be overplotted by the dominant group. In this case stratified sampling can help.

In [1]:
import numpy as np
import pandas as pd
from lets_plot import *

LetsPlot.setup_html()
In [2]:
N = 5000 
small_group = 3
large_group = N - small_group

np.random.seed(123)
data = dict(
    x = np.random.normal(0, 1, N),
    y = np.random.normal(0, 1, N),
    cond = ['A' for _ in range(small_group)] + ['B' for _ in range(large_group)]
)
In [3]:
# Data points in group 'A' (small group) are overplotted by the dominant group 'B'.
p = ggplot(data, aes('x','y',color='cond')) + \
    scale_color_manual(values=["red", "#1C9E77"], breaks=['A', 'B'])
p + geom_point(size=5, alpha=.2)
Out[3]:
In [4]:
# The 'random' sampling loses the group 'A' altogether.
p + geom_point(size=5, sampling=sampling_random(50, seed=2))
Out[4]:
In [5]:
# Stratified sampling ensures that group 'A' is represented.
p + geom_point(size=5, sampling=sampling_random_stratified(50, seed=2))
Out[5]: