Stratified sampling¶

In large dataset a relatively small group of points might be overplotted by the dominant group. In this case stratified sampling can help.

In [1]:

import numpy as np
import pandas as pd
from lets_plot import *

LetsPlot.setup_html()

In [2]:

N = 5000 
small_group = 3
large_group = N - small_group

np.random.seed(123)
data = dict(
    x = np.random.normal(0, 1, N),
    y = np.random.normal(0, 1, N),
    cond = ['A' for _ in range(small_group)] + ['B' for _ in range(large_group)]
)

In [3]:

# Data points in group 'A' (small group) are overplotted by the dominant group 'B'.
p = ggplot(data, aes('x','y',color='cond')) + \
    scale_color_manual(values=["red", "#1C9E77"], breaks=['A', 'B'])
p + geom_point(size=5, alpha=.2)

Out[3]:

In [4]:

# The 'random' sampling loses the group 'A' altogether.
p + geom_point(size=5, sampling=sampling_random(50, seed=2))

Out[4]:

In [5]:

# Stratified sampling ensures that group 'A' is represented.
p + geom_point(size=5, sampling=sampling_random_stratified(50, seed=2))

Out[5]: