In large dataset with groups, choice of the sampling method may depend on the number of groups and the group size.
In this example we consider line plot where each line correspons to a group.
import numpy as np
from lets_plot import *
LetsPlot.setup_html()
def data(n_per_line, n_groups):
x_step = 2 * np.pi / (n_per_line - 1)
little_delta = x_step / 100
x_stops =np.arange(-np.pi, np.pi + little_delta, x_step)
y_min, y_max = 1, 10
y_step = (y_max - y_min) / (n_groups - 1)
little_delta = y_step / 100
y_multiplier = np.arange(y_min, y_max + little_delta, y_step)
x = []
y = []
c = []
for i in range(n_groups):
x.extend(x_stops)
y.extend([np.sin(x) * y_multiplier[i] for x in x_stops])
c.extend([str(i) for _ in x_stops])
return dict(x = x, y = y, cond = c)
p = ggplot(mapping=aes('x','y',color='cond'))
n_per_line, n_groups = 1000, 10
dat = data(n_per_line, n_groups)
# The default 'systematic' sampling is fine in this case.
p + geom_line(data=dat)
n_per_line, n_groups = 30, 1000
dat = data(n_per_line, n_groups)
# The default systematic line sampling doesn't work very well in this case.
p + geom_line(data=dat)
# Random group sampling works better.
p + geom_line(data=dat, sampling=sampling_group_random(10))
# Systematic group sampling works even better.
p + geom_line(data=dat, sampling=sampling_group_systematic(10))
n_per_line, n_groups = 1000, 200
dat = data(n_per_line, n_groups)
p + geom_line(data=dat)
# A combination of points and group sampling works.
p + geom_line(data=dat, sampling=sampling_group_systematic(10)+sampling_systematic(200))