In large dataset a relatively small group of points might be overplotted by the dominant group. In this case stratified sampling can help.
import numpy as np import pandas as pd from lets_plot import * LetsPlot.setup_html()
N = 5000 small_group = 3 large_group = N - small_group np.random.seed(123) data = dict( x = np.random.normal(0, 1, N), y = np.random.normal(0, 1, N), cond = ['A' for _ in range(small_group)] + ['B' for _ in range(large_group)] )
# Data points in group 'A' (small group) are overplotted by the dominant group 'B'. p = ggplot(data, aes('x','y',color='cond')) + \ scale_color_manual(values=["red", "#1C9E77"], breaks=['A', 'B']) p + geom_point(size=5, alpha=.2)
# The 'random' sampling loses the group 'A' altogether. p + geom_point(size=5, sampling=sampling_random(50, seed=2))
# Stratified sampling ensures that group 'A' is represented. p + geom_point(size=5, sampling=sampling_random_stratified(50, seed=2))