Obtain samples from a set of 10 products using probability sampling to determine the population mean of a particular measure of interest

In [10]:
# Import required libraries
import numpy as np
import pandas as pd

# Set random seed
np.random.seed(42)

# Define total number of products
number_of_products = 10

# Create data dictionary
data = {'product_id':np.arange(1, number_of_products+1).tolist(),
       'measure':np.round(np.random.normal(loc=10, scale=0.5, size=number_of_products),3)}

# Transform dictionary into a data frame
df = pd.DataFrame(data)

# Store the real mean in a separate variable
real_mean = round(df['measure'].mean(),3)
print(real_mean)
# View data frame
df
10.224
Out[10]:
product_id measure
0 1 10.248
1 2 9.931
2 3 10.324
3 4 10.762
4 5 9.883
5 6 9.883
6 7 10.790
7 8 10.384
8 9 9.765
9 10 10.271

Simple random sampling method selects random samples from a process or population where every unit has the same probability of getting selected. This is the most direct method of probability sampling.

In [12]:
# Obtain simple random sample
# sample (n,axis)
simple_random_sample = df.sample(n=4).sort_values(by='product_id')

# Save the sample mean in a separate variable
simple_random_mean = round(simple_random_sample['measure'].mean(),3)
print(simple_random_mean)
# View sampled data frame
simple_random_sample
9.866
Out[12]:
product_id measure
1 2 9.931
4 5 9.883
5 6 9.883
8 9 9.765

The systematic sampling method selects units based on a fixed sampling interval (i.e. every nth unit is selected from a given process or population). This sampling method tends to be more effective than the simple random sampling method.

In [4]:
# Define systematic sampling function
def systematic_sampling(df, step):
    
    indexes = np.arange(0,len(df),step=step)
    systematic_sample = df.iloc[indexes]
    return systematic_sample
    
# Obtain a systematic sample and save it in a new variable
systematic_sample = systematic_sampling(df, 3)

# Save the sample mean in a separate variable
systematic_mean = round(systematic_sample['measure'].mean(),3)

# View sampled data frame
systematic_sample
 
Out[4]:
product_id measure
0 1 10.248
3 4 10.762
6 7 10.790
9 10 10.271

The cluster sampling method divides the population in clusters of equal size n and selects clusters every Tth cluster.

In [15]:
def cluster_sampling(df, number_of_clusters):
    
    try:
        # Divide the units into cluster of equal size
        df['cluster_id'] = np.repeat([range(1,number_of_clusters+1)],len(df)/number_of_clusters)
        print(df)
        # Create an empty list
        indexes = []

        # Append the indexes from the clusters that meet the criteria
        # For this formula, clusters id must be an even number
        for i in range(0,len(df)):
            if df['cluster_id'].iloc[i]%2 == 0:
                indexes.append(i)
        print(indexes)
        cluster_sample = df.iloc[indexes]
        return(cluster_sample)
    
    except:
        print("The population cannot be divided into clusters of equal size!")
    
# Obtain a cluster sample and save it in a new variable
cluster_sample = cluster_sampling(df,5)

# Save the sample mean in a separate variable
cluster_mean = round(cluster_sample['measure'].mean(),3)

# View sampled data frame
cluster_sample
   product_id  measure  cluster_id
0           1   10.248           1
1           2    9.931           1
2           3   10.324           2
3           4   10.762           2
4           5    9.883           3
5           6    9.883           3
6           7   10.790           4
7           8   10.384           4
8           9    9.765           5
9          10   10.271           5
[2, 3, 6, 7]
Out[15]:
product_id measure cluster_id
2 3 10.324 2
3 4 10.762 2
6 7 10.790 4
7 8 10.384 4

The stratified random sampling method divides the population in subgroups (i.e. strata) and selects random samples where every unit has the same probability of getting selected.

In [6]:
# Create data dictionary
data = {'product_id':np.arange(1, number_of_products+1).tolist(),
       'product_strata':np.repeat([1,2], number_of_products/2).tolist(),
       'measure':np.round(np.random.normal(loc=10, scale=0.5, size=number_of_products),3)}

# Transform dictionary into a data frame
df = pd.DataFrame(data)

# View data frame
df
Out[6]:
product_id product_strata measure
0 1 1 8.780
1 2 1 10.302
2 3 1 9.874
3 4 1 9.918
4 5 1 9.262
5 6 2 10.743
6 7 2 9.988
7 8 2 10.178
8 9 2 10.209
9 10 2 10.416
In [7]:
# Import StratifiedShuffleSplit
from sklearn.model_selection import StratifiedShuffleSplit

# Set the split criteria
split = StratifiedShuffleSplit(n_splits=1, test_size=4)

# Perform data frame split
for x, y in split.split(df, df['product_strata']):
    stratified_random_sample = df.iloc[y].sort_values(by='product_id')

# View sampled data frame
stratified_random_sample

# Obtain the sample mean for each group
stratified_random_sample.groupby('product_strata').mean().drop(['product_id'],axis=1)
Out[7]:
measure
product_strata
1 9.327
2 10.476

Once samples have been obtained using each sampling technique, let’s compare the samples means with the population mean (which usually is unknown, but not in this case) to determine the sampling technique that leads to the best approximation of the population measure mean.

In [8]:
# Create a dictionary with the mean outcomes for each sampling method and the real mean
outcomes = {'sample_mean':[simple_random_mean,systematic_mean,cluster_mean],
           'real_mean':real_mean}

# Transform dictionary into a data frame
outcomes = pd.DataFrame(outcomes, index=['Simple Random Sampling','Systematic Sampling','Cluster Sampling'])

# Add a value corresponding to the absolute error
outcomes['abs_error'] = abs(outcomes['real_mean'] - outcomes['sample_mean'])

# Sort data frame by absolute error
outcomes.sort_values(by='abs_error')
Out[8]:
sample_mean real_mean abs_error
Simple Random Sampling 10.316 10.224 0.092
Systematic Sampling 10.518 10.224 0.294
Cluster Sampling 10.565 10.224 0.341
In [ ]: