## Obtain samples from a set of 10 products using probability sampling to determine the population mean of a particular measure of interest¶

In [10]:
# Import required libraries
import numpy as np
import pandas as pd

# Set random seed
np.random.seed(42)

# Define total number of products
number_of_products = 10

# Create data dictionary
data = {'product_id':np.arange(1, number_of_products+1).tolist(),
'measure':np.round(np.random.normal(loc=10, scale=0.5, size=number_of_products),3)}

# Transform dictionary into a data frame
df = pd.DataFrame(data)

# Store the real mean in a separate variable
real_mean = round(df['measure'].mean(),3)
print(real_mean)
# View data frame
df

10.224

Out[10]:
product_id measure
0 1 10.248
1 2 9.931
2 3 10.324
3 4 10.762
4 5 9.883
5 6 9.883
6 7 10.790
7 8 10.384
8 9 9.765
9 10 10.271

### Simple random sampling method selects random samples from a process or population where every unit has the same probability of getting selected. This is the most direct method of probability sampling.¶

In [12]:
# Obtain simple random sample
# sample (n,axis)
simple_random_sample = df.sample(n=4).sort_values(by='product_id')

# Save the sample mean in a separate variable
simple_random_mean = round(simple_random_sample['measure'].mean(),3)
print(simple_random_mean)
# View sampled data frame
simple_random_sample

9.866

Out[12]:
product_id measure
1 2 9.931
4 5 9.883
5 6 9.883
8 9 9.765

### The systematic sampling method selects units based on a fixed sampling interval (i.e. every nth unit is selected from a given process or population). This sampling method tends to be more effective than the simple random sampling method.¶

In [4]:
# Define systematic sampling function
def systematic_sampling(df, step):

indexes = np.arange(0,len(df),step=step)
systematic_sample = df.iloc[indexes]
return systematic_sample

# Obtain a systematic sample and save it in a new variable
systematic_sample = systematic_sampling(df, 3)

# Save the sample mean in a separate variable
systematic_mean = round(systematic_sample['measure'].mean(),3)

# View sampled data frame
systematic_sample


Out[4]:
product_id measure
0 1 10.248
3 4 10.762
6 7 10.790
9 10 10.271

### The cluster sampling method divides the population in clusters of equal size n and selects clusters every Tth cluster.¶

In [15]:
def cluster_sampling(df, number_of_clusters):

try:
# Divide the units into cluster of equal size
df['cluster_id'] = np.repeat([range(1,number_of_clusters+1)],len(df)/number_of_clusters)
print(df)
# Create an empty list
indexes = []

# Append the indexes from the clusters that meet the criteria
# For this formula, clusters id must be an even number
for i in range(0,len(df)):
if df['cluster_id'].iloc[i]%2 == 0:
indexes.append(i)
print(indexes)
cluster_sample = df.iloc[indexes]
return(cluster_sample)

except:
print("The population cannot be divided into clusters of equal size!")

# Obtain a cluster sample and save it in a new variable
cluster_sample = cluster_sampling(df,5)

# Save the sample mean in a separate variable
cluster_mean = round(cluster_sample['measure'].mean(),3)

# View sampled data frame
cluster_sample

   product_id  measure  cluster_id
0           1   10.248           1
1           2    9.931           1
2           3   10.324           2
3           4   10.762           2
4           5    9.883           3
5           6    9.883           3
6           7   10.790           4
7           8   10.384           4
8           9    9.765           5
9          10   10.271           5
[2, 3, 6, 7]

Out[15]:
product_id measure cluster_id
2 3 10.324 2
3 4 10.762 2
6 7 10.790 4
7 8 10.384 4

### The stratified random sampling method divides the population in subgroups (i.e. strata) and selects random samples where every unit has the same probability of getting selected.¶

In [6]:
# Create data dictionary
data = {'product_id':np.arange(1, number_of_products+1).tolist(),
'product_strata':np.repeat([1,2], number_of_products/2).tolist(),
'measure':np.round(np.random.normal(loc=10, scale=0.5, size=number_of_products),3)}

# Transform dictionary into a data frame
df = pd.DataFrame(data)

# View data frame
df

Out[6]:
product_id product_strata measure
0 1 1 8.780
1 2 1 10.302
2 3 1 9.874
3 4 1 9.918
4 5 1 9.262
5 6 2 10.743
6 7 2 9.988
7 8 2 10.178
8 9 2 10.209
9 10 2 10.416
In [7]:
# Import StratifiedShuffleSplit
from sklearn.model_selection import StratifiedShuffleSplit

# Set the split criteria
split = StratifiedShuffleSplit(n_splits=1, test_size=4)

# Perform data frame split
for x, y in split.split(df, df['product_strata']):
stratified_random_sample = df.iloc[y].sort_values(by='product_id')

# View sampled data frame
stratified_random_sample

# Obtain the sample mean for each group
stratified_random_sample.groupby('product_strata').mean().drop(['product_id'],axis=1)

Out[7]:
measure
product_strata
1 9.327
2 10.476

Once samples have been obtained using each sampling technique, let’s compare the samples means with the population mean (which usually is unknown, but not in this case) to determine the sampling technique that leads to the best approximation of the population measure mean.

In [8]:
# Create a dictionary with the mean outcomes for each sampling method and the real mean
outcomes = {'sample_mean':[simple_random_mean,systematic_mean,cluster_mean],
'real_mean':real_mean}

# Transform dictionary into a data frame
outcomes = pd.DataFrame(outcomes, index=['Simple Random Sampling','Systematic Sampling','Cluster Sampling'])

# Add a value corresponding to the absolute error
outcomes['abs_error'] = abs(outcomes['real_mean'] - outcomes['sample_mean'])

# Sort data frame by absolute error
outcomes.sort_values(by='abs_error')

Out[8]:
sample_mean real_mean abs_error
Simple Random Sampling 10.316 10.224 0.092
Systematic Sampling 10.518 10.224 0.294
Cluster Sampling 10.565 10.224 0.341
In [ ]: