#!/usr/bin/env python # coding: utf-8 # In[52]: import pandas as pd import numpy as np import random as rd import seaborn as sns rd.seed(42) df = pd.read_feather("../datasets/attrition.feather") df.head() # In[28]: print(df.shape) df["Age"].hist() # # Simple sampling # In[29]: df_simple_samp = df.sample(n=70, random_state=42) df_simple_samp["Age"].hist() # # Systematic sampling # Systematic sampling has a problem: if the data has been sorted, or there is some sort of pattern or meaning behind the row order, then the resulting sample may not be representative of the whole population. The problem can be solved by shuffling the rows, but then systematic sampling is equivalent to simple random sampling. # In[30]: sample_size = 70 pop_size = len(df) interval = pop_size // sample_size df_sys_samp = df.iloc[::interval] df_sys_samp["Age"].hist() # # Proportional stratified sampling # # * Split the population into subgroups # * Use simple random sampling on every subgroup # # The proportions of each category or subgroup will be similar between the population and sampling data. # In[31]: df_strat_samp = df.groupby("Department", observed=False).sample(frac=0.1, random_state=42) df_strat_samp["Age"].hist() # In[32]: df["Department"].value_counts(normalize=True) # In[33]: df_strat_samp["Department"].value_counts(normalize=True) # # Equal counts stratified sampling # The sampling will extract n rows of each category # In[34]: df_eq_strat_samp = df.groupby("Department", observed=False).sample(n=15, random_state=42) df_eq_strat_samp["Age"].hist() # In[35]: df["Department"].value_counts(normalize=True) # In[36]: df_eq_strat_samp["Department"].value_counts(normalize=True) # # Weighted random sampling # Specify weights to adjust the relative probability of a row being sampled. # In[37]: df_weight = df condition = df_weight["Department"] == "Sales" df_weight["weight"] = np.where(condition, 2, 1) # weight 2 if match - 1 don't match => 2 times the chance of beign picked df_weight = df.sample(frac=0.1, weights="weight", random_state=42) df_weight["Age"].hist() # In[38]: df.value_counts("Department", normalize=True) # In[39]: df_weight.value_counts("Department", normalize=True) # # Cluster sampling # * Use simple random sampling to pick some subgroups # * Use simple random sampling on only those subgroups # In[45]: job_roles = list(df["JobRole"].unique()) job_roles_samp = rd.sample(job_roles, k=4) condition = df["JobRole"].isin(job_roles_samp) df_filtered = df[condition] df_filtered["JobRole"] = df_filtered["JobRole"].cat.remove_unused_categories() df_clust_samp = df_filtered.groupby("JobRole").sample(n=10, random_state=42) df_clust_samp["Age"].hist() # # Relative error # # $$ # RE=100*\frac{|\text{population mean} - \text{sample mean}|}{\text{population mean}} # $$ # In[50]: attrition_srs100 = df.sample(n=100, random_state=42) mean_attrition_srs100 = attrition_srs100["Attrition"].mean() rel_error_pct100 = 100 * abs(df["Attrition"].mean() - mean_attrition_srs100) / df["Attrition"].mean() print(rel_error_pct100) # # # Bootstrapping # # *Sampling*: going from a population to a smaller sample. # # *Bootstraping*: building up a theorical population from the sample. # # **Process:** # # 1. Make a resample of the same size as the original sample. # 2. Calculate the statistic of interest for this bootstrap sample. # 3. Repeat steps 1 and 2 many times. # # **Bootstrap distribution mean:** # # * Usually close to the sample mean. # * May not be a good estimate of the population mean. (here we use sampling distribution) # * Bootstrapping cannot correct biases from sampling. # # **Standard error:** standard deviation of the statistic of interest. # # * standard error * sqrt(sample_size) => population standard deviation # * Estimated standard error -> standard deviation of the bootstrap distribution for a sample statistic. # * It can be a good estimate of the population std # * Bootstrapping doesn't suffer that much from biases # # In[54]: df_resample = df.sample(frac=1, replace=True) means = [] for i in range(1000): means.append(np.mean(df.sample(frac=1, replace=True)["Age"])) sns.histplot(means) # # Confidence # In[58]: sns.histplot(df["Age"]) # Ways to calculate: # # 1. Mean plus or minus one standard deviation: # In[55]: mean = np.mean(df["Age"]) c1 = mean - np.std(df["Age"], ddof=1) c2 = mean + np.std(df["Age"], ddof=1) print([c1, c2]) # 2. Quantile method for confidence intervals # In[57]: q1 = np.quantile(df["Age"], 0.025) q2 = np.quantile(df["Age"], 0.975) print([q1, q2]) # 3. Inverse cumulative distribution function - Standard error method for confidence itnerval # In[61]: from scipy.stats import norm point_estimate = np.mean(df["Age"]) std_error = np.std(df["Age"], ddof=1) # here we should use the standard error (std(bootstrap_distribution)) lower = norm.ppf(0.025, loc=point_estimate, scale=std_error) upper = norm.ppf(0.975, loc=point_estimate, scale=std_error) print([lower, upper])