!date import numpy as np, pandas as pd def bootstrap_resample(X, n=None): """ Bootstrap resample an array_like Parameters ---------- X : array_like data to resample n : int, optional length of resampled array, equal to len(X) if n==None Results ------- returns X_resamples """ if n == None: n = len(X) resample_i = np.floor(np.random.rand(n)*len(X)).astype(int) X_resample = X[resample_i] return X_resample X = arange(10000) X_resample = bootstrap_resample(X, n=5000) print 'original mean:', X.mean() print 'resampled mean:', X_resample.mean() def test_bsr_shape(): # test without resampling length parameter X = arange(10000) X_resample = bootstrap_resample(X) assert X_resample.shape == (10000,), 'resampled length should be 10000' # test with resampling length parameter n = 5000 X_resample = bootstrap_resample(X, n=n) assert X_resample.shape == (n,), 'resampled length should be %d' % n test_bsr_shape() def test_bsr_mean(): # test that means are close np.random.seed(123456) # set seed so that randomness does not lead to failed test X = arange(10000) X_resample = bootstrap_resample(X, 5000) assert abs(X_resample.mean() - X.mean()) / X.mean() < 1e-2, 'means should be approximately equal' test_bsr_mean() def test_bsr_on_df(): # test that means are close for pd.DataFrame with unusual index np.random.seed(123456) # set seed so that randomness does not lead to failed test X = pd.Series(arange(10000), index=arange(10000)*10) X_resample = bootstrap_resample(X, 5000) print X_resample.mean(), X.mean() assert abs(X_resample.mean() - X.mean()) / X.mean() < 1e-2, 'means should be approximately equal' test_bsr_on_df() def bootstrap_resample(X, n=None): """ Bootstrap resample an array_like Parameters ---------- X : array_like data to resample n : int, optional length of resampled array, equal to len(X) if n==None Results ------- returns X_resamples """ if isinstance(X, pd.Series): X = X.copy() X.index = range(len(X.index)) if n == None: n = len(X) resample_i = np.floor(np.random.rand(n)*len(X)).astype(int) X_resample = np.array(X[resample_i]) # TODO: write a test demonstrating why array() is important return X_resample test_bsr_on_df() df = pd.read_csv('/home/j/Project/Models/network_LoS/IA_2007.csv', index_col=0, low_memory=False) # moderately large file, for a single state/year df.shape df_resampled = pd.DataFrame(index=df.index, columns=df.columns, dtype=df.dtypes) for col in df.columns: df_resampled[col] = bootstrap_resample(df[col]) df.ix[:10,:10] df_resampled.ix[:10,:10] df.age.mean(), df_resampled.age.mean() df_resampled.to_csv('/home/j/Project/Models/network_LoS/IA_2007_resampled.csv')