!date

import numpy as np, pandas as pd

def bootstrap_resample(X, n=None):
    """ Bootstrap resample an array_like
    Parameters
    ----------
    X : array_like
      data to resample
    n : int, optional
      length of resampled array, equal to len(X) if n==None
    Results
    -------
    returns X_resamples
    """
    if n == None:
        n = len(X)
        
    resample_i = np.floor(np.random.rand(n)*len(X)).astype(int)
    X_resample = X[resample_i]
    return X_resample

X = arange(10000)
X_resample = bootstrap_resample(X, n=5000)
print 'original mean:', X.mean()
print 'resampled mean:', X_resample.mean()

def test_bsr_shape():
    # test without resampling length parameter
    X = arange(10000)
    X_resample = bootstrap_resample(X)
    assert X_resample.shape == (10000,), 'resampled length should be 10000'
    
    # test with resampling length parameter
    n = 5000
    X_resample = bootstrap_resample(X, n=n)
    assert X_resample.shape == (n,), 'resampled length should be %d' % n
test_bsr_shape()

def test_bsr_mean():
    # test that means are close
    np.random.seed(123456)  # set seed so that randomness does not lead to failed test
    X = arange(10000)
    X_resample = bootstrap_resample(X, 5000)
    assert abs(X_resample.mean() - X.mean()) / X.mean() < 1e-2, 'means should be approximately equal'
test_bsr_mean()

def test_bsr_on_df():
    # test that means are close for pd.DataFrame with unusual index
    np.random.seed(123456)  # set seed so that randomness does not lead to failed test
    X = pd.Series(arange(10000), index=arange(10000)*10)
    
    X_resample = bootstrap_resample(X, 5000)
    print X_resample.mean(), X.mean()
    assert abs(X_resample.mean() - X.mean()) / X.mean() < 1e-2, 'means should be approximately equal'
    
test_bsr_on_df()

def bootstrap_resample(X, n=None):
    """ Bootstrap resample an array_like
    Parameters
    ----------
    X : array_like
      data to resample
    n : int, optional
      length of resampled array, equal to len(X) if n==None
    Results
    -------
    returns X_resamples
    """
    if isinstance(X, pd.Series):
        X = X.copy()
        X.index = range(len(X.index))
    if n == None:
        n = len(X)
        
    resample_i = np.floor(np.random.rand(n)*len(X)).astype(int)
    X_resample = np.array(X[resample_i])  # TODO: write a test demonstrating why array() is important
    return X_resample
test_bsr_on_df()

df = pd.read_csv('/home/j/Project/Models/network_LoS/IA_2007.csv', index_col=0, low_memory=False)

# moderately large file, for a single state/year
df.shape

df_resampled = pd.DataFrame(index=df.index, columns=df.columns, dtype=df.dtypes)

for col in df.columns:
    df_resampled[col] = bootstrap_resample(df[col])

df.ix[:10,:10]

df_resampled.ix[:10,:10]

df.age.mean(), df_resampled.age.mean()

df_resampled.to_csv('/home/j/Project/Models/network_LoS/IA_2007_resampled.csv')