In [ ]:
import matplotlib
from datascience import *
%matplotlib inline
import matplotlib.pyplot as plots
import numpy as np
plots.style.use('fivethirtyeight')
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)


## Chebyshev's Bounds¶

In [ ]:
births = Table.read_table('baby.csv')

In [ ]:
births.labels

In [ ]:
births.drop(5).hist(overlay = False)

In [ ]:
mpw = births.column('Maternal Pregnancy Weight')
mean = np.mean(mpw)
sd = np.std(mpw)
mean, sd

In [ ]:
within_3_SDs = births.where('Maternal Pregnancy Weight', are.between(mean - 3*sd, mean + 3*sd))

In [ ]:
within_3_SDs.num_rows/births.num_rows

In [ ]:
1 - 1/3**2

In [ ]:
# See if Chebyshev's bounds work for different distributions

for k in births.labels:
values = births.column(k)
mean = np.mean(values)
sd = np.std(values)
print()
print(k)
for z in np.arange(2, 6):
chosen = births.where(k, are.between(mean - z*sd, mean + z*sd))
proportion = chosen.num_rows/births.num_rows
percent = round(proportion * 100, 2)
print('Average plus or minus', z, 'SDs:', percent, '%')


## Standard Units¶

In [ ]:
def standard_units(x):
"""Convert array x to standard units."""
return (x - np.mean(x))/np.std(x)

In [ ]:
ages = births.column('Maternal Age')

In [ ]:
ages_standard_units = standard_units(ages)
ages_standard_units

In [ ]:
np.mean(ages_standard_units), np.std(ages_standard_units)

In [ ]:
both = Table().with_columns(
'Age in Years', ages,
'Age in Standard Units', ages_standard_units
)
both
#####################

In [ ]:
np.mean(ages), np.std(ages)

In [ ]:
both.hist('Age in Years', bins = np.arange(15, 46, 2))

In [ ]:
both.hist('Age in Standard Units', bins = np.arange(-2.2, 3.4, 0.35))
plots.xlim(-2, 3.1);


## The SD and Bell Shaped Curves¶

In [ ]:
births.hist('Maternal Height', bins = np.arange(56.5, 72.6, 1))

In [ ]:
heights = births.column('Maternal Height')
np.mean(heights), np.std(heights)

In [ ]:
births.hist('Birth Weight')

In [ ]:
bw = births.column('Birth Weight')
mean_w = np.mean(bw)
sd_w = np.std(bw)
mean_w, sd_w


## The normal curve¶

In [ ]:
# Roulette:  38 pockets
# bets on red pays 1 to 1
red_winnings = np.append(1*np.ones(18), -1*np.ones(20))
red = Table().with_columns('Winnings on Red', red_winnings)

In [ ]:
red.show()

In [ ]:
red.hist(bins = np.arange(-1.5, 1.6, 1))

In [ ]:
18/38 #chance of making \$1 from bet placed on red

In [ ]:
num_bets = 100 #different spins of the roulette with bets on red

net_gains = make_array() #amount won from num_bets on red

for i in np.arange(20000):
spins = red.sample(num_bets)
new_net_gain = sum(spins.column('Winnings on Red'))
net_gains = np.append(net_gains, new_net_gain)


In [ ]:
#Bell-shaped histogram
#(even though original distribution was far from bell-shaped)
Table().with_columns('Net Gain', net_gains).hist()
plots.xticks(np.arange(-45, 36, 10));

In [ ]:
np.average(net_gains)

In [ ]:
np.std(net_gains)


## Central Limit Theorem and Simulating Sample Mean¶

In [ ]:
united = Table.read_table('united_summer2015.csv')
united

In [ ]:
united.hist('Delay', bins = np.arange(-20, 300, 10))

In [ ]:
delays = united.column('Delay')
mean_delay = np.mean(delays)
sd_delay = np.std(delays)

mean_delay, sd_delay

In [ ]:
united = united.with_columns(
'Delay in Standard Units', standard_units(delays)
)
united.sort('Delay', descending=True)

In [ ]:
#Chebychev: at least 89% within 3 SD
chosen = united.where('Delay in Standard Units', are.between(-3, 3))
chosen.num_rows/united.num_rows

In [ ]:
united.hist('Delay', bins = np.arange(-20, 300, 10))

In [ ]:
#Take random sample from population of size sample_size
#Repeat to get empirical distribution of sample average
sample_size = 400

means = make_array()

for i in np.arange(10000):
sampled_flights = united.sample(sample_size)
sample_mean = np.mean(sampled_flights.column('Delay'))
means = np.append(means, sample_mean)

In [ ]:
Table().with_columns('Sample Mean', means).hist(bins = 20)
plots.title('Sample Means: Sample Size ' + str(sample_size))
plots.xlabel('Random Sample Mean');

In [ ]:


In [ ]:
np.mean(means), np.std(means)

In [ ]:
mean_delay, sd_delay

In [ ]:
sd_delay/np.sqrt(sample_size)

In [ ]: