#!/usr/bin/env python
# coding: utf-8

# # Chapter 6: Practice Exercises with Solutions
# ---

# In[1]:


import numpy as np
from scipy.stats import norm


# **Exercise 1:** A local news outlet reported that 46% of 600 randomly sampled Kansas residents planned to set off fireworks on New Year's Eve. Determine the margin of error for the 46% point estimate using a 95% confidence level.

# In[2]:


print(f'''
    With a random sample, independence is satisfied. 
    The success-failure condition is also satisfied: {0.46*600} and {(1-0.46)*600} are both greater than 10.
    Hence, the margin of error can be calculated as follows:''')
print('''
    ME = 1.96 x ((0.46 x 0.54)/600)^(1/2) =''', round((1.96*np.sqrt((0.46*0.54)/600)), 2))


# ---
# **Exercise 2:** A market researcher wants to evaluate car insurance savings at a competing company. Based on past studies he is assuming that the standard deviation of savings is AUD 100. He wants to collect data such that he can get a margin of error of no more than AUD 10 at a 95% confidence level. How large of a sample should he collect?

# In[3]:


print(f'''
    ME = critical value X standard errror = z x sigma/(square root of n)
    
    10 >= 1.96 x (100 / n^(1/2)) ---> n >= ((1.96 x 100) / 10)^2 = {round(((1.96*100)/10)**2, 2)}
    
    He should survey at least 385 customers. Note that we need to round up the calculated sample size.''')


# ---
# **Exercise 3:** Some people claim that they can tell the difference between a diet soda and a regular soda in the first sip. A researcher wanting to test this claim randomly sampled 80 such people. He then filled 80 plain white cups with soda, half diet and half regular through random assignment, and asked each person to take one sip from their cup and identify the soda as diet or regular. 53 participants correctly identified the soda. Construct a hypothesis test using $\alpha = 0.05$.

# In[4]:


print('''Prepare Check Calculate Conclude Framework
''')

print('''PREPARE
The parameter of interest: The proportion of people who can tell the difference between a diet soda and 
a regular soda in the first sip.
Set up hypotheses: 
Ho: p = 0.50 (Results are equivalent to randomly guessing)
Ha: p != 0.50 (Results are different than just randomly guessing)
Significance level (alpha): %5
''')
p0 = 0.50
p_hat = 53/80
n = 80

print(f'''CHECK
(1) Independence: The sample is random, therefore whether or not one person in the sample
can identify a soda correctly in independent of another.
(2) Success-failure: n*p0 = 80*(0.50) = {round(n*p0, 1)} and n*(1-p0) = 80*(0.50) = {round(n*(1-p0), 1)} are both >= 10, 
hence the success-failure conditions are satisfied.
''')

se = np.sqrt(p0*(1 - p0)/n)
z_score = round((p_hat - p0)/se, 4)
print(f'''CALCULATE
The Z-score is {z_score}. The Z score's one-tail area: {round(1 - norm.cdf(np.abs(z_score)), 4)}, yet this is a two-sided hypothesis testing. 
Thus, the p-value is twice this one-tail area: {round(2*(1 - norm.cdf(np.abs(z_score))), 4)}
''')

print('''CONCLUDE
Because the p-value of 0.37% is smaller than alpha = 5%, we do REJECT the null hypothesis and 
conclude people are better than random guessing and the proportion of correctly identifying a soda for 
these people is significantly better than just by random guessing.
''')


# ---
# **Exercise 4:** According to a report on sleep deprivation by the Centers for Disease Control and Prevention, the proportion of California residents who reported insufficient rest or sleep during each of the preceding 30 days is 8.0%, while this proportion is 8.8% for Oregon residents. These data are based on simple random samples of 11,545 California and 4,691 Oregon residents. Calculate a 95% confidence interval for the difference between the proportions of California and Oregon residents who are sleep deprived and interpret it in context of the data.

# In[5]:


p_hat_CAL = 0.08
p_hat_ORE = 0.088
nCAL = 11545 
nORE = 4691

print(f'''
(1) Independence: Both samples are random, and 11,545 < 10% of all California residents and 4,691 <
10% of all Oregon residents, therefore how much one California resident sleeps is independent of how much
another California resident sleeps and how much one Oregon resident sleeps is independent of how much
another Oregon resident sleeps. In addition, the two samples are independent of each other.

(2) Success-failure: 
    nCAL*p_hat_CAL = {round(nCAL*p_hat_CAL, 1)} and nCAL*(1-p_hat_CAL) = {round(nCAL*(1-p_hat_CAL), 1)} 
    are both >= 10
    nORE*p_hat_ORE = {round(nORE*p_hat_ORE, 1)} and nORE*(1-p_hat_ORE) = {round(nORE*(1-p_hat_ORE), 1)} 
    are both >= 10
hence the success-failure conditions are satisfied.
''')

print('''Since the observations are independent and the success-failure condition is met, 
p_hat_CAL - p_hat_ORE is expected to be approximately normal.

A 95% confidence interval for the difference between 
the population proportions can be calculated as follows:''')

critical_value = round(norm.ppf(0.975), 3)
se = np.sqrt((p_hat_CAL*(1-p_hat_CAL)/nCAL) + (p_hat_ORE*(1-p_hat_ORE)/nORE))
print('''    
The confidence interval is therefore, point estimate +/- z x SE = 
(0.08-0.088) +/- 1.96 x 0.0048 =
  ''', round((0.08-0.088)-critical_value*se, 4), 'and', round((0.08-0.088)+critical_value*se, 4))

print('''
We are 95% confident that the difference between the proportions of California and Oregon residents 
who are sleep deprived is between -1.7% and 0.15%. In other words, we are 95% confident that 1.7% less to 
0.15% more California than Oregon residents are sleep deprived.''')


# ---
# **Exercise 5:** A survey asked 827 randomly sampled Melbourne and Sydney residents "What is your favorite sport? Football, cricket or basketball?" Below is the distribution of responses, separated based on whether the respondent is a Melbourne or Sydney resident.
# 
# |-| Melbourne | Sydney   | 
# |---                 |---   |---      |
# | Football | 154       | 132     |
# | Cricket | 180      | 126     |
# | Basketball | 104 | 131     | 
# | TOTAL | 438 | 389     | 
# 
# 
# A) What percent of Melbourne residents and what percent of Sydney residents in this sample report that their favorite sport is basketball?
# 
# B) Conduct a hypothesis test to determine if the data provide strong evidence that the proportion of Melbourne residents who
# reported their favorite sport as basketball is different than that of Sydney residents.

# In[6]:


p_hat_MEL = round(104/438, 2)
p_hat_SYD = round(131/389, 2)

print('''
A)  The percentages can be calculated as follows:''')
print(f'''
Proportion of Melbourne residents who love basketball = 104/438 = {p_hat_MEL} (24%)''')
print(f'''Proportion of Sydney residents who love basketball = 131/389 = {p_hat_SYD} (34%)
''')

print('''B) Hypothesis Test:
''')

print('''PREPARE
Let pMEL represent the proportion of all Melbourne residents who love 'basketball', and
pSYD represent the proportion of all Sydney residents who love 'basketball'.
Set up hypotheses: 
Ho: pMEL = pSYD
Ha: pMEL != pSYD
Significance level (alpha): %5
''')
successMEL = 104
successSYD = 131
nMEL = 438 
nSYD = 389
p_pool = (successMEL+successSYD)/(nMEL+nSYD) # Pooled estimate of a proportion

print(f'''CHECK
(1) Independence: Both samples are random, so observations within each group are independent. 
Additionally, each sample is independent of the other.
(2) Success-failure: 
    nMEL*p_pool = 438*(0.284) = {round(nMEL*p_pool, 1)} and nMEL*(1-p_pool) = 438*(0.716) = {round(nMEL*(1-p_pool), 1)} 
    are both >= 10
    nSYD*p_pool = 438*(0.284) = {round(nSYD*p_pool, 1)} and nSYD*(1-p_pool) = 438*(0.716) = {round(nSYD*(1-p_pool), 1)} 
    are both >= 10
hence the success-failure conditions are satisfied.
''')

print('CALCULATE')
se = np.sqrt((p_pool*(1 - p_pool)/nMEL) + (p_pool*(1 - p_pool)/nSYD))
z_score = round((p_hat_MEL - p_hat_SYD)/se, 2)
print(f'''The Z-score is {z_score}. The Z score's one-tail area: {round(1 - norm.cdf(np.abs(z_score)), 5)}, yet this is a two-sided hypothesis testing. 
Thus, the p-value is twice this one-tail area: {round(2*(1 - norm.cdf(np.abs(z_score))), 4)}
''')

print('''CONCLUDE
Because the p-value of 0.15% is smaller than alpha = 5%, we do REJECT the null hypothesis and 
we conclude that the data provide strong evidence that the proportion of Melbourne residents who reported 
their favorite sport as basketball is different than that of Sydney residents.
''')