Normal Distribution¶

Different displays of normally distributed data
Compare different samples from a normal distribution
Check for normality
Work with the cumulative distribution function (CDF)

Author: Thomas Haslwanter, Feb-2017

In [1]:

%pylab inline
import scipy.stats as stats

# seaborn is a package for the visualization of statistical data
import seaborn as sns
sns.set(style='ticks')

Populating the interactive namespace from numpy and matplotlib

Different Representations¶

In [2]:

''' Different aspects of a normal distribution'''
# Generate the data
x = r_[-10:10:0.1]
rv = stats.norm(0,1)   # random variate

x2 = r_[0:1:0.001]

ax = subplot2grid((3,2),(0,0), colspan=2)
plot(x,rv.pdf(x))
xlim([-10,10])
title('Normal Distribution - PDF')

subplot(323)
plot(x,rv.cdf(x))
xlim([-4,4])
title('CDF: cumulative distribution fct')

subplot(324)
plot(x,rv.sf(x))
xlim([-4,4])
title('SF: survival fct')

subplot(325)
plot(x2,rv.ppf(x2))
title('PPF')

subplot(326)
plot(x2,rv.isf(x2))
title('ISF')
tight_layout()
show()
    

Shifted distribution¶

In [3]:

'''PDF, scatter plot, and histogram.'''
# Generate the data
x = arange(-5,15,0.1)
# Plot a normal distribution: "Probability density functions"
myMean = 5
mySD = 2
y = normpdf(x, myMean, mySD)
# or: y = stats.norm.pdf(x, myMean, mySD)
plot(x,y)
title('Shifted Normal Distribution')

Out[3]:

<matplotlib.text.Text at 0x27b4256e198>

Random numbers with a normal distribution¶

In [4]:

numData = 500
data = stats.norm.rvs(myMean, mySD, size = numData)
plot(data, '.')
title('Normally distributed data')
show()

hist(data)
title('Histogram of normally distributed data')

Out[4]:

<matplotlib.text.Text at 0x27b4268f6d8>

Multiple normal sample distributions¶

In [5]:

'''Show multiple samples from the same distribution, and compare means.'''
# Do this 25 times, and show the histograms
numRows = 5
numData = 100
for ii in range(numRows):
    for jj in range(numRows):
        data = stats.norm.rvs(myMean, mySD, size=numData)
        subplot(numRows,numRows,numRows*ii+jj+1)
        hist(data)

        xticks([])
        yticks([])
        xlim(myMean-3*mySD, myMean+3*mySD)

tight_layout()
show()

# Check out the mean of 1000 normally distributded samples
numTrials = 1000;
numData = 100
myMeans = ones(numTrials)*nan
for ii in range(numTrials):
    data = stats.norm.rvs(myMean, mySD, size=numData)
    myMeans[ii] = mean(data)
print('The standard error of the mean, with {0} samples, is {1:5.3f}'.format(numData, std(myMeans)))

The standard error of the mean, with 100 samples, is 0.200

Normality Check¶

In [6]:

'''Check if the distribution is normal.'''
# Generate and show a distribution
numData = 100
data = stats.norm.rvs(myMean, mySD, size=numData)
hist(data)

Out[6]:

(array([  4.,   6.,  10.,   7.,  19.,  15.,  15.,  15.,   5.,   4.]),
 array([ 0.76711319,  1.55863157,  2.35014995,  3.14166832,  3.9331867 ,
         4.72470508,  5.51622346,  6.30774183,  7.09926021,  7.89077859,
         8.68229697]),
 <a list of 10 Patch objects>)

In [7]:

# Graphical test: if the data lie on a line, they are pretty much
# normally distributed
_ = stats.probplot(data, plot=plt)

In [8]:

# The scipy "normaltest" is based on D’Agostino and Pearson’s test that
# combines skew and kurtosis to produce an omnibus test of normality.
_, pVal = stats.normaltest(data)

# Or you can check for normality with Kolmogorov-Smirnov test: but this is only advisable for large sample numbers!
#_,pVal = stats.kstest((data-np.mean(data))/np.std(data,ddof=1), 'norm')

if pVal > 0.05:
    print('Data are probably normally distributed')

Data are probably normally distributed

Values from the Cumulative Distribution Function¶

In [10]:

'''Calculate an empirical cumulative distribution function, compare it with the exact one, and
find the exact point for a specific data value.'''

# Generate normally distributed random data
myMean = 5
mySD = 2
numData = 100
data = stats.norm.rvs(myMean, mySD, size=numData)

# Calculate the cumulative distribution function, CDF
numbins = 20
counts, bin_edges = histogram(data, bins=numbins, normed=True)
cdf = cumsum(counts)
cdf /= max(cdf)

# compare with the exact CDF
step(bin_edges[1:],cdf)
plot(x, stats.norm.cdf(x, myMean, mySD),'r')

# Find out the value corresponding to the x-th percentile: the
# "cumulative distribution function"
value = 2
myMean = 5
mySD = 2
cdf = stats.norm.cdf(value, myMean, mySD)
print(('With a threshold of {0:4.2f}, you get {1}% of the data'.format(value, round(cdf*100))))

# For the percentile corresponding to a certain value: 
# the "inverse cumulative distribution function" 
value = 0.025
icdf = stats.norm.isf(value, myMean, mySD)
print('To get {0}% of the data, you need a threshold of {1:4.2f}.'.format((1-value)*100, icdf))

With a threshold of 2.00, you get 7.0% of the data
To get 97.5% of the data, you need a threshold of 8.92.