In [12]:

import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

In [2]:

aheights = [6*12+1]*5 + [6*12]*30 + [5*12+11]*5
bheights = [5*12]*20 + [7*12]*20

In [3]:

print 'means:',np.mean(aheights),np.mean(bheights)
print 'variances:',np.var(aheights),np.var(bheights)
print 'std deviations:',np.std(aheights),np.std(bheights)

means: 72.0 72.0
variances: 0.25 144.0
std deviations: 0.5 12.0

In [6]:

plt.figure(figsize=(7,5))
plt.hist(aheights,bins=np.arange(59.5,90))
plt.hist(bheights,bins=np.arange(59.5,90))
plt.xlabel('inches')
plt.ylabel('#people with that height')
plt.legend(['mean {}, std {}'.format(np.mean(d),np.std(d)) for d in (aheights,bheights)]);

In [15]:

from scipy.misc import comb
def probs(N,p=.5):
   return np.array([comb(N,k)*p**k*(1-p)**(N-k) for k in range(N+1)])

In class we discussed calculating the variance of $X$, where $X$ was the value of rolling a single die.
Recall $E[X] = (1+2+3+4+5+6)/6 = 7/2$,
and $E[X^2]= (1^2+2^2+3^2+4^2+5^2+6^2)/6=91/6$,
so $V[X]=E[X^2]-(E[X])^2 = 91/6-49/4=35/12=2.91\overline6$ .

We can compare that to picking random numbers from 1 through 6 using randint(1,7). 10 of these can be generated either via [randint(1,7) for t in xrange(10)], or directly via randint(1,7,size=10), e.g.,

In [9]:

from numpy.random import randint
randint(1,7,10)

Out[9]:

array([3, 2, 6, 1, 4, 6, 4, 6, 3, 2])

For large numbers of "rolls", each of the six numbers will appear roughly equal numbers of times. The np.var() function calculates the mean of the data that it's given, then sums the squared differences of the data points from the mean, and divides by the number of data points. For one million rolls of the die, the mean and variance of the distribution are close to the above theoretical values:

In [10]:

data=randint(1,7,1000000)
np.mean(data),np.var(data)

Out[10]:

(3.5007570000000001, 2.9131994269509995)

In [21]:

randint(0,2,size=10)

Out[21]:

array([0, 0, 1, 1, 0, 1, 1, 1, 0, 0])

In [22]:

hcount=[0 for i in range(11)]
experiments = 1000
for j in xrange(experiments):
    flips= randint(0,2,size=10)  #a sequence of 10 coin flips
    heads = sum(flips)   #count the number of heads
    hcount[heads]+=1    #accumulate head counts

In [17]:

plt.bar(range(11),hcount,align='center')
plt.plot(experiments*probs(10),'ro--')
plt.xticks(range(11))
plt.xlim(-.5,10.5)
plt.xlabel('# heads in ten flips')
plt.ylabel('#experiments with that #heads ('+str(experiments)+' total)')
plt.legend(['predicted','counts']);

In [18]:

zip([round(x,3) for x in experiments*probs(10)],hcount)

Out[18]:

[(0.977, 2),
 (9.766, 9),
 (43.945, 36),
 (117.188, 120),
 (205.078, 204),
 (246.094, 252),
 (205.078, 205),
 (117.188, 120),
 (43.945, 42),
 (9.766, 10),
 (0.977, 0)]

In [25]:

experiments,flips=100,10
plt.title('{} experiments of {} flips'.format(experiments,flips))
plt.xlim(-.5,10.5)
plt.xticks(range(11))
#        same thing in one line of code
plt.hist([sum(randint(0,2,size=10)) for j in xrange(experiments)], bins=np.arange(-.5,11));

In [31]:

#now switch to experiments of 100 flips
experiments = 1000
results=[sum(randint(0,2,size=100)) for j in xrange(experiments)]

In [32]:

results[:10]   #first ten results

Out[32]:

[57, 47, 55, 50, 56, 52, 45, 55, 50, 44]

In [33]:

plt.hist(results,bins=range(101))
plt.xlabel('# heads in 100 flips')
plt.ylabel('#experiments with that #heads ('+str(experiments)+' total)');

In [34]:

np.std(results),np.sqrt(100./4)  #compare Var to N*p*(1-p)

Out[34]:

(5.1462417354803689, 5.0)

In [35]:

np.mean(results)

Out[35]:

49.985999999999997

In [36]:

x=randint(1,7,size=10) #now roll a die ten times

In [37]:

print x,', sixes:', sum(x==6)

[6 5 1 3 1 4 3 5 1 1] , sixes: 1

In [38]:

x==6    # tests whether members of array are equal to 6, True=1

Out[38]:

array([ True, False, False, False, False, False, False, False, False, False], dtype=bool)

In [41]:

def sixes(rolls=100):
    rolls=randint(1,7,size=rolls) #default to 100 rolls
    return sum(rolls==6)  # return the number of sixes

experiments = 1000
results = [sixes(100) for j in xrange(experiments)]

In [42]:

plt.hist(results,bins=range(101))
plt.xlabel('# sixes in 100 rolls of die')
plt.ylabel('#experiments with that #sixes ('+str(experiments)+' total)');

In [43]:

print 'mean',np.mean(results),100./6
print 'std',np.std(results),np.sqrt(100*(1/6.)*(5/6.)) #compare to N*p(1-p)

mean 16.777 16.6666666667
std 3.73004973157 3.7267799625

Postscript 1: A local python guru reminded me that randint can produce a two dimensional array, so another way of accumulating statistics for the number of heads in 1000 experiments of 100 flips is (where .sum(1) sums along the rows):

In [46]:

experiments,flips=1000,100
plt.hist(randint(0,2,(experiments,flips)).sum(1),np.arange(flips+1));

Now consider tossing a coin 100 times, see what is the max length string of Heads or Tails

In [14]:

from numpy.random import random
import re
from collections import Counter

In [15]:

tosses= ''
for i in range(100):
    if random() >= .5:
        tosses += 'H'
    else:
        tosses += 'T'

In [16]:

tosses

Out[16]:

'HHTTHHTHHTHTTHHTHTTHTTHTHTHHTHTTHHTHHTHHTTHTTHHTHHHHTHTTTTTHHTHTTHHHHTTTTTHHTTTHTHTTHHHHTHTHTHHTTHTH'

In [17]:

re.findall('H+|T+',tosses)[:12]

Out[17]:

['HH', 'TT', 'HH', 'T', 'HH', 'T', 'H', 'TT', 'HH', 'T', 'H', 'TT']

In [18]:

map(len,re.findall('H+|T+',tosses))[:12]

Out[18]:

[2, 2, 2, 1, 2, 1, 1, 2, 2, 1, 1, 2]

In [19]:

max(map(len,re.findall('H+|T+',tosses)))

Out[19]:

In [22]:

def maxlength():
  tosses= ''
  for i in xrange(100):
    if random() >= .5:
        tosses += 'H'
    else:
        tosses += 'T'
  return max(map(len,re.findall('H+|T+',tosses)))

In [23]:

experiments = 10000
results = [maxlength() for t in xrange(experiments)]

In [29]:

results[:10]

Out[29]:

[8, 7, 8, 5, 6, 7, 6, 7, 5, 6]

In [30]:

max(results)

Out[30]:

In [31]:

counts=Counter(results)

In [32]:

counts

Out[32]:

Counter({6: 2700, 7: 2279, 5: 1667, 8: 1442, 9: 808, 10: 427, 4: 259, 11: 220, 12: 100, 13: 48, 14: 16, 15: 14, 16: 11, 3: 4, 17: 2, 18: 2, 22: 1})

In [33]:

[(x,counts[x]) for x in sorted(counts)]

Out[33]:

[(3, 4),
 (4, 259),
 (5, 1667),
 (6, 2700),
 (7, 2279),
 (8, 1442),
 (9, 808),
 (10, 427),
 (11, 220),
 (12, 100),
 (13, 48),
 (14, 16),
 (15, 14),
 (16, 11),
 (17, 2),
 (18, 2),
 (22, 1)]

In [26]:

plt.figure(figsize=(8,6))
plt.hist(results,bins=range(25))
plt.xticks(np.arange(.5,25.5),range(25));
plt.xlabel('max length string of heads or tails')
plt.ylabel('occurred # times in '+str(experiments)+' experiments');

In [36]:

cumul=cumsum([counts[i] for i in range(max(counts),-1,-1)])[::-1]

In [29]:

print experiments,'experiments of 100 flips'
print 'length k',' '*10,'prob longest:'
print ' '*10,'at least k',' '*10, 'exactly k'
for i in range(min(counts),max(counts)+1):
  if i not in counts: counts[i] = 0
  print '{:>2}            {:<20} {:<6}'.format(i,cumul[i]/float(experiments),
                                                       counts[i]/10000.)

10000 experiments of 100 flips
length k            prob longest:
           at least k            exactly k
 3            1.0                  0.0004
 4            0.9996               0.0259
 5            0.9737               0.1667
 6            0.807                0.27  
 7            0.537                0.2279
 8            0.3091               0.1442
 9            0.1649               0.0808
10            0.0841               0.0427
11            0.0414               0.022 
12            0.0194               0.01  
13            0.0094               0.0048
14            0.0046               0.0016
15            0.003                0.0014
16            0.0016               0.0011
17            0.0005               0.0002
18            0.0003               0.0002
19            0.0001               0.0   
20            0.0001               0.0   
21            0.0001               0.0   
22            0.0001               0.0001

Postscript 2: The same local python guru pointed out a one-liner for the above statistics as well (this may be a bit obscure, but the abs(diff ( )) is 1 in the places where there's a transition from 0 to 1 or 1 to 0, the where finds the location of those places in the array, and the outer diff then finds the distances between the transition points, i.e., the length of the run of 1's or 0's):

In [38]:

hist([ max(diff( where(abs(diff(randint(0,2,100)))==1) )[0]) for j in xrange(10000)],arange(-.5,30));

Recall the "normal" probability distribution: $$\qquad\qquad{\rm gaussian}(x)=\frac{1}{\sigma\sqrt{2\pi}}{\rm e}^{\textstyle-\frac{(x-\mu)^2}{2\sigma^2}}$$

In [5]:

def gaussian(x,mu=0.,sigma=1.):
    return np.exp(-(x-mu)**2/(2.*sigma**2))/(sigma*np.sqrt(2.*np.pi))

In [6]:

x=np.arange(-5,5.1,.1)
y=gaussian(x)
plt.plot(x,y)
plt.xticks(range(-4,5))
plt.xlim(-4,4);

In [ ]: