The classic start, simple and easy in python

In [1]:
print("Hello world!")
Hello world!

Easy to use as a calculator

In [2]:

Python is a dynamic language, no need to declare variable types before assignment

In [3]:
foo = 2   # this is a comment
bar = 2
(4, 0, 4, 1, 4)

There are floats, integers, strings, lists

In [4]:
a = [2.01, 3, 'hello']   # lists can contain anything,
print(a, a[2], a[0:2], "<--- array slicing example")   # showing list/array slicing
print(a*2, "<--- multiplying a list")               # multiplying a list
a.append('goodbye')      # adding to a list
print(a, "<--- appending a list")
([2.01, 3, 'hello'], 'hello', [2.01, 3], '<--- array slicing example')
([2.01, 3, 'hello', 2.01, 3, 'hello'], '<--- multiplying a list')
([2.01, 3, 'hello', 'goodbye'], '<--- appending a list')

You can also import packages for useful code content, such as functions, constants, whatever

In [5]:
import math   # the import function
print(math.pi)   # now you've got pi!
In [6]:
from math import *   # another way to import things, but this method isn't recommended
In [7]:
from numpy import pi as anotherpi  # the recommended way to import single functions
import numpy as np                 # import a package as a new name, this is the most common

Numpy is a powerful pacakge that you'll use a lot

In [8]:
print(np.zeros(10))        # make an empty array
print(np.ones(10))         # make an array of ones
b = np.arange(10)          # make an array of integer indices
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
[ 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.]
[0 1 2 3 4 5 6 7 8 9]
In [9]:
print(b*b)                  # multiplying arrays of same size
print((b*b).reshape(2,5))   # you can also reshape arrays
[ 0  1  4  9 16 25 36 49 64 81]
[[ 0  1  4  9 16]
 [25 36 49 64 81]]

numpy also has useful functions like mean, std

In [10]:
# use numpy's random number sampling from a normal distribution (which has a sigma=1)
#  to generate some fake data
testdata=np.ones(ntestpoints)*testmean + np.random.randn(ntestpoints)*teststd
themean = np.sum(testdata)/ntestpoints   # calculating the mean without numpy
stderr = np.sqrt( 1./(ntestpoints-1) * np.sum( (np.mean(testdata)-testdata)**2) )   #calculating the std error
error_on_mean = stderr/(sqrt(ntestpoints))
print "The mean: "+ str(np.mean(testdata))+", or "+str(themean)    # our mean and numpy's mean should be the same
print "The standard error: " +str(np.std(testdata,ddof=1))+", or "+str(stderr)  #note the ddof=1
print "The error on the mean: " + str(error_on_mean)
The mean: 9.86443753209, or 9.86443753209
The standard error: 1.88782447342, or 1.88782447342
The error on the mean: 0.188782447342

Plotting is inside the matplotlib.pyplot package

In [11]:
import matplotlib.pyplot as plt
%pylab inline
Populating the interactive namespace from numpy and matplotlib
/home/andrew/.local/lib/python2.7/site-packages/IPython/core/magics/ UserWarning: pylab import has clobbered these variables: ['cosh', 'ldexp', 'hypot', 'tan', 'isnan', 'log', 'fabs', 'floor', 'modf', 'sqrt', 'frexp', 'degrees', 'pi', 'log10', 'sin', 'fmod', 'copysign', 'cos', 'ceil', 'isinf', 'sinh', 'bar', 'trunc', 'expm1', 'e', 'tanh', 'radians', 'exp', 'log1p', 'gamma']
`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"
In [12]:
plt.ylabel('Number of measurements \n in each bin',fontsize=20)
plt.title('Histogram of test data',fontsize=22)
<matplotlib.lines.Line2D at 0x7fe8bedab0d0>

You can also read in data from a text file with numpy's loadtxt function

In [13]:
# Click on the menu for the graph on
# and go to "Download csv" to download this data, 
# and put its filename below

# this data has four columns
#  date, cowboys searches, patriots searches, superbowl searches

# Use numpy's recfromcsv function to read in the csv file from Google
# another option is to use np.loadtxt
# (an example of that syntax is at the end of this notebook)
# the skip_header call is to skip a few lines at the top that aren't data
# and the names allow us to now reference the records using a variable name rather than integers

timedat=np.array([np.array(int(datedat.split('-')[0])+int(datedat.split('-')[1])/12.) for datedat in goog['week']])
#this translates the "2004-01" string into fractional years as a floating point number

for datname in ['cowboys','pats','supabowl']:
plt.legend(loc='upper center',fontsize=10)
plt.ylabel('Google interest [arb. units]',fontsize=12)
plt.xlabel('Time since 0 BCE [years]',fontsize=12)
plt.title('Hand-egg sports teams and their championships \n as measured through Google search interest')

print ('how bout dem cowboys')

# put a vertical line for every patriots superbowl visit
patsup=[2004,2007,2011,2014] #years they went to the superbowl
for goodyear in patsup:  # loop over them 
how bout dem cowboys

You can define functions to operate on variables

In [16]:
# start function definitions with the "def " and the name of the function
# inside the parentheses you place the variables which squared takes in
def squared(thingtosquare):
    """This is a documentation string for the function squared, which takes
    in a number (thingtosquare) and outputs its square (thingsquared)"""
    thingsquared = thingtosquare ** 2     # simply squares the input variable, and returning it
    return thingsquared
In [17]:
In [18]:
print squared(b),help(squared)
[ 0  1  4  9 16 25 36 49 64 81]Help on function squared in module __main__:

    This is a documentation string for the function squared, which takes
    in a number (thingtosquare) and outputs its square (thingsquared)

In [19]:
# define a function to make a gaussian with input values, used later
def mygauss(x, amp, center, sigma):
    """This is an example gaussian function, which takes in x values, the amplitude (amp),
    the center x value (center) and the sigma of the Gaussian, and returns the respective y values."""
    y = amp * np.exp(-.5*((x-center)/sigma)**2)
    return y
In [20]:
npts = 40   # the number of points on the x axis
x = np.linspace(0,10,npts)    # make a series of npts linearly spaced values between 0 and 10
amp = 3.1
center = 5.1
sigma = 1.11
y = mygauss(x, amp, center, sigma)
print y
[  8.07827074e-05   2.27337292e-04   6.06524751e-04   1.53409389e-03
   3.67858601e-03   8.36248704e-03   1.80225221e-02   3.68231757e-02
   7.13267572e-02   1.30981297e-01   2.28029826e-01   3.76356649e-01
   5.88888531e-01   8.73558844e-01   1.22850464e+00   1.63789856e+00
   2.07024987e+00   2.48075627e+00   2.81819471e+00   3.03517306e+00
   3.09899937e+00   2.99975015e+00   2.75279737e+00   2.39490875e+00
   1.97528272e+00   1.54452564e+00   1.14495007e+00   8.04643554e-01
   5.36100352e-01   3.38621254e-01   2.02771961e-01   1.15113737e-01
   6.19543740e-02   3.16113001e-02   1.52910852e-02   7.01228930e-03
   3.04864536e-03   1.25654936e-03   4.90995791e-04   1.81886953e-04]
In [21]:
plt.plot(x,y,'bo', label='data points')     
plt.text(center, amp, "<-- peak is here",fontsize=16)     # places text at any x/y location on the graph
plt.xlabel('X axis',fontsize=20)
plt.ylabel('Y axis', fontsize=20)
plt.title('A gaussian plot \n with some extras!',fontsize=20)
<matplotlib.legend.Legend at 0x7fe8c1144790>

Now let's fake some measurement process by adding some errors, and fit this with a model

Simulate some data, using a Gaussian with random noise added

In [133]:
def mygauss_errors(x, amp, center, sigma, noise_amp):
    """Takes in x values, amplitude,center,sigma for a guassian, as well as a noise amplitude
    which multiplies a random normally distributed (sigma=1,center=0) noise that is added to the gaussian"""
    y = amp * np.exp(-.5*((x-center)/sigma)**2) #+ np.random.randn(len(x)) 
    yerr = y + np.random.randn(len(x)) * noise_amp
    return yerr

# make up some hidden parameters for the noisy Gaussian, we will try to discover them with optimization later
npts, minx, maxx = 15,0,10
x = linspace(minx,maxx,npts)    # making linearly spaced x values between min and max with n points
amp, center, sigma = 50.1,5.3,1.11   # made up hidden parameters
noise_amp = .05*amp   # this multiplies a unit normal distribution to simulate noise, larger means increased noise

# get the noisy y values using the above input parameters
yerr = mygauss_errors(x, amp, center, sigma, noise_amp)

## for plotting purposeses, also make a more "continuous" x variable, with much finer spacing
ycont = mygauss(xcont,amp,center,sigma) # also get the true continuous y values

plt.plot(xcont,ycont,'r',label='True values (nature)')

plt.errorbar(x,yerr,yerr=noise_amp,marker='o',linestyle='None',label='Noisy data')
plt.title('Fake data example with a noisy Gaussian')
<matplotlib.legend.Legend at 0x7fe850e63d90>

Now that we have some fake data, let's pretend we don't know the parameters and try to find them via $\chi^2$ minimization!

In [110]:
# to fit a function to some data, you will need to minimize (optimize) some statistic
# scipy has a function in it which will take in a model and data and find the best parameters
# for you using a chi-squared minimization
from scipy.optimize import curve_fit

# you can always google for more info about common functions, but python has documentation strings
# that you can access in the notebook by typing


# or also executing "curve_fit?" would pop up a window with the same information
Help on function curve_fit in module scipy.optimize.minpack:

curve_fit(f, xdata, ydata, p0=None, sigma=None, absolute_sigma=False, check_finite=True, bounds=(-inf, inf), method=None, jac=None, **kwargs)
    Use non-linear least squares to fit a function, f, to data.
    Assumes ``ydata = f(xdata, *params) + eps``
    f : callable
        The model function, f(x, ...).  It must take the independent
        variable as the first argument and the parameters to fit as
        separate remaining arguments.
    xdata : An M-length sequence or an (k,M)-shaped array
        for functions with k predictors.
        The independent variable where the data is measured.
    ydata : M-length sequence
        The dependent data --- nominally f(xdata, ...)
    p0 : None, scalar, or N-length sequence, optional
        Initial guess for the parameters.  If None, then the initial
        values will all be 1 (if the number of parameters for the function
        can be determined using introspection, otherwise a ValueError
        is raised).
    sigma : None or M-length sequence, optional
        If not None, the uncertainties in the ydata array. These are used as
        weights in the least-squares problem
        i.e. minimising ``np.sum( ((f(xdata, *popt) - ydata) / sigma)**2 )``
        If None, the uncertainties are assumed to be 1.
    absolute_sigma : bool, optional
        If False, `sigma` denotes relative weights of the data points.
        The returned covariance matrix `pcov` is based on *estimated*
        errors in the data, and is not affected by the overall
        magnitude of the values in `sigma`. Only the relative
        magnitudes of the `sigma` values matter.
        If True, `sigma` describes one standard deviation errors of
        the input data points. The estimated covariance in `pcov` is
        based on these values.
    check_finite : bool, optional
        If True, check that the input arrays do not contain nans of infs,
        and raise a ValueError if they do. Setting this parameter to
        False may silently produce nonsensical results if the input arrays
        do contain nans. Default is True.
    bounds : 2-tuple of array_like, optional
        Lower and upper bounds on independent variables. Defaults to no bounds.        
        Each element of the tuple must be either an array with the length equal
        to the number of parameters, or a scalar (in which case the bound is
        taken to be the same for all parameters.) Use ``np.inf`` with an
        appropriate sign to disable bounds on all or some parameters.
        .. versionadded:: 0.17
    method : {'lm', 'trf', 'dogbox'}, optional
        Method to use for optimization.  See `least_squares` for more details.
        Default is 'lm' for unconstrained problems and 'trf' if `bounds` are
        provided. The method 'lm' won't work when the number of observations
        is less than the number of variables, use 'trf' or 'dogbox' in this
        .. versionadded:: 0.17
    jac : callable, string or None, optional
        Function with signature ``jac(x, ...)`` which computes the Jacobian
        matrix of the model function with respect to parameters as a dense
        array_like structure. It will be scaled according to provided `sigma`.
        If None (default), the Jacobian will be estimated numerically.
        String keywords for 'trf' and 'dogbox' methods can be used to select
        a finite difference scheme, see `least_squares`.
        .. versionadded:: 0.18
        Keyword arguments passed to `leastsq` for ``method='lm'`` or
        `least_squares` otherwise.
    popt : array
        Optimal values for the parameters so that the sum of the squared error
        of ``f(xdata, *popt) - ydata`` is minimized
    pcov : 2d array
        The estimated covariance of popt. The diagonals provide the variance
        of the parameter estimate. To compute one standard deviation errors
        on the parameters use ``perr = np.sqrt(np.diag(pcov))``.
        How the `sigma` parameter affects the estimated covariance
        depends on `absolute_sigma` argument, as described above.
        If the Jacobian matrix at the solution doesn't have a full rank, then
        'lm' method returns a matrix filled with ``np.inf``, on the other hand
        'trf'  and 'dogbox' methods use Moore-Penrose pseudoinverse to compute
        the covariance matrix.
        if either `ydata` or `xdata` contain NaNs, or if incompatible options
        are used.
        if the least-squares minimization fails.
        if covariance of the parameters can not be estimated.
    See Also
    least_squares : Minimize the sum of squares of nonlinear functions.
    stats.linregress : Calculate a linear least squares regression for two sets
                       of measurements.
    With ``method='lm'``, the algorithm uses the Levenberg-Marquardt algorithm
    through `leastsq`. Note that this algorithm can only deal with
    unconstrained problems.
    Box constraints can be handled by methods 'trf' and 'dogbox'. Refer to
    the docstring of `least_squares` for more information.
    >>> import numpy as np
    >>> from scipy.optimize import curve_fit
    >>> def func(x, a, b, c):
    ...     return a * np.exp(-b * x) + c
    >>> xdata = np.linspace(0, 4, 50)
    >>> y = func(xdata, 2.5, 1.3, 0.5)
    >>> ydata = y + 0.2 * np.random.normal(size=len(xdata))
    >>> popt, pcov = curve_fit(func, xdata, ydata)
    Constrain the optimization to the region of ``0 < a < 3``, ``0 < b < 2``
    and ``0 < c < 1``:
    >>> popt, pcov = curve_fit(func, xdata, ydata, bounds=(0, [3., 2., 1.]))

In [134]:
# use the curve_fit function to find optimal parameters for the mygauss function, given the data
# (it does better with a good guess for starting parameters, though none or even bad guesses work)

print("These are the best fit amplitude, center, and sigma")
print("These are the covariances between those parameters (i.e. diagonal elements are sigma**2 for each param.)")

# these will print out the values and their errors, according to the covariance matrix
print "True amplitude, center, and sigma: " +str(amp)+", "+str(center)+', '+str(sigma)
print "Fitted amplitude, center, and sigma: " +str(params)
print "With estimated errors +/- " +str(np.diag(cov))
chi2 = 1./(len(x)-3)*np.sum((yerr-mygauss(x,*params))**2/(noise_amp)**2)
print 'Reduced chi-squared of fit: '+str(chi2)
These are the best fit amplitude, center, and sigma
[ 48.25156428   5.30305646   1.1577862 ]
These are the covariances between those parameters (i.e. diagonal elements are sigma**2 for each param.)
[[  1.51140205e+00   1.14560249e-09  -2.41771648e-02]
 [  1.14560249e-09   1.16025091e-03  -1.44802759e-11]
 [ -2.41771648e-02  -1.44802759e-11   1.16025105e-03]]
True amplitude, center, and sigma: 50.1, 5.3, 1.11
Fitted amplitude, center, and sigma: [ 48.25156428   5.30305646   1.1577862 ]
With estimated errors +/- [  1.51140205e+00   1.16025091e-03   1.16025105e-03]
Reduced chi-squared of fit: 0.461322203984
In [135]:
'[(48.251564279071438, 1.5114020525699201), (5.3030564570108174, 0.001160250911266647), (1.1577861962228788, 0.0011602510452089643)]'

Now plot all the curves together: the noisy data, the true nature, and the fit

In [136]:

plt.errorbar(x,yerr,yerr=noise_amp,marker='o',linestyle='None',label='Noisy data')

plt.plot(xcont,mygauss(xcont,amp,center,sigma),label='True data (nature)')
plt.plot(xcont,mygauss(xcont,*params),label='Fitted curve',color='r')
plt.axhline(0, color='k', linestyle='--')
plt.xlabel('X axis',fontsize=20)
plt.ylabel('Y axis', fontsize=20)
plt.title('Our result: A noisy Gaussian with fit',fontsize=20)
plt.legend(loc='upper left')
<matplotlib.legend.Legend at 0x7fe850d22fd0>

time is important, some time is worth keeping track

In [140]:
import time
print time.time()   #one way to keep track of time is to use the system time

Just for fun... we can repeat all the above, faking the data, and fitting it as many times as we want. Does it return reasonable model fits? This is a way of checking the measurement/modeling process for bias. Hint: lower/raise the number of points (npts) and noise amplitude (noise_amp) variables below to see how e.g. the reduced chi-squared distribution changes

In [147]:
npts, minx, maxx = 100,0,9
x = linspace(minx,maxx,npts)    # making linearly spaced x values between min and max with n points
amp, center, sigma = 50.1,5.3,1.11   # made up hidden parameters
noise_amp = 2   # this multiplies a unit normal distribution to simulate noise, larger means increased noise

# Now just repeat the experiment with all the above true parameters
# and store the resulting model parameters
nexpts=10000    # the number of times to run the expt, 10k takes a few seconds

tstart=time.time()   #mark the start time
for i in range(nexpts):  # simply loop over repeated "experiments", saving the model parameters each time
    yerr = mygauss_errors(x, amp, center, sigma, noise_amp)   # fake the data
    params,cov=curve_fit(mygauss,x,yerr,p0=[ampguess,centerguess,sigmaguess])    # fit the model
    allchi2[i]=1./(len(x)-3)*np.sum((yerr-mygauss(x,*params))**2/(noise_amp)**2)  # record chi2
    allamp[i],allcen[i],allsig[i]=params    # save parameters
    allampstd[i],allcenstd[i],allsigstd[i]=np.array([np.sqrt(cov[diag][diag]) for diag in range(3)])

print "took "+str(time.time()-tstart)+" seconds"   #current time minus start time is total time to run
took 4.98560094833 seconds

there are lots of things we could now plot... for instance, is the reduced chi2 distribution reasonable?

In [148]:
plt.title('The $\\chi^2$ of '+str(nexpts)+' repeated "measurement simulations"')
<matplotlib.lines.Line2D at 0x7fe85094e310>

looking at the distribution of model fits and residuals

In [144]:
for i in range(nplots):
    plt.plot(xcont,thefity,'b.',alpha=.01)                     #plot the best fit model for this expt
    plt.plot(xcont,(thefity-thetruey)/amp*100.,'k.',alpha=.01)  #plot the residuals

plt.ylabel('% difference')
plt.title('residuals of best fit model minus truth')
plt.plot(xcont,ycont,'k',linewidth=2,label='true nature') #plot the true nature being measured
plt.title('spread of best fit model \n in repeated experiments')
<matplotlib.legend.Legend at 0x7fe85098a250>
In [150]:
plt.suptitle('Reported parameter fitting "errors"')
# why is the std for sigma and center the same??
<matplotlib.text.Text at 0x7fe8500a1cd0>
In [117]:
#This was plotting up Xiangdong's interesting lab data for the first class

xdat = np.recfromtxt('/home/andrew/Work/Data sorted by Voltage.txt',skip_header=2,

plt.plot(xdat['n'],xdat['pvol'],label='+10 V',alpha=.7)
plt.plot(xdat['n'],xdat['mvol'],label='-10 V',alpha=.7)
plt.xlabel('Measurement #')
plt.ylabel('Voltage [$10^-3 V$]')
plt.hist(xdat['pvol'],bins=100,histtype='step',label='+10 V')
plt.hist(xdat['mvol'],bins=100,histtype='step',label='-10 V')
print("The median of +10/-10 voltage measurements is:",pvmed,nvmed)
print("The standard error is:",pvstd,nvstd)
plt.suptitle("WELCOME TO $\\rho\\hbar\\psi s 122$",fontsize=30)
('The median of +10/-10 voltage measurements is:', -6.9600000000000003e-06, 4.9099999999999996e-06)
('The standard error is:', 2.6421533574604636e-06, 2.7054565169812322e-06)
<matplotlib.text.Text at 0x7fe85196df50>
In [ ]: