#!/usr/bin/env python # coding: utf-8 #

Fitting and model building

# # #### **Quick intro to the following packages** # - The core package `iminuit`. # - Model building and a word on the Scikit-HEP affiliated project/package `zfit`. #   #
# #

Python wrapper to Minuit2 minimization and error computation package

#
# The `iminuit` package provides Python bindings for the [C++ Minuit2 library](https://root.cern.ch/guides/minuit2-manual) maintained at CERN, which is effectively the only fitting engine used in HEP. The package has no external dependency apart from NumPy. # # `iminuit` gives the user full power of the engine internals. That can be really useful, but it often demands a certain level of expertise. It is hence no wonder that many fitting libraries around build atop it. That's in particular the case for Astronomy. # # Note: feel free to complement the introduction below with the several *great tutorials* available from the [GitHub repository](https://github.com/scikit-hep/iminuit). Indeed what is shown here is just an appetiser. # ### **1. A very simple example** # # Minimisation of a function: # In[ ]: from iminuit import Minuit def fcn(x, y, z): return (x - 2) ** 2 + (y - 3) ** 2 + (z - 4) ** 2 fcn.errordef = Minuit.LEAST_SQUARES m = Minuit(fcn, x=0, y=0, z=0) m.migrad() # run optimiser # In[ ]: m.hesse() # run covariance estimator # ### **2. A more evolved example** # # Let's look at a little sample of track information generated with a toy. #
# # Acknowledgements # # This mini-tutorial is kindly provided by Hans Dembinski (TU Dortmund), with minor modifications. #
# In[ ]: import uproot f = uproot.open("data/sample_tracks.root") event = f["event"] # In[ ]: event.show() # Legend :-): # * mc_trk_len: number of true tracks in event # * mc_trk_px: x-component of true momentum of particle (variable-length array) # * mc_trk_py: y-component of true momentum of particle (variable-length array) # * mc_trk_pz: z-component of true momentum of particle (variable-length array) # * trk_len: number of reconstructed tracks in event # * trk_px: x-component of momentum of reconstructed track (variable-length array) # * trk_py: y-component of momentum of reconstructed track (variable-length array) # * trk_pz: z-component of momentum of reconstructed track (variable-length array) # * trk_imc: index of matched true particle or -1 (variable-length array) # Get the content of branches as arrays (don't use this in large trees - you will exhaust the computer memory): # In[ ]: trk_len = event["trk_len"].array() mc_trk_len = event["mc_trk_len"].array() #trk_mom = event.arrays(["trk_px", "trk_py", "trk_pz"], library="pd") trk_px = event["trk_px"].array(library="pd") # In[ ]: # first ten entries, this is a normal numpy array trk_len[:10] # In[ ]: import numpy as np print(f"{np.sum(trk_len == 0)} events with zero tracks") # In[ ]: import matplotlib.pyplot as plt plt.hist(trk_len); # In[ ]: # first ten entries, trk_px is a special jagged array trk_px[:10] # In[ ]: plt.hist(trk_px); # #### **Fits** # # * Typical analysis work flow: # 1. Pre-select data and make compact data trees # 2. Make histograms and profiles from tree data # 3. Fit histograms and profiles to extract physical parameters # * Many specialized fitting tools for individual purposes, e.g.: # - [scipy.optimize.curve_fit](https://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.curve_fit.html#scipy.optimize.curve_fit) # - [RooFit](https://root.cern.ch/roofit) # * Generic method # - Select mathematical model (PDF) which describes data # - Use maximum-likelihood method to adapt model to data # * Specialised methods give fast results for some types of problems # * Generic method allows one to do advanced things not implemented in specialised methods # Let's fit the $p_x$ distribution with a normal distribution to extract the parameters $\mu$ and $\sigma$. # # - To apply a maximum-likelihood method, we need a statistical model that describes the data # - Assumption 1: original data before histogramming is normal distributed; pdf is $\mathcal{N}(\mu, \sigma)$ with parameters $\mu$ and $\sigma$ # - Assumption 2: count in histogram cell is Poisson distributed $P(n_i, \lambda_i)$ # - Expected content in a histogram cell is $\lambda_i = N \int_{x_i}^{x_{i+1}} \mathcal{N}(\mu, \sigma) \, \text{d}x$, where $N$ is total number of events # - Likelihood is joint probability of data under model # $L = \prod_i P(n_i, \lambda_i)$, need to maximize this by varying model parameters $\mu$ and $\sigma$ # - Technical step to achieve this: Minimize score $S(\mu,\sigma) = -2\ln L(\mu, \sigma) = -2\sum_i \ln P(n_i; \lambda_i)$ # In[ ]: import boost_histogram as bh xaxis = bh.axis.Regular(50, -2, 2) h_px = bh.Histogram(xaxis) h_px.fill(trk_px) # In[ ]: # get data from before px_axis = h_px.axes[0] cx = px_axis.centers dx = px_axis.widths xe = px_axis.edges n = h_px.view() plt.errorbar(cx, n, n**0.5, dx, fmt="o", label="data"); # In[ ]: # scipy has efficient and correct implementations for most statistical distributions from scipy.stats import norm, poisson n_total = np.sum(n) def score(mu, sigma): cdf = norm(mu, sigma).cdf lambdas = n_total * (cdf(xe[1:]) - cdf(xe[:-1])) probs = poisson.pmf(n, lambdas) return -2 * np.sum(np.log(probs + 1e-100)) # avoid taking log of zero # In[ ]: from matplotlib.ticker import LogLocator mus = np.linspace(-1, 1, 20) sigmas = np.linspace(1e-10, 2, 20) g_mu, g_sigma = np.meshgrid(mus, sigmas) g_score = np.vectorize(score)(g_mu, g_sigma) plt.contourf(g_mu, g_sigma, g_score) plt.xlabel("$\mu$") plt.ylabel("$\sigma$") plt.colorbar(); # In[ ]: m = Minuit(score, mu=0, sigma=1) m.errordef = Minuit.LEAST_SQUARES m.migrad() mu, sigma = m.values s_mu, s_sigma = m.errors plt.errorbar(cx, n, n ** 0.5, dx, fmt="o", label="data"); plt.plot(cx, norm(mu, sigma).pdf(cx) * n_total * dx, label="fit") plt.title(f"$\mu = {mu:.3f} \pm {s_mu:.3f}, \quad \sigma = {sigma:.3f} \pm {s_sigma:.3f}$") plt.legend(); # Check whether the fit is good: # - by looking at *pull distribution* # - $(n_i - \lambda_i) / \lambda_i$ for Poisson distribute data # - by checking the $\chi^2$ value against the degrees of freedom # - Simple check: $\chi^2/n_\text{dof}$ should be about 1 # - Better check: chance probability $\int_{\chi^2_\text{observed}}^{-\infty} P(\chi^2; n_\text{dof}) \, \text{d}\chi^2$ to obtain a higher value than the observed should not be too small # In[ ]: cdf = norm(mu, sigma).cdf n_pred = (cdf(xe[1:]) - cdf(xe[:-1])) * n_total n_sigma = n_pred ** 0.5 # for Poisson-distributed data pull = (n - n_pred) / n_sigma plt.errorbar(cx, pull, np.ones_like(pull), fmt="o") plt.axhline(0, ls="--") # degrees of freedom: number of fitted bins minus number of fitted parameters n_dof = len(n) - 2 # need to subtract two fitted parameters chi2_obs = np.sum(pull ** 2) print(f"chi2/ndof = {chi2_obs} / {n_dof} = {chi2_obs / n_dof}") from scipy.stats import chi2 chance_prob = 1 - chi2(n_dof).cdf(chi2_obs) print(f"{chance_prob}") # ### **3. Towards realistic HEP data use-cases** # # HEP analyses typically involve far more sophisticated fitting work. It is often about **data model building** and performing **(un)binned maximum likelihood fits** to describe experimental distributions. Also, fitting is often performed to extract decay mode yields atop the model parameters. # # The `iminuit` package has functionality to performed so-called *simultaneous fits*, # where the models being fitted to two or more datasets share at least a parameter. # But it is not meant as a package providing commonly-used goodies and a "framework" for (complex) model building. Imagine how cumbersome it would be to verbosely describe the model below for an amplitude analysis ...! # #### **A simple binned extended maximum-likelihood fit example** # # (Taken straight from one of the excellent `iminuit` tutorials.) # # The binned extended maximum-likelihood fit is strictly the binned equivalent of the corresponding unbinned fit. One sums the logarithm of Poisson probabilities for the observed counts as a function of the predicted counts in this case (times -1 to turn maximization into minimization). # # Instead of a density, you need to provide a cdf of the density in this case (which must be vectorized). There is no need to separately return the total integral like the unbinned case. The parameters are the same as in the unbinned extended fit. # In[ ]: from iminuit import cost from scipy.stats import norm, uniform xrange = -1, 1 rng = np.random.default_rng(1) xdata = rng.normal(0, 0.1, size=400) xdata = np.append(xdata, rng.uniform(*xrange, size=1000)) def model_pdf(x, z, mu, sigma): return (z * norm.pdf(x, mu, sigma) + (1 - z) * uniform.pdf(x, xrange[0], xrange[1] - xrange[0])) c = cost.UnbinnedNLL(xdata, model_pdf) m = Minuit(c, z=0.4, mu=0, sigma=0.2) m.limits["sigma"] = (0, None) m.limits["z"] = (0, 1) m.limits["mu"] = (-1, 1) m.migrad() # In[ ]: n, xe = np.histogram(xdata, bins=50, range=xrange) cx = 0.5 * (xe[1:] + xe[:-1]) dx = np.diff(xe) plt.errorbar(cx, n, n ** 0.5, fmt="ok") xm = np.linspace(xe[0], xe[-1]) plt.plot(xm, model_pdf(xm, *[p.value for p in m.init_params]) * len(xdata) * dx[0], ls=":", label="init") plt.plot(xm, model_pdf(xm, *m.values) * len(xdata) * dx[0], label="fit") plt.legend(); # #### **Scaling up in complexity** # # In spite of the example above being a *simple* example of an extended maximum-likelihood fit to binned data, the code is, honestly, very verbose. # Clearly much of the above could be simplified for the general user. And now imagine that the (realistic example!) function to fit is not a uniform+normal distribution but rather this: #
# [Example taken from a presentation by Anton Poluektov.] # The ROOT framework provides `RooFit` as a model fitting library. The PyHEP ecosystem in particular has the `zfit` package, a model fitting library based on TensorFlow and optimised for simple and direct manipulation of probability density functions ...