#!/usr/bin/env python # coding: utf-8 # # Get started with physt # # This tutorial describes some basic features of physt. # In[1]: # Necessary import evil from physt import binnings, h1, h2, h3 import numpy as np import matplotlib.pyplot as plt np.random.seed(1337) # Have always the same data # ## Getting physt (to run) # # I believe you can skip this section but anyway, for the sake of completeness, the default way of installing a relatively stable version of physt is via pip: # # `pip install physt` # # Alternatively, you can download the source code from github (https://github.com/janpipek/physt). # # You will need **numpy** to use physt (required), but there are other packages (optional) that are very useful if you want to use physt at its best: **matplotlib** for plotting (or **bokeh** as a not-so-well supported alternative). # ## Your first histogram # # If you need to create a histogram, call the `histogram` (or `h1`) function with your data (like heights of people) as the first argument. The default gives a reasonable result... # In[2]: # Basic dataset heights = [160, 155, 156, 198, 177, 168, 191, 183, 184, 179, 178, 172, 173, 175, 172, 177, 176, 175, 174, 173, 174, 175, 177, 169, 168, 164, 175, 188, 178, 174, 173, 181, 185, 166, 162, 163, 171, 165, 180, 189, 166, 163, 172, 173, 174, 183, 184, 161, 162, 168, 169, 174, 176, 170, 169, 165] hist = h1(heights) # Automatically select all settings hist # ...which is an object of the Histogram1D type that holds all the bin information... # In[3]: hist.bins # All the bins # In[4]: hist.frequencies # All the frequencies # ...and provides further features and methods, like plotting for example... # In[5]: hist.plot(show_values=True); # ...or adding new values (note that this is something numpy.histogram won't do for you)... # In[6]: original = hist.copy() # Store the original data to see changes # ******* Here comes a lonely giant hist.fill(197) step1 = hist.copy() # Store the intermediate value # ******* And a bunch of relatively short people hist.fill_n([160, 160, 161, 157, 156, 159, 162]) # See how the plot changes (you can safely ignore the following 4 lines) ax = hist.plot(label="After fill_n") step1.plot(color="yellow", ax=ax, label="After fill") original.plot(color="red", ax=ax, label="Before filling") ax.legend(loc=1) # See the number of entries hist # ## Data representation # # The primary goal of physt library is to represent histograms as data objects with a set methods for easy manipulation and analysis (including mathematical operations, adaptivity, summary statistics. ...). The histogram classes used in [ROOT](https://root.cern.ch/) framework served as inspiration but not as a model to copy (though relevant methods often have same names). # # Based on its dimensionality, a histogram is an instance of one of the following classes (all inheriting from **HistogramBase**): # # * **Histogram1D** for univariate data # * **Histogram2D** for bivariate data # * **HistogramND** for data with higher dimensionality # * ...or some special dedicated class (user-provided). Currently, there is a **PolarHistogram** as an example (considered to be experimental, not API-stable). # # However, these objects are \__init\__ialized with already calculated data and therefore, you typically don't construct the yourselves but call one of the facade functions: # # * **histogram** or **h1** # * **histogram2d** or **h2** # * **histogramdd** (or **h3** for 3D case) # # These functions try to find the best binning schema, calculate bin contents and set other properties for the histograms. In principle (if not, submit a bug report), if you call a function with arguments understood by eponymous numpy functions ([histogram](http://docs.scipy.org/doc/numpy-1.10.1/reference/generated/numpy.histogram.html), [histogram2d](http://docs.scipy.org/doc/numpy-1.10.1/reference/generated/numpy.histogram2d.html) and [histogramdd](http://docs.scipy.org/doc/numpy-1.10.1/reference/generated/numpy.histogramdd.html)), you should receive histogram with exactly the same bin edges and bin contents. However, there are many more arguments available! # In[7]: # Back to people's parameters... heights = np.random.normal(172, 10, 100) weights = np.random.normal(70, 15, 100) iqs = np.random.normal(100, 15, 100) # In[8]: # 1D histogram h1(heights) # In[9]: # 2D histogram h2(heights, weights, [5, 7]) # In[10]: # 3D histogram h3([heights, weights, iqs]) # Simplification with respect to numpy.histogramdd # So, what do these objects contain? In principle: # # - binning schema (`_binning` or `_binnings`) # - bin contents (`frequencies`) together with errors (`errors`) # - some statistics about the data (`mean`, `variance`, `std`) # - metadata (like `name` and `axis_name` or `axis_names`) # - ... # # In the following, properties of `Histogram1D` will be described. Analogous methods and data fields do exist also for `Histogram2D` and `HistogramND`, perhaps with the name in plural. # ### Binning schema # # The structure of bins is stored in the histogram object as a hidden attribute `_binning`. This value is an instance of one of the binning classes that are all descendants of `physt.binnings.BinningBase`. You are not supposed to directly access this value because manipulating it without at the same time updating the bin contents is dangerous. # # A dedicated notebook deals with the binning specifics, here we sum at least the most important features. # # `Histogram1D` offers the following attributes to access (read-only or read-only-intended) the binning information (explicitly or implicitly stored in `_binning`): # In[11]: # Create a histogram with "reasonable" bins data = np.random.normal(0, 7, 10000) hist = h1(data, "pretty", bin_count=4) hist # In[12]: hist._binning # Just to show, don't use it # In[13]: hist.bin_count # The total number of bins # In[14]: hist.bins # Bins as array of both edges # In[15]: hist.edges # Bin edges with the same semantics as the numpy.histogram # In[16]: hist.bin_left_edges # In[17]: hist.bin_right_edges # In[18]: hist.bin_centers # Centers of the bins - useful for interpretation of histograms as scatter data # In[19]: hist.bin_widths # Widths of the bins - useful for calculating densities and also for bar plots # Just as a simple overview of binning schemas, that are provided by physt, we show the bins as produced by different schemas: # In[20]: list(binnings.binning_methods.keys()) # Show them all # These names can be used as the second parameter of the `h1` function: # In[21]: # Fixed-width h1(data, "fixed_width", bin_width=6).numpy_bins # In[22]: # Numpy-like print("Expected:", np.histogram(data, 5)[1]) print("We got:", h1(data, "numpy", bin_count=5).numpy_bins) # In[23]: # Integer - centered around integers; useful for integer data h1(data, "integer").numpy_bins # In[24]: # Exponential - positive numbers required h1(np.abs(data), "exponential").numpy_bins # We 'abs' the values # In[25]: # Quantile - each bin should have a similar statistical importance h1(data, "quantile", bin_count=5).numpy_bins # In[26]: # Pretty - as friendly to your plots as possible, you may set an approximate number of bins h1(data, "pretty").numpy_bins # ### Bin contents # The bin contents (`frequencies`) and associated errors (`errors`) are stored as numpy arrays with a shape corresponding to number of bins (in all dimensions). Again, you cannot manipulate these properties diractly (unless you break the dont-touch-the-underscore convention). # In[27]: hist = h1(data, "pretty") hist.frequencies # Errors are calculated as $\sqrt(N)$ which is the simplest expectation for independent values. If you don't accept this, you can set your errors through `_errors2` field which contains squared errors. # # Note: Filling with weights, arithmetic operations and scaling preserve correct error values under similar conditions. # In[28]: hist.errors # In[29]: # Doubling the histogram doubles the error (hist * 2).errors # ### Data types # # Internally, histogram bins can contain values in several types (`dtype` in numpy terminology). By default, this is either `np.int64` (for histograms without weights) or `np.float64` (for histograms with weight). Wherever possible, this distinction is preserved. If you try filling in with weights, if you multiply by a float constant, if you divide, ... - basically whenever this is reasonable - an integer histogram is automatically converted to a float one. # In[30]: hist = h1(data) print("Default type:", hist.dtype) hist = h1(data, weights=np.abs(data)) # Add some random weights print("Default type with weights:", hist.dtype) hist = h1(data) hist.fill(1.0, weight=.44) print("Default type after filling with weight:", hist.dtype) hist = h1(data) hist *= 2 print("Default type after multiplying by an int:", hist.dtype) hist *= 5.6 print("Default type after multiplying by a float:", hist.dtype) # In[31]: # You can specify the type in the method call hist = h1(data, dtype="int32") hist.dtype # In[32]: # You can set the type of the histogram using the attribute hist = h1(data) hist.dtype = np.int32 hist.dtype # In[33]: # Adding two histograms uses the broader range hist1 = h1(data, dtype="int64") hist2 = h1(data, dtype="float32") (hist1 + hist2).dtype # See the result! # ### Manually create histogram instances # # As mentioned, `h1` and `h2` are just facade functions. You can construct the objects directly using the constructors. The first argument accepts something that can be interpreted as binning or list of bins, second argument is an array of frequencies (numpy array or something convertible). # In[34]: from physt.histogram1d import Histogram1D from physt.histogram_nd import Histogram2D hist1 = Histogram1D([0.0, 0.2, 0.4, 0.6, 0.8, 1.0], [1, 2, 3, 4, 5]) hist2 = Histogram2D([[0, 0.5, 1], [0, 1, 2, 3]], [[0.2, 2.2, 7.3], [6, 5, 3]], axis_names=["x", "y"]) fig, axes = plt.subplots(1, 2, figsize=(10, 4)) hist1.plot(ax = axes[0]) hist2.plot(ax = axes[1]) hist1, hist2 # In[35]: # Create a physt "logo", also available as physt.examples.fist _, ax = plt.subplots(figsize=(4, 4)) widths = np.cumsum([0, 1.2, 0.2, 1, 0.1, 1, 0.1, 0.9, 0.1, 0.8]) fingers = np.asarray([4, 1, 7.5, 6, 7.6, 6, 7.5, 6, 7.2]) + 5 hist1 = Histogram1D(widths, fingers) hist1.plot(lw=0, ax=ax) ax.set_xticks([]) ax.set_yticks([]) ax.set_xlabel("physt") hist1 # ## Indexing # Supported indexing is more or less compatible with numpy arrays. # In[36]: hist.find_bin(3) # Find a proper bin for some value (0 - based indices) # In[37]: hist[3] # Return the bin (with frequency) # In[38]: hist[-3:] # Sub-histogram (as slice) # In[39]: hist[hist.frequencies > 5] # Masked array (destroys underflow & overflow information) # In[40]: hist[[1, 3, 5]] # Select some of the bins # ## Arithmetics # # With histograms, you can do basic arithmetic operations, preserving bins and usually having intuitive meaning. # In[41]: hist + hist # In[42]: hist - hist # In[43]: hist * 0.45 # In[44]: hist / 0.45 # Some of the operations are prohibited: # In[45]: try: hist * hist # Does not make sense except Exception as ex: print(repr(ex)) # In[46]: try: hist + 4 # Does not make sense except Exception as ex: print(repr(ex)) # In[47]: try: (-0.2) * hist except Exception as ex: print(repr(ex)) # Some of the above checks are dropped if you allow "free arithmetics". This you can do by: # 1. Setting the `PHYST_FREE_ARITHMETICS` environment variable to 1 (note: not any other "truthy" value) # 2. By setting `config.free_arithmetics` to True # 3. By using the context manager `config.enable_free_arithmetics()`: # In[48]: from physt.config import config with config.enable_free_arithmetics(): neg_hist = (-0.2) * hist ax = neg_hist.plot() ax.set_ylim((-800, 0)) # TODO: Rendering bug requires this neg_hist # With this relaxation, you can also use any numpy array as (right) operand for any of the operations: # In[49]: # Add some noise with config.enable_free_arithmetics(): hist_plus_array = hist + np.random.normal(800, 200, hist.shape) hist_plus_array.plot() hist_plus_array # If you need to side-step any rules completely, just use the histogram in a numpy array: # In[50]: np.asarray(hist) * np.asarray(hist) # Excercise: Reconstruct a histogram with original bins # ## Statistics # # When creating histograms, it is possible to keep simple statistics about the sampled distribution, # like mean() and std(). The behaviour was inspired by similar features in ROOT. # # **To be yet refined.** # In[51]: hist.statistics # ## Plotting # # This is currently based on matplotlib, but other tools might come later (d3.js, bokeh?) # In[52]: hist.plot(); # Basic plot # In[53]: hist.plot(density=True, errors=True, ecolor="red"); # Include errors # In[54]: hist.plot(show_stats=True, errors=True, alpha=0.3); # Show summary statistics (not fully supported yet) # In[55]: hist.plot(cumulative=True, color="yellow", lw=3, edgecolor="red"); # Use matplotlib parameters # In[56]: hist.plot(kind="scatter", s=hist.frequencies, cmap="rainbow", density=True); # Another plot type # In[57]: hist.plot(kind="step", lw=4) # In[58]: # Plot different bins using different styles axis = hist[hist.frequencies > 5].plot(label="High", alpha=0.5) hist[1:-1][hist[1:-1].frequencies <= 5].plot(ax=axis, color="green", label="Low", alpha=0.5) hist[[0, -1]].plot(ax=axis, color="red", label="Edge cases", alpha=0.5) hist.plot(kind="scatter", ax=axis, s=hist.frequencies / 10, label="Scatter") # axis.legend(); # Does not work - why? # In[59]: # Bar plot with colormap (with logarithmic scale) ax = hist.plot(cmap="Reds_r", yscale="log", show_values=True); # ## Irregular binning and densities # In[60]: figure, axes = plt.subplots(1, 3, figsize=(11, 3)) hist_irregular = h1(heights, (160, 162, 166, 167, 175, 188, 191)) hist_irregular.plot(ax=axes[0], errors=True, cmap="rainbow") hist_irregular.plot(ax=axes[1], density=True, errors=True, cmap="rainbow") hist_irregular.plot(ax=axes[2], density=True, cumulative=True, cmap="rainbow") axes[0].set_title("Absolute values") axes[1].set_title("Densities") axes[2].set_title("Cumulative"); # ## Adding new values # ### Add (fill) single values # In[61]: figure, axes = plt.subplots(1, 4, figsize=(12, 3)) hist3 = h1([], 20, range=(160, 200)) for i, ax in enumerate(axes): for height in np.random.normal(165 + 10 * i, 2.8, 10000): hist3.fill(height) hist3.plot(ax=ax); print("After {0} batches: {1}".format(i, hist3)) figure.tight_layout() # ### Add histograms with the same binning # In[62]: heights1 = h1(np.random.normal(169, 10, 100000), 50, range=(150, 200)) heights2 = h1(np.random.normal(180, 11, 100000), 50, range=(150, 200)) total = heights1 + heights2 axis = heights1.plot(label="Women", color="red", alpha=0.5) heights2.plot(label="Men", color="blue", alpha=0.5, ax=axis) total.plot(label="All", color="gray", alpha=0.5, ax=axis) axis.legend(); # ## Compatibility # # Note: Mostly, the compatibility is a trivial consequence of the object being convertible to numpy array # In[63]: # Convert to pandas DataFrames hist.to_dataframe() # In[64]: # Works on xarray import xarray as xr arr = xr.DataArray(np.random.rand(10, 50, 100)) h1(arr).plot(cmap="Reds_r", cmap_min=4744, cmap_max=5100, lw=1, edgecolor="red", show_values=True); # In[65]: # Works on pandas Series import pandas as pd series = pd.Series(heights, name="height [cm]") hist = h1(series, title="Height distribution") hist.plot() hist # In[66]: # Or hist = series.physt.h1(title="Distribution") hist.plot() hist # ## Export & import # In[67]: json = hist.to_json() # add path argument to write it to file json # In[68]: from physt.io import parse_json hist = parse_json(json) hist.plot() hist