# Necessary import evil
import physt
from physt import h1, h2, histogramdd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
np.random.seed(42)
# Some data
x = np.random.normal(100, 1, 1000)
y = np.random.normal(10, 10, 1000)
# Create a simple histogram
histogram = h2(x, y, [8, 4], name="Some histogram", axis_names=["x", "y"])
histogram
Histogram2D(bins=(8, 4), total=1000, dtype=int64)
# Frequencies are a 2D-array
histogram.frequencies
array([[ 0, 2, 4, 0], [ 3, 26, 20, 5], [ 17, 78, 104, 10], [ 26, 163, 147, 17], [ 17, 136, 96, 17], [ 6, 41, 38, 6], [ 1, 11, 7, 0], [ 0, 1, 0, 1]])
In most cases, binning methods that apply for 1D histograms, can be used also in higher dimensions. In such cases, each parameter can be either scalar (applies to all dimensions) or a list/tuple with independent values for each dimension. This also applies for range that has to be list/tuple of tuples.
histogram = h2(x, y, "fixed_width", (2, 10), name="Fixed-width bins", axis_names=["x", "y"])
histogram.plot();
histogram.numpy_bins
[array([ 96., 98., 100., 102., 104.]), array([-20., -10., 0., 10., 20., 30., 40., 50.])]
histogram = h2(x, y, "quantile", (3, 4), name="Quantile bins", axis_names=["x", "y"])
histogram.plot(cmap_min=0);
histogram.numpy_bins
[array([ 96.75873266, 99.54993453, 100.40825276, 103.85273149]), array([-19.40388635, 3.93758311, 10.63077132, 17.28882177, 41.93107568])]
histogram = h2(x, y, "human", 5, name="Human-friendly bins", axis_names=["x", "y"])
histogram.plot();
histogram.numpy_bins
[array([ 96., 98., 100., 102., 104.]), array([-20., -10., 0., 10., 20., 30., 40., 50.])]
# Default is workable
ax = histogram.plot()
# Custom colormap, no colorbar
import matplotlib.cm as cm
fig, ax = plt.subplots()
ax = histogram.plot(ax=ax, cmap=cm.copper, show_colorbar=False, grid_color=cm.copper(0.5))
ax.set_title("Custom colormap");
# Use a named colormap + limit it to a range of values
import matplotlib.cm as cm
fig, ax = plt.subplots()
ax = histogram.plot(ax=ax, cmap="Oranges", show_colorbar=True, cmap_min=20, cmap_max=100, show_values=True)
ax.set_title("Clipped colormap");
# Show labels (and hide zero bins), no grid(lw=0)
ax = histogram.plot(show_values=True, show_zero=False, cmap=cm.RdBu, format_value=float, lw=0)
Plotting histograms in this way gets problematic with more than roughly 50x50 bins. There is an alternative, though, partially inspired by the datashader
project - plot the histogram as bitmap, which works very fast even for very large histograms.
Note: This method does not work for histograms with irregular bins.
x = np.random.normal(100, 1, 1000000)
y = np.random.normal(10, 10, 1000000)
fig, axes = plt.subplots(1, 3, figsize=(12, 4))
h2(x, y, 20, name="20 bins - map").plot("map", cmap="rainbow", lw=0, alpha=1, ax=axes[0], show_colorbar=False)
h2(x, y, 20, name="20 bins - image").plot("image", cmap="rainbow", alpha=1, ax=axes[1])
h2(x, y, 500, name="500 bins - image").plot("image", cmap="rainbow", alpha=1, ax=axes[2]);
See that the output is equivalent to map without lines.
Sometimes, the value range is too big to show details. Therefore, it may be of some use to transform the values by a function, e.g. logarithm.
fig, axes = plt.subplots(1, 3, figsize=(12, 4))
h2(x, y, 20, name="20 bins - map").plot("map", alpha=1, lw=0, show_zero=False, cmap="rainbow", ax=axes[0], show_colorbar=False, cmap_normalize="log")
h2(x, y, 20, name="20 bins - image").plot("image", alpha=1, ax=axes[1], cmap="rainbow", cmap_normalize="log")
h2(x, y, 500, name="500 bins - image").plot("image", alpha=1, ax=axes[2], cmap="rainbow", cmap_normalize="log");
# Composition - show histogram overlayed with "points"
fig, ax = plt.subplots(figsize=(8, 7))
h_2 = h2(x, y, 30)
h_2.plot("map", lw=0, alpha=0.9, cmap="Blues", ax=ax, cmap_normalize="log", show_zero=False)
# h2(x, y, 300).plot("image", alpha=1, cmap="Greys", ax=ax, transform=lambda x: x > 0);
# Not working currently
<matplotlib.axes._subplots.AxesSubplot at 0x7f4f41f4cbe0>
By this, we mean 3D bar plots of 2D histograms (not a visual representation of 3D histograms).
histogram.plot("bar3d", cmap="rainbow");
histogram.plot("bar3d", color="red");
proj1 = histogram.projection("x", name="Projection to X")
proj1.plot(errors=True)
proj1
Histogram1D(bins=(4,), total=1000, dtype=int64)
proj2 = histogram.projection("y", name="Projection to Y")
proj2.plot(errors=True)
proj2
Histogram1D(bins=(7,), total=1000, dtype=int64)
# Create and add two histograms with adaptive binning
height1 = np.random.normal(180, 5, 1000)
weight1 = np.random.normal(80, 2, 1000)
ad1 = h2(height1, weight1, "fixed_width", 1, adaptive=True)
ad1.plot(show_zero=False)
height2 = np.random.normal(160, 5, 1000)
weight2 = np.random.normal(70, 2, 1000)
ad2 = h2(height2, weight2, "fixed_width", 1, adaptive=True)
ad2.plot(show_zero=False)
(ad1 + ad2).plot(show_zero=False);
Although is not easy to visualize them, it is possible to create histograms of any dimensions that behave similar to 2D ones. Warning: be aware that the memory consumption can be significant.
# Create a 4D histogram
data = [np.random.rand(1000)[:, np.newaxis] for i in range(4)]
data = np.concatenate(data, axis=1)
h4 = histogramdd(data, [3, 2, 2, 3], axis_names="abcd")
h4
HistogramND(bins=(3, 2, 2, 3), total=1000, dtype=int64)
h4.frequencies
array([[[[31, 28, 33], [21, 22, 22]], [[25, 29, 28], [29, 35, 28]]], [[[20, 25, 20], [28, 32, 31]], [[30, 28, 24], [29, 21, 27]]], [[[27, 26, 33], [21, 35, 30]], [[38, 30, 32], [25, 30, 27]]]])
h4.projection("a", "d", name="4D -> 2D").plot(show_values=True, format_value=int, cmap_min="min");
h4.projection("d", name="4D -> 1D").plot("scatter", errors=True);
# Load notorious example data set
iris = sns.load_dataset('iris')
iris = sns.load_dataset('iris')
iris_hist = physt.h2(iris["sepal_length"], iris["sepal_width"], "human", (12, 7), name="Iris")
iris_hist.plot(show_zero=False, cmap=cm.gray_r, show_values=True, format_value=int);
iris_hist.projection("sepal_length").plot();