import numpy as np
import pandas as pd
s1 = pd.Series([1, 3, 5, np.nan, 6, 8])
s1
0 1.0 1 3.0 2 5.0 3 NaN 4 6.0 5 8.0 dtype: float64
s2 = pd.Series([10, 20, 30], index=[2, 3, 4])
s2
2 10 3 20 4 30 dtype: int64
The index aligns the two datasets, comparing "apples to apples".
s1 + s2
0 NaN 1 NaN 2 15.0 3 NaN 4 36.0 5 NaN dtype: float64
df = pd.DataFrame({"a": np.arange(100), "b": np.random.random(100)})
df
a | b | |
---|---|---|
0 | 0 | 0.021700 |
1 | 1 | 0.571916 |
2 | 2 | 0.943367 |
3 | 3 | 0.762479 |
4 | 4 | 0.893559 |
... | ... | ... |
95 | 95 | 0.536255 |
96 | 96 | 0.112918 |
97 | 97 | 0.988750 |
98 | 98 | 0.374512 |
99 | 99 | 0.734572 |
100 rows × 2 columns
df.head()
a | b | |
---|---|---|
0 | 0 | 0.021700 |
1 | 1 | 0.571916 |
2 | 2 | 0.943367 |
3 | 3 | 0.762479 |
4 | 4 | 0.893559 |
df["a"]
0 0 1 1 2 2 3 3 4 4 .. 95 95 96 96 97 97 98 98 99 99 Name: a, Length: 100, dtype: int64
df["c"] = np.cos(df["a"] / 10)
df.head()
a | b | c | |
---|---|---|---|
0 | 0 | 0.021700 | 1.000000 |
1 | 1 | 0.571916 | 0.995004 |
2 | 2 | 0.943367 | 0.980067 |
3 | 3 | 0.762479 | 0.955336 |
4 | 4 | 0.893559 | 0.921061 |
masked = df[df["c"] > 0.5]
masked
a | b | c | |
---|---|---|---|
0 | 0 | 0.021700 | 1.000000 |
1 | 1 | 0.571916 | 0.995004 |
2 | 2 | 0.943367 | 0.980067 |
3 | 3 | 0.762479 | 0.955336 |
4 | 4 | 0.893559 | 0.921061 |
5 | 5 | 0.280620 | 0.877583 |
6 | 6 | 0.074905 | 0.825336 |
7 | 7 | 0.006715 | 0.764842 |
8 | 8 | 0.546949 | 0.696707 |
9 | 9 | 0.784740 | 0.621610 |
10 | 10 | 0.735269 | 0.540302 |
53 | 53 | 0.178331 | 0.554374 |
54 | 54 | 0.970279 | 0.634693 |
55 | 55 | 0.555380 | 0.708670 |
56 | 56 | 0.849476 | 0.775566 |
57 | 57 | 0.069965 | 0.834713 |
58 | 58 | 0.519418 | 0.885520 |
59 | 59 | 0.824816 | 0.927478 |
60 | 60 | 0.352184 | 0.960170 |
61 | 61 | 0.329233 | 0.983268 |
62 | 62 | 0.897445 | 0.996542 |
63 | 63 | 0.476213 | 0.999859 |
64 | 64 | 0.602563 | 0.993185 |
65 | 65 | 0.285894 | 0.976588 |
66 | 66 | 0.914200 | 0.950233 |
67 | 67 | 0.507988 | 0.914383 |
68 | 68 | 0.841618 | 0.869397 |
69 | 69 | 0.361295 | 0.815725 |
70 | 70 | 0.805875 | 0.753902 |
71 | 71 | 0.430637 | 0.684547 |
72 | 72 | 0.295865 | 0.608351 |
73 | 73 | 0.525168 | 0.526078 |
df.sum()
a 4950.000000 b 47.632534 dtype: float64
df.describe()
a | b | |
---|---|---|
count | 100.000000 | 100.000000 |
mean | 49.500000 | 0.476325 |
std | 29.011492 | 0.294647 |
min | 0.000000 | 0.006715 |
25% | 24.750000 | 0.228288 |
50% | 49.500000 | 0.470454 |
75% | 74.250000 | 0.760325 |
max | 99.000000 | 0.988750 |
binned = np.digitize(df["a"], bins=np.arange(0, 100, 10))
binned
array([ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10])
avg_per_bin = df.groupby(binned).mean()
avg_per_bin
a | b | c | |
---|---|---|---|
1 | 4.5 | 0.488695 | 0.863755 |
2 | 14.5 | 0.471454 | 0.115592 |
3 | 24.5 | 0.426544 | -0.738845 |
4 | 34.5 | 0.365475 | -0.913992 |
5 | 44.5 | 0.411681 | -0.248819 |
6 | 54.5 | 0.530656 | 0.645117 |
7 | 64.5 | 0.556863 | 0.945935 |
8 | 74.5 | 0.575423 | 0.377065 |
9 | 84.5 | 0.464790 | -0.538477 |
10 | 94.5 | 0.471672 | -0.958946 |
%matplotlib inline
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
ax.plot(df["a"], df["c"], label="df")
ax.plot(masked["a"], masked["c"], label="masked", marker="x", linestyle="none")
ax.plot(avg_per_bin["a"], avg_per_bin["c"], label="avg_per_bin")
ax.legend()
<matplotlib.legend.Legend at 0x7ff75150ffd0>
experiments = pd.DataFrame(
{
"sample": ["Ni", "Ni", "Au", "Au", "Ag", "Ag", "Ag", "Fe", "Fe"],
"temp": [273, 253, 272, 251, 275, 250, 273, 271, 253],
"operator": ["Jane", "Jane", "Jane", "Jane", "Joe", "Joe", "Joe", "Chris", "Chris"],
"measurement": [10, 11, 10, 9, 11, 9, 10, 13, 11],
}
)
experiments
sample | temp | operator | measurement | |
---|---|---|---|---|
0 | Ni | 273 | Jane | 10 |
1 | Ni | 253 | Jane | 11 |
2 | Au | 272 | Jane | 10 |
3 | Au | 251 | Jane | 9 |
4 | Ag | 275 | Joe | 11 |
5 | Ag | 250 | Joe | 9 |
6 | Ag | 273 | Joe | 10 |
7 | Fe | 271 | Chris | 13 |
8 | Fe | 253 | Chris | 11 |
pivoted = pd.pivot_table(experiments, index=["sample", "temp"], columns=["operator"])
pivoted
measurement | ||||
---|---|---|---|---|
operator | Chris | Jane | Joe | |
sample | temp | |||
Ag | 250 | NaN | NaN | 9.0 |
273 | NaN | NaN | 10.0 | |
275 | NaN | NaN | 11.0 | |
Au | 251 | NaN | 9.0 | NaN |
272 | NaN | 10.0 | NaN | |
Fe | 253 | 11.0 | NaN | NaN |
271 | 13.0 | NaN | NaN | |
Ni | 253 | NaN | 11.0 | NaN |
273 | NaN | 10.0 | NaN |
pivoted.stack()
measurement | |||
---|---|---|---|
sample | temp | operator | |
Ag | 250 | Joe | 9.0 |
273 | Joe | 10.0 | |
275 | Joe | 11.0 | |
Au | 251 | Jane | 9.0 |
272 | Jane | 10.0 | |
Fe | 253 | Chris | 11.0 |
271 | Chris | 13.0 | |
Ni | 253 | Jane | 11.0 |
273 | Jane | 10.0 |
pivoted.unstack()
measurement | |||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
operator | Chris | Jane | Joe | ||||||||||||||||||
temp | 250 | 251 | 253 | 271 | 272 | 273 | 275 | 250 | 251 | 253 | ... | 272 | 273 | 275 | 250 | 251 | 253 | 271 | 272 | 273 | 275 |
sample | |||||||||||||||||||||
Ag | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | 9.0 | NaN | NaN | NaN | NaN | 10.0 | 11.0 |
Au | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 9.0 | NaN | ... | 10.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
Fe | NaN | NaN | 11.0 | 13.0 | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
Ni | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 11.0 | ... | NaN | 10.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
4 rows × 21 columns
df.columns
Index(['a', 'b', 'c'], dtype='object')
df.index
RangeIndex(start=0, stop=100, step=1)
Column-wise access is fast; row-wise access is slow.
# This will be comparatively slow. On small tables it may not matter.
for index, row in df.iterrows():
...
Is a DataFrame 1-dimensional or 2-dimensional?
df.head()
a | b | c | |
---|---|---|---|
0 | 0 | 0.021700 | 1.000000 |
1 | 1 | 0.571916 | 0.995004 |
2 | 2 | 0.943367 | 0.980067 |
3 | 3 | 0.762479 | 0.955336 |
4 | 4 | 0.893559 | 0.921061 |
Pandas is more complex inside than numpy.
Underlying data is organized into "blocks" of like data type. Sometimes the blocks store numpy arrays; sometimes they store something more specialized.
df._data
BlockManager Items: Index(['a', 'b', 'c'], dtype='object') Axis 1: RangeIndex(start=0, stop=100, step=1) NumericBlock: slice(1, 3, 1), 2 x 100, dtype: float64 NumericBlock: slice(0, 1, 1), 1 x 100, dtype: int64
df._data.blocks
(NumericBlock: slice(1, 3, 1), 2 x 100, dtype: float64, NumericBlock: slice(0, 1, 1), 1 x 100, dtype: int64)
df._data.blocks[0].values
array([[ 0.02169961, 0.57191617, 0.94336744, 0.76247875, 0.89355944, 0.28061985, 0.07490485, 0.00671478, 0.54694905, 0.78474025, 0.73526894, 0.13130981, 0.3284698 , 0.19313359, 0.67364605, 0.46469555, 0.25197018, 0.6480102 , 0.90820986, 0.37982242, 0.50418041, 0.1394719 , 0.177409 , 0.09648807, 0.57920907, 0.12140765, 0.77264377, 0.62380269, 0.84016833, 0.4106641 , 0.36880325, 0.28877727, 0.59150967, 0.38334719, 0.286111 , 0.52722373, 0.00834346, 0.09297144, 0.15996142, 0.94769908, 0.02287614, 0.23246116, 0.90280447, 0.18027103, 0.89933323, 0.32089887, 0.89943022, 0.03315442, 0.41586305, 0.20971634, 0.75960768, 0.42505656, 0.15423074, 0.17833109, 0.97027914, 0.55537967, 0.84947565, 0.06996517, 0.51941796, 0.82481572, 0.35218439, 0.32923343, 0.89744524, 0.47621268, 0.60256269, 0.28589414, 0.91420016, 0.50798777, 0.84161808, 0.36129532, 0.80587483, 0.43063669, 0.29586512, 0.52516787, 0.7025272 , 0.92243695, 0.00694605, 0.48695912, 0.79557451, 0.7822406 , 0.21576979, 0.66088274, 0.24881061, 0.97540968, 0.76987802, 0.20054532, 0.23896566, 0.50409409, 0.0162174 , 0.81733116, 0.43694734, 0.58238885, 0.61299265, 0.32765915, 0.00972478, 0.536255 , 0.11291805, 0.98874984, 0.37451228, 0.73457169], [ 1. , 0.99500417, 0.98006658, 0.95533649, 0.92106099, 0.87758256, 0.82533561, 0.76484219, 0.69670671, 0.62160997, 0.54030231, 0.45359612, 0.36235775, 0.26749883, 0.16996714, 0.0707372 , -0.02919952, -0.12884449, -0.22720209, -0.32328957, -0.41614684, -0.5048461 , -0.58850112, -0.66627602, -0.73739372, -0.80114362, -0.85688875, -0.90407214, -0.94222234, -0.97095817, -0.9899925 , -0.99913515, -0.99829478, -0.98747977, -0.96679819, -0.93645669, -0.89675842, -0.84810003, -0.79096771, -0.7259323 , -0.65364362, -0.57482395, -0.49026082, -0.40079917, -0.30733287, -0.2107958 , -0.11215253, -0.01238866, 0.08749898, 0.18651237, 0.28366219, 0.37797774, 0.46851667, 0.55437434, 0.63469288, 0.70866977, 0.77556588, 0.83471278, 0.88551952, 0.92747843, 0.96017029, 0.98326844, 0.9965421 , 0.99985864, 0.99318492, 0.97658763, 0.95023259, 0.91438315, 0.86939749, 0.8157251 , 0.75390225, 0.68454667, 0.60835131, 0.52607752, 0.43854733, 0.34663532, 0.25125984, 0.15337386, 0.05395542, -0.04600213, -0.14550003, -0.24354415, -0.33915486, -0.43137684, -0.51928865, -0.6020119 , -0.67872005, -0.74864665, -0.81109301, -0.86543521, -0.91113026, -0.9477216 , -0.97484362, -0.99222533, -0.99969304, -0.99717216, -0.98468786, -0.96236488, -0.93042627, -0.88919115]])