Advanced pandas¶

In [ ]:

import numpy as np
import pandas as pd
np.random.seed(12345)
import matplotlib.pyplot as plt
plt.rc('figure', figsize=(10, 6))
PREVIOUS_MAX_ROWS = pd.options.display.max_rows
pd.options.display.max_rows = 20
np.set_printoptions(precision=4, suppress=True)

Categorical Data¶

Background and Motivation¶

In [ ]:

import numpy as np; import pandas as pd
values = pd.Series(['apple', 'orange', 'apple',
                    'apple'] * 2)
values
pd.unique(values)
pd.value_counts(values)

In [ ]:

values = pd.Series([0, 1, 0, 0] * 2)
dim = pd.Series(['apple', 'orange'])
values
dim

In [ ]:

dim.take(values)

Categorical Type in pandas¶

In [ ]:

fruits = ['apple', 'orange', 'apple', 'apple'] * 2
N = len(fruits)
df = pd.DataFrame({'fruit': fruits,
                   'basket_id': np.arange(N),
                   'count': np.random.randint(3, 15, size=N),
                   'weight': np.random.uniform(0, 4, size=N)},
                  columns=['basket_id', 'fruit', 'count', 'weight'])
df

In [ ]:

fruit_cat = df['fruit'].astype('category')
fruit_cat

In [ ]:

c = fruit_cat.values
type(c)

In [ ]:

c.categories
c.codes

In [ ]:

df['fruit'] = df['fruit'].astype('category')
df.fruit

In [ ]:

my_categories = pd.Categorical(['foo', 'bar', 'baz', 'foo', 'bar'])
my_categories

In [ ]:

categories = ['foo', 'bar', 'baz']
codes = [0, 1, 2, 0, 0, 1]
my_cats_2 = pd.Categorical.from_codes(codes, categories)
my_cats_2

In [ ]:

ordered_cat = pd.Categorical.from_codes(codes, categories,
                                        ordered=True)
ordered_cat

In [ ]:

my_cats_2.as_ordered()

Computations with Categoricals¶

In [ ]:

np.random.seed(12345)
draws = np.random.randn(1000)
draws[:5]

In [ ]:

bins = pd.qcut(draws, 4)
bins

In [ ]:

bins = pd.qcut(draws, 4, labels=['Q1', 'Q2', 'Q3', 'Q4'])
bins
bins.codes[:10]

In [ ]:

bins = pd.Series(bins, name='quartile')
results = (pd.Series(draws)
           .groupby(bins)
           .agg(['count', 'min', 'max'])
           .reset_index())
results

In [ ]:

results['quartile']

Better performance with categoricals¶

In [ ]:

N = 10000000
draws = pd.Series(np.random.randn(N))
labels = pd.Series(['foo', 'bar', 'baz', 'qux'] * (N // 4))

In [ ]:

categories = labels.astype('category')

In [ ]:

labels.memory_usage()
categories.memory_usage()

In [ ]:

%time _ = labels.astype('category')

Categorical Methods¶

In [ ]:

s = pd.Series(['a', 'b', 'c', 'd'] * 2)
cat_s = s.astype('category')
cat_s

In [ ]:

cat_s.cat.codes
cat_s.cat.categories

In [ ]:

actual_categories = ['a', 'b', 'c', 'd', 'e']
cat_s2 = cat_s.cat.set_categories(actual_categories)
cat_s2

In [ ]:

cat_s.value_counts()
cat_s2.value_counts()

In [ ]:

cat_s3 = cat_s[cat_s.isin(['a', 'b'])]
cat_s3
cat_s3.cat.remove_unused_categories()

Creating dummy variables for modeling¶

In [ ]:

cat_s = pd.Series(['a', 'b', 'c', 'd'] * 2, dtype='category')

In [ ]:

pd.get_dummies(cat_s)

Advanced GroupBy Use¶

Group Transforms and "Unwrapped" GroupBys¶

In [ ]:

df = pd.DataFrame({'key': ['a', 'b', 'c'] * 4,
                   'value': np.arange(12.)})
df

In [ ]:

g = df.groupby('key').value
g.mean()

In [ ]:

g.transform(lambda x: x.mean())

In [ ]:

g.transform('mean')

In [ ]:

g.transform(lambda x: x * 2)

In [ ]:

g.transform(lambda x: x.rank(ascending=False))

In [ ]:

def normalize(x):
    return (x - x.mean()) / x.std()

In [ ]:

g.transform(normalize)
g.apply(normalize)

In [ ]:

g.transform('mean')
normalized = (df['value'] - g.transform('mean')) / g.transform('std')
normalized

Grouped Time Resampling¶

In [ ]:

N = 15
times = pd.date_range('2017-05-20 00:00', freq='1min', periods=N)
df = pd.DataFrame({'time': times,
                   'value': np.arange(N)})
df

In [ ]:

df.set_index('time').resample('5min').count()

In [ ]:

df2 = pd.DataFrame({'time': times.repeat(3),
                    'key': np.tile(['a', 'b', 'c'], N),
                    'value': np.arange(N * 3.)})
df2[:7]

In [ ]:

time_key = pd.TimeGrouper('5min')

In [ ]:

resampled = (df2.set_index('time')
             .groupby(['key', time_key])
             .sum())
resampled
resampled.reset_index()

Techniques for Method Chaining¶

df = load_data()
df2 = df[df['col2'] < 0]
df2['col1_demeaned'] = df2['col1'] - df2['col1'].mean()
result = df2.groupby('key').col1_demeaned.std()

# Usual non-functional way
df2 = df.copy()
df2['k'] = v

# Functional assign way
df2 = df.assign(k=v)

result = (df2.assign(col1_demeaned=df2.col1 - df2.col2.mean())
          .groupby('key')
          .col1_demeaned.std())

df = load_data()
df2 = df[df['col2'] < 0]

df = (load_data()
      [lambda x: x['col2'] < 0])

result = (load_data()
          [lambda x: x.col2 < 0]
          .assign(col1_demeaned=lambda x: x.col1 - x.col1.mean())
          .groupby('key')
          .col1_demeaned.std())

The pipe Method¶

a = f(df, arg1=v1)
b = g(a, v2, arg3=v3)
c = h(b, arg4=v4)

result = (df.pipe(f, arg1=v1)
          .pipe(g, v2, arg3=v3)
          .pipe(h, arg4=v4))

g = df.groupby(['key1', 'key2'])
df['col1'] = df['col1'] - g.transform('mean')

def group_demean(df, by, cols):
    result = df.copy()
    g = df.groupby(by)
    for c in cols:
        result[c] = df[c] - g[c].transform('mean')
    return result

result = (df[df.col1 < 0]
          .pipe(group_demean, ['key1', 'key2'], ['col1']))

In [ ]:

pd.options.display.max_rows = PREVIOUS_MAX_ROWS

Advanced pandas¶

Categorical Data¶

Background and Motivation¶

Categorical Type in pandas¶

Computations with Categoricals¶

Better performance with categoricals¶

Categorical Methods¶

Creating dummy variables for modeling¶

Advanced GroupBy Use¶

Group Transforms and "Unwrapped" GroupBys¶

Grouped Time Resampling¶

Techniques for Method Chaining¶

The pipe Method¶

Conclusion¶