In [1]:

import numpy as np
import pandas as pd
import itertools

1. Generate random dataframe, then convert to numpy array¶

In [2]:

df = pd.DataFrame(np.random.randn(3, 4) * 10, columns=[f"column {i}" for i in range(4)])
df

Out[2]:

	column 0	column 1	column 2	column 3
0	-2.514103	-11.355411	-7.881233	2.462776
1	-8.846382	-3.083167	-14.698746	-2.668292
2	-1.551953	0.827107	-4.770237	-17.009419

In [3]:

df.values  # this is numpy array

Out[3]:

array([[ -2.51410318, -11.35541057,  -7.8812329 ,   2.46277592],
       [ -8.84638153,  -3.08316687, -14.69874602,  -2.66829197],
       [ -1.55195347,   0.82710666,  -4.77023707, -17.00941874]])

2. Create a dataframe with multiindex names. And show how you gonna use loc on this dataframe?¶

In [4]:

group_list = [3 * ["portfolio_1"], 3 * ["portfolio_2"], 3 * ["portfolio_3"]]
flat_group = list(itertools.chain(*group_list))

In [5]:

subgroup_list = [[f"strategy_{i}" for i in range(1, 4)] for _ in range(3)]
flat_subgroup = list(itertools.chain(*subgroup_list))

In [14]:

df = pd.DataFrame(np.random.randn(9, 4))
df.index = pd.MultiIndex.from_arrays(
    [flat_group, flat_subgroup], names=("portfolio", "strategy")
)
df

Out[14]:

		0	1	2	3
portfolio	strategy
portfolio_1	strategy_1	-1.666313	-1.535478	1.192467	-0.221651
	strategy_2	-0.173095	-0.213803	0.235330	0.861566
	strategy_3	0.374859	1.990981	0.107066	-1.670760
portfolio_2	strategy_1	-0.084183	0.288425	1.052083	-0.146290
	strategy_2	0.326947	0.027266	0.222703	0.308883
	strategy_3	1.563305	1.280715	0.685003	2.202861
portfolio_3	strategy_1	0.464870	-1.077632	0.000620	1.331755
	strategy_2	-1.627832	-0.361344	1.080016	-0.599627
	strategy_3	0.348844	0.780083	-0.679279	-0.716045

In [7]:

df.loc[[("portfolio_2", "strategy_3")]]  # select with a tuple

Out[7]:

		0	1	2	3
portfolio	strategy
portfolio_2	strategy_3	-1.44679	-1.263631	0.45624	-1.349913

3.How to use Multicolumn?¶

In [8]:

top_columns = [f"PM {i}" for i in range(1, 3) for _ in range(2)]
bottom_columns = 2 * ["Opportunistic", "Niches"]

In [15]:

df.columns = pd.MultiIndex.from_arrays([top_columns, bottom_columns])
df

Out[15]:

		PM 1		PM 2
		Opportunistic	Niches	Opportunistic	Niches
portfolio	strategy
portfolio_1	strategy_1	-1.666313	-1.535478	1.192467	-0.221651
	strategy_2	-0.173095	-0.213803	0.235330	0.861566
	strategy_3	0.374859	1.990981	0.107066	-1.670760
portfolio_2	strategy_1	-0.084183	0.288425	1.052083	-0.146290
	strategy_2	0.326947	0.027266	0.222703	0.308883
	strategy_3	1.563305	1.280715	0.685003	2.202861
portfolio_3	strategy_1	0.464870	-1.077632	0.000620	1.331755
	strategy_2	-1.627832	-0.361344	1.080016	-0.599627
	strategy_3	0.348844	0.780083	-0.679279	-0.716045

In [17]:

# select PM 2 and Niche, return a series
df.loc[:, ("PM 2", "Niches")]

Out[17]:

portfolio    strategy  
portfolio_1  strategy_1   -0.221651
             strategy_2    0.861566
             strategy_3   -1.670760
portfolio_2  strategy_1   -0.146290
             strategy_2    0.308883
             strategy_3    2.202861
portfolio_3  strategy_1    1.331755
             strategy_2   -0.599627
             strategy_3   -0.716045
Name: (PM 2, Niches), dtype: float64

In [18]:

df.loc[:, ("PM 2", "Niches")].to_frame()  # return a dataframe

Out[18]:

		PM 2
		Niches
portfolio	strategy
portfolio_1	strategy_1	-0.221651
	strategy_2	0.861566
	strategy_3	-1.670760
portfolio_2	strategy_1	-0.146290
	strategy_2	0.308883
	strategy_3	2.202861
portfolio_3	strategy_1	1.331755
	strategy_2	-0.599627
	strategy_3	-0.716045

4. How to use reindex to change the order of index or columns?¶

In [11]:

# An example of reindexing
df = pd.DataFrame(
    np.random.randn(3, 4),
    columns=[f"column {i}" for i in range(4)],
    index=pd.date_range("20240101", periods=3, freq="B"),
)
df

Out[11]:

	column 0	column 1	column 2	column 3
2024-01-01	1.604649	0.923969	-0.627469	0.625340
2024-01-02	0.587526	1.938590	-1.275731	0.112374
2024-01-03	-0.082560	0.964331	1.170139	0.474678

In [12]:

index_2 = pd.date_range("20231229", periods=8)
df = df.reindex(index_2)
df

Out[12]:

	column 0	column 1	column 2	column 3
2023-12-29	NaN	NaN	NaN	NaN
2023-12-30	NaN	NaN	NaN	NaN
2023-12-31	NaN	NaN	NaN	NaN
2024-01-01	1.604649	0.923969	-0.627469	0.625340
2024-01-02	0.587526	1.938590	-1.275731	0.112374
2024-01-03	-0.082560	0.964331	1.170139	0.474678
2024-01-04	NaN	NaN	NaN	NaN
2024-01-05	NaN	NaN	NaN	NaN

In [13]:

new_columns = [f"column {i}" for i in range(-2, 5)]
df = df.reindex(columns=new_columns)
df

Out[13]:

	column -2	column -1	column 0	column 1	column 2	column 3	column 4
2023-12-29	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2023-12-30	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2023-12-31	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2024-01-01	NaN	NaN	1.604649	0.923969	-0.627469	0.625340	NaN
2024-01-02	NaN	NaN	0.587526	1.938590	-1.275731	0.112374	NaN
2024-01-03	NaN	NaN	-0.082560	0.964331	1.170139	0.474678	NaN
2024-01-04	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2024-01-05	NaN	NaN	NaN	NaN	NaN	NaN	NaN

5. Show how you gonna use .pivot_table method? And what does arg column and index mean in this method? After pivoting, what are the index and columns type?¶

In [28]:

# demonstrate how to use .pivot() to create a dataframe

df = pd.DataFrame(
    {
        "A": ["apple", "apple", "pear", "banana"] * 3,
        "B": ["one", "two", "one", "two"] * 3,
        "C": ["small", "large", "large", "small"] * 3,
        "D": np.random.randn(12),
        "E": np.random.randn(12),
    }
); df

Out[28]:

	A	B	C	D	E
0	apple	one	small	-0.596805	1.722010
1	apple	two	large	0.009945	0.832135
2	pear	one	large	0.387427	-0.118612
3	banana	two	small	-0.361440	0.431291
4	apple	one	small	-3.021230	0.905683
5	apple	two	large	-1.515054	2.379202
6	pear	one	large	-0.536045	-1.021149
7	banana	two	small	1.498246	0.444942
8	apple	one	small	-1.286598	1.124035
9	apple	two	large	-0.481828	-0.273580
10	pear	one	large	-0.967426	-0.463726
11	banana	two	small	-0.400804	0.282053

In [27]:

df.pivot_table(index="A", columns="B", values=["D"]) # default aggregation is mean

Out[27]:

	D
B	one	two
A
apple	-0.475195	0.837590
banana	NaN	-0.678793
pear	-0.693141	NaN

In [31]:

df.pivot_table(index="A", columns="B", values=["D"], aggfunc="sum") # sum aggregation

Out[31]:

	D
B	one	two
A
apple	-4.904634	-1.986937
banana	NaN	0.736002
pear	-1.116043	NaN

In [40]:

df_pivoted = df.pivot_table(index="A", columns="B", values=["D", "E"], aggfunc="max", fill_value=0); df_pivoted

Out[40]:

	D		E
B	one	two	one	two
A
apple	-0.596805	0.009945	1.722010	2.379202
banana	0.000000	1.498246	0.000000	0.444942
pear	0.387427	0.000000	-0.118612	0.000000

In [39]:

df_pivoted.columns

Out[39]:

MultiIndex([('D', 'one'),
            ('D', 'two'),
            ('E', 'one'),
            ('E', 'two')],
           names=[None, 'B'])

In [ ]: