import pandas as pd
import janitor
import numpy as np
#https://stackoverflow.com/q/61948103/7175713
df1 = pd.DataFrame({'id': [1,1,1,2,2,3],
'value_1': [2,5,7,1,3,4]})
df2 = pd.DataFrame({'id': [1,1,1,1,2,2,2,3],
'value_2A': [0,3,7,12,0,2,3,1],
'value_2B': [1,5,9,15,1,4,6,3]})
df1
id | value_1 | |
---|---|---|
0 | 1 | 2 |
1 | 1 | 5 |
2 | 1 | 7 |
3 | 2 | 1 |
4 | 2 | 3 |
5 | 3 | 4 |
df2
id | value_2A | value_2B | |
---|---|---|---|
0 | 1 | 0 | 1 |
1 | 1 | 3 | 5 |
2 | 1 | 7 | 9 |
3 | 1 | 12 | 15 |
4 | 2 | 0 | 1 |
5 | 2 | 2 | 4 |
6 | 2 | 3 | 6 |
7 | 3 | 1 | 3 |
Join on equi and non-equi operators is possible:
df1.conditional_join(
df2,
('id', 'id', '=='),
('value_1', 'value_2A', '>='),
('value_1', 'value_2B', '<='),
sort_by_appearance = True
)
left | right | ||||
---|---|---|---|---|---|
id | value_1 | id | value_2A | value_2B | |
0 | 1 | 5 | 1 | 3 | 5 |
1 | 1 | 7 | 1 | 7 | 9 |
2 | 2 | 1 | 2 | 0 | 1 |
3 | 2 | 3 | 2 | 2 | 4 |
4 | 2 | 3 | 2 | 3 | 6 |
The default join is inner. left and right joins are supported as well:
df1.conditional_join(
df2,
('id', 'id', '=='),
('value_1', 'value_2A', '>='),
('value_1', 'value_2B', '<='),
how='left',
sort_by_appearance = True
)
left | right | ||||
---|---|---|---|---|---|
id | value_1 | id | value_2A | value_2B | |
0 | 1 | 2 | NaN | NaN | NaN |
1 | 1 | 5 | 1.0 | 3.0 | 5.0 |
2 | 1 | 7 | 1.0 | 7.0 | 9.0 |
3 | 2 | 1 | 2.0 | 0.0 | 1.0 |
4 | 2 | 3 | 2.0 | 2.0 | 4.0 |
5 | 2 | 3 | 2.0 | 3.0 | 6.0 |
6 | 3 | 4 | NaN | NaN | NaN |
df1.conditional_join(
df2,
('id', 'id', '=='),
('value_1', 'value_2A', '>='),
('value_1', 'value_2B', '<='),
how='right',
sort_by_appearance = True
)
left | right | ||||
---|---|---|---|---|---|
id | value_1 | id | value_2A | value_2B | |
0 | NaN | NaN | 1 | 0 | 1 |
1 | 1.0 | 5.0 | 1 | 3 | 5 |
2 | 1.0 | 7.0 | 1 | 7 | 9 |
3 | NaN | NaN | 1 | 12 | 15 |
4 | 2.0 | 1.0 | 2 | 0 | 1 |
5 | 2.0 | 3.0 | 2 | 2 | 4 |
6 | 2.0 | 3.0 | 2 | 3 | 6 |
7 | NaN | NaN | 3 | 1 | 3 |
Join on just the non-equi joins is also possible:
df1.conditional_join(
df2,
('value_1', 'value_2A', '>'),
('value_1', 'value_2B', '<'),
how='inner',
sort_by_appearance = True
)
left | right | ||||
---|---|---|---|---|---|
id | value_1 | id | value_2A | value_2B | |
0 | 1 | 2 | 3 | 1 | 3 |
1 | 1 | 5 | 2 | 3 | 6 |
2 | 2 | 3 | 2 | 2 | 4 |
3 | 3 | 4 | 1 | 3 | 5 |
4 | 3 | 4 | 2 | 3 | 6 |
Join on not equal -> !=
df1.conditional_join(
df2,
('id', 'id', "!=")
)
left | right | ||||
---|---|---|---|---|---|
id | value_1 | id | value_2A | value_2B | |
0 | 1 | 2 | 2 | 0 | 1 |
1 | 1 | 2 | 2 | 2 | 4 |
2 | 1 | 2 | 2 | 3 | 6 |
3 | 1 | 2 | 3 | 1 | 3 |
4 | 1 | 5 | 2 | 0 | 1 |
5 | 1 | 5 | 2 | 2 | 4 |
6 | 1 | 5 | 2 | 3 | 6 |
7 | 1 | 5 | 3 | 1 | 3 |
8 | 1 | 7 | 2 | 0 | 1 |
9 | 1 | 7 | 2 | 2 | 4 |
10 | 1 | 7 | 2 | 3 | 6 |
11 | 1 | 7 | 3 | 1 | 3 |
12 | 2 | 1 | 3 | 1 | 3 |
13 | 2 | 3 | 3 | 1 | 3 |
14 | 2 | 1 | 1 | 0 | 1 |
15 | 2 | 1 | 1 | 3 | 5 |
16 | 2 | 1 | 1 | 7 | 9 |
17 | 2 | 1 | 1 | 12 | 15 |
18 | 2 | 3 | 1 | 0 | 1 |
19 | 2 | 3 | 1 | 3 | 5 |
20 | 2 | 3 | 1 | 7 | 9 |
21 | 2 | 3 | 1 | 12 | 15 |
22 | 3 | 4 | 1 | 0 | 1 |
23 | 3 | 4 | 1 | 3 | 5 |
24 | 3 | 4 | 1 | 7 | 9 |
25 | 3 | 4 | 1 | 12 | 15 |
26 | 3 | 4 | 2 | 0 | 1 |
27 | 3 | 4 | 2 | 2 | 4 |
28 | 3 | 4 | 2 | 3 | 6 |
If the columns from both dataframes have nothing in common, a single indexed column is returned:
(df1.select_columns('value_1')
.conditional_join(
df2.select_columns('val*'),
('value_1', 'value_2A', '>'),
('value_1', 'value_2B', '<'),
)
)
value_1 | value_2A | value_2B | |
---|---|---|---|
0 | 2 | 1 | 3 |
1 | 5 | 3 | 6 |
2 | 3 | 2 | 4 |
3 | 4 | 3 | 5 |
4 | 4 | 3 | 6 |