▶️ First, run the code cell below to import unittest
, a module used for 🧭 Check Your Work sections and the autograder.
import unittest
tc = unittest.TestCase()
pandas
: Use alias pd
.numpy
: Use alias np
.### BEGIN SOLUTION
import pandas as pd
import numpy as np
### END SOLUTION
import sys
tc.assertTrue('pd' in globals(), 'Check whether you have correctly import Pandas with an alias.')
tc.assertTrue('np' in globals(), 'Check whether you have correctly import NumPy with an alias.')
For the first part, we're going to work with a small DataFrame to see how we merge two DataFrames together.
▶️ Run the code cell below to create df_employees
and df_laptops
.
df_employees = pd.DataFrame({
'emp_id': [1, 2, 3],
'name': ['Jasper', 'Gary', 'Sally'],
'laptop_id': ['A', 'B', np.nan]
})
df_laptops = pd.DataFrame({
'laptop_id': ['A', 'B', 'C', 'D'],
'model': ['Red Touchbook', 'BlueGo', 'Eco Green', 'Hackbook Pro']
})
# Used for 🧭 Check Your Work sections
df_employees_check = df_employees.copy()
df_laptops_check = df_laptops.copy()
df_join_check = df_employees_check.merge(df_laptops, on='laptop_id', how='outer')
▶️ Run the code cell below to display df_employees
.
df_employees
emp_id | name | laptop_id | |
---|---|---|---|
0 | 1 | Jasper | A |
1 | 2 | Gary | B |
2 | 3 | Sally | NaN |
▶️ Run the code cell below to display df_laptops
.
df_laptops
laptop_id | model | |
---|---|---|
0 | A | Red Touchbook |
1 | B | BlueGo |
2 | C | Eco Green |
3 | D | Hackbook Pro |
df_employees
and df_laptop
using an inner merge.df_inner
.df_inner = pd.merge(
left=...,
right=...,
on='...',
how='...'
)
emp_id | name | laptop_id | model | |
---|---|---|---|---|
0 | 1 | Jasper | A | Red Touchbook |
1 | 2 | Gary | B | BlueGo |
### BEGIN SOLUTION
df_inner = pd.merge(
left=df_employees,
right=df_laptops,
on='laptop_id',
how='inner'
)
### END SOLUTION
display(df_inner)
emp_id | name | laptop_id | model | |
---|---|---|---|---|
0 | 1 | Jasper | A | Red Touchbook |
1 | 2 | Gary | B | BlueGo |
# DO NOT CHANGE THE CODE IN THIS CELL
df_jc = df_join_check
df_sol = df_jc[df_jc['emp_id'].notna() & df_jc['laptop_id'].notna()].reset_index(drop=True)
pd.testing.assert_frame_equal(
df_inner.reset_index(drop=True),
df_sol.reset_index(drop=True),
check_dtype=False
)
'laptop_id'
and 'model'
as np.NaN (or any other null-like value).df_employees
and df_laptop
using a left merge.df_left
.emp_id | name | laptop_id | model | |
---|---|---|---|---|
0 | 1 | Jasper | A | Red Touchbook |
1 | 2 | Gary | B | BlueGo |
2 | 3 | Sally | NaN | NaN |
### BEGIN SOLUTION
df_left = pd.merge(
left=df_employees,
right=df_laptops,
on='laptop_id',
how='left'
)
### END SOLUTION
display(df_left)
emp_id | name | laptop_id | model | |
---|---|---|---|---|
0 | 1 | Jasper | A | Red Touchbook |
1 | 2 | Gary | B | BlueGo |
2 | 3 | Sally | NaN | NaN |
# DO NOT CHANGE THE CODE IN THIS CELL
df_jc = df_join_check
df_sol = df_jc[df_jc['emp_id'].notna()].reset_index(drop=True)
pd.testing.assert_frame_equal(
df_left.reset_index(drop=True),
df_sol.reset_index(drop=True),
check_dtype=False
)
'emp_id'
and 'name'
as np.NaN (or any other null-like value).df_employees
and df_laptop
using a right merge.df_right
.emp_id | name | laptop_id | model | |
---|---|---|---|---|
0 | 1 | Jasper | A | Red Touchbook |
1 | 2 | Gary | B | BlueGo |
2 | NaN | NaN | C | Eco Green |
3 | NaN | NaN | D | Hackbook Pro |
### BEGIN SOLUTION
df_right = pd.merge(
left=df_employees,
right=df_laptops,
on='laptop_id',
how='right'
)
### END SOLUTION
display(df_right)
emp_id | name | laptop_id | model | |
---|---|---|---|---|
0 | 1.0 | Jasper | A | Red Touchbook |
1 | 2.0 | Gary | B | BlueGo |
2 | NaN | NaN | C | Eco Green |
3 | NaN | NaN | D | Hackbook Pro |
# DO NOT CHANGE THE CODE IN THIS CELL
df_jc = df_join_check
df_sol = df_jc[df_jc['laptop_id'].notna()].reset_index(drop=True)
pd.testing.assert_frame_equal(
df_right.reset_index(drop=True),
df_sol.reset_index(drop=True),
check_dtype=False
)
'laptop_id'
and 'model'
as np.NaN (or any other null-like value).'emp_id'
and 'name'
as np.NaN (or any other null-like value).df_employees
and df_laptop
using an outer merge.df_outer
.emp_id | name | laptop_id | model | |
---|---|---|---|---|
0 | 1 | Jasper | A | Red Touchbook |
1 | 2 | Gary | B | BlueGo |
2 | 3 | Sally | NaN | NaN |
3 | NaN | NaN | C | Eco Green |
4 | NaN | NaN | D | Hackbook Pro |
### BEGIN SOLUTION
df_outer = pd.merge(
left=df_employees,
right=df_laptops,
on='laptop_id',
how='outer'
)
### END SOLUTION
display(df_outer)
emp_id | name | laptop_id | model | |
---|---|---|---|---|
0 | 1.0 | Jasper | A | Red Touchbook |
1 | 2.0 | Gary | B | BlueGo |
2 | 3.0 | Sally | NaN | NaN |
3 | NaN | NaN | C | Eco Green |
4 | NaN | NaN | D | Hackbook Pro |
# DO NOT CHANGE THE CODE IN THIS CELL
df_jc = df_join_check
df_sol = df_jc[df_jc['emp_id'].notna()].reset_index(drop=True)
pd.testing.assert_frame_equal(
df_left.reset_index(drop=True),
df_sol.reset_index(drop=True),
check_dtype=False
)
df_libraries = pd.DataFrame({
'name': ['ACES (Funk)', 'Grainger', 'Law', 'Main'],
'amenities': [
'Rooms,Scanner,Printer',
'Rooms,Scanner,Printer,Cafe',
'Cafe',
'Rooms,Scanner,Printer,Cafe'
],
})
# Used for 🧭 Check Your Work sections
df_libraries_check = df_libraries.copy()
▶️ Run the code cell below to display df_libraries
.
df_libraries
name | amenities | |
---|---|---|
0 | ACES (Funk) | Rooms,Scanner,Printer |
1 | Grainger | Rooms,Scanner,Printer,Cafe |
2 | Law | Cafe |
3 | Main | Rooms,Scanner,Printer,Cafe |
'name_length'
in df_libraries
.name | amenities | name_length | |
---|---|---|---|
0 | ACES (Funk) | Rooms,Scanner,Printer | 11 |
1 | Grainger | Rooms,Scanner,Printer,Cafe | 8 |
2 | Law | Cafe | 3 |
3 | Main | Rooms,Scanner,Printer,Cafe | 4 |
### BEGIN SOLUTION
df_libraries['name_length'] = df_libraries['name'].str.len()
### END SOLUTION
display(df_libraries)
name | amenities | name_length | |
---|---|---|---|
0 | ACES (Funk) | Rooms,Scanner,Printer | 11 |
1 | Grainger | Rooms,Scanner,Printer,Cafe | 8 |
2 | Law | Cafe | 3 |
3 | Main | Rooms,Scanner,Printer,Cafe | 4 |
# DO NOT CHANGE THE CODE IN THIS CELL
df_lc = df_libraries_check
df_lc['name_length'] = df_lc['name'].str.len()
pd.testing.assert_frame_equal(
df_libraries.reset_index(drop=True),
df_lc.reset_index(drop=True),
check_dtype=False
)
'name'
column in df_libraries
.name | amenities | name_length | |
---|---|---|---|
0 | ACES (FUNK) | Rooms,Scanner,Printer | 11 |
1 | GRAINGER | Rooms,Scanner,Printer,Cafe | 8 |
2 | LAW | Cafe | 3 |
3 | MAIN | Rooms,Scanner,Printer,Cafe | 4 |
### BEGIN SOLUTION
df_libraries['name'] = df_libraries['name'].str.upper()
### END SOLUTION
display(df_libraries)
name | amenities | name_length | |
---|---|---|---|
0 | ACES (FUNK) | Rooms,Scanner,Printer | 11 |
1 | GRAINGER | Rooms,Scanner,Printer,Cafe | 8 |
2 | LAW | Cafe | 3 |
3 | MAIN | Rooms,Scanner,Printer,Cafe | 4 |
# DO NOT CHANGE THE CODE IN THIS CELL
df_lc = df_libraries_check
df_lc['name_length'] = df_lc['name'].str.len()
df_lc['name'] = df_lc['name'].str.upper()
pd.testing.assert_frame_equal(
df_libraries.reset_index(drop=True),
df_lc.reset_index(drop=True),
check_dtype=False
)
'amenities'
column using the comma (,
) as a delimiter.'amenities_list'
in df_libraries
.name | amenities | name_length | amenities_list | |
---|---|---|---|---|
0 | ACES (FUNK) | Rooms,Scanner,Printer | 11 | ['Rooms', 'Scanner', 'Printer'] |
1 | GRAINGER | Rooms,Scanner,Printer,Cafe | 8 | ['Rooms', 'Scanner', 'Printer', 'Cafe'] |
2 | LAW | Cafe | 3 | ['Cafe'] |
3 | MAIN | Rooms,Scanner,Printer,Cafe | 4 | ['Rooms', 'Scanner', 'Printer', 'Cafe'] |
### BEGIN SOLUTION
df_libraries['amenities_list'] = df_libraries['amenities'].str.split(',')
### END SOLUTION
display(df_libraries)
name | amenities | name_length | amenities_list | |
---|---|---|---|---|
0 | ACES (FUNK) | Rooms,Scanner,Printer | 11 | [Rooms, Scanner, Printer] |
1 | GRAINGER | Rooms,Scanner,Printer,Cafe | 8 | [Rooms, Scanner, Printer, Cafe] |
2 | LAW | Cafe | 3 | [Cafe] |
3 | MAIN | Rooms,Scanner,Printer,Cafe | 4 | [Rooms, Scanner, Printer, Cafe] |
# DO NOT CHANGE THE CODE IN THIS CELL
df_lc = df_libraries_check
df_lc['name_length'] = df_lc['name'].str.len()
df_lc['name'] = df_lc['name'].str.upper()
df_lc['amenities_list'] = df_lc['amenities'].str.split(',')
pd.testing.assert_frame_equal(
df_libraries.reset_index(drop=True),
df_lc.reset_index(drop=True),
check_dtype=False
)