#!/usr/bin/env python # coding: utf-8 # # Pandas Missing Values, Datetime, Aggregation, and Merging # ## Lecture Notes and in-class exercises # ▶️ First, run the code cell below to import `unittest`, a module used for **🧭 Check Your Work** sections and the autograder. # In[1]: import unittest tc = unittest.TestCase() # #### 👇 Tasks # # - ✔️ Import the following Python packages. # 1. `pandas`: Use alias `pd`. # 2. `numpy`: Use alias `np`. # In[2]: ### BEGIN SOLUTION import pandas as pd import numpy as np ### END SOLUTION # #### 🧭 Check your work # # - Once you're done, run the code cell below to test correctness. # - ✔️ If the code cell runs without an error, you're good to move on. # - ❌ If the code cell throws an error, go back and fix incorrect parts. # In[3]: import sys tc.assertTrue('pd' in globals(), 'Check whether you have correctly import Pandas with an alias.') tc.assertTrue('np' in globals(), 'Check whether you have correctly import NumPy with an alias.') # --- # # ## 👉 Working with Missing Values # # A dataset commonly can contain one or more missing values. Pandas provides a flexible set of methods for you to work with missing values. Missing values can be represented in multiple ways. Here are few common notations that denote a missing value. # # - `NaN`: NaN stands for "not a number". # - `None`: python's built-in type (equivalent to `null` in other programming languages) # - `NA`: A native scalar to denote a missing value in `pandas` # - `NaT`: NaT stands for "not a time". # In[4]: # NaN nan_example = np.nan print(nan_example) # In[5]: # None none_example = None print(none_example) # In[6]: # NA na_example = pd.NA print(na_example) # In[7]: # NaT nat_example = pd.NaT print(nat_example) # --- # # ### 📌 Load data # ▶️ Run the code cell below to create a new `DataFrame` named `df_people`. # In[8]: # DO NOT CHANGE THE CODE IN THIS CELL df_people = pd.read_csv("https://raw.githubusercontent.com/UI-Deloitte-business-analytics-center/datasets/main/people-sample.csv") # Used to keep a clean copy df_people_backup = df_people.copy() # head() displays the first 5 rows of a DataFrame df_people.head() # The table below describes each column in `df_people`. # | Column Name | Description | # |-------------------------|-----------------------------------------------------------| # | name | Full name | # | major1 | Major | # | major2 | Second major OR minor (blank if no second major or no minor) | # | city | City the person is from | # | fav_restaurant | Favorite restaurant (blank if no restaurant was given) | # | has_iphone | Whether the person use an iPhone | # --- # # ### 🎯 Challenge 1: People from Grayslake # #### 👇 Tasks # # - ✔️ Using `df_people`, filter rows where the person is from `"Grayslake"`. # - Check whether the `city` column is equal to `"Grayslake"`. # - Store the result to a new variable named `df_grayslake`. # - ✔️ `df_people` should remain unaltered after your code. # In[9]: ### BEGIN SOLUTION df_grayslake = df_people[df_people["city"] == "Grayslake"] ### END SOLUTION df_grayslake # #### 🧭 Check Your Work # # - Once you're done, run the code cell below to test correctness. # - ✔️ If the code cell runs without an error, you're good to move on. # - ❌ If the code cell throws an error, go back and fix incorrect parts. # In[10]: # df_people should remain unaltered pd.testing.assert_frame_equal(df_people, df_people_backup) pd.testing.assert_frame_equal(df_grayslake.sort_values(df_grayslake.columns.tolist()).reset_index(drop=True), df_people_backup.query(f'{"cItY".lower()} == "{"GraYsLakE".capitalize()}"') .sort_values(df_people_backup.columns.tolist()).reset_index(drop=True)) # --- # # ### 🎯 Challenge 2: Anyone with a non-missing `major2` # #### 👇 Tasks # # - ✔️ Using `df_people`, filter rows where the person has a second major or a minor. # - You're looking for rows where `major2` is not `NaN`. # - ✔️ `NaN` is a special value to denote missing value. You must use `my_series.isna()` or `my_series.notna()` to check whether a row contains a missing value or not. # - ✔️ Store the result to a new variable named `df_major2`. # - ✔️ `df_people` should remain unaltered after your code. # #### 🚀 Hints # # - `my_series.notna()` can be used to check whether a row contains a missing value or not. # # ![notna](https://github.com/bdi475/images/blob/main/pandas/notna-series.png?raw=true) # In[11]: ### BEGIN SOLUTION df_major2 = df_people[df_people["major2"].notna()] ### END SOLUTION df_major2 # #### 🧭 Check Your Work # # - Once you're done, run the code cell below to test correctness. # - ✔️ If the code cell runs without an error, you're good to move on. # - ❌ If the code cell throws an error, go back and fix incorrect parts. # In[12]: # df_people should remain unaltered pd.testing.assert_frame_equal(df_people, df_people_backup) pd.testing.assert_frame_equal(df_major2.sort_values(df_major2.columns.tolist()).reset_index(drop=True), df_people_backup.query(f"major2 == major2") .sort_values(df_people_backup.columns.tolist()).reset_index(drop=True)) # --- # # ### 🎯 Challenge 3: Anyone without a favorite movie # # #### 👇 Tasks # # - ✔️ Using `df_people`, filter rows where the person's `fav_movie` is `NaN`. # - ✔️ `NaN` is a special value to denote missing value. You must use `my_series.isna()` or `my_series.notna()` to compare a `Series` with `NaN`. # - ✔️ Store the result to a new variable named `df_no_fav_movie`. # - ✔️ `df_people` should remain unaltered after your code. # # #### 🚀 Hints # # - `my_series.isna()` can be used to check whether a row contains a missing value. # In[13]: ### BEGIN SOLUTION df_no_fav_movie = df_people[df_people["fav_movie"].isna()] ### END SOLUTION df_no_fav_movie # #### 🧭 Check Your Work # # - Once you're done, run the code cell below to test correctness. # - ✔️ If the code cell runs without an error, you're good to move on. # - ❌ If the code cell throws an error, go back and fix incorrect parts. # In[14]: # df_people should remain unaltered pd.testing.assert_frame_equal(df_people, df_people_backup) pd.testing.assert_frame_equal(df_no_fav_movie.sort_values(df_no_fav_movie.columns.tolist()).reset_index(drop=True), df_people_backup.query(f"fav_movie != fav_movie") .sort_values(df_people_backup.columns.tolist()).reset_index(drop=True)) # --- # # ## 🗓️ Working with Datetime Values # # You will often see date-looking strings in your data. A few examples are: # # - `20210315` # - `Mar 15, 2021` # - `2020-03-15` # - `2020/3/15` # # In this part, we'll discuss how we can *parse* and utilize datetime values. # --- # # ### 📌 Load employees data # # ▶️ Run the code cell below to create a new `DataFrame` named `df_emp`. # In[15]: # DO NOT CHANGE THE CODE IN THIS CELL df_emp = pd.DataFrame({ 'emp_id': [30, 40, 10, 20], 'name': ['Talal', 'Josh', 'Anika', 'Aishani'], 'dept': ['Finance', 'Purchase', 'Finance', 'Purchase'], 'office_phone': ['(217)123-4500', np.nan, np.nan, '(217)987-6600'], 'start_date': ['2017-05-01', '2018-02-01', '2020-08-01', '2019-12-01'], 'salary': [202000, 185000, 240000, 160500] }) # Used for intermediate checks df_emp_backup = df_emp.copy() df_emp # **Question**: What is the data type of the `start_date` column? # ▶️ Run `str(df_emp['start_date'].dtype)` below to see the data type of the `start_date` column. # In[16]: ### BEGIN SOLUTION str(df_emp['start_date'].dtype) ### END SOLUTION # While `object` can refer to many different types, you can safely assume that all `object` data types you see in this course refer to strings. # --- # # ### 🎯 Challenge 4: Parse a string column as datetime # # #### 👇 Tasks # # - ✔️ Parse `start_date` to a `datetime` data type. # - ✔️ Store the result to a new column named `start_date_parsed`. # # #### 🚀 Hints # # The code below converts `date_str` column to a `datetime`-typed column and stores the converted result to a new column named `date_parsed`. # # ```python # my_dataframe['date_parsed'] = pd.to_datetime(my_dataframe['date_str']) # ``` # In[17]: ### BEGIN SOLUTION df_emp['start_date_parsed'] = pd.to_datetime(df_emp['start_date']) ### END SOLUTION df_emp # #### 🧭 Check Your Work # # - Once you're done, run the code cell below to test correctness. # - ✔️ If the code cell runs without an error, you're good to move on. # - ❌ If the code cell throws an error, go back and fix incorrect parts. # In[18]: tc.assertEqual(set(df_emp.columns), set(df_emp_backup.columns.tolist() + ['start_date_parsed'])) pd.testing.assert_series_equal(df_emp['start_date_parsed'].reset_index(drop=True), pd.to_datetime(df_emp_backup['_'.join(['sTarT', 'DaTe']).lower()]) .reset_index(drop=True), check_names=False) # --- # # ▶️ Run `str(df_emp['start_date_parsed'].dtype)` below to see the data type of the `start_date` column. # In[19]: # BEGIN SOLUTION str(df_emp['start_date_parsed'].dtype) ### END SOLUTION # --- # # ### 🎯 Challenge 5: Drop `start_date` column *in-place* # # We no longer need the `start_date` column. We'll work with the new `start_date_parsed` column from this point on. # # #### 👇 Tasks # # - ✔️ Drop `start_date` column from `df_emp` *in-place*. # # #### 🚀 Hints # # The code below drops `col1` from `my_dataframe` *in-place* without creating a new variable. # # ```python # my_dataframe.drop(columns=['col1'], inplace=True) # ``` # In[20]: ### BEGIN SOLUTION df_emp.drop(columns=['start_date'], inplace=True) ### END SOLUTION df_emp # #### 🧭 Check Your Work # # - Once you're done, run the code cell below to test correctness. # - ✔️ If the code cell runs without an error, you're good to move on. # - ❌ If the code cell throws an error, go back and fix incorrect parts. # In[21]: df_check = df_emp_backup.copy() df_check['_'.join(['sTarT', 'DaTe', 'pArSeD']).lower()] = pd.to_datetime(df_check['start_date']) df_check = df_check.drop(columns=['start_date']) # Check result tc.assertEqual(set(df_emp.columns), set(['start_date_parsed', 'salary', 'office_phone', 'dept', 'name', 'emp_id'])) pd.testing.assert_frame_equal(df_emp.sort_values(df_emp.columns.tolist()).reset_index(drop=True), df_check.sort_values(df_check.columns.tolist()).reset_index(drop=True)) # --- # # ### 🎯 Challenge 6: Rename `start_date_parsed` to `start_date` # # #### 👇 Tasks # # - ✔️ Rename `start_date_parsed` to `start_date` in `df_emp` *in-place*. # # #### 🚀 Hints # # The code below renames the `name_before` column to `name_after` in `my_dataframe` *in-place* without creating a new variable. # # ```python # my_dataframe.rename(columns={'name_before': 'name_after'}, inplace=True) # ``` # In[22]: ### BEGIN SOLUTION df_emp.rename(columns={'start_date_parsed': 'start_date'}, inplace=True) ### END SOLUTION df_emp # #### 🧭 Check Your Work # # - Once you're done, run the code cell below to test correctness. # - ✔️ If the code cell runs without an error, you're good to move on. # - ❌ If the code cell throws an error, go back and fix incorrect parts. # In[23]: df_check = df_emp_backup.copy() df_check['_'.join(['sTarT', 'DaTe', 'pArSeD']).lower()] = pd.to_datetime(df_check['start_date']) df_check = df_check.drop(columns=['start_date']).rename(columns={'start_date_parsed': 'start_date'}) # Check result pd.testing.assert_frame_equal(df_emp.sort_values(df_emp.columns.tolist()).reset_index(drop=True), df_check.sort_values(df_check.columns.tolist()).reset_index(drop=True)) # --- # # ### 🎯 Challenge 7: Extract year from a datetime column # # #### 👇 Tasks # # - ✔️ Create a new column named `start_year` in `df_emp` that contains the starting years in integers (e.g., `2017`, `2018`). # - ✔️ Extract values from `df_emp['start_date']`. # # #### 🚀 Hints # # The code extracts the year of a datetime column `my_date` and stores it to a new column named `my_year`. # # ```python # my_dataframe['my_year'] = my_dataframe['my_date'].dt.year # ``` # In[24]: ### BEGIN SOLUTION df_emp['start_year'] = df_emp['start_date'].dt.year ### END SOLUTION df_emp # #### 🧭 Check Your Work # # - Once you're done, run the code cell below to test correctness. # - ✔️ If the code cell runs without an error, you're good to move on. # - ❌ If the code cell throws an error, go back and fix incorrect parts. # In[25]: df_check = df_emp_backup.copy() df_check['_'.join(['sTarT', 'DaTe', 'pArSeD']).lower()] = pd.to_datetime(df_check['start_date']) df_check = df_check.drop(columns=['start_date']).rename(columns={'start_date_parsed': 'start_date'}) df_check['_'.join(['sTarT', 'yEaR']).lower()] = df_check['_'.join(['sTarT', 'dAtE']).lower()].dt.year df_emp_check = df_emp.sort_values(df_emp.columns.tolist()).reset_index(drop=True) df_check = df_check.sort_values(df_check.columns.tolist()).reset_index(drop=True) cols_to_check = ['emp_id', 'start_year'] # Check result pd.testing.assert_frame_equal(df_emp_check[cols_to_check], df_check[cols_to_check]) # --- # # ### 🎯 Challenge 8: Extract month, day of month from a datetime column # # #### 👇 Tasks # # - ✔️ Create new columns named `start_month` and `start_day` in `df_emp` that contain the starting months and days of month in integers. # - ✔️ Extract values from `df_emp['start_date']`. # # #### 🚀 Hints # # The code extracts the months and days of a datetime column `my_date` and stores it to two new columns. # # ```python # my_dataframe['my_month'] = my_dataframe['my_date'].dt.month # my_dataframe['my_day'] = my_dataframe['my_date'].dt.day # ``` # In[26]: ### BEGIN SOLUTION df_emp['start_month'] = df_emp['start_date'].dt.month df_emp['start_day'] = df_emp['start_date'].dt.day ### END SOLUTION df_emp # #### 🧭 Check Your Work # # - Once you're done, run the code cell below to test correctness. # - ✔️ If the code cell runs without an error, you're good to move on. # - ❌ If the code cell throws an error, go back and fix incorrect parts. # In[27]: df_check = df_emp_backup.copy() df_check['_'.join(['sTarT', 'DaTe', 'pArSeD']).lower()] = pd.to_datetime(df_check['start_date']) df_check = df_check.drop(columns=['start_date']).rename(columns={'start_date_parsed': 'start_date'}) df_check['_'.join(['sTarT', 'mOnTh']).lower()] = df_check['_'.join(['sTarT', 'dAtE']).lower()].dt.month df_check['_'.join(['sTarT', 'dAy']).lower()] = df_check['_'.join(['sTarT', 'dAtE']).lower()].dt.day df_emp_check = df_emp.sort_values(df_emp.columns.tolist()).reset_index(drop=True) df_check = df_check.sort_values(df_check.columns.tolist()).reset_index(drop=True) cols_to_check = ['emp_id', 'start_month', 'start_day'] # Check result pd.testing.assert_frame_equal(df_emp_check[cols_to_check], df_check[cols_to_check]) # --- # # ### 🎯 Challenge 9: Extract quarter, weekday of month from a datetime column # # #### 👇 Tasks # # - ✔️ Create new columns named `start_quarter` and `start_weekday` in `df_emp` that contain the starting quarters and weekdays of month in integers. # - ✔️ Extract values from `df_emp['start_date']`. # # #### 🚀 Hints # # The code extracts the quarters and weekdays of a datetime column `my_date` and stores it to two new columns. # # ```python # my_dataframe['my_quarter'] = my_dataframe['my_date'].dt.quarter # my_dataframe['my_weekday'] = my_dataframe['my_date'].dt.weekday # ``` # In[28]: ### BEGIN SOLUTION df_emp['start_quarter'] = df_emp['start_date'].dt.quarter df_emp['start_weekday'] = df_emp['start_date'].dt.weekday ### END SOLUTION df_emp # #### 🧭 Check Your Work # # - Once you're done, run the code cell below to test correctness. # - ✔️ If the code cell runs without an error, you're good to move on. # - ❌ If the code cell throws an error, go back and fix incorrect parts. # In[29]: df_check = df_emp_backup.copy() df_check['_'.join(['sTarT', 'DaTe', 'pArSeD']).lower()] = pd.to_datetime(df_check['start_date']) df_check = df_check.drop(columns=['start_date']).rename(columns={'start_date_parsed': 'start_date'}) df_check['_'.join(['sTarT', 'qUarTer']).lower()] = df_check['_'.join(['sTarT', 'dAtE']).lower()].dt.quarter df_check['_'.join(['sTarT', 'wEeKDaY']).lower()] = df_check['_'.join(['sTarT', 'dAtE']).lower()].dt.weekday df_emp_check = df_emp.sort_values(df_emp.columns.tolist()).reset_index(drop=True) df_check = df_check.sort_values(df_check.columns.tolist()).reset_index(drop=True) cols_to_check = ['emp_id', 'start_quarter', 'start_weekday'] # Check result pd.testing.assert_frame_equal(df_emp_check[cols_to_check], df_check[cols_to_check])