#!/usr/bin/env python
# coding: utf-8

# # Pandas Missing Values, Datetime, Aggregation, and Merging

# ## Lecture Notes and in-class exercises

# ▶️ First, run the code cell below to import `unittest`, a module used for **🧭 Check Your Work** sections and the autograder.

# In[1]:


import unittest
tc = unittest.TestCase()


# #### 👇 Tasks
# 
# - ✔️ Import the following Python packages.
#     1. `pandas`: Use alias `pd`.
#     2. `numpy`: Use alias `np`.

# In[2]:


### BEGIN SOLUTION
import pandas as pd
import numpy as np
### END SOLUTION


# #### 🧭 Check your work
# 
# - Once you're done, run the code cell below to test correctness.
# - ✔️ If the code cell runs without an error, you're good to move on.
# - ❌ If the code cell throws an error, go back and fix incorrect parts.

# In[3]:


import sys
tc.assertTrue('pd' in globals(), 'Check whether you have correctly import Pandas with an alias.')
tc.assertTrue('np' in globals(), 'Check whether you have correctly import NumPy with an alias.')


# ---
# 
# ## 👉 Working with Missing Values
# 
# A dataset commonly can contain one or more missing values. Pandas provides a flexible set of methods for you to work with missing values. Missing values can be represented in multiple ways. Here are few common notations that denote a missing value.
# 
# - `NaN`: NaN stands for "not a number".
# - `None`: python's built-in type (equivalent to `null` in other programming languages)
# - `NA`: A native scalar to denote a missing value in `pandas`
# - `NaT`: NaT stands for "not a time". 

# In[4]:


# NaN
nan_example = np.nan
print(nan_example)


# In[5]:


# None
none_example = None
print(none_example)


# In[6]:


# NA
na_example = pd.NA
print(na_example)


# In[7]:


# NaT
nat_example = pd.NaT
print(nat_example)


# ---
# 
# ### 📌 Load data

# ▶️ Run the code cell below to create a new `DataFrame` named `df_people`.

# In[8]:


# DO NOT CHANGE THE CODE IN THIS CELL
df_people = pd.read_csv("https://raw.githubusercontent.com/UI-Deloitte-business-analytics-center/datasets/main/people-sample.csv")

# Used to keep a clean copy
df_people_backup = df_people.copy()

# head() displays the first 5 rows of a DataFrame
df_people.head()


# The table below describes each column in `df_people`.

# | Column Name             | Description                                               |
# |-------------------------|-----------------------------------------------------------|
# | name                    | Full name                                                |
# | major1                  | Major                                                     |
# | major2                  | Second major OR minor (blank if no second major or no minor) |
# | city                    | City the person is from                                   |
# | fav_restaurant          | Favorite restaurant (blank if no restaurant was given)    |
# | has_iphone              | Whether the person use an iPhone                          |

# ---
# 
# ### 🎯 Challenge 1: People from Grayslake

# #### 👇 Tasks
# 
# - ✔️ Using `df_people`, filter rows where the person is from `"Grayslake"`.
#     - Check whether the `city` column is equal to `"Grayslake"`.
#     - Store the result to a new variable named `df_grayslake`.
# - ✔️ `df_people` should remain unaltered after your code.

# In[9]:


### BEGIN SOLUTION
df_grayslake = df_people[df_people["city"] == "Grayslake"]
### END SOLUTION

df_grayslake


# #### 🧭 Check Your Work
# 
# - Once you're done, run the code cell below to test correctness.
# - ✔️ If the code cell runs without an error, you're good to move on.
# - ❌ If the code cell throws an error, go back and fix incorrect parts.

# In[10]:


# df_people should remain unaltered
pd.testing.assert_frame_equal(df_people, df_people_backup)

pd.testing.assert_frame_equal(df_grayslake.sort_values(df_grayslake.columns.tolist()).reset_index(drop=True),
                              df_people_backup.query(f'{"cItY".lower()} == "{"GraYsLakE".capitalize()}"')
                                 .sort_values(df_people_backup.columns.tolist()).reset_index(drop=True))


# ---
# 
# ### 🎯 Challenge 2: Anyone with a non-missing `major2`

# #### 👇 Tasks
# 
# - ✔️ Using `df_people`, filter rows where the person has a second major or a minor.
#     - You're looking for rows where `major2` is not `NaN`.
# - ✔️ `NaN` is a special value to denote missing value. You must use `my_series.isna()` or `my_series.notna()` to check whether a row contains a missing value or not.
# - ✔️ Store the result to a new variable named `df_major2`.
# - ✔️ `df_people` should remain unaltered after your code.

# #### 🚀 Hints
# 
# - `my_series.notna()` can be used to check whether a row contains a missing value or not.
# 
# ![notna](https://github.com/bdi475/images/blob/main/pandas/notna-series.png?raw=true)

# In[11]:


### BEGIN SOLUTION
df_major2 = df_people[df_people["major2"].notna()]
### END SOLUTION

df_major2


# #### 🧭 Check Your Work
# 
# - Once you're done, run the code cell below to test correctness.
# - ✔️ If the code cell runs without an error, you're good to move on.
# - ❌ If the code cell throws an error, go back and fix incorrect parts.

# In[12]:


# df_people should remain unaltered
pd.testing.assert_frame_equal(df_people, df_people_backup)

pd.testing.assert_frame_equal(df_major2.sort_values(df_major2.columns.tolist()).reset_index(drop=True),
                              df_people_backup.query(f"major2 == major2")
                                 .sort_values(df_people_backup.columns.tolist()).reset_index(drop=True))


# ---
# 
# ### 🎯 Challenge 3: Anyone without a favorite movie
# 
# #### 👇 Tasks
# 
# - ✔️ Using `df_people`, filter rows where the person's `fav_movie` is `NaN`.
# - ✔️ `NaN` is a special value to denote missing value. You must use `my_series.isna()` or `my_series.notna()` to compare a `Series` with `NaN`.
# - ✔️ Store the result to a new variable named `df_no_fav_movie`.
# - ✔️ `df_people` should remain unaltered after your code.
# 
# #### 🚀 Hints
# 
# - `my_series.isna()` can be used to check whether a row contains a missing value.

# In[13]:


### BEGIN SOLUTION
df_no_fav_movie = df_people[df_people["fav_movie"].isna()]
### END SOLUTION

df_no_fav_movie


# #### 🧭 Check Your Work
# 
# - Once you're done, run the code cell below to test correctness.
# - ✔️ If the code cell runs without an error, you're good to move on.
# - ❌ If the code cell throws an error, go back and fix incorrect parts.

# In[14]:


# df_people should remain unaltered
pd.testing.assert_frame_equal(df_people, df_people_backup)

pd.testing.assert_frame_equal(df_no_fav_movie.sort_values(df_no_fav_movie.columns.tolist()).reset_index(drop=True),
                              df_people_backup.query(f"fav_movie != fav_movie")
                                 .sort_values(df_people_backup.columns.tolist()).reset_index(drop=True))


# ---
# 
# ## 🗓️ Working with Datetime Values
# 
# You will often see date-looking strings in your data. A few examples are:
# 
# - `20210315`
# - `Mar 15, 2021`
# - `2020-03-15`
# - `2020/3/15`
# 
# In this part, we'll discuss how we can *parse* and utilize datetime values.

# ---
# 
# ### 📌 Load employees data
# 
# ▶️ Run the code cell below to create a new `DataFrame` named `df_emp`.

# In[15]:


# DO NOT CHANGE THE CODE IN THIS CELL
df_emp = pd.DataFrame({
    'emp_id': [30, 40, 10, 20],
    'name': ['Talal', 'Josh', 'Anika', 'Aishani'],
    'dept': ['Finance', 'Purchase', 'Finance', 'Purchase'],
    'office_phone': ['(217)123-4500', np.nan, np.nan, '(217)987-6600'],
    'start_date': ['2017-05-01', '2018-02-01', '2020-08-01', '2019-12-01'],
    'salary': [202000, 185000, 240000, 160500]
})

# Used for intermediate checks
df_emp_backup = df_emp.copy()

df_emp


# **Question**: What is the data type of the `start_date` column?

# ▶️ Run `str(df_emp['start_date'].dtype)` below to see the data type of the `start_date` column.

# In[16]:


### BEGIN SOLUTION
str(df_emp['start_date'].dtype)
### END SOLUTION


# While `object` can refer to many different types, you can safely assume that all `object` data types you see in this course refer to strings.

# ---
# 
# ### 🎯 Challenge 4: Parse a string column as datetime
# 
# #### 👇 Tasks
# 
# - ✔️ Parse `start_date` to a `datetime` data type.
# - ✔️ Store the result to a new column named `start_date_parsed`.
# 
# #### 🚀 Hints
# 
# The code below converts `date_str` column to a `datetime`-typed column and stores the converted result to a new column named `date_parsed`.
# 
# ```python
# my_dataframe['date_parsed'] = pd.to_datetime(my_dataframe['date_str'])
# ```

# In[17]:


### BEGIN SOLUTION
df_emp['start_date_parsed'] = pd.to_datetime(df_emp['start_date'])
### END SOLUTION

df_emp


# #### 🧭 Check Your Work
# 
# - Once you're done, run the code cell below to test correctness.
# - ✔️ If the code cell runs without an error, you're good to move on.
# - ❌ If the code cell throws an error, go back and fix incorrect parts.

# In[18]:


tc.assertEqual(set(df_emp.columns), set(df_emp_backup.columns.tolist() + ['start_date_parsed']))
pd.testing.assert_series_equal(df_emp['start_date_parsed'].reset_index(drop=True),
                               pd.to_datetime(df_emp_backup['_'.join(['sTarT', 'DaTe']).lower()])
                                  .reset_index(drop=True),
                               check_names=False)


# ---
# 
# ▶️ Run `str(df_emp['start_date_parsed'].dtype)` below to see the data type of the `start_date` column.

# In[19]:


# BEGIN SOLUTION
str(df_emp['start_date_parsed'].dtype)
### END SOLUTION


# ---
# 
# ### 🎯 Challenge 5: Drop `start_date` column *in-place*
# 
# We no longer need the `start_date` column. We'll work with the new `start_date_parsed` column from this point on.
# 
# #### 👇 Tasks
# 
# - ✔️ Drop `start_date` column from `df_emp` *in-place*.
# 
# #### 🚀 Hints
# 
# The code below drops `col1` from `my_dataframe` *in-place* without creating a new variable.
# 
# ```python
# my_dataframe.drop(columns=['col1'], inplace=True)
# ```

# In[20]:


### BEGIN SOLUTION
df_emp.drop(columns=['start_date'], inplace=True)
### END SOLUTION

df_emp


# #### 🧭 Check Your Work
# 
# - Once you're done, run the code cell below to test correctness.
# - ✔️ If the code cell runs without an error, you're good to move on.
# - ❌ If the code cell throws an error, go back and fix incorrect parts.

# In[21]:


df_check = df_emp_backup.copy()
df_check['_'.join(['sTarT', 'DaTe', 'pArSeD']).lower()] = pd.to_datetime(df_check['start_date'])
df_check = df_check.drop(columns=['start_date'])

# Check result
tc.assertEqual(set(df_emp.columns), set(['start_date_parsed', 'salary', 'office_phone', 'dept', 'name', 'emp_id']))
pd.testing.assert_frame_equal(df_emp.sort_values(df_emp.columns.tolist()).reset_index(drop=True),
                              df_check.sort_values(df_check.columns.tolist()).reset_index(drop=True))


# ---
# 
# ### 🎯 Challenge 6: Rename `start_date_parsed` to `start_date`
# 
# #### 👇 Tasks
# 
# - ✔️ Rename `start_date_parsed` to `start_date` in `df_emp` *in-place*.
# 
# #### 🚀 Hints
# 
# The code below renames the `name_before` column to `name_after` in `my_dataframe` *in-place* without creating a new variable.
# 
# ```python
# my_dataframe.rename(columns={'name_before': 'name_after'}, inplace=True)
# ```

# In[22]:


### BEGIN SOLUTION
df_emp.rename(columns={'start_date_parsed': 'start_date'}, inplace=True)
### END SOLUTION

df_emp


# #### 🧭 Check Your Work
# 
# - Once you're done, run the code cell below to test correctness.
# - ✔️ If the code cell runs without an error, you're good to move on.
# - ❌ If the code cell throws an error, go back and fix incorrect parts.

# In[23]:


df_check = df_emp_backup.copy()
df_check['_'.join(['sTarT', 'DaTe', 'pArSeD']).lower()] = pd.to_datetime(df_check['start_date'])
df_check = df_check.drop(columns=['start_date']).rename(columns={'start_date_parsed': 'start_date'})

# Check result
pd.testing.assert_frame_equal(df_emp.sort_values(df_emp.columns.tolist()).reset_index(drop=True),
                              df_check.sort_values(df_check.columns.tolist()).reset_index(drop=True))


# ---
# 
# ### 🎯 Challenge 7: Extract year from a datetime column
# 
# #### 👇 Tasks
# 
# - ✔️ Create a new column named `start_year` in `df_emp` that contains the starting years in integers (e.g., `2017`, `2018`).
# - ✔️ Extract values from `df_emp['start_date']`.
# 
# #### 🚀 Hints
# 
# The code extracts the year of a datetime column `my_date` and stores it to a new column named `my_year`.
# 
# ```python
# my_dataframe['my_year'] = my_dataframe['my_date'].dt.year
# ```

# In[24]:


### BEGIN SOLUTION
df_emp['start_year'] = df_emp['start_date'].dt.year
### END SOLUTION

df_emp


# #### 🧭 Check Your Work
# 
# - Once you're done, run the code cell below to test correctness.
# - ✔️ If the code cell runs without an error, you're good to move on.
# - ❌ If the code cell throws an error, go back and fix incorrect parts.

# In[25]:


df_check = df_emp_backup.copy()
df_check['_'.join(['sTarT', 'DaTe', 'pArSeD']).lower()] = pd.to_datetime(df_check['start_date'])
df_check = df_check.drop(columns=['start_date']).rename(columns={'start_date_parsed': 'start_date'})
df_check['_'.join(['sTarT', 'yEaR']).lower()] = df_check['_'.join(['sTarT', 'dAtE']).lower()].dt.year

df_emp_check = df_emp.sort_values(df_emp.columns.tolist()).reset_index(drop=True)
df_check = df_check.sort_values(df_check.columns.tolist()).reset_index(drop=True)
cols_to_check = ['emp_id', 'start_year']

# Check result
pd.testing.assert_frame_equal(df_emp_check[cols_to_check],
                              df_check[cols_to_check])


# ---
# 
# ### 🎯 Challenge 8: Extract month, day of month from a datetime column
# 
# #### 👇 Tasks
# 
# - ✔️ Create new columns named `start_month` and `start_day` in `df_emp` that contain the starting months and days of month in integers.
# - ✔️ Extract values from `df_emp['start_date']`.
# 
# #### 🚀 Hints
# 
# The code extracts the months and days of a datetime column `my_date` and stores it to two new columns.
# 
# ```python
# my_dataframe['my_month'] = my_dataframe['my_date'].dt.month
# my_dataframe['my_day'] = my_dataframe['my_date'].dt.day
# ```

# In[26]:


### BEGIN SOLUTION
df_emp['start_month'] = df_emp['start_date'].dt.month
df_emp['start_day'] = df_emp['start_date'].dt.day
### END SOLUTION

df_emp


# #### 🧭 Check Your Work
# 
# - Once you're done, run the code cell below to test correctness.
# - ✔️ If the code cell runs without an error, you're good to move on.
# - ❌ If the code cell throws an error, go back and fix incorrect parts.

# In[27]:


df_check = df_emp_backup.copy()
df_check['_'.join(['sTarT', 'DaTe', 'pArSeD']).lower()] = pd.to_datetime(df_check['start_date'])
df_check = df_check.drop(columns=['start_date']).rename(columns={'start_date_parsed': 'start_date'})
df_check['_'.join(['sTarT', 'mOnTh']).lower()] = df_check['_'.join(['sTarT', 'dAtE']).lower()].dt.month
df_check['_'.join(['sTarT', 'dAy']).lower()] = df_check['_'.join(['sTarT', 'dAtE']).lower()].dt.day

df_emp_check = df_emp.sort_values(df_emp.columns.tolist()).reset_index(drop=True)
df_check = df_check.sort_values(df_check.columns.tolist()).reset_index(drop=True)
cols_to_check = ['emp_id', 'start_month', 'start_day']

# Check result
pd.testing.assert_frame_equal(df_emp_check[cols_to_check],
                              df_check[cols_to_check])


# ---
# 
# ### 🎯 Challenge 9: Extract quarter, weekday of month from a datetime column
# 
# #### 👇 Tasks
# 
# - ✔️ Create new columns named `start_quarter` and `start_weekday` in `df_emp` that contain the starting quarters and weekdays of month in integers.
# - ✔️ Extract values from `df_emp['start_date']`.
# 
# #### 🚀 Hints
# 
# The code extracts the quarters and weekdays of a datetime column `my_date` and stores it to two new columns.
# 
# ```python
# my_dataframe['my_quarter'] = my_dataframe['my_date'].dt.quarter
# my_dataframe['my_weekday'] = my_dataframe['my_date'].dt.weekday
# ```

# In[28]:


### BEGIN SOLUTION
df_emp['start_quarter'] = df_emp['start_date'].dt.quarter
df_emp['start_weekday'] = df_emp['start_date'].dt.weekday
### END SOLUTION

df_emp


# #### 🧭 Check Your Work
# 
# - Once you're done, run the code cell below to test correctness.
# - ✔️ If the code cell runs without an error, you're good to move on.
# - ❌ If the code cell throws an error, go back and fix incorrect parts.

# In[29]:


df_check = df_emp_backup.copy()
df_check['_'.join(['sTarT', 'DaTe', 'pArSeD']).lower()] = pd.to_datetime(df_check['start_date'])
df_check = df_check.drop(columns=['start_date']).rename(columns={'start_date_parsed': 'start_date'})
df_check['_'.join(['sTarT', 'qUarTer']).lower()] = df_check['_'.join(['sTarT', 'dAtE']).lower()].dt.quarter
df_check['_'.join(['sTarT', 'wEeKDaY']).lower()] = df_check['_'.join(['sTarT', 'dAtE']).lower()].dt.weekday

df_emp_check = df_emp.sort_values(df_emp.columns.tolist()).reset_index(drop=True)
df_check = df_check.sort_values(df_check.columns.tolist()).reset_index(drop=True)
cols_to_check = ['emp_id', 'start_quarter', 'start_weekday']

# Check result
pd.testing.assert_frame_equal(df_emp_check[cols_to_check],
                              df_check[cols_to_check])