#!/usr/bin/env python # coding: utf-8 # # Pandas Update Cells, More Exercises # ## Lecture Notes and in-class exercises # ▶️ First, run the code cell below to import `unittest`, a module used for **🧭 Check Your Work** sections and the autograder. # In[1]: import unittest tc = unittest.TestCase() # #### 👇 Tasks # # - ✔️ Import the following Python packages. # 1. `pandas`: Use alias `pd`. # 2. `numpy`: Use alias `np`. # In[2]: ### BEGIN SOLUTION import pandas as pd import numpy as np ### END SOLUTION # #### 🧭 Check your work # # - Once you're done, run the code cell below to test correctness. # - ✔️ If the code cell runs without an error, you're good to move on. # - ❌ If the code cell throws an error, go back and fix incorrect parts. # In[3]: import sys tc.assertTrue('pd' in globals(), 'Check whether you have correctly import Pandas with an alias.') tc.assertTrue('np' in globals(), 'Check whether you have correctly import NumPy with an alias.') # --- # # ### 📌 Load companies data # # For the first part, we're going to work with a small DataFrame to see how we can read and/or update individual cells in a DataFrame. # # ▶️ Run the code cell below to create `df_companies`. # In[4]: df_companies = pd.DataFrame({ 'company_name': ['Amazon', 'Nvidia', 'Google', 'Microsoft', 'Adobe'], 'ticker': ['AMZN', 'AMD', np.nan, np.nan, 'ADBE'], 'headquarters': ['Seattle', 'Santa Clara', 'Champaign', 'Redmond', 'San Jose'] }) df_companies # --- # # ### 🎯 Exercise 1: Update Nvidia's ticker # # #### 👇 Tasks # # - ✔️ Nvidia's ticker is incorrectly listed as `'AMD'`. # - ✔️ Replace `'AMD'` with `'NVDA'`. # - ✔️ Directly update `df_companies`. # # #### 🚀 Sample Code # # ```python # my_dataframe.loc[my_dataframe['column1'] == 'John Doe Co', 'column2'] = 'new value' # ``` # In[5]: ### BEGIN SOLUTION df_companies.loc[df_companies['company_name'] == 'Nvidia', 'ticker'] = 'NVDA' ### END SOLUTION df_companies # #### 🧭 Check Your Work # # - Once you're done, run the code cell below to test correctness. # - ✔️ If the code cell runs without an error, you're good to move on. # - ❌ If the code cell throws an error, go back and fix incorrect parts. # In[6]: # DO NOT CHANGE THE CODE IN THIS CELL tc.assertEqual( df_companies.loc[df_companies['company_name'] == 'Nvidia', 'ticker'].iloc[0], 'NVDA' ) # --- # # ### 🎯 Exercise 2: Update Google and Microsoft's tickers # # #### 👇 Tasks # # - ✔️ Both Google and Microsoft's tickers are missing. # - ✔️ Set Google's ticker to `'GOOG'`. # - ✔️ Set Microsoft's ticker to `'MSFT'`. # - ✔️ Directly update `df_companies`. # # #### 🚀 Sample Code # # ```python # my_dataframe.loc[my_dataframe['column1'] == 'John Doe Co', 'column2'] = 'new value' # ``` # In[7]: ### BEGIN SOLUTION df_companies.loc[df_companies['company_name'] == 'Google', 'ticker'] = 'GOOG' df_companies.loc[df_companies['company_name'] == 'Microsoft', 'ticker'] = 'MSFT' ### END SOLUTION df_companies # #### 🧭 Check Your Work # # - Once you're done, run the code cell below to test correctness. # - ✔️ If the code cell runs without an error, you're good to move on. # - ❌ If the code cell throws an error, go back and fix incorrect parts. # In[8]: # DO NOT CHANGE THE CODE IN THIS CELL tc.assertEqual( df_companies.loc[df_companies['company_name'] == 'Google', 'ticker'].iloc[0], 'GOOG' ) tc.assertEqual( df_companies.loc[df_companies['company_name'] == 'Microsoft', 'ticker'].iloc[0], 'MSFT' ) # --- # # ### 🎯 Exercise 3: Update Google's headquarters location # # #### 👇 Tasks # # - ✔️ Google's headquarters is listed as Champaign. # - ✔️ Set Google's headquarters to `'Mountain View'`. # - ✔️ Directly update `df_companies`. # In[9]: ### BEGIN SOLUTION df_companies.loc[df_companies['company_name'] == 'Google', 'headquarters'] = 'Mountain View' ### END SOLUTION df_companies # #### 🧭 Check Your Work # # - Once you're done, run the code cell below to test correctness. # - ✔️ If the code cell runs without an error, you're good to move on. # - ❌ If the code cell throws an error, go back and fix incorrect parts. # In[10]: # DO NOT CHANGE THE CODE IN THIS CELL df_companies_CHECK = pd.DataFrame({'company_name': ['Amazon', 'Nvidia', 'Google', 'Microsoft', 'Adobe'], 'ticker': ['AMZN', 'NVDA', 'GOOG', 'MSFT', 'ADBE'], 'headquarters': ['Seattle', 'Santa Clara', 'Mountain View', 'Redmond', 'San Jose']}) pd.testing.assert_frame_equal( df_companies.reset_index(drop=True), df_companies_CHECK.reset_index(drop=True) ) # --- # # ### 🎯 Exercise 4: Retrieve Amazon's headquarters location # # #### 👇 Tasks # # - ✔️ Retrieve Amazon's headquarters location. # - ✔️ Store the result to a new variable named `amazon_headquarters`. # # #### 🚀 Sample Code # # ```python # retrieved_value = my_dataframe.loc[my_dataframe['column1'] == 'Amazon', 'column2'].iloc[0] # ``` # # #### 🔑 Expected Output # # ``` # Amazon's headquarters is in Seattle. # ``` # In[11]: ### BEGIN SOLUTION amazon_headquarters = df_companies.loc[df_companies['company_name'] == 'Amazon', 'headquarters'].iloc[0] ### END SOLUTION print(f"Amazon's headquarters is in {amazon_headquarters}.") # #### 🧭 Check Your Work # # - Once you're done, run the code cell below to test correctness. # - ✔️ If the code cell runs without an error, you're good to move on. # - ❌ If the code cell throws an error, go back and fix incorrect parts. # In[12]: # DO NOT CHANGE THE CODE IN THIS CELL tc.assertEqual(amazon_headquarters, 'Seattle') # --- # # ### 🎯 Exercise 5: Retrieve Adobe's ticker # # #### 👇 Tasks # # - ✔️ Retrieve Adobe's ticker. # - ✔️ Store the result to a new variable named `adobe_ticker`. # # #### 🚀 Sample Code # # ```python # retrieved_value = my_dataframe.loc[my_dataframe['column1'] == 'Adobe', 'column2'].iloc[0] # ``` # # #### 🔑 Expected Output # # ``` # Adobe's ticker is ADBE. # ``` # In[13]: ### BEGIN SOLUTION adobe_ticker = df_companies.loc[df_companies['company_name'] == 'Adobe', 'ticker'].iloc[0] ### END SOLUTION print(f"Adobe's ticker is {adobe_ticker}.") # #### 🧭 Check Your Work # # - Once you're done, run the code cell below to test correctness. # - ✔️ If the code cell runs without an error, you're good to move on. # - ❌ If the code cell throws an error, go back and fix incorrect parts. # In[14]: # DO NOT CHANGE THE CODE IN THIS CELL tc.assertEqual(adobe_ticker, 'ADBE') # --- # # ### 📌 Load data # # Now, you'll work with the Starbucks Customer Rewards Program dataset. ☕ The dataset was originally released as a part of a capstone project of a Udacity course. # # For the purpose of this course, the original dataset in JSON format has been transformed to CSV files for convenience. You can download the original JSON files [here](https://www.kaggle.com/blacktile/starbucks-app-customer-reward-program-data?select=portfolio.json). # # ▶️ Run the code cell below to read the transcript data. # In[15]: # DO NOT CHANGE THE CODE IN THIS CELL df_transcript = pd.read_csv('https://github.com/bdi475/datasets/raw/main/starbucks-rewards/transcript.v2.csv.gz') df_transcript_backup = df_transcript.copy() display(df_transcript) # ▶️ Run the code cell below to read the customer profiles data. # In[16]: # DO NOT CHANGE THE CODE IN THIS CELL df_profiles = pd.read_csv('https://github.com/bdi475/datasets/raw/main/starbucks-rewards/profile.csv') df_profiles_backup = df_profiles.copy() display(df_profiles) # ▶️ Run the code cell below to read the offers data. # In[17]: # DO NOT CHANGE THE CODE IN THIS CELL df_offers = pd.read_csv('https://github.com/bdi475/datasets/raw/main/starbucks-rewards/portfolio.csv') df_offers = df_offers.copy() display(df_offers) # --- # # ### 🎯 Exercise 6: Find all `event` types in `df_transcript` # # #### 👇 Tasks # # - ✔️ Find the unique values in the `event` column as a Series. # - ✔️ A fully-working code is provided below. # # #### 🚀 Code # # ```python # event_types = df_transcript['event'].unique() # # print(event_types) # ``` # In[18]: ### BEGIN SOLUTION event_types = df_transcript['event'].unique() print(event_types) ### END SOLUTION # #### 🧭 Check Your Work # # - Once you're done, run the code cell below to test correctness. # - ✔️ If the code cell runs without an error, you're good to move on. # - ❌ If the code cell throws an error, go back and fix incorrect parts. # In[19]: # DO NOT CHANGE THE CODE IN THIS CELL tc.assertEqual(set(event_types), set(df_transcript_backup['event'].unique())) # --- # # ### 🎯 Exercise 7: Find all completed offers # # #### 👇 Tasks # # - ✔️ In `df_transcript` find all rows of completed offers (where the `event` column's value is `'offer completed'`). # - ✔️ Store the filtered result to a new DataFrame named `df_completed`. # - ✔️ `df_transcript` should remain unaltered. # In[20]: ### BEGIN SOLUTION df_completed = df_transcript[df_transcript['event'] == 'offer completed'] ### END SOLUTION df_completed # #### 🧭 Check Your Work # # - Once you're done, run the code cell below to test correctness. # - ✔️ If the code cell runs without an error, you're good to move on. # - ❌ If the code cell throws an error, go back and fix incorrect parts. # In[21]: # DO NOT CHANGE THE CODE IN THIS CELL df_check = df_transcript.query("event == 'offer completed'") pd.testing.assert_frame_equal(df_transcript, df_transcript_backup) pd.testing.assert_frame_equal(df_completed.sort_values(df_completed.columns.tolist()).reset_index(drop=True), df_check.sort_values(df_check.columns.tolist()).reset_index(drop=True)) # --- # # ### 🎯 Exercise 8: Drop unused columns # # ▶️ Before proceeding, run the code cell below. # In[22]: # DO NOT CHANGE THE CODE BELOW df_completed = df_completed.copy() # #### 👇 Tasks # # - ✔️ Drop `event`, `time`, and `amount` columns from `df_completed` **in-place**. # - ✔️ You should directly drop the columns from `df_completed` using the `inplace=True` option. # # #### 🚀 Hints # # Use the following code as a reference. # # ```python # my_dataframe.drop(columns=["my_column1", "my_column2"], inplace=True) # ``` # # #### 🔑 Expected Output # # | | member_id | offer_id | # |------:|:---------------------------------|:---------------------------------| # | 12658 | 9fa9ae8f57894cc9a3b8a9bbe0fc1b2f | 2906b810c7d4411798c6938adc9daaa5 | # | 12672 | fe97aa22dd3e48c8b143116a8403dd52 | fafdcd668e3743c1bb461111dcafc2a4 | # | 12679 | 629fc02d56414d91bca360decdfa9288 | 9b98b8c7a33c4b65b9aebfe6a799e6d9 | # In[23]: ### BEGIN SOLUTION df_completed.drop(columns=["event", "time", "amount"], inplace=True) ### END SOLUTION df_completed.head(3) # #### 🧭 Check Your Work # # - Once you're done, run the code cell below to test correctness. # - ✔️ If the code cell runs without an error, you're good to move on. # - ❌ If the code cell throws an error, go back and fix any incorrect parts. # In[24]: # DO NOT CHANGE THE CODE IN THIS CELL df_check = df_transcript.query("event == 'offer completed'")[['member_id', 'offer_id']] pd.testing.assert_frame_equal(df_completed.sort_values(df_completed.columns.tolist()).reset_index(drop=True), df_check.sort_values(df_check.columns.tolist()).reset_index(drop=True)) # --- # # ### 🎯 Exercise 9: Remove rows with missing values # # #### 👇 Tasks # # - ✔️ Update `df_profiles` so that it only contains rows where `gender` is NOT missing. # # #### 🚀 Hints # # ```python # df = df[df['some_column'].notna()] # ``` # # #### 🔑 Expected Output # # | | member_id | gender | age | became_member_on | income | # |---:|---:|---:|---:|---:|---:| # | 1 | 0610b486422d4921ae7d2bf64640c50b | F | 55.0 | 2017-07-15 | 112000.0 | # | 3 | 78afa995795e4d85b5d9ceeca43f5fef | F | 75.0 | 2017-05-09 | 100000.0 | # | 5 | e2127556f4f64592b11af22de27a7932 | M | 68.0 | 2018-04-26 | 70000.0 | # | 8 | 389bc3fa690240e798340f5a15918d5c | M | 65.0 | 2018-02-09 | 53000.0 | # | 12 | 2eeac8d8feae4a8cad5a6af0499a211d | M | 58.0 | 2017-11-11 | 51000.0 | # In[25]: ### BEGIN SOLUTION df_profiles = df_profiles[df_profiles['gender'].notna()] ### END SOLUTION df_profiles.head(5) # #### 🧭 Check Your Work # # - Once you're done, run the code cell below to test correctness. # - ✔️ If the code cell runs without an error, you're good to move on. # - ❌ If the code cell throws an error, go back and fix any incorrect parts. # In[26]: # DO NOT CHANGE THE CODE IN THIS CELL df_check = df_profiles_backup.query('gender == gender') pd.testing.assert_frame_equal(df_profiles.sort_values(df_profiles.columns.to_list()).reset_index(drop=True), df_check.sort_values(df_check.columns.tolist()).reset_index(drop=True)) # --- # # ### 🎯 Exercise 10: Merge profiles into `df_completed` # # ▶️ Run the code below to: # 1. Convert `age` and `income` columns in `df_profiles` to integer types (from float types). # 2. Print out the first three rows of `df_transactions` and `df_profiles`. # In[27]: # Convert age and income to int64 types df_profiles = df_profiles.copy() df_profiles['age'] = df_profiles['age'].astype(np.int64) df_profiles['income'] = df_profiles['income'].astype(np.int64) display(df_completed.head(3)) display(df_profiles.head(3)) # #### 👇 Tasks # # - ✔️ Merge `df_completed` and `df_profiles` using an **inner** join type. # - By using an inner join, we are intentionally removing transactions made by customers with missing gender information. # - ✔️ Place `df_completed` on the "left" side. # - ✔️ Store the merged DataFrame to `df_merged`. # # #### 🚀 Hints # # ```python # my_merged_dataframe = pd.merge( # left=left_dataframe, # right=right_dataframe, # on='shared_key_column', # how='inner' # ) # ``` # # #### 🔑 Expected Output # # | | member_id | offer_id | gender | age | became_member_on | income | # |---:|:---------------------------------|:---------------------------------|:---------|------:|:-------------------|---------:| # | 0 | 9fa9ae8f57894cc9a3b8a9bbe0fc1b2f | 2906b810c7d4411798c6938adc9daaa5 | M | 42 | 2016-01-17 | 96000 | # | 1 | 9fa9ae8f57894cc9a3b8a9bbe0fc1b2f | 2298d6c36e964ae4a3e7e9706d1fb8c2 | M | 42 | 2016-01-17 | 96000 | # | 2 | 9fa9ae8f57894cc9a3b8a9bbe0fc1b2f | 0b1e1539f2cc45b7b9fa7c272da2e1d7 | M | 42 | 2016-01-17 | 96000 | # In[28]: ### BEGIN SOLUTION df_merged = pd.merge( left=df_completed, right=df_profiles, on='member_id', how='inner' ) ### END SOLUTION df_merged.head(3) # #### 🧭 Check Your Work # # - Once you're done, run the code cell below to test correctness. # - ✔️ If the code cell runs without an error, you're good to move on. # - ❌ If the code cell throws an error, go back and fix any incorrect parts. # In[29]: # DO NOT CHANGE THE CODE IN THIS CELL df_check = df_transcript_backup.query('event == "offer completed"').merge( df_profiles_backup.query('gender == gender'), on='member_id', how='inner' ).drop(columns=['event', 'time', 'amount']) df_check['age'] = df_check['age'].astype(np.int64) df_check['income'] = df_check['income'].astype(np.int64) df_merged_backup1 = df_check.copy() pd.testing.assert_frame_equal(df_merged.sort_values(df_merged.columns.to_list()).reset_index(drop=True), df_check.sort_values(df_check.columns.tolist()).reset_index(drop=True)) # --- # # ### 🎯 Exercise 11: Merge offers into `df_merged` # #### 👇 Tasks # # - ✔️ Merge `df_offers` and `df_merged` using an **inner** join type. # - ✔️ Place `df_merged` on the "left" side. # - ✔️ Store the merged DataFrame to `df_completed_offers`. # # #### 🚀 Hints # # ```python # my_merged_dataframe = pd.merge( # left=left_dataframe, # right=right_dataframe, # on='shared_key_column', # how='inner' # ) # ``` # # #### 🔑 Expected Output # # | | member_id | offer_id | gender | age | became_member_on | income | reward | difficulty | duration | offer_type | email | mobile | social | web | # |---:|:---------------------------------|:---------------------------------|:---------|------:|:-------------------|---------:|---------:|-------------:|-----------:|:-------------|--------:|---------:|---------:|------:| # | 0 | 9fa9ae8f57894cc9a3b8a9bbe0fc1b2f | 2906b810c7d4411798c6938adc9daaa5 | M | 42 | 2016-01-17 | 96000 | 2 | 10 | 7 | discount | 1 | 1 | 0 | 1 | # | 1 | 629fc02d56414d91bca360decdfa9288 | 2906b810c7d4411798c6938adc9daaa5 | M | 52 | 2018-06-05 | 72000 | 2 | 10 | 7 | discount | 1 | 1 | 0 | 1 | # | 2 | 73ffefd41e9a4ca3ab26b2b3697c6eb7 | 2906b810c7d4411798c6938adc9daaa5 | F | 67 | 2015-09-29 | 67000 | 2 | 10 | 7 | discount | 1 | 1 | 0 | 1 | # In[30]: ### BEGIN SOLUTION df_completed_offers = pd.merge( left=df_merged, right=df_offers, on='offer_id', how='inner' ) ### END SOLUTION df_completed_offers.head(3) # #### 🧭 Check Your Work # # - Once you're done, run the code cell below to test correctness. # - ✔️ If the code cell runs without an error, you're good to move on. # - ❌ If the code cell throws an error, go back and fix any incorrect parts. # In[31]: # DO NOT CHANGE THE CODE IN THIS CELL df_check = df_merged_backup1.merge( df_offers, on='offer_id', how='inner' ) df_merged_backup2 = df_check.copy() pd.testing.assert_frame_equal(df_completed_offers.sort_values(df_completed_offers.columns.to_list()).reset_index(drop=True), df_check.sort_values(df_check.columns.tolist()).reset_index(drop=True)) # --- # # ### 📌 Challenge Day # # - For the rest of this lecture, your challenge is to get through these challenges by yourself! 🏀 # - At the end of the class, we'll go over them together. # - If we're short on time, I will post the solutions to these challenges # --- # # ### 🎯 Exercise 12: Find how many times each offer has been redeemed # # #### 👇 Tasks # # - ✔️ Using `df_completed_offers`, find the number of times each offer has been redeemed. # - ✔️ Store the summary DataFrame to `df_count_by_offer_id`. # - ✔️ `df_count_by_offer_id` should have two non-index columns. # - `df_count_by_offer_id.columns.to_list()` should print out `['offer_id', 'count']`. # - ✔️ Sort `df_count_by_offer_id` by `count` in descending order. # - ✔️ `df_completed_offers` should remain unaltered. # # # #### 🔑 Expected Output # # Your index column may contain different values. # # | | offer_id | count | # |---:|:---------------------------------|--------:| # | 7 | fafdcd668e3743c1bb461111dcafc2a4 | 5003 | # | 1 | 2298d6c36e964ae4a3e7e9706d1fb8c2 | 4886 | # | 4 | 9b98b8c7a33c4b65b9aebfe6a799e6d9 | 4188 | # | 6 | f19421c1d4aa40978ebb69ca19b0e20d | 4103 | # | 2 | 2906b810c7d4411798c6938adc9daaa5 | 3911 | # | 5 | ae264e3637204a6fb9bb56bc8210ddfd | 3657 | # | 0 | 0b1e1539f2cc45b7b9fa7c272da2e1d7 | 3386 | # | 3 | 4d5c57ea9a6940dd891ad53e9dbe8da0 | 3310 | # In[32]: ### BEGIN SOLUTION df_count_by_offer_id = df_completed_offers.groupby('offer_id', as_index=False).agg({ 'member_id': 'count' }).rename(columns={ 'member_id': 'count' }).sort_values('count', ascending=False) ### END SOLUTION df_count_by_offer_id # #### 🧭 Check Your Work # # - Once you're done, run the code cell below to test correctness. # - ✔️ If the code cell runs without an error, you're good to move on. # - ❌ If the code cell throws an error, go back and fix any incorrect parts. # In[33]: # DO NOT CHANGE THE CODE IN THIS CELL df_check = pd.DataFrame({'offer_id': {7: 'fafdcd668e3743c1bb461111dcafc2a4', 1: '2298d6c36e964ae4a3e7e9706d1fb8c2', 4: '9b98b8c7a33c4b65b9aebfe6a799e6d9', 6: 'f19421c1d4aa40978ebb69ca19b0e20d', 2: '2906b810c7d4411798c6938adc9daaa5', 5: 'ae264e3637204a6fb9bb56bc8210ddfd', 0: '0b1e1539f2cc45b7b9fa7c272da2e1d7', 3: '4d5c57ea9a6940dd891ad53e9dbe8da0'}, 'count': {7: 5003, 1: 4886, 4: 4188, 6: 4103, 2: 3911, 5: 3657, 0: 3386, 3: 3310}}) pd.testing.assert_frame_equal(df_count_by_offer_id.reset_index(drop=True), df_check.reset_index(drop=True)) # --- # # ### 🎯 Exercise 13: Find how many times each type of offer has been received # # #### 👇 Tasks # # - ✔️ Using `df_completed_offers`, find the number of times each offer **type** has been redeemed. # - ✔️ Store the summary DataFrame to `df_count_by_offer_type`. # - ✔️ `df_count_by_offer_type` should have two non-index columns. # - `df_count_by_offer_type.columns.to_list()` should print out `['offer_type', 'count']`. # - ✔️ Sort `df_count_by_offer_type` by `count` in descending order. # - ✔️ `df_completed_offers` should remain unaltered. # # # #### 🔑 Expected Output # # Your index column may contain different values. # # | | offer_type | count | # |---:|:-------------|--------:| # | 1 | discount | 17186 | # | 0 | bogo | 15258 | # In[34]: ### BEGIN SOLUTION df_count_by_offer_type = df_completed_offers.groupby('offer_type', as_index=False).agg({ 'member_id': 'count' }).rename(columns={ 'member_id': 'count' }).sort_values('count', ascending=False) ### END SOLUTION df_count_by_offer_type # #### 🧭 Check Your Work # # - Once you're done, run the code cell below to test correctness. # - ✔️ If the code cell runs without an error, you're good to move on. # - ❌ If the code cell throws an error, go back and fix any incorrect parts. # In[35]: # DO NOT CHANGE THE CODE IN THIS CELL df_check = pd.DataFrame({'offer_type': {1: 'discount', 0: 'bogo'}, 'count': {1: 17186, 0: 15258}}) pd.testing.assert_frame_equal(df_count_by_offer_type.reset_index(drop=True), df_check.reset_index(drop=True)) # --- # # ### 🎯 Exercise 14: Find redeem count by gender # # #### 👇 Tasks # # - ✔️ Using `df_completed_offers`, find the number of times each gender has redeemed an offer. # - ✔️ Store the summary DataFrame to `df_count_by_gender`. # - ✔️ `df_count_by_offer_type` should have two non-index columns. # - `df_count_by_gender.columns.to_list()` should print out `['gender', 'count']`. # - ✔️ Sort `df_count_by_gender` by `count` in descending order. # - ✔️ `df_completed_offers` should remain unaltered. # # # #### 🔑 Expected Output # # Your index column may contain different values. # # | | gender | count | # |---:|:---------|--------:| # | 1 | M | 16466 | # | 0 | F | 15477 | # | 2 | O | 501 | # In[36]: ### BEGIN SOLUTION df_count_by_gender = df_completed_offers.groupby('gender', as_index=False).agg({ 'member_id': 'count' }).rename(columns={ 'member_id': 'count' }).sort_values('count', ascending=False) ### END SOLUTION df_count_by_gender # #### 🧭 Check Your Work # # - Once you're done, run the code cell below to test correctness. # - ✔️ If the code cell runs without an error, you're good to move on. # - ❌ If the code cell throws an error, go back and fix any incorrect parts. # In[37]: # DO NOT CHANGE THE CODE IN THIS CELL df_check = pd.DataFrame({'gender': {1: 'M', 0: 'F', 2: 'O'}, 'count': {1: 16466, 0: 15477, 2: 501}}) pd.testing.assert_frame_equal(df_count_by_gender.reset_index(drop=True), df_check.reset_index(drop=True)) # --- # # ### 🎯 Exercise 15: Find redeem count by member sign-up year # # #### 👇 Tasks # # - ✔️ Using `df_completed_offers`, first find the member sign-up year using `became_member_on` column. # - ✔️ Then, find the number of completed offers by the sign-up year. # - ✔️ Store the summary DataFrame to `df_count_by_member_signup_year`. # - ✔️ `df_count_by_member_signup_year` should have two non-index columns. # - `df_count_by_member_signup_year.columns.to_list()` should print out `['member_since', 'count']`. # - ✔️ Sort `df_count_by_member_signup_year` by `year` in ascending order. # - ✔️ **You can modify `df_completed_offers` if you need to (e.g., convert a column to datetime format, extract year, etc).** # # # #### 🔑 Expected Output # # Your index column may contain different values. # # | | member_since | count | # |---:|---------------:|--------:| # | 0 | 2013 | 561 | # | 1 | 2014 | 1321 | # | 2 | 2015 | 4319 | # | 3 | 2016 | 8928 | # | 4 | 2017 | 12119 | # | 5 | 2018 | 5196 | # In[38]: ### BEGIN SOLUTION df_completed_offers['became_member_on'] = pd.to_datetime(df_completed_offers['became_member_on']) df_completed_offers['member_since'] = df_completed_offers['became_member_on'].dt.year df_count_by_member_signup_year = df_completed_offers.groupby('member_since', as_index=False).agg({ 'member_id': 'count' }).rename(columns={ 'member_id': 'count' }) ### END SOLUTION df_count_by_member_signup_year # #### 🧭 Check Your Work # # - Once you're done, run the code cell below to test correctness. # - ✔️ If the code cell runs without an error, you're good to move on. # - ❌ If the code cell throws an error, go back and fix any incorrect parts. # In[39]: # DO NOT CHANGE THE CODE IN THIS CELL df_check = pd.DataFrame({'member_since': {0: 2013, 1: 2014, 2: 2015, 3: 2016, 4: 2017, 5: 2018}, 'count': {0: 561, 1: 1321, 2: 4319, 3: 8928, 4: 12119, 5: 5196}}) pd.testing.assert_frame_equal(df_count_by_member_signup_year.reset_index(drop=True), df_check.reset_index(drop=True))