▶️ First, run the code cell below to import unittest
, a module used for 🧭 Check Your Work sections and the autograder.
import unittest
tc = unittest.TestCase()
pandas
: Use alias pd
.numpy
: Use alias np
.### BEGIN SOLUTION
import pandas as pd
import numpy as np
### END SOLUTION
import sys
tc.assertTrue('pd' in globals(), 'Check whether you have correctly import Pandas with an alias.')
tc.assertTrue('np' in globals(), 'Check whether you have correctly import NumPy with an alias.')
For the first part, we're going to work with a small DataFrame to see how we can read and/or update individual cells in a DataFrame.
▶️ Run the code cell below to create df_companies
.
df_companies = pd.DataFrame({
'company_name': ['Amazon', 'Nvidia', 'Google', 'Microsoft', 'Adobe'],
'ticker': ['AMZN', 'AMD', np.nan, np.nan, 'ADBE'],
'headquarters': ['Seattle', 'Santa Clara', 'Champaign', 'Redmond', 'San Jose']
})
df_companies
company_name | ticker | headquarters | |
---|---|---|---|
0 | Amazon | AMZN | Seattle |
1 | Nvidia | AMD | Santa Clara |
2 | NaN | Champaign | |
3 | Microsoft | NaN | Redmond |
4 | Adobe | ADBE | San Jose |
### BEGIN SOLUTION
df_companies.loc[df_companies['company_name'] == 'Nvidia', 'ticker'] = 'NVDA'
### END SOLUTION
df_companies
company_name | ticker | headquarters | |
---|---|---|---|
0 | Amazon | AMZN | Seattle |
1 | Nvidia | NVDA | Santa Clara |
2 | NaN | Champaign | |
3 | Microsoft | NaN | Redmond |
4 | Adobe | ADBE | San Jose |
# DO NOT CHANGE THE CODE IN THIS CELL
tc.assertEqual(
df_companies.loc[df_companies['company_name'] == 'Nvidia', 'ticker'].iloc[0],
'NVDA'
)
'GOOG'
.'MSFT'
.df_companies
.my_dataframe.loc[my_dataframe['column1'] == 'John Doe Co', 'column2'] = 'new value'
### BEGIN SOLUTION
df_companies.loc[df_companies['company_name'] == 'Google', 'ticker'] = 'GOOG'
df_companies.loc[df_companies['company_name'] == 'Microsoft', 'ticker'] = 'MSFT'
### END SOLUTION
df_companies
company_name | ticker | headquarters | |
---|---|---|---|
0 | Amazon | AMZN | Seattle |
1 | Nvidia | NVDA | Santa Clara |
2 | GOOG | Champaign | |
3 | Microsoft | MSFT | Redmond |
4 | Adobe | ADBE | San Jose |
# DO NOT CHANGE THE CODE IN THIS CELL
tc.assertEqual(
df_companies.loc[df_companies['company_name'] == 'Google', 'ticker'].iloc[0],
'GOOG'
)
tc.assertEqual(
df_companies.loc[df_companies['company_name'] == 'Microsoft', 'ticker'].iloc[0],
'MSFT'
)
### BEGIN SOLUTION
df_companies.loc[df_companies['company_name'] == 'Google', 'headquarters'] = 'Mountain View'
### END SOLUTION
df_companies
company_name | ticker | headquarters | |
---|---|---|---|
0 | Amazon | AMZN | Seattle |
1 | Nvidia | NVDA | Santa Clara |
2 | GOOG | Mountain View | |
3 | Microsoft | MSFT | Redmond |
4 | Adobe | ADBE | San Jose |
# DO NOT CHANGE THE CODE IN THIS CELL
df_companies_CHECK = pd.DataFrame({'company_name': ['Amazon', 'Nvidia', 'Google', 'Microsoft', 'Adobe'],
'ticker': ['AMZN', 'NVDA', 'GOOG', 'MSFT', 'ADBE'], 'headquarters': ['Seattle', 'Santa Clara',
'Mountain View', 'Redmond', 'San Jose']})
pd.testing.assert_frame_equal(
df_companies.reset_index(drop=True),
df_companies_CHECK.reset_index(drop=True)
)
amazon_headquarters
.retrieved_value = my_dataframe.loc[my_dataframe['column1'] == 'Amazon', 'column2'].iloc[0]
Amazon's headquarters is in Seattle.
### BEGIN SOLUTION
amazon_headquarters = df_companies.loc[df_companies['company_name'] == 'Amazon', 'headquarters'].iloc[0]
### END SOLUTION
print(f"Amazon's headquarters is in {amazon_headquarters}.")
Amazon's headquarters is in Seattle.
# DO NOT CHANGE THE CODE IN THIS CELL
tc.assertEqual(amazon_headquarters, 'Seattle')
### BEGIN SOLUTION
adobe_ticker = df_companies.loc[df_companies['company_name'] == 'Adobe', 'ticker'].iloc[0]
### END SOLUTION
print(f"Adobe's ticker is {adobe_ticker}.")
Adobe's ticker is ADBE.
# DO NOT CHANGE THE CODE IN THIS CELL
tc.assertEqual(adobe_ticker, 'ADBE')
Now, you'll work with the Starbucks Customer Rewards Program dataset. ☕ The dataset was originally released as a part of a capstone project of a Udacity course.
For the purpose of this course, the original dataset in JSON format has been transformed to CSV files for convenience. You can download the original JSON files here.
▶️ Run the code cell below to read the transcript data.
# DO NOT CHANGE THE CODE IN THIS CELL
df_transcript = pd.read_csv('https://github.com/bdi475/datasets/raw/main/starbucks-rewards/transcript.v2.csv.gz')
df_transcript_backup = df_transcript.copy()
display(df_transcript)
member_id | event | time | offer_id | amount | |
---|---|---|---|---|---|
0 | 78afa995795e4d85b5d9ceeca43f5fef | offer received | 0 | 9b98b8c7a33c4b65b9aebfe6a799e6d9 | NaN |
1 | a03223e636434f42ac4c3df47e8bac43 | offer received | 0 | 0b1e1539f2cc45b7b9fa7c272da2e1d7 | NaN |
2 | e2127556f4f64592b11af22de27a7932 | offer received | 0 | 2906b810c7d4411798c6938adc9daaa5 | NaN |
3 | 8ec6ce2a7e7949b1bf142def7d0e0586 | offer received | 0 | fafdcd668e3743c1bb461111dcafc2a4 | NaN |
4 | 68617ca6246f4fbc85e91a2a49552598 | offer received | 0 | 4d5c57ea9a6940dd891ad53e9dbe8da0 | NaN |
... | ... | ... | ... | ... | ... |
306529 | b3a1272bc9904337b331bf348c3e8c17 | transaction | 714 | NaN | 1.59 |
306530 | 68213b08d99a4ae1b0dcb72aebd9aa35 | transaction | 714 | NaN | 9.53 |
306531 | a00058cf10334a308c68e7631c529907 | transaction | 714 | NaN | 3.61 |
306532 | 76ddbd6576844afe811f1a3c0fbb5bec | transaction | 714 | NaN | 3.53 |
306533 | c02b10e8752c4d8e9b73f918558531f7 | transaction | 714 | NaN | 4.05 |
306534 rows × 5 columns
▶️ Run the code cell below to read the customer profiles data.
# DO NOT CHANGE THE CODE IN THIS CELL
df_profiles = pd.read_csv('https://github.com/bdi475/datasets/raw/main/starbucks-rewards/profile.csv')
df_profiles_backup = df_profiles.copy()
display(df_profiles)
member_id | gender | age | became_member_on | income | |
---|---|---|---|---|---|
0 | 68be06ca386d4c31939f3a4f0e3dd783 | NaN | NaN | 2017-02-12 | NaN |
1 | 0610b486422d4921ae7d2bf64640c50b | F | 55.0 | 2017-07-15 | 112000.0 |
2 | 38fe809add3b4fcf9315a9694bb96ff5 | NaN | NaN | 2018-07-12 | NaN |
3 | 78afa995795e4d85b5d9ceeca43f5fef | F | 75.0 | 2017-05-09 | 100000.0 |
4 | a03223e636434f42ac4c3df47e8bac43 | NaN | NaN | 2017-08-04 | NaN |
... | ... | ... | ... | ... | ... |
16995 | 6d5f3a774f3d4714ab0c092238f3a1d7 | F | 45.0 | 2018-06-04 | 54000.0 |
16996 | 2cb4f97358b841b9a9773a7aa05a9d77 | M | 61.0 | 2018-07-13 | 72000.0 |
16997 | 01d26f638c274aa0b965d24cefe3183f | M | 49.0 | 2017-01-26 | 73000.0 |
16998 | 9dc1421481194dcd9400aec7c9ae6366 | F | 83.0 | 2016-03-07 | 50000.0 |
16999 | e4052622e5ba45a8b96b59aba68cf068 | F | 62.0 | 2017-07-22 | 82000.0 |
17000 rows × 5 columns
▶️ Run the code cell below to read the offers data.
# DO NOT CHANGE THE CODE IN THIS CELL
df_offers = pd.read_csv('https://github.com/bdi475/datasets/raw/main/starbucks-rewards/portfolio.csv')
df_offers = df_offers.copy()
display(df_offers)
reward | difficulty | duration | offer_type | offer_id | mobile | social | web | ||
---|---|---|---|---|---|---|---|---|---|
0 | 10 | 10 | 7 | bogo | ae264e3637204a6fb9bb56bc8210ddfd | 1 | 1 | 1 | 0 |
1 | 10 | 10 | 5 | bogo | 4d5c57ea9a6940dd891ad53e9dbe8da0 | 1 | 1 | 1 | 1 |
2 | 0 | 0 | 4 | informational | 3f207df678b143eea3cee63160fa8bed | 1 | 1 | 0 | 1 |
3 | 5 | 5 | 7 | bogo | 9b98b8c7a33c4b65b9aebfe6a799e6d9 | 1 | 1 | 0 | 1 |
4 | 5 | 20 | 10 | discount | 0b1e1539f2cc45b7b9fa7c272da2e1d7 | 1 | 0 | 0 | 1 |
5 | 3 | 7 | 7 | discount | 2298d6c36e964ae4a3e7e9706d1fb8c2 | 1 | 1 | 1 | 1 |
6 | 2 | 10 | 10 | discount | fafdcd668e3743c1bb461111dcafc2a4 | 1 | 1 | 1 | 1 |
7 | 0 | 0 | 3 | informational | 5a8bc65990b245e5a138643cd4eb9837 | 1 | 1 | 1 | 0 |
8 | 5 | 5 | 5 | bogo | f19421c1d4aa40978ebb69ca19b0e20d | 1 | 1 | 1 | 1 |
9 | 2 | 10 | 7 | discount | 2906b810c7d4411798c6938adc9daaa5 | 1 | 1 | 0 | 1 |
### BEGIN SOLUTION
event_types = df_transcript['event'].unique()
print(event_types)
### END SOLUTION
['offer received' 'offer viewed' 'transaction' 'offer completed']
# DO NOT CHANGE THE CODE IN THIS CELL
tc.assertEqual(set(event_types), set(df_transcript_backup['event'].unique()))
### BEGIN SOLUTION
df_completed = df_transcript[df_transcript['event'] == 'offer completed']
### END SOLUTION
df_completed
member_id | event | time | offer_id | amount | |
---|---|---|---|---|---|
12658 | 9fa9ae8f57894cc9a3b8a9bbe0fc1b2f | offer completed | 0 | 2906b810c7d4411798c6938adc9daaa5 | NaN |
12672 | fe97aa22dd3e48c8b143116a8403dd52 | offer completed | 0 | fafdcd668e3743c1bb461111dcafc2a4 | NaN |
12679 | 629fc02d56414d91bca360decdfa9288 | offer completed | 0 | 9b98b8c7a33c4b65b9aebfe6a799e6d9 | NaN |
12692 | 676506bad68e4161b9bbaffeb039626b | offer completed | 0 | ae264e3637204a6fb9bb56bc8210ddfd | NaN |
12697 | 8f7dd3b2afe14c078eb4f6e6fe4ba97d | offer completed | 0 | 4d5c57ea9a6940dd891ad53e9dbe8da0 | NaN |
... | ... | ... | ... | ... | ... |
306475 | 0c027f5f34dd4b9eba0a25785c611273 | offer completed | 714 | 2298d6c36e964ae4a3e7e9706d1fb8c2 | NaN |
306497 | a6f84f4e976f44508c358cc9aba6d2b3 | offer completed | 714 | 2298d6c36e964ae4a3e7e9706d1fb8c2 | NaN |
306506 | b895c57e8cd047a8872ce02aa54759d6 | offer completed | 714 | fafdcd668e3743c1bb461111dcafc2a4 | NaN |
306509 | 8431c16f8e1d440880db371a68f82dd0 | offer completed | 714 | fafdcd668e3743c1bb461111dcafc2a4 | NaN |
306527 | 24f56b5e1849462093931b164eb803b5 | offer completed | 714 | fafdcd668e3743c1bb461111dcafc2a4 | NaN |
33579 rows × 5 columns
# DO NOT CHANGE THE CODE IN THIS CELL
df_check = df_transcript.query("event == 'offer completed'")
pd.testing.assert_frame_equal(df_transcript, df_transcript_backup)
pd.testing.assert_frame_equal(df_completed.sort_values(df_completed.columns.tolist()).reset_index(drop=True),
df_check.sort_values(df_check.columns.tolist()).reset_index(drop=True))
# DO NOT CHANGE THE CODE BELOW
df_completed = df_completed.copy()
event
, time
, and amount
columns from df_completed
in-place.df_completed
using the inplace=True
option.Use the following code as a reference.
my_dataframe.drop(columns=["my_column1", "my_column2"], inplace=True)
member_id | offer_id | |
---|---|---|
12658 | 9fa9ae8f57894cc9a3b8a9bbe0fc1b2f | 2906b810c7d4411798c6938adc9daaa5 |
12672 | fe97aa22dd3e48c8b143116a8403dd52 | fafdcd668e3743c1bb461111dcafc2a4 |
12679 | 629fc02d56414d91bca360decdfa9288 | 9b98b8c7a33c4b65b9aebfe6a799e6d9 |
### BEGIN SOLUTION
df_completed.drop(columns=["event", "time", "amount"], inplace=True)
### END SOLUTION
df_completed.head(3)
member_id | offer_id | |
---|---|---|
12658 | 9fa9ae8f57894cc9a3b8a9bbe0fc1b2f | 2906b810c7d4411798c6938adc9daaa5 |
12672 | fe97aa22dd3e48c8b143116a8403dd52 | fafdcd668e3743c1bb461111dcafc2a4 |
12679 | 629fc02d56414d91bca360decdfa9288 | 9b98b8c7a33c4b65b9aebfe6a799e6d9 |
# DO NOT CHANGE THE CODE IN THIS CELL
df_check = df_transcript.query("event == 'offer completed'")[['member_id', 'offer_id']]
pd.testing.assert_frame_equal(df_completed.sort_values(df_completed.columns.tolist()).reset_index(drop=True),
df_check.sort_values(df_check.columns.tolist()).reset_index(drop=True))
df_profiles
so that it only contains rows where gender
is NOT missing.df = df[df['some_column'].notna()]
member_id | gender | age | became_member_on | income | |
---|---|---|---|---|---|
1 | 0610b486422d4921ae7d2bf64640c50b | F | 55.0 | 2017-07-15 | 112000.0 |
3 | 78afa995795e4d85b5d9ceeca43f5fef | F | 75.0 | 2017-05-09 | 100000.0 |
5 | e2127556f4f64592b11af22de27a7932 | M | 68.0 | 2018-04-26 | 70000.0 |
8 | 389bc3fa690240e798340f5a15918d5c | M | 65.0 | 2018-02-09 | 53000.0 |
12 | 2eeac8d8feae4a8cad5a6af0499a211d | M | 58.0 | 2017-11-11 | 51000.0 |
### BEGIN SOLUTION
df_profiles = df_profiles[df_profiles['gender'].notna()]
### END SOLUTION
df_profiles.head(5)
member_id | gender | age | became_member_on | income | |
---|---|---|---|---|---|
1 | 0610b486422d4921ae7d2bf64640c50b | F | 55.0 | 2017-07-15 | 112000.0 |
3 | 78afa995795e4d85b5d9ceeca43f5fef | F | 75.0 | 2017-05-09 | 100000.0 |
5 | e2127556f4f64592b11af22de27a7932 | M | 68.0 | 2018-04-26 | 70000.0 |
8 | 389bc3fa690240e798340f5a15918d5c | M | 65.0 | 2018-02-09 | 53000.0 |
12 | 2eeac8d8feae4a8cad5a6af0499a211d | M | 58.0 | 2017-11-11 | 51000.0 |
# DO NOT CHANGE THE CODE IN THIS CELL
df_check = df_profiles_backup.query('gender == gender')
pd.testing.assert_frame_equal(df_profiles.sort_values(df_profiles.columns.to_list()).reset_index(drop=True),
df_check.sort_values(df_check.columns.tolist()).reset_index(drop=True))
df_completed
¶▶️ Run the code below to:
age
and income
columns in df_profiles
to integer types (from float types).df_transactions
and df_profiles
.# Convert age and income to int64 types
df_profiles = df_profiles.copy()
df_profiles['age'] = df_profiles['age'].astype(np.int64)
df_profiles['income'] = df_profiles['income'].astype(np.int64)
display(df_completed.head(3))
display(df_profiles.head(3))
member_id | offer_id | |
---|---|---|
12658 | 9fa9ae8f57894cc9a3b8a9bbe0fc1b2f | 2906b810c7d4411798c6938adc9daaa5 |
12672 | fe97aa22dd3e48c8b143116a8403dd52 | fafdcd668e3743c1bb461111dcafc2a4 |
12679 | 629fc02d56414d91bca360decdfa9288 | 9b98b8c7a33c4b65b9aebfe6a799e6d9 |
member_id | gender | age | became_member_on | income | |
---|---|---|---|---|---|
1 | 0610b486422d4921ae7d2bf64640c50b | F | 55 | 2017-07-15 | 112000 |
3 | 78afa995795e4d85b5d9ceeca43f5fef | F | 75 | 2017-05-09 | 100000 |
5 | e2127556f4f64592b11af22de27a7932 | M | 68 | 2018-04-26 | 70000 |
df_completed
and df_profiles
using an inner join type.df_completed
on the "left" side.df_merged
.my_merged_dataframe = pd.merge(
left=left_dataframe,
right=right_dataframe,
on='shared_key_column',
how='inner'
)
member_id | offer_id | gender | age | became_member_on | income | |
---|---|---|---|---|---|---|
0 | 9fa9ae8f57894cc9a3b8a9bbe0fc1b2f | 2906b810c7d4411798c6938adc9daaa5 | M | 42 | 2016-01-17 | 96000 |
1 | 9fa9ae8f57894cc9a3b8a9bbe0fc1b2f | 2298d6c36e964ae4a3e7e9706d1fb8c2 | M | 42 | 2016-01-17 | 96000 |
2 | 9fa9ae8f57894cc9a3b8a9bbe0fc1b2f | 0b1e1539f2cc45b7b9fa7c272da2e1d7 | M | 42 | 2016-01-17 | 96000 |
### BEGIN SOLUTION
df_merged = pd.merge(
left=df_completed,
right=df_profiles,
on='member_id',
how='inner'
)
### END SOLUTION
df_merged.head(3)
member_id | offer_id | gender | age | became_member_on | income | |
---|---|---|---|---|---|---|
0 | 9fa9ae8f57894cc9a3b8a9bbe0fc1b2f | 2906b810c7d4411798c6938adc9daaa5 | M | 42 | 2016-01-17 | 96000 |
1 | 9fa9ae8f57894cc9a3b8a9bbe0fc1b2f | 2298d6c36e964ae4a3e7e9706d1fb8c2 | M | 42 | 2016-01-17 | 96000 |
2 | 9fa9ae8f57894cc9a3b8a9bbe0fc1b2f | 0b1e1539f2cc45b7b9fa7c272da2e1d7 | M | 42 | 2016-01-17 | 96000 |
# DO NOT CHANGE THE CODE IN THIS CELL
df_check = df_transcript_backup.query('event == "offer completed"').merge(
df_profiles_backup.query('gender == gender'),
on='member_id',
how='inner'
).drop(columns=['event', 'time', 'amount'])
df_check['age'] = df_check['age'].astype(np.int64)
df_check['income'] = df_check['income'].astype(np.int64)
df_merged_backup1 = df_check.copy()
pd.testing.assert_frame_equal(df_merged.sort_values(df_merged.columns.to_list()).reset_index(drop=True),
df_check.sort_values(df_check.columns.tolist()).reset_index(drop=True))
df_merged
¶df_offers
and df_merged
using an inner join type.df_merged
on the "left" side.df_completed_offers
.my_merged_dataframe = pd.merge(
left=left_dataframe,
right=right_dataframe,
on='shared_key_column',
how='inner'
)
member_id | offer_id | gender | age | became_member_on | income | reward | difficulty | duration | offer_type | mobile | social | web | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 9fa9ae8f57894cc9a3b8a9bbe0fc1b2f | 2906b810c7d4411798c6938adc9daaa5 | M | 42 | 2016-01-17 | 96000 | 2 | 10 | 7 | discount | 1 | 1 | 0 | 1 |
1 | 629fc02d56414d91bca360decdfa9288 | 2906b810c7d4411798c6938adc9daaa5 | M | 52 | 2018-06-05 | 72000 | 2 | 10 | 7 | discount | 1 | 1 | 0 | 1 |
2 | 73ffefd41e9a4ca3ab26b2b3697c6eb7 | 2906b810c7d4411798c6938adc9daaa5 | F | 67 | 2015-09-29 | 67000 | 2 | 10 | 7 | discount | 1 | 1 | 0 | 1 |
### BEGIN SOLUTION
df_completed_offers = pd.merge(
left=df_merged,
right=df_offers,
on='offer_id',
how='inner'
)
### END SOLUTION
df_completed_offers.head(3)
member_id | offer_id | gender | age | became_member_on | income | reward | difficulty | duration | offer_type | mobile | social | web | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 9fa9ae8f57894cc9a3b8a9bbe0fc1b2f | 2906b810c7d4411798c6938adc9daaa5 | M | 42 | 2016-01-17 | 96000 | 2 | 10 | 7 | discount | 1 | 1 | 0 | 1 |
1 | 629fc02d56414d91bca360decdfa9288 | 2906b810c7d4411798c6938adc9daaa5 | M | 52 | 2018-06-05 | 72000 | 2 | 10 | 7 | discount | 1 | 1 | 0 | 1 |
2 | 73ffefd41e9a4ca3ab26b2b3697c6eb7 | 2906b810c7d4411798c6938adc9daaa5 | F | 67 | 2015-09-29 | 67000 | 2 | 10 | 7 | discount | 1 | 1 | 0 | 1 |
# DO NOT CHANGE THE CODE IN THIS CELL
df_check = df_merged_backup1.merge(
df_offers,
on='offer_id',
how='inner'
)
df_merged_backup2 = df_check.copy()
pd.testing.assert_frame_equal(df_completed_offers.sort_values(df_completed_offers.columns.to_list()).reset_index(drop=True),
df_check.sort_values(df_check.columns.tolist()).reset_index(drop=True))
df_completed_offers
, find the number of times each offer has been redeemed.df_count_by_offer_id
.df_count_by_offer_id
should have two non-index columns.df_count_by_offer_id.columns.to_list()
should print out ['offer_id', 'count']
.df_count_by_offer_id
by count
in descending order.df_completed_offers
should remain unaltered.Your index column may contain different values.
offer_id | count | |
---|---|---|
7 | fafdcd668e3743c1bb461111dcafc2a4 | 5003 |
1 | 2298d6c36e964ae4a3e7e9706d1fb8c2 | 4886 |
4 | 9b98b8c7a33c4b65b9aebfe6a799e6d9 | 4188 |
6 | f19421c1d4aa40978ebb69ca19b0e20d | 4103 |
2 | 2906b810c7d4411798c6938adc9daaa5 | 3911 |
5 | ae264e3637204a6fb9bb56bc8210ddfd | 3657 |
0 | 0b1e1539f2cc45b7b9fa7c272da2e1d7 | 3386 |
3 | 4d5c57ea9a6940dd891ad53e9dbe8da0 | 3310 |
### BEGIN SOLUTION
df_count_by_offer_id = df_completed_offers.groupby('offer_id', as_index=False).agg({
'member_id': 'count'
}).rename(columns={
'member_id': 'count'
}).sort_values('count', ascending=False)
### END SOLUTION
df_count_by_offer_id
offer_id | count | |
---|---|---|
7 | fafdcd668e3743c1bb461111dcafc2a4 | 5003 |
1 | 2298d6c36e964ae4a3e7e9706d1fb8c2 | 4886 |
4 | 9b98b8c7a33c4b65b9aebfe6a799e6d9 | 4188 |
6 | f19421c1d4aa40978ebb69ca19b0e20d | 4103 |
2 | 2906b810c7d4411798c6938adc9daaa5 | 3911 |
5 | ae264e3637204a6fb9bb56bc8210ddfd | 3657 |
0 | 0b1e1539f2cc45b7b9fa7c272da2e1d7 | 3386 |
3 | 4d5c57ea9a6940dd891ad53e9dbe8da0 | 3310 |
# DO NOT CHANGE THE CODE IN THIS CELL
df_check = pd.DataFrame({'offer_id': {7: 'fafdcd668e3743c1bb461111dcafc2a4', 1: '2298d6c36e964ae4a3e7e9706d1fb8c2',
4: '9b98b8c7a33c4b65b9aebfe6a799e6d9', 6: 'f19421c1d4aa40978ebb69ca19b0e20d', 2: '2906b810c7d4411798c6938adc9daaa5',
5: 'ae264e3637204a6fb9bb56bc8210ddfd', 0: '0b1e1539f2cc45b7b9fa7c272da2e1d7', 3: '4d5c57ea9a6940dd891ad53e9dbe8da0'},
'count': {7: 5003, 1: 4886, 4: 4188, 6: 4103, 2: 3911, 5: 3657, 0: 3386, 3: 3310}})
pd.testing.assert_frame_equal(df_count_by_offer_id.reset_index(drop=True),
df_check.reset_index(drop=True))
df_completed_offers
, find the number of times each offer type has been redeemed.df_count_by_offer_type
.df_count_by_offer_type
should have two non-index columns.df_count_by_offer_type.columns.to_list()
should print out ['offer_type', 'count']
.df_count_by_offer_type
by count
in descending order.df_completed_offers
should remain unaltered.Your index column may contain different values.
offer_type | count | |
---|---|---|
1 | discount | 17186 |
0 | bogo | 15258 |
### BEGIN SOLUTION
df_count_by_offer_type = df_completed_offers.groupby('offer_type', as_index=False).agg({
'member_id': 'count'
}).rename(columns={
'member_id': 'count'
}).sort_values('count', ascending=False)
### END SOLUTION
df_count_by_offer_type
offer_type | count | |
---|---|---|
1 | discount | 17186 |
0 | bogo | 15258 |
# DO NOT CHANGE THE CODE IN THIS CELL
df_check = pd.DataFrame({'offer_type': {1: 'discount', 0: 'bogo'}, 'count': {1: 17186, 0: 15258}})
pd.testing.assert_frame_equal(df_count_by_offer_type.reset_index(drop=True),
df_check.reset_index(drop=True))
df_completed_offers
, find the number of times each gender has redeemed an offer.df_count_by_gender
.df_count_by_offer_type
should have two non-index columns.df_count_by_gender.columns.to_list()
should print out ['gender', 'count']
.df_count_by_gender
by count
in descending order.df_completed_offers
should remain unaltered.Your index column may contain different values.
gender | count | |
---|---|---|
1 | M | 16466 |
0 | F | 15477 |
2 | O | 501 |
### BEGIN SOLUTION
df_count_by_gender = df_completed_offers.groupby('gender', as_index=False).agg({
'member_id': 'count'
}).rename(columns={
'member_id': 'count'
}).sort_values('count', ascending=False)
### END SOLUTION
df_count_by_gender
gender | count | |
---|---|---|
1 | M | 16466 |
0 | F | 15477 |
2 | O | 501 |
# DO NOT CHANGE THE CODE IN THIS CELL
df_check = pd.DataFrame({'gender': {1: 'M', 0: 'F', 2: 'O'}, 'count': {1: 16466, 0: 15477, 2: 501}})
pd.testing.assert_frame_equal(df_count_by_gender.reset_index(drop=True),
df_check.reset_index(drop=True))
df_completed_offers
, first find the member sign-up year using became_member_on
column.df_count_by_member_signup_year
.df_count_by_member_signup_year
should have two non-index columns.df_count_by_member_signup_year.columns.to_list()
should print out ['member_since', 'count']
.df_count_by_member_signup_year
by year
in ascending order.df_completed_offers
if you need to (e.g., convert a column to datetime format, extract year, etc).Your index column may contain different values.
member_since | count | |
---|---|---|
0 | 2013 | 561 |
1 | 2014 | 1321 |
2 | 2015 | 4319 |
3 | 2016 | 8928 |
4 | 2017 | 12119 |
5 | 2018 | 5196 |
### BEGIN SOLUTION
df_completed_offers['became_member_on'] = pd.to_datetime(df_completed_offers['became_member_on'])
df_completed_offers['member_since'] = df_completed_offers['became_member_on'].dt.year
df_count_by_member_signup_year = df_completed_offers.groupby('member_since', as_index=False).agg({
'member_id': 'count'
}).rename(columns={
'member_id': 'count'
})
### END SOLUTION
df_count_by_member_signup_year
member_since | count | |
---|---|---|
0 | 2013 | 561 |
1 | 2014 | 1321 |
2 | 2015 | 4319 |
3 | 2016 | 8928 |
4 | 2017 | 12119 |
5 | 2018 | 5196 |
# DO NOT CHANGE THE CODE IN THIS CELL
df_check = pd.DataFrame({'member_since': {0: 2013, 1: 2014, 2: 2015, 3: 2016, 4: 2017, 5: 2018},
'count': {0: 561, 1: 1321, 2: 4319, 3: 8928, 4: 12119, 5: 5196}})
pd.testing.assert_frame_equal(df_count_by_member_signup_year.reset_index(drop=True),
df_check.reset_index(drop=True))