Today, we will review basic SQL joins.
▶️ First, run the code cell below to import modules used for 🧭 Check Your Work sections and the autograder.
import unittest
import base64
tc = unittest.TestCase()
pandas
: Use alias pd
.numpy
: Use alias np
.sqlite3
: No alias# YOUR CODE BEGINS
import pandas as pd
import numpy as np
import sqlite3
# YOUR CODE ENDS
import sys
tc.assertTrue('pd' in globals(), 'Check whether you have correctly imported Pandas with an alias.')
tc.assertTrue('np' in globals(), 'Check whether you have correctly imported NumPy with an alias.')
tc.assertTrue('sqlite3' in globals(), 'Check whether you have correctly imported the sqlite3 package.')
Throughout this exercise, you'll work with the Starbucks Customer Rewards Program dataset. ☕ The dataset was originally released as a part of a capstone project of a Udacity course.
For the purpose of this course, the original dataset in JSON format has been transformed to CSV files for convenience. You can download the original JSON files here.
The table below describes the columns in the transactions
table.
Field | Description |
---|---|
member_id | Member ID |
event | Type of event ('offer received', 'offer completed', 'transaction') |
time | Time as continuous value |
offer_id | Offer ID |
amount | Amount of purchase or NaN if non-purchase |
We want to go further with the purchase data analysis by looking at the customers. Unfortunately, the information we currently have about the customers in each purchase is limited. We only have their unique IDs (e.g., 02c083884c7d45b39cc68e1314fec56c
, 9fa9ae8f57894cc9a3b8a9bbe0fc1b2f
).
In the next few steps, you'll read the customer profiles data and link them to df_transactions
using member_id
.
The table below describes the columns in the profiles
table.
Field | Description |
---|---|
member_id | Unique identifier for each Starbucks app member |
gender | Gender (NaN if unknown) |
age | Age (NaN if unknown) |
became_member_on | Sign-up date |
income | Annual income (NaN if unknown) |
▶️ Run the code below to populate the profiles
and transcripts
tables.
df_transactions = pd.read_csv('https://github.com/bdi475/datasets/raw/main/starbucks-rewards/transcript.v2.csv.gz')
df_profiles = pd.read_csv('https://github.com/bdi475/datasets/raw/main/starbucks-rewards/profile.csv')
conn = sqlite3.connect('starbucks.db')
c = conn.cursor()
tables = list(pd.read_sql_query('SELECT * FROM sqlite_master WHERE type="table";', con=conn)['tbl_name'])
tables_to_populate = ['transactions', 'profiles']
for t in tables_to_populate:
if t in tables:
c.execute(f'DELETE FROM {t}')
conn.commit()
df_transactions.to_sql(name='transactions', index=False, con=conn, if_exists='append')
df_profiles.to_sql(name='profiles', index=False, con=conn, if_exists='append')
conn.close()
# DO NOT CHANGE THE CODE IN THIS CELL
conn_checker = sqlite3.connect('starbucks.db')
tables_to_check = ['transactions', 'profiles']
# Check if table exists
for t in tables_to_check:
user_tables = list(pd.read_sql_query('SELECT * FROM sqlite_master WHERE type="table";', con=conn_checker)['tbl_name'])
tc.assertTrue(t in user_tables, f'{t} does not exist in your starbucks.db file!')
conn_checker.close()
▶️ Run the code below to select the first 5 rows from the transactions
table
# DO NOT CHANGE THE CODE IN THIS CELL
conn = sqlite3.connect('starbucks.db')
display(pd.read_sql_query('SELECT * FROM transactions LIMIT 5;', con=conn))
conn.close()
member_id | event | time | offer_id | amount | |
---|---|---|---|---|---|
0 | 78afa995795e4d85b5d9ceeca43f5fef | offer received | 0 | 9b98b8c7a33c4b65b9aebfe6a799e6d9 | None |
1 | a03223e636434f42ac4c3df47e8bac43 | offer received | 0 | 0b1e1539f2cc45b7b9fa7c272da2e1d7 | None |
2 | e2127556f4f64592b11af22de27a7932 | offer received | 0 | 2906b810c7d4411798c6938adc9daaa5 | None |
3 | 8ec6ce2a7e7949b1bf142def7d0e0586 | offer received | 0 | fafdcd668e3743c1bb461111dcafc2a4 | None |
4 | 68617ca6246f4fbc85e91a2a49552598 | offer received | 0 | 4d5c57ea9a6940dd891ad53e9dbe8da0 | None |
▶️ Run the code below to select the first 5 rows from the profiles
table
# DO NOT CHANGE THE CODE IN THIS CELL
conn = sqlite3.connect('starbucks.db')
display(pd.read_sql_query('SELECT * FROM profiles LIMIT 5;', con=conn))
conn.close()
member_id | gender | age | became_member_on | income | |
---|---|---|---|---|---|
0 | 68be06ca386d4c31939f3a4f0e3dd783 | None | NaN | 2017-02-12 | NaN |
1 | 0610b486422d4921ae7d2bf64640c50b | F | 55.0 | 2017-07-15 | 112000.0 |
2 | 38fe809add3b4fcf9315a9694bb96ff5 | None | NaN | 2018-07-12 | NaN |
3 | 78afa995795e4d85b5d9ceeca43f5fef | F | 75.0 | 2017-05-09 | 100000.0 |
4 | a03223e636434f42ac4c3df47e8bac43 | None | NaN | 2017-08-04 | NaN |
# DO NOT CHANGE THE CODE IN THIS CELL
conn_checker = sqlite3.connect('starbucks.db')
tables_to_check = ['transactions', 'profiles']
# Check if table exists
user_tables = list(pd.read_sql_query('SELECT * FROM sqlite_master WHERE type="table";', con=conn_checker)['tbl_name'])
for table_to_check in tables_to_check:
tc.assertTrue(table_to_check in user_tables, f'{table_to_check} does not exist in your NWT.db file!')
conn_checker.close()
# YOUR CODE BEGINS
query_joined1 = '''
SELECT *
FROM transactions
INNER JOIN profiles
ON transactions.member_id == profiles.member_id;
'''
# YOUR CODE ENDS
conn = sqlite3.connect('starbucks.db')
df_result = pd.read_sql_query(query_joined1, con=conn)
display(df_result)
conn.close()
member_id | event | time | offer_id | amount | member_id | gender | age | became_member_on | income | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 78afa995795e4d85b5d9ceeca43f5fef | offer received | 0 | 9b98b8c7a33c4b65b9aebfe6a799e6d9 | NaN | 78afa995795e4d85b5d9ceeca43f5fef | F | 75.0 | 2017-05-09 | 100000.0 |
1 | a03223e636434f42ac4c3df47e8bac43 | offer received | 0 | 0b1e1539f2cc45b7b9fa7c272da2e1d7 | NaN | a03223e636434f42ac4c3df47e8bac43 | None | NaN | 2017-08-04 | NaN |
2 | e2127556f4f64592b11af22de27a7932 | offer received | 0 | 2906b810c7d4411798c6938adc9daaa5 | NaN | e2127556f4f64592b11af22de27a7932 | M | 68.0 | 2018-04-26 | 70000.0 |
3 | 8ec6ce2a7e7949b1bf142def7d0e0586 | offer received | 0 | fafdcd668e3743c1bb461111dcafc2a4 | NaN | 8ec6ce2a7e7949b1bf142def7d0e0586 | None | NaN | 2017-09-25 | NaN |
4 | 68617ca6246f4fbc85e91a2a49552598 | offer received | 0 | 4d5c57ea9a6940dd891ad53e9dbe8da0 | NaN | 68617ca6246f4fbc85e91a2a49552598 | None | NaN | 2017-10-02 | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
306529 | b3a1272bc9904337b331bf348c3e8c17 | transaction | 714 | None | 1.59 | b3a1272bc9904337b331bf348c3e8c17 | M | 66.0 | 2018-01-01 | 47000.0 |
306530 | 68213b08d99a4ae1b0dcb72aebd9aa35 | transaction | 714 | None | 9.53 | 68213b08d99a4ae1b0dcb72aebd9aa35 | M | 52.0 | 2018-04-08 | 62000.0 |
306531 | a00058cf10334a308c68e7631c529907 | transaction | 714 | None | 3.61 | a00058cf10334a308c68e7631c529907 | F | 63.0 | 2013-09-22 | 52000.0 |
306532 | 76ddbd6576844afe811f1a3c0fbb5bec | transaction | 714 | None | 3.53 | 76ddbd6576844afe811f1a3c0fbb5bec | M | 57.0 | 2016-07-09 | 40000.0 |
306533 | c02b10e8752c4d8e9b73f918558531f7 | transaction | 714 | None | 4.05 | c02b10e8752c4d8e9b73f918558531f7 | None | NaN | 2015-12-11 | NaN |
306534 rows × 10 columns
# DO NOT CHANGE THE CODE IN THIS CELL
conn = sqlite3.connect('starbucks.db')
df_check = pd.read_sql_query(query_joined1, con=conn)
tc.assertEqual(df_result.shape, (306534, 10), 'Incorrect number of rows and/or columns')
conn.close()
profiles
into transactions
table where gender is not missing¶profiles
table into transactions
.gender
column's value in the profiles
table is NOT NULL.query_joined2
.# YOUR CODE BEGINS
query_joined2 = '''
SELECT *
FROM transactions
INNER JOIN profiles
ON transactions.member_id == profiles.member_id
WHERE profiles.gender IS NOT NULL;
'''
# YOUR CODE ENDS
conn = sqlite3.connect('starbucks.db')
df_result = pd.read_sql_query(query_joined2, con=conn)
display(df_result)
conn.close()
member_id | event | time | offer_id | amount | member_id | gender | age | became_member_on | income | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 0610b486422d4921ae7d2bf64640c50b | offer completed | 528 | 9b98b8c7a33c4b65b9aebfe6a799e6d9 | NaN | 0610b486422d4921ae7d2bf64640c50b | F | 55.0 | 2017-07-15 | 112000.0 |
1 | 0610b486422d4921ae7d2bf64640c50b | offer received | 408 | 9b98b8c7a33c4b65b9aebfe6a799e6d9 | NaN | 0610b486422d4921ae7d2bf64640c50b | F | 55.0 | 2017-07-15 | 112000.0 |
2 | 0610b486422d4921ae7d2bf64640c50b | offer received | 504 | 3f207df678b143eea3cee63160fa8bed | NaN | 0610b486422d4921ae7d2bf64640c50b | F | 55.0 | 2017-07-15 | 112000.0 |
3 | 0610b486422d4921ae7d2bf64640c50b | transaction | 18 | None | 21.51 | 0610b486422d4921ae7d2bf64640c50b | F | 55.0 | 2017-07-15 | 112000.0 |
4 | 0610b486422d4921ae7d2bf64640c50b | transaction | 144 | None | 32.28 | 0610b486422d4921ae7d2bf64640c50b | F | 55.0 | 2017-07-15 | 112000.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
272757 | e4052622e5ba45a8b96b59aba68cf068 | transaction | 84 | None | 25.19 | e4052622e5ba45a8b96b59aba68cf068 | F | 62.0 | 2017-07-22 | 82000.0 |
272758 | e4052622e5ba45a8b96b59aba68cf068 | transaction | 96 | None | 21.53 | e4052622e5ba45a8b96b59aba68cf068 | F | 62.0 | 2017-07-22 | 82000.0 |
272759 | e4052622e5ba45a8b96b59aba68cf068 | transaction | 480 | None | 30.57 | e4052622e5ba45a8b96b59aba68cf068 | F | 62.0 | 2017-07-22 | 82000.0 |
272760 | e4052622e5ba45a8b96b59aba68cf068 | transaction | 486 | None | 19.47 | e4052622e5ba45a8b96b59aba68cf068 | F | 62.0 | 2017-07-22 | 82000.0 |
272761 | e4052622e5ba45a8b96b59aba68cf068 | transaction | 690 | None | 24.71 | e4052622e5ba45a8b96b59aba68cf068 | F | 62.0 | 2017-07-22 | 82000.0 |
272762 rows × 10 columns
# DO NOT CHANGE THE CODE IN THIS CELL
conn = sqlite3.connect('starbucks.db')
df_check = pd.read_sql_query(query_joined2, con=conn)
tc.assertEqual(df_result.shape[0], 272762, 'Incorrect number of rows and/or columns')
conn.close()
gender
value.gender
average_amount
query_joined3
.average_amount
in descending order.gender | average_amount | |
---|---|---|
0 | F | 17.4901 |
1 | O | 14.7987 |
2 | M | 11.6066 |
# YOUR CODE BEGINS
query_joined3 = '''
SELECT gender, AVG(amount) AS average_amount
FROM transactions
INNER JOIN profiles
ON transactions.member_id == profiles.member_id
GROUP BY profiles.gender
HAVING gender IS NOT NULL
ORDER BY average_amount DESC;
'''
# YOUR CODE ENDS
conn = sqlite3.connect('starbucks.db')
df_result = pd.read_sql_query(query_joined3, con=conn)
display(df_result)
conn.close()
gender | average_amount | |
---|---|---|
0 | F | 17.490077 |
1 | O | 14.798731 |
2 | M | 11.606600 |
# DO NOT CHANGE THE CODE IN THIS CELL
conn = sqlite3.connect('starbucks.db')
decoded_query = base64.b64decode(b'ClNFTEVDVCBnZW5kZXIsIEFWRyhhbW91\
bnQpIEFTIGF2ZXJhZ2VfYW1vdW50CkZST00gdHJhbnNhY3Rpb25zCklOTkVSIEpPSU4\
gcHJvZmlsZXMKT04gdHJhbnNhY3Rpb25zLm1lbWJlcl9pZCA9PSBwcm9maWxlcy5tZW\
1iZXJfaWQKR1JPVVAgQlkgcHJvZmlsZXMuZ2VuZGVyCkhBVklORyBnZW5kZXIgSVMgT\
k9UIE5VTEwKT1JERVIgQlkgYXZlcmFnZV9hbW91bnQgREVTQzsK').decode()
df_check = pd.read_sql_query(decoded_query, con=conn)
pd.testing.assert_frame_equal(df_result.reset_index(drop=True),
df_check.reset_index(drop=True))