#!/usr/bin/env python # coding: utf-8 # # SQL Joins # --- # # ## ✨ Joining Tables # Today, we will review basic SQL joins. # ▶️ First, run the code cell below to import modules used for **🧭 Check Your Work** sections and the autograder. # In[1]: import unittest import base64 tc = unittest.TestCase() # --- # # ### 🎯 Pre-exercise: Import Packages # #### 👇 Tasks # # - ✔️ Import the following Python packages. # 1. `pandas`: Use alias `pd`. # 2. `numpy`: Use alias `np`. # 3. `sqlite3`: No alias # In[2]: # YOUR CODE BEGINS import pandas as pd import numpy as np import sqlite3 # YOUR CODE ENDS # #### 🧭 Check your work # In[3]: import sys tc.assertTrue('pd' in globals(), 'Check whether you have correctly imported Pandas with an alias.') tc.assertTrue('np' in globals(), 'Check whether you have correctly imported NumPy with an alias.') tc.assertTrue('sqlite3' in globals(), 'Check whether you have correctly imported the sqlite3 package.') # --- # # ### 📌 Transcripts # # Throughout this exercise, you'll work with the Starbucks Customer Rewards Program dataset. ☕ The dataset was originally released as a part of a capstone project of a Udacity course. # # For the purpose of this course, the original dataset in JSON format has been transformed to CSV files for convenience. You can download the original JSON files [here](https://www.kaggle.com/blacktile/starbucks-app-customer-reward-program-data?select=portfolio.json). # The table below describes the columns in the `transactions` table. # # | Field | Description | # |---|---| # | member_id | Member ID | # | event | Type of event ('offer received', 'offer completed', 'transaction') | # | time | Time as continuous value | # | offer_id | Offer ID | # | amount | Amount of purchase or NaN if non-purchase | # --- # # ### 📌 Customer profiles # # We want to go further with the purchase data analysis by looking at the customers. Unfortunately, the information we currently have about the customers in each purchase is limited. We only have their unique IDs (e.g., `02c083884c7d45b39cc68e1314fec56c`, `9fa9ae8f57894cc9a3b8a9bbe0fc1b2f`). # # In the next few steps, you'll read the customer profiles data and link them to `df_transactions` using `member_id`. # The table below describes the columns in the `profiles` table. # # | Field | Description | # |---|---| # | member_id | Unique identifier for each Starbucks app member | # | gender | Gender (NaN if unknown) | # | age | Age (NaN if unknown) | # | became_member_on | Sign-up date | # | income | Annual income (NaN if unknown) | # --- # ### 📌 Populate database tables from a CSV file # ▶️ Run the code below to populate the `profiles` and `transcripts` tables. # In[4]: df_transactions = pd.read_csv('https://github.com/bdi475/datasets/raw/main/starbucks-rewards/transcript.v2.csv.gz') df_profiles = pd.read_csv('https://github.com/bdi475/datasets/raw/main/starbucks-rewards/profile.csv') conn = sqlite3.connect('starbucks.db') c = conn.cursor() tables = list(pd.read_sql_query('SELECT * FROM sqlite_master WHERE type="table";', con=conn)['tbl_name']) tables_to_populate = ['transactions', 'profiles'] for t in tables_to_populate: if t in tables: c.execute(f'DELETE FROM {t}') conn.commit() df_transactions.to_sql(name='transactions', index=False, con=conn, if_exists='append') df_profiles.to_sql(name='profiles', index=False, con=conn, if_exists='append') conn.close() # #### 🧭 Check your work # In[5]: # DO NOT CHANGE THE CODE IN THIS CELL conn_checker = sqlite3.connect('starbucks.db') tables_to_check = ['transactions', 'profiles'] # Check if table exists for t in tables_to_check: user_tables = list(pd.read_sql_query('SELECT * FROM sqlite_master WHERE type="table";', con=conn_checker)['tbl_name']) tc.assertTrue(t in user_tables, f'{t} does not exist in your starbucks.db file!') conn_checker.close() # --- # ### 📌 Read Sqlite Database File # ▶️ Run the code below to select the first 5 rows from the `transactions` table # In[6]: # DO NOT CHANGE THE CODE IN THIS CELL conn = sqlite3.connect('starbucks.db') display(pd.read_sql_query('SELECT * FROM transactions LIMIT 5;', con=conn)) conn.close() # ▶️ Run the code below to select the first 5 rows from the `profiles` table # In[7]: # DO NOT CHANGE THE CODE IN THIS CELL conn = sqlite3.connect('starbucks.db') display(pd.read_sql_query('SELECT * FROM profiles LIMIT 5;', con=conn)) conn.close() # #### 🧭 Check your work # In[8]: # DO NOT CHANGE THE CODE IN THIS CELL conn_checker = sqlite3.connect('starbucks.db') tables_to_check = ['transactions', 'profiles'] # Check if table exists user_tables = list(pd.read_sql_query('SELECT * FROM sqlite_master WHERE type="table";', con=conn_checker)['tbl_name']) for table_to_check in tables_to_check: tc.assertTrue(table_to_check in user_tables, f'{table_to_check} does not exist in your NWT.db file!') conn_checker.close() # --- # # ### 🎯 Exercise 1: Join `profiles` into `transactions` table # # #### 👇 Tasks # # - ✔️ Write a query that joins the `profiles` table into `transactions`. # - ✔️ Use an inner join. # - ✔️ Select all columns. # - ✔️ Store your query to a new variable named `query_joined1`. # In[9]: # YOUR CODE BEGINS query_joined1 = ''' SELECT * FROM transactions INNER JOIN profiles ON transactions.member_id == profiles.member_id; ''' # YOUR CODE ENDS conn = sqlite3.connect('starbucks.db') df_result = pd.read_sql_query(query_joined1, con=conn) display(df_result) conn.close() # #### 🧭 Check your work # In[10]: # DO NOT CHANGE THE CODE IN THIS CELL conn = sqlite3.connect('starbucks.db') df_check = pd.read_sql_query(query_joined1, con=conn) tc.assertEqual(df_result.shape, (306534, 10), 'Incorrect number of rows and/or columns') conn.close() # --- # # ### 🎯 Exercise 2: Join `profiles` into `transactions` table where gender is not missing # # #### 👇 Tasks # # - ✔️ Write a query that joins the `profiles` table into `transactions`. # - ✔️ Only keep rows where the `gender` column's value in the `profiles` table is NOT NULL. # - ✔️ Use an inner join. # - ✔️ Select all columns. # - ✔️ Store your query to a new variable named `query_joined2`. # In[11]: # YOUR CODE BEGINS query_joined2 = ''' SELECT * FROM transactions INNER JOIN profiles ON transactions.member_id == profiles.member_id WHERE profiles.gender IS NOT NULL; ''' # YOUR CODE ENDS conn = sqlite3.connect('starbucks.db') df_result = pd.read_sql_query(query_joined2, con=conn) display(df_result) conn.close() # #### 🧭 Check your work # In[12]: # DO NOT CHANGE THE CODE IN THIS CELL conn = sqlite3.connect('starbucks.db') df_check = pd.read_sql_query(query_joined2, con=conn) tc.assertEqual(df_result.shape[0], 272762, 'Incorrect number of rows and/or columns') conn.close() # --- # # ### 🎯 Exercise 3: Average transaction amount by gender # # #### 👇 Tasks # # - ✔️ Write a query that finds the average transaction amount by gender. # - ✔️ Only display rows with a non-missing `gender` value. # - ✔️ Only display the following two columns: # - `gender` # - `average_amount` # - ✔️ Store your query to a new variable named `query_joined3`. # - ✔️ Sort the result by `average_amount` in descending order. # # # #### 🧭 Expected Output # # | | gender | average_amount | # |---:|:---------|-----------------:| # | 0 | F | 17.4901 | # | 1 | O | 14.7987 | # | 2 | M | 11.6066 | # In[13]: # YOUR CODE BEGINS query_joined3 = ''' SELECT gender, AVG(amount) AS average_amount FROM transactions INNER JOIN profiles ON transactions.member_id == profiles.member_id GROUP BY profiles.gender HAVING gender IS NOT NULL ORDER BY average_amount DESC; ''' # YOUR CODE ENDS conn = sqlite3.connect('starbucks.db') df_result = pd.read_sql_query(query_joined3, con=conn) display(df_result) conn.close() # #### 🧭 Check your work # In[14]: # DO NOT CHANGE THE CODE IN THIS CELL conn = sqlite3.connect('starbucks.db') decoded_query = base64.b64decode(b'ClNFTEVDVCBnZW5kZXIsIEFWRyhhbW91\ bnQpIEFTIGF2ZXJhZ2VfYW1vdW50CkZST00gdHJhbnNhY3Rpb25zCklOTkVSIEpPSU4\ gcHJvZmlsZXMKT04gdHJhbnNhY3Rpb25zLm1lbWJlcl9pZCA9PSBwcm9maWxlcy5tZW\ 1iZXJfaWQKR1JPVVAgQlkgcHJvZmlsZXMuZ2VuZGVyCkhBVklORyBnZW5kZXIgSVMgT\ k9UIE5VTEwKT1JERVIgQlkgYXZlcmFnZV9hbW91bnQgREVTQzsK').decode() df_check = pd.read_sql_query(decoded_query, con=conn) pd.testing.assert_frame_equal(df_result.reset_index(drop=True), df_check.reset_index(drop=True))