#!/usr/bin/env python # coding: utf-8 # # Guided Project: Answering Business Questions Using SQL # As a capstone to the Intermediate SQL Course, we have been invited to make use of the chinook database to practice our SQL skills and answer some hypothetical business questions. # ### Setting Up # The first task was getting connected to the chinook database within the jupyter kernel # In[1]: get_ipython().run_cell_magic('capture', '', '%load_ext sql\n%sql sqlite:///chinook.db\n') # #### Testing Functionalities # Next we want to familiarise ourselves with the outputs of the schema within this kernel # # Here is the schema: # ![chinook database schema](https://s3.amazonaws.com/dq-content/601%2Fchinook_medium.svg "chinook database schema") # In[2]: get_ipython().run_cell_magic('sql', '', 'SELECT\n name,\n type\nFROM sqlite_master\nWHERE type IN ("table","view");\n') # ## Task One: Popular Music Genres in the USA # In this first activity we want to figure out which three of the following four artists to stock in the Chinook record store based on the most popular genres in the USA. # # | Artist Name | Genre | # | :------------: | :------------: | # |Regal| Hip-Hop| # |Red Tone|Punk| # |Meteor and the Girls|Pop| # |Slim Jim Bites|Blues| # # ### Familiarisation & Context # I always like to see not just the raw numbers but also the content of what is selling, so I pulled a list of all albums and artists with their genres in order of album track sales in the USA. # In[3]: get_ipython().run_cell_magic('sql', '', "SELECT t.name track_name\n ,ar.name artist_name\n ,al.title album_name\n ,g.name genre\n ,count(*) tracks_sold\nFROM invoice i\nLEFT JOIN invoice_line il ON i.invoice_id = il.invoice_id\nLEFT JOIN track t ON t.track_id = il.track_id\nLEFT JOIN genre g ON g.genre_id = t.genre_id\nLEFT JOIN album al ON al.album_id = t.album_id\nLEFT JOIN artist ar ON ar.artist_id = al.artist_id\nWHERE i.billing_country = 'USA'\nGROUP BY track_name, artist_name,album_name,genre\nORDER BY 5 DESC\nLIMIT 40\n") # Next I pulled the total number of tracks sold in the USA # In[4]: get_ipython().run_cell_magic('sql', '', "SELECT count(*) tracks_sold \n FROM invoice i\n LEFT JOIN invoice_line il ON i.invoice_id = il.invoice_id\n WHERE i.billing_country = 'USA'\n") # I subsequently calculated the total number of tracks sold in each genre and the percentage of total sales for each. # In[5]: get_ipython().run_cell_magic('sql', '', " SELECT g.name genre\n ,count(invoice_line_id) tracks_sold \n ,ROUND(count(*)*100.0/(SELECT count(*) \n FROM invoice i\n LEFT JOIN invoice_line il ON i.invoice_id = il.invoice_id\n WHERE i.billing_country = 'USA'),2) genre_sales_pct\nFROM invoice i\nLEFT JOIN invoice_line il ON i.invoice_id = il.invoice_id\nLEFT JOIN track t ON t.track_id = il.track_id\nLEFT JOIN genre g ON g.genre_id = t.genre_id\nWHERE i.billing_country = 'USA'\nGROUP BY genre\nORDER BY 2 DESC\n") # ### Task One Conclusion # Whilst Rock is the top genre in the USA, Punk, Pop & Blues are the best choices based on their relative popularity compare to Hip-Hop. # # Therefore the Chinook record store should stock: # # | Artist Name | Genre | # | :------------: | :------------: | # |Red Tone|Punk| # |Meteor and the Girls|Pop| # |Slim Jim Bites|Blues| # # ## Task Two: Chinook Employee Evaluation # In this next activity we want to ascertain the factors influencing the performance of the sales support staff at Chinook. # # ### Familiarisation & Context # First off I generated a table of all relevant data for employees and their customers purchases. # In[6]: get_ipython().run_cell_magic('sql', '', ' SELECT sr.*,i.*,il.*,ar.*\n FROM invoice i\n LEFT JOIN customer c ON i.customer_id = c.customer_id\n LEFT JOIN employee sr ON c.support_rep_id = sr.employee_id\n LEFT JOIN invoice_line il ON i.invoice_id = il.invoice_id\n LEFT JOIN track t ON t.track_id = il.track_id\n LEFT JOIN album al ON al.album_id = t.album_id\n LEFT JOIN artist ar ON ar.artist_id = al.artist_id\nLIMIT 50\n') # ### Analysis # I then generated a summary of the following by each employee (**bold = required**) # - **total revenue from sales** # - **total sales orders** # - total number of clients # - average tracks sold per order # - average revenue per order # - top selling artist # - date of first sale by each employee # In[7]: get_ipython().run_cell_magic('sql', '', 'WITH revenue_base AS (\n SELECT sr.*,i.*\n FROM invoice i\n LEFT JOIN customer c ON i.customer_id = c.customer_id\n LEFT JOIN employee sr ON c.support_rep_id = sr.employee_id\n )\n, detail_base AS(SELECT sr.*,i.*,il.*,ar.*\n FROM invoice i\n LEFT JOIN customer c ON i.customer_id = c.customer_id\n LEFT JOIN employee sr ON c.support_rep_id = sr.employee_id\n LEFT JOIN invoice_line il ON i.invoice_id = il.invoice_id\n LEFT JOIN track t ON t.track_id = il.track_id\n LEFT JOIN album al ON al.album_id = t.album_id\n LEFT JOIN artist ar ON ar.artist_id = al.artist_id\n )\n\nSELECT\n b.first_name||" "||b.last_name support_rep_name\n,SUM(b.total) total_revenue\n,ROUND(SUM(b.total)*100.0/(SELECT sum(b2.total) \n FROM revenue_base b2),2) revenue_pct\n,COUNT(b.invoice_id) no_purchases\n,COUNT(DISTINCT b.customer_id) no_of_clients\n,(SELECT COUNT(DISTINCT b2.invoice_line_id)/COUNT(DISTINCT b2.invoice_id)\n FROM detail_base b2\n WHERE b.employee_id=b2.employee_id) avg_tracks_per_sale\n,ROUND(SUM(b.total)/COUNT(b.invoice_id),2) avg_revenue_pp\n,(SELECT b2.name artist\n FROM detail_base b2\n WHERE b.employee_id = b2.employee_id\n GROUP BY artist\n ORDER BY MAX(unit_price) DESC\n ,COUNT(invoice_id) DESC\n ,MAX(total) DESC\n LIMIT 1) top_selling_artist\n,MIN (b.invoice_date) first_sale\nFROM revenue_base b\nGROUP BY employee_id,support_rep_name\nORDER BY 2 DESC, 4 DESC;\n\n') # #### Additional Context # To add context to the top selling artist, I also looked at the sales rate of these artists and the maximum prices of their music # I found overall that most artists music sells for ~99c, especially the most popular songs # In[8]: get_ipython().run_cell_magic('sql', '', 'SELECT DISTINCT\n ar.name artist\n ,MAX(il.unit_price) max_track_price\n ,count(il.invoice_line_id) no_tracks_sold\nFROM artist ar\nLEFT JOIN album al ON al.artist_id = ar.artist_id\nLEFT JOIN track t ON t.album_id = al.album_id\nLEFT JOIN invoice_line il ON il.track_id = t.track_id\nGROUP BY artist\nORDER BY 3 DESC\nLIMIT 30\n') # ### Summary of Findings # Overall, it seems their revenue split is proportional to the number of clients they have which seems to be a function of their tenure at the company # - Jane is the rep with the highest revenue, despite Margaret generating more sales. # - Interestingly , Jane is also selling on average one more track in each sale than her peers which puts her share of the revenue generated at 3.3% above the average. Margaret generates 4% more of the revenues than Steve, with the lowest. # - As the newest member of the team, Steve is generating lower revenue due to simply making fewer sales, as his tracks per sale and revenue per sale is equal to Margaret # ## Task Three: Sales Performance by Country # For this activity, we want to calculate data, for each country, on the: # # - total number of customers # - total value of sales # - average value of sales per customer # - average order value # # Grouping smaller markets as 'Other' and forcing that row to the bottom of the results # In[9]: get_ipython().run_cell_magic('sql', '', "WITH country_clients AS(\nSELECT c.country client_country\n ,COUNT(DISTINCT c.customer_id) no_of_customers\n ,SUM(i.total) total_revenue\n ,AVG(i.total) avg_order_value\nFROM customer c\nLEFT JOIN invoice i ON i. customer_id = c.customer_id\nGROUP BY client_country)\n\n, summary AS(\n SELECT\n (CASE WHEN no_of_customers = 1 THEN 'Other'\n ELSE client_country END) country\n ,SUM(no_of_customers) no_of_customers\n ,ROUND(SUM(total_revenue),2) total_revenue\n ,ROUND(SUM(total_revenue)/SUM(no_of_customers),2) revenue_per_client\n ,ROUND(AVG(avg_order_value),2) avg_order_value\n ,(CASE WHEN avg(no_of_customers) = 1 THEN 2\n ELSE 1 END) ranking\n FROM country_clients\n GROUP BY 1)\n\nSELECT country, no_of_customers, total_revenue, revenue_per_client,avg_order_value\nFROM summary\nORDER BY ranking ASC,3 DESC, 4 DESC;\n") # #### Notes: # I opted to arrange this query using CTEs for clarity. # - The first CTE **`country_clients`** gave the basic summmary of all countries # - The second CTE **`summary`** employs two CASE clauses, one to identify the small markets, and the other to label them for to be listed last # - The final query selects all outputs of the **`summary`** CTE with the exception of the `ranking` column which is just used as the first ORDER BY condition # ## Task Four: Album vs Collection Purchases # This activity requires identification of orders as either comprising whole albums or collections made up of many individual tracks (disregarding purchases of whole albums made by selecting all tracks) # # This is to drive the decision as to whether the Chinook store should move away from offering sales of full albums and only purchase and sell the most popular tracks for the sake of saving costs # ### Working Out Steps # # I found this really challenging, but eventually I arrived at the solution in my own style using the following: # 1. a base CTE with all the albums associated with each invoice # 2. an intermidate CTE with all the tracks on each album # 3. a secondary intermediate CTE to label the invoices # # #### Notes on the subqueries embedded in the CASE WHEN clauses # To use the EXCEPT function accurately: # - First write out the query to SELECT the following: # - a) All track_ids of tracks which are on a given invoice_id # - b) All track_ids of tracks which are on a given album # - Then ensure your references tie back relevantly to your intermediate CTEs # In[10]: get_ipython().run_cell_magic('sql', '', "WITH base AS (\n SELECT i.invoice_id\n ,al.album_id\n FROM invoice i\n LEFT JOIN invoice_line il ON i.invoice_id = il.invoice_id\n LEFT JOIN track t ON il.track_id = t.track_id\n LEFT JOIN album al ON t.album_id =al.album_id\n)\n, album_tracks AS (\n SELECT al.album_id\n ,t.track_id\n FROM track t\n LEFT JOIN album al\n ON t.album_id =al.album_id)\n\n, purchase_types AS(\n SELECT base.invoice_id\n ,(CASE WHEN (SELECT t.track_id\n FROM track t\n WHERE album_id = base.album_id\n\n EXCEPT \n\n SELECT il.track_id\n FROM invoice_line il\n WHERE il.invoice_id = base.invoice_id) IS NULL\n AND \n (SELECT il.track_id\n FROM invoice_line il\n WHERE il.invoice_id = base.invoice_id\n \n EXCEPT \n \n SELECT t.track_id\n FROM track t\n WHERE album_id = base.album_id) IS NULL\n THEN 'Album'\n ELSE 'Collection' END) purchase_type\n FROM base\n)\nSELECT \n purchase_type\n,COUNT(DISTINCT invoice_id) AS no_invoices\n,ROUND(COUNT(DISTINCT invoice_id)*100\n /(SELECT COUNT(DISTINCT invoice_id)\n FROM base),2) AS percentage\nFROM purchase_types\nGROUP BY purchase_type\nUNION\nSELECT\n'Total'\n,COUNT(DISTINCT invoice_id)\n,ROUND(COUNT(DISTINCT invoice_id)*100/(SELECT COUNT(DISTINCT invoice_id)\n FROM base),2)\nFROM base\n") # ### Task Four: Conclusion # # The majority of customers purchase custom collections of songs, however their is a non-negligable minority (almost 20%) of customers who do still bu complete albums. It would potentially be unwise to limit these clients' freedom to purchase how they like for the sake of margins as we might lose custom along the way which would be counter intuitive. # # Further analysis into the types of albums purchases and perhaps listening data if we can gather it could help make a more nuanced intervention if we still want to pursue this line of thought. # ## Extension Activities # ### Activity One: Which artist is used in the most playlists? # # The answer to this is interesting, from its wording I started off by looking at the most playlists featuring each artist. Here, we find there are lots of artists featured lightly on many playlists. But then, looking at the number of times an artist is featured on playlist, this is more intersting. We find more usefully which artists are featured heavily on but on fewer playlists. # In[11]: get_ipython().run_cell_magic('sql', '', 'WITH base AS (\nSELECT pl.*\n ,ar.*\nFROM playlist pl\nLEFT JOIN playlist_track plt ON plt.playlist_id = pl.playlist_id\nLEFT JOIN track t ON t.track_id = plt.track_id\nLEFT JOIN album al ON al.album_id = t.album_id\nLEFT JOIN artist ar ON ar.artist_id = al.artist_id)\n\nSELECT DISTINCT "name:1" artist\n ,COUNT(DISTINCT playlist_id) no_playlists\n ,COUNT(playlist_id) total_features\nFROM base\nGROUP BY artist\nORDER BY 3 DESC, 2 DESC\n') # ### Activity Two: How many tracks have been purchased vs not purchased? # # Almost half of the songs on the database have never been purchased! That's a lot! # In[12]: get_ipython().run_cell_magic('sql', '', "SELECT ever_purchased\n , COUNT (track_id) no_tracks\nFROM(\nSELECT t.track_id\n ,SUM(il.quantity) total_purchases\n ,CASE WHEN SUM(il.quantity) >0 THEN 'Yes'\n ELSE 'No' END ever_purchased\nFROM track t\nLEFT JOIN invoice_line il ON il.track_id = t.track_id\nGROUP BY t.track_id)\nGROUP BY ever_purchased\n") # ### Activity Three: Is the range of tracks in the store reflective of their sales popularity? # # Arguably not, the most popular genres are also the ones with the most supply, but if we look at avg sales per track, the top three genres are # # |Genre| # |:----------:| # |Easy Listening| # |Alternative| # |R&B/Soul| # # So I would recommend that we stock more of these. # In[13]: get_ipython().run_cell_magic('sql', '', ' SELECT g.name genre\n ,COUNT(invoice_line_id) tracks_sold \n ,COUNT(DISTINCT t.track_id) no_available\n ,ROUND(COUNT(invoice_line_id)*1.0/COUNT(DISTINCT t.track_id),2) avg_sales_per_track\n ,ROUND(count(invoice_line_id)*100.0/(SELECT count(invoice_line_id) \n FROM invoice i\n LEFT JOIN invoice_line il\n ON i.invoice_id = il.invoice_id),2) genre_sales_pct\nFROM track t\nLEFT JOIN invoice_line il ON t.track_id = il.track_id\nLEFT JOIN invoice i ON i.invoice_id = il.invoice_id\nLEFT JOIN genre g ON g.genre_id = t.genre_id\nGROUP BY genre\nORDER BY 4 DESC,5 DESC\n') # ### Activity Four: Do protected vs non-protected media types have an effect on popularity? # In[14]: get_ipython().run_cell_magic('sql', '', " SELECT CASE WHEN mt.name LIKE 'Protected%' THEN 'Protected'\n ELSE 'Non_Protected' END media_type\n ,COUNT(invoice_line_id) tracks_sold \n ,COUNT(DISTINCT t.track_id) no_available\n ,ROUND(COUNT(invoice_line_id)*1.0/COUNT(DISTINCT t.track_id),2) avg_sales_per_track\n ,ROUND(count(invoice_line_id)*100.0/(SELECT count(invoice_line_id) \n FROM invoice i\n LEFT JOIN invoice_line il\n ON i.invoice_id = il.invoice_id),2) genre_sales_pct\nFROM track t\nLEFT JOIN invoice_line il ON t.track_id = il.track_id\nLEFT JOIN invoice i ON i.invoice_id = il.invoice_id\nLEFT JOIN media_type mt ON mt.media_type_id = t.media_type_id\nGROUP BY media_type\nORDER BY 4 DESC,5 DESC\n") # ### Summary # In short, yes. Non-Protected tracks sell 9x as many as Protected tracks