Chinook Records Store Database

In this project we will be working with a modified version of a database called Chinook. The Chinook database contains information about a fictional digital music shop - a smaller version of iTunes.

The database contains information about the artists, songs, and albums from the music shop, as well as information on the shop's employees, customers, and their purchases. This information is contained in eleven tables. Here is a schema diagram for the Chinook database:

Chinook.png

In [1]:
%%capture
%load_ext sql
%sql sqlite:///chinook.db
# Connect to the database
In [2]:
# Import python libraries
import sqlite3 
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
In [3]:
%%sql 
/*# Database overview */

SELECT name, type
  FROM sqlite_master
 WHERE type = 'table'
 * sqlite:///chinook.db
Done.
Out[3]:
name type
album table
artist table
customer table
employee table
genre table
invoice table
invoice_line table
media_type table
playlist table
playlist_track table
track table

Sales Analysis By Country

Here we will display the following data for each country where customers have bought tracks in the store:

  • total number of customers
  • total sales
  • average sales per customer
  • average order value

Because there are several countries with only one customer we will group these into a row named "Other" and keep it on the bottom of the table after sorting.

In [4]:
%%sql
/*# Display aggregate sales data per country */

WITH country_countrynew AS
    (
     SELECT country, 
            CASE
                WHEN COUNT(DISTINCT customer_id) <= 1 THEN 'Other'
                ELSE country
                END
                AS country_new
      FROM  customer
     GROUP BY country
    )

SELECT country_new AS country,
       COUNT(DISTINCT c.customer_id) AS customers,
       ROUND(SUM(i.total), 2) AS sales,
       ROUND(SUM(i.total) / COUNT(DISTINCT c.customer_id), 2) AS avg_sales_per_customer,
       ROUND(SUM(i.total) / COUNT(DISTINCT i.invoice_id), 2) AS avg_sales_per_order
FROM customer c
INNER JOIN invoice as i ON i.customer_id = c.customer_id
INNER JOIN country_countrynew AS ccn ON ccn.country = c.country
GROUP BY country_new
ORDER BY (country_new='Other'), sales DESC;
 * sqlite:///chinook.db
Done.
Out[4]:
country customers sales avg_sales_per_customer avg_sales_per_order
USA 13 1040.49 80.04 7.94
Canada 8 535.59 66.95 7.05
Brazil 5 427.68 85.54 7.01
France 5 389.07 77.81 7.78
Germany 4 334.62 83.66 8.16
Czech Republic 2 273.24 136.62 9.11
United Kingdom 3 245.52 81.84 8.77
Portugal 2 185.13 92.57 6.38
India 2 183.15 91.58 8.72
Other 15 1094.94 73.0 7.45
In [5]:
# Create variable to be passed into pd.read_sql
query_country = """WITH country_countrynew AS
    (
     SELECT country, 
            CASE
                WHEN COUNT(DISTINCT customer_id) <= 1 THEN 'Other'
                ELSE country
                END
                AS country_new
      FROM  customer
     GROUP BY country
    )

SELECT country_new AS country,
       COUNT(DISTINCT c.customer_id) AS customers,
       ROUND(SUM(i.total), 2) AS sales,
       ROUND(SUM(i.total) / COUNT(DISTINCT c.customer_id), 2) AS avg_sales_per_customer,
       ROUND(SUM(i.total) / COUNT(DISTINCT i.invoice_id), 2) AS avg_sales_per_order
FROM customer c
INNER JOIN invoice as i ON i.customer_id = c.customer_id
INNER JOIN country_countrynew AS ccn ON ccn.country = c.country
GROUP BY country_new
ORDER BY (country_new='Other'), sales DESC;"""

# Connect to database
conn = sqlite3.connect('chinook.db')

# Read into dataframe
country_stats = pd.read_sql_query(query_country, conn)

# Reset index and create list of countries to pass as x-axis labels
country_stats = country_stats.set_index('country', drop=True)
country_list = country_stats.index.tolist()
In [6]:
# Plot
fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(10,10))
fig.subplots_adjust(hspace=0.4)
plt.rcParams['figure.dpi'] = 460

ax1 = plt.subplot(2,2,1)
ax1 = sns.barplot(x=country_list, y='sales', data=country_stats, color="steelblue")
ax1.set_title('Sales By Country (USD)', fontsize=13)
ax1.set_ylabel('')
ax1.tick_params(labelsize=9)
plt.xticks(rotation = 45, ha='right')
sns.despine(left=True)
sns.set(style="whitegrid")


ax2 = plt.subplot(2,2,2)
ax2 = sns.barplot(x=country_list, y='avg_sales_per_customer', data=country_stats, color="steelblue")
ax2.set_title('Average Sales Per Customer (USD)', fontsize=13)
ax2.set_ylabel('')
ax2.set_ylim([50,140])
ax2.tick_params(labelsize=9)
plt.xticks(rotation = 45, ha='right')
sns.despine(left=True)
sns.set(style="whitegrid")

ax3 = plt.subplot(2,2,3)
ax3 = sns.barplot(x=country_list, y='avg_sales_per_order', data=country_stats, color="steelblue")
ax3.set_title('Average Sales Per Order (USD)', fontsize=13)
ax3.set_ylabel('')
ax3.set_ylim([5,10])
ax3.tick_params(labelsize=9)
plt.xticks(rotation = 45, ha='right')
sns.despine(left=True)
sns.set(style="whitegrid")

ax4 = plt.subplot(2,2,4)
ax4 = sns.barplot(x=country_list, y='customers', data=country_stats, color="steelblue")
ax4.set_title('Number Of Customers', fontsize=13)
ax4.set_ylabel('')
ax4.tick_params(labelsize=9)
plt.xticks(rotation = 45, ha='right')
sns.despine(left=True)
sns.set(style="whitegrid")

Observations

  • The United States has the most sales of any other country.
  • Countries in the 'Other' category have more combined sales than the United States.
  • Most customers spend between 70 and 90 dollars with the notable exception of our customers from the Czech Republic.
  • The Czech Republic has the most sales per customer and the largest sales per order. Since the country only has two customers, it is hard to conclude that customers from the Czech Republic spend more in general or if our two customers just happen to be big spenders.

Measurements of Song Popularity

Next, will look at the follwing popularity metrics:

  • genre popularity
  • song appearances in playlists
  • songs that never sell
  • Digital Rights Management (DRM) protected vs. non-protected file types

Genre Popularity

The Chinook record store has just signed a deal with a new record label, and they are considering three albums that will be added to the store from a list of four:

  • Regal - Hip-Hop
  • Red Tone - Punk
  • Meteor and the Girls - Pop
  • Slim Jim Bites - Blues

Since the record label specializes on artists from the USA we are interested in finding out which of these genres sell best in across the country.

In [7]:
%%sql
/*# Display all genres in the USA sorted by the number of tracks sold */

WITH usa_tracks AS
    (
     SELECT t.genre_id AS genre_id, il.invoice_line_id
       FROM track as t
      INNER JOIN invoice_line as il ON il.track_id=t.track_id
      INNER JOIN invoice as i ON i.invoice_id = il.invoice_id
      WHERE i.billing_country = 'USA'
    )
    
SELECT g.name, 
       COUNT(usa.genre_id) AS tracks_sold,
       ROUND(CAST(COUNT(usa.genre_id) AS FLOAT)/ (SELECT COUNT(genre_id)
                                FROM usa_tracks),2) AS percentage
FROM usa_tracks AS usa
INNER JOIN genre as g ON g.genre_id = usa.genre_id

GROUP BY g.name
ORDER BY tracks_sold DESC;
 * sqlite:///chinook.db
Done.
Out[7]:
name tracks_sold percentage
Rock 561 0.53
Alternative & Punk 130 0.12
Metal 124 0.12
R&B/Soul 53 0.05
Blues 36 0.03
Alternative 35 0.03
Pop 22 0.02
Latin 22 0.02
Hip Hop/Rap 20 0.02
Jazz 14 0.01
Easy Listening 13 0.01
Reggae 6 0.01
Electronica/Dance 5 0.0
Classical 4 0.0
Heavy Metal 3 0.0
Soundtrack 2 0.0
TV Shows 1 0.0
In [8]:
# Create variable to pass into pd.read_sql
query_genre = """ WITH usa_tracks AS
    (
     SELECT t.genre_id AS genre_id, il.invoice_line_id
       FROM track as t
      INNER JOIN invoice_line as il ON il.track_id=t.track_id
      INNER JOIN invoice as i ON i.invoice_id = il.invoice_id
      WHERE i.billing_country = 'USA'
    )
    
SELECT g.name, 
       COUNT(usa.genre_id) AS tracks_sold,
       ROUND(CAST(COUNT(usa.genre_id) AS FLOAT)/ (SELECT COUNT(genre_id)
                                FROM usa_tracks),2) AS percentage
FROM usa_tracks AS usa
INNER JOIN genre as g ON g.genre_id = usa.genre_id

GROUP BY g.name
ORDER BY tracks_sold DESC;"""

# Connect to database
conn = sqlite3.connect('chinook.db')

# Read into dataframe
genre = pd.read_sql_query(query_genre, conn)
In [9]:
# Label genres with less than 20 tracks sold to 'All Others' for pie chart
genre['name'] = genre['name'].mask(genre['tracks_sold'] <20, 'All Others')

# Group, sort, and slice for the pie chart
genre = genre.groupby('name').sum()
genre = genre.sort_values(by='tracks_sold', ascending=False)
genre = genre.iloc[:,0]
genre
Out[9]:
name
Rock                  561
Alternative & Punk    130
Metal                 124
R&B/Soul               53
All Others             48
Blues                  36
Alternative            35
Latin                  22
Pop                    22
Hip Hop/Rap            20
Name: tracks_sold, dtype: int64
In [10]:
# Get 'All Others' to bottom of series for pie chart
genre = genre.drop('All Others')
temp = pd.Series(48)
temp = temp.rename({0:'All Others'})
genre = genre.append(temp)
In [11]:
explode = (0, 0.15, 0, 0, 0, 0, 0, 0.2, 0.2, 0)

# Plot
fig, ax = plt.subplots(figsize=(14,14))
genre.plot(kind='pie', fontsize=18, explode=explode, startangle=40, labeldistance=1.05)

# Plot Aesthetics
ax.set_ylabel('')
ax.set_title('Top Selling Genres (USA)', fontsize=24)
Out[11]:
Text(0.5, 1.0, 'Top Selling Genres (USA)')