In this project we will be working with a modified version of a database called Chinook. The Chinook database contains information about a fictional digital music shop - a smaller version of iTunes.
The database contains information about the artists, songs, and albums from the music shop, as well as information on the shop's employees, customers, and their purchases. This information is contained in eleven tables. Here is a schema diagram for the Chinook database:
%%capture
%load_ext sql
%sql sqlite:///chinook.db
# Connect to the database
# Import python libraries
import sqlite3
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
%%sql
/*# Database overview */
SELECT name, type
FROM sqlite_master
WHERE type = 'table'
Here we will display the following data for each country where customers have bought tracks in the store:
Because there are several countries with only one customer we will group these into a row named "Other" and keep it on the bottom of the table after sorting.
%%sql
/*# Display aggregate sales data per country */
WITH country_countrynew AS
(
SELECT country,
CASE
WHEN COUNT(DISTINCT customer_id) <= 1 THEN 'Other'
ELSE country
END
AS country_new
FROM customer
GROUP BY country
)
SELECT country_new AS country,
COUNT(DISTINCT c.customer_id) AS customers,
ROUND(SUM(i.total), 2) AS sales,
ROUND(SUM(i.total) / COUNT(DISTINCT c.customer_id), 2) AS avg_sales_per_customer,
ROUND(SUM(i.total) / COUNT(DISTINCT i.invoice_id), 2) AS avg_sales_per_order
FROM customer c
INNER JOIN invoice as i ON i.customer_id = c.customer_id
INNER JOIN country_countrynew AS ccn ON ccn.country = c.country
GROUP BY country_new
ORDER BY (country_new='Other'), sales DESC;
# Create variable to be passed into pd.read_sql
query_country = """WITH country_countrynew AS
(
SELECT country,
CASE
WHEN COUNT(DISTINCT customer_id) <= 1 THEN 'Other'
ELSE country
END
AS country_new
FROM customer
GROUP BY country
)
SELECT country_new AS country,
COUNT(DISTINCT c.customer_id) AS customers,
ROUND(SUM(i.total), 2) AS sales,
ROUND(SUM(i.total) / COUNT(DISTINCT c.customer_id), 2) AS avg_sales_per_customer,
ROUND(SUM(i.total) / COUNT(DISTINCT i.invoice_id), 2) AS avg_sales_per_order
FROM customer c
INNER JOIN invoice as i ON i.customer_id = c.customer_id
INNER JOIN country_countrynew AS ccn ON ccn.country = c.country
GROUP BY country_new
ORDER BY (country_new='Other'), sales DESC;"""
# Connect to database
conn = sqlite3.connect('chinook.db')
# Read into dataframe
country_stats = pd.read_sql_query(query_country, conn)
# Reset index and create list of countries to pass as x-axis labels
country_stats = country_stats.set_index('country', drop=True)
country_list = country_stats.index.tolist()
# Plot
fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(10,10))
fig.subplots_adjust(hspace=0.4)
plt.rcParams['figure.dpi'] = 460
ax1 = plt.subplot(2,2,1)
ax1 = sns.barplot(x=country_list, y='sales', data=country_stats, color="steelblue")
ax1.set_title('Sales By Country (USD)', fontsize=13)
ax1.set_ylabel('')
ax1.tick_params(labelsize=9)
plt.xticks(rotation = 45, ha='right')
sns.despine(left=True)
sns.set(style="whitegrid")
ax2 = plt.subplot(2,2,2)
ax2 = sns.barplot(x=country_list, y='avg_sales_per_customer', data=country_stats, color="steelblue")
ax2.set_title('Average Sales Per Customer (USD)', fontsize=13)
ax2.set_ylabel('')
ax2.set_ylim([50,140])
ax2.tick_params(labelsize=9)
plt.xticks(rotation = 45, ha='right')
sns.despine(left=True)
sns.set(style="whitegrid")
ax3 = plt.subplot(2,2,3)
ax3 = sns.barplot(x=country_list, y='avg_sales_per_order', data=country_stats, color="steelblue")
ax3.set_title('Average Sales Per Order (USD)', fontsize=13)
ax3.set_ylabel('')
ax3.set_ylim([5,10])
ax3.tick_params(labelsize=9)
plt.xticks(rotation = 45, ha='right')
sns.despine(left=True)
sns.set(style="whitegrid")
ax4 = plt.subplot(2,2,4)
ax4 = sns.barplot(x=country_list, y='customers', data=country_stats, color="steelblue")
ax4.set_title('Number Of Customers', fontsize=13)
ax4.set_ylabel('')
ax4.tick_params(labelsize=9)
plt.xticks(rotation = 45, ha='right')
sns.despine(left=True)
sns.set(style="whitegrid")
Next, will look at the follwing popularity metrics:
The Chinook record store has just signed a deal with a new record label, and they are considering three albums that will be added to the store from a list of four:
Since the record label specializes on artists from the USA we are interested in finding out which of these genres sell best in across the country.
%%sql
/*# Display all genres in the USA sorted by the number of tracks sold */
WITH usa_tracks AS
(
SELECT t.genre_id AS genre_id, il.invoice_line_id
FROM track as t
INNER JOIN invoice_line as il ON il.track_id=t.track_id
INNER JOIN invoice as i ON i.invoice_id = il.invoice_id
WHERE i.billing_country = 'USA'
)
SELECT g.name,
COUNT(usa.genre_id) AS tracks_sold,
ROUND(CAST(COUNT(usa.genre_id) AS FLOAT)/ (SELECT COUNT(genre_id)
FROM usa_tracks),2) AS percentage
FROM usa_tracks AS usa
INNER JOIN genre as g ON g.genre_id = usa.genre_id
GROUP BY g.name
ORDER BY tracks_sold DESC;
# Create variable to pass into pd.read_sql
query_genre = """ WITH usa_tracks AS
(
SELECT t.genre_id AS genre_id, il.invoice_line_id
FROM track as t
INNER JOIN invoice_line as il ON il.track_id=t.track_id
INNER JOIN invoice as i ON i.invoice_id = il.invoice_id
WHERE i.billing_country = 'USA'
)
SELECT g.name,
COUNT(usa.genre_id) AS tracks_sold,
ROUND(CAST(COUNT(usa.genre_id) AS FLOAT)/ (SELECT COUNT(genre_id)
FROM usa_tracks),2) AS percentage
FROM usa_tracks AS usa
INNER JOIN genre as g ON g.genre_id = usa.genre_id
GROUP BY g.name
ORDER BY tracks_sold DESC;"""
# Connect to database
conn = sqlite3.connect('chinook.db')
# Read into dataframe
genre = pd.read_sql_query(query_genre, conn)
# Label genres with less than 20 tracks sold to 'All Others' for pie chart
genre['name'] = genre['name'].mask(genre['tracks_sold'] <20, 'All Others')
# Group, sort, and slice for the pie chart
genre = genre.groupby('name').sum()
genre = genre.sort_values(by='tracks_sold', ascending=False)
genre = genre.iloc[:,0]
genre
# Get 'All Others' to bottom of series for pie chart
genre = genre.drop('All Others')
temp = pd.Series(48)
temp = temp.rename({0:'All Others'})
genre = genre.append(temp)
explode = (0, 0.15, 0, 0, 0, 0, 0, 0.2, 0.2, 0)
# Plot
fig, ax = plt.subplots(figsize=(14,14))
genre.plot(kind='pie', fontsize=18, explode=explode, startangle=40, labeldistance=1.05)
# Plot Aesthetics
ax.set_ylabel('')
ax.set_title('Top Selling Genres (USA)', fontsize=24)