Notebook

In [17]:

%matplotlib inline


"""
The data set in this example represents 1059 songs from various countries obtained 
from the UCI Machine Learning library. Various features of the audio tracks have been 
extracted, and each track has been tagged with the latitude and longitude of the capital
city of its country of origin. 

We'll treat this as a classification problem, and attempt to train a model to predict 
the country of origin of each model. 

Data source did not specifify what the audio features specifically are, just
    "In the 'default_features_1059_tracks.txt' file, the first 68 columns are audio 
    features of the track, and the last two columns are the origin of the music, 
    represented by latitude and longitude. 

    In the 'default_plus_chromatic_features_1059_tracks.txt' file, the first 116 
    columns are audio features of the track, and the last two columns are the 
    origin of the music."
"""

import numpy as np
import pandas as pd
import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.multiclass import unique_labels
import sys

#First get the data. The UCI ML Library distributes it as a zipped file;
#download the data and extract the two provided files to the 'data' folder before continuing
music_df = pd.read_csv('data\default_plus_chromatic_features_1059_tracks.txt', header=None)
music = music_df.as_matrix()


#Our features are all but the last two columns
X = music[:,0:-2]

#Since feature names were not given, we'll just assign strings with an incrementing integer
names = np.linspace(start=1, stop=116, num=116, dtype='int').tolist()
for idx, name in enumerate(names):
    names[idx] = "Feature " + str(name)


#The source data said that each song as tied to the capital city of it's origin country via a lat/lon pair. 
#Let's treat this as a multi-class classification problem. 
#Rather than reverse-geocoding, we'll just make a string out of the unique lat/lon pairs
lats = ["%.2f" % lat for lat in music_df[116]]
lons = ["%.2f" % lon for lon in music_df[117]]
song_latlons = []
for index, value in enumerate(lats):
    city_id = lats[index] + "," + lons[index]
    song_latlons.append(city_id)

unique_latlons = unique_labels(song_latlons)
city_options = ['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z','AA','AB','AC','AD','AE','AF','AG']
city_name_map = {}
for idx,latlon in enumerate(unique_latlons):
    city_name_map[latlon] = city_options[idx]

ylist = []
for latlon in song_latlons:
    ylist.append(city_name_map[latlon])
y = np.array(ylist)

In [ ]:

In [18]:

#We want yellowbrick to import from this repository, and assume this notebook is in repofolder/examples/subfolder/
sys.path.append("../../")
import yellowbrick as yb
from yellowbrick.features.rankd import Rank2D 
from yellowbrick.features.radviz import RadViz 
from yellowbrick.features.pcoords import ParallelCoordinates 

In [19]:

#See how well correlated the features are
visualizer = Rank2D(features = names, algorithm = 'pearson')
visualizer.fit(X, y)
visualizer.transform(X)
visualizer.poof()

In [ ]:

In [20]:

from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from yellowbrick.classifier import ClassificationReport

In [21]:

def train_and_classification_report(model):
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size =0.2, random_state=11)
    
    model.fit(X_train, y_train)
    y_predict = model.predict(X_test)

    print("prec: {}".format(metrics.precision_score(y_true = y_test, y_pred = y_predict, average="weighted")))
    print("rec: {}".format(metrics.recall_score(y_true= y_test, y_pred = y_predict, average = "weighted")))

    cr_viz = ClassificationReport(model) #,classes=city_options
    cr_viz.fit(X_train, y_train)
    cr_viz.score(X_test, y_test)
    cr_viz.poof()
    

In [29]:

In [34]:

#Adding the reloading functionality so we can edit the source code and see results here. 
import importlib
importlib.reload(yb.classifier)
from yellowbrick.classifier import ClassificationReport

#This produces an IndexError: list index out of range. 
train_and_classification_report(LogisticRegression())

prec: 0.430726301383904
rec: 0.4056603773584906

C:\Users\humph\Anaconda3\lib\site-packages\sklearn\metrics\classification.py:1115: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in labels with no true samples.
  'recall', 'true', average, warn_for)
C:\Users\humph\Anaconda3\lib\site-packages\sklearn\metrics\classification.py:1115: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples.
  'recall', 'true', average, warn_for)

---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-34-10397df83f0c> in <module>()
      3 from yellowbrick.classifier import ClassificationReport
      4 #This produces an IndexError: list index out of range.
----> 5 train_and_classification_report(LogisticRegression())

<ipython-input-21-cd9cb9f45f93> in train_and_classification_report(model)
     10     cr_viz = ClassificationReport(model) #,classes=city_options
     11     cr_viz.fit(X_train, y_train)
---> 12     cr_viz.score(X_test, y_test)
     13     cr_viz.poof()
     14 

C:\Users\humph\Documents\Github\yellowbrick\yellowbrick\classifier.py in score(self, X, y, **kwargs)
    133         self.scores = map(lambda s: dict(zip(self.classes_, s)), self.scores[0:3])
    134         self.scores = dict(zip(keys, self.scores))
--> 135         return self.draw(y, y_pred)
    136 
    137     def draw(self, y, y_pred):

C:\Users\humph\Documents\Github\yellowbrick\yellowbrick\classifier.py in draw(self, y, y_pred)
    158         for column in range(len(self.matrix)+1):
    159             for row in range(len(self.classes_)):
--> 160                 self.ax.text(column,row,self.matrix[row][column],va='center',ha='center')
    161 
    162         fig = plt.imshow(self.matrix, interpolation='nearest', cmap=self.cmap, vmin=0, vmax=1)

IndexError: list index out of range

In [35]:

#This demonstrates a version of the Seaborn confusion matrix heatmap we could replicate (and improve on). 
def train_and_confusion_matrix(model):
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size =0.2, random_state=11)
    
    model.fit(X_train, y_train)
    y_predict = model.predict(X_test)

    print("prec: {}".format(metrics.precision_score(y_true = y_test, y_pred = y_predict, average="weighted")))
    print("rec: {}".format(metrics.recall_score(y_true= y_test, y_pred = y_predict, average = "weighted")))

    c_matrix = confusion_matrix(y_true = y_test, y_pred = y_predict)
    
    sns.heatmap(c_matrix, square=True, annot=True, cbar=False, xticklabels=city_options, yticklabels = city_options)
    plt.xlabel('predicted value')
    plt.ylabel('true value')

In [36]:

train_and_confusion_matrix(LogisticRegression())

prec: 0.430726301383904
rec: 0.4056603773584906

C:\Users\humph\Anaconda3\lib\site-packages\sklearn\metrics\classification.py:1115: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in labels with no true samples.
  'recall', 'true', average, warn_for)

In [37]:

def train_and_class_balance(model):
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size =0.2, random_state=11)
    
    class_balance = yb.classifier.ClassBalance(model, classes=city_options)
    class_balance.fit(X_train, y_train)
    class_balance.score(X_test, y_test)
    class_balance.poof()

In [38]:

train_and_class_balance(LogisticRegression())

C:\Users\humph\Anaconda3\lib\site-packages\sklearn\metrics\classification.py:1115: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples.
  'recall', 'true', average, warn_for)

In [ ]: