Taking examples/examples.ipynb
as a starting point.
%matplotlib inline
%load_ext autoreload
%autoreload 2
import os
import sys
sys.path.append("..")
sys.path.append("../..")
import numpy as np
import pandas as pd
import yellowbrick as yb
from yellowbrick.features import (ParallelCoordinates,
parallel_coordinates)
from download import download_all
## The path to the test data sets
FIXTURES = os.path.join(os.getcwd(), "data")
## Dataset loading mechanisms
datasets = {
"credit": os.path.join(FIXTURES, "credit", "credit.csv"),
"concrete": os.path.join(FIXTURES, "concrete", "concrete.csv"),
"occupancy": os.path.join(FIXTURES, "occupancy", "occupancy.csv"),
"mushroom": os.path.join(FIXTURES, "mushroom", "mushroom.csv"),
}
def load_data(name, download=True):
"""
Loads and wrangles the passed in dataset by name.
If download is specified, this method will download any missing files.
"""
# Get the path from the datasets
path = datasets[name]
# Check if the data exists, otherwise download or raise
if not os.path.exists(path):
if download:
download_all()
else:
raise ValueError((
"'{}' dataset has not been downloaded, "
"use the download.py module to fetch datasets"
).format(name))
# Return the data frame
return pd.read_csv(path)
# Load the classification data set
data = load_data('occupancy')
print(len(data))
data.head()
20560
datetime | temperature | relative humidity | light | C02 | humidity | occupancy | |
---|---|---|---|---|---|---|---|
0 | 2015-02-04 17:51:00 | 23.18 | 27.2720 | 426.0 | 721.25 | 0.004793 | 1 |
1 | 2015-02-04 17:51:59 | 23.15 | 27.2675 | 429.5 | 714.00 | 0.004783 | 1 |
2 | 2015-02-04 17:53:00 | 23.15 | 27.2450 | 426.0 | 713.50 | 0.004779 | 1 |
3 | 2015-02-04 17:54:00 | 23.15 | 27.2000 | 426.0 | 708.25 | 0.004772 | 1 |
4 | 2015-02-04 17:55:00 | 23.10 | 27.2000 | 426.0 | 704.50 | 0.004757 | 1 |
# Specify the features of interest and the classes of the target
features = ["temperature", "relative humidity", "light", "C02", "humidity"]
classes = ['unoccupied', 'occupied']
# Extract the numpy arrays from the data frame
X = data.head(1000)[features]
y = data.head(1000).occupancy
# numpy inputs
visualizer = ParallelCoordinates(features=features, classes=classes)
visualizer.fit_transform_poof(X.values, y.values);
# numpy inputs, no labels
visualizer = ParallelCoordinates(classes=classes)
visualizer.fit_transform_poof(X.values, y.values);
# dataframe inputs
visualizer = ParallelCoordinates(classes=classes)
visualizer.fit_transform_poof(X, y);
# quick method
parallel_coordinates(X, y);
normalize
argument¶visualizer = ParallelCoordinates(normalize='minmax', classes=classes)
visualizer.fit_transform_poof(X, y);
visualizer = ParallelCoordinates(normalize='maxabs', classes=classes)
visualizer.fit_transform_poof(X, y);
visualizer = ParallelCoordinates(normalize='standard', classes=classes)
visualizer.fit_transform_poof(X, y);
visualizer = ParallelCoordinates(normalize='l1', classes=classes)
visualizer.fit_transform_poof(X, y);
visualizer = ParallelCoordinates(normalize='l2', classes=classes)
visualizer.fit_transform_poof(X, y);
visualizer = ParallelCoordinates(normalize='l2', classes=classes)
visualizer.fit_transform_poof(X, y);
# should raise YellowbrickValueError
visualizer = ParallelCoordinates(normalize='bad', classes=classes)
visualizer.fit_transform_poof(X, y);
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) <ipython-input-17-fcf498de708c> in <module>() 1 # should raise YellowbrickValueError ----> 2 visualizer = ParallelCoordinates(normalize='bad', classes=classes) 3 visualizer.fit_transform_poof(X, y); ~/Google Drive/projects/other/yellowbrick/yellowbrick/features/pcoords.py in __init__(self, ax, features, classes, normalize, sample, color, colormap, vlines, vlines_kwds, **kwargs) 205 raise YellowbrickValueError( 206 "'{}' is an unrecognized normalization method" --> 207 .format(self.normalize) 208 ) 209 self.sample = sample AttributeError: 'ParallelCoordinates' object has no attribute 'normalize'
# quick method
parallel_coordinates(X, y, normalize='standard');
sample
argument¶visualizer = ParallelCoordinates(classes=classes, sample=200)
visualizer.fit_transform_poof(X, y);
visualizer = ParallelCoordinates(classes=classes, sample=0.2)
visualizer.fit_transform_poof(X, y);
# quick method
parallel_coordinates(X, y, sample=0.2);
# should raise YellowbrickTypeError
visualizer = ParallelCoordinates(classes=classes, sample='bad')
visualizer.fit_transform_poof(X, y);
--------------------------------------------------------------------------- YellowbrickTypeError Traceback (most recent call last) <ipython-input-22-fde984a70554> in <module>() 1 # should raise YellowbrickTypeError 2 visualizer = ParallelCoordinates(classes=classes, sample='bad') ----> 3 visualizer.fit_transform_poof(X, y); ~/Google Drive/projects/other/yellowbrick/yellowbrick/features/base.py in fit_transform_poof(self, X, y, **kwargs) 70 return the result of the transform method. 71 """ ---> 72 Xp = self.fit_transform(X, y, **kwargs) 73 self.poof(**kwargs) 74 return Xp ~/.virtualenvs/yellowbrick/lib/python3.6/site-packages/sklearn/base.py in fit_transform(self, X, y, **fit_params) 495 else: 496 # fit method of arity 2 (supervised transformation) --> 497 return self.fit(X, y, **fit_params).transform(X) 498 499 ~/Google Drive/projects/other/yellowbrick/yellowbrick/features/base.py in fit(self, X, y, **kwargs) 191 192 # Draw the instances --> 193 self.draw(X, y, **kwargs) 194 195 # Fit always returns self. ~/Google Drive/projects/other/yellowbrick/yellowbrick/features/pcoords.py in draw(self, X, y, **kwargs) 240 else: 241 raise YellowbrickTypeError( --> 242 "`sample` parameter must be int or float" 243 ) 244 X = X[:self.n_samples, :] YellowbrickTypeError: `sample` parameter must be int or float
# should raise YellowbrickValueError
visualizer = ParallelCoordinates(classes=classes, sample=-1)
visualizer.fit_transform_poof(X, y);
--------------------------------------------------------------------------- YellowbrickValueError Traceback (most recent call last) <ipython-input-23-a8a0ae171ee4> in <module>() 1 # should raise YellowbrickValueError 2 visualizer = ParallelCoordinates(classes=classes, sample=-1) ----> 3 visualizer.fit_transform_poof(X, y); ~/Google Drive/projects/other/yellowbrick/yellowbrick/features/base.py in fit_transform_poof(self, X, y, **kwargs) 70 return the result of the transform method. 71 """ ---> 72 Xp = self.fit_transform(X, y, **kwargs) 73 self.poof(**kwargs) 74 return Xp ~/.virtualenvs/yellowbrick/lib/python3.6/site-packages/sklearn/base.py in fit_transform(self, X, y, **fit_params) 495 else: 496 # fit method of arity 2 (supervised transformation) --> 497 return self.fit(X, y, **fit_params).transform(X) 498 499 ~/Google Drive/projects/other/yellowbrick/yellowbrick/features/base.py in fit(self, X, y, **kwargs) 191 192 # Draw the instances --> 193 self.draw(X, y, **kwargs) 194 195 # Fit always returns self. ~/Google Drive/projects/other/yellowbrick/yellowbrick/features/pcoords.py in draw(self, X, y, **kwargs) 229 if self.sample < 1: 230 raise YellowbrickValueError( --> 231 "`sample` parameter of type `int` must be greater than 1" 232 ) 233 self.n_samples = min([self.sample, len(X)]) YellowbrickValueError: `sample` parameter of type `int` must be greater than 1
# should raise YellowbrickValueError
visualizer = ParallelCoordinates(classes=classes, sample=1.1)
visualizer.fit_transform_poof(X, y);
--------------------------------------------------------------------------- YellowbrickValueError Traceback (most recent call last) <ipython-input-24-24bc492b8f98> in <module>() 1 # should raise YellowbrickValueError 2 visualizer = ParallelCoordinates(classes=classes, sample=1.1) ----> 3 visualizer.fit_transform_poof(X, y); ~/Google Drive/projects/other/yellowbrick/yellowbrick/features/base.py in fit_transform_poof(self, X, y, **kwargs) 70 return the result of the transform method. 71 """ ---> 72 Xp = self.fit_transform(X, y, **kwargs) 73 self.poof(**kwargs) 74 return Xp ~/.virtualenvs/yellowbrick/lib/python3.6/site-packages/sklearn/base.py in fit_transform(self, X, y, **fit_params) 495 else: 496 # fit method of arity 2 (supervised transformation) --> 497 return self.fit(X, y, **fit_params).transform(X) 498 499 ~/Google Drive/projects/other/yellowbrick/yellowbrick/features/base.py in fit(self, X, y, **kwargs) 191 192 # Draw the instances --> 193 self.draw(X, y, **kwargs) 194 195 # Fit always returns self. ~/Google Drive/projects/other/yellowbrick/yellowbrick/features/pcoords.py in draw(self, X, y, **kwargs) 235 if self.sample <= 0 or self.sample > 1: 236 raise YellowbrickValueError( --> 237 "`sample` parameter of type `float` must be between 0 and 1" 238 ) 239 self.n_samples = int(len(X) * self.sample) YellowbrickValueError: `sample` parameter of type `float` must be between 0 and 1