import addutils.toc ; addutils.toc.js(ipy_notebook=True)
import scipy.io
import numpy as np
import pandas as pd
import warnings
from addutils import css_notebook
css_notebook()
warnings.filterwarnings('ignore')
In scikit-learn, almost all operations are done through an estimator object. For example, a linear regression estimator can be instantiated as follows:
from sklearn import linear_model
model = linear_model.LinearRegression(fit_intercept=True, normalize=True)
print(model)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=True)
In brackets are displayed the current values for the “hyperparameters” of the estimator. To learn about the specific “hyperparameters” check the documentation:
# Try: model?
Meta parameters can be changed after the model has been created:
model.fit_intercept = False
print(model)
LinearRegression(copy_X=True, fit_intercept=False, n_jobs=1, normalize=True)
Given a scikit-learn estimator object named model
, the following methods are available:
model.fit()
: fit training data.model.fit(X, y)
.model.fit(X)
.model.predict()
: predict the label of a new set of data. This accepts one argument model.predict(X_new)
).model.predict_proba()
: Returns the probability of a categorical label. The label itself is returned by model.predict()
.model.score()
: Scores are between 0 and 1, with a larger score indicating a better fit.model.transform()
: Transform new data into the new basis. This accepts one argument X_new
, and returns the new representation of the data.model.fit_transform()
: some estimators implement this method, which more efficiently performs a fit and a transform on the same input data.from sklearn import datasets, preprocessing, metrics
X, y = datasets.samples_generator.make_regression(n_samples=70,
n_features=1, n_informative=1,
random_state=0, noise=5)
scaler = preprocessing.MinMaxScaler()
X_sc = scaler.fit_transform(X)
lin = linear_model.LinearRegression(fit_intercept=True)
lin.fit(X_sc, y)
print(lin)
print("Model coefficient: %.5f, and intercept: %.5f" % (lin.coef_, lin.intercept_))
err = metrics.mean_squared_error(lin.predict(X_sc), y)
print("Mean Squared Error: %.2f" % err)
# Plot the data and the model prediction
X_p = np.linspace(0, 1, 2)[:, np.newaxis]
y_p = lin.predict(X_p)
fig = bk.figure(title='Simple Regression',
x_axis_label='X scaled',
y_axis_label='y',
plot_width=600, plot_height=300)
fig.circle(X_sc.squeeze(), y, line_color='darkgreen', size=10,
fill_color='green', fill_alpha=0.5, legend='Measured Data')
fig.line(X_p.ravel(), y_p, line_color='blue', legend='Predicted Values')
fig.legend.location = 'bottom_right'
bk.show(fig)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False) Model coefficient: 275.99550, and intercept: -145.41186 Mean Squared Error: 23.04
v4 (Level 1.0), v6 and v7 to 7.2 matfiles are supported. To read matlab 7.3 format mat files an HDF5 python library is required. Please check the scipy documentation for more information.
The data can be generated with the following MATLAB code:
% Generate Regression Test Data
X = [1 2 3
4 5 6
7 8 9
0 1 2] + 0.1;
y = sum(X,2);
feat_names = strvcat('Feature One', 'Feature Two', 'Feature Three');
save ('matlab_test_data_01', 'X','y', 'feat_names')
mat_data = scipy.io.loadmat('example_data/matlab_test_data_01.mat')
Variables names included in the .mat
file are keys of the mat_data
dictionary. Moreover the key '__header__'
contains the mat-file information.
Here we load the two variables in Pandas varialbles:
mat_data.keys()
dict_keys(['__header__', '__version__', '__globals__', 'X', 'y', 'feat_names'])
In the following code the .strip()
method is used to remove the trailing white spaces used by MATLAB to make all the variable names of the same lenght:
X = pd.DataFrame(mat_data['X'], columns=[s.strip() for s in list(mat_data['feat_names'])])
y = pd.DataFrame(mat_data['y'], columns=['measured'])
print(X, '\n\n', y)
Feature One Feature Two Feature Three 0 1.1 2.1 3.1 1 4.1 5.1 6.1 2 7.1 8.1 9.1 3 0.1 1.1 2.1 measured 0 6.3 1 15.3 2 24.3 3 3.3
Typical scikit-learn dataset are dictionary-like object that holds all the data and metadata.
.data
field in the form of a 2D array [n_samples, n_features]
..target
field in the form of a 1D array.Scikit-learn makes available a host of datasets for testing learning algorithms:
sklearn.datasets.load_*
sklearn.datasets.fetch_*
sklearn.datasets.make_*
Try by yourself:
datasets.load_<TAB>
datasets.fetch_<TAB>
datasets.make_<TAB>
#datasets.make_
Features in the Iris dataset:
Target classes to predict:
d = datasets.load_iris()
Try by yourself one of the following commands where 'd' is the variable containing the dataset:
print d.keys() # Structure of the contained data
print d.DESCR # A complete description of the dataset
print d.data.shape # [n_samples, n_features]
print d.target.shape # [n_samples,]
print d.feature_names
datasets.get_data_home() # This is where the datasets are stored
print(d.keys())
print(d.target_names)
print(d.feature_names)
dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names']) ['setosa' 'versicolor' 'virginica'] ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
The Digits contains 1797 samples made of 64 features: each feature represents the grey-scale value of a 8x8 digit image:
from bokeh.palettes import Greys9
from bokeh.models.ranges import Range1d
import addutils.palette as pal
import addutils.imagegrid as ig
digits = datasets.load_digits()
# plot the digits: each image is 8x8 pixels
images = [ digits.images[i][::-1, :] for i in range(40) ]
txt = [ str(i) for i in range(10) ] * 4
fig = ig.imagegrid_figure(figure_plot_width=760, figure_plot_height=100,
figure_title=None,
images=images, grid_size=(20, 2),
text=txt, text_font_size='9pt', text_color='red',
palette=Greys9[::-1], padding=0.2)
bk.show(fig)
import seaborn as sns
cat_colors = list(map(pal.to_hex, sns.color_palette('Paired', 7)))
data, color_indices = datasets.make_blobs(n_samples=2000, n_features=2, centers=7,
center_box=(-4.0, 6.0), cluster_std=0.5)
fig = bk.figure(title=None)
fig.circle(data[:,0], data[:,1],
line_color='black', line_alpha=0.5, size=8,
fill_color=pal.linear_map(color_indices, cat_colors,
low=0, high=6))
bk.show(fig)
Visit www.add-for.com for more tutorials and updates.
This work is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License.