%matplotlib inline
from preamble import *
import os
# The file has no headers naming the columns, so we pass header=None
# and provide the column names explicitly in "names"
adult_path = os.path.join(mglearn.datasets.DATA_PATH, "adult.data")
data = pd.read_csv(
adult_path,
header=None,
index_col=False,
names=['age', 'workclass', 'fnlwgt', 'education', 'education-num',
'marital-status', 'occupation', 'relationship', 'race', 'gender',
'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
'income']
)
# For illustration purposes, we only select some of the columns:
data = data[['age', 'workclass', 'education', 'gender', 'hours-per-week', 'occupation', 'income']]
# IPython.display allows nice output formatting within the Jupyter notebook
display(data.head())
print(data.size)
age | workclass | education | gender | hours-per-week | occupation | income | |
---|---|---|---|---|---|---|---|
0 | 39 | State-gov | Bachelors | Male | 40 | Adm-clerical | <=50K |
1 | 50 | Self-emp-not-inc | Bachelors | Male | 13 | Exec-managerial | <=50K |
2 | 38 | Private | HS-grad | Male | 40 | Handlers-cleaners | <=50K |
3 | 53 | Private | 11th | Male | 40 | Handlers-cleaners | <=50K |
4 | 28 | Private | Bachelors | Female | 40 | Prof-specialty | <=50K |
227927
print(data.gender.value_counts())
Male 21790 Female 10771 Name: gender, dtype: int64
print(data.workclass.value_counts())
Private 22696 Self-emp-not-inc 2541 Local-gov 2093 ? 1836 State-gov 1298 Self-emp-inc 1116 Federal-gov 960 Without-pay 14 Never-worked 7 Name: workclass, dtype: int64
print("Original features:", list(data.columns))
print("length of features:", len(data.columns))
Original features: ['age', 'workclass', 'education', 'gender', 'hours-per-week', 'occupation', 'income'] length of features: 7
data_dummies = pd.get_dummies(data)
print("Features after get_dummies:\n", list(data_dummies.columns))
print("length of dummy features:", len(data_dummies.columns))
Features after get_dummies: ['age', 'hours-per-week', 'workclass_ ?', 'workclass_ Federal-gov', 'workclass_ Local-gov', 'workclass_ Never-worked', 'workclass_ Private', 'workclass_ Self-emp-inc', 'workclass_ Self-emp-not-inc', 'workclass_ State-gov', 'workclass_ Without-pay', 'education_ 10th', 'education_ 11th', 'education_ 12th', 'education_ 1st-4th', 'education_ 5th-6th', 'education_ 7th-8th', 'education_ 9th', 'education_ Assoc-acdm', 'education_ Assoc-voc', 'education_ Bachelors', 'education_ Doctorate', 'education_ HS-grad', 'education_ Masters', 'education_ Preschool', 'education_ Prof-school', 'education_ Some-college', 'gender_ Female', 'gender_ Male', 'occupation_ ?', 'occupation_ Adm-clerical', 'occupation_ Armed-Forces', 'occupation_ Craft-repair', 'occupation_ Exec-managerial', 'occupation_ Farming-fishing', 'occupation_ Handlers-cleaners', 'occupation_ Machine-op-inspct', 'occupation_ Other-service', 'occupation_ Priv-house-serv', 'occupation_ Prof-specialty', 'occupation_ Protective-serv', 'occupation_ Sales', 'occupation_ Tech-support', 'occupation_ Transport-moving', 'income_ <=50K', 'income_ >50K'] length of dummy features: 46
display(data_dummies.head(n=10))
age | hours-per-week | workclass_ ? | workclass_ Federal-gov | ... | occupation_ Tech-support | occupation_ Transport-moving | income_ <=50K | income_ >50K | |
---|---|---|---|---|---|---|---|---|---|
0 | 39 | 40 | 0 | 0 | ... | 0 | 0 | 1 | 0 |
1 | 50 | 13 | 0 | 0 | ... | 0 | 0 | 1 | 0 |
2 | 38 | 40 | 0 | 0 | ... | 0 | 0 | 1 | 0 |
3 | 53 | 40 | 0 | 0 | ... | 0 | 0 | 1 | 0 |
4 | 28 | 40 | 0 | 0 | ... | 0 | 0 | 1 | 0 |
5 | 37 | 40 | 0 | 0 | ... | 0 | 0 | 1 | 0 |
6 | 49 | 16 | 0 | 0 | ... | 0 | 0 | 1 | 0 |
7 | 52 | 45 | 0 | 0 | ... | 0 | 0 | 0 | 1 |
8 | 31 | 50 | 0 | 0 | ... | 0 | 0 | 0 | 1 |
9 | 42 | 40 | 0 | 0 | ... | 0 | 0 | 0 | 1 |
10 rows × 46 columns
one_hot_encoded = data_dummies.loc[:, 'workclass_ ?':'workclass_ Without-pay']
display(one_hot_encoded.head(n=10))
workclass_ ? | workclass_ Federal-gov | workclass_ Local-gov | workclass_ Never-worked | ... | workclass_ Self-emp-inc | workclass_ Self-emp-not-inc | workclass_ State-gov | workclass_ Without-pay | |
---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 1 | 0 |
1 | 0 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 0 |
2 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 |
3 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 |
4 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 |
5 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 |
6 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 |
7 | 0 | 0 | 0 | 0 | ... | 0 | 1 | 0 | 0 |
8 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 |
9 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 |
10 rows × 9 columns
print(type(one_hot_encoded.values))
print(one_hot_encoded.values[:10])
<class 'numpy.ndarray'> [[0 0 0 0 0 0 0 1 0] [0 0 0 0 0 0 1 0 0] [0 0 0 0 1 0 0 0 0] [0 0 0 0 1 0 0 0 0] [0 0 0 0 1 0 0 0 0] [0 0 0 0 1 0 0 0 0] [0 0 0 0 1 0 0 0 0] [0 0 0 0 0 0 1 0 0] [0 0 0 0 1 0 0 0 0] [0 0 0 0 1 0 0 0 0]]
# Get only the columns containing features
# that is all columns from 'age' to 'occupation_ Transport-moving'
# This range contains all the features but not the target
features = data_dummies.loc[:, 'age':'occupation_ Transport-moving']
# extract NumPy arrays
X = features.values
y = data_dummies['income_ >50K'].values
print("X.shape: {} y.shape: {}".format(X.shape, y.shape))
X.shape: (32561, 44) y.shape: (32561,)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
print("Test score: {:.2f}".format(logreg.score(X_test, y_test)))
Test score: 0.81
# create a DataFrame with an integer feature and a categorical string feature
demo_df = pd.DataFrame(
{'Integer Feature': [0, 1, 2, 1],
'Categorical Feature': ['socks', 'fox', 'socks', 'box']}
)
display(demo_df)
Categorical Feature | Integer Feature | |
---|---|---|
0 | socks | 0 |
1 | fox | 1 |
2 | socks | 2 |
3 | box | 1 |
display(pd.get_dummies(demo_df))
Integer Feature | Categorical Feature_box | Categorical Feature_fox | Categorical Feature_socks | |
---|---|---|---|---|
0 | 0 | 0 | 0 | 1 |
1 | 1 | 0 | 1 | 0 |
2 | 2 | 0 | 0 | 1 |
3 | 1 | 1 | 0 | 0 |
demo_df['Integer Feature'] = demo_df['Integer Feature'].astype(str)
display(pd.get_dummies(demo_df, columns=['Integer Feature', 'Categorical Feature']))
Integer Feature_0 | Integer Feature_1 | Integer Feature_2 | Categorical Feature_box | Categorical Feature_fox | Categorical Feature_socks | |
---|---|---|---|---|---|---|
0 | 1 | 0 | 0 | 0 | 0 | 1 |
1 | 0 | 1 | 0 | 0 | 1 | 0 |
2 | 0 | 0 | 1 | 0 | 0 | 1 |
3 | 0 | 1 | 0 | 1 | 0 | 0 |
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
X, y = mglearn.datasets.make_wave(n_samples=100)
print("X.shape: {}".format(X.shape))
print("y.shape: {}".format(y.shape))
print()
for i in range(10):
print(X[i], y[i])
X.shape: (100, 1) y.shape: (100,) [-0.753] -0.3979485798878842 [2.704] 0.7105775485755936 [1.392] 0.41392866721449156 [0.592] -0.3483837936512941 [-2.064] -1.6020040642044855 [-2.064] -1.3135709853245343 [-2.651] -0.12426799844607195 [2.197] 1.1366058452312982 [0.607] 0.22684365004805757 [1.248] -0.10700112891754687
line = np.linspace(-3, 3, 1000, endpoint=False).reshape(-1, 1)
print(line.shape)
reg = DecisionTreeRegressor(min_samples_split=3).fit(X, y)
plt.plot(line, reg.predict(line), label="decision tree")
reg = LinearRegression().fit(X, y)
plt.plot(line, reg.predict(line), label="linear regression")
plt.plot(X[:, 0], y, 'o', c='k')
plt.ylabel("Regression output")
plt.xlabel("Input feature")
plt.legend(loc="best")
(1000, 1)
<matplotlib.legend.Legend at 0x10d12bb00>
bins = np.linspace(-3, 3, 11)
print("bins: {}".format(bins))
bins: [-3. -2.4 -1.8 -1.2 -0.6 0. 0.6 1.2 1.8 2.4 3. ]
which_bin = np.digitize(X, bins=bins)
print("\nData points:\n", X[:5])
print("\nBin membership for data points:\n", which_bin[:5])
Data points: [[-0.753] [ 2.704] [ 1.392] [ 0.592] [-2.064]] Bin membership for data points: [[ 4] [10] [ 8] [ 6] [ 2]]
from sklearn.preprocessing import OneHotEncoder
# transform using the OneHotEncoder
encoder = OneHotEncoder(sparse=False)
encoder.fit(which_bin)
X_binned = encoder.transform(which_bin)
print(X_binned[:5])
[[0. 0. 0. 1. 0. 0. 0. 0. 0. 0.] [0. 0. 0. 0. 0. 0. 0. 0. 0. 1.] [0. 0. 0. 0. 0. 0. 0. 1. 0. 0.] [0. 0. 0. 0. 0. 1. 0. 0. 0. 0.] [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]]
print("X.shape: {}".format(X.shape))
print("X_binned.shape: {}".format(X_binned.shape))
X.shape: (100, 1) X_binned.shape: (100, 10)
encoder = OneHotEncoder(sparse=False)
which_bin = np.digitize(X, bins=bins)
encoder.fit(which_bin)
X_binned = encoder.transform(which_bin)
line = np.linspace(-3, 3, 1000, endpoint=False).reshape(-1, 1)
print("line.shape:", line.shape)
line_binned = encoder.transform(np.digitize(line, bins=bins))
print("line_binned.shape:", line_binned.shape)
reg = LinearRegression().fit(X_binned, y)
plt.plot(line, reg.predict(line_binned), label='linear regression binned')
reg = DecisionTreeRegressor(min_samples_split=3).fit(X_binned, y)
plt.plot(line, reg.predict(line_binned), label='decision tree binned')
plt.plot(X[:, 0], y, 'o', c='k')
plt.vlines(bins, -3, 3, linewidth=1, alpha=.2)
plt.legend(loc="best")
plt.ylabel("Regression output")
plt.xlabel("Input feature")
line.shape: (1000, 1) line_binned.shape: (1000, 10)
Text(0.5,0,'Input feature')
X_combined = np.hstack([X, X_binned])
print(X_combined.shape)
print(X_combined[:5])
(100, 11) [[-0.753 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. ] [ 2.704 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. ] [ 1.392 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. ] [ 0.592 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. ] [-2.064 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. ]]
reg = LinearRegression().fit(X_combined, y)
line_combined = np.hstack([line, line_binned])
plt.plot(line, reg.predict(line_combined), label='linear regression combined')
plt.vlines(bins, -3, 3, linewidth=1, alpha=.2)
plt.legend(loc="best")
plt.ylabel("Regression output")
plt.xlabel("Input feature")
plt.plot(X[:, 0], y, 'o', c='k')
[<matplotlib.lines.Line2D at 0x10e92efd0>]
X_product = np.hstack([X_binned, X * X_binned])
print(X_product.shape)
print(X_product[:5])
(100, 20) [[ 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. -0. -0. -0. -0.753 -0. -0. -0. -0. -0. -0. ] [ 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 2.704] [ 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.392 0. 0. ] [ 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.592 0. 0. 0. 0. ] [ 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. -0. -2.064 -0. -0. -0. -0. -0. -0. -0. -0. ]]
reg = LinearRegression().fit(X_product, y)
line_product = np.hstack([line_binned, line * line_binned])
plt.plot(line, reg.predict(line_product), label='linear regression product')
for bin in bins:
plt.plot([bin, bin], [-3, 3], ':', c='k', linewidth=1)
plt.plot(X[:, 0], y, 'o', c='k')
plt.ylabel("Regression output")
plt.xlabel("Input feature")
plt.legend(loc="best")
<matplotlib.legend.Legend at 0x10e7db7f0>
from sklearn.preprocessing import PolynomialFeatures
# include polynomials up to x ** 10:
# the default "include_bias=True" adds a feature that's constantly 1
poly = PolynomialFeatures(degree=10, include_bias=False)
poly.fit(X)
X_poly = poly.transform(X)
print("X.shape: {}".format(X.shape))
print("X_poly.shape: {}".format(X_poly.shape))
X.shape: (100, 1) X_poly.shape: (100, 10)
print("Entries of X:\n{}".format(X[:5]))
print("Entries of X_poly:\n{}".format(X_poly[:5]))
Entries of X: [[-0.753] [ 2.704] [ 1.392] [ 0.592] [-2.064]] Entries of X_poly: [[ -0.753 0.567 -0.427 0.321 -0.242 0.182 -0.137 0.103 -0.078 0.058] [ 2.704 7.313 19.777 53.482 144.632 391.125 1057.714 2860.36 7735.232 20918.278] [ 1.392 1.938 2.697 3.754 5.226 7.274 10.125 14.094 19.618 27.307] [ 0.592 0.35 0.207 0.123 0.073 0.043 0.025 0.015 0.009 0.005] [ -2.064 4.26 -8.791 18.144 -37.448 77.289 -159.516 329.222 -679.478 1402.367]]
print("Polynomial feature names:\n{}".format(poly.get_feature_names()))
Polynomial feature names: ['x0', 'x0^2', 'x0^3', 'x0^4', 'x0^5', 'x0^6', 'x0^7', 'x0^8', 'x0^9', 'x0^10']
reg = LinearRegression().fit(X_poly, y)
line_poly = poly.transform(line)
plt.plot(line, reg.predict(line_poly), label='polynomial linear regression')
plt.plot(X[:, 0], y, 'o', c='k')
plt.ylabel("Regression output")
plt.xlabel("Input feature")
plt.legend(loc="best")
<matplotlib.legend.Legend at 0x10ee045f8>
-SVR(gamma='auto')
from sklearn.svm import SVR
for gamma in [1, 10]:
svr = SVR(gamma=gamma).fit(X, y)
plt.plot(line, svr.predict(line), label='SVR gamma={}'.format(gamma))
plt.plot(X[:, 0], y, 'o', c='k')
plt.ylabel("Regression output")
plt.xlabel("Input feature")
plt.legend(loc="best")
<matplotlib.legend.Legend at 0x10d5364e0>
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
boston = load_boston()
X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, random_state=0)
# rescale data
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
poly = PolynomialFeatures(degree=2).fit(X_train_scaled)
X_train_poly = poly.transform(X_train_scaled)
X_test_poly = poly.transform(X_test_scaled)
print("X_train.shape: {}".format(X_train.shape))
print("X_train_poly.shape: {}".format(X_train_poly.shape))
X_train.shape: (379, 13) X_train_poly.shape: (379, 105)
print("Polynomial feature names:\n{}".format(poly.get_feature_names()))
Polynomial feature names: ['1', 'x0', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10', 'x11', 'x12', 'x0^2', 'x0 x1', 'x0 x2', 'x0 x3', 'x0 x4', 'x0 x5', 'x0 x6', 'x0 x7', 'x0 x8', 'x0 x9', 'x0 x10', 'x0 x11', 'x0 x12', 'x1^2', 'x1 x2', 'x1 x3', 'x1 x4', 'x1 x5', 'x1 x6', 'x1 x7', 'x1 x8', 'x1 x9', 'x1 x10', 'x1 x11', 'x1 x12', 'x2^2', 'x2 x3', 'x2 x4', 'x2 x5', 'x2 x6', 'x2 x7', 'x2 x8', 'x2 x9', 'x2 x10', 'x2 x11', 'x2 x12', 'x3^2', 'x3 x4', 'x3 x5', 'x3 x6', 'x3 x7', 'x3 x8', 'x3 x9', 'x3 x10', 'x3 x11', 'x3 x12', 'x4^2', 'x4 x5', 'x4 x6', 'x4 x7', 'x4 x8', 'x4 x9', 'x4 x10', 'x4 x11', 'x4 x12', 'x5^2', 'x5 x6', 'x5 x7', 'x5 x8', 'x5 x9', 'x5 x10', 'x5 x11', 'x5 x12', 'x6^2', 'x6 x7', 'x6 x8', 'x6 x9', 'x6 x10', 'x6 x11', 'x6 x12', 'x7^2', 'x7 x8', 'x7 x9', 'x7 x10', 'x7 x11', 'x7 x12', 'x8^2', 'x8 x9', 'x8 x10', 'x8 x11', 'x8 x12', 'x9^2', 'x9 x10', 'x9 x11', 'x9 x12', 'x10^2', 'x10 x11', 'x10 x12', 'x11^2', 'x11 x12', 'x12^2']
from sklearn.linear_model import Ridge
ridge = Ridge().fit(X_train_scaled, y_train)
print("Score without interactions: {:.3f}".format(ridge.score(X_test_scaled, y_test)))
ridge = Ridge().fit(X_train_poly, y_train)
print("Score with interactions: {:.3f}".format(ridge.score(X_test_poly, y_test)))
Score without interactions: 0.621 Score with interactions: 0.753
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=100).fit(X_train_scaled, y_train)
print("Score without interactions: {:.3f}".format(rf.score(X_test_scaled, y_test)))
rf = RandomForestRegressor(n_estimators=100).fit(X_train_poly, y_train)
print("Score with interactions: {:.3f}".format(rf.score(X_test_poly, y_test)))
Score without interactions: 0.819 Score with interactions: 0.769
rnd = np.random.RandomState(0)
X_org = rnd.normal(size=(1000, 3))
w = rnd.normal(size=3)
print("X_org.shape:", X_org.shape)
print("w.shape:", w.shape)
print()
X = rnd.poisson(lam = 10 * np.exp(X_org))
y = np.dot(X_org, w)
print("X.shape:", X.shape)
print("y.shape:", y.shape)
print()
print("X_org[:10, 0]:", X_org[:10, 0])
print("X[:10, 0]", X[:10, 0])
print("y[:10]", y[:10])
X_org.shape: (1000, 3) w.shape: (3,) X.shape: (1000, 3) y.shape: (1000,) X_org[:10, 0]: [ 1.764 2.241 0.95 0.411 0.761 0.334 0.313 0.654 2.27 -0.187] X[:10, 0] [ 56 81 25 20 27 18 12 21 109 7] y[:10] [2.926 4.744 1.439 0.57 1.231 1.405 0.305 1.618 2.784 0.405]
print("Number of feature appearances:\n{}".format(np.bincount(X[:, 0])))
Number of feature appearances: [28 38 68 48 61 59 45 56 37 40 35 34 36 26 23 26 27 21 23 23 18 21 10 9 17 9 7 14 12 7 3 8 4 5 5 3 4 2 4 1 1 3 2 5 3 8 2 5 2 1 2 3 3 2 2 3 3 0 1 2 1 0 0 3 1 0 0 0 1 3 0 1 0 2 0 1 1 0 0 0 0 1 0 0 2 2 0 1 1 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1]
bins = np.bincount(X[:, 0])
plt.bar(range(len(bins)), bins, color='grey')
plt.ylabel("Number of appearances")
plt.xlabel("Value")
Text(0.5,0,'Value')
from sklearn.linear_model import Ridge
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
score = Ridge().fit(X_train, y_train).score(X_test, y_test)
print("Test score: {:.3f}".format(score))
Test score: 0.622
X_train_log = np.log(X_train + 1)
X_test_log = np.log(X_test + 1)
plt.hist(X_train_log[:, 0], bins=25, color='gray')
plt.ylabel("Number of appearances")
plt.xlabel("Value")
plt.show()
score = Ridge().fit(X_train_log, y_train).score(X_test_log, y_test)
print("Test score: {:.3f}".format(score))
Test score: 0.875
from sklearn.datasets import load_breast_cancer
from sklearn.feature_selection import SelectPercentile
from sklearn.model_selection import train_test_split
cancer = load_breast_cancer()
print("Shape of cancer data: {}".format(cancer.data.shape))
Shape of cancer data: (569, 30)
# get deterministic random numbers
rng = np.random.RandomState(42)
noise = rng.normal(size=(len(cancer.data), 50))
print("Shape of noise: {}".format(noise.shape))
Shape of noise: (569, 50)
# add noise features to the data
# the first 30 features are from the dataset, the next 50 are noise
X_w_noise = np.hstack([cancer.data, noise])
print("Shape of X_w_noise: {}".format(X_w_noise.shape))
Shape of X_w_noise: (569, 80)
X_train, X_test, y_train, y_test = train_test_split(X_w_noise, cancer.target, random_state=0, test_size=.5)
# use f_classif (the default) and SelectPercentile to select 50% of features
select = SelectPercentile(percentile=50)
select.fit(X_train, y_train)
# transform training set
X_train_selected = select.transform(X_train)
print("X_train.shape: {}".format(X_train.shape))
print("X_train_selected.shape: {}".format(X_train_selected.shape))
X_train.shape: (284, 80) X_train_selected.shape: (284, 40)
/Users/yhhan/anaconda/lib/python3.6/site-packages/sklearn/utils/__init__.py:54: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`. if np.issubdtype(mask.dtype, np.int):
mask = select.get_support()
print(mask)
# 30보다 작거나 같은 인덱스
print([x for x in np.where(mask == False)[0].tolist() if x < 30])
# visualize the mask. black is True, white is False
plt.matshow(mask.reshape(1, -1), cmap='gray_r')
plt.xlabel("Sample index")
plt.yticks(())
[ True True True True True True True True True False True False True True True True True True False False True True True True True True True True True True False False False True False True False False True False False False False True False False True False False True False True False False False False False False True False True False False False False True False True False False False False True True False True False False False False] [9, 11, 18, 19]
([], <a list of 0 Text yticklabel objects>)
mask.reshape(1, -1)
array([[ True, True, True, True, True, True, True, True, True, False, True, False, True, True, True, True, True, True, False, False, True, True, True, True, True, True, True, True, True, True, False, False, False, True, False, True, False, False, True, False, False, False, False, True, False, False, True, False, False, True, False, True, False, False, False, False, False, False, True, False, True, False, False, False, False, True, False, True, False, False, False, False, True, True, False, True, False, False, False, False]])
from sklearn.linear_model import LogisticRegression
# transform test data
X_test_selected = select.transform(X_test)
lr = LogisticRegression()
lr.fit(X_train, y_train)
print("Score with all features: {:.3f}".format(lr.score(X_test, y_test)))
lr.fit(X_train_selected, y_train)
print("Score with only selected features: {:.3f}".format(lr.score(X_test_selected, y_test)))
Score with all features: 0.930 Score with only selected features: 0.940
/Users/yhhan/anaconda/lib/python3.6/site-packages/sklearn/utils/__init__.py:54: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`. if np.issubdtype(mask.dtype, np.int):
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
select = SelectFromModel(
RandomForestClassifier(n_estimators=100, random_state=42),
threshold="median"
)
select.fit(X_train, y_train)
X_train_l1 = select.transform(X_train)
print("X_train.shape: {}".format(X_train.shape))
print("X_train_l1.shape: {}".format(X_train_l1.shape))
X_train.shape: (284, 80) X_train_l1.shape: (284, 40)
/Users/yhhan/anaconda/lib/python3.6/site-packages/sklearn/utils/__init__.py:54: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`. if np.issubdtype(mask.dtype, np.int):
mask = select.get_support()
print(mask)
# 30보다 작거나 같은 인덱스
print([x for x in np.where(mask == False)[0].tolist() if x < 30])
# visualize the mask. black is True, white is False
plt.matshow(mask.reshape(1, -1), cmap='gray_r')
plt.xlabel("Sample index")
plt.yticks(())
[ True True True True True True True True True True True False True True False True True True True True True True True True True True True True True True False True False False True False False False False False True False False True False True True False False False True False False True True False False False False False False True False False False False False False False True False False False True False False False False False False] [11, 14]
([], <a list of 0 Text yticklabel objects>)
X_test_l1 = select.transform(X_test)
score = LogisticRegression().fit(X_train_l1, y_train).score(X_test_l1, y_test)
print("Test score: {:.3f}".format(score))
Test score: 0.951
/Users/yhhan/anaconda/lib/python3.6/site-packages/sklearn/utils/__init__.py:54: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`. if np.issubdtype(mask.dtype, np.int):
from sklearn.feature_selection import RFE
select = RFE(
RandomForestClassifier(n_estimators=100, random_state=42),
n_features_to_select=40
)
select.fit(X_train, y_train)
X_train_rfe = select.transform(X_train)
print("X_train.shape: {}".format(X_train.shape))
print("X_train_rfe.shape: {}".format(X_train_rfe.shape))
X_train.shape: (284, 80) X_train_rfe.shape: (284, 40)
/Users/yhhan/anaconda/lib/python3.6/site-packages/sklearn/utils/__init__.py:54: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`. if np.issubdtype(mask.dtype, np.int):
# visualize the selected features:
mask = select.get_support()
print(mask)
# 30보다 작거나 같은 인덱스
print([x for x in np.where(mask == False)[0].tolist() if x < 30])
plt.matshow(mask.reshape(1, -1), cmap='gray_r')
plt.xlabel("Sample index")
plt.yticks(())
[ True True True True True True True True True True True True True True True True False True True True True True True True True True True True True True False True False False False False False False False False False False False False False False False True True False True False False True True False False False True False False False False False False True False True False False False False False True False False False True False False] [16]
([], <a list of 0 Text yticklabel objects>)
X_train_rfe = select.transform(X_train)
X_test_rfe = select.transform(X_test)
score = LogisticRegression().fit(X_train_rfe, y_train).score(X_test_rfe, y_test)
print("Test score: {:.3f}".format(score))
Test score: 0.951
/Users/yhhan/anaconda/lib/python3.6/site-packages/sklearn/utils/__init__.py:54: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`. if np.issubdtype(mask.dtype, np.int):
print("Test score: {:.3f}".format(select.score(X_test, y_test)))
Test score: 0.951
/Users/yhhan/anaconda/lib/python3.6/site-packages/sklearn/utils/__init__.py:54: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`. if np.issubdtype(mask.dtype, np.int):
citibike = mglearn.datasets.load_citibike()
print("citibike.data.shape:", citibike.shape)
print(type(citibike))
citibike.data.shape: (248,) <class 'pandas.core.series.Series'>
print("Citibike data:\n{}".format(citibike.head()))
Citibike data: starttime 2015-08-01 00:00:00 3 2015-08-01 03:00:00 0 2015-08-01 06:00:00 9 2015-08-01 09:00:00 41 2015-08-01 12:00:00 39 Freq: 3H, Name: one, dtype: int64
plt.figure(figsize=(10, 3))
xticks = pd.date_range(
start=citibike.index.min(),
end=citibike.index.max(),
freq='D'
)
print(xticks)
DatetimeIndex(['2015-08-01', '2015-08-02', '2015-08-03', '2015-08-04', '2015-08-05', '2015-08-06', '2015-08-07', '2015-08-08', '2015-08-09', '2015-08-10', '2015-08-11', '2015-08-12', '2015-08-13', '2015-08-14', '2015-08-15', '2015-08-16', '2015-08-17', '2015-08-18', '2015-08-19', '2015-08-20', '2015-08-21', '2015-08-22', '2015-08-23', '2015-08-24', '2015-08-25', '2015-08-26', '2015-08-27', '2015-08-28', '2015-08-29', '2015-08-30', '2015-08-31'], dtype='datetime64[ns]', freq='D')
<Figure size 3000x900 with 0 Axes>
week = ["Sun", "Mon", "Tue", "Wed", "Thr", "Fri", "Sat"]
xtick_name = [week[int(w)] + d for w, d in zip(xticks.strftime("%w"), xticks.strftime(" %m-%d"))]
plt.xticks(xticks, xtick_name, rotation=90, ha="left")
plt.plot(citibike, linewidth=1)
plt.xlabel("Date")
plt.ylabel("Rentals")
Text(0,0.5,'Rentals')
# extract the target values (number of rentals)
y = citibike.values
# convert to POSIX time by dividing by 10**9
X = citibike.index.astype("int64").values.reshape(-1, 1) // 10**9
print(X[0], y[0])
print(X[1], y[1])
print(X[2], y[2])
print()
print(X.shape)
print(y.shape)
[1438387200] 3 [1438398000] 0 [1438408800] 9 (248, 1) (248,)
# use the first 184 data points for training, the rest for testing
n_train = 184
# function to evaluate and plot a regressor on a given feature set
def eval_on_features(features, target, regressor):
# split the given features into a training and a test set
X_train, X_test = features[:n_train], features[n_train:]
# also split the target array
y_train, y_test = target[:n_train], target[n_train:]
regressor.fit(X_train, y_train)
print("Test-set R^2: {:.2f}".format(regressor.score(X_test, y_test)))
y_pred = regressor.predict(X_test)
y_pred_train = regressor.predict(X_train)
plt.figure(figsize=(10, 3))
plt.xticks(range(0, len(X), 8), xticks.strftime("%a %m-%d"), rotation=90, ha="left")
plt.plot(range(n_train), y_train, label="train")
plt.plot(range(n_train, len(y_test) + n_train), y_test, '-', label="test")
plt.plot(range(n_train), y_pred_train, '--', label="prediction train")
plt.plot(range(n_train, len(y_test) + n_train), y_pred, '--', label="prediction test")
plt.legend(loc=(1.01, 0))
plt.xlabel("Date")
plt.ylabel("Rentals")
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators=100, random_state=0)
eval_on_features(X, y, regressor)
Test-set R^2: -0.04
X_hour = citibike.index.hour.values.reshape(-1, 1)
for i in range(len(X_hour)):
print(X_hour[i], end=", ")
print()
print()
print(X_hour.shape)
print()
eval_on_features(X_hour, y, regressor)
[0], [3], [6], [9], [12], [15], [18], [21], [0], [3], [6], [9], [12], [15], [18], [21], [0], [3], [6], [9], [12], [15], [18], [21], [0], [3], [6], [9], [12], [15], [18], [21], [0], [3], [6], [9], [12], [15], [18], [21], [0], [3], [6], [9], [12], [15], [18], [21], [0], [3], [6], [9], [12], [15], [18], [21], [0], [3], [6], [9], [12], [15], [18], [21], [0], [3], [6], [9], [12], [15], [18], [21], [0], [3], [6], [9], [12], [15], [18], [21], [0], [3], [6], [9], [12], [15], [18], [21], [0], [3], [6], [9], [12], [15], [18], [21], [0], [3], [6], [9], [12], [15], [18], [21], [0], [3], [6], [9], [12], [15], [18], [21], [0], [3], [6], [9], [12], [15], [18], [21], [0], [3], [6], [9], [12], [15], [18], [21], [0], [3], [6], [9], [12], [15], [18], [21], [0], [3], [6], [9], [12], [15], [18], [21], [0], [3], [6], [9], [12], [15], [18], [21], [0], [3], [6], [9], [12], [15], [18], [21], [0], [3], [6], [9], [12], [15], [18], [21], [0], [3], [6], [9], [12], [15], [18], [21], [0], [3], [6], [9], [12], [15], [18], [21], [0], [3], [6], [9], [12], [15], [18], [21], [0], [3], [6], [9], [12], [15], [18], [21], [0], [3], [6], [9], [12], [15], [18], [21], [0], [3], [6], [9], [12], [15], [18], [21], [0], [3], [6], [9], [12], [15], [18], [21], [0], [3], [6], [9], [12], [15], [18], [21], [0], [3], [6], [9], [12], [15], [18], [21], [0], [3], [6], [9], [12], [15], [18], [21], (248, 1) Test-set R^2: 0.60
위 결과에서 시각에 따른 패턴 예측은 비교적 정확
해결 방법
a = citibike.index.dayofweek.values.reshape(-1, 1)
b = citibike.index.hour.values.reshape(-1, 1)
print(a.shape)
print(b.shape)
print()
X_hour_week = np.hstack([citibike.index.dayofweek.values.reshape(-1, 1),
citibike.index.hour.values.reshape(-1, 1)])
for i in range(len(X_hour_week)):
print(X_hour_week[i], end=", ")
print()
print()
print(X_hour_week.shape)
print()
eval_on_features(X_hour_week, y, regressor)
(248, 1) (248, 1) [5 0], [5 3], [5 6], [5 9], [ 5 12], [ 5 15], [ 5 18], [ 5 21], [6 0], [6 3], [6 6], [6 9], [ 6 12], [ 6 15], [ 6 18], [ 6 21], [0 0], [0 3], [0 6], [0 9], [ 0 12], [ 0 15], [ 0 18], [ 0 21], [1 0], [1 3], [1 6], [1 9], [ 1 12], [ 1 15], [ 1 18], [ 1 21], [2 0], [2 3], [2 6], [2 9], [ 2 12], [ 2 15], [ 2 18], [ 2 21], [3 0], [3 3], [3 6], [3 9], [ 3 12], [ 3 15], [ 3 18], [ 3 21], [4 0], [4 3], [4 6], [4 9], [ 4 12], [ 4 15], [ 4 18], [ 4 21], [5 0], [5 3], [5 6], [5 9], [ 5 12], [ 5 15], [ 5 18], [ 5 21], [6 0], [6 3], [6 6], [6 9], [ 6 12], [ 6 15], [ 6 18], [ 6 21], [0 0], [0 3], [0 6], [0 9], [ 0 12], [ 0 15], [ 0 18], [ 0 21], [1 0], [1 3], [1 6], [1 9], [ 1 12], [ 1 15], [ 1 18], [ 1 21], [2 0], [2 3], [2 6], [2 9], [ 2 12], [ 2 15], [ 2 18], [ 2 21], [3 0], [3 3], [3 6], [3 9], [ 3 12], [ 3 15], [ 3 18], [ 3 21], [4 0], [4 3], [4 6], [4 9], [ 4 12], [ 4 15], [ 4 18], [ 4 21], [5 0], [5 3], [5 6], [5 9], [ 5 12], [ 5 15], [ 5 18], [ 5 21], [6 0], [6 3], [6 6], [6 9], [ 6 12], [ 6 15], [ 6 18], [ 6 21], [0 0], [0 3], [0 6], [0 9], [ 0 12], [ 0 15], [ 0 18], [ 0 21], [1 0], [1 3], [1 6], [1 9], [ 1 12], [ 1 15], [ 1 18], [ 1 21], [2 0], [2 3], [2 6], [2 9], [ 2 12], [ 2 15], [ 2 18], [ 2 21], [3 0], [3 3], [3 6], [3 9], [ 3 12], [ 3 15], [ 3 18], [ 3 21], [4 0], [4 3], [4 6], [4 9], [ 4 12], [ 4 15], [ 4 18], [ 4 21], [5 0], [5 3], [5 6], [5 9], [ 5 12], [ 5 15], [ 5 18], [ 5 21], [6 0], [6 3], [6 6], [6 9], [ 6 12], [ 6 15], [ 6 18], [ 6 21], [0 0], [0 3], [0 6], [0 9], [ 0 12], [ 0 15], [ 0 18], [ 0 21], [1 0], [1 3], [1 6], [1 9], [ 1 12], [ 1 15], [ 1 18], [ 1 21], [2 0], [2 3], [2 6], [2 9], [ 2 12], [ 2 15], [ 2 18], [ 2 21], [3 0], [3 3], [3 6], [3 9], [ 3 12], [ 3 15], [ 3 18], [ 3 21], [4 0], [4 3], [4 6], [4 9], [ 4 12], [ 4 15], [ 4 18], [ 4 21], [5 0], [5 3], [5 6], [5 9], [ 5 12], [ 5 15], [ 5 18], [ 5 21], [6 0], [6 3], [6 6], [6 9], [ 6 12], [ 6 15], [ 6 18], [ 6 21], [0 0], [0 3], [0 6], [0 9], [ 0 12], [ 0 15], [ 0 18], [ 0 21], (248, 2) Test-set R^2: 0.84
from sklearn.linear_model import LinearRegression
eval_on_features(X_hour_week, y, LinearRegression())
Test-set R^2: 0.13
enc = OneHotEncoder()
X_hour_week_onehot = enc.fit_transform(X_hour_week).toarray()
print(X_hour_week_onehot[0])
print(X_hour_week_onehot[1])
print(X_hour_week_onehot[2])
print()
print(X_hour_week_onehot.shape)
print()
[0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0.] [0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0.] [0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0.] (248, 15)
eval_on_features(X_hour_week_onehot, y, Ridge())
Test-set R^2: 0.62
poly_transformer = PolynomialFeatures(degree=2, interaction_only=True,
include_bias=False)
X_hour_week_onehot_poly = poly_transformer.fit_transform(X_hour_week_onehot)
lr = Ridge()
eval_on_features(X_hour_week_onehot_poly, y, lr)
Test-set R^2: 0.85
hour = ["%02d:00" % i for i in range(0, 24, 3)]
day = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
features = day + hour
features_poly = poly_transformer.get_feature_names(features)
features_nonzero = np.array(features_poly)[lr.coef_ != 0]
coef_nonzero = lr.coef_[lr.coef_ != 0]
plt.figure(figsize=(15, 2))
plt.plot(coef_nonzero, 'o')
plt.xticks(np.arange(len(coef_nonzero)), features_nonzero, rotation=90)
plt.xlabel("Feature name")
plt.ylabel("Feature magnitude")
Text(0,0.5,'Feature magnitude')