import pandas as pd
import sklearn
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import train_test_split
import os
import matplotlib.pyplot as plt
%matplotlib inline
print sklearn.__version__
print pd.__version__
0.19.1 0.20.3
/usr/local/lib/python2.7/dist-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20. "This module will be removed in 0.20.", DeprecationWarning)
DATA_DIR = '../data'
df = pd.read_table(
os.path.abspath(os.path.join(DATA_DIR, 'day2/automobile.csv')),
sep=','
)
df.head(5)
symboling | normalized-losses | make | fuel-type | aspiration | num-of-doors | body-style | drive-wheels | engine-location | wheel-base | ... | engine-size | fuel-system | bore | stroke | compression-ratio | horsepower | peak-rpm | city-mpg | highway-mpg | price | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2 | 164 | audi | gas | std | four | sedan | fwd | front | 99.8 | ... | 109 | mpfi | 3.19 | 3.4 | 10.0 | 102 | 5500 | 24 | 30 | 13950 |
1 | 2 | 164 | audi | gas | std | four | sedan | 4wd | front | 99.4 | ... | 136 | mpfi | 3.19 | 3.4 | 8.0 | 115 | 5500 | 18 | 22 | 17450 |
2 | 1 | 158 | audi | gas | std | four | sedan | fwd | front | 105.8 | ... | 136 | mpfi | 3.19 | 3.4 | 8.5 | 110 | 5500 | 19 | 25 | 17710 |
3 | 1 | 158 | audi | gas | turbo | four | sedan | fwd | front | 105.8 | ... | 131 | mpfi | 3.13 | 3.4 | 8.3 | 140 | 5500 | 17 | 20 | 23875 |
4 | 2 | 192 | bmw | gas | std | two | sedan | rwd | front | 101.2 | ... | 108 | mpfi | 3.50 | 2.8 | 8.8 | 101 | 5800 | 23 | 29 | 16430 |
5 rows × 26 columns
# 205 rows, 26 cols
df.shape
(159, 26)
# datatypes
df.dtypes
symboling int64 normalized-losses int64 make object fuel-type object aspiration object num-of-doors object body-style object drive-wheels object engine-location object wheel-base float64 length float64 width float64 height float64 curb-weight int64 engine-type object num-of-cylinders object engine-size int64 fuel-system object bore float64 stroke float64 compression-ratio float64 horsepower int64 peak-rpm int64 city-mpg int64 highway-mpg int64 price int64 dtype: object
For the first experiment we will just use numerical features as our features for prediction
So, to summarize
Input: Numerical Values
Output: Price
numerics_dtypes = ['int64', 'float64']
df_rel = df.select_dtypes(include=numerics_dtypes)
df_rel.loc[:,'price'] = df.price
df_rel.head(5)
symboling | normalized-losses | wheel-base | length | width | height | curb-weight | engine-size | bore | stroke | compression-ratio | horsepower | peak-rpm | city-mpg | highway-mpg | price | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2 | 164 | 99.8 | 176.6 | 66.2 | 54.3 | 2337 | 109 | 3.19 | 3.4 | 10.0 | 102 | 5500 | 24 | 30 | 13950 |
1 | 2 | 164 | 99.4 | 176.6 | 66.4 | 54.3 | 2824 | 136 | 3.19 | 3.4 | 8.0 | 115 | 5500 | 18 | 22 | 17450 |
2 | 1 | 158 | 105.8 | 192.7 | 71.4 | 55.7 | 2844 | 136 | 3.19 | 3.4 | 8.5 | 110 | 5500 | 19 | 25 | 17710 |
3 | 1 | 158 | 105.8 | 192.7 | 71.4 | 55.9 | 3086 | 131 | 3.13 | 3.4 | 8.3 | 140 | 5500 | 17 | 20 | 23875 |
4 | 2 | 192 | 101.2 | 176.8 | 64.8 | 54.3 | 2395 | 108 | 3.50 | 2.8 | 8.8 | 101 | 5800 | 23 | 29 | 16430 |
# we have only 16 columns of 26 that are numeric
df_rel.shape
(159, 16)
X = df_rel.iloc[ : , :-1].values
Y = df_rel.iloc[:,-1].values
print X.shape
print Y.shape
(159, 15) (159,)
# ideal practice is to use test as 20% - 30% of training data
# defined by test_size in train_test_split()
# random_state is required to avoid sequential biasness in the data distribution
def data_split(X, Y):
X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size=0.2, random_state = 10)
return X_train, X_test, Y_train, Y_test
X_train, X_test, Y_train, Y_test = data_split(X, Y)
X_train.shape, X_test.shape
((127, 15), (32, 15))
class Regression:
def __init__(self):
self.regressor = LinearRegression()
def train(self, X_train, Y_train):
model = self.regressor.fit(X_train, Y_train)
return model
def predict(self, model, X_test):
return model.predict(X_test)
regress = Regression()
model = regress.train(X_train, Y_train)
predictions_train = regress.predict(model, X_train)
predictions_test = regress.predict(model, X_test)