Imports¶

In [1]:

import pandas as pd
import sklearn
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import train_test_split
import os
import matplotlib.pyplot as plt
%matplotlib inline

print sklearn.__version__
print pd.__version__

0.19.1
0.20.3

/usr/local/lib/python2.7/dist-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

In [2]:

DATA_DIR = '../data'
df = pd.read_table(
            os.path.abspath(os.path.join(DATA_DIR, 'day2/automobile.csv')),
            sep=','
            
)
df.head(5)

Out[2]:

	symboling	normalized-losses	make	fuel-type	aspiration	num-of-doors	body-style	drive-wheels	engine-location	wheel-base	...	engine-size	fuel-system	bore	stroke	compression-ratio	horsepower	peak-rpm	city-mpg	highway-mpg	price
0	2	164	audi	gas	std	four	sedan	fwd	front	99.8	...	109	mpfi	3.19	3.4	10.0	102	5500	24	30	13950
1	2	164	audi	gas	std	four	sedan	4wd	front	99.4	...	136	mpfi	3.19	3.4	8.0	115	5500	18	22	17450
2	1	158	audi	gas	std	four	sedan	fwd	front	105.8	...	136	mpfi	3.19	3.4	8.5	110	5500	19	25	17710
3	1	158	audi	gas	turbo	four	sedan	fwd	front	105.8	...	131	mpfi	3.13	3.4	8.3	140	5500	17	20	23875
4	2	192	bmw	gas	std	two	sedan	rwd	front	101.2	...	108	mpfi	3.50	2.8	8.8	101	5800	23	29	16430

5 rows × 26 columns

In [3]:

# 205 rows, 26 cols
df.shape

Out[3]:

(159, 26)

In [4]:

# datatypes
df.dtypes

Out[4]:

symboling              int64
normalized-losses      int64
make                  object
fuel-type             object
aspiration            object
num-of-doors          object
body-style            object
drive-wheels          object
engine-location       object
wheel-base           float64
length               float64
width                float64
height               float64
curb-weight            int64
engine-type           object
num-of-cylinders      object
engine-size            int64
fuel-system           object
bore                 float64
stroke               float64
compression-ratio    float64
horsepower             int64
peak-rpm               int64
city-mpg               int64
highway-mpg            int64
price                  int64
dtype: object

Experiment 1¶

For the first experiment we will just use numerical features as our features for prediction

So, to summarize

Input: Numerical Values
Output: Price

In [12]:

numerics_dtypes = ['int64', 'float64']
df_rel = df.select_dtypes(include=numerics_dtypes)
df_rel.loc[:,'price'] = df.price
df_rel.head(5)

Out[12]:

	symboling	normalized-losses	wheel-base	length	width	height	curb-weight	engine-size	bore	stroke	compression-ratio	horsepower	peak-rpm	city-mpg	highway-mpg	price
0	2	164	99.8	176.6	66.2	54.3	2337	109	3.19	3.4	10.0	102	5500	24	30	13950
1	2	164	99.4	176.6	66.4	54.3	2824	136	3.19	3.4	8.0	115	5500	18	22	17450
2	1	158	105.8	192.7	71.4	55.7	2844	136	3.19	3.4	8.5	110	5500	19	25	17710
3	1	158	105.8	192.7	71.4	55.9	3086	131	3.13	3.4	8.3	140	5500	17	20	23875
4	2	192	101.2	176.8	64.8	54.3	2395	108	3.50	2.8	8.8	101	5800	23	29	16430

In [13]:

# we have only 16 columns of 26 that are numeric
df_rel.shape

Out[13]:

(159, 16)

Make input and output¶

In [14]:

X = df_rel.iloc[ : , :-1].values
Y = df_rel.iloc[:,-1].values

print X.shape
print Y.shape

(159, 15)
(159,)

Train/Test Split¶

In [15]:

# ideal practice is to use test as 20% - 30% of training data
# defined by test_size in train_test_split()
# random_state is required to avoid sequential biasness in the data distribution
def data_split(X, Y):
    X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size=0.2, random_state = 10)
    return X_train, X_test, Y_train, Y_test

X_train, X_test, Y_train, Y_test = data_split(X, Y)

In [16]:

X_train.shape, X_test.shape

Out[16]:

((127, 15), (32, 15))

In [17]:

class Regression:
    
    def __init__(self):
        self.regressor = LinearRegression()
    
    def train(self, X_train, Y_train):
        model = self.regressor.fit(X_train, Y_train)
        return model

    def predict(self, model, X_test):
        return model.predict(X_test)

In [18]:

regress = Regression()
model = regress.train(X_train, Y_train)
predictions_train = regress.predict(model, X_train)
predictions_test = regress.predict(model, X_test)