#!/usr/bin/env python # coding: utf-8 # # Multiple Regression # Let's grab a data set of of car values: # In[2]: import pandas as pd df = pd.read_excel('https://admintuts.tech/wp-content/downloads/xls/cars.xls') df.head() # In[5]: get_ipython().run_line_magic('matplotlib', 'inline') import numpy as np df1 = df[['Mileage','Price']] bins = np.arange(0,50000,10000) groups = df1.groupby(pd.cut(df1['Mileage'],bins)).mean() print(groups.head()) groups['Price'].plot.line() # We can use pandas to split up this matrix into the feature vectors we're interested in, and the value we're trying to predict. # # Note how we are avoiding the make and model; regressions don't work well with ordinal values, unless you can convert them into some numerical order that makes sense somehow. # # Let's scale our feature data into the same range so we can easily compare the coefficients we end up with. # In[8]: import statsmodels.api as sm from sklearn.preprocessing import StandardScaler scale = StandardScaler() X = df[['Mileage', 'Cylinder', 'Doors']] y = df['Price'] X[['Mileage', 'Cylinder', 'Doors']] = scale.fit_transform(X[['Mileage', 'Cylinder', 'Doors']].as_matrix()) print (X) est = sm.OLS(y, X).fit() est.summary() # The table of coefficients above gives us the values to plug into an equation of form: # B0 + B1 * Mileage + B2 * cylinders + B3 * doors # # In this example, it's pretty clear that the number of cylinders is more important than anything based on the coefficients. # # Could we have figured that out earlier? # In[4]: y.groupby(df.Doors).mean() # Surprisingly, more doors does not mean a higher price! (Maybe it implies a sport car in some cases?) So it's not surprising that it's pretty useless as a predictor here. This is a very small data set however, so we can't really read much meaning into it. # In[29]: scaled = scale.transform([[20000, 8, 4]]) print(scaled) predicted = est.predict(scaled[0]) print(predicted)