import matplotlib.pyplot as plt import numpy as np from sklearn import datasets, linear_model import pandas as pd from pandas import DataFrame, Series from __future__ import division import seaborn as sns from sklearn.cross_validation import train_test_split sns.set(style='ticks', palette='Set2') %matplotlib inline data = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data-original", delim_whitespace = True, header=None, names = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model', 'origin', 'car_name']) print(data.shape) data = data.dropna() data.head() indep_vars = ['cylinders', 'displacement', 'horsepower', 'weight', 'acceleration'] dep_vars = ['mpg'] indep_data = data[indep_vars] dep_data = data[dep_vars] indep_train, indep_test, dep_train, dep_test = train_test_split(indep_data, dep_data, test_size=0.33, random_state=42) regr = linear_model.LinearRegression() regr.fit(indep_train, dep_train) print('Coefficients: {0}'.format(zip(indep_vars,np.squeeze(regr.coef_)))) regr_predict = regr.predict(indep_test) print("Residual sum of squares: %.2f" % np.mean((regr_predict - dep_test) ** 2)) data.groupby(['cylinders']).mpg.describe() pivot_table = data.pivot_table(index='cylinders', columns='acceleration', values='mpg', aggfunc=np.mean) pivot_table.head() p = plt.hist(data.mpg) plt.title("MPG") p sns.despine() sns.lmplot("mpg", "weight", data); sns.lmplot("mpg", "weight", data, order=2); sns.jointplot("mpg", "weight", data, kind="reg") sns.boxplot(data[['displacement', 'horsepower']]) sns.violinplot(data[['displacement', 'horsepower']]) g = sns.FacetGrid(data, col="cylinders") g.map(plt.hist, "mpg");