import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import subplots
from ISLP import load_data
from ISLP.models import (ModelSpec as MS,
summarize,
poly)
Q1
Auto = load_data('Auto').dropna()
# (a) Which of the predictors are quantitative, and which are qualitative?
# View the first few rows of the data
Auto.head()
# Answer: Quantitative: mpg, cylinders, displacement, horsepower, weight, acceleration, year.
# Qualitative: origin
mpg | cylinders | displacement | horsepower | weight | acceleration | year | origin | |
---|---|---|---|---|---|---|---|---|
name | ||||||||
chevrolet chevelle malibu | 18.0 | 8 | 307.0 | 130 | 3504 | 12.0 | 70 | 1 |
buick skylark 320 | 15.0 | 8 | 350.0 | 165 | 3693 | 11.5 | 70 | 1 |
plymouth satellite | 18.0 | 8 | 318.0 | 150 | 3436 | 11.0 | 70 | 1 |
amc rebel sst | 16.0 | 8 | 304.0 | 150 | 3433 | 12.0 | 70 | 1 |
ford torino | 17.0 | 8 | 302.0 | 140 | 3449 | 10.5 | 70 | 1 |
# What is the range of each quantitative predictor? You can answer this using the range()
# (b) Calculate the range of each quantitative predictor
quantitative_columns = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'year']
ranges = Auto[quantitative_columns].apply(lambda x: (x.min(), x.max()))
# Display the ranges
print(ranges)
mpg cylinders displacement horsepower weight acceleration year 0 9.0 3 68.0 46 1613 8.0 70 1 46.6 8 455.0 230 5140 24.8 82
# (c) What is the mean and standard deviation of each quantitative predictor?
# Calculate the mean and standard deviation of each quantitative predictor
means = Auto[quantitative_columns].mean()
std_devs = Auto[quantitative_columns].std()
print(means)
print(std_devs)
mpg 23.445918 cylinders 5.471939 displacement 194.411990 horsepower 104.469388 weight 2977.584184 acceleration 15.541327 year 75.979592 dtype: float64 mpg 7.805007 cylinders 1.705783 displacement 104.644004 horsepower 38.491160 weight 849.402560 acceleration 2.758864 year 3.683737 dtype: float64
#(d) Now remove the 10th through 85th observations. What is the range, mean, and standard
#deviation of each predictor in the subset of the data that remains?
Auto_subset = Auto.drop(Auto.index[9:85])
means_subset = Auto_subset[quantitative_columns].mean()
std_devs_subset = Auto_subset[quantitative_columns].std()
ranges_subset = Auto_subset[quantitative_columns].apply(lambda x: (x.min(), x.max()))
means_subset
mpg 25.041637 cylinders 5.274021 displacement 179.373665 horsepower 98.715302 weight 2881.505338 acceleration 15.738790 year 77.508897 dtype: float64
std_devs_subset
mpg 7.912874 cylinders 1.632155 displacement 95.512897 horsepower 33.822711 weight 792.548494 acceleration 2.570191 year 2.989403 dtype: float64
ranges_subset
mpg | cylinders | displacement | horsepower | weight | acceleration | year | |
---|---|---|---|---|---|---|---|
0 | 11.0 | 3 | 68.0 | 46 | 1755 | 9.5 | 70 |
1 | 46.6 | 8 | 455.0 | 230 | 4952 | 24.6 | 82 |
# (e) Using the full data set, investigate the predictors graphically, using scatterplots or other tools of your choice. Create some plots highlighting the relationships among the predictors.
fig, axs = subplots(3, 3, figsize=(15, 15))
for i, column in enumerate(quantitative_columns):
ax = axs.flatten()[i]
ax.scatter(Auto[column], Auto['mpg'])
ax.set_xlabel(column)
ax.set_ylabel('mpg')
# Create pairwise plots of the quantitative predictors
pd.plotting.scatter_matrix(Auto[quantitative_columns], figsize=(15, 15))
array([[<Axes: xlabel='mpg', ylabel='mpg'>, <Axes: xlabel='cylinders', ylabel='mpg'>, <Axes: xlabel='displacement', ylabel='mpg'>, <Axes: xlabel='horsepower', ylabel='mpg'>, <Axes: xlabel='weight', ylabel='mpg'>, <Axes: xlabel='acceleration', ylabel='mpg'>, <Axes: xlabel='year', ylabel='mpg'>], [<Axes: xlabel='mpg', ylabel='cylinders'>, <Axes: xlabel='cylinders', ylabel='cylinders'>, <Axes: xlabel='displacement', ylabel='cylinders'>, <Axes: xlabel='horsepower', ylabel='cylinders'>, <Axes: xlabel='weight', ylabel='cylinders'>, <Axes: xlabel='acceleration', ylabel='cylinders'>, <Axes: xlabel='year', ylabel='cylinders'>], [<Axes: xlabel='mpg', ylabel='displacement'>, <Axes: xlabel='cylinders', ylabel='displacement'>, <Axes: xlabel='displacement', ylabel='displacement'>, <Axes: xlabel='horsepower', ylabel='displacement'>, <Axes: xlabel='weight', ylabel='displacement'>, <Axes: xlabel='acceleration', ylabel='displacement'>, <Axes: xlabel='year', ylabel='displacement'>], [<Axes: xlabel='mpg', ylabel='horsepower'>, <Axes: xlabel='cylinders', ylabel='horsepower'>, <Axes: xlabel='displacement', ylabel='horsepower'>, <Axes: xlabel='horsepower', ylabel='horsepower'>, <Axes: xlabel='weight', ylabel='horsepower'>, <Axes: xlabel='acceleration', ylabel='horsepower'>, <Axes: xlabel='year', ylabel='horsepower'>], [<Axes: xlabel='mpg', ylabel='weight'>, <Axes: xlabel='cylinders', ylabel='weight'>, <Axes: xlabel='displacement', ylabel='weight'>, <Axes: xlabel='horsepower', ylabel='weight'>, <Axes: xlabel='weight', ylabel='weight'>, <Axes: xlabel='acceleration', ylabel='weight'>, <Axes: xlabel='year', ylabel='weight'>], [<Axes: xlabel='mpg', ylabel='acceleration'>, <Axes: xlabel='cylinders', ylabel='acceleration'>, <Axes: xlabel='displacement', ylabel='acceleration'>, <Axes: xlabel='horsepower', ylabel='acceleration'>, <Axes: xlabel='weight', ylabel='acceleration'>, <Axes: xlabel='acceleration', ylabel='acceleration'>, <Axes: xlabel='year', ylabel='acceleration'>], [<Axes: xlabel='mpg', ylabel='year'>, <Axes: xlabel='cylinders', ylabel='year'>, <Axes: xlabel='displacement', ylabel='year'>, <Axes: xlabel='horsepower', ylabel='year'>, <Axes: xlabel='weight', ylabel='year'>, <Axes: xlabel='acceleration', ylabel='year'>, <Axes: xlabel='year', ylabel='year'>]], dtype=object)
To name a few, there appear to be a strong negative relationship between mpg and displacement, horsepower, and weight. There seems to be a positive linear relationship betwee weight and displacement, weight and horsepower.
f. Based on our findings, we can conclude that displacement, horsepower, and weight are the most important predictors of mpg. This is because the scatterplots indicate a strong (negative) relationship between these predictors and mpg.
Q3
data = pd.read_csv('KNN.csv')
0 1.170 1 1.880 2 0.343 3 2.110 4 1.650 ... 95 0.647 96 1.780 97 2.330 98 1.320 99 1.120 Name: Y, Length: 100, dtype: float64
# Update the code to use the 'data.X' and 'data.Y' syntax
# Function to estimate regression function with nearest neighbors averaging
def nearest_neighbors_avg(x_data, y_data, tau, x_vals):
y_estimated = []
for x_val in x_vals:
neighbors = y_data[np.abs(x_data - x_val) <= tau]
if len_neighbors := len(neighbors):
y_estimated.append(np.mean(neighbors))
else:
y_estimated.append(np.nan)
return np.array(y_estimated)
# Define the different tau values
taus = [0.1, 0.3, 0.8, 2, 10]
x_vals = np.linspace(data.X.min(), data.X.max(), 300)
# Plot the data points
plt.scatter(data['X'], data[' Y'], color='black', label='data', alpha=0.5)
# Plot for each tau value
for tau in taus:
y_estimated = nearest_neighbors_avg(data['X'], data[' Y'], tau, x_vals)
plt.plot(x_vals, y_estimated, label=f'tau = {tau}')
plt.xlabel('X')
plt.ylabel('Y')
plt.title('Nearest Neighbors Averaging with Varying Tau Values')
plt.legend()
plt.show()
WHen the neighborhood is too small, the model will overfit and the fit will be very "unsmooth". It will just attempt to fit every single point in the training set. When the neighborhood is too large, the model will underfit. It will just attempt to fit the average of the training set.