In [26]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import subplots

In [4]:

from ISLP import load_data
from ISLP.models import (ModelSpec as MS,
                         summarize,
                         poly)

Q1

In [6]:

Auto = load_data('Auto').dropna()

In [9]:

# (a) Which of the predictors are quantitative, and which are qualitative?
# View the first few rows of the data
Auto.head()
# Answer: Quantitative: mpg, cylinders, displacement, horsepower, weight, acceleration, year.
# Qualitative: origin

Out[9]:

	mpg	cylinders	displacement	horsepower	weight	acceleration	year	origin
name
chevrolet chevelle malibu	18.0	8	307.0	130	3504	12.0	70	1
buick skylark 320	15.0	8	350.0	165	3693	11.5	70	1
plymouth satellite	18.0	8	318.0	150	3436	11.0	70	1
amc rebel sst	16.0	8	304.0	150	3433	12.0	70	1
ford torino	17.0	8	302.0	140	3449	10.5	70	1

In [13]:

# What is the range of each quantitative predictor? You can answer this using the range()
# (b) Calculate the range of each quantitative predictor
quantitative_columns = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'year']

ranges = Auto[quantitative_columns].apply(lambda x: (x.min(), x.max()))

# Display the ranges
print(ranges)

    mpg  cylinders  displacement  horsepower  weight  acceleration  year
0   9.0          3          68.0          46    1613           8.0    70
1  46.6          8         455.0         230    5140          24.8    82

In [14]:

# (c) What is the mean and standard deviation of each quantitative predictor?
# Calculate the mean and standard deviation of each quantitative predictor
means = Auto[quantitative_columns].mean()
std_devs = Auto[quantitative_columns].std()
print(means)
print(std_devs)

mpg               23.445918
cylinders          5.471939
displacement     194.411990
horsepower       104.469388
weight          2977.584184
acceleration      15.541327
year              75.979592
dtype: float64
mpg               7.805007
cylinders         1.705783
displacement    104.644004
horsepower       38.491160
weight          849.402560
acceleration      2.758864
year              3.683737
dtype: float64

In [15]:

#(d) Now remove the 10th through 85th observations. What is the range, mean, and standard
#deviation of each predictor in the subset of the data that remains?
Auto_subset = Auto.drop(Auto.index[9:85])
means_subset = Auto_subset[quantitative_columns].mean()
std_devs_subset = Auto_subset[quantitative_columns].std()
ranges_subset = Auto_subset[quantitative_columns].apply(lambda x: (x.min(), x.max()))

In [16]:

means_subset

Out[16]:

mpg               25.041637
cylinders          5.274021
displacement     179.373665
horsepower        98.715302
weight          2881.505338
acceleration      15.738790
year              77.508897
dtype: float64

In [17]:

std_devs_subset

Out[17]:

mpg               7.912874
cylinders         1.632155
displacement     95.512897
horsepower       33.822711
weight          792.548494
acceleration      2.570191
year              2.989403
dtype: float64

In [18]:

ranges_subset

Out[18]:

	mpg	cylinders	displacement	horsepower	weight	acceleration	year
0	11.0	3	68.0	46	1755	9.5	70
1	46.6	8	455.0	230	4952	24.6	82

In [21]:

# (e) Using the full data set, investigate the predictors graphically, using scatterplots or other tools of your choice. Create some plots highlighting the relationships among the predictors.
fig, axs = subplots(3, 3, figsize=(15, 15))
for i, column in enumerate(quantitative_columns):
    ax = axs.flatten()[i]
    ax.scatter(Auto[column], Auto['mpg'])
    ax.set_xlabel(column)
    ax.set_ylabel('mpg')

In [22]:

# Create pairwise plots of the quantitative predictors
pd.plotting.scatter_matrix(Auto[quantitative_columns], figsize=(15, 15))

Out[22]:

array([[<Axes: xlabel='mpg', ylabel='mpg'>,
        <Axes: xlabel='cylinders', ylabel='mpg'>,
        <Axes: xlabel='displacement', ylabel='mpg'>,
        <Axes: xlabel='horsepower', ylabel='mpg'>,
        <Axes: xlabel='weight', ylabel='mpg'>,
        <Axes: xlabel='acceleration', ylabel='mpg'>,
        <Axes: xlabel='year', ylabel='mpg'>],
       [<Axes: xlabel='mpg', ylabel='cylinders'>,
        <Axes: xlabel='cylinders', ylabel='cylinders'>,
        <Axes: xlabel='displacement', ylabel='cylinders'>,
        <Axes: xlabel='horsepower', ylabel='cylinders'>,
        <Axes: xlabel='weight', ylabel='cylinders'>,
        <Axes: xlabel='acceleration', ylabel='cylinders'>,
        <Axes: xlabel='year', ylabel='cylinders'>],
       [<Axes: xlabel='mpg', ylabel='displacement'>,
        <Axes: xlabel='cylinders', ylabel='displacement'>,
        <Axes: xlabel='displacement', ylabel='displacement'>,
        <Axes: xlabel='horsepower', ylabel='displacement'>,
        <Axes: xlabel='weight', ylabel='displacement'>,
        <Axes: xlabel='acceleration', ylabel='displacement'>,
        <Axes: xlabel='year', ylabel='displacement'>],
       [<Axes: xlabel='mpg', ylabel='horsepower'>,
        <Axes: xlabel='cylinders', ylabel='horsepower'>,
        <Axes: xlabel='displacement', ylabel='horsepower'>,
        <Axes: xlabel='horsepower', ylabel='horsepower'>,
        <Axes: xlabel='weight', ylabel='horsepower'>,
        <Axes: xlabel='acceleration', ylabel='horsepower'>,
        <Axes: xlabel='year', ylabel='horsepower'>],
       [<Axes: xlabel='mpg', ylabel='weight'>,
        <Axes: xlabel='cylinders', ylabel='weight'>,
        <Axes: xlabel='displacement', ylabel='weight'>,
        <Axes: xlabel='horsepower', ylabel='weight'>,
        <Axes: xlabel='weight', ylabel='weight'>,
        <Axes: xlabel='acceleration', ylabel='weight'>,
        <Axes: xlabel='year', ylabel='weight'>],
       [<Axes: xlabel='mpg', ylabel='acceleration'>,
        <Axes: xlabel='cylinders', ylabel='acceleration'>,
        <Axes: xlabel='displacement', ylabel='acceleration'>,
        <Axes: xlabel='horsepower', ylabel='acceleration'>,
        <Axes: xlabel='weight', ylabel='acceleration'>,
        <Axes: xlabel='acceleration', ylabel='acceleration'>,
        <Axes: xlabel='year', ylabel='acceleration'>],
       [<Axes: xlabel='mpg', ylabel='year'>,
        <Axes: xlabel='cylinders', ylabel='year'>,
        <Axes: xlabel='displacement', ylabel='year'>,
        <Axes: xlabel='horsepower', ylabel='year'>,
        <Axes: xlabel='weight', ylabel='year'>,
        <Axes: xlabel='acceleration', ylabel='year'>,
        <Axes: xlabel='year', ylabel='year'>]], dtype=object)

To name a few, there appear to be a strong negative relationship between mpg and displacement, horsepower, and weight. There seems to be a positive linear relationship betwee weight and displacement, weight and horsepower.

f. Based on our findings, we can conclude that displacement, horsepower, and weight are the most important predictors of mpg. This is because the scatterplots indicate a strong (negative) relationship between these predictors and mpg.

Q3

In [51]:

data = pd.read_csv('KNN.csv')

Out[51]:

0     1.170
1     1.880
2     0.343
3     2.110
4     1.650
      ...  
95    0.647
96    1.780
97    2.330
98    1.320
99    1.120
Name:  Y, Length: 100, dtype: float64

In [52]:

# Update the code to use the 'data.X' and 'data.Y' syntax

# Function to estimate regression function with nearest neighbors averaging
def nearest_neighbors_avg(x_data, y_data, tau, x_vals):
    y_estimated = []
    for x_val in x_vals:
        neighbors = y_data[np.abs(x_data - x_val) <= tau]
        if len_neighbors := len(neighbors):
            y_estimated.append(np.mean(neighbors))
        else:
            y_estimated.append(np.nan)
    return np.array(y_estimated)

# Define the different tau values
taus = [0.1, 0.3, 0.8, 2, 10]
x_vals = np.linspace(data.X.min(), data.X.max(), 300)

# Plot the data points
plt.scatter(data['X'], data[' Y'], color='black', label='data', alpha=0.5)

# Plot for each tau value
for tau in taus:
    y_estimated = nearest_neighbors_avg(data['X'], data[' Y'], tau, x_vals)
    plt.plot(x_vals, y_estimated, label=f'tau = {tau}')

plt.xlabel('X')
plt.ylabel('Y')
plt.title('Nearest Neighbors Averaging with Varying Tau Values')
plt.legend()
plt.show()

WHen the neighborhood is too small, the model will overfit and the fit will be very "unsmooth". It will just attempt to fit every single point in the training set. When the neighborhood is too large, the model will underfit. It will just attempt to fit the average of the training set.