#!/usr/bin/env python # coding: utf-8 # In[26]: import numpy as np import pandas as pd import matplotlib.pyplot as plt from matplotlib.pyplot import subplots # In[4]: from ISLP import load_data from ISLP.models import (ModelSpec as MS, summarize, poly) # Q1 # In[6]: Auto = load_data('Auto').dropna() # In[9]: # (a) Which of the predictors are quantitative, and which are qualitative? # View the first few rows of the data Auto.head() # Answer: Quantitative: mpg, cylinders, displacement, horsepower, weight, acceleration, year. # Qualitative: origin # In[13]: # What is the range of each quantitative predictor? You can answer this using the range() # (b) Calculate the range of each quantitative predictor quantitative_columns = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'year'] ranges = Auto[quantitative_columns].apply(lambda x: (x.min(), x.max())) # Display the ranges print(ranges) # In[14]: # (c) What is the mean and standard deviation of each quantitative predictor? # Calculate the mean and standard deviation of each quantitative predictor means = Auto[quantitative_columns].mean() std_devs = Auto[quantitative_columns].std() print(means) print(std_devs) # In[15]: #(d) Now remove the 10th through 85th observations. What is the range, mean, and standard #deviation of each predictor in the subset of the data that remains? Auto_subset = Auto.drop(Auto.index[9:85]) means_subset = Auto_subset[quantitative_columns].mean() std_devs_subset = Auto_subset[quantitative_columns].std() ranges_subset = Auto_subset[quantitative_columns].apply(lambda x: (x.min(), x.max())) # In[16]: means_subset # In[17]: std_devs_subset # In[18]: ranges_subset # In[21]: # (e) Using the full data set, investigate the predictors graphically, using scatterplots or other tools of your choice. Create some plots highlighting the relationships among the predictors. fig, axs = subplots(3, 3, figsize=(15, 15)) for i, column in enumerate(quantitative_columns): ax = axs.flatten()[i] ax.scatter(Auto[column], Auto['mpg']) ax.set_xlabel(column) ax.set_ylabel('mpg') # In[22]: # Create pairwise plots of the quantitative predictors pd.plotting.scatter_matrix(Auto[quantitative_columns], figsize=(15, 15)) # To name a few, there appear to be a strong negative relationship between mpg and displacement, horsepower, and weight. There seems to be a positive linear relationship betwee weight and displacement, weight and horsepower. # f. Based on our findings, we can conclude that displacement, horsepower, and weight are the most important predictors of mpg. This is because the scatterplots indicate a strong (negative) relationship between these predictors and mpg. # Q3 # In[51]: data = pd.read_csv('KNN.csv') # In[52]: # Update the code to use the 'data.X' and 'data.Y' syntax # Function to estimate regression function with nearest neighbors averaging def nearest_neighbors_avg(x_data, y_data, tau, x_vals): y_estimated = [] for x_val in x_vals: neighbors = y_data[np.abs(x_data - x_val) <= tau] if len_neighbors := len(neighbors): y_estimated.append(np.mean(neighbors)) else: y_estimated.append(np.nan) return np.array(y_estimated) # Define the different tau values taus = [0.1, 0.3, 0.8, 2, 10] x_vals = np.linspace(data.X.min(), data.X.max(), 300) # Plot the data points plt.scatter(data['X'], data[' Y'], color='black', label='data', alpha=0.5) # Plot for each tau value for tau in taus: y_estimated = nearest_neighbors_avg(data['X'], data[' Y'], tau, x_vals) plt.plot(x_vals, y_estimated, label=f'tau = {tau}') plt.xlabel('X') plt.ylabel('Y') plt.title('Nearest Neighbors Averaging with Varying Tau Values') plt.legend() plt.show() # WHen the neighborhood is too small, the model will overfit and the fit will be very "unsmooth". It will just attempt to fit every single point in the training set. When the neighborhood is too large, the model will underfit. It will just attempt to fit the average of the training set. #