#!/usr/bin/env python
# coding: utf-8

# In[26]:


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import subplots


# In[4]:


from ISLP import load_data
from ISLP.models import (ModelSpec as MS,
                         summarize,
                         poly)


# Q1

# In[6]:


Auto = load_data('Auto').dropna()


# In[9]:


# (a) Which of the predictors are quantitative, and which are qualitative?
# View the first few rows of the data
Auto.head()
# Answer: Quantitative: mpg, cylinders, displacement, horsepower, weight, acceleration, year.
# Qualitative: origin


# In[13]:


# What is the range of each quantitative predictor? You can answer this using the range()
# (b) Calculate the range of each quantitative predictor
quantitative_columns = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'year']

ranges = Auto[quantitative_columns].apply(lambda x: (x.min(), x.max()))

# Display the ranges
print(ranges)


# In[14]:


# (c) What is the mean and standard deviation of each quantitative predictor?
# Calculate the mean and standard deviation of each quantitative predictor
means = Auto[quantitative_columns].mean()
std_devs = Auto[quantitative_columns].std()
print(means)
print(std_devs)


# In[15]:


#(d) Now remove the 10th through 85th observations. What is the range, mean, and standard
#deviation of each predictor in the subset of the data that remains?
Auto_subset = Auto.drop(Auto.index[9:85])
means_subset = Auto_subset[quantitative_columns].mean()
std_devs_subset = Auto_subset[quantitative_columns].std()
ranges_subset = Auto_subset[quantitative_columns].apply(lambda x: (x.min(), x.max()))


# In[16]:


means_subset


# In[17]:


std_devs_subset


# In[18]:


ranges_subset


# In[21]:


# (e) Using the full data set, investigate the predictors graphically, using scatterplots or other tools of your choice. Create some plots highlighting the relationships among the predictors.
fig, axs = subplots(3, 3, figsize=(15, 15))
for i, column in enumerate(quantitative_columns):
    ax = axs.flatten()[i]
    ax.scatter(Auto[column], Auto['mpg'])
    ax.set_xlabel(column)
    ax.set_ylabel('mpg')


# In[22]:


# Create pairwise plots of the quantitative predictors
pd.plotting.scatter_matrix(Auto[quantitative_columns], figsize=(15, 15))


# To name a few, there appear to be a strong negative relationship between mpg and displacement, horsepower, and weight. There seems to be a positive linear relationship betwee weight and displacement, weight and horsepower. 

# f. Based on our findings, we can conclude that displacement, horsepower, and weight are the most important predictors of mpg. This is because the scatterplots indicate a strong (negative) relationship between these predictors and mpg.

# Q3

# In[51]:


data = pd.read_csv('KNN.csv')


# In[52]:


# Update the code to use the 'data.X' and 'data.Y' syntax

# Function to estimate regression function with nearest neighbors averaging
def nearest_neighbors_avg(x_data, y_data, tau, x_vals):
    y_estimated = []
    for x_val in x_vals:
        neighbors = y_data[np.abs(x_data - x_val) <= tau]
        if len_neighbors := len(neighbors):
            y_estimated.append(np.mean(neighbors))
        else:
            y_estimated.append(np.nan)
    return np.array(y_estimated)

# Define the different tau values
taus = [0.1, 0.3, 0.8, 2, 10]
x_vals = np.linspace(data.X.min(), data.X.max(), 300)

# Plot the data points
plt.scatter(data['X'], data[' Y'], color='black', label='data', alpha=0.5)

# Plot for each tau value
for tau in taus:
    y_estimated = nearest_neighbors_avg(data['X'], data[' Y'], tau, x_vals)
    plt.plot(x_vals, y_estimated, label=f'tau = {tau}')

plt.xlabel('X')
plt.ylabel('Y')
plt.title('Nearest Neighbors Averaging with Varying Tau Values')
plt.legend()
plt.show()


# WHen the neighborhood is too small, the model will overfit and the fit will be very "unsmooth". It will just attempt to fit every single point in the training set. When the neighborhood is too large, the model will underfit. It will just attempt to fit the average of the training set. 

#