#!/usr/bin/env python
# coding: utf-8

# # Exercise 3.2 - Solution
# 

# In[1]:


import numpy as np
import matplotlib.pyplot as plt


# In[2]:


# The code snippet below is responsible for downloading the dataset
# - for example when running via Google Colab.
#
# You can also directly download the file using the link if you work
# with a local setup (in that case, ignore the !wget)

get_ipython().system('wget https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv')


# In[3]:


# load all examples from the file
data = np.genfromtxt('winequality-white.csv',delimiter=";",skip_header=1)

print("data:", data.shape)

# Prepare for proper training
np.random.shuffle(data) # randomly sort examples

# take the first 3000 examples for training
# (remember array slicing from last week)
X_train = data[:3000,:11] # all features except last column
y_train = data[:3000,11]  # quality column

# and the remaining examples for testing
X_test = data[3000:,:11] # all features except last column
y_test = data[3000:,11] # quality column

print("First example:")
print("Features:", X_train[0])
print("Quality:", y_train[0])


# # Solution
# 
# * First we want to understand the data better. Plot (`plt.hist`) the distribution of each of the features for the training data as well as the 2D distribution (either `plt.scatter` or `plt.hist2d`) of each feature versus quality. Also calculate the correlation coefficient (`np.corrcoef`) for each feature with quality. Which feature by itself seems most predictive for the quality?
# 

# In[4]:


features = ["fixed acidity", "volatile acidity", "citric acid", 
            "residual sugar", "chlorides", "free sulfur dioxide", 
            "total sulfur dioxide", "density", "pH", "sulphates", "alcohol"]

# Loop over all features
for i_feat, feat in enumerate(features):
    plt.clf()
    print("Feature:", feat)
    print("Correlation coefficient:", 
          np.corrcoef(X_train[:,i_feat],y_train)[0,1]) # Calulate correlation coefficient

    # 1D Histogram 
    plt.hist(X_train[:,i_feat])
    plt.xlabel(feat)
    plt.ylabel("# Wines")
    plt.show()
    
    # Scatter Plot
    plt.scatter(X_train[:,i_feat],y_train)
    plt.xlabel(feat)
    plt.ylabel("Quality")
    plt.show()


# * Calculate the linear regression weights. Numpy provides functions for matrix multiplication (`np.matmul`), matrix transposition (`.T`) and matrix inversion (`np.linalg.inv`).
# 

# In[5]:


# Calulate weights using train data
# w = (X_T X)^{-1} X_T y
w = np.matmul(np.matmul(np.linalg.inv(np.matmul(X_train.T, X_train)), X_train.T),y_train)
print(w.shape)
print(w)


# * Use the weights to predict the quality for the test dataset. How
# does your predicted quality compare with the true quality of the test data? Calculate the correlation coefficient between predicted and true quality and draw a scatter plot.

# In[6]:


# Evaluate linear regression model 
y_pred = np.matmul(X_test,w)
print(X_test.shape,w.shape,y_pred.shape)

print(X_test[0])
print(w)
print(y_pred[0])


print("Correlation coefficient:", np.corrcoef(y_pred,y_test)[0,1])

# Prepare scatter plot
plt.scatter(y_pred,y_test)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()