#!/usr/bin/env python # coding: utf-8 # # Exercise 3.2 - Solution # # In[1]: import numpy as np import matplotlib.pyplot as plt # In[2]: # The code snippet below is responsible for downloading the dataset # - for example when running via Google Colab. # # You can also directly download the file using the link if you work # with a local setup (in that case, ignore the !wget) get_ipython().system('wget https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv') # In[3]: # load all examples from the file data = np.genfromtxt('winequality-white.csv',delimiter=";",skip_header=1) print("data:", data.shape) # Prepare for proper training np.random.shuffle(data) # randomly sort examples # take the first 3000 examples for training # (remember array slicing from last week) X_train = data[:3000,:11] # all features except last column y_train = data[:3000,11] # quality column # and the remaining examples for testing X_test = data[3000:,:11] # all features except last column y_test = data[3000:,11] # quality column print("First example:") print("Features:", X_train[0]) print("Quality:", y_train[0]) # # Solution # # * First we want to understand the data better. Plot (`plt.hist`) the distribution of each of the features for the training data as well as the 2D distribution (either `plt.scatter` or `plt.hist2d`) of each feature versus quality. Also calculate the correlation coefficient (`np.corrcoef`) for each feature with quality. Which feature by itself seems most predictive for the quality? # # In[4]: features = ["fixed acidity", "volatile acidity", "citric acid", "residual sugar", "chlorides", "free sulfur dioxide", "total sulfur dioxide", "density", "pH", "sulphates", "alcohol"] # Loop over all features for i_feat, feat in enumerate(features): plt.clf() print("Feature:", feat) print("Correlation coefficient:", np.corrcoef(X_train[:,i_feat],y_train)[0,1]) # Calulate correlation coefficient # 1D Histogram plt.hist(X_train[:,i_feat]) plt.xlabel(feat) plt.ylabel("# Wines") plt.show() # Scatter Plot plt.scatter(X_train[:,i_feat],y_train) plt.xlabel(feat) plt.ylabel("Quality") plt.show() # * Calculate the linear regression weights. Numpy provides functions for matrix multiplication (`np.matmul`), matrix transposition (`.T`) and matrix inversion (`np.linalg.inv`). # # In[5]: # Calulate weights using train data # w = (X_T X)^{-1} X_T y w = np.matmul(np.matmul(np.linalg.inv(np.matmul(X_train.T, X_train)), X_train.T),y_train) print(w.shape) print(w) # * Use the weights to predict the quality for the test dataset. How # does your predicted quality compare with the true quality of the test data? Calculate the correlation coefficient between predicted and true quality and draw a scatter plot. # In[6]: # Evaluate linear regression model y_pred = np.matmul(X_test,w) print(X_test.shape,w.shape,y_pred.shape) print(X_test[0]) print(w) print(y_pred[0]) print("Correlation coefficient:", np.corrcoef(y_pred,y_test)[0,1]) # Prepare scatter plot plt.scatter(y_pred,y_test) plt.xlabel("Predicted") plt.ylabel("True") plt.show()