import numpy as np
import matplotlib.pyplot as plt
# The code snippet below is responsible for downloading the dataset
# - for example when running via Google Colab.
#
# You can also directly download the file using the link if you work
# with a local setup (in that case, ignore the !wget)
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv
# load all examples from the file
data = np.genfromtxt('winequality-white.csv',delimiter=";",skip_header=1)
print("data:", data.shape)
# Prepare for proper training
np.random.shuffle(data) # randomly sort examples
# take the first 3000 examples for training
# (remember array slicing from last week)
X_train = data[:3000,:11] # all features except last column
y_train = data[:3000,11] # quality column
# and the remaining examples for testing
X_test = data[3000:,:11] # all features except last column
y_test = data[3000:,11] # quality column
print("First example:")
print("Features:", X_train[0])
print("Quality:", y_train[0])
plt.hist
) the distribution of each of the features for the training data as well as the 2D distribution (either plt.scatter
or plt.hist2d
) of each feature versus quality. Also calculate the correlation coefficient (np.corrcoef
) for each feature with quality. Which feature by itself seems most predictive for the quality?features = ["fixed acidity", "volatile acidity", "citric acid",
"residual sugar", "chlorides", "free sulfur dioxide",
"total sulfur dioxide", "density", "pH", "sulphates", "alcohol"]
# Loop over all features
for i_feat, feat in enumerate(features):
plt.clf()
print("Feature:", feat)
print("Correlation coefficient:",
np.corrcoef(X_train[:,i_feat],y_train)[0,1]) # Calulate correlation coefficient
# 1D Histogram
plt.hist(X_train[:,i_feat])
plt.xlabel(feat)
plt.ylabel("# Wines")
plt.show()
# Scatter Plot
plt.scatter(X_train[:,i_feat],y_train)
plt.xlabel(feat)
plt.ylabel("Quality")
plt.show()
np.matmul
), matrix transposition (.T
) and matrix inversion (np.linalg.inv
).# Calulate weights using train data
# w = (X_T X)^{-1} X_T y
w = np.matmul(np.matmul(np.linalg.inv(np.matmul(X_train.T, X_train)), X_train.T),y_train)
print(w.shape)
print(w)
# Evaluate linear regression model
y_pred = np.matmul(X_test,w)
print(X_test.shape,w.shape,y_pred.shape)
print(X_test[0])
print(w)
print(y_pred[0])
print("Correlation coefficient:", np.corrcoef(y_pred,y_test)[0,1])
# Prepare scatter plot
plt.scatter(y_pred,y_test)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()