# This mounts your Google Drive to the Colab VM.
from google.colab import drive
drive.mount('/content/drive')
# TODO: Enter the foldername in your Drive where you have saved the unzipped
# assignment folder, e.g. 'cs231n/assignments/assignment1/'
FOLDERNAME = "ml/logistic_regression"
assert FOLDERNAME is not None, "[!] Enter the foldername."
# Now that we've mounted your Drive, this ensures that
# the Python interpreter of the Colab VM can load
# python files from within it.
import sys
sys.path.append('/content/drive/My Drive/{}'.format(FOLDERNAME))
# This downloads the CIFAR-10 dataset to your Drive
# if it doesn't already exist.
#%cd /content/drive/My\ Drive/$FOLDERNAME/cs231n/datasets/
#!bash get_datasets.sh
#%cd /content/drive/My\ Drive/$FOLDERNAME
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
import numpy as np
import pandas as pd
import math
pima = pd.read_csv("diabetes.csv")
print(pima.columns)
columns = ['Glucose','SkinThickness','Outcome']
df = pd.DataFrame(pima, columns=columns)
print(df.head())
Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'], dtype='object') Glucose SkinThickness Outcome 0 148 35 1 1 85 29 0 2 183 0 1 3 89 23 0 4 137 35 1
# Splitting data set into train and test
dfx = df.to_numpy()
train_data = dfx[0:500,:]
test_data = dfx[500:,:]
# train data
train_X = train_data[:,0:2]
train_y = train_data[:,2]
#test data
test_X = test_data[:,0:2]
test_y = test_data[:,2]
def sigmoid(x):
"""A numerically stable version of the logistic sigmoid function."""
pos_mask = x >= 0
neg_mask = x < 0
z = np.zeros_like(x)
z[pos_mask] = np.exp(-x[pos_mask])
z[neg_mask] = np.exp(x[neg_mask])
top = np.ones_like(x)
top[neg_mask] = z[neg_mask]
return top / (1 + z)
def print_every(loss,i):
if i%50 == 0:
print("Iteration:",i,"loss:",loss)
def normalize(x):
mu = np.mean(x,axis=0)
var = np.mean((x-mu)**2,axis=0)
x = (x - mu) / np.sqrt(var)
ones = np.ones([x.shape[0],1])
x = np.append(ones,x,axis=1)
return x,mu,var
def logistic_reg_train(x,w,y,iter,lr):
# linear function same as Linear Regression
n = x.shape[0]
loss_hist = []
for i in range(iter):
h = np.dot(x,w)
#Sigmoid Function
s = sigmoid(h)
# loss function
loss = -(1/n) * ((np.dot(y,np.log(s))) + np.dot((1-y),np.log(1-s)))
print_every(loss,i)
# Partial derivatives w.r.t w
dw = (1/n) * np.dot((y-s),x)
#updating parameters
w = w + lr*dw
loss_hist.append(loss)
return w,loss_hist
# Normalize the inputs
x_train_norm,mu_train,var_train = normalize(train_X)
# weights initialization
w = np.random.randn(train_X.shape[1],1)
w = np.append(0,w)
iter = 2000
lr = 0.003
optim_w, loss_hist = logistic_reg_train(x_train_norm,w,train_y,iter,lr)
Iteration: 0 loss: 0.670044768724279 Iteration: 50 loss: 0.6626523557924836 Iteration: 100 loss: 0.6556799978061332 Iteration: 150 loss: 0.6491072494003616 Iteration: 200 loss: 0.6429141523824431 Iteration: 250 loss: 0.6370812981205708 Iteration: 300 loss: 0.6315898779543568 Iteration: 350 loss: 0.6264217225187003 Iteration: 400 loss: 0.6215593309558392 Iteration: 450 loss: 0.6169858910308043 Iteration: 500 loss: 0.612685291171128 Iteration: 550 loss: 0.6086421254300672 Iteration: 600 loss: 0.6048416923305238 Iteration: 650 loss: 0.6012699884901916 Iteration: 700 loss: 0.5979136978621907 Iteration: 750 loss: 0.594760177353653 Iteration: 800 loss: 0.5917974395106935 Iteration: 850 loss: 0.5890141328844253 Iteration: 900 loss: 0.5863995206210857 Iteration: 950 loss: 0.5839434577512648 Iteration: 1000 loss: 0.5816363675895684 Iteration: 1050 loss: 0.5794692175973817 Iteration: 1100 loss: 0.5774334950079463 Iteration: 1150 loss: 0.5755211824648213 Iteration: 1200 loss: 0.5737247338818299 Iteration: 1250 loss: 0.5720370506945847 Iteration: 1300 loss: 0.5704514586403434 Iteration: 1350 loss: 0.5689616851739256 Iteration: 1400 loss: 0.5675618376023547 Iteration: 1450 loss: 0.5662463819993975 Iteration: 1500 loss: 0.565010122942904 Iteration: 1550 loss: 0.5638481841024198 Iteration: 1600 loss: 0.5627559896916551 Iteration: 1650 loss: 0.5617292467897003 Iteration: 1700 loss: 0.5607639285261273 Iteration: 1750 loss: 0.5598562581180195 Iteration: 1800 loss: 0.5590026937413304 Iteration: 1850 loss: 0.5581999142145451 Iteration: 1900 loss: 0.5574448054692442 Iteration: 1950 loss: 0.5567344477796855
def predict(x,y,w):
h = np.dot(x,w)
s = sigmoid(h)
y_pred = []
for i in range(len(s)):
if s[i] >= 0.5:
y_pred.append(1)
else:
y_pred.append(0)
return y_pred
x_test_norm,mu,var = normalize(test_X)
pred_y = predict(x_test_norm,test_y,optim_w)
pred_y = np.array(pred_y)
#Calculating the accuracy
count = 0
for i in range(len(test_y)):
if test_y[i]==pred_y[i]:
count+=1
print("Prediction Accuracy:",(count/len(test_y)*100))
Prediction Accuracy: 77.61194029850746
# we will try to plot with different colors
import matplotlib.pyplot as plt
import seaborn as sbn
plt.figure(figsize=(8, 8), dpi=80)
sbn.scatterplot(x=test_X[:, 0][test_y==0], y=test_X[:, 1][test_y==0],color='green')
sbn.scatterplot(x=test_X[:, 0][test_y==1], y=test_X[:, 1][test_y==1],color='blue')
#plt.margins(x=-0.25, y=-0.25)
decision_boundary = (-(optim_w[0] + (optim_w[1]) *x_train_norm[:,1])) /(optim_w[2]/10)
plt.plot(train_X[:,0],decision_boundary,color='red')
plt.grid()
plt.show()
Try out different set of data
pima = pd.read_csv("diabetes.csv")
print(pima.columns)
columns = ['Glucose','Age','Outcome']
df = pd.DataFrame(pima, columns=columns)
print(df.head())
Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'], dtype='object') Glucose Age Outcome 0 148 50 1 1 85 31 0 2 183 32 1 3 89 21 0 4 137 33 1
# Splitting data set into train and test
dfx = df.to_numpy()
train_data = dfx[0:500,:]
test_data = dfx[500:,:]
# train data
train_X = train_data[:,0:2]
train_y = train_data[:,2]
#test data
test_X = test_data[:,0:2]
test_y = test_data[:,2]
# Normalize the inputs
x_train_norm,mu_train,var_train = normalize(train_X)
# weights initialization
w = np.random.randn(train_X.shape[1],1)
w = np.append(0,w)
iter = 2000
lr = 0.003
optim_w, loss_hist = logistic_reg_train(x_train_norm,w,train_y,iter,lr)
Iteration: 0 loss: 0.6536393076313141 Iteration: 50 loss: 0.6492563106039907 Iteration: 100 loss: 0.6450014258419662 Iteration: 150 loss: 0.6408730876433092 Iteration: 200 loss: 0.6368696791982097 Iteration: 250 loss: 0.632989532059511 Iteration: 300 loss: 0.629230925912058 Iteration: 350 loss: 0.6255920886469993 Iteration: 400 loss: 0.6220711967451137 Iteration: 450 loss: 0.61866637597101 Iteration: 500 loss: 0.6153757023777348 Iteration: 550 loss: 0.6121972036189006 Iteration: 600 loss: 0.6091288605629931 Iteration: 650 loss: 0.6061686092020252 Iteration: 700 loss: 0.603314342844242 Iteration: 750 loss: 0.6005639145781453 Iteration: 800 loss: 0.5979151399927768 Iteration: 850 loss: 0.5953658001369553 Iteration: 900 loss: 0.5929136446980983 Iteration: 950 loss: 0.5905563953793473 Iteration: 1000 loss: 0.5882917494520337 Iteration: 1050 loss: 0.5861173834590634 Iteration: 1100 loss: 0.5840309570436123 Iteration: 1150 loss: 0.582030116876602 Iteration: 1200 loss: 0.580112500655817 Iteration: 1250 loss: 0.5782757411492054 Iteration: 1300 loss: 0.5765174702549023 Iteration: 1350 loss: 0.574835323050825 Iteration: 1400 loss: 0.5732269418072925 Iteration: 1450 loss: 0.5716899799370264 Iteration: 1500 loss: 0.5702221058580564 Iteration: 1550 loss: 0.5688210067464866 Iteration: 1600 loss: 0.5674843921577114 Iteration: 1650 loss: 0.5662099974965321 Iteration: 1700 loss: 0.5649955873186046 Iteration: 1750 loss: 0.5638389584477951 Iteration: 1800 loss: 0.5627379428962258 Iteration: 1850 loss: 0.5616904105760655 Iteration: 1900 loss: 0.560694271794407 Iteration: 1950 loss: 0.5597474795248379
x_test_norm,mu,var = normalize(test_X)
pred_y = predict(x_test_norm,test_y,optim_w)
pred_y = np.array(pred_y)
#Calculating the accuracy
count = 0
for i in range(len(test_y)):
if test_y[i]==pred_y[i]:
count+=1
print("Prediction Accuracy:",(count/len(test_y)*100))
Prediction Accuracy: 79.8507462686567
# we will try to plot with different colors
import matplotlib.pyplot as plt
import seaborn as sbn
plt.figure(figsize=(8, 8), dpi=80)
decision_boundary = (-(optim_w[0] + (optim_w[1]) *x_train_norm[:,1])) /(optim_w[2]/10)
plt.plot(train_X[:,0],decision_boundary,color='blue')
sbn.scatterplot(x=test_X[:, 0][test_y==0],y= test_X[:, 1][test_y==0],color='green')
sbn.scatterplot(x=test_X[:, 0][test_y==1],y= test_X[:, 1][test_y==1],color='orange')
plt.grid()
plt.show()