#!/usr/bin/env python # coding: utf-8 # # Chapter 15: Classifying data with logistic regression # In[1]: import matplotlib.pyplot as plt import numpy as np from matplotlib import cm from mpl_toolkits.mplot3d import Axes3D # In[2]: def plot_function(f,xmin,xmax,**kwargs): ts = np.linspace(xmin,xmax,1000) plt.plot(ts,[f(t) for t in ts],**kwargs) def scalar_field_heatmap(f,xmin,xmax,ymin,ymax,xsteps=100,ysteps=100): fig = plt.figure() fig.set_size_inches(7,7) fv = np.vectorize(f) X = np.linspace(xmin, xmax, xsteps) Y = np.linspace(ymin, ymax, ysteps) X, Y = np.meshgrid(X, Y) z = fv(X,Y) fig, ax = plt.subplots() c = ax.pcolormesh(X,Y,z, cmap='plasma') ax.axis([X.min(), X.max(), Y.min(), Y.max()]) fig.colorbar(c, ax=ax) def plot_scalar_field(f,xmin,xmax,ymin,ymax,xsteps=100,ysteps=100,c=None,cmap=cm.coolwarm,alpha=1,antialiased=False, zorder=0): fig = plt.gcf() fig.set_size_inches(7,7) ax = fig.gca(projection='3d') fv = np.vectorize(f) # Make data. X = np.linspace(xmin, xmax, xsteps) Y = np.linspace(ymin, ymax, ysteps) X, Y = np.meshgrid(X, Y) Z = fv(X,Y) # Plot the surface. surf = ax.plot_surface(X, Y, Z, cmap=cmap,color=c,alpha=alpha, linewidth=0, antialiased=antialiased, zorder=zorder) # # 15.1 Testing a classification function on real data # In[3]: def bmw_finder(mileage,price): if price > 25000: return 1 else: return 0 # ## 15.1.1 Loading the car data # In[4]: from car_data import bmws, priuses # In[5]: all_car_data = [] for bmw in bmws: all_car_data.append((bmw.mileage,bmw.price,1)) for prius in priuses: all_car_data.append((prius.mileage,prius.price,0)) # In[6]: all_car_data # ## 15.1.1 Testing the classification function # In[7]: def test_classifier(classifier, data): trues = 0 falses = 0 for mileage, price, is_bmw in data: if classifier(mileage, price) == is_bmw: #1 trues += 1 else: falses += 1 #1 return trues / (trues + falses) # In[8]: test_classifier(bmw_finder, all_car_data) # ## 15.1.3 Exercises # **Exercise:** Update the `test_classifier` function to print the number of true positives, true negatives, false positives, and false negatives. Printing these for the `bmw_finder` classifier, what can you tell about the performance of the classifier? # In[9]: def test_classifier(classifier, data, verbose=False): #1 true_positives = 0 #2 true_negatives = 0 false_positives = 0 false_negatives = 0 for mileage, price, is_bmw in data: predicted = classifier(mileage,price) if predicted and is_bmw: #3 true_positives += 1 elif predicted: false_positives += 1 elif is_bmw: false_negatives += 1 else: true_negatives += 1 if verbose: print("true positives %f" % true_positives) #4 print("true negatives %f" % true_negatives) print("false positives %f" % false_positives) print("false negatives %f" % false_negatives) return (true_positives + true_negatives) / len(data) #5 # In[10]: test_classifier(bmw_finder,all_car_data,verbose=True) # **Exercise:** Find a way to update the `bmw_finder` function slightly to improve its performance, and use the `test_classifier` function to confirm that your improved function has better than 59% accuracy. # In[11]: def bmw_finder2(mileage,price): if price > 20000: return 1 else: return 0 # In[12]: test_classifier(bmw_finder2, all_car_data) # # 15.2 Picturing a decision boundary # ## 15.2.1 Picturing the space of cars # In[13]: def plot_data(ds): plt.scatter([d[0] for d in ds if d[2]==0],[d[1] for d in ds if d[2]==0],c='C1') plt.scatter([d[0] for d in ds if d[2]==1],[d[1] for d in ds if d[2]==1],c='C0',marker='x') plt.ylabel("Price ($)",fontsize=16) plt.xlabel("Odometer (mi)",fontsize=16) # In[14]: plot_data(all_car_data) # In[15]: plot_data(all_car_data) plot_function(lambda x: 25000, 0, 200000, c='k') # In[16]: plot_data(all_car_data) plot_function(lambda x: 21000 - 0.07 * x, 0, 200000, c='k') # ## 15.2.3 Implementing the classification function # In[17]: def decision_boundary_classify(mileage,price): if price > 21000 - 0.07 * mileage: return 1 else: return 0 # In[18]: test_classifier(decision_boundary_classify, all_car_data) # ## 15.2.4 Exercises # **Mini-project:** What is the decision boundary of the form $p = constant$ giving the best classification accuracy on the test data set? # In[19]: def constant_price_classifier(cutoff_price): def c(x,p): if p > cutoff_price: return 1 else: return 0 return c # In[20]: def cutoff_accuracy(cutoff_price): c = constant_price_classifier(cutoff_price) return test_classifier(c,all_car_data) # In[21]: all_prices = [price for (mileage,price,is_bmw) in all_car_data] max(all_prices,key=cutoff_accuracy) # In[22]: test_classifier(constant_price_classifier(17998.0), all_car_data) # # 15.3 Framing classification as a regression problem # # ## 15.3.1 Scaling the raw car data # In[23]: def make_scale(data): min_val = min(data) #1 max_val = max(data) def scale(x): #2 return (x-min_val) / (max_val - min_val) def unscale(y): #3 return y * (max_val - min_val) + min_val return scale, unscale #4 price_scale, price_unscale = make_scale([x[1] for x in all_car_data]) #5 mileage_scale, mileage_unscale = make_scale([x[0] for x in all_car_data]) # In[24]: scaled_car_data = [(mileage_scale(mileage), price_scale(price), is_bmw) for mileage,price,is_bmw in all_car_data] # In[25]: plot_data(scaled_car_data) # ## 15.3.2 Measuring BMWness of a car # In[26]: plot_data(scaled_car_data) plot_function(lambda x: 0.56 - 0.35*x,0,1,c='k') # In[27]: scalar_field_heatmap(lambda x,p: p + 0.35*x - 0.56, 0, 1,0,1) plt.ylabel('Price', fontsize=16) plt.xlabel('Mileage', fontsize=16) plot_function(lambda x: 0.56-0.35*x,0,1,c='k') # ## 15.3.3 Introducing the sigmoid function # In[28]: from math import exp def sigmoid(x): return 1 / (1+exp(-x)) # In[29]: plot_function(sigmoid,-5,5) # ## 15.3.4 Composing the sigmoid function with other functions # In[30]: def f(x,p): return p + 0.35 * x - 0.56 def l(x,p): return sigmoid(f(x,p)) # In[31]: scalar_field_heatmap(l, 0, 1,0,1) # plot_data(scaled_car_data,white=True) plt.ylabel('Price', fontsize=16) plt.xlabel('Mileage', fontsize=16) plot_function(lambda x: 0.56-0.35*x,0,1,c='k') # In[32]: plot_scalar_field(l, -5, 5, -5, 5) # In[33]: plot_scalar_field(f,-5,5,-5,5) # ## 15.3.4 Exercises # **Exercise:** Find a function $h(x)$ such that large positive values of $x$ cause $h(x)$ to be very close to $0$, large negative values of $x$ cause $h(x)$ to be very close to $1$, and $h(3) = 0.5$. # In[34]: def h(x): return sigmoid(3-x) # In[35]: plot_function(h,-5,10) # # 15.4 Exploring possible logistic functions # ## 15.4.1 Parametrizing logistic functions # In[36]: def make_logistic(a,b,c): def l(x,p): return sigmoid(a*x + b*p - c) return l # ## 15.4.2 Measuring the quality of fit for a logistic function # In[37]: def simple_logistic_cost(a,b,c): l = make_logistic(a,b,c) errors = [abs(is_bmw-l(x,p)) for x,p,is_bmw in scaled_car_data] return sum(errors) # In[38]: from math import log plot_function(lambda x: -log(x),0.01,1) # In[39]: -log(0.01) # In[40]: -log(0.001) # In[41]: def point_cost(l,x,p,is_bmw): #1 wrong = 1 - is_bmw return -log(abs(wrong - l(x,p))) def logistic_cost(a,b,c): l = make_logistic(a,b,c) errors = [point_cost(l,x,p,is_bmw) #2 for x,p,is_bmw in scaled_car_data] return sum(errors) # ## 15.4.3 Testing different logistic functions # In[42]: def plot_line(acoeff,bcoeff,ccoeff,**kwargs): a,b,c = acoeff, bcoeff, ccoeff # black by default if 'c' not in kwargs: kwargs['c'] = 'k' if b == 0: plt.plot([c/a,c/a],[0,1]) else: def y(x): return (c-a*x)/b plt.plot([0,1],[y(0),y(1)],**kwargs) # In[43]: plot_data(scaled_car_data) plot_line(0.35,1,0.56,) plot_line(1,1,1) # In[44]: logistic_cost(0.35,1,0.56) # In[45]: logistic_cost(1,1,1) # ## 15.4.4 Exercises # **Exercise:** Implement the function `plot_line(a,b,c)` referenced in the chapter that plots the line $ax + by = c$ where $0 \leq x \leq 1$ and $0 \leq y \leq 1$. # **Solution:** (see implementation above) # **Mini-project:** What does the graph of $k(x,y) = \sigma(x^2 + y^2 - 1)$ look like? What does the decision boundary look like, meaning the set of points where $k(x,y) = 0.5$? # In[46]: def k(x,y): return sigmoid(x**2 + y**2 - 1) plot_scalar_field(k,-3,3,-3,3) # **Mini-project:** Two equations $2x + y = 1$ and $4x + 2y = 2$ define the same line, and therefore the same decision boundary. Are the logistic functions $\sigma(2x + y - 1)$ and $\sigma(4x + 2y - 2)$ the same? # In[47]: plot_scalar_field(lambda x,y:sigmoid(2*x+y-1),-3,3,-3,3) # In[48]: plot_scalar_field(lambda x,y:sigmoid(4*x+2*y-2),-3,3,-3,3) # # 15.5 Finding the best logistic function # ## 15.5.1 Gradient descent in three dimensions # In[49]: from math import sqrt def length(v): return sqrt(sum([vi*vi for vi in v])) def secant_slope(f,xmin,xmax): return (f(xmax) - f(xmin)) / (xmax - xmin) def approx_derivative(f,x,dx=1e-6): return secant_slope(f,x-dx,x+dx) # In[50]: def approx_gradient(f,x0,y0,dx=1e-6): partial_x = approx_derivative(lambda x:f(x,y0),x0,dx=dx) partial_y = approx_derivative(lambda y:f(x0,y),y0,dx=dx) return (partial_x,partial_y) # In[51]: def approx_gradient3(f,x0,y0,z0,dx=1e-6): partial_x = approx_derivative(lambda x:f(x,y0,z0),x0,dx=dx) partial_y = approx_derivative(lambda y:f(x0,y,z0),y0,dx=dx) partial_z = approx_derivative(lambda z:f(x0,y0,z),z0,dx=dx) return (partial_x,partial_y,partial_z) # In[52]: def gradient_descent3(f,xstart,ystart,zstart, tolerance=1e-6,max_steps=1000): x = xstart y = ystart z = zstart grad = approx_gradient3(f,x,y,z) steps = 0 while length(grad) > tolerance and steps < max_steps: x -= 0.01 * grad[0] y -= 0.01 * grad[1] z -= 0.01 * grad[2] grad = approx_gradient3(f,x,y,z) steps += 1 return x,y,z # ## 15.5.2 Using gradient descent to find the best fit # In[53]: gradient_descent3(logistic_cost,1,1,1,max_steps=100) # In[54]: gradient_descent3(logistic_cost,1,1,1,max_steps=200) # To make a cool graph of the decision boundary moving as gradient descent runs: # In[55]: plot_data(scaled_car_data) for i in range(0,1000,100): a,b,c = gradient_descent3(logistic_cost,1,1,1,max_steps=i) plot_line(a,b,c,alpha=i/1000,c='k') # In[56]: gradient_descent3(logistic_cost,1,1,1,max_steps=8000) # In[57]: plot_data(scaled_car_data) plot_line(0.35,1,0.56) plot_line(3.7167003153580045, 11.422062409195114, 5.596878367305919) # ## 15.5.3 Testing an understanding the best logistic classifier # In[58]: def best_logistic_classifier(x,p): l = make_logistic(3.7167003153580045, 11.422062409195114, 5.596878367305919) if l(x,p) > 0.5: return 1 else: return 0 # In[59]: test_classifier(best_logistic_classifier,scaled_car_data) # In[60]: plot_scalar_field(make_logistic(0.35,1,0.56), -2, 2, -2, 2) # In[61]: plot_scalar_field(make_logistic(3.7167003153580045, 11.422062409195114, 5.596878367305919), -2, 2, -2, 2) # ## 15.5.5 Exercises # **Exercise:** Modify the `gradient_descent3` function to print out the total number of steps taken before it returns its result. How many steps does the gradient descent take to converge for `logistic_cost`? # In[62]: def gradient_descent3(f,xstart,ystart,zstart,tolerance=1e-6,max_steps=1000): x = xstart y = ystart z = zstart grad = approx_gradient3(f,x,y,z) steps = 0 while length(grad) > tolerance and steps < max_steps: x -= 0.01 * grad[0] y -= 0.01 * grad[1] z -= 0.01 * grad[2] grad = approx_gradient3(f,x,y,z) steps += 1 print(steps) return x,y,z # In[63]: gradient_descent3(logistic_cost,1,1,1,max_steps=8000) # **Mini project:** Write an `approx_gradient` function that can calculate the gradient of a function in any number of dimensions. Then write a `gradient_descent` function that works in any number of dimensions. To test your `gradient_descent` on an $n$-dimensional function, you can try a function like $f(x_1,x_2, \ldots ,x_n) = (x_1 - 1)^2 + (x_2 - 1)^ + \ldots + (x_n - 1)^2$ where $x_1, x_2, \ldots , x_n$ are the $n$ input variables to the function $f$. The minimum of this function should be $(1,1, \ldots , 1)$, an $n$-dimensional vector with 1 in every entry. # In[64]: def partial_derivative(f,i,v,**kwargs): def cross_section(x): arg = [(vj if j != i else x) for j,vj in enumerate(v)] return f(*arg) return approx_derivative(cross_section, v[i], **kwargs) # In[65]: def approx_gradient(f,v,dx=1e-6): return [partial_derivative(f,i,v) for i in range(0,len(v))] # In[66]: def gradient_descent(f,vstart,tolerance=1e-6,max_steps=1000): v = vstart grad = approx_gradient(f,v) steps = 0 while length(grad) > tolerance and steps < max_steps: v = [(vi - 0.01 * dvi) for vi,dvi in zip(v,grad)] grad = approx_gradient(f,v) steps += 1 return v # In[67]: def sum_squares(*v): return sum([(x-1)**2 for x in v]) # In[68]: v = [2,2,2,2,2] gradient_descent(sum_squares,v)