#!/usr/bin/env python # coding: utf-8 # # 05 - Logistic Regression # by [Alejandro Correa Bahnsen](albahnsen.com/) # # version 0.2, May 2016 # # ## Part of the class [Machine Learning for Risk Management](https://github.com/albahnsen/ML_RiskManagement) # # # This notebook is licensed under a [Creative Commons Attribution-ShareAlike 3.0 Unported License](http://creativecommons.org/licenses/by-sa/3.0/deed.en_US). Special thanks goes to [Kevin Markham](https://github.com/justmarkham) # # Review: Predicting a Continuous Response # In[1]: import pandas as pd import zipfile with zipfile.ZipFile('../datasets/glass.csv.zip', 'r') as z: f = z.open('glass.csv') glass = pd.read_csv(f, sep=',', index_col=0) glass.head() # **Question:** Pretend that we want to predict **ri**, and our only feature is **al**. How could we do it using machine learning? # # **Answer:** We could frame it as a regression problem, and use a linear regression model with **al** as the only feature and **ri** as the response. # # **Question:** How would we **visualize** this model? # # **Answer:** Create a scatter plot with **al** on the x-axis and **ri** on the y-axis, and draw the line of best fit. # In[2]: get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt # In[3]: # scatter plot using Pandas glass.plot(kind='scatter', x='al', y='ri') # In[4]: # equivalent scatter plot using Matplotlib plt.scatter(glass.al, glass.ri) plt.xlabel('al') plt.ylabel('ri') # In[5]: # fit a linear regression model from sklearn.linear_model import LinearRegression linreg = LinearRegression() feature_cols = ['al'] X = glass[feature_cols] y = glass.ri linreg.fit(X, y) # In[6]: # make predictions for all values of X glass['ri_pred'] = linreg.predict(X) glass.head() # In[7]: # put the plots together plt.scatter(glass.al, glass.ri) plt.plot(glass.al, glass.ri_pred, color='red') plt.xlabel('al') plt.ylabel('ri') # ### Refresher: interpreting linear regression coefficients # Linear regression equation: $y = \beta_0 + \beta_1x$ # In[8]: # compute prediction for al=2 using the equation linreg.intercept_ + linreg.coef_ * 2 # In[9]: # compute prediction for al=2 using the predict method linreg.predict(2) # In[10]: # examine coefficient for al print(feature_cols, linreg.coef_) # **Interpretation:** A 1 unit increase in 'al' is associated with a 0.0025 unit decrease in 'ri'. # In[11]: # increasing al by 1 (so that al=3) decreases ri by 0.0025 1.51699012 - 0.0024776063874696243 # In[12]: # compute prediction for al=3 using the predict method linreg.predict(3) # # Predicting a Categorical Response # In[13]: # examine glass_type glass.glass_type.value_counts().sort_index() # In[14]: # types 1, 2, 3 are window glass # types 5, 6, 7 are household glass glass['household'] = glass.glass_type.map({1:0, 2:0, 3:0, 5:1, 6:1, 7:1}) glass.head() # Let's change our task, so that we're predicting **household** using **al**. Let's visualize the relationship to figure out how to do this: # In[15]: plt.scatter(glass.al, glass.household) plt.xlabel('al') plt.ylabel('household') # Let's draw a **regression line**, like we did before: # In[16]: # fit a linear regression model and store the predictions feature_cols = ['al'] X = glass[feature_cols] y = glass.household linreg.fit(X, y) glass['household_pred'] = linreg.predict(X) # In[17]: # scatter plot that includes the regression line plt.scatter(glass.al, glass.household) plt.plot(glass.al, glass.household_pred, color='red') plt.xlabel('al') plt.ylabel('household') # If **al=3**, what class do we predict for household? **1** # # If **al=1.5**, what class do we predict for household? **0** # # We predict the 0 class for **lower** values of al, and the 1 class for **higher** values of al. What's our cutoff value? Around **al=2**, because that's where the linear regression line crosses the midpoint between predicting class 0 and class 1. # # Therefore, we'll say that if **household_pred >= 0.5**, we predict a class of **1**, else we predict a class of **0**. # # ## $$h_\beta(x) = \beta_0 + \beta_1x_1 + \beta_2x_2 + ... + \beta_nx_n$$ # # - $h_\beta(x)$ is the response # - $\beta_0$ is the intercept # - $\beta_1$ is the coefficient for $x_1$ (the first feature) # - $\beta_n$ is the coefficient for $x_n$ (the nth feature) # # ### if $h_\beta(x)\le 0.5$ then $\hat y = 0$ # # ### if $h_\beta(x)> 0.5$ then $\hat y = 1$ # In[18]: # understanding np.where import numpy as np nums = np.array([5, 15, 8]) # np.where returns the first value if the condition is True, and the second value if the condition is False np.where(nums > 10, 'big', 'small') # In[19]: # transform household_pred to 1 or 0 glass['household_pred_class'] = np.where(glass.household_pred >= 0.5, 1, 0) glass.head() # In[20]: # plot the class predictions plt.scatter(glass.al, glass.household) plt.plot(glass.al, glass.household_pred_class, color='red') plt.xlabel('al') plt.ylabel('household') # $h_\beta(x)$ can be lower 0 or higher than 1, which is countra intuitive # # ## Using Logistic Regression Instead # # Logistic regression can do what we just did: # In[21]: # fit a logistic regression model and store the class predictions from sklearn.linear_model import LogisticRegression logreg = LogisticRegression(C=1e9) feature_cols = ['al'] X = glass[feature_cols] y = glass.household logreg.fit(X, y) glass['household_pred_class'] = logreg.predict(X) # In[22]: # plot the class predictions plt.scatter(glass.al, glass.household) plt.plot(glass.al, glass.household_pred_class, color='red') plt.xlabel('al') plt.ylabel('household') # What if we wanted the **predicted probabilities** instead of just the **class predictions**, to understand how confident we are in a given prediction? # In[23]: # store the predicted probabilites of class 1 glass['household_pred_prob'] = logreg.predict_proba(X)[:, 1] # In[24]: # plot the predicted probabilities plt.scatter(glass.al, glass.household) plt.plot(glass.al, glass.household_pred_prob, color='red') plt.xlabel('al') plt.ylabel('household') # In[25]: # examine some example predictions print(logreg.predict_proba(1)) print(logreg.predict_proba(2)) print(logreg.predict_proba(3)) # The first column indicates the predicted probability of **class 0**, and the second column indicates the predicted probability of **class 1**. # ## Probability, odds, e, log, log-odds # # $$probability = \frac {one\ outcome} {all\ outcomes}$$ # # $$odds = \frac {one\ outcome} {all\ other\ outcomes}$$ # # Examples: # # - Dice roll of 1: probability = 1/6, odds = 1/5 # - Even dice roll: probability = 3/6, odds = 3/3 = 1 # - Dice roll less than 5: probability = 4/6, odds = 4/2 = 2 # # $$odds = \frac {probability} {1 - probability}$$ # # $$probability = \frac {odds} {1 + odds}$$ # In[26]: # create a table of probability versus odds table = pd.DataFrame({'probability':[0.1, 0.2, 0.25, 0.5, 0.6, 0.8, 0.9]}) table['odds'] = table.probability/(1 - table.probability) table # What is **e**? It is the base rate of growth shared by all continually growing processes: # In[27]: # exponential function: e^1 np.exp(1) # What is a **(natural) log**? It gives you the time needed to reach a certain level of growth: # In[28]: # time needed to grow 1 unit to 2.718 units np.log(2.718) # It is also the **inverse** of the exponential function: # In[29]: np.log(np.exp(5)) # In[30]: # add log-odds to the table table['logodds'] = np.log(table.odds) table # ## What is Logistic Regression? # **Linear regression:** continuous response is modeled as a linear combination of the features: # # $$y = \beta_0 + \beta_1x$$ # # **Logistic regression:** log-odds of a categorical response being "true" (1) is modeled as a linear combination of the features: # # $$\log \left({p\over 1-p}\right) = \beta_0 + \beta_1x$$ # # This is called the **logit function**. # # Probability is sometimes written as pi: # # $$\log \left({\pi\over 1-\pi}\right) = \beta_0 + \beta_1x$$ # # The equation can be rearranged into the **logistic function**: # # $$\pi = \frac{e^{\beta_0 + \beta_1x}} {1 + e^{\beta_0 + \beta_1x}}$$ # In other words: # # - Logistic regression outputs the **probabilities of a specific class** # - Those probabilities can be converted into **class predictions** # # The **logistic function** has some nice properties: # # - Takes on an "s" shape # - Output is bounded by 0 and 1 # # We have covered how this works for **binary classification problems** (two response classes). But what about **multi-class classification problems** (more than two response classes)? # # - Most common solution for classification models is **"one-vs-all"** (also known as **"one-vs-rest"**): decompose the problem into multiple binary classification problems # - **Multinomial logistic regression** can solve this as a single problem # ## Part 6: Interpreting Logistic Regression Coefficients # In[31]: # plot the predicted probabilities again plt.scatter(glass.al, glass.household) plt.plot(glass.al, glass.household_pred_prob, color='red') plt.xlabel('al') plt.ylabel('household') # In[32]: # compute predicted log-odds for al=2 using the equation logodds = logreg.intercept_ + logreg.coef_[0] * 2 logodds # In[33]: # convert log-odds to odds odds = np.exp(logodds) odds # In[34]: # convert odds to probability prob = odds/(1 + odds) prob # In[35]: # compute predicted probability for al=2 using the predict_proba method logreg.predict_proba(2)[:, 1] # In[36]: # examine the coefficient for al feature_cols, logreg.coef_[0] # **Interpretation:** A 1 unit increase in 'al' is associated with a 4.18 unit increase in the log-odds of 'household'. # In[37]: # increasing al by 1 (so that al=3) increases the log-odds by 4.18 logodds = 0.64722323 + 4.1804038614510901 odds = np.exp(logodds) prob = odds/(1 + odds) prob # In[38]: # compute predicted probability for al=3 using the predict_proba method logreg.predict_proba(3)[:, 1] # **Bottom line:** Positive coefficients increase the log-odds of the response (and thus increase the probability), and negative coefficients decrease the log-odds of the response (and thus decrease the probability). # In[39]: # examine the intercept logreg.intercept_ # **Interpretation:** For an 'al' value of 0, the log-odds of 'household' is -7.71. # In[40]: # convert log-odds to probability logodds = logreg.intercept_ odds = np.exp(logodds) prob = odds/(1 + odds) prob # That makes sense from the plot above, because the probability of household=1 should be very low for such a low 'al' value. # ![Logistic regression beta values](images/logistic_betas.png) # Changing the $\beta_0$ value shifts the curve **horizontally**, whereas changing the $\beta_1$ value changes the **slope** of the curve. # ## Comparing Logistic Regression with Other Models # # Advantages of logistic regression: # # - Highly interpretable (if you remember how) # - Model training and prediction are fast # - No tuning is required (excluding regularization) # - Features don't need scaling # - Can perform well with a small number of observations # - Outputs well-calibrated predicted probabilities # # Disadvantages of logistic regression: # # - Presumes a linear relationship between the features and the log-odds of the response # - Performance is (generally) not competitive with the best supervised learning methods # - Can't automatically learn feature interactions # In[ ]: