#!/usr/bin/env python
# coding: utf-8

# # Simple example of logistic regression with scikit-learn

# In[1]:


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


# ### Read data 
# Data are from the [wikipedia article on logistic regression](https://en.wikipedia.org/wiki/Logistic_regression)

# In[2]:


# data: 1. hours studies, 2. passed (0/1)  
filename = "https://www.physi.uni-heidelberg.de/~reygers/lectures/2020/smipp/exam.txt"
df = pd.read_csv(filename, engine='python', sep='\s+')


# In[3]:


x_tmp = df['hours_studied'].values
x = np.reshape(x_tmp, (-1, 1))
y = df['passed'].values


# ### Fit the model

# In[37]:


from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(penalty='none', fit_intercept=True)
clf.fit(x, y);


# ### Calculate predictions

# In[38]:


hours_studied_tmp = np.linspace(0., 6., 1000)
hours_studied = np.reshape(hours_studied_tmp, (-1, 1))
y_pred = clf.predict_proba(hours_studied)


# ### Plot result

# In[39]:


df.plot.scatter(x='hours_studied', y='passed')
plt.plot(hours_studied, y_pred[:,1])
plt.xlabel("preparation time in hours", fontsize=14)
plt.ylabel("probability of passing exam", fontsize=14)
plt.savefig("logistic_regression.pdf")


# In[40]:


clf.get_params()


# In[41]:


print('Coefficient: ', clf.coef_)
print('Intercept: ', clf.intercept_)