#!/usr/bin/env python
# coding: utf-8

# # CHAPTER 1: Predicting students admission with Logistic Regression
# 

# In[1]:


import matplotlib.pyplot as plt
import pandas as pd
import numpy as np


# # PART 1 - Data Handling

# In[2]:


# Importing data with pandas
data = pd.read_csv("dataset_admissions.csv")


# In[3]:


# Showing an overview of our data
data.head()


# In[4]:


# The shape property returns a tuple representing the dimensionality of the DataFrame  
# The format of shape is (rows, columns)
data.shape 


# In[5]:


# Pandas describe() is used to view some basic statistical details like percentile, mean, std etc. of a data frame or a series of numeric values. W
data.describe()


# In[6]:


data.std() # std() is to get the standard deviation   
# Standard deviation (S) = square root of the variance
# Variance is the average squared deviations from the mean, while standard deviation is the square root of this number. 


# In[7]:


#The pandas crosstab function builds a cross-tabulation table that can show the frequency with which certain groups of data appear.
pd.crosstab(data['admit'], data['rank'], rownames = ['admitted'])


# In[9]:


data.hist(color="pink")
plt.show()


# In[10]:


dummy_rank = pd.get_dummies(data['rank'],prefix="rank") # converte a variável categórica (1,2,3,4) para valores binários 
dummy_rank.head()


# In[11]:


collumns_to_keep = ['admit','gre','gpa']
data = data[collumns_to_keep].join(dummy_rank[['rank_1','rank_2','rank_3','rank_4']])
data.head()


# In[12]:


# defining x and y
X = data.drop('admit',axis=1) # In pandas, axis=1 stands for columns / aqui estou dizendo q quero dar o drop na coluna admit
X


# In[13]:


Y = data['admit']
Y


# # PART 2 - Data Analysis

# In[14]:


from sklearn.model_selection import train_test_split
 

# In[15]:


X_train,X_test, Y_train,Y_real = train_test_split(X,Y,test_size=0.2) #20% é para teste (x, y) / 80% para treinamento (x e y)


# In[16]:


from sklearn.linear_model import LogisticRegression
 

# In[17]:


# For small datasets, ‘liblinear’ is a good choice, whereas ‘sag’ and ‘saga’ are faster for large ones.
log_reg = LogisticRegression(solver = 'liblinear')


# In[18]:


#training the model 
log_reg.fit(X_train,Y_train)


# In[19]:


#salvando o teste do modelo numa variável 
y_pred = log_reg.predict(X_test)


# In[20]:


y_pred   


# ##### Observation:
# 
# Note that one class is dominating the other.  The more is predicting more situations where the result is False. That leads to biases in the model. This model will be biased towards rejecting.  I will do Part 3 anyways, only to see the results. After this I will apply a resampling method and create a new model. 

# # PART 3 - Valuation Analysis: Performance Measurement & K-Fold 

# ## Performance Measurement 

# #### a) Accuracy  

# In[21]:


from sklearn import metrics


# In[22]:


# Accuracy = true negatives + true positives / true positives + false positives + true negatives + false negatives
accuracy = metrics.accuracy_score(Y_real,y_pred)  
accuracy


# #### b) Precision 

# In[23]:


# Precision = true positive / true positive + false positive
precision = metrics.precision_score(Y_real,y_pred)  
precision


# #### c) Recall

# In[24]:


# Recall = true positive / true positive + false negative
recall = metrics.recall_score(Y_real,y_pred)  
recall


# #### d) Confusion matrix

# In[25]:


import seaborn as sns


# In[26]:


confusion_matrix = metrics.confusion_matrix(Y_real,y_pred)
confusion_matrix


# In[28]:


# 0,0: 53 => True Negative
# 0,1: 0  => False Positive 
# 1,0: 23 => False Negative 
# 1,1: 4 =>  True Positive 
sns.heatmap(confusion_matrix, annot=True)


# #### e) AUC - ROC curve

# In[29]:


auc = metrics.roc_auc_score(Y_real, y_pred) # as the documentation explain, the main parameters are: y_true and y_score
auc


# ## K-Fold

# In[30]:


# Now using kfold, a model validation technique where it's not using your pre-trained model
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
kf = KFold(n_splits=5,shuffle=True)  
cv_r = cross_val_score(log_reg, X, Y, cv=kf)
np.mean(cv_r)


# # CHAPTER 2: Predicting students admission with Logistic Regression, Decision Tree, SVM and Random Forest 

# #### Observation:
# 
# In Chapter 1, the data has shown that one class dominates the other. In such a case, the model will have a hard time learning from data to predict future classes. Next, I will apply a resampling method and then use Logistic Regression again. After this, I will also use three other algorithms: Decision Tree, SVM and Random Forest. Finally, I will do the Valuation Analisis (Performance Measurement & K-Fold)

#  # PART 1 - Data Handling: Resampling   
# A widely adopted technique for dealing with highly unbalanced datasets is called resampling. There are two main ways to perform random resampling: Undersampling and Oversampling. 
# * Oversampling — Duplicating samples from the minority class. 
# * Undersampling — Deleting samples from the majority class.
# 
# Random Sampling involves creating a new transformed version of the data with a new class distribution. The goal is to reduce the influence of the data on our ML algorithm. Generally, oversampling is preferable as under sampling can result in the loss of important data.
#  

# In[82]:


from sklearn.utils import resample


# In[83]:


# Counting how many admissions are in the dataset 
data[data['admit']==1].count()


# In[84]:


# Counting how many rejections are in the dataset 
data[data['admit']==0].count()


# In[85]:


# Creating variables to store the results 
majority = data[data['admit']==0]
minority = data[data['admit']==1]


# In[86]:


# Applying a resampling strategy (Oversampling) to obtain a more balanced data
minority_upsample = resample(minority, replace = True, n_samples=273, random_state=123) 


# In[87]:


new_data = pd.concat([majority,minority_upsample])
new_data.describe()


# In[94]:


new_data.hist(color="green")
plt.show()


# In[95]:


# Creating X 
X = new_data.drop("admit",axis=1)
X


# In[96]:


# Creating Y
Y = new_data["admit"]
Y


#  # PART 2 - Data Analysis
# 

# In[97]:


from sklearn.model_selection import train_test_split 


# In[98]:


X_train,X_test,Y_train,Y_real = train_test_split(X,Y,test_size=0.2) # test size will be 20% and train size will be 80%


# # Part 2.1. Logistic Regression

# In[99]:


from sklearn.linear_model import LogisticRegression


# #### LIBLINEAR -- A Library for Large Linear Classification
# 
# The solvers implemented in the class Logistic Regression are “liblinear”, “newton-cg”, “lbfgs”, “sag” and “saga”. According to Scikit Documentation: The “liblinear” solver was the one used by default for historical reasons before version 0.22. Since then, default use is lbfgs Algorithm.
#  

# In[100]:


lr_model = LogisticRegression(solver='liblinear')
lr_model.fit(X_train,Y_train)


# In[101]:


y_pred = lr_model.predict(X_test)
y_pred


# ### Logistic Regression - Performance measurement

# #### a) Accuracy    

# In[102]:


from sklearn import metrics


# In[103]:


acc_lg = metrics.accuracy_score(Y_real, y_pred)
acc_lg


# #### b) Precision

# In[104]:


pre_lg = metrics.precision_score(Y_real, y_pred)
pre_lg


# #### c) Recall  

# In[105]:


rec_lg = metrics.recall_score(Y_real, y_pred)
rec_lg


# #### d) Confusion matrix

# In[106]:


cm_lg = metrics.confusion_matrix(Y_real,y_pred)
cm_lg


# In[107]:


import seaborn as sns
sns.heatmap(confusion_matrix, annot=True)


# #### e) AUC - ROC curve

# In[108]:


auc_lg = metrics.roc_auc_score(Y_real, y_pred) # as the documentation explain, the main parameters are: y_true and y_score
auc_lg


# ### Logistic Regression - KFold

# In[109]:


from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score


# In[110]:


kf_lg = KFold(n_splits = 5,shuffle=True)
cv_lg = cross_val_score(lr_model, X, Y, cv=kf_lg)
np.mean(cv_lg)


# # Part 2.2. Decision Tree

# In[111]:


from sklearn import tree
tree_model = tree.DecisionTreeClassifier(max_depth=3)


# In[112]:


tree_model.fit(X_train,Y_train)


# ### Decision Tree - Performance measurement

# #### a) Accuracy

# In[113]:


acc_dt = tree_model.score(X_test,Y_real) 
acc_dt


#  #### b) Precision   

# In[114]:


pre_dt = metrics.precision_score(Y_real, y_pred)
pre_dt


#   #### c) Recall   

# In[115]:


rec_dt = metrics.recall_score(Y_real, y_pred)
rec_dt


#    #### d) Confusion matrix  

# In[116]:


cm_dt = metrics.confusion_matrix(Y_real,y_pred)
cm_dt


#   #### e) AUC - ROC curve

# In[117]:


auc_dt = metrics.roc_auc_score(Y_real, y_pred) # as the documentation explain, the main parameters are: y_true and y_score
auc_dt


# ### Decision Tree - KFold

# In[118]:


kf_dt = KFold(n_splits = 5,shuffle=True)
cv_dt = cross_val_score(tree_model, X, Y, cv=kf_dt)
np.mean(cv_dt)


# # Part 2.3. SVM - SVC 

# In[119]:


from sklearn import svm


# In[120]:


model_SVC = svm.SVC(kernel="linear")
model_SVC


# In[121]:


model_SVC.fit(X_train,Y_train)


# In[122]:


Y_pred = model_SVC.predict(X_test)
Y_pred


# ### SVM - SVC - Performance measurement

# #### a) Accuracy

# In[123]:


acc_svc = metrics.accuracy_score(Y_pred,Y_real)  
acc_svc


#  #### b) Precision   

# In[124]:


pre_svc = metrics.precision_score(Y_real, y_pred)
pre_svc


#   #### c) Recall   

# In[125]:


rec_svc = metrics.recall_score(Y_real, y_pred)
rec_svc


#    #### d) Confusion matrix  

# In[126]:


cm_svc = metrics.confusion_matrix(Y_real,y_pred)
cm_svc


#   #### e) AUC - ROC curve

# In[127]:


auc_svc = metrics.roc_auc_score(Y_real, y_pred) # as the documentation explain, the main parameters are: y_true and y_score
auc_svc


# ### SVM-SVC K-Fold
# 

# In[128]:


kf_svm = KFold(n_splits = 5,shuffle=True)
cv_svm = cross_val_score(model_SVC, X, Y, cv=kf_svm)
np.mean(cv_svm)


# # Part 2.4. Random Forest

# In[129]:


from sklearn.ensemble import RandomForestClassifier


# In[130]:


model_random_forest = RandomForestClassifier().fit(X_train,Y_train)
y_pred_random_forest = model_random_forest.predict(X_test)
y_pred_random_forest


# ### Random Forest - Performance measurement

#  #### a) Accuracy   

# In[131]:


acc_rf = metrics.accuracy_score(Y_real,y_pred_random_forest)   
acc_rf


#   #### b) Precision   

# In[132]:


metrics.precision_score(Y_real, y_pred_random_forest)


#   #### c) Recall    

# In[133]:


metrics.recall_score(Y_real, y_pred_random_forest)


#   #### d) Confusion matrix   

# In[134]:


metrics.roc_auc_score(Y_real, y_pred_random_forest)


#  #### e) AUC - ROC curve

# In[135]:


metrics.confusion_matrix(Y_real,y_pred_random_forest)


# ### Random Forest K-Fold
# 

# In[136]:


kf_rf = KFold(n_splits = 5,shuffle=True)
cv_rf = cross_val_score(model_SVC, X, Y, cv=kf_rf)
np.mean(cv_rf)


# # PART 3 - Valuation Analysis: Performance Measurement & K-Fold 
# 
# 
# In Part 2 I did the performance measurement and K-fold of each ML model. Now I'll compare the accuracy of the different models (Accuracy comparison graph).

# In[142]:


plt.title("Accuracy Comparison Graph")

plt.ylabel("Accuracy Score")

plt.xlabel("Machine Learning Algorithms - 1.Logistic Regression / 2.Decision Tree / 3.SVM-SCV / 4.Random Forest")

x = [acc_lg,acc_dt,acc_svc,acc_rf]

plt.plot([1,2,3,4],x, color = "black")

plt.scatter(1,acc_lg, marker="o", color="pink", label="Logistic Regression")

plt.scatter(2,acc_dt, marker="o", color="green", label="Decision Tree")

plt.scatter(3,acc_svc, marker="o", color="red", label="SVM-SVC")

plt.scatter(4,acc_rf, marker="o", color = "blue",label = "Random Forest")

plt.legend()

plt.show()


# In[ ]: