#!/usr/bin/env python # coding: utf-8 # # CHAPTER 1: Predicting students admission with Logistic Regression # # In[1]: import matplotlib.pyplot as plt import pandas as pd import numpy as np # # PART 1 - Data Handling # In[2]: # Importing data with pandas data = pd.read_csv("dataset_admissions.csv") # In[3]: # Showing an overview of our data data.head() # In[4]: # The shape property returns a tuple representing the dimensionality of the DataFrame # The format of shape is (rows, columns) data.shape # In[5]: # Pandas describe() is used to view some basic statistical details like percentile, mean, std etc. of a data frame or a series of numeric values. W data.describe() # In[6]: data.std() # std() is to get the standard deviation # Standard deviation (S) = square root of the variance # Variance is the average squared deviations from the mean, while standard deviation is the square root of this number. # In[7]: #The pandas crosstab function builds a cross-tabulation table that can show the frequency with which certain groups of data appear. pd.crosstab(data['admit'], data['rank'], rownames = ['admitted']) # In[9]: data.hist(color="pink") plt.show() # In[10]: dummy_rank = pd.get_dummies(data['rank'],prefix="rank") # converte a variável categórica (1,2,3,4) para valores binários dummy_rank.head() # In[11]: collumns_to_keep = ['admit','gre','gpa'] data = data[collumns_to_keep].join(dummy_rank[['rank_1','rank_2','rank_3','rank_4']]) data.head() # In[12]: # defining x and y X = data.drop('admit',axis=1) # In pandas, axis=1 stands for columns / aqui estou dizendo q quero dar o drop na coluna admit X # In[13]: Y = data['admit'] Y # # PART 2 - Data Analysis # In[14]: from sklearn.model_selection import train_test_split # In[15]: X_train,X_test, Y_train,Y_real = train_test_split(X,Y,test_size=0.2) #20% é para teste (x, y) / 80% para treinamento (x e y) # In[16]: from sklearn.linear_model import LogisticRegression # In[17]: # For small datasets, ‘liblinear’ is a good choice, whereas ‘sag’ and ‘saga’ are faster for large ones. log_reg = LogisticRegression(solver = 'liblinear') # In[18]: #training the model log_reg.fit(X_train,Y_train) # In[19]: #salvando o teste do modelo numa variável y_pred = log_reg.predict(X_test) # In[20]: y_pred # ##### Observation: # # Note that one class is dominating the other. The more is predicting more situations where the result is False. That leads to biases in the model. This model will be biased towards rejecting. I will do Part 3 anyways, only to see the results. After this I will apply a resampling method and create a new model. # # PART 3 - Valuation Analysis: Performance Measurement & K-Fold # ## Performance Measurement # #### a) Accuracy # In[21]: from sklearn import metrics # In[22]: # Accuracy = true negatives + true positives / true positives + false positives + true negatives + false negatives accuracy = metrics.accuracy_score(Y_real,y_pred) accuracy # #### b) Precision # In[23]: # Precision = true positive / true positive + false positive precision = metrics.precision_score(Y_real,y_pred) precision # #### c) Recall # In[24]: # Recall = true positive / true positive + false negative recall = metrics.recall_score(Y_real,y_pred) recall # #### d) Confusion matrix # In[25]: import seaborn as sns # In[26]: confusion_matrix = metrics.confusion_matrix(Y_real,y_pred) confusion_matrix # In[28]: # 0,0: 53 => True Negative # 0,1: 0 => False Positive # 1,0: 23 => False Negative # 1,1: 4 => True Positive sns.heatmap(confusion_matrix, annot=True) # #### e) AUC - ROC curve # In[29]: auc = metrics.roc_auc_score(Y_real, y_pred) # as the documentation explain, the main parameters are: y_true and y_score auc # ## K-Fold # In[30]: # Now using kfold, a model validation technique where it's not using your pre-trained model from sklearn.model_selection import KFold from sklearn.model_selection import cross_val_score kf = KFold(n_splits=5,shuffle=True) cv_r = cross_val_score(log_reg, X, Y, cv=kf) np.mean(cv_r) # # CHAPTER 2: Predicting students admission with Logistic Regression, Decision Tree, SVM and Random Forest # #### Observation: # # In Chapter 1, the data has shown that one class dominates the other. In such a case, the model will have a hard time learning from data to predict future classes. Next, I will apply a resampling method and then use Logistic Regression again. After this, I will also use three other algorithms: Decision Tree, SVM and Random Forest. Finally, I will do the Valuation Analisis (Performance Measurement & K-Fold) # # PART 1 - Data Handling: Resampling # A widely adopted technique for dealing with highly unbalanced datasets is called resampling. There are two main ways to perform random resampling: Undersampling and Oversampling. # * Oversampling — Duplicating samples from the minority class. # * Undersampling — Deleting samples from the majority class. # # Random Sampling involves creating a new transformed version of the data with a new class distribution. The goal is to reduce the influence of the data on our ML algorithm. Generally, oversampling is preferable as under sampling can result in the loss of important data. # # In[82]: from sklearn.utils import resample # In[83]: # Counting how many admissions are in the dataset data[data['admit']==1].count() # In[84]: # Counting how many rejections are in the dataset data[data['admit']==0].count() # In[85]: # Creating variables to store the results majority = data[data['admit']==0] minority = data[data['admit']==1] # In[86]: # Applying a resampling strategy (Oversampling) to obtain a more balanced data minority_upsample = resample(minority, replace = True, n_samples=273, random_state=123) # In[87]: new_data = pd.concat([majority,minority_upsample]) new_data.describe() # In[94]: new_data.hist(color="green") plt.show() # In[95]: # Creating X X = new_data.drop("admit",axis=1) X # In[96]: # Creating Y Y = new_data["admit"] Y # # PART 2 - Data Analysis # # In[97]: from sklearn.model_selection import train_test_split # In[98]: X_train,X_test,Y_train,Y_real = train_test_split(X,Y,test_size=0.2) # test size will be 20% and train size will be 80% # # Part 2.1. Logistic Regression # In[99]: from sklearn.linear_model import LogisticRegression # #### LIBLINEAR -- A Library for Large Linear Classification # # The solvers implemented in the class Logistic Regression are “liblinear”, “newton-cg”, “lbfgs”, “sag” and “saga”. According to Scikit Documentation: The “liblinear” solver was the one used by default for historical reasons before version 0.22. Since then, default use is lbfgs Algorithm. # # In[100]: lr_model = LogisticRegression(solver='liblinear') lr_model.fit(X_train,Y_train) # In[101]: y_pred = lr_model.predict(X_test) y_pred # ### Logistic Regression - Performance measurement # #### a) Accuracy # In[102]: from sklearn import metrics # In[103]: acc_lg = metrics.accuracy_score(Y_real, y_pred) acc_lg # #### b) Precision # In[104]: pre_lg = metrics.precision_score(Y_real, y_pred) pre_lg # #### c) Recall # In[105]: rec_lg = metrics.recall_score(Y_real, y_pred) rec_lg # #### d) Confusion matrix # In[106]: cm_lg = metrics.confusion_matrix(Y_real,y_pred) cm_lg # In[107]: import seaborn as sns sns.heatmap(confusion_matrix, annot=True) # #### e) AUC - ROC curve # In[108]: auc_lg = metrics.roc_auc_score(Y_real, y_pred) # as the documentation explain, the main parameters are: y_true and y_score auc_lg # ### Logistic Regression - KFold # In[109]: from sklearn.model_selection import KFold from sklearn.model_selection import cross_val_score # In[110]: kf_lg = KFold(n_splits = 5,shuffle=True) cv_lg = cross_val_score(lr_model, X, Y, cv=kf_lg) np.mean(cv_lg) # # Part 2.2. Decision Tree # In[111]: from sklearn import tree tree_model = tree.DecisionTreeClassifier(max_depth=3) # In[112]: tree_model.fit(X_train,Y_train) # ### Decision Tree - Performance measurement # #### a) Accuracy # In[113]: acc_dt = tree_model.score(X_test,Y_real) acc_dt # #### b) Precision # In[114]: pre_dt = metrics.precision_score(Y_real, y_pred) pre_dt # #### c) Recall # In[115]: rec_dt = metrics.recall_score(Y_real, y_pred) rec_dt # #### d) Confusion matrix # In[116]: cm_dt = metrics.confusion_matrix(Y_real,y_pred) cm_dt # #### e) AUC - ROC curve # In[117]: auc_dt = metrics.roc_auc_score(Y_real, y_pred) # as the documentation explain, the main parameters are: y_true and y_score auc_dt # ### Decision Tree - KFold # In[118]: kf_dt = KFold(n_splits = 5,shuffle=True) cv_dt = cross_val_score(tree_model, X, Y, cv=kf_dt) np.mean(cv_dt) # # Part 2.3. SVM - SVC # In[119]: from sklearn import svm # In[120]: model_SVC = svm.SVC(kernel="linear") model_SVC # In[121]: model_SVC.fit(X_train,Y_train) # In[122]: Y_pred = model_SVC.predict(X_test) Y_pred # ### SVM - SVC - Performance measurement # #### a) Accuracy # In[123]: acc_svc = metrics.accuracy_score(Y_pred,Y_real) acc_svc # #### b) Precision # In[124]: pre_svc = metrics.precision_score(Y_real, y_pred) pre_svc # #### c) Recall # In[125]: rec_svc = metrics.recall_score(Y_real, y_pred) rec_svc # #### d) Confusion matrix # In[126]: cm_svc = metrics.confusion_matrix(Y_real,y_pred) cm_svc # #### e) AUC - ROC curve # In[127]: auc_svc = metrics.roc_auc_score(Y_real, y_pred) # as the documentation explain, the main parameters are: y_true and y_score auc_svc # ### SVM-SVC K-Fold # # In[128]: kf_svm = KFold(n_splits = 5,shuffle=True) cv_svm = cross_val_score(model_SVC, X, Y, cv=kf_svm) np.mean(cv_svm) # # Part 2.4. Random Forest # In[129]: from sklearn.ensemble import RandomForestClassifier # In[130]: model_random_forest = RandomForestClassifier().fit(X_train,Y_train) y_pred_random_forest = model_random_forest.predict(X_test) y_pred_random_forest # ### Random Forest - Performance measurement # #### a) Accuracy # In[131]: acc_rf = metrics.accuracy_score(Y_real,y_pred_random_forest) acc_rf # #### b) Precision # In[132]: metrics.precision_score(Y_real, y_pred_random_forest) # #### c) Recall # In[133]: metrics.recall_score(Y_real, y_pred_random_forest) # #### d) Confusion matrix # In[134]: metrics.roc_auc_score(Y_real, y_pred_random_forest) # #### e) AUC - ROC curve # In[135]: metrics.confusion_matrix(Y_real,y_pred_random_forest) # ### Random Forest K-Fold # # In[136]: kf_rf = KFold(n_splits = 5,shuffle=True) cv_rf = cross_val_score(model_SVC, X, Y, cv=kf_rf) np.mean(cv_rf) # # PART 3 - Valuation Analysis: Performance Measurement & K-Fold # # # In Part 2 I did the performance measurement and K-fold of each ML model. Now I'll compare the accuracy of the different models (Accuracy comparison graph). # In[142]: plt.title("Accuracy Comparison Graph") plt.ylabel("Accuracy Score") plt.xlabel("Machine Learning Algorithms - 1.Logistic Regression / 2.Decision Tree / 3.SVM-SCV / 4.Random Forest") x = [acc_lg,acc_dt,acc_svc,acc_rf] plt.plot([1,2,3,4],x, color = "black") plt.scatter(1,acc_lg, marker="o", color="pink", label="Logistic Regression") plt.scatter(2,acc_dt, marker="o", color="green", label="Decision Tree") plt.scatter(3,acc_svc, marker="o", color="red", label="SVM-SVC") plt.scatter(4,acc_rf, marker="o", color = "blue",label = "Random Forest") plt.legend() plt.show() # In[ ]: