#!/usr/bin/env python # coding: utf-8 # In[100]: import pandas as pd import xgboost as xgb from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score from sklearn.metrics import classification_report from sklearn.metrics import plot_confusion_matrix from sklearn.metrics import classification_report from sklearn.model_selection import cross_val_score from sklearn.model_selection import RepeatedStratifiedKFold from sklearn.pipeline import Pipeline from sklearn.feature_selection import RFE from sklearn.tree import DecisionTreeClassifier import numpy as np import matplotlib.pyplot as plt import seaborn as sns import umap import umap.plot from imblearn.under_sampling import RandomUnderSampler # In[101]: df = pd.read_pickle("dataframe_survey_2018-01-23_enriched.pickle") df # In[102]: df.func # 4 cognitive functions without their attitudinal directions introversion/extraversion # In[144]: liwc_cols = ["negate","ppron","nonfl","i","relativ","percept","quant","affect","shehe","achieve","bio","leisure","conj","motion","posemo","adverb","home","future","negemo","number","inhib","humans","pronoun","excl","space","tentat","see","past","anx","family","present","health","verb","certain","anger","preps","swear","ingest","discrep","friend","relig","time","cause","article","body","social","assent","work","sexual","insight","ipron","filler","death","funct","sad","you","cogmech","auxverb","they","incl","money","feel","we","hear"] data = df[liwc_cols] data["y"] = df.func data = data.dropna() data # In[145]: print(len(data.columns)) y = data.iloc[:,[64]] X = data.iloc[:,0:63] # In[146]: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2045) # In[159]: X_test.values.shape # In[160]: y_test.values.ravel().shape # In[161]: y_train.y.value_counts() # In[170]: type(y_train.iloc[:,0]) # In[171]: model = xgb.XGBClassifier() model.fit(X_train, y_train.iloc[:,0]) # In[175]: y_pred = model.predict(X_test) # In[176]: accuracy = accuracy_score(y_test, y_pred) accuracy # In[178]: fig, ax = plt.subplots(figsize=(12, 12)) plot_confusion_matrix(model, X_test, y_test, ax=ax) plt.show() # # With balanced classes # In[179]: rus = RandomUnderSampler(sampling_strategy='not minority', random_state=1) data_balanced, balanced_y = rus.fit_resample(data, data['y']) # In[180]: data_balanced # In[181]: balanced_y.value_counts() # In[182]: data_balanced.y.value_counts() # In[183]: print(len(data.columns)) y = data_balanced.iloc[:,[64]] X = data_balanced.iloc[:,0:63] # In[184]: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2045) # In[185]: y_train.y.value_counts() # In[186]: y_test.y.value_counts() # In[188]: model = xgb.XGBClassifier() model.fit(X_train.values, y_train.iloc[:,0]) # In[194]: X_test # In[196]: y_pred = model.predict(X_test.values) # In[197]: accuracy = accuracy_score(y_test, y_pred) accuracy # In[198]: fig, ax = plt.subplots(figsize=(12, 12)) plot_confusion_matrix(model, X_test.values, y_test, ax=ax) plt.show() # In[199]: print(classification_report(y_test, y_pred)) # # With reduced dimensionality # In[200]: reducer = umap.UMAP() mapper = umap.UMAP().fit(X) # for plotting embedding = reducer.fit_transform(X) # In[201]: embedding.shape # In[202]: y.values.ravel() # In[203]: umap.plot.points(mapper, labels=y.values.ravel()) # In[204]: X_train, X_test, y_train, y_test = train_test_split(embedding, y, test_size=0.2, random_state=2045) # In[205]: embedding.shape # In[206]: y.values.shape # In[207]: model = xgb.XGBClassifier() model.fit(X_train, y_train) y_pred = model.predict(X_test) y_pred # In[208]: X_train.shape # In[209]: X_test.shape # In[210]: fig, ax = plt.subplots(figsize=(12, 12)) plot_confusion_matrix(model, X_test, y_test, ax=ax) plt.show() # In[211]: print(classification_report(y_test, y_pred))