#!/usr/bin/env python
# coding: utf-8

# In[100]:


import pandas as pd
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier

import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import umap
import umap.plot

from imblearn.under_sampling import RandomUnderSampler


# In[101]:


df = pd.read_pickle("dataframe_survey_2018-01-23_enriched.pickle")
df


# In[102]:


df.func # 4 cognitive functions without their attitudinal directions introversion/extraversion


# In[144]:


liwc_cols = ["negate","ppron","nonfl","i","relativ","percept","quant","affect","shehe","achieve","bio","leisure","conj","motion","posemo","adverb","home","future","negemo","number","inhib","humans","pronoun","excl","space","tentat","see","past","anx","family","present","health","verb","certain","anger","preps","swear","ingest","discrep","friend","relig","time","cause","article","body","social","assent","work","sexual","insight","ipron","filler","death","funct","sad","you","cogmech","auxverb","they","incl","money","feel","we","hear"]
data = df[liwc_cols]
data["y"] = df.func
data = data.dropna()
data


# In[145]:


print(len(data.columns))
y = data.iloc[:,[64]]
X = data.iloc[:,0:63]


# In[146]:


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2045)


# In[159]:


X_test.values.shape


# In[160]:


y_test.values.ravel().shape


# In[161]:


y_train.y.value_counts()


# In[170]:


type(y_train.iloc[:,0])


# In[171]:


model = xgb.XGBClassifier()
model.fit(X_train, y_train.iloc[:,0])


# In[175]:


y_pred = model.predict(X_test)


# In[176]:


accuracy = accuracy_score(y_test, y_pred)
accuracy


# In[178]:


fig, ax = plt.subplots(figsize=(12, 12))
plot_confusion_matrix(model, X_test, y_test, ax=ax)

plt.show()


# # With balanced classes

# In[179]:


rus = RandomUnderSampler(sampling_strategy='not minority', random_state=1)
data_balanced, balanced_y = rus.fit_resample(data, data['y'])


# In[180]:


data_balanced


# In[181]:


balanced_y.value_counts()


# In[182]:


data_balanced.y.value_counts()


# In[183]:


print(len(data.columns))
y = data_balanced.iloc[:,[64]]
X = data_balanced.iloc[:,0:63]


# In[184]:


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2045)


# In[185]:


y_train.y.value_counts()


# In[186]:


y_test.y.value_counts()


# In[188]:


model = xgb.XGBClassifier()
model.fit(X_train.values, y_train.iloc[:,0])


# In[194]:


X_test


# In[196]:


y_pred = model.predict(X_test.values)


# In[197]:


accuracy = accuracy_score(y_test, y_pred)
accuracy


# In[198]:


fig, ax = plt.subplots(figsize=(12, 12))
plot_confusion_matrix(model, X_test.values, y_test, ax=ax)
plt.show()


# In[199]:


print(classification_report(y_test, y_pred))


# # With reduced dimensionality

# In[200]:


reducer = umap.UMAP()
mapper = umap.UMAP().fit(X) # for plotting
embedding = reducer.fit_transform(X)


# In[201]:


embedding.shape


# In[202]:


y.values.ravel()


# In[203]:


umap.plot.points(mapper, labels=y.values.ravel())


# In[204]:


X_train, X_test, y_train, y_test = train_test_split(embedding, y, test_size=0.2, random_state=2045)


# In[205]:


embedding.shape


# In[206]:


y.values.shape


# In[207]:


model = xgb.XGBClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred


# In[208]:


X_train.shape


# In[209]:


X_test.shape


# In[210]:


fig, ax = plt.subplots(figsize=(12, 12))
plot_confusion_matrix(model, X_test, y_test, ax=ax)
plt.show()


# In[211]:


print(classification_report(y_test, y_pred))