import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import umap
import umap.plot
from imblearn.under_sampling import RandomUnderSampler
df = pd.read_pickle("dataframe_survey_2018-01-23_enriched.pickle")
df
url | typealyzer | actual | e | s | t | sntf_s | sntf_n | sntf_t | sntf_f | ... | sad | you | cogmech | auxverb | they | incl | money | feel | we | hear | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | http://adropofcolour.tumblr.com | ISFP | INFJ | 0.291281 | 0.787844 | 0.460961 | 0.663515 | 0.178565 | 0.069282 | 0.088638 | ... | 0.000000 | 0.019704 | 0.098522 | 0.147783 | 0.000000 | 0.039409 | 0.009852 | 0.019704 | 0.044335 | 0.009852 |
2 | http://godheadcomplex.tumblr.com | ESFP | INFP | 0.883579 | 0.951693 | 0.238407 | 0.855921 | 0.046931 | 0.021850 | 0.075297 | ... | 0.000000 | 0.017513 | 0.201401 | 0.084063 | 0.001751 | 0.056042 | 0.007005 | 0.017513 | 0.047285 | 0.003503 |
3 | http://chaotikaeon2.tumblr.com | INTJ | INTP | 0.332444 | 0.357863 | 0.591322 | 0.147668 | 0.252326 | 0.339831 | 0.260175 | ... | 0.003283 | 0.014540 | 0.181989 | 0.114916 | 0.000938 | 0.071295 | 0.010319 | 0.008912 | 0.054409 | 0.014540 |
5 | http://perpetually-in-transit.blogspot.com | ESFP | ENFJ | 0.944394 | 0.943192 | 0.105527 | 0.778825 | 0.051134 | 0.017299 | 0.152742 | ... | 0.002497 | 0.018727 | 0.207241 | 0.104869 | 0.002497 | 0.049938 | 0.014981 | 0.011236 | 0.041199 | 0.017478 |
10 | http://museofmystery.wordpress.com/2012/08/29/... | ISTP | INFP | 0.073352 | 0.850472 | 0.608812 | 0.628322 | 0.112762 | 0.149270 | 0.109646 | ... | 0.001031 | 0.005155 | 0.215464 | 0.122680 | 0.005155 | 0.043299 | 0.019588 | 0.002062 | 0.021649 | 0.012371 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
25432 | http://pistoche.tumblr.com | ESFP | INTJ | 0.685653 | 0.969891 | 0.480241 | 0.960824 | 0.029758 | 0.004220 | 0.005199 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
25433 | http://lokh.tumblr.com | ISTP | INTP | 0.201637 | 0.553602 | 0.662618 | 0.468074 | 0.374926 | 0.099968 | 0.057033 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
25435 | http://readerdye.tumblr.com | ISTP | INFP | 0.375704 | 0.756593 | 0.740688 | 0.697536 | 0.229456 | 0.051684 | 0.021324 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
25436 | http://loveisart.tumblr.com | ISTP | ENFP | 0.002516 | 0.848823 | 0.661502 | 0.584138 | 0.118812 | 0.192779 | 0.104271 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
25437 | http://angelalll.tumblr.com | ESTP | INFP | 0.814616 | 0.652280 | 0.832608 | 0.518149 | 0.281291 | 0.163392 | 0.037168 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
22919 rows × 115 columns
df.func # 4 cognitive functions without their attitudinal directions introversion/extraversion
1 f 2 t 3 f 5 f 10 t .. 25432 f 25433 f 25435 t 25436 f 25437 n Name: func, Length: 22919, dtype: object
liwc_cols = ["negate","ppron","nonfl","i","relativ","percept","quant","affect","shehe","achieve","bio","leisure","conj","motion","posemo","adverb","home","future","negemo","number","inhib","humans","pronoun","excl","space","tentat","see","past","anx","family","present","health","verb","certain","anger","preps","swear","ingest","discrep","friend","relig","time","cause","article","body","social","assent","work","sexual","insight","ipron","filler","death","funct","sad","you","cogmech","auxverb","they","incl","money","feel","we","hear"]
data = df[liwc_cols]
data["y"] = df.func
data = data.dropna()
data
/opt/anaconda3/envs/mindalyzer/lib/python3.6/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy This is separate from the ipykernel package so we can avoid doing imports until
negate | ppron | nonfl | i | relativ | percept | quant | affect | shehe | achieve | ... | you | cogmech | auxverb | they | incl | money | feel | we | hear | y | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 0.034483 | 0.428571 | 0.049261 | 0.334975 | 0.197044 | 0.039409 | 0.024631 | 0.088670 | 0.029557 | 0.024631 | ... | 0.019704 | 0.098522 | 0.147783 | 0.000000 | 0.039409 | 0.009852 | 0.019704 | 0.044335 | 0.009852 | f |
2 | 0.040280 | 0.416813 | 0.063047 | 0.285464 | 0.316988 | 0.031524 | 0.029772 | 0.082312 | 0.064799 | 0.012259 | ... | 0.017513 | 0.201401 | 0.084063 | 0.001751 | 0.056042 | 0.007005 | 0.017513 | 0.047285 | 0.003503 | t |
3 | 0.017824 | 0.439962 | 0.068949 | 0.277674 | 0.353189 | 0.040807 | 0.031895 | 0.090994 | 0.092402 | 0.018293 | ... | 0.014540 | 0.181989 | 0.114916 | 0.000938 | 0.071295 | 0.010319 | 0.008912 | 0.054409 | 0.014540 | f |
5 | 0.038702 | 0.377029 | 0.049938 | 0.233458 | 0.223471 | 0.046192 | 0.041199 | 0.072409 | 0.081149 | 0.024969 | ... | 0.018727 | 0.207241 | 0.104869 | 0.002497 | 0.049938 | 0.014981 | 0.011236 | 0.041199 | 0.017478 | f |
10 | 0.014433 | 0.479381 | 0.091753 | 0.364948 | 0.319588 | 0.026804 | 0.047423 | 0.102062 | 0.082474 | 0.030928 | ... | 0.005155 | 0.215464 | 0.122680 | 0.005155 | 0.043299 | 0.019588 | 0.002062 | 0.021649 | 0.012371 | t |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
22914 | 0.091241 | 0.390511 | 0.040146 | 0.244526 | 0.306569 | 0.025547 | 0.021898 | 0.065693 | 0.025547 | 0.021898 | ... | 0.040146 | 0.244526 | 0.069343 | 0.003650 | 0.091241 | 0.007299 | 0.000000 | 0.076642 | 0.007299 | t |
22915 | 0.020309 | 0.508530 | 0.053615 | 0.351340 | 0.287571 | 0.049553 | 0.045491 | 0.119415 | 0.084890 | 0.017059 | ... | 0.019090 | 0.211210 | 0.150690 | 0.012998 | 0.045085 | 0.016247 | 0.016653 | 0.040211 | 0.010154 | n |
22916 | 0.029458 | 0.482904 | 0.061021 | 0.307207 | 0.262493 | 0.039979 | 0.028406 | 0.110994 | 0.124671 | 0.025776 | ... | 0.017885 | 0.197791 | 0.162020 | 0.006312 | 0.042083 | 0.010521 | 0.013677 | 0.026828 | 0.010521 | n |
22917 | 0.113971 | 0.139706 | 0.025735 | 0.084559 | 0.136029 | 0.022059 | 0.018382 | 0.040441 | 0.025735 | 0.000000 | ... | 0.014706 | 0.113971 | 0.069853 | 0.007353 | 0.011029 | 0.000000 | 0.007353 | 0.007353 | 0.000000 | f |
22918 | 0.003861 | 0.455598 | 0.057915 | 0.333977 | 0.322394 | 0.019305 | 0.036680 | 0.073359 | 0.075290 | 0.017375 | ... | 0.027027 | 0.133205 | 0.113900 | 0.003861 | 0.055985 | 0.003861 | 0.003861 | 0.015444 | 0.011583 | n |
20819 rows × 65 columns
print(len(data.columns))
y = data.iloc[:,[64]]
X = data.iloc[:,0:63]
65
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2045)
X_test.values.shape
(4164, 63)
y_test.values.ravel().shape
(4164,)
y_train.y.value_counts()
n 6883 f 4489 t 3278 s 2005 Name: y, dtype: int64
type(y_train.iloc[:,0])
pandas.core.series.Series
model = xgb.XGBClassifier()
model.fit(X_train, y_train.iloc[:,0])
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1, importance_type='gain', interaction_constraints='', learning_rate=0.300000012, max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0, num_parallel_tree=1, objective='multi:softprob', random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=None, subsample=1, tree_method='exact', validate_parameters=1, verbosity=None)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
accuracy
0.3547070124879923
fig, ax = plt.subplots(figsize=(12, 12))
plot_confusion_matrix(model, X_test, y_test, ax=ax)
plt.show()
rus = RandomUnderSampler(sampling_strategy='not minority', random_state=1)
data_balanced, balanced_y = rus.fit_resample(data, data['y'])
data_balanced
negate | ppron | nonfl | i | relativ | percept | quant | affect | shehe | achieve | ... | you | cogmech | auxverb | they | incl | money | feel | we | hear | y | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.050562 | 0.252809 | 0.033708 | 0.188202 | 0.160112 | 0.025281 | 0.016854 | 0.070225 | 0.030899 | 0.008427 | ... | 0.002809 | 0.115169 | 0.115169 | 0.002809 | 0.025281 | 0.005618 | 0.016854 | 0.028090 | 0.002809 | f |
1 | 0.038462 | 0.378205 | 0.054487 | 0.272436 | 0.285256 | 0.054487 | 0.038462 | 0.108974 | 0.057692 | 0.006410 | ... | 0.012821 | 0.227564 | 0.108974 | 0.000000 | 0.083333 | 0.003205 | 0.019231 | 0.035256 | 0.022436 | f |
2 | 0.018786 | 0.552023 | 0.046243 | 0.341040 | 0.339595 | 0.052023 | 0.066474 | 0.114162 | 0.135838 | 0.026012 | ... | 0.027457 | 0.225434 | 0.167630 | 0.020231 | 0.049133 | 0.010116 | 0.018786 | 0.027457 | 0.020231 | f |
3 | 0.039113 | 0.413299 | 0.061278 | 0.320730 | 0.251630 | 0.053455 | 0.032595 | 0.067797 | 0.053455 | 0.002608 | ... | 0.010430 | 0.170795 | 0.088657 | 0.001304 | 0.045632 | 0.005215 | 0.013038 | 0.027379 | 0.007823 | f |
4 | 0.026490 | 0.394702 | 0.079470 | 0.270199 | 0.241060 | 0.046358 | 0.027815 | 0.092715 | 0.070199 | 0.023841 | ... | 0.027815 | 0.143046 | 0.092715 | 0.000000 | 0.018543 | 0.006623 | 0.010596 | 0.026490 | 0.025166 | f |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
10031 | 0.027867 | 0.454984 | 0.065380 | 0.274920 | 0.310289 | 0.032154 | 0.029475 | 0.088424 | 0.138800 | 0.012326 | ... | 0.013398 | 0.190782 | 0.150054 | 0.003751 | 0.043944 | 0.008039 | 0.007503 | 0.024116 | 0.013934 | t |
10032 | 0.052083 | 0.322917 | 0.050000 | 0.229167 | 0.237500 | 0.018750 | 0.006250 | 0.093750 | 0.041667 | 0.010417 | ... | 0.008333 | 0.129167 | 0.079167 | 0.002083 | 0.012500 | 0.002083 | 0.006250 | 0.041667 | 0.004167 | t |
10033 | 0.128889 | 0.306667 | 0.017778 | 0.213333 | 0.284444 | 0.022222 | 0.013333 | 0.075556 | 0.022222 | 0.017778 | ... | 0.026667 | 0.155556 | 0.053333 | 0.000000 | 0.031111 | 0.008889 | 0.008889 | 0.044444 | 0.000000 | t |
10034 | 0.048276 | 0.439655 | 0.047414 | 0.289655 | 0.238793 | 0.046552 | 0.056034 | 0.106034 | 0.076724 | 0.015517 | ... | 0.020690 | 0.199138 | 0.129310 | 0.000862 | 0.052586 | 0.008621 | 0.012069 | 0.051724 | 0.022414 | t |
10035 | 0.060134 | 0.443207 | 0.044543 | 0.293987 | 0.360802 | 0.020045 | 0.042316 | 0.122494 | 0.073497 | 0.026726 | ... | 0.022272 | 0.200445 | 0.113586 | 0.006682 | 0.069042 | 0.002227 | 0.011136 | 0.046771 | 0.004454 | t |
10036 rows × 65 columns
balanced_y.value_counts()
t 2509 n 2509 f 2509 s 2509 Name: y, dtype: int64
data_balanced.y.value_counts()
t 2509 n 2509 f 2509 s 2509 Name: y, dtype: int64
print(len(data.columns))
y = data_balanced.iloc[:,[64]]
X = data_balanced.iloc[:,0:63]
65
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2045)
y_train.y.value_counts()
s 2033 t 2017 f 1994 n 1984 Name: y, dtype: int64
y_test.y.value_counts()
n 525 f 515 t 492 s 476 Name: y, dtype: int64
model = xgb.XGBClassifier()
model.fit(X_train.values, y_train.iloc[:,0])
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1, importance_type='gain', interaction_constraints='', learning_rate=0.300000012, max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0, num_parallel_tree=1, objective='multi:softprob', random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=None, subsample=1, tree_method='exact', validate_parameters=1, verbosity=None)
X_test
negate | ppron | nonfl | i | relativ | percept | quant | affect | shehe | achieve | ... | funct | sad | you | cogmech | auxverb | they | incl | money | feel | we | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
7343 | 0.020151 | 0.458438 | 0.076826 | 0.332494 | 0.328715 | 0.030227 | 0.036524 | 0.095718 | 0.044081 | 0.028967 | ... | 1.358942 | 0.006297 | 0.021411 | 0.152393 | 0.119647 | 0.010076 | 0.035264 | 0.013854 | 0.003778 | 0.050378 |
2127 | 0.034139 | 0.470128 | 0.034139 | 0.301565 | 0.281650 | 0.046230 | 0.044808 | 0.088193 | 0.076814 | 0.014936 | ... | 1.494310 | 0.002845 | 0.013514 | 0.256046 | 0.148649 | 0.008535 | 0.047653 | 0.009957 | 0.014936 | 0.069701 |
2225 | 0.030955 | 0.484260 | 0.073977 | 0.351522 | 0.317419 | 0.049318 | 0.037775 | 0.107030 | 0.076600 | 0.031480 | ... | 1.606506 | 0.003148 | 0.021511 | 0.178909 | 0.136411 | 0.006821 | 0.038300 | 0.013641 | 0.018363 | 0.027807 |
5473 | 0.067358 | 0.393782 | 0.058722 | 0.260794 | 0.303972 | 0.034542 | 0.058722 | 0.074266 | 0.072539 | 0.010363 | ... | 1.298791 | 0.003454 | 0.005181 | 0.127807 | 0.100173 | 0.001727 | 0.032815 | 0.015544 | 0.013817 | 0.053541 |
1515 | 0.037975 | 0.379747 | 0.042194 | 0.299578 | 0.248945 | 0.059072 | 0.046414 | 0.033755 | 0.063291 | 0.016878 | ... | 1.059072 | 0.000000 | 0.004219 | 0.168776 | 0.067511 | 0.008439 | 0.025316 | 0.021097 | 0.021097 | 0.004219 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
550 | 0.053942 | 0.448133 | 0.049793 | 0.319502 | 0.242739 | 0.020747 | 0.024896 | 0.080913 | 0.068465 | 0.006224 | ... | 1.307054 | 0.014523 | 0.020747 | 0.155602 | 0.105809 | 0.008299 | 0.037344 | 0.020747 | 0.006224 | 0.031120 |
3477 | 0.025516 | 0.520656 | 0.102066 | 0.375456 | 0.284933 | 0.060753 | 0.034629 | 0.116039 | 0.078372 | 0.027339 | ... | 1.597813 | 0.007290 | 0.029162 | 0.157959 | 0.156744 | 0.002430 | 0.033414 | 0.019441 | 0.026124 | 0.035237 |
3042 | 0.010163 | 0.443089 | 0.052846 | 0.239837 | 0.241870 | 0.034553 | 0.038618 | 0.083333 | 0.095528 | 0.010163 | ... | 1.355691 | 0.008130 | 0.026423 | 0.172764 | 0.095528 | 0.012195 | 0.069106 | 0.000000 | 0.006098 | 0.069106 |
9734 | 0.045028 | 0.360225 | 0.061914 | 0.250469 | 0.275797 | 0.055347 | 0.034709 | 0.110694 | 0.059099 | 0.014071 | ... | 1.252345 | 0.016886 | 0.009381 | 0.214822 | 0.119137 | 0.000938 | 0.045966 | 0.024390 | 0.037523 | 0.040338 |
9261 | 0.034934 | 0.425036 | 0.055313 | 0.216885 | 0.229985 | 0.039301 | 0.024745 | 0.090247 | 0.085881 | 0.011645 | ... | 1.199418 | 0.001456 | 0.080058 | 0.177584 | 0.109170 | 0.001456 | 0.046579 | 0.008734 | 0.013100 | 0.040757 |
2008 rows × 63 columns
y_pred = model.predict(X_test.values)
accuracy = accuracy_score(y_test, y_pred)
accuracy
0.24651394422310757
fig, ax = plt.subplots(figsize=(12, 12))
plot_confusion_matrix(model, X_test.values, y_test, ax=ax)
plt.show()
print(classification_report(y_test, y_pred))
precision recall f1-score support f 0.27 0.27 0.27 515 n 0.26 0.25 0.25 525 s 0.24 0.26 0.25 476 t 0.21 0.21 0.21 492 accuracy 0.25 2008 macro avg 0.25 0.25 0.25 2008 weighted avg 0.25 0.25 0.25 2008
reducer = umap.UMAP()
mapper = umap.UMAP().fit(X) # for plotting
embedding = reducer.fit_transform(X)
embedding.shape
(10036, 2)
y.values.ravel()
array(['f', 'f', 'f', ..., 't', 't', 't'], dtype=object)
umap.plot.points(mapper, labels=y.values.ravel())
<AxesSubplot:>
X_train, X_test, y_train, y_test = train_test_split(embedding, y, test_size=0.2, random_state=2045)
embedding.shape
(10036, 2)
y.values.shape
(10036, 1)
model = xgb.XGBClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred
/opt/anaconda3/envs/mindalyzer/lib/python3.6/site-packages/sklearn/utils/validation.py:73: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel(). return f(**kwargs)
array(['n', 't', 'f', ..., 's', 's', 's'], dtype=object)
X_train.shape
(8028, 2)
X_test.shape
(2008, 2)
fig, ax = plt.subplots(figsize=(12, 12))
plot_confusion_matrix(model, X_test, y_test, ax=ax)
plt.show()
print(classification_report(y_test, y_pred))
precision recall f1-score support f 0.26 0.23 0.24 515 n 0.23 0.23 0.23 525 s 0.21 0.24 0.23 476 t 0.24 0.24 0.24 492 accuracy 0.23 2008 macro avg 0.23 0.23 0.23 2008 weighted avg 0.24 0.23 0.23 2008