In [100]:

import pandas as pd
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier

import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import umap
import umap.plot

from imblearn.under_sampling import RandomUnderSampler

In [101]:

df = pd.read_pickle("dataframe_survey_2018-01-23_enriched.pickle")
df

Out[101]:

	url	typealyzer	actual	e	s	t	sntf_s	sntf_n	sntf_t	sntf_f	...	sad	you	cogmech	auxverb	they	incl	money	feel	we	hear
1	http://adropofcolour.tumblr.com	ISFP	INFJ	0.291281	0.787844	0.460961	0.663515	0.178565	0.069282	0.088638	...	0.000000	0.019704	0.098522	0.147783	0.000000	0.039409	0.009852	0.019704	0.044335	0.009852
2	http://godheadcomplex.tumblr.com	ESFP	INFP	0.883579	0.951693	0.238407	0.855921	0.046931	0.021850	0.075297	...	0.000000	0.017513	0.201401	0.084063	0.001751	0.056042	0.007005	0.017513	0.047285	0.003503
3	http://chaotikaeon2.tumblr.com	INTJ	INTP	0.332444	0.357863	0.591322	0.147668	0.252326	0.339831	0.260175	...	0.003283	0.014540	0.181989	0.114916	0.000938	0.071295	0.010319	0.008912	0.054409	0.014540
5	http://perpetually-in-transit.blogspot.com	ESFP	ENFJ	0.944394	0.943192	0.105527	0.778825	0.051134	0.017299	0.152742	...	0.002497	0.018727	0.207241	0.104869	0.002497	0.049938	0.014981	0.011236	0.041199	0.017478
10	http://museofmystery.wordpress.com/2012/08/29/...	ISTP	INFP	0.073352	0.850472	0.608812	0.628322	0.112762	0.149270	0.109646	...	0.001031	0.005155	0.215464	0.122680	0.005155	0.043299	0.019588	0.002062	0.021649	0.012371
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
25432	http://pistoche.tumblr.com	ESFP	INTJ	0.685653	0.969891	0.480241	0.960824	0.029758	0.004220	0.005199	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
25433	http://lokh.tumblr.com	ISTP	INTP	0.201637	0.553602	0.662618	0.468074	0.374926	0.099968	0.057033	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
25435	http://readerdye.tumblr.com	ISTP	INFP	0.375704	0.756593	0.740688	0.697536	0.229456	0.051684	0.021324	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
25436	http://loveisart.tumblr.com	ISTP	ENFP	0.002516	0.848823	0.661502	0.584138	0.118812	0.192779	0.104271	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
25437	http://angelalll.tumblr.com	ESTP	INFP	0.814616	0.652280	0.832608	0.518149	0.281291	0.163392	0.037168	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

22919 rows × 115 columns

In [102]:

df.func # 4 cognitive functions without their attitudinal directions introversion/extraversion

Out[102]:

1        f
2        t
3        f
5        f
10       t
        ..
25432    f
25433    f
25435    t
25436    f
25437    n
Name: func, Length: 22919, dtype: object

In [144]:

liwc_cols = ["negate","ppron","nonfl","i","relativ","percept","quant","affect","shehe","achieve","bio","leisure","conj","motion","posemo","adverb","home","future","negemo","number","inhib","humans","pronoun","excl","space","tentat","see","past","anx","family","present","health","verb","certain","anger","preps","swear","ingest","discrep","friend","relig","time","cause","article","body","social","assent","work","sexual","insight","ipron","filler","death","funct","sad","you","cogmech","auxverb","they","incl","money","feel","we","hear"]
data = df[liwc_cols]
data["y"] = df.func
data = data.dropna()
data

/opt/anaconda3/envs/mindalyzer/lib/python3.6/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until

Out[144]:

	negate	ppron	nonfl	i	relativ	percept	quant	affect	shehe	achieve	...	you	cogmech	auxverb	they	incl	money	feel	we	hear	y
1	0.034483	0.428571	0.049261	0.334975	0.197044	0.039409	0.024631	0.088670	0.029557	0.024631	...	0.019704	0.098522	0.147783	0.000000	0.039409	0.009852	0.019704	0.044335	0.009852	f
2	0.040280	0.416813	0.063047	0.285464	0.316988	0.031524	0.029772	0.082312	0.064799	0.012259	...	0.017513	0.201401	0.084063	0.001751	0.056042	0.007005	0.017513	0.047285	0.003503	t
3	0.017824	0.439962	0.068949	0.277674	0.353189	0.040807	0.031895	0.090994	0.092402	0.018293	...	0.014540	0.181989	0.114916	0.000938	0.071295	0.010319	0.008912	0.054409	0.014540	f
5	0.038702	0.377029	0.049938	0.233458	0.223471	0.046192	0.041199	0.072409	0.081149	0.024969	...	0.018727	0.207241	0.104869	0.002497	0.049938	0.014981	0.011236	0.041199	0.017478	f
10	0.014433	0.479381	0.091753	0.364948	0.319588	0.026804	0.047423	0.102062	0.082474	0.030928	...	0.005155	0.215464	0.122680	0.005155	0.043299	0.019588	0.002062	0.021649	0.012371	t
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
22914	0.091241	0.390511	0.040146	0.244526	0.306569	0.025547	0.021898	0.065693	0.025547	0.021898	...	0.040146	0.244526	0.069343	0.003650	0.091241	0.007299	0.000000	0.076642	0.007299	t
22915	0.020309	0.508530	0.053615	0.351340	0.287571	0.049553	0.045491	0.119415	0.084890	0.017059	...	0.019090	0.211210	0.150690	0.012998	0.045085	0.016247	0.016653	0.040211	0.010154	n
22916	0.029458	0.482904	0.061021	0.307207	0.262493	0.039979	0.028406	0.110994	0.124671	0.025776	...	0.017885	0.197791	0.162020	0.006312	0.042083	0.010521	0.013677	0.026828	0.010521	n
22917	0.113971	0.139706	0.025735	0.084559	0.136029	0.022059	0.018382	0.040441	0.025735	0.000000	...	0.014706	0.113971	0.069853	0.007353	0.011029	0.000000	0.007353	0.007353	0.000000	f
22918	0.003861	0.455598	0.057915	0.333977	0.322394	0.019305	0.036680	0.073359	0.075290	0.017375	...	0.027027	0.133205	0.113900	0.003861	0.055985	0.003861	0.003861	0.015444	0.011583	n

20819 rows × 65 columns

In [145]:

print(len(data.columns))
y = data.iloc[:,[64]]
X = data.iloc[:,0:63]

In [146]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2045)

In [159]:

X_test.values.shape

Out[159]:

(4164, 63)

In [160]:

y_test.values.ravel().shape

Out[160]:

(4164,)

In [161]:

y_train.y.value_counts()

Out[161]:

n    6883
f    4489
t    3278
s    2005
Name: y, dtype: int64

In [170]:

type(y_train.iloc[:,0])

Out[170]:

pandas.core.series.Series

In [171]:

model = xgb.XGBClassifier()
model.fit(X_train, y_train.iloc[:,0])

Out[171]:

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [175]:

y_pred = model.predict(X_test)

In [176]:

accuracy = accuracy_score(y_test, y_pred)
accuracy

Out[176]:

0.3547070124879923

In [178]:

fig, ax = plt.subplots(figsize=(12, 12))
plot_confusion_matrix(model, X_test, y_test, ax=ax)

plt.show()

With balanced classes¶

In [179]:

rus = RandomUnderSampler(sampling_strategy='not minority', random_state=1)
data_balanced, balanced_y = rus.fit_resample(data, data['y'])

In [180]:

data_balanced

Out[180]:

	negate	ppron	nonfl	i	relativ	percept	quant	affect	shehe	achieve	...	you	cogmech	auxverb	they	incl	money	feel	we	hear	y
0	0.050562	0.252809	0.033708	0.188202	0.160112	0.025281	0.016854	0.070225	0.030899	0.008427	...	0.002809	0.115169	0.115169	0.002809	0.025281	0.005618	0.016854	0.028090	0.002809	f
1	0.038462	0.378205	0.054487	0.272436	0.285256	0.054487	0.038462	0.108974	0.057692	0.006410	...	0.012821	0.227564	0.108974	0.000000	0.083333	0.003205	0.019231	0.035256	0.022436	f
2	0.018786	0.552023	0.046243	0.341040	0.339595	0.052023	0.066474	0.114162	0.135838	0.026012	...	0.027457	0.225434	0.167630	0.020231	0.049133	0.010116	0.018786	0.027457	0.020231	f
3	0.039113	0.413299	0.061278	0.320730	0.251630	0.053455	0.032595	0.067797	0.053455	0.002608	...	0.010430	0.170795	0.088657	0.001304	0.045632	0.005215	0.013038	0.027379	0.007823	f
4	0.026490	0.394702	0.079470	0.270199	0.241060	0.046358	0.027815	0.092715	0.070199	0.023841	...	0.027815	0.143046	0.092715	0.000000	0.018543	0.006623	0.010596	0.026490	0.025166	f
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
10031	0.027867	0.454984	0.065380	0.274920	0.310289	0.032154	0.029475	0.088424	0.138800	0.012326	...	0.013398	0.190782	0.150054	0.003751	0.043944	0.008039	0.007503	0.024116	0.013934	t
10032	0.052083	0.322917	0.050000	0.229167	0.237500	0.018750	0.006250	0.093750	0.041667	0.010417	...	0.008333	0.129167	0.079167	0.002083	0.012500	0.002083	0.006250	0.041667	0.004167	t
10033	0.128889	0.306667	0.017778	0.213333	0.284444	0.022222	0.013333	0.075556	0.022222	0.017778	...	0.026667	0.155556	0.053333	0.000000	0.031111	0.008889	0.008889	0.044444	0.000000	t
10034	0.048276	0.439655	0.047414	0.289655	0.238793	0.046552	0.056034	0.106034	0.076724	0.015517	...	0.020690	0.199138	0.129310	0.000862	0.052586	0.008621	0.012069	0.051724	0.022414	t
10035	0.060134	0.443207	0.044543	0.293987	0.360802	0.020045	0.042316	0.122494	0.073497	0.026726	...	0.022272	0.200445	0.113586	0.006682	0.069042	0.002227	0.011136	0.046771	0.004454	t

10036 rows × 65 columns

In [181]:

balanced_y.value_counts()

Out[181]:

t    2509
n    2509
f    2509
s    2509
Name: y, dtype: int64

In [182]:

data_balanced.y.value_counts()

Out[182]:

t    2509
n    2509
f    2509
s    2509
Name: y, dtype: int64

In [183]:

print(len(data.columns))
y = data_balanced.iloc[:,[64]]
X = data_balanced.iloc[:,0:63]

In [184]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2045)

In [185]:

y_train.y.value_counts()

Out[185]:

s    2033
t    2017
f    1994
n    1984
Name: y, dtype: int64

In [186]:

y_test.y.value_counts()

Out[186]:

n    525
f    515
t    492
s    476
Name: y, dtype: int64

In [188]:

model = xgb.XGBClassifier()
model.fit(X_train.values, y_train.iloc[:,0])

Out[188]:

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [194]:

X_test

Out[194]:

	negate	ppron	nonfl	i	relativ	percept	quant	affect	shehe	achieve	...	funct	sad	you	cogmech	auxverb	they	incl	money	feel	we
7343	0.020151	0.458438	0.076826	0.332494	0.328715	0.030227	0.036524	0.095718	0.044081	0.028967	...	1.358942	0.006297	0.021411	0.152393	0.119647	0.010076	0.035264	0.013854	0.003778	0.050378
2127	0.034139	0.470128	0.034139	0.301565	0.281650	0.046230	0.044808	0.088193	0.076814	0.014936	...	1.494310	0.002845	0.013514	0.256046	0.148649	0.008535	0.047653	0.009957	0.014936	0.069701
2225	0.030955	0.484260	0.073977	0.351522	0.317419	0.049318	0.037775	0.107030	0.076600	0.031480	...	1.606506	0.003148	0.021511	0.178909	0.136411	0.006821	0.038300	0.013641	0.018363	0.027807
5473	0.067358	0.393782	0.058722	0.260794	0.303972	0.034542	0.058722	0.074266	0.072539	0.010363	...	1.298791	0.003454	0.005181	0.127807	0.100173	0.001727	0.032815	0.015544	0.013817	0.053541
1515	0.037975	0.379747	0.042194	0.299578	0.248945	0.059072	0.046414	0.033755	0.063291	0.016878	...	1.059072	0.000000	0.004219	0.168776	0.067511	0.008439	0.025316	0.021097	0.021097	0.004219
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
550	0.053942	0.448133	0.049793	0.319502	0.242739	0.020747	0.024896	0.080913	0.068465	0.006224	...	1.307054	0.014523	0.020747	0.155602	0.105809	0.008299	0.037344	0.020747	0.006224	0.031120
3477	0.025516	0.520656	0.102066	0.375456	0.284933	0.060753	0.034629	0.116039	0.078372	0.027339	...	1.597813	0.007290	0.029162	0.157959	0.156744	0.002430	0.033414	0.019441	0.026124	0.035237
3042	0.010163	0.443089	0.052846	0.239837	0.241870	0.034553	0.038618	0.083333	0.095528	0.010163	...	1.355691	0.008130	0.026423	0.172764	0.095528	0.012195	0.069106	0.000000	0.006098	0.069106
9734	0.045028	0.360225	0.061914	0.250469	0.275797	0.055347	0.034709	0.110694	0.059099	0.014071	...	1.252345	0.016886	0.009381	0.214822	0.119137	0.000938	0.045966	0.024390	0.037523	0.040338
9261	0.034934	0.425036	0.055313	0.216885	0.229985	0.039301	0.024745	0.090247	0.085881	0.011645	...	1.199418	0.001456	0.080058	0.177584	0.109170	0.001456	0.046579	0.008734	0.013100	0.040757

2008 rows × 63 columns

In [196]:

y_pred = model.predict(X_test.values)

In [197]:

accuracy = accuracy_score(y_test, y_pred)
accuracy

Out[197]:

0.24651394422310757

In [198]:

fig, ax = plt.subplots(figsize=(12, 12))
plot_confusion_matrix(model, X_test.values, y_test, ax=ax)
plt.show()

In [199]:

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           f       0.27      0.27      0.27       515
           n       0.26      0.25      0.25       525
           s       0.24      0.26      0.25       476
           t       0.21      0.21      0.21       492

    accuracy                           0.25      2008
   macro avg       0.25      0.25      0.25      2008
weighted avg       0.25      0.25      0.25      2008

With reduced dimensionality¶

In [200]:

reducer = umap.UMAP()
mapper = umap.UMAP().fit(X) # for plotting
embedding = reducer.fit_transform(X)

In [201]:

embedding.shape

Out[201]:

(10036, 2)

In [202]:

y.values.ravel()

Out[202]:

array(['f', 'f', 'f', ..., 't', 't', 't'], dtype=object)

In [203]:

umap.plot.points(mapper, labels=y.values.ravel())

Out[203]:

<AxesSubplot:>

In [204]:

X_train, X_test, y_train, y_test = train_test_split(embedding, y, test_size=0.2, random_state=2045)

In [205]:

embedding.shape

Out[205]:

(10036, 2)

In [206]:

y.values.shape

Out[206]:

(10036, 1)

In [207]:

model = xgb.XGBClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred

/opt/anaconda3/envs/mindalyzer/lib/python3.6/site-packages/sklearn/utils/validation.py:73: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  return f(**kwargs)

Out[207]:

array(['n', 't', 'f', ..., 's', 's', 's'], dtype=object)

In [208]:

X_train.shape

Out[208]:

(8028, 2)

In [209]:

X_test.shape

Out[209]:

(2008, 2)

In [210]:

fig, ax = plt.subplots(figsize=(12, 12))
plot_confusion_matrix(model, X_test, y_test, ax=ax)
plt.show()

In [211]:

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           f       0.26      0.23      0.24       515
           n       0.23      0.23      0.23       525
           s       0.21      0.24      0.23       476
           t       0.24      0.24      0.24       492

    accuracy                           0.23      2008
   macro avg       0.23      0.23      0.23      2008
weighted avg       0.24      0.23      0.23      2008