ls -lrt lezhin_dataset_v2_training.tsv
-rw-r--r-- 1 donglyeolsin staff 239197437 8 13 12:06 lezhin_dataset_v2_training.tsv
9~15 번째 열이 정답 레이블인 0번째 열과 같은 값을 가지는지 확인해 보았습니다.
import pandas as pd
df = pd.read_table('lezhin_dataset_v2_training.tsv', header=None)
print(all(df[0] == df[[9, 10, 11, 12, 13, 14, 15]].any(axis=1)))
False
데이터 열끼리 상관관계를 다음처럼 히트맵으로 출력하였습니다.
%matplotlib inline
import seaborn as sns
corr = df.corr()
sns.heatmap(corr,
xticklabels=corr.columns.values,
yticklabels=corr.columns.values)
<matplotlib.axes._subplots.AxesSubplot at 0x11aa51198>
데이터 열끼리 상관관계 값을 정렬하여 제일 높은 값을 출력하였습니다.
# https://stackoverflow.com/questions/17778394/list-highest-correlation-pairs-from-a-large-correlation-matrix-in-pandas
s = corr.unstack()
so = s.sort_values(kind="quicksort")
so = so.dropna()
# https://stackoverflow.com/questions/18624039/pandas-reset-index-on-series-to-remove-multiindex
so = so.reset_index()
so[so['level_0']!=so['level_1']].tail(10)
level_0 | level_1 | 0 | |
---|---|---|---|
25696 | 154 | 160 | 0.860670 |
25697 | 160 | 154 | 0.860670 |
25698 | 124 | 110 | 0.883358 |
25699 | 110 | 124 | 0.883358 |
25700 | 159 | 160 | 0.901885 |
25701 | 160 | 159 | 0.901885 |
25702 | 165 | 153 | 0.918212 |
25703 | 153 | 165 | 0.918212 |
25704 | 166 | 1 | 0.986777 |
25705 | 1 | 166 | 0.986777 |
정답레이블과 다른 열과의 상관관계 값을 정렬하여 제일 높은 값을 출력하였습니다. 가장 높은 값이 0.240389가 나왔습니다.
so[so['level_0']==0].tail(10)
level_0 | level_1 | 0 | |
---|---|---|---|
23015 | 0 | 50 | 0.170594 |
23087 | 0 | 46 | 0.173585 |
23153 | 0 | 11 | 0.175697 |
23176 | 0 | 165 | 0.176384 |
23451 | 0 | 61 | 0.186519 |
23722 | 0 | 4 | 0.199042 |
23767 | 0 | 27 | 0.201936 |
24134 | 0 | 124 | 0.222300 |
24381 | 0 | 110 | 0.240389 |
25708 | 0 | 0 | 1.000000 |
print("전체 데이터 컬럼 개수 : %d" % len(df.columns))
전체 데이터 컬럼 개수 : 167
print("전체 데이터 개수 : %d" % len(df))
전체 데이터 개수 : 650965
학습에 사용할 데이터를 다음처럼 추출하였습니다.
df_1 = df.drop(df.columns[[6,7,]], axis=1)
df_1 = df_1.dropna(axis=1)
df_1
0 | 1 | 2 | 3 | 4 | 5 | 8 | 10 | 11 | 12 | ... | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 11 | 0 | ... | 19 | 1 | 19 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | ... | 19 | 1 | 19 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 1 | 0 | ... | 19 | 1 | 19 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 34 | 0 | ... | 19 | 1 | 19 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | ... | 19 | 0 | 18 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
5 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 3 | 2 | 0 | ... | 19 | 0 | 18 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
6 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 5 | 2 | 0 | ... | 19 | 0 | 15 | 4 | 0 | 0 | 0 | 0 | 0 | 0 |
7 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | ... | 19 | 0 | 15 | 4 | 0 | 0 | 0 | 0 | 0 | 0 |
8 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | ... | 19 | 0 | 15 | 4 | 0 | 0 | 0 | 0 | 0 | 0 |
9 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 5 | 0 | ... | 19 | 0 | 15 | 4 | 0 | 0 | 0 | 0 | 0 | 0 |
10 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 1 | 0 | ... | 19 | 0 | 15 | 4 | 0 | 0 | 0 | 0 | 0 | 0 |
11 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 42 | 1 | 0 | ... | 19 | 0 | 15 | 4 | 0 | 0 | 0 | 0 | 0 | 0 |
12 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 10 | 0 | ... | 19 | 0 | 15 | 4 | 0 | 0 | 0 | 0 | 0 | 0 |
13 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 37 | 13 | 0 | ... | 19 | 0 | 15 | 4 | 0 | 0 | 0 | 0 | 0 | 0 |
14 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 11 | 0 | ... | 19 | 0 | 15 | 4 | 0 | 0 | 0 | 0 | 0 | 0 |
15 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 34 | 0 | ... | 19 | 0 | 15 | 4 | 0 | 0 | 0 | 0 | 0 | 0 |
16 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 4 | 0 | ... | 19 | 0 | 15 | 4 | 0 | 0 | 0 | 0 | 0 | 0 |
17 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 6 | 0 | ... | 19 | 0 | 15 | 4 | 0 | 0 | 0 | 0 | 0 | 0 |
18 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 31 | 0 | ... | 14 | 0 | 10 | 6 | 0 | 0 | 0 | 0 | 0 | 0 |
19 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | ... | 14 | 0 | 10 | 6 | 0 | 0 | 0 | 0 | 0 | 0 |
20 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | ... | 14 | 0 | 10 | 6 | 0 | 0 | 0 | 0 | 0 | 0 |
21 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 6 | 0 | 0 | ... | 14 | 0 | 10 | 6 | 0 | 0 | 0 | 0 | 0 | 0 |
22 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 2 | 0 | ... | 16 | 1 | 16 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
23 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | ... | 16 | 1 | 16 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
24 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 44 | 0 | 0 | ... | 17 | 0 | 16 | 2 | 0 | 0 | 0 | 0 | 0 | 0 |
25 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | ... | 6 | 1 | 6 | 1 | 0 | 0 | 0 | 1 | 0 | 0 |
26 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 16 | 0 | 0 | ... | 6 | 1 | 6 | 1 | 0 | 0 | 0 | 1 | 0 | 0 |
27 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 11 | 12 | 0 | ... | 6 | 1 | 6 | 1 | 0 | 0 | 0 | 1 | 0 | 0 |
28 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | ... | 19 | 0 | 17 | 3 | 0 | 0 | 0 | 0 | 0 | 0 |
29 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 4 | 0 | 0 | ... | 18 | 0 | 17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
650935 | 1 | 1 | 0 | 0 | 0 | 96 | 1 | 0 | 0 | 0 | ... | 19 | 0 | 7 | 13 | 0 | 0 | 0 | 0 | 0 | 0 |
650936 | 1 | 1 | 0 | 0 | 0 | 96 | 1 | 0 | 0 | 0 | ... | 19 | 0 | 7 | 13 | 0 | 0 | 0 | 0 | 0 | 0 |
650937 | 1 | 1 | 0 | 0 | 0 | 96 | 1 | 0 | 0 | 0 | ... | 19 | 0 | 19 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
650938 | 1 | 1 | 0 | 0 | 0 | 96 | 1 | 0 | 0 | 0 | ... | 19 | 0 | 10 | 9 | 0 | 0 | 0 | 0 | 0 | 0 |
650939 | 1 | 1 | 0 | 0 | 0 | 96 | 1 | 0 | 14 | 0 | ... | 19 | 1 | 19 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
650940 | 1 | 1 | 0 | 0 | 0 | 96 | 1 | 0 | 0 | 0 | ... | 19 | 0 | 19 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
650941 | 1 | 1 | 0 | 0 | 0 | 96 | 1 | 0 | 0 | 0 | ... | 19 | 0 | 19 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
650942 | 1 | 1 | 0 | 0 | 0 | 97 | 1 | 0 | 0 | 0 | ... | 19 | 0 | 18 | 2 | 0 | 0 | 0 | 0 | 0 | 0 |
650943 | 1 | 1 | 0 | 0 | 0 | 97 | 1 | 0 | 0 | 16 | ... | 19 | 0 | 16 | 4 | 0 | 0 | 0 | 0 | 0 | 0 |
650944 | 1 | 1 | 0 | 0 | 0 | 97 | 1 | 14 | 7 | 0 | ... | 19 | 0 | 16 | 4 | 0 | 0 | 0 | 0 | 0 | 0 |
650945 | 1 | 1 | 0 | 0 | 0 | 97 | 1 | 11 | 0 | 17 | ... | 19 | 0 | 17 | 6 | 0 | 0 | 0 | 0 | 0 | 0 |
650946 | 1 | 1 | 0 | 0 | 0 | 97 | 0 | 0 | 0 | 0 | ... | 14 | 0 | 8 | 8 | 0 | 0 | 0 | 0 | 0 | 0 |
650947 | 1 | 1 | 0 | 0 | 0 | 97 | 0 | 0 | 0 | 0 | ... | 19 | 0 | 19 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
650948 | 1 | 1 | 0 | 0 | 0 | 97 | 1 | 0 | 0 | 0 | ... | 19 | 0 | 11 | 7 | 0 | 0 | 0 | 0 | 0 | 0 |
650949 | 1 | 1 | 0 | 0 | 0 | 97 | 1 | 0 | 0 | 0 | ... | 19 | 0 | 11 | 7 | 0 | 0 | 0 | 0 | 0 | 0 |
650950 | 1 | 1 | 0 | 0 | 0 | 97 | 1 | 0 | 0 | 0 | ... | 19 | 0 | 11 | 7 | 0 | 0 | 0 | 0 | 0 | 0 |
650951 | 1 | 1 | 0 | 0 | 0 | 98 | 1 | 0 | 0 | 0 | ... | 18 | 0 | 5 | 3 | 0 | 0 | 0 | 0 | 0 | 0 |
650952 | 1 | 1 | 0 | 0 | 0 | 98 | 1 | 0 | 0 | 0 | ... | 18 | 0 | 5 | 3 | 0 | 0 | 0 | 0 | 0 | 0 |
650953 | 1 | 1 | 0 | 0 | 0 | 98 | 1 | 0 | 0 | 0 | ... | 18 | 0 | 5 | 3 | 0 | 0 | 0 | 0 | 0 | 0 |
650954 | 1 | 1 | 0 | 0 | 0 | 98 | 1 | 0 | 0 | 0 | ... | 17 | 0 | 12 | 6 | 0 | 0 | 0 | 0 | 0 | 0 |
650955 | 1 | 1 | 0 | 0 | 0 | 98 | 1 | 0 | 0 | 0 | ... | 17 | 0 | 12 | 6 | 0 | 0 | 0 | 0 | 0 | 0 |
650956 | 1 | 1 | 0 | 0 | 0 | 98 | 1 | 0 | 0 | 0 | ... | 17 | 0 | 12 | 6 | 0 | 0 | 0 | 0 | 0 | 0 |
650957 | 1 | 1 | 0 | 0 | 0 | 98 | 1 | 0 | 0 | 0 | ... | 19 | 0 | 11 | 9 | 0 | 0 | 0 | 0 | 0 | 0 |
650958 | 1 | 1 | 0 | 0 | 0 | 99 | 1 | 0 | 10 | 0 | ... | 19 | 0 | 15 | 4 | 0 | 0 | 0 | 0 | 0 | 0 |
650959 | 1 | 1 | 0 | 0 | 0 | 99 | 1 | 0 | 10 | 0 | ... | 19 | 0 | 15 | 4 | 0 | 0 | 0 | 0 | 0 | 0 |
650960 | 1 | 1 | 0 | 0 | 0 | 99 | 1 | 0 | 0 | 0 | ... | 19 | 0 | 13 | 7 | 0 | 0 | 0 | 0 | 0 | 0 |
650961 | 1 | 1 | 0 | 0 | 0 | 99 | 1 | 0 | 0 | 0 | ... | 19 | 0 | 11 | 9 | 0 | 0 | 0 | 0 | 0 | 0 |
650962 | 1 | 1 | 0 | 0 | 0 | 99 | 1 | 0 | 0 | 0 | ... | 19 | 0 | 11 | 7 | 0 | 0 | 0 | 0 | 0 | 0 |
650963 | 1 | 1 | 0 | 0 | 0 | 99 | 1 | 0 | 0 | 0 | ... | 19 | 0 | 11 | 7 | 0 | 0 | 0 | 0 | 0 | 0 |
650964 | 1 | 1 | 0 | 0 | 0 | 99 | 1 | 0 | 0 | 0 | ... | 19 | 0 | 11 | 7 | 0 | 0 | 0 | 0 | 0 | 0 |
650965 rows × 148 columns
print("학습할 데이터 컬럼 개수 : %d" % len(df_1.columns)) # 데이터 컬럼 개수
print("학습할 데이터 개수 : %d" % len(df_1)) # 데이터 개수
학습할 데이터 컬럼 개수 : 148 학습할 데이터 개수 : 650965