#!/usr/bin/env python
# coding: utf-8

# # 밀집 신경망을 사용한 감성 분류기

# 이 노트북에서 IMDB 영화 리뷰를 감성에 따라 분류하는 밀집 신경망을 만듭니다.

# [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/rickiepark/dl-illustrated/blob/master/notebooks/11-2.dense_sentiment_classifier.ipynb)

# #### 라이브러리를 적재합니다.

# In[1]:


from tensorflow import keras
from tensorflow.keras.datasets import imdb # new! 
from tensorflow.keras.preprocessing.sequence import pad_sequences #new!
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Dropout
from tensorflow.keras.layers import Embedding # new!
from tensorflow.keras.callbacks import ModelCheckpoint # new! 
import os # new! 
from sklearn.metrics import roc_auc_score, roc_curve # new!
import pandas as pd
import matplotlib.pyplot as plt # new!
get_ipython().run_line_magic('matplotlib', 'inline')


# #### 하이퍼파라미터 셋팅

# In[2]:


# 출력 디렉토리
output_dir = 'model_output/dense'

# 훈련
epochs = 4
batch_size = 128

# 벡터 공간 임베딩
n_dim = 64
n_unique_words = 5000 # Maas et al. (2011); 최적이 아닐 수 있음
n_words_to_skip = 50 # 상동
max_review_length = 100
pad_type = trunc_type = 'pre'

# 신경망 구조
n_dense = 64
dropout = 0.5


# #### 데이터를 적재합니다.

# 이 데이터셋에서
# 
# * [케라스 텍스트 유틸리티](http://keras-ko.kr/api/preprocessing/text/)는 빠르게 자연어를 전처리하고 인덱스로 변환합니다.
# * `keras.preprocessing.text.Tokenizer` 클래스는 필요한 모든 것을 한 줄로 처리할 수 있습니다.
#     * 단어나 문자로 토큰화하기
#     * `num_words`: 고유한 최대 토큰
#     * 구둣점 삭제
#     * 소문자로 변경
#     * 단어를 정수 인덱스로 변경하기

# In[3]:


(x_train, y_train), (x_valid, y_valid) = imdb.load_data(num_words=n_unique_words, 
                                                        skip_top=n_words_to_skip) 


# In[4]:


x_train[0:6] # 0은 패딩; 1은 시작 문자; 2는 모르는 문자; 3은 가장 흔한 단어를 의미합니다.


# In[5]:


for x in x_train[0:6]:
    print(len(x))


# In[6]:


y_train[0:6]


# In[7]:


len(x_train), len(x_valid)


# #### 인덱스에서 단어 복원하기

# In[8]:


word_index = keras.datasets.imdb.get_word_index()
word_index = {k:(v+3) for k,v in word_index.items()}
word_index["PAD"] = 0
word_index["START"] = 1
word_index["UNK"] = 2


# In[9]:


word_index


# In[10]:


index_word = {v:k for k,v in word_index.items()}


# In[11]:


x_train[0]


# In[12]:


' '.join(index_word[id] for id in x_train[0])


# In[13]:


(all_x_train,_),(all_x_valid,_) = imdb.load_data() 


# In[14]:


' '.join(index_word[id] for id in all_x_train[0])


# #### 데이터 전처리

# In[15]:


x_train = pad_sequences(x_train, maxlen=max_review_length, 
                        padding=pad_type, truncating=trunc_type, value=0)
x_valid = pad_sequences(x_valid, maxlen=max_review_length, 
                        padding=pad_type, truncating=trunc_type, value=0)


# In[16]:


x_train[0:6]


# In[17]:


for x in x_train[0:6]:
    print(len(x))


# In[18]:


' '.join(index_word[id] for id in x_train[0])


# In[19]:


' '.join(index_word[id] for id in x_train[5])


# #### 신경망 만들기

# In[20]:


model = Sequential()
model.add(Embedding(n_unique_words, n_dim, input_length=max_review_length))
model.add(Flatten())
model.add(Dense(n_dense, activation='relu'))
model.add(Dropout(dropout))
# model.add(Dense(n_dense, activation='relu'))
# model.add(Dropout(dropout))
model.add(Dense(1, activation='sigmoid')) # 두 개의 클래스가 있는 소프트맥스와 수학적으로 동일합니다


# In[21]:


model.summary() # 파라미터가 얼마나 많나요!


# In[22]:


# 임베딩 층의 차원과 파라미터
n_dim, n_unique_words, n_dim*n_unique_words


# In[23]:


# ...Flatten()
max_review_length, n_dim, n_dim*max_review_length


# In[24]:


# ...Dense
n_dense, n_dim*max_review_length*n_dense + n_dense # weights + biases


# In[25]:


# ...그리고 출력
n_dense + 1 


# #### 모델을 설정합니다.

# In[26]:


model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


# In[27]:


modelcheckpoint = ModelCheckpoint(filepath=output_dir+
                                  "/weights.{epoch:02d}.hdf5")


# In[28]:


if not os.path.exists(output_dir):
    os.makedirs(output_dir)


# #### 훈련!

# In[29]:


model.fit(x_train, y_train, 
          batch_size=batch_size, epochs=epochs, verbose=1, 
          validation_data=(x_valid, y_valid), 
          callbacks=[modelcheckpoint])


# #### 평가

# In[30]:


model.load_weights(output_dir+"/weights.02.hdf5") # NOT zero-indexed


# In[31]:


y_hat = model.predict(x_valid)


# In[32]:


len(y_hat)


# In[33]:


y_hat[0]


# In[34]:


y_valid[0]


# In[35]:


plt.hist(y_hat)
_ = plt.axvline(x=0.5, color='orange')


# In[36]:


pct_auc = roc_auc_score(y_valid, y_hat)*100.0


# In[37]:


"{:0.2f}".format(pct_auc)


# In[38]:


float_y_hat = []
for y in y_hat:
    float_y_hat.append(y[0])


# In[39]:


ydf = pd.DataFrame(list(zip(float_y_hat, y_valid)), columns=['y_hat', 'y'])


# In[40]:


ydf.head(10)


# In[41]:


' '.join(index_word[id] for id in all_x_valid[0])


# In[42]:


' '.join(index_word[id] for id in all_x_valid[6]) 


# In[43]:


ydf[(ydf.y == 0) & (ydf.y_hat > 0.9)].head(10)


# In[44]:


' '.join(index_word[id] for id in all_x_valid[386]) 


# In[45]:


ydf[(ydf.y == 1) & (ydf.y_hat < 0.1)].head(10)


# In[46]:


' '.join(index_word[id] for id in all_x_valid[224])