#!/usr/bin/env python
# coding: utf-8

# # Simple version Random Forest 생성

# (1) *Sklearn moons* 훈련 데이터를 1,000개 생성합니다. 각각의 훈련 데이터는 무작위로 선택된 10,000개의 샘플을 담고 있도록 합니다.

# (2) 먼저 트리분류기를 학습한 후 테스트 셋에서 최종 성능을 확인합니다.

# (3) 각 테스트 샘플에 대해 1,000개의 결정 트리 예측을 만들고 다수로 나온 예측만 취합니다. 그러면 테스트 세트에 대한 **다수결 예측(majority vote prediction)** 이 생성됩니다.

# (4) 테스트 세트에서 이 예측을 평가합니다.

# ## 1. 데이터 로딩 

# In[1]:


from sklearn.datasets import make_moons

X, y = make_moons(n_samples=10000, noise=0.4, random_state=42)


# ## 2. 학습 및 테스트 세트 구분

# In[2]:


from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.3, random_state=42)


# In[3]:


print('학습 셋 :', X_train.shape)
print('테스트 셋 :', X_test.shape)


# In[4]:


X_train[:5]


# In[5]:


y_train[:5]


# ## 3. Hyperparameter 검색

# In[6]:


from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

params = {
    'max_leaf_nodes': [2, 3, 4, 5, 6, 7], 
    'min_samples_split': [2, 3, 4],
    'max_depth': [3, 5, 10, 15, 20]
}
grid_search_cv = GridSearchCV(
    DecisionTreeClassifier(random_state=42), 
    params, 
    n_jobs=-1, 
    verbose=1, 
    cv=3
)


# In[7]:


grid_search_cv.fit(X_train, y_train)


# In[8]:


# 성능이 좋은 하이퍼파라미터를 찾는다.
grid_search_cv.best_estimator_


# ## 4. 단일 트리 성능 체크

# In[9]:


from sklearn.metrics import accuracy_score

y_pred = grid_search_cv.predict(X_test)
accuracy_score(y_test, y_pred)


# ## 5. 랜덤 포레스트 모델을 생성하기 위하여 학습셋 샘플들 생성

# In[10]:


from sklearn.model_selection import ShuffleSplit

# 총 1000개의 tree
n_trees = 1000
n_instances = 100

mini_sets = []

rs = ShuffleSplit(
    n_splits=n_trees, 
    test_size=len(X_train) - n_instances, 
    random_state=42
)


# In[11]:


# 샘플 개수 확인
len(list(rs.split(X_train)))


# In[12]:


# 학습 세트는 사용한다.
print('학습 세트 :', len(list(rs.split(X_train))[0][0]))
# 테스트 세트는 사용하지 않을 것이다.
print('테스트 세트 :', len(list(rs.split(X_train))[0][1]))


# In[13]:


for mini_train_index, _ in rs.split(X_train):
    X_mini_train = X_train[mini_train_index]
    y_mini_train = y_train[mini_train_index]
    mini_sets.append((X_mini_train, y_mini_train))


# ## 6. 1000개의 개별 모델 학습

# In[14]:


import numpy as np
from sklearn.base import clone

# 1000개의 학습트리
forest = [clone(grid_search_cv.best_estimator_) \
          for _ in range(n_trees)]

accuracy_scores = []

# 1000개의 트리에 대해서 학습한다.
for tree, (X_mini_train, y_mini_train) in zip(forest, mini_sets):
    # 개별 모델을 학습한다.
    tree.fit(X_mini_train, y_mini_train)
    # 학습한 개별 모델의 예측을 구한다.
    y_pred = tree.predict(X_test)
    accuracy_scores.append(accuracy_score(y_test, y_pred))

np.mean(accuracy_scores)


# 단일 트리를 이용했을 때, 7000개의 샘플에서 학습을 하였기 때문에, 100개의 샘플에서 학습을 했을 때보다 정확도가 높다.
# 
# 100개의 샘플에서 학습한, 개별 학습기 1000개의 성능 평균값이 낮은 것을 확인 할 수 있다.

# ## 7. 1000개의 개별 모델에서 예측값을 얻고 앙상블

# In[15]:


Y_pred = np.empty([n_trees, len(X_test)], dtype=np.uint8)

# 1000개의 개별 모델에서 예측값을 각각 구한다.
for tree_index, tree in enumerate(forest):
    Y_pred[tree_index] = tree.predict(X_test)


# In[16]:


from scipy.stats import mode

# Majority vote
y_pred_majority_votes, n_votes = mode(Y_pred, axis=0)


# In[17]:


y_pred_majority_votes


# In[18]:


accuracy_score(y_test, y_pred_majority_votes.reshape([-1]))