#!/usr/bin/env python
# coding: utf-8

# # 머신 러닝 교과서 3판

# # 14장 - 텐서플로의 구조 자세히 알아보기 (2/3)

# **아래 링크를 통해 이 노트북을 주피터 노트북 뷰어(nbviewer.jupyter.org)로 보거나 구글 코랩(colab.research.google.com)에서 실행할 수 있습니다.**
# 
# <table class="tfo-notebook-buttons" align="left">
#   <td>
#     <a target="_blank" href="https://nbviewer.org/github/rickiepark/python-machine-learning-book-3rd-edition/blob/master/ch14/ch14_part2.ipynb"><img src="https://jupyter.org/assets/share.png" width="60" />주피터 노트북 뷰어로 보기</a>
#   </td>
#   <td>
#     <a target="_blank" href="https://colab.research.google.com/github/rickiepark/python-machine-learning-book-3rd-edition/blob/master/ch14/ch14_part2.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />구글 코랩(Colab)에서 실행하기</a>
#   </td>
# </table>

# ### 목차

# - 텐서플로 추정기
#   - 특성 열 사용하기
#   - 사전에 준비된 추정기로 머신 러닝 수행하기

# In[ ]:


import numpy as np
import tensorflow as tf
import pandas as pd

from IPython.display import Image


# In[ ]:


tf.__version__


# ## 텐서플로 추정기
# 
# ##### 사전에 준비된 추정기 사용하는 단계
# 
#  * **단계 1:** 데이터 로딩을 위해 입력 함수 정의하기
#  * **단계 2:** 추정기와 데이터 사이를 연결하기 위해 특성 열 정의하기
#  * **단계 3:** 추정기 객체를 만들거나 케라스 모델을 추정기로 바꾸기
#  * **단계 4:** 추정기 사용하기: train() evaluate() predict()

# In[ ]:


tf.random.set_seed(1)
np.random.seed(1)


# ### 특성 열 사용하기
# 
# 
#  * 정의: https://developers.google.com/machine-learning/glossary/#feature_columns
#  * 문서: https://www.tensorflow.org/api_docs/python/tf/feature_column

# In[ ]:


Image(url='https://git.io/JL56E', width=700)


# In[ ]:


dataset_path = tf.keras.utils.get_file(
    origin="https://archive.ics.uci.edu/static/public/9/auto+mpg.zip",
    extract=True)
dataset_path = dataset_path.replace('auto+mpg.zip', 'auto-mpg.data')

column_names = ['MPG', 'Cylinders', 'Displacement', 'Horsepower',
                'Weight', 'Acceleration', 'ModelYear', 'Origin']

df = pd.read_csv(dataset_path, names=column_names,
                 na_values = "?", comment='\t',
                 sep=" ", skipinitialspace=True)

df.tail()


# In[ ]:


print(df.isna().sum())

df = df.dropna()
df = df.reset_index(drop=True)
df.tail()


# In[ ]:


import sklearn
import sklearn.model_selection


df_train, df_test = sklearn.model_selection.train_test_split(df, train_size=0.8)
train_stats = df_train.describe().transpose()
train_stats


# In[ ]:


numeric_column_names = ['Cylinders', 'Displacement', 'Horsepower', 'Weight', 'Acceleration']

df_train_norm, df_test_norm = df_train.copy(), df_test.copy()

for col_name in numeric_column_names:
    mean = train_stats.loc[col_name, 'mean']
    std  = train_stats.loc[col_name, 'std']
    df_train_norm.loc[:, col_name] = (df_train_norm.loc[:, col_name] - mean)/std
    df_test_norm.loc[:, col_name] = (df_test_norm.loc[:, col_name] - mean)/std

df_train_norm.tail()


# #### 수치형 열

# In[ ]:


numeric_features = []

for col_name in numeric_column_names:
    numeric_features.append(tf.feature_column.numeric_column(key=col_name))

numeric_features


# In[ ]:


feature_year = tf.feature_column.numeric_column(key="ModelYear")

bucketized_features = []

bucketized_features.append(tf.feature_column.bucketized_column(
    source_column=feature_year,
    boundaries=[73, 76, 79]))

print(bucketized_features)


# In[ ]:


feature_origin = tf.feature_column.categorical_column_with_vocabulary_list(
    key='Origin',
    vocabulary_list=[1, 2, 3])

categorical_indicator_features = []
categorical_indicator_features.append(tf.feature_column.indicator_column(feature_origin))

print(categorical_indicator_features)


# ### 사전에 준비된 추정기로 머신러닝 수행하기

# In[ ]:


def train_input_fn(df_train, batch_size=8):
    df = df_train.copy()
    train_x, train_y = df, df.pop('MPG')
    dataset = tf.data.Dataset.from_tensor_slices((dict(train_x), train_y))

    # 셔플, 반복, 배치
    return dataset.shuffle(1000).repeat().batch(batch_size)

## 조사
ds = train_input_fn(df_train_norm)
batch = next(iter(ds))
print('키:', batch[0].keys())
print('ModelYear:', batch[0]['ModelYear'])


# In[ ]:


all_feature_columns = (numeric_features +
                       bucketized_features +
                       categorical_indicator_features)

print(all_feature_columns)


# In[ ]:


regressor = tf.estimator.DNNRegressor(
    feature_columns=all_feature_columns,
    hidden_units=[32, 10],
    model_dir='models/autompg-dnnregressor/')


# In[ ]:


EPOCHS = 1000
BATCH_SIZE = 8
total_steps = EPOCHS * int(np.ceil(len(df_train) / BATCH_SIZE))
print('훈련 스텝:', total_steps)

regressor.train(
    input_fn=lambda:train_input_fn(df_train_norm, batch_size=BATCH_SIZE),
    steps=total_steps)


# In[ ]:


reloaded_regressor = tf.estimator.DNNRegressor(
    feature_columns=all_feature_columns,
    hidden_units=[32, 10],
    warm_start_from='models/autompg-dnnregressor/',
    model_dir='models/autompg-dnnregressor/')


# In[ ]:


def eval_input_fn(df_test, batch_size=8):
    df = df_test.copy()
    test_x, test_y = df, df.pop('MPG')
    dataset = tf.data.Dataset.from_tensor_slices((dict(test_x), test_y))

    return dataset.batch(batch_size)

eval_results = reloaded_regressor.evaluate(
    input_fn=lambda:eval_input_fn(df_test_norm, batch_size=8))

for key in eval_results:
    print('{:15s} {}'.format(key, eval_results[key]))

print('평균 손실 {:.4f}'.format(eval_results['average_loss']))


# In[ ]:


pred_res = regressor.predict(input_fn=lambda: eval_input_fn(df_test_norm, batch_size=8))

print(next(iter(pred_res)))


# #### Boosted Tree Regressor

# In[ ]:


boosted_tree = tf.estimator.BoostedTreesRegressor(
    feature_columns=all_feature_columns,
    n_batches_per_layer=20,
    n_trees=200)

boosted_tree.train(
    input_fn=lambda:train_input_fn(df_train_norm, batch_size=BATCH_SIZE))

eval_results = boosted_tree.evaluate(
    input_fn=lambda:eval_input_fn(df_test_norm, batch_size=8))

print(eval_results)

print('평균 손실 {:.4f}'.format(eval_results['average_loss']))