#!/usr/bin/env python # coding: utf-8 # # 머신 러닝 교과서 3판 # # 14장 - 텐서플로의 구조 자세히 알아보기 (2/3) # **아래 링크를 통해 이 노트북을 주피터 노트북 뷰어(nbviewer.jupyter.org)로 보거나 구글 코랩(colab.research.google.com)에서 실행할 수 있습니다.** # # <table class="tfo-notebook-buttons" align="left"> # <td> # <a target="_blank" href="https://nbviewer.org/github/rickiepark/python-machine-learning-book-3rd-edition/blob/master/ch14/ch14_part2.ipynb"><img src="https://jupyter.org/assets/share.png" width="60" />주피터 노트북 뷰어로 보기</a> # </td> # <td> # <a target="_blank" href="https://colab.research.google.com/github/rickiepark/python-machine-learning-book-3rd-edition/blob/master/ch14/ch14_part2.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />구글 코랩(Colab)에서 실행하기</a> # </td> # </table> # ### 목차 # - 텐서플로 추정기 # - 특성 열 사용하기 # - 사전에 준비된 추정기로 머신 러닝 수행하기 # In[ ]: import numpy as np import tensorflow as tf import pandas as pd from IPython.display import Image # In[ ]: tf.__version__ # ## 텐서플로 추정기 # # ##### 사전에 준비된 추정기 사용하는 단계 # # * **단계 1:** 데이터 로딩을 위해 입력 함수 정의하기 # * **단계 2:** 추정기와 데이터 사이를 연결하기 위해 특성 열 정의하기 # * **단계 3:** 추정기 객체를 만들거나 케라스 모델을 추정기로 바꾸기 # * **단계 4:** 추정기 사용하기: train() evaluate() predict() # In[ ]: tf.random.set_seed(1) np.random.seed(1) # ### 특성 열 사용하기 # # # * 정의: https://developers.google.com/machine-learning/glossary/#feature_columns # * 문서: https://www.tensorflow.org/api_docs/python/tf/feature_column # In[ ]: Image(url='https://git.io/JL56E', width=700) # In[ ]: dataset_path = tf.keras.utils.get_file( origin="https://archive.ics.uci.edu/static/public/9/auto+mpg.zip", extract=True) dataset_path = dataset_path.replace('auto+mpg.zip', 'auto-mpg.data') column_names = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight', 'Acceleration', 'ModelYear', 'Origin'] df = pd.read_csv(dataset_path, names=column_names, na_values = "?", comment='\t', sep=" ", skipinitialspace=True) df.tail() # In[ ]: print(df.isna().sum()) df = df.dropna() df = df.reset_index(drop=True) df.tail() # In[ ]: import sklearn import sklearn.model_selection df_train, df_test = sklearn.model_selection.train_test_split(df, train_size=0.8) train_stats = df_train.describe().transpose() train_stats # In[ ]: numeric_column_names = ['Cylinders', 'Displacement', 'Horsepower', 'Weight', 'Acceleration'] df_train_norm, df_test_norm = df_train.copy(), df_test.copy() for col_name in numeric_column_names: mean = train_stats.loc[col_name, 'mean'] std = train_stats.loc[col_name, 'std'] df_train_norm.loc[:, col_name] = (df_train_norm.loc[:, col_name] - mean)/std df_test_norm.loc[:, col_name] = (df_test_norm.loc[:, col_name] - mean)/std df_train_norm.tail() # #### 수치형 열 # In[ ]: numeric_features = [] for col_name in numeric_column_names: numeric_features.append(tf.feature_column.numeric_column(key=col_name)) numeric_features # In[ ]: feature_year = tf.feature_column.numeric_column(key="ModelYear") bucketized_features = [] bucketized_features.append(tf.feature_column.bucketized_column( source_column=feature_year, boundaries=[73, 76, 79])) print(bucketized_features) # In[ ]: feature_origin = tf.feature_column.categorical_column_with_vocabulary_list( key='Origin', vocabulary_list=[1, 2, 3]) categorical_indicator_features = [] categorical_indicator_features.append(tf.feature_column.indicator_column(feature_origin)) print(categorical_indicator_features) # ### 사전에 준비된 추정기로 머신러닝 수행하기 # In[ ]: def train_input_fn(df_train, batch_size=8): df = df_train.copy() train_x, train_y = df, df.pop('MPG') dataset = tf.data.Dataset.from_tensor_slices((dict(train_x), train_y)) # 셔플, 반복, 배치 return dataset.shuffle(1000).repeat().batch(batch_size) ## 조사 ds = train_input_fn(df_train_norm) batch = next(iter(ds)) print('키:', batch[0].keys()) print('ModelYear:', batch[0]['ModelYear']) # In[ ]: all_feature_columns = (numeric_features + bucketized_features + categorical_indicator_features) print(all_feature_columns) # In[ ]: regressor = tf.estimator.DNNRegressor( feature_columns=all_feature_columns, hidden_units=[32, 10], model_dir='models/autompg-dnnregressor/') # In[ ]: EPOCHS = 1000 BATCH_SIZE = 8 total_steps = EPOCHS * int(np.ceil(len(df_train) / BATCH_SIZE)) print('훈련 스텝:', total_steps) regressor.train( input_fn=lambda:train_input_fn(df_train_norm, batch_size=BATCH_SIZE), steps=total_steps) # In[ ]: reloaded_regressor = tf.estimator.DNNRegressor( feature_columns=all_feature_columns, hidden_units=[32, 10], warm_start_from='models/autompg-dnnregressor/', model_dir='models/autompg-dnnregressor/') # In[ ]: def eval_input_fn(df_test, batch_size=8): df = df_test.copy() test_x, test_y = df, df.pop('MPG') dataset = tf.data.Dataset.from_tensor_slices((dict(test_x), test_y)) return dataset.batch(batch_size) eval_results = reloaded_regressor.evaluate( input_fn=lambda:eval_input_fn(df_test_norm, batch_size=8)) for key in eval_results: print('{:15s} {}'.format(key, eval_results[key])) print('평균 손실 {:.4f}'.format(eval_results['average_loss'])) # In[ ]: pred_res = regressor.predict(input_fn=lambda: eval_input_fn(df_test_norm, batch_size=8)) print(next(iter(pred_res))) # #### Boosted Tree Regressor # In[ ]: boosted_tree = tf.estimator.BoostedTreesRegressor( feature_columns=all_feature_columns, n_batches_per_layer=20, n_trees=200) boosted_tree.train( input_fn=lambda:train_input_fn(df_train_norm, batch_size=BATCH_SIZE)) eval_results = boosted_tree.evaluate( input_fn=lambda:eval_input_fn(df_test_norm, batch_size=8)) print(eval_results) print('평균 손실 {:.4f}'.format(eval_results['average_loss']))