#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('reload_ext', 'autoreload') get_ipython().run_line_magic('autoreload', '2') get_ipython().run_line_magic('matplotlib', 'inline') import os os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"; os.environ["CUDA_VISIBLE_DEVICES"]="0" # In[2]: import numpy as np import tensorflow as tf print("Tensorflow version " + tf.__version__) # # Using *ktrain* to Facilitate a Normal TensorFlow Workflow # # This example notebook simply illustrates how *ktrain* can be used in a **minimally-invasive** way within # a normal TensorFlow workflow. In this notebook, we will store our datasets in the form of `tf.Datasets` and build our own `tf.Keras` model following the example of TensorFlow's [Keras MNIST TPU.ipynb](https://colab.research.google.com/github/tensorflow/tpu/blob/master/tools/colab/keras_mnist_tpu.ipynb#scrollTo=cCpkS9C_H7Tl). We will then simply use **ktrain** as a lightweight wrapper for our model and data to estimate a learning rate, train the model, inspect the model, and make predictions. # ## Detect Hardware: CPU vs. GPU vs. TPU # In[3]: # Detect hardware try: tpu = tf.distribute.cluster_resolver.TPUClusterResolver() # TPU detection except ValueError: tpu = None gpus = tf.config.experimental.list_logical_devices("GPU") # Select appropriate distribution strategy if tpu: tf.tpu.experimental.initialize_tpu_system(tpu) strategy = tf.distribute.experimental.TPUStrategy(tpu, steps_per_run=128) # Going back and forth between TPU and host is expensive. Better to run 128 batches on the TPU before reporting back. print('Running on TPU ', tpu.cluster_spec().as_dict()['worker']) elif len(gpus) > 1: strategy = tf.distribute.MirroredStrategy([gpu.name for gpu in gpus]) print('Running on multiple GPUs ', [gpu.name for gpu in gpus]) elif len(gpus) == 1: strategy = tf.distribute.get_strategy() # default strategy that works on CPU and single GPU print('Running on single GPU ', gpus[0].name) else: strategy = tf.distribute.get_strategy() # default strategy that works on CPU and single GPU print('Running on CPU') print("Number of accelerators: ", strategy.num_replicas_in_sync) # ## Prepare Training and Validation Data as `tf.Datasets` # # Download the dataset files from [LeCun's website](http://yann.lecun.com/exdb/mnist/). # In[4]: BATCH_SIZE = 64 * strategy.num_replicas_in_sync # Gobal batch size. training_images_file = 'data/mnist_lecun/train-images-idx3-ubyte' training_labels_file = 'data/mnist_lecun/train-labels-idx1-ubyte' validation_images_file = 'data/mnist_lecun/t10k-images-idx3-ubyte' validation_labels_file = 'data/mnist_lecun/t10k-labels-idx1-ubyte' # Note that, if training using a TPU, these should be set as follows: # # ```python # training_images_file = 'gs://mnist-public/train-images-idx3-ubyte' # training_labels_file = 'gs://mnist-public/train-labels-idx1-ubyte' # validation_images_file = 'gs://mnist-public/t10k-images-idx3-ubyte' # validation_labels_file = 'gs://mnist-public/t10k-labels-idx1-ubyte' # ``` # # You may need to authenticate: # ```python # IS_COLAB_BACKEND = 'COLAB_GPU' in os.environ # this is always set on Colab, the value is 0 or 1 depending on GPU presence # if IS_COLAB_BACKEND: # from google.colab import auth # # Authenticates the Colab machine and also the TPU using your # # credentials so that they can access your private GCS buckets. # auth.authenticate_user() # ``` # In[5]: def read_label(tf_bytestring): label = tf.io.decode_raw(tf_bytestring, tf.uint8) label = tf.reshape(label, []) label = tf.one_hot(label, 10) return label def read_image(tf_bytestring): image = tf.io.decode_raw(tf_bytestring, tf.uint8) image = tf.cast(image, tf.float32)/255.0 image = tf.reshape(image, [28*28]) return image def load_dataset(image_file, label_file): imagedataset = tf.data.FixedLengthRecordDataset(image_file, 28*28, header_bytes=16) imagedataset = imagedataset.map(read_image, num_parallel_calls=16) labelsdataset = tf.data.FixedLengthRecordDataset(label_file, 1, header_bytes=8) labelsdataset = labelsdataset.map(read_label, num_parallel_calls=16) dataset = tf.data.Dataset.zip((imagedataset, labelsdataset)) return dataset def get_training_dataset(image_file, label_file, batch_size): dataset = load_dataset(image_file, label_file) dataset = dataset.cache() # this small dataset can be entirely cached in RAM dataset = dataset.shuffle(5000, reshuffle_each_iteration=True) dataset = dataset.repeat() # Mandatory for Keras for now dataset = dataset.batch(batch_size, drop_remainder=True) # drop_remainder is important on TPU, batch size must be fixed dataset = dataset.prefetch(-1) # fetch next batches while training on the current one (-1: autotune prefetch buffer size) return dataset def get_validation_dataset(image_file, label_file): dataset = load_dataset(image_file, label_file) dataset = dataset.cache() # this small dataset can be entirely cached in RAM dataset = dataset.batch(10000, drop_remainder=True) # 10000 items in eval dataset, all in one batch dataset = dataset.repeat() # Mandatory for Keras for now return dataset def load_label_dataset(label_file): labelsdataset = tf.data.FixedLengthRecordDataset(label_file, 1, header_bytes=8) labelsdataset = labelsdataset.map(read_label, num_parallel_calls=16) return labelsdataset # instantiate the datasets training_dataset = get_training_dataset(training_images_file, training_labels_file, BATCH_SIZE) validation_dataset = get_validation_dataset(validation_images_file, validation_labels_file) # exract ground truth labels training_labels = np.vstack(list(load_label_dataset(training_labels_file).as_numpy_iterator())) validation_labels = np.vstack(list(load_label_dataset(validation_labels_file).as_numpy_iterator())) # ## Build a Model # In[6]: # This model trains to 99.4% accuracy in 10 epochs (with a batch size of 64) def make_model(): model = tf.keras.Sequential( [ tf.keras.layers.Reshape(input_shape=(28*28,), target_shape=(28, 28, 1), name="image"), tf.keras.layers.Conv2D(filters=12, kernel_size=3, padding='same', use_bias=False), # no bias necessary before batch norm tf.keras.layers.BatchNormalization(scale=False, center=True), # no batch norm scaling necessary before "relu" tf.keras.layers.Activation('relu'), # activation after batch norm tf.keras.layers.Conv2D(filters=24, kernel_size=6, padding='same', use_bias=False, strides=2), tf.keras.layers.BatchNormalization(scale=False, center=True), tf.keras.layers.Activation('relu'), tf.keras.layers.Conv2D(filters=32, kernel_size=6, padding='same', use_bias=False, strides=2), tf.keras.layers.BatchNormalization(scale=False, center=True), tf.keras.layers.Activation('relu'), tf.keras.layers.Flatten(), tf.keras.layers.Dense(200, use_bias=False), tf.keras.layers.BatchNormalization(scale=False, center=True), tf.keras.layers.Activation('relu'), tf.keras.layers.Dropout(0.4), # Dropout on dense layer only tf.keras.layers.Dense(10, activation='softmax') ]) model.compile(optimizer='adam', # learning rate will be set by LearningRateScheduler loss='categorical_crossentropy', metrics=['accuracy']) return model with strategy.scope(): model = make_model() # set up learning rate decay [FROM ORIGINAL EXAMPLE BUT NOT USED] # NOT NEEDED: we will use ktrain to find LR and decay learning rate during training LEARNING_RATE = 0.01 LEARNING_RATE_EXP_DECAY = 0.6 if strategy.num_replicas_in_sync == 1 else 0.7 lr_decay = tf.keras.callbacks.LearningRateScheduler( lambda epoch: LEARNING_RATE * LEARNING_RATE_EXP_DECAY**epoch, verbose=True) # ## Use *ktrain* With Our Model and Data # ### Wrap tf.Datasets in a `ktrain.TFDataset` wrapper and create `Learner` # In[7]: import ktrain trn = ktrain.TFDataset(training_dataset, n=training_labels.shape[0], y=training_labels) val = ktrain.TFDataset(validation_dataset, n=validation_labels.shape[0], y=validation_labels) learner = ktrain.get_learner(model, train_data=trn, val_data=val) # ### Find Learning Rate # In[8]: learner.lr_find(show_plot=True) # ## Train the Model Using a Cosine Annealing LR Schedule # In[9]: learner.fit(5e-3, 1, cycle_len=10, checkpoint_folder='/tmp/mymodel') # In[10]: # cosine annealed LR schedule learner.plot('lr') # In[11]: # training vs. validation loss learner.plot('loss') # ### Inspect Model # # #### Evaluate as Normal # In[12]: learner.model.evaluate(validation_dataset, steps=1) # #### Validation Metrics # In[14]: learner.validate(class_names=list(map(str, range(10)))) # #### View Top Losses # In[11]: learner.view_top_losses(n=1) # ### Making Predictions # In[26]: preds = learner.predict(val) preds = np.argmax(preds, axis=1) actual = learner.ground_truth(val) actual = np.argmax(actual, axis=1) # In[27]: import pandas as pd df = pd.DataFrame(zip(preds, actual), columns=['Predicted', 'Actual']) df.head() # ## Save Model and Reload Model # In[13]: learner.save_model('/tmp/my_tf_model') # In[14]: learner.load_model('/tmp/my_tf_model') # In[15]: learner.model.evaluate(validation_dataset, steps=1) # In[ ]: