#!/usr/bin/env python # coding: utf-8 # # Deep Learning in Python # ## Session 02 - Keras Advanced Concepts # # - *Course*: Big Data and Language Technologies # - *Date*: 11.04.2022 # # This session will cover a few more advanced concepts around Deep Learning in Python with Keras. We will build upon the ideas from the last session and learn about ways to customize the workflow further in detail. We will also learn how to solve some problems that we faced during the last session. # ## Setup # In[1]: import tensorflow as tf import numpy as np # ## Loading Data # This time, we will simply use a wrapper provided by Keras to load up the IMDB dataset that we explored in the last session. For reference, see the [API docs](https://www.tensorflow.org/api_docs/python/tf/keras/datasets/imdb/load_data). # In[2]: INDEX_FROM=3 NUM_WORDS=1000 (X_train, y_train), (X_test, y_test) = tf.keras.datasets.imdb.load_data(num_words=NUM_WORDS,index_from=INDEX_FROM) # Note that this already provides us with a train-test split. # # This dataset is already built using word indices instead of word strings. For transforming text from and to indices using the word index, see [this example](https://www.tensorflow.org/api_docs/python/tf/keras/datasets/imdb/get_word_index#example). # # **Exercise**: Explore the first 3 samples of X_train by converting them back to strings. What is going wrong? Why? # ### Naive solution # In[3]: word_index = tf.keras.datasets.imdb.get_word_index() inverted_word_index = {v:k for k,v in word_index.items()} for i in range(3): decoded_sequence = " ".join(inverted_word_index[ind] for ind in X_train[i]) print("label:",["negative","positive"][y_train[i]]) print(decoded_sequence) print() # ### Better solution that takes into account the special tokens that we defined implicitly while loading the dataset # In[4]: word_index = tf.keras.datasets.imdb.get_word_index() inverted_word_index = {v+INDEX_FROM:k for k,v in word_index.items()} inverted_word_index |= {0:"", 1: "", 2:"", 3: ""} for i in range(3): decoded_sequence = " ".join(inverted_word_index[ind] for ind in X_train[i]) print("label:",["negative","positive"][y_train[i]]) print(decoded_sequence) print() # ## `tf.data.Dataset` # # Using `tf.data.Dataset`, we can represent very large datasets (will become very important later in the semester). Tensorflow will handle many features necessary for that internally. # # **Exercise**: Use `tf.data.Dataset.from_generator` ([docs](https://www.tensorflow.org/api_docs/python/tf/data/Dataset#from_generator)) to convert our ndarray-based dataset to `tf.data.Dataset`. Provide an `output_signature=(X,y)` (you will also have to have the generator return this format). # # Using `tf.data.Dataset.from_tensor_slices` is probably difficult because the data is not padded yet. # In[5]: def gen(Xs, ys): for X,y in zip(Xs,ys): yield (X,y) output_signature=(tf.TensorSpec(shape=[None], dtype=tf.int32),tf.TensorSpec(shape=[],dtype=tf.int32)) train_ds = tf.data.Dataset.from_generator(lambda: gen(X_train,y_train), output_signature=output_signature) test_ds = tf.data.Dataset.from_generator(lambda: gen(X_test,y_test), output_signature=output_signature) # Converting the data back to numpy is easy: # In[6]: next(train_ds.as_numpy_iterator()) # ## Dataset persistence # # Tensorflow makes it quite easy to save and load `tf.data.Dataset`. # # ### Using `tf.data.experimental.save` and `load` # # `tf.data.experimental.save` ([docs](https://tensorflow.google.cn/api_docs/python/tf/data/experimental/save)) and `load` ([docs](https://tensorflow.google.cn/api_docs/python/tf/data/experimental/load)) can be used to persist a Dataset to storage. This will create multiple files (shards). # # **Exercise**: Save and load our dataset to storage. # In[7]: tf.data.experimental.save(train_ds,path="imdb_train") tf.data.experimental.save(test_ds,path="imdb_test") train_ds=tf.data.experimental.load(path="imdb_train") test_ds=tf.data.experimental.load(path="imdb_test") # Let's test it again: # In[8]: next(train_ds.as_numpy_iterator()) # Note: The [TFRecord format](https://www.tensorflow.org/tutorials/load_data/tfrecord) is the traditional method to save serialized data, which might save memory. # ## `map` and `filter` # # Using `map` ([docs](https://www.tensorflow.org/api_docs/python/tf/data/Dataset#map)) and `filter` ([docs](https://www.tensorflow.org/api_docs/python/tf/data/Dataset#filter)) on `tf.data.Dataset` is very convenient, as the used functions are applied on the fly, controlled by demand. # # It is recommended to use the `tf.function` decorator ([docs](https://www.tensorflow.org/api_docs/python/tf/function)) to improve performance if possible. # # **Exercise**: From the `train_ds`, filter out all reviews shorter than 100 tokens. # In[9]: @tf.function def filter_func(X,y): return tf.shape(X)[0]>=100 train_ds = train_ds.filter(filter_func) # ### \* Bonus: `flat_map` # # `map` allows us to modify Dataset samples 1-to-1. If we want to split certain samples into a varying number of samples, we can use `flat_map` ([docs](https://www.tensorflow.org/api_docs/python/tf/data/Dataset#flat_map)). # # **Exercise**: Use `flat_map` on `train_ds` to split up long reviews into reviews of 100 tokens. # In[10]: @tf.function def map_func(X,y): size=tf.shape(X)[0] r=tf.range(0,(size//100)*100,1) r=tf.reshape(r,[size//100,100]) Xs=tf.gather(X,r) ys=tf.repeat(y,size//100) return tf.data.Dataset.from_tensor_slices((Xs,ys)) train_ds = train_ds.flat_map(map_func) # In[11]: it=train_ds.as_numpy_iterator() for i in range(3): print(next(it)) # ## Batch, shuffle, repeat # # In order to make our dataset usable for training, we will need to batch it (split it up into batches), repeat it (so you can train on multiple epochs) and shuffle it (to avoid using the same order every time). # # In this task, you will learn that the order of these operations indeed matters! # # Let's create a dummy dataset: # In[12]: DUMMY_DS_SIZE=30 dummy_ds=tf.data.Dataset.range(DUMMY_DS_SIZE) DUMMY_BATCHSIZE=10 DUMMY_BUFFERSIZE=2*10 # **Exercise**: Roll the dice to determine the order in which you will implement shuffle, batch and repeat. Try to spot flaws in the results by inspecting 5 epochs. # In[13]: print(np.random.choice(["Shuffle, repeat, batch", "Repeat, shuffle, batch", "Batch, shuffle, repeat"])) # In[14]: new_dummy_ds=dummy_ds.shuffle(DUMMY_BUFFERSIZE).repeat().batch(DUMMY_BATCHSIZE) for epoch in range(5): for batch in new_dummy_ds.take(DUMMY_DS_SIZE//DUMMY_BATCHSIZE).as_numpy_iterator(): print(batch) print() # Observation: Despite being a bit unintuitive (repeating after shuffling?!), this is indeed the correct solution. # In[15]: new_dummy_ds=dummy_ds.repeat().shuffle(DUMMY_BUFFERSIZE).batch(DUMMY_BATCHSIZE) for epoch in range(5): for batch in new_dummy_ds.take(DUMMY_DS_SIZE//DUMMY_BATCHSIZE).as_numpy_iterator(): print(batch) print() # Observation: Some samples occur multiple times inside the epoch (and sometimes even inside a batch) # In[16]: new_dummy_ds=dummy_ds.batch(DUMMY_BATCHSIZE).shuffle(DUMMY_BUFFERSIZE).repeat() for epoch in range(5): for batch in new_dummy_ds.take(DUMMY_DS_SIZE//DUMMY_BATCHSIZE).as_numpy_iterator(): print(batch) print() # Observation: Only shuffles batches -> batch should be called last # ### Applying what we found out # **Exercise**: Shuffle, repeat and batch (using `padded_batch`) our `train_ds`. # In[17]: BATCHSIZE=64 BUFFERSIZE=2*64 train_ds=train_ds.shuffle(BUFFERSIZE).repeat().padded_batch(BATCHSIZE) # ## Custom Layers # # Keras allows you to define custom layers. This is useful for: # 1. Combining multiple pre-defined layers into a single custom layer # 2. Defining the layer weights explicitly # 3. Modifying gradients # # ### "Custom" dense layer # # **Exercise**: Re-implement a dense layer using a subclass of the `tf.keras.layers.Layer` class ([docs](https://keras.io/api/layers/base_layer/)). # In[18]: class CustomDenseLayer(tf.keras.layers.Layer): def __init__(self, units=32): super().__init__() self.units = units def build(self, input_shape): w_init = tf.random_normal_initializer() self.w = tf.Variable( initial_value=w_init(shape=(input_shape[-1], self.units), dtype='float32'), trainable=True) b_init = tf.zeros_initializer() self.b = tf.Variable( initial_value=b_init(shape=(self.units,), dtype='float32'), trainable=True) def call(self, inputs): return tf.matmul(inputs, self.w) + self.b # ### \* Bonus: "Custom" dropout layer # # **Exercise**: Re-implement a dropout layer using a subclass of the `Layer` class ([docs](https://keras.io/api/layers/base_layer/)). # In[73]: class CustomDropoutLayer(tf.keras.layers.Layer): def __init__(self, rate): super().__init__() self.rate = rate def call(self, inputs, training=True): #if training is None: # training = tf.keras.backend.learning_phase() random=tf.where(tf.random.uniform(inputs.shape,minval=0,maxval=1,dtype=tf.float32)