!jupyter nbconvert --to markdown 10_7_10_Exercises.ipynb
[NbConvertApp] WARNING | Config option `kernel_spec_manager_class` not recognized by `NbConvertApp`. [NbConvertApp] Converting notebook 10_7_10_Exercises.ipynb to markdown [NbConvertApp] Support files will be in 10_7_10_Exercises_files/ [NbConvertApp] Making directory 10_7_10_Exercises_files [NbConvertApp] Writing 15184 bytes to 10_7_10_Exercises.md
import sys
import torch.nn as nn
import torch
import warnings
from sklearn.model_selection import ParameterGrid
sys.path.append('/home/jovyan/work/d2l_solutions/notebooks/exercises/d2l_utils/')
import d2l
from torchsummary import summary
warnings.filterwarnings("ignore")
class LSTM(d2l.RNN):
def __init__(self, num_inputs, num_hiddens, num_layers=1,
dropout=0):
d2l.Module.__init__(self)
self.save_hyperparameters()
self.rnn = nn.LSTM(num_inputs, num_hiddens, num_layers=num_layers, dropout=dropout)
def forward(self, inputs, H_C=None):
print(inputs.shape)
return self.rnn(inputs, H_C)
def stat_val(model, data):
ppls = []
for batch in iter(data.get_dataloader(False)):
ppls.append(model.validation_step(batch, plot_flag=False).detach().numpy())
return np.exp(np.mean(ppls))
def experient(data_class=d2l.TimeMachine, num_steps=32, num_hiddens=32, lr=1):
data = data_class(batch_size=1024, num_steps=num_steps)
lstm = LSTM(num_inputs=len(data.vocab), num_hiddens=num_hiddens)
model = d2l.RNNLM(lstm, vocab_size=len(data.vocab), lr=lr)
trainer = d2l.Trainer(max_epochs=100, gradient_clip_val=1) #, num_gpus=1
trainer.fit(model, data)
return stat_val(model, data)
data = d2l.TimeMachine(batch_size=1024, num_steps=32)
lstm = LSTM(num_inputs=len(data.vocab), num_hiddens=32, num_layers=2)
model = d2l.RNNLM(lstm, vocab_size=len(data.vocab), lr=1)
trainer = d2l.Trainer(max_epochs=100, gradient_clip_val=1) #, num_gpus=1
trainer.fit(model, data)
torch.Size([32, 1024, 28]) torch.Size([32, 1024, 28]) torch.Size([32, 1024, 28]) torch.Size([32, 1024, 28])
--------------------------------------------------------------------------- KeyboardInterrupt Traceback (most recent call last) Cell In[17], line 5 3 model = d2l.RNNLM(lstm, vocab_size=len(data.vocab), lr=1) 4 trainer = d2l.Trainer(max_epochs=100, gradient_clip_val=1) #, num_gpus=1 ----> 5 trainer.fit(model, data) File ~/work/d2l_solutions/notebooks/exercises/d2l_utils/d2l.py:208, in Trainer.fit(self, model, data) 206 self.val_batch_idx = 0 207 for i in range(self.max_epochs): --> 208 train_loss, valid_loss = self.fit_epoch() 209 self.epoch += 1 210 return train_loss, valid_loss File ~/work/d2l_solutions/notebooks/exercises/d2l_utils/d2l.py:241, in Trainer.fit_epoch(self) 239 for batch in self.val_dataloader: 240 with torch.no_grad(): --> 241 loss = self.model.validation_step(self.prepare_batch(batch), 242 plot_flag=self.plot_flag) 243 self.val_batch_idx += 1 244 valid_loss += loss.detach().numpy() File ~/work/d2l_solutions/notebooks/exercises/d2l_utils/d2l.py:620, in RNNLMScratch.validation_step(self, batch, plot_flag) 619 def validation_step(self, batch, plot_flag=True): --> 620 l = self.loss(self(*batch[:-1]), batch[-1]) 621 if plot_flag: 622 self.plot('ppl', torch.exp(l), train=False) File ~/work/d2l_solutions/notebooks/exercises/d2l_utils/d2l.py:362, in Classifier.loss(self, y_hat, y, averaged) 360 y_hat = y_hat.reshape((-1, y_hat.shape[-1])) 361 y = y.reshape((-1,)) --> 362 return F.cross_entropy(y_hat, y, reduction='mean' 363 if averaged else 'none') File ~/.local/lib/python3.11/site-packages/torch/nn/functional.py:3029, in cross_entropy(input, target, weight, size_average, ignore_index, reduce, reduction, label_smoothing) 3027 if size_average is not None or reduce is not None: 3028 reduction = _Reduction.legacy_get_string(size_average, reduce) -> 3029 return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index, label_smoothing) KeyboardInterrupt:
param_grid = {'num_steps':[8, 16, 32, 64, 128],
'num_hiddens':[8, 16, 32, 64, 128],
'lr':[0.01,0.1,1,10]}
param_grid_obj = ParameterGrid(param_grid)
ppls = []
for params in param_grid_obj:
ppl = experient(**params)
ppls.append(ppl)
print(params, ppl)
class WordTimeMachine(d2l.TimeMachine):
def _tokenize(self, text):
return text.split(' ')
experient(data_class=WordTimeMachine)
The hidden state of an LSTM cell is the output that is passed to the next layer or the next time step. Therefore, it needs to have a consistent range of values that can be easily processed by other layers or cells. The tanh function ensures that the hidden state is bounded between -1 and 1, which is a common range for many activation functions and neural network operations. Moreover, the tanh function is a nonlinear function that can introduce some complexity and diversity to the hidden state, which can help the network learn more complex patterns and features. The tanh function also has a nice property that its derivative is easy to compute and does not suffer from the vanishing gradient problem as much as other functions like sigmoid.
Some sources suggest that the tanh function for the hidden state is not necessary and can be replaced by other functions or even omitted¹². However, this may depend on the specific task and data that the LSTM network is trying to model. In general, the tanh function for the hidden state is a reasonable choice that has been widely used and proven to work well in many applications.
class Data(d2l.DataModule):
def __init__(self, batch_size=16, T=1000, num_train=600, tau=4, randn=0.2):
self.save_hyperparameters()
self.time = torch.range(1, T, dtype=torch.float32)
self.x = torch.sin(0.01*self.time) + torch.randn(T)*randn
def get_dataloader(self, train):
features = [self.x[i:self.T-self.tau+i] for i in range(self.tau)]
self.features = torch.stack(features, 1)
self.labels = self.x[self.tau:].reshape((-1, 1))
i = slice(0, self.num_train) if train else slice(self.num_train, None)
return self.get_tensorloader([self.features, self.labels], train, i)
class RNNAutoRegression(d2l.LinearRegression): #@save
"""The RNN-based language model implemented with high-level APIs."""
def init_params(self):
self.linear = nn.LazyLinear(1)
# def output_layer(self, hiddens):
# return self.linear(hiddens).swapaxes(0, 1)
def __init__(self, rnn,lr=0.01, tau=4, plot_flag=True, emb_len=8):
super().__init__(lr=lr)
self.save_hyperparameters()
self.init_params()
def forward(self, X, state=None):
rnn_outputs, _ = self.rnn(X, state)
return self.linear(rnn_outputs)
# return rnn_outputs
tau=4
data = Data(tau=tau)
lstm = LSTM(num_inputs=tau, num_hiddens=8)
model = RNNAutoRegression(rnn=lstm, lr=0.01)
trainer = d2l.Trainer(max_epochs=5)
trainer.fit(model, data)
onestep_preds = model(data.features).detach().numpy()
d2l.plot(data.time[data.tau:], [data.labels, onestep_preds], 'time', 'x',
legend=['labels', '1-step preds'], figsize=(6, 3))