#| include: false ! [ -e /content ] && pip install -Uqq fastbook import fastbook fastbook.setup_book() #| include: false from fastbook import * from fastai.text.all import * path = untar_data(URLs.HUMAN_NUMBERS) #| include: false Path.BASE_PATH = path path.ls() lines = L() with open(path/'train.txt') as f: lines += L(*f.readlines()) with open(path/'valid.txt') as f: lines += L(*f.readlines()) lines text = ' . '.join([l.strip() for l in lines]) text[:100] tokens = text.split(' ') tokens[:10] vocab = L(*tokens).unique() vocab word2idx = {w:i for i,w in enumerate(vocab)} nums = L(word2idx[i] for i in tokens) nums L((tokens[i:i+3], tokens[i+3]) for i in range(0,len(tokens)-4,3)) seqs = L((tensor(nums[i:i+3]), nums[i+3]) for i in range(0,len(nums)-4,3)) seqs bs = 64 cut = int(len(seqs) * 0.8) dls = DataLoaders.from_dsets(seqs[:cut], seqs[cut:], bs=64, shuffle=False) class LMModel1(Module): def __init__(self, vocab_sz, n_hidden): self.i_h = nn.Embedding(vocab_sz, n_hidden) self.h_h = nn.Linear(n_hidden, n_hidden) self.h_o = nn.Linear(n_hidden,vocab_sz) def forward(self, x): h = F.relu(self.h_h(self.i_h(x[:,0]))) h = h + self.i_h(x[:,1]) h = F.relu(self.h_h(h)) h = h + self.i_h(x[:,2]) h = F.relu(self.h_h(h)) return self.h_o(h) learn = Learner(dls, LMModel1(len(vocab), 64), loss_func=F.cross_entropy, metrics=accuracy) learn.fit_one_cycle(4, 1e-3) n,counts = 0,torch.zeros(len(vocab)) for x,y in dls.valid: n += y.shape[0] for i in range_of(vocab): counts[i] += (y==i).long().sum() idx = torch.argmax(counts) idx, vocab[idx.item()], counts[idx].item()/n class LMModel2(Module): def __init__(self, vocab_sz, n_hidden): self.i_h = nn.Embedding(vocab_sz, n_hidden) self.h_h = nn.Linear(n_hidden, n_hidden) self.h_o = nn.Linear(n_hidden,vocab_sz) def forward(self, x): h = 0 for i in range(3): h = h + self.i_h(x[:,i]) h = F.relu(self.h_h(h)) return self.h_o(h) learn = Learner(dls, LMModel2(len(vocab), 64), loss_func=F.cross_entropy, metrics=accuracy) learn.fit_one_cycle(4, 1e-3) class LMModel3(Module): def __init__(self, vocab_sz, n_hidden): self.i_h = nn.Embedding(vocab_sz, n_hidden) self.h_h = nn.Linear(n_hidden, n_hidden) self.h_o = nn.Linear(n_hidden,vocab_sz) self.h = 0 def forward(self, x): for i in range(3): self.h = self.h + self.i_h(x[:,i]) self.h = F.relu(self.h_h(self.h)) out = self.h_o(self.h) self.h = self.h.detach() return out def reset(self): self.h = 0 m = len(seqs)//bs m,bs,len(seqs) def group_chunks(ds, bs): m = len(ds) // bs new_ds = L() for i in range(m): new_ds += L(ds[i + m*j] for j in range(bs)) return new_ds cut = int(len(seqs) * 0.8) dls = DataLoaders.from_dsets( group_chunks(seqs[:cut], bs), group_chunks(seqs[cut:], bs), bs=bs, drop_last=True, shuffle=False) learn = Learner(dls, LMModel3(len(vocab), 64), loss_func=F.cross_entropy, metrics=accuracy, cbs=ModelResetter) learn.fit_one_cycle(10, 3e-3) sl = 16 seqs = L((tensor(nums[i:i+sl]), tensor(nums[i+1:i+sl+1])) for i in range(0,len(nums)-sl-1,sl)) cut = int(len(seqs) * 0.8) dls = DataLoaders.from_dsets(group_chunks(seqs[:cut], bs), group_chunks(seqs[cut:], bs), bs=bs, drop_last=True, shuffle=False) [L(vocab[o] for o in s) for s in seqs[0]] class LMModel4(Module): def __init__(self, vocab_sz, n_hidden): self.i_h = nn.Embedding(vocab_sz, n_hidden) self.h_h = nn.Linear(n_hidden, n_hidden) self.h_o = nn.Linear(n_hidden,vocab_sz) self.h = 0 def forward(self, x): outs = [] for i in range(sl): self.h = self.h + self.i_h(x[:,i]) self.h = F.relu(self.h_h(self.h)) outs.append(self.h_o(self.h)) self.h = self.h.detach() return torch.stack(outs, dim=1) def reset(self): self.h = 0 def loss_func(inp, targ): return F.cross_entropy(inp.view(-1, len(vocab)), targ.view(-1)) learn = Learner(dls, LMModel4(len(vocab), 64), loss_func=loss_func, metrics=accuracy, cbs=ModelResetter) learn.fit_one_cycle(15, 3e-3) class LMModel5(Module): def __init__(self, vocab_sz, n_hidden, n_layers): self.i_h = nn.Embedding(vocab_sz, n_hidden) self.rnn = nn.RNN(n_hidden, n_hidden, n_layers, batch_first=True) self.h_o = nn.Linear(n_hidden, vocab_sz) self.h = torch.zeros(n_layers, bs, n_hidden) def forward(self, x): res,h = self.rnn(self.i_h(x), self.h) self.h = h.detach() return self.h_o(res) def reset(self): self.h.zero_() learn = Learner(dls, LMModel5(len(vocab), 64, 2), loss_func=CrossEntropyLossFlat(), metrics=accuracy, cbs=ModelResetter) learn.fit_one_cycle(15, 3e-3) class LSTMCell(Module): def __init__(self, ni, nh): self.forget_gate = nn.Linear(ni + nh, nh) self.input_gate = nn.Linear(ni + nh, nh) self.cell_gate = nn.Linear(ni + nh, nh) self.output_gate = nn.Linear(ni + nh, nh) def forward(self, input, state): h,c = state h = torch.cat([h, input], dim=1) forget = torch.sigmoid(self.forget_gate(h)) c = c * forget inp = torch.sigmoid(self.input_gate(h)) cell = torch.tanh(self.cell_gate(h)) c = c + inp * cell out = torch.sigmoid(self.output_gate(h)) h = out * torch.tanh(c) return h, (h,c) class LSTMCell(Module): def __init__(self, ni, nh): self.ih = nn.Linear(ni,4*nh) self.hh = nn.Linear(nh,4*nh) def forward(self, input, state): h,c = state # One big multiplication for all the gates is better than 4 smaller ones gates = (self.ih(input) + self.hh(h)).chunk(4, 1) ingate,forgetgate,outgate = map(torch.sigmoid, gates[:3]) cellgate = gates[3].tanh() c = (forgetgate*c) + (ingate*cellgate) h = outgate * c.tanh() return h, (h,c) t = torch.arange(0,10); t t.chunk(2) class LMModel6(Module): def __init__(self, vocab_sz, n_hidden, n_layers): self.i_h = nn.Embedding(vocab_sz, n_hidden) self.rnn = nn.LSTM(n_hidden, n_hidden, n_layers, batch_first=True) self.h_o = nn.Linear(n_hidden, vocab_sz) self.h = [torch.zeros(n_layers, bs, n_hidden) for _ in range(2)] def forward(self, x): res,h = self.rnn(self.i_h(x), self.h) self.h = [h_.detach() for h_ in h] return self.h_o(res) def reset(self): for h in self.h: h.zero_() learn = Learner(dls, LMModel6(len(vocab), 64, 2), loss_func=CrossEntropyLossFlat(), metrics=accuracy, cbs=ModelResetter) learn.fit_one_cycle(15, 1e-2) class Dropout(Module): def __init__(self, p): self.p = p def forward(self, x): if not self.training: return x mask = x.new(*x.shape).bernoulli_(1-p) return x * mask.div_(1-p) class LMModel7(Module): def __init__(self, vocab_sz, n_hidden, n_layers, p): self.i_h = nn.Embedding(vocab_sz, n_hidden) self.rnn = nn.LSTM(n_hidden, n_hidden, n_layers, batch_first=True) self.drop = nn.Dropout(p) self.h_o = nn.Linear(n_hidden, vocab_sz) self.h_o.weight = self.i_h.weight self.h = [torch.zeros(n_layers, bs, n_hidden) for _ in range(2)] def forward(self, x): raw,h = self.rnn(self.i_h(x), self.h) out = self.drop(raw) self.h = [h_.detach() for h_ in h] return self.h_o(out),raw,out def reset(self): for h in self.h: h.zero_() learn = Learner(dls, LMModel7(len(vocab), 64, 2, 0.5), loss_func=CrossEntropyLossFlat(), metrics=accuracy, cbs=[ModelResetter, RNNRegularizer(alpha=2, beta=1)]) learn = TextLearner(dls, LMModel7(len(vocab), 64, 2, 0.4), loss_func=CrossEntropyLossFlat(), metrics=accuracy) learn.fit_one_cycle(15, 1e-2, wd=0.1)