# 학습할 데이터 살펴보기 (1백만줄)
with open('data/input.txt', 'r', encoding='utf-8') as f:
text = f.read()
print(f"length of dataset in characters: {len(text):,}", )
print(text[:50])
length of dataset in characters: 1,115,394 First Citizen: Before we proceed any further, hear
# 바이트 페어 인코딩(Byte Pair Encoding, BPE) 을 위한 Token 내용 살펴보기
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(f"Unique Token : {''.join(chars)}\nVocab Size : {vocab_size}")
Unique Token : !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz Vocab Size : 65
# 인덱스 숫자를 활용한 인코딩과 디코딩
stoi = { ch:i for i,ch in enumerate(chars) } # token dict {token: index}
itos = { i:ch for i,ch in enumerate(chars) } # token dict {index: token}
encode = lambda s: [stoi[c] for c in s] # encoder: string -> integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: integers -> string
print(encode("hello world Python"))
print(decode(encode("hello world Python")))
[46, 43, 50, 50, 53, 1, 61, 53, 56, 50, 42, 1, 28, 63, 58, 46, 53, 52] hello world Python
# Check GPU Cuda
import torch
print(torch.cuda.is_available())
if torch.cuda.is_available(): dev = "cuda:0"
else: dev = "cpu"
device = torch.device(dev)
torch.zeros(2,4).to(device)
True
tensor([[0., 0., 0., 0.], [0., 0., 0., 0.]], device='cuda:0')
# text dataset 을 torch.Tensor 로 변환
# data[:100] : GPT 에서 연산할 때 사용되는 데이터 형태
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:100])
torch.Size([1115394]) torch.int64 tensor([18, 47, 56, 57, 58, 1, 15, 47, 58, 47, 64, 43, 52, 10, 0, 14, 43, 44, 53, 56, 43, 1, 61, 43, 1, 54, 56, 53, 41, 43, 43, 42, 1, 39, 52, 63, 1, 44, 59, 56, 58, 46, 43, 56, 6, 1, 46, 43, 39, 56, 1, 51, 43, 1, 57, 54, 43, 39, 49, 8, 0, 0, 13, 50, 50, 10, 0, 31, 54, 43, 39, 49, 6, 1, 57, 54, 43, 39, 49, 8, 0, 0, 18, 47, 56, 57, 58, 1, 15, 47, 58, 47, 64, 43, 52, 10, 0, 37, 53, 59])
# 학습, 테스트 데이터 나누기
# 앞부분 90% 는 `train_data`
# 나머지 10% 는 `val_data`
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]
block_size = 8
train_data[:block_size+1]
tensor([18, 47, 56, 57, 58, 1, 15, 47, 58])
# x : 문장 Token
# y : 문장 바로 다음에 이어질 Token
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
context = x[:t+1]
target = y[t]
print(f"when input is {context} the target: {target}")
when input is tensor([18]) the target: 47 when input is tensor([18, 47]) the target: 56 when input is tensor([18, 47, 56]) the target: 57 when input is tensor([18, 47, 56, 57]) the target: 58 when input is tensor([18, 47, 56, 57, 58]) the target: 1 when input is tensor([18, 47, 56, 57, 58, 1]) the target: 15 when input is tensor([18, 47, 56, 57, 58, 1, 15]) the target: 47 when input is tensor([18, 47, 56, 57, 58, 1, 15, 47]) the target: 58
batch_size
, block_size
를 적용한 형태로 변환torch.manual_seed(1337)
batch_size = 4 # 병렬로 학습을 진행할 갯수
block_size = 8 # 학습 데이터
def get_batch(split_source:str):
r"""Batch 크기에 맞도록 `input` X `target` 데이터 변환
split_source : train / test 데이터 소스 구분 """
data = train_data if split_source == 'train' else val_data
ix = torch.randint(len(data) - block_size, (batch_size,))
x = torch.stack([data[i:i+block_size] for i in ix])
y = torch.stack([data[i+1:i+block_size+1] for i in ix])
return x, y
# batch_size, block_size 가 적용된 Tensor 데이터 셋 (Input)
xb, yb = get_batch('train')
print(f'inputs {xb.shape} : {xb}')
print(f'targets {yb.shape} : {yb}')
inputs torch.Size([4, 8]) : tensor([[24, 43, 58, 5, 57, 1, 46, 43], [44, 53, 56, 1, 58, 46, 39, 58], [52, 58, 1, 58, 46, 39, 58, 1], [25, 17, 27, 10, 0, 21, 1, 54]]) targets torch.Size([4, 8]) : tensor([[43, 58, 5, 57, 1, 46, 43, 39], [53, 56, 1, 58, 46, 39, 58, 1], [58, 1, 58, 46, 39, 58, 1, 46], [17, 27, 10, 0, 21, 1, 54, 39]])
# Batch 크기 적용된 Train / Validation
for b in range(batch_size): # batch dimension
for t in range(block_size): # time dimension
context = xb[b, :t+1]
target = yb[b,t]
print(f"when input is {context.tolist()} the target: {target}")
when input is [24] the target: 43 when input is [24, 43] the target: 58 when input is [24, 43, 58] the target: 5 when input is [24, 43, 58, 5] the target: 57 when input is [24, 43, 58, 5, 57] the target: 1 when input is [24, 43, 58, 5, 57, 1] the target: 46 when input is [24, 43, 58, 5, 57, 1, 46] the target: 43 when input is [24, 43, 58, 5, 57, 1, 46, 43] the target: 39 when input is [44] the target: 53 when input is [44, 53] the target: 56 when input is [44, 53, 56] the target: 1 when input is [44, 53, 56, 1] the target: 58 when input is [44, 53, 56, 1, 58] the target: 46 when input is [44, 53, 56, 1, 58, 46] the target: 39 when input is [44, 53, 56, 1, 58, 46, 39] the target: 58 when input is [44, 53, 56, 1, 58, 46, 39, 58] the target: 1 when input is [52] the target: 58 when input is [52, 58] the target: 1 when input is [52, 58, 1] the target: 58 when input is [52, 58, 1, 58] the target: 46 when input is [52, 58, 1, 58, 46] the target: 39 when input is [52, 58, 1, 58, 46, 39] the target: 58 when input is [52, 58, 1, 58, 46, 39, 58] the target: 1 when input is [52, 58, 1, 58, 46, 39, 58, 1] the target: 46 when input is [25] the target: 17 when input is [25, 17] the target: 27 when input is [25, 17, 27] the target: 10 when input is [25, 17, 27, 10] the target: 0 when input is [25, 17, 27, 10, 0] the target: 21 when input is [25, 17, 27, 10, 0, 21] the target: 1 when input is [25, 17, 27, 10, 0, 21, 1] the target: 54 when input is [25, 17, 27, 10, 0, 21, 1, 54] the target: 39
# TransFormer 학습에 사용할 단위 데이터 셋
print(xb)
tensor([[24, 43, 58, 5, 57, 1, 46, 43], [44, 53, 56, 1, 58, 46, 39, 58], [52, 58, 1, 58, 46, 39, 58, 1], [25, 17, 27, 10, 0, 21, 1, 54]])
xb 데이터
가 학급 진행 중)import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)
class BigramLanguageModel(nn.Module):
def __init__(self, vocab_size):
r"""Lookup Table, Embedding Table (`One Hot` 없이 Tensor 직접 활용)
Lookup Table : `인덱스 테이블` 학습 과정 중 생성된 밀집 벡터(dense vector)
Embedding Table : `lookup table` 에서 학습에 필요한 Token 을 바로 호출"""
super().__init__()
self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
def forward(self, idx, targets=None):
r"""순전파 학습 Graph
B : batch size
T : sequence length
C : Dimensionality of the embedding (n_embd).
B,T,C : Batch_size, Time, and Channels
loss : Cross Entropy 를 활용
로짓(Logit) 변환 : `확률인 실수 값`으로 변환"""
logits = self.token_embedding_table(idx) # (B,T,C)
# targets 이 없을 때 (마지막 데이터 학습)
if targets is None:
loss = None
# `idx` 와 `targets` 테이블을 (B,T,C) 파라미터를 활용하여 정의
else:
B, T, C = logits.shape
logits = logits.view(B*T, C) # 값 예측
targets = targets.view(B*T) # Target 값
loss = F.cross_entropy(logits, targets)
return logits, loss
def generate(self, idx, max_new_tokens):
r"""모델의 학습 : """
# (B, T) array 값을 활용하여 문장(context) 을 `idx` 로 변환
for _ in range(max_new_tokens):
logits, loss = self(idx) # get the predictions
# 반복시 신규 추가된 마지막 데이터만 활용
# logits : torch.Size([1, 1~100, 65]) => torch.Size([1, 65])
logits = logits[:, -1, :] # becomes (B, C)
probs = F.softmax(logits, dim=-1) # (B, C) : SoftMax 로 평평화
# .multinomial : 다항분포 확률값 샘플링
# params : 텐서 (정규화될 필요는 없다), 샘플링할 갯수
# return : 샘플링된 값의 인덱스를 반환.
idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
return idx
# torch.zeros((1, 1), dtype=torch.long) : [[0]]
model = BigramLanguageModel(vocab_size)
logits, loss = model(xb, yb)
print(f"Logits Shape : {logits.shape}\nLoss : {loss}")
print(decode(model.generate(torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))
Logits Shape : torch.Size([32, 65]) Loss : 4.878634929656982 SKIcLT;AcELMoTbvZv C?nq-QE33:CJqkOKH-q;:la!oiywkHjgChzbQ?u!3bLIgwevmyFJGUGp wnYWmnxKWWev-tDqXErVKLgJ
# PyTorch 로 최적화 학습
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
batch_size = 32
for steps in range(100): # 반복학습
xb, yb = get_batch('train') # 학습에 사용할 batch data
logits, loss = model(xb, yb) # 모델의 Loss 값 평가
optimizer.zero_grad(set_to_none=True)
loss.backward()
optimizer.step()
print(f"Loss : {loss.item()}")
print(decode(model.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=500)[0].tolist()))
Loss : 4.65630578994751 oTo.JUZ!!zqe! xBP qbs$Gy'AcOmrLwwt p$x;Seh-onQbfM?OjKbn'NwUAW -Np3fkz$FVwAUEa-wzWC -wQo-R!v -Mj?,SPiTyZ;o-opr$mOiPJEYD-CfigkzD3p3?zvS;ADz;.y?o,ivCuC'zqHxcVT cHA rT'Fd,SBMZyOslg!NXeF$sBe,juUzLq?w-wzP-h ERjjxlgJzPbHxf$ q,q,KCDCU fqBOQT SV&CW:xSVwZv'DG'NSPypDhKStKzC -$hslxIVzoivnp ,ethA:NCCGoi tN!ljjP3fwJMwNelgUzzPGJlgihJ!d?q.d pSPYgCuCJrIFtb jQXg pA.P LP,SPJi DBcuBM:CixjJ$Jzkq,OLf3KLQLMGph$O 3DfiPHnXKuHMlyjxEiyZib3FaHV-oJa!zoc'XSP :CKGUhd?lgCOF$;;DTHZMlvvcmZAm;:iv'MMgO&Ywbc;BLCUd&vZINLIzkuTGZa D.?
self-attention
에서 수학적 트릭¶# toy example illustrating how matrix multiplication can be used for a "weighted aggregation"
torch.manual_seed(42)
a = torch.tril(torch.ones(3, 3)) # 대각선을 기준으로 반은 남기고 반은 0으로 변환
a = a / torch.sum(a, 1, keepdim=True)
b = torch.randint(0,10,(3,2)).float()
c = a @ b
print(f'a={a}\nb={b}\nc={c}')
a=tensor([[1.0000, 0.0000, 0.0000], [0.5000, 0.5000, 0.0000], [0.3333, 0.3333, 0.3333]]) b=tensor([[2., 7.], [6., 4.], [6., 5.]]) c=tensor([[2.0000, 7.0000], [4.0000, 5.5000], [4.6667, 5.3333]])
# consider the following toy example:
torch.manual_seed(1337)
B,T,C = 4,8,2 # batch, time, channels
x = torch.randn(B,T,C)
x.shape
torch.Size([4, 8, 2])
# We want
# :: x[b,t] = mean_{i<=t} x[b,i]
xbow = torch.zeros((B,T,C))
for b in range(B):
for t in range(T):
xprev = x[b,:t+1] # (t,C)
xbow[b,t] = torch.mean(xprev, 0)
# version 2: using matrix multiply for a weighted aggregation
weight = torch.tril(torch.ones(T, T))
weight = weight / weight.sum(1, keepdim=True)
xbow2 = weight @ x # (B, T, T) @ (B, T, C) ----> (B, T, C)
torch.allclose(xbow, xbow2) # .allclose : 두 텐서가 동일한지 비교
True
# version 3: use Softmax
tril = torch.tril(torch.ones(T, T))
weight = torch.zeros((T,T))
weight = weight.masked_fill(tril == 0, float('-inf'))
weight = F.softmax(weight, dim=-1)
xbow3 = weight @ x
torch.allclose(xbow, xbow3)
True
# version 4: self-attention!
torch.manual_seed(1337)
B,T,C = 4,8,32 # batch, time, channels
x = torch.randn(B,T,C)
# let's see a single Head perform self-attention
head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)
k = key(x) # (B, T, 16)
q = query(x) # (B, T, 16)
weight = q @ k.transpose(-2, -1) # (B, T, 16) @ (B, 16, T) ---> (B, T, T)
tril = torch.tril(torch.ones(T, T))
# weight = torch.zeros((T,T))
weight = weight.masked_fill(tril == 0, float('-inf'))
weight = F.softmax(weight, dim=-1)
v = value(x)
out = weight @ v # out = weight @ x
out.shape
torch.Size([4, 8, 16])
weight[0]
tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000], [0.1574, 0.8426, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000], [0.2088, 0.1646, 0.6266, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000], [0.5792, 0.1187, 0.1889, 0.1131, 0.0000, 0.0000, 0.0000, 0.0000], [0.0294, 0.1052, 0.0469, 0.0276, 0.7909, 0.0000, 0.0000, 0.0000], [0.0176, 0.2689, 0.0215, 0.0089, 0.6812, 0.0019, 0.0000, 0.0000], [0.1691, 0.4066, 0.0438, 0.0416, 0.1048, 0.2012, 0.0329, 0.0000], [0.0210, 0.0843, 0.0555, 0.2297, 0.0573, 0.0709, 0.2423, 0.2391]], grad_fn=<SelectBackward0>)
Notes:
tril
, allowing all tokens to communicate. This block here is called a "decoder" attention block because it has triangular masking, and is usually used in autoregressive settings, like language modeling.wei
by 1/sqrt(head_size). This makes it so when input Q,K are unit variance, wei will be unit variance too and Softmax will stay diffuse and not saturate too much. Illustration belowk = torch.randn(B, T, head_size)
q = torch.randn(B, T, head_size)
weight = q @ k.transpose(-2, -1) * head_size**-0.5
k.var()
tensor(1.0449)
q.var()
tensor(1.0700)
weight.var()
tensor(1.0918)
torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5]), dim=-1)
tensor([0.1925, 0.1426, 0.2351, 0.1426, 0.2872])
torch.softmax(torch.tensor([0.1, -0.2, 0.3, -0.2, 0.5])*8, dim=-1) # gets too peaky, converges to one-hot
tensor([0.0326, 0.0030, 0.1615, 0.0030, 0.8000])
class LayerNorm1d: # (used to be BatchNorm1d)
def __init__(self, dim, eps=1e-5, momentum=0.1):
self.eps = eps
self.gamma = torch.ones(dim)
self.beta = torch.zeros(dim)
def __call__(self, x):
# calculate the forward pass
xmean = x.mean(1, keepdim=True) # batch mean
xvar = x.var(1, keepdim=True) # batch variance
xhat = (x - xmean) / torch.sqrt(xvar + self.eps) # normalize to unit variance
self.out = self.gamma * xhat + self.beta
return self.out
def parameters(self):
return [self.gamma, self.beta]
torch.manual_seed(1337)
module = LayerNorm1d(100)
x = torch.randn(32, 100) # batch size 32 of 100-dimensional vectors
x = module(x)
x.shape
torch.Size([32, 100])
# mean,std of one feature across all batch inputs
x[:,0].mean(), x[:,0].std()
(tensor(0.1469), tensor(0.8803))
# mean,std of a single input from the batch, of its features
x[0,:].mean(), x[0,:].std()
(tensor(-9.5367e-09), tensor(1.0000))
# French to English translation example:
# <--------- ENCODE ------------------><--------------- DECODE ----------------->
# les réseaux de neurones sont géniaux! <START> neural networks are awesome!<END>
You may want to refer directly to the git repo instead though.
import torch
import torch.nn as nn
from torch.nn import functional as F
# hyperparameters
batch_size = 16 # how many independent sequences will we process in parallel?
block_size = 32 # what is the maximum context length for predictions?
max_iters = 5000
eval_interval = 500
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 64
n_head = 4
n_layer = 4
dropout = 0.0
# ------------
torch.manual_seed(1337)
with open('data/input.txt', 'r', encoding='utf-8') as f:
text = f.read()
# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string
# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]
# data loading
def get_batch(split):
# generate a small batch of data of inputs x and targets y
data = train_data if split == 'train' else val_data
ix = torch.randint(len(data) - block_size, (batch_size,))
x = torch.stack([data[i:i+block_size] for i in ix])
y = torch.stack([data[i+1:i+block_size+1] for i in ix])
x, y = x.to(device), y.to(device)
return x, y
@torch.no_grad()
def estimate_loss():
out = {}
model.eval()
for split in ['train', 'val']:
losses = torch.zeros(eval_iters)
for k in range(eval_iters):
X, Y = get_batch(split)
logits, loss = model(X, Y)
losses[k] = loss.item()
out[split] = losses.mean()
model.train()
return out
class Head(nn.Module):
""" one head of self-attention """
def __init__(self, head_size):
super().__init__()
self.key = nn.Linear(n_embd, head_size, bias=False)
self.query = nn.Linear(n_embd, head_size, bias=False)
self.value = nn.Linear(n_embd, head_size, bias=False)
self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
self.dropout = nn.Dropout(dropout)
def forward(self, x):
B,T,C = x.shape
k = self.key(x) # (B,T,C)
q = self.query(x) # (B,T,C)
# compute attention scores ("affinities")
wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
wei = F.softmax(wei, dim=-1) # (B, T, T)
wei = self.dropout(wei)
# perform the weighted aggregation of the values
v = self.value(x) # (B,T,C)
out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
return out
class MultiHeadAttention(nn.Module):
""" multiple heads of self-attention in parallel """
def __init__(self, num_heads, head_size):
super().__init__()
self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
self.proj = nn.Linear(n_embd, n_embd)
self.dropout = nn.Dropout(dropout)
def forward(self, x):
out = torch.cat([h(x) for h in self.heads], dim=-1)
out = self.dropout(self.proj(out))
return out
class FeedFoward(nn.Module):
""" a simple linear layer followed by a non-linearity """
def __init__(self, n_embd):
super().__init__()
self.net = nn.Sequential(
nn.Linear(n_embd, 4 * n_embd),
nn.ReLU(),
nn.Linear(4 * n_embd, n_embd),
nn.Dropout(dropout),
)
def forward(self, x):
return self.net(x)
class Block(nn.Module):
""" Transformer block: communication followed by computation """
def __init__(self, n_embd, n_head):
# n_embd: embedding dimension, n_head: the number of heads we'd like
super().__init__()
head_size = n_embd // n_head
self.sa = MultiHeadAttention(n_head, head_size)
self.ffwd = FeedFoward(n_embd)
self.ln1 = nn.LayerNorm(n_embd)
self.ln2 = nn.LayerNorm(n_embd)
def forward(self, x):
x = x + self.sa(self.ln1(x))
x = x + self.ffwd(self.ln2(x))
return x
# super simple bigram model
class BigramLanguageModel(nn.Module):
def __init__(self):
super().__init__()
# each token directly reads off the logits for the next token from a lookup table
self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
self.position_embedding_table = nn.Embedding(block_size, n_embd)
self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
self.ln_f = nn.LayerNorm(n_embd) # final layer norm
self.lm_head = nn.Linear(n_embd, vocab_size)
def forward(self, idx, targets=None):
B, T = idx.shape
# idx and targets are both (B,T) tensor of integers
tok_emb = self.token_embedding_table(idx) # (B,T,C)
pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
x = tok_emb + pos_emb # (B,T,C)
x = self.blocks(x) # (B,T,C)
x = self.ln_f(x) # (B,T,C)
logits = self.lm_head(x) # (B,T,vocab_size)
if targets is None:
loss = None
else:
B, T, C = logits.shape
logits = logits.view(B*T, C)
targets = targets.view(B*T)
loss = F.cross_entropy(logits, targets)
return logits, loss
def generate(self, idx, max_new_tokens):
# idx is (B, T) array of indices in the current context
for _ in range(max_new_tokens):
# crop idx to the last block_size tokens
idx_cond = idx[:, -block_size:]
# get the predictions
logits, loss = self(idx_cond)
# focus only on the last time step
logits = logits[:, -1, :] # becomes (B, C)
# apply softmax to get probabilities
probs = F.softmax(logits, dim=-1) # (B, C)
# sample from the distribution
idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
# append sampled index to the running sequence
idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
return idx
model = BigramLanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
for iter in range(max_iters):
# every once in a while evaluate the loss on train and val sets
if iter % eval_interval == 0 or iter == max_iters - 1:
losses = estimate_loss()
print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
# sample a batch of data
xb, yb = get_batch('train')
# evaluate the loss
logits, loss = model(xb, yb)
optimizer.zero_grad(set_to_none=True)
loss.backward()
optimizer.step()
# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=2000)[0].tolist()))
0.209729 M parameters step 0: train loss 4.4116, val loss 4.4022 step 500: train loss 2.3140, val loss 2.3142 step 1000: train loss 2.1010, val loss 2.1277 step 1500: train loss 1.9655, val loss 2.0312 step 2000: train loss 1.8810, val loss 1.9706 step 2500: train loss 1.8207, val loss 1.9455 step 3000: train loss 1.7736, val loss 1.9146 step 3500: train loss 1.7465, val loss 1.8915 step 4000: train loss 1.7220, val loss 1.8657 step 4500: train loss 1.6937, val loss 1.8466 step 4999: train loss 1.6669, val loss 1.8288 And the Romioc'd and is not the air. MENENIUS: Say thy let latands: and his usquit them hedsland, ever crome. Whalt womzolour Yours, to fignicior milend lincees is ensengmin; Stirmail ov the do; Wall may is wand! All indience. His me nevery at princess, why. Here, she radoness the hopery would that To Willond do evicks the most rived in him he poor ower; the day of thrusban son; igres my montey. MENENIUS: Marry, I have a fair and tooble, Warwas Warward for hith couragaranny it-bus Infort carey, nevers, being. And you haste mesbragant, good noble the men. CORIOLANUS: What thy well or Of my counfends should sold. And as he laceing thouged Perberenors, and pass Leords in meeding. GLOUDERGARETET: Rigares: What's, shech nower, it duch in, soxet as and bawaret against uncesta k sleved unlancond the walk not a veartch again, And I shall pairst in now thy your tark? CALEOLILLord, The shall all mytanty what that thy must perverelike, on so han thy abadelwatch thus cout in farulty, know our her and bruny hears, Well Henring. Shen the vart, sping itake, and that had his well: For we minds and end byingin in soverignt: How wown, acconstor Cagaiding menIners. FLORIZE: Was you comented knight lament tell tend sirges. Cangitor have even I discoves sticly's her where! That brack! And fortucteous beake on of throught? Second Narge: A faill. LARIO: Hor doth head your counts begragey of vonzing Ritome, Whath boele his made him nevant om Threer, Ay, deleveget: as yeed these arewholer, every sleal on you; The plower sages spairess thouka-latecter: On! thene of whereford That shold breeposced being him knew you No have it 't! Where madrishn. Prower-was I was that becourmah, Or morastion; hereives shall blay not, That be flower for thy banicry,-- JULIO: Art, livess make here two peresect as Talk'd and the corrown'd of islift he punservoudgo hold cause; 't: And fannury, thy poor and now wonsent, But hathose a fath, Jury Ricarriol: Inkin shown'd for a hands his our had nide! MENE