Ctrl/Cmd + P and select "Save as PDF".
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")
text = "Tokenization is surprisingly important!"
# See the subword tokens (strings)
tokens = tokenizer.tokenize(text)
print(tokens) # ['Token', 'ization', 'Δ is', 'Δ surprisingly', 'Δ important', '!']
# Δ = leading space β byte-level BPE encodes spaces as part of tokens
# Encode: text β integer IDs
ids = tokenizer.encode(text)
print(ids) # [30642, 1634, 318, 11242, 1593, 0]
# Full tokenization (returns tensors + attention mask for batching)
inputs = tokenizer(text, return_tensors="pt", padding=True)
print(inputs.input_ids) # Token IDs as tensor
print(inputs.attention_mask) # 1 = real token, 0 = padding
# Decode: IDs β text (lossless roundtrip)
print(tokenizer.decode(ids)) # "Tokenization is surprisingly important!"
print(f"Vocabulary size: {len(tokenizer)}") # 50257from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")
# Numbers are tokenized inconsistently
for n in ["127", "128", "129", "1000", "10000"]:
toks = tokenizer.tokenize(n)
print(f"{n:>6} β {toks} ({len(toks)} tokens)")
# 127 β ['127'] (1 token)
# 128 β ['128'] (1 token)
# 129 β ['12', '9'] (2 tokens!) β different structure
# 1000 β ['1000'] (1 token)
# 10000 β ['100', '00'] (2 tokens)
# Multilingual cost inequality
en = "The cat sat on the mat."
zh = "η«εε¨ε«εδΈγ" # Same meaning in Chinese
print(f"English: {len(tokenizer.encode(en))} tokens")
print(f"Chinese: {len(tokenizer.encode(zh))} tokens") # 2-3Γ more!
# These artifacts explain many real model failuresimport torch
import torch.nn as nn
class InputPipeline(nn.Module):
"""Token + positional embeddings for a GPT-style model.
GPT-OSS replaces pos_embed with RoPE."""
def __init__(self, vocab_size=200000, max_seq_len=4096, embed_dim=4096):
super().__init__()
self.token_embed = nn.Embedding(vocab_size, embed_dim)
self.pos_embed = nn.Embedding(max_seq_len, embed_dim) # β RoPE later
self.dropout = nn.Dropout(0.1)
def forward(self, token_ids): # (batch_size, seq_len)
B, T = token_ids.shape
tok_emb = self.token_embed(token_ids) # (B, T, embed_dim)
positions = torch.arange(T, device=token_ids.device) # [0, 1, ..., T-1]
pos_emb = self.pos_embed(positions) # (T, embed_dim)
x = self.dropout(tok_emb + pos_emb) # (B, T, embed_dim)
return x # β feeds into the first transformer layer
# Parameter count
pipeline = InputPipeline()
print(f"Embedding parameters: {sum(p.numel() for p in pipeline.parameters()):,}")
# ~835M parameters just for embeddings + positions!import torch
import torch.nn as nn
import torch.nn.functional as F
# After transformer layers, we have hidden states
# hidden: (batch, seq_len, embed_dim)
# --- LM HEAD: Project to vocabulary ---
lm_head = nn.Linear(embed_dim, vocab_size, bias=False)
logits = lm_head(hidden) # (batch, seq_len, vocab_size)
# --- TRAINING: Compute loss ---
# Position t's logits predict token at position t+1, so we shift:
shift_logits = logits[:, :-1, :].contiguous() # (batch, T-1, vocab_size)
shift_labels = token_ids[:, 1:].contiguous() # (batch, T-1)
loss = F.cross_entropy(
shift_logits.view(-1, vocab_size), # Flatten to (batch*(T-1), vocab_size)
shift_labels.view(-1) # Flatten to (batch*(T-1),)
) # Scalar β this is what we backpropagate!
# --- INFERENCE: Generate next token ---
next_logits = logits[:, -1, :] # (batch, vocab_size)
scaled = next_logits / 0.7 # Temperature scaling
probs = F.softmax(scaled, dim=-1) # Probability distribution
next_token = torch.multinomial(probs, 1) # Sample one tokenimport torch
import torch.nn.functional as F
import math
def compute_loss_and_perplexity(logits, labels):
"""Standard language modeling loss computation."""
# logits: (batch, seq_len, vocab_size)
# labels: (batch, seq_len) β the true next tokens
loss = F.cross_entropy(
logits.view(-1, logits.size(-1)), # Flatten: (batch*seq_len, vocab_size)
labels.view(-1), # Flatten: (batch*seq_len,)
ignore_index=-100 # Ignore padding positions
)
perplexity = math.exp(loss.item()) # PPL = e^(cross-entropy)
return loss, perplexity
# Example: inside the training loop
for batch in dataloader:
token_ids = batch["input_ids"] # (batch, seq_len)
logits = model(token_ids[:, :-1]) # Predict from all but last
labels = token_ids[:, 1:] # True next tokens
loss, ppl = compute_loss_and_perplexity(logits, labels)
print(f"Loss: {loss.item():.4f}, Perplexity: {ppl:.2f}")
loss.backward() # Backpropagate!
optimizer.step()
optimizer.zero_grad()import torch
import torch.nn as nn
import torch.nn.functional as F
class GPTModel(nn.Module):
def __init__(self, vocab_size=200000, max_seq_len=4096,
embed_dim=4096, n_layers=32, n_heads=32):
super().__init__()
self.token_embed = nn.Embedding(vocab_size, embed_dim)
self.pos_embed = nn.Embedding(max_seq_len, embed_dim) # Will β RoPE
self.dropout = nn.Dropout(0.1)
self.layers = nn.ModuleList([
TransformerBlock(embed_dim, n_heads) # From last lecture!
for _ in range(n_layers)
])
self.norm = nn.LayerNorm(embed_dim) # Will β RMSNorm
self.lm_head = nn.Linear(embed_dim, vocab_size, bias=False)
self.lm_head.weight = self.token_embed.weight # Weight tying!
def forward(self, token_ids, targets=None):
B, T = token_ids.shape
tok_emb = self.token_embed(token_ids)
pos_emb = self.pos_embed(torch.arange(T, device=token_ids.device))
x = self.dropout(tok_emb + pos_emb)
for layer in self.layers:
x = layer(x) # N transformer blocks
x = self.norm(x) # Final normalization
logits = self.lm_head(x) # (B, T, vocab_size)
loss = None
if targets is not None:
loss = F.cross_entropy(
logits.view(-1, logits.size(-1)), targets.view(-1)
)
return logits, lossfrom transformers import AutoTokenizer, AutoModelForCausalLM
import torch
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelForCausalLM.from_pretrained("gpt2")
# --- Step 1: Tokenize ---
text = "The future of AI is"
inputs = tokenizer(text, return_tensors="pt")
print(f"Tokens: {tokenizer.tokenize(text)}") # ['The', 'Δ future', 'Δ of', 'Δ AI', 'Δ is']
print(f"IDs: {inputs.input_ids}") # tensor([[464, 2003, 286, 9552, 318]])
# --- Step 2: Forward pass β inspect logits ---
with torch.no_grad():
logits = model(**inputs).logits # (1, 5, 50257)
# See what the model predicts after each token
for i in range(logits.shape[1]):
prefix = tokenizer.decode(inputs.input_ids[0, :i+1])
top_pred = tokenizer.decode(logits[0, i].argmax())
print(f" After '{prefix}' β predicts '{top_pred}'")
# --- Step 3: Generate ---
out = model.generate(inputs.input_ids, max_new_tokens=30, temperature=0.7, top_p=0.95,
do_sample=True)
print(f"\nGenerated: {tokenizer.decode(out[0])}")from tokenizers import Tokenizer, models, trainers, pre_tokenizers
# Step 1 of building GPT: train your own tokenizer
tokenizer = Tokenizer(models.BPE())
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
trainer = trainers.BpeTrainer(
vocab_size=50000, # Target vocabulary size
special_tokens=["<|endoftext|>", # End of sequence
"<|pad|>", # Padding
"<|begin_of_text|>"], # Beginning of sequence
min_frequency=2, # Minimum pair frequency to merge
)
# Train on your corpus β this learns the merge rules
tokenizer.train(files=["corpus_part1.txt", "corpus_part2.txt"], trainer=trainer)
tokenizer.save("gpt-oss-tokenizer.json")
# Test it
output = tokenizer.encode("Hello, GPT-OSS!")
print(f"Tokens: {output.tokens}")
print(f"IDs: {output.ids}")
# This tokenizer is trained ONCE, then used for ALL model training and inference