Ctrl/Cmd + P and select "Save as PDF".
import torch
import torch.nn.functional as F
def self_attention(x, W_q, W_k, W_v):
"""x: (batch, seq_len, d_model)"""
Q = x @ W_q # Queries
K = x @ W_k # Keys
V = x @ W_v # Values
d_k = K.shape[-1]
scores = (Q @ K.transpose(-2, -1)) / (d_k ** 0.5)
attn_weights = F.softmax(scores, dim=-1)
output = attn_weights @ V
return output, attn_weightsclass MultiHeadAttention(nn.Module):
def __init__(self, d_model, num_heads):
super().__init__()
self.num_heads = num_heads
self.d_k = d_model // num_heads
self.W_q = nn.Linear(d_model, d_model)
self.W_k = nn.Linear(d_model, d_model)
self.W_v = nn.Linear(d_model, d_model)
self.W_o = nn.Linear(d_model, d_model)
def forward(self, x):
B, L, D = x.shape
# Project and reshape to (B, num_heads, L, d_k)
Q = self.W_q(x).view(B, L, self.num_heads, self.d_k).transpose(1, 2)
K = self.W_k(x).view(B, L, self.num_heads, self.d_k).transpose(1, 2)
V = self.W_v(x).view(B, L, self.num_heads, self.d_k).transpose(1, 2)
# Scaled dot-product attention
scores = (Q @ K.transpose(-2, -1)) / (self.d_k ** 0.5)
attn = F.softmax(scores, dim=-1)
out = (attn @ V).transpose(1, 2).reshape(B, L, D)
return self.W_o(out)import torch.nn as nn
# Create the layer
mha = nn.MultiheadAttention(
embed_dim=512, # Model dimension
num_heads=8, # Number of attention heads
dropout=0.1, # Dropout rate
batch_first=True # Input shape: (batch, seq, dim)
)
# Forward pass (self-attention: Q=K=V)
x = torch.randn(2, 10, 512) # batch=2, seq=10
output, attn_weights = mha(x, x, x)
# output: (2, 10, 512)
# attn_weights: (2, 10, 10)# FROM SCRATCH (what you implement in assignments)
class SelfAttention(nn.Module):
def __init__(self, d_model):
self.W_q = nn.Linear(d_model, d_model)
self.W_k = nn.Linear(d_model, d_model)
self.W_v = nn.Linear(d_model, d_model)
def forward(self, x):
Q, K, V = self.W_q(x), self.W_k(x), self.W_v(x)
scores = Q @ K.transpose(-2, -1) / (Q.size(-1) ** 0.5)
return F.softmax(scores, dim=-1) @ V
# BUILT-IN (what you use in production)
self.attn = nn.MultiheadAttention(d_model, num_heads)
output, _ = self.attn(x, x, x) # Same result, optimized!class TransformerBlock(nn.Module):
def __init__(self, d_model, num_heads, d_ff):
super().__init__()
self.attn = MultiHeadAttention(d_model, num_heads)
self.ffn = nn.Sequential(
nn.Linear(d_model, d_ff),
nn.GELU(),
nn.Linear(d_ff, d_model)
)
self.ln1 = nn.LayerNorm(d_model)
self.ln2 = nn.LayerNorm(d_model)
def forward(self, x):
# Pre-norm style
x = x + self.attn(self.ln1(x))
x = x + self.ffn(self.ln2(x))
return xdef causal_attention(x, W_q, W_k, W_v):
Q, K, V = x @ W_q, x @ W_k, x @ W_v
d_k = K.shape[-1]
scores = (Q @ K.transpose(-2, -1)) / (d_k ** 0.5)
# Create causal mask
seq_len = x.shape[1]
mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=1).bool()
scores = scores.masked_fill(mask, float('-inf'))
attn = F.softmax(scores, dim=-1)
return attn @ Vimport torch.nn as nn
class SimpleTransformer(nn.Module):
def __init__(self, vocab_size, d_model=512, nhead=8, num_layers=6):
super().__init__()
self.embed = nn.Embedding(vocab_size, d_model)
self.pos_enc = PositionalEncoding(d_model) # You implement this!
# Use built-in TransformerEncoder
encoder_layer = nn.TransformerEncoderLayer(
d_model=d_model, nhead=nhead, batch_first=True
)
self.transformer = nn.TransformerEncoder(encoder_layer, num_layers)
self.fc_out = nn.Linear(d_model, vocab_size)
def forward(self, x):
x = self.embed(x)
x = self.pos_enc(x)
x = self.transformer(x)
return self.fc_out(x)