πŸ–¨οΈ Printing Instructions: Press Ctrl/Cmd + P and select "Save as PDF".
1

NLP & Tokenization

From Raw Text to Transformer-Ready Tensors

2

Where We Are

3

Part 1: The Text-to-Tensor Pipeline

4

The Complete Pipeline at a Glance

5

Part 2: Tokenization

6

What Is a Token?

7

Why Subwords Win

8

Byte-Pair Encoding (BPE) β€” The Algorithm

9

BPE: Worked Example

10

Byte-Level BPE β€” What LLMs Actually Use

11

Vocabulary Size β€” A Critical Hyperparameter

12

Special Tokens

13

Tokenizer Implementations

14

Tokenization Artifacts β€” Why This Matters for Model Behavior

15

Working with Tokenizers in Practice

python
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")

text = "Tokenization is surprisingly important!"

# See the subword tokens (strings)
tokens = tokenizer.tokenize(text)
print(tokens)  # ['Token', 'ization', 'Δ is', 'Δ surprisingly', 'Δ important', '!']
# Δ  = leading space β€” byte-level BPE encodes spaces as part of tokens

# Encode: text β†’ integer IDs
ids = tokenizer.encode(text)
print(ids)  # [30642, 1634, 318, 11242, 1593, 0]

# Full tokenization (returns tensors + attention mask for batching)
inputs = tokenizer(text, return_tensors="pt", padding=True)
print(inputs.input_ids)       # Token IDs as tensor
print(inputs.attention_mask)  # 1 = real token, 0 = padding

# Decode: IDs β†’ text (lossless roundtrip)
print(tokenizer.decode(ids))  # "Tokenization is surprisingly important!"

print(f"Vocabulary size: {len(tokenizer)}")  # 50257
16

Exploring Tokenization Artifacts

python
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")

# Numbers are tokenized inconsistently
for n in ["127", "128", "129", "1000", "10000"]:
    toks = tokenizer.tokenize(n)
    print(f"{n:>6} β†’ {toks}  ({len(toks)} tokens)")
# 127   β†’ ['127']          (1 token)
# 128   β†’ ['128']          (1 token)
# 129   β†’ ['12', '9']      (2 tokens!)  ← different structure
# 1000  β†’ ['1000']         (1 token)
# 10000 β†’ ['100', '00']    (2 tokens)

# Multilingual cost inequality
en = "The cat sat on the mat."
zh = "ηŒ«εεœ¨εž«ε­δΈŠγ€‚"  # Same meaning in Chinese
print(f"English: {len(tokenizer.encode(en))} tokens")
print(f"Chinese: {len(tokenizer.encode(zh))} tokens")  # 2-3Γ— more!

# These artifacts explain many real model failures
17

Part 3: Embeddings β€” From IDs to Vectors

18

Token Embeddings

19

The Embedding Matrix ($W_E$)

20

Positional Information

21

Static vs Contextual Representations

22

The Input Pipeline in PyTorch

python
import torch
import torch.nn as nn

class InputPipeline(nn.Module):
    """Token + positional embeddings for a GPT-style model.
    GPT-OSS replaces pos_embed with RoPE."""
    def __init__(self, vocab_size=200000, max_seq_len=4096, embed_dim=4096):
        super().__init__()
        self.token_embed = nn.Embedding(vocab_size, embed_dim)
        self.pos_embed = nn.Embedding(max_seq_len, embed_dim)  # β†’ RoPE later
        self.dropout = nn.Dropout(0.1)

    def forward(self, token_ids):  # (batch_size, seq_len)
        B, T = token_ids.shape
        tok_emb = self.token_embed(token_ids)                   # (B, T, embed_dim)
        positions = torch.arange(T, device=token_ids.device)    # [0, 1, ..., T-1]
        pos_emb = self.pos_embed(positions)                     # (T, embed_dim)
        x = self.dropout(tok_emb + pos_emb)                     # (B, T, embed_dim)
        return x  # β†’ feeds into the first transformer layer

# Parameter count
pipeline = InputPipeline()
print(f"Embedding parameters: {sum(p.numel() for p in pipeline.parameters()):,}")
# ~835M parameters just for embeddings + positions!
23

Part 4: The Output Pipeline β€” From Vectors to Text

24

The Language Modeling Head

25

Logits β†’ Probabilities

26

Decoding Strategies (Inference Time)

27

Autoregressive Generation Loop

28

The Output Pipeline in PyTorch

python
import torch
import torch.nn as nn
import torch.nn.functional as F

# After transformer layers, we have hidden states
# hidden: (batch, seq_len, embed_dim)

# --- LM HEAD: Project to vocabulary ---
lm_head = nn.Linear(embed_dim, vocab_size, bias=False)
logits = lm_head(hidden)  # (batch, seq_len, vocab_size)

# --- TRAINING: Compute loss ---
# Position t's logits predict token at position t+1, so we shift:
shift_logits = logits[:, :-1, :].contiguous()   # (batch, T-1, vocab_size)
shift_labels = token_ids[:, 1:].contiguous()     # (batch, T-1)
loss = F.cross_entropy(
    shift_logits.view(-1, vocab_size),  # Flatten to (batch*(T-1), vocab_size)
    shift_labels.view(-1)               # Flatten to (batch*(T-1),)
)  # Scalar β€” this is what we backpropagate!

# --- INFERENCE: Generate next token ---
next_logits = logits[:, -1, :]            # (batch, vocab_size)
scaled = next_logits / 0.7               # Temperature scaling
probs = F.softmax(scaled, dim=-1)         # Probability distribution
next_token = torch.multinomial(probs, 1)  # Sample one token
29

Part 5: Loss Functions and Metrics

30

The Training Objective: Next-Token Prediction

31

Cross-Entropy Loss

32

Teacher Forcing: Why Training Is So Efficient

33

Perplexity β€” The Key LLM Metric

34

Other Metrics You'll See

35

Computing Loss and Perplexity

python
import torch
import torch.nn.functional as F
import math

def compute_loss_and_perplexity(logits, labels):
    """Standard language modeling loss computation."""
    # logits: (batch, seq_len, vocab_size)
    # labels: (batch, seq_len) β€” the true next tokens
    loss = F.cross_entropy(
        logits.view(-1, logits.size(-1)),  # Flatten: (batch*seq_len, vocab_size)
        labels.view(-1),                    # Flatten: (batch*seq_len,)
        ignore_index=-100                   # Ignore padding positions
    )
    perplexity = math.exp(loss.item())      # PPL = e^(cross-entropy)
    return loss, perplexity

# Example: inside the training loop
for batch in dataloader:
    token_ids = batch["input_ids"]                 # (batch, seq_len)
    logits = model(token_ids[:, :-1])               # Predict from all but last
    labels = token_ids[:, 1:]                       # True next tokens
    loss, ppl = compute_loss_and_perplexity(logits, labels)
    print(f"Loss: {loss.item():.4f}, Perplexity: {ppl:.2f}")
    loss.backward()                                 # Backpropagate!
    optimizer.step()
    optimizer.zero_grad()
36

Part 6: The Full Architecture β€” End to End

37

Why Decoder-Only?

38

End-to-End Walkthrough

39

Simplified GPT β€” The Skeleton You'll Expand Into GPT-OSS

python
import torch
import torch.nn as nn
import torch.nn.functional as F

class GPTModel(nn.Module):
    def __init__(self, vocab_size=200000, max_seq_len=4096,
                 embed_dim=4096, n_layers=32, n_heads=32):
        super().__init__()
        self.token_embed = nn.Embedding(vocab_size, embed_dim)
        self.pos_embed = nn.Embedding(max_seq_len, embed_dim)  # Will β†’ RoPE
        self.dropout = nn.Dropout(0.1)
        self.layers = nn.ModuleList([
            TransformerBlock(embed_dim, n_heads)  # From last lecture!
            for _ in range(n_layers)
        ])
        self.norm = nn.LayerNorm(embed_dim)                    # Will β†’ RMSNorm
        self.lm_head = nn.Linear(embed_dim, vocab_size, bias=False)
        self.lm_head.weight = self.token_embed.weight          # Weight tying!

    def forward(self, token_ids, targets=None):
        B, T = token_ids.shape
        tok_emb = self.token_embed(token_ids)
        pos_emb = self.pos_embed(torch.arange(T, device=token_ids.device))
        x = self.dropout(tok_emb + pos_emb)
        for layer in self.layers:
            x = layer(x)             # N transformer blocks
        x = self.norm(x)             # Final normalization
        logits = self.lm_head(x)     # (B, T, vocab_size)

        loss = None
        if targets is not None:
            loss = F.cross_entropy(
                logits.view(-1, logits.size(-1)), targets.view(-1)
            )
        return logits, loss
40

What Changes in the Original Transformer

41

Part 7: HuggingFace in Practice

42

HuggingFace β€” Your Practical Toolkit

43

End-to-End: Tokenize β†’ Forward β†’ Inspect β†’ Generate

python
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelForCausalLM.from_pretrained("gpt2")

# --- Step 1: Tokenize ---
text = "The future of AI is"
inputs = tokenizer(text, return_tensors="pt")
print(f"Tokens: {tokenizer.tokenize(text)}")  # ['The', 'Δ future', 'Δ of', 'Δ AI', 'Δ is']
print(f"IDs:    {inputs.input_ids}")           # tensor([[464, 2003, 286, 9552, 318]])

# --- Step 2: Forward pass β†’ inspect logits ---
with torch.no_grad():
    logits = model(**inputs).logits            # (1, 5, 50257)

# See what the model predicts after each token
for i in range(logits.shape[1]):
    prefix = tokenizer.decode(inputs.input_ids[0, :i+1])
    top_pred = tokenizer.decode(logits[0, i].argmax())
    print(f"  After '{prefix}' β†’ predicts '{top_pred}'")

# --- Step 3: Generate ---
out = model.generate(inputs.input_ids, max_new_tokens=30, temperature=0.7, top_p=0.95,
                     do_sample=True)
print(f"\nGenerated: {tokenizer.decode(out[0])}")
44

Training a BPE Tokenizer from Scratch

python
from tokenizers import Tokenizer, models, trainers, pre_tokenizers

# Step 1 of building GPT: train your own tokenizer
tokenizer = Tokenizer(models.BPE())
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)

trainer = trainers.BpeTrainer(
    vocab_size=50000,                       # Target vocabulary size
    special_tokens=["<|endoftext|>",        # End of sequence
                    "<|pad|>",              # Padding
                    "<|begin_of_text|>"],   # Beginning of sequence
    min_frequency=2,                         # Minimum pair frequency to merge
)

# Train on your corpus β€” this learns the merge rules
tokenizer.train(files=["corpus_part1.txt", "corpus_part2.txt"], trainer=trainer)
tokenizer.save("gpt-oss-tokenizer.json")

# Test it
output = tokenizer.encode("Hello, GPT-OSS!")
print(f"Tokens: {output.tokens}")
print(f"IDs:    {output.ids}")
# This tokenizer is trained ONCE, then used for ALL model training and inference
45

Summary

46

What You Now Understand

47

The Road to GPT-OSS

48

Interactive Demos

49

Supplementary Resources