Ctrl/Cmd + P and select "Save as PDF".
import torch
from PIL import Image
import torchvision.transforms as T
# Load an image
img = Image.open('cat.jpg') # Shape: (H, W, C)
# Convert to tensor
transform = T.ToTensor() # Converts to (C, H, W), scales to [0, 1]
img_tensor = transform(img)
print(img_tensor.shape) # torch.Size([3, 224, 224])transform = T.Compose([
T.ToTensor(), # [0, 255] → [0, 1]
T.Normalize(
mean=[0.485, 0.456, 0.406], # ImageNet means
std=[0.229, 0.224, 0.225] # ImageNet stds
)
])import torch.nn as nn
# Convolution layer
conv = nn.Conv2d(
in_channels=3, # RGB input
out_channels=16, # 16 output feature maps
kernel_size=3, # 3×3 filter
stride=1, # Move filter by 1 pixel
padding=1 # Add 1 pixel border to preserve size
)
# Input: (batch, 3, 32, 32)
# Output: (batch, 16, 32, 32)# Max pooling: 2×2 window, stride 2 pool = nn.MaxPool2d(kernel_size=2, stride=2) # Input: (batch, 16, 32, 32) # Output: (batch, 16, 16, 16) — halves each dimension # Adaptive pooling: output fixed size regardless of input adaptive_pool = nn.AdaptiveAvgPool2d((1, 1)) # Global average pooling # Output: (batch, channels, 1, 1)
# Common pattern: Conv → BatchNorm → ReLU
nn.Sequential(
nn.Conv2d(3, 64, kernel_size=3, padding=1),
nn.BatchNorm2d(64),
nn.ReLU(inplace=True)
)class SimpleCNN(nn.Module):
def __init__(self):
super().__init__()
# Feature extractor
self.features = nn.Sequential(
nn.Conv2d(3, 32, 3, padding=1),
nn.BatchNorm2d(32),
nn.ReLU(),
nn.MaxPool2d(2, 2), # 32→16
nn.Conv2d(32, 64, 3, padding=1),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.MaxPool2d(2, 2), # 16→8
nn.Conv2d(64, 128, 3, padding=1),
nn.BatchNorm2d(128),
nn.ReLU(),
nn.MaxPool2d(2, 2), # 8→4
)
# Classifier
self.classifier = nn.Sequential(
nn.Flatten(),
nn.Linear(128 * 4 * 4, 256),
nn.ReLU(),
nn.Dropout(0.5),
nn.Linear(256, 10)
)
def forward(self, x):
x = self.features(x)
x = self.classifier(x)
return xfrom torchvision import datasets, transforms
from torch.utils.data import DataLoader
# Define transforms
transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])
# Load CIFAR-10
train_dataset = datasets.CIFAR10(
root='./data', train=True, download=True, transform=transform
)
test_dataset = datasets.CIFAR10(
root='./data', train=False, download=True, transform=transform
)
# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)model = SimpleCNN().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
for epoch in range(10):
model.train()
for images, labels in train_loader:
images, labels = images.to(device), labels.to(device)
outputs = model(images)
loss = criterion(outputs, labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
print(f'Epoch {epoch+1}, Loss: {loss.item():.4f}')train_transform = transforms.Compose([
transforms.RandomHorizontalFlip(p=0.5),
transforms.RandomRotation(15),
transforms.RandomCrop(32, padding=4),
transforms.ColorJitter(brightness=0.2, contrast=0.2),
transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])
# For testing: NO augmentation, just normalize
test_transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])from torchvision import models
# Load pre-trained ResNet18
model = models.resnet18(pretrained=True)
# Freeze all layers (don't update during training)
for param in model.parameters():
param.requires_grad = False
# Replace final layer for our task (e.g., 10 classes)
num_features = model.fc.in_features
model.fc = nn.Linear(num_features, 10)
# Only the new fc layer will be trained
model = model.to(device)# Unfreeze last few layers for fine-tuning
for name, param in model.named_parameters():
if 'layer4' in name or 'fc' in name:
param.requires_grad = True
# Use smaller learning rate for fine-tuning
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)from torchvision import models # Classic architectures resnet = models.resnet50(pretrained=True) vgg = models.vgg16(pretrained=True) # Modern architectures efficientnet = models.efficientnet_b0(pretrained=True) convnext = models.convnext_tiny(pretrained=True) # Vision Transformers (ViT) vit = models.vit_b_16(pretrained=True)