计算机视觉基础:图像处理与CNN原理 引言 计算机视觉是AI领域最成功的方向之一。从图像分类到目标检测,从图像分割到人脸识别,CNN(卷积神经网络)彻底改变了视觉任务。本文将深入讲解计算机视觉的核心概念、CNN工作原理和实际应用。 一、图像基础 1.1 数字图像表示 1.2 基础操作 二、卷积神经网络(CNN) 2.1 卷积操作 2.2 池化层 2.3 经典CNN架构 LeNet-5(1998) AlexNet(2012) VGG(2014) 三、现代CNN架构 3.1 ResNet(残差网络) 3.2 EfficientNet(复合缩放) 四、数据增强 五、实战案例 5.1 图像分类 5.
计算机视觉是AI领域最成功的方向之一。从图像分类到目标检测,从图像分割到人脸识别,CNN(卷积神经网络)彻底改变了视觉任务。本文将深入讲解计算机视觉的核心概念、CNN工作原理和实际应用。
图像 = 高度 × 宽度 × 通道数 # 灰度图: [H, W] # 彩色图(RGB): [H, W, 3] # 批量: [Batch, H, W, 3]
import cv2 import numpy as np import matplotlib.pyplot as plt # 读取图像 img = cv2.imread('image.jpg') # BGR格式 img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) # 转RGB # 调整大小 resized = cv2.resize(img, (224, 224)) # 裁剪 cropped = img[100:300, 200:400] # 旋转 M = cv2.getRotationMatrix2D(center=(width//2, height//2), angle=45, scale=1.0) rotated = cv2.warpAffine(img, M, (width, height)) # 滤波 blur = cv2.GaussianBlur(img, (5, 5), 0) sharpen = cv2.filter2D(img, -1, np.array([[-1,-1,-1], [-1,9,-1], [-1,-1,-1]])) # 边缘检测 edges = cv2.Canny(img, 100, 200)
import torch import torch.nn as nn # 手动实现2D卷积 def conv2d(image, kernel, stride=1, padding=0): """ image: [C_in, H, W] kernel: [C_out, C_in, kH, kW] """ C_in, H, W = image.shape C_out, _, kH, kW = kernel.shape # 添加padding if padding > 0: image = torch.nn.functional.pad(image, (padding, padding, padding, padding)) # 输出尺寸 H_out = (H + 2*padding - kH) // stride + 1 W_out = (W + 2*padding - kW) // stride + 1 output = torch.zeros(C_out, H_out, W_out) # 卷积操作 for c_out in range(C_out): for h in range(H_out): for w in range(W_out): h_start, w_start = h*stride, w*stride patch = image[:, h_start:h_start+kH, w_start:w_start+kW] output[c_out, h, w] = torch.sum(patch * kernel[c_out]) return output # 使用PyTorch卷积层 conv = nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3, stride=1, padding=1) output = conv(image)
# 最大池化 maxpool = nn.MaxPool2d(kernel_size=2, stride=2) output = maxpool(feature_map) # 平均池化 avgpool = nn.AvgPool2d(kernel_size=2, stride=2) output = avgpool(feature_map) # 自适应池化(输出固定尺寸) adaptive_pool = nn.AdaptiveAvgPool2d((1, 1)) output = adaptive_pool(feature_map) # 任意尺寸 -> [Batch, C, 1, 1]
LeNet-5(1998)
class LeNet5(nn.Module): def __init__(self): super(LeNet5, self).__init__() self.conv1 = nn.Conv2d(1, 6, kernel_size=5, stride=1, padding=0) self.conv2 = nn.Conv2d(6, 16, kernel_size=5, stride=1, padding=0) self.fc1 = nn.Linear(16*5*5, 120) self.fc2 = nn.Linear(120, 84) self.fc3 = nn.Linear(84, 10) def forward(self, x): x = F.max_pool2d(F.relu(self.conv1(x)), 2) x = F.max_pool2d(F.relu(self.conv2(x)), 2) x = x.view(-1, 16*5*5) x = F.relu(self.fc1(x)) x = F.relu(self.fc2(x)) x = self.fc3(x) return x
AlexNet(2012)
class AlexNet(nn.Module): def __init__(self, num_classes=1000): super(AlexNet, self).__init__() self.features = nn.Sequential( nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2), nn.ReLU(inplace=True), nn.MaxPool2d(kernel_size=3, stride=2), nn.Conv2d(64, 192, kernel_size=5, padding=2), nn.ReLU(inplace=True), nn.MaxPool2d(kernel_size=3, stride=2), nn.Conv2d(192, 384, kernel_size=3, padding=1), nn.ReLU(inplace=True), nn.Conv2d(384, 256, kernel_size=3, padding=1), nn.ReLU(inplace=True), nn.Conv2d(256, 256, kernel_size=3, padding=1), nn.ReLU(inplace=True), nn.MaxPool2d(kernel_size=3, stride=2), ) self.classifier = nn.Sequential( nn.Dropout(), nn.Linear(256 * 6 * 6, 4096), nn.ReLU(inplace=True), nn.Dropout(), nn.Linear(4096, 4096), nn.ReLU(inplace=True), nn.Linear(4096, num_classes), ) def forward(self, x): x = self.features(x) x = x.view(x.size(0), 256 * 6 * 6) x = self.classifier(x) return x
VGG(2014)
class VGG(nn.Module): def __init__(self, features, num_classes=1000): super(VGG, self).__init__() self.features = features self.classifier = nn.Sequential( nn.Linear(512 * 7 * 7, 4096), nn.ReLU(True), nn.Dropout(), nn.Linear(4096, 4096), nn.ReLU(True), nn.Dropout(), nn.Linear(4096, num_classes), ) def forward(self, x): x = self.features(x) x = x.view(x.size(0), -1) x = self.classifier(x) return x def make_vgg_layers(): layers = [] in_channels = 3 cfg = [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'] for v in cfg: if v == 'M': layers += [nn.MaxPool2d(kernel_size=2, stride=2)] else: conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1) layers += [conv2d, nn.ReLU(inplace=True)] in_channels = v return nn.Sequential(*layers)
class ResidualBlock(nn.Module): def __init__(self, in_channels, out_channels, stride=1): super(ResidualBlock, self).__init__() self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False) self.bn1 = nn.BatchNorm2d(out_channels) self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False) self.bn2 = nn.BatchNorm2d(out_channels) # 快捷连接 self.shortcut = nn.Sequential() if stride != 1 or in_channels != out_channels: self.shortcut = nn.Sequential( nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False), nn.BatchNorm2d(out_channels) ) def forward(self, x): out = F.relu(self.bn1(self.conv1(x))) out = self.bn2(self.conv2(out)) out += self.shortcut(x) # 残差连接 out = F.relu(out) return out class ResNet(nn.Module): def __init__(self, block, num_blocks, num_classes=10): super(ResNet, self).__init__() self.in_channels = 64 self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False) self.bn1 = nn.BatchNorm2d(64) self.layer1 = self.make_layer(block, 64, num_blocks[0], stride=1) self.layer2 = self.make_layer(block, 128, num_blocks[1], stride=2) self.layer3 = self.make_layer(block, 256, num_blocks[2], stride=2) self.layer4 = self.make_layer(block, 512, num_blocks[3], stride=2) self.linear = nn.Linear(512, num_classes) def make_layer(self, block, out_channels, num_blocks, stride): layers = [] layers.append(block(self.in_channels, out_channels, stride)) self.in_channels = out_channels for _ in range(1, num_blocks): layers.append(block(out_channels, out_channels, stride=1)) return nn.Sequential(*layers) def forward(self, x): out = F.relu(self.bn1(self.conv1(x))) out = self.layer1(out) out = self.layer2(out) out = self.layer3(out) out = self.layer4(out) out = F.adaptive_avg_pool2d(out, (1, 1)) out = out.view(out.size(0), -1) out = self.linear(out) return out # ResNet18 def ResNet18(num_classes=10): return ResNet(ResidualBlock, [2, 2, 2, 2], num_classes)
from torchvision.models import efficientnet_v2_s # 使用预训练模型 model = efficientnet_v2_s(pretrained=True) # 微调 model.classifier[1] = nn.Linear(model.classifier[1].in_features, num_classes)
from torchvision import transforms # 训练时数据增强 train_transform = transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.RandomRotation(15), transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2), transforms.RandomAffine(0, shear=10), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ]) # 测试时不做增强 test_transform = transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ])
import torch.optim as optim from torch.utils.data import DataLoader from torchvision.datasets import CIFAR10 # 加载数据 train_dataset = CIFAR10(root='./data', train=True, download=True, transform=train_transform) test_dataset = CIFAR10(root='./data', train=False, download=True, transform=test_transform) train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True) test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False) # 模型、损失函数、优化器 model = ResNet18(num_classes=10).cuda() criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4) scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=200) # 训练循环 def train(epoch): model.train() train_loss = 0 correct = 0 total = 0 for batch_idx, (inputs, targets) in enumerate(train_loader): inputs, targets = inputs.cuda(), targets.cuda() optimizer.zero_grad() outputs = model(inputs) loss = criterion(outputs, targets) loss.backward() optimizer.step() train_loss += loss.item() _, predicted = outputs.max(1) total += targets.size(0) correct += predicted.eq(targets).sum().item() if batch_idx % 100 == 0: print(f'Epoch: {epoch} | Batch: {batch_idx} | Loss: {train_loss/(batch_idx+1):.4f} | Acc: {100.*correct/total:.2f}%') scheduler.step() # 测试 def test(): model.eval() test_loss = 0 correct = 0 total = 0 with torch.no_grad(): for batch_idx, (inputs, targets) in enumerate(test_loader): inputs, targets = inputs.cuda(), targets.cuda() outputs = model(inputs) loss = criterion(outputs, targets) test_loss += loss.item() _, predicted = outputs.max(1) total += targets.size(0) correct += predicted.eq(targets).sum().item() acc = 100.*correct/total print(f'Test Acc: {acc:.2f}%') return acc # 训练 for epoch in range(200): train(epoch) test()
import torchvision.models as models # 加载预训练模型 model = models.resnet50(pretrained=True) # 冻结特征提取层 for param in model.parameters(): param.requires_grad = False # 替换最后的分类层 num_features = model.fc.in_features model.fc = nn.Linear(num_features, num_classes) # 只训练分类层 optimizer = optim.Adam(model.fc.parameters(), lr=0.001) # 或者微调所有层 for param in model.parameters(): param.requires_grad = True optimizer = optim.Adam(model.parameters(), lr=0.0001)
# 特征图可视化 def visualize_feature_maps(model, image, layer_name): """可视化中间层特征图""" activation = {} def hook(model, input, output): activation[layer_name] = output.detach() # 注册hook layer = dict([*model.named_modules()])[layer_name] hook_handle = layer.register_forward_hook(hook) # 前向传播 model(image) # 可视化 feature_maps = activation[layer_name][0] num_maps = feature_maps.shape[0] fig, axes = plt.subplots(1, min(num_maps, 16), figsize=(20, 2)) for i in range(min(num_maps, 16)): axes[i].imshow(feature_maps[i].cpu(), cmap='viridis') axes[i].axis('off') hook_handle.remove() plt.show() # Grad-CAM(类别激活映射) import cv2 class GradCAM: def __init__(self, model, target_layer): self.model = model self.target_layer = target_layer self.gradients = None self.activations = None self.hook_forward() self.hook_backward() def hook_forward(self): def forward_hook(module, input, output): self.activations = output.detach() self.target_layer.register_forward_hook(forward_hook) def hook_backward(self): def backward_hook(module, grad_in, grad_out): self.gradients = grad_out[0].detach() self.target_layer.register_backward_hook(backward_hook) def generate_cam(self, input_tensor, target_class): # 前向传播 output = self.model(input_tensor) # 反向传播 self.model.zero_grad() output[0][target_class].backward() # 计算权重 gradients = self.gradients[0] activations = self.activations[0] weights = torch.mean(gradients, dim=(1, 2), keepdim=True) # 生成CAM cam = torch.sum(weights * activations, dim=0).cpu().numpy() cam = np.maximum(cam, 0) cam = cv2.resize(cam, (input_tensor.shape[2], input_tensor.shape[3])) cam = (cam - np.min(cam)) / (np.max(cam) - np.min(cam)) return cam
计算机视觉的核心要点: