1.1 Transformers架构概览 — Transformers架构基础 本节导读:通过本节学习,读者将全面理解Transformers的完整架构组成、核心设计思路以及与传统模型的区别,为后续深入学习奠定坚实基础。 学习目标 掌握Transformers的完整架构组成和各组件功能 理解自注意力机制在序列处理中的核心作用 了解编码器-解码器架构的设计思想 掌握位置编码的重要性和实现原理 理解残差连接和层归一化的技术价值 核心概念 Transformers整体架构 Transformers是一种基于自注意力机制的神经网络架构,与传统RNN和LSTM不同,它能够并行处理整个序列,有效解决了长距离依赖问题。
本节导读:通过本节学习,读者将全面理解Transformers的完整架构组成、核心设计思路以及与传统模型的区别,为后续深入学习奠定坚实基础。
Transformers是一种基于自注意力机制的神经网络架构,与传统RNN和LSTM不同,它能够并行处理整个序列,有效解决了长距离依赖问题。其核心架构包括:
自注意力机制是Transformers的核心,它能够计算序列中每个元素与其他元素的相关性,为每个元素生成一个动态的上下文表示。
# 基础依赖 torch==2.1.0 transformers==4.35.2 numpy==1.24.3 matplotlib==3.7.2 seaborn==0.12.2
# 创建虚拟环境 conda create -n transformers-tutorial python=3.9 conda activate transformers-tutorial # 安装PyTorch pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 # 安装Transformers库 pip install transformers==4.35.2 # 安装辅助库 pip install numpy matplotlib seaborn tqdm
import torch import torch.nn as nn import numpy as np import matplotlib.pyplot as plt import seaborn as sns from transformers import AutoTokenizer, AutoModel print(f"PyTorch版本: {torch.__version__}") print(f"CUDA可用: {torch.cuda.is_available()}") if torch.cuda.is_available(): print(f"CUDA版本: {torch.version.cuda}") print(f"GPU设备: {torch.cuda.get_device_name()}")
# 验证安装是否成功 try: # 加载预训练模型 model_name = "bert-base-uncased" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModel.from_pretrained(model_name) print(f"成功加载模型: {model_name}") print(f"模型参数数量: {sum(p.numel() for p in model.parameters()):,}") print(f"可训练参数数量: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}") # 查看模型结构 print("\n模型结构:") print(model) except Exception as e: print(f"模型加载失败: {e}")
def positional_encoding(max_seq_len, d_model): """ 生成位置编码矩阵 Args: max_seq_len: 最大序列长度 d_model: 模型维度 Returns: 位置编码矩阵 [max_seq_len, d_model] """ position = torch.arange(max_seq_len, dtype=torch.float).unsqueeze(1) div_term = torch.exp(torch.arange(0, d_model, 2, dtype=torch.float) * (-np.log(10000.0) / d_model)) pe = torch.zeros(max_seq_len, d_model) pe[:, 0::2] = torch.sin(position * div_term) pe[:, 1::2] = torch.cos(position * div_term) return pe # 可视化位置编码 max_seq_len = 100 d_model = 512 pe_matrix = positional_encoding(max_seq_len, d_model) plt.figure(figsize=(12, 8)) plt.imshow(pe_matrix, cmap='viridis', aspect='auto') plt.xlabel('维度') plt.ylabel('位置') plt.title('位置编码可视化') plt.colorbar() plt.show()
class SelfAttention(nn.Module): def __init__(self, d_model, num_heads=8): super(SelfAttention, self).__init__() self.d_model = d_model self.num_heads = num_heads self.head_dim = d_model // num_heads # 确保模型维度能被注意力头数整除 assert self.head_dim * num_heads == d_model # 线性变换矩阵 self.q_linear = nn.Linear(d_model, d_model) self.k_linear = nn.Linear(d_model, d_model) self.v_linear = nn.Linear(d_model, d_model) self.out_linear = nn.Linear(d_model, d_model) def forward(self, x, mask=None): batch_size = x.size(0) # 线性变换并分割为多头 Q = self.q_linear(x).view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2) K = self.k_linear(x).view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2) V = self.v_linear(x).view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2) # 计算注意力分数 scores = torch.matmul(Q, K.transpose(-2, -1)) / np.sqrt(self.head_dim) # 应用掩码(如果存在) if mask is not None: scores = scores.masked_fill(mask == 0, -1e9) # Softmax得到注意力权重 attention_weights = torch.softmax(scores, dim=-1) # 计算上下文向量 context = torch.matmul(attention_weights, V) # 合并多头 context = context.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model) # 输出线性变换 output = self.out_linear(context) return output, attention_weights # 测试自注意力机制 def test_self_attention(): d_model = 512 seq_len = 10 batch_size = 32 # 创建随机输入 x = torch.randn(batch_size, seq_len, d_model) # 创建自注意力层 attention = SelfAttention(d_model, num_heads=8) # 前向传播 output, attention_weights = attention(x) print(f"输入形状: {x.shape}") print(f"输出形状: {output.shape}") print(f"注意力权重形状: {attention_weights.shape}") print(f"注意力权重总和: {attention_weights.sum(dim=-1)}") return output, attention_weights # 运行测试 test_self_attention()
class EncoderLayer(nn.Module): def __init__(self, d_model, num_heads, d_ff=2048, dropout=0.1): super(EncoderLayer, self).__init__() self.self_attn = SelfAttention(d_model, num_heads) self.ffn = nn.Sequential( nn.Linear(d_model, d_ff), nn.ReLU(), nn.Linear(d_ff, d_model) ) self.norm1 = nn.LayerNorm(d_model) self.norm2 = nn.LayerNorm(d_model) self.dropout = nn.Dropout(dropout) def forward(self, x, mask=None): # 自注意力子层 attn_output, _ = self.self_attn(x, mask) x = x + self.dropout(attn_output) # 残差连接 x = self.norm1(x) # 前馈网络子层 ffn_output = self.ffn(x) x = x + self.dropout(ffn_output) # 残差连接 x = self.norm2(x) return x class TransformerEncoder(nn.Module): def __init__(self, vocab_size, d_model, num_heads, num_layers, d_ff=2048, max_seq_len=512, dropout=0.1): super(TransformerEncoder, self).__init__() self.d_model = d_model # 嵌入层 self.embedding = nn.Embedding(vocab_size, d_model) self.positional_encoding = positional_encoding(max_seq_len, d_model) # 编码器层堆叠 self.layers = nn.ModuleList([ EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers) ]) self.norm = nn.LayerNorm(d_model) def forward(self, x, mask=None): # 嵌入和位置编码 seq_len = x.size(1) x = self.embedding(x) * np.sqrt(self.d_model) x = x + self.positional_encoding[:seq_len, :] # 通过编码器层 for layer in self.layers: x = layer(x, mask) # 最终归一化 x = self.norm(x) return x # 测试编码器 def test_encoder(): vocab_size = 10000 d_model = 512 num_heads = 8 num_layers = 6 max_seq_len = 128 batch_size = 32 seq_len = 50 # 创建编码器 encoder = TransformerEncoder(vocab_size, d_model, num_heads, num_layers, max_seq_len=max_seq_len) # 创建输入 x = torch.randint(0, vocab_size, (batch_size, seq_len)) # 前向传播 output = encoder(x) print(f"编码器输入形状: {x.shape}") print(f"编码器输出形状: {output.shape}") print(f"编码器参数数量: {sum(p.numel() for p in encoder.parameters()):,}") return output # 运行测试 test_encoder()
import torch import torch.nn as nn import numpy as np import matplotlib.pyplot as plt import seaborn as sns from typing import Optional, Tuple class PositionalEncoding(nn.Module): def __init__(self, d_model: int, max_len: int = 5000): super(PositionalEncoding, self).__init__() pe = torch.zeros(max_len, d_model) position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1) div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model)) pe[:, 0::2] = torch.sin(position * div_term) pe[:, 1::2] = torch.cos(position * div_term) pe = pe.unsqueeze(0).transpose(0, 1) self.register_buffer('pe', pe) def forward(self, x: torch.Tensor) -> torch.Tensor: return x + self.pe[:x.size(0), :] class MultiHeadAttention(nn.Module): def __init__(self, d_model: int, num_heads: int = 8, dropout: float = 0.1): super(MultiHeadAttention, self).__init__() self.d_model = d_model self.num_heads = num_heads self.head_dim = d_model // num_heads assert self.head_dim * num_heads == d_model, "d_model必须被num_heads整除" self.q_linear = nn.Linear(d_model, d_model) self.k_linear = nn.Linear(d_model, d_model) self.v_linear = nn.Linear(d_model, d_model) self.out_linear = nn.Linear(d_model, d_model) self.dropout = nn.Dropout(dropout) self.scale = np.sqrt(self.head_dim) def forward(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, mask: Optional[torch.Tensor] = None) -> Tuple[torch.Tensor, torch.Tensor]: batch_size = query.size(0) # 线性变换 Q = self.q_linear(query) K = self.k_linear(key) V = self.v_linear(value) # 分割为多头 Q = Q.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2) K = K.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2) V = V.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2) # 计算注意力分数 scores = torch.matmul(Q, K.transpose(-2, -1)) / self.scale # 应用掩码 if mask is not None: scores = scores.masked_fill(mask == 0, -1e9) # 计算注意力权重 attention_weights = torch.softmax(scores, dim=-1) attention_weights = self.dropout(attention_weights) # 计算上下文向量 context = torch.matmul(attention_weights, V) # 合并多头 context = context.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model) # 输出线性变换 output = self.out_linear(context) return output, attention_weights class FeedForward(nn.Module): def __init__(self, d_model: int, d_ff: int = 2048, dropout: float = 0.1): super(FeedForward, self).__init__() self.linear1 = nn.Linear(d_model, d_ff) self.linear2 = nn.Linear(d_ff, d_model) self.dropout = nn.Dropout(dropout) self.relu = nn.ReLU() def forward(self, x: torch.Tensor) -> torch.Tensor: return self.linear2(self.dropout(self.relu(self.linear1(x)))) class TransformerEncoderLayer(nn.Module): def __init__(self, d_model: int, num_heads: int = 8, d_ff: int = 2048, dropout: float = 0.1): super(TransformerEncoderLayer, self).__init__() self.self_attn = MultiHeadAttention(d_model, num_heads, dropout) self.ffn = FeedForward(d_model, d_ff, dropout) self.norm1 = nn.LayerNorm(d_model) self.norm2 = nn.LayerNorm(d_model) self.dropout = nn.Dropout(dropout) def forward(self, x: torch.Tensor, mask: Optional[torch.Tensor] = None) -> torch.Tensor: # 自注意力子层 attn_output, _ = self.self_attn(x, x, x, mask) x = x + self.dropout(attn_output) x = self.norm1(x) # 前馈网络子层 ffn_output = self.ffn(x) x = x + self.dropout(ffn_output) x = self.norm2(x) return x class TransformerEncoder(nn.Module): def __init__(self, vocab_size: int, d_model: int = 512, num_heads: int = 8, num_layers: int = 6, d_ff: int = 2048, max_len: int = 5000, dropout: float = 0.1): super(TransformerEncoder, self).__init__() self.d_model = d_model # 嵌入层 self.embedding = nn.Embedding(vocab_size, d_model) self.positional_encoding = PositionalEncoding(d_model, max_len) # 编码器层堆叠 self.layers = nn.ModuleList([ TransformerEncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers) ]) self.norm = nn.LayerNorm(d_model) def forward(self, x: torch.Tensor, mask: Optional[torch.Tensor] = None) -> torch.Tensor: seq_len = x.size(1) # 嵌入和位置编码 x = self.embedding(x) * np.sqrt(self.d_model) x = x + self.positional_encoding[:seq_len, :] # 通过编码器层 for layer in self.layers: x = layer(x, mask) # 最终归一化 x = self.norm(x) return x def create_mask(seq: torch.Tensor) -> torch.Tensor: """创建填充掩码""" return (seq != 0).unsqueeze(1).unsqueeze(2) def demonstrate_transformer(): """演示Transformer编码器的功能""" # 参数设置 vocab_size = 10000 d_model = 512 num_heads = 8 num_layers = 6 max_seq_len = 128 batch_size = 32 seq_len = 50 # 创建编码器 encoder = TransformerEncoder(vocab_size, d_model, num_heads, num_layers, max_seq_len=max_seq_len) # 创建输入序列 x = torch.randint(0, vocab_size, (batch_size, seq_len)) # 创建掩码 mask = create_mask(x) # 前向传播 with torch.no_grad(): output = encoder(x, mask) print("="*60) print("Transformers编码器演示") print("="*60) print(f"输入序列形状: {x.shape}") print(f"输入序列范围: {x.min().item()} - {x.max().item()}") print(f"掩码形状: {mask.shape}") print(f"输出序列形状: {output.shape}") print(f"编码器参数数量: {sum(p.numel() for p in encoder.parameters()):,}") print(f"可训练参数数量: {sum(p.numel() for p in encoder.parameters() if p.requires_grad):,}") # 分析输出 print(f"\n输出统计:") print(f"输出均值: {output.mean():.4f}") print(f"输出标准差: {output.std():.4f}") print(f"输出最小值: {output.min():.4f}") print(f"输出最大值: {output.max():.4f}") # 显示某些位置的特征 sample_idx = 0 token_idx = 10 print(f"\n样本 {sample_idx} 中第 {token_idx} 个token的输出特征:") print(f"输出向量 (前20维): {output[sample_idx, token_idx, :20]}") return encoder, output if __name__ == "__main__": # 运行演示 encoder, output = demonstrate_transformer()
A:Transformers通过自注意力机制能够直接计算序列中任意两个位置之间的关联,不受距离限制。每个Token都能够关注到序列中的所有其他Token,从而有效捕获长距离依赖关系。相比之下,RNN和LSTM需要通过多个时间步才能传递远距离信息,容易出现梯度消失问题。
A:位置编码为序列中的每个位置生成一个独特的向量,使得模型能够区分相同词汇在不同位置的含义。由于自注意力机制本身不包含位置信息,位置编码提供了相对位置和绝对位置的信息,帮助理解词语在序列中的排列顺序。常见的实现包括正弦编码和可学习编码。
A:多头注意力机制允许模型同时关注序列中的不同位置和不同的表示子空间。每个注意力头可以学习不同的表示模式,从而捕获更丰富的语义信息。这种设计类似于人类在不同层面上理解文本(如语法结构、语义关系、上下文关联等),提高了模型的表达能力。
A:残差连接(Residual Connection)通过将输入直接连接到层输出,解决了深层网络中的梯度消失问题,允许模型训练更深的网络。层归一化(Layer Normalization)通过对每个样本的特征进行归一化,稳定了训练过程,加快了收敛速度,并提高了模型的泛化能力。
A:编码器负责将输入序列转换为连续的表示向量,包含多个相同的编码器层,每层包含自注意力和前馈网络。解码器在编码器输出基础上生成目标序列,包含自注意力层、编码器-解码器注意力层和前馈网络。解码器的编码器-解码器注意力层使其能够关注输入序列的相关部分,而自注意力层确保目标序列内部的一致性。
本节全面介绍了Transformers架构的核心组成和设计思想。通过本节的学习,读者应该掌握了:
下一节将深入探讨自注意力机制的数学原理和实现细节,帮助读者更深入地理解这一核心组件。
关键词:Transformers, 架构概览, 自注意力, 位置编码, 编码器-解码器, 残差连接, 层归一化
难度:基础
预计阅读:60分钟