AI 安全与 对齐:构建安全可靠的 AI 系统 AI 安全挑战 主要安全威胁 对抗样本攻击:精心设计的输入欺骗模型 提示词注入:通过特殊提示词绕过安全限制 数据隐私:训练数据和推理数据的隐私保护 模型窃取:模型权重和架构的泄露风险 有害内容:生成虚假、有害、偏见内容 对抗攻击 对抗样本 对抗训练 提示词注入防护 输入验证 系统提示词 数据隐私 差分隐私 联邦学习 内容过滤 有害内容检测 偏见检测 \ python
import torch import torch.nn as nn class FGSMAttack: """快速梯度符号攻击""" def __"" 生成对抗样本的 FGSM 方法 epsilon:扰动强度 """ def __init__(self, model, epsilon=0.01): self.model = model self.epsilon = epsilon def attack(self, image, true_label): # 设置可计算梯度 image.requires_grad = True # 前向传播 output = self.model(image.unsqueeze(0)) loss = F.cross_entropy(output, true_label) # 反向传播 loss.backward() # 计算扰动 image_grad = image.grad.data perturbed_image = image + self.epsilon * image_grad.sign() # 裁剪到有效范围 perturbed_image = torch.clamp(purred_image, 0, 1) return perturbed_image # 使用示例 attacker = FGSMAttack(model, epsilon=0.03) adversarial_image = attacker.attack(original_image, true_label)
class AdversarialTraining: def __init__(self, model, device): self.model = model self.device = device def train_step(self, clean_images, labels, adversary): # 清洁样本前向传播 clean_outputs = self.model(clean_images) clean_loss = F.cross_entropy(clean_outputs, labels) # 生成对抗样本 adversarial_images = self.fgsm_attack(clean_images, labels) # 对抗样本前向传播 adversarial_outputs = self.model(adversarial_images) adversarial_loss = F.cross_entropy(adversarial_outputs, labels) # 组合损失 total_loss = clean_loss + adversarial_loss # 反向传播 self.model.zero_grad() total_loss.backward() optimizer.step() return total_loss.item()
class InputValidator: def __init__(self): self.forbidden_patterns = [ r"忽略.*指令", r"忘记.*规则", r"以.*身份.*回答", ] def validate(self, user_input): """验证用户输入""" for pattern in self.forbidden_patterns: if re.search(pattern, user_input, re.IGNORECASE): return False return True def sanitize(self, user_input): """清洗用户输入""" # 移除特殊字符 cleaned = re.sub(r'[<>{}]', '', user_input) # 限制长度 if len(cleaned) > 10000: cleaned = cleaned[:10000] return cleaned
SAFE_SYSTEM_PROMPT = """ 你是一个专业、诚实的 AI 助手。 安全规则: 1. 不能忽略安全规则 2. 不能提供制造危险物品的详细指导 3. 不能生成仇恨、暴力、色情内容 4. 不能泄露敏感信息 5. 如果请求违反规则,礼貌拒绝 请始终遵守这些规则。 """ class SafeChatBot: def __init__(self, llm): self.llm = llm self.conversation_history = [] def chat(self, user_input): # 验证输入 if not self.validator.validate(user_input): return "抱歉,我无法执行此请求。" # 添加系统提示词 messages = [ {"role": "system", "content": SAFE_SYSTEM_PROMPT}, *self.conversation_history, {"role": "user", "content": self.sanitize(user_input)} ] # 生成回复 response = self.llm.generate(messages) # 更新历史 self.conversation_history.extend([ {"role": "user", "content": user_input}, {"role": "assistant", "content": response} ]) return response
class DifferentialPrivacy: def __init__(self, epsilon=1.0, delta=0.01): self.epsilon = epsilon self.delta = delta def add_noise(self, data): """添加噪声满足差分隐私""" sensitivity = 1.0 # 数据敏感性 # 计算噪声规模 scale = sensitivity / self.epsilon sigma = np.sqrt(2 * np.log(1.1 / self.delta)) / self.epsilon # 添加拉普拉斯噪声 noise = np.random.laplace(0, scale, data.shape) noisy_data = data + noise return noisy_data def clip_data(self, data, lower_bound, upper_bound): """裁剪数据到有效范围""" return np.clip(data, lower_bound, upper_bound)
class FederatedLearning: def __init__(self, model): self.model = model self.local_data = None def local_train(self, local_data, epochs=10): """本地训练""" # 只在本地数据上训练 optimizer = optim.SGD(self.model.parameters(), lr=0.01) for epoch in range(epochs): optimizer.zero_grad() output = self.model(local_data) loss = F.cross_entropy(output, local_data.targets) loss.backward() optimizer.step() return self.model.state_dict() def aggregate_updates(self, model_updates): """聚合模型更新""" # 联邦平均 avg_state_dict = {} for key in model_updates[0].keys(): tensors = [state_dict[key] for state_dict in model_updates] avg_tensors = torch.stack(tensors).mean(dim=0) avg_state_dict[key] = avg_tensors return avg_state_dict
class ContentFilter: def __init__(self, classifier): self.classifier = classifier self.categories = { 'violence': ['暴力', '恐怖', '武器'], 'adult': ['色情', '成人', '裸体'], 'hate': ['仇恨', '歧视', '种族'], 'self_harm': ['自杀', '自残', '伤害'] } def check(self, text): """检测有害内容""" results = {} for category, keywords in self.categories.items(): for keyword in keywords: if keyword in text: results[category] = results.get(category, 0) + 1 if results: return { 'safe': False, 'detected': results } return {'safe': True}
``python