多模态RAG系统:文本与图像的联合检索 技术原理 传统RAG系统主要处理文本数据,但在实际应用中,信息往往以多模态形式存在(文本、图像、视频、音频)。多模态RAG通过联合编码和跨模态检索,实现更智能的信息检索和生成。 多模态RAG的挑战 模态对齐:不同模态的语义空间需要统一 特征融合:如何有效结合文本和图像特征 跨模态检索:文本查图像、图像查文本的双向检索 计算效率:多模态编码计算成本高 技术栈选择 文本编码:BGE、E5、MTEB排行榜领先模型 图像编码:CLIP、VisualBERT、LayoutLM 多模态融合:BLIP-2、Flamingo、LLaVA 向量数据库:Milvus、Qdrant、Weaviate(支持多模态) CLIP多模态嵌入 CLIP(Contrastive
传统RAG系统主要处理文本数据,但在实际应用中,信息往往以多模态形式存在(文本、图像、视频、音频)。多模态RAG通过联合编码和跨模态检索,实现更智能的信息检索和生成。
CLIP(Contrastive Language-Image Pre-training)是OpenAI的视觉-语言模型,将图像和文本映射到同一特征空间。
import torch from PIL import Image from transformers import CLIPProcessor, CLIPModel class CLIPEncoder: def __init__(self, model_name="openai/clip-vit-base-patch32"): self.model = CLIPModel.from_pretrained(model_name) self.processor = CLIPProcessor.from_pretrained(model_name) self.model.eval() def encode_text(self, text): """编码文本""" inputs = self.processor(text=[text], return_tensors="pt", padding=True) with torch.no_grad(): text_features = self.model.get_text_features(**inputs) return text_features.cpu().numpy()[0] def encode_image(self, image_path): """编码图像""" image = Image.open(image_path) inputs = self.processor(images=image, return_tensors="pt") with torch.no_grad(): image_features = self.model.get_image_features(**inputs) return image_features.cpu().numpy()[0] def encode_batch(self, texts=None, images=None): """批量编码""" inputs = self.processor( text=texts or [], images=[Image.open(img) if isinstance(img, str) else img for img in images] if images else [], return_tensors="pt", padding=True ) with torch.no_grad(): if texts: return self.model.get_text_features(**inputs).cpu().numpy() if images: return self.model.get_image_features(**inputs).cpu().numpy() # 使用示例 encoder = CLIPEncoder() text_embedding = encoder.encode_text("一只猫坐在沙发上") image_embedding = encoder.encode_image("cat.jpg") # 计算相似度 from sklearn.metrics.pairwise import cosine_similarity similarity = cosine_similarity([text_embedding], [image_embedding])[0][0] print(f"Text-Image Similarity: {similarity}")
处理PDF中的文本和图像内容:
import fitz # PyMuPDF from PIL import Image import io import numpy as np class MultimodalDocumentProcessor: def __init__(self, image_encoder, text_encoder): self.image_encoder = image_encoder self.text_encoder = text_encoder self.chunks = [] def process_pdf(self, pdf_path): """处理PDF文档,提取文本和图像""" doc = fitz.open(pdf_path) for page_num, page in enumerate(doc): # 提取文本 text = page.get_text() if text.strip(): text_embedding = self.text_encoder.encode(text) self.chunks.append({ 'type': 'text', 'content': text, 'embedding': text_embedding, 'page': page_num, 'metadata': {'source': pdf_path} }) # 提取图像 image_list = page.get_images(full=True) for img_index, img in enumerate(image_list): xref = img[0] base_image = doc.extract_image(xref) image_bytes = base_image["image"] # 编码图像 image = Image.open(io.BytesIO(image_bytes)) image_embedding = self.image_encoder.encode_image(image) self.chunks.append({ 'type': 'image', 'content': f"image_{page_num}_{img_index}", 'embedding': image_embedding, 'page': page_num, 'image_data': image_bytes, 'metadata': {'source': pdf_path, 'format': base_image['ext']} }) doc.close() return self.chunks def chunk_text(self, text, chunk_size=500, overlap=50): """文本分块""" words = text.split() chunks = [] for i in range(0, len(words), chunk_size - overlap): chunk = ' '.join(words[i:i + chunk_size]) chunks.append(chunk) return chunks
使用Qdrant存储和检索多模态向量:
from qdrant_client import QdrantClient from qdrant_client.models import Distance, VectorParams, PointStruct, Filter, FieldCondition, MatchValue class MultimodalVectorStore: def __init__(self, url="localhost", port=6333): self.client = QdrantClient(url=url, port=port) self.collection_name = "multimodal_docs" self._create_collection() def _create_collection(self): """创建集合""" collections = self.client.get_collections().collections if not any(c.name == self.collection_name for c in collections): self.client.create_collection( collection_name=self.collection_name, vectors_config=VectorParams(size=512, distance=Distance.COSINE) ) def insert_chunks(self, chunks): """插入文档块""" points = [] for idx, chunk in enumerate(chunks): points.append(PointStruct( id=idx, vector=chunk['embedding'].tolist(), payload={ 'content': chunk['content'], 'type': chunk['type'], 'page': chunk['page'], 'metadata': chunk['metadata'] } )) self.client.upsert( collection_name=self.collection_name, points=points ) def search(self, query_embedding, limit=5, modality_filter=None): """混合检索""" query_filter = None if modality_filter: query_filter = Filter( must=[FieldCondition(key="type", match=MatchValue(value=modality_filter))] ) results = self.client.search( collection_name=self.collection_name, query_vector=query_embedding.tolist(), limit=limit, query_filter=query_filter ) return [ { 'content': hit.payload['content'], 'type': hit.payload['type'], 'score': hit.score, 'metadata': hit.payload['metadata'] } for hit in results ] def hybrid_search(self, query_text, query_image=None, text_weight=0.5, image_weight=0.5): """混合检索:结合文本和图像""" text_results = [] image_results = [] # 文本检索 if query_text: text_embedding = self.text_encoder.encode_text(query_text) text_results = self.search(text_embedding, modality_filter='text') # 图像检索 if query_image: image_embedding = self.image_encoder.encode_image(query_image) image_results = self.search(image_embedding, modality_filter='image') # 融合结果 merged = {} for result in text_results: key = result['content'] merged[key] = {'result': result, 'score': result['score'] * text_weight} for result in image_results: key = result['content'] if key in merged: merged[key]['score'] += result['score'] * image_weight else: merged[key] = {'result': result, 'score': result['score'] * image_weight} # 排序 sorted_results = sorted(merged.values(), key=lambda x: x['score'], reverse=True) return [item['result'] for item in sorted_results[:5]]
构建能理解文本和图像的AI Agent:
import openai from typing import List, Dict, Any class MultimodalRAGAgent: def __init__(self, vector_store, image_encoder, text_encoder, api_key): self.vector_store = vector_store self.image_encoder = image_encoder self.text_encoder = text_encoder openai.api_key = api_key def query(self, query_text: str, query_image=None, top_k=3): """多模态查询""" # 检索相关内容 if query_image: # 图像查询 query_embedding = self.image_encoder.encode_image(query_image) results = self.vector_store.search(query_embedding, limit=top_k) else: # 文本查询 query_embedding = self.text_encoder.encode_text(query_text) results = self.vector_store.search(query_embedding, limit=top_k) # 构建上下文 context = self._build_context(results) # 生成回答 response = self._generate_answer(query_text, context) return response def _build_context(self, results: List[Dict]) -> str: """构建上下文""" context_parts = [] for result in results: if result['type'] == 'text': context_parts.append(f"文本片段: {result['content']}") elif result['type'] == 'image': context_parts.append(f"[图像: {result['content']}]") return "\n\n".join(context_parts) def _generate_answer(self, query: str, context: str) -> str: """生成回答""" messages = [ { "role": "system", "content": "你是一个智能助手,根据提供的多模态上下文回答用户问题。" }, { "role": "user", "content": f"问题: {query}\n\n上下文:\n{context}" } ] response = openai.ChatCompletion.create( model="gpt-4-vision-preview", # 支持视觉的模型 messages=messages, max_tokens=500 ) return response.choices[0].message.content def visualize_query(self, query_text: str): """可视化查询结果(文本查图像)""" query_embedding = self.text_encoder.encode_text(query_text) results = self.vector_store.search(query_embedding, modality_filter='image', limit=5) print(f"Query: {query_text}") print(f"Found {len(results)} relevant images:") for i, result in enumerate(results, 1): print(f"{i}. {result['content']} (score: {result['score']:.4f})") return results
class ProductManualQA: def __init__(self): self.encoder = CLIPEncoder() self.vector_store = MultimodalVectorStore() self.agent = MultimodalRAGAgent( self.vector_store, self.encoder, self.encoder, api_key="your-api-key" ) def ingest_manual(self, pdf_path: str): """导入产品说明书""" processor = MultimodalDocumentProcessor(self.encoder, self.encoder) chunks = processor.process_pdf(pdf_path) self.vector_store.insert_chunks(chunks) def ask(self, question: str, reference_image=None): """问答""" response = self.agent.query(question, reference_image) return response def visualize_component(self, description: str): """根据描述可视化组件""" results = self.agent.visualize_query(description) # 显示相关图像 for result in results: # 在实际应用中,这里会显示图像 print(f"Found component image: {result['content']}") # 使用示例 qa_system = ProductManualQA() # 导入产品说明书 qa_system.ingest_manual("product_manual.pdf") # 文本问答 answer = qa_system.ask("如何安装设备?") print(f"Answer: {answer}") # 结合图像的问答 # 用户上传设备某部分的照片 # answer = qa_system.ask("这是什么部分?", reference_image="component.jpg")
from functools import lru_cache import hashlib class CachedEncoder: def __init__(self, encoder): self.encoder = encoder self.cache = {} def encode_text(self, text): """带缓存的文本编码""" cache_key = hashlib.md5(text.encode()).hexdigest() if cache_key in self.cache: return self.cache[cache_key] embedding = self.encoder.encode_text(text) self.cache[cache_key] = embedding return embedding def encode_batch(self, texts): """批量编码,提高效率""" return self.encoder.encode_batch(texts=texts)
import asyncio import aiohttp async def async_search(vector_store, query_embeddings): """异步检索多个查询""" async with aiohttp.ClientSession() as session: tasks = [ vector_store.async_search(embedding, session) for embedding in query_embeddings ] results = await asyncio.gather(*tasks) return results
多模态RAG系统通过结合文本和图像的检索能力,提供了更丰富的信息访问方式。从产品说明书智能问答到医疗影像分析,多模态RAG都有广泛的应用前景。关键在于选择合适的编码模型、设计高效的检索策略,并优化系统的整体性能。