多模态RAG系统：文本与图像的联合检索

文档摘要

多模态RAG系统：文本与图像的联合检索技术原理传统RAG系统主要处理文本数据，但在实际应用中，信息往往以多模态形式存在（文本、图像、视频、音频）。多模态RAG通过联合编码和跨模态检索，实现更智能的信息检索和生成。多模态RAG的挑战模态对齐：不同模态的语义空间需要统一特征融合：如何有效结合文本和图像特征跨模态检索：文本查图像、图像查文本的双向检索计算效率：多模态编码计算成本高技术栈选择文本编码：BGE、E5、MTEB排行榜领先模型图像编码：CLIP、VisualBERT、LayoutLM 多模态融合：BLIP-2、Flamingo、LLaVA 向量数据库：Milvus、Qdrant、Weaviate（支持多模态） CLIP多模态嵌入 CLIP（Contrastive

多模态RAG系统：文本与图像的联合检索

技术原理

传统RAG系统主要处理文本数据，但在实际应用中，信息往往以多模态形式存在（文本、图像、视频、音频）。多模态RAG通过联合编码和跨模态检索，实现更智能的信息检索和生成。

多模态RAG的挑战

模态对齐：不同模态的语义空间需要统一
特征融合：如何有效结合文本和图像特征
跨模态检索：文本查图像、图像查文本的双向检索
计算效率：多模态编码计算成本高

技术栈选择

文本编码：BGE、E5、MTEB排行榜领先模型
图像编码：CLIP、VisualBERT、LayoutLM
多模态融合：BLIP-2、Flamingo、LLaVA
向量数据库：Milvus、Qdrant、Weaviate（支持多模态）

CLIP多模态嵌入

CLIP（Contrastive Language-Image Pre-training）是OpenAI的视觉-语言模型，将图像和文本映射到同一特征空间。


import torch
from PIL import Image
from transformers import CLIPProcessor, CLIPModel

class CLIPEncoder:
    def __init__(self, model_name="openai/clip-vit-base-patch32"):
        self.model = CLIPModel.from_pretrained(model_name)
        self.processor = CLIPProcessor.from_pretrained(model_name)
        self.model.eval()
    
    def encode_text(self, text):
        """编码文本"""
        inputs = self.processor(text=[text], return_tensors="pt", padding=True)
        with torch.no_grad():
            text_features = self.model.get_text_features(**inputs)
        return text_features.cpu().numpy()[0]
    
    def encode_image(self, image_path):
        """编码图像"""
        image = Image.open(image_path)
        inputs = self.processor(images=image, return_tensors="pt")
        with torch.no_grad():
            image_features = self.model.get_image_features(**inputs)
        return image_features.cpu().numpy()[0]
    
    def encode_batch(self, texts=None, images=None):
        """批量编码"""
        inputs = self.processor(
            text=texts or [],
            images=[Image.open(img) if isinstance(img, str) else img for img in images] if images else [],
            return_tensors="pt",
            padding=True
        )
        with torch.no_grad():
            if texts:
                return self.model.get_text_features(**inputs).cpu().numpy()
            if images:
                return self.model.get_image_features(**inputs).cpu().numpy()

# 使用示例
encoder = CLIPEncoder()

text_embedding = encoder.encode_text("一只猫坐在沙发上")
image_embedding = encoder.encode_image("cat.jpg")

# 计算相似度
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity([text_embedding], [image_embedding])[0][0]
print(f"Text-Image Similarity: {similarity}")

多模态文档处理

处理PDF中的文本和图像内容：


import fitz  # PyMuPDF
from PIL import Image
import io
import numpy as np

class MultimodalDocumentProcessor:
    def __init__(self, image_encoder, text_encoder):
        self.image_encoder = image_encoder
        self.text_encoder = text_encoder
        self.chunks = []
    
    def process_pdf(self, pdf_path):
        """处理PDF文档，提取文本和图像"""
        doc = fitz.open(pdf_path)
        
        for page_num, page in enumerate(doc):
            # 提取文本
            text = page.get_text()
            if text.strip():
                text_embedding = self.text_encoder.encode(text)
                self.chunks.append({
                    'type': 'text',
                    'content': text,
                    'embedding': text_embedding,
                    'page': page_num,
                    'metadata': {'source': pdf_path}
                })
            
            # 提取图像
            image_list = page.get_images(full=True)
            for img_index, img in enumerate(image_list):
                xref = img[0]
                base_image = doc.extract_image(xref)
                image_bytes = base_image["image"]
                
                # 编码图像
                image = Image.open(io.BytesIO(image_bytes))
                image_embedding = self.image_encoder.encode_image(image)
                
                self.chunks.append({
                    'type': 'image',
                    'content': f"image_{page_num}_{img_index}",
                    'embedding': image_embedding,
                    'page': page_num,
                    'image_data': image_bytes,
                    'metadata': {'source': pdf_path, 'format': base_image['ext']}
                })
        
        doc.close()
        return self.chunks
    
    def chunk_text(self, text, chunk_size=500, overlap=50):
        """文本分块"""
        words = text.split()
        chunks = []
        for i in range(0, len(words), chunk_size - overlap):
            chunk = ' '.join(words[i:i + chunk_size])
            chunks.append(chunk)
        return chunks

多模态向量数据库

使用Qdrant存储和检索多模态向量：


from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct, Filter, FieldCondition, MatchValue

class MultimodalVectorStore:
    def __init__(self, url="localhost", port=6333):
        self.client = QdrantClient(url=url, port=port)
        self.collection_name = "multimodal_docs"
        self._create_collection()
    
    def _create_collection(self):
        """创建集合"""
        collections = self.client.get_collections().collections
        if not any(c.name == self.collection_name for c in collections):
            self.client.create_collection(
                collection_name=self.collection_name,
                vectors_config=VectorParams(size=512, distance=Distance.COSINE)
            )
    
    def insert_chunks(self, chunks):
        """插入文档块"""
        points = []
        for idx, chunk in enumerate(chunks):
            points.append(PointStruct(
                id=idx,
                vector=chunk['embedding'].tolist(),
                payload={
                    'content': chunk['content'],
                    'type': chunk['type'],
                    'page': chunk['page'],
                    'metadata': chunk['metadata']
                }
            ))
        
        self.client.upsert(
            collection_name=self.collection_name,
            points=points
        )
    
    def search(self, query_embedding, limit=5, modality_filter=None):
        """混合检索"""
        query_filter = None
        if modality_filter:
            query_filter = Filter(
                must=[FieldCondition(key="type", match=MatchValue(value=modality_filter))]
            )
        
        results = self.client.search(
            collection_name=self.collection_name,
            query_vector=query_embedding.tolist(),
            limit=limit,
            query_filter=query_filter
        )
        
        return [
            {
                'content': hit.payload['content'],
                'type': hit.payload['type'],
                'score': hit.score,
                'metadata': hit.payload['metadata']
            }
            for hit in results
        ]
    
    def hybrid_search(self, query_text, query_image=None, text_weight=0.5, image_weight=0.5):
        """混合检索：结合文本和图像"""
        text_results = []
        image_results = []
        
        # 文本检索
        if query_text:
            text_embedding = self.text_encoder.encode_text(query_text)
            text_results = self.search(text_embedding, modality_filter='text')
        
        # 图像检索
        if query_image:
            image_embedding = self.image_encoder.encode_image(query_image)
            image_results = self.search(image_embedding, modality_filter='image')
        
        # 融合结果
        merged = {}
        for result in text_results:
            key = result['content']
            merged[key] = {'result': result, 'score': result['score'] * text_weight}
        
        for result in image_results:
            key = result['content']
            if key in merged:
                merged[key]['score'] += result['score'] * image_weight
            else:
                merged[key] = {'result': result, 'score': result['score'] * image_weight}
        
        # 排序
        sorted_results = sorted(merged.values(), key=lambda x: x['score'], reverse=True)
        return [item['result'] for item in sorted_results[:5]]

多模态Agent实现

构建能理解文本和图像的AI Agent：


import openai
from typing import List, Dict, Any

class MultimodalRAGAgent:
    def __init__(self, vector_store, image_encoder, text_encoder, api_key):
        self.vector_store = vector_store
        self.image_encoder = image_encoder
        self.text_encoder = text_encoder
        openai.api_key = api_key
    
    def query(self, query_text: str, query_image=None, top_k=3):
        """多模态查询"""
        # 检索相关内容
        if query_image:
            # 图像查询
            query_embedding = self.image_encoder.encode_image(query_image)
            results = self.vector_store.search(query_embedding, limit=top_k)
        else:
            # 文本查询
            query_embedding = self.text_encoder.encode_text(query_text)
            results = self.vector_store.search(query_embedding, limit=top_k)
        
        # 构建上下文
        context = self._build_context(results)
        
        # 生成回答
        response = self._generate_answer(query_text, context)
        
        return response
    
    def _build_context(self, results: List[Dict]) -> str:
        """构建上下文"""
        context_parts = []
        for result in results:
            if result['type'] == 'text':
                context_parts.append(f"文本片段: {result['content']}")
            elif result['type'] == 'image':
                context_parts.append(f"[图像: {result['content']}]")
        
        return "\n\n".join(context_parts)
    
    def _generate_answer(self, query: str, context: str) -> str:
        """生成回答"""
        messages = [
            {
                "role": "system",
                "content": "你是一个智能助手，根据提供的多模态上下文回答用户问题。"
            },
            {
                "role": "user",
                "content": f"问题: {query}\n\n上下文:\n{context}"
            }
        ]
        
        response = openai.ChatCompletion.create(
            model="gpt-4-vision-preview",  # 支持视觉的模型
            messages=messages,
            max_tokens=500
        )
        
        return response.choices[0].message.content
    
    def visualize_query(self, query_text: str):
        """可视化查询结果（文本查图像）"""
        query_embedding = self.text_encoder.encode_text(query_text)
        results = self.vector_store.search(query_embedding, modality_filter='image', limit=5)
        
        print(f"Query: {query_text}")
        print(f"Found {len(results)} relevant images:")
        for i, result in enumerate(results, 1):
            print(f"{i}. {result['content']} (score: {result['score']:.4f})")
        
        return results

实战案例：产品说明书智能问答


class ProductManualQA:
    def __init__(self):
        self.encoder = CLIPEncoder()
        self.vector_store = MultimodalVectorStore()
        self.agent = MultimodalRAGAgent(
            self.vector_store,
            self.encoder,
            self.encoder,
            api_key="your-api-key"
        )
    
    def ingest_manual(self, pdf_path: str):
        """导入产品说明书"""
        processor = MultimodalDocumentProcessor(self.encoder, self.encoder)
        chunks = processor.process_pdf(pdf_path)
        self.vector_store.insert_chunks(chunks)
    
    def ask(self, question: str, reference_image=None):
        """问答"""
        response = self.agent.query(question, reference_image)
        return response
    
    def visualize_component(self, description: str):
        """根据描述可视化组件"""
        results = self.agent.visualize_query(description)
        
        # 显示相关图像
        for result in results:
            # 在实际应用中，这里会显示图像
            print(f"Found component image: {result['content']}")

# 使用示例
qa_system = ProductManualQA()

# 导入产品说明书
qa_system.ingest_manual("product_manual.pdf")

# 文本问答
answer = qa_system.ask("如何安装设备？")
print(f"Answer: {answer}")

# 结合图像的问答
# 用户上传设备某部分的照片
# answer = qa_system.ask("这是什么部分？", reference_image="component.jpg")

性能优化

批量编码与缓存


from functools import lru_cache
import hashlib

class CachedEncoder:
    def __init__(self, encoder):
        self.encoder = encoder
        self.cache = {}
    
    def encode_text(self, text):
        """带缓存的文本编码"""
        cache_key = hashlib.md5(text.encode()).hexdigest()
        
        if cache_key in self.cache:
            return self.cache[cache_key]
        
        embedding = self.encoder.encode_text(text)
        self.cache[cache_key] = embedding
        return embedding
    
    def encode_batch(self, texts):
        """批量编码，提高效率"""
        return self.encoder.encode_batch(texts=texts)

异步检索


import asyncio
import aiohttp

async def async_search(vector_store, query_embeddings):
    """异步检索多个查询"""
    async with aiohttp.ClientSession() as session:
        tasks = [
            vector_store.async_search(embedding, session)
            for embedding in query_embeddings
        ]
        results = await asyncio.gather(*tasks)
    return results

多模态RAG系统通过结合文本和图像的检索能力，提供了更丰富的信息访问方式。从产品说明书智能问答到医疗影像分析，多模态RAG都有广泛的应用前景。关键在于选择合适的编码模型、设计高效的检索策略，并优化系统的整体性能。