GPT-4o Realtime实战:构建毫秒级实时语音AI系统 引言 OpenAI的GPT-4o Realtime API开启了实时语音交互的新纪元,将端到端延迟降低到毫秒级。本文将深入讲解如何构建一个实时语音AI系统,涵盖流式ASR、LLM处理、流式TTS以及延迟优化技巧。 一、实时语音交互技术栈 1.1 端到端架构 1.2 延迟目标 组件 | 延迟目标 | 说明 ASR | <200ms | 语音转文本 LLM | <300ms | 文本生成 TTS | <200ms | 文本转语音 总计 | <700ms | 端到端 二、流式ASR(自动语音识别) 2.1 Whisper Streaming 2.2 实时语音活动检测(VAD) 三、GPT-4o Realtime API使用 3.
OpenAI的GPT-4o Realtime API开启了实时语音交互的新纪元,将端到端延迟降低到毫秒级。本文将深入讲解如何构建一个实时语音AI系统,涵盖流式ASR、LLM处理、流式TTS以及延迟优化技巧。
用户语音 (Audio) ↓ 流式ASR (Whisper Streaming) ↓ 文本 (Text) ↓ LLM (GPT-4o Realtime) ↓ 文本 (Text) ↓ 流式TTS (VITS Streaming) ↓ 用户语音 (Audio)
| 组件 | 延迟目标 | 说明 |
|---|---|---|
| ASR | <200ms | 语音转文本 |
| LLM | <300ms | 文本生成 |
| TTS | <200ms | 文本转语音 |
| 总计 | <700ms | 端到端 |
import whisper import queue import numpy as np # 初始化Whisper model = whisper.load_model("base") # 音频队列 audio_queue = queue.Queue() def process_audio_stream(audio_chunk): """处理音频流""" # 转换为numpy数组 audio = np.frombuffer(audio_chunk, dtype=np.float32) # 添加到队列 audio_queue.put(audio) # 累积到一定长度后识别 if audio_queue.qsize() >= 10: audio_data = [] while not audio_queue.empty(): audio_data.append(audio_queue.get()) # 拼接并识别 audio_np = np.concatenate(audio_data) result = model.transcribe(audio_np) return result["text"] return None
from silero_vad_utils import get_speech_timestamps def detect_voice_activity(audio_chunk): """检测语音活动""" timestamps = get_speech_timestamps(audio_chunk) return len(timestamps) > 0
import asyncio import websockets async def gpt4o_realtime(prompt: str): uri = "wss://api.openai.com/v1/realtime" async with websockets.connect(uri) as ws: # 发送消息 await ws.send(json.dumps({ "type": "message", "content": prompt })) # 接收流式响应 while True: response = await ws.recv() data = json.loads(response) if data["type"] == "text": yield data["content"] elif data["type"] == "done": break
async def realtime_conversation(): async with websockets.connect("wss://api.openai.com/v1/realtime") as ws: # 初始化 await ws.send(json.dumps({"type": "init", "config": {"mode": "voice"}})) # 对话循环 while True: # 1. 接收用户语音 audio_chunk = await get_user_audio() # 2. ASR text = await process_audio_stream(audio_chunk) # 3. 发送到GPT-4o await ws.send(json.dumps({ "type": "message", "content": text })) # 4. 接收响应 response_text = "" while True: response = await ws.recv() data = json.loads(response) if data["type"] == "text": response_text += data["content"] # 实时TTS await stream_tts(data["content"]) elif data["type"] == "done": break
from TTS.bin import synthesize import sounddevice as sd async def stream_tts(text: str): """流式语音合成""" # 分块合成 chunk_size = 50 # 每次合成50个字符 for i in range(0, len(text), chunk_size): chunk = text[i:i+chunk_size] # 合成音频 audio = synthesize(chunk) # 播放音频 sd.play(audio, samplerate=22050) sd.wait()
from TTS.api import TTS tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2") async def stream_tts_fast(text: str): """快速流式TTS""" # 生成音频 tts.tts_to_file(text=text, file_path="output.wav") # 流式播放 with open("output.wav", "rb") as f: while True: chunk = f.read(1024) if not chunk: break sd.play(chunk, samplerate=22050) sd.wait()
import asyncio from fastapi import WebSocket from fastapi.responses import StreamingResponse @app.websocket("/realtime") async def realtime_endpoint(websocket: WebSocket): await websocket.accept() try: while True: # 接收音频 audio_data = await websocket.receive_bytes() # ASR text = await process_audio(audio_data) # LLM response = await llm_generate(text) # TTS audio_output = await synthesize(response) # 发送音频 await websocket.send_bytes(audio_output) except WebSocketDisconnect: print("Client disconnected")
import asyncio import websockets async def realtime_client(): uri = "ws://localhost:8000/realtime" async with websockets.connect(uri) as ws: # 音频采集线程 async def audio_sender(): microphone = AudioRecorder() for chunk in microphone.stream(): await ws.send_bytes(chunk) # 音频播放线程 async def audio_receiver(): async for message in ws: if isinstance(message, bytes): play_audio(message) # 并行运行 await asyncio.gather(audio_sender(), audio_receiver())
async def parallel_pipeline(audio_stream): """并行化处理流水线""" # Stage 1: ASR asr_task = asyncio.create_task(process_asr(audio_stream)) # Stage 2: LLM (等待ASR完成) text = await asr_task llm_task = asyncio.create_task(process_llm(text)) # Stage 3: TTS (流式) async for text_chunk in stream_llm(llm_task): audio_chunk = await synthesize(text_chunk) yield audio_chunk
class BatchProcessor: def __init__(self, batch_size=4, timeout_ms=50): self.batch = [] self.batch_size = batch_size self.timeout = timeout_ms / 1000 async def add(self, item): self.batch.append(item) if len(self.batch) >= self.batch_size: await self.flush() async def flush(self): if self.batch: results = await process_batch(self.batch) self.batch.clear() return results
# 支持的语言 LANGUAGES = { "zh": "base", # 中文 "en": "base", # 英文 "es": "base", # 西班牙语 "fr": "base", # 法语 "ja": "base" # 日语 } def detect_language(audio_chunk): """检测语言""" import whisper model = whisper.load_model("base") result = model.transcribe(audio_chunk) return result["language"]
def synthesize_multilingual(text: str, language: str = "zh"): """多语言语音合成""" from TTS.api import TTS tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2") # 生成音频 tts.tts_to_file( text=text, file_path="output.wav", language=language )
apiVersion: apps/v1 kind: Deployment metadata: name: realtime-voice-ai spec: replicas: 3 selector: matchLabels: app: realtime-voice-ai template: metadata: labels: app: realtime-voice-ai spec: containers: - name: websocket-server image: realtime-voice-ai:latest ports: - containerPort: 8000 resources: limits: nvidia.com/gpu: 1
from prometheus_client import Counter, Histogram # 定义指标 request_counter = Counter('realtime_requests_total', 'Total requests') latency_histogram = Histogram('realtime_latency_seconds', 'Request latency') async def monitored_process(audio_data): start_time = time.time() # 处理请求 result = await process_realtime(audio_data) # 记录指标 duration = time.time() - start_time request_counter.inc() latency_histogram.observe(duration) return result
构建毫秒级实时语音AI系统需要精心设计整个流水线,从ASR、LLM到TTS,每个环节都需要优化。通过使用流式处理、并行化、批处理等技术,并结合WebSocket实时通信,可以实现真正的实时语音交互体验。GPT-4o Realtime API为这一目标提供了强大支持,让开发者能够快速构建高质量的实时语音应用。
"