DroneMind/voicellmcloud/examples/cloud_voice_client.py
2026-04-14 10:08:41 +08:00

320 lines
9.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
香橙派客户端 WebSocket 适配器
用于替换现有的本地 LLM + TTS 调用
"""
from __future__ import annotations
import asyncio
import json
import time
from typing import Optional, Callable
import websockets
import numpy as np
import sounddevice as sd
from loguru import logger
class CloudVoiceClient:
"""
云端语音客户端 - 香橙派端适配器
用法:
client = CloudVoiceClient(
server_url="ws://192.168.1.100:8765/v1/voice/session",
auth_token="your-token",
device_id="drone-001"
)
async with client.connect():
result = await client.send_text("起飞然后在前方十米悬停")
print(result.routing) # "flight_intent"
print(result.flight_intent.summary)
# 播放 TTS: result.audio_data (numpy array)
"""
def __init__(
self,
server_url: str,
auth_token: str,
device_id: str,
sample_rate: int = 24000,
):
self.server_url = server_url
self.auth_token = auth_token
self.device_id = device_id
self.sample_rate = sample_rate
self._ws = None
self._session_id = None
self._connected = False
async def connect(self) -> "CloudVoiceClient":
"""建立 WebSocket 连接"""
try:
self._ws = await websockets.connect(self.server_url)
self._connected = True
self._session_id = f"session-{int(time.time())}"
logger.info(f"WebSocket 连接成功: {self.server_url}")
# 发送 session.start
await self._send_session_start()
# 接收 session.ready
await self._receive_session_ready()
return self
except Exception as e:
logger.error(f"连接失败: {e}")
raise
async def disconnect(self):
"""断开连接"""
if self._ws:
try:
# 发送 session.end
await self._ws.send(json.dumps({
"type": "session.end",
"proto_version": "1.0",
"session_id": self._session_id,
}))
except:
pass
try:
await self._ws.close()
except:
pass
self._connected = False
logger.info("WebSocket 连接已关闭")
async def __aenter__(self):
return await self.connect()
async def __aexit__(self, exc_type, exc_val, exc_tb):
await self.disconnect()
async def _send_session_start(self):
"""发送 session.start"""
msg = {
"type": "session.start",
"proto_version": "1.0",
"transport_profile": "text_uplink",
"session_id": self._session_id,
"auth_token": self.auth_token,
"client": {
"device_id": self.device_id,
"locale": "zh-CN",
"capabilities": {
"playback_sample_rate_hz": self.sample_rate,
"prefer_tts_codec": "pcm_s16le"
},
"protocol": {"dialog_result": "cloud_voice_dialog_v1"},
}
}
await self._ws.send(json.dumps(msg, ensure_ascii=False))
logger.debug("→ session.start")
async def _receive_session_ready(self):
"""接收 session.ready"""
msg = await self._ws.recv()
data = json.loads(msg)
if data.get("type") != "session.ready":
raise Exception(f"期望 session.ready收到: {data}")
logger.info("← session.ready - 服务端就绪")
async def send_text(self, text: str) -> "CloudVoiceResult":
"""
发送文本并获取响应
Args:
text: 用户输入的中文文本
Returns:
CloudVoiceResult 对象
"""
if not self._connected:
raise RuntimeError("未连接,请先调用 connect()")
turn_id = f"turn-{int(time.time())}"
# 发送 turn.text
await self._ws.send(json.dumps({
"type": "turn.text",
"proto_version": "1.0",
"transport_profile": "text_uplink",
"turn_id": turn_id,
"text": text,
"is_final": True,
"source": "device_stt"
}, ensure_ascii=False))
logger.debug(f"→ turn.text: {text}")
# 接收响应
result = CloudVoiceResult(turn_id=turn_id)
audio_chunks = []
while True:
msg = await self._ws.recv()
if isinstance(msg, bytes):
# 音频数据
audio_chunks.append(msg)
else:
data = json.loads(msg)
msg_type = data.get("type")
if msg_type == "dialog_result":
result.parse_dialog_result(data)
elif msg_type == "turn.complete":
result.metrics = data.get("metrics", {})
# 拼接音频
if audio_chunks:
full_pcm = b"".join(audio_chunks)
result.audio_data = np.frombuffer(full_pcm, dtype=np.int16)
logger.info(
f"回合完成: LLM={result.metrics.get('llm_ms')}ms, "
f"TTS={result.metrics.get('tts_first_byte_ms')}ms"
)
break
elif msg_type == "error":
raise Exception(
f"服务端错误: {data.get('code')} - {data.get('message')}"
)
return result
async def play_audio(self, audio_data: np.ndarray):
"""
播放音频数据
Args:
audio_data: numpy int16 array
"""
try:
audio_float = audio_data.astype(np.float32) / 32768.0
sd.play(
audio_float,
samplerate=self.sample_rate,
blocking=True,
)
logger.info(f"音频播放完成: {len(audio_data)} samples")
except Exception as e:
logger.error(f"音频播放失败: {e}")
raise
class CloudVoiceResult:
"""云端语音响应结果"""
def __init__(self, turn_id: str):
self.turn_id = turn_id
# 意图识别结果
self.routing: Optional[str] = None # "flight_intent" 或 "chitchat"
self.flight_intent: Optional[dict] = None
self.chat_reply: Optional[str] = None
# TTS 音频
self.audio_data: Optional[np.ndarray] = None # int16 numpy array
# 性能指标
self.metrics: dict = {}
# 原始数据
self.raw_data: Optional[dict] = None
def parse_dialog_result(self, data: dict):
"""解析 dialog_result 消息"""
self.raw_data = data
self.routing = data.get("routing")
if self.routing == "flight_intent":
self.flight_intent = data.get("flight_intent")
self.chat_reply = None
elif self.routing == "chitchat":
self.chat_reply = data.get("chat_reply")
self.flight_intent = None
def is_flight_intent(self) -> bool:
"""是否为飞控意图"""
return self.routing == "flight_intent"
def get_tts_text(self) -> str:
"""获取用于 TTS 播报的文本"""
if self.is_flight_intent() and self.flight_intent:
return self.flight_intent.get("summary", "收到")
elif self.routing == "chitchat" and self.chat_reply:
return self.chat_reply
else:
return "收到"
def get_flight_actions(self) -> list:
"""获取飞控动作列表"""
if self.is_flight_intent() and self.flight_intent:
return self.flight_intent.get("actions", [])
return []
def __repr__(self):
return (
f"CloudVoiceResult(routing={self.routing}, "
f"audio={len(self.audio_data) if self.audio_data is not None else 0} samples)"
)
# ==================== 使用示例 ====================
async def example_usage():
"""使用示例"""
# 创建客户端
client = CloudVoiceClient(
server_url="ws://192.168.1.100:8765/v1/voice/session",
auth_token="drone-voice-cloud-token-2024",
device_id="drone-001",
)
try:
# 连接
async with client.connect():
# 测试闲聊
print("\n=== 闲聊测试 ===")
result = await client.send_text("你好,今天天气怎么样?")
print(f"路由: {result.routing}")
print(f"回复: {result.chat_reply}")
await client.play_audio(result.audio_data)
# 测试飞控指令
print("\n=== 飞控指令测试 ===")
result = await client.send_text("起飞然后在前方十米悬停")
print(f"路由: {result.routing}")
print(f"摘要: {result.flight_intent['summary']}")
print(f"动作: {result.get_flight_actions()}")
await client.play_audio(result.audio_data)
# 在这里可以将飞控动作发送到飞控板
# for action in result.get_flight_actions():
# send_to_flight_controller(action)
except Exception as e:
logger.error(f"测试失败: {e}")
raise
if __name__ == "__main__":
asyncio.run(example_usage())