DroneMind/voice_drone/core/cloud_voice_client.py

"""
云端语音 WebSocket 客户端：会话 `session.start.transport_profile` 固定为 pcm_asr_uplink。

- 主路径：`turn.audio.start` → 若干 `turn.audio.chunk`（每条仅文本 JSON，含 `pcm_base64`）→ `turn.audio.end`；**禁止**用 WebSocket binary 上发 PCM（与 Starlette receive 语义一致）。
- 辅助：`run_turn` 发 `turn.text`（如同句快路径仅有文本）；`run_tts_synthesize` 仅 TTS。
- `asr.partial` 仅调试展示，不参与机端状态机。

文档：`docs/CLOUD_VOICE_SESSION_SCHEME_v1.md`，`docs/CLOUD_VOICE_PROTOCOL_pcm_asr_uplink_v1.md`。
"""

from __future__ import annotations

import base64
import json
import os
import threading
import time
import uuid
from typing import Any

import numpy as np

from voice_drone.core.cloud_dialog_v1 import CLOUD_VOICE_DIALOG_V1
from voice_drone.logging_ import get_logger

logger = get_logger("voice_drone.cloud_voice")

_CLOUD_PROTO = "1.0"
TRANSPORT_PCM_ASR_UPLINK = "pcm_asr_uplink"


def _merge_session_client(
    device_id: str,
    *,
    session_client_extensions: dict[str, Any] | None,
) -> dict[str, Any]:
    """session.start 的 client：capabilities 与设备信息 + 可选 PX4 等扩展（不覆盖 device_id/locale）。"""
    client: dict[str, Any] = {
        "device_id": device_id,
        "locale": "zh-CN",
        "capabilities": {
            "playback_sample_rate_hz": 24000,
            "prefer_tts_codec": "pcm_s16le",
        },
    }
    ext = session_client_extensions or {}
    for k, v in ext.items():
        if v is None or k in ("device_id", "locale", "capabilities", "protocol"):
            continue
        if k == "extras" and isinstance(v, dict) and len(v) == 0:
            continue
        client[k] = v
    client["protocol"] = {"dialog_result": CLOUD_VOICE_DIALOG_V1}
    return client


def _transient_ws_exc(exc: BaseException) -> bool:
    """可通过对端已关、网络抖动等通过重连重发 turn 恢复的异常。"""
    import websocket as _websocket  # noqa: PLC0415

    if isinstance(
        exc,
        (
            BrokenPipeError,
            ConnectionResetError,
            ConnectionAbortedError,
        ),
    ):
        return True
    if isinstance(
        exc,
        (
            _websocket.WebSocketConnectionClosedException,
            _websocket.WebSocketTimeoutException,
        ),
    ):
        return True
    if isinstance(exc, OSError) and getattr(exc, "errno", None) in (
        32,
        104,
        110,
    ):  # EPIPE, ECONNRESET, ETIMEDOUT
        return True
    return False


def _merge_tts_pcm_chunks(
    chunk_entries: list[tuple[int | None, int, bytes]],
) -> bytes:
    """按 seq 升序拼接；无 seq 时按到达顺序。chunk_entries: (seq|None, arrival, pcm)。"""
    if not chunk_entries:
        return b""
    if all(s is not None for s, _, _ in chunk_entries):
        ordered = sorted(chunk_entries, key=lambda x: (x[0], x[1]))
        seqs = [x[0] for x in ordered]
        for a, b in zip(seqs, seqs[1:]):
            if b != a + 1:
                logger.warning("TTS seq 不连续（仍按序拼接）: %s → %s", a, b)
                break
        return b"".join(x[2] for x in ordered)
    return b"".join(x[2] for x in sorted(chunk_entries, key=lambda x: x[1]))


class CloudVoiceError(RuntimeError):
    """云端返回 error 消息或协议不符合预期。"""

    def __init__(self, message: str, *, code: str | None = None, retryable: bool = False):
        super().__init__(message)
        self.code = code
        self.retryable = retryable


class CloudVoiceClient:
    """连接 ws://…/v1/voice/session；session 为 pcm_asr_uplink，含 run_turn_audio / run_turn / tts.synthesize。"""

    def __init__(
        self,
        *,
        server_url: str,
        auth_token: str,
        device_id: str,
        recv_timeout: float = 120.0,
        session_client_extensions: dict[str, Any] | None = None,
    ) -> None:
        self.server_url = server_url.strip()
        self.auth_token = auth_token.strip()
        self.device_id = (device_id or "drone-001").strip()
        self.recv_timeout = float(recv_timeout)
        self._session_client_extensions: dict[str, Any] = dict(
            session_client_extensions or {}
        )
        self._transport_profile: str = TRANSPORT_PCM_ASR_UPLINK
        self._ws: Any = None
        self._session_id: str | None = None
        self._lock = threading.Lock()

    @property
    def connected(self) -> bool:
        with self._lock:
            return self._ws is not None

    def close(self) -> None:
        with self._lock:
            self._close_nolock()

    def _close_nolock(self) -> None:
        if self._ws is None:
            self._session_id = None
            return
        try:
            if self._session_id:
                try:
                    self._ws.send(
                        json.dumps(
                            {
                                "type": "session.end",
                                "proto_version": _CLOUD_PROTO,
                                "session_id": self._session_id,
                            },
                            ensure_ascii=False,
                        )
                    )
                except Exception:  # noqa: BLE001
                    pass
        finally:
            try:
                self._ws.close()
            except Exception:  # noqa: BLE001
                pass
            self._ws = None
            self._session_id = None

    def connect(self) -> None:
        """建立 WSS，发送 session.start，等待 session.ready。"""
        with self._lock:
            self._connect_nolock()

    def _connect_nolock(self) -> None:
        import websocket  # websocket-client

        self._close_nolock()
        hdr = [f"Authorization: Bearer {self.auth_token}"]
        try:
            self._ws = websocket.create_connection(
                self.server_url,
                header=hdr,
                timeout=self.recv_timeout,
            )
            self._ws.settimeout(self.recv_timeout)
            self._session_id = str(uuid.uuid4())
            client_payload = _merge_session_client(
                self.device_id,
                session_client_extensions=self._session_client_extensions,
            )
            if self._session_client_extensions:
                logger.info(
                    "session.start 已附加 client 扩展键: %s",
                    sorted(self._session_client_extensions.keys()),
                )
            start = {
                "type": "session.start",
                "proto_version": _CLOUD_PROTO,
                "transport_profile": self._transport_profile,
                "session_id": self._session_id,
                "auth_token": self.auth_token,
                "client": client_payload,
            }
            self._ws.send(json.dumps(start, ensure_ascii=False))
            raw = self._ws.recv()
            if isinstance(raw, bytes):
                raise CloudVoiceError("session.ready 期望 JSON 文本帧，收到二进制")
            data = json.loads(raw)
            if data.get("type") != "session.ready":
                raise CloudVoiceError(
                    f"期望 session.ready，收到: {data.get('type')!r}",
                    code="INVALID_MESSAGE",
                )
            logger.info("云端会话已就绪 session_id=%s", self._session_id)
        except Exception:
            self._close_nolock()
            raise

    def ensure_connected(self) -> None:
        with self._lock:
            if self._ws is None:
                self._connect_nolock()

    def _execute_turn_nolock(self, t: str) -> dict[str, Any]:
        """已持锁且 _ws 已连接：发送 turn.text 并收齐本轮帧。"""
        import websocket  # websocket-client

        ws = self._ws
        if ws is None:
            raise CloudVoiceError("WebSocket 未连接")

        turn_id = str(uuid.uuid4())
        turn_msg = {
            "type": "turn.text",
            "proto_version": _CLOUD_PROTO,
            "transport_profile": self._transport_profile,
            "turn_id": turn_id,
            "text": t,
            "is_final": True,
            "source": "device_stt",
        }
        try:
            ws.send(json.dumps(turn_msg, ensure_ascii=False))
        except Exception as e:
            if _transient_ws_exc(e):
                raise
            raise CloudVoiceError(f"发送 turn.text 失败: {e}", code="INTERNAL") from e
        logger.debug("→ turn.text turn_id=%s", turn_id)

        expecting_binary = False
        _pending_tts_seq: int | None = None
        pcm_entries: list[tuple[int | None, int, bytes]] = []
        _pcm_arrival = 0
        llm_stream_parts: list[str] = []
        dialog: dict[str, Any] | None = None
        metrics: dict[str, Any] = {}
        sample_rate_hz = 24000

        while True:
            try:
                msg = ws.recv()
            except websocket.WebSocketConnectionClosedException as e:
                raise CloudVoiceError(
                    f"连接已断开: {e}",
                    code="DISCONNECTED",
                    retryable=True,
                ) from e
            except Exception as e:
                if _transient_ws_exc(e):
                    raise
                raise

            if isinstance(msg, bytes):
                if expecting_binary:
                    expecting_binary = False
                else:
                    logger.warning("收到未预期的二进制帧，仍作为 TTS 数据处理")
                pcm_entries.append((_pending_tts_seq, _pcm_arrival, msg))
                _pcm_arrival += 1
                _pending_tts_seq = None
                continue

            if not isinstance(msg, str):
                raise CloudVoiceError(
                    f"期望文本帧为 str，实际为 {type(msg).__name__}",
                    code="INVALID_MESSAGE",
                )
            text_frame = msg.strip()
            if not text_frame:
                logger.debug("跳过空 WebSocket 文本帧")
                continue
            try:
                data = json.loads(text_frame)
            except json.JSONDecodeError as e:
                head = text_frame[:200].replace("\n", "\\n")
                raise CloudVoiceError(
                    f"服务端文本帧不是合法 JSON: {e}; 前 {len(head)} 字符: {head!r}",
                    code="INVALID_MESSAGE",
                ) from e
            mtype = data.get("type")

            if mtype == "asr.partial":
                logger.debug("← asr.partial（机端不参与状态跳转）")
                continue

            if mtype == "llm.text_delta":
                if data.get("turn_id") != turn_id:
                    logger.debug(
                        "llm.text_delta turn_id 与当前不一致，忽略 type=%s",
                        mtype,
                    )
                    continue
                raw_d = data.get("delta")
                delta = "" if raw_d is None else str(raw_d)
                llm_stream_parts.append(delta)
                _print_stream = os.environ.get("ROCKET_PRINT_LLM_STREAM", "").lower() in (
                    "1",
                    "true",
                    "yes",
                )
                if _print_stream:
                    print(delta, end="", flush=True)
                    if data.get("done"):
                        print(flush=True)
                logger.debug(
                    "← llm.text_delta done=%s delta_chars=%s",
                    data.get("done"),
                    len(delta),
                )
                continue

            if mtype == "tts_audio_chunk":
                _pending_tts_seq = None
                if data.get("turn_id") != turn_id:
                    logger.warning("tts_audio_chunk turn_id 与当前不一致，仍消费后续二进制")
                else:
                    try:
                        sample_rate_hz = int(
                            data.get("sample_rate_hz") or sample_rate_hz
                        )
                    except (TypeError, ValueError):
                        pass
                    _s = data.get("seq")
                    try:
                        if _s is not None:
                            _pending_tts_seq = int(_s)
                    except (TypeError, ValueError):
                        _pending_tts_seq = None
                    if data.get("is_final"):
                        logger.debug("← tts_audio_chunk is_final=true seq=%s", _s)
                expecting_binary = True
                continue

            if mtype == "dialog_result":
                if data.get("turn_id") != turn_id:
                    raise CloudVoiceError(
                        "dialog_result turn_id 不匹配", code="INVALID_MESSAGE"
                    )
                dialog = data
                logger.info(
                    "← dialog_result routing=%s", data.get("routing")
                )
                continue

            if mtype == "turn.complete":
                if data.get("turn_id") != turn_id:
                    raise CloudVoiceError(
                        "turn.complete turn_id 不匹配", code="INVALID_MESSAGE"
                    )
                metrics = data.get("metrics") or {}
                break

            if mtype == "error":
                code = str(data.get("code") or "INTERNAL")
                raise CloudVoiceError(
                    data.get("message") or code,
                    code=code,
                    retryable=bool(data.get("retryable")),
                )

            logger.debug("忽略服务端消息 type=%s", mtype)

        if dialog is None:
            raise CloudVoiceError("未收到 dialog_result", code="INVALID_MESSAGE")

        full_pcm = _merge_tts_pcm_chunks(pcm_entries)
        pcm = (
            np.frombuffer(full_pcm, dtype=np.int16).copy()
            if full_pcm
            else np.array([], dtype=np.int16)
        )
        if pcm.size > 0:
            mx = int(np.max(np.abs(pcm)))
            if mx == 0:
                logger.warning(
                    "云端 TTS 已收齐二进制总长 %s 字节（≈%s 个 s16 采样），但全为 0x00，"
                    "属于服务端发出的静音占位或未写入合成结果；机端无法通过重采样/扬声器修复。"
                    "请在服务端对同一次 synthesize 写 WAV 核对非零采样，并确认 WS 先发 tts_audio_chunk JSON、"
                    "再发 raw PCM 帧、且未把 JSON/base64 误当 binary 发出。",
                    len(full_pcm),
                    pcm.size,
                )
                if os.environ.get("ROCKET_CLOUD_PCM_HEX", "").strip().lower() in (
                    "1",
                    "true",
                    "yes",
                ):
                    head = full_pcm[:64]
                    logger.warning(
                        "ROCKET_CLOUD_PCM_HEX: 前 %s 字节 hex=%s",
                        len(head),
                        head.hex(),
                    )

        llm_stream_text = "".join(llm_stream_parts)
        return {
            "protocol": dialog.get("protocol"),
            "routing": dialog.get("routing"),
            "flight_intent": dialog.get("flight_intent"),
            "confirm": dialog.get("confirm"),
            "chat_reply": dialog.get("chat_reply"),
            "user_input": dialog.get("user_input"),
            "pcm": pcm,
            "sample_rate_hz": sample_rate_hz,
            "metrics": metrics,
            "llm_stream_text": llm_stream_text,
        }

    def _execute_turn_audio_nolock(
        self, pcm_int16: np.ndarray, sample_rate_hz: int
    ) -> dict[str, Any]:
        """发送 turn.audio.start → 多条 turn.audio.chunk（pcm_base64 文本帧）→ turn.audio.end；禁止 binary 上发 PCM。"""
        import websocket  # websocket-client

        ws = self._ws
        if ws is None:
            raise CloudVoiceError("WebSocket 未连接")

        pcm_int16 = np.asarray(pcm_int16, dtype=np.int16).reshape(-1)
        if pcm_int16.size == 0:
            raise CloudVoiceError("turn.audio PCM 为空")

        pcm_mx = int(np.max(np.abs(pcm_int16)))
        pcm_rms = float(np.sqrt(np.mean(pcm_int16.astype(np.float64) ** 2)))
        dur_sec = float(pcm_int16.size) / max(1, int(sample_rate_hz))
        logger.info(
            "turn.audio 上行: samples=%s sr_hz=%s dur≈%.2fs abs_max=%s rms=%.1f dtype=int16",
            pcm_int16.size,
            int(sample_rate_hz),
            dur_sec,
            pcm_mx,
            pcm_rms,
        )
        if pcm_mx == 0:
            logger.warning(
                "turn.audio 上行波形全零，云端 ASR 通常会判无有效语音（请查麦/切段/VAD 是否误交静音）"
            )
        elif pcm_mx < 200:
            logger.warning(
                "turn.audio 上行幅值极小 abs_max=%s（仍发送）；若云端反复无识别请调 AGC/VAD/麦增益",
                pcm_mx,
            )

        turn_id = str(uuid.uuid4())
        start = {
            "type": "turn.audio.start",
            "proto_version": _CLOUD_PROTO,
            "transport_profile": self._transport_profile,
            "turn_id": turn_id,
            "sample_rate_hz": int(sample_rate_hz),
            "codec": "pcm_s16le",
            "channels": 1,
        }
        raw = pcm_int16.tobytes()
        try:
            ws.send(json.dumps(start, ensure_ascii=False))
            try:
                raw_chunk = int(os.environ.get("ROCKET_CLOUD_AUDIO_CHUNK_BYTES", "8192"))
            except ValueError:
                raw_chunk = 8192
            raw_chunk = max(2048, min(256 * 1024, raw_chunk))
            n_chunks = 0
            for i in range(0, len(raw), raw_chunk):
                piece = raw[i : i + raw_chunk]
                chunk_msg = {
                    "type": "turn.audio.chunk",
                    "proto_version": _CLOUD_PROTO,
                    "transport_profile": self._transport_profile,
                    "turn_id": turn_id,
                    "pcm_base64": base64.b64encode(piece).decode("ascii"),
                }
                ws.send(json.dumps(chunk_msg, ensure_ascii=False))
                n_chunks += 1
            end = {
                "type": "turn.audio.end",
                "proto_version": _CLOUD_PROTO,
                "transport_profile": self._transport_profile,
                "turn_id": turn_id,
            }
            ws.send(json.dumps(end, ensure_ascii=False))
        except Exception as e:
            if _transient_ws_exc(e):
                raise
            raise CloudVoiceError(f"发送 turn.audio 失败: {e}", code="INTERNAL") from e
        logger.debug(
            "→ turn.audio start/%s chunk(s)/end turn_id=%s samples=%s",
            n_chunks,
            turn_id,
            pcm_int16.size,
        )

        expecting_binary = False
        _pending_tts_seq: int | None = None
        pcm_entries: list[tuple[int | None, int, bytes]] = []
        _pcm_arrival = 0
        llm_stream_parts: list[str] = []
        dialog: dict[str, Any] | None = None
        metrics: dict[str, Any] = {}
        out_sr = 24000

        while True:
            try:
                msg = ws.recv()
            except websocket.WebSocketConnectionClosedException as e:
                raise CloudVoiceError(
                    f"连接已断开: {e}",
                    code="DISCONNECTED",
                    retryable=True,
                ) from e
            except Exception as e:
                if _transient_ws_exc(e):
                    raise
                raise

            if isinstance(msg, bytes):
                if expecting_binary:
                    expecting_binary = False
                else:
                    logger.warning("收到未预期的二进制帧，仍作为 TTS 数据处理")
                pcm_entries.append((_pending_tts_seq, _pcm_arrival, msg))
                _pcm_arrival += 1
                _pending_tts_seq = None
                continue

            if not isinstance(msg, str):
                raise CloudVoiceError(
                    f"期望文本帧为 str，实际为 {type(msg).__name__}",
                    code="INVALID_MESSAGE",
                )
            text_frame = msg.strip()
            if not text_frame:
                logger.debug("跳过空 WebSocket 文本帧")
                continue
            try:
                data = json.loads(text_frame)
            except json.JSONDecodeError as e:
                head = text_frame[:200].replace("\n", "\\n")
                raise CloudVoiceError(
                    f"服务端文本帧不是合法 JSON: {e}; 前 {len(head)} 字符: {head!r}",
                    code="INVALID_MESSAGE",
                ) from e
            mtype = data.get("type")

            if mtype == "asr.partial":
                logger.debug("← asr.partial（机端不参与状态跳转）")
                continue

            if mtype == "llm.text_delta":
                if data.get("turn_id") != turn_id:
                    logger.debug(
                        "llm.text_delta turn_id 与当前不一致，忽略 type=%s",
                        mtype,
                    )
                    continue
                raw_d = data.get("delta")
                delta = "" if raw_d is None else str(raw_d)
                llm_stream_parts.append(delta)
                _print_stream = os.environ.get("ROCKET_PRINT_LLM_STREAM", "").lower() in (
                    "1",
                    "true",
                    "yes",
                )
                if _print_stream:
                    print(delta, end="", flush=True)
                    if data.get("done"):
                        print(flush=True)
                logger.debug(
                    "← llm.text_delta done=%s delta_chars=%s",
                    data.get("done"),
                    len(delta),
                )
                continue

            if mtype == "tts_audio_chunk":
                _pending_tts_seq = None
                if data.get("turn_id") != turn_id:
                    logger.warning("tts_audio_chunk turn_id 与当前不一致，仍消费后续二进制")
                else:
                    try:
                        out_sr = int(data.get("sample_rate_hz") or out_sr)
                    except (TypeError, ValueError):
                        pass
                    _s = data.get("seq")
                    try:
                        if _s is not None:
                            _pending_tts_seq = int(_s)
                    except (TypeError, ValueError):
                        _pending_tts_seq = None
                    if data.get("is_final"):
                        logger.debug("← tts_audio_chunk is_final=true seq=%s", _s)
                expecting_binary = True
                continue

            if mtype == "dialog_result":
                if data.get("turn_id") != turn_id:
                    raise CloudVoiceError(
                        "dialog_result turn_id 不匹配", code="INVALID_MESSAGE"
                    )
                dialog = data
                logger.info(
                    "← dialog_result routing=%s", data.get("routing")
                )
                continue

            if mtype == "turn.complete":
                if data.get("turn_id") != turn_id:
                    raise CloudVoiceError(
                        "turn.complete turn_id 不匹配", code="INVALID_MESSAGE"
                    )
                metrics = data.get("metrics") or {}
                break

            if mtype == "error":
                code = str(data.get("code") or "INTERNAL")
                raise CloudVoiceError(
                    data.get("message") or code,
                    code=code,
                    retryable=bool(data.get("retryable")),
                )

            logger.debug("忽略服务端消息 type=%s", mtype)

        if dialog is None:
            raise CloudVoiceError("未收到 dialog_result", code="INVALID_MESSAGE")

        full_pcm = _merge_tts_pcm_chunks(pcm_entries)
        out_pcm = (
            np.frombuffer(full_pcm, dtype=np.int16).copy()
            if full_pcm
            else np.array([], dtype=np.int16)
        )
        if out_pcm.size > 0:
            mx = int(np.max(np.abs(out_pcm)))
            if mx == 0:
                logger.warning(
                    "云端 TTS 已收齐但全零采样，请核对服务端。",
                )

        llm_stream_text = "".join(llm_stream_parts)
        return {
            "protocol": dialog.get("protocol"),
            "routing": dialog.get("routing"),
            "flight_intent": dialog.get("flight_intent"),
            "confirm": dialog.get("confirm"),
            "chat_reply": dialog.get("chat_reply"),
            "user_input": dialog.get("user_input"),
            "pcm": out_pcm,
            "sample_rate_hz": out_sr,
            "metrics": metrics,
            "llm_stream_text": llm_stream_text,
        }

    def run_turn_audio(
        self, pcm_int16: np.ndarray, sample_rate_hz: int
    ) -> dict[str, Any]:
        """上行一轮麦克风 PCM：chunk 均为含 pcm_base64 的文本 JSON；收齐 dialog_result + TTS + turn.complete。"""
        try:
            raw_attempts = int(os.environ.get("ROCKET_CLOUD_TURN_RETRIES", "3"))
        except ValueError:
            raw_attempts = 3
        attempts = max(1, raw_attempts)
        try:
            delay = float(os.environ.get("ROCKET_CLOUD_TURN_RETRY_DELAY_SEC", "0.35"))
        except ValueError:
            delay = 0.35
        delay = max(0.0, delay)

        for attempt in range(attempts):
            with self._lock:
                try:
                    if self._ws is None:
                        self._connect_nolock()
                    return self._execute_turn_audio_nolock(pcm_int16, sample_rate_hz)
                except CloudVoiceError as e:
                    retry = bool(e.retryable) or e.code == "DISCONNECTED"
                    if retry and attempt < attempts - 1:
                        logger.warning(
                            "turn.audio 可恢复错误，重连重试 (%s/%s): %s",
                            attempt + 1,
                            attempts,
                            e,
                        )
                        self._close_nolock()
                        if delay:
                            time.sleep(delay)
                        continue
                    raise
                except Exception as e:
                    if _transient_ws_exc(e) and attempt < attempts - 1:
                        logger.warning(
                            "turn.audio WebSocket 瞬断，重连重试 (%s/%s): %s",
                            attempt + 1,
                            attempts,
                            e,
                        )
                        self._close_nolock()
                        if delay:
                            time.sleep(delay)
                        continue
                    raise

        raise CloudVoiceError("run_turn_audio 未执行", code="INTERNAL")

    def _execute_tts_synthesize_nolock(self, text: str) -> dict[str, Any]:
        """已持锁且 _ws 已连接：发送 tts.synthesize，仅收 tts_audio_chunk* 与 turn.complete（无 dialog_result）。"""
        import websocket  # websocket-client

        ws = self._ws
        if ws is None:
            raise CloudVoiceError("WebSocket 未连接")

        turn_id = str(uuid.uuid4())
        synth_msg = {
            "type": "tts.synthesize",
            "proto_version": _CLOUD_PROTO,
            "transport_profile": self._transport_profile,
            "turn_id": turn_id,
            "text": text,
        }
        try:
            ws.send(json.dumps(synth_msg, ensure_ascii=False))
        except Exception as e:
            if _transient_ws_exc(e):
                raise
            raise CloudVoiceError(f"发送 tts.synthesize 失败: {e}", code="INTERNAL") from e
        logger.debug("→ tts.synthesize turn_id=%s", turn_id)

        expecting_binary = False
        _pending_tts_seq: int | None = None
        pcm_entries: list[tuple[int | None, int, bytes]] = []
        _pcm_arrival = 0
        metrics: dict[str, Any] = {}
        sample_rate_hz = 24000

        while True:
            try:
                msg = ws.recv()
            except websocket.WebSocketConnectionClosedException as e:
                raise CloudVoiceError(
                    f"连接已断开: {e}",
                    code="DISCONNECTED",
                    retryable=True,
                ) from e
            except Exception as e:
                if _transient_ws_exc(e):
                    raise
                raise

            if isinstance(msg, bytes):
                if expecting_binary:
                    expecting_binary = False
                else:
                    logger.warning("收到未预期的二进制帧，仍作为 TTS 数据处理")
                pcm_entries.append((_pending_tts_seq, _pcm_arrival, msg))
                _pcm_arrival += 1
                _pending_tts_seq = None
                continue

            if not isinstance(msg, str):
                raise CloudVoiceError(
                    f"期望文本帧为 str，实际为 {type(msg).__name__}",
                    code="INVALID_MESSAGE",
                )
            text_frame = msg.strip()
            if not text_frame:
                logger.debug("跳过空 WebSocket 文本帧")
                continue
            try:
                data = json.loads(text_frame)
            except json.JSONDecodeError as e:
                head = text_frame[:200].replace("\n", "\\n")
                raise CloudVoiceError(
                    f"服务端文本帧不是合法 JSON: {e}; 前 {len(head)} 字符: {head!r}",
                    code="INVALID_MESSAGE",
                ) from e
            mtype = data.get("type")

            if mtype == "asr.partial":
                logger.debug("← asr.partial（tts 轮次，忽略）")
                continue

            if mtype == "llm.text_delta":
                if data.get("turn_id") != turn_id:
                    logger.debug(
                        "llm.text_delta turn_id 与当前 tts 不一致，忽略",
                    )
                continue

            if mtype == "tts_audio_chunk":
                _pending_tts_seq = None
                if data.get("turn_id") != turn_id:
                    logger.warning(
                        "tts_audio_chunk turn_id 与 tts.synthesize 不一致，仍消费后续二进制",
                    )
                else:
                    try:
                        sample_rate_hz = int(
                            data.get("sample_rate_hz") or sample_rate_hz
                        )
                    except (TypeError, ValueError):
                        pass
                    _s = data.get("seq")
                    try:
                        if _s is not None:
                            _pending_tts_seq = int(_s)
                    except (TypeError, ValueError):
                        _pending_tts_seq = None
                    if data.get("is_final"):
                        logger.debug("← tts_audio_chunk is_final=true seq=%s", _s)
                expecting_binary = True
                continue

            if mtype == "dialog_result":
                logger.debug("tts.synthesize 收到 dialog_result（非预期），忽略")
                continue

            if mtype == "turn.complete":
                if data.get("turn_id") != turn_id:
                    raise CloudVoiceError(
                        "turn.complete turn_id 不匹配", code="INVALID_MESSAGE"
                    )
                metrics = data.get("metrics") or {}
                break

            if mtype == "error":
                code = str(data.get("code") or "INTERNAL")
                raise CloudVoiceError(
                    data.get("message") or code,
                    code=code,
                    retryable=bool(data.get("retryable")),
                )

            logger.debug("忽略服务端消息 type=%s", mtype)

        full_pcm = _merge_tts_pcm_chunks(pcm_entries)
        pcm = (
            np.frombuffer(full_pcm, dtype=np.int16).copy()
            if full_pcm
            else np.array([], dtype=np.int16)
        )
        if pcm.size > 0:
            mx = int(np.max(np.abs(pcm)))
            if mx == 0:
                logger.warning(
                    "tts.synthesize 收齐 PCM 但全零（服务端静音占位）；总长 %s 字节",
                    len(full_pcm),
                )

        return {
            "pcm": pcm,
            "sample_rate_hz": sample_rate_hz,
            "metrics": metrics,
        }

    def run_tts_synthesize(self, text: str) -> dict[str, Any]:
        """
        发送 tts.synthesize，收齐 TTS 块与 turn.complete（无 dialog_result）。
        与 run_turn 共用连接，互斥由服务端排队；重试策略同 ROCKET_CLOUD_TURN_RETRIES。
        """
        t = (text or "").strip()
        if not t:
            raise CloudVoiceError("tts.synthesize text 不能为空")

        try:
            raw_attempts = int(os.environ.get("ROCKET_CLOUD_TURN_RETRIES", "3"))
        except ValueError:
            raw_attempts = 3
        attempts = max(1, raw_attempts)
        try:
            delay = float(os.environ.get("ROCKET_CLOUD_TURN_RETRY_DELAY_SEC", "0.35"))
        except ValueError:
            delay = 0.35
        delay = max(0.0, delay)

        for attempt in range(attempts):
            with self._lock:
                try:
                    if self._ws is None:
                        self._connect_nolock()
                    return self._execute_tts_synthesize_nolock(t)
                except CloudVoiceError as e:
                    retry = bool(e.retryable) or e.code == "DISCONNECTED"
                    if retry and attempt < attempts - 1:
                        logger.warning(
                            "tts.synthesize 可恢复错误，将重连并重试 (%s/%s): %s",
                            attempt + 1,
                            attempts,
                            e,
                        )
                        self._close_nolock()
                        if delay:
                            time.sleep(delay)
                        continue
                    raise
                except Exception as e:
                    if _transient_ws_exc(e) and attempt < attempts - 1:
                        logger.warning(
                            "tts.synthesize WebSocket 瞬断，重连并重试 (%s/%s): %s",
                            attempt + 1,
                            attempts,
                            e,
                        )
                        self._close_nolock()
                        if delay:
                            time.sleep(delay)
                        continue
                    raise

        raise CloudVoiceError("run_tts_synthesize 未执行", code="INTERNAL")

    def run_turn(self, text: str) -> dict[str, Any]:
        """
        发送一轮用户文本，收齐 dialog_result、TTS 块、turn.complete。

        支持流式下行：可先于 dialog_result 收到 tts_audio_chunk+PCM 与 llm.text_delta；
        飞控与最终文案仍以 dialog_result 为准。

        若中间因对端已关 TCP、ping/pong Broken pipe 等断开，会自动关连接、
        重连 session 并重发本轮（次数由 ROCKET_CLOUD_TURN_RETRIES 控制，默认 3）。

        Returns:
            dict: routing, flight_intent, chat_reply, user_input, pcm, sample_rate_hz,
            metrics, llm_stream_text（llm.text_delta 拼接，可选调试/UI）
        """
        t = (text or "").strip()
        if not t:
            raise CloudVoiceError("turn.text 不能为空")

        try:
            raw_attempts = int(os.environ.get("ROCKET_CLOUD_TURN_RETRIES", "3"))
        except ValueError:
            raw_attempts = 3
        attempts = max(1, raw_attempts)
        try:
            delay = float(os.environ.get("ROCKET_CLOUD_TURN_RETRY_DELAY_SEC", "0.35"))
        except ValueError:
            delay = 0.35
        delay = max(0.0, delay)

        for attempt in range(attempts):
            with self._lock:
                try:
                    if self._ws is None:
                        self._connect_nolock()
                    return self._execute_turn_nolock(t)
                except CloudVoiceError as e:
                    retry = bool(e.retryable) or e.code == "DISCONNECTED"
                    if retry and attempt < attempts - 1:
                        logger.warning(
                            "云端回合可恢复错误，将重连并重试 (%s/%s): %s",
                            attempt + 1,
                            attempts,
                            e,
                        )
                        self._close_nolock()
                        if delay:
                            time.sleep(delay)
                        continue
                    raise
                except Exception as e:
                    if _transient_ws_exc(e) and attempt < attempts - 1:
                        logger.warning(
                            "云端 WebSocket 瞬断（如对端先关、PONG 写失败），"
                            "重连并重发 turn (%s/%s): %s",
                            attempt + 1,
                            attempts,
                            e,
                        )
                        self._close_nolock()
                        if delay:
                            time.sleep(delay)
                        continue
                    raise

        raise CloudVoiceError("run_turn 未执行", code="INTERNAL")