DroneMind/voice_drone/main_app.py
2026-04-14 09:54:26 +08:00

2272 lines
96 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# 实时检测语音:用「无人机」唤醒 → TTS「你好我在呢」→ 收音一句指令(关麦)→ 大模型 Kokoro 播报答句 → 再仅听唤醒词。
# 可选assistant.local_keyword_takeoff_enabled 或 ROCKET_LOCAL_KEYWORD_TAKEOFF=1 时,「无人机 + keywords.yaml 里 takeoff 词」走本地 offboard + WAV默认关闭
# 其它指令走云端/本地 LLM → flight_intent 等(设 ROCKET_CLOUD_EXECUTE_FLIGHT=1 才执行机端序列)。
# 环境变量ROCKET_LLM_GGUF、ROCKET_LLM_MAX_TOKENS默认 256、ROCKET_LLM_CTX默认 4096可试 2048 省显存/略提速)、
# ROCKET_LLM_N_THREADSllama.cpp 线程数,如 RK3588 可试 68、ROCKET_LLM_N_GPU_LAYERS有 CUDA/Vulkan 时>0、ROCKET_LLM_N_BATCH、
# ROCKET_TTS_ORT_INTRA_OP_THREADS / ROCKET_TTS_ORT_INTER_OP_THREADSKokoro ONNXRuntime 线程),
# ROCKET_CHAT_IDLE_SEC历史占位每轮重置上下文、ROCKET_TTS_DEVICE同 qwen15b_chat --tts-device
# ROCKET_INPUT_HW=2,0 对应 arecord -l 的 card,deviceROCKET_INPUT_DEVICE_INDEX、ROCKET_INPUT_DEVICE_NAME
# 录音:默认交互列出 arecord -l + PyAudio 并选择;--input-index / ROCKET_INPUT_DEVICE_INDEX 跳过交互;--non-interactive 用 yaml 的 input_device_index可为 null 自动探测)。
# ROCKET_LLM_DISABLE=1 关闭对话。
# ROCKET_LLM_STREAM=0 关闭流式输出(整段推理后再单次 TTS便于对照调试
# ROCKET_STREAM_TTS_CHUNK_CHARS 流式闲聊时、无句末标点则按此长度强制切段(默认 64过小会听感碎
# 云端语音(见 voice_drone_assistant/clientguide.mdROCKET_CLOUD_VOICE=1 或 cloud_voice.enabled
# ROCKET_CLOUD_WS_URL、ROCKET_CLOUD_AUTH_TOKEN、ROCKET_CLOUD_DEVICE_IDROCKET_CLOUD_FALLBACK_LOCAL=0 禁用本地回退。
# 云端会话固定 pcm_asr_uplinkVAD 截句→turn.audio.*→Fun-ASR同句快路径仍可用 turn.text。
# 闲聊「无语音」超时listen_silence_timeout_sec默认 5滴声后仅当 RMS<energy_vad_rms_low 且未在说话时累计,满则播 MSG非墙上固定 5s。
# 提示音长 segment_cue_duration_ms / ROCKET_CLOUD_SEGMENT_CUE_MS。
# 本地字符串走服务端 TTScloud_voice.remote_tts_for_local默认 true或 ROCKET_CLOUD_REMOTE_TTS=1设 0 则用 Kokoro。
# PX4 语境 YAMLcloud_voice.px4_context_file 或 ROCKET_CLOUD_PX4_CONTEXT_FILE合并进 session.start.client。
# STT 串行ROCKET_STT_QUEUE_MAX默认 1限制 VAD 排队段数;上一段仍在识别时新段可能因队列满被丢弃,见日志。
# 问候中仍送 STTROCKET_VAD_STT_DURING_GREETING=1默认不向 STT 排队,减少 TTS 问候期间的无效识别与积压)。
# 识别到唤醒词后立刻短鸣「滴」ROCKET_WAKE_ACK_BEEP=0 关闭ROCKET_WAKE_ACK_BEEP_SEC/HZ/GAIN 未设时参照 ROCKET_WAKE_BEEP_*(时长默认同 0.72 倍略短)。
# 唤醒问候播完后短鸣「滴」ROCKET_WAKE_PROMPT_BEEP=0 关闭ROCKET_WAKE_BEEP_SEC / ROCKET_WAKE_BEEP_HZ / ROCKET_WAKE_BEEP_GAIN 可调。
# 播完 TTS 后恢复 PyAudio 麦流ROCKET_MIC_RESTART_SETTLE_MS默认 150给 ES8388 等编解码器一点时间稳定,仍为静音可试 yaml recognizer.ack_pause_mic_for_playback=false。
# 云端飞控 JSON默认仅日志 + 播放服务端 TTS见 docs/llmcon.md要执行机端飞控设 ROCKET_CLOUD_EXECUTE_FLIGHT=1。
# 方案一(云 → 本程序 → ROS 伴飞桥):同时设 ROCKET_FLIGHT_INTENT_ROS_BRIDGE=1仅向 ROS 话题发布 JSON不走 Socket/offboard
# ROCKET_FLIGHT_BRIDGE_TOPIC默认 /input、ROCKET_FLIGHT_BRIDGE_SETUP默认 source /opt/ros/noetic/setup.bash
# ROCKET_FLIGHT_BRIDGE_WAIT_SUB默认 2等待订阅者0 则尽量即发)。
# 本地口令起飞keywords.yaml takeoff → offboard默认关system.yaml assistant.local_keyword_takeoff_enabledROCKET_LOCAL_KEYWORD_TAKEOFF=1/true/yes 优先开启。
# 唤醒问候「你好,我在呢」:优先播 WAVROCKET_WAKE_GREETING_WAV 或 assets/tts_cache/wake_greeting.wav
# 若无文件则在预加载/首次播报时自动生成(须 scipy 写盘);亦可手动 python scripts/generate_wake_greeting_wav.py。
# 默认启动后预加载 Qwen GGUF + Kokoro ONNX设 ROCKET_SKIP_MODEL_PRELOAD=1 或传 --no-preload 可改为首次对话时再加载。
from __future__ import annotations
import argparse
import enum
import json
import os
import queue
import shlex
import signal
import subprocess
import sys
import tempfile
import threading
import time
import wave
from pathlib import Path
import numpy as np
_PROJECT_ROOT = Path(__file__).resolve().parents[1]
if str(_PROJECT_ROOT) not in sys.path:
sys.path.insert(0, str(_PROJECT_ROOT))
try:
os.chdir(_PROJECT_ROOT)
except OSError:
pass
from voice_drone.core.portaudio_env import fix_ld_path_for_portaudio
fix_ld_path_for_portaudio()
# 控制台必显:识别结果 / 语音段提交 / 未命中唤醒(不依赖 logging 级别)
# 板载麦上 Silero 常截不到段默认用能量RMSVAD完整系统可用 ROCKET_ENERGY_VAD=0 + vad_backend: silero
os.environ.setdefault("ROCKET_ENERGY_VAD", "1")
os.environ.setdefault("ROCKET_PRINT_STT", "1")
os.environ.setdefault("ROCKET_PRINT_VAD", "1")
from voice_drone.core.command import Command
from voice_drone.core.cloud_dialog_v1 import (
CLOUD_VOICE_DIALOG_V1,
MSG_CANCELLED,
MSG_CONFIRM_EXECUTING,
MSG_CONFIRM_TIMEOUT,
MSG_PROMPT_LISTEN_TIMEOUT,
match_phrase_list,
normalize_phrase_text,
parse_confirm_dict,
)
from voice_drone.core.flight_intent import (
ActionGoto,
ActionHold,
ActionHover,
ActionLand,
ActionReturnHome,
ActionTakeoff,
ActionWait,
goto_action_to_command,
parse_flight_intent_dict,
)
from voice_drone.core.configuration import (
SYSTEM_ASSISTANT_CONFIG,
SYSTEM_CLOUD_VOICE_CONFIG,
SYSTEM_CLOUD_VOICE_PX4_CONTEXT,
)
from voice_drone.core.qwen_intent_chat import (
FLIGHT_INTENT_CHAT_SYSTEM,
default_qwen_gguf_path,
load_llama_qwen,
parse_flight_intent_reply,
)
from voice_drone.core.recognizer import VoiceCommandRecognizer
from voice_drone.core.streaming_llm_tts import force_soft_split, take_completed_sentences
from voice_drone.logging_ import get_logger
logger = get_logger("voice_drone_assistant")
_CLOUD_PCM_TAG = "__cloud_pcm__"
# 唤醒词命中后立即播短音(入队,由主线程 _drain_llm_playback_queue 播放)
_WAKE_HIT_BEEP_TAG = "__wake_hit_beep__"
# 云端收音VAD 截句 → 断句提示音 → turn.audio.*;命令队列用元组标记
_PCM_TURN_MARKER = "__pcm_turn__"
_SEGMENT_END_CUE_TAG = "__segment_end_cue__"
_CHITCHAT_REPROMPT_BEEP_TAG = "__chitchat_reprompt_beep__"
_WAKE_GREETING = "你好,我在呢"
_TTS_CACHE_DIR = _PROJECT_ROOT / "assets" / "tts_cache"
_WAKE_GREETING_WAV = _TTS_CACHE_DIR / "wake_greeting.wav"
def _resolve_wake_greeting_wav() -> Path:
raw = os.environ.get("ROCKET_WAKE_GREETING_WAV", "").strip()
return Path(raw).expanduser() if raw else _WAKE_GREETING_WAV
_CORE_DIR = _PROJECT_ROOT / "voice_drone" / "core"
_TAKEOFF_ACK_WAV = _CORE_DIR / "好的收到,开始起飞.wav"
_TAKEOFF_DONE_WAV = _CORE_DIR / "任务执行完成,开始返航降落.wav"
_OFFBOARD_SCRIPT = _PROJECT_ROOT / "scripts" / "run_px4_offboard_one_terminal.sh"
def _play_wav_blocking(path: Path) -> None:
"""与 src/play_wav.py 相同16-bit PCM 单文件 blocking 播放。"""
import pyaudio
with wave.open(str(path), "rb") as wf:
ch = wf.getnchannels()
sw = wf.getsampwidth()
sr = wf.getframerate()
nframes = wf.getnframes()
if sw != 2:
raise ValueError(f"仅支持 16-bit PCM: {path}")
pcm = wf.readframes(nframes)
p = pyaudio.PyAudio()
try:
fmt = p.get_format_from_width(sw)
chunk = 1024
stream = p.open(
format=fmt,
channels=ch,
rate=sr,
output=True,
frames_per_buffer=chunk,
)
stream.start_stream()
try:
step = chunk * sw * ch
for i in range(0, len(pcm), step):
stream.write(pcm[i : i + step])
finally:
stream.stop_stream()
stream.close()
finally:
p.terminate()
def _synthesize_ready_beep(
sample_rate: int = 24000,
*,
duration_sec: float = 0.11,
frequency_hz: float = 988.0,
amplitude: float = 0.22,
) -> np.ndarray:
"""正弦短鸣 + 淡入淡出,作唤醒后「可以说话」提示。"""
n = max(8, int(sample_rate * duration_sec))
x = np.arange(n, dtype=np.float32)
w = np.sin(2.0 * np.pi * frequency_hz * x / float(sample_rate)).astype(np.float32)
fade = max(2, min(n // 3, int(0.006 * sample_rate)))
ramp = np.linspace(0.0, 1.0, fade, dtype=np.float32)
w[:fade] *= ramp
w[-fade:] *= ramp[::-1]
return np.clip(w * np.float32(amplitude), -1.0, 1.0)
def _terminate_process_group(proc: subprocess.Popen) -> None:
if proc.poll() is not None:
return
try:
os.killpg(proc.pid, signal.SIGTERM)
except ProcessLookupError:
return
except Exception as e: # noqa: BLE001
logger.warning("SIGTERM offboard 进程组失败: %s", e)
try:
proc.wait(timeout=10)
except subprocess.TimeoutExpired:
try:
os.killpg(proc.pid, signal.SIGKILL)
except Exception as e: # noqa: BLE001
logger.warning("SIGKILL offboard 进程组失败: %s", e)
class _WakeFlowPhase(enum.IntEnum):
IDLE = 0
GREETING_WAIT = 1
ONE_SHOT_LISTEN = 2
LLM_BUSY = 3
FLIGHT_CONFIRM_LISTEN = 4
class TakeoffPrintRecognizer(VoiceCommandRecognizer):
"""待机IDLE仅识别含唤醒词的句子唤醒后多轮对话在 ONE_SHOT_LISTEN 等阶段不要求句内唤醒词。
云端会话为 pcm_asr_uplink滴声后整句 PCM 上云 Fun-ASR结束一轮回到 IDLE 再要唤醒词。"""
def __init__(self, *, skip_model_preload: bool = False) -> None:
super().__init__(auto_connect_socket=False)
self.ack_tts_enabled = False
self._audio_play_lock = threading.Lock()
self._offboard_proc_lock = threading.Lock()
self._active_offboard_proc: subprocess.Popen | None = None
self._takeoff_side_task_busy = threading.Lock()
self._model_warm_lock = threading.Lock()
# 流式闲聊会按句/块多次入队,队列过小易丢段
self._llm_playback_queue: queue.Queue[str] = queue.Queue(maxsize=64)
self._chat_session_lock = threading.Lock()
self._chat_session_until: float = 0.0
self._llm_messages: list = []
self._llm = None
self._llm_tts_engine = None
self._llm_model_path = Path(
os.environ.get(
"ROCKET_LLM_GGUF",
str(default_qwen_gguf_path(_PROJECT_ROOT)),
)
)
self._chat_idle_sec = float(os.environ.get("ROCKET_CHAT_IDLE_SEC", "120"))
self._llm_max_tokens = int(os.environ.get("ROCKET_LLM_MAX_TOKENS", "256"))
self._llm_ctx = int(os.environ.get("ROCKET_LLM_CTX", "4096"))
self._llm_tts_max_chars = int(os.environ.get("ROCKET_LLM_TTS_MAX_CHARS", "800"))
self._llm_stream_enabled = os.environ.get(
"ROCKET_LLM_STREAM", "1"
).lower() not in ("0", "false", "no")
self._stream_tts_chunk_chars = max(
16,
int(os.environ.get("ROCKET_STREAM_TTS_CHUNK_CHARS", "64")),
)
self._llm_disabled = os.environ.get("ROCKET_LLM_DISABLE", "").lower() in (
"1",
"true",
"yes",
)
_kw_raw = os.environ.get("ROCKET_LOCAL_KEYWORD_TAKEOFF", "").strip()
if _kw_raw:
self._local_keyword_takeoff_enabled = _kw_raw.lower() in (
"1",
"true",
"yes",
)
else:
_ac = (
SYSTEM_ASSISTANT_CONFIG
if isinstance(SYSTEM_ASSISTANT_CONFIG, dict)
else {}
)
self._local_keyword_takeoff_enabled = bool(
_ac.get("local_keyword_takeoff_enabled", False)
)
self._skip_model_preload = skip_model_preload or os.environ.get(
"ROCKET_SKIP_MODEL_PRELOAD", ""
).lower() in ("1", "true", "yes")
cv = SYSTEM_CLOUD_VOICE_CONFIG if isinstance(SYSTEM_CLOUD_VOICE_CONFIG, dict) else {}
env_cloud = os.environ.get("ROCKET_CLOUD_VOICE", "").lower() in (
"1",
"true",
"yes",
)
self._cloud_voice_enabled = bool(env_cloud or cv.get("enabled"))
self._cloud_fallback_local = os.environ.get(
"ROCKET_CLOUD_FALLBACK_LOCAL", ""
).lower() not in ("0", "false", "no") and bool(
cv.get("fallback_to_local", True)
)
# 唤醒词仅在 IDLE 由命令线程强制ONE_SHOT_LISTEN 整句直接上行或处理,不要求句内唤醒词。
try:
self._listen_silence_timeout_sec = max(
0.5,
float(
os.environ.get("ROCKET_PROMPT_LISTEN_TIMEOUT_SEC")
or cv.get("listen_silence_timeout_sec")
or 5.0
),
)
except ValueError:
self._listen_silence_timeout_sec = 5.0
try:
self._post_cue_mic_mute_ms = float(
os.environ.get("ROCKET_POST_CUE_MIC_MUTE_MS")
or cv.get("post_cue_mic_mute_ms")
or 200.0
)
except ValueError:
self._post_cue_mic_mute_ms = 200.0
self._post_cue_mic_mute_ms = max(0.0, min(2000.0, self._post_cue_mic_mute_ms))
try:
self._segment_cue_duration_ms = float(
os.environ.get("ROCKET_SEGMENT_CUE_DURATION_MS")
or cv.get("segment_cue_duration_ms")
or 120.0
)
except ValueError:
self._segment_cue_duration_ms = 120.0
self._segment_cue_duration_ms = max(20.0, min(500.0, self._segment_cue_duration_ms))
ws_url = (os.environ.get("ROCKET_CLOUD_WS_URL") or cv.get("server_url") or "").strip()
auth_tok = (
os.environ.get("ROCKET_CLOUD_AUTH_TOKEN") or cv.get("auth_token") or ""
).strip()
dev_id = (
os.environ.get("ROCKET_CLOUD_DEVICE_ID") or cv.get("device_id") or "drone-001"
).strip()
self._cloud_client = None
self._cloud_remote_tts_for_local = False
if self._cloud_voice_enabled:
if ws_url and auth_tok:
from voice_drone.core.cloud_voice_client import CloudVoiceClient
self._cloud_client = CloudVoiceClient(
server_url=ws_url,
auth_token=auth_tok,
device_id=dev_id,
recv_timeout=float(cv.get("timeout") or 120),
session_client_extensions=dict(SYSTEM_CLOUD_VOICE_PX4_CONTEXT)
if SYSTEM_CLOUD_VOICE_PX4_CONTEXT
else None,
)
_env_rt = os.environ.get("ROCKET_CLOUD_REMOTE_TTS", "").strip().lower()
if _env_rt in ("0", "false", "no"):
self._cloud_remote_tts_for_local = False
elif _env_rt in ("1", "true", "yes"):
self._cloud_remote_tts_for_local = True
else:
self._cloud_remote_tts_for_local = bool(
cv.get("remote_tts_for_local", True)
)
print(
f"[云端] 已启用 WebSocket 对话: {ws_url} device_id={dev_id}",
flush=True,
)
if self._cloud_remote_tts_for_local:
print(
"[云端] 本地文案播报将走 tts.synthesize失败回退 Kokoro",
flush=True,
)
print(
f"[云端] Fun-ASR 上行 turn.audio.*;仅待机时说唤醒词;"
f"滴声后累计静默 {self._listen_silence_timeout_sec:.1f}s低于 yaml energy_vad_rms_low 才计);"
f"断句提示 {self._segment_cue_duration_ms:.0f}ms、消抖 {self._post_cue_mic_mute_ms:.0f}ms。",
flush=True,
)
else:
logger.warning("cloud_voice 已启用但缺少 server_url/auth_token将使用本地 LLM")
self._cloud_voice_enabled = False
self._wake_flow_lock = threading.Lock()
self._wake_phase: int = int(_WakeFlowPhase.IDLE)
self._greeting_done = threading.Event()
self._playback_batch_is_greeting = False
self._pending_finish_wake_cycle_after_tts = False
self._pending_flight_confirm_after_tts = False
self._pending_flight_confirm: dict | None = None
self._flight_confirm_timer: threading.Timer | None = None
self._flight_confirm_timer_lock = threading.Lock()
self._staged_one_shot_after_greeting: str | None = None
self._mic_op_queue: queue.Queue[str] = queue.Queue(maxsize=8)
# 默认仅 1 段在 STT 队列等待;可设 ROCKET_STT_QUEUE_MAX=28 允许少量排队
_raw_sq = os.environ.get("ROCKET_STT_QUEUE_MAX", "1").strip()
try:
_stn = max(1, min(16, int(_raw_sq)))
except ValueError:
_stn = 1
self.stt_queue = queue.Queue(maxsize=_stn)
# PROMPT_LISTENv1 §4 为「RMS 低于阈值持续累计」,不是滴声后固定墙上时钟 5s
self._prompt_listen_watch_armed: bool = False
self._prompt_silence_accum_sec: float = 0.0
self._segment_cue_done = threading.Event()
self._pending_chitchat_reprompt_after_tts = False
if self._cloud_client is not None:
self._vad_speech_start_hook = self._on_vad_speech_start_prompt_listen
self._after_processed_audio_chunk = self._tick_prompt_listen_silence_accum
def _cancel_prompt_listen_timer(self) -> None:
"""停止「滴声后静默监听」累计(飞控/结束唤醒/起 PCM 上行前等)。"""
self._prompt_listen_watch_armed = False
self._prompt_silence_accum_sec = 0.0
def _arm_prompt_listen_timeout(self) -> None:
"""滴声后进 PROMPT_LISTEN仅在麦克持续低于 energy_vad_rms_low 时累加,超时再播 MSG。"""
if self._cloud_client is None:
return
with self._wake_flow_lock:
if self._wake_phase != int(_WakeFlowPhase.ONE_SHOT_LISTEN):
return
self._prompt_silence_accum_sec = 0.0
self._prompt_listen_watch_armed = True
logger.debug(
"PROMPT_LISTEN: 已启用 RMS 累计静默 %.1fs低于 rms_low 才计时;说话或 rms≥low 清零)",
self._listen_silence_timeout_sec,
)
def _on_prompt_listen_timeout(self) -> None:
with self._wake_flow_lock:
if self._wake_phase != int(_WakeFlowPhase.ONE_SHOT_LISTEN):
return
self._prompt_listen_watch_armed = False
self._prompt_silence_accum_sec = 0.0
logger.info(
"[会话] 滴声后持续静默 ≥%.1fs未截句播超时提示并回待机",
self._listen_silence_timeout_sec,
)
self._enqueue_llm_speak(MSG_PROMPT_LISTEN_TIMEOUT)
self._pending_finish_wake_cycle_after_tts = True
def _tick_prompt_listen_silence_accum(self, processed_chunk: np.ndarray) -> None:
if not self._prompt_listen_watch_armed or self._cloud_client is None:
return
with self._wake_flow_lock:
if self._wake_phase != int(_WakeFlowPhase.ONE_SHOT_LISTEN):
return
rms = self._int16_chunk_rms(processed_chunk)
dt = float(len(processed_chunk)) / float(self.audio_capture.sample_rate)
speaking = (
self._ev_speaking
if self._use_energy_vad
else self.vad.is_speaking
)
if speaking or rms >= self._energy_rms_low:
self._prompt_silence_accum_sec = 0.0
return
self._prompt_silence_accum_sec += dt
if self._prompt_silence_accum_sec >= self._listen_silence_timeout_sec:
try:
self._on_prompt_listen_timeout()
except Exception as e: # noqa: BLE001
logger.error("PROMPT_LISTEN 静默超时处理异常: %s", e, exc_info=True)
def _on_vad_speech_start_prompt_listen(self) -> None:
"""VAD 判「开始说话」时清零静默累计v1 §4与 RMS≥rms_low 并行)。"""
if self._cloud_client is None:
return
with self._wake_flow_lock:
if self._wake_phase != int(_WakeFlowPhase.ONE_SHOT_LISTEN):
return
self._prompt_silence_accum_sec = 0.0
def _submit_concatenated_speech_to_stt(self) -> None:
"""在唤醒/一问一答流程中节流 VAD避免问候或云端推理时继续向 STT 积压整句。"""
allow_greeting_stt = os.environ.get(
"ROCKET_VAD_STT_DURING_GREETING", ""
).lower() in ("1", "true", "yes")
with self._wake_flow_lock:
phase = self._wake_phase
if phase == int(_WakeFlowPhase.GREETING_WAIT) and not allow_greeting_stt:
with self.speech_buffer_lock:
self.speech_buffer.clear()
if os.environ.get("ROCKET_PRINT_VAD", "").lower() in (
"1",
"true",
"yes",
):
print(
"[VAD] 问候播放中,本段不送 STT说完问候后再说指令"
"若需在问候同时识别请设 ROCKET_VAD_STT_DURING_GREETING=1",
flush=True,
)
return
if phase == int(_WakeFlowPhase.LLM_BUSY):
with self.speech_buffer_lock:
self.speech_buffer.clear()
if os.environ.get("ROCKET_PRINT_VAD", "").lower() in (
"1",
"true",
"yes",
):
print(
"[VAD] 大模型/云端处理中,本段不送 STT请等本轮播报结束后再说",
flush=True,
)
return
if (
self._cloud_client is not None
and phase == int(_WakeFlowPhase.ONE_SHOT_LISTEN)
):
if len(self.speech_buffer) == 0:
return
speech_audio = np.concatenate(self.speech_buffer)
self.speech_buffer.clear()
min_samples = int(self.audio_capture.sample_rate * 0.5)
if len(speech_audio) >= min_samples:
try:
self.command_queue.put(
(
_PCM_TURN_MARKER,
speech_audio.copy(),
int(self.audio_capture.sample_rate),
),
block=False,
)
if os.environ.get("ROCKET_PRINT_VAD", "").lower() in (
"1",
"true",
"yes",
):
print(
f"[VAD] turn.audio 已排队,{len(speech_audio)} 采样点"
f"(≈{len(speech_audio) / float(self.audio_capture.sample_rate):.2f}s",
flush=True,
)
except queue.Full:
logger.warning("命令队列已满,跳过 PCM 上行")
elif os.environ.get("ROCKET_PRINT_VAD", "").lower() in (
"1",
"true",
"yes",
):
print(
f"[VAD] 语音段太短已丢弃({len(speech_audio)} < {min_samples} 采样)",
flush=True,
)
return
super()._submit_concatenated_speech_to_stt()
def _llm_tts_output_device(self) -> str | int | None:
raw = os.environ.get("ROCKET_TTS_DEVICE", "").strip()
if raw.isdigit():
return int(raw)
if raw:
return raw
return None
def _before_audio_iteration(self) -> None:
self._drain_mic_ops()
super()._before_audio_iteration()
self._drain_llm_playback_queue()
def _drain_mic_ops(self) -> None:
"""主线程:执行命令线程请求的麦克风流 stop/start。"""
while True:
try:
op = self._mic_op_queue.get_nowait()
except queue.Empty:
break
try:
if op == "stop":
if self.audio_capture.stream is not None:
self.audio_capture.stop_stream()
elif op == "start" and self.running:
if self.audio_capture.stream is None:
self.audio_capture.start_stream()
self.vad.reset()
with self.speech_buffer_lock:
self.speech_buffer.clear()
self.pre_speech_buffer.clear()
except Exception as e: # noqa: BLE001
logger.warning("麦克风流控制失败 (%r): %s", op, e)
def _finish_wake_cycle(self) -> None:
self._cancel_prompt_listen_timer()
self._cancel_flight_confirm_timer()
with self._flight_confirm_timer_lock:
self._pending_flight_confirm = None
self._pending_flight_confirm_after_tts = False
self._pending_finish_wake_cycle_after_tts = False
with self._wake_flow_lock:
self._wake_phase = int(_WakeFlowPhase.IDLE)
self._reset_llm_history()
print("[唤醒] 本轮结束。请说「无人机」再次唤醒。", flush=True)
def _reset_llm_history(self) -> None:
with self._chat_session_lock:
self._llm_messages.clear()
self._chat_session_until = 0.0
def _flush_llm_playback_queue_silent(self) -> None:
"""丢弃 LLM 播报队列(无日志);新一轮唤醒前清空,避免与问候语或上一轮残段叠播。"""
while True:
try:
self._llm_playback_queue.get_nowait()
except queue.Empty:
break
def _prepare_wake_session_resources(self) -> None:
"""新一轮唤醒:清空对话状态、播报队列与待 STT 段(问候/快路径共用)。"""
self._reset_llm_history()
self._flush_llm_playback_queue_silent()
self.discard_pending_stt_segments()
def _recover_from_cloud_failure(
self,
user_msg: str,
*,
finish_wake_after_tts: bool,
idle_speak: str,
) -> None:
"""云端 run_turn 失败后:按需回退本地 LLM 或播一句占位。"""
if self._cloud_fallback_local:
print("[云端] 回退本地 LLM…", flush=True)
self._handle_llm_turn_local(user_msg, finish_wake_after_tts=finish_wake_after_tts)
return
self._enqueue_llm_speak(idle_speak)
if finish_wake_after_tts:
self._pending_finish_wake_cycle_after_tts = True
def _begin_wake_cycle(self, staged_followup: str | None) -> None:
"""命中唤醒后:排队问候语,并在主线程播完后由 _after_greeting_pipeline 继续。"""
with self._wake_flow_lock:
if self._wake_phase != int(_WakeFlowPhase.IDLE):
logger.info(
"唤醒忽略:当前非 IDLEphase=%s),不重复排队问候",
_WakeFlowPhase(self._wake_phase).name,
)
return
self._wake_phase = int(_WakeFlowPhase.GREETING_WAIT)
self._prepare_wake_session_resources()
s = (staged_followup or "").strip()
self._staged_one_shot_after_greeting = s if s else None
self._greeting_done.clear()
self._playback_batch_is_greeting = True
self._enqueue_wake_word_ack_beep()
self._enqueue_llm_speak(_WAKE_GREETING)
threading.Thread(
target=self._after_greeting_pipeline,
daemon=True,
name="wake-after-greeting",
).start()
def _wake_fast_path_process_follow(self, follow: str) -> bool:
"""同一句已含唤醒词+指令时:跳过问候与滴声,清队列后直接 _process_one_shot_command。"""
follow = (follow or "").strip()
if not follow:
return False
with self._wake_flow_lock:
if self._wake_phase != int(_WakeFlowPhase.IDLE):
logger.info(
"唤醒连带指令忽略:当前非 IDLEphase=%s",
_WakeFlowPhase(self._wake_phase).name,
)
return False
self._wake_phase = int(_WakeFlowPhase.LLM_BUSY)
self._prepare_wake_session_resources()
self._staged_one_shot_after_greeting = None
self._enqueue_wake_word_ack_beep()
logger.info("唤醒含指令,跳过问候与提示音,直接处理: %s", follow[:120])
self._process_one_shot_command(follow)
return True
def _after_greeting_pipeline(self) -> None:
if not self._greeting_done.wait(timeout=120):
logger.error("问候语播放超时,回到 IDLE")
self._finish_wake_cycle()
return
self._greeting_done.clear()
staged: str | None = None
with self._wake_flow_lock:
staged = self._staged_one_shot_after_greeting
self._staged_one_shot_after_greeting = None
if staged is not None:
with self._wake_flow_lock:
self._wake_phase = int(_WakeFlowPhase.LLM_BUSY)
self._process_one_shot_command(staged)
else:
with self._wake_flow_lock:
self._wake_phase = int(_WakeFlowPhase.ONE_SHOT_LISTEN)
print("[唤醒] 请说您的指令(一句)。", flush=True)
self._arm_prompt_listen_timeout()
def _process_one_shot_command(self, raw: str) -> None:
"""已关麦或准备关麦:处理一句指令(起飞 / LLM结束后再切回 IDLE。"""
user_msg = (raw or "").strip()
if not user_msg:
self._finish_wake_cycle()
return
iw, _ = self.wake_word_detector.detect(user_msg)
if iw:
user_msg = (
self.wake_word_detector.extract_command_text(user_msg) or user_msg
).strip()
if not user_msg:
self._finish_wake_cycle()
return
print(f"[指令] {user_msg}", flush=True)
try:
self._mic_op_queue.put_nowait("stop")
except queue.Full:
pass
time.sleep(0.12)
_, params = self.text_preprocessor.preprocess_fast(user_msg)
if (
self._local_keyword_takeoff_enabled
and params.command_keyword == "takeoff"
):
threading.Thread(
target=self._run_takeoff_offboard_and_wavs,
daemon=True,
).start()
self._finish_wake_cycle()
try:
self._mic_op_queue.put_nowait("start")
except queue.Full:
pass
return
if self._llm_disabled and not self._cloud_voice_enabled:
print("[LLM] 已禁用ROCKET_LLM_DISABLE", flush=True)
self._finish_wake_cycle()
try:
self._mic_op_queue.put_nowait("start")
except queue.Full:
pass
return
self._handle_llm_turn(
user_msg, finish_wake_after_tts=(self._cloud_client is None)
)
@staticmethod
def _flight_payload_requests_takeoff(payload: dict) -> bool:
for a in payload.get("actions") or []:
if isinstance(a, dict) and a.get("type") == "takeoff":
return True
return False
def _enqueue_llm_speak(self, line: str) -> None:
t = (line or "").strip()
if not t:
return
try:
self._llm_playback_queue.put(t, block=False)
except queue.Full:
logger.warning("LLM 播报队列已满,跳过: %s", t[:40])
def _ensure_llm(self):
if self._llm is not None:
return self._llm
with self._model_warm_lock:
if self._llm is not None:
return self._llm
if not self._llm_model_path.is_file():
logger.error("未找到 GGUF: %s", self._llm_model_path)
return None
logger.info("正在加载 LLM: %s", self._llm_model_path)
print("[LLM] 正在加载 QwenGGUF", flush=True)
self._llm = load_llama_qwen(self._llm_model_path, n_ctx=self._llm_ctx)
if self._llm is None:
logger.error("llama-cpp-python 未安装或加载失败")
else:
print("[LLM] Qwen 已载入。", flush=True)
return self._llm
def _ensure_llm_tts(self):
if self._llm_tts_engine is not None:
return self._llm_tts_engine
with self._model_warm_lock:
if self._llm_tts_engine is not None:
return self._llm_tts_engine
from voice_drone.core.tts import KokoroOnnxTTS
print("[LLM] 正在加载 Kokoro TTSONNX", flush=True)
self._llm_tts_engine = KokoroOnnxTTS()
print("[LLM] Kokoro 已载入。", flush=True)
return self._llm_tts_engine
def _preload_llm_and_tts_if_enabled(self) -> None:
"""启动后预加载,避免首轮对话/播报长时间卡顿。"""
if self._cloud_voice_enabled:
print(
"[云端] 跳过本地 Qwen 预加载;对话 TTS 以云端 PCM 为主。",
flush=True,
)
try:
p = _resolve_wake_greeting_wav()
if not p.is_file():
if (
not self._llm_disabled
and not self._cloud_remote_tts_for_local
):
self._ensure_wake_greeting_wav_on_disk()
except Exception as e: # noqa: BLE001
logger.debug("云端模式下预热问候 WAV 跳过: %s", e)
if self._cloud_remote_tts_for_local:
print(
"[云端] 本地字符串播报由 tts.synthesize 提供,跳过 Kokoro 预加载"
"(失败时会临场加载 Kokoro",
flush=True,
)
return
# 飞控确认超时/取消、云端 fallback 等仍走本地 Kokoro启动时加载一次
# 避免超时播报时现场冷启动模型(数秒卡顿)。
if self._skip_model_preload:
print(
"[云端] 已跳过 Kokoro 预加载(--no-preload / ROCKET_SKIP_MODEL_PRELOAD"
"首次本地提示时再加载。",
flush=True,
)
else:
t0 = time.monotonic()
try:
print(
"[LLM] 云端模式:预加载 Kokoro确认超时/取消等本地语音)…",
flush=True,
)
self._ensure_llm_tts()
except Exception as e: # noqa: BLE001
logger.warning(
"云端模式 Kokoro 预加载失败(将在首次本地播报时重试): %s",
e,
exc_info=True,
)
print(f"[LLM] Kokoro 预加载失败: {e}", flush=True)
else:
dt = time.monotonic() - t0
print(f"[LLM] Kokoro 预加载完成(约 {dt:.1f}s", flush=True)
return
if self._llm_disabled or self._skip_model_preload:
if self._skip_model_preload and not self._llm_disabled:
print(
"[LLM] 已跳过预加载(--no-preload 或 ROCKET_SKIP_MODEL_PRELOAD将在首次使用时加载。",
flush=True,
)
return
if not self._llm_model_path.is_file():
print(
f"[LLM] 未找到 GGUF跳过预加载: {self._llm_model_path}",
flush=True,
)
return
print(
"[LLM] 预加载 Qwen + Kokoro数十秒属正常完成后的首轮对话会快很多",
flush=True,
)
t0 = time.monotonic()
try:
if self._ensure_llm() is None:
return
self._ensure_llm_tts()
self._ensure_wake_greeting_wav_on_disk()
except Exception as e: # noqa: BLE001
logger.warning("预加载模型失败(将在首次使用时重试): %s", e, exc_info=True)
print(f"[LLM] 预加载失败: {e}", flush=True)
return
dt = time.monotonic() - t0
print(f"[LLM] 预加载完成(耗时约 {dt:.1f}s", flush=True)
def _ensure_wake_greeting_wav_on_disk(self) -> Path:
"""若尚无问候 WAV则用 Kokoro 合成一次并写入;之后只走 play_wav_path。"""
p = _resolve_wake_greeting_wav()
if p.is_file():
return p
try:
p.parent.mkdir(parents=True, exist_ok=True)
except OSError as e:
logger.warning("无法创建问候缓存目录 %s: %s", p.parent, e)
return p
try:
tts = self._ensure_llm_tts()
tts.synthesize_to_file(_WAKE_GREETING, str(p))
logger.info("已自动生成唤醒问候缓存(此后只播此文件): %s", p)
print(f"[TTS] 已写入问候缓存,下次起不再合成: {p}", flush=True)
except Exception as e: # noqa: BLE001
logger.warning(
"自动生成问候 WAV 失败(需 scipy 写盘;将本次仍用实时合成): %s",
e,
exc_info=True,
)
return p
def _play_wake_ready_beep(self, output_device: object | None) -> None:
"""问候语播完后短鸣一声,提示用户再开口下指令。"""
from voice_drone.core.tts import play_tts_audio
if os.environ.get("ROCKET_WAKE_PROMPT_BEEP", "1").lower() in (
"0",
"false",
"no",
):
return
sr = 24000
try:
dur = float(os.environ.get("ROCKET_WAKE_BEEP_SEC", "0.11"))
except ValueError:
dur = 0.11
dur = max(0.04, min(0.25, dur))
try:
hz = float(os.environ.get("ROCKET_WAKE_BEEP_HZ", "988"))
except ValueError:
hz = 988.0
try:
amp = float(os.environ.get("ROCKET_WAKE_BEEP_GAIN", "0.22"))
except ValueError:
amp = 0.22
amp = max(0.05, min(0.45, amp))
audio = _synthesize_ready_beep(
sr, duration_sec=dur, frequency_hz=hz, amplitude=amp
)
try:
play_tts_audio(audio, sr, output_device=output_device)
print("[唤醒] 提示音已播,请说指令。", flush=True)
except Exception as e: # noqa: BLE001
logger.debug("唤醒提示音播放跳过: %s", e)
def _enqueue_wake_word_ack_beep(self) -> None:
"""唤醒词命中后立即排队一声短鸣,主线程播报(与云 TTS 同队列,不阻塞命令线程)。"""
if os.environ.get("ROCKET_WAKE_ACK_BEEP", "1").lower() in (
"0",
"false",
"no",
):
return
try:
self._llm_playback_queue.put_nowait(_WAKE_HIT_BEEP_TAG)
except queue.Full:
logger.warning("播报队列已满,跳过唤醒确认短音")
def _play_wake_word_hit_beep(self, output_device: object | None) -> None:
"""刚识别到唤醒词时的一声「滴」,默认略短于问候后的滴声。"""
from voice_drone.core.tts import play_tts_audio
if os.environ.get("ROCKET_WAKE_ACK_BEEP", "1").lower() in (
"0",
"false",
"no",
):
return
sr = 24000
try:
raw = os.environ.get("ROCKET_WAKE_ACK_BEEP_SEC", "").strip()
if raw:
dur = float(raw)
else:
dur = float(os.environ.get("ROCKET_WAKE_BEEP_SEC", "0.11")) * 0.72
except ValueError:
dur = 0.08
dur = max(0.04, min(0.25, dur))
try:
raw_h = os.environ.get("ROCKET_WAKE_ACK_BEEP_HZ", "").strip()
hz = float(raw_h) if raw_h else float(os.environ.get("ROCKET_WAKE_BEEP_HZ", "988"))
except ValueError:
hz = 1100.0
try:
raw_g = os.environ.get("ROCKET_WAKE_ACK_BEEP_GAIN", "").strip()
amp = float(raw_g) if raw_g else float(os.environ.get("ROCKET_WAKE_BEEP_GAIN", "0.22"))
except ValueError:
amp = 0.22
amp = max(0.05, min(0.45, amp))
audio = _synthesize_ready_beep(
sr, duration_sec=dur, frequency_hz=hz, amplitude=amp
)
try:
play_tts_audio(audio, sr, output_device=output_device)
except Exception as e: # noqa: BLE001
logger.debug("唤醒确认短音播放失败: %s", e)
return
print("[唤醒] 确认短音已播。", flush=True)
def _try_play_line_via_cloud_tts(self, s: str, dev: object | None) -> bool:
"""docs/API.md §3.3 tts.synthesize成功播放返回 True否则 False调用方回退 Kokoro"""
if not self._cloud_remote_tts_for_local or self._cloud_client is None:
return False
txt = (s or "").strip()
if not txt:
return False
from voice_drone.core.cloud_voice_client import CloudVoiceError
from voice_drone.core.tts import play_tts_audio
t0 = time.monotonic()
try:
out = self._cloud_client.run_tts_synthesize(txt)
except CloudVoiceError as e:
logger.warning("云端 tts.synthesize 失败: %s", e)
return False
except Exception as e: # noqa: BLE001
logger.warning("云端 tts.synthesize 异常: %s", e, exc_info=True)
return False
pcm = out.get("pcm")
try:
sr = int(out.get("sample_rate_hz") or 24000)
except (TypeError, ValueError):
sr = 24000
if pcm is None or np.asarray(pcm).size == 0:
logger.warning("云端 tts.synthesize 返回空 PCM")
return False
pcm_i16 = np.asarray(pcm, dtype=np.int16).reshape(-1)
logger.info(
"云端 tts.synthesize: samples=%s int16_max_abs=%s elapsed=%.3fs",
pcm_i16.size,
int(np.max(np.abs(pcm_i16))),
time.monotonic() - t0,
)
audio_f32 = pcm_i16.astype(np.float32) / 32768.0
try:
play_tts_audio(audio_f32, sr, output_device=dev)
except Exception as e: # noqa: BLE001
logger.warning("播放云端 tts.synthesize 结果失败: %s", e, exc_info=True)
return False
return True
def _play_segment_end_cue(self, dev: object | None) -> None:
"""断句后极短提示§5不计入闲聊再滴声。"""
from voice_drone.core.tts import play_tts_audio
sr = 24000
dur = self._segment_cue_duration_ms / 1000.0
dur = max(0.02, min(0.5, dur))
audio = _synthesize_ready_beep(
sr,
duration_sec=dur,
frequency_hz=1420.0,
amplitude=0.18,
)
try:
play_tts_audio(audio, sr, output_device=dev)
except Exception as e: # noqa: BLE001
logger.debug("断句提示音: %s", e)
def _play_chitchat_reprompt_beep(self, dev: object | None) -> None:
"""闲聊 TTS 播完后再滴一声,进入下一轮 PROMPT_LISTEN。"""
self._play_wake_word_hit_beep(dev)
def _handle_pcm_uplink_turn(self, pcm: np.ndarray, sample_rate_hz: int) -> None:
"""SEGMENT_END断句提示 + 消抖 → turn.audio 上行一轮。"""
with self._wake_flow_lock:
if self._wake_phase != int(_WakeFlowPhase.ONE_SHOT_LISTEN):
logger.debug("PCM 上行忽略:当前非 PROMPT_LISTEN")
return
self._cancel_prompt_listen_timer()
try:
self._mic_op_queue.put_nowait("stop")
except queue.Full:
pass
self._segment_cue_done.clear()
try:
self._llm_playback_queue.put_nowait(_SEGMENT_END_CUE_TAG)
except queue.Full:
logger.error("播报队列满,无法播断句提示")
try:
self._mic_op_queue.put_nowait("start")
except queue.Full:
pass
return
if not self._segment_cue_done.wait(timeout=15.0):
logger.error("断句提示音同步超时")
try:
self._mic_op_queue.put_nowait("start")
except queue.Full:
pass
return
time.sleep(self._post_cue_mic_mute_ms / 1000.0)
with self._wake_flow_lock:
self._wake_phase = int(_WakeFlowPhase.LLM_BUSY)
self._handle_llm_turn_cloud_pcm(
pcm, sample_rate_hz, finish_wake_after_tts=False
)
def _drain_llm_playback_queue(self, recover_mic: bool = True) -> None:
from voice_drone.core.tts import play_tts_audio, play_wav_path
lines: list[str] = []
while True:
try:
lines.append(self._llm_playback_queue.get_nowait())
except queue.Empty:
break
if not lines:
# 流式分段 TTS 时:最后一次 drain 可能在 _finalize_llm_turn 设置
# _pending_finish_wake_cycle_after_tts 之前就把队列播空;此处补上结束本轮唤醒。
# 注意:飞控确认窗须在「播完含本轮云端 TTS 的一批队列」之后在 finally 里进入,
# 不可在此处用 _pending_flight_confirm_after_tts否则主线程可能在 PCM 入队前
# 空跑 drain抢先 begin_confirm 并清掉标志,命令线程末尾又会设 _pending_finish_wake_cycle。
if self._pending_finish_wake_cycle_after_tts:
self._pending_finish_wake_cycle_after_tts = False
self._finish_wake_cycle()
return
greeting_batch = self._playback_batch_is_greeting
self._playback_batch_is_greeting = False
mic_stopped = False
if self.ack_pause_mic_for_playback:
# 关麦前再丢一次队列:唤醒到 drain 之间 VAD 可能又提交了片段
self.discard_pending_stt_segments()
try:
self.audio_capture.stop_stream()
mic_stopped = True
except Exception as e: # noqa: BLE001
logger.warning("暂停麦克风失败: %s", e)
try:
tts = None
dev = self._llm_tts_output_device()
for line in lines:
if line == _WAKE_HIT_BEEP_TAG:
self._play_wake_word_hit_beep(dev)
continue
if line == _SEGMENT_END_CUE_TAG:
self._play_segment_end_cue(dev)
self._segment_cue_done.set()
continue
if line == _CHITCHAT_REPROMPT_BEEP_TAG:
self._play_chitchat_reprompt_beep(dev)
self._arm_prompt_listen_timeout()
continue
if (
isinstance(line, tuple)
and len(line) == 3
and line[0] == _CLOUD_PCM_TAG
):
_, pcm_i16, sr_cloud = line
try:
pcm_i16 = np.asarray(pcm_i16, dtype=np.int16).reshape(-1)
if pcm_i16.size == 0:
continue
dbg_max = int(np.max(np.abs(pcm_i16)))
logger.info(
"云端 PCM 解码: samples=%s int16_max_abs=%s (若 max_abs=0 则为全零或"
"协议/端序与云端不一致;请在服务端导出同段 WAV 对比)",
pcm_i16.size,
dbg_max,
)
audio_f32 = pcm_i16.astype(np.float32) / 32768.0
t_play0 = time.monotonic()
play_tts_audio(
audio_f32, int(sr_cloud), output_device=dev
)
print(
f"[计时] 云端 TTS 播放 {time.monotonic() - t_play0:.3f}s "
f"{pcm_i16.size / int(sr_cloud):.2f}s 音频)",
flush=True,
)
print("[LLM] 已播报。", flush=True)
except Exception as e: # noqa: BLE001
logger.warning("云端 PCM 播放失败: %s", e, exc_info=True)
continue
s = (line or "").strip()
if not s:
continue
try:
if s == _WAKE_GREETING:
t_w0 = time.monotonic()
cloud_ok = self._try_play_line_via_cloud_tts(s, dev)
if not cloud_ok:
greet_wav = self._ensure_wake_greeting_wav_on_disk()
if greet_wav.is_file():
play_wav_path(greet_wav, output_device=dev)
print(
f"[计时] TTS 预生成问候 WAV 播完,耗时 "
f"{time.monotonic() - t_w0:.3f}s",
flush=True,
)
else:
if tts is None:
tts = self._ensure_llm_tts()
logger.info("TTS: 开始合成并播放: %r", s)
t_syn0 = time.monotonic()
audio, sr = tts.synthesize(s)
t_syn1 = time.monotonic()
play_tts_audio(audio, sr, output_device=dev)
t_play1 = time.monotonic()
print(
f"[计时] TTS 合成 {t_syn1 - t_syn0:.3f}s"
f"播放 {t_play1 - t_syn1:.3f}s"
f"(本段合计 {t_play1 - t_syn0:.3f}s",
flush=True,
)
logger.info("TTS: 播放完成")
else:
print(
f"[计时] 云端 tts.synthesize 问候,耗时 "
f"{time.monotonic() - t_w0:.3f}s",
flush=True,
)
if greeting_batch:
self._play_wake_ready_beep(dev)
else:
t_line0 = time.monotonic()
cloud_ok = self._try_play_line_via_cloud_tts(s, dev)
if not cloud_ok:
if tts is None:
tts = self._ensure_llm_tts()
logger.info("TTS: 开始合成并播放: %r", s)
t_syn0 = time.monotonic()
audio, sr = tts.synthesize(s)
t_syn1 = time.monotonic()
play_tts_audio(audio, sr, output_device=dev)
t_play1 = time.monotonic()
print(
f"[计时] TTS 合成 {t_syn1 - t_syn0:.3f}s"
f"播放 {t_play1 - t_syn1:.3f}s"
f"(本段合计 {t_play1 - t_syn0:.3f}s",
flush=True,
)
logger.info("TTS: 播放完成")
else:
print(
f"[计时] 云端 tts.synthesize 本段合计 "
f"{time.monotonic() - t_line0:.3f}s",
flush=True,
)
print("[LLM] 已播报。", flush=True)
except Exception as e: # noqa: BLE001
logger.warning("LLM 播报失败: %s", e, exc_info=True)
finally:
if mic_stopped and recover_mic:
try:
self.audio_capture.start_stream()
try:
settle_ms = float(
os.environ.get("ROCKET_MIC_RESTART_SETTLE_MS", "150")
)
except ValueError:
settle_ms = 150.0
settle_ms = max(0.0, min(2000.0, settle_ms))
if settle_ms > 0:
time.sleep(settle_ms / 1000.0)
try:
self.audio_preprocessor.reset()
except Exception as e: # noqa: BLE001
logger.debug("audio_preprocessor.reset: %s", e)
self.vad.reset()
with self.speech_buffer_lock:
self.speech_buffer.clear()
self.pre_speech_buffer.clear()
except Exception as e: # noqa: BLE001
logger.error("麦克风恢复失败: %s", e)
if greeting_batch:
self._greeting_done.set()
if self._pending_flight_confirm_after_tts:
self._pending_flight_confirm_after_tts = False
self._begin_flight_confirm_listen()
elif self._pending_chitchat_reprompt_after_tts:
self._pending_chitchat_reprompt_after_tts = False
with self._wake_flow_lock:
self._wake_phase = int(_WakeFlowPhase.ONE_SHOT_LISTEN)
try:
self._llm_playback_queue.put_nowait(_CHITCHAT_REPROMPT_BEEP_TAG)
except queue.Full:
logger.warning("播报队列已满,跳过闲聊再滴声")
elif self._pending_finish_wake_cycle_after_tts:
self._pending_finish_wake_cycle_after_tts = False
self._finish_wake_cycle()
def _discard_llm_playback_queue(self) -> None:
"""退出时丢弃未播完的大模型 TTS避免 stop() 里 speak_text/sounddevice 长时间阻塞导致 Ctrl+C 无法结束进程。"""
dropped = 0
while True:
try:
self._llm_playback_queue.get_nowait()
dropped += 1
except queue.Empty:
break
if dropped:
logger.info("退出:已丢弃 %s 条待播 LLM 语音", dropped)
@staticmethod
def _chunk_delta_text(chunk: object) -> str:
if not isinstance(chunk, dict):
return ""
choices = chunk.get("choices") or []
if not choices:
return ""
c0 = choices[0]
d = c0.get("delta") if isinstance(c0, dict) else None
if not isinstance(d, dict):
d = c0.get("message") if isinstance(c0, dict) else None
if not isinstance(d, dict):
return ""
raw = d.get("content")
return raw if isinstance(raw, str) else ""
def _enqueue_segment_capped(self, seg: str, budget: int) -> int:
seg = (seg or "").strip()
if not seg or budget <= 0:
return budget
if len(seg) <= budget:
self._enqueue_llm_speak(seg)
return budget - len(seg)
self._enqueue_llm_speak(seg[: max(0, budget - 1)] + "")
return 0
def _finalize_llm_turn(
self,
reply: str,
finish_wake_after_tts: bool,
*,
streamed_chat: bool,
) -> None:
if not reply:
self._enqueue_llm_speak("我没听清,请再说一遍。")
if finish_wake_after_tts:
self._pending_finish_wake_cycle_after_tts = True
return
mode, payload = parse_flight_intent_reply(reply)
with self._chat_session_lock:
self._llm_messages.append({"role": "assistant", "content": reply})
print(f"[LLM] 判定={mode}", flush=True)
print(f"[LLM] 原文: {reply[:500]}{'' if len(reply) > 500 else ''}", flush=True)
if streamed_chat:
if payload is not None and self._flight_payload_requests_takeoff(payload):
threading.Thread(
target=self._run_takeoff_offboard_and_wavs,
daemon=True,
).start()
if finish_wake_after_tts:
self._pending_finish_wake_cycle_after_tts = True
return
if payload is not None:
to_say = str(payload.get("summary") or "好的。").strip()
if self._flight_payload_requests_takeoff(payload):
threading.Thread(
target=self._run_takeoff_offboard_and_wavs,
daemon=True,
).start()
else:
to_say = reply.strip()
if len(to_say) > self._llm_tts_max_chars:
to_say = to_say[: self._llm_tts_max_chars] + ""
self._enqueue_llm_speak(to_say)
if finish_wake_after_tts:
self._pending_finish_wake_cycle_after_tts = True
def _enqueue_cloud_pcm_playback(
self, pcm_int16: np.ndarray, sample_rate_hz: int
) -> None:
if pcm_int16 is None or np.asarray(pcm_int16).size == 0:
return
try:
self._llm_playback_queue.put(
(_CLOUD_PCM_TAG, np.asarray(pcm_int16, dtype=np.int16), int(sample_rate_hz)),
block=False,
)
except queue.Full:
logger.warning("LLM 播报队列已满,跳过云端 PCM")
def _send_socket_command(self, cmd: Command) -> bool:
cmd.fill_defaults()
if self.socket_client.send_command_with_retry(cmd):
logger.info("✅ Socket 已发送: %s", cmd.command)
return True
logger.warning("Socket 未送达(已达 max_retries: %s", cmd.command)
return False
def _publish_flight_intent_to_ros_bridge(self, flight: dict) -> None:
"""校验 flight_intent 后由子进程发布到 ROS std_msgs/String伴飞桥 ~input"""
_parsed, errors = parse_flight_intent_dict(flight)
if errors or _parsed is None:
logger.warning("[飞控-ROS桥] flight_intent 校验失败,未发布: %s", errors)
return
setup = os.environ.get(
"ROCKET_FLIGHT_BRIDGE_SETUP", "source /opt/ros/noetic/setup.bash"
).strip()
topic = os.environ.get("ROCKET_FLIGHT_BRIDGE_TOPIC", "/input").strip() or "/input"
wait_raw = os.environ.get("ROCKET_FLIGHT_BRIDGE_WAIT_SUB", "2").strip()
try:
wait_sub = float(wait_raw)
except ValueError:
wait_sub = 2.0
root = str(_PROJECT_ROOT)
body = json.dumps(flight, ensure_ascii=False)
fd, tmp_path = tempfile.mkstemp(prefix="flight_intent_", suffix=".json", text=True)
try:
with os.fdopen(fd, "w", encoding="utf-8") as f:
f.write(body)
except OSError:
try:
os.close(fd)
except OSError:
pass
try:
os.unlink(tmp_path)
except OSError:
pass
logger.warning("[飞控-ROS桥] 无法写入临时 JSON")
return
# 须追加 PYTHONPATH若写成 PYTHONPATH=仅工程根,会覆盖 ROS setup 注入的 /opt/ros/.../dist-packages导致找不到 rospy。
cmd = (
f"{setup} && cd {shlex.quote(root)} && "
f"export PYTHONPATH={shlex.quote(root)}:$PYTHONPATH && "
"python3 -m voice_drone.tools.publish_flight_intent_ros_once "
f"--topic {shlex.quote(topic)} --wait-subscribers {wait_sub} "
f"{shlex.quote(tmp_path)}"
)
try:
r = subprocess.run(
["bash", "-lc", cmd],
capture_output=True,
text=True,
timeout=60,
)
except subprocess.TimeoutExpired:
logger.warning("[飞控-ROS桥] 子进程超时(>60s")
return
except OSError as e:
logger.warning("[飞控-ROS桥] 无法启动 bash: %s", e)
return
finally:
try:
os.unlink(tmp_path)
except OSError:
pass
if r.returncode != 0:
logger.warning(
"[飞控-ROS桥] 发布失败 code=%s stderr=%s",
r.returncode,
(r.stderr or "").strip()[:800],
)
else:
logger.info("[飞控-ROS桥] 已发布至 %s", topic)
def _run_cloud_flight_intent_sequence(self, flight: dict) -> None:
"""
在后台线程中顺序执行云端 flight_intent校验 v1 + takeoff 走 offboard + 其余 Socket
含 takeoff 时:先跑完 offboard 流程,再继续 hover/wait/land 等(修复此前仅触发起飞、后续动作丢失)。
"""
parsed, errors = parse_flight_intent_dict(flight)
if errors:
logger.warning("[飞控] flight_intent 校验失败: %s", errors)
return
tid = (parsed.trace_id or "").strip() or "-"
logger.info("[飞控] 开始执行序列 trace_id=%s steps=%d", tid, len(parsed.actions))
for step, action in enumerate(parsed.actions):
if isinstance(action, ActionTakeoff):
alt = action.args.relative_altitude_m
if alt is not None:
logger.info(
"[飞控] takeoff 请求相对高度 %.2fm当前 offboard 脚本是否使用该参数请自行扩展)",
alt,
)
self._run_takeoff_offboard_and_wavs()
elif isinstance(action, ActionLand):
cmd = Command.create("land", self._get_next_sequence_id())
self._send_socket_command(cmd)
elif isinstance(action, ActionReturnHome):
cmd = Command.create("return_home", self._get_next_sequence_id())
self._send_socket_command(cmd)
elif isinstance(action, (ActionHover, ActionHold)):
cmd = Command.create("hover", self._get_next_sequence_id())
self._send_socket_command(cmd)
elif isinstance(action, ActionGoto):
cmd, err = goto_action_to_command(action, self._get_next_sequence_id())
if err:
logger.warning("[飞控] step %d goto: %s", step, err)
continue
if cmd is not None:
self._send_socket_command(cmd)
elif isinstance(action, ActionWait):
sec = float(action.args.seconds)
logger.info("[飞控] step %d wait %.2fs", step, sec)
time.sleep(sec)
else:
logger.warning("[飞控] step %d 未处理的动作类型: %r", step, action)
def _cancel_flight_confirm_timer(self) -> None:
with self._flight_confirm_timer_lock:
t = self._flight_confirm_timer
self._flight_confirm_timer = None
if t is not None:
try:
t.cancel()
except Exception: # noqa: BLE001
pass
def _begin_flight_confirm_listen(self) -> None:
"""云端 TTS 播完后进入口头确认窗cloud_voice_dialog_v1"""
self._cancel_prompt_listen_timer()
with self._flight_confirm_timer_lock:
if self._pending_flight_confirm is None:
logger.warning("[飞控] 无待确认意图,跳过确认窗")
self._finish_wake_cycle()
return
cd = self._pending_flight_confirm["confirm"]
timeout_sec = float(cd["timeout_sec"])
phrases_repr = (cd["confirm_phrases"], cd["cancel_phrases"])
self._cancel_flight_confirm_timer()
with self._wake_flow_lock:
self._wake_phase = int(_WakeFlowPhase.FLIGHT_CONFIRM_LISTEN)
print(
f"[飞控] 请口头确认 {phrases_repr[0]!r} 或取消 {phrases_repr[1]!r}"
f"超时 {timeout_sec:.0f}s。",
flush=True,
)
def _fire() -> None:
try:
self._on_flight_confirm_timeout()
except Exception as e: # noqa: BLE001
logger.error("确认窗超时处理异常: %s", e, exc_info=True)
with self._flight_confirm_timer_lock:
self._flight_confirm_timer = threading.Timer(timeout_sec, _fire)
self._flight_confirm_timer.daemon = True
self._flight_confirm_timer.start()
def _on_flight_confirm_timeout(self) -> None:
with self._flight_confirm_timer_lock:
if self._pending_flight_confirm is None:
return
self._pending_flight_confirm = None
self._flight_confirm_timer = None
logger.info("[飞控] 确认窗超时")
self._enqueue_llm_speak(MSG_CONFIRM_TIMEOUT)
self._pending_finish_wake_cycle_after_tts = True
def _handle_flight_confirm_text(self, raw: str) -> None:
utter = (raw or "").strip()
if not utter:
return
norm = normalize_phrase_text(utter)
print(f"[飞控-确认窗] {utter!r}", flush=True)
action: str = "noop"
fi_ok: dict | None = None
t: threading.Timer | None = None
with self._flight_confirm_timer_lock:
pend = self._pending_flight_confirm
if pend is None:
return
cd = pend["confirm"]
cancel_hit = match_phrase_list(norm, cd["cancel_phrases"])
confirm_hit = match_phrase_list(norm, cd["confirm_phrases"])
if cancel_hit:
action = "cancel"
self._pending_flight_confirm = None
t = self._flight_confirm_timer
self._flight_confirm_timer = None
elif confirm_hit:
action = "confirm"
fi_ok = pend["flight"]
self._pending_flight_confirm = None
t = self._flight_confirm_timer
self._flight_confirm_timer = None
else:
logger.info("[飞控] 确认窗未命中短语,忽略: %s", utter[:80])
return
if t is not None:
try:
t.cancel()
except Exception: # noqa: BLE001
pass
if action == "cancel":
logger.info("[飞控] 用户取消待执行意图")
self._enqueue_llm_speak(MSG_CANCELLED)
self._pending_finish_wake_cycle_after_tts = True
return
if action == "confirm" and fi_ok is not None:
logger.info("[飞控] 用户已确认,开始执行 flight_intent")
self._start_cloud_flight_execution(fi_ok)
self._enqueue_llm_speak(MSG_CONFIRM_EXECUTING)
self._pending_finish_wake_cycle_after_tts = True
def _start_cloud_flight_execution(self, fi: dict) -> None:
"""ROCKET_CLOUD_EXECUTE_FLIGHT 已通过校验后,起线程执行。"""
if os.environ.get("ROCKET_CLOUD_EXECUTE_FLIGHT", "").lower() not in (
"1",
"true",
"yes",
):
return
if os.environ.get("ROCKET_FLIGHT_INTENT_ROS_BRIDGE", "").lower() in (
"1",
"true",
"yes",
):
threading.Thread(
target=self._publish_flight_intent_to_ros_bridge,
args=(fi,),
daemon=True,
).start()
else:
threading.Thread(
target=self._run_cloud_flight_intent_sequence,
args=(fi,),
daemon=True,
).start()
def _handle_llm_turn(
self, user_msg: str, *, finish_wake_after_tts: bool = False
) -> None:
if self._cloud_voice_enabled and self._cloud_client is not None:
self._handle_llm_turn_cloud(user_msg, finish_wake_after_tts=finish_wake_after_tts)
return
self._handle_llm_turn_local(user_msg, finish_wake_after_tts=finish_wake_after_tts)
def _apply_cloud_dialog_result(
self,
result: dict,
*,
finish_wake_after_tts: bool,
) -> None:
proto = result.get("protocol")
routing = result.get("routing")
fi = result.get("flight_intent")
confirm_raw = result.get("confirm")
scheduled_flight_confirm = False
if routing == "flight_intent" and isinstance(fi, dict) and fi.get("is_flight_intent"):
summary = str(fi.get("summary") or "好的。").strip()
actions = fi.get("actions") or []
print(f"[LLM] 判定=飞控意图(云端) summary={summary!r}", flush=True)
print(f"[LLM] actions={actions!r}", flush=True)
if proto != CLOUD_VOICE_DIALOG_V1:
logger.error(
"[云端] flight_intent 须 protocol=%r,收到 %r;按 v1 拒执行飞控",
CLOUD_VOICE_DIALOG_V1,
proto,
)
cd = parse_confirm_dict(confirm_raw)
if cd is None:
logger.error("[云端] flight_intent 须带合法 confirm 对象v1拒执行飞控")
exec_enabled = os.environ.get("ROCKET_CLOUD_EXECUTE_FLIGHT", "").lower() in (
"1",
"true",
"yes",
)
if (
exec_enabled
and proto == CLOUD_VOICE_DIALOG_V1
and cd is not None
):
if cd["required"]:
scheduled_flight_confirm = True
with self._flight_confirm_timer_lock:
self._pending_flight_confirm = {"flight": fi, "confirm": cd}
self._pending_flight_confirm_after_tts = True
logger.info(
"[云端] flight_intent 待口头确认pending_id=%s"
"播完 TTS 后听确认/超时",
cd.get("pending_id"),
)
else:
logger.info(
"[云端] flight_intent confirm.required=false将直接执行若已开执行开关"
)
self._start_cloud_flight_execution(fi)
elif exec_enabled and (
proto != CLOUD_VOICE_DIALOG_V1 or cd is None
):
logger.warning(
"[云端] 协议或 confirm 不完整,本轮不执行飞控(仍播 TTS"
)
else:
logger.info(
"[云端] flight_intent 已下发(未设 ROCKET_CLOUD_EXECUTE_FLIGHT仅播报"
)
elif routing == "chitchat":
if proto != CLOUD_VOICE_DIALOG_V1:
logger.warning(
"[云端] chitchat 期望 protocol=%r,实际=%r",
CLOUD_VOICE_DIALOG_V1,
proto,
)
cr = (result.get("chat_reply") or "").strip()
print(f"[LLM] 判定=闲聊(云端) reply={cr[:200]!r}", flush=True)
else:
logger.warning("未知 routing: %s", routing)
pcm = result.get("pcm")
sr = int(result.get("sample_rate_hz") or 24000)
if pcm is not None and np.asarray(pcm).size > 0:
self._enqueue_cloud_pcm_playback(np.asarray(pcm, dtype=np.int16), sr)
elif self._cloud_fallback_local:
if routing == "flight_intent" and isinstance(fi, dict):
fallback_txt = str(fi.get("summary") or "好的。").strip()
else:
fallback_txt = (result.get("chat_reply") or "好的。").strip()
if fallback_txt:
self._enqueue_llm_speak(fallback_txt)
else:
self._enqueue_llm_speak("未收到云端语音。")
if routing == "chitchat":
self._pending_chitchat_reprompt_after_tts = True
elif scheduled_flight_confirm:
pass
elif finish_wake_after_tts and not scheduled_flight_confirm:
self._pending_finish_wake_cycle_after_tts = True
elif routing == "flight_intent" and not scheduled_flight_confirm:
self._pending_finish_wake_cycle_after_tts = True
elif routing not in ("chitchat", "flight_intent"):
self._pending_finish_wake_cycle_after_tts = True
def _handle_llm_turn_cloud(
self, user_msg: str, *, finish_wake_after_tts: bool = False
) -> None:
from voice_drone.core.cloud_voice_client import CloudVoiceError
assert self._cloud_client is not None
t0 = time.monotonic()
try:
result = self._cloud_client.run_turn(user_msg)
except CloudVoiceError as e:
print(f"[云端] 失败: {e} (code={e.code!r})", flush=True)
logger.error("云端对话失败: %s", e, exc_info=True)
self._recover_from_cloud_failure(
user_msg,
finish_wake_after_tts=finish_wake_after_tts,
idle_speak="云端服务不可用,请稍后再试。",
)
return
except Exception as e: # noqa: BLE001
print(f"[云端] 异常: {e}", flush=True)
logger.error("云端对话异常: %s", e, exc_info=True)
self._recover_from_cloud_failure(
user_msg,
finish_wake_after_tts=finish_wake_after_tts,
idle_speak="网络异常,请稍后再试。",
)
return
dt = time.monotonic() - t0
metrics = result.get("metrics") or {}
print(
f"[计时] 云端一轮(turn.text) {dt:.3f}s "
f"(llm_ms={metrics.get('llm_ms')!r}, "
f"tts_first_byte_ms={metrics.get('tts_first_byte_ms')!r})",
flush=True,
)
self._apply_cloud_dialog_result(result, finish_wake_after_tts=finish_wake_after_tts)
def _handle_llm_turn_cloud_pcm(
self,
pcm_i16: np.ndarray,
sample_rate_hz: int,
*,
finish_wake_after_tts: bool = False,
) -> None:
from voice_drone.core.cloud_voice_client import CloudVoiceError
assert self._cloud_client is not None
t0 = time.monotonic()
try:
result = self._cloud_client.run_turn_audio(pcm_i16, int(sample_rate_hz))
except CloudVoiceError as e:
print(f"[云端] turn.audio 失败: {e} (code={e.code!r})", flush=True)
logger.error("云端 turn.audio 失败: %s", e, exc_info=True)
self._recover_from_cloud_failure(
"",
finish_wake_after_tts=True,
idle_speak="云端语音识别失败,请稍后再试。",
)
return
except Exception as e: # noqa: BLE001
print(f"[云端] turn.audio 异常: {e}", flush=True)
logger.error("云端 turn.audio 异常: %s", e, exc_info=True)
self._recover_from_cloud_failure(
"",
finish_wake_after_tts=True,
idle_speak="网络异常,请稍后再试。",
)
return
dt = time.monotonic() - t0
metrics = result.get("metrics") or {}
print(
f"[计时] 云端一轮(turn.audio) {dt:.3f}s "
f"(llm_ms={metrics.get('llm_ms')!r}, "
f"tts_first_byte_ms={metrics.get('tts_first_byte_ms')!r})",
flush=True,
)
self._apply_cloud_dialog_result(result, finish_wake_after_tts=finish_wake_after_tts)
def _handle_llm_turn_local(
self, user_msg: str, *, finish_wake_after_tts: bool = False
) -> None:
llm = self._ensure_llm()
if llm is None:
self._enqueue_llm_speak(
"大模型未就绪。请确认已下载 GGUF或设置环境变量 ROCKET_LLM_GGUF 指向模型文件。"
)
if finish_wake_after_tts:
self._pending_finish_wake_cycle_after_tts = True
return
with self._chat_session_lock:
self._llm_messages = [
{"role": "system", "content": FLIGHT_INTENT_CHAT_SYSTEM},
{"role": "user", "content": user_msg},
]
messages_snapshot = list(self._llm_messages)
if not self._llm_stream_enabled:
t_llm0 = time.monotonic()
try:
out = llm.create_chat_completion(
messages=messages_snapshot,
max_tokens=self._llm_max_tokens,
)
except Exception as e: # noqa: BLE001
dt_llm = time.monotonic() - t_llm0
print(f"[计时] LLM 推理 {dt_llm:.3f}s失败", flush=True)
logger.error("LLM 推理失败: %s", e, exc_info=True)
with self._chat_session_lock:
if self._llm_messages and self._llm_messages[-1].get("role") == "user":
self._llm_messages.pop()
self._enqueue_llm_speak("推理出错,请稍后再说。")
if finish_wake_after_tts:
self._pending_finish_wake_cycle_after_tts = True
return
dt_llm = time.monotonic() - t_llm0
print(f"[计时] LLM 推理 {dt_llm:.3f}s", flush=True)
reply = (
(out.get("choices") or [{}])[0].get("message") or {}
).get("content", "").strip()
self._finalize_llm_turn(
reply, finish_wake_after_tts, streamed_chat=False
)
return
t_llm0 = time.monotonic()
try:
stream = llm.create_chat_completion(
messages=messages_snapshot,
max_tokens=self._llm_max_tokens,
stream=True,
)
except Exception as e: # noqa: BLE001
dt_llm = time.monotonic() - t_llm0
print(f"[计时] LLM 推理 {dt_llm:.3f}s失败", flush=True)
logger.error("LLM 推理失败: %s", e, exc_info=True)
with self._chat_session_lock:
if self._llm_messages and self._llm_messages[-1].get("role") == "user":
self._llm_messages.pop()
self._enqueue_llm_speak("推理出错,请稍后再说。")
if finish_wake_after_tts:
self._pending_finish_wake_cycle_after_tts = True
return
full_reply = ""
pending = ""
tts_budget = self._llm_tts_max_chars
route: str | None = None
try:
for chunk in stream:
content = self._chunk_delta_text(chunk)
if not content:
continue
full_reply += content
if route is None:
lead = full_reply.lstrip()
if lead:
route = "json" if lead[0] == "{" else "chat"
if route != "chat" or tts_budget <= 0:
continue
pending += content
while tts_budget > 0 and pending:
segs, pending = take_completed_sentences(pending)
if segs:
for seg in segs:
tts_budget = self._enqueue_segment_capped(seg, tts_budget)
if tts_budget <= 0:
break
continue
forced, pending = force_soft_split(
pending, self._stream_tts_chunk_chars
)
if not forced:
break
for seg in forced:
tts_budget = self._enqueue_segment_capped(seg, tts_budget)
if tts_budget <= 0:
break
except Exception as e: # noqa: BLE001
dt_llm = time.monotonic() - t_llm0
print(f"[计时] LLM 推理 {dt_llm:.3f}s失败", flush=True)
logger.error("LLM 流式推理失败: %s", e, exc_info=True)
with self._chat_session_lock:
if self._llm_messages and self._llm_messages[-1].get("role") == "user":
self._llm_messages.pop()
self._enqueue_llm_speak("推理出错,请稍后再说。")
if finish_wake_after_tts:
self._pending_finish_wake_cycle_after_tts = True
return
dt_llm = time.monotonic() - t_llm0
print(f"[计时] LLM 推理 {dt_llm:.3f}s", flush=True)
reply = full_reply.strip()
if route == "chat" and tts_budget > 0:
tail = pending.strip()
if tail:
self._enqueue_segment_capped(tail, tts_budget)
self._finalize_llm_turn(
reply, finish_wake_after_tts, streamed_chat=(route == "chat")
)
def start(self) -> None:
if self.running:
logger.warning("识别器已在运营")
return
self.running = True
self.stt_thread = threading.Thread(target=self._stt_worker_thread, daemon=True)
self.stt_thread.start()
self.command_thread = threading.Thread(
target=self._takeoff_only_command_worker, daemon=True
)
self.command_thread.start()
# 先预加载再开麦:否则 PortAudio 回调会一直往 audio_queue 塞数据,而主线程还没进入
# process_audio_stream默认仅 10 块的队列会迅速满并触发「音频队列已满,丢弃数据块」。
logger.info("voice_drone_assistant: 准备预加载模型(若启用)…")
self._preload_llm_and_tts_if_enabled()
try:
self.audio_capture.start_stream()
except BaseException:
self.running = False
try:
self.stt_queue.put(None, timeout=0.5)
except Exception: # noqa: BLE001
pass
try:
self.command_queue.put(None, timeout=0.5)
except Exception: # noqa: BLE001
pass
if self.stt_thread is not None:
self.stt_thread.join(timeout=2.0)
if self.command_thread is not None:
self.command_thread.join(timeout=2.0)
raise
if self._cloud_voice_enabled:
logger.info(
"voice_drone_assistant: 已启动(对话走云端 WebSocketTTS 为云端 PCM飞控见 Socket/offboard"
)
else:
logger.info(
"voice_drone_assistant: 已启动(无试飞控 Socket大模型答复走 Kokoro TTS"
)
ld = os.environ.get("LD_PRELOAD", "")
sys_asound = "libasound.so" in ld and "/usr/" in ld
if not sys_asound:
print(
"\n⚠ 建议用系统 ALSA 启动conda 下否则常无声或 VAD 不触发):\n"
" bash with_system_alsa.sh python main.py\n",
flush=True,
)
if self._llm_disabled and not self._cloud_voice_enabled:
if self._local_keyword_takeoff_enabled:
llm_hint = "已 ROCKET_LLM_DISABLE=1除 keywords.yaml 中 takeoff 关键词外,其它指令仅打印,不调大模型。\n"
else:
llm_hint = (
"已 ROCKET_LLM_DISABLE=1 且未启用本地口令起飞assistant.local_keyword_takeoff_enabled / "
"ROCKET_LOCAL_KEYWORD_TAKEOFF指令仅打印不调大模型。\n"
)
elif self._cloud_voice_enabled:
if self._local_keyword_takeoff_enabled:
llm_hint = "已启用云端对话:非 takeoff 关键词指令经 WebSocket 上云,播报为云端 TTS 流。\n"
else:
llm_hint = "已启用云端对话:指令经 WebSocket 上云,播报为云端 TTS 流(本地口令起飞已关闭)。\n"
else:
llm_hint = (
"说「无人机」唤醒后会先播报问候,再听您说一句(不必再带唤醒词);说完后关麦推理,答句播完后再说「"
f"{self.wake_word_detector.primary}」开始下一轮。非起飞指令走大模型("
"飞控相关→JSON否则闲聊\n"
)
if self._local_keyword_takeoff_enabled:
takeoff_banner = (
"\n本地口令起飞已开启:说「无人机」+ keywords.yaml 里 takeoff 词(如「起飞演示」)→ 播提示音、"
"启动 scripts/run_px4_offboard_one_terminal.sh串口真机、再播返航提示并结束脚本。\n"
)
else:
takeoff_banner = (
"\n本地口令起飞已关闭(飞控请用云端 flight_intent / ROS 桥等);"
"若需恢复 keywords.yaml takeoff → offboard设 assistant.local_keyword_takeoff_enabled: true 或 "
"ROCKET_LOCAL_KEYWORD_TAKEOFF=1。\n"
)
print(
f"{takeoff_banner}"
f"{llm_hint}"
"标记说明:[VAD] 已截段送 STT[STT] 识别文字;[唤醒] 是否含唤醒词;[LLM] 对话与播报。\n"
"录音已在启动时选好;扬声器可设 ROCKET_TTS_DEVICE。建议bash with_system_alsa.sh python …\n"
"Ctrl+C 退出。\n",
flush=True,
)
def _play_wav_serialized(self, path: Path) -> None:
if not path.is_file():
logger.warning("WAV 文件不存在,跳过播放: %s", path)
return
with self._audio_play_lock:
try:
_play_wav_blocking(path)
except Exception as e: # noqa: BLE001
logger.warning("播放 WAV 失败 %s: %s", path, e, exc_info=True)
def _run_takeoff_offboard_and_wavs(self) -> None:
"""独立线程:起 offboard 脚本;播第一段;第一段结束后等 10s再播第二段第二段结束后杀掉脚本进程组。"""
if not _OFFBOARD_SCRIPT.is_file():
logger.error("未找到 offboard 脚本: %s", _OFFBOARD_SCRIPT)
return
acquired = self._takeoff_side_task_busy.acquire(blocking=False)
if not acquired:
logger.warning("起飞联动已在执行,忽略重复触发")
return
proc: subprocess.Popen | None = None
try:
log_path = Path(
os.environ.get("ROCKET_OFFBOARD_LOG", "/tmp/rocket_drone_offboard_script.log")
).expanduser()
log_f = open(log_path, "ab", buffering=0)
try:
proc = subprocess.Popen(
[
"bash",
str(_OFFBOARD_SCRIPT),
"/dev/ttyACM0",
"921600",
"20",
],
cwd=str(_PROJECT_ROOT),
stdout=log_f,
stderr=subprocess.STDOUT,
start_new_session=True,
)
except Exception as e: # noqa: BLE001
logger.error("启动 run_px4_offboard_one_terminal.sh 失败: %s", e, exc_info=True)
return
finally:
log_f.close()
with self._offboard_proc_lock:
self._active_offboard_proc = proc
time.sleep(0.5)
early_rc = proc.poll()
if early_rc is not None:
logger.error(
"offboard 一键脚本已立即结束 (exit=%s),未持续运行。日志: %s (常见原因:找不到 "
"px4_ctrl_offboard_demo.py、ROS 环境、或串口未连)",
early_rc,
log_path,
)
logger.info(
"已启动 offboard 一键脚本 (pid=%s),并播放起飞提示音;脚本输出见 %s",
proc.pid,
log_path,
)
self._play_wav_serialized(_TAKEOFF_ACK_WAV)
time.sleep(10.0)
self._play_wav_serialized(_TAKEOFF_DONE_WAV)
finally:
if proc is not None:
logger.info("第二段 WAV 已播完,终止 offboard 脚本进程组 (pid=%s)", proc.pid)
_terminate_process_group(proc)
with self._offboard_proc_lock:
if self._active_offboard_proc is proc:
self._active_offboard_proc = None
self._takeoff_side_task_busy.release()
def _takeoff_only_command_worker(self) -> None:
"""唤醒;同句带指令则直转 LLM/起飞;否则问候+滴声→再问一句→关麦播报。"""
logger.info("唤醒流程命令线程已启动")
while self.running:
try:
text = self.command_queue.get(timeout=0.1)
except queue.Empty:
continue
except Exception as e: # noqa: BLE001
logger.error(f"命令处理线程错误: {e}", exc_info=True)
continue
try:
if text is None:
break
try:
if (
isinstance(text, tuple)
and len(text) == 3
and text[0] == _PCM_TURN_MARKER
):
self._handle_pcm_uplink_turn(text[1], int(text[2]))
continue
with self._wake_flow_lock:
phase = self._wake_phase
if phase == int(_WakeFlowPhase.LLM_BUSY):
continue
if phase == int(_WakeFlowPhase.GREETING_WAIT):
continue
if phase == int(_WakeFlowPhase.FLIGHT_CONFIRM_LISTEN):
self._handle_flight_confirm_text(text)
continue
if phase == int(_WakeFlowPhase.ONE_SHOT_LISTEN):
with self._wake_flow_lock:
self._wake_phase = int(_WakeFlowPhase.LLM_BUSY)
self._process_one_shot_command(text)
continue
is_wake, matched = self.wake_word_detector.detect(text)
if not is_wake:
logger.debug("未检测到唤醒词,忽略: %s", text)
if os.environ.get("ROCKET_PRINT_STT", "").lower() in (
"1",
"true",
"yes",
):
print(
f"[唤醒] 未命中「{self.wake_word_detector.primary}」,原文: {text!r}",
flush=True,
)
continue
logger.info("唤醒词命中: %s", matched)
command_text = self.wake_word_detector.extract_command_text(text)
follow = (command_text or "").strip()
if follow:
if not self._wake_fast_path_process_follow(follow):
continue
continue
self._begin_wake_cycle(None)
except Exception as e: # noqa: BLE001
logger.error("命令处理失败: %s", e, exc_info=True)
finally:
self.command_queue.task_done()
logger.info("唤醒流程命令线程已停止")
def stop(self) -> None:
"""停止识别;不重连 Socket从未连接"""
if not self.running:
return
self.running = False
self._cancel_prompt_listen_timer()
self._cancel_flight_confirm_timer()
with self._flight_confirm_timer_lock:
self._pending_flight_confirm = None
self._pending_flight_confirm_after_tts = False
if self.stt_thread is not None:
self.stt_queue.put(None)
if self.command_thread is not None:
self.command_queue.put(None)
if self.stt_thread is not None:
self.stt_thread.join(timeout=2.0)
if self.command_thread is not None:
self.command_thread.join(timeout=2.0)
# 不在此线程做 speak_text会阻塞数秒至数十秒用户多次 Ctrl+C 仍杀不掉进程
self._discard_llm_playback_queue()
with self._offboard_proc_lock:
op = self._active_offboard_proc
self._active_offboard_proc = None
if op is not None and op.poll() is None:
logger.info("主程序退出:终止仍在运行的 offboard 脚本")
_terminate_process_group(op)
try:
self.audio_capture.stop_stream()
except KeyboardInterrupt:
logger.info("关闭麦克风流时中断,跳过")
except Exception as e: # noqa: BLE001
logger.warning("关闭麦克风流失败: %s", e)
if self._cloud_client is not None:
try:
self._cloud_client.close()
except Exception as e: # noqa: BLE001
logger.debug("关闭云端 WebSocket: %s", e)
if self.socket_client.connected:
self.socket_client.disconnect()
logger.info("voice_drone_assistant 已停止")
print("\n已退出。", flush=True)
def main() -> None:
ap = argparse.ArgumentParser(
description="无人机语音:唤醒 → 问候 → 一句指令 → 起飞或 LLM 播报 → 再唤醒"
)
ap.add_argument(
"--input-index",
"-I",
type=int,
default=None,
help="跳过交互菜单,直接指定 PyAudio 录音设备索引与启动时「PyAudio_index=」一致)。",
)
ap.add_argument(
"--non-interactive",
action="store_true",
help="不选设备:用 system.yaml 的 audio.input_device_index为 null 时自动枚举默认可录音设备)。",
)
ap.add_argument(
"--no-preload",
action="store_true",
help="不预加载 Qwen/Kokoro缩短启动时间首轮对话与首次播报会变慢",
)
args = ap.parse_args()
non_inter = args.non_interactive or os.environ.get(
"ROCKET_NON_INTERACTIVE", ""
).lower() in ("1", "true", "yes")
idx = args.input_index
if idx is None:
raw_ix = os.environ.get("ROCKET_INPUT_DEVICE_INDEX", "").strip()
if raw_ix.isdigit() or (raw_ix.startswith("-") and raw_ix[1:].isdigit()):
idx = int(raw_ix)
if idx is not None:
from voice_drone.core.mic_device_select import apply_input_device_index_only
apply_input_device_index_only(idx)
logger.info("录音设备: PyAudio 索引 %sCLI/环境变量)", idx)
elif not non_inter:
from voice_drone.core.mic_device_select import (
apply_input_device_index_only,
prompt_for_input_device_index,
)
chosen = prompt_for_input_device_index()
apply_input_device_index_only(chosen)
else:
logger.info(
"非交互模式:使用 system.yaml 的 audio.input_device_indexnull=自动探测)"
)
app = TakeoffPrintRecognizer(skip_model_preload=args.no_preload)
try:
app.run()
except KeyboardInterrupt:
logger.info("用户中断")
finally:
if app.running:
app.stop()
if __name__ == "__main__":
main()