""" 高性能实时语音识别与命令生成系统 整合所有模块,实现从语音检测到命令发送的完整流程: 1. 音频采集(高性能模式) 2. 音频预处理(降噪+AGC) 3. VAD语音活动检测 4. STT语音识别 5. 文本预处理(纠错+参数提取) 6. 命令生成 7. Socket发送 性能优化: - 多线程异步处理 - 非阻塞音频采集 - LRU缓存优化 - 低延迟设计 """ import math import numpy as np import os import random import threading import queue import time from typing import Callable, Dict, List, Optional, Tuple, TYPE_CHECKING from voice_drone.core.audio import AudioCapture, AudioPreprocessor from voice_drone.core.vad import VAD from voice_drone.core.stt import STT from voice_drone.core.text_preprocessor import TextPreprocessor, get_preprocessor from voice_drone.core.command import Command from voice_drone.core.scoket_client import SocketClient from voice_drone.core.configuration import ( SYSTEM_AUDIO_CONFIG, SYSTEM_RECOGNIZER_CONFIG, SYSTEM_SOCKET_SERVER_CONFIG, ) from voice_drone.core.tts_ack_cache import ( compute_ack_pcm_fingerprint, load_cached_phrases, persist_phrases, ) from voice_drone.core.wake_word import WakeWordDetector, get_wake_word_detector from voice_drone.logging_ import get_logger if TYPE_CHECKING: from voice_drone.core.tts import KokoroOnnxTTS logger = get_logger("recognizer") class VoiceCommandRecognizer: """ 高性能实时语音命令识别器 完整的语音转命令系统,包括: - 音频采集和预处理 - 语音活动检测 - 语音识别 - 文本预处理和参数提取 - 命令生成 - Socket发送 """ def __init__(self, auto_connect_socket: bool = True): """ 初始化语音命令识别器 Args: auto_connect_socket: 是否自动连接Socket服务器 """ logger.info("初始化语音命令识别系统...") # 初始化各模块 self.audio_capture = AudioCapture() self.audio_preprocessor = AudioPreprocessor() self.vad = VAD() self.stt = STT() self.text_preprocessor = get_preprocessor() # 使用全局单例 self.wake_word_detector = get_wake_word_detector() # 使用全局单例 # Socket客户端 self.socket_client = SocketClient(SYSTEM_SOCKET_SERVER_CONFIG) self.auto_connect_socket = auto_connect_socket if self.auto_connect_socket: if not self.socket_client.connect(): logger.warning("Socket连接失败,将在发送命令时重试") # 语音段缓冲区 self.speech_buffer: list = [] # 存储语音音频块 self.speech_buffer_lock = threading.Lock() # 预缓冲区:保存语音检测前一小段音频,避免丢失开头 # 例如:pre_speech_max_seconds = 0.8 表示保留最近约 0.8 秒音频 self.pre_speech_buffer: list = [] # 存储最近的静音/背景音块 # 从系统配置读取(确保类型正确:YAML 可能把数值当字符串) self.pre_speech_max_seconds: float = float( SYSTEM_RECOGNIZER_CONFIG.get("pre_speech_max_seconds", 0.8) ) self.pre_speech_max_chunks: Optional[int] = None # 根据采样率和chunk大小动态计算 # 命令发送成功后的 TTS 反馈(懒加载 Kokoro,避免拖慢启动) self.ack_tts_enabled = bool(SYSTEM_RECOGNIZER_CONFIG.get("ack_tts_enabled", True)) self.ack_tts_text = str(SYSTEM_RECOGNIZER_CONFIG.get("ack_tts_text", "好的收到")).strip() self.ack_tts_phrases: Dict[str, List[str]] = self._normalize_ack_tts_phrases( SYSTEM_RECOGNIZER_CONFIG.get("ack_tts_phrases") ) # True:仅 ack_tts_phrases 中出现的命令会播报,且每次随机一句;False:全局 ack_tts_text(所有成功命令同一应答) self._ack_mode_phrases: bool = bool(self.ack_tts_phrases) self.ack_tts_prewarm = bool(SYSTEM_RECOGNIZER_CONFIG.get("ack_tts_prewarm", True)) self.ack_tts_prewarm_blocking = bool( SYSTEM_RECOGNIZER_CONFIG.get("ack_tts_prewarm_blocking", True) ) self.ack_pause_mic_for_playback = bool( SYSTEM_RECOGNIZER_CONFIG.get("ack_pause_mic_for_playback", True) ) self.ack_tts_disk_cache = bool( SYSTEM_RECOGNIZER_CONFIG.get("ack_tts_disk_cache", True) ) self._tts_engine: Optional["KokoroOnnxTTS"] = None # 阻塞预加载时缓存波形:全局单句 _tts_ack_pcm,或按命令随机模式下的 _tts_phrase_pcm_cache(每句一条) self._tts_ack_pcm: Optional[Tuple[np.ndarray, int]] = None self._tts_phrase_pcm_cache: Dict[str, Tuple[np.ndarray, int]] = {} self._tts_lock = threading.Lock() # 命令线程只入队,主线程 process_audio_stream 中统一播放(避免 Windows 下后台线程 sd.play 无声) self._ack_playback_queue: queue.Queue = queue.Queue(maxsize=8) # STT识别线程和队列 self.stt_queue = queue.Queue(maxsize=5) # STT识别队列 self.stt_thread: Optional[threading.Thread] = None # 命令处理线程和队列 self.command_queue = queue.Queue(maxsize=10) # 命令处理队列 self.command_thread: Optional[threading.Thread] = None # 运行状态 self.running = False # 命令序列号(用于去重和顺序保证) self.sequence_id = 0 self.sequence_lock = threading.Lock() logger.info( f"应答TTS配置: enabled={self.ack_tts_enabled}, " f"mode={'按命令随机短语' if self._ack_mode_phrases else '全局固定文案'}, " f"prewarm_blocking={self.ack_tts_prewarm_blocking}, " f"pause_mic={self.ack_pause_mic_for_playback}, " f"disk_cache={self.ack_tts_disk_cache}" ) if self._ack_mode_phrases: logger.info(f" 仅播报命令: {list(self.ack_tts_phrases.keys())}") # VAD 后端:silero(默认)或 energy(按块 RMS,Silero 在部分板载麦上长期无段时使用) _ev_env = os.environ.get("ROCKET_ENERGY_VAD", "").lower() in ( "1", "true", "yes", ) _yaml_backend = str( SYSTEM_RECOGNIZER_CONFIG.get("vad_backend", "silero") ).lower() self._use_energy_vad: bool = _ev_env or _yaml_backend == "energy" self._energy_rms_high: float = float( SYSTEM_RECOGNIZER_CONFIG.get("energy_vad_rms_high", 280) ) self._energy_rms_low: float = float( SYSTEM_RECOGNIZER_CONFIG.get("energy_vad_rms_low", 150) ) self._energy_start_chunks: int = int( SYSTEM_RECOGNIZER_CONFIG.get("energy_vad_start_chunks", 4) ) self._energy_end_chunks: int = int( SYSTEM_RECOGNIZER_CONFIG.get("energy_vad_end_chunks", 15) ) # 高噪底/AGC 下 RMS 几乎不低于 energy_vad_rms_low 时,用「相对本段峰值」辅助判停 self._energy_end_peak_ratio: float = float( SYSTEM_RECOGNIZER_CONFIG.get("energy_vad_end_peak_ratio", 0.88) ) # 说话过程中对 utt 峰值每块乘衰减再与当前 rms 取 max,避免前几个字特响导致后半句一直被判「相对衰减」而误切段 self._energy_utt_peak_decay: float = float( SYSTEM_RECOGNIZER_CONFIG.get("energy_vad_utt_peak_decay", 0.988) ) self._energy_utt_peak_decay = max(0.95, min(0.9999, self._energy_utt_peak_decay)) self._ev_speaking: bool = False self._ev_high_run: int = 0 self._ev_low_run: int = 0 self._ev_rms_peak: float = 0.0 self._ev_last_diag_time: float = 0.0 self._ev_utt_peak: float = 0.0 # 可选:能量 VAD 刚进入「正在说话」时回调(用于机端 PROMPT_LISTEN 计时清零等) self._vad_speech_start_hook: Optional[Callable[[], None]] = None _trail_raw = SYSTEM_RECOGNIZER_CONFIG.get("trailing_silence_seconds") if _trail_raw is not None: _trail = float(_trail_raw) if _trail > 0: fs = int(SYSTEM_AUDIO_CONFIG.get("frame_size", 1024)) sr = int(SYSTEM_AUDIO_CONFIG.get("sample_rate", 16000)) if fs > 0 and sr > 0: n_end = max(1, int(math.ceil(_trail * sr / fs))) self._energy_end_chunks = n_end self.vad.silence_end_frames = n_end logger.info( "VAD 句尾切段:trailing_silence_seconds=%.2f → 连续静音块数=%d " "(每块≈%.0fms,Silero 与 energy 共用)", _trail, n_end, 1000.0 * fs / sr, ) if self._use_energy_vad: logger.info( "VAD 后端: energy(RMS)" f" high={self._energy_rms_high} low={self._energy_rms_low} " f"start_chunks={self._energy_start_chunks} end_chunks={self._energy_end_chunks}" f" end_peak_ratio={self._energy_end_peak_ratio}" f" utt_peak_decay={self._energy_utt_peak_decay}" ) logger.info("语音命令识别系统初始化完成") @staticmethod def _normalize_ack_tts_phrases(raw) -> Dict[str, List[str]]: """YAML: ack_tts_phrases: { takeoff: [\"...\", ...], ... }""" result: Dict[str, List[str]] = {} if not isinstance(raw, dict): return result for k, v in raw.items(): key = str(k).strip() if not key: continue if isinstance(v, list): phrases = [str(x).strip() for x in v if str(x).strip()] elif isinstance(v, str) and v.strip(): phrases = [v.strip()] else: phrases = [] if phrases: result[key] = phrases return result def _has_ack_tts_content(self) -> bool: if self._ack_mode_phrases: return any(bool(v) for v in self.ack_tts_phrases.values()) return bool(self.ack_tts_text) def _pick_ack_phrase(self, command_name: str) -> Optional[str]: if self._ack_mode_phrases: phrases = self.ack_tts_phrases.get(command_name) if not phrases: return None return random.choice(phrases) return self.ack_tts_text or None def _get_cached_pcm_for_phrase(self, phrase: str) -> Optional[Tuple[np.ndarray, int]]: """若启动阶段已预合成该句,则返回缓存,播报时不再跑 ONNX(低延迟)。""" if self._ack_mode_phrases: return self._tts_phrase_pcm_cache.get(phrase) if self._tts_ack_pcm is not None: return self._tts_ack_pcm return None def _ensure_tts_engine(self) -> "KokoroOnnxTTS": """懒加载 Kokoro(双检锁,避免多线程重复加载)。""" from voice_drone.core.tts import KokoroOnnxTTS if self._tts_engine is not None: return self._tts_engine with self._tts_lock: if self._tts_engine is None: logger.info("TTS: 正在加载 Kokoro 模型(首次约需十余秒)…") self._tts_engine = KokoroOnnxTTS() logger.info("TTS: Kokoro 模型加载完成") assert self._tts_engine is not None return self._tts_engine def _enqueue_ack_playback(self, command_name: str) -> None: """ 命令已成功发出后,将待播音频交给主线程队列。 不在此线程直接调用 sounddevice:Windows 上后台线程常出现播放完全无声。 """ if not self.ack_tts_enabled: return phrase = self._pick_ack_phrase(command_name) if not phrase: return try: cached = self._get_cached_pcm_for_phrase(phrase) if cached is not None: audio, sr = cached self._ack_playback_queue.put(("pcm", audio.copy(), sr), block=False) logger.info( f"命令已发送,已排队语音应答(主线程播放,预缓存): {phrase!r}" ) print(f"[TTS] 已排队语音应答(主线程播放,预缓存): {phrase!r}", flush=True) else: self._ack_playback_queue.put(("synth", phrase), block=False) logger.info( f"命令已发送,已排队语音应答(主线程合成+播放,无缓存,可能有数秒延迟): {phrase!r}" ) print( f"[TTS] 已排队语音应答(主线程合成+播放,无缓存): {phrase!r}", flush=True, ) except queue.Full: logger.warning("应答语音播放队列已满,跳过本次") def _before_audio_iteration(self) -> None: """主循环每轮开头(主线程):子类可扩展以播放其它排队 TTS。""" self._drain_ack_playback_queue() def _drain_ack_playback_queue(self, recover_mic: bool = True) -> None: """在主线程中播放队列中的应答(与麦克风采集同进程、同主循环线程)。 Args: recover_mic: 播完后是否恢复麦克风;退出 shutdown 时应为 False,避免与 stop() 中关流冲突。 """ from voice_drone.core.tts import play_tts_audio, speak_text items: list = [] while True: try: items.append(self._ack_playback_queue.get_nowait()) except queue.Empty: break if not items: return mic_stopped = False if self.ack_pause_mic_for_playback: try: logger.info( "TTS: 已暂停麦克风采集以便扬声器播放(避免 Windows 下输入/输出同时开无声)" ) self.audio_capture.stop_stream() mic_stopped = True except Exception as e: logger.warning(f"暂停麦克风失败,将尝试直接播放: {e}") try: for item in items: try: kind = item[0] if kind == "pcm": _, audio, sr = item logger.info("TTS: 主线程播放应答(预缓存波形)") play_tts_audio(audio, sr) logger.info("TTS: 播放完成") elif kind == "synth": logger.info("TTS: 主线程合成并播放应答(无预缓存)") tts = self._ensure_tts_engine() text = item[1] if len(item) >= 2 else (self.ack_tts_text or "") speak_text(text, tts=tts) except Exception as e: logger.warning(f"应答语音播放失败: {e}", exc_info=True) finally: if mic_stopped and recover_mic: try: self.audio_capture.start_stream() try: self.audio_preprocessor.reset() except Exception as e: # noqa: BLE001 logger.debug("audio_preprocessor.reset: %s", e) # TTS 暂停期间若未凑齐「尾静音」帧,VAD 会一直保持 is_speaking=True; # 恢复后 detect_speech_start 会直接放弃,表现为「能恢复采集但再也不识别」。 self.vad.reset() with self.speech_buffer_lock: self.speech_buffer.clear() self.pre_speech_buffer.clear() logger.info("TTS: 麦克风采集已恢复(已重置 VAD 与语音缓冲)") except Exception as e: logger.error(f"麦克风采集恢复失败,请重启程序: {e}", exc_info=True) def _prewarm_tts_async(self) -> None: """后台预加载 TTS(仅当未使用阻塞预加载时)。""" if not self.ack_tts_enabled or not self._has_ack_tts_content() or not self.ack_tts_prewarm: return def _run() -> None: try: self._ensure_tts_engine() if self._ack_mode_phrases: logger.warning( "TTS: 当前为「按命令随机短语」且未使用阻塞预加载," "各句首次播报可能仍有数秒延迟;若需低延迟请将 ack_tts_prewarm_blocking 设为 true。" ) except Exception as e: logger.warning(f"TTS 预加载失败(将在首次播报时重试): {e}", exc_info=True) threading.Thread(target=_run, daemon=True, name="tts-prewarm").start() def _prewarm_tts_blocking(self) -> None: """启动时准备应答 PCM:优先读磁盘缓存(文案与 TTS 配置未变则跳过合成);必要时加载 Kokoro 并合成。""" if not self.ack_tts_enabled or not self._has_ack_tts_content() or not self.ack_tts_prewarm: return use_disk = self.ack_tts_disk_cache logger.info("TTS: 正在准备语音反馈(磁盘缓存 / 合成)…") print("正在加载语音反馈…") try: if self._ack_mode_phrases: self._tts_phrase_pcm_cache.clear() seen: set = set() unique: List[str] = [] for lst in self.ack_tts_phrases.values(): for t in lst: p = str(t).strip() if p and p not in seen: seen.add(p) unique.append(p) if not unique: return fingerprint = compute_ack_pcm_fingerprint(unique, mode_phrases=True) missing = list(unique) if use_disk: loaded, missing = load_cached_phrases(unique, fingerprint) for ph, pcm in loaded.items(): self._tts_phrase_pcm_cache[ph] = pcm if not missing: self._tts_ack_pcm = None logger.info( "TTS: 已从磁盘加载全部应答波形(%d 句),跳过 Kokoro 加载与合成", len(unique), ) print("语音反馈已就绪(本地缓存),可以开始说话下指令。") return self._ensure_tts_engine() assert self._tts_engine is not None need = [p for p in unique if p not in self._tts_phrase_pcm_cache] for j, phrase in enumerate(need, start=1): logger.info( f"TTS: 合成应答句 {j}/{len(need)}: {phrase!r}" ) audio, sr = self._tts_engine.synthesize(phrase) self._tts_phrase_pcm_cache[phrase] = (audio, sr) self._tts_ack_pcm = None if use_disk: persist_phrases(fingerprint, dict(self._tts_phrase_pcm_cache)) logger.info( "TTS: 语音反馈已就绪(随机应答已缓存,播报低延迟)" ) print("语音反馈引擎已就绪,可以开始说话下指令。") else: text = (self.ack_tts_text or "").strip() if not text: return fingerprint = compute_ack_pcm_fingerprint( [], global_text=text, mode_phrases=False ) missing = [text] if use_disk: loaded, missing = load_cached_phrases([text], fingerprint) if text in loaded: self._tts_ack_pcm = loaded[text] if not missing: logger.info( "TTS: 已从磁盘加载全局应答波形,跳过 Kokoro 加载与合成" ) print("语音反馈已就绪(本地缓存),可以开始说话下指令。") return self._ensure_tts_engine() assert self._tts_engine is not None audio, sr = self._tts_engine.synthesize(text) self._tts_ack_pcm = (audio, sr) if use_disk: persist_phrases(fingerprint, {text: self._tts_ack_pcm}) logger.info( "TTS: 语音反馈引擎已就绪;已缓存应答语音,命令成功后将快速播报" ) print("语音反馈引擎已就绪,可以开始说话下指令。") except Exception as e: logger.warning( f"TTS: 启动阶段预加载失败,命令成功后可能延迟或无语音反馈: {e}", exc_info=True, ) @staticmethod def _init_sounddevice_output_probe() -> None: """在主线程探测默认输出设备;应答播报必须在主线程调用 sd.play。""" try: from voice_drone.core.tts import log_sounddevice_output_devices log_sounddevice_output_devices() import sounddevice as sd # type: ignore from voice_drone.core.tts import _sounddevice_default_output_index out_idx = _sounddevice_default_output_index() if out_idx is not None and int(out_idx) >= 0: info = sd.query_devices(int(out_idx)) logger.info( f"sounddevice 默认输出设备: {info.get('name', '?')} (index={out_idx})" ) sd.check_output_settings(samplerate=24000, channels=1, dtype="float32") # 预解析 tts.output_device,启动日志中可见实际用于播放的设备 from voice_drone.core.tts import get_playback_output_device_id get_playback_output_device_id() except Exception as e: logger.warning(f"sounddevice 输出设备探测失败,可能导致无法播音: {e}") def _get_next_sequence_id(self) -> int: """获取下一个命令序列号""" with self.sequence_lock: self.sequence_id += 1 return self.sequence_id @staticmethod def _int16_chunk_rms(chunk: np.ndarray) -> float: if chunk.size == 0: return 0.0 return float(np.sqrt(np.mean(chunk.astype(np.float64) ** 2))) def _submit_concatenated_speech_to_stt(self) -> None: """在持有 speech_buffer_lock 时调用:合并 speech_buffer 并送 STT,然后清空。""" if len(self.speech_buffer) == 0: return speech_audio = np.concatenate(self.speech_buffer) self.speech_buffer.clear() min_samples = int(self.audio_capture.sample_rate * 0.5) if len(speech_audio) >= min_samples: try: self.stt_queue.put(speech_audio.copy(), block=False) logger.debug( f"提交语音段到STT队列,长度: {len(speech_audio)} 采样点" ) if os.environ.get("ROCKET_PRINT_VAD", "").lower() in ( "1", "true", "yes", ): print( f"[VAD] 已送 STT,{len(speech_audio)} 采样点(≈{len(speech_audio) / float(self.audio_capture.sample_rate):.2f}s)", flush=True, ) except queue.Full: logger.warning("STT队列已满,跳过本次识别") elif os.environ.get("ROCKET_PRINT_VAD", "").lower() in ( "1", "true", "yes", ): print( f"[VAD] 语音段太短已丢弃({len(speech_audio)} < {min_samples} 采样)", flush=True, ) def _energy_vad_on_chunk(self, processed_chunk: np.ndarray) -> None: rms = self._int16_chunk_rms(processed_chunk) _vad_diag = os.environ.get("ROCKET_PRINT_VAD", "").lower() in ( "1", "true", "yes", ) if _vad_diag: self._ev_rms_peak = max(self._ev_rms_peak, rms) now = time.monotonic() if now - self._ev_last_diag_time >= 3.0: print( f"[VAD] energy 诊断:近 3s 块 RMS 峰值≈{self._ev_rms_peak:.0f} " f"(high={self._energy_rms_high} low={self._energy_rms_low})", flush=True, ) self._ev_rms_peak = 0.0 self._ev_last_diag_time = now if not self._ev_speaking: if rms >= self._energy_rms_high: self._ev_high_run += 1 else: self._ev_high_run = 0 if self._ev_high_run >= self._energy_start_chunks: self._ev_speaking = True self._ev_high_run = 0 self._ev_low_run = 0 self._ev_utt_peak = rms hook = self._vad_speech_start_hook if hook is not None: try: hook() except Exception as e: # noqa: BLE001 logger.debug("vad_speech_start_hook: %s", e, exc_info=True) with self.speech_buffer_lock: if self.pre_speech_buffer: self.speech_buffer = list(self.pre_speech_buffer) else: self.speech_buffer.clear() self.speech_buffer.append(processed_chunk) logger.debug( "energy VAD: 开始收集语音段(含预缓冲约 %.2f s)", self.pre_speech_max_seconds, ) return with self.speech_buffer_lock: self.speech_buffer.append(processed_chunk) self._ev_utt_peak = max(rms, self._ev_utt_peak * self._energy_utt_peak_decay) below_abs = rms <= self._energy_rms_low below_rel = ( self._energy_end_peak_ratio > 0 and self._ev_utt_peak >= self._energy_rms_high and rms <= self._ev_utt_peak * self._energy_end_peak_ratio ) if below_abs or below_rel: self._ev_low_run += 1 else: self._ev_low_run = 0 if self._ev_low_run >= self._energy_end_chunks: self._ev_speaking = False self._ev_low_run = 0 self._ev_utt_peak = 0.0 with self.speech_buffer_lock: self._submit_concatenated_speech_to_stt() self._reset_agc_after_utterance_end() logger.debug("energy VAD: 语音段结束,已提交") def _reset_agc_after_utterance_end(self) -> None: """VAD 句尾:清 AGC 滑窗,避免巨响后 RMS 卡死。""" try: self.audio_preprocessor.reset_agc_state() except AttributeError: pass def discard_pending_stt_segments(self) -> int: """丢弃尚未被 STT 线程取走的整句,避免唤醒/播 TTS 关麦后仍识别旧段。""" n = 0 while True: try: self.stt_queue.get_nowait() self.stt_queue.task_done() n += 1 except queue.Empty: break if n: logger.info( "已丢弃 %s 条待 STT 的语音段(流程切换,避免与播 TTS 重叠)", n, ) return n def _stt_worker_thread(self): """STT识别工作线程(异步处理,不阻塞主流程)""" logger.info("STT识别线程已启动") while self.running: try: audio_data = self.stt_queue.get(timeout=0.1) except queue.Empty: continue except Exception as e: logger.error(f"STT工作线程错误: {e}", exc_info=True) continue try: if audio_data is None: break try: text = self.stt.invoke_numpy(audio_data) if os.environ.get("ROCKET_PRINT_STT", "").lower() in ( "1", "true", "yes", ): print( f"[STT] {text!r}" if (text and text.strip()) else "[STT] <空或不识别>", flush=True, ) if text and text.strip(): logger.info(f"🎤 STT识别结果: {text}") try: self.command_queue.put(text, block=False) logger.debug(f"文本已提交到命令处理队列: {text}") except queue.Full: logger.warning("命令处理队列已满,跳过本次识别结果") except Exception as e: logger.error(f"STT识别失败: {e}", exc_info=True) finally: self.stt_queue.task_done() logger.info("STT识别线程已停止") def _command_worker_thread(self): """命令处理工作线程(文本预处理+命令生成+Socket发送)""" logger.info("命令处理线程已启动") while self.running: try: text = self.command_queue.get(timeout=0.1) except queue.Empty: continue except Exception as e: logger.error(f"命令处理线程错误: {e}", exc_info=True) continue try: if text is None: break try: # 1. 检测唤醒词 is_wake, matched_wake_word = self.wake_word_detector.detect(text) if not is_wake: logger.debug(f"未检测到唤醒词,忽略文本: {text}") continue logger.info(f"🔔 检测到唤醒词: {matched_wake_word}") # 2. 提取命令文本(移除唤醒词) command_text = self.wake_word_detector.extract_command_text(text) if not command_text or not command_text.strip(): logger.warning(f"唤醒词后无命令内容: {text}") continue logger.debug(f"提取的命令文本: {command_text}") # 3. 文本预处理(快速模式,不进行分词) normalized_text, params = self.text_preprocessor.preprocess_fast(command_text) logger.debug(f"文本预处理结果:") logger.debug(f" 规范化文本: {normalized_text}") logger.debug(f" 命令关键词: {params.command_keyword}") logger.debug(f" 距离: {params.distance} 米") logger.debug(f" 速度: {params.speed} 米/秒") logger.debug(f" 时间: {params.duration} 秒") # 4. 检查是否识别到命令关键词 if not params.command_keyword: logger.warning(f"未识别到有效命令关键词: {normalized_text}") continue # 5. 生成命令 sequence_id = self._get_next_sequence_id() command = Command.create( command=params.command_keyword, sequence_id=sequence_id, distance=params.distance, speed=params.speed, duration=params.duration ) logger.info(f"📝 生成命令: {command.command}") logger.debug(f"命令详情: {command.to_dict()}") # 6. 发送命令到Socket服务器 if self.socket_client.send_command_with_retry(command): logger.info(f"✅ 命令已发送: {command.command} (序列号: {sequence_id})") self._enqueue_ack_playback(command.command) else: logger.warning( "命令未送达(已达 max_retries): %s (序列号: %s)", command.command, sequence_id, ) except Exception as e: logger.error(f"命令处理失败: {e}", exc_info=True) finally: self.command_queue.task_done() logger.info("命令处理线程已停止") def start(self): """启动语音命令识别系统""" if self.running: logger.warning("语音命令识别系统已在运行") return # 先完成阻塞式 TTS 预加载,再开麦与识别线程,避免用户在预加载期间下指令导致无波形缓存、播报延迟 print("[TTS] 探测扬声器并预加载应答语音(可能需十余秒,请勿说话)…", flush=True) self._init_sounddevice_output_probe() if self.ack_tts_enabled and self._has_ack_tts_content() and self.ack_tts_prewarm: if self.ack_tts_prewarm_blocking: self._prewarm_tts_blocking() else: print( "[TTS] 已跳过启动预加载(ack_tts_enabled/应答文案/ack_tts_prewarm)", flush=True, ) self.running = True # 启动STT识别线程 self.stt_thread = threading.Thread(target=self._stt_worker_thread, daemon=True) self.stt_thread.start() # 启动命令处理线程 self.command_thread = threading.Thread(target=self._command_worker_thread, daemon=True) self.command_thread.start() # 启动音频采集 self.audio_capture.start_stream() if self.ack_tts_enabled and self._has_ack_tts_content() and self.ack_tts_prewarm: if not self.ack_tts_prewarm_blocking: self._prewarm_tts_async() logger.info("语音命令识别系统已启动") print("\n" + "=" * 70) print("🎙️ 高性能实时语音命令识别系统已启动") print("=" * 70) print("💡 功能说明:") print(" - 系统会自动检测语音并识别") print(f" - 🔔 唤醒词: {self.wake_word_detector.primary}") print(" - 只有包含唤醒词的语音才会被处理") print(" - 识别结果会自动转换为无人机控制命令") print(" - 命令会自动发送到Socket服务器") print(" - 按 Ctrl+C 退出") print("=" * 70 + "\n") def stop(self): """停止语音命令识别系统""" if not self.running: return self.running = False # 先通知工作线程结束,再播放尚未 drain 的应答(避免 Ctrl+C 时主循环未跑下一轮导致无声) if self.stt_thread is not None: self.stt_queue.put(None) if self.command_thread is not None: self.command_queue.put(None) if self.stt_thread is not None: self.stt_thread.join(timeout=2.0) if self.command_thread is not None: self.command_thread.join(timeout=2.0) if self.ack_tts_enabled: try: self._drain_ack_playback_queue(recover_mic=False) except Exception as e: logger.warning(f"退出前播放应答失败: {e}", exc_info=True) self.audio_capture.stop_stream() # 断开Socket连接 if self.socket_client.connected: self.socket_client.disconnect() logger.info("语音命令识别系统已停止") print("\n语音命令识别系统已停止") def process_audio_stream(self): """ 处理音频流(主循环) 高性能实时处理流程: 1. 采集音频块(非阻塞) 2. 预处理(降噪+AGC) 3. VAD检测语音开始/结束 4. 收集语音段 5. 异步STT识别(不阻塞主流程) """ try: while self.running: # 0. 主线程播放命令应答(必须在采集循环线程中执行 sd.play,见 tts.play_tts_audio 说明) self._before_audio_iteration() # 1. 采集音频块(非阻塞,高性能模式) chunk = self.audio_capture.read_chunk_numpy(timeout=0.1) if chunk is None: continue # 2. 音频预处理(降噪+AGC) processed_chunk = self.audio_preprocessor.process(chunk) # 初始化预缓冲区的最大块数(只需计算一次) if self.pre_speech_max_chunks is None: # 每个chunk包含的采样点数 samples_per_chunk = processed_chunk.shape[0] if samples_per_chunk > 0: # 0.8 秒需要的chunk数量 = 预缓冲秒数 * 采样率 / 每块采样数 chunks = int( self.pre_speech_max_seconds * self.audio_capture.sample_rate / samples_per_chunk ) # 至少保留 1 块,避免被算成 0 self.pre_speech_max_chunks = max(chunks, 1) else: self.pre_speech_max_chunks = 1 # 将当前块加入预缓冲区(环形缓冲) # 注意:预缓冲区保存的是“最近的一段音频”,无论当下是否在说话 self.pre_speech_buffer.append(processed_chunk) if ( self.pre_speech_max_chunks is not None and len(self.pre_speech_buffer) > self.pre_speech_max_chunks ): # 超出最大长度时,丢弃最早的块 self.pre_speech_buffer.pop(0) # 3. VAD:Silero 或能量(RMS)分段 if self._use_energy_vad: self._energy_vad_on_chunk(processed_chunk) else: chunk_bytes = processed_chunk.tobytes() if self.vad.detect_speech_start(chunk_bytes): hook = self._vad_speech_start_hook if hook is not None: try: hook() except Exception as e: # noqa: BLE001 logger.debug( "vad_speech_start_hook: %s", e, exc_info=True ) with self.speech_buffer_lock: if self.pre_speech_buffer: self.speech_buffer = list(self.pre_speech_buffer) else: self.speech_buffer.clear() self.speech_buffer.append(processed_chunk) logger.debug( "检测到语音开始,使用预缓冲音频(约 %.2f 秒)作为前缀,开始收集语音段", self.pre_speech_max_seconds, ) elif self.vad.is_speaking: with self.speech_buffer_lock: self.speech_buffer.append(processed_chunk) if self.vad.detect_speech_end(chunk_bytes): with self.speech_buffer_lock: self._submit_concatenated_speech_to_stt() self._reset_agc_after_utterance_end() logger.debug("检测到语音结束,提交识别") hook = getattr(self, "_after_processed_audio_chunk", None) if hook is not None: try: hook(processed_chunk) except Exception as e: # noqa: BLE001 logger.debug( "after_processed_audio_chunk: %s", e, exc_info=True ) except KeyboardInterrupt: logger.info("用户中断") except Exception as e: logger.error(f"处理音频流时发生错误: {e}", exc_info=True) raise def run(self): """运行语音命令识别系统(完整流程)""" try: self.start() self.process_audio_stream() finally: self.stop() if __name__ == "__main__": # 测试代码 recognizer = VoiceCommandRecognizer() recognizer.run()