""" 音频采集模块 - 优化版本 输入设备(麦克风)选择(已简化): - 若 system.yaml 中 audio.input_device_index 为整数:只尝试该 PyAudio 索引(无则启动失败并列设备)。 - 若为 null:依次尝试系统默认输入、所有 maxInputChannels>0 的设备。 rocket_drone_audio 启动时可交互选择并写入 input_device_index(见 src.core.mic_device_select)。 """ from voice_drone.core.portaudio_env import fix_ld_path_for_portaudio fix_ld_path_for_portaudio() import re import pyaudio import numpy as np import queue import threading from typing import List, Optional, Tuple from voice_drone.core.configuration import SYSTEM_AUDIO_CONFIG from voice_drone.logging_ import get_logger logger = get_logger("audio.capture.optimized") class AudioCaptureOptimized: """ 优化版音频采集器 使用回调模式 + 队列,实现非阻塞音频采集 """ def __init__(self): """ 初始化音频采集器 """ # 确保数值类型正确(从 YAML 读取可能是字符串) self.sample_rate = int(SYSTEM_AUDIO_CONFIG.get("sample_rate", 16000)) self.channels = int(SYSTEM_AUDIO_CONFIG.get("channels", 1)) self.chunk_size = int(SYSTEM_AUDIO_CONFIG.get("frame_size", 1024)) self.sample_width = int(SYSTEM_AUDIO_CONFIG.get("sample_width", 2)) # 高性能模式配置 self.buffer_queue_size = int(SYSTEM_AUDIO_CONFIG.get("buffer_queue_size", 10)) self._prefer_stereo_capture = bool( SYSTEM_AUDIO_CONFIG.get("prefer_stereo_capture", True) ) raw_idx = SYSTEM_AUDIO_CONFIG.get("input_device_index", None) self._input_device_index_cfg: Optional[int] = ( int(raw_idx) if raw_idx is not None and str(raw_idx).strip() != "" else None ) tr = SYSTEM_AUDIO_CONFIG.get("audio_open_try_rates") if tr: raw_rates: List[int] = [int(x) for x in tr if x is not None] else: raw_rates = [self.sample_rate, 48000, 44100, 32000] seen_r: set[int] = set() self._open_try_rates: List[int] = [] for r in raw_rates: if r not in seen_r: seen_r.add(r) self._open_try_rates.append(r) # 逻辑通道(送给 VAD/STT 的 mono);_pa_channels 为 PortAudio 实际打开的通道数 self._pa_channels = self.channels self._stereo_downmix = False self._pa_open_sample_rate: int = self.sample_rate self.audio = pyaudio.PyAudio() self.format = self.audio.get_format_from_width(self.sample_width) # 使用队列缓冲音频数据(非阻塞) self.audio_queue = queue.Queue(maxsize=self.buffer_queue_size) self.stream: Optional[pyaudio.Stream] = None logger.info( f"优化版音频采集器初始化成功: " f"采样率={self.sample_rate}Hz, " f"块大小={self.chunk_size}, " f"使用回调模式+队列缓冲" ) def _device_hw_tuple_in_name(self, dev_name: str) -> Optional[Tuple[int, int]]: m = re.search(r"\(hw:(\d+),\s*(\d+)\)", dev_name) if not m: return None return int(m.group(1)), int(m.group(2)) def _ordered_input_candidates(self) -> Tuple[List[int], List[int]]: preferred: List[int] = [] seen: set[int] = set() def add(idx: Optional[int]) -> None: if idx is None: return ii = int(idx) if ii in seen: return seen.add(ii) preferred.append(ii) # 配置了整数索引:只打开该设备(与交互选择 / CLI 写入一致) if self._input_device_index_cfg is not None: add(self._input_device_index_cfg) return preferred, [] try: add(int(self.audio.get_default_input_device_info()["index"])) except Exception: pass for i in range(self.audio.get_device_count()): try: inf = self.audio.get_device_info_by_index(i) if int(inf.get("maxInputChannels", 0)) > 0: add(i) except Exception: continue fallback: List[int] = [] for i in range(self.audio.get_device_count()): if i in seen: continue try: self.audio.get_device_info_by_index(i) except Exception: continue fallback.append(i) return preferred, fallback def _channel_plan(self, max_in: int, dev_name: str) -> List[Tuple[int, bool]]: ch = self.channels pref = self._prefer_stereo_capture if ch != 1: return [(ch, False)] if max_in <= 0: logger.warning( "设备 %s 报告 maxInputChannels=%s,将尝试 mono / stereo", dev_name or "?", max_in, ) return [(1, False), (2, True)] if max_in == 1: return [(1, False)] if pref: return [(2, True), (1, False)] return [(1, False)] def _frames_per_buffer_for_rate(self, pa_rate: int) -> int: if pa_rate <= 0: pa_rate = self.sample_rate return max(128, int(round(self.chunk_size * pa_rate / self.sample_rate))) @staticmethod def _resample_linear_int16( x: np.ndarray, sr_in: int, sr_out: int ) -> np.ndarray: if sr_in == sr_out or x.size == 0: return x n_out = max(1, int(round(x.size * (sr_out / sr_in)))) t_in = np.arange(x.size, dtype=np.float64) t_out = np.linspace(0.0, float(x.size - 1), n_out, dtype=np.float64) y = np.interp(t_out, t_in, x.astype(np.float32)) return np.clip(np.round(y), -32768, 32767).astype(np.int16) def _try_open_on_device(self, input_device_index: int) -> bool: try: dev = self.audio.get_device_info_by_index(input_device_index) except Exception: return False max_in = int(dev.get("maxInputChannels", 0)) dev_name = str(dev.get("name", "")) if ( max_in <= 0 and self._input_device_index_cfg is not None and int(input_device_index) == int(self._input_device_index_cfg) and self._prefer_stereo_capture and self.channels == 1 ): max_in = 2 logger.warning( "设备 %s 上报 maxInputChannels=0,假定 2 通道以尝试 ES8388 立体声采集", input_device_index, ) plan = self._channel_plan(max_in, dev_name) hw_t = self._device_hw_tuple_in_name(dev_name) for pa_ch, stereo_dm in plan: self._pa_channels = pa_ch self._stereo_downmix = stereo_dm if stereo_dm and pa_ch == 2: logger.info( "输入按立体声打开并下混 mono(%s index=%s)", dev_name, input_device_index, ) for rate in self._open_try_rates: fpb = self._frames_per_buffer_for_rate(int(rate)) try: self.stream = self.audio.open( format=self.format, channels=self._pa_channels, rate=int(rate), input=True, input_device_index=input_device_index, frames_per_buffer=fpb, stream_callback=self._audio_callback, start=False, ) self.stream.start_stream() self._pa_open_sample_rate = int(rate) extra = ( f" hw=card{hw_t[0]}dev{hw_t[1]}" if hw_t else "" ) if self._pa_open_sample_rate != self.sample_rate: logger.warning( "输入实际 %s Hz,将重采样为 %s Hz 供 VAD/STT", self._pa_open_sample_rate, self.sample_rate, ) logger.info( "音频流启动成功 index=%s name=%r PA_ch=%s PA_rate=%s 逻辑rate=%s%s", input_device_index, dev_name, self._pa_channels, self._pa_open_sample_rate, self.sample_rate, extra, ) return True except Exception as e: if self.stream is not None: try: self.stream.close() except Exception: pass self.stream = None logger.warning( "打开失败 index=%s ch=%s rate=%s: %s", input_device_index, pa_ch, rate, e, ) return False def _audio_callback(self, in_data, frame_count, time_info, status): """ 音频回调函数(非阻塞) """ if status: logger.warning(f"音频流状态: {status}") # 将数据放入队列(非阻塞) try: self.audio_queue.put(in_data, block=False) except queue.Full: logger.warning("音频队列已满,丢弃数据块") return (None, pyaudio.paContinue) def _log_input_devices_for_user(self) -> None: """列出 PortAudio 全部设备(含 in_ch=0),便于选 --input-index / 核对子串。""" n_dev = self.audio.get_device_count() if n_dev <= 0: print( "[audio] PyAudio get_device_count()=0,多为 ALSA/PortAudio 未初始化;" "请用 bash with_system_alsa.sh python … 启动。", flush=True, ) logger.error("PyAudio 枚举不到任何设备") return lines: List[str] = [] for i in range(n_dev): try: inf = self.audio.get_device_info_by_index(i) mic = int(inf.get("maxInputChannels", 0)) outc = int(inf.get("maxOutputChannels", 0)) name = str(inf.get("name", "?")) mark = " <- 可录音" if mic > 0 else "" lines.append(f" [{i}] in={mic} out={outc} {name}{mark}") except Exception: continue msg = "\n".join(lines) logger.error("PortAudio 设备列表:\n%s", msg) print( "[audio] PortAudio 设备列表(in>0 才可作输入;若板载显示 in=0 仍可用 probe 试采):\n" + msg, flush=True, ) def start_stream(self) -> None: """启动音频流(回调模式)""" if self.stream is not None: return preferred, fallback = self._ordered_input_candidates() to_try: List[int] = preferred + fallback if not to_try: print( "[audio] 无任何输入候选。请检查 PortAudio/ALSA(建议:bash with_system_alsa.sh python …)。", flush=True, ) if self._input_device_index_cfg is not None: logger.error( "已配置 input_device_index=%s 但无效或不可打开", self._input_device_index_cfg, ) print( f"[audio] 当前配置的 PyAudio 索引 {self._input_device_index_cfg} 不可用," "请改 system.yaml 或重新运行交互选设备。", flush=True, ) self._log_input_devices_for_user() raise OSError("未找到任何 PyAudio 输入候选设备") for input_device_index in to_try: if self._try_open_on_device(input_device_index): return logger.error("启动音频流失败:全部候选设备无法打开") self._log_input_devices_for_user() raise OSError("启动音频流失败:全部候选设备无法打开") def stop_stream(self) -> None: """停止音频流""" if self.stream is None: return try: self.stream.stop_stream() self.stream.close() self.stream = None self._pa_open_sample_rate = self.sample_rate # 清空队列 while not self.audio_queue.empty(): try: self.audio_queue.get_nowait() except queue.Empty: break logger.info("音频流已停止") except Exception as e: logger.error(f"停止音频流失败: {e}") def read_chunk(self, timeout: float = 0.1) -> Optional[bytes]: """ 读取一个音频块(非阻塞) Args: timeout: 超时时间(秒) Returns: 音频数据(bytes),如果超时则返回 None """ if self.stream is None: return None try: return self.audio_queue.get(timeout=timeout) except queue.Empty: return None def read_chunk_numpy(self, timeout: float = 0.1) -> Optional[np.ndarray]: """读取一个音频块并转换为 numpy 数组(非阻塞)""" data = self.read_chunk(timeout) if data is None: return None sample_size = self._pa_channels * self.sample_width if len(data) % sample_size != 0: aligned_len = (len(data) // sample_size) * sample_size if aligned_len == 0: return None data = data[:aligned_len] mono = np.frombuffer(data, dtype=" float: """ 更新 RMS 值 Args: sample: 新的采样值 Returns: 当前 RMS 值 """ if self.count < self.window_size: # 填充阶段 self.buffer[self.count] = sample self.sum_sq += sample * sample self.count += 1 if self.count == 0: return 0.0 return np.sqrt(self.sum_sq / self.count) else: # 滑动窗口阶段 old_sq = self.buffer[self.idx] * self.buffer[self.idx] self.sum_sq = self.sum_sq - old_sq + sample * sample self.buffer[self.idx] = sample self.idx = (self.idx + 1) % self.window_size return np.sqrt(self.sum_sq / self.window_size) def update_batch(self, samples: np.ndarray) -> float: """ 批量更新 RMS 值 Args: samples: 采样数组 Returns: 当前 RMS 值 """ for sample in samples: self.update(sample) return np.sqrt(self.sum_sq / min(self.count, self.window_size)) def reset(self): """重置计算器""" self.buffer.fill(0.0) self.sum_sq = 0.0 self.idx = 0 self.count = 0 class LightweightNoiseReduction: """ 轻量级降噪算法 使用简单的高通滤波 + 谱减法,性能比 noisereduce 快 10-20 倍 """ def __init__(self, sample_rate: int = 16000, cutoff: float = 80.0): """ Args: sample_rate: 采样率 cutoff: 高通滤波截止频率(Hz) """ self.sample_rate = sample_rate self.cutoff = cutoff # 简单的 IIR 高通滤波器系数(一阶 Butterworth) # H(z) = (1 - z^-1) / (1 - 0.99*z^-1) self.alpha = np.exp(-2.0 * np.pi * cutoff / sample_rate) self.prev_input = 0.0 self.prev_output = 0.0 def process(self, audio: np.ndarray) -> np.ndarray: """ 处理音频(高通滤波) Args: audio: 音频数据(float32,范围 [-1, 1]) Returns: 处理后的音频 """ if audio.dtype != np.float32: audio = audio.astype(np.float32) # 简单的一阶高通滤波 output = np.zeros_like(audio) for i in range(len(audio)): output[i] = self.alpha * (self.prev_output + audio[i] - self.prev_input) self.prev_input = audio[i] self.prev_output = output[i] return output def reset(self): """重置滤波器状态""" self.prev_input = 0.0 self.prev_output = 0.0 class AudioPreprocessorOptimized: """ 优化版音频预处理器 性能优化: 1. 轻量级降噪(替代 noisereduce) 2. 增量 AGC 计算 3. 减少类型转换 """ def __init__(self, enable_noise_reduction: Optional[bool] = None, enable_agc: Optional[bool] = None): """ 初始化音频预处理器 """ # 从配置读取 if enable_noise_reduction is None: enable_noise_reduction = SYSTEM_AUDIO_CONFIG.get("noise_reduce", True) if enable_agc is None: enable_agc = SYSTEM_AUDIO_CONFIG.get("agc", True) self.enable_noise_reduction = enable_noise_reduction self.enable_agc = enable_agc self.sample_rate = int(SYSTEM_AUDIO_CONFIG.get("sample_rate", 16000)) # AGC 参数(确保类型正确,从 YAML 读取可能是字符串) self.agc_target_db = float(SYSTEM_AUDIO_CONFIG.get("agc_target_db", -20.0)) self.agc_gain_min = float(SYSTEM_AUDIO_CONFIG.get("agc_gain_min", 0.1)) self.agc_gain_max = float(SYSTEM_AUDIO_CONFIG.get("agc_gain_max", 10.0)) self.agc_rms_threshold = float(SYSTEM_AUDIO_CONFIG.get("agc_rms_threshold", 1e-6)) self._agc_alpha = float(SYSTEM_AUDIO_CONFIG.get("agc_smoothing_alpha", 0.1)) self._agc_alpha = max(0.02, min(0.95, self._agc_alpha)) # 当需要抬增益(小声/巨响过后)时用更大系数,避免长时间压在 agc_gain_min self._agc_release_alpha = float( SYSTEM_AUDIO_CONFIG.get("agc_release_alpha", 0.45) ) self._agc_release_alpha = max(self._agc_alpha, min(0.95, self._agc_release_alpha)) # 初始化组件 if enable_noise_reduction: self.noise_reducer = LightweightNoiseReduction( sample_rate=self.sample_rate, cutoff=80.0 # 可配置 ) else: self.noise_reducer = None if enable_agc: # 使用增量 RMS 计算器 window_size = int(SYSTEM_AUDIO_CONFIG.get("frame_size", 1024)) self.rms_calculator = IncrementalRMS(window_size=window_size) self.current_gain = 1.0 # 缓存当前增益 else: self.rms_calculator = None logger.info( f"优化版音频预处理器初始化完成: " f"降噪={'启用(轻量级)' if enable_noise_reduction else '禁用'}, " f"自动增益控制={'启用(增量)' if enable_agc else '禁用'}" ) def reset(self) -> None: """ 重置高通滤波与 AGC 状态。应在「暂停采集再重新 start_stream」之后调用, 避免停麦/播 TTS 期间的状态带到新流上(否则易出现恢复后长时间 RMS≈0 或电平怪异)。 """ if self.noise_reducer is not None: self.noise_reducer.reset() if self.rms_calculator is not None: self.rms_calculator.reset() if self.enable_agc: self.current_gain = 1.0 def reset_agc_state(self) -> None: """ 每段语音结束或需Recovery时调用:清空 RMS 滑窗并将增益重置为 1。 避免短时强噪声把 current_gain 压在 agc_gain_min、滑窗仍含高能量导致后续 RMS≈0。 """ if not self.enable_agc or self.rms_calculator is None: return self.rms_calculator.reset() self.current_gain = 1.0 def reduce_noise(self, audio_data: np.ndarray) -> np.ndarray: """ 轻量级降噪处理 Args: audio_data: 音频数据(int16 或 float32) Returns: 降噪后的音频数据 """ if not self.enable_noise_reduction or self.noise_reducer is None: return audio_data # 转换为 float32 if audio_data.dtype == np.int16: audio_float = audio_data.astype(np.float32) / 32768.0 is_int16 = True else: audio_float = audio_data.astype(np.float32) is_int16 = False # 轻量级降噪 reduced = self.noise_reducer.process(audio_float) # 转换回原始格式 if is_int16: reduced = (reduced * 32768.0).astype(np.int16) return reduced def automatic_gain_control(self, audio_data: np.ndarray) -> np.ndarray: """ 自动增益控制(使用增量 RMS) Args: audio_data: 音频数据(int16 或 float32) Returns: 增益调整后的音频数据 """ if not self.enable_agc or self.rms_calculator is None: return audio_data # 转换为 float32 if audio_data.dtype == np.int16: audio_float = audio_data.astype(np.float32) / 32768.0 is_int16 = True else: audio_float = audio_data.astype(np.float32) is_int16 = False # 使用增量 RMS 计算 rms = self.rms_calculator.update_batch(audio_float) if rms < self.agc_rms_threshold: return audio_data # 计算增益(可以进一步优化:使用滑动平均) current_db = 20 * np.log10(rms) gain_db = self.agc_target_db - current_db gain_linear = 10 ** (gain_db / 20.0) gain_linear = np.clip(gain_linear, self.agc_gain_min, self.agc_gain_max) # 压低增益用较小 alpha;需要恢复(gain_linear 明显高于当前)时用 release alpha if gain_linear > self.current_gain * 1.08: alpha = self._agc_release_alpha else: alpha = self._agc_alpha self.current_gain = alpha * gain_linear + (1 - alpha) * self.current_gain # 应用增益 adjusted = audio_float * self.current_gain adjusted = np.clip(adjusted, -1.0, 1.0) # 转换回原始格式 if is_int16: adjusted = (adjusted * 32768.0).astype(np.int16) return adjusted def process(self, audio_data: np.ndarray) -> np.ndarray: """ 完整的预处理流程(优化版) Args: audio_data: 音频数据(numpy array) Returns: 预处理后的音频数据 """ processed = audio_data.copy() # 降噪 if self.enable_noise_reduction: processed = self.reduce_noise(processed) # 自动增益控制 if self.enable_agc: processed = self.automatic_gain_control(processed) return processed # 向后兼容别名(保持API一致性) AudioCapture = AudioCaptureOptimized AudioPreprocessor = AudioPreprocessorOptimized # 使用示例 if __name__ == "__main__": # 优化版使用 with AudioCapture() as capture: preprocessor = AudioPreprocessor() for i in range(10): chunk = capture.read_chunk_numpy(timeout=0.1) if chunk is not None: processed = preprocessor.process(chunk) print(f"处理了 {len(processed)} 个采样点")