DroneMind/voice_drone/core/audio.py

"""
音频采集模块 - 优化版本

输入设备（麦克风）选择（已简化）：
- 若 system.yaml 中 audio.input_device_index 为整数：只尝试该 PyAudio 索引（无则启动失败并列设备）。
- 若为 null：依次尝试系统默认输入、所有 maxInputChannels>0 的设备。
  rocket_drone_audio 启动时可交互选择并写入 input_device_index（见 src.core.mic_device_select）。
"""
from voice_drone.core.portaudio_env import fix_ld_path_for_portaudio

fix_ld_path_for_portaudio()

import re
import pyaudio
import numpy as np
import queue
import threading
from typing import List, Optional, Tuple
from voice_drone.core.configuration import SYSTEM_AUDIO_CONFIG
from voice_drone.logging_ import get_logger

logger = get_logger("audio.capture.optimized")


class AudioCaptureOptimized:
    """
    优化版音频采集器

    使用回调模式 + 队列,实现非阻塞音频采集
    """

    def __init__(self):
        """
        初始化音频采集器
        """
        # 确保数值类型正确（从 YAML 读取可能是字符串）
        self.sample_rate = int(SYSTEM_AUDIO_CONFIG.get("sample_rate", 16000))
        self.channels = int(SYSTEM_AUDIO_CONFIG.get("channels", 1))
        self.chunk_size = int(SYSTEM_AUDIO_CONFIG.get("frame_size", 1024))
        self.sample_width = int(SYSTEM_AUDIO_CONFIG.get("sample_width", 2))

        # 高性能模式配置
        self.buffer_queue_size = int(SYSTEM_AUDIO_CONFIG.get("buffer_queue_size", 10))
        self._prefer_stereo_capture = bool(
            SYSTEM_AUDIO_CONFIG.get("prefer_stereo_capture", True)
        )
        raw_idx = SYSTEM_AUDIO_CONFIG.get("input_device_index", None)
        self._input_device_index_cfg: Optional[int] = (
            int(raw_idx) if raw_idx is not None and str(raw_idx).strip() != "" else None
        )

        tr = SYSTEM_AUDIO_CONFIG.get("audio_open_try_rates")
        if tr:
            raw_rates: List[int] = [int(x) for x in tr if x is not None]
        else:
            raw_rates = [self.sample_rate, 48000, 44100, 32000]
        seen_r: set[int] = set()
        self._open_try_rates: List[int] = []
        for r in raw_rates:
            if r not in seen_r:
                seen_r.add(r)
                self._open_try_rates.append(r)

        # 逻辑通道（送给 VAD/STT 的 mono）；_pa_channels 为 PortAudio 实际打开的通道数
        self._pa_channels = self.channels
        self._stereo_downmix = False
        self._pa_open_sample_rate: int = self.sample_rate

        self.audio = pyaudio.PyAudio()
        self.format = self.audio.get_format_from_width(self.sample_width)

        # 使用队列缓冲音频数据(非阻塞)
        self.audio_queue = queue.Queue(maxsize=self.buffer_queue_size)
        self.stream: Optional[pyaudio.Stream] = None

        logger.info(
            f"优化版音频采集器初始化成功: "
            f"采样率={self.sample_rate}Hz, "
            f"块大小={self.chunk_size}, "
            f"使用回调模式+队列缓冲"
        )

    def _device_hw_tuple_in_name(self, dev_name: str) -> Optional[Tuple[int, int]]:
        m = re.search(r"\(hw:(\d+),\s*(\d+)\)", dev_name)
        if not m:
            return None
        return int(m.group(1)), int(m.group(2))

    def _ordered_input_candidates(self) -> Tuple[List[int], List[int]]:
        preferred: List[int] = []
        seen: set[int] = set()

        def add(idx: Optional[int]) -> None:
            if idx is None:
                return
            ii = int(idx)
            if ii in seen:
                return
            seen.add(ii)
            preferred.append(ii)

        # 配置了整数索引：只打开该设备（与交互选择 / CLI 写入一致）
        if self._input_device_index_cfg is not None:
            add(self._input_device_index_cfg)
            return preferred, []

        try:
            add(int(self.audio.get_default_input_device_info()["index"]))
        except Exception:
            pass
        for i in range(self.audio.get_device_count()):
            try:
                inf = self.audio.get_device_info_by_index(i)
                if int(inf.get("maxInputChannels", 0)) > 0:
                    add(i)
            except Exception:
                continue

        fallback: List[int] = []
        for i in range(self.audio.get_device_count()):
            if i in seen:
                continue
            try:
                self.audio.get_device_info_by_index(i)
            except Exception:
                continue
            fallback.append(i)
        return preferred, fallback

    def _channel_plan(self, max_in: int, dev_name: str) -> List[Tuple[int, bool]]:
        ch = self.channels
        pref = self._prefer_stereo_capture
        if ch != 1:
            return [(ch, False)]
        if max_in <= 0:
            logger.warning(
                "设备 %s 报告 maxInputChannels=%s，将尝试 mono / stereo",
                dev_name or "?",
                max_in,
            )
            return [(1, False), (2, True)]
        if max_in == 1:
            return [(1, False)]
        if pref:
            return [(2, True), (1, False)]
        return [(1, False)]

    def _frames_per_buffer_for_rate(self, pa_rate: int) -> int:
        if pa_rate <= 0:
            pa_rate = self.sample_rate
        return max(128, int(round(self.chunk_size * pa_rate / self.sample_rate)))

    @staticmethod
    def _resample_linear_int16(
        x: np.ndarray, sr_in: int, sr_out: int
    ) -> np.ndarray:
        if sr_in == sr_out or x.size == 0:
            return x
        n_out = max(1, int(round(x.size * (sr_out / sr_in))))
        t_in = np.arange(x.size, dtype=np.float64)
        t_out = np.linspace(0.0, float(x.size - 1), n_out, dtype=np.float64)
        y = np.interp(t_out, t_in, x.astype(np.float32))
        return np.clip(np.round(y), -32768, 32767).astype(np.int16)

    def _try_open_on_device(self, input_device_index: int) -> bool:
        try:
            dev = self.audio.get_device_info_by_index(input_device_index)
        except Exception:
            return False
        max_in = int(dev.get("maxInputChannels", 0))
        dev_name = str(dev.get("name", ""))
        if (
            max_in <= 0
            and self._input_device_index_cfg is not None
            and int(input_device_index) == int(self._input_device_index_cfg)
            and self._prefer_stereo_capture
            and self.channels == 1
        ):
            max_in = 2
            logger.warning(
                "设备 %s 上报 maxInputChannels=0，假定 2 通道以尝试 ES8388 立体声采集",
                input_device_index,
            )
        plan = self._channel_plan(max_in, dev_name)
        hw_t = self._device_hw_tuple_in_name(dev_name)

        for pa_ch, stereo_dm in plan:
            self._pa_channels = pa_ch
            self._stereo_downmix = stereo_dm
            if stereo_dm and pa_ch == 2:
                logger.info(
                    "输入按立体声打开并下混 mono（%s index=%s）",
                    dev_name,
                    input_device_index,
                )
            for rate in self._open_try_rates:
                fpb = self._frames_per_buffer_for_rate(int(rate))
                try:
                    self.stream = self.audio.open(
                        format=self.format,
                        channels=self._pa_channels,
                        rate=int(rate),
                        input=True,
                        input_device_index=input_device_index,
                        frames_per_buffer=fpb,
                        stream_callback=self._audio_callback,
                        start=False,
                    )
                    self.stream.start_stream()
                    self._pa_open_sample_rate = int(rate)
                    extra = (
                        f" hw=card{hw_t[0]}dev{hw_t[1]}" if hw_t else ""
                    )
                    if self._pa_open_sample_rate != self.sample_rate:
                        logger.warning(
                            "输入实际 %s Hz，将重采样为 %s Hz 供 VAD/STT",
                            self._pa_open_sample_rate,
                            self.sample_rate,
                        )
                    logger.info(
                        "音频流启动成功 index=%s name=%r PA_ch=%s PA_rate=%s 逻辑rate=%s%s",
                        input_device_index,
                        dev_name,
                        self._pa_channels,
                        self._pa_open_sample_rate,
                        self.sample_rate,
                        extra,
                    )
                    return True
                except Exception as e:
                    if self.stream is not None:
                        try:
                            self.stream.close()
                        except Exception:
                            pass
                        self.stream = None
                    logger.warning(
                        "打开失败 index=%s ch=%s rate=%s: %s",
                        input_device_index,
                        pa_ch,
                        rate,
                        e,
                    )
        return False

    def _audio_callback(self, in_data, frame_count, time_info, status):
        """
        音频回调函数(非阻塞)
        """
        if status:
            logger.warning(f"音频流状态: {status}")

        # 将数据放入队列(非阻塞)
        try:
            self.audio_queue.put(in_data, block=False)
        except queue.Full:
            logger.warning("音频队列已满,丢弃数据块")

        return (None, pyaudio.paContinue)

    def _log_input_devices_for_user(self) -> None:
        """列出 PortAudio 全部设备（含 in_ch=0），便于选 --input-index / 核对子串。"""
        n_dev = self.audio.get_device_count()
        if n_dev <= 0:
            print(
                "[audio] PyAudio get_device_count()=0，多为 ALSA/PortAudio 未初始化；"
                "请用 bash with_system_alsa.sh python … 启动。",
                flush=True,
            )
            logger.error("PyAudio 枚举不到任何设备")
            return
        lines: List[str] = []
        for i in range(n_dev):
            try:
                inf = self.audio.get_device_info_by_index(i)
                mic = int(inf.get("maxInputChannels", 0))
                outc = int(inf.get("maxOutputChannels", 0))
                name = str(inf.get("name", "?"))
                mark = "  <- 可录音" if mic > 0 else ""
                lines.append(f"  [{i}] in={mic} out={outc} {name}{mark}")
            except Exception:
                continue
        msg = "\n".join(lines)
        logger.error("PortAudio 设备列表：\n%s", msg)
        print(
            "[audio] PortAudio 设备列表（in>0 才可作输入；若板载显示 in=0 仍可用 probe 试采）：\n"
            + msg,
            flush=True,
        )

    def start_stream(self) -> None:
        """启动音频流(回调模式)"""
        if self.stream is not None:
            return

        preferred, fallback = self._ordered_input_candidates()
        to_try: List[int] = preferred + fallback
        if not to_try:
            print(
                "[audio] 无任何输入候选。请检查 PortAudio/ALSA（建议：bash with_system_alsa.sh python …）。",
                flush=True,
            )
            if self._input_device_index_cfg is not None:
                logger.error(
                    "已配置 input_device_index=%s 但无效或不可打开",
                    self._input_device_index_cfg,
                )
                print(
                    f"[audio] 当前配置的 PyAudio 索引 {self._input_device_index_cfg} 不可用，"
                    "请改 system.yaml 或重新运行交互选设备。",
                    flush=True,
                )
                self._log_input_devices_for_user()
            raise OSError("未找到任何 PyAudio 输入候选设备")

        for input_device_index in to_try:
            if self._try_open_on_device(input_device_index):
                return

        logger.error("启动音频流失败：全部候选设备无法打开")
        self._log_input_devices_for_user()
        raise OSError("启动音频流失败：全部候选设备无法打开")

    def stop_stream(self) -> None:
        """停止音频流"""
        if self.stream is None:
            return

        try:
            self.stream.stop_stream()
            self.stream.close()
            self.stream = None
            self._pa_open_sample_rate = self.sample_rate
            # 清空队列
            while not self.audio_queue.empty():
                try:
                    self.audio_queue.get_nowait()
                except queue.Empty:
                    break
            logger.info("音频流已停止")
        except Exception as e:
            logger.error(f"停止音频流失败: {e}")

    def read_chunk(self, timeout: float = 0.1) -> Optional[bytes]:
        """
        读取一个音频块(非阻塞)

        Args:
            timeout: 超时时间(秒)

        Returns:
            音频数据(bytes),如果超时则返回 None
        """
        if self.stream is None:
            return None

        try:
            return self.audio_queue.get(timeout=timeout)
        except queue.Empty:
            return None

    def read_chunk_numpy(self, timeout: float = 0.1) -> Optional[np.ndarray]:
        """读取一个音频块并转换为 numpy 数组(非阻塞)"""
        data = self.read_chunk(timeout)
        if data is None:
            return None

        sample_size = self._pa_channels * self.sample_width
        if len(data) % sample_size != 0:
            aligned_len = (len(data) // sample_size) * sample_size
            if aligned_len == 0:
                return None
            data = data[:aligned_len]

        mono = np.frombuffer(data, dtype="<i2")
        if self._stereo_downmix and self._pa_channels == 2:
            n = mono.size // 2
            if n == 0:
                return None
            s = mono[: n * 2].reshape(n, 2).astype(np.int32)
            mono = ((s[:, 0] + s[:, 1]) // 2).astype(np.int16)
        if self._pa_open_sample_rate != self.sample_rate:
            mono = self._resample_linear_int16(
                mono, self._pa_open_sample_rate, self.sample_rate
            )
        return mono

    def __enter__(self):
        self.start_stream()
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.stop_stream()
        self.audio.terminate()


class IncrementalRMS:
    """
    增量 RMS 计算器(滑动窗口)

    用于 AGC,避免每次重新计算整个音频块的 RMS
    """

    def __init__(self, window_size: int = 1024):
        """
        Args:
            window_size: 滑动窗口大小
        """
        self.window_size = window_size
        self.buffer = np.zeros(window_size, dtype=np.float32)
        self.sum_sq = 0.0
        self.idx = 0
        self.count = 0

    def update(self, sample: float) -> float:
        """
        更新 RMS 值

        Args:
            sample: 新的采样值

        Returns:
            当前 RMS 值
        """
        if self.count < self.window_size:
            # 填充阶段
            self.buffer[self.count] = sample
            self.sum_sq += sample * sample
            self.count += 1
            if self.count == 0:
                return 0.0
            return np.sqrt(self.sum_sq / self.count)
        else:
            # 滑动窗口阶段
            old_sq = self.buffer[self.idx] * self.buffer[self.idx]
            self.sum_sq = self.sum_sq - old_sq + sample * sample
            self.buffer[self.idx] = sample
            self.idx = (self.idx + 1) % self.window_size
            return np.sqrt(self.sum_sq / self.window_size)

    def update_batch(self, samples: np.ndarray) -> float:
        """
        批量更新 RMS 值

        Args:
            samples: 采样数组

        Returns:
            当前 RMS 值
        """
        for sample in samples:
            self.update(sample)
        return np.sqrt(self.sum_sq / min(self.count, self.window_size))

    def reset(self):
        """重置计算器"""
        self.buffer.fill(0.0)
        self.sum_sq = 0.0
        self.idx = 0
        self.count = 0


class LightweightNoiseReduction:
    """
    轻量级降噪算法

    使用简单的高通滤波 + 谱减法,性能比 noisereduce 快 10-20 倍
    """

    def __init__(self, sample_rate: int = 16000, cutoff: float = 80.0):
        """
        Args:
            sample_rate: 采样率
            cutoff: 高通滤波截止频率(Hz)
        """
        self.sample_rate = sample_rate
        self.cutoff = cutoff

        # 简单的 IIR 高通滤波器系数(一阶 Butterworth)
        # H(z) = (1 - z^-1) / (1 - 0.99*z^-1)
        self.alpha = np.exp(-2.0 * np.pi * cutoff / sample_rate)
        self.prev_input = 0.0
        self.prev_output = 0.0

    def process(self, audio: np.ndarray) -> np.ndarray:
        """
        处理音频(高通滤波)

        Args:
            audio: 音频数据(float32,范围 [-1, 1])

        Returns:
            处理后的音频
        """
        if audio.dtype != np.float32:
            audio = audio.astype(np.float32)

        # 简单的一阶高通滤波
        output = np.zeros_like(audio)
        for i in range(len(audio)):
            output[i] = self.alpha * (self.prev_output + audio[i] - self.prev_input)
            self.prev_input = audio[i]
            self.prev_output = output[i]

        return output

    def reset(self):
        """重置滤波器状态"""
        self.prev_input = 0.0
        self.prev_output = 0.0


class AudioPreprocessorOptimized:
    """
    优化版音频预处理器

    性能优化：
    1. 轻量级降噪(替代 noisereduce)
    2. 增量 AGC 计算
    3. 减少类型转换
    """

    def __init__(self, enable_noise_reduction: Optional[bool] = None,
                 enable_agc: Optional[bool] = None):
        """
        初始化音频预处理器
        """
        # 从配置读取
        if enable_noise_reduction is None:
            enable_noise_reduction = SYSTEM_AUDIO_CONFIG.get("noise_reduce", True)
        if enable_agc is None:
            enable_agc = SYSTEM_AUDIO_CONFIG.get("agc", True)

        self.enable_noise_reduction = enable_noise_reduction
        self.enable_agc = enable_agc
        self.sample_rate = int(SYSTEM_AUDIO_CONFIG.get("sample_rate", 16000))

        # AGC 参数（确保类型正确，从 YAML 读取可能是字符串）
        self.agc_target_db = float(SYSTEM_AUDIO_CONFIG.get("agc_target_db", -20.0))
        self.agc_gain_min = float(SYSTEM_AUDIO_CONFIG.get("agc_gain_min", 0.1))
        self.agc_gain_max = float(SYSTEM_AUDIO_CONFIG.get("agc_gain_max", 10.0))
        self.agc_rms_threshold = float(SYSTEM_AUDIO_CONFIG.get("agc_rms_threshold", 1e-6))
        self._agc_alpha = float(SYSTEM_AUDIO_CONFIG.get("agc_smoothing_alpha", 0.1))
        self._agc_alpha = max(0.02, min(0.95, self._agc_alpha))
        # 当需要抬增益（小声/巨响过后）时用更大系数，避免长时间压在 agc_gain_min
        self._agc_release_alpha = float(
            SYSTEM_AUDIO_CONFIG.get("agc_release_alpha", 0.45)
        )
        self._agc_release_alpha = max(self._agc_alpha, min(0.95, self._agc_release_alpha))

        # 初始化组件
        if enable_noise_reduction:
            self.noise_reducer = LightweightNoiseReduction(
                sample_rate=self.sample_rate,
                cutoff=80.0  # 可配置
            )
        else:
            self.noise_reducer = None

        if enable_agc:
            # 使用增量 RMS 计算器
            window_size = int(SYSTEM_AUDIO_CONFIG.get("frame_size", 1024))
            self.rms_calculator = IncrementalRMS(window_size=window_size)
            self.current_gain = 1.0  # 缓存当前增益
        else:
            self.rms_calculator = None

        logger.info(
            f"优化版音频预处理器初始化完成: "
            f"降噪={'启用(轻量级)' if enable_noise_reduction else '禁用'}, "
            f"自动增益控制={'启用(增量)' if enable_agc else '禁用'}"
        )

    def reset(self) -> None:
        """
        重置高通滤波与 AGC 状态。应在「暂停采集再重新 start_stream」之后调用，
        避免停麦/播 TTS 期间的状态带到新流上（否则易出现恢复后长时间 RMS≈0 或电平怪异）。
        """
        if self.noise_reducer is not None:
            self.noise_reducer.reset()
        if self.rms_calculator is not None:
            self.rms_calculator.reset()
        if self.enable_agc:
            self.current_gain = 1.0

    def reset_agc_state(self) -> None:
        """
        每段语音结束或需Recovery时调用：清空 RMS 滑窗并将增益重置为 1。
        避免短时强噪声把 current_gain 压在 agc_gain_min、滑窗仍含高能量导致后续 RMS≈0。
        """
        if not self.enable_agc or self.rms_calculator is None:
            return
        self.rms_calculator.reset()
        self.current_gain = 1.0

    def reduce_noise(self, audio_data: np.ndarray) -> np.ndarray:
        """
        轻量级降噪处理

        Args:
            audio_data: 音频数据(int16 或 float32)

        Returns:
            降噪后的音频数据
        """
        if not self.enable_noise_reduction or self.noise_reducer is None:
            return audio_data

        # 转换为 float32
        if audio_data.dtype == np.int16:
            audio_float = audio_data.astype(np.float32) / 32768.0
            is_int16 = True
        else:
            audio_float = audio_data.astype(np.float32)
            is_int16 = False

        # 轻量级降噪
        reduced = self.noise_reducer.process(audio_float)

        # 转换回原始格式
        if is_int16:
            reduced = (reduced * 32768.0).astype(np.int16)

        return reduced

    def automatic_gain_control(self, audio_data: np.ndarray) -> np.ndarray:
        """
        自动增益控制(使用增量 RMS)

        Args:
            audio_data: 音频数据(int16 或 float32)

        Returns:
            增益调整后的音频数据
        """
        if not self.enable_agc or self.rms_calculator is None:
            return audio_data

        # 转换为 float32
        if audio_data.dtype == np.int16:
            audio_float = audio_data.astype(np.float32) / 32768.0
            is_int16 = True
        else:
            audio_float = audio_data.astype(np.float32)
            is_int16 = False

        # 使用增量 RMS 计算
        rms = self.rms_calculator.update_batch(audio_float)

        if rms < self.agc_rms_threshold:
            return audio_data

        # 计算增益(可以进一步优化：使用滑动平均)
        current_db = 20 * np.log10(rms)
        gain_db = self.agc_target_db - current_db
        gain_linear = 10 ** (gain_db / 20.0)
        gain_linear = np.clip(gain_linear, self.agc_gain_min, self.agc_gain_max)

        # 压低增益用较小 alpha；需要恢复（gain_linear 明显高于当前）时用 release alpha
        if gain_linear > self.current_gain * 1.08:
            alpha = self._agc_release_alpha
        else:
            alpha = self._agc_alpha
        self.current_gain = alpha * gain_linear + (1 - alpha) * self.current_gain

        # 应用增益
        adjusted = audio_float * self.current_gain
        adjusted = np.clip(adjusted, -1.0, 1.0)

        # 转换回原始格式
        if is_int16:
            adjusted = (adjusted * 32768.0).astype(np.int16)

        return adjusted

    def process(self, audio_data: np.ndarray) -> np.ndarray:
        """
        完整的预处理流程(优化版)

        Args:
            audio_data: 音频数据(numpy array)

        Returns:
            预处理后的音频数据
        """
        processed = audio_data.copy()

        # 降噪
        if self.enable_noise_reduction:
            processed = self.reduce_noise(processed)

        # 自动增益控制
        if self.enable_agc:
            processed = self.automatic_gain_control(processed)

        return processed


# 向后兼容别名(保持API一致性)
AudioCapture = AudioCaptureOptimized
AudioPreprocessor = AudioPreprocessorOptimized


# 使用示例
if __name__ == "__main__":
    # 优化版使用
    with AudioCapture() as capture:
        preprocessor = AudioPreprocessor()

        for i in range(10):
            chunk = capture.read_chunk_numpy(timeout=0.1)
            if chunk is not None:
                processed = preprocessor.process(chunk)
                print(f"处理了 {len(processed)} 个采样点")