2026-04-14 09:54:26 +08:00

715 lines
25 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
音频采集模块 - 优化版本
输入设备(麦克风)选择(已简化):
- 若 system.yaml 中 audio.input_device_index 为整数:只尝试该 PyAudio 索引(无则启动失败并列设备)。
- 若为 null依次尝试系统默认输入、所有 maxInputChannels>0 的设备。
rocket_drone_audio 启动时可交互选择并写入 input_device_index见 src.core.mic_device_select
"""
from voice_drone.core.portaudio_env import fix_ld_path_for_portaudio
fix_ld_path_for_portaudio()
import re
import pyaudio
import numpy as np
import queue
import threading
from typing import List, Optional, Tuple
from voice_drone.core.configuration import SYSTEM_AUDIO_CONFIG
from voice_drone.logging_ import get_logger
logger = get_logger("audio.capture.optimized")
class AudioCaptureOptimized:
"""
优化版音频采集器
使用回调模式 + 队列,实现非阻塞音频采集
"""
def __init__(self):
"""
初始化音频采集器
"""
# 确保数值类型正确(从 YAML 读取可能是字符串)
self.sample_rate = int(SYSTEM_AUDIO_CONFIG.get("sample_rate", 16000))
self.channels = int(SYSTEM_AUDIO_CONFIG.get("channels", 1))
self.chunk_size = int(SYSTEM_AUDIO_CONFIG.get("frame_size", 1024))
self.sample_width = int(SYSTEM_AUDIO_CONFIG.get("sample_width", 2))
# 高性能模式配置
self.buffer_queue_size = int(SYSTEM_AUDIO_CONFIG.get("buffer_queue_size", 10))
self._prefer_stereo_capture = bool(
SYSTEM_AUDIO_CONFIG.get("prefer_stereo_capture", True)
)
raw_idx = SYSTEM_AUDIO_CONFIG.get("input_device_index", None)
self._input_device_index_cfg: Optional[int] = (
int(raw_idx) if raw_idx is not None and str(raw_idx).strip() != "" else None
)
tr = SYSTEM_AUDIO_CONFIG.get("audio_open_try_rates")
if tr:
raw_rates: List[int] = [int(x) for x in tr if x is not None]
else:
raw_rates = [self.sample_rate, 48000, 44100, 32000]
seen_r: set[int] = set()
self._open_try_rates: List[int] = []
for r in raw_rates:
if r not in seen_r:
seen_r.add(r)
self._open_try_rates.append(r)
# 逻辑通道(送给 VAD/STT 的 mono_pa_channels 为 PortAudio 实际打开的通道数
self._pa_channels = self.channels
self._stereo_downmix = False
self._pa_open_sample_rate: int = self.sample_rate
self.audio = pyaudio.PyAudio()
self.format = self.audio.get_format_from_width(self.sample_width)
# 使用队列缓冲音频数据(非阻塞)
self.audio_queue = queue.Queue(maxsize=self.buffer_queue_size)
self.stream: Optional[pyaudio.Stream] = None
logger.info(
f"优化版音频采集器初始化成功: "
f"采样率={self.sample_rate}Hz, "
f"块大小={self.chunk_size}, "
f"使用回调模式+队列缓冲"
)
def _device_hw_tuple_in_name(self, dev_name: str) -> Optional[Tuple[int, int]]:
m = re.search(r"\(hw:(\d+),\s*(\d+)\)", dev_name)
if not m:
return None
return int(m.group(1)), int(m.group(2))
def _ordered_input_candidates(self) -> Tuple[List[int], List[int]]:
preferred: List[int] = []
seen: set[int] = set()
def add(idx: Optional[int]) -> None:
if idx is None:
return
ii = int(idx)
if ii in seen:
return
seen.add(ii)
preferred.append(ii)
# 配置了整数索引:只打开该设备(与交互选择 / CLI 写入一致)
if self._input_device_index_cfg is not None:
add(self._input_device_index_cfg)
return preferred, []
try:
add(int(self.audio.get_default_input_device_info()["index"]))
except Exception:
pass
for i in range(self.audio.get_device_count()):
try:
inf = self.audio.get_device_info_by_index(i)
if int(inf.get("maxInputChannels", 0)) > 0:
add(i)
except Exception:
continue
fallback: List[int] = []
for i in range(self.audio.get_device_count()):
if i in seen:
continue
try:
self.audio.get_device_info_by_index(i)
except Exception:
continue
fallback.append(i)
return preferred, fallback
def _channel_plan(self, max_in: int, dev_name: str) -> List[Tuple[int, bool]]:
ch = self.channels
pref = self._prefer_stereo_capture
if ch != 1:
return [(ch, False)]
if max_in <= 0:
logger.warning(
"设备 %s 报告 maxInputChannels=%s,将尝试 mono / stereo",
dev_name or "?",
max_in,
)
return [(1, False), (2, True)]
if max_in == 1:
return [(1, False)]
if pref:
return [(2, True), (1, False)]
return [(1, False)]
def _frames_per_buffer_for_rate(self, pa_rate: int) -> int:
if pa_rate <= 0:
pa_rate = self.sample_rate
return max(128, int(round(self.chunk_size * pa_rate / self.sample_rate)))
@staticmethod
def _resample_linear_int16(
x: np.ndarray, sr_in: int, sr_out: int
) -> np.ndarray:
if sr_in == sr_out or x.size == 0:
return x
n_out = max(1, int(round(x.size * (sr_out / sr_in))))
t_in = np.arange(x.size, dtype=np.float64)
t_out = np.linspace(0.0, float(x.size - 1), n_out, dtype=np.float64)
y = np.interp(t_out, t_in, x.astype(np.float32))
return np.clip(np.round(y), -32768, 32767).astype(np.int16)
def _try_open_on_device(self, input_device_index: int) -> bool:
try:
dev = self.audio.get_device_info_by_index(input_device_index)
except Exception:
return False
max_in = int(dev.get("maxInputChannels", 0))
dev_name = str(dev.get("name", ""))
if (
max_in <= 0
and self._input_device_index_cfg is not None
and int(input_device_index) == int(self._input_device_index_cfg)
and self._prefer_stereo_capture
and self.channels == 1
):
max_in = 2
logger.warning(
"设备 %s 上报 maxInputChannels=0假定 2 通道以尝试 ES8388 立体声采集",
input_device_index,
)
plan = self._channel_plan(max_in, dev_name)
hw_t = self._device_hw_tuple_in_name(dev_name)
for pa_ch, stereo_dm in plan:
self._pa_channels = pa_ch
self._stereo_downmix = stereo_dm
if stereo_dm and pa_ch == 2:
logger.info(
"输入按立体声打开并下混 mono%s index=%s",
dev_name,
input_device_index,
)
for rate in self._open_try_rates:
fpb = self._frames_per_buffer_for_rate(int(rate))
try:
self.stream = self.audio.open(
format=self.format,
channels=self._pa_channels,
rate=int(rate),
input=True,
input_device_index=input_device_index,
frames_per_buffer=fpb,
stream_callback=self._audio_callback,
start=False,
)
self.stream.start_stream()
self._pa_open_sample_rate = int(rate)
extra = (
f" hw=card{hw_t[0]}dev{hw_t[1]}" if hw_t else ""
)
if self._pa_open_sample_rate != self.sample_rate:
logger.warning(
"输入实际 %s Hz将重采样为 %s Hz 供 VAD/STT",
self._pa_open_sample_rate,
self.sample_rate,
)
logger.info(
"音频流启动成功 index=%s name=%r PA_ch=%s PA_rate=%s 逻辑rate=%s%s",
input_device_index,
dev_name,
self._pa_channels,
self._pa_open_sample_rate,
self.sample_rate,
extra,
)
return True
except Exception as e:
if self.stream is not None:
try:
self.stream.close()
except Exception:
pass
self.stream = None
logger.warning(
"打开失败 index=%s ch=%s rate=%s: %s",
input_device_index,
pa_ch,
rate,
e,
)
return False
def _audio_callback(self, in_data, frame_count, time_info, status):
"""
音频回调函数(非阻塞)
"""
if status:
logger.warning(f"音频流状态: {status}")
# 将数据放入队列(非阻塞)
try:
self.audio_queue.put(in_data, block=False)
except queue.Full:
logger.warning("音频队列已满,丢弃数据块")
return (None, pyaudio.paContinue)
def _log_input_devices_for_user(self) -> None:
"""列出 PortAudio 全部设备(含 in_ch=0便于选 --input-index / 核对子串。"""
n_dev = self.audio.get_device_count()
if n_dev <= 0:
print(
"[audio] PyAudio get_device_count()=0多为 ALSA/PortAudio 未初始化;"
"请用 bash with_system_alsa.sh python … 启动。",
flush=True,
)
logger.error("PyAudio 枚举不到任何设备")
return
lines: List[str] = []
for i in range(n_dev):
try:
inf = self.audio.get_device_info_by_index(i)
mic = int(inf.get("maxInputChannels", 0))
outc = int(inf.get("maxOutputChannels", 0))
name = str(inf.get("name", "?"))
mark = " <- 可录音" if mic > 0 else ""
lines.append(f" [{i}] in={mic} out={outc} {name}{mark}")
except Exception:
continue
msg = "\n".join(lines)
logger.error("PortAudio 设备列表:\n%s", msg)
print(
"[audio] PortAudio 设备列表in>0 才可作输入;若板载显示 in=0 仍可用 probe 试采):\n"
+ msg,
flush=True,
)
def start_stream(self) -> None:
"""启动音频流(回调模式)"""
if self.stream is not None:
return
preferred, fallback = self._ordered_input_candidates()
to_try: List[int] = preferred + fallback
if not to_try:
print(
"[audio] 无任何输入候选。请检查 PortAudio/ALSA建议bash with_system_alsa.sh python …)。",
flush=True,
)
if self._input_device_index_cfg is not None:
logger.error(
"已配置 input_device_index=%s 但无效或不可打开",
self._input_device_index_cfg,
)
print(
f"[audio] 当前配置的 PyAudio 索引 {self._input_device_index_cfg} 不可用,"
"请改 system.yaml 或重新运行交互选设备。",
flush=True,
)
self._log_input_devices_for_user()
raise OSError("未找到任何 PyAudio 输入候选设备")
for input_device_index in to_try:
if self._try_open_on_device(input_device_index):
return
logger.error("启动音频流失败:全部候选设备无法打开")
self._log_input_devices_for_user()
raise OSError("启动音频流失败:全部候选设备无法打开")
def stop_stream(self) -> None:
"""停止音频流"""
if self.stream is None:
return
try:
self.stream.stop_stream()
self.stream.close()
self.stream = None
self._pa_open_sample_rate = self.sample_rate
# 清空队列
while not self.audio_queue.empty():
try:
self.audio_queue.get_nowait()
except queue.Empty:
break
logger.info("音频流已停止")
except Exception as e:
logger.error(f"停止音频流失败: {e}")
def read_chunk(self, timeout: float = 0.1) -> Optional[bytes]:
"""
读取一个音频块(非阻塞)
Args:
timeout: 超时时间(秒)
Returns:
音频数据(bytes),如果超时则返回 None
"""
if self.stream is None:
return None
try:
return self.audio_queue.get(timeout=timeout)
except queue.Empty:
return None
def read_chunk_numpy(self, timeout: float = 0.1) -> Optional[np.ndarray]:
"""读取一个音频块并转换为 numpy 数组(非阻塞)"""
data = self.read_chunk(timeout)
if data is None:
return None
sample_size = self._pa_channels * self.sample_width
if len(data) % sample_size != 0:
aligned_len = (len(data) // sample_size) * sample_size
if aligned_len == 0:
return None
data = data[:aligned_len]
mono = np.frombuffer(data, dtype="<i2")
if self._stereo_downmix and self._pa_channels == 2:
n = mono.size // 2
if n == 0:
return None
s = mono[: n * 2].reshape(n, 2).astype(np.int32)
mono = ((s[:, 0] + s[:, 1]) // 2).astype(np.int16)
if self._pa_open_sample_rate != self.sample_rate:
mono = self._resample_linear_int16(
mono, self._pa_open_sample_rate, self.sample_rate
)
return mono
def __enter__(self):
self.start_stream()
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.stop_stream()
self.audio.terminate()
class IncrementalRMS:
"""
增量 RMS 计算器(滑动窗口)
用于 AGC,避免每次重新计算整个音频块的 RMS
"""
def __init__(self, window_size: int = 1024):
"""
Args:
window_size: 滑动窗口大小
"""
self.window_size = window_size
self.buffer = np.zeros(window_size, dtype=np.float32)
self.sum_sq = 0.0
self.idx = 0
self.count = 0
def update(self, sample: float) -> float:
"""
更新 RMS 值
Args:
sample: 新的采样值
Returns:
当前 RMS 值
"""
if self.count < self.window_size:
# 填充阶段
self.buffer[self.count] = sample
self.sum_sq += sample * sample
self.count += 1
if self.count == 0:
return 0.0
return np.sqrt(self.sum_sq / self.count)
else:
# 滑动窗口阶段
old_sq = self.buffer[self.idx] * self.buffer[self.idx]
self.sum_sq = self.sum_sq - old_sq + sample * sample
self.buffer[self.idx] = sample
self.idx = (self.idx + 1) % self.window_size
return np.sqrt(self.sum_sq / self.window_size)
def update_batch(self, samples: np.ndarray) -> float:
"""
批量更新 RMS 值
Args:
samples: 采样数组
Returns:
当前 RMS 值
"""
for sample in samples:
self.update(sample)
return np.sqrt(self.sum_sq / min(self.count, self.window_size))
def reset(self):
"""重置计算器"""
self.buffer.fill(0.0)
self.sum_sq = 0.0
self.idx = 0
self.count = 0
class LightweightNoiseReduction:
"""
轻量级降噪算法
使用简单的高通滤波 + 谱减法,性能比 noisereduce 快 10-20 倍
"""
def __init__(self, sample_rate: int = 16000, cutoff: float = 80.0):
"""
Args:
sample_rate: 采样率
cutoff: 高通滤波截止频率(Hz)
"""
self.sample_rate = sample_rate
self.cutoff = cutoff
# 简单的 IIR 高通滤波器系数(一阶 Butterworth)
# H(z) = (1 - z^-1) / (1 - 0.99*z^-1)
self.alpha = np.exp(-2.0 * np.pi * cutoff / sample_rate)
self.prev_input = 0.0
self.prev_output = 0.0
def process(self, audio: np.ndarray) -> np.ndarray:
"""
处理音频(高通滤波)
Args:
audio: 音频数据(float32,范围 [-1, 1])
Returns:
处理后的音频
"""
if audio.dtype != np.float32:
audio = audio.astype(np.float32)
# 简单的一阶高通滤波
output = np.zeros_like(audio)
for i in range(len(audio)):
output[i] = self.alpha * (self.prev_output + audio[i] - self.prev_input)
self.prev_input = audio[i]
self.prev_output = output[i]
return output
def reset(self):
"""重置滤波器状态"""
self.prev_input = 0.0
self.prev_output = 0.0
class AudioPreprocessorOptimized:
"""
优化版音频预处理器
性能优化:
1. 轻量级降噪(替代 noisereduce)
2. 增量 AGC 计算
3. 减少类型转换
"""
def __init__(self, enable_noise_reduction: Optional[bool] = None,
enable_agc: Optional[bool] = None):
"""
初始化音频预处理器
"""
# 从配置读取
if enable_noise_reduction is None:
enable_noise_reduction = SYSTEM_AUDIO_CONFIG.get("noise_reduce", True)
if enable_agc is None:
enable_agc = SYSTEM_AUDIO_CONFIG.get("agc", True)
self.enable_noise_reduction = enable_noise_reduction
self.enable_agc = enable_agc
self.sample_rate = int(SYSTEM_AUDIO_CONFIG.get("sample_rate", 16000))
# AGC 参数(确保类型正确,从 YAML 读取可能是字符串)
self.agc_target_db = float(SYSTEM_AUDIO_CONFIG.get("agc_target_db", -20.0))
self.agc_gain_min = float(SYSTEM_AUDIO_CONFIG.get("agc_gain_min", 0.1))
self.agc_gain_max = float(SYSTEM_AUDIO_CONFIG.get("agc_gain_max", 10.0))
self.agc_rms_threshold = float(SYSTEM_AUDIO_CONFIG.get("agc_rms_threshold", 1e-6))
self._agc_alpha = float(SYSTEM_AUDIO_CONFIG.get("agc_smoothing_alpha", 0.1))
self._agc_alpha = max(0.02, min(0.95, self._agc_alpha))
# 当需要抬增益(小声/巨响过后)时用更大系数,避免长时间压在 agc_gain_min
self._agc_release_alpha = float(
SYSTEM_AUDIO_CONFIG.get("agc_release_alpha", 0.45)
)
self._agc_release_alpha = max(self._agc_alpha, min(0.95, self._agc_release_alpha))
# 初始化组件
if enable_noise_reduction:
self.noise_reducer = LightweightNoiseReduction(
sample_rate=self.sample_rate,
cutoff=80.0 # 可配置
)
else:
self.noise_reducer = None
if enable_agc:
# 使用增量 RMS 计算器
window_size = int(SYSTEM_AUDIO_CONFIG.get("frame_size", 1024))
self.rms_calculator = IncrementalRMS(window_size=window_size)
self.current_gain = 1.0 # 缓存当前增益
else:
self.rms_calculator = None
logger.info(
f"优化版音频预处理器初始化完成: "
f"降噪={'启用(轻量级)' if enable_noise_reduction else '禁用'}, "
f"自动增益控制={'启用(增量)' if enable_agc else '禁用'}"
)
def reset(self) -> None:
"""
重置高通滤波与 AGC 状态。应在「暂停采集再重新 start_stream」之后调用
避免停麦/播 TTS 期间的状态带到新流上(否则易出现恢复后长时间 RMS≈0 或电平怪异)。
"""
if self.noise_reducer is not None:
self.noise_reducer.reset()
if self.rms_calculator is not None:
self.rms_calculator.reset()
if self.enable_agc:
self.current_gain = 1.0
def reset_agc_state(self) -> None:
"""
每段语音结束或需Recovery时调用清空 RMS 滑窗并将增益重置为 1。
避免短时强噪声把 current_gain 压在 agc_gain_min、滑窗仍含高能量导致后续 RMS≈0。
"""
if not self.enable_agc or self.rms_calculator is None:
return
self.rms_calculator.reset()
self.current_gain = 1.0
def reduce_noise(self, audio_data: np.ndarray) -> np.ndarray:
"""
轻量级降噪处理
Args:
audio_data: 音频数据(int16 或 float32)
Returns:
降噪后的音频数据
"""
if not self.enable_noise_reduction or self.noise_reducer is None:
return audio_data
# 转换为 float32
if audio_data.dtype == np.int16:
audio_float = audio_data.astype(np.float32) / 32768.0
is_int16 = True
else:
audio_float = audio_data.astype(np.float32)
is_int16 = False
# 轻量级降噪
reduced = self.noise_reducer.process(audio_float)
# 转换回原始格式
if is_int16:
reduced = (reduced * 32768.0).astype(np.int16)
return reduced
def automatic_gain_control(self, audio_data: np.ndarray) -> np.ndarray:
"""
自动增益控制(使用增量 RMS)
Args:
audio_data: 音频数据(int16 或 float32)
Returns:
增益调整后的音频数据
"""
if not self.enable_agc or self.rms_calculator is None:
return audio_data
# 转换为 float32
if audio_data.dtype == np.int16:
audio_float = audio_data.astype(np.float32) / 32768.0
is_int16 = True
else:
audio_float = audio_data.astype(np.float32)
is_int16 = False
# 使用增量 RMS 计算
rms = self.rms_calculator.update_batch(audio_float)
if rms < self.agc_rms_threshold:
return audio_data
# 计算增益(可以进一步优化:使用滑动平均)
current_db = 20 * np.log10(rms)
gain_db = self.agc_target_db - current_db
gain_linear = 10 ** (gain_db / 20.0)
gain_linear = np.clip(gain_linear, self.agc_gain_min, self.agc_gain_max)
# 压低增益用较小 alpha需要恢复gain_linear 明显高于当前)时用 release alpha
if gain_linear > self.current_gain * 1.08:
alpha = self._agc_release_alpha
else:
alpha = self._agc_alpha
self.current_gain = alpha * gain_linear + (1 - alpha) * self.current_gain
# 应用增益
adjusted = audio_float * self.current_gain
adjusted = np.clip(adjusted, -1.0, 1.0)
# 转换回原始格式
if is_int16:
adjusted = (adjusted * 32768.0).astype(np.int16)
return adjusted
def process(self, audio_data: np.ndarray) -> np.ndarray:
"""
完整的预处理流程(优化版)
Args:
audio_data: 音频数据(numpy array)
Returns:
预处理后的音频数据
"""
processed = audio_data.copy()
# 降噪
if self.enable_noise_reduction:
processed = self.reduce_noise(processed)
# 自动增益控制
if self.enable_agc:
processed = self.automatic_gain_control(processed)
return processed
# 向后兼容别名(保持API一致性)
AudioCapture = AudioCaptureOptimized
AudioPreprocessor = AudioPreprocessorOptimized
# 使用示例
if __name__ == "__main__":
# 优化版使用
with AudioCapture() as capture:
preprocessor = AudioPreprocessor()
for i in range(10):
chunk = capture.read_chunk_numpy(timeout=0.1)
if chunk is not None:
processed = preprocessor.process(chunk)
print(f"处理了 {len(processed)} 个采样点")