715 lines
25 KiB
Python
715 lines
25 KiB
Python
"""
|
||
音频采集模块 - 优化版本
|
||
|
||
输入设备(麦克风)选择(已简化):
|
||
- 若 system.yaml 中 audio.input_device_index 为整数:只尝试该 PyAudio 索引(无则启动失败并列设备)。
|
||
- 若为 null:依次尝试系统默认输入、所有 maxInputChannels>0 的设备。
|
||
rocket_drone_audio 启动时可交互选择并写入 input_device_index(见 src.core.mic_device_select)。
|
||
"""
|
||
from voice_drone.core.portaudio_env import fix_ld_path_for_portaudio
|
||
|
||
fix_ld_path_for_portaudio()
|
||
|
||
import re
|
||
import pyaudio
|
||
import numpy as np
|
||
import queue
|
||
import threading
|
||
from typing import List, Optional, Tuple
|
||
from voice_drone.core.configuration import SYSTEM_AUDIO_CONFIG
|
||
from voice_drone.logging_ import get_logger
|
||
|
||
logger = get_logger("audio.capture.optimized")
|
||
|
||
|
||
class AudioCaptureOptimized:
|
||
"""
|
||
优化版音频采集器
|
||
|
||
使用回调模式 + 队列,实现非阻塞音频采集
|
||
"""
|
||
|
||
def __init__(self):
|
||
"""
|
||
初始化音频采集器
|
||
"""
|
||
# 确保数值类型正确(从 YAML 读取可能是字符串)
|
||
self.sample_rate = int(SYSTEM_AUDIO_CONFIG.get("sample_rate", 16000))
|
||
self.channels = int(SYSTEM_AUDIO_CONFIG.get("channels", 1))
|
||
self.chunk_size = int(SYSTEM_AUDIO_CONFIG.get("frame_size", 1024))
|
||
self.sample_width = int(SYSTEM_AUDIO_CONFIG.get("sample_width", 2))
|
||
|
||
# 高性能模式配置
|
||
self.buffer_queue_size = int(SYSTEM_AUDIO_CONFIG.get("buffer_queue_size", 10))
|
||
self._prefer_stereo_capture = bool(
|
||
SYSTEM_AUDIO_CONFIG.get("prefer_stereo_capture", True)
|
||
)
|
||
raw_idx = SYSTEM_AUDIO_CONFIG.get("input_device_index", None)
|
||
self._input_device_index_cfg: Optional[int] = (
|
||
int(raw_idx) if raw_idx is not None and str(raw_idx).strip() != "" else None
|
||
)
|
||
|
||
tr = SYSTEM_AUDIO_CONFIG.get("audio_open_try_rates")
|
||
if tr:
|
||
raw_rates: List[int] = [int(x) for x in tr if x is not None]
|
||
else:
|
||
raw_rates = [self.sample_rate, 48000, 44100, 32000]
|
||
seen_r: set[int] = set()
|
||
self._open_try_rates: List[int] = []
|
||
for r in raw_rates:
|
||
if r not in seen_r:
|
||
seen_r.add(r)
|
||
self._open_try_rates.append(r)
|
||
|
||
# 逻辑通道(送给 VAD/STT 的 mono);_pa_channels 为 PortAudio 实际打开的通道数
|
||
self._pa_channels = self.channels
|
||
self._stereo_downmix = False
|
||
self._pa_open_sample_rate: int = self.sample_rate
|
||
|
||
self.audio = pyaudio.PyAudio()
|
||
self.format = self.audio.get_format_from_width(self.sample_width)
|
||
|
||
# 使用队列缓冲音频数据(非阻塞)
|
||
self.audio_queue = queue.Queue(maxsize=self.buffer_queue_size)
|
||
self.stream: Optional[pyaudio.Stream] = None
|
||
|
||
logger.info(
|
||
f"优化版音频采集器初始化成功: "
|
||
f"采样率={self.sample_rate}Hz, "
|
||
f"块大小={self.chunk_size}, "
|
||
f"使用回调模式+队列缓冲"
|
||
)
|
||
|
||
def _device_hw_tuple_in_name(self, dev_name: str) -> Optional[Tuple[int, int]]:
|
||
m = re.search(r"\(hw:(\d+),\s*(\d+)\)", dev_name)
|
||
if not m:
|
||
return None
|
||
return int(m.group(1)), int(m.group(2))
|
||
|
||
def _ordered_input_candidates(self) -> Tuple[List[int], List[int]]:
|
||
preferred: List[int] = []
|
||
seen: set[int] = set()
|
||
|
||
def add(idx: Optional[int]) -> None:
|
||
if idx is None:
|
||
return
|
||
ii = int(idx)
|
||
if ii in seen:
|
||
return
|
||
seen.add(ii)
|
||
preferred.append(ii)
|
||
|
||
# 配置了整数索引:只打开该设备(与交互选择 / CLI 写入一致)
|
||
if self._input_device_index_cfg is not None:
|
||
add(self._input_device_index_cfg)
|
||
return preferred, []
|
||
|
||
try:
|
||
add(int(self.audio.get_default_input_device_info()["index"]))
|
||
except Exception:
|
||
pass
|
||
for i in range(self.audio.get_device_count()):
|
||
try:
|
||
inf = self.audio.get_device_info_by_index(i)
|
||
if int(inf.get("maxInputChannels", 0)) > 0:
|
||
add(i)
|
||
except Exception:
|
||
continue
|
||
|
||
fallback: List[int] = []
|
||
for i in range(self.audio.get_device_count()):
|
||
if i in seen:
|
||
continue
|
||
try:
|
||
self.audio.get_device_info_by_index(i)
|
||
except Exception:
|
||
continue
|
||
fallback.append(i)
|
||
return preferred, fallback
|
||
|
||
def _channel_plan(self, max_in: int, dev_name: str) -> List[Tuple[int, bool]]:
|
||
ch = self.channels
|
||
pref = self._prefer_stereo_capture
|
||
if ch != 1:
|
||
return [(ch, False)]
|
||
if max_in <= 0:
|
||
logger.warning(
|
||
"设备 %s 报告 maxInputChannels=%s,将尝试 mono / stereo",
|
||
dev_name or "?",
|
||
max_in,
|
||
)
|
||
return [(1, False), (2, True)]
|
||
if max_in == 1:
|
||
return [(1, False)]
|
||
if pref:
|
||
return [(2, True), (1, False)]
|
||
return [(1, False)]
|
||
|
||
def _frames_per_buffer_for_rate(self, pa_rate: int) -> int:
|
||
if pa_rate <= 0:
|
||
pa_rate = self.sample_rate
|
||
return max(128, int(round(self.chunk_size * pa_rate / self.sample_rate)))
|
||
|
||
@staticmethod
|
||
def _resample_linear_int16(
|
||
x: np.ndarray, sr_in: int, sr_out: int
|
||
) -> np.ndarray:
|
||
if sr_in == sr_out or x.size == 0:
|
||
return x
|
||
n_out = max(1, int(round(x.size * (sr_out / sr_in))))
|
||
t_in = np.arange(x.size, dtype=np.float64)
|
||
t_out = np.linspace(0.0, float(x.size - 1), n_out, dtype=np.float64)
|
||
y = np.interp(t_out, t_in, x.astype(np.float32))
|
||
return np.clip(np.round(y), -32768, 32767).astype(np.int16)
|
||
|
||
def _try_open_on_device(self, input_device_index: int) -> bool:
|
||
try:
|
||
dev = self.audio.get_device_info_by_index(input_device_index)
|
||
except Exception:
|
||
return False
|
||
max_in = int(dev.get("maxInputChannels", 0))
|
||
dev_name = str(dev.get("name", ""))
|
||
if (
|
||
max_in <= 0
|
||
and self._input_device_index_cfg is not None
|
||
and int(input_device_index) == int(self._input_device_index_cfg)
|
||
and self._prefer_stereo_capture
|
||
and self.channels == 1
|
||
):
|
||
max_in = 2
|
||
logger.warning(
|
||
"设备 %s 上报 maxInputChannels=0,假定 2 通道以尝试 ES8388 立体声采集",
|
||
input_device_index,
|
||
)
|
||
plan = self._channel_plan(max_in, dev_name)
|
||
hw_t = self._device_hw_tuple_in_name(dev_name)
|
||
|
||
for pa_ch, stereo_dm in plan:
|
||
self._pa_channels = pa_ch
|
||
self._stereo_downmix = stereo_dm
|
||
if stereo_dm and pa_ch == 2:
|
||
logger.info(
|
||
"输入按立体声打开并下混 mono(%s index=%s)",
|
||
dev_name,
|
||
input_device_index,
|
||
)
|
||
for rate in self._open_try_rates:
|
||
fpb = self._frames_per_buffer_for_rate(int(rate))
|
||
try:
|
||
self.stream = self.audio.open(
|
||
format=self.format,
|
||
channels=self._pa_channels,
|
||
rate=int(rate),
|
||
input=True,
|
||
input_device_index=input_device_index,
|
||
frames_per_buffer=fpb,
|
||
stream_callback=self._audio_callback,
|
||
start=False,
|
||
)
|
||
self.stream.start_stream()
|
||
self._pa_open_sample_rate = int(rate)
|
||
extra = (
|
||
f" hw=card{hw_t[0]}dev{hw_t[1]}" if hw_t else ""
|
||
)
|
||
if self._pa_open_sample_rate != self.sample_rate:
|
||
logger.warning(
|
||
"输入实际 %s Hz,将重采样为 %s Hz 供 VAD/STT",
|
||
self._pa_open_sample_rate,
|
||
self.sample_rate,
|
||
)
|
||
logger.info(
|
||
"音频流启动成功 index=%s name=%r PA_ch=%s PA_rate=%s 逻辑rate=%s%s",
|
||
input_device_index,
|
||
dev_name,
|
||
self._pa_channels,
|
||
self._pa_open_sample_rate,
|
||
self.sample_rate,
|
||
extra,
|
||
)
|
||
return True
|
||
except Exception as e:
|
||
if self.stream is not None:
|
||
try:
|
||
self.stream.close()
|
||
except Exception:
|
||
pass
|
||
self.stream = None
|
||
logger.warning(
|
||
"打开失败 index=%s ch=%s rate=%s: %s",
|
||
input_device_index,
|
||
pa_ch,
|
||
rate,
|
||
e,
|
||
)
|
||
return False
|
||
|
||
def _audio_callback(self, in_data, frame_count, time_info, status):
|
||
"""
|
||
音频回调函数(非阻塞)
|
||
"""
|
||
if status:
|
||
logger.warning(f"音频流状态: {status}")
|
||
|
||
# 将数据放入队列(非阻塞)
|
||
try:
|
||
self.audio_queue.put(in_data, block=False)
|
||
except queue.Full:
|
||
logger.warning("音频队列已满,丢弃数据块")
|
||
|
||
return (None, pyaudio.paContinue)
|
||
|
||
def _log_input_devices_for_user(self) -> None:
|
||
"""列出 PortAudio 全部设备(含 in_ch=0),便于选 --input-index / 核对子串。"""
|
||
n_dev = self.audio.get_device_count()
|
||
if n_dev <= 0:
|
||
print(
|
||
"[audio] PyAudio get_device_count()=0,多为 ALSA/PortAudio 未初始化;"
|
||
"请用 bash with_system_alsa.sh python … 启动。",
|
||
flush=True,
|
||
)
|
||
logger.error("PyAudio 枚举不到任何设备")
|
||
return
|
||
lines: List[str] = []
|
||
for i in range(n_dev):
|
||
try:
|
||
inf = self.audio.get_device_info_by_index(i)
|
||
mic = int(inf.get("maxInputChannels", 0))
|
||
outc = int(inf.get("maxOutputChannels", 0))
|
||
name = str(inf.get("name", "?"))
|
||
mark = " <- 可录音" if mic > 0 else ""
|
||
lines.append(f" [{i}] in={mic} out={outc} {name}{mark}")
|
||
except Exception:
|
||
continue
|
||
msg = "\n".join(lines)
|
||
logger.error("PortAudio 设备列表:\n%s", msg)
|
||
print(
|
||
"[audio] PortAudio 设备列表(in>0 才可作输入;若板载显示 in=0 仍可用 probe 试采):\n"
|
||
+ msg,
|
||
flush=True,
|
||
)
|
||
|
||
def start_stream(self) -> None:
|
||
"""启动音频流(回调模式)"""
|
||
if self.stream is not None:
|
||
return
|
||
|
||
preferred, fallback = self._ordered_input_candidates()
|
||
to_try: List[int] = preferred + fallback
|
||
if not to_try:
|
||
print(
|
||
"[audio] 无任何输入候选。请检查 PortAudio/ALSA(建议:bash with_system_alsa.sh python …)。",
|
||
flush=True,
|
||
)
|
||
if self._input_device_index_cfg is not None:
|
||
logger.error(
|
||
"已配置 input_device_index=%s 但无效或不可打开",
|
||
self._input_device_index_cfg,
|
||
)
|
||
print(
|
||
f"[audio] 当前配置的 PyAudio 索引 {self._input_device_index_cfg} 不可用,"
|
||
"请改 system.yaml 或重新运行交互选设备。",
|
||
flush=True,
|
||
)
|
||
self._log_input_devices_for_user()
|
||
raise OSError("未找到任何 PyAudio 输入候选设备")
|
||
|
||
for input_device_index in to_try:
|
||
if self._try_open_on_device(input_device_index):
|
||
return
|
||
|
||
logger.error("启动音频流失败:全部候选设备无法打开")
|
||
self._log_input_devices_for_user()
|
||
raise OSError("启动音频流失败:全部候选设备无法打开")
|
||
|
||
def stop_stream(self) -> None:
|
||
"""停止音频流"""
|
||
if self.stream is None:
|
||
return
|
||
|
||
try:
|
||
self.stream.stop_stream()
|
||
self.stream.close()
|
||
self.stream = None
|
||
self._pa_open_sample_rate = self.sample_rate
|
||
# 清空队列
|
||
while not self.audio_queue.empty():
|
||
try:
|
||
self.audio_queue.get_nowait()
|
||
except queue.Empty:
|
||
break
|
||
logger.info("音频流已停止")
|
||
except Exception as e:
|
||
logger.error(f"停止音频流失败: {e}")
|
||
|
||
def read_chunk(self, timeout: float = 0.1) -> Optional[bytes]:
|
||
"""
|
||
读取一个音频块(非阻塞)
|
||
|
||
Args:
|
||
timeout: 超时时间(秒)
|
||
|
||
Returns:
|
||
音频数据(bytes),如果超时则返回 None
|
||
"""
|
||
if self.stream is None:
|
||
return None
|
||
|
||
try:
|
||
return self.audio_queue.get(timeout=timeout)
|
||
except queue.Empty:
|
||
return None
|
||
|
||
def read_chunk_numpy(self, timeout: float = 0.1) -> Optional[np.ndarray]:
|
||
"""读取一个音频块并转换为 numpy 数组(非阻塞)"""
|
||
data = self.read_chunk(timeout)
|
||
if data is None:
|
||
return None
|
||
|
||
sample_size = self._pa_channels * self.sample_width
|
||
if len(data) % sample_size != 0:
|
||
aligned_len = (len(data) // sample_size) * sample_size
|
||
if aligned_len == 0:
|
||
return None
|
||
data = data[:aligned_len]
|
||
|
||
mono = np.frombuffer(data, dtype="<i2")
|
||
if self._stereo_downmix and self._pa_channels == 2:
|
||
n = mono.size // 2
|
||
if n == 0:
|
||
return None
|
||
s = mono[: n * 2].reshape(n, 2).astype(np.int32)
|
||
mono = ((s[:, 0] + s[:, 1]) // 2).astype(np.int16)
|
||
if self._pa_open_sample_rate != self.sample_rate:
|
||
mono = self._resample_linear_int16(
|
||
mono, self._pa_open_sample_rate, self.sample_rate
|
||
)
|
||
return mono
|
||
|
||
def __enter__(self):
|
||
self.start_stream()
|
||
return self
|
||
|
||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||
self.stop_stream()
|
||
self.audio.terminate()
|
||
|
||
|
||
class IncrementalRMS:
|
||
"""
|
||
增量 RMS 计算器(滑动窗口)
|
||
|
||
用于 AGC,避免每次重新计算整个音频块的 RMS
|
||
"""
|
||
|
||
def __init__(self, window_size: int = 1024):
|
||
"""
|
||
Args:
|
||
window_size: 滑动窗口大小
|
||
"""
|
||
self.window_size = window_size
|
||
self.buffer = np.zeros(window_size, dtype=np.float32)
|
||
self.sum_sq = 0.0
|
||
self.idx = 0
|
||
self.count = 0
|
||
|
||
def update(self, sample: float) -> float:
|
||
"""
|
||
更新 RMS 值
|
||
|
||
Args:
|
||
sample: 新的采样值
|
||
|
||
Returns:
|
||
当前 RMS 值
|
||
"""
|
||
if self.count < self.window_size:
|
||
# 填充阶段
|
||
self.buffer[self.count] = sample
|
||
self.sum_sq += sample * sample
|
||
self.count += 1
|
||
if self.count == 0:
|
||
return 0.0
|
||
return np.sqrt(self.sum_sq / self.count)
|
||
else:
|
||
# 滑动窗口阶段
|
||
old_sq = self.buffer[self.idx] * self.buffer[self.idx]
|
||
self.sum_sq = self.sum_sq - old_sq + sample * sample
|
||
self.buffer[self.idx] = sample
|
||
self.idx = (self.idx + 1) % self.window_size
|
||
return np.sqrt(self.sum_sq / self.window_size)
|
||
|
||
def update_batch(self, samples: np.ndarray) -> float:
|
||
"""
|
||
批量更新 RMS 值
|
||
|
||
Args:
|
||
samples: 采样数组
|
||
|
||
Returns:
|
||
当前 RMS 值
|
||
"""
|
||
for sample in samples:
|
||
self.update(sample)
|
||
return np.sqrt(self.sum_sq / min(self.count, self.window_size))
|
||
|
||
def reset(self):
|
||
"""重置计算器"""
|
||
self.buffer.fill(0.0)
|
||
self.sum_sq = 0.0
|
||
self.idx = 0
|
||
self.count = 0
|
||
|
||
|
||
class LightweightNoiseReduction:
|
||
"""
|
||
轻量级降噪算法
|
||
|
||
使用简单的高通滤波 + 谱减法,性能比 noisereduce 快 10-20 倍
|
||
"""
|
||
|
||
def __init__(self, sample_rate: int = 16000, cutoff: float = 80.0):
|
||
"""
|
||
Args:
|
||
sample_rate: 采样率
|
||
cutoff: 高通滤波截止频率(Hz)
|
||
"""
|
||
self.sample_rate = sample_rate
|
||
self.cutoff = cutoff
|
||
|
||
# 简单的 IIR 高通滤波器系数(一阶 Butterworth)
|
||
# H(z) = (1 - z^-1) / (1 - 0.99*z^-1)
|
||
self.alpha = np.exp(-2.0 * np.pi * cutoff / sample_rate)
|
||
self.prev_input = 0.0
|
||
self.prev_output = 0.0
|
||
|
||
def process(self, audio: np.ndarray) -> np.ndarray:
|
||
"""
|
||
处理音频(高通滤波)
|
||
|
||
Args:
|
||
audio: 音频数据(float32,范围 [-1, 1])
|
||
|
||
Returns:
|
||
处理后的音频
|
||
"""
|
||
if audio.dtype != np.float32:
|
||
audio = audio.astype(np.float32)
|
||
|
||
# 简单的一阶高通滤波
|
||
output = np.zeros_like(audio)
|
||
for i in range(len(audio)):
|
||
output[i] = self.alpha * (self.prev_output + audio[i] - self.prev_input)
|
||
self.prev_input = audio[i]
|
||
self.prev_output = output[i]
|
||
|
||
return output
|
||
|
||
def reset(self):
|
||
"""重置滤波器状态"""
|
||
self.prev_input = 0.0
|
||
self.prev_output = 0.0
|
||
|
||
|
||
class AudioPreprocessorOptimized:
|
||
"""
|
||
优化版音频预处理器
|
||
|
||
性能优化:
|
||
1. 轻量级降噪(替代 noisereduce)
|
||
2. 增量 AGC 计算
|
||
3. 减少类型转换
|
||
"""
|
||
|
||
def __init__(self, enable_noise_reduction: Optional[bool] = None,
|
||
enable_agc: Optional[bool] = None):
|
||
"""
|
||
初始化音频预处理器
|
||
"""
|
||
# 从配置读取
|
||
if enable_noise_reduction is None:
|
||
enable_noise_reduction = SYSTEM_AUDIO_CONFIG.get("noise_reduce", True)
|
||
if enable_agc is None:
|
||
enable_agc = SYSTEM_AUDIO_CONFIG.get("agc", True)
|
||
|
||
self.enable_noise_reduction = enable_noise_reduction
|
||
self.enable_agc = enable_agc
|
||
self.sample_rate = int(SYSTEM_AUDIO_CONFIG.get("sample_rate", 16000))
|
||
|
||
# AGC 参数(确保类型正确,从 YAML 读取可能是字符串)
|
||
self.agc_target_db = float(SYSTEM_AUDIO_CONFIG.get("agc_target_db", -20.0))
|
||
self.agc_gain_min = float(SYSTEM_AUDIO_CONFIG.get("agc_gain_min", 0.1))
|
||
self.agc_gain_max = float(SYSTEM_AUDIO_CONFIG.get("agc_gain_max", 10.0))
|
||
self.agc_rms_threshold = float(SYSTEM_AUDIO_CONFIG.get("agc_rms_threshold", 1e-6))
|
||
self._agc_alpha = float(SYSTEM_AUDIO_CONFIG.get("agc_smoothing_alpha", 0.1))
|
||
self._agc_alpha = max(0.02, min(0.95, self._agc_alpha))
|
||
# 当需要抬增益(小声/巨响过后)时用更大系数,避免长时间压在 agc_gain_min
|
||
self._agc_release_alpha = float(
|
||
SYSTEM_AUDIO_CONFIG.get("agc_release_alpha", 0.45)
|
||
)
|
||
self._agc_release_alpha = max(self._agc_alpha, min(0.95, self._agc_release_alpha))
|
||
|
||
# 初始化组件
|
||
if enable_noise_reduction:
|
||
self.noise_reducer = LightweightNoiseReduction(
|
||
sample_rate=self.sample_rate,
|
||
cutoff=80.0 # 可配置
|
||
)
|
||
else:
|
||
self.noise_reducer = None
|
||
|
||
if enable_agc:
|
||
# 使用增量 RMS 计算器
|
||
window_size = int(SYSTEM_AUDIO_CONFIG.get("frame_size", 1024))
|
||
self.rms_calculator = IncrementalRMS(window_size=window_size)
|
||
self.current_gain = 1.0 # 缓存当前增益
|
||
else:
|
||
self.rms_calculator = None
|
||
|
||
logger.info(
|
||
f"优化版音频预处理器初始化完成: "
|
||
f"降噪={'启用(轻量级)' if enable_noise_reduction else '禁用'}, "
|
||
f"自动增益控制={'启用(增量)' if enable_agc else '禁用'}"
|
||
)
|
||
|
||
def reset(self) -> None:
|
||
"""
|
||
重置高通滤波与 AGC 状态。应在「暂停采集再重新 start_stream」之后调用,
|
||
避免停麦/播 TTS 期间的状态带到新流上(否则易出现恢复后长时间 RMS≈0 或电平怪异)。
|
||
"""
|
||
if self.noise_reducer is not None:
|
||
self.noise_reducer.reset()
|
||
if self.rms_calculator is not None:
|
||
self.rms_calculator.reset()
|
||
if self.enable_agc:
|
||
self.current_gain = 1.0
|
||
|
||
def reset_agc_state(self) -> None:
|
||
"""
|
||
每段语音结束或需Recovery时调用:清空 RMS 滑窗并将增益重置为 1。
|
||
避免短时强噪声把 current_gain 压在 agc_gain_min、滑窗仍含高能量导致后续 RMS≈0。
|
||
"""
|
||
if not self.enable_agc or self.rms_calculator is None:
|
||
return
|
||
self.rms_calculator.reset()
|
||
self.current_gain = 1.0
|
||
|
||
def reduce_noise(self, audio_data: np.ndarray) -> np.ndarray:
|
||
"""
|
||
轻量级降噪处理
|
||
|
||
Args:
|
||
audio_data: 音频数据(int16 或 float32)
|
||
|
||
Returns:
|
||
降噪后的音频数据
|
||
"""
|
||
if not self.enable_noise_reduction or self.noise_reducer is None:
|
||
return audio_data
|
||
|
||
# 转换为 float32
|
||
if audio_data.dtype == np.int16:
|
||
audio_float = audio_data.astype(np.float32) / 32768.0
|
||
is_int16 = True
|
||
else:
|
||
audio_float = audio_data.astype(np.float32)
|
||
is_int16 = False
|
||
|
||
# 轻量级降噪
|
||
reduced = self.noise_reducer.process(audio_float)
|
||
|
||
# 转换回原始格式
|
||
if is_int16:
|
||
reduced = (reduced * 32768.0).astype(np.int16)
|
||
|
||
return reduced
|
||
|
||
def automatic_gain_control(self, audio_data: np.ndarray) -> np.ndarray:
|
||
"""
|
||
自动增益控制(使用增量 RMS)
|
||
|
||
Args:
|
||
audio_data: 音频数据(int16 或 float32)
|
||
|
||
Returns:
|
||
增益调整后的音频数据
|
||
"""
|
||
if not self.enable_agc or self.rms_calculator is None:
|
||
return audio_data
|
||
|
||
# 转换为 float32
|
||
if audio_data.dtype == np.int16:
|
||
audio_float = audio_data.astype(np.float32) / 32768.0
|
||
is_int16 = True
|
||
else:
|
||
audio_float = audio_data.astype(np.float32)
|
||
is_int16 = False
|
||
|
||
# 使用增量 RMS 计算
|
||
rms = self.rms_calculator.update_batch(audio_float)
|
||
|
||
if rms < self.agc_rms_threshold:
|
||
return audio_data
|
||
|
||
# 计算增益(可以进一步优化:使用滑动平均)
|
||
current_db = 20 * np.log10(rms)
|
||
gain_db = self.agc_target_db - current_db
|
||
gain_linear = 10 ** (gain_db / 20.0)
|
||
gain_linear = np.clip(gain_linear, self.agc_gain_min, self.agc_gain_max)
|
||
|
||
# 压低增益用较小 alpha;需要恢复(gain_linear 明显高于当前)时用 release alpha
|
||
if gain_linear > self.current_gain * 1.08:
|
||
alpha = self._agc_release_alpha
|
||
else:
|
||
alpha = self._agc_alpha
|
||
self.current_gain = alpha * gain_linear + (1 - alpha) * self.current_gain
|
||
|
||
# 应用增益
|
||
adjusted = audio_float * self.current_gain
|
||
adjusted = np.clip(adjusted, -1.0, 1.0)
|
||
|
||
# 转换回原始格式
|
||
if is_int16:
|
||
adjusted = (adjusted * 32768.0).astype(np.int16)
|
||
|
||
return adjusted
|
||
|
||
def process(self, audio_data: np.ndarray) -> np.ndarray:
|
||
"""
|
||
完整的预处理流程(优化版)
|
||
|
||
Args:
|
||
audio_data: 音频数据(numpy array)
|
||
|
||
Returns:
|
||
预处理后的音频数据
|
||
"""
|
||
processed = audio_data.copy()
|
||
|
||
# 降噪
|
||
if self.enable_noise_reduction:
|
||
processed = self.reduce_noise(processed)
|
||
|
||
# 自动增益控制
|
||
if self.enable_agc:
|
||
processed = self.automatic_gain_control(processed)
|
||
|
||
return processed
|
||
|
||
|
||
# 向后兼容别名(保持API一致性)
|
||
AudioCapture = AudioCaptureOptimized
|
||
AudioPreprocessor = AudioPreprocessorOptimized
|
||
|
||
|
||
# 使用示例
|
||
if __name__ == "__main__":
|
||
# 优化版使用
|
||
with AudioCapture() as capture:
|
||
preprocessor = AudioPreprocessor()
|
||
|
||
for i in range(10):
|
||
chunk = capture.read_chunk_numpy(timeout=0.1)
|
||
if chunk is not None:
|
||
processed = preprocessor.process(chunk)
|
||
print(f"处理了 {len(processed)} 个采样点")
|