490 lines
21 KiB
C++
490 lines
21 KiB
C++
#include "audio_processor.h"
|
||
#include <esp_log.h>
|
||
#include <cmath>
|
||
#include <algorithm>
|
||
|
||
#define PROCESSOR_RUNNING 0x01
|
||
|
||
static const char* TAG = "AudioProcessor";
|
||
|
||
AudioProcessor::AudioProcessor()
|
||
: afe_data_(nullptr), adaptive_enabled_(true) {
|
||
event_group_ = xEventGroupCreate();
|
||
}
|
||
|
||
// 初始化音频处理器
|
||
void AudioProcessor::Initialize(AudioCodec* codec, bool realtime_chat) {
|
||
codec_ = codec;
|
||
int ref_num = codec_->input_reference() ? 1 : 0;
|
||
|
||
std::string input_format;
|
||
for (int i = 0; i < codec_->input_channels() - ref_num; i++) {
|
||
input_format.push_back('M');
|
||
}
|
||
for (int i = 0; i < ref_num; i++) {
|
||
input_format.push_back('R');
|
||
}
|
||
|
||
srmodel_list_t *models = esp_srmodel_init("model");
|
||
char* ns_model_name = esp_srmodel_filter(models, ESP_NSNET_PREFIX, NULL);
|
||
|
||
afe_config_t* afe_config = afe_config_init(input_format.c_str(), NULL, AFE_TYPE_VC, AFE_MODE_HIGH_PERF);
|
||
if (realtime_chat) {
|
||
// 实时模式:启用AEC,但关闭VAD
|
||
afe_config->aec_init = true;
|
||
afe_config->aec_mode = AEC_MODE_VOIP_HIGH_PERF; // 高性能AEC模式
|
||
|
||
// 关闭VAD
|
||
afe_config->vad_init = false;
|
||
|
||
ESP_LOGI(TAG, "Realtime mode: AEC enabled, VAD disabled");
|
||
} else {
|
||
// 非实时模式:可根据需求配置
|
||
afe_config->aec_init = false;
|
||
afe_config->vad_init = false; // 关闭VAD
|
||
|
||
ESP_LOGI(TAG, "Non-realtime mode: VAD disabled");
|
||
}
|
||
|
||
// 启用噪声抑制 - 与官方项目保持一致
|
||
afe_config->ns_init = true;
|
||
afe_config->ns_model_name = ns_model_name;
|
||
afe_config->afe_ns_mode = AFE_NS_MODE_NET;
|
||
|
||
// 启用AGC并设置为WAKENET模式 - 与官方项目保持一致
|
||
afe_config->agc_init = true;
|
||
afe_config->agc_mode = AFE_AGC_MODE_WAKENET; // 使用WAKENET模式的AGC
|
||
afe_config->agc_target_level_dbfs = -3; // 设置目标电平为-3dBFS(与官方默认一致)
|
||
|
||
// 其他配置保持不变
|
||
afe_config->afe_perferred_core = 1;
|
||
afe_config->afe_perferred_priority = 5;
|
||
afe_config->memory_alloc_mode = AFE_MEMORY_ALLOC_MORE_INTERNAL;
|
||
|
||
ESP_LOGI(TAG, "AFE配置: 格式=%s 通道=%d 参考=%d", input_format.c_str(), codec_->input_channels(), codec_->input_reference() ? 1 : 0);
|
||
ESP_LOGI(TAG, "AFE配置: AEC=%s VAD=disabled AGC=enabled 核心=%d 优先级=%d",
|
||
realtime_chat ? "enabled" : "disabled", 1, 5);
|
||
|
||
afe_iface_ = esp_afe_handle_from_config(afe_config);
|
||
afe_data_ = afe_iface_->create_from_config(afe_config);
|
||
|
||
// 创建任务部分保持不变
|
||
xTaskCreate([](void* arg) {
|
||
auto this_ = (AudioProcessor*)arg;
|
||
this_->AudioProcessorTask();
|
||
vTaskDelete(NULL);
|
||
}, "audio_communication", 4096, this, 3, NULL);
|
||
}
|
||
|
||
AudioProcessor::~AudioProcessor() {
|
||
if (afe_data_ != nullptr) {
|
||
afe_iface_->destroy(afe_data_);
|
||
}
|
||
vEventGroupDelete(event_group_);
|
||
}
|
||
|
||
void AudioProcessor::Feed(const std::vector<int16_t>& data) {
|
||
if (afe_data_ != nullptr) {
|
||
afe_iface_->feed(afe_data_, (int16_t*)data.data());
|
||
}
|
||
}
|
||
|
||
void AudioProcessor::Start() {
|
||
xEventGroupSetBits(event_group_, PROCESSOR_RUNNING);
|
||
}
|
||
|
||
void AudioProcessor::Stop() {
|
||
xEventGroupClearBits(event_group_, PROCESSOR_RUNNING);
|
||
}
|
||
|
||
bool AudioProcessor::IsRunning() {
|
||
return (xEventGroupGetBits(event_group_) & PROCESSOR_RUNNING) != 0;
|
||
}
|
||
// 输出回调函数,用于将处理后的音频数据发送到外部
|
||
void AudioProcessor::OnOutput(std::function<void(std::vector<int16_t>&& data)> callback) {
|
||
output_callback_ = callback;
|
||
}
|
||
|
||
void AudioProcessor::OnVadStateChange(std::function<void(bool speaking)> callback) {
|
||
vad_state_change_callback_ = callback;
|
||
}
|
||
|
||
void AudioProcessor::OnSimpleVadStateChange(std::function<void(bool speaking)> callback) {
|
||
simple_vad_state_change_callback_ = callback;
|
||
}
|
||
|
||
size_t AudioProcessor::GetFeedSize() {
|
||
if (afe_iface_ != nullptr && afe_data_ != nullptr) {
|
||
return afe_iface_->get_feed_chunksize(afe_data_);
|
||
}
|
||
return 0;
|
||
}
|
||
|
||
void AudioProcessor::AudioProcessorTask() {
|
||
auto fetch_size = afe_iface_->get_fetch_chunksize(afe_data_);
|
||
auto feed_size = afe_iface_->get_feed_chunksize(afe_data_);
|
||
ESP_LOGI(TAG, "Audio communication task started, feed size: %d fetch size: %d",
|
||
feed_size, fetch_size);
|
||
|
||
while (true) {
|
||
xEventGroupWaitBits(event_group_, PROCESSOR_RUNNING, pdFALSE, pdTRUE, portMAX_DELAY);
|
||
|
||
auto res = afe_iface_->fetch_with_delay(afe_data_, portMAX_DELAY);
|
||
if ((xEventGroupGetBits(event_group_) & PROCESSOR_RUNNING) == 0) {
|
||
continue;
|
||
}
|
||
if (res == nullptr || res->ret_value == ESP_FAIL) {
|
||
if (res != nullptr) {
|
||
ESP_LOGI(TAG, "Error code: %d", res->ret_value);
|
||
}
|
||
continue;
|
||
}
|
||
|
||
// 🎯 简单VAD处理:用于普通业务(触摸忽略、LED状态等)
|
||
if (simple_vad_state_change_callback_) {
|
||
// 参考chumo4_yuan的简单实现:直接使用ESP-ADF的VAD结果
|
||
static bool simple_is_speaking = false;
|
||
if (res->vad_state == VAD_SPEECH && !simple_is_speaking) {
|
||
simple_is_speaking = true;
|
||
simple_vad_state_change_callback_(true);
|
||
} else if (res->vad_state == VAD_SILENCE && simple_is_speaking) {
|
||
simple_is_speaking = false;
|
||
simple_vad_state_change_callback_(false);
|
||
}
|
||
}
|
||
|
||
// 🔊 复杂VAD处理:小智AI官方语音打断方案(仅在语音打断功能启用时使用)
|
||
if (vad_state_change_callback_) {
|
||
// 核心逻辑:检测VAD状态变化,区分人声和回声
|
||
bool human_voice_detected = (res->vad_state == VAD_SPEECH);
|
||
|
||
if (human_voice_detected && !is_speaking_) {
|
||
// 语音开始:使用增强的回声感知评估,区分真实人声和设备回声
|
||
if (EvaluateSpeechWithEchoAwareness(res)) {
|
||
is_speaking_ = true;
|
||
ESP_LOGI(TAG, "VAD: Human voice detected (echo-aware filtering)");
|
||
// ESP_LOGI(TAG, "🗣️ 检测到人声(已通过回声过滤)");
|
||
vad_state_change_callback_(true);
|
||
} else {
|
||
ESP_LOGV(TAG, "VAD: Voice rejected (likely device echo)");
|
||
ESP_LOGV(TAG, "🔇 声音被判定为设备回声,已忽略");
|
||
}
|
||
} else if (!human_voice_detected && is_speaking_) {
|
||
// 语音结束:VAD检测到静音
|
||
is_speaking_ = false;
|
||
ESP_LOGI(TAG, "VAD: Human voice ended");
|
||
ESP_LOGI(TAG, "🛑 人声结束(进入静音)");
|
||
vad_state_change_callback_(false);
|
||
}
|
||
}
|
||
|
||
if (output_callback_) {
|
||
// 确保音频数据在正确的内存区域分配,避免PSRAM/内部内存混乱
|
||
size_t sample_count = res->data_size / sizeof(int16_t);
|
||
std::vector<int16_t> audio_data;
|
||
audio_data.reserve(sample_count);
|
||
|
||
// 逐个复制数据,确保使用标准内存分配器
|
||
int16_t* src_data = (int16_t*)res->data;
|
||
for (size_t i = 0; i < sample_count; i++) {
|
||
audio_data.push_back(src_data[i]);
|
||
}
|
||
|
||
output_callback_(std::move(audio_data));
|
||
}
|
||
}
|
||
}
|
||
|
||
// 回声感知VAD优化方法实现
|
||
void AudioProcessor::SetEchoAwareParams(const EchoAwareVadParams& params) {
|
||
echo_params_ = params;
|
||
ESP_LOGI(TAG, "Echo-aware VAD params updated: snr_threshold=%.2f, min_silence=%dms, cooldown=%dms",
|
||
params.snr_threshold, params.min_silence_ms, params.interrupt_cooldown_ms);
|
||
}
|
||
|
||
void AudioProcessor::SetSpeakerVolume(float volume) {
|
||
current_speaker_volume_ = volume;
|
||
|
||
// 🎯 触发自适应噪声抑制更新
|
||
if (adaptive_enabled_ && echo_params_.adaptive_noise_suppression) {
|
||
AdaptSuppressionLevel();
|
||
}
|
||
|
||
ESP_LOGV(TAG, "Speaker volume updated: %.2f, adaptive suppression: %.2f",
|
||
volume, adaptive_state_.dynamic_suppression_level);
|
||
}
|
||
|
||
bool AudioProcessor::IsEchoSuppressed() const {
|
||
return aec_converged_;
|
||
}
|
||
|
||
bool AudioProcessor::EvaluateSpeechWithEchoAwareness(afe_fetch_result_t* fetch_result) {
|
||
if (!fetch_result || fetch_result->ret_value != ESP_OK) {
|
||
return false;
|
||
}
|
||
|
||
// 检查VAD状态 - 基于实际的ESP-ADF API
|
||
bool basic_vad_detected = (fetch_result->vad_state == VAD_SPEECH);
|
||
|
||
if (!basic_vad_detected) {
|
||
return false;
|
||
}
|
||
|
||
// 增强的回声感知逻辑:多重检查机制
|
||
if (echo_params_.adaptive_threshold) {
|
||
// 计算当前音频块的能量
|
||
int16_t* audio_data = (int16_t*)fetch_result->data;
|
||
size_t sample_count = fetch_result->data_size / sizeof(int16_t);
|
||
|
||
float energy = 0.0f;
|
||
float peak_amplitude = 0.0f;
|
||
for (size_t i = 0; i < sample_count; i++) {
|
||
float sample = (float)abs(audio_data[i]);
|
||
energy += sample * sample;
|
||
if (sample > peak_amplitude) {
|
||
peak_amplitude = sample;
|
||
}
|
||
}
|
||
energy = energy / sample_count; // 平均能量
|
||
|
||
// 🎯 自适应噪声抑制:根据实时环境动态调整阈值
|
||
// 首先更新自适应状态
|
||
UpdateAdaptiveNoiseState(audio_data, sample_count);
|
||
|
||
// 获取动态抑制级别
|
||
float adaptive_suppression = adaptive_enabled_ && echo_params_.adaptive_noise_suppression ?
|
||
adaptive_state_.dynamic_suppression_level : 1.0f;
|
||
|
||
// 🔊 智能阈值计算:结合固定策略和自适应策略
|
||
float volume_factor = 1.0f + current_speaker_volume_ * 500.0f; // 基础音量影响
|
||
float adaptive_threshold = echo_params_.snr_threshold * volume_factor * adaptive_suppression; // 自适应增强
|
||
float energy_threshold = adaptive_threshold * 10000000000.0f; // 基础阈值
|
||
|
||
// 超激进峰值检查:极度提高阈值,完全阻止误触发
|
||
float peak_threshold = 500000.0f * volume_factor; // 超激进提高峰值阈值
|
||
bool peak_check = (peak_amplitude > peak_threshold);
|
||
|
||
// 能量检查
|
||
bool energy_check = (energy > energy_threshold);
|
||
|
||
// 超激进扬声器保护:任何微弱音频都极大提高阈值
|
||
if (current_speaker_volume_ > 0.0001f) { // 极早触发保护
|
||
energy_threshold *= 1000.0f; // 播放时能量阈值提高1000倍
|
||
peak_threshold *= 500.0f; // 峰值阈值提高500倍
|
||
energy_check = (energy > energy_threshold);
|
||
peak_check = (peak_amplitude > peak_threshold);
|
||
}
|
||
|
||
// 频域特征检查:分析高频成分,人声通常有更多高频特征
|
||
float high_freq_energy = 0.0f;
|
||
for (size_t i = sample_count / 2; i < sample_count; i++) {
|
||
float sample = (float)abs(audio_data[i]);
|
||
high_freq_energy += sample * sample;
|
||
}
|
||
high_freq_energy = high_freq_energy / (sample_count / 2);
|
||
|
||
// 超激进高频比例检查:极度严格的人声特征要求
|
||
float high_freq_ratio = (energy > 0) ? (high_freq_energy / energy) : 0.0f;
|
||
float freq_threshold = 1.2f * volume_factor; // 超激进提高高频比例要求到1.2(几乎不可能达到)
|
||
if (current_speaker_volume_ > 0.0001f) {
|
||
freq_threshold *= 50.0f; // 播放时超激进提高高频要求
|
||
}
|
||
bool freq_check = (high_freq_ratio > freq_threshold);
|
||
|
||
// 超激进稳定性检查:极度严格的信号变化要求
|
||
float variance = 0.0f;
|
||
for (size_t i = 1; i < sample_count; i++) {
|
||
float diff = (float)(abs(audio_data[i]) - abs(audio_data[i-1]));
|
||
variance += diff * diff;
|
||
}
|
||
variance = variance / (sample_count - 1);
|
||
float variance_threshold = 10000000000.0f / volume_factor; // 超激进提高方差要求
|
||
if (current_speaker_volume_ > 0.0001f) {
|
||
variance_threshold *= 100.0f; // 播放时超激进提高方差要求
|
||
}
|
||
bool stability_check = (variance > variance_threshold); // 人声变化更大
|
||
|
||
// 增强连续性检查 - 真实人声通常有连续的特征变化
|
||
static float prev_energy = 0.0f;
|
||
static float prev_high_freq_ratio = 0.0f;
|
||
static int consistent_frames = 0; // 连续帧计数
|
||
|
||
float energy_change = abs(energy - prev_energy) / (prev_energy + 1.0f);
|
||
float freq_change = abs(high_freq_ratio - prev_high_freq_ratio);
|
||
|
||
// 超激进连续性要求:需要连续更多帧都满足极严格的人声特征
|
||
bool frame_continuity = (energy_change > 1.2f && freq_change > 0.5f); // 超激进提高变化要求,且必须同时满足
|
||
if (frame_continuity) {
|
||
consistent_frames++;
|
||
} else {
|
||
consistent_frames = 0; // 重置计数
|
||
}
|
||
bool continuity_check = (consistent_frames >= 10); // 需要连续10帧都符合极严格人声特征
|
||
|
||
prev_energy = energy;
|
||
prev_high_freq_ratio = high_freq_ratio;
|
||
|
||
// 最终综合判断:需要同时满足所有条件(绝对严格)
|
||
// 新增:播放时间检查 - 如果刚开始播放,额外严格
|
||
static auto last_volume_update = std::chrono::steady_clock::now();
|
||
auto now = std::chrono::steady_clock::now();
|
||
if (current_speaker_volume_ > 0.01f) {
|
||
last_volume_update = now;
|
||
}
|
||
auto time_since_playback = std::chrono::duration_cast<std::chrono::milliseconds>(now - last_volume_update);
|
||
bool recent_playback_protection = (time_since_playback.count() < 30000); // 播放后30秒内额外保护
|
||
|
||
bool final_result = energy_check && peak_check && freq_check && stability_check &&
|
||
continuity_check && !recent_playback_protection;
|
||
|
||
// 🔕 注释掉过于频繁的回声评估详细日志 - 只在结果为true时输出
|
||
if (final_result) {
|
||
ESP_LOGI(TAG, "🎯 HUMAN VOICE DETECTED: duration=%.0fms, vol=%.3f, adaptive=%.1f%s",
|
||
(float)time_since_playback.count(), current_speaker_volume_, adaptive_suppression,
|
||
adaptive_state_.high_interference_mode ? "[HIGH_INTERFERENCE]" : "");
|
||
}
|
||
|
||
|
||
return final_result;
|
||
}
|
||
|
||
// 非自适应模式,直接信任VAD结果
|
||
return true;
|
||
}
|
||
|
||
// 🎯 自适应噪声抑制核心算法实现
|
||
void AudioProcessor::UpdateAdaptiveNoiseState(const int16_t* audio_data, size_t sample_count) {
|
||
if (!adaptive_enabled_ || !echo_params_.adaptive_noise_suppression) {
|
||
return;
|
||
}
|
||
|
||
// 计算当前回声强度
|
||
float echo_strength = CalculateEchoStrength(audio_data, sample_count);
|
||
adaptive_state_.current_echo_strength = echo_strength;
|
||
|
||
// 估算距离因子 (基于回声强度和音量)
|
||
adaptive_state_.estimated_distance_factor = EstimateDistanceFactor(echo_strength, current_speaker_volume_);
|
||
|
||
// 更新环境噪声基线
|
||
if (current_speaker_volume_ < 0.01f) { // 扬声器几乎静音时更新基线
|
||
float current_noise = 0.0f;
|
||
for (size_t i = 0; i < sample_count; i++) {
|
||
current_noise += abs(audio_data[i]);
|
||
}
|
||
current_noise /= sample_count;
|
||
|
||
// 指数移动平均更新噪声基线
|
||
adaptive_state_.noise_baseline = adaptive_state_.noise_baseline * 0.95f + current_noise * 0.05f;
|
||
}
|
||
|
||
// 自适应调整抑制级别
|
||
AdaptSuppressionLevel();
|
||
|
||
adaptive_state_.last_adaptation_time = std::chrono::steady_clock::now();
|
||
}
|
||
|
||
float AudioProcessor::CalculateEchoStrength(const int16_t* audio_data, size_t sample_count) {
|
||
if (current_speaker_volume_ < 0.001f) {
|
||
return 0.0f; // 扬声器静音,无回声
|
||
}
|
||
|
||
// 计算音频能量
|
||
float energy = 0.0f;
|
||
float peak = 0.0f;
|
||
for (size_t i = 0; i < sample_count; i++) {
|
||
float sample = abs(audio_data[i]);
|
||
energy += sample * sample;
|
||
if (sample > peak) peak = sample;
|
||
}
|
||
energy = std::sqrt(energy / sample_count);
|
||
|
||
// 🔊 回声强度 = 能量 × 峰值比 × 音量影响
|
||
float peak_ratio = (energy > 0) ? (peak / energy) : 0.0f;
|
||
|
||
// 🎯 关键洞察:回声具有特征性的能量分布模式
|
||
// 真实人声:能量分布更均匀,峰值比较低
|
||
// 设备回声:能量集中,峰值比较高
|
||
float echo_indicator = peak_ratio * current_speaker_volume_;
|
||
|
||
return echo_indicator;
|
||
}
|
||
|
||
float AudioProcessor::EstimateDistanceFactor(float echo_strength, float volume) {
|
||
if (volume < 0.001f) {
|
||
return 1.0f; // 静音时认为距离无关紧要
|
||
}
|
||
|
||
// 🎯 基于物理原理的距离估算:
|
||
// 回声强度 ∝ 音量² / 距离²
|
||
// 距离因子 = 1 / (1 + echo_strength * volume_sensitivity)
|
||
// 值越小表示越近,值越大表示越远
|
||
|
||
float normalized_echo = echo_strength / (volume + 0.001f); // 归一化回声
|
||
float distance_factor = 1.0f / (1.0f + normalized_echo * echo_params_.volume_sensitivity);
|
||
|
||
// 🔊 约束距离因子范围 [0.1, 1.0]
|
||
distance_factor = std::max(0.1f, std::min(1.0f, distance_factor));
|
||
|
||
// 🔕 注释掉过于频繁的距离估算日志
|
||
// ESP_LOGD(TAG, "🎯 Distance estimation: echo=%.3f, vol=%.3f, factor=%.3f",
|
||
// echo_strength, volume, distance_factor);
|
||
|
||
return distance_factor;
|
||
}
|
||
|
||
void AudioProcessor::AdaptSuppressionLevel() {
|
||
if (!adaptive_enabled_ || !echo_params_.adaptive_noise_suppression) {
|
||
adaptive_state_.dynamic_suppression_level = 1.0f;
|
||
return;
|
||
}
|
||
|
||
// 🎯 自适应抑制级别计算
|
||
// 基础抑制级别
|
||
float base_level = echo_params_.noise_suppression_base;
|
||
|
||
// 🔊 音量影响:音量越大,抑制越强
|
||
float volume_multiplier = 1.0f + current_speaker_volume_ * echo_params_.volume_sensitivity;
|
||
|
||
// 📏 距离影响:距离越近,抑制越强
|
||
float distance_multiplier = 1.0f / (adaptive_state_.estimated_distance_factor + 0.1f);
|
||
|
||
// 🌊 回声强度影响:回声越强,抑制越强
|
||
float echo_multiplier = 1.0f + adaptive_state_.current_echo_strength * 2.0f;
|
||
|
||
// 🎯 综合计算动态抑制级别
|
||
adaptive_state_.dynamic_suppression_level = base_level * volume_multiplier * distance_multiplier * echo_multiplier;
|
||
|
||
// 📊 高干扰模式判断
|
||
bool was_high_interference = adaptive_state_.high_interference_mode;
|
||
adaptive_state_.high_interference_mode = (
|
||
current_speaker_volume_ > 0.3f && // 高音量
|
||
adaptive_state_.estimated_distance_factor < 0.5f && // 近距离
|
||
adaptive_state_.current_echo_strength > echo_params_.echo_detection_threshold // 强回声
|
||
);
|
||
|
||
// 🚨 高干扰模式额外保护
|
||
if (adaptive_state_.high_interference_mode) {
|
||
adaptive_state_.dynamic_suppression_level *= 5.0f; // 高干扰时5倍抑制
|
||
|
||
if (!was_high_interference) {
|
||
ESP_LOGW(TAG, "🔴 Entering HIGH INTERFERENCE mode - vol=%.2f, dist=%.2f, echo=%.3f",
|
||
current_speaker_volume_, adaptive_state_.estimated_distance_factor,
|
||
adaptive_state_.current_echo_strength);
|
||
}
|
||
} else if (was_high_interference) {
|
||
ESP_LOGI(TAG, "🟢 Exiting high interference mode - returning to adaptive suppression");
|
||
}
|
||
|
||
// 📏 限制抑制级别范围 [1.0, 100.0]
|
||
adaptive_state_.dynamic_suppression_level = std::max(1.0f, std::min(100.0f, adaptive_state_.dynamic_suppression_level));
|
||
|
||
// 🔕 注释掉过于频繁的自适应抑制日志
|
||
// ESP_LOGD(TAG, "🎯 Adaptive suppression: vol=%.2f, dist=%.2f, echo=%.3f → level=%.1f %s",
|
||
// current_speaker_volume_, adaptive_state_.estimated_distance_factor,
|
||
// adaptive_state_.current_echo_strength, adaptive_state_.dynamic_suppression_level,
|
||
// adaptive_state_.high_interference_mode ? "[HIGH_INTERFERENCE]" : "");
|
||
}
|
||
|
||
AdaptiveNoiseState AudioProcessor::GetAdaptiveState() const {
|
||
return adaptive_state_;
|
||
} |