#include "audio_processor.h" #include #include #include #define PROCESSOR_RUNNING 0x01 static const char* TAG = "AudioProcessor"; AudioProcessor::AudioProcessor() : afe_data_(nullptr), adaptive_enabled_(true) { event_group_ = xEventGroupCreate(); } void AudioProcessor::Initialize(AudioCodec* codec, bool realtime_chat) { codec_ = codec; int ref_num = codec_->input_reference() ? 1 : 0; std::string input_format; for (int i = 0; i < codec_->input_channels() - ref_num; i++) { input_format.push_back('M'); } for (int i = 0; i < ref_num; i++) { input_format.push_back('R'); } srmodel_list_t *models = esp_srmodel_init("model"); char* ns_model_name = esp_srmodel_filter(models, ESP_NSNET_PREFIX, NULL); afe_config_t* afe_config = afe_config_init(input_format.c_str(), NULL, AFE_TYPE_VC, AFE_MODE_HIGH_PERF); if (realtime_chat) { // 实时模式：基于小智AI官方方案的AEC+VAD语音打断优化 afe_config->aec_init = true; afe_config->aec_mode = AEC_MODE_VOIP_HIGH_PERF; // 使用高性能AEC模式 // 启用VAD，配置严格参数减少误触发 afe_config->vad_init = true; afe_config->vad_mode = VAD_MODE_3; // 最严格模式，减少误触发 afe_config->vad_min_noise_ms = 500; // 增加静音检测时长到500ms，符合官方建议 ESP_LOGI(TAG, "Realtime mode: AEC + Strict VAD enabled for voice interrupt (xiaozhi optimized)"); } else { // 非实时模式：关闭AEC，启用标准VAD afe_config->aec_init = false; afe_config->vad_init = true; afe_config->vad_mode = VAD_MODE_0; afe_config->vad_min_noise_ms = 100; ESP_LOGI(TAG, "Non-realtime mode: Standard VAD enabled"); } afe_config->ns_init = true; afe_config->ns_model_name = ns_model_name; afe_config->afe_ns_mode = AFE_NS_MODE_NET; afe_config->afe_perferred_core = 1; afe_config->afe_perferred_priority = 1; afe_config->agc_init = false; afe_config->memory_alloc_mode = AFE_MEMORY_ALLOC_MORE_INTERNAL; // 优化处理器核心分配和优先级 - 确保音频处理的实时性 afe_config->afe_perferred_core = 1; // 绑定到专用核心 afe_config->afe_perferred_priority = 5; // 提高优先级 ESP_LOGI(TAG, "AFE configuration: AEC=%s, VAD=%s, core=%d, priority=%d", realtime_chat ? "enabled" : "disabled", "enabled", 1, 5); afe_iface_ = esp_afe_handle_from_config(afe_config); afe_data_ = afe_iface_->create_from_config(afe_config); xTaskCreate([](void* arg) { auto this_ = (AudioProcessor*)arg; this_->AudioProcessorTask(); vTaskDelete(NULL); }, "audio_communication", 4096, this, 3, NULL); } AudioProcessor::~AudioProcessor() { if (afe_data_ != nullptr) { afe_iface_->destroy(afe_data_); } vEventGroupDelete(event_group_); } void AudioProcessor::Feed(const std::vector& data) { if (afe_data_ != nullptr) { afe_iface_->feed(afe_data_, (int16_t*)data.data()); } } void AudioProcessor::Start() { xEventGroupSetBits(event_group_, PROCESSOR_RUNNING); } void AudioProcessor::Stop() { xEventGroupClearBits(event_group_, PROCESSOR_RUNNING); } bool AudioProcessor::IsRunning() { return (xEventGroupGetBits(event_group_) & PROCESSOR_RUNNING) != 0; } void AudioProcessor::OnOutput(std::function&& data)> callback) { output_callback_ = callback; } void AudioProcessor::OnVadStateChange(std::function callback) { vad_state_change_callback_ = callback; } void AudioProcessor::OnSimpleVadStateChange(std::function callback) { simple_vad_state_change_callback_ = callback; } size_t AudioProcessor::GetFeedSize() { if (afe_iface_ != nullptr && afe_data_ != nullptr) { return afe_iface_->get_feed_chunksize(afe_data_); } return 0; } void AudioProcessor::AudioProcessorTask() { auto fetch_size = afe_iface_->get_fetch_chunksize(afe_data_); auto feed_size = afe_iface_->get_feed_chunksize(afe_data_); ESP_LOGI(TAG, "Audio communication task started, feed size: %d fetch size: %d", feed_size, fetch_size); while (true) { xEventGroupWaitBits(event_group_, PROCESSOR_RUNNING, pdFALSE, pdTRUE, portMAX_DELAY); auto res = afe_iface_->fetch_with_delay(afe_data_, portMAX_DELAY); if ((xEventGroupGetBits(event_group_) & PROCESSOR_RUNNING) == 0) { continue; } if (res == nullptr || res->ret_value == ESP_FAIL) { if (res != nullptr) { ESP_LOGI(TAG, "Error code: %d", res->ret_value); } continue; } // 🎯 简单VAD处理：用于普通业务（触摸忽略、LED状态等） if (simple_vad_state_change_callback_) { // 参考chumo4_yuan的简单实现：直接使用ESP-ADF的VAD结果 static bool simple_is_speaking = false; if (res->vad_state == VAD_SPEECH && !simple_is_speaking) { simple_is_speaking = true; simple_vad_state_change_callback_(true); } else if (res->vad_state == VAD_SILENCE && simple_is_speaking) { simple_is_speaking = false; simple_vad_state_change_callback_(false); } } // 🔊 复杂VAD处理：小智AI官方语音打断方案（仅在语音打断功能启用时使用） if (vad_state_change_callback_) { // 核心逻辑：检测VAD状态变化，区分人声和回声 bool human_voice_detected = (res->vad_state == VAD_SPEECH); if (human_voice_detected && !is_speaking_) { // 语音开始：使用增强的回声感知评估，区分真实人声和设备回声 if (EvaluateSpeechWithEchoAwareness(res)) { is_speaking_ = true; ESP_LOGI(TAG, "VAD: Human voice detected (echo-aware filtering)"); vad_state_change_callback_(true); } else { ESP_LOGV(TAG, "VAD: Voice rejected (likely device echo)"); } } else if (!human_voice_detected && is_speaking_) { // 语音结束：VAD检测到静音 is_speaking_ = false; ESP_LOGI(TAG, "VAD: Human voice ended"); vad_state_change_callback_(false); } } if (output_callback_) { // 确保音频数据在正确的内存区域分配，避免PSRAM/内部内存混乱 size_t sample_count = res->data_size / sizeof(int16_t); std::vector audio_data; audio_data.reserve(sample_count); // 逐个复制数据，确保使用标准内存分配器 int16_t* src_data = (int16_t*)res->data; for (size_t i = 0; i < sample_count; i++) { audio_data.push_back(src_data[i]); } output_callback_(std::move(audio_data)); } } } // 回声感知VAD优化方法实现 void AudioProcessor::SetEchoAwareParams(const EchoAwareVadParams& params) { echo_params_ = params; ESP_LOGI(TAG, "Echo-aware VAD params updated: snr_threshold=%.2f, min_silence=%dms, cooldown=%dms", params.snr_threshold, params.min_silence_ms, params.interrupt_cooldown_ms); } void AudioProcessor::SetSpeakerVolume(float volume) { current_speaker_volume_ = volume; // 🎯 触发自适应噪声抑制更新 if (adaptive_enabled_ && echo_params_.adaptive_noise_suppression) { AdaptSuppressionLevel(); } ESP_LOGV(TAG, "Speaker volume updated: %.2f, adaptive suppression: %.2f", volume, adaptive_state_.dynamic_suppression_level); } bool AudioProcessor::IsEchoSuppressed() const { return aec_converged_; } bool AudioProcessor::EvaluateSpeechWithEchoAwareness(afe_fetch_result_t* fetch_result) { if (!fetch_result || fetch_result->ret_value != ESP_OK) { return false; } // 检查VAD状态 - 基于实际的ESP-ADF API bool basic_vad_detected = (fetch_result->vad_state == VAD_SPEECH); if (!basic_vad_detected) { return false; } // 增强的回声感知逻辑：多重检查机制 if (echo_params_.adaptive_threshold) { // 计算当前音频块的能量 int16_t* audio_data = (int16_t*)fetch_result->data; size_t sample_count = fetch_result->data_size / sizeof(int16_t); float energy = 0.0f; float peak_amplitude = 0.0f; for (size_t i = 0; i < sample_count; i++) { float sample = (float)abs(audio_data[i]); energy += sample * sample; if (sample > peak_amplitude) { peak_amplitude = sample; } } energy = energy / sample_count; // 平均能量 // 🎯 自适应噪声抑制：根据实时环境动态调整阈值 // 首先更新自适应状态 UpdateAdaptiveNoiseState(audio_data, sample_count); // 获取动态抑制级别 float adaptive_suppression = adaptive_enabled_ && echo_params_.adaptive_noise_suppression ? adaptive_state_.dynamic_suppression_level : 1.0f; // 🔊 智能阈值计算：结合固定策略和自适应策略 float volume_factor = 1.0f + current_speaker_volume_ * 500.0f; // 基础音量影响 float adaptive_threshold = echo_params_.snr_threshold * volume_factor * adaptive_suppression; // 自适应增强 float energy_threshold = adaptive_threshold * 10000000000.0f; // 基础阈值 // 超激进峰值检查：极度提高阈值，完全阻止误触发 float peak_threshold = 500000.0f * volume_factor; // 超激进提高峰值阈值 bool peak_check = (peak_amplitude > peak_threshold); // 能量检查 bool energy_check = (energy > energy_threshold); // 超激进扬声器保护：任何微弱音频都极大提高阈值 if (current_speaker_volume_ > 0.0001f) { // 极早触发保护 energy_threshold *= 1000.0f; // 播放时能量阈值提高1000倍 peak_threshold *= 500.0f; // 峰值阈值提高500倍 energy_check = (energy > energy_threshold); peak_check = (peak_amplitude > peak_threshold); } // 频域特征检查：分析高频成分，人声通常有更多高频特征 float high_freq_energy = 0.0f; for (size_t i = sample_count / 2; i < sample_count; i++) { float sample = (float)abs(audio_data[i]); high_freq_energy += sample * sample; } high_freq_energy = high_freq_energy / (sample_count / 2); // 超激进高频比例检查：极度严格的人声特征要求 float high_freq_ratio = (energy > 0) ? (high_freq_energy / energy) : 0.0f; float freq_threshold = 1.2f * volume_factor; // 超激进提高高频比例要求到1.2（几乎不可能达到） if (current_speaker_volume_ > 0.0001f) { freq_threshold *= 50.0f; // 播放时超激进提高高频要求 } bool freq_check = (high_freq_ratio > freq_threshold); // 超激进稳定性检查：极度严格的信号变化要求 float variance = 0.0f; for (size_t i = 1; i < sample_count; i++) { float diff = (float)(abs(audio_data[i]) - abs(audio_data[i-1])); variance += diff * diff; } variance = variance / (sample_count - 1); float variance_threshold = 10000000000.0f / volume_factor; // 超激进提高方差要求 if (current_speaker_volume_ > 0.0001f) { variance_threshold *= 100.0f; // 播放时超激进提高方差要求 } bool stability_check = (variance > variance_threshold); // 人声变化更大 // 增强连续性检查 - 真实人声通常有连续的特征变化 static float prev_energy = 0.0f; static float prev_high_freq_ratio = 0.0f; static int consistent_frames = 0; // 连续帧计数 float energy_change = abs(energy - prev_energy) / (prev_energy + 1.0f); float freq_change = abs(high_freq_ratio - prev_high_freq_ratio); // 超激进连续性要求：需要连续更多帧都满足极严格的人声特征 bool frame_continuity = (energy_change > 1.2f && freq_change > 0.5f); // 超激进提高变化要求，且必须同时满足 if (frame_continuity) { consistent_frames++; } else { consistent_frames = 0; // 重置计数 } bool continuity_check = (consistent_frames >= 10); // 需要连续10帧都符合极严格人声特征 prev_energy = energy; prev_high_freq_ratio = high_freq_ratio; // 最终综合判断：需要同时满足所有条件（绝对严格） // 新增：播放时间检查 - 如果刚开始播放，额外严格 static auto last_volume_update = std::chrono::steady_clock::now(); auto now = std::chrono::steady_clock::now(); if (current_speaker_volume_ > 0.01f) { last_volume_update = now; } auto time_since_playback = std::chrono::duration_cast(now - last_volume_update); bool recent_playback_protection = (time_since_playback.count() < 30000); // 播放后30秒内额外保护 bool final_result = energy_check && peak_check && freq_check && stability_check && continuity_check && !recent_playback_protection; // 🔕 注释掉过于频繁的回声评估详细日志 - 只在结果为true时输出 if (final_result) { ESP_LOGI(TAG, "🎯 HUMAN VOICE DETECTED: duration=%.0fms, vol=%.3f, adaptive=%.1f%s", (float)time_since_playback.count(), current_speaker_volume_, adaptive_suppression, adaptive_state_.high_interference_mode ? "[HIGH_INTERFERENCE]" : ""); } return final_result; } // 非自适应模式，直接信任VAD结果 return true; } // 🎯 自适应噪声抑制核心算法实现 void AudioProcessor::UpdateAdaptiveNoiseState(const int16_t* audio_data, size_t sample_count) { if (!adaptive_enabled_ || !echo_params_.adaptive_noise_suppression) { return; } // 计算当前回声强度 float echo_strength = CalculateEchoStrength(audio_data, sample_count); adaptive_state_.current_echo_strength = echo_strength; // 估算距离因子 (基于回声强度和音量) adaptive_state_.estimated_distance_factor = EstimateDistanceFactor(echo_strength, current_speaker_volume_); // 更新环境噪声基线 if (current_speaker_volume_ < 0.01f) { // 扬声器几乎静音时更新基线 float current_noise = 0.0f; for (size_t i = 0; i < sample_count; i++) { current_noise += abs(audio_data[i]); } current_noise /= sample_count; // 指数移动平均更新噪声基线 adaptive_state_.noise_baseline = adaptive_state_.noise_baseline * 0.95f + current_noise * 0.05f; } // 自适应调整抑制级别 AdaptSuppressionLevel(); adaptive_state_.last_adaptation_time = std::chrono::steady_clock::now(); } float AudioProcessor::CalculateEchoStrength(const int16_t* audio_data, size_t sample_count) { if (current_speaker_volume_ < 0.001f) { return 0.0f; // 扬声器静音，无回声 } // 计算音频能量 float energy = 0.0f; float peak = 0.0f; for (size_t i = 0; i < sample_count; i++) { float sample = abs(audio_data[i]); energy += sample * sample; if (sample > peak) peak = sample; } energy = std::sqrt(energy / sample_count); // 🔊 回声强度 = 能量 × 峰值比 × 音量影响 float peak_ratio = (energy > 0) ? (peak / energy) : 0.0f; // 🎯 关键洞察：回声具有特征性的能量分布模式 // 真实人声：能量分布更均匀，峰值比较低 // 设备回声：能量集中，峰值比较高 float echo_indicator = peak_ratio * current_speaker_volume_; return echo_indicator; } float AudioProcessor::EstimateDistanceFactor(float echo_strength, float volume) { if (volume < 0.001f) { return 1.0f; // 静音时认为距离无关紧要 } // 🎯 基于物理原理的距离估算： // 回声强度 ∝ 音量² / 距离² // 距离因子 = 1 / (1 + echo_strength * volume_sensitivity) // 值越小表示越近，值越大表示越远 float normalized_echo = echo_strength / (volume + 0.001f); // 归一化回声 float distance_factor = 1.0f / (1.0f + normalized_echo * echo_params_.volume_sensitivity); // 🔊 约束距离因子范围 [0.1, 1.0] distance_factor = std::max(0.1f, std::min(1.0f, distance_factor)); // 🔕 注释掉过于频繁的距离估算日志 // ESP_LOGD(TAG, "🎯 Distance estimation: echo=%.3f, vol=%.3f, factor=%.3f", // echo_strength, volume, distance_factor); return distance_factor; } void AudioProcessor::AdaptSuppressionLevel() { if (!adaptive_enabled_ || !echo_params_.adaptive_noise_suppression) { adaptive_state_.dynamic_suppression_level = 1.0f; return; } // 🎯 自适应抑制级别计算 // 基础抑制级别 float base_level = echo_params_.noise_suppression_base; // 🔊 音量影响：音量越大，抑制越强 float volume_multiplier = 1.0f + current_speaker_volume_ * echo_params_.volume_sensitivity; // 📏 距离影响：距离越近，抑制越强 float distance_multiplier = 1.0f / (adaptive_state_.estimated_distance_factor + 0.1f); // 🌊 回声强度影响：回声越强，抑制越强 float echo_multiplier = 1.0f + adaptive_state_.current_echo_strength * 2.0f; // 🎯 综合计算动态抑制级别 adaptive_state_.dynamic_suppression_level = base_level * volume_multiplier * distance_multiplier * echo_multiplier; // 📊 高干扰模式判断 bool was_high_interference = adaptive_state_.high_interference_mode; adaptive_state_.high_interference_mode = ( current_speaker_volume_ > 0.3f && // 高音量 adaptive_state_.estimated_distance_factor < 0.5f && // 近距离 adaptive_state_.current_echo_strength > echo_params_.echo_detection_threshold // 强回声 ); // 🚨 高干扰模式额外保护 if (adaptive_state_.high_interference_mode) { adaptive_state_.dynamic_suppression_level *= 5.0f; // 高干扰时5倍抑制 if (!was_high_interference) { ESP_LOGW(TAG, "🔴 Entering HIGH INTERFERENCE mode - vol=%.2f, dist=%.2f, echo=%.3f", current_speaker_volume_, adaptive_state_.estimated_distance_factor, adaptive_state_.current_echo_strength); } } else if (was_high_interference) { ESP_LOGI(TAG, "🟢 Exiting high interference mode - returning to adaptive suppression"); } // 📏 限制抑制级别范围 [1.0, 100.0] adaptive_state_.dynamic_suppression_level = std::max(1.0f, std::min(100.0f, adaptive_state_.dynamic_suppression_level)); // 🔕 注释掉过于频繁的自适应抑制日志 // ESP_LOGD(TAG, "🎯 Adaptive suppression: vol=%.2f, dist=%.2f, echo=%.3f → level=%.1f %s", // current_speaker_volume_, adaptive_state_.estimated_distance_factor, // adaptive_state_.current_echo_strength, adaptive_state_.dynamic_suppression_level, // adaptive_state_.high_interference_mode ? "[HIGH_INTERFERENCE]" : ""); } AdaptiveNoiseState AudioProcessor::GetAdaptiveState() const { return adaptive_state_; }