toy-hardware/main/audio_processing/audio_processor.cc

485 lines
20 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#include "audio_processor.h"
#include <esp_log.h>
#include <cmath>
#include <algorithm>
#define PROCESSOR_RUNNING 0x01
static const char* TAG = "AudioProcessor";
AudioProcessor::AudioProcessor()
: afe_data_(nullptr), adaptive_enabled_(true) {
event_group_ = xEventGroupCreate();
}
void AudioProcessor::Initialize(AudioCodec* codec, bool realtime_chat) {
codec_ = codec;
int ref_num = codec_->input_reference() ? 1 : 0;
std::string input_format;
for (int i = 0; i < codec_->input_channels() - ref_num; i++) {
input_format.push_back('M');
}
for (int i = 0; i < ref_num; i++) {
input_format.push_back('R');
}
srmodel_list_t *models = esp_srmodel_init("model");
char* ns_model_name = esp_srmodel_filter(models, ESP_NSNET_PREFIX, NULL);
afe_config_t* afe_config = afe_config_init(input_format.c_str(), NULL, AFE_TYPE_VC, AFE_MODE_HIGH_PERF);
if (realtime_chat) {
// 实时模式基于小智AI官方方案的AEC+VAD语音打断优化
afe_config->aec_init = true;
afe_config->aec_mode = AEC_MODE_VOIP_HIGH_PERF; // 使用高性能AEC模式
// 启用VAD配置严格参数减少误触发
afe_config->vad_init = true;
afe_config->vad_mode = VAD_MODE_3; // 最严格模式,减少误触发
afe_config->vad_min_noise_ms = 500; // 增加静音检测时长到500ms符合官方建议
ESP_LOGI(TAG, "Realtime mode: AEC + Strict VAD enabled for voice interrupt (xiaozhi optimized)");
} else {
// 非实时模式关闭AEC启用标准VAD
afe_config->aec_init = false;
afe_config->vad_init = true;
afe_config->vad_mode = VAD_MODE_0;
afe_config->vad_min_noise_ms = 100;
ESP_LOGI(TAG, "Non-realtime mode: Standard VAD enabled");
}
afe_config->ns_init = true;
afe_config->ns_model_name = ns_model_name;
afe_config->afe_ns_mode = AFE_NS_MODE_NET;
afe_config->afe_perferred_core = 1;
afe_config->afe_perferred_priority = 1;
afe_config->agc_init = false;
afe_config->memory_alloc_mode = AFE_MEMORY_ALLOC_MORE_INTERNAL;
// 优化处理器核心分配和优先级 - 确保音频处理的实时性
afe_config->afe_perferred_core = 1; // 绑定到专用核心
afe_config->afe_perferred_priority = 5; // 提高优先级
ESP_LOGI(TAG, "AFE configuration: AEC=%s, VAD=%s, core=%d, priority=%d",
realtime_chat ? "enabled" : "disabled",
"enabled", 1, 5);
afe_iface_ = esp_afe_handle_from_config(afe_config);
afe_data_ = afe_iface_->create_from_config(afe_config);
xTaskCreate([](void* arg) {
auto this_ = (AudioProcessor*)arg;
this_->AudioProcessorTask();
vTaskDelete(NULL);
}, "audio_communication", 4096, this, 3, NULL);
}
AudioProcessor::~AudioProcessor() {
if (afe_data_ != nullptr) {
afe_iface_->destroy(afe_data_);
}
vEventGroupDelete(event_group_);
}
void AudioProcessor::Feed(const std::vector<int16_t>& data) {
if (afe_data_ != nullptr) {
afe_iface_->feed(afe_data_, (int16_t*)data.data());
}
}
void AudioProcessor::Start() {
xEventGroupSetBits(event_group_, PROCESSOR_RUNNING);
}
void AudioProcessor::Stop() {
xEventGroupClearBits(event_group_, PROCESSOR_RUNNING);
}
bool AudioProcessor::IsRunning() {
return (xEventGroupGetBits(event_group_) & PROCESSOR_RUNNING) != 0;
}
void AudioProcessor::OnOutput(std::function<void(std::vector<int16_t>&& data)> callback) {
output_callback_ = callback;
}
void AudioProcessor::OnVadStateChange(std::function<void(bool speaking)> callback) {
vad_state_change_callback_ = callback;
}
void AudioProcessor::OnSimpleVadStateChange(std::function<void(bool speaking)> callback) {
simple_vad_state_change_callback_ = callback;
}
size_t AudioProcessor::GetFeedSize() {
if (afe_iface_ != nullptr && afe_data_ != nullptr) {
return afe_iface_->get_feed_chunksize(afe_data_);
}
return 0;
}
void AudioProcessor::AudioProcessorTask() {
auto fetch_size = afe_iface_->get_fetch_chunksize(afe_data_);
auto feed_size = afe_iface_->get_feed_chunksize(afe_data_);
ESP_LOGI(TAG, "Audio communication task started, feed size: %d fetch size: %d",
feed_size, fetch_size);
while (true) {
xEventGroupWaitBits(event_group_, PROCESSOR_RUNNING, pdFALSE, pdTRUE, portMAX_DELAY);
auto res = afe_iface_->fetch_with_delay(afe_data_, portMAX_DELAY);
if ((xEventGroupGetBits(event_group_) & PROCESSOR_RUNNING) == 0) {
continue;
}
if (res == nullptr || res->ret_value == ESP_FAIL) {
if (res != nullptr) {
ESP_LOGI(TAG, "Error code: %d", res->ret_value);
}
continue;
}
// 🎯 简单VAD处理用于普通业务触摸忽略、LED状态等
if (simple_vad_state_change_callback_) {
// 参考chumo4_yuan的简单实现直接使用ESP-ADF的VAD结果
static bool simple_is_speaking = false;
if (res->vad_state == VAD_SPEECH && !simple_is_speaking) {
simple_is_speaking = true;
simple_vad_state_change_callback_(true);
} else if (res->vad_state == VAD_SILENCE && simple_is_speaking) {
simple_is_speaking = false;
simple_vad_state_change_callback_(false);
}
}
// 🔊 复杂VAD处理小智AI官方语音打断方案仅在语音打断功能启用时使用
if (vad_state_change_callback_) {
// 核心逻辑检测VAD状态变化区分人声和回声
bool human_voice_detected = (res->vad_state == VAD_SPEECH);
if (human_voice_detected && !is_speaking_) {
// 语音开始:使用增强的回声感知评估,区分真实人声和设备回声
if (EvaluateSpeechWithEchoAwareness(res)) {
is_speaking_ = true;
ESP_LOGI(TAG, "VAD: Human voice detected (echo-aware filtering)");
vad_state_change_callback_(true);
} else {
ESP_LOGV(TAG, "VAD: Voice rejected (likely device echo)");
}
} else if (!human_voice_detected && is_speaking_) {
// 语音结束VAD检测到静音
is_speaking_ = false;
ESP_LOGI(TAG, "VAD: Human voice ended");
vad_state_change_callback_(false);
}
}
if (output_callback_) {
// 确保音频数据在正确的内存区域分配避免PSRAM/内部内存混乱
size_t sample_count = res->data_size / sizeof(int16_t);
std::vector<int16_t> audio_data;
audio_data.reserve(sample_count);
// 逐个复制数据,确保使用标准内存分配器
int16_t* src_data = (int16_t*)res->data;
for (size_t i = 0; i < sample_count; i++) {
audio_data.push_back(src_data[i]);
}
output_callback_(std::move(audio_data));
}
}
}
// 回声感知VAD优化方法实现
void AudioProcessor::SetEchoAwareParams(const EchoAwareVadParams& params) {
echo_params_ = params;
ESP_LOGI(TAG, "Echo-aware VAD params updated: snr_threshold=%.2f, min_silence=%dms, cooldown=%dms",
params.snr_threshold, params.min_silence_ms, params.interrupt_cooldown_ms);
}
void AudioProcessor::SetSpeakerVolume(float volume) {
current_speaker_volume_ = volume;
// 🎯 触发自适应噪声抑制更新
if (adaptive_enabled_ && echo_params_.adaptive_noise_suppression) {
AdaptSuppressionLevel();
}
ESP_LOGV(TAG, "Speaker volume updated: %.2f, adaptive suppression: %.2f",
volume, adaptive_state_.dynamic_suppression_level);
}
bool AudioProcessor::IsEchoSuppressed() const {
return aec_converged_;
}
bool AudioProcessor::EvaluateSpeechWithEchoAwareness(afe_fetch_result_t* fetch_result) {
if (!fetch_result || fetch_result->ret_value != ESP_OK) {
return false;
}
// 检查VAD状态 - 基于实际的ESP-ADF API
bool basic_vad_detected = (fetch_result->vad_state == VAD_SPEECH);
if (!basic_vad_detected) {
return false;
}
// 增强的回声感知逻辑:多重检查机制
if (echo_params_.adaptive_threshold) {
// 计算当前音频块的能量
int16_t* audio_data = (int16_t*)fetch_result->data;
size_t sample_count = fetch_result->data_size / sizeof(int16_t);
float energy = 0.0f;
float peak_amplitude = 0.0f;
for (size_t i = 0; i < sample_count; i++) {
float sample = (float)abs(audio_data[i]);
energy += sample * sample;
if (sample > peak_amplitude) {
peak_amplitude = sample;
}
}
energy = energy / sample_count; // 平均能量
// 🎯 自适应噪声抑制:根据实时环境动态调整阈值
// 首先更新自适应状态
UpdateAdaptiveNoiseState(audio_data, sample_count);
// 获取动态抑制级别
float adaptive_suppression = adaptive_enabled_ && echo_params_.adaptive_noise_suppression ?
adaptive_state_.dynamic_suppression_level : 1.0f;
// 🔊 智能阈值计算:结合固定策略和自适应策略
float volume_factor = 1.0f + current_speaker_volume_ * 500.0f; // 基础音量影响
float adaptive_threshold = echo_params_.snr_threshold * volume_factor * adaptive_suppression; // 自适应增强
float energy_threshold = adaptive_threshold * 10000000000.0f; // 基础阈值
// 超激进峰值检查:极度提高阈值,完全阻止误触发
float peak_threshold = 500000.0f * volume_factor; // 超激进提高峰值阈值
bool peak_check = (peak_amplitude > peak_threshold);
// 能量检查
bool energy_check = (energy > energy_threshold);
// 超激进扬声器保护:任何微弱音频都极大提高阈值
if (current_speaker_volume_ > 0.0001f) { // 极早触发保护
energy_threshold *= 1000.0f; // 播放时能量阈值提高1000倍
peak_threshold *= 500.0f; // 峰值阈值提高500倍
energy_check = (energy > energy_threshold);
peak_check = (peak_amplitude > peak_threshold);
}
// 频域特征检查:分析高频成分,人声通常有更多高频特征
float high_freq_energy = 0.0f;
for (size_t i = sample_count / 2; i < sample_count; i++) {
float sample = (float)abs(audio_data[i]);
high_freq_energy += sample * sample;
}
high_freq_energy = high_freq_energy / (sample_count / 2);
// 超激进高频比例检查:极度严格的人声特征要求
float high_freq_ratio = (energy > 0) ? (high_freq_energy / energy) : 0.0f;
float freq_threshold = 1.2f * volume_factor; // 超激进提高高频比例要求到1.2(几乎不可能达到)
if (current_speaker_volume_ > 0.0001f) {
freq_threshold *= 50.0f; // 播放时超激进提高高频要求
}
bool freq_check = (high_freq_ratio > freq_threshold);
// 超激进稳定性检查:极度严格的信号变化要求
float variance = 0.0f;
for (size_t i = 1; i < sample_count; i++) {
float diff = (float)(abs(audio_data[i]) - abs(audio_data[i-1]));
variance += diff * diff;
}
variance = variance / (sample_count - 1);
float variance_threshold = 10000000000.0f / volume_factor; // 超激进提高方差要求
if (current_speaker_volume_ > 0.0001f) {
variance_threshold *= 100.0f; // 播放时超激进提高方差要求
}
bool stability_check = (variance > variance_threshold); // 人声变化更大
// 增强连续性检查 - 真实人声通常有连续的特征变化
static float prev_energy = 0.0f;
static float prev_high_freq_ratio = 0.0f;
static int consistent_frames = 0; // 连续帧计数
float energy_change = abs(energy - prev_energy) / (prev_energy + 1.0f);
float freq_change = abs(high_freq_ratio - prev_high_freq_ratio);
// 超激进连续性要求:需要连续更多帧都满足极严格的人声特征
bool frame_continuity = (energy_change > 1.2f && freq_change > 0.5f); // 超激进提高变化要求,且必须同时满足
if (frame_continuity) {
consistent_frames++;
} else {
consistent_frames = 0; // 重置计数
}
bool continuity_check = (consistent_frames >= 10); // 需要连续10帧都符合极严格人声特征
prev_energy = energy;
prev_high_freq_ratio = high_freq_ratio;
// 最终综合判断:需要同时满足所有条件(绝对严格)
// 新增:播放时间检查 - 如果刚开始播放,额外严格
static auto last_volume_update = std::chrono::steady_clock::now();
auto now = std::chrono::steady_clock::now();
if (current_speaker_volume_ > 0.01f) {
last_volume_update = now;
}
auto time_since_playback = std::chrono::duration_cast<std::chrono::milliseconds>(now - last_volume_update);
bool recent_playback_protection = (time_since_playback.count() < 30000); // 播放后30秒内额外保护
bool final_result = energy_check && peak_check && freq_check && stability_check &&
continuity_check && !recent_playback_protection;
// 🔕 注释掉过于频繁的回声评估详细日志 - 只在结果为true时输出
if (final_result) {
ESP_LOGI(TAG, "🎯 HUMAN VOICE DETECTED: duration=%.0fms, vol=%.3f, adaptive=%.1f%s",
(float)time_since_playback.count(), current_speaker_volume_, adaptive_suppression,
adaptive_state_.high_interference_mode ? "[HIGH_INTERFERENCE]" : "");
}
return final_result;
}
// 非自适应模式直接信任VAD结果
return true;
}
// 🎯 自适应噪声抑制核心算法实现
void AudioProcessor::UpdateAdaptiveNoiseState(const int16_t* audio_data, size_t sample_count) {
if (!adaptive_enabled_ || !echo_params_.adaptive_noise_suppression) {
return;
}
// 计算当前回声强度
float echo_strength = CalculateEchoStrength(audio_data, sample_count);
adaptive_state_.current_echo_strength = echo_strength;
// 估算距离因子 (基于回声强度和音量)
adaptive_state_.estimated_distance_factor = EstimateDistanceFactor(echo_strength, current_speaker_volume_);
// 更新环境噪声基线
if (current_speaker_volume_ < 0.01f) { // 扬声器几乎静音时更新基线
float current_noise = 0.0f;
for (size_t i = 0; i < sample_count; i++) {
current_noise += abs(audio_data[i]);
}
current_noise /= sample_count;
// 指数移动平均更新噪声基线
adaptive_state_.noise_baseline = adaptive_state_.noise_baseline * 0.95f + current_noise * 0.05f;
}
// 自适应调整抑制级别
AdaptSuppressionLevel();
adaptive_state_.last_adaptation_time = std::chrono::steady_clock::now();
}
float AudioProcessor::CalculateEchoStrength(const int16_t* audio_data, size_t sample_count) {
if (current_speaker_volume_ < 0.001f) {
return 0.0f; // 扬声器静音,无回声
}
// 计算音频能量
float energy = 0.0f;
float peak = 0.0f;
for (size_t i = 0; i < sample_count; i++) {
float sample = abs(audio_data[i]);
energy += sample * sample;
if (sample > peak) peak = sample;
}
energy = std::sqrt(energy / sample_count);
// 🔊 回声强度 = 能量 × 峰值比 × 音量影响
float peak_ratio = (energy > 0) ? (peak / energy) : 0.0f;
// 🎯 关键洞察:回声具有特征性的能量分布模式
// 真实人声:能量分布更均匀,峰值比较低
// 设备回声:能量集中,峰值比较高
float echo_indicator = peak_ratio * current_speaker_volume_;
return echo_indicator;
}
float AudioProcessor::EstimateDistanceFactor(float echo_strength, float volume) {
if (volume < 0.001f) {
return 1.0f; // 静音时认为距离无关紧要
}
// 🎯 基于物理原理的距离估算:
// 回声强度 ∝ 音量² / 距离²
// 距离因子 = 1 / (1 + echo_strength * volume_sensitivity)
// 值越小表示越近,值越大表示越远
float normalized_echo = echo_strength / (volume + 0.001f); // 归一化回声
float distance_factor = 1.0f / (1.0f + normalized_echo * echo_params_.volume_sensitivity);
// 🔊 约束距离因子范围 [0.1, 1.0]
distance_factor = std::max(0.1f, std::min(1.0f, distance_factor));
// 🔕 注释掉过于频繁的距离估算日志
// ESP_LOGD(TAG, "🎯 Distance estimation: echo=%.3f, vol=%.3f, factor=%.3f",
// echo_strength, volume, distance_factor);
return distance_factor;
}
void AudioProcessor::AdaptSuppressionLevel() {
if (!adaptive_enabled_ || !echo_params_.adaptive_noise_suppression) {
adaptive_state_.dynamic_suppression_level = 1.0f;
return;
}
// 🎯 自适应抑制级别计算
// 基础抑制级别
float base_level = echo_params_.noise_suppression_base;
// 🔊 音量影响:音量越大,抑制越强
float volume_multiplier = 1.0f + current_speaker_volume_ * echo_params_.volume_sensitivity;
// 📏 距离影响:距离越近,抑制越强
float distance_multiplier = 1.0f / (adaptive_state_.estimated_distance_factor + 0.1f);
// 🌊 回声强度影响:回声越强,抑制越强
float echo_multiplier = 1.0f + adaptive_state_.current_echo_strength * 2.0f;
// 🎯 综合计算动态抑制级别
adaptive_state_.dynamic_suppression_level = base_level * volume_multiplier * distance_multiplier * echo_multiplier;
// 📊 高干扰模式判断
bool was_high_interference = adaptive_state_.high_interference_mode;
adaptive_state_.high_interference_mode = (
current_speaker_volume_ > 0.3f && // 高音量
adaptive_state_.estimated_distance_factor < 0.5f && // 近距离
adaptive_state_.current_echo_strength > echo_params_.echo_detection_threshold // 强回声
);
// 🚨 高干扰模式额外保护
if (adaptive_state_.high_interference_mode) {
adaptive_state_.dynamic_suppression_level *= 5.0f; // 高干扰时5倍抑制
if (!was_high_interference) {
ESP_LOGW(TAG, "🔴 Entering HIGH INTERFERENCE mode - vol=%.2f, dist=%.2f, echo=%.3f",
current_speaker_volume_, adaptive_state_.estimated_distance_factor,
adaptive_state_.current_echo_strength);
}
} else if (was_high_interference) {
ESP_LOGI(TAG, "🟢 Exiting high interference mode - returning to adaptive suppression");
}
// 📏 限制抑制级别范围 [1.0, 100.0]
adaptive_state_.dynamic_suppression_level = std::max(1.0f, std::min(100.0f, adaptive_state_.dynamic_suppression_level));
// 🔕 注释掉过于频繁的自适应抑制日志
// ESP_LOGD(TAG, "🎯 Adaptive suppression: vol=%.2f, dist=%.2f, echo=%.3f → level=%.1f %s",
// current_speaker_volume_, adaptive_state_.estimated_distance_factor,
// adaptive_state_.current_echo_strength, adaptive_state_.dynamic_suppression_level,
// adaptive_state_.high_interference_mode ? "[HIGH_INTERFERENCE]" : "");
}
AdaptiveNoiseState AudioProcessor::GetAdaptiveState() const {
return adaptive_state_;
}