toy-hardware/main/audio_processing/audio_processor.cc

#include "audio_processor.h"
#include <esp_log.h>
#include <cmath>
#include <algorithm>

#define PROCESSOR_RUNNING 0x01

static const char* TAG = "AudioProcessor";

AudioProcessor::AudioProcessor()
    : afe_data_(nullptr), adaptive_enabled_(true) {
    event_group_ = xEventGroupCreate();
}

void AudioProcessor::Initialize(AudioCodec* codec, bool realtime_chat) {
    codec_ = codec;
    int ref_num = codec_->input_reference() ? 1 : 0;

    std::string input_format;
    for (int i = 0; i < codec_->input_channels() - ref_num; i++) {
        input_format.push_back('M');
    }
    for (int i = 0; i < ref_num; i++) {
        input_format.push_back('R');
    }

    srmodel_list_t *models = esp_srmodel_init("model");
    char* ns_model_name = esp_srmodel_filter(models, ESP_NSNET_PREFIX, NULL);

    afe_config_t* afe_config = afe_config_init(input_format.c_str(), NULL, AFE_TYPE_VC, AFE_MODE_HIGH_PERF);
    if (realtime_chat) {
        // 实时模式：基于小智AI官方方案的AEC+VAD语音打断优化
        afe_config->aec_init = true;
        afe_config->aec_mode = AEC_MODE_VOIP_HIGH_PERF;  // 使用高性能AEC模式

        // 启用VAD，配置严格参数减少误触发
        afe_config->vad_init = true;
        afe_config->vad_mode = VAD_MODE_3;  // 最严格模式，减少误触发
        afe_config->vad_min_noise_ms = 500; // 增加静音检测时长到500ms，符合官方建议

        ESP_LOGI(TAG, "Realtime mode: AEC + Strict VAD enabled for voice interrupt (xiaozhi optimized)");
    } else {
        // 非实时模式：关闭AEC，启用标准VAD
        afe_config->aec_init = false;
        afe_config->vad_init = true;
        afe_config->vad_mode = VAD_MODE_0;
        afe_config->vad_min_noise_ms = 100;

        ESP_LOGI(TAG, "Non-realtime mode: Standard VAD enabled");
    }
    afe_config->ns_init = true;
    afe_config->ns_model_name = ns_model_name;
    afe_config->afe_ns_mode = AFE_NS_MODE_NET;
    afe_config->afe_perferred_core = 1;
    afe_config->afe_perferred_priority = 1;
    afe_config->agc_init = false;
    afe_config->memory_alloc_mode = AFE_MEMORY_ALLOC_MORE_INTERNAL;

    // 优化处理器核心分配和优先级 - 确保音频处理的实时性
    afe_config->afe_perferred_core = 1;        // 绑定到专用核心
    afe_config->afe_perferred_priority = 5;    // 提高优先级

    ESP_LOGI(TAG, "AFE configuration: AEC=%s, VAD=%s, core=%d, priority=%d",
             realtime_chat ? "enabled" : "disabled",
             "enabled", 1, 5);

    afe_iface_ = esp_afe_handle_from_config(afe_config);
    afe_data_ = afe_iface_->create_from_config(afe_config);

    xTaskCreate([](void* arg) {
        auto this_ = (AudioProcessor*)arg;
        this_->AudioProcessorTask();
        vTaskDelete(NULL);
    }, "audio_communication", 4096, this, 3, NULL);
}

AudioProcessor::~AudioProcessor() {
    if (afe_data_ != nullptr) {
        afe_iface_->destroy(afe_data_);
    }
    vEventGroupDelete(event_group_);
}

void AudioProcessor::Feed(const std::vector<int16_t>& data) {
    if (afe_data_ != nullptr) {
        afe_iface_->feed(afe_data_, (int16_t*)data.data());
    }
}

void AudioProcessor::Start() {
    xEventGroupSetBits(event_group_, PROCESSOR_RUNNING);
}

void AudioProcessor::Stop() {
    xEventGroupClearBits(event_group_, PROCESSOR_RUNNING);
}

bool AudioProcessor::IsRunning() {
    return (xEventGroupGetBits(event_group_) & PROCESSOR_RUNNING) != 0;
}

void AudioProcessor::OnOutput(std::function<void(std::vector<int16_t>&& data)> callback) {
    output_callback_ = callback;
}

void AudioProcessor::OnVadStateChange(std::function<void(bool speaking)> callback) {
    vad_state_change_callback_ = callback;
}

void AudioProcessor::OnSimpleVadStateChange(std::function<void(bool speaking)> callback) {
    simple_vad_state_change_callback_ = callback;
}

size_t AudioProcessor::GetFeedSize() {
    if (afe_iface_ != nullptr && afe_data_ != nullptr) {
        return afe_iface_->get_feed_chunksize(afe_data_);
    }
    return 0;
}

void AudioProcessor::AudioProcessorTask() {
    auto fetch_size = afe_iface_->get_fetch_chunksize(afe_data_);
    auto feed_size = afe_iface_->get_feed_chunksize(afe_data_);
    ESP_LOGI(TAG, "Audio communication task started, feed size: %d fetch size: %d",
        feed_size, fetch_size);

    while (true) {
        xEventGroupWaitBits(event_group_, PROCESSOR_RUNNING, pdFALSE, pdTRUE, portMAX_DELAY);

        auto res = afe_iface_->fetch_with_delay(afe_data_, portMAX_DELAY);
        if ((xEventGroupGetBits(event_group_) & PROCESSOR_RUNNING) == 0) {
            continue;
        }
        if (res == nullptr || res->ret_value == ESP_FAIL) {
            if (res != nullptr) {
                ESP_LOGI(TAG, "Error code: %d", res->ret_value);
            }
            continue;
        }

        // 🎯 简单VAD处理：用于普通业务（触摸忽略、LED状态等）
        if (simple_vad_state_change_callback_) {
            // 参考chumo4_yuan的简单实现：直接使用ESP-ADF的VAD结果
            static bool simple_is_speaking = false;
            if (res->vad_state == VAD_SPEECH && !simple_is_speaking) {
                simple_is_speaking = true;
                simple_vad_state_change_callback_(true);
            } else if (res->vad_state == VAD_SILENCE && simple_is_speaking) {
                simple_is_speaking = false;
                simple_vad_state_change_callback_(false);
            }
        }

        // 🔊 复杂VAD处理：小智AI官方语音打断方案（仅在语音打断功能启用时使用）
        if (vad_state_change_callback_) {
            // 核心逻辑：检测VAD状态变化，区分人声和回声
            bool human_voice_detected = (res->vad_state == VAD_SPEECH);

            if (human_voice_detected && !is_speaking_) {
                // 语音开始：使用增强的回声感知评估，区分真实人声和设备回声
                if (EvaluateSpeechWithEchoAwareness(res)) {
                    is_speaking_ = true;
                    ESP_LOGI(TAG, "VAD: Human voice detected (echo-aware filtering)");
                    vad_state_change_callback_(true);
                } else {
                    ESP_LOGV(TAG, "VAD: Voice rejected (likely device echo)");
                }
            } else if (!human_voice_detected && is_speaking_) {
                // 语音结束：VAD检测到静音
                is_speaking_ = false;
                ESP_LOGI(TAG, "VAD: Human voice ended");
                vad_state_change_callback_(false);
            }
        }

        if (output_callback_) {
            // 确保音频数据在正确的内存区域分配，避免PSRAM/内部内存混乱
            size_t sample_count = res->data_size / sizeof(int16_t);
            std::vector<int16_t> audio_data;
            audio_data.reserve(sample_count);

            // 逐个复制数据，确保使用标准内存分配器
            int16_t* src_data = (int16_t*)res->data;
            for (size_t i = 0; i < sample_count; i++) {
                audio_data.push_back(src_data[i]);
            }

            output_callback_(std::move(audio_data));
        }
    }
}

// 回声感知VAD优化方法实现
void AudioProcessor::SetEchoAwareParams(const EchoAwareVadParams& params) {
    echo_params_ = params;
    ESP_LOGI(TAG, "Echo-aware VAD params updated: snr_threshold=%.2f, min_silence=%dms, cooldown=%dms",
             params.snr_threshold, params.min_silence_ms, params.interrupt_cooldown_ms);
}

void AudioProcessor::SetSpeakerVolume(float volume) {
    current_speaker_volume_ = volume;

    // 🎯 触发自适应噪声抑制更新
    if (adaptive_enabled_ && echo_params_.adaptive_noise_suppression) {
        AdaptSuppressionLevel();
    }

    ESP_LOGV(TAG, "Speaker volume updated: %.2f, adaptive suppression: %.2f",
             volume, adaptive_state_.dynamic_suppression_level);
}

bool AudioProcessor::IsEchoSuppressed() const {
    return aec_converged_;
}

bool AudioProcessor::EvaluateSpeechWithEchoAwareness(afe_fetch_result_t* fetch_result) {
    if (!fetch_result || fetch_result->ret_value != ESP_OK) {
        return false;
    }

    // 检查VAD状态 - 基于实际的ESP-ADF API
    bool basic_vad_detected = (fetch_result->vad_state == VAD_SPEECH);

    if (!basic_vad_detected) {
        return false;
    }

    // 增强的回声感知逻辑：多重检查机制
    if (echo_params_.adaptive_threshold) {
        // 计算当前音频块的能量
        int16_t* audio_data = (int16_t*)fetch_result->data;
        size_t sample_count = fetch_result->data_size / sizeof(int16_t);

        float energy = 0.0f;
        float peak_amplitude = 0.0f;
        for (size_t i = 0; i < sample_count; i++) {
            float sample = (float)abs(audio_data[i]);
            energy += sample * sample;
            if (sample > peak_amplitude) {
                peak_amplitude = sample;
            }
        }
        energy = energy / sample_count; // 平均能量

        // 🎯 自适应噪声抑制：根据实时环境动态调整阈值
        // 首先更新自适应状态
        UpdateAdaptiveNoiseState(audio_data, sample_count);

        // 获取动态抑制级别
        float adaptive_suppression = adaptive_enabled_ && echo_params_.adaptive_noise_suppression ?
                                    adaptive_state_.dynamic_suppression_level : 1.0f;

        // 🔊 智能阈值计算：结合固定策略和自适应策略
        float volume_factor = 1.0f + current_speaker_volume_ * 500.0f; // 基础音量影响
        float adaptive_threshold = echo_params_.snr_threshold * volume_factor * adaptive_suppression; // 自适应增强
        float energy_threshold = adaptive_threshold * 10000000000.0f; // 基础阈值

        // 超激进峰值检查：极度提高阈值，完全阻止误触发
        float peak_threshold = 500000.0f * volume_factor; // 超激进提高峰值阈值
        bool peak_check = (peak_amplitude > peak_threshold);

        // 能量检查
        bool energy_check = (energy > energy_threshold);

        // 超激进扬声器保护：任何微弱音频都极大提高阈值
        if (current_speaker_volume_ > 0.0001f) { // 极早触发保护
            energy_threshold *= 1000.0f;  // 播放时能量阈值提高1000倍
            peak_threshold *= 500.0f;     // 峰值阈值提高500倍
            energy_check = (energy > energy_threshold);
            peak_check = (peak_amplitude > peak_threshold);
        }

        // 频域特征检查：分析高频成分，人声通常有更多高频特征
        float high_freq_energy = 0.0f;
        for (size_t i = sample_count / 2; i < sample_count; i++) {
            float sample = (float)abs(audio_data[i]);
            high_freq_energy += sample * sample;
        }
        high_freq_energy = high_freq_energy / (sample_count / 2);

        // 超激进高频比例检查：极度严格的人声特征要求
        float high_freq_ratio = (energy > 0) ? (high_freq_energy / energy) : 0.0f;
        float freq_threshold = 1.2f * volume_factor; // 超激进提高高频比例要求到1.2（几乎不可能达到）
        if (current_speaker_volume_ > 0.0001f) {
            freq_threshold *= 50.0f; // 播放时超激进提高高频要求
        }
        bool freq_check = (high_freq_ratio > freq_threshold);

        // 超激进稳定性检查：极度严格的信号变化要求
        float variance = 0.0f;
        for (size_t i = 1; i < sample_count; i++) {
            float diff = (float)(abs(audio_data[i]) - abs(audio_data[i-1]));
            variance += diff * diff;
        }
        variance = variance / (sample_count - 1);
        float variance_threshold = 10000000000.0f / volume_factor; // 超激进提高方差要求
        if (current_speaker_volume_ > 0.0001f) {
            variance_threshold *= 100.0f; // 播放时超激进提高方差要求
        }
        bool stability_check = (variance > variance_threshold); // 人声变化更大

        // 增强连续性检查 - 真实人声通常有连续的特征变化
        static float prev_energy = 0.0f;
        static float prev_high_freq_ratio = 0.0f;
        static int consistent_frames = 0; // 连续帧计数

        float energy_change = abs(energy - prev_energy) / (prev_energy + 1.0f);
        float freq_change = abs(high_freq_ratio - prev_high_freq_ratio);

        // 超激进连续性要求：需要连续更多帧都满足极严格的人声特征
        bool frame_continuity = (energy_change > 1.2f && freq_change > 0.5f); // 超激进提高变化要求，且必须同时满足
        if (frame_continuity) {
            consistent_frames++;
        } else {
            consistent_frames = 0; // 重置计数
        }
        bool continuity_check = (consistent_frames >= 10); // 需要连续10帧都符合极严格人声特征

        prev_energy = energy;
        prev_high_freq_ratio = high_freq_ratio;

        // 最终综合判断：需要同时满足所有条件（绝对严格）
        // 新增：播放时间检查 - 如果刚开始播放，额外严格
        static auto last_volume_update = std::chrono::steady_clock::now();
        auto now = std::chrono::steady_clock::now();
        if (current_speaker_volume_ > 0.01f) {
            last_volume_update = now;
        }
        auto time_since_playback = std::chrono::duration_cast<std::chrono::milliseconds>(now - last_volume_update);
        bool recent_playback_protection = (time_since_playback.count() < 30000); // 播放后30秒内额外保护

        bool final_result = energy_check && peak_check && freq_check && stability_check &&
                           continuity_check && !recent_playback_protection;

                // 🔕 注释掉过于频繁的回声评估详细日志 - 只在结果为true时输出
                if (final_result) {
                    ESP_LOGI(TAG, "🎯 HUMAN VOICE DETECTED: duration=%.0fms, vol=%.3f, adaptive=%.1f%s",
                            (float)time_since_playback.count(), current_speaker_volume_, adaptive_suppression,
                            adaptive_state_.high_interference_mode ? "[HIGH_INTERFERENCE]" : "");
                }


        return final_result;
    }

    // 非自适应模式，直接信任VAD结果
    return true;
}

// 🎯 自适应噪声抑制核心算法实现
void AudioProcessor::UpdateAdaptiveNoiseState(const int16_t* audio_data, size_t sample_count) {
    if (!adaptive_enabled_ || !echo_params_.adaptive_noise_suppression) {
        return;
    }

    // 计算当前回声强度
    float echo_strength = CalculateEchoStrength(audio_data, sample_count);
    adaptive_state_.current_echo_strength = echo_strength;

    // 估算距离因子 (基于回声强度和音量)
    adaptive_state_.estimated_distance_factor = EstimateDistanceFactor(echo_strength, current_speaker_volume_);

    // 更新环境噪声基线
    if (current_speaker_volume_ < 0.01f) { // 扬声器几乎静音时更新基线
        float current_noise = 0.0f;
        for (size_t i = 0; i < sample_count; i++) {
            current_noise += abs(audio_data[i]);
        }
        current_noise /= sample_count;

        // 指数移动平均更新噪声基线
        adaptive_state_.noise_baseline = adaptive_state_.noise_baseline * 0.95f + current_noise * 0.05f;
    }

    // 自适应调整抑制级别
    AdaptSuppressionLevel();

    adaptive_state_.last_adaptation_time = std::chrono::steady_clock::now();
}

float AudioProcessor::CalculateEchoStrength(const int16_t* audio_data, size_t sample_count) {
    if (current_speaker_volume_ < 0.001f) {
        return 0.0f; // 扬声器静音，无回声
    }

    // 计算音频能量
    float energy = 0.0f;
    float peak = 0.0f;
    for (size_t i = 0; i < sample_count; i++) {
        float sample = abs(audio_data[i]);
        energy += sample * sample;
        if (sample > peak) peak = sample;
    }
    energy = std::sqrt(energy / sample_count);

    // 🔊 回声强度 = 能量 × 峰值比 × 音量影响
    float peak_ratio = (energy > 0) ? (peak / energy) : 0.0f;

    // 🎯 关键洞察：回声具有特征性的能量分布模式
    // 真实人声：能量分布更均匀，峰值比较低
    // 设备回声：能量集中，峰值比较高
    float echo_indicator = peak_ratio * current_speaker_volume_;

    return echo_indicator;
}

float AudioProcessor::EstimateDistanceFactor(float echo_strength, float volume) {
    if (volume < 0.001f) {
        return 1.0f; // 静音时认为距离无关紧要
    }

    // 🎯 基于物理原理的距离估算：
    // 回声强度 ∝ 音量² / 距离²
    // 距离因子 = 1 / (1 + echo_strength * volume_sensitivity)
    // 值越小表示越近，值越大表示越远

    float normalized_echo = echo_strength / (volume + 0.001f); // 归一化回声
    float distance_factor = 1.0f / (1.0f + normalized_echo * echo_params_.volume_sensitivity);

    // 🔊 约束距离因子范围 [0.1, 1.0]
    distance_factor = std::max(0.1f, std::min(1.0f, distance_factor));

    // 🔕 注释掉过于频繁的距离估算日志
    // ESP_LOGD(TAG, "🎯 Distance estimation: echo=%.3f, vol=%.3f, factor=%.3f",
    //          echo_strength, volume, distance_factor);

    return distance_factor;
}

void AudioProcessor::AdaptSuppressionLevel() {
    if (!adaptive_enabled_ || !echo_params_.adaptive_noise_suppression) {
        adaptive_state_.dynamic_suppression_level = 1.0f;
        return;
    }

    // 🎯 自适应抑制级别计算
    // 基础抑制级别
    float base_level = echo_params_.noise_suppression_base;

    // 🔊 音量影响：音量越大，抑制越强
    float volume_multiplier = 1.0f + current_speaker_volume_ * echo_params_.volume_sensitivity;

    // 📏 距离影响：距离越近，抑制越强
    float distance_multiplier = 1.0f / (adaptive_state_.estimated_distance_factor + 0.1f);

    // 🌊 回声强度影响：回声越强，抑制越强
    float echo_multiplier = 1.0f + adaptive_state_.current_echo_strength * 2.0f;

    // 🎯 综合计算动态抑制级别
    adaptive_state_.dynamic_suppression_level = base_level * volume_multiplier * distance_multiplier * echo_multiplier;

    // 📊 高干扰模式判断
    bool was_high_interference = adaptive_state_.high_interference_mode;
    adaptive_state_.high_interference_mode = (
        current_speaker_volume_ > 0.3f &&  // 高音量
        adaptive_state_.estimated_distance_factor < 0.5f &&  // 近距离
        adaptive_state_.current_echo_strength > echo_params_.echo_detection_threshold  // 强回声
    );

    // 🚨 高干扰模式额外保护
    if (adaptive_state_.high_interference_mode) {
        adaptive_state_.dynamic_suppression_level *= 5.0f; // 高干扰时5倍抑制

        if (!was_high_interference) {
            ESP_LOGW(TAG, "🔴 Entering HIGH INTERFERENCE mode - vol=%.2f, dist=%.2f, echo=%.3f",
                     current_speaker_volume_, adaptive_state_.estimated_distance_factor,
                     adaptive_state_.current_echo_strength);
        }
    } else if (was_high_interference) {
        ESP_LOGI(TAG, "🟢 Exiting high interference mode - returning to adaptive suppression");
    }

    // 📏 限制抑制级别范围 [1.0, 100.0]
    adaptive_state_.dynamic_suppression_level = std::max(1.0f, std::min(100.0f, adaptive_state_.dynamic_suppression_level));

    // 🔕 注释掉过于频繁的自适应抑制日志
    // ESP_LOGD(TAG, "🎯 Adaptive suppression: vol=%.2f, dist=%.2f, echo=%.3f → level=%.1f %s",
    //          current_speaker_volume_, adaptive_state_.estimated_distance_factor,
    //          adaptive_state_.current_echo_strength, adaptive_state_.dynamic_suppression_level,
    //          adaptive_state_.high_interference_mode ? "[HIGH_INTERFERENCE]" : "");
}

AdaptiveNoiseState AudioProcessor::GetAdaptiveState() const {
    return adaptive_state_;
}