feat(rtc): 偶发连接失败完整修复 (A+B+C 三件套)
实测根因 (DIAG 埋点确认): 火山 RTC SDK 启动时一次性申请大量 lwIP socket fd,
默认 CONFIG_LWIP_MAX_SOCKETS=10 不够 SDK 分配, 触发 SocketConnection-Lite.c:191
bind local ip failed → ICE 协商失败 → wait connect bits=0x0 超时.
实测对比:
修复前: 冷启动 RTC join 30+ 秒超时 × 3 次失败
修复后: 冷启动 RTC join 1.6 秒成功, 软退出 + 唤醒重连 2.3 秒成功 ✅
修复内容:
[A] sdkconfig: CONFIG_LWIP_MAX_SOCKETS=10 → 20
根治 lwIP socket fd 不足. 16 是临界值, 20 留 25% 余量应对 burst 场景
(HTTP 重试 / DNS 查询 / NTP 同步并发). 代价: +6 fd × ~200B = 1.2 KB RAM (忽略).
[B] application.h/cc + volc_rtc_protocol.h/cc: 失败 3 次后销毁 + 重建 engine
新增 VolcRtcProtocol::ForceRebuildEngine() public 方法.
OpenAudioChannel 连续失败 3 次时调用 (application.cc:566-573):
- 销毁 rtc_handle_ + reset SDK 内部状态污染
- 等待 2 秒让 lwIP 释放残留 socket fd (TIME_WAIT)
- 触发 Phase 6 重建路径 (rtc_handle_=nullptr → Start())
应对 A 修复后仍可能出现的 SDK 内部状态错乱 (e.g. ICE Agent 异常).
本次实测未触发 (A 已解决主要问题), 但保留作为兜底防御.
[C] volc_rtc_protocol.cc: DIAG_RTC_BIND_ENABLE 一键开关诊断埋点
在 join_room 前/后 + ForceRebuildEngine 前/后打印:
- lwIP socket fd 使用量 (sockets=N/MAX)
- heap free + psram free
- WiFi rssi
- 失败时的 errno + strerror
验证完成后改 0 关闭, 编译器消除 #if 块, 零运行时开销.
文件改动:
sdkconfig | LWIP_MAX_SOCKETS 10→20
main/application.h | +audio_channel_retry_count_
main/application.cc | +重试计数 + static_cast → ForceRebuildEngine 调用
main/protocols/volc_rtc_protocol.h | +ForceRebuildEngine() 声明
main/protocols/volc_rtc_protocol.cc | +DIAG 埋点 + diag_count_used_sockets() + ForceRebuildEngine()
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
3e709577f5
commit
70f0cdd07a
@ -555,10 +555,25 @@ void Application::ToggleChatState() {
|
|||||||
Board::GetInstance().SetPowerSaveMode(false);// 关闭低功耗模式
|
Board::GetInstance().SetPowerSaveMode(false);// 关闭低功耗模式
|
||||||
if (!protocol_->OpenAudioChannel()) {
|
if (!protocol_->OpenAudioChannel()) {
|
||||||
auto ac = Board::GetInstance().GetAudioCodec();
|
auto ac = Board::GetInstance().GetAudioCodec();
|
||||||
ESP_LOGW(TAG, "打开音频通道失败,将在2秒后重试");
|
audio_channel_retry_count_++;
|
||||||
|
ESP_LOGW(TAG, "打开音频通道失败 (第 %d 次), 将在2秒后重试", audio_channel_retry_count_);
|
||||||
if (ac) {
|
if (ac) {
|
||||||
ESP_LOGW(TAG, "Diag: codec out_channels=%d in_channels=%d out_sr=%d in_sr=%d", ac->output_channels(), ac->input_channels(), ac->output_sample_rate(), ac->input_sample_rate());
|
ESP_LOGW(TAG, "Diag: codec out_channels=%d in_channels=%d out_sr=%d in_sr=%d", ac->output_channels(), ac->input_channels(), ac->output_sample_rate(), ac->input_sample_rate());
|
||||||
}
|
}
|
||||||
|
// 方案 B: 连续失败 3 次后销毁 + 重建 RTC engine
|
||||||
|
// 原因: SDK 内部状态污染 (lwIP socket fd 残留 / 内部缓存错乱) 单纯重试无效,
|
||||||
|
// 必须重建 engine 清理. 触发 Phase 6 的 rtc_handle_=nullptr → Start() 重建路径
|
||||||
|
if (audio_channel_retry_count_ >= 3) {
|
||||||
|
ESP_LOGW(TAG, "🔄 连续失败 3 次, 触发 RTC engine 重建 (清理 SDK 状态)");
|
||||||
|
// protocol_ 是基类 unique_ptr<Protocol>, 需 dynamic_cast 到 VolcRtcProtocol
|
||||||
|
// ESP-IDF 默认 -fno-rtti, 不能用 dynamic_cast.
|
||||||
|
// protocol_ 在 Init 时只赋值为 VolcRtcProtocol (line 932), 用 static_cast 安全
|
||||||
|
auto* volc_rtc = static_cast<VolcRtcProtocol*>(protocol_.get());
|
||||||
|
if (volc_rtc) {
|
||||||
|
volc_rtc->ForceRebuildEngine();
|
||||||
|
}
|
||||||
|
audio_channel_retry_count_ = 0; // 重置计数, 重建后从 0 开始计
|
||||||
|
}
|
||||||
SetDeviceState(kDeviceStateIdle);
|
SetDeviceState(kDeviceStateIdle);
|
||||||
Schedule([this]() {
|
Schedule([this]() {
|
||||||
vTaskDelay(pdMS_TO_TICKS(2000));
|
vTaskDelay(pdMS_TO_TICKS(2000));
|
||||||
@ -567,6 +582,8 @@ void Application::ToggleChatState() {
|
|||||||
});
|
});
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
// 连接成功重置重试计数
|
||||||
|
audio_channel_retry_count_ = 0;
|
||||||
|
|
||||||
listening_mode_ = kListeningModeRealtime;// 设置监听模式为实时监听
|
listening_mode_ = kListeningModeRealtime;// 设置监听模式为实时监听
|
||||||
SetDeviceState(kDeviceStateDialog);// 设置设备状态为对话模式
|
SetDeviceState(kDeviceStateDialog);// 设置设备状态为对话模式
|
||||||
|
|||||||
@ -167,6 +167,7 @@ private:
|
|||||||
std::atomic<bool> https_playback_active_{false};// HTTPS音频播放进行中标志
|
std::atomic<bool> https_playback_active_{false};// HTTPS音频播放进行中标志
|
||||||
std::atomic<bool> https_playback_abort_{false};// HTTPS音频播放中止标志
|
std::atomic<bool> https_playback_abort_{false};// HTTPS音频播放中止标志
|
||||||
std::atomic<int> post_abort_debug_frames_{0};// HTTPS中止后诊断日志计数(追踪前N帧音频)
|
std::atomic<int> post_abort_debug_frames_{0};// HTTPS中止后诊断日志计数(追踪前N帧音频)
|
||||||
|
int audio_channel_retry_count_ = 0;// RTC 偶发连接失败重试计数 (方案 B: 失败 3 次后销毁 + 重建 engine)
|
||||||
bool aborted_ = false;
|
bool aborted_ = false;
|
||||||
bool voice_detected_ = false;
|
bool voice_detected_ = false;
|
||||||
bool audio_paused_ = false; // 音频暂停状态标志
|
bool audio_paused_ = false; // 音频暂停状态标志
|
||||||
|
|||||||
@ -21,6 +21,29 @@
|
|||||||
|
|
||||||
static const char* TAG = "VolcRtcProtocol";
|
static const char* TAG = "VolcRtcProtocol";
|
||||||
|
|
||||||
|
// ============================================================
|
||||||
|
// 方案 C: RTC bind 失败诊断埋点 (一键关闭, 零运行时开销)
|
||||||
|
// 验证完成后改 0 关闭, 编译器消除 #if 块, 不占 Flash/CPU
|
||||||
|
// 排查 "Cache.c:273 status=0x9 + SocketConnection-Lite.c:191 bind failed" 偶发问题
|
||||||
|
// ============================================================
|
||||||
|
#ifndef DIAG_RTC_BIND_ENABLE
|
||||||
|
#define DIAG_RTC_BIND_ENABLE 1
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if DIAG_RTC_BIND_ENABLE
|
||||||
|
#include "esp_wifi.h"
|
||||||
|
#include "lwip/sockets.h" // LWIP_SOCKET_OFFSET
|
||||||
|
// 统计当前 lwIP socket fd 使用量 (在 LWIP_SOCKET_OFFSET 偏移之上扫描)
|
||||||
|
static int diag_count_used_sockets(void) {
|
||||||
|
int used = 0;
|
||||||
|
for (int fd = LWIP_SOCKET_OFFSET; fd < LWIP_SOCKET_OFFSET + CONFIG_LWIP_MAX_SOCKETS; fd++) {
|
||||||
|
struct stat st;
|
||||||
|
if (fstat(fd, &st) == 0) used++;
|
||||||
|
}
|
||||||
|
return used;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
VolcRtcProtocol::VolcRtcProtocol() {
|
VolcRtcProtocol::VolcRtcProtocol() {
|
||||||
event_group_handle_ = xEventGroupCreate();
|
event_group_handle_ = xEventGroupCreate();
|
||||||
}
|
}
|
||||||
@ -364,6 +387,18 @@ bool VolcRtcProtocol::OpenAudioChannel() {
|
|||||||
xEventGroupClearBits(event_group_handle_, 0x1 | 0x2);
|
xEventGroupClearBits(event_group_handle_, 0x1 | 0x2);
|
||||||
// 新增:extra_params 用于传递额外的AgentConfig配置参数
|
// 新增:extra_params 用于传递额外的AgentConfig配置参数
|
||||||
ESP_LOGI(TAG, "Join RTC: handle=%p bot=%s iot_ready=%d free_heap=%u", rtc_handle_, CONFIG_VOLC_BOT_ID, (int)iot_ready_, (unsigned)heap_caps_get_free_size(MALLOC_CAP_DEFAULT));
|
ESP_LOGI(TAG, "Join RTC: handle=%p bot=%s iot_ready=%d free_heap=%u", rtc_handle_, CONFIG_VOLC_BOT_ID, (int)iot_ready_, (unsigned)heap_caps_get_free_size(MALLOC_CAP_DEFAULT));
|
||||||
|
#if DIAG_RTC_BIND_ENABLE
|
||||||
|
{
|
||||||
|
int sockets_used = diag_count_used_sockets();
|
||||||
|
wifi_ap_record_t ap_info = {};
|
||||||
|
int rssi = (esp_wifi_sta_get_ap_info(&ap_info) == ESP_OK) ? ap_info.rssi : -127;
|
||||||
|
ESP_LOGW("DIAG-RTC", "Pre-Join: sockets=%d/%d heap=%u psram=%u rssi=%d",
|
||||||
|
sockets_used, CONFIG_LWIP_MAX_SOCKETS,
|
||||||
|
(unsigned)heap_caps_get_free_size(MALLOC_CAP_DEFAULT),
|
||||||
|
(unsigned)heap_caps_get_free_size(MALLOC_CAP_SPIRAM),
|
||||||
|
rssi);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
int ret = volc_rtc_start(rtc_handle_, CONFIG_VOLC_BOT_ID, &iot_info_, extra_params_.empty() ? NULL : extra_params_.c_str());
|
int ret = volc_rtc_start(rtc_handle_, CONFIG_VOLC_BOT_ID, &iot_info_, extra_params_.empty() ? NULL : extra_params_.c_str());
|
||||||
if (ret != 0) {
|
if (ret != 0) {
|
||||||
ESP_LOGE(TAG, "RTC启动失败:%d", ret);// RTC启动失败:%d
|
ESP_LOGE(TAG, "RTC启动失败:%d", ret);// RTC启动失败:%d
|
||||||
@ -375,6 +410,16 @@ bool VolcRtcProtocol::OpenAudioChannel() {
|
|||||||
if ((bits & 0x1) == 0) {
|
if ((bits & 0x1) == 0) {
|
||||||
ESP_LOGE(TAG, "RTC连接超时");// RTC连接超时
|
ESP_LOGE(TAG, "RTC连接超时");// RTC连接超时
|
||||||
ESP_LOGW(TAG, "Diag: check Wi-Fi, SNTP time sync, IoT creds, RTC server availability");// 诊断:检查Wi-Fi、SNTP时间同步、IoT凭证、RTC服务器可用性
|
ESP_LOGW(TAG, "Diag: check Wi-Fi, SNTP time sync, IoT creds, RTC server availability");// 诊断:检查Wi-Fi、SNTP时间同步、IoT凭证、RTC服务器可用性
|
||||||
|
#if DIAG_RTC_BIND_ENABLE
|
||||||
|
{
|
||||||
|
int sockets_used = diag_count_used_sockets();
|
||||||
|
ESP_LOGW("DIAG-RTC", "Post-Fail: sockets=%d/%d heap=%u psram=%u errno=%d(%s)",
|
||||||
|
sockets_used, CONFIG_LWIP_MAX_SOCKETS,
|
||||||
|
(unsigned)heap_caps_get_free_size(MALLOC_CAP_DEFAULT),
|
||||||
|
(unsigned)heap_caps_get_free_size(MALLOC_CAP_SPIRAM),
|
||||||
|
errno, strerror(errno));
|
||||||
|
}
|
||||||
|
#endif
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
// Do not block audio readiness on remote user join; enable subscribe immediately
|
// Do not block audio readiness on remote user join; enable subscribe immediately
|
||||||
@ -452,6 +497,38 @@ void VolcRtcProtocol::LeaveRoom(bool notify_closed) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 方案 B: 强制销毁并重建 RTC engine
|
||||||
|
// 用途: OpenAudioChannel 连续失败 N 次后调用, 清理 SDK 内部错乱状态
|
||||||
|
// 实现: 销毁 rtc_handle_ + 触发 Phase 6 重建路径
|
||||||
|
// 下次 OpenAudioChannel 看到 rtc_handle_=nullptr → Start() 异步重建
|
||||||
|
void VolcRtcProtocol::ForceRebuildEngine() {
|
||||||
|
ESP_LOGW(TAG, "🔄 ForceRebuildEngine: 销毁 RTC engine 以清理 SDK 状态");
|
||||||
|
#if DIAG_RTC_BIND_ENABLE
|
||||||
|
ESP_LOGW("DIAG-RTC", "Pre-Rebuild: sockets=%d/%d heap=%u",
|
||||||
|
diag_count_used_sockets(), CONFIG_LWIP_MAX_SOCKETS,
|
||||||
|
(unsigned)heap_caps_get_free_size(MALLOC_CAP_DEFAULT));
|
||||||
|
#endif
|
||||||
|
if (rtc_handle_) {
|
||||||
|
if (is_connected_) {
|
||||||
|
volc_rtc_stop(rtc_handle_);
|
||||||
|
is_connected_ = false;
|
||||||
|
}
|
||||||
|
volc_rtc_destroy(rtc_handle_);
|
||||||
|
rtc_handle_ = nullptr;
|
||||||
|
}
|
||||||
|
is_audio_channel_opened_ = false;
|
||||||
|
downlink_is_pcm_ = false;
|
||||||
|
first_downlink_logged_ = false;
|
||||||
|
// 等 2 秒让 lwIP 释放残留 socket fd (TIME_WAIT 状态)
|
||||||
|
vTaskDelay(pdMS_TO_TICKS(2000));
|
||||||
|
#if DIAG_RTC_BIND_ENABLE
|
||||||
|
ESP_LOGW("DIAG-RTC", "Post-Rebuild-Wait: sockets=%d/%d heap=%u",
|
||||||
|
diag_count_used_sockets(), CONFIG_LWIP_MAX_SOCKETS,
|
||||||
|
(unsigned)heap_caps_get_free_size(MALLOC_CAP_DEFAULT));
|
||||||
|
#endif
|
||||||
|
ESP_LOGI(TAG, "🔄 engine 已销毁, 下次 OpenAudioChannel 触发 Phase 6 重建");
|
||||||
|
}
|
||||||
|
|
||||||
// 🔊 检查音频通道是否已打开
|
// 🔊 检查音频通道是否已打开
|
||||||
bool VolcRtcProtocol::IsAudioChannelOpened() const {
|
bool VolcRtcProtocol::IsAudioChannelOpened() const {
|
||||||
return is_audio_channel_opened_;
|
return is_audio_channel_opened_;
|
||||||
|
|||||||
@ -26,6 +26,11 @@ public:
|
|||||||
// 与 CloseAudioChannel 区别:CloseAudioChannel 只停媒体流,房间仍占用
|
// 与 CloseAudioChannel 区别:CloseAudioChannel 只停媒体流,房间仍占用
|
||||||
void LeaveRoom(bool notify_closed = true) override;
|
void LeaveRoom(bool notify_closed = true) override;
|
||||||
|
|
||||||
|
// 方案 B: 强制销毁并重建 RTC engine. 当 OpenAudioChannel 连续失败 N 次时调用,
|
||||||
|
// 清理 SDK 内部错乱状态 (如 lwIP socket fd 残留 / 内部缓存污染),
|
||||||
|
// 触发 Phase 6 的 rtc_handle_=nullptr → Start() 重建路径
|
||||||
|
void ForceRebuildEngine();
|
||||||
|
|
||||||
bool IsAudioChannelOpened() const override;// 🔊 检查音频通道是否已打开
|
bool IsAudioChannelOpened() const override;// 🔊 检查音频通道是否已打开
|
||||||
void SendAbortSpeaking(AbortReason reason) override;// 🔊 发送中止通话请求
|
void SendAbortSpeaking(AbortReason reason) override;// 🔊 发送中止通话请求
|
||||||
void SendStartListening(ListeningMode mode) override;// 🔊 发送开始监听请求
|
void SendStartListening(ListeningMode mode) override;// 🔊 发送开始监听请求
|
||||||
|
|||||||
@ -2106,7 +2106,7 @@ CONFIG_LWIP_DNS_SUPPORT_MDNS_QUERIES=y
|
|||||||
CONFIG_LWIP_TIMERS_ONDEMAND=y
|
CONFIG_LWIP_TIMERS_ONDEMAND=y
|
||||||
CONFIG_LWIP_ND6=y
|
CONFIG_LWIP_ND6=y
|
||||||
# CONFIG_LWIP_FORCE_ROUTER_FORWARDING is not set
|
# CONFIG_LWIP_FORCE_ROUTER_FORWARDING is not set
|
||||||
CONFIG_LWIP_MAX_SOCKETS=10
|
CONFIG_LWIP_MAX_SOCKETS=20
|
||||||
# CONFIG_LWIP_USE_ONLY_LWIP_SELECT is not set
|
# CONFIG_LWIP_USE_ONLY_LWIP_SELECT is not set
|
||||||
# CONFIG_LWIP_SO_LINGER is not set
|
# CONFIG_LWIP_SO_LINGER is not set
|
||||||
CONFIG_LWIP_SO_REUSE=y
|
CONFIG_LWIP_SO_REUSE=y
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user