核心变更: - face_tracker.cc: YUYV→YVYU 序列修正(byte[1]=V, byte[3]=U), 基于 JPEG Dump 诊断工具验证 OV3660 FORMAT_CTRL00=0x61 实际是 YVYU - face_tracker.cc: 启动时 base64 打印一帧 JPEG 到串口,用于肉眼验证 - config.h: XCLK 20MHz→10MHz,给飞线信号完整性 2x 裕度 - scripts/auto_capture_jpeg.py: 自动串口抓帧工具(DTR/RTS 复位 + base64 解码) - scripts/extract_jpeg_from_log.py: 从日志离线提取 JPEG - Coglet项目分析与开发指南.md: 新增"六点六"章节,汇总 Phase 01 主要矛盾(画面可辨识≠模型可识别)、YUV→RGB 色偏三层原因、 esp-dl 模型输入分布敏感性、延迟分析、三方案对比、方案 B 突破口 - docs/: 新增 2 篇 OV3660 相关 CSDN 参考资料 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
323 lines
14 KiB
C++
323 lines
14 KiB
C++
// [T05/T06] 人脸追踪任务
|
||
// 只有 ESP32-S3 + CONFIG_XIAOZHI_ENABLE_FACE_TRACKING=y 才编译完整实现
|
||
// 其他情况编译 3 个空函数,保证链接通过
|
||
|
||
#include "face_tracker.h"
|
||
#include "sdkconfig.h"
|
||
|
||
#if defined(CONFIG_XIAOZHI_ENABLE_FACE_TRACKING) && defined(CONFIG_IDF_TARGET_ESP32S3)
|
||
|
||
#include "human_face_detect.hpp"
|
||
#include "dl_image_define.hpp"
|
||
#include "dl_detect_define.hpp"
|
||
#include "board.h"
|
||
#include "esp32_camera.h"
|
||
#include "display/lvgl_display/jpg/image_to_jpeg.h"
|
||
#include <linux/videodev2.h>
|
||
|
||
#include <esp_heap_caps.h>
|
||
#include <esp_log.h>
|
||
#include <esp_timer.h>
|
||
#include <freertos/FreeRTOS.h>
|
||
#include <freertos/task.h>
|
||
#include <list>
|
||
#include <new>
|
||
#include <cstring>
|
||
|
||
static const char* TAG = "FaceTracker";
|
||
static TaskHandle_t s_handle = nullptr;
|
||
static volatile bool s_stop = false;
|
||
static float s_last_fps = 0.0f;
|
||
|
||
// T06: uart_send_face 由 T07 在 uart_component.{h,cc} 中提供
|
||
// 此处用前向声明 + 弱符号,让 T07 完成前 face_tracker.cc 仍能通过编译
|
||
// T07 完成后该弱符号被真实实现覆盖,无需改动本文件
|
||
extern "C" __attribute__((weak)) void uart_send_face(int x_offset, int y_offset);
|
||
|
||
// YVYU → RGB888 手动转换(OV3660 FORMAT_CTRL00=0x61 实际输出 Y V Y U 序列)
|
||
// 每 4 字节 YVYU 生成 2 像素 6 字节 RGB888
|
||
// 公式(BT.601 JFIF):R = Y + 1.402*(V-128); G = Y - 0.344*(U-128) - 0.714*(V-128); B = Y + 1.772*(U-128)
|
||
// [2026-04-21 修正] 之前按 YUYV (Y U Y V) 读取导致色彩偏绿紫,JPEG dump 测试证实
|
||
// sensor 实际是 YVYU sequence,byte[1]=V, byte[3]=U(顺序反了)
|
||
static inline void yuyv_to_rgb888_line(const uint8_t* yuyv, uint8_t* rgb, int pixels) {
|
||
for (int i = 0; i < pixels; i += 2) {
|
||
int y1 = yuyv[0];
|
||
int v = yuyv[1] - 128; // 修正:byte[1] = V(原本误当 U)
|
||
int y2 = yuyv[2];
|
||
int u = yuyv[3] - 128; // 修正:byte[3] = U(原本误当 V)
|
||
yuyv += 4;
|
||
// 像素 1
|
||
int r1 = y1 + (359 * v) / 256;
|
||
int g1 = y1 - (88 * u + 183 * v) / 256;
|
||
int b1 = y1 + (454 * u) / 256;
|
||
// 像素 2
|
||
int r2 = y2 + (359 * v) / 256;
|
||
int g2 = y2 - (88 * u + 183 * v) / 256;
|
||
int b2 = y2 + (454 * u) / 256;
|
||
*rgb++ = (uint8_t)(r1 < 0 ? 0 : r1 > 255 ? 255 : r1);
|
||
*rgb++ = (uint8_t)(g1 < 0 ? 0 : g1 > 255 ? 255 : g1);
|
||
*rgb++ = (uint8_t)(b1 < 0 ? 0 : b1 > 255 ? 255 : b1);
|
||
*rgb++ = (uint8_t)(r2 < 0 ? 0 : r2 > 255 ? 255 : r2);
|
||
*rgb++ = (uint8_t)(g2 < 0 ? 0 : g2 > 255 ? 255 : g2);
|
||
*rgb++ = (uint8_t)(b2 < 0 ? 0 : b2 > 255 ? 255 : b2);
|
||
}
|
||
}
|
||
|
||
static void face_tracker_task(void* arg) {
|
||
(void)arg;
|
||
// 等待摄像头 ISP 预热 + 视频流启动稳定
|
||
vTaskDelay(pdMS_TO_TICKS(500));
|
||
|
||
ESP_LOGI(TAG, "face_tracker task started on core %d", xPortGetCoreID());
|
||
|
||
// [2026-04-20 重大修复] 分配 PSRAM RGB888 缓冲区,手动 YUYV→RGB888 转换
|
||
// 绕过 esp-dl ImagePreprocessor 的 YUYV 路径(疑似产生固定激活 bug)
|
||
// 240*240*3 = 172800 字节,PSRAM 8MB 完全够
|
||
constexpr size_t RGB_SIZE = 240 * 240 * 3;
|
||
uint8_t* rgb_buf = (uint8_t*)heap_caps_malloc(RGB_SIZE, MALLOC_CAP_SPIRAM);
|
||
if (!rgb_buf) {
|
||
ESP_LOGE(TAG, "分配 RGB888 缓冲失败");
|
||
vTaskDelete(NULL);
|
||
return;
|
||
}
|
||
ESP_LOGI(TAG, "RGB888 转换缓冲已分配 %u bytes", (unsigned)RGB_SIZE);
|
||
|
||
// 构造检测器:默认 model_type 由 CONFIG_DEFAULT_HUMAN_FACE_DETECT_MODEL 决定
|
||
// lazy_load=true(默认)以减少启动期内存瞬时占用
|
||
auto* detector = new(std::nothrow) HumanFaceDetect();
|
||
if (!detector) {
|
||
ESP_LOGE(TAG, "HumanFaceDetect 构造失败(PSRAM 不足?)");
|
||
multi_heap_info_t info;
|
||
heap_caps_get_info(&info, MALLOC_CAP_SPIRAM);
|
||
ESP_LOGE(TAG, "PSRAM free=%u total_allocated=%u",
|
||
(unsigned)info.total_free_bytes,
|
||
(unsigned)info.total_allocated_bytes);
|
||
s_handle = nullptr;
|
||
vTaskDelete(NULL);
|
||
return;
|
||
}
|
||
|
||
// 一次性打印启动时 PSRAM 占用供诊断(RESEARCH R2 风险跟踪)
|
||
{
|
||
multi_heap_info_t info;
|
||
heap_caps_get_info(&info, MALLOC_CAP_SPIRAM);
|
||
ESP_LOGI(TAG, "PSRAM after detector init: free=%u allocated=%u",
|
||
(unsigned)info.total_free_bytes,
|
||
(unsigned)info.total_allocated_bytes);
|
||
}
|
||
|
||
// [2026-04-21 诊断结论] 多格式 JPEG dump 测试确认:sensor 实际输出 YUYV packed 格式
|
||
// - frame_YUYV.jpg 画面清晰(能看到戴眼镜人脸 + 背景),只是色彩偏绿紫
|
||
// - frame_RGB565.jpg / UYVY / YUV422P 全是彩色马赛克
|
||
// - 色偏原因:FORMAT_CTRL00=0x61 的 bit[3:0]=1 在 YUV 模式下是 YVYU sequence
|
||
// (实际字节序 Y V Y U,不是标准 YUYV 的 Y U Y V)
|
||
// → yuyv_to_rgb888_line 要按 YVYU 读取:byte[1]=V, byte[3]=U
|
||
// 保留 JPEG dump 用于拍照验证(先确认摄像头正常再跑人脸识别)
|
||
// [2026-04-22] sensor 切到硬件 JPEG 模式(CONFIG_CAMERA_OV3660_DVP_JPEG_1280X720_12FPS)
|
||
// sensor 内部已做完 YUV→RGB→JPEG 全流程色彩处理,输出标准 JPEG 字节流
|
||
// 我们不再需要 image_to_jpeg 二次编码,直接把 f.data 透传即可
|
||
{
|
||
vTaskDelay(pdMS_TO_TICKS(2000)); // JPEG 模式分辨率 1280x720,sensor 需要更长曝光稳定时间
|
||
auto* cam = dynamic_cast<Esp32Camera*>(Board::GetInstance().GetCamera());
|
||
Esp32Camera::FrameRef f;
|
||
if (cam && cam->CaptureForDetection(&f) && f.data && f.len > 0) {
|
||
const uint8_t* jpg = (const uint8_t*)f.data;
|
||
size_t jpg_len = f.len;
|
||
ESP_LOGI(TAG, "===JPEG_DUMP_BEGIN fmt=SENSOR_JPEG size=%u w=%u h=%u===",
|
||
(unsigned)jpg_len, f.width, f.height);
|
||
static const char b64[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
|
||
char line[128]; size_t lp = 0;
|
||
for (size_t i = 0; i < jpg_len; i += 3) {
|
||
uint32_t v = ((uint32_t)jpg[i] << 16);
|
||
if (i + 1 < jpg_len) v |= ((uint32_t)jpg[i+1] << 8);
|
||
if (i + 2 < jpg_len) v |= jpg[i+2];
|
||
line[lp++] = b64[(v >> 18) & 0x3F];
|
||
line[lp++] = b64[(v >> 12) & 0x3F];
|
||
line[lp++] = (i + 1 < jpg_len) ? b64[(v >> 6) & 0x3F] : '=';
|
||
line[lp++] = (i + 2 < jpg_len) ? b64[v & 0x3F] : '=';
|
||
if (lp >= 72) { line[lp] = 0; printf("%s\n", line); lp = 0; }
|
||
}
|
||
if (lp > 0) { line[lp] = 0; printf("%s\n", line); }
|
||
ESP_LOGI(TAG, "===JPEG_DUMP_END===");
|
||
cam->ReleaseDetectionFrame(f);
|
||
}
|
||
}
|
||
|
||
// 按 Kconfig 配置的 FPS 计算节拍
|
||
const TickType_t period = pdMS_TO_TICKS(1000 / CONFIG_XIAOZHI_FACE_TRACKING_FPS);
|
||
TickType_t last_wake = xTaskGetTickCount();
|
||
int hit = 0, miss = 0;
|
||
int64_t last_report_us = esp_timer_get_time();
|
||
// 实时日志限频:每秒最多 1 条(INFO 级别便于排查)
|
||
int64_t last_detail_log_us = 0;
|
||
int miss_streak = 0; // 连续 miss 计数
|
||
|
||
while (!s_stop) {
|
||
vTaskDelayUntil(&last_wake, period);
|
||
|
||
auto* cam = dynamic_cast<Esp32Camera*>(Board::GetInstance().GetCamera());
|
||
if (!cam) {
|
||
continue;
|
||
}
|
||
|
||
Esp32Camera::FrameRef f;
|
||
if (!cam->CaptureForDetection(&f)) {
|
||
// [T04 策略] 拿不到 mutex(MCP 拍照中)或 DQBUF 失败 → 正常跳帧
|
||
continue;
|
||
}
|
||
|
||
// [Bug 1 诊断] 首次进入循环时,打印前 32 字节 + 中心像素 + 统计,判断数据性质
|
||
// 全零 → 摄像头无数据;规律 → 字节序/格式问题;随机 → 正常但模型看不懂
|
||
static bool debug_dumped = false;
|
||
if (!debug_dumped && f.data && f.len >= 32) {
|
||
debug_dumped = true;
|
||
const uint8_t* d = (const uint8_t*)f.data;
|
||
ESP_LOGI(TAG, "frame debug: size=%u w=%u h=%u len=%u",
|
||
(unsigned)f.width * f.height * 2, f.width, f.height, (unsigned)f.len);
|
||
// 打印左上角 16 字节 + 中心附近 16 字节
|
||
size_t center = (f.width * (f.height / 2) + f.width / 2) * 2;
|
||
if (center + 16 <= f.len) {
|
||
ESP_LOGI(TAG, "top-left 16B: %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x",
|
||
d[0],d[1],d[2],d[3],d[4],d[5],d[6],d[7],d[8],d[9],d[10],d[11],d[12],d[13],d[14],d[15]);
|
||
ESP_LOGI(TAG, "center 16B: %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x",
|
||
d[center],d[center+1],d[center+2],d[center+3],d[center+4],d[center+5],d[center+6],d[center+7],
|
||
d[center+8],d[center+9],d[center+10],d[center+11],d[center+12],d[center+13],d[center+14],d[center+15]);
|
||
}
|
||
// 统计:零字节比例(判断摄像头是否真有数据)
|
||
size_t zero_cnt = 0;
|
||
for (size_t i = 0; i < f.len; i++) if (d[i] == 0) zero_cnt++;
|
||
ESP_LOGI(TAG, "zero bytes: %u / %u (%.1f%%)",
|
||
(unsigned)zero_cnt, (unsigned)f.len, 100.0f * zero_cnt / f.len);
|
||
}
|
||
|
||
// [2026-04-20 重大修复] 手动 YUYV → RGB888 转换,绕过 esp-dl 预处理黑盒
|
||
// 以前:img.pix_type = YUYV,让 ImagePreprocessor 内部做 YUV→RGB,但它产生固定激活
|
||
// 现在:先转成 RGB888 喂给模型,pix_type 标 RGB888,消除预处理不确定性
|
||
{
|
||
const uint8_t* src = (const uint8_t*)f.data;
|
||
uint8_t* dst = rgb_buf;
|
||
for (uint16_t row = 0; row < f.height; row++) {
|
||
yuyv_to_rgb888_line(src, dst, f.width);
|
||
src += f.width * 2; // YUYV 每像素 2 字节
|
||
dst += f.width * 3; // RGB888 每像素 3 字节
|
||
}
|
||
}
|
||
|
||
dl::image::img_t img{};
|
||
img.data = (void*)rgb_buf;
|
||
img.width = f.width;
|
||
img.height = f.height;
|
||
img.pix_type = dl::image::DL_IMAGE_PIX_TYPE_RGB888;
|
||
|
||
int64_t t0 = esp_timer_get_time();
|
||
auto& results = detector->run(img);
|
||
int64_t t1 = esp_timer_get_time();
|
||
|
||
// 立即归还 V4L2 缓冲,避免 face_track 占用时间长
|
||
cam->ReleaseDetectionFrame(f);
|
||
|
||
int64_t now_us = esp_timer_get_time();
|
||
if (results.empty()) {
|
||
miss++;
|
||
miss_streak++;
|
||
// 连续 3 秒无人脸时提示一次(按默认 FPS=10 折算 ~30 帧)
|
||
if (miss_streak == CONFIG_XIAOZHI_FACE_TRACKING_FPS * 3) {
|
||
ESP_LOGI(TAG, "no face detected in last 3s");
|
||
}
|
||
} else {
|
||
hit++;
|
||
miss_streak = 0;
|
||
// PLAN 未明确排序策略,esp-dl 内部 nms 后 list 顺序不稳定
|
||
// 为健壮性,挑 score 最高的那个(避免多脸时摇摆)
|
||
const dl::detect::result_t* best = nullptr;
|
||
for (const auto& r : results) {
|
||
if (best == nullptr || r.score > best->score) {
|
||
best = &r;
|
||
}
|
||
}
|
||
// box: [left_up_x, left_up_y, right_down_x, right_down_y]
|
||
int cx = (best->box[0] + best->box[2]) / 2;
|
||
int cy = (best->box[1] + best->box[3]) / 2;
|
||
// 坐标映射(RESEARCH Pitfall 7):严格保持 cx * 224 / width - 112
|
||
// 对齐 RP2040 端 deadzone=20 / x_adj_factor=10 的基准
|
||
int x_offset = (f.width > 0) ? (cx * 224 / f.width - 112) : 0;
|
||
int y_offset = (f.height > 0) ? (cy * 224 / f.height - 112) : 0;
|
||
|
||
// T07 完成后,uart_send_face 弱符号会被真实实现覆盖
|
||
if (uart_send_face != nullptr) {
|
||
uart_send_face(x_offset, y_offset);
|
||
}
|
||
// INFO 级别实时日志,限频每秒 1 条避免刷屏
|
||
// 修复:%lld 在 nano newlib 下输出异常,改为 %lu + uint32(infer<2s 安全)
|
||
if (now_us - last_detail_log_us > 1000000LL) {
|
||
ESP_LOGI(TAG, "face: score=%.2f box=[%d,%d,%d,%d] offset=(%+d,%+d) infer=%lums",
|
||
best->score,
|
||
best->box[0], best->box[1], best->box[2], best->box[3],
|
||
x_offset, y_offset,
|
||
(unsigned long)((t1 - t0) / 1000));
|
||
last_detail_log_us = now_us;
|
||
}
|
||
// 高频详细日志保留为 LOGD(需 idf.py monitor 按 Ctrl+T Y 切换为 DEBUG)
|
||
ESP_LOGD(TAG, "face score=%.2f offset=(%d,%d) infer=%luus",
|
||
best->score, x_offset, y_offset, (unsigned long)(t1 - t0));
|
||
}
|
||
|
||
// 每 10 秒汇报一次统计(加保底避免除零)
|
||
int64_t now = esp_timer_get_time();
|
||
if (now - last_report_us > 10000000LL) {
|
||
float elapsed_s = (now - last_report_us) / 1e6f;
|
||
if (elapsed_s > 0.1f) {
|
||
s_last_fps = (hit + miss) / elapsed_s;
|
||
ESP_LOGI(TAG, "face stats: hit=%d miss=%d fps=%.1f",
|
||
hit, miss, s_last_fps);
|
||
}
|
||
hit = miss = 0;
|
||
last_report_us = now;
|
||
}
|
||
}
|
||
|
||
delete detector;
|
||
if (rgb_buf) {
|
||
heap_caps_free(rgb_buf);
|
||
}
|
||
ESP_LOGI(TAG, "face_tracker task exiting");
|
||
s_handle = nullptr;
|
||
vTaskDelete(NULL);
|
||
}
|
||
|
||
extern "C" void face_tracker_start(void) {
|
||
if (s_handle != nullptr) {
|
||
ESP_LOGW(TAG, "face_tracker already running, ignore start");
|
||
return;
|
||
}
|
||
s_stop = false;
|
||
// [2026-04-20 修复 WDT 崩溃] 原绑 Core 0 + 优先级 2 会导致:
|
||
// esp-dl 推理占 150ms → 同核的 RMT LED 驱动拿不到 spinlock 超过 300ms →
|
||
// 触发 Interrupt WDT → SetDeviceState 切换时点 LED 崩溃。
|
||
// 改绑到 Core 1(WiFi/RMT/LED 在 Core 0,音频在 Core 1 但只 speaking 时重载)。
|
||
// 栈 8KB:给 esp-dl 推理留充足空间
|
||
BaseType_t ok = xTaskCreatePinnedToCore(
|
||
face_tracker_task, "face_track",
|
||
8 * 1024, nullptr, 2, &s_handle, 1);
|
||
if (ok != pdPASS) {
|
||
ESP_LOGE(TAG, "xTaskCreatePinnedToCore failed");
|
||
s_handle = nullptr;
|
||
}
|
||
}
|
||
|
||
extern "C" void face_tracker_stop(void) {
|
||
s_stop = true;
|
||
}
|
||
|
||
extern "C" float face_tracker_get_fps(void) {
|
||
return s_last_fps;
|
||
}
|
||
|
||
#else // 非 S3 或功能未启用:提供空壳,保证链接通过
|
||
|
||
extern "C" void face_tracker_start(void) {}
|
||
extern "C" void face_tracker_stop(void) {}
|
||
extern "C" float face_tracker_get_fps(void) { return 0.0f; }
|
||
|
||
#endif // CONFIG_XIAOZHI_ENABLE_FACE_TRACKING && CONFIG_IDF_TARGET_ESP32S3
|