CogletESP-camera-version/main/face_tracker.cc
Rdzleo f1c2bfce93 Phase 01 JPEG Dump 诊断 + YVYU 修正 + 矛盾分析汇总
核心变更:
- face_tracker.cc: YUYV→YVYU 序列修正(byte[1]=V, byte[3]=U),
  基于 JPEG Dump 诊断工具验证 OV3660 FORMAT_CTRL00=0x61 实际是 YVYU
- face_tracker.cc: 启动时 base64 打印一帧 JPEG 到串口,用于肉眼验证
- config.h: XCLK 20MHz→10MHz,给飞线信号完整性 2x 裕度
- scripts/auto_capture_jpeg.py: 自动串口抓帧工具(DTR/RTS 复位 + base64 解码)
- scripts/extract_jpeg_from_log.py: 从日志离线提取 JPEG
- Coglet项目分析与开发指南.md: 新增"六点六"章节,汇总 Phase 01
  主要矛盾(画面可辨识≠模型可识别)、YUV→RGB 色偏三层原因、
  esp-dl 模型输入分布敏感性、延迟分析、三方案对比、方案 B 突破口
- docs/: 新增 2 篇 OV3660 相关 CSDN 参考资料

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-22 11:01:02 +08:00

323 lines
14 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// [T05/T06] 人脸追踪任务
// 只有 ESP32-S3 + CONFIG_XIAOZHI_ENABLE_FACE_TRACKING=y 才编译完整实现
// 其他情况编译 3 个空函数,保证链接通过
#include "face_tracker.h"
#include "sdkconfig.h"
#if defined(CONFIG_XIAOZHI_ENABLE_FACE_TRACKING) && defined(CONFIG_IDF_TARGET_ESP32S3)
#include "human_face_detect.hpp"
#include "dl_image_define.hpp"
#include "dl_detect_define.hpp"
#include "board.h"
#include "esp32_camera.h"
#include "display/lvgl_display/jpg/image_to_jpeg.h"
#include <linux/videodev2.h>
#include <esp_heap_caps.h>
#include <esp_log.h>
#include <esp_timer.h>
#include <freertos/FreeRTOS.h>
#include <freertos/task.h>
#include <list>
#include <new>
#include <cstring>
static const char* TAG = "FaceTracker";
static TaskHandle_t s_handle = nullptr;
static volatile bool s_stop = false;
static float s_last_fps = 0.0f;
// T06: uart_send_face 由 T07 在 uart_component.{h,cc} 中提供
// 此处用前向声明 + 弱符号,让 T07 完成前 face_tracker.cc 仍能通过编译
// T07 完成后该弱符号被真实实现覆盖,无需改动本文件
extern "C" __attribute__((weak)) void uart_send_face(int x_offset, int y_offset);
// YVYU → RGB888 手动转换OV3660 FORMAT_CTRL00=0x61 实际输出 Y V Y U 序列)
// 每 4 字节 YVYU 生成 2 像素 6 字节 RGB888
// 公式BT.601 JFIFR = Y + 1.402*(V-128); G = Y - 0.344*(U-128) - 0.714*(V-128); B = Y + 1.772*(U-128)
// [2026-04-21 修正] 之前按 YUYV (Y U Y V) 读取导致色彩偏绿紫JPEG dump 测试证实
// sensor 实际是 YVYU sequencebyte[1]=V, byte[3]=U顺序反了
static inline void yuyv_to_rgb888_line(const uint8_t* yuyv, uint8_t* rgb, int pixels) {
for (int i = 0; i < pixels; i += 2) {
int y1 = yuyv[0];
int v = yuyv[1] - 128; // 修正byte[1] = V原本误当 U
int y2 = yuyv[2];
int u = yuyv[3] - 128; // 修正byte[3] = U原本误当 V
yuyv += 4;
// 像素 1
int r1 = y1 + (359 * v) / 256;
int g1 = y1 - (88 * u + 183 * v) / 256;
int b1 = y1 + (454 * u) / 256;
// 像素 2
int r2 = y2 + (359 * v) / 256;
int g2 = y2 - (88 * u + 183 * v) / 256;
int b2 = y2 + (454 * u) / 256;
*rgb++ = (uint8_t)(r1 < 0 ? 0 : r1 > 255 ? 255 : r1);
*rgb++ = (uint8_t)(g1 < 0 ? 0 : g1 > 255 ? 255 : g1);
*rgb++ = (uint8_t)(b1 < 0 ? 0 : b1 > 255 ? 255 : b1);
*rgb++ = (uint8_t)(r2 < 0 ? 0 : r2 > 255 ? 255 : r2);
*rgb++ = (uint8_t)(g2 < 0 ? 0 : g2 > 255 ? 255 : g2);
*rgb++ = (uint8_t)(b2 < 0 ? 0 : b2 > 255 ? 255 : b2);
}
}
static void face_tracker_task(void* arg) {
(void)arg;
// 等待摄像头 ISP 预热 + 视频流启动稳定
vTaskDelay(pdMS_TO_TICKS(500));
ESP_LOGI(TAG, "face_tracker task started on core %d", xPortGetCoreID());
// [2026-04-20 重大修复] 分配 PSRAM RGB888 缓冲区,手动 YUYV→RGB888 转换
// 绕过 esp-dl ImagePreprocessor 的 YUYV 路径(疑似产生固定激活 bug
// 240*240*3 = 172800 字节PSRAM 8MB 完全够
constexpr size_t RGB_SIZE = 240 * 240 * 3;
uint8_t* rgb_buf = (uint8_t*)heap_caps_malloc(RGB_SIZE, MALLOC_CAP_SPIRAM);
if (!rgb_buf) {
ESP_LOGE(TAG, "分配 RGB888 缓冲失败");
vTaskDelete(NULL);
return;
}
ESP_LOGI(TAG, "RGB888 转换缓冲已分配 %u bytes", (unsigned)RGB_SIZE);
// 构造检测器:默认 model_type 由 CONFIG_DEFAULT_HUMAN_FACE_DETECT_MODEL 决定
// lazy_load=true默认以减少启动期内存瞬时占用
auto* detector = new(std::nothrow) HumanFaceDetect();
if (!detector) {
ESP_LOGE(TAG, "HumanFaceDetect 构造失败PSRAM 不足?)");
multi_heap_info_t info;
heap_caps_get_info(&info, MALLOC_CAP_SPIRAM);
ESP_LOGE(TAG, "PSRAM free=%u total_allocated=%u",
(unsigned)info.total_free_bytes,
(unsigned)info.total_allocated_bytes);
s_handle = nullptr;
vTaskDelete(NULL);
return;
}
// 一次性打印启动时 PSRAM 占用供诊断RESEARCH R2 风险跟踪)
{
multi_heap_info_t info;
heap_caps_get_info(&info, MALLOC_CAP_SPIRAM);
ESP_LOGI(TAG, "PSRAM after detector init: free=%u allocated=%u",
(unsigned)info.total_free_bytes,
(unsigned)info.total_allocated_bytes);
}
// [2026-04-21 诊断结论] 多格式 JPEG dump 测试确认sensor 实际输出 YUYV packed 格式
// - frame_YUYV.jpg 画面清晰(能看到戴眼镜人脸 + 背景),只是色彩偏绿紫
// - frame_RGB565.jpg / UYVY / YUV422P 全是彩色马赛克
// - 色偏原因FORMAT_CTRL00=0x61 的 bit[3:0]=1 在 YUV 模式下是 YVYU sequence
// (实际字节序 Y V Y U不是标准 YUYV 的 Y U Y V
// → yuyv_to_rgb888_line 要按 YVYU 读取byte[1]=V, byte[3]=U
// 保留 JPEG dump 用于拍照验证(先确认摄像头正常再跑人脸识别)
// [2026-04-22] sensor 切到硬件 JPEG 模式CONFIG_CAMERA_OV3660_DVP_JPEG_1280X720_12FPS
// sensor 内部已做完 YUV→RGB→JPEG 全流程色彩处理,输出标准 JPEG 字节流
// 我们不再需要 image_to_jpeg 二次编码,直接把 f.data 透传即可
{
vTaskDelay(pdMS_TO_TICKS(2000)); // JPEG 模式分辨率 1280x720sensor 需要更长曝光稳定时间
auto* cam = dynamic_cast<Esp32Camera*>(Board::GetInstance().GetCamera());
Esp32Camera::FrameRef f;
if (cam && cam->CaptureForDetection(&f) && f.data && f.len > 0) {
const uint8_t* jpg = (const uint8_t*)f.data;
size_t jpg_len = f.len;
ESP_LOGI(TAG, "===JPEG_DUMP_BEGIN fmt=SENSOR_JPEG size=%u w=%u h=%u===",
(unsigned)jpg_len, f.width, f.height);
static const char b64[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
char line[128]; size_t lp = 0;
for (size_t i = 0; i < jpg_len; i += 3) {
uint32_t v = ((uint32_t)jpg[i] << 16);
if (i + 1 < jpg_len) v |= ((uint32_t)jpg[i+1] << 8);
if (i + 2 < jpg_len) v |= jpg[i+2];
line[lp++] = b64[(v >> 18) & 0x3F];
line[lp++] = b64[(v >> 12) & 0x3F];
line[lp++] = (i + 1 < jpg_len) ? b64[(v >> 6) & 0x3F] : '=';
line[lp++] = (i + 2 < jpg_len) ? b64[v & 0x3F] : '=';
if (lp >= 72) { line[lp] = 0; printf("%s\n", line); lp = 0; }
}
if (lp > 0) { line[lp] = 0; printf("%s\n", line); }
ESP_LOGI(TAG, "===JPEG_DUMP_END===");
cam->ReleaseDetectionFrame(f);
}
}
// 按 Kconfig 配置的 FPS 计算节拍
const TickType_t period = pdMS_TO_TICKS(1000 / CONFIG_XIAOZHI_FACE_TRACKING_FPS);
TickType_t last_wake = xTaskGetTickCount();
int hit = 0, miss = 0;
int64_t last_report_us = esp_timer_get_time();
// 实时日志限频:每秒最多 1 条INFO 级别便于排查)
int64_t last_detail_log_us = 0;
int miss_streak = 0; // 连续 miss 计数
while (!s_stop) {
vTaskDelayUntil(&last_wake, period);
auto* cam = dynamic_cast<Esp32Camera*>(Board::GetInstance().GetCamera());
if (!cam) {
continue;
}
Esp32Camera::FrameRef f;
if (!cam->CaptureForDetection(&f)) {
// [T04 策略] 拿不到 mutexMCP 拍照中)或 DQBUF 失败 → 正常跳帧
continue;
}
// [Bug 1 诊断] 首次进入循环时,打印前 32 字节 + 中心像素 + 统计,判断数据性质
// 全零 → 摄像头无数据;规律 → 字节序/格式问题;随机 → 正常但模型看不懂
static bool debug_dumped = false;
if (!debug_dumped && f.data && f.len >= 32) {
debug_dumped = true;
const uint8_t* d = (const uint8_t*)f.data;
ESP_LOGI(TAG, "frame debug: size=%u w=%u h=%u len=%u",
(unsigned)f.width * f.height * 2, f.width, f.height, (unsigned)f.len);
// 打印左上角 16 字节 + 中心附近 16 字节
size_t center = (f.width * (f.height / 2) + f.width / 2) * 2;
if (center + 16 <= f.len) {
ESP_LOGI(TAG, "top-left 16B: %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x",
d[0],d[1],d[2],d[3],d[4],d[5],d[6],d[7],d[8],d[9],d[10],d[11],d[12],d[13],d[14],d[15]);
ESP_LOGI(TAG, "center 16B: %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x",
d[center],d[center+1],d[center+2],d[center+3],d[center+4],d[center+5],d[center+6],d[center+7],
d[center+8],d[center+9],d[center+10],d[center+11],d[center+12],d[center+13],d[center+14],d[center+15]);
}
// 统计:零字节比例(判断摄像头是否真有数据)
size_t zero_cnt = 0;
for (size_t i = 0; i < f.len; i++) if (d[i] == 0) zero_cnt++;
ESP_LOGI(TAG, "zero bytes: %u / %u (%.1f%%)",
(unsigned)zero_cnt, (unsigned)f.len, 100.0f * zero_cnt / f.len);
}
// [2026-04-20 重大修复] 手动 YUYV → RGB888 转换,绕过 esp-dl 预处理黑盒
// 以前img.pix_type = YUYV让 ImagePreprocessor 内部做 YUV→RGB但它产生固定激活
// 现在:先转成 RGB888 喂给模型pix_type 标 RGB888消除预处理不确定性
{
const uint8_t* src = (const uint8_t*)f.data;
uint8_t* dst = rgb_buf;
for (uint16_t row = 0; row < f.height; row++) {
yuyv_to_rgb888_line(src, dst, f.width);
src += f.width * 2; // YUYV 每像素 2 字节
dst += f.width * 3; // RGB888 每像素 3 字节
}
}
dl::image::img_t img{};
img.data = (void*)rgb_buf;
img.width = f.width;
img.height = f.height;
img.pix_type = dl::image::DL_IMAGE_PIX_TYPE_RGB888;
int64_t t0 = esp_timer_get_time();
auto& results = detector->run(img);
int64_t t1 = esp_timer_get_time();
// 立即归还 V4L2 缓冲,避免 face_track 占用时间长
cam->ReleaseDetectionFrame(f);
int64_t now_us = esp_timer_get_time();
if (results.empty()) {
miss++;
miss_streak++;
// 连续 3 秒无人脸时提示一次(按默认 FPS=10 折算 ~30 帧)
if (miss_streak == CONFIG_XIAOZHI_FACE_TRACKING_FPS * 3) {
ESP_LOGI(TAG, "no face detected in last 3s");
}
} else {
hit++;
miss_streak = 0;
// PLAN 未明确排序策略esp-dl 内部 nms 后 list 顺序不稳定
// 为健壮性,挑 score 最高的那个(避免多脸时摇摆)
const dl::detect::result_t* best = nullptr;
for (const auto& r : results) {
if (best == nullptr || r.score > best->score) {
best = &r;
}
}
// box: [left_up_x, left_up_y, right_down_x, right_down_y]
int cx = (best->box[0] + best->box[2]) / 2;
int cy = (best->box[1] + best->box[3]) / 2;
// 坐标映射RESEARCH Pitfall 7严格保持 cx * 224 / width - 112
// 对齐 RP2040 端 deadzone=20 / x_adj_factor=10 的基准
int x_offset = (f.width > 0) ? (cx * 224 / f.width - 112) : 0;
int y_offset = (f.height > 0) ? (cy * 224 / f.height - 112) : 0;
// T07 完成后uart_send_face 弱符号会被真实实现覆盖
if (uart_send_face != nullptr) {
uart_send_face(x_offset, y_offset);
}
// INFO 级别实时日志,限频每秒 1 条避免刷屏
// 修复:%lld 在 nano newlib 下输出异常,改为 %lu + uint32infer<2s 安全)
if (now_us - last_detail_log_us > 1000000LL) {
ESP_LOGI(TAG, "face: score=%.2f box=[%d,%d,%d,%d] offset=(%+d,%+d) infer=%lums",
best->score,
best->box[0], best->box[1], best->box[2], best->box[3],
x_offset, y_offset,
(unsigned long)((t1 - t0) / 1000));
last_detail_log_us = now_us;
}
// 高频详细日志保留为 LOGD需 idf.py monitor 按 Ctrl+T Y 切换为 DEBUG
ESP_LOGD(TAG, "face score=%.2f offset=(%d,%d) infer=%luus",
best->score, x_offset, y_offset, (unsigned long)(t1 - t0));
}
// 每 10 秒汇报一次统计(加保底避免除零)
int64_t now = esp_timer_get_time();
if (now - last_report_us > 10000000LL) {
float elapsed_s = (now - last_report_us) / 1e6f;
if (elapsed_s > 0.1f) {
s_last_fps = (hit + miss) / elapsed_s;
ESP_LOGI(TAG, "face stats: hit=%d miss=%d fps=%.1f",
hit, miss, s_last_fps);
}
hit = miss = 0;
last_report_us = now;
}
}
delete detector;
if (rgb_buf) {
heap_caps_free(rgb_buf);
}
ESP_LOGI(TAG, "face_tracker task exiting");
s_handle = nullptr;
vTaskDelete(NULL);
}
extern "C" void face_tracker_start(void) {
if (s_handle != nullptr) {
ESP_LOGW(TAG, "face_tracker already running, ignore start");
return;
}
s_stop = false;
// [2026-04-20 修复 WDT 崩溃] 原绑 Core 0 + 优先级 2 会导致:
// esp-dl 推理占 150ms → 同核的 RMT LED 驱动拿不到 spinlock 超过 300ms →
// 触发 Interrupt WDT → SetDeviceState 切换时点 LED 崩溃。
// 改绑到 Core 1WiFi/RMT/LED 在 Core 0音频在 Core 1 但只 speaking 时重载)。
// 栈 8KB给 esp-dl 推理留充足空间
BaseType_t ok = xTaskCreatePinnedToCore(
face_tracker_task, "face_track",
8 * 1024, nullptr, 2, &s_handle, 1);
if (ok != pdPASS) {
ESP_LOGE(TAG, "xTaskCreatePinnedToCore failed");
s_handle = nullptr;
}
}
extern "C" void face_tracker_stop(void) {
s_stop = true;
}
extern "C" float face_tracker_get_fps(void) {
return s_last_fps;
}
#else // 非 S3 或功能未启用:提供空壳,保证链接通过
extern "C" void face_tracker_start(void) {}
extern "C" void face_tracker_stop(void) {}
extern "C" float face_tracker_get_fps(void) { return 0.0f; }
#endif // CONFIG_XIAOZHI_ENABLE_FACE_TRACKING && CONFIG_IDF_TARGET_ESP32S3