CogletESP-camera-version/main/face_tracker.cc

// [T05/T06] 人脸追踪任务
// 只有 ESP32-S3 + CONFIG_XIAOZHI_ENABLE_FACE_TRACKING=y 才编译完整实现
// 其他情况编译 3 个空函数，保证链接通过

#include "face_tracker.h"
#include "sdkconfig.h"

#if defined(CONFIG_XIAOZHI_ENABLE_FACE_TRACKING) && defined(CONFIG_IDF_TARGET_ESP32S3)

#include "human_face_detect.hpp"
#include "dl_image_define.hpp"
#include "dl_detect_define.hpp"
#include "board.h"
#include "esp32_camera.h"
#include "display/lvgl_display/jpg/image_to_jpeg.h"
#include <linux/videodev2.h>

#include <esp_heap_caps.h>
#include <esp_log.h>
#include <esp_timer.h>
#include <freertos/FreeRTOS.h>
#include <freertos/task.h>
#include <list>
#include <new>
#include <cstring>

static const char* TAG = "FaceTracker";
static TaskHandle_t s_handle = nullptr;
static volatile bool s_stop = false;
static float s_last_fps = 0.0f;

// T06: uart_send_face 由 T07 在 uart_component.{h,cc} 中提供
// 此处用前向声明 + 弱符号，让 T07 完成前 face_tracker.cc 仍能通过编译
// T07 完成后该弱符号被真实实现覆盖，无需改动本文件
extern "C" __attribute__((weak)) void uart_send_face(int x_offset, int y_offset);

// YVYU → RGB888 手动转换（OV3660 FORMAT_CTRL00=0x61 实际输出 Y V Y U 序列）
// 每 4 字节 YVYU 生成 2 像素 6 字节 RGB888
// 公式（BT.601 JFIF）：R = Y + 1.402*(V-128); G = Y - 0.344*(U-128) - 0.714*(V-128); B = Y + 1.772*(U-128)
// [2026-04-21 修正] 之前按 YUYV (Y U Y V) 读取导致色彩偏绿紫，JPEG dump 测试证实
//   sensor 实际是 YVYU sequence，byte[1]=V, byte[3]=U（顺序反了）
static inline void yuyv_to_rgb888_line(const uint8_t* yuyv, uint8_t* rgb, int pixels) {
    for (int i = 0; i < pixels; i += 2) {
        int y1 = yuyv[0];
        int v  = yuyv[1] - 128;  // 修正：byte[1] = V（原本误当 U）
        int y2 = yuyv[2];
        int u  = yuyv[3] - 128;  // 修正：byte[3] = U（原本误当 V）
        yuyv += 4;
        // 像素 1
        int r1 = y1 + (359 * v) / 256;
        int g1 = y1 - (88 * u + 183 * v) / 256;
        int b1 = y1 + (454 * u) / 256;
        // 像素 2
        int r2 = y2 + (359 * v) / 256;
        int g2 = y2 - (88 * u + 183 * v) / 256;
        int b2 = y2 + (454 * u) / 256;
        *rgb++ = (uint8_t)(r1 < 0 ? 0 : r1 > 255 ? 255 : r1);
        *rgb++ = (uint8_t)(g1 < 0 ? 0 : g1 > 255 ? 255 : g1);
        *rgb++ = (uint8_t)(b1 < 0 ? 0 : b1 > 255 ? 255 : b1);
        *rgb++ = (uint8_t)(r2 < 0 ? 0 : r2 > 255 ? 255 : r2);
        *rgb++ = (uint8_t)(g2 < 0 ? 0 : g2 > 255 ? 255 : g2);
        *rgb++ = (uint8_t)(b2 < 0 ? 0 : b2 > 255 ? 255 : b2);
    }
}

static void face_tracker_task(void* arg) {
    (void)arg;
    // 等待摄像头 ISP 预热 + 视频流启动稳定
    vTaskDelay(pdMS_TO_TICKS(500));

    ESP_LOGI(TAG, "face_tracker task started on core %d", xPortGetCoreID());

    // [2026-04-20 重大修复] 分配 PSRAM RGB888 缓冲区，手动 YUYV→RGB888 转换
    // 绕过 esp-dl ImagePreprocessor 的 YUYV 路径（疑似产生固定激活 bug）
    // 240*240*3 = 172800 字节，PSRAM 8MB 完全够
    constexpr size_t RGB_SIZE = 240 * 240 * 3;
    uint8_t* rgb_buf = (uint8_t*)heap_caps_malloc(RGB_SIZE, MALLOC_CAP_SPIRAM);
    if (!rgb_buf) {
        ESP_LOGE(TAG, "分配 RGB888 缓冲失败");
        vTaskDelete(NULL);
        return;
    }
    ESP_LOGI(TAG, "RGB888 转换缓冲已分配 %u bytes", (unsigned)RGB_SIZE);

    // 构造检测器：默认 model_type 由 CONFIG_DEFAULT_HUMAN_FACE_DETECT_MODEL 决定
    // lazy_load=true（默认）以减少启动期内存瞬时占用
    auto* detector = new(std::nothrow) HumanFaceDetect();
    if (!detector) {
        ESP_LOGE(TAG, "HumanFaceDetect 构造失败（PSRAM 不足？）");
        multi_heap_info_t info;
        heap_caps_get_info(&info, MALLOC_CAP_SPIRAM);
        ESP_LOGE(TAG, "PSRAM free=%u total_allocated=%u",
                 (unsigned)info.total_free_bytes,
                 (unsigned)info.total_allocated_bytes);
        s_handle = nullptr;
        vTaskDelete(NULL);
        return;
    }

    // 一次性打印启动时 PSRAM 占用供诊断（RESEARCH R2 风险跟踪）
    {
        multi_heap_info_t info;
        heap_caps_get_info(&info, MALLOC_CAP_SPIRAM);
        ESP_LOGI(TAG, "PSRAM after detector init: free=%u allocated=%u",
                 (unsigned)info.total_free_bytes,
                 (unsigned)info.total_allocated_bytes);
    }

    // [2026-04-21 诊断结论] 多格式 JPEG dump 测试确认：sensor 实际输出 YUYV packed 格式
    //   - frame_YUYV.jpg 画面清晰（能看到戴眼镜人脸 + 背景），只是色彩偏绿紫
    //   - frame_RGB565.jpg / UYVY / YUV422P 全是彩色马赛克
    //   - 色偏原因：FORMAT_CTRL00=0x61 的 bit[3:0]=1 在 YUV 模式下是 YVYU sequence
    //     （实际字节序 Y V Y U，不是标准 YUYV 的 Y U Y V）
    //     → yuyv_to_rgb888_line 要按 YVYU 读取：byte[1]=V, byte[3]=U
    // 保留 JPEG dump 用于拍照验证（先确认摄像头正常再跑人脸识别）
    // [2026-04-22] sensor 切到硬件 JPEG 模式（CONFIG_CAMERA_OV3660_DVP_JPEG_1280X720_12FPS）
    // sensor 内部已做完 YUV→RGB→JPEG 全流程色彩处理，输出标准 JPEG 字节流
    // 我们不再需要 image_to_jpeg 二次编码，直接把 f.data 透传即可
    {
        vTaskDelay(pdMS_TO_TICKS(2000));  // JPEG 模式分辨率 1280x720，sensor 需要更长曝光稳定时间
        auto* cam = dynamic_cast<Esp32Camera*>(Board::GetInstance().GetCamera());
        Esp32Camera::FrameRef f;
        if (cam && cam->CaptureForDetection(&f) && f.data && f.len > 0) {
            const uint8_t* jpg = (const uint8_t*)f.data;
            size_t jpg_len = f.len;
            ESP_LOGI(TAG, "===JPEG_DUMP_BEGIN fmt=SENSOR_JPEG size=%u w=%u h=%u===",
                     (unsigned)jpg_len, f.width, f.height);
            static const char b64[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
            char line[128]; size_t lp = 0;
            for (size_t i = 0; i < jpg_len; i += 3) {
                uint32_t v = ((uint32_t)jpg[i] << 16);
                if (i + 1 < jpg_len) v |= ((uint32_t)jpg[i+1] << 8);
                if (i + 2 < jpg_len) v |= jpg[i+2];
                line[lp++] = b64[(v >> 18) & 0x3F];
                line[lp++] = b64[(v >> 12) & 0x3F];
                line[lp++] = (i + 1 < jpg_len) ? b64[(v >> 6) & 0x3F] : '=';
                line[lp++] = (i + 2 < jpg_len) ? b64[v & 0x3F] : '=';
                if (lp >= 72) { line[lp] = 0; printf("%s\n", line); lp = 0; }
            }
            if (lp > 0) { line[lp] = 0; printf("%s\n", line); }
            ESP_LOGI(TAG, "===JPEG_DUMP_END===");
            cam->ReleaseDetectionFrame(f);
        }
    }

    // 按 Kconfig 配置的 FPS 计算节拍
    const TickType_t period = pdMS_TO_TICKS(1000 / CONFIG_XIAOZHI_FACE_TRACKING_FPS);
    TickType_t last_wake = xTaskGetTickCount();
    int hit = 0, miss = 0;
    int64_t last_report_us = esp_timer_get_time();
    // 实时日志限频：每秒最多 1 条（INFO 级别便于排查）
    int64_t last_detail_log_us = 0;
    int miss_streak = 0;  // 连续 miss 计数

    while (!s_stop) {
        vTaskDelayUntil(&last_wake, period);

        auto* cam = dynamic_cast<Esp32Camera*>(Board::GetInstance().GetCamera());
        if (!cam) {
            continue;
        }

        Esp32Camera::FrameRef f;
        if (!cam->CaptureForDetection(&f)) {
            // [T04 策略] 拿不到 mutex（MCP 拍照中）或 DQBUF 失败 → 正常跳帧
            continue;
        }

        // [Bug 1 诊断] 首次进入循环时，打印前 32 字节 + 中心像素 + 统计，判断数据性质
        // 全零 → 摄像头无数据；规律 → 字节序/格式问题；随机 → 正常但模型看不懂
        static bool debug_dumped = false;
        if (!debug_dumped && f.data && f.len >= 32) {
            debug_dumped = true;
            const uint8_t* d = (const uint8_t*)f.data;
            ESP_LOGI(TAG, "frame debug: size=%u w=%u h=%u len=%u",
                     (unsigned)f.width * f.height * 2, f.width, f.height, (unsigned)f.len);
            // 打印左上角 16 字节 + 中心附近 16 字节
            size_t center = (f.width * (f.height / 2) + f.width / 2) * 2;
            if (center + 16 <= f.len) {
                ESP_LOGI(TAG, "top-left 16B:  %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x",
                         d[0],d[1],d[2],d[3],d[4],d[5],d[6],d[7],d[8],d[9],d[10],d[11],d[12],d[13],d[14],d[15]);
                ESP_LOGI(TAG, "center  16B:   %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x %02x",
                         d[center],d[center+1],d[center+2],d[center+3],d[center+4],d[center+5],d[center+6],d[center+7],
                         d[center+8],d[center+9],d[center+10],d[center+11],d[center+12],d[center+13],d[center+14],d[center+15]);
            }
            // 统计：零字节比例（判断摄像头是否真有数据）
            size_t zero_cnt = 0;
            for (size_t i = 0; i < f.len; i++) if (d[i] == 0) zero_cnt++;
            ESP_LOGI(TAG, "zero bytes: %u / %u (%.1f%%)",
                     (unsigned)zero_cnt, (unsigned)f.len, 100.0f * zero_cnt / f.len);
        }

        // [2026-04-20 重大修复] 手动 YUYV → RGB888 转换，绕过 esp-dl 预处理黑盒
        // 以前：img.pix_type = YUYV，让 ImagePreprocessor 内部做 YUV→RGB，但它产生固定激活
        // 现在：先转成 RGB888 喂给模型，pix_type 标 RGB888，消除预处理不确定性
        {
            const uint8_t* src = (const uint8_t*)f.data;
            uint8_t* dst = rgb_buf;
            for (uint16_t row = 0; row < f.height; row++) {
                yuyv_to_rgb888_line(src, dst, f.width);
                src += f.width * 2;     // YUYV 每像素 2 字节
                dst += f.width * 3;     // RGB888 每像素 3 字节
            }
        }

        dl::image::img_t img{};
        img.data = (void*)rgb_buf;
        img.width = f.width;
        img.height = f.height;
        img.pix_type = dl::image::DL_IMAGE_PIX_TYPE_RGB888;

        int64_t t0 = esp_timer_get_time();
        auto& results = detector->run(img);
        int64_t t1 = esp_timer_get_time();

        // 立即归还 V4L2 缓冲，避免 face_track 占用时间长
        cam->ReleaseDetectionFrame(f);

        int64_t now_us = esp_timer_get_time();
        if (results.empty()) {
            miss++;
            miss_streak++;
            // 连续 3 秒无人脸时提示一次（按默认 FPS=10 折算 ~30 帧）
            if (miss_streak == CONFIG_XIAOZHI_FACE_TRACKING_FPS * 3) {
                ESP_LOGI(TAG, "no face detected in last 3s");
            }
        } else {
            hit++;
            miss_streak = 0;
            // PLAN 未明确排序策略，esp-dl 内部 nms 后 list 顺序不稳定
            // 为健壮性，挑 score 最高的那个（避免多脸时摇摆）
            const dl::detect::result_t* best = nullptr;
            for (const auto& r : results) {
                if (best == nullptr || r.score > best->score) {
                    best = &r;
                }
            }
            // box: [left_up_x, left_up_y, right_down_x, right_down_y]
            int cx = (best->box[0] + best->box[2]) / 2;
            int cy = (best->box[1] + best->box[3]) / 2;
            // 坐标映射（RESEARCH Pitfall 7）：严格保持 cx * 224 / width - 112
            // 对齐 RP2040 端 deadzone=20 / x_adj_factor=10 的基准
            int x_offset = (f.width  > 0) ? (cx * 224 / f.width  - 112) : 0;
            int y_offset = (f.height > 0) ? (cy * 224 / f.height - 112) : 0;

            // T07 完成后，uart_send_face 弱符号会被真实实现覆盖
            if (uart_send_face != nullptr) {
                uart_send_face(x_offset, y_offset);
            }
            // INFO 级别实时日志，限频每秒 1 条避免刷屏
            // 修复：%lld 在 nano newlib 下输出异常，改为 %lu + uint32（infer<2s 安全）
            if (now_us - last_detail_log_us > 1000000LL) {
                ESP_LOGI(TAG, "face: score=%.2f box=[%d,%d,%d,%d] offset=(%+d,%+d) infer=%lums",
                         best->score,
                         best->box[0], best->box[1], best->box[2], best->box[3],
                         x_offset, y_offset,
                         (unsigned long)((t1 - t0) / 1000));
                last_detail_log_us = now_us;
            }
            // 高频详细日志保留为 LOGD（需 idf.py monitor 按 Ctrl+T Y 切换为 DEBUG）
            ESP_LOGD(TAG, "face score=%.2f offset=(%d,%d) infer=%luus",
                     best->score, x_offset, y_offset, (unsigned long)(t1 - t0));
        }

        // 每 10 秒汇报一次统计（加保底避免除零）
        int64_t now = esp_timer_get_time();
        if (now - last_report_us > 10000000LL) {
            float elapsed_s = (now - last_report_us) / 1e6f;
            if (elapsed_s > 0.1f) {
                s_last_fps = (hit + miss) / elapsed_s;
                ESP_LOGI(TAG, "face stats: hit=%d miss=%d fps=%.1f",
                         hit, miss, s_last_fps);
            }
            hit = miss = 0;
            last_report_us = now;
        }
    }

    delete detector;
    if (rgb_buf) {
        heap_caps_free(rgb_buf);
    }
    ESP_LOGI(TAG, "face_tracker task exiting");
    s_handle = nullptr;
    vTaskDelete(NULL);
}

extern "C" void face_tracker_start(void) {
    if (s_handle != nullptr) {
        ESP_LOGW(TAG, "face_tracker already running, ignore start");
        return;
    }
    s_stop = false;
    // [2026-04-20 修复 WDT 崩溃] 原绑 Core 0 + 优先级 2 会导致：
    // esp-dl 推理占 150ms → 同核的 RMT LED 驱动拿不到 spinlock 超过 300ms →
    // 触发 Interrupt WDT → SetDeviceState 切换时点 LED 崩溃。
    // 改绑到 Core 1（WiFi/RMT/LED 在 Core 0，音频在 Core 1 但只 speaking 时重载）。
    // 栈 8KB：给 esp-dl 推理留充足空间
    BaseType_t ok = xTaskCreatePinnedToCore(
        face_tracker_task, "face_track",
        8 * 1024, nullptr, 2, &s_handle, 1);
    if (ok != pdPASS) {
        ESP_LOGE(TAG, "xTaskCreatePinnedToCore failed");
        s_handle = nullptr;
    }
}

extern "C" void face_tracker_stop(void) {
    s_stop = true;
}

extern "C" float face_tracker_get_fps(void) {
    return s_last_fps;
}

#else  // 非 S3 或功能未启用：提供空壳，保证链接通过

extern "C" void face_tracker_start(void) {}
extern "C" void face_tracker_stop(void) {}
extern "C" float face_tracker_get_fps(void) { return 0.0f; }

#endif  // CONFIG_XIAOZHI_ENABLE_FACE_TRACKING && CONFIG_IDF_TARGET_ESP32S3