Rdzleo fc07d3806d Phase 01 调试迭代: OV3660 人脸检测集成 + 23 条踩坑经验汇总
## 代码变更

### main/application.cc
修复 T01 probe 日志 %lld 格式 bug(改用 %lu + unsigned long)

### main/boards/common/esp32_camera.cc
- 修复 DVP V4L2 单 buffer 导致 DMA 饥饿:req.count 从 1 改为 2
- 修复 [T01] Probe 日志 elapsed=ldus 显示问题(同上格式修复)

### main/face_tracker.cc
多轮迭代:
- 新增 frame debug 诊断日志(打印 top-left/center 16B + zero_bytes 统计)
- pix_type 尝试路径:YUYV → RGB565LE → RGB565BE → YUYV → RGB888(手动转换)
- 手动实现 BT.601 公式 YUYV→RGB888 转换,绕过 ImagePreprocessor 黑盒
- face_tracker 任务从 Core 0 切换到 Core 1,避让 RMT/LED 死锁
- 新增 INFO 级限频日志(每秒 1 条 face 检测记录)
- 修复推理时长日志 %lld 格式 bug
- 连续 3 秒无人脸时打印 no face detected

### main/idf_component.yml
esp_video 升级 1.3.1 → ~1.4.1(手动 patch 修 xclk_freq bug)

### partitions/v2/16m.csv
OTA 分区扩容:3.94MB → 5MB,assets 缩到 5.875MB,支持 4.23MB 固件

### docs/phase-01-face-tracking/PROGRESS.md
更新 Phase 01 执行日志,记录实机调试细节

## 文档更新

### Coglet项目分析与开发指南.md 新增第六点五节

完整记录本轮调试的 23 个踩坑,分为:
1. 编译/配置类(5 个):板级重置、依赖冲突、bootloader 缓存、%lld 格式、xclk_freq bug
2. 摄像头数据链路(5 个):sensor driver 启用、V4L2 buffer 数量、分区扩容、镜头保护膜、光照
3. esp-dl 人脸检测(3 个):MSR letterbox 伪影、ESPDET OOD 默认输出、字节序判断
4. 任务调度(3 个):WDT 崩溃、GDMA ISR 崩溃、弱符号链接
5. RP2040 端(4 个):idle 回中、坐标累加撞限位、mpremote 阻塞、两分支代码差异
6. 硬件(3 个):飞线验证、360° 舵机误用、烧录生效验证

附调试方法论 6 条 + 未解决遗留问题 3 条

## 已解决问题

-  ESP-IDF 编译链路(依赖/分区/格式)
-  ESP32 + RP2040 端到端协议(face:x,y UART)
-  WDT 崩溃(face_tracker 切到 Core 1)
-  RP2040 眼球回中机制(idle 时回正)
-  V4L2 双 buffer(DMA 数据更新正常)

## 遗留问题(待解决)

-  face 检测 box 固定伪激活(无论 pix_type / 画面内容 / 模型选择)
-  GDMA ISR 每 ~30s 触发 InstrFetchProhibited 崩溃
- ⚠️ 端到端验收:眼球未真正跟随人脸

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-20 18:22:15 +08:00

1144 lines
45 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#include <fcntl.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <sys/param.h>
#include <unistd.h>
#include <errno.h>
#include <esp_heap_caps.h>
#include <cstdio>
#include <cstring>
#include "esp_imgfx_color_convert.h"
#include "esp_video_device.h"
#include "esp_video_init.h"
#include "linux/videodev2.h"
#include "board.h"
#include "display.h"
#include "esp32_camera.h"
#include "esp_jpeg_common.h"
#include "jpg/image_to_jpeg.h"
#include "jpg/jpeg_to_image.h"
#include "lvgl_display.h"
#include "mcp_server.h"
#include "system_info.h"
#ifdef CONFIG_XIAOZHI_ENABLE_CAMERA_DEBUG_MODE
#undef LOG_LOCAL_LEVEL
#define LOG_LOCAL_LEVEL MAX(CONFIG_LOG_DEFAULT_LEVEL, ESP_LOG_DEBUG)
#endif // CONFIG_XIAOZHI_ENABLE_CAMERA_DEBUG_MODE
#include <esp_log.h> // should be after LOCAL_LOG_LEVEL definition
#ifdef CONFIG_XIAOZHI_ENABLE_ROTATE_CAMERA_IMAGE
#ifdef CONFIG_IDF_TARGET_ESP32P4
#include "driver/ppa.h"
#if defined(CONFIG_XIAOZHI_CAMERA_IMAGE_ROTATION_ANGLE_90)
#define IMAGE_ROTATION_ANGLE (PPA_SRM_ROTATION_ANGLE_270)
#elif defined(CONFIG_XIAOZHI_CAMERA_IMAGE_ROTATION_ANGLE_270)
#define IMAGE_ROTATION_ANGLE (PPA_SRM_ROTATION_ANGLE_90)
#else
#error "CONFIG_XIAOZHI_CAMERA_IMAGE_ROTATION_ANGLE is not set"
#endif // angle
#else // target
#include "esp_imgfx_rotate.h"
#if defined(CONFIG_XIAOZHI_CAMERA_IMAGE_ROTATION_ANGLE_90)
#define IMAGE_ROTATION_ANGLE (90)
#elif defined(CONFIG_XIAOZHI_CAMERA_IMAGE_ROTATION_ANGLE_270)
#define IMAGE_ROTATION_ANGLE (270)
#else
#error "CONFIG_XIAOZHI_CAMERA_IMAGE_ROTATION_ANGLE is not set"
#endif // angle
#endif // target
#endif // CONFIG_XIAOZHI_ENABLE_ROTATE_CAMERA_IMAGE
#define TAG "Esp32Camera"
#if defined(CONFIG_CAMERA_SENSOR_SWAP_PIXEL_BYTE_ORDER) || defined(CONFIG_XIAOZHI_ENABLE_CAMERA_ENDIANNESS_SWAP)
#warning \
"CAMERA_SENSOR_SWAP_PIXEL_BYTE_ORDER or CONFIG_XIAOZHI_ENABLE_CAMERA_ENDIANNESS_SWAP is enabled, which may cause image corruption in YUV422 format!"
#endif
#if CONFIG_XIAOZHI_ENABLE_CAMERA_DEBUG_MODE
#define CAM_PRINT_FOURCC(pixelformat) \
char fourcc[5]; \
fourcc[0] = pixelformat & 0xFF; \
fourcc[1] = (pixelformat >> 8) & 0xFF; \
fourcc[2] = (pixelformat >> 16) & 0xFF; \
fourcc[3] = (pixelformat >> 24) & 0xFF; \
fourcc[4] = '\0'; \
ESP_LOGD(TAG, "FOURCC: '%c%c%c%c'", fourcc[0], fourcc[1], fourcc[2], fourcc[3]);
// for compatibility with old esp_video version
#ifndef MAP_FAILED
#define MAP_FAILED nullptr
#endif
__attribute__((weak)) esp_err_t esp_video_deinit(void) {
return ESP_ERR_NOT_SUPPORTED;
}
// end of for compatibility with old esp_video version
static void log_available_video_devices() {
for (int i = 0; i < 50; i++) {
char path[16];
snprintf(path, sizeof(path), "/dev/video%d", i);
int fd = open(path, O_RDONLY);
if (fd >= 0) {
ESP_LOGD(TAG, "found video device: %s", path);
close(fd);
}
}
}
#else
#define CAM_PRINT_FOURCC(pixelformat) (void)0;
#endif // CONFIG_XIAOZHI_ENABLE_CAMERA_DEBUG_MODE
Esp32Camera::Esp32Camera(const esp_video_init_config_t& config) {
// [T04] 创建采集互斥锁face_track 用 10ms timeoutMCP 拍照用 portMAX_DELAY
capture_mutex_ = xSemaphoreCreateMutex();
if (capture_mutex_ == nullptr) {
ESP_LOGE(TAG, "xSemaphoreCreateMutex failed");
return;
}
if (esp_video_init(&config) != ESP_OK) {
ESP_LOGE(TAG, "esp_video_init failed");
return;
}
#ifdef CONFIG_XIAOZHI_ENABLE_CAMERA_DEBUG_MODE
esp_log_level_set(TAG, ESP_LOG_DEBUG);
#endif // CONFIG_XIAOZHI_ENABLE_CAMERA_DEBUG_MODE
const char* video_device_name = nullptr;
if (false) { /* 用于构建 else if */
}
#if CONFIG_ESP_VIDEO_ENABLE_MIPI_CSI_VIDEO_DEVICE
else if (config.csi != nullptr) {
video_device_name = ESP_VIDEO_MIPI_CSI_DEVICE_NAME;
}
#endif
#if CONFIG_ESP_VIDEO_ENABLE_DVP_VIDEO_DEVICE
else if (config.dvp != nullptr) {
video_device_name = ESP_VIDEO_DVP_DEVICE_NAME;
}
#endif
#if CONFIG_ESP_VIDEO_ENABLE_HW_JPEG_VIDEO_DEVICE
else if (config.jpeg != nullptr) {
video_device_name = ESP_VIDEO_JPEG_DEVICE_NAME;
}
#endif
#if CONFIG_ESP_VIDEO_ENABLE_SPI_VIDEO_DEVICE
else if (config.spi != nullptr) {
video_device_name = ESP_VIDEO_SPI_DEVICE_NAME;
}
#endif
#if CONFIG_ESP_VIDEO_ENABLE_USB_UVC_VIDEO_DEVICE
else if (config.usb_uvc != nullptr) {
video_device_name = ESP_VIDEO_USB_UVC_DEVICE_NAME(0);
}
#endif
if (video_device_name == nullptr) {
ESP_LOGE(TAG, "no video device is enabled");
return;
}
video_fd_ = open(video_device_name, O_RDWR);
if (video_fd_ < 0) {
ESP_LOGE(TAG, "open %s failed, errno=%d(%s)", video_device_name, errno, strerror(errno));
#if CONFIG_XIAOZHI_ENABLE_CAMERA_DEBUG_MODE
log_available_video_devices();
#endif // CONFIG_XIAOZHI_ENABLE_CAMERA_DEBUG_MODE
return;
}
struct v4l2_capability cap = {};
if (ioctl(video_fd_, VIDIOC_QUERYCAP, &cap) != 0) {
ESP_LOGE(TAG, "VIDIOC_QUERYCAP failed, errno=%d(%s)", errno, strerror(errno));
close(video_fd_);
video_fd_ = -1;
return;
}
ESP_LOGD(
TAG,
"VIDIOC_QUERYCAP: driver=%s, card=%s, bus_info=%s, version=0x%08lx, capabilities=0x%08lx, device_caps=0x%08lx",
cap.driver, cap.card, cap.bus_info, cap.version, cap.capabilities, cap.device_caps);
struct v4l2_format format = {};
format.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
if (ioctl(video_fd_, VIDIOC_G_FMT, &format) != 0) {
ESP_LOGE(TAG, "VIDIOC_G_FMT failed, errno=%d(%s)", errno, strerror(errno));
close(video_fd_);
video_fd_ = -1;
return;
}
ESP_LOGD(TAG, "VIDIOC_G_FMT: pixelformat=0x%08lx, width=%ld, height=%ld", format.fmt.pix.pixelformat,
format.fmt.pix.width, format.fmt.pix.height);
CAM_PRINT_FOURCC(format.fmt.pix.pixelformat);
struct v4l2_format setformat = {};
setformat.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
#ifdef CONFIG_XIAOZHI_ENABLE_ROTATE_CAMERA_IMAGE
sensor_width_ = format.fmt.pix.width;
sensor_height_ = format.fmt.pix.height;
#endif // CONFIG_XIAOZHI_ENABLE_ROTATE_CAMERA_IMAGE
setformat.fmt.pix.width = format.fmt.pix.width;
setformat.fmt.pix.height = format.fmt.pix.height;
struct v4l2_fmtdesc fmtdesc = {};
fmtdesc.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
fmtdesc.index = 0;
uint32_t best_fmt = 0;
int best_rank = 1 << 30; // large number
// 注: 当前版本 esp_video 中 YUV422P 实际输出为 YUYV。
#if defined(CONFIG_XIAOZHI_ENABLE_ROTATE_CAMERA_IMAGE) && defined(CONFIG_SOC_PPA_SUPPORTED)
auto get_rank = [](uint32_t fmt) -> int {
switch (fmt) {
case V4L2_PIX_FMT_RGB24:
return 0;
case V4L2_PIX_FMT_RGB565:
return 1;
#ifdef CONFIG_XIAOZHI_ENABLE_HARDWARE_JPEG_ENCODER
case V4L2_PIX_FMT_YUV420: // 软件 JPEG 编码器不支持 YUV420 格式
return 2;
#endif // CONFIG_XIAOZHI_ENABLE_HARDWARE_JPEG_ENCODER
case V4L2_PIX_FMT_GREY:
case V4L2_PIX_FMT_YUV422P:
default:
return 1 << 29; // unsupported
}
};
#else
auto get_rank = [](uint32_t fmt) -> int {
switch (fmt) {
case V4L2_PIX_FMT_YUV422P:
return 10;
case V4L2_PIX_FMT_RGB565:
return 11;
case V4L2_PIX_FMT_RGB24:
return 12;
#ifdef CONFIG_XIAOZHI_ENABLE_HARDWARE_JPEG_ENCODER
case V4L2_PIX_FMT_YUV420:
return 13;
#endif // CONFIG_XIAOZHI_ENABLE_HARDWARE_JPEG_ENCODER
#ifdef CONFIG_XIAOZHI_CAMERA_ALLOW_JPEG_INPUT
case V4L2_PIX_FMT_JPEG:
return 5;
#endif // CONFIG_XIAOZHI_CAMERA_ALLOW_JPEG_INPUT
case V4L2_PIX_FMT_GREY:
return 20;
default:
return 1 << 29; // unsupported
}
};
#endif
while (ioctl(video_fd_, VIDIOC_ENUM_FMT, &fmtdesc) == 0) {
ESP_LOGD(TAG, "VIDIOC_ENUM_FMT: pixelformat=0x%08lx, description=%s", fmtdesc.pixelformat, fmtdesc.description);
CAM_PRINT_FOURCC(fmtdesc.pixelformat);
int rank = get_rank(fmtdesc.pixelformat);
if (rank < best_rank) {
best_rank = rank;
best_fmt = fmtdesc.pixelformat;
}
fmtdesc.index++;
}
if (best_rank < (1 << 29)) {
setformat.fmt.pix.pixelformat = best_fmt;
sensor_format_ = best_fmt;
}
if (!setformat.fmt.pix.pixelformat) {
ESP_LOGE(TAG, "no supported pixel format found");
close(video_fd_);
video_fd_ = -1;
sensor_format_ = 0;
return;
}
ESP_LOGD(TAG, "selected pixel format: 0x%08lx", setformat.fmt.pix.pixelformat);
if (ioctl(video_fd_, VIDIOC_S_FMT, &setformat) != 0) {
ESP_LOGE(TAG, "VIDIOC_S_FMT failed, errno=%d(%s)", errno, strerror(errno));
close(video_fd_);
video_fd_ = -1;
sensor_format_ = 0;
return;
}
#ifdef CONFIG_XIAOZHI_ENABLE_ROTATE_CAMERA_IMAGE
frame_.width = setformat.fmt.pix.height;
frame_.height = setformat.fmt.pix.width;
#else
frame_.width = setformat.fmt.pix.width;
frame_.height = setformat.fmt.pix.height;
#endif
// 申请缓冲并mmap
// [2026-04-20 修复] 原 DVP 单 buffer 导致 face_tracker 每次 DQBUF 拿到
// 的都是同一帧陈旧数据DMA 无 buffer 可写),模型输出"固定伪激活"。
// 改为双 buffer 让 DMA 始终有空 buffer 可写,保证每次 DQBUF 取到新帧。
struct v4l2_requestbuffers req = {};
req.count = 2;
req.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
req.memory = V4L2_MEMORY_MMAP;
if (ioctl(video_fd_, VIDIOC_REQBUFS, &req) != 0) {
ESP_LOGE(TAG, "VIDIOC_REQBUFS failed");
close(video_fd_);
video_fd_ = -1;
sensor_format_ = 0;
return;
}
mmap_buffers_.resize(req.count);
for (uint32_t i = 0; i < req.count; i++) {
struct v4l2_buffer buf = {};
buf.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
buf.memory = V4L2_MEMORY_MMAP;
buf.index = i;
if (ioctl(video_fd_, VIDIOC_QUERYBUF, &buf) != 0) {
ESP_LOGE(TAG, "VIDIOC_QUERYBUF failed");
close(video_fd_);
video_fd_ = -1;
sensor_format_ = 0;
return;
}
void* start = mmap(NULL, buf.length, PROT_READ | PROT_WRITE, MAP_SHARED, video_fd_, buf.m.offset);
if (start == MAP_FAILED) {
ESP_LOGE(TAG, "mmap failed");
close(video_fd_);
video_fd_ = -1;
sensor_format_ = 0;
return;
}
mmap_buffers_[i].start = start;
mmap_buffers_[i].length = buf.length;
if (ioctl(video_fd_, VIDIOC_QBUF, &buf) != 0) {
ESP_LOGE(TAG, "VIDIOC_QBUF failed");
close(video_fd_);
video_fd_ = -1;
sensor_format_ = 0;
return;
}
}
int type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
if (ioctl(video_fd_, VIDIOC_STREAMON, &type) != 0) {
ESP_LOGE(TAG, "VIDIOC_STREAMON failed");
close(video_fd_);
video_fd_ = -1;
sensor_format_ = 0;
return;
}
#ifdef CONFIG_ESP_VIDEO_ENABLE_ISP_VIDEO_DEVICE
// 当启用 ISP 时ISP 需要一些照片来初始化参数因此开启后后台拍摄5s照片并丢弃
xTaskCreate(
[](void* arg) {
Esp32Camera* self = static_cast<Esp32Camera*>(arg);
uint16_t capture_count = 0;
TickType_t start = xTaskGetTickCount();
TickType_t duration = 5000 / portTICK_PERIOD_MS; // 5s
while ((xTaskGetTickCount() - start) < duration) {
struct v4l2_buffer buf = {};
buf.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
buf.memory = V4L2_MEMORY_MMAP;
if (ioctl(self->video_fd_, VIDIOC_DQBUF, &buf) != 0) {
ESP_LOGE(TAG, "VIDIOC_DQBUF failed during init");
vTaskDelay(10 / portTICK_PERIOD_MS);
continue;
}
if (ioctl(self->video_fd_, VIDIOC_QBUF, &buf) != 0) {
ESP_LOGE(TAG, "VIDIOC_QBUF failed during init");
}
capture_count++;
}
ESP_LOGI(TAG, "Camera init success, captured %d frames in %dms", capture_count,
(xTaskGetTickCount() - start) * portTICK_PERIOD_MS);
self->streaming_on_ = true;
vTaskDelete(NULL);
},
"CameraInitTask", 4096, this, 5, nullptr);
#else
ESP_LOGI(TAG, "Camera init success");
streaming_on_ = true;
#endif // CONFIG_ESP_VIDEO_ENABLE_ISP_VIDEO_DEVICE
}
Esp32Camera::~Esp32Camera() {
if (streaming_on_ && video_fd_ >= 0) {
int type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
ioctl(video_fd_, VIDIOC_STREAMOFF, &type);
}
for (auto& b : mmap_buffers_) {
if (b.start && b.length) {
munmap(b.start, b.length);
}
}
if (video_fd_ >= 0) {
close(video_fd_);
video_fd_ = -1;
}
sensor_format_ = 0;
// [T04] 释放采集互斥锁
if (capture_mutex_ != nullptr) {
vSemaphoreDelete(capture_mutex_);
capture_mutex_ = nullptr;
}
esp_video_deinit();
}
void Esp32Camera::SetExplainUrl(const std::string& url, const std::string& token) {
explain_url_ = url;
explain_token_ = token;
}
// [T01] 最小化 V4L2 DQBUF/QBUF 探测
// 只做一次 VIDIOC_DQBUF + VIDIOC_QBUF不分配 PSRAM不做格式转换/编码
// 用途:验证 OV3660 + esp_video 底层采集链路(针对 xiaozhi issue #1588 定位)
bool Esp32Camera::ProbeFrameCapture(int64_t* elapsed_us) {
if (!streaming_on_ || video_fd_ < 0) {
ESP_LOGE(TAG, "[T01] Probe 失败streaming 未启动或 video_fd 无效");
return false;
}
int64_t t0 = esp_timer_get_time();
struct v4l2_buffer buf = {};
buf.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
buf.memory = V4L2_MEMORY_MMAP;
// 只做一次 DQBUF验证 V4L2 能获取帧
if (ioctl(video_fd_, VIDIOC_DQBUF, &buf) != 0) {
ESP_LOGE(TAG, "[T01] Probe 失败VIDIOC_DQBUF 返回错误 errno=%d", errno);
return false;
}
size_t bytes_used = buf.bytesused;
// 立即归还,避免占用缓冲
if (ioctl(video_fd_, VIDIOC_QBUF, &buf) != 0) {
ESP_LOGE(TAG, "[T01] Probe 失败VIDIOC_QBUF 归还失败 errno=%d", errno);
return false;
}
int64_t t1 = esp_timer_get_time();
if (elapsed_us) *elapsed_us = t1 - t0;
// 修复ESP-IDF nano newlib printf 不完全支持 %lld改为 %lu + 强制 uint32
// probe 时间 < 100ms << uint32 上限(~71 分钟),完全安全
ESP_LOGI(TAG, "[T01] Probe 成功bytesused=%u elapsed=%luus",
(unsigned)bytes_used, (unsigned long)(t1 - t0));
return true;
}
// [T04] 人脸检测用帧采集10ms 超短 timeout 拿不到 mutex 即跳帧
// 语义MCP Capture() 可能耗时 500-3000msJPEG 编码+HTTPface_track 不能死等
// 人脸检测允许丢帧,拍照不允许丢
bool Esp32Camera::CaptureForDetection(FrameRef* out) {
if (!streaming_on_ || video_fd_ < 0 || !out || capture_mutex_ == nullptr) {
return false;
}
// 超短 timeout拿不到锁就让上层跳过这一帧
if (xSemaphoreTake(capture_mutex_, pdMS_TO_TICKS(10)) != pdTRUE) {
return false;
}
struct v4l2_buffer buf = {};
buf.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
buf.memory = V4L2_MEMORY_MMAP;
if (ioctl(video_fd_, VIDIOC_DQBUF, &buf) != 0) {
xSemaphoreGive(capture_mutex_);
return false;
}
out->data = (const uint8_t*)mmap_buffers_[buf.index].start;
out->len = buf.bytesused;
out->width = frame_.width;
out->height = frame_.height;
out->format = sensor_format_;
out->buf_index = buf.index;
// 注意:不在此处解锁!由 ReleaseDetectionFrame 配对解锁
return true;
}
// [T04] 归还人脸检测帧:配对 CaptureForDetection
// 内部执行 VIDIOC_QBUF 归还缓冲,并释放 capture_mutex_
bool Esp32Camera::ReleaseDetectionFrame(const FrameRef& ref) {
if (video_fd_ < 0) {
if (capture_mutex_ != nullptr) xSemaphoreGive(capture_mutex_);
return false;
}
struct v4l2_buffer buf = {};
buf.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
buf.memory = V4L2_MEMORY_MMAP;
buf.index = ref.buf_index;
int ret = ioctl(video_fd_, VIDIOC_QBUF, &buf);
if (capture_mutex_ != nullptr) xSemaphoreGive(capture_mutex_);
return ret == 0;
}
bool Esp32Camera::Capture() {
if (encoder_thread_.joinable()) {
encoder_thread_.join();
}
if (!streaming_on_ || video_fd_ < 0) {
return false;
}
// [T04] MCP 拍照用 portMAX_DELAY拍照不允许丢可以等 face_track 的一次推理完成
// 使用 RAII guard 确保函数任何 return 路径都释放锁
struct CaptureLockGuard {
SemaphoreHandle_t mtx;
explicit CaptureLockGuard(SemaphoreHandle_t m) : mtx(m) {
if (mtx) xSemaphoreTake(mtx, portMAX_DELAY);
}
~CaptureLockGuard() {
if (mtx) xSemaphoreGive(mtx);
}
} _cap_lock(capture_mutex_);
for (int i = 0; i < 3; i++) {
struct v4l2_buffer buf = {};
buf.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
buf.memory = V4L2_MEMORY_MMAP;
if (ioctl(video_fd_, VIDIOC_DQBUF, &buf) != 0) {
ESP_LOGE(TAG, "VIDIOC_DQBUF failed");
return false;
}
if (i == 2) {
// 保存帧副本到PSRAM
if (frame_.data) {
heap_caps_free(frame_.data);
frame_.data = nullptr;
frame_.format = 0;
}
frame_.len = buf.bytesused;
frame_.data = (uint8_t*)heap_caps_malloc(frame_.len, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT);
if (!frame_.data) {
ESP_LOGE(TAG, "alloc frame copy failed: need allocate %lu bytes", buf.bytesused);
if (ioctl(video_fd_, VIDIOC_QBUF, &buf) != 0) {
ESP_LOGE(TAG, "Cleanup: VIDIOC_QBUF failed");
}
return false;
}
#ifdef CONFIG_XIAOZHI_ENABLE_ROTATE_CAMERA_IMAGE
ESP_LOGW(TAG, "mmap_buffers_[buf.index].length = %d, sensor_width = %d, sensor_height = %d",
mmap_buffers_[buf.index].length, sensor_width_, sensor_height_);
#else
ESP_LOGW(TAG, "mmap_buffers_[buf.index].length = %d, frame.width = %d, frame.height = %d",
mmap_buffers_[buf.index].length, frame_.width, frame_.height);
#endif // CONFIG_XIAOZHI_ENABLE_ROTATE_CAMERA_IMAGE
ESP_LOG_BUFFER_HEXDUMP(TAG, mmap_buffers_[buf.index].start, MIN(mmap_buffers_[buf.index].length, 256),
ESP_LOG_DEBUG);
switch (sensor_format_) {
case V4L2_PIX_FMT_RGB565:
case V4L2_PIX_FMT_RGB24:
case V4L2_PIX_FMT_YUYV:
case V4L2_PIX_FMT_YUV420:
case V4L2_PIX_FMT_GREY:
#ifdef CONFIG_XIAOZHI_CAMERA_ALLOW_JPEG_INPUT
case V4L2_PIX_FMT_JPEG:
#endif // CONFIG_XIAOZHI_CAMERA_ALLOW_JPEG_INPUT
#ifdef CONFIG_XIAOZHI_ENABLE_CAMERA_ENDIANNESS_SWAP
{
auto src16 = (uint16_t*)mmap_buffers_[buf.index].start;
auto dst16 = (uint16_t*)frame_.data;
size_t count = (size_t)mmap_buffers_[buf.index].length / 2;
for (size_t i = 0; i < count; i++) {
dst16[i] = __builtin_bswap16(src16[i]);
}
}
#else
memcpy(frame_.data, mmap_buffers_[buf.index].start,
MIN(mmap_buffers_[buf.index].length, frame_.len));
#endif // CONFIG_XIAOZHI_ENABLE_CAMERA_ENDIANNESS_SWAP
frame_.format = sensor_format_;
break;
case V4L2_PIX_FMT_YUV422P: {
// 这个格式是 422 YUYV不是 planer
frame_.format = V4L2_PIX_FMT_YUYV;
#ifdef CONFIG_XIAOZHI_ENABLE_CAMERA_ENDIANNESS_SWAP
{
auto src16 = (uint16_t*)mmap_buffers_[buf.index].start;
auto dst16 = (uint16_t*)frame_.data;
size_t count = (size_t)mmap_buffers_[buf.index].length / 2;
for (size_t i = 0; i < count; i++) {
dst16[i] = __builtin_bswap16(src16[i]);
}
}
#else
memcpy(frame_.data, mmap_buffers_[buf.index].start,
MIN(mmap_buffers_[buf.index].length, frame_.len));
#endif // CONFIG_XIAOZHI_ENABLE_CAMERA_ENDIANNESS_SWAP
break;
}
case V4L2_PIX_FMT_RGB565X: {
// 大端序的 RGB565 需要转换为小端序
// 目前 esp_video 的大小端都会返回格式为 RGB565不会返回格式为 RGB565X此 case 用于未来版本兼容
auto src16 = (uint16_t*)mmap_buffers_[buf.index].start;
auto dst16 = (uint16_t*)frame_.data;
size_t pixel_count = (size_t)frame_.width * (size_t)frame_.height;
for (size_t i = 0; i < pixel_count; i++) {
dst16[i] = __builtin_bswap16(src16[i]);
}
frame_.format = V4L2_PIX_FMT_RGB565;
break;
}
default:
ESP_LOGE(TAG, "unsupported sensor format: 0x%08lx", sensor_format_);
if (ioctl(video_fd_, VIDIOC_QBUF, &buf) != 0) {
ESP_LOGE(TAG, "Cleanup: VIDIOC_QBUF failed");
}
return false;
}
#ifdef CONFIG_XIAOZHI_ENABLE_ROTATE_CAMERA_IMAGE
#ifndef CONFIG_SOC_PPA_SUPPORTED
uint8_t* rotate_dst =
(uint8_t*)heap_caps_aligned_alloc(64, frame_.len, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT);
if (rotate_dst == nullptr) {
ESP_LOGE(TAG, "Failed to allocate memory for rotate image");
if (ioctl(video_fd_, VIDIOC_QBUF, &buf) != 0) {
ESP_LOGE(TAG, "Cleanup: VIDIOC_QBUF failed");
}
return false;
}
uint8_t* rotate_src = (uint8_t*)frame_.data;
esp_imgfx_rotate_cfg_t rotate_cfg = {
.in_res =
{
.width = static_cast<int16_t>(sensor_width_),
.height = static_cast<int16_t>(sensor_height_),
},
.degree = IMAGE_ROTATION_ANGLE,
};
switch (frame_.format) {
case V4L2_PIX_FMT_RGB565:
rotate_cfg.in_pixel_fmt = ESP_IMGFX_PIXEL_FMT_RGB565_LE;
break;
case V4L2_PIX_FMT_YUYV:
rotate_cfg.in_pixel_fmt = ESP_IMGFX_PIXEL_FMT_RGB565_LE;
break;
case V4L2_PIX_FMT_GREY:
rotate_cfg.in_pixel_fmt = ESP_IMGFX_PIXEL_FMT_Y;
break;
case V4L2_PIX_FMT_RGB24:
rotate_cfg.in_pixel_fmt = ESP_IMGFX_PIXEL_FMT_RGB888;
break;
default:
ESP_LOGE(TAG, "unsupported sensor format: 0x%08lx", sensor_format_);
if (ioctl(video_fd_, VIDIOC_QBUF, &buf) != 0) {
ESP_LOGE(TAG, "Cleanup: VIDIOC_QBUF failed");
}
return false;
}
esp_imgfx_rotate_handle_t rotate_handle = nullptr;
esp_imgfx_err_t imgfx_err = esp_imgfx_rotate_open(&rotate_cfg, &rotate_handle);
if (imgfx_err != ESP_IMGFX_ERR_OK || rotate_handle == nullptr) {
ESP_LOGE(TAG, "esp_imgfx_rotate_create failed");
if (ioctl(video_fd_, VIDIOC_QBUF, &buf) != 0) {
ESP_LOGE(TAG, "Cleanup: VIDIOC_QBUF failed");
}
return false;
}
esp_imgfx_data_t rotate_input_data = {
.data = rotate_src,
.data_len = frame_.len,
};
esp_imgfx_data_t rotate_output_data = {
.data = rotate_dst,
.data_len = frame_.len,
};
imgfx_err = esp_imgfx_rotate_process(rotate_handle, &rotate_input_data, &rotate_output_data);
if (imgfx_err != ESP_IMGFX_ERR_OK) {
ESP_LOGE(TAG, "esp_imgfx_rotate_process failed");
heap_caps_free(rotate_dst);
rotate_dst = nullptr;
if (ioctl(video_fd_, VIDIOC_QBUF, &buf) != 0) {
ESP_LOGE(TAG, "Cleanup: VIDIOC_QBUF failed");
}
esp_imgfx_rotate_close(rotate_handle);
rotate_handle = nullptr;
return false;
}
frame_.data = rotate_dst;
heap_caps_free(rotate_src);
rotate_src = nullptr;
esp_imgfx_rotate_close(rotate_handle);
rotate_handle = nullptr;
#else // CONFIG_SOC_PPA_SUPPORTED
uint8_t* rotate_src = nullptr;
ppa_srm_color_mode_t ppa_color_mode;
switch (frame_.format) {
case V4L2_PIX_FMT_RGB565:
rotate_src = (uint8_t*)frame_.data;
ppa_color_mode = PPA_SRM_COLOR_MODE_RGB565;
break;
case V4L2_PIX_FMT_RGB24:
rotate_src = (uint8_t*)frame_.data;
ppa_color_mode = PPA_SRM_COLOR_MODE_RGB888;
break;
case V4L2_PIX_FMT_YUYV: {
ESP_LOGW(TAG, "YUYV format is not supported for PPA rotation, using software conversion to RGB888");
rotate_src = (uint8_t*)heap_caps_malloc(frame_.width * frame_.height * 3,
MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT);
if (rotate_src == nullptr) {
ESP_LOGE(TAG, "Failed to allocate memory for rotate image");
if (ioctl(video_fd_, VIDIOC_QBUF, &buf) != 0) {
ESP_LOGE(TAG, "Cleanup: VIDIOC_QBUF failed");
}
return false;
}
esp_imgfx_color_convert_cfg_t convert_cfg = {
.in_res = {.width = static_cast<int16_t>(frame_.width),
.height = static_cast<int16_t>(frame_.height)},
.in_pixel_fmt = ESP_IMGFX_PIXEL_FMT_YUYV,
.out_pixel_fmt = ESP_IMGFX_PIXEL_FMT_RGB888,
};
esp_imgfx_color_convert_handle_t convert_handle = nullptr;
esp_imgfx_err_t err = esp_imgfx_color_convert_open(&convert_cfg, &convert_handle);
if (err != ESP_IMGFX_ERR_OK || convert_handle == nullptr) {
ESP_LOGE(TAG, "esp_imgfx_color_convert_open failed");
heap_caps_free(rotate_src);
rotate_src = nullptr;
if (ioctl(video_fd_, VIDIOC_QBUF, &buf) != 0) {
ESP_LOGE(TAG, "Cleanup: VIDIOC_QBUF failed");
}
return false;
}
esp_imgfx_data_t convert_input_data = {
.data = frame_.data,
.data_len = frame_.len,
};
esp_imgfx_data_t convert_output_data = {
.data = rotate_src,
.data_len = static_cast<uint32_t>(frame_.width * frame_.height * 3),
};
err = esp_imgfx_color_convert_process(convert_handle, &convert_input_data, &convert_output_data);
if (err != ESP_IMGFX_ERR_OK) {
ESP_LOGE(TAG, "esp_imgfx_color_convert_process failed");
heap_caps_free(rotate_src);
rotate_src = nullptr;
esp_imgfx_color_convert_close(convert_handle);
convert_handle = nullptr;
if (ioctl(video_fd_, VIDIOC_QBUF, &buf) != 0) {
ESP_LOGE(TAG, "Cleanup: VIDIOC_QBUF failed");
}
return false;
}
esp_imgfx_color_convert_close(convert_handle);
convert_handle = nullptr;
ppa_color_mode = PPA_SRM_COLOR_MODE_RGB888;
heap_caps_free(frame_.data);
frame_.data = rotate_src;
frame_.len = frame_.width * frame_.height * 3;
break;
}
default:
ESP_LOGE(TAG, "unsupported sensor format for PPA rotation: 0x%08lx", sensor_format_);
if (ioctl(video_fd_, VIDIOC_QBUF, &buf) != 0) {
ESP_LOGE(TAG, "Cleanup: VIDIOC_QBUF failed");
}
return false;
}
uint8_t* rotate_dst = (uint8_t*)heap_caps_malloc(
frame_.width * frame_.height * 2, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT | MALLOC_CAP_CACHE_ALIGNED);
if (rotate_dst == nullptr) {
ESP_LOGE(TAG, "Failed to allocate memory for rotate image");
if (ioctl(video_fd_, VIDIOC_QBUF, &buf) != 0) {
ESP_LOGE(TAG, "Cleanup: VIDIOC_QBUF failed");
}
return false;
}
ppa_client_handle_t ppa_client = nullptr;
ppa_client_config_t client_cfg = {
.oper_type = PPA_OPERATION_SRM,
.max_pending_trans_num = 1,
};
esp_err_t err = ppa_register_client(&client_cfg, &ppa_client);
if (err != ESP_OK || ppa_client == nullptr) {
ESP_LOGE(TAG, "ppa_register_client failed: %d", (int)err);
heap_caps_free(rotate_dst);
rotate_dst = nullptr;
if (ioctl(video_fd_, VIDIOC_QBUF, &buf) != 0) {
ESP_LOGE(TAG, "Cleanup: VIDIOC_QBUF failed");
}
return false;
}
ppa_srm_rotation_angle_t ppa_angle = IMAGE_ROTATION_ANGLE;
ppa_srm_oper_config_t srm_cfg = {};
srm_cfg.in.buffer = (void*)rotate_src;
srm_cfg.in.pic_w = sensor_width_;
srm_cfg.in.pic_h = sensor_height_;
srm_cfg.in.block_w = sensor_width_;
srm_cfg.in.block_h = sensor_height_;
srm_cfg.in.block_offset_x = 0;
srm_cfg.in.block_offset_y = 0;
srm_cfg.in.srm_cm = ppa_color_mode;
srm_cfg.out.buffer = (void*)rotate_dst;
srm_cfg.out.buffer_size = frame_.len;
srm_cfg.out.pic_w = frame_.width;
srm_cfg.out.pic_h = frame_.height;
srm_cfg.out.block_offset_x = 0;
srm_cfg.out.block_offset_y = 0;
srm_cfg.out.srm_cm = PPA_SRM_COLOR_MODE_RGB565;
// 等比例缩放 1.0
srm_cfg.scale_x = 1.0f;
srm_cfg.scale_y = 1.0f;
srm_cfg.rotation_angle = ppa_angle;
srm_cfg.mode = PPA_TRANS_MODE_BLOCKING;
srm_cfg.user_data = nullptr;
err = ppa_do_scale_rotate_mirror(ppa_client, &srm_cfg);
if (err != ESP_OK) {
ESP_LOGE(TAG, "ppa_do_scale_rotate_mirror failed: %d", (int)err);
heap_caps_free(rotate_dst);
rotate_dst = nullptr;
(void)ppa_unregister_client(ppa_client);
if (ioctl(video_fd_, VIDIOC_QBUF, &buf) != 0) {
ESP_LOGE(TAG, "Cleanup: VIDIOC_QBUF failed");
}
return false;
}
(void)ppa_unregister_client(ppa_client);
frame_.data = rotate_dst;
frame_.len = frame_.width * frame_.height * 2;
frame_.format = V4L2_PIX_FMT_RGB565;
heap_caps_free(rotate_src);
rotate_src = nullptr;
#endif // CONFIG_SOC_PPA_SUPPORTED
#endif // CONFIG_XIAOZHI_ENABLE_ROTATE_CAMERA_IMAGE
}
if (ioctl(video_fd_, VIDIOC_QBUF, &buf) != 0) {
ESP_LOGE(TAG, "VIDIOC_QBUF failed");
}
}
// 显示预览图片
auto display = dynamic_cast<LvglDisplay*>(Board::GetInstance().GetDisplay());
if (display != nullptr) {
if (!frame_.data) {
ESP_LOGE(TAG, "frame.data is null");
return false;
}
uint16_t w = frame_.width;
uint16_t h = frame_.height;
size_t lvgl_image_size = frame_.len;
size_t stride = ((w * 2) + 3) & ~3; // 4字节对齐
lv_color_format_t color_format = LV_COLOR_FORMAT_RGB565;
uint8_t* data = nullptr;
switch (frame_.format) {
// LVGL 显示 YUV 系的图像似乎都有问题,暂时转换为 RGB565 显示
case V4L2_PIX_FMT_YUYV:
case V4L2_PIX_FMT_YUV420:
case V4L2_PIX_FMT_RGB24: {
color_format = LV_COLOR_FORMAT_RGB565;
data = (uint8_t*)heap_caps_malloc(w * h * 2, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT);
if (data == nullptr) {
ESP_LOGE(TAG, "Failed to allocate memory for preview image");
return false;
}
esp_imgfx_color_convert_cfg_t convert_cfg = {
.in_res = {.width = static_cast<int16_t>(frame_.width),
.height = static_cast<int16_t>(frame_.height)},
.in_pixel_fmt = static_cast<esp_imgfx_pixel_fmt_t>(frame_.format),
.out_pixel_fmt = ESP_IMGFX_PIXEL_FMT_RGB565_LE,
.color_space_std = ESP_IMGFX_COLOR_SPACE_STD_BT601,
};
esp_imgfx_color_convert_handle_t convert_handle = nullptr;
esp_imgfx_err_t err = esp_imgfx_color_convert_open(&convert_cfg, &convert_handle);
if (err != ESP_IMGFX_ERR_OK || convert_handle == nullptr) {
ESP_LOGE(TAG, "esp_imgfx_color_convert_open failed");
heap_caps_free(data);
data = nullptr;
return false;
}
esp_imgfx_data_t convert_input_data = {
.data = frame_.data,
.data_len = frame_.len,
};
esp_imgfx_data_t convert_output_data = {
.data = data,
.data_len = static_cast<uint32_t>(w * h * 2),
};
err = esp_imgfx_color_convert_process(convert_handle, &convert_input_data, &convert_output_data);
if (err != ESP_IMGFX_ERR_OK) {
ESP_LOGE(TAG, "esp_imgfx_color_convert_process failed");
heap_caps_free(data);
data = nullptr;
esp_imgfx_color_convert_close(convert_handle);
convert_handle = nullptr;
return false;
}
esp_imgfx_color_convert_close(convert_handle);
convert_handle = nullptr;
lvgl_image_size = w * h * 2;
break;
}
case V4L2_PIX_FMT_RGB565:
data = (uint8_t*)heap_caps_malloc(w * h * 2, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT);
if (data == nullptr) {
ESP_LOGE(TAG, "Failed to allocate memory for preview image");
return false;
}
memcpy(data, frame_.data, frame_.len);
lvgl_image_size = frame_.len; // fallthrough 时兼顾 YUYV 与 RGB565
break;
#ifdef CONFIG_XIAOZHI_CAMERA_ALLOW_JPEG_INPUT
case V4L2_PIX_FMT_JPEG: {
uint8_t* out_data = nullptr; // out data is allocated by jpeg_to_image
size_t out_len = 0;
size_t out_width = 0;
size_t out_height = 0;
size_t out_stride = 0;
esp_err_t ret =
jpeg_to_image(frame_.data, frame_.len, &out_data, &out_len, &out_width, &out_height, &out_stride);
if (ret != ESP_OK) {
ESP_LOGE(TAG, "Failed to decode JPEG image: %d (%s)", (int)ret, esp_err_to_name(ret));
if (out_data) {
heap_caps_free(out_data);
out_data = nullptr;
}
return false;
}
data = out_data;
w = out_width;
h = out_height;
lvgl_image_size = out_len;
stride = out_stride;
break;
}
#endif
default:
ESP_LOGE(TAG, "unsupported frame format: 0x%08lx", frame_.format);
return false;
}
auto image = std::make_unique<LvglAllocatedImage>(data, lvgl_image_size, w, h, stride, color_format);
display->SetPreviewImage(std::move(image));
}
return true;
}
bool Esp32Camera::SetHMirror(bool enabled) {
if (video_fd_ < 0)
return false;
struct v4l2_ext_controls ctrls = {};
struct v4l2_ext_control ctrl = {};
ctrl.id = V4L2_CID_HFLIP;
ctrl.value = enabled ? 1 : 0;
ctrls.ctrl_class = V4L2_CTRL_CLASS_USER;
ctrls.count = 1;
ctrls.controls = &ctrl;
if (ioctl(video_fd_, VIDIOC_S_EXT_CTRLS, &ctrls) != 0) {
ESP_LOGE(TAG, "set HFLIP failed");
return false;
}
return true;
}
bool Esp32Camera::SetVFlip(bool enabled) {
if (video_fd_ < 0)
return false;
struct v4l2_ext_controls ctrls = {};
struct v4l2_ext_control ctrl = {};
ctrl.id = V4L2_CID_VFLIP;
ctrl.value = enabled ? 1 : 0;
ctrls.ctrl_class = V4L2_CTRL_CLASS_USER;
ctrls.count = 1;
ctrls.controls = &ctrl;
if (ioctl(video_fd_, VIDIOC_S_EXT_CTRLS, &ctrls) != 0) {
ESP_LOGE(TAG, "set VFLIP failed");
return false;
}
return true;
}
/**
* @brief 将摄像头捕获的图像发送到远程服务器进行AI分析和解释
*
* 该函数将当前摄像头缓冲区中的图像编码为JPEG格式并通过HTTP POST请求
* 以multipart/form-data的形式发送到指定的解释服务器。服务器将根据提供的
* 问题对图像进行AI分析并返回结果。
*
* 实现特点:
* - 使用独立线程编码JPEG与主线程分离
* - 采用分块传输编码(chunked transfer encoding)优化内存使用
* - 通过队列机制实现编码线程和发送线程的数据同步
* - 支持设备ID、客户端ID和认证令牌的HTTP头部配置
*
* @param question 要向AI提出的关于图像的问题将作为表单字段发送
* @return std::string 服务器返回的JSON格式响应字符串
* 成功时包含AI分析结果失败时包含错误信息
* 格式示例:{"success": true, "result": "分析结果"}
* {"success": false, "message": "错误信息"}
*
* @note 调用此函数前必须先调用SetExplainUrl()设置服务器URL
* @note 函数会等待之前的编码线程完成后再开始新的处理
* @warning 如果摄像头缓冲区为空或网络连接失败,将返回错误信息
*/
std::string Esp32Camera::Explain(const std::string& question) {
if (explain_url_.empty()) {
throw std::runtime_error("Image explain URL or token is not set");
}
// 创建局部的 JPEG 队列, 40 entries is about to store 512 * 40 = 20480 bytes of JPEG data
QueueHandle_t jpeg_queue = xQueueCreate(40, sizeof(JpegChunk));
if (jpeg_queue == nullptr) {
ESP_LOGE(TAG, "Failed to create JPEG queue");
throw std::runtime_error("Failed to create JPEG queue");
}
// We spawn a thread to encode the image to JPEG using optimized encoder (cost about 500ms and 8KB SRAM)
encoder_thread_ = std::thread([this, jpeg_queue]() {
uint16_t w = frame_.width ? frame_.width : 320;
uint16_t h = frame_.height ? frame_.height : 240;
v4l2_pix_fmt_t enc_fmt = frame_.format;
bool ok = image_to_jpeg_cb(
frame_.data, frame_.len, w, h, enc_fmt, 80,
[](void* arg, size_t index, const void* data, size_t len) -> size_t {
auto jpeg_queue = static_cast<QueueHandle_t>(arg);
JpegChunk chunk = {.data = nullptr, .len = len};
if (index == 0 && data != nullptr && len > 0) {
chunk.data = (uint8_t*)heap_caps_aligned_alloc(16, len, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT);
if (chunk.data == nullptr) {
ESP_LOGE(TAG, "Failed to allocate %zu bytes for JPEG chunk", len);
chunk.len = 0;
} else {
memcpy(chunk.data, data, len);
}
} else {
chunk.len = 0; // Sentinel or error
}
xQueueSend(jpeg_queue, &chunk, portMAX_DELAY);
return len;
},
jpeg_queue);
if (!ok) {
JpegChunk chunk = {.data = nullptr, .len = 0};
xQueueSend(jpeg_queue, &chunk, portMAX_DELAY);
}
});
auto network = Board::GetInstance().GetNetwork();
auto http = network->CreateHttp(3);
// 构造multipart/form-data请求体
std::string boundary = "----ESP32_CAMERA_BOUNDARY";
// 配置HTTP客户端使用分块传输编码
http->SetHeader("Device-Id", SystemInfo::GetMacAddress().c_str());
http->SetHeader("Client-Id", Board::GetInstance().GetUuid().c_str());
if (!explain_token_.empty()) {
http->SetHeader("Authorization", "Bearer " + explain_token_);
}
http->SetHeader("Content-Type", "multipart/form-data; boundary=" + boundary);
http->SetHeader("Transfer-Encoding", "chunked");
if (!http->Open("POST", explain_url_)) {
ESP_LOGE(TAG, "Failed to connect to explain URL");
// Clear the queue
encoder_thread_.join();
JpegChunk chunk;
while (xQueueReceive(jpeg_queue, &chunk, portMAX_DELAY) == pdPASS) {
if (chunk.data != nullptr) {
heap_caps_free(chunk.data);
} else {
break;
}
}
vQueueDelete(jpeg_queue);
throw std::runtime_error("Failed to connect to explain URL");
}
{
// 第一块question字段
std::string question_field;
question_field += "--" + boundary + "\r\n";
question_field += "Content-Disposition: form-data; name=\"question\"\r\n";
question_field += "\r\n";
question_field += question + "\r\n";
http->Write(question_field.c_str(), question_field.size());
}
{
// 第二块:文件字段头部
std::string file_header;
file_header += "--" + boundary + "\r\n";
file_header += "Content-Disposition: form-data; name=\"file\"; filename=\"camera.jpg\"\r\n";
file_header += "Content-Type: image/jpeg\r\n";
file_header += "\r\n";
http->Write(file_header.c_str(), file_header.size());
}
// 第三块JPEG数据
size_t total_sent = 0;
bool saw_terminator = false;
while (true) {
JpegChunk chunk;
if (xQueueReceive(jpeg_queue, &chunk, portMAX_DELAY) != pdPASS) {
ESP_LOGE(TAG, "Failed to receive JPEG chunk");
break;
}
if (chunk.data == nullptr) {
saw_terminator = true;
break; // The last chunk
}
http->Write((const char*)chunk.data, chunk.len);
total_sent += chunk.len;
heap_caps_free(chunk.data);
}
// Wait for the encoder thread to finish
encoder_thread_.join();
// 清理队列
vQueueDelete(jpeg_queue);
if (!saw_terminator || total_sent == 0) {
ESP_LOGE(TAG, "JPEG encoder failed or produced empty output");
throw std::runtime_error("Failed to encode image to JPEG");
}
{
// 第四块multipart尾部
std::string multipart_footer;
multipart_footer += "\r\n--" + boundary + "--\r\n";
http->Write(multipart_footer.c_str(), multipart_footer.size());
}
// 结束块
http->Write("", 0);
if (http->GetStatusCode() != 200) {
ESP_LOGE(TAG, "Failed to upload photo, status code: %d", http->GetStatusCode());
throw std::runtime_error("Failed to upload photo");
}
std::string result = http->ReadAll();
http->Close();
// Get remain task stack size
size_t remain_stack_size = uxTaskGetStackHighWaterMark(nullptr);
ESP_LOGI(TAG, "Explain image size=%d bytes, compressed size=%d, remain stack size=%d, question=%s\n%s",
(int)frame_.len, (int)total_sent, (int)remain_stack_size, question.c_str(), result.c_str());
return result;
}