Rdzleo e95d0c414e Phase 01 批次 1-3: 单摄像头人脸追踪基础设施
实现 ESP32-S3 上单摄像头人脸追踪的核心代码骨架,替代 Grove Vision AI V2
模块,通过 UART 发送人脸坐标驱动 RP2040 控制的眼球/YAW 舵机。

## 规划文档(docs/phase-01-face-tracking/)

- GOAL.md       Phase 目标与 5 大成功标准
- RESEARCH.md   esp-dl v3.2/3.3 + human_face_detect 0.4.1 技术调研
- PLAN.md       15 个原子任务的执行计划(T01-T15)
- PLAN_CHECK.md 计划审查报告(PASS_WITH_NOTES)
- PROGRESS.md   执行进度追踪(批次 1-3 已完成)

## 批次 1:依赖与开关(T01-T03)

- main/idf_component.yml
  新增 esp-dl ~3.3.0 + human_face_detect 0.4.1(仅 S3/P4)
  esp-sr 从 ~2.2.0 升级到 ~2.3.1,解决 esp-dsp 1.6/1.7 版本冲突
- main/Kconfig.projbuild
  新增 CONFIG_XIAOZHI_ENABLE_FACE_TRACKING 开关(默认 y,depends on S3)
  新增 CONFIG_XIAOZHI_FACE_TRACKING_FPS_CHOICE(5/10/15)
- main/boards/common/esp32_camera.{h,cc}
  新增 ProbeFrameCapture() 最小 V4L2 DQBUF/QBUF 探针(T01)
- main/application.cc
  Start() 末尾调用 probe 验证摄像头硬件链路

## 批次 2:人脸检测核心(T04-T06)

- main/boards/common/esp32_camera.{h,cc}
  新增 FrameRef 结构体 + CaptureForDetection/ReleaseDetectionFrame
  双超时 mutex 策略:face_tracker 10ms timeout 跳帧,Capture() RAII guard
- main/face_tracker.{h,cc}(新建)
  Core 0 / 优先级 2 / 栈 8KB 独立任务
  集成 esp-dl HumanFaceDetect 推理
  坐标归一化 cx*224/W-112,匹配 RP2040 pixel_centre=112
  多人脸遍历挑 score 最高,避免多脸时眼球摇摆
  三重保护:Kconfig depends on S3 + 源文件 #if 守卫 + CMake 条件排除
- main/CMakeLists.txt
  非 S3 目标从 SOURCES 移除 face_tracker.cc

## 批次 3:UART 协议扩展(T07)

- main/uart_component.{h,cc}
  新增 uart_send_face(x,y) 发送 face:x,y\r\n 协议
  extern "C" 链接名配合 face_tracker 的弱符号声明
  全局 TX mutex 保护所有 UART 写入,防并发帧交织
  uart_send_string 同步加锁保持一致性

## 编译验证

idf.py build 通过,固件 2.51MB / 剩余 1.46MB (36% free)
当前 face_tracker 未被 application 激活(留到 T11),
UART/摄像头现有功能零影响。

## 未完成(下次继续)

- T01 硬件 probe 实机验证
- T08-T10 RP2040 端 parse_face + facetrack 双数据源改造
- T11-T15 application 接入 + 端到端联调 + 性能调优 + 最终验收

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-17 18:24:27 +08:00

1139 lines
44 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#include <fcntl.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <sys/param.h>
#include <unistd.h>
#include <errno.h>
#include <esp_heap_caps.h>
#include <cstdio>
#include <cstring>
#include "esp_imgfx_color_convert.h"
#include "esp_video_device.h"
#include "esp_video_init.h"
#include "linux/videodev2.h"
#include "board.h"
#include "display.h"
#include "esp32_camera.h"
#include "esp_jpeg_common.h"
#include "jpg/image_to_jpeg.h"
#include "jpg/jpeg_to_image.h"
#include "lvgl_display.h"
#include "mcp_server.h"
#include "system_info.h"
#ifdef CONFIG_XIAOZHI_ENABLE_CAMERA_DEBUG_MODE
#undef LOG_LOCAL_LEVEL
#define LOG_LOCAL_LEVEL MAX(CONFIG_LOG_DEFAULT_LEVEL, ESP_LOG_DEBUG)
#endif // CONFIG_XIAOZHI_ENABLE_CAMERA_DEBUG_MODE
#include <esp_log.h> // should be after LOCAL_LOG_LEVEL definition
#ifdef CONFIG_XIAOZHI_ENABLE_ROTATE_CAMERA_IMAGE
#ifdef CONFIG_IDF_TARGET_ESP32P4
#include "driver/ppa.h"
#if defined(CONFIG_XIAOZHI_CAMERA_IMAGE_ROTATION_ANGLE_90)
#define IMAGE_ROTATION_ANGLE (PPA_SRM_ROTATION_ANGLE_270)
#elif defined(CONFIG_XIAOZHI_CAMERA_IMAGE_ROTATION_ANGLE_270)
#define IMAGE_ROTATION_ANGLE (PPA_SRM_ROTATION_ANGLE_90)
#else
#error "CONFIG_XIAOZHI_CAMERA_IMAGE_ROTATION_ANGLE is not set"
#endif // angle
#else // target
#include "esp_imgfx_rotate.h"
#if defined(CONFIG_XIAOZHI_CAMERA_IMAGE_ROTATION_ANGLE_90)
#define IMAGE_ROTATION_ANGLE (90)
#elif defined(CONFIG_XIAOZHI_CAMERA_IMAGE_ROTATION_ANGLE_270)
#define IMAGE_ROTATION_ANGLE (270)
#else
#error "CONFIG_XIAOZHI_CAMERA_IMAGE_ROTATION_ANGLE is not set"
#endif // angle
#endif // target
#endif // CONFIG_XIAOZHI_ENABLE_ROTATE_CAMERA_IMAGE
#define TAG "Esp32Camera"
#if defined(CONFIG_CAMERA_SENSOR_SWAP_PIXEL_BYTE_ORDER) || defined(CONFIG_XIAOZHI_ENABLE_CAMERA_ENDIANNESS_SWAP)
#warning \
"CAMERA_SENSOR_SWAP_PIXEL_BYTE_ORDER or CONFIG_XIAOZHI_ENABLE_CAMERA_ENDIANNESS_SWAP is enabled, which may cause image corruption in YUV422 format!"
#endif
#if CONFIG_XIAOZHI_ENABLE_CAMERA_DEBUG_MODE
#define CAM_PRINT_FOURCC(pixelformat) \
char fourcc[5]; \
fourcc[0] = pixelformat & 0xFF; \
fourcc[1] = (pixelformat >> 8) & 0xFF; \
fourcc[2] = (pixelformat >> 16) & 0xFF; \
fourcc[3] = (pixelformat >> 24) & 0xFF; \
fourcc[4] = '\0'; \
ESP_LOGD(TAG, "FOURCC: '%c%c%c%c'", fourcc[0], fourcc[1], fourcc[2], fourcc[3]);
// for compatibility with old esp_video version
#ifndef MAP_FAILED
#define MAP_FAILED nullptr
#endif
__attribute__((weak)) esp_err_t esp_video_deinit(void) {
return ESP_ERR_NOT_SUPPORTED;
}
// end of for compatibility with old esp_video version
static void log_available_video_devices() {
for (int i = 0; i < 50; i++) {
char path[16];
snprintf(path, sizeof(path), "/dev/video%d", i);
int fd = open(path, O_RDONLY);
if (fd >= 0) {
ESP_LOGD(TAG, "found video device: %s", path);
close(fd);
}
}
}
#else
#define CAM_PRINT_FOURCC(pixelformat) (void)0;
#endif // CONFIG_XIAOZHI_ENABLE_CAMERA_DEBUG_MODE
Esp32Camera::Esp32Camera(const esp_video_init_config_t& config) {
// [T04] 创建采集互斥锁face_track 用 10ms timeoutMCP 拍照用 portMAX_DELAY
capture_mutex_ = xSemaphoreCreateMutex();
if (capture_mutex_ == nullptr) {
ESP_LOGE(TAG, "xSemaphoreCreateMutex failed");
return;
}
if (esp_video_init(&config) != ESP_OK) {
ESP_LOGE(TAG, "esp_video_init failed");
return;
}
#ifdef CONFIG_XIAOZHI_ENABLE_CAMERA_DEBUG_MODE
esp_log_level_set(TAG, ESP_LOG_DEBUG);
#endif // CONFIG_XIAOZHI_ENABLE_CAMERA_DEBUG_MODE
const char* video_device_name = nullptr;
if (false) { /* 用于构建 else if */
}
#if CONFIG_ESP_VIDEO_ENABLE_MIPI_CSI_VIDEO_DEVICE
else if (config.csi != nullptr) {
video_device_name = ESP_VIDEO_MIPI_CSI_DEVICE_NAME;
}
#endif
#if CONFIG_ESP_VIDEO_ENABLE_DVP_VIDEO_DEVICE
else if (config.dvp != nullptr) {
video_device_name = ESP_VIDEO_DVP_DEVICE_NAME;
}
#endif
#if CONFIG_ESP_VIDEO_ENABLE_HW_JPEG_VIDEO_DEVICE
else if (config.jpeg != nullptr) {
video_device_name = ESP_VIDEO_JPEG_DEVICE_NAME;
}
#endif
#if CONFIG_ESP_VIDEO_ENABLE_SPI_VIDEO_DEVICE
else if (config.spi != nullptr) {
video_device_name = ESP_VIDEO_SPI_DEVICE_NAME;
}
#endif
#if CONFIG_ESP_VIDEO_ENABLE_USB_UVC_VIDEO_DEVICE
else if (config.usb_uvc != nullptr) {
video_device_name = ESP_VIDEO_USB_UVC_DEVICE_NAME(0);
}
#endif
if (video_device_name == nullptr) {
ESP_LOGE(TAG, "no video device is enabled");
return;
}
video_fd_ = open(video_device_name, O_RDWR);
if (video_fd_ < 0) {
ESP_LOGE(TAG, "open %s failed, errno=%d(%s)", video_device_name, errno, strerror(errno));
#if CONFIG_XIAOZHI_ENABLE_CAMERA_DEBUG_MODE
log_available_video_devices();
#endif // CONFIG_XIAOZHI_ENABLE_CAMERA_DEBUG_MODE
return;
}
struct v4l2_capability cap = {};
if (ioctl(video_fd_, VIDIOC_QUERYCAP, &cap) != 0) {
ESP_LOGE(TAG, "VIDIOC_QUERYCAP failed, errno=%d(%s)", errno, strerror(errno));
close(video_fd_);
video_fd_ = -1;
return;
}
ESP_LOGD(
TAG,
"VIDIOC_QUERYCAP: driver=%s, card=%s, bus_info=%s, version=0x%08lx, capabilities=0x%08lx, device_caps=0x%08lx",
cap.driver, cap.card, cap.bus_info, cap.version, cap.capabilities, cap.device_caps);
struct v4l2_format format = {};
format.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
if (ioctl(video_fd_, VIDIOC_G_FMT, &format) != 0) {
ESP_LOGE(TAG, "VIDIOC_G_FMT failed, errno=%d(%s)", errno, strerror(errno));
close(video_fd_);
video_fd_ = -1;
return;
}
ESP_LOGD(TAG, "VIDIOC_G_FMT: pixelformat=0x%08lx, width=%ld, height=%ld", format.fmt.pix.pixelformat,
format.fmt.pix.width, format.fmt.pix.height);
CAM_PRINT_FOURCC(format.fmt.pix.pixelformat);
struct v4l2_format setformat = {};
setformat.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
#ifdef CONFIG_XIAOZHI_ENABLE_ROTATE_CAMERA_IMAGE
sensor_width_ = format.fmt.pix.width;
sensor_height_ = format.fmt.pix.height;
#endif // CONFIG_XIAOZHI_ENABLE_ROTATE_CAMERA_IMAGE
setformat.fmt.pix.width = format.fmt.pix.width;
setformat.fmt.pix.height = format.fmt.pix.height;
struct v4l2_fmtdesc fmtdesc = {};
fmtdesc.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
fmtdesc.index = 0;
uint32_t best_fmt = 0;
int best_rank = 1 << 30; // large number
// 注: 当前版本 esp_video 中 YUV422P 实际输出为 YUYV。
#if defined(CONFIG_XIAOZHI_ENABLE_ROTATE_CAMERA_IMAGE) && defined(CONFIG_SOC_PPA_SUPPORTED)
auto get_rank = [](uint32_t fmt) -> int {
switch (fmt) {
case V4L2_PIX_FMT_RGB24:
return 0;
case V4L2_PIX_FMT_RGB565:
return 1;
#ifdef CONFIG_XIAOZHI_ENABLE_HARDWARE_JPEG_ENCODER
case V4L2_PIX_FMT_YUV420: // 软件 JPEG 编码器不支持 YUV420 格式
return 2;
#endif // CONFIG_XIAOZHI_ENABLE_HARDWARE_JPEG_ENCODER
case V4L2_PIX_FMT_GREY:
case V4L2_PIX_FMT_YUV422P:
default:
return 1 << 29; // unsupported
}
};
#else
auto get_rank = [](uint32_t fmt) -> int {
switch (fmt) {
case V4L2_PIX_FMT_YUV422P:
return 10;
case V4L2_PIX_FMT_RGB565:
return 11;
case V4L2_PIX_FMT_RGB24:
return 12;
#ifdef CONFIG_XIAOZHI_ENABLE_HARDWARE_JPEG_ENCODER
case V4L2_PIX_FMT_YUV420:
return 13;
#endif // CONFIG_XIAOZHI_ENABLE_HARDWARE_JPEG_ENCODER
#ifdef CONFIG_XIAOZHI_CAMERA_ALLOW_JPEG_INPUT
case V4L2_PIX_FMT_JPEG:
return 5;
#endif // CONFIG_XIAOZHI_CAMERA_ALLOW_JPEG_INPUT
case V4L2_PIX_FMT_GREY:
return 20;
default:
return 1 << 29; // unsupported
}
};
#endif
while (ioctl(video_fd_, VIDIOC_ENUM_FMT, &fmtdesc) == 0) {
ESP_LOGD(TAG, "VIDIOC_ENUM_FMT: pixelformat=0x%08lx, description=%s", fmtdesc.pixelformat, fmtdesc.description);
CAM_PRINT_FOURCC(fmtdesc.pixelformat);
int rank = get_rank(fmtdesc.pixelformat);
if (rank < best_rank) {
best_rank = rank;
best_fmt = fmtdesc.pixelformat;
}
fmtdesc.index++;
}
if (best_rank < (1 << 29)) {
setformat.fmt.pix.pixelformat = best_fmt;
sensor_format_ = best_fmt;
}
if (!setformat.fmt.pix.pixelformat) {
ESP_LOGE(TAG, "no supported pixel format found");
close(video_fd_);
video_fd_ = -1;
sensor_format_ = 0;
return;
}
ESP_LOGD(TAG, "selected pixel format: 0x%08lx", setformat.fmt.pix.pixelformat);
if (ioctl(video_fd_, VIDIOC_S_FMT, &setformat) != 0) {
ESP_LOGE(TAG, "VIDIOC_S_FMT failed, errno=%d(%s)", errno, strerror(errno));
close(video_fd_);
video_fd_ = -1;
sensor_format_ = 0;
return;
}
#ifdef CONFIG_XIAOZHI_ENABLE_ROTATE_CAMERA_IMAGE
frame_.width = setformat.fmt.pix.height;
frame_.height = setformat.fmt.pix.width;
#else
frame_.width = setformat.fmt.pix.width;
frame_.height = setformat.fmt.pix.height;
#endif
// 申请缓冲并mmap
struct v4l2_requestbuffers req = {};
req.count = strcmp(video_device_name, ESP_VIDEO_MIPI_CSI_DEVICE_NAME) == 0 ? 2 : 1;
req.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
req.memory = V4L2_MEMORY_MMAP;
if (ioctl(video_fd_, VIDIOC_REQBUFS, &req) != 0) {
ESP_LOGE(TAG, "VIDIOC_REQBUFS failed");
close(video_fd_);
video_fd_ = -1;
sensor_format_ = 0;
return;
}
mmap_buffers_.resize(req.count);
for (uint32_t i = 0; i < req.count; i++) {
struct v4l2_buffer buf = {};
buf.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
buf.memory = V4L2_MEMORY_MMAP;
buf.index = i;
if (ioctl(video_fd_, VIDIOC_QUERYBUF, &buf) != 0) {
ESP_LOGE(TAG, "VIDIOC_QUERYBUF failed");
close(video_fd_);
video_fd_ = -1;
sensor_format_ = 0;
return;
}
void* start = mmap(NULL, buf.length, PROT_READ | PROT_WRITE, MAP_SHARED, video_fd_, buf.m.offset);
if (start == MAP_FAILED) {
ESP_LOGE(TAG, "mmap failed");
close(video_fd_);
video_fd_ = -1;
sensor_format_ = 0;
return;
}
mmap_buffers_[i].start = start;
mmap_buffers_[i].length = buf.length;
if (ioctl(video_fd_, VIDIOC_QBUF, &buf) != 0) {
ESP_LOGE(TAG, "VIDIOC_QBUF failed");
close(video_fd_);
video_fd_ = -1;
sensor_format_ = 0;
return;
}
}
int type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
if (ioctl(video_fd_, VIDIOC_STREAMON, &type) != 0) {
ESP_LOGE(TAG, "VIDIOC_STREAMON failed");
close(video_fd_);
video_fd_ = -1;
sensor_format_ = 0;
return;
}
#ifdef CONFIG_ESP_VIDEO_ENABLE_ISP_VIDEO_DEVICE
// 当启用 ISP 时ISP 需要一些照片来初始化参数因此开启后后台拍摄5s照片并丢弃
xTaskCreate(
[](void* arg) {
Esp32Camera* self = static_cast<Esp32Camera*>(arg);
uint16_t capture_count = 0;
TickType_t start = xTaskGetTickCount();
TickType_t duration = 5000 / portTICK_PERIOD_MS; // 5s
while ((xTaskGetTickCount() - start) < duration) {
struct v4l2_buffer buf = {};
buf.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
buf.memory = V4L2_MEMORY_MMAP;
if (ioctl(self->video_fd_, VIDIOC_DQBUF, &buf) != 0) {
ESP_LOGE(TAG, "VIDIOC_DQBUF failed during init");
vTaskDelay(10 / portTICK_PERIOD_MS);
continue;
}
if (ioctl(self->video_fd_, VIDIOC_QBUF, &buf) != 0) {
ESP_LOGE(TAG, "VIDIOC_QBUF failed during init");
}
capture_count++;
}
ESP_LOGI(TAG, "Camera init success, captured %d frames in %dms", capture_count,
(xTaskGetTickCount() - start) * portTICK_PERIOD_MS);
self->streaming_on_ = true;
vTaskDelete(NULL);
},
"CameraInitTask", 4096, this, 5, nullptr);
#else
ESP_LOGI(TAG, "Camera init success");
streaming_on_ = true;
#endif // CONFIG_ESP_VIDEO_ENABLE_ISP_VIDEO_DEVICE
}
Esp32Camera::~Esp32Camera() {
if (streaming_on_ && video_fd_ >= 0) {
int type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
ioctl(video_fd_, VIDIOC_STREAMOFF, &type);
}
for (auto& b : mmap_buffers_) {
if (b.start && b.length) {
munmap(b.start, b.length);
}
}
if (video_fd_ >= 0) {
close(video_fd_);
video_fd_ = -1;
}
sensor_format_ = 0;
// [T04] 释放采集互斥锁
if (capture_mutex_ != nullptr) {
vSemaphoreDelete(capture_mutex_);
capture_mutex_ = nullptr;
}
esp_video_deinit();
}
void Esp32Camera::SetExplainUrl(const std::string& url, const std::string& token) {
explain_url_ = url;
explain_token_ = token;
}
// [T01] 最小化 V4L2 DQBUF/QBUF 探测
// 只做一次 VIDIOC_DQBUF + VIDIOC_QBUF不分配 PSRAM不做格式转换/编码
// 用途:验证 OV3660 + esp_video 底层采集链路(针对 xiaozhi issue #1588 定位)
bool Esp32Camera::ProbeFrameCapture(int64_t* elapsed_us) {
if (!streaming_on_ || video_fd_ < 0) {
ESP_LOGE(TAG, "[T01] Probe 失败streaming 未启动或 video_fd 无效");
return false;
}
int64_t t0 = esp_timer_get_time();
struct v4l2_buffer buf = {};
buf.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
buf.memory = V4L2_MEMORY_MMAP;
// 只做一次 DQBUF验证 V4L2 能获取帧
if (ioctl(video_fd_, VIDIOC_DQBUF, &buf) != 0) {
ESP_LOGE(TAG, "[T01] Probe 失败VIDIOC_DQBUF 返回错误 errno=%d", errno);
return false;
}
size_t bytes_used = buf.bytesused;
// 立即归还,避免占用缓冲
if (ioctl(video_fd_, VIDIOC_QBUF, &buf) != 0) {
ESP_LOGE(TAG, "[T01] Probe 失败VIDIOC_QBUF 归还失败 errno=%d", errno);
return false;
}
int64_t t1 = esp_timer_get_time();
if (elapsed_us) *elapsed_us = t1 - t0;
ESP_LOGI(TAG, "[T01] Probe 成功bytesused=%u elapsed=%lldus",
(unsigned)bytes_used, (long long)(t1 - t0));
return true;
}
// [T04] 人脸检测用帧采集10ms 超短 timeout 拿不到 mutex 即跳帧
// 语义MCP Capture() 可能耗时 500-3000msJPEG 编码+HTTPface_track 不能死等
// 人脸检测允许丢帧,拍照不允许丢
bool Esp32Camera::CaptureForDetection(FrameRef* out) {
if (!streaming_on_ || video_fd_ < 0 || !out || capture_mutex_ == nullptr) {
return false;
}
// 超短 timeout拿不到锁就让上层跳过这一帧
if (xSemaphoreTake(capture_mutex_, pdMS_TO_TICKS(10)) != pdTRUE) {
return false;
}
struct v4l2_buffer buf = {};
buf.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
buf.memory = V4L2_MEMORY_MMAP;
if (ioctl(video_fd_, VIDIOC_DQBUF, &buf) != 0) {
xSemaphoreGive(capture_mutex_);
return false;
}
out->data = (const uint8_t*)mmap_buffers_[buf.index].start;
out->len = buf.bytesused;
out->width = frame_.width;
out->height = frame_.height;
out->format = sensor_format_;
out->buf_index = buf.index;
// 注意:不在此处解锁!由 ReleaseDetectionFrame 配对解锁
return true;
}
// [T04] 归还人脸检测帧:配对 CaptureForDetection
// 内部执行 VIDIOC_QBUF 归还缓冲,并释放 capture_mutex_
bool Esp32Camera::ReleaseDetectionFrame(const FrameRef& ref) {
if (video_fd_ < 0) {
if (capture_mutex_ != nullptr) xSemaphoreGive(capture_mutex_);
return false;
}
struct v4l2_buffer buf = {};
buf.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
buf.memory = V4L2_MEMORY_MMAP;
buf.index = ref.buf_index;
int ret = ioctl(video_fd_, VIDIOC_QBUF, &buf);
if (capture_mutex_ != nullptr) xSemaphoreGive(capture_mutex_);
return ret == 0;
}
bool Esp32Camera::Capture() {
if (encoder_thread_.joinable()) {
encoder_thread_.join();
}
if (!streaming_on_ || video_fd_ < 0) {
return false;
}
// [T04] MCP 拍照用 portMAX_DELAY拍照不允许丢可以等 face_track 的一次推理完成
// 使用 RAII guard 确保函数任何 return 路径都释放锁
struct CaptureLockGuard {
SemaphoreHandle_t mtx;
explicit CaptureLockGuard(SemaphoreHandle_t m) : mtx(m) {
if (mtx) xSemaphoreTake(mtx, portMAX_DELAY);
}
~CaptureLockGuard() {
if (mtx) xSemaphoreGive(mtx);
}
} _cap_lock(capture_mutex_);
for (int i = 0; i < 3; i++) {
struct v4l2_buffer buf = {};
buf.type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
buf.memory = V4L2_MEMORY_MMAP;
if (ioctl(video_fd_, VIDIOC_DQBUF, &buf) != 0) {
ESP_LOGE(TAG, "VIDIOC_DQBUF failed");
return false;
}
if (i == 2) {
// 保存帧副本到PSRAM
if (frame_.data) {
heap_caps_free(frame_.data);
frame_.data = nullptr;
frame_.format = 0;
}
frame_.len = buf.bytesused;
frame_.data = (uint8_t*)heap_caps_malloc(frame_.len, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT);
if (!frame_.data) {
ESP_LOGE(TAG, "alloc frame copy failed: need allocate %lu bytes", buf.bytesused);
if (ioctl(video_fd_, VIDIOC_QBUF, &buf) != 0) {
ESP_LOGE(TAG, "Cleanup: VIDIOC_QBUF failed");
}
return false;
}
#ifdef CONFIG_XIAOZHI_ENABLE_ROTATE_CAMERA_IMAGE
ESP_LOGW(TAG, "mmap_buffers_[buf.index].length = %d, sensor_width = %d, sensor_height = %d",
mmap_buffers_[buf.index].length, sensor_width_, sensor_height_);
#else
ESP_LOGW(TAG, "mmap_buffers_[buf.index].length = %d, frame.width = %d, frame.height = %d",
mmap_buffers_[buf.index].length, frame_.width, frame_.height);
#endif // CONFIG_XIAOZHI_ENABLE_ROTATE_CAMERA_IMAGE
ESP_LOG_BUFFER_HEXDUMP(TAG, mmap_buffers_[buf.index].start, MIN(mmap_buffers_[buf.index].length, 256),
ESP_LOG_DEBUG);
switch (sensor_format_) {
case V4L2_PIX_FMT_RGB565:
case V4L2_PIX_FMT_RGB24:
case V4L2_PIX_FMT_YUYV:
case V4L2_PIX_FMT_YUV420:
case V4L2_PIX_FMT_GREY:
#ifdef CONFIG_XIAOZHI_CAMERA_ALLOW_JPEG_INPUT
case V4L2_PIX_FMT_JPEG:
#endif // CONFIG_XIAOZHI_CAMERA_ALLOW_JPEG_INPUT
#ifdef CONFIG_XIAOZHI_ENABLE_CAMERA_ENDIANNESS_SWAP
{
auto src16 = (uint16_t*)mmap_buffers_[buf.index].start;
auto dst16 = (uint16_t*)frame_.data;
size_t count = (size_t)mmap_buffers_[buf.index].length / 2;
for (size_t i = 0; i < count; i++) {
dst16[i] = __builtin_bswap16(src16[i]);
}
}
#else
memcpy(frame_.data, mmap_buffers_[buf.index].start,
MIN(mmap_buffers_[buf.index].length, frame_.len));
#endif // CONFIG_XIAOZHI_ENABLE_CAMERA_ENDIANNESS_SWAP
frame_.format = sensor_format_;
break;
case V4L2_PIX_FMT_YUV422P: {
// 这个格式是 422 YUYV不是 planer
frame_.format = V4L2_PIX_FMT_YUYV;
#ifdef CONFIG_XIAOZHI_ENABLE_CAMERA_ENDIANNESS_SWAP
{
auto src16 = (uint16_t*)mmap_buffers_[buf.index].start;
auto dst16 = (uint16_t*)frame_.data;
size_t count = (size_t)mmap_buffers_[buf.index].length / 2;
for (size_t i = 0; i < count; i++) {
dst16[i] = __builtin_bswap16(src16[i]);
}
}
#else
memcpy(frame_.data, mmap_buffers_[buf.index].start,
MIN(mmap_buffers_[buf.index].length, frame_.len));
#endif // CONFIG_XIAOZHI_ENABLE_CAMERA_ENDIANNESS_SWAP
break;
}
case V4L2_PIX_FMT_RGB565X: {
// 大端序的 RGB565 需要转换为小端序
// 目前 esp_video 的大小端都会返回格式为 RGB565不会返回格式为 RGB565X此 case 用于未来版本兼容
auto src16 = (uint16_t*)mmap_buffers_[buf.index].start;
auto dst16 = (uint16_t*)frame_.data;
size_t pixel_count = (size_t)frame_.width * (size_t)frame_.height;
for (size_t i = 0; i < pixel_count; i++) {
dst16[i] = __builtin_bswap16(src16[i]);
}
frame_.format = V4L2_PIX_FMT_RGB565;
break;
}
default:
ESP_LOGE(TAG, "unsupported sensor format: 0x%08lx", sensor_format_);
if (ioctl(video_fd_, VIDIOC_QBUF, &buf) != 0) {
ESP_LOGE(TAG, "Cleanup: VIDIOC_QBUF failed");
}
return false;
}
#ifdef CONFIG_XIAOZHI_ENABLE_ROTATE_CAMERA_IMAGE
#ifndef CONFIG_SOC_PPA_SUPPORTED
uint8_t* rotate_dst =
(uint8_t*)heap_caps_aligned_alloc(64, frame_.len, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT);
if (rotate_dst == nullptr) {
ESP_LOGE(TAG, "Failed to allocate memory for rotate image");
if (ioctl(video_fd_, VIDIOC_QBUF, &buf) != 0) {
ESP_LOGE(TAG, "Cleanup: VIDIOC_QBUF failed");
}
return false;
}
uint8_t* rotate_src = (uint8_t*)frame_.data;
esp_imgfx_rotate_cfg_t rotate_cfg = {
.in_res =
{
.width = static_cast<int16_t>(sensor_width_),
.height = static_cast<int16_t>(sensor_height_),
},
.degree = IMAGE_ROTATION_ANGLE,
};
switch (frame_.format) {
case V4L2_PIX_FMT_RGB565:
rotate_cfg.in_pixel_fmt = ESP_IMGFX_PIXEL_FMT_RGB565_LE;
break;
case V4L2_PIX_FMT_YUYV:
rotate_cfg.in_pixel_fmt = ESP_IMGFX_PIXEL_FMT_RGB565_LE;
break;
case V4L2_PIX_FMT_GREY:
rotate_cfg.in_pixel_fmt = ESP_IMGFX_PIXEL_FMT_Y;
break;
case V4L2_PIX_FMT_RGB24:
rotate_cfg.in_pixel_fmt = ESP_IMGFX_PIXEL_FMT_RGB888;
break;
default:
ESP_LOGE(TAG, "unsupported sensor format: 0x%08lx", sensor_format_);
if (ioctl(video_fd_, VIDIOC_QBUF, &buf) != 0) {
ESP_LOGE(TAG, "Cleanup: VIDIOC_QBUF failed");
}
return false;
}
esp_imgfx_rotate_handle_t rotate_handle = nullptr;
esp_imgfx_err_t imgfx_err = esp_imgfx_rotate_open(&rotate_cfg, &rotate_handle);
if (imgfx_err != ESP_IMGFX_ERR_OK || rotate_handle == nullptr) {
ESP_LOGE(TAG, "esp_imgfx_rotate_create failed");
if (ioctl(video_fd_, VIDIOC_QBUF, &buf) != 0) {
ESP_LOGE(TAG, "Cleanup: VIDIOC_QBUF failed");
}
return false;
}
esp_imgfx_data_t rotate_input_data = {
.data = rotate_src,
.data_len = frame_.len,
};
esp_imgfx_data_t rotate_output_data = {
.data = rotate_dst,
.data_len = frame_.len,
};
imgfx_err = esp_imgfx_rotate_process(rotate_handle, &rotate_input_data, &rotate_output_data);
if (imgfx_err != ESP_IMGFX_ERR_OK) {
ESP_LOGE(TAG, "esp_imgfx_rotate_process failed");
heap_caps_free(rotate_dst);
rotate_dst = nullptr;
if (ioctl(video_fd_, VIDIOC_QBUF, &buf) != 0) {
ESP_LOGE(TAG, "Cleanup: VIDIOC_QBUF failed");
}
esp_imgfx_rotate_close(rotate_handle);
rotate_handle = nullptr;
return false;
}
frame_.data = rotate_dst;
heap_caps_free(rotate_src);
rotate_src = nullptr;
esp_imgfx_rotate_close(rotate_handle);
rotate_handle = nullptr;
#else // CONFIG_SOC_PPA_SUPPORTED
uint8_t* rotate_src = nullptr;
ppa_srm_color_mode_t ppa_color_mode;
switch (frame_.format) {
case V4L2_PIX_FMT_RGB565:
rotate_src = (uint8_t*)frame_.data;
ppa_color_mode = PPA_SRM_COLOR_MODE_RGB565;
break;
case V4L2_PIX_FMT_RGB24:
rotate_src = (uint8_t*)frame_.data;
ppa_color_mode = PPA_SRM_COLOR_MODE_RGB888;
break;
case V4L2_PIX_FMT_YUYV: {
ESP_LOGW(TAG, "YUYV format is not supported for PPA rotation, using software conversion to RGB888");
rotate_src = (uint8_t*)heap_caps_malloc(frame_.width * frame_.height * 3,
MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT);
if (rotate_src == nullptr) {
ESP_LOGE(TAG, "Failed to allocate memory for rotate image");
if (ioctl(video_fd_, VIDIOC_QBUF, &buf) != 0) {
ESP_LOGE(TAG, "Cleanup: VIDIOC_QBUF failed");
}
return false;
}
esp_imgfx_color_convert_cfg_t convert_cfg = {
.in_res = {.width = static_cast<int16_t>(frame_.width),
.height = static_cast<int16_t>(frame_.height)},
.in_pixel_fmt = ESP_IMGFX_PIXEL_FMT_YUYV,
.out_pixel_fmt = ESP_IMGFX_PIXEL_FMT_RGB888,
};
esp_imgfx_color_convert_handle_t convert_handle = nullptr;
esp_imgfx_err_t err = esp_imgfx_color_convert_open(&convert_cfg, &convert_handle);
if (err != ESP_IMGFX_ERR_OK || convert_handle == nullptr) {
ESP_LOGE(TAG, "esp_imgfx_color_convert_open failed");
heap_caps_free(rotate_src);
rotate_src = nullptr;
if (ioctl(video_fd_, VIDIOC_QBUF, &buf) != 0) {
ESP_LOGE(TAG, "Cleanup: VIDIOC_QBUF failed");
}
return false;
}
esp_imgfx_data_t convert_input_data = {
.data = frame_.data,
.data_len = frame_.len,
};
esp_imgfx_data_t convert_output_data = {
.data = rotate_src,
.data_len = static_cast<uint32_t>(frame_.width * frame_.height * 3),
};
err = esp_imgfx_color_convert_process(convert_handle, &convert_input_data, &convert_output_data);
if (err != ESP_IMGFX_ERR_OK) {
ESP_LOGE(TAG, "esp_imgfx_color_convert_process failed");
heap_caps_free(rotate_src);
rotate_src = nullptr;
esp_imgfx_color_convert_close(convert_handle);
convert_handle = nullptr;
if (ioctl(video_fd_, VIDIOC_QBUF, &buf) != 0) {
ESP_LOGE(TAG, "Cleanup: VIDIOC_QBUF failed");
}
return false;
}
esp_imgfx_color_convert_close(convert_handle);
convert_handle = nullptr;
ppa_color_mode = PPA_SRM_COLOR_MODE_RGB888;
heap_caps_free(frame_.data);
frame_.data = rotate_src;
frame_.len = frame_.width * frame_.height * 3;
break;
}
default:
ESP_LOGE(TAG, "unsupported sensor format for PPA rotation: 0x%08lx", sensor_format_);
if (ioctl(video_fd_, VIDIOC_QBUF, &buf) != 0) {
ESP_LOGE(TAG, "Cleanup: VIDIOC_QBUF failed");
}
return false;
}
uint8_t* rotate_dst = (uint8_t*)heap_caps_malloc(
frame_.width * frame_.height * 2, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT | MALLOC_CAP_CACHE_ALIGNED);
if (rotate_dst == nullptr) {
ESP_LOGE(TAG, "Failed to allocate memory for rotate image");
if (ioctl(video_fd_, VIDIOC_QBUF, &buf) != 0) {
ESP_LOGE(TAG, "Cleanup: VIDIOC_QBUF failed");
}
return false;
}
ppa_client_handle_t ppa_client = nullptr;
ppa_client_config_t client_cfg = {
.oper_type = PPA_OPERATION_SRM,
.max_pending_trans_num = 1,
};
esp_err_t err = ppa_register_client(&client_cfg, &ppa_client);
if (err != ESP_OK || ppa_client == nullptr) {
ESP_LOGE(TAG, "ppa_register_client failed: %d", (int)err);
heap_caps_free(rotate_dst);
rotate_dst = nullptr;
if (ioctl(video_fd_, VIDIOC_QBUF, &buf) != 0) {
ESP_LOGE(TAG, "Cleanup: VIDIOC_QBUF failed");
}
return false;
}
ppa_srm_rotation_angle_t ppa_angle = IMAGE_ROTATION_ANGLE;
ppa_srm_oper_config_t srm_cfg = {};
srm_cfg.in.buffer = (void*)rotate_src;
srm_cfg.in.pic_w = sensor_width_;
srm_cfg.in.pic_h = sensor_height_;
srm_cfg.in.block_w = sensor_width_;
srm_cfg.in.block_h = sensor_height_;
srm_cfg.in.block_offset_x = 0;
srm_cfg.in.block_offset_y = 0;
srm_cfg.in.srm_cm = ppa_color_mode;
srm_cfg.out.buffer = (void*)rotate_dst;
srm_cfg.out.buffer_size = frame_.len;
srm_cfg.out.pic_w = frame_.width;
srm_cfg.out.pic_h = frame_.height;
srm_cfg.out.block_offset_x = 0;
srm_cfg.out.block_offset_y = 0;
srm_cfg.out.srm_cm = PPA_SRM_COLOR_MODE_RGB565;
// 等比例缩放 1.0
srm_cfg.scale_x = 1.0f;
srm_cfg.scale_y = 1.0f;
srm_cfg.rotation_angle = ppa_angle;
srm_cfg.mode = PPA_TRANS_MODE_BLOCKING;
srm_cfg.user_data = nullptr;
err = ppa_do_scale_rotate_mirror(ppa_client, &srm_cfg);
if (err != ESP_OK) {
ESP_LOGE(TAG, "ppa_do_scale_rotate_mirror failed: %d", (int)err);
heap_caps_free(rotate_dst);
rotate_dst = nullptr;
(void)ppa_unregister_client(ppa_client);
if (ioctl(video_fd_, VIDIOC_QBUF, &buf) != 0) {
ESP_LOGE(TAG, "Cleanup: VIDIOC_QBUF failed");
}
return false;
}
(void)ppa_unregister_client(ppa_client);
frame_.data = rotate_dst;
frame_.len = frame_.width * frame_.height * 2;
frame_.format = V4L2_PIX_FMT_RGB565;
heap_caps_free(rotate_src);
rotate_src = nullptr;
#endif // CONFIG_SOC_PPA_SUPPORTED
#endif // CONFIG_XIAOZHI_ENABLE_ROTATE_CAMERA_IMAGE
}
if (ioctl(video_fd_, VIDIOC_QBUF, &buf) != 0) {
ESP_LOGE(TAG, "VIDIOC_QBUF failed");
}
}
// 显示预览图片
auto display = dynamic_cast<LvglDisplay*>(Board::GetInstance().GetDisplay());
if (display != nullptr) {
if (!frame_.data) {
ESP_LOGE(TAG, "frame.data is null");
return false;
}
uint16_t w = frame_.width;
uint16_t h = frame_.height;
size_t lvgl_image_size = frame_.len;
size_t stride = ((w * 2) + 3) & ~3; // 4字节对齐
lv_color_format_t color_format = LV_COLOR_FORMAT_RGB565;
uint8_t* data = nullptr;
switch (frame_.format) {
// LVGL 显示 YUV 系的图像似乎都有问题,暂时转换为 RGB565 显示
case V4L2_PIX_FMT_YUYV:
case V4L2_PIX_FMT_YUV420:
case V4L2_PIX_FMT_RGB24: {
color_format = LV_COLOR_FORMAT_RGB565;
data = (uint8_t*)heap_caps_malloc(w * h * 2, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT);
if (data == nullptr) {
ESP_LOGE(TAG, "Failed to allocate memory for preview image");
return false;
}
esp_imgfx_color_convert_cfg_t convert_cfg = {
.in_res = {.width = static_cast<int16_t>(frame_.width),
.height = static_cast<int16_t>(frame_.height)},
.in_pixel_fmt = static_cast<esp_imgfx_pixel_fmt_t>(frame_.format),
.out_pixel_fmt = ESP_IMGFX_PIXEL_FMT_RGB565_LE,
.color_space_std = ESP_IMGFX_COLOR_SPACE_STD_BT601,
};
esp_imgfx_color_convert_handle_t convert_handle = nullptr;
esp_imgfx_err_t err = esp_imgfx_color_convert_open(&convert_cfg, &convert_handle);
if (err != ESP_IMGFX_ERR_OK || convert_handle == nullptr) {
ESP_LOGE(TAG, "esp_imgfx_color_convert_open failed");
heap_caps_free(data);
data = nullptr;
return false;
}
esp_imgfx_data_t convert_input_data = {
.data = frame_.data,
.data_len = frame_.len,
};
esp_imgfx_data_t convert_output_data = {
.data = data,
.data_len = static_cast<uint32_t>(w * h * 2),
};
err = esp_imgfx_color_convert_process(convert_handle, &convert_input_data, &convert_output_data);
if (err != ESP_IMGFX_ERR_OK) {
ESP_LOGE(TAG, "esp_imgfx_color_convert_process failed");
heap_caps_free(data);
data = nullptr;
esp_imgfx_color_convert_close(convert_handle);
convert_handle = nullptr;
return false;
}
esp_imgfx_color_convert_close(convert_handle);
convert_handle = nullptr;
lvgl_image_size = w * h * 2;
break;
}
case V4L2_PIX_FMT_RGB565:
data = (uint8_t*)heap_caps_malloc(w * h * 2, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT);
if (data == nullptr) {
ESP_LOGE(TAG, "Failed to allocate memory for preview image");
return false;
}
memcpy(data, frame_.data, frame_.len);
lvgl_image_size = frame_.len; // fallthrough 时兼顾 YUYV 与 RGB565
break;
#ifdef CONFIG_XIAOZHI_CAMERA_ALLOW_JPEG_INPUT
case V4L2_PIX_FMT_JPEG: {
uint8_t* out_data = nullptr; // out data is allocated by jpeg_to_image
size_t out_len = 0;
size_t out_width = 0;
size_t out_height = 0;
size_t out_stride = 0;
esp_err_t ret =
jpeg_to_image(frame_.data, frame_.len, &out_data, &out_len, &out_width, &out_height, &out_stride);
if (ret != ESP_OK) {
ESP_LOGE(TAG, "Failed to decode JPEG image: %d (%s)", (int)ret, esp_err_to_name(ret));
if (out_data) {
heap_caps_free(out_data);
out_data = nullptr;
}
return false;
}
data = out_data;
w = out_width;
h = out_height;
lvgl_image_size = out_len;
stride = out_stride;
break;
}
#endif
default:
ESP_LOGE(TAG, "unsupported frame format: 0x%08lx", frame_.format);
return false;
}
auto image = std::make_unique<LvglAllocatedImage>(data, lvgl_image_size, w, h, stride, color_format);
display->SetPreviewImage(std::move(image));
}
return true;
}
bool Esp32Camera::SetHMirror(bool enabled) {
if (video_fd_ < 0)
return false;
struct v4l2_ext_controls ctrls = {};
struct v4l2_ext_control ctrl = {};
ctrl.id = V4L2_CID_HFLIP;
ctrl.value = enabled ? 1 : 0;
ctrls.ctrl_class = V4L2_CTRL_CLASS_USER;
ctrls.count = 1;
ctrls.controls = &ctrl;
if (ioctl(video_fd_, VIDIOC_S_EXT_CTRLS, &ctrls) != 0) {
ESP_LOGE(TAG, "set HFLIP failed");
return false;
}
return true;
}
bool Esp32Camera::SetVFlip(bool enabled) {
if (video_fd_ < 0)
return false;
struct v4l2_ext_controls ctrls = {};
struct v4l2_ext_control ctrl = {};
ctrl.id = V4L2_CID_VFLIP;
ctrl.value = enabled ? 1 : 0;
ctrls.ctrl_class = V4L2_CTRL_CLASS_USER;
ctrls.count = 1;
ctrls.controls = &ctrl;
if (ioctl(video_fd_, VIDIOC_S_EXT_CTRLS, &ctrls) != 0) {
ESP_LOGE(TAG, "set VFLIP failed");
return false;
}
return true;
}
/**
* @brief 将摄像头捕获的图像发送到远程服务器进行AI分析和解释
*
* 该函数将当前摄像头缓冲区中的图像编码为JPEG格式并通过HTTP POST请求
* 以multipart/form-data的形式发送到指定的解释服务器。服务器将根据提供的
* 问题对图像进行AI分析并返回结果。
*
* 实现特点:
* - 使用独立线程编码JPEG与主线程分离
* - 采用分块传输编码(chunked transfer encoding)优化内存使用
* - 通过队列机制实现编码线程和发送线程的数据同步
* - 支持设备ID、客户端ID和认证令牌的HTTP头部配置
*
* @param question 要向AI提出的关于图像的问题将作为表单字段发送
* @return std::string 服务器返回的JSON格式响应字符串
* 成功时包含AI分析结果失败时包含错误信息
* 格式示例:{"success": true, "result": "分析结果"}
* {"success": false, "message": "错误信息"}
*
* @note 调用此函数前必须先调用SetExplainUrl()设置服务器URL
* @note 函数会等待之前的编码线程完成后再开始新的处理
* @warning 如果摄像头缓冲区为空或网络连接失败,将返回错误信息
*/
std::string Esp32Camera::Explain(const std::string& question) {
if (explain_url_.empty()) {
throw std::runtime_error("Image explain URL or token is not set");
}
// 创建局部的 JPEG 队列, 40 entries is about to store 512 * 40 = 20480 bytes of JPEG data
QueueHandle_t jpeg_queue = xQueueCreate(40, sizeof(JpegChunk));
if (jpeg_queue == nullptr) {
ESP_LOGE(TAG, "Failed to create JPEG queue");
throw std::runtime_error("Failed to create JPEG queue");
}
// We spawn a thread to encode the image to JPEG using optimized encoder (cost about 500ms and 8KB SRAM)
encoder_thread_ = std::thread([this, jpeg_queue]() {
uint16_t w = frame_.width ? frame_.width : 320;
uint16_t h = frame_.height ? frame_.height : 240;
v4l2_pix_fmt_t enc_fmt = frame_.format;
bool ok = image_to_jpeg_cb(
frame_.data, frame_.len, w, h, enc_fmt, 80,
[](void* arg, size_t index, const void* data, size_t len) -> size_t {
auto jpeg_queue = static_cast<QueueHandle_t>(arg);
JpegChunk chunk = {.data = nullptr, .len = len};
if (index == 0 && data != nullptr && len > 0) {
chunk.data = (uint8_t*)heap_caps_aligned_alloc(16, len, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT);
if (chunk.data == nullptr) {
ESP_LOGE(TAG, "Failed to allocate %zu bytes for JPEG chunk", len);
chunk.len = 0;
} else {
memcpy(chunk.data, data, len);
}
} else {
chunk.len = 0; // Sentinel or error
}
xQueueSend(jpeg_queue, &chunk, portMAX_DELAY);
return len;
},
jpeg_queue);
if (!ok) {
JpegChunk chunk = {.data = nullptr, .len = 0};
xQueueSend(jpeg_queue, &chunk, portMAX_DELAY);
}
});
auto network = Board::GetInstance().GetNetwork();
auto http = network->CreateHttp(3);
// 构造multipart/form-data请求体
std::string boundary = "----ESP32_CAMERA_BOUNDARY";
// 配置HTTP客户端使用分块传输编码
http->SetHeader("Device-Id", SystemInfo::GetMacAddress().c_str());
http->SetHeader("Client-Id", Board::GetInstance().GetUuid().c_str());
if (!explain_token_.empty()) {
http->SetHeader("Authorization", "Bearer " + explain_token_);
}
http->SetHeader("Content-Type", "multipart/form-data; boundary=" + boundary);
http->SetHeader("Transfer-Encoding", "chunked");
if (!http->Open("POST", explain_url_)) {
ESP_LOGE(TAG, "Failed to connect to explain URL");
// Clear the queue
encoder_thread_.join();
JpegChunk chunk;
while (xQueueReceive(jpeg_queue, &chunk, portMAX_DELAY) == pdPASS) {
if (chunk.data != nullptr) {
heap_caps_free(chunk.data);
} else {
break;
}
}
vQueueDelete(jpeg_queue);
throw std::runtime_error("Failed to connect to explain URL");
}
{
// 第一块question字段
std::string question_field;
question_field += "--" + boundary + "\r\n";
question_field += "Content-Disposition: form-data; name=\"question\"\r\n";
question_field += "\r\n";
question_field += question + "\r\n";
http->Write(question_field.c_str(), question_field.size());
}
{
// 第二块:文件字段头部
std::string file_header;
file_header += "--" + boundary + "\r\n";
file_header += "Content-Disposition: form-data; name=\"file\"; filename=\"camera.jpg\"\r\n";
file_header += "Content-Type: image/jpeg\r\n";
file_header += "\r\n";
http->Write(file_header.c_str(), file_header.size());
}
// 第三块JPEG数据
size_t total_sent = 0;
bool saw_terminator = false;
while (true) {
JpegChunk chunk;
if (xQueueReceive(jpeg_queue, &chunk, portMAX_DELAY) != pdPASS) {
ESP_LOGE(TAG, "Failed to receive JPEG chunk");
break;
}
if (chunk.data == nullptr) {
saw_terminator = true;
break; // The last chunk
}
http->Write((const char*)chunk.data, chunk.len);
total_sent += chunk.len;
heap_caps_free(chunk.data);
}
// Wait for the encoder thread to finish
encoder_thread_.join();
// 清理队列
vQueueDelete(jpeg_queue);
if (!saw_terminator || total_sent == 0) {
ESP_LOGE(TAG, "JPEG encoder failed or produced empty output");
throw std::runtime_error("Failed to encode image to JPEG");
}
{
// 第四块multipart尾部
std::string multipart_footer;
multipart_footer += "\r\n--" + boundary + "--\r\n";
http->Write(multipart_footer.c_str(), multipart_footer.size());
}
// 结束块
http->Write("", 0);
if (http->GetStatusCode() != 200) {
ESP_LOGE(TAG, "Failed to upload photo, status code: %d", http->GetStatusCode());
throw std::runtime_error("Failed to upload photo");
}
std::string result = http->ReadAll();
http->Close();
// Get remain task stack size
size_t remain_stack_size = uxTaskGetStackHighWaterMark(nullptr);
ESP_LOGI(TAG, "Explain image size=%d bytes, compressed size=%d, remain stack size=%d, question=%s\n%s",
(int)frame_.len, (int)total_sent, (int)remain_stack_size, question.c_str(), result.c_str());
return result;
}