139 lines
3.9 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#ifndef GBK_ENCODING_H
#define GBK_ENCODING_H
#include <stdint.h>
#include <stddef.h>
#include "gbk_map.h" // 引入映射表
#include <esp_log.h>
#define GBK_UTIL_TAG "GBK_ENCODING"
#ifdef __cplusplus
extern "C" {
#endif
/**
* @brief 将GBK编码转换为UTF-8编码
*
* @param gbk_str 输入的GBK编码字符串
* @param utf8_buf 输出的UTF-8字符串缓冲区
* @param buf_size 缓冲区大小
* @return size_t 转换后的字符串长度如果失败则返回0
*/
size_t gbk_to_utf8(const char* gbk_str, char* utf8_buf, size_t buf_size);
/**
* @brief 获取转换GBK到UTF-8所需的缓冲区大小
*
* @param gbk_str 输入的GBK编码字符串
* @return size_t 所需的UTF-8缓冲区大小
*/
size_t gbk_to_utf8_buffer_size(const char* gbk_str);
/**
* @brief 将GBK编码转换为UTF-8编码并分配新内存
*
* @param gbk_str 输入的GBK编码字符串
* @return char* 新分配的UTF-8字符串使用后需要free
*/
char* gbk_to_utf8_alloc(const char* gbk_str);
/**
* @brief 初始化GBK编码转换表
* 这个函数会加载编码转换表到内存中
*/
void gbk_encoding_init(void);
// GBK到Unicode的映射表
static const uint16_t gbk_to_unicode_map[] = {
0x4E02, 0x4E04, 0x4E05, 0x4E06, 0x4E0F, 0x4E12, 0x4E17, 0x4E1F,
// ... 这里是完整的映射表
};
// GBK到Unicode的转换函数
static inline uint16_t gbk_to_unicode(uint8_t ch, uint8_t cl) {
if (ch <= 0x7F) {
return ch; // ASCII字符
}
// GBK区域判断
if (ch >= 0x81 && ch <= 0xFE) {
if (cl >= 0x40 && cl <= 0x7E || cl >= 0x80 && cl <= 0xFE) {
uint32_t gbk = (ch << 8) | cl;
// GBK-1区域 (0xB0A1-0xF7FE)
if (gbk >= 0xB0A1 && gbk <= 0xF7FE) {
uint32_t offset = ((ch - 0xB0) * 94 + (cl - 0xA1));
return 0x4E00 + offset; // 基本汉字区
}
// GBK-2区域 (0x8140-0xA0FE)
if (gbk >= 0x8140 && gbk <= 0xA0FE) {
uint32_t offset = ((ch - 0x81) * 190 + (cl - (cl >= 0x80 ? 0x41 : 0x40)));
return 0x3000 + offset; // 符号区
}
// GBK-3区域 (0xAA40-0xFEA0)
if (gbk >= 0xAA40 && gbk <= 0xFEA0) {
uint32_t offset = ((ch - 0xAA) * 96 + (cl - 0x40));
return 0x4E00 + 6768 + offset; // 扩展汉字区
}
}
}
ESP_LOGW(GBK_UTIL_TAG, "未找到映射的GBK编码: 0x%04X [高字节:0x%02X, 低字节:0x%02X]",
(ch << 8) | cl, ch, cl);
return 0x3F; // 返回'?'的Unicode编码
}
// Unicode到UTF-8的转换函数
static inline int unicode_to_utf8(uint16_t uni, uint8_t *utf8) {
if (uni <= 0x7F) {
utf8[0] = (uint8_t)uni;
return 1;
}
else if (uni <= 0x7FF) {
utf8[0] = 0xC0 | ((uni >> 6) & 0x1F);
utf8[1] = 0x80 | (uni & 0x3F);
return 2;
}
else {
utf8[0] = 0xE0 | ((uni >> 12) & 0x0F);
utf8[1] = 0x80 | ((uni >> 6) & 0x3F);
utf8[2] = 0x80 | (uni & 0x3F);
return 3;
}
}
// GBK到UTF-8的转换函数
static inline int gbk_to_utf8(const char* gbk, char* utf8, int len) {
int utf8_len = 0;
for (int i = 0; i < len;) {
uint8_t ch = (uint8_t)gbk[i];
if (ch <= 0x7F) {
// ASCII字符
utf8[utf8_len++] = ch;
i++;
} else {
// GBK字符
if (i + 1 >= len) break;
uint8_t cl = (uint8_t)gbk[i + 1];
uint16_t unicode = gbk_to_unicode(ch, cl);
utf8_len += unicode_to_utf8(unicode, (uint8_t*)&utf8[utf8_len]);
i += 2;
}
}
utf8[utf8_len] = '\0';
return utf8_len;
}
// 处理文件名的函数
static inline void process_filename(const char* filename, char* utf8_filename, int max_len) {
gbk_to_utf8(filename, utf8_filename, strlen(filename));
}
#ifdef __cplusplus
}
#endif
#endif /* GBK_ENCODING_H */