video-flow-toon/src/utils/cleanNovel.ts
2026-03-19 23:08:52 +08:00

162 lines
5.8 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import * as z from "zod";
import { ModelMessage, Output } from "ai";
import { o_novel } from "@/types/database";
import ai from "@/utils/ai";
import u from "@/utils";
export interface EventType {
name: string;
detail: string;
chapter: string;
}
export interface Novel {
event: EventType[];
}
// 章节拆分
function getChapterGroups<T>(chapters: T[], windowSize: number = 5, overlap: number = 1): T[][] {
const res: T[][] = [];
if (windowSize < 1 || overlap < 0) return res;
let i = 0;
const length = chapters.length;
while (i < length) {
if (res.length === 0) {
// 第一组,直接取 windowSize 个
res.push(chapters.slice(i, i + windowSize));
i += windowSize;
} else {
// 取上一组最后 overlap 个,加上新的 windowSize 个
const prevGroup = res[res.length - 1];
const overlapItems = prevGroup.slice(-overlap);
const newItems = chapters.slice(i, i + windowSize);
if (newItems.length === 0) break; // 已经取完,跳出
res.push([...overlapItems, ...newItems]);
i += windowSize;
}
}
return res;
}
/* 文本数据清洗
* @param textData 需要清洗的文本
* @param windowSize 每组数量 默认5
* @param overlap 交叠数量 默认1
* @returns {totalCharacter:所有人物角色卡,totalEvent:所有事件}
*/
class CleanNovel {
windowSize: number;
overlap: number;
constructor(windowSize: number = 5, overlap: number = 1) {
this.windowSize = windowSize;
this.overlap = overlap;
}
async start(allChapters: o_novel[], projectId: number): Promise<EventType[]> {
const groups = getChapterGroups(allChapters!, this.windowSize, this.overlap);
let preData: Novel | null = null;
//所有事件
let totalEvent: EventType[] = [];
const intansce = u.Ai.Text("eventExtractAi");
try {
for (let gi = 0; gi < groups.length; gi++) {
const group = groups[gi];
// 第一批没有交叠章节,后续批次前 overlap 个是交叠章节(仅作上下文,不输出事件)
const overlapCount = gi === 0 ? 0 : this.overlap;
const overlapChapterIndexes = group.slice(0, overlapCount).map((i) => i.chapterIndex);
const cleanText = group
.map((i, index: number) => {
const isOverlap = overlapChapterIndexes.includes(i.chapterIndex);
return {
role: "user",
content: isOverlap
? `【上文衔接章节,仅供上下文参考,禁止为本章生成情节单元】\n第${i.chapterIndex}章:\n\n${i.chapterData}`
: `${i.chapterIndex}章:\n\n${i.chapterData}`,
} as ModelMessage;
})
.filter(Boolean);
const taskRecord = await u.task(projectId, "事件提取", "gpt-4.1", {
describe: "根据小说原文,提取情节单元",
content: cleanText,
});
let resData;
try {
resData = await intansce.invoke({
messages: [
{
role: "system",
content: `
你是专业剧本结构分析师,负责将用户提供的章节文本拆分为标准情节单元。请严格遵循以下规则执行。
━━━━━━━━━━━━━━━━━━━━━━━━━━━━
【情节单元拆分规则】
━━━━━━━━━━━━━━━━━━━━━━━━━━━━
▍拆分粒度
- 连续多个章节若属同一戏剧动作,可合并为 1 个情节单元,禁止过细拆分;
- 每个情节单元的 detail 字数控制在 100200 字。
▍每个情节单元包含以下三个字段
- chapter事件覆盖的章节范围如"1-3章"),每个章节只能归属一个事件;
- name事件名称须具体描述实际戏剧动作禁止使用"XXX踏上征程""命运转折"等笼统标题;
- detail事件过程详情包含时间、地点、涉及人物、起因、经过、结果。
━━━━━━━━━━━━━━━━━━━━━━━━━━━━
【执行规则】
━━━━━━━━━━━━━━━━━━━━━━━━━━━━
✅ 必须执行
1. 所有章节按剧情顺序逐一覆盖,不得遗漏;
2. 标注为【上文衔接章节】的内容仅作上下文理解使用,禁止为其生成任何情节单元。
🚫 禁止出现
- 笼统事件名称;
- 单个章节拆分为多个情节单元;
- 遗漏任何章节。
`,
},
...cleanText,
],
output: Output.object({
schema: z.object({
event: z.array(
z
.object({
chapter: z
.string()
.describe(
"事件覆盖的章节如1-3章、4-6章章节划分必须连续每个章节范围只能属于一个事件。事件分割不可过细——避免只描述琐碎、日常细节的微小事件。",
),
name: z.string().describe("事件名称"),
detail: z.string().describe("事件过程详情(包括起因、经过、结果、场景、人物等)"),
})
.describe("事件必须在100-200字说明起因经过结果不可将单一章节或细小场景独立成事件"),
),
}),
}),
});
} catch (e) {
taskRecord(-1, u.error(e).message);
throw e;
}
taskRecord(1);
preData = JSON.parse(resData.text);
const newEvents = preData?.event || [];
newEvents.forEach((newItem) => {
totalEvent.push({ ...newItem });
});
}
} catch (e) {
console.error(e);
throw e;
}
return totalEvent;
}
}
export default CleanNovel;