]*(?:class|id)=["'][^"']*(?:result|b_algo|vrwrap|news-box|result-op)[^"']*["'][\s\S]*?<\/div>/gi),
];
for (const block of blocks) {
const rawBlock = block[0];
const blockText = decodePercentText(rawBlock)
.replace(/<[^>]+>/g, " ")
.replace(/\s+/g, " ")
.trim();
if (keyword && keywordMatchScore(blockText, keyword) <= 0) continue;
if (platform && !platformEvidenceMatches(blockText, platform)) continue;
for (const match of [
...rawBlock.matchAll(/
\s*([^<\s]+)\s*<\/link>/gi),
...rawBlock.matchAll(/\bhref\s*=\s*["']([^"']+)["']/gi),
]) {
const url = normalizeUrl(match[1], baseUrl);
if (!url || seen.has(url)) continue;
if (scoreUrl(url, configForBridge(platform), keyword) > 0) continue;
if (isSearchOrEngineUrl(url)) continue;
seen.add(url);
results.push({ url, evidence: blockText });
}
}
return results;
}
function configForBridge(platform) {
return SEARCH_CONFIGS[platform] || {
allowHosts: [],
includePaths: [],
excludePaths: [],
};
}
function isSearchOrEngineUrl(url) {
try {
const host = new URL(url).hostname.toLowerCase();
return /(?:bing|baidu|sogou|google)\./.test(host);
} catch {
return true;
}
}
function platformEvidenceMatches(text, platform) {
const normalized = normalizeSearchText(text);
const terms = {
tencent: ["腾讯视频", "腾讯", "小企鹅乐园", "企鹅乐园", "vqq"],
youku: ["优酷", "youku"],
iqiyi: ["爱奇艺", "iqiyi"],
mgtv: ["芒果tv", "芒果", "mgtv"],
}[platform] || [];
return terms.length === 0 || terms.some((term) => normalized.includes(normalizeSearchText(term)));
}
function extractStructuredSearchCandidates(decoded, baseUrl, config, keyword) {
const results = [];
const blocks = [
...decoded.matchAll(/
- /gi),
...decoded.matchAll(/
]*(?:class|id)=["'][^"']*(?:result|b_algo|vrwrap|news-box|result-op)[^"']*["'][\s\S]*?<\/div>/gi),
];
for (const block of blocks) {
const rawBlock = block[0];
const blockText = decodePercentText(rawBlock)
.replace(/<[^>]+>/g, " ")
.replace(/\s+/g, " ")
.trim();
if (keywordMatchScore(blockText, keyword) <= 0) continue;
const urls = [
...rawBlock.matchAll(/\s*([^<\s]+)\s*<\/link>/gi),
...rawBlock.matchAll(/\bhref\s*=\s*["']([^"']+)["']/gi),
...rawBlock.matchAll(/["']((?:https?:)?\/\/[^"']+)["']/gi),
...rawBlock.matchAll(/\b((?:https?:)?\/\/(?:v\.qq\.com|(?:v\.|www\.)?youku\.com|www\.iqiyi\.com|(?:www\.)?mgtv\.com)\/[^"'<>\s]+)/gi),
];
for (const match of urls) {
const url = normalizeUrl(match[1], baseUrl);
if (!url) continue;
const score = scoreUrl(url, config, keyword);
if (score <= 0) continue;
results.push({
url,
score: score + 80,
evidence: blockText,
});
}
}
return results;
}
async function rankCandidates(platform, candidates, keyword, signal) {
const ranked = [];
for (const candidate of candidates.slice(0, 8)) {
const pageTitle = await fetchPageTitle(platform, candidate.url, signal);
if (titleConflictsWithKeyword(pageTitle, keyword)) continue;
const keywordScore = keywordMatchScore(`${candidate.evidence} ${pageTitle}`, keyword);
ranked.push({
...candidate,
pageTitle,
keywordScore,
score: candidate.score + keywordScore,
});
}
return ranked
.filter((candidate) => candidate.keywordScore > 0)
.sort((a, b) => b.score - a.score)
.slice(0, 10);
}
async function fetchPageTitle(platform, url, signal) {
try {
const response = await fetch(url, {
headers: getRequestHeaders(platform),
redirect: "follow",
signal: fetchSignal(signal, 6_000),
});
const html = await response.text();
return decodeEscapedText(html.match(/
]*>([\s\S]*?)<\/title>/i)?.[1] || "")
.replace(/\s+/g, " ")
.trim();
} catch {
return "";
}
}
function hasStrongCandidate(candidates) {
return candidates.some((candidate) => candidate.score >= 180);
}
export function titleConflictsWithKeyword(pageTitle, keyword) {
const title = String(pageTitle || "").trim();
if (!title) return false;
return keywordMatchScore(title, keyword) === 0;
}
function isBlockedSearchPage(html) {
return /_____tmd_____|x5secdata|captcha|验证码|安全验证|人机验证|访问过于频繁|请求过于频繁/i.test(html);
}
function keywordMatchScore(text, keyword) {
const haystack = normalizeSearchText(text);
const tokens = keywordTokens(keyword);
if (!haystack || tokens.length === 0) return 0;
const full = normalizeSearchText(keyword);
let score = haystack.includes(full) ? 220 : 0;
const matched = tokens.filter((token) => haystack.includes(token)).length;
if (matched === tokens.length) score += 180;
score += matched * 45;
return score;
}
function keywordTokens(keyword) {
const tokens = String(keyword)
.split(/[\s::\-_/]+/)
.map(normalizeSearchText)
.filter((token) => token.length >= 2);
return [...new Set(tokens)];
}
function normalizeSearchText(value) {
return decodePercentText(String(value || ""))
.toLowerCase()
.replace(/[《》【】[\]()()::\s\-_/]+/g, "");
}
function scoreUrl(url, config, keyword) {
let parsed;
try {
parsed = new URL(url);
} catch {
return 0;
}
const host = parsed.hostname.toLowerCase();
const path = safeDecodeURIComponent(parsed.pathname);
if (!config.allowHosts.some((allowedHost) => host === allowedHost || host.endsWith(`.${allowedHost}`))) {
return 0;
}
if (config.excludePaths.some((pattern) => pattern.test(path))) {
return 0;
}
if (/^\/(?:a_|v_)\/?$/.test(path)) return 0;
if (host.includes("youku.com") && path === "/video" && !parsed.searchParams.get("s")) return 0;
if (!config.includePaths.some((pattern) => pattern.test(path))) return 0;
let score = 80;
if (/\/a_/.test(path) || /\/show_page\//.test(path) || /\/x\/cover\//.test(path) || /\/b\//.test(path)) score += 20;
if (/\/v_/.test(path) || /\/v_show\//.test(path) || /\/x\/page\//.test(path)) score += 5;
if (url.includes(encodeURIComponent(keyword)) || url.includes(keyword)) score += 5;
if (url.includes("...") || url.includes("%E2%80%A6")) score = 0;
if (/\.(jpg|jpeg|png|gif|webp|css|js|ico|svg)$/i.test(path)) score = 0;
return score;
}
function mergeCandidates(...groups) {
const merged = new Map();
for (const candidate of groups.flat()) {
if (!candidate?.url) continue;
const previous = merged.get(candidate.url);
if (!previous || candidate.score > previous.score) {
merged.set(candidate.url, candidate);
}
}
return [...merged.values()]
.sort((a, b) => b.score - a.score)
.slice(0, 10);
}
function normalizeUrl(rawUrl, baseUrl) {
if (!rawUrl) return "";
const trimmed = decodeEscapedText(rawUrl.trim());
if (trimmed.startsWith("javascript:") || trimmed.startsWith("#")) return "";
try {
const absolute = /^(?:https?:)?\/\//i.test(trimmed)
? trimmed
: /^(?:v\.qq\.com|(?:v\.|www\.)?youku\.com|www\.iqiyi\.com|(?:www\.)?mgtv\.com)\//i.test(trimmed)
? `https://${trimmed}`
: trimmed;
const parsed = new URL(absolute, baseUrl);
const unwrapped = decodeWrappedTarget(parsed);
if (unwrapped) return new URL(unwrapped).toString();
normalizeTencentPath(parsed);
return cleanupUrl(parsed);
} catch {
return "";
}
}
function normalizeTencentPath(parsed) {
if (parsed.hostname !== "v.qq.com") return;
if (!/^\/x\/cover\//.test(parsed.pathname)) return;
if (pathExtension(parsed.pathname)) return;
parsed.pathname = `${parsed.pathname}.html`;
}
function pathExtension(pathname) {
return /\.[a-z0-9]+$/i.test(pathname);
}
function decodeWrappedTarget(parsed) {
if (parsed.hostname.endsWith("bing.com")) {
const encoded = parsed.searchParams.get("u");
if (encoded) {
try {
const value = encoded.startsWith("a1") ? encoded.slice(2) : encoded;
return Buffer.from(value, "base64url").toString("utf8");
} catch {}
}
}
for (const key of ["url", "u", "target", "to", "redirect", "jump"]) {
const value = parsed.searchParams.get(key);
if (!value) continue;
const decoded = decodePercentText(value);
if (/^https?:\/\//i.test(decoded) || /^\/\//.test(decoded)) return decoded;
}
return "";
}
function cleanupUrl(parsed) {
parsed.hash = "";
for (const key of [...parsed.searchParams.keys()]) {
if (/^(ptag|from|fromvsogou|query|wd|q|src|source|utm_|spm|cxid)/i.test(key)) {
parsed.searchParams.delete(key);
}
}
return parsed.toString();
}
function decodeEscapedText(value) {
return decodeHtmlEntities(String(value)
.replace(/\\u([0-9a-f]{4})/gi, (_, hex) => String.fromCharCode(Number.parseInt(hex, 16)))
.replace(/\\x([0-9a-f]{2})/gi, (_, hex) => String.fromCharCode(Number.parseInt(hex, 16)))
.replace(/\\\//g, "/"));
}
function decodeHtmlEntities(value) {
return String(value)
.replace(/ /g, " ")
.replace(/"/g, "\"")
.replace(/"/g, "\"")
.replace(/"/gi, "\"")
.replace(/'/g, "'")
.replace(/'/g, "'")
.replace(/'/gi, "'")
.replace(/&/g, "&")
.replace(/</g, "<")
.replace(/>/g, ">");
}
function cleanSnippet(text, index, padding = 160) {
const start = Math.max(0, index - padding);
const end = Math.min(text.length, index + padding);
return decodePercentText(text.slice(start, end))
.replace(/<[^>]+>/g, " ")
.replace(/\s+/g, " ")
.trim();
}
function decodePercentText(value) {
return String(value).replace(/%[0-9a-f]{2}(?:%[0-9a-f]{2})*/gi, (match) => {
try {
return decodeURIComponent(match);
} catch {
return match;
}
});
}
function safeDecodeURIComponent(value) {
try {
return decodeURIComponent(value);
} catch {
return String(value || "");
}
}