kaikai_test/src/search.js
2026-05-14 18:53:53 +08:00

1142 lines
35 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import crypto from "node:crypto";
import { getRequestHeaders } from "./sites.js";
const SEARCH_TIMEOUT_MS = 6_000;
const QUICK_SEARCH_TIMEOUT_MS = 6_000;
const SEARCH_CONFIGS = {
tencent: {
searchUrl: (keyword) => `https://v.qq.com/x/search/?q=${encodeURIComponent(keyword)}`,
siteSearchUrls: [
(keyword) => `https://v.qq.com/x/search/?q=${encodeURIComponent(keyword)}`,
],
allowHosts: ["v.qq.com"],
includePaths: [/\/x\/cover\//, /\/x\/page\//],
excludePaths: [/\/x\/search\//, /\/search/],
fallbackQueries: [
(keyword) => `site:v.qq.com/x/cover ${keyword} 腾讯视频`,
(keyword) => `site:v.qq.com/x/page ${keyword} 腾讯视频`,
(keyword) => `${keyword} 腾讯视频`,
],
},
youku: {
searchUrl: (keyword) => `https://so.youku.com/search_video/q_${encodeURIComponent(keyword)}`,
preferFallback: true,
siteSearchUrls: [
(keyword) => `https://so.youku.com/search_video/q_${encodeURIComponent(keyword)}`,
(keyword) => `https://www.youku.com/search_video?keyword=${encodeURIComponent(keyword)}`,
],
allowHosts: ["v.youku.com", "www.youku.com", "youku.com"],
includePaths: [/\/v_show\//, /^\/video$/, /\/show_page\//],
excludePaths: [/\/search/],
fallbackQueries: [
(keyword) => `site:v.youku.com/v_show ${keyword}`,
(keyword) => `site:v.youku.com/video ${keyword}`,
(keyword) => `site:www.youku.com/show_page ${keyword}`,
(keyword) => `site:youku.com ${keyword} youku`,
(keyword) => `site:v.youku.com ${keyword} 优酷`,
(keyword) => `site:youku.com/show_page ${keyword} 优酷`,
(keyword) => `${keyword} 优酷`,
],
},
iqiyi: {
searchUrl: (keyword) => `https://so.iqiyi.com/so/q_${encodeURIComponent(keyword)}`,
siteSearchUrls: [
(keyword) => `https://so.iqiyi.com/so/q_${encodeURIComponent(keyword)}`,
(keyword) => `https://www.iqiyi.com/search?keyword=${encodeURIComponent(keyword)}`,
],
allowHosts: ["www.iqiyi.com"],
includePaths: [/\/v_/, /\/a_/],
excludePaths: [/\/so\//],
fallbackQueries: [
(keyword) => `site:www.iqiyi.com/a_ ${keyword} 爱奇艺 热度`,
(keyword) => `site:www.iqiyi.com/v_ ${keyword} 爱奇艺`,
(keyword) => `${keyword} 爱奇艺`,
],
},
mgtv: {
searchUrl: (keyword) => `https://so.mgtv.com/so?k=${encodeURIComponent(keyword)}`,
siteSearchUrls: [
(keyword) => `https://so.mgtv.com/so?k=${encodeURIComponent(keyword)}`,
],
allowHosts: ["www.mgtv.com", "mgtv.com"],
includePaths: [/\/b\//, /\/h\//, /\/l\//],
excludePaths: [/\/so/],
fallbackQueries: [
(keyword) => `site:www.mgtv.com/h ${keyword} 芒果TV`,
(keyword) => `site:www.mgtv.com/b ${keyword} 芒果TV`,
(keyword) => `${keyword} 芒果TV`,
],
},
};
SEARCH_CONFIGS.tencent.fallbackQueries = [
(keyword) => `site:v.qq.com/x/cover ${keyword} 腾讯视频`,
(keyword) => `site:v.qq.com/x/page ${keyword} 腾讯视频`,
(keyword) => `${keyword} 腾讯视频 少儿`,
(keyword) => `${keyword} 小企鹅乐园`,
(keyword) => `${keyword} 腾讯视频`,
];
SEARCH_CONFIGS.youku.fallbackQueries = [
(keyword) => `site:v.youku.com/v_show ${keyword}`,
(keyword) => `site:v.youku.com/video ${keyword}`,
(keyword) => `site:www.youku.com/show_page ${keyword}`,
(keyword) => `site:youku.com ${keyword} youku`,
(keyword) => `site:v.youku.com ${keyword} 优酷`,
(keyword) => `site:youku.com/show_page ${keyword} 优酷`,
(keyword) => `${keyword} 优酷`,
];
SEARCH_CONFIGS.iqiyi.excludePaths = [/\/so(?:\/|$)/, /\/search/];
SEARCH_CONFIGS.iqiyi.fallbackQueries = [
(keyword) => `${keyword} 爱奇艺`,
(keyword) => `site:www.iqiyi.com/a_ ${keyword} 爱奇艺 热度`,
(keyword) => `site:www.iqiyi.com/v_ ${keyword} 爱奇艺`,
];
SEARCH_CONFIGS.mgtv.fallbackQueries = [
(keyword) => `site:www.mgtv.com/h ${keyword} 芒果TV`,
(keyword) => `site:www.mgtv.com/b ${keyword} 芒果TV`,
(keyword) => `${keyword} 芒果TV`,
];
export async function findProgramPage(platform, keyword, options = {}) {
const config = SEARCH_CONFIGS[platform];
if (!config) {
return {
platform,
keyword,
url: "",
status: "error",
error: `unsupported platform: ${platform}`,
candidates: [],
};
}
try {
const keywordAliases = platform === "youku"
? await youkuHomeSearchKeywords(keyword, options.signal)
: platform === "iqiyi"
? iqiyiSearchKeywords(keyword)
: [keyword];
const searchUrl = config.searchUrl(keyword);
let html = "";
let blockedSearch = Boolean(config.preferFallback);
let responseOk = true;
if (!config.preferFallback) {
const response = await fetch(searchUrl, {
headers: getRequestHeaders(platform),
redirect: "follow",
signal: fetchSignal(options.signal, SEARCH_TIMEOUT_MS),
});
html = await response.text();
blockedSearch = response.status === 403 || response.status === 429 || isBlockedSearchPage(html);
responseOk = response.ok;
}
if (!responseOk && !blockedSearch) {
return {
platform,
keyword,
url: "",
status: "error",
error: "search HTTP error",
candidates: [],
};
}
let candidates = blockedSearch
? []
: await rankCandidates(platform, await candidateUrlsFromHtml(platform, html, searchUrl, config, keyword, options.signal), keyword, options.signal);
let matchedSearchUrl = searchUrl;
if (!hasStrongCandidate(candidates) && config.siteSearchUrls?.length) {
const siteSearch = await findFromSiteSearches(platform, config, keywordAliases, options.signal);
candidates = mergeCandidates(candidates, siteSearch.candidates);
matchedSearchUrl = siteSearch.searchUrl || matchedSearchUrl;
}
if (platform === "tencent" && !hasStrongCandidate(candidates)) {
const stationSearch = await findFromTencentStationSearch(config, keywordAliases, options.signal);
candidates = mergeCandidates(candidates, stationSearch.candidates);
matchedSearchUrl = stationSearch.searchUrl || matchedSearchUrl;
}
if (platform === "iqiyi" && !hasStrongCandidate(candidates)) {
const iqiyiFallback = await findIqiyiFromDuckDuckGo(config, keywordAliases, options.signal);
candidates = mergeCandidates(candidates, iqiyiFallback.candidates);
matchedSearchUrl = iqiyiFallback.searchUrl || matchedSearchUrl;
}
if (!hasStrongCandidate(candidates)) {
const fallback = await findFromFallbackSearch(platform, config, keywordAliases, options.signal);
candidates = mergeCandidates(candidates, fallback.candidates);
matchedSearchUrl = fallback.searchUrl || matchedSearchUrl;
}
const best = candidates[0];
return {
platform,
keyword,
url: best?.url || "",
status: best ? "ok" : "no_match",
error: best ? "" : (blockedSearch ? "search page requires verification" : "no program page found from search page"),
candidates,
searchUrl: matchedSearchUrl,
};
} catch (error) {
return {
platform,
keyword,
url: "",
status: "error",
error: error.message,
candidates: [],
};
}
}
export async function findProgramPageQuick(platform, keyword) {
const controller = new AbortController();
let timer;
try {
timer = setTimeout(() => controller.abort(), QUICK_SEARCH_TIMEOUT_MS);
return await findProgramPage(platform, keyword, { signal: controller.signal });
} catch (error) {
return {
platform,
keyword,
url: "",
status: "error",
error: controller.signal.aborted ? `quick search timeout ${QUICK_SEARCH_TIMEOUT_MS}ms` : error.message,
candidates: [],
searchUrl: "",
};
} finally {
clearTimeout(timer);
}
}
function fetchSignal(parentSignal, timeoutMs) {
return parentSignal ? AbortSignal.any([parentSignal, AbortSignal.timeout(timeoutMs)]) : AbortSignal.timeout(timeoutMs);
}
async function findFromSiteSearches(platform, config, keywords, signal) {
let bestCandidates = [];
let bestSearchUrl = "";
for (const keyword of uniqueKeywords(keywords)) {
for (const searchBuilder of config.siteSearchUrls || []) {
const searchUrl = searchBuilder(keyword);
try {
const response = await fetch(searchUrl, {
headers: getRequestHeaders(platform),
redirect: "follow",
signal: fetchSignal(signal, 8_000),
});
if (!response.ok) continue;
const html = await response.text();
if (isBlockedSearchPage(html)) continue;
const candidates = await rankCandidates(platform, await candidateUrlsFromHtml(platform, html, searchUrl, config, keyword, signal), keyword, signal);
if (hasStrongCandidate(candidates)) return { candidates, searchUrl };
if (candidates.length > bestCandidates.length) {
bestCandidates = candidates;
bestSearchUrl = searchUrl;
}
} catch {
continue;
}
}
}
return { candidates: bestCandidates, searchUrl: bestSearchUrl };
}
const TENCENT_SEARCH_API_URLS = [
"https://pbaccess.video.qq.com/trpc.videosearch.mobile_search.MultiTerminalSearch/MbSearch?vversion_platform=2",
"https://pbaccess.video.qq.com/trpc.videosearch.mobile_search.HttpMobileRecall/MbSearchHttp",
];
async function findFromTencentStationSearch(config, keywords, signal) {
let bestCandidates = [];
let bestSearchUrl = "";
for (const keyword of uniqueKeywords(keywords)) {
for (const searchUrl of TENCENT_SEARCH_API_URLS) {
try {
const response = await fetch(searchUrl, {
method: "POST",
headers: getTencentSearchApiHeaders(keyword),
body: JSON.stringify(buildTencentSearchPayload(keyword)),
redirect: "follow",
signal: fetchSignal(signal, 8_000),
});
if (!response.ok) continue;
const json = await response.json();
const candidates = await rankCandidates(
"tencent",
extractTencentSearchCandidates(json, keyword, config),
keyword,
signal,
);
if (hasStrongCandidate(candidates)) return { candidates, searchUrl };
if (candidates.length > bestCandidates.length) {
bestCandidates = candidates;
bestSearchUrl = searchUrl;
}
} catch {
continue;
}
}
}
return { candidates: bestCandidates, searchUrl: bestSearchUrl };
}
function getTencentSearchApiHeaders(keyword) {
return {
...getRequestHeaders("tencent"),
accept: "application/json, text/plain, */*",
"content-type": "application/json",
origin: "https://v.qq.com",
referer: SEARCH_CONFIGS.tencent.searchUrl(keyword),
};
}
function buildTencentSearchPayload(keyword) {
return {
query: keyword,
pagenum: 0,
pagesize: 20,
queryFrom: 0,
filterValue: "",
sceneId: 21,
searchDatakey: "",
transInfo: "",
isneedQc: true,
preQid: "",
adClientInfo: "",
extraInfo: {
isNewMarkLabel: "0",
multi_terminal_pc: "1",
themeType: "0",
sugRelatedIds: "{}",
appVersion: "",
frontVersion: "26041606",
},
version: "26022601",
clientType: 1,
uuid: crypto.randomUUID(),
retry: 0,
featureList: [
"DEFAULT_FEFEATURE",
"PC_SHORT_VIDEOS_WATERFALL",
"PC_WANT_EPISODE_V2",
"PC_WANT_EPISODE",
],
};
}
async function findFromFallbackSearch(platform, config, keywords, signal) {
let bestCandidates = [];
let bestSearchUrl = "";
for (const keyword of uniqueKeywords(keywords)) {
for (const queryBuilder of config.fallbackQueries || []) {
const query = queryBuilder(keyword);
for (const engine of fallbackSearchUrls(query)) {
try {
const response = await fetch(engine.url, {
headers: {
...getRequestHeaders(""),
referer: engine.referer,
},
redirect: "follow",
signal: fetchSignal(signal, 8_000),
});
if (!response.ok) continue;
const html = await response.text();
const candidates = await rankCandidates(platform, await candidateUrlsFromHtml(platform, html, engine.url, config, keyword, signal), keyword, signal);
if (hasStrongCandidate(candidates)) return { candidates, searchUrl: engine.url };
if (candidates.length > bestCandidates.length) {
bestCandidates = candidates;
bestSearchUrl = engine.url;
}
} catch {
continue;
}
}
}
}
return { candidates: bestCandidates, searchUrl: bestSearchUrl };
}
async function findIqiyiFromDuckDuckGo(config, keywords, signal) {
for (const keyword of uniqueKeywords(keywords)) {
const query = `${keyword} 爱奇艺`;
const searchUrl = `https://duckduckgo.com/html/?q=${encodeURIComponent(query)}`;
try {
const response = await fetch(searchUrl, {
headers: {
...getRequestHeaders(""),
referer: "https://duckduckgo.com/",
},
redirect: "follow",
signal: fetchSignal(signal, 8_000),
});
if (!response.ok) continue;
const html = await response.text();
const candidates = extractCandidateUrls(html, searchUrl, config, keyword)
.map((candidate) => ({
...candidate,
keywordScore: keywordMatchScore(candidate.evidence, keyword),
score: candidate.score + keywordMatchScore(candidate.evidence, keyword),
}))
.filter((candidate) => candidate.keywordScore > 0)
.sort((a, b) => b.score - a.score)
.slice(0, 10);
if (hasStrongCandidate(candidates)) return { candidates, searchUrl };
} catch {
continue;
}
}
return { candidates: [], searchUrl: "" };
}
async function youkuHomeSearchKeywords(keyword, signal) {
const keywords = [keyword];
try {
const json = await fetchYoukuMtopSearch({
pg: "1",
pz: "12",
searchFrom: "home",
utdId: "XlQcF5xQrCcCAWoLKdGqIOhS",
ykPid: "",
sdkver: 314,
pcKuFlixMode: 1,
appScene: "kubox",
appCaller: "pc",
s: "pc",
device: "pc",
platform: "pc",
keyword,
}, signal);
for (const value of extractYoukuSuggestionTexts(json)) {
keywords.push(value);
}
} catch {}
return uniqueKeywords(keywords).slice(0, 5);
}
async function fetchYoukuMtopSearch(dataObject, signal) {
const appKey = "23774304";
const api = "mtop.youku.soku.yksearch";
const data = JSON.stringify(dataObject);
const headers = {
...getRequestHeaders("youku"),
referer: "https://www.youku.com/",
};
const first = await fetch(buildYoukuMtopUrl({ api, appKey, data, token: "" }), {
headers,
redirect: "follow",
signal: fetchSignal(signal, 8_000),
});
await first.text();
const cookieHeader = first.headers.get("set-cookie") || "";
const token = extractMtopToken(cookieHeader);
if (!token) return {};
const response = await fetch(buildYoukuMtopUrl({ api, appKey, data, token }), {
headers: {
...headers,
cookie: compactMtopCookie(cookieHeader),
},
redirect: "follow",
signal: fetchSignal(signal, 8_000),
});
return response.json();
}
function buildYoukuMtopUrl({ api, appKey, data, token }) {
const timestamp = Date.now().toString();
const sign = crypto
.createHash("md5")
.update(`${token}&${timestamp}&${appKey}&${data}`)
.digest("hex");
const params = new URLSearchParams({
jsv: "2.7.2",
appKey,
t: timestamp,
sign,
api,
v: "2.0",
type: "GET",
dataType: "json",
ecode: "1",
data,
});
return `https://acs.youku.com/h5/${api}/2.0/?${params.toString()}`;
}
function extractMtopToken(cookieHeader) {
return (cookieHeader.match(/_m_h5_tk=([^_;]+)/)?.[1] || "").split("_")[0] || "";
}
function compactMtopCookie(cookieHeader) {
return [...cookieHeader.matchAll(/(?:^|, )([^=;, ]+=[^;]+)/g)]
.map((match) => match[1])
.filter((cookie) => cookie.startsWith("_m_h5") || cookie.startsWith("mtop"))
.join("; ");
}
function extractYoukuSuggestionTexts(json) {
const values = [];
walkJson(json, (key, value) => {
if (typeof value !== "string") return;
if (!["w", "show_w", "keyword"].includes(key)) return;
const text = stripHtml(value).trim();
if (text) values.push(text);
});
return values;
}
function walkJson(value, visit) {
if (!value || typeof value !== "object") return;
for (const [key, child] of Object.entries(value)) {
visit(key, child);
walkJson(child, visit);
}
}
function stripHtml(value) {
return String(value || "").replace(/<[^>]+>/g, "");
}
function uniqueKeywords(keywords) {
const seen = new Set();
const result = [];
for (const keyword of keywords) {
const value = String(keyword || "").trim();
const key = normalizeSearchText(value);
if (!value || seen.has(key)) continue;
seen.add(key);
result.push(value);
}
return result;
}
export function iqiyiSearchKeywords(keyword) {
const value = String(keyword || "").trim();
const keywords = [value];
const seasonMatch = value.match(/^(.+?)(\d{1,2})之(.+)$/);
if (seasonMatch) {
const [, prefix, season, title] = seasonMatch;
keywords.push(`${prefix}${season}${title}`);
keywords.push(`${prefix}${season}${title}`);
keywords.push(`${prefix} ${title}`);
}
return uniqueKeywords(keywords).slice(0, 5);
}
function fallbackSearchUrls(query) {
const encoded = encodeURIComponent(query);
return [
{
url: `https://www.bing.com/search?format=rss&q=${encoded}`,
referer: "https://www.bing.com/",
},
{
url: `https://www.bing.com/search?q=${encoded}`,
referer: "https://www.bing.com/",
},
{
url: `https://duckduckgo.com/html/?q=${encoded}`,
referer: "https://duckduckgo.com/",
},
{
url: `https://www.baidu.com/s?wd=${encoded}`,
referer: "https://www.baidu.com/",
},
{
url: `https://www.sogou.com/web?query=${encoded}`,
referer: "https://www.sogou.com/",
},
];
}
async function candidateUrlsFromHtml(platform, html, baseUrl, config, keyword, signal) {
const direct = extractCandidateUrls(html, baseUrl, config, keyword);
const expanded = await expandShortLinkCandidates(platform, html, config, keyword, signal);
const bridge = direct.length >= 2 ? [] : await expandBridgePageCandidates(platform, html, baseUrl, config, keyword, signal);
return mergeCandidates(direct, expanded, bridge);
}
export function extractCandidateUrls(html, baseUrl, config, keyword) {
const decoded = decodeEscapedText(html);
const candidates = new Map();
for (const candidate of extractStructuredSearchCandidates(decoded, baseUrl, config, keyword)) {
const previous = candidates.get(candidate.url);
if (!previous || candidate.score > previous.score) {
candidates.set(candidate.url, candidate);
}
}
const linkMatches = [
...decoded.matchAll(/\bhref\s*=\s*["']([^"']+)["']/gi),
...decoded.matchAll(/\b(?:url|playUrl|pageUrl|coverUrl|jumpUrl|target)\s*[:=]\s*["']([^"']+)["']/gi),
...decoded.matchAll(/<link>\s*([^<\s]+)\s*<\/link>/gi),
...decoded.matchAll(/["']((?:https?:)?\/\/[^"']+)["']/gi),
...decoded.matchAll(/\b((?:https?:)?\/\/(?:v\.qq\.com|(?:v\.|www\.)?youku\.com|www\.iqiyi\.com|(?:www\.)?mgtv\.com)\/[^"'<>\s]+)/gi),
...decoded.matchAll(/\b((?:v\.qq\.com|(?:v\.|www\.)?youku\.com|www\.iqiyi\.com|(?:www\.)?mgtv\.com)\/[^"'<>\s]+)/gi),
];
for (const match of linkMatches) {
const rawUrl = match[1];
const url = normalizeUrl(rawUrl, baseUrl);
if (!url) continue;
const score = scoreUrl(url, config, keyword);
if (score <= 0) continue;
const previous = candidates.get(url);
if (!previous || score > previous.score) {
candidates.set(url, {
url,
score,
evidence: cleanSnippet(decoded, match.index ?? 0, 700),
});
}
}
return [...candidates.values()].sort((a, b) => b.score - a.score).slice(0, 10);
}
export function extractTencentSearchCandidates(json, keyword, config = SEARCH_CONFIGS.tencent) {
const candidates = new Map();
for (const { item, boxShowName } of tencentSearchItems(json)) {
const evidence = tencentItemEvidence(item, boxShowName);
if (keywordMatchScore(evidence, keyword) <= 0) continue;
for (const url of tencentItemProgramUrls(item)) {
const score = scoreUrl(url, config, keyword);
if (score <= 0) continue;
const candidate = {
url,
score: score + 140,
evidence,
};
const previous = candidates.get(url);
if (!previous || candidate.score > previous.score) {
candidates.set(url, candidate);
}
}
}
return [...candidates.values()]
.sort((a, b) => b.score - a.score)
.slice(0, 10);
}
function tencentSearchItems(json) {
const lists = [
json?.data?.normalList,
...(json?.data?.areaBoxList || []),
].filter(Boolean);
const items = [];
const seen = new Set();
for (const list of lists) {
for (const item of list.itemList || []) {
if (!item || typeof item !== "object" || seen.has(item)) continue;
seen.add(item);
items.push({ item, boxShowName: list.boxShowName || "" });
}
}
return items;
}
function tencentItemProgramUrls(item) {
const urls = [];
const nodes = [item, item?.videoInfo, item?.doc].filter(Boolean);
for (const node of nodes) {
if (Number(node.dataType) === 2) {
const cid = tencentNodeId(node);
const coverUrl = tencentCoverUrlFromCid(cid);
if (coverUrl) urls.push(coverUrl);
}
for (const key of ["url", "playUrl", "pageUrl", "coverUrl", "jumpUrl", "target"]) {
const url = canonicalTencentProgramUrl(node[key]);
if (url) urls.push(url);
}
}
return [...new Set(urls)];
}
function tencentNodeId(node) {
return String(node?.cid || node?.coverId || node?.cover_id || node?.id || "").trim();
}
function tencentCoverUrlFromCid(cid) {
const value = String(cid || "").trim();
if (!/^[a-z0-9]{8,40}$/i.test(value)) return "";
return `https://v.qq.com/x/cover/${value}.html`;
}
function canonicalTencentProgramUrl(rawUrl) {
const url = normalizeUrl(rawUrl, "https://v.qq.com/");
if (!url) return "";
try {
const parsed = new URL(url);
const path = safeDecodeURIComponent(parsed.pathname);
const coverMatch = path.match(/^\/x\/cover\/([^/]+)(?:\/[^/]+)?\.html$/);
if (parsed.hostname === "v.qq.com" && coverMatch) {
return `https://v.qq.com/x/cover/${coverMatch[1]}.html`;
}
} catch {}
return url;
}
function tencentItemEvidence(item, boxShowName = "") {
const values = [boxShowName];
collectTencentEvidenceStrings(item, values);
return [...new Set(values.map(stripHtml).map((value) => value.trim()).filter(Boolean))]
.join(" ");
}
function collectTencentEvidenceStrings(value, results, depth = 0) {
if (!value || typeof value !== "object" || depth > 3 || results.length > 80) return;
for (const [key, child] of Object.entries(value)) {
if (typeof child === "string") {
if (/title|name|subtitle|desc|keyword|text/i.test(key)) results.push(child);
continue;
}
if (child && typeof child === "object") {
collectTencentEvidenceStrings(child, results, depth + 1);
}
}
}
async function expandShortLinkCandidates(platform, html, config, keyword, signal) {
const decoded = decodeEscapedText(html);
const results = [];
const seen = new Set();
const shortLinks = extractShortLinks(decoded, keyword, platform);
for (const item of shortLinks.slice(0, 5)) {
if (seen.has(item.url)) continue;
seen.add(item.url);
try {
const response = await fetch(item.url, {
headers: getRequestHeaders(platform),
redirect: "follow",
signal: fetchSignal(signal, 5_000),
});
const target = response.url || "";
const score = scoreUrl(target, config, keyword);
if (score <= 0) continue;
results.push({
url: target,
score: score + 120,
evidence: item.evidence,
});
} catch {}
}
return results;
}
export function extractShortLinks(text, keyword = "", platform = "") {
const decoded = decodeEscapedText(text);
const results = [];
const shortLinkPattern = /\bhttps?:\/\/(?:t\.cn|url\.cn|m\.weibo\.cn\/status|weibo\.com\/ttarticle\/x\/m\/show)[^\s"'<>),。;]+/gi;
for (const match of decoded.matchAll(shortLinkPattern)) {
const evidence = cleanSnippet(decoded, match.index ?? 0, 500);
if (keyword && keywordMatchScore(evidence, keyword) <= 0) continue;
if (platform && !platformEvidenceMatches(evidence, platform)) continue;
results.push({
url: match[0],
evidence,
});
}
return results;
}
async function expandBridgePageCandidates(platform, html, baseUrl, config, keyword, signal) {
const bridgePages = extractBridgePageUrls(html, baseUrl, keyword, platform);
const results = [];
for (const bridge of bridgePages.slice(0, 3)) {
try {
const response = await fetch(bridge.url, {
headers: getRequestHeaders(platform),
redirect: "follow",
signal: fetchSignal(signal, 6_000),
});
if (!response.ok) continue;
const pageHtml = await response.text();
results.push(
...extractCandidateUrls(pageHtml, response.url || bridge.url, config, keyword)
.map((candidate) => ({
...candidate,
score: candidate.score + 60,
evidence: `${bridge.evidence} ${candidate.evidence}`.trim(),
})),
);
results.push(...await expandShortLinkCandidates(platform, pageHtml, config, keyword, signal));
} catch {}
}
return results;
}
export function extractBridgePageUrls(html, baseUrl, keyword = "", platform = "") {
const decoded = decodeEscapedText(html);
const results = [];
const seen = new Set();
const blocks = [
...decoded.matchAll(/<item\b[\s\S]*?<\/item>/gi),
...decoded.matchAll(/<div\b[^>]*(?:class|id)=["'][^"']*(?:result|b_algo|vrwrap|news-box|result-op)[^"']*["'][\s\S]*?<\/div>/gi),
];
for (const block of blocks) {
const rawBlock = block[0];
const blockText = decodePercentText(rawBlock)
.replace(/<[^>]+>/g, " ")
.replace(/\s+/g, " ")
.trim();
if (keyword && keywordMatchScore(blockText, keyword) <= 0) continue;
if (platform && !platformEvidenceMatches(blockText, platform)) continue;
for (const match of [
...rawBlock.matchAll(/<link>\s*([^<\s]+)\s*<\/link>/gi),
...rawBlock.matchAll(/\bhref\s*=\s*["']([^"']+)["']/gi),
]) {
const url = normalizeUrl(match[1], baseUrl);
if (!url || seen.has(url)) continue;
if (scoreUrl(url, configForBridge(platform), keyword) > 0) continue;
if (isSearchOrEngineUrl(url)) continue;
seen.add(url);
results.push({ url, evidence: blockText });
}
}
return results;
}
function configForBridge(platform) {
return SEARCH_CONFIGS[platform] || {
allowHosts: [],
includePaths: [],
excludePaths: [],
};
}
function isSearchOrEngineUrl(url) {
try {
const host = new URL(url).hostname.toLowerCase();
return /(?:bing|baidu|sogou|google)\./.test(host);
} catch {
return true;
}
}
function platformEvidenceMatches(text, platform) {
const normalized = normalizeSearchText(text);
const terms = {
tencent: ["腾讯视频", "腾讯", "小企鹅乐园", "企鹅乐园", "vqq"],
youku: ["优酷", "youku"],
iqiyi: ["爱奇艺", "iqiyi"],
mgtv: ["芒果tv", "芒果", "mgtv"],
}[platform] || [];
return terms.length === 0 || terms.some((term) => normalized.includes(normalizeSearchText(term)));
}
function extractStructuredSearchCandidates(decoded, baseUrl, config, keyword) {
const results = [];
const blocks = [
...decoded.matchAll(/<item\b[\s\S]*?<\/item>/gi),
...decoded.matchAll(/<div\b[^>]*(?:class|id)=["'][^"']*(?:result|b_algo|vrwrap|news-box|result-op)[^"']*["'][\s\S]*?<\/div>/gi),
];
for (const block of blocks) {
const rawBlock = block[0];
const blockText = decodePercentText(rawBlock)
.replace(/<[^>]+>/g, " ")
.replace(/\s+/g, " ")
.trim();
if (keywordMatchScore(blockText, keyword) <= 0) continue;
const urls = [
...rawBlock.matchAll(/<link>\s*([^<\s]+)\s*<\/link>/gi),
...rawBlock.matchAll(/\bhref\s*=\s*["']([^"']+)["']/gi),
...rawBlock.matchAll(/["']((?:https?:)?\/\/[^"']+)["']/gi),
...rawBlock.matchAll(/\b((?:https?:)?\/\/(?:v\.qq\.com|(?:v\.|www\.)?youku\.com|www\.iqiyi\.com|(?:www\.)?mgtv\.com)\/[^"'<>\s]+)/gi),
];
for (const match of urls) {
const url = normalizeUrl(match[1], baseUrl);
if (!url) continue;
const score = scoreUrl(url, config, keyword);
if (score <= 0) continue;
results.push({
url,
score: score + 80,
evidence: blockText,
});
}
}
return results;
}
async function rankCandidates(platform, candidates, keyword, signal) {
const ranked = [];
for (const candidate of candidates.slice(0, 8)) {
const pageTitle = await fetchPageTitle(platform, candidate.url, signal);
if (titleConflictsWithKeyword(pageTitle, keyword)) continue;
const keywordScore = keywordMatchScore(`${candidate.evidence} ${pageTitle}`, keyword);
ranked.push({
...candidate,
pageTitle,
keywordScore,
score: candidate.score + keywordScore,
});
}
return ranked
.filter((candidate) => candidate.keywordScore > 0)
.sort((a, b) => b.score - a.score)
.slice(0, 10);
}
async function fetchPageTitle(platform, url, signal) {
try {
const response = await fetch(url, {
headers: getRequestHeaders(platform),
redirect: "follow",
signal: fetchSignal(signal, 6_000),
});
const html = await response.text();
return decodeEscapedText(html.match(/<title[^>]*>([\s\S]*?)<\/title>/i)?.[1] || "")
.replace(/\s+/g, " ")
.trim();
} catch {
return "";
}
}
function hasStrongCandidate(candidates) {
return candidates.some((candidate) => candidate.score >= 180);
}
export function titleConflictsWithKeyword(pageTitle, keyword) {
const title = String(pageTitle || "").trim();
if (!title) return false;
return keywordMatchScore(title, keyword) === 0;
}
function isBlockedSearchPage(html) {
return /_____tmd_____|x5secdata|captcha|验证码|安全验证|人机验证|访问过于频繁|请求过于频繁/i.test(html);
}
function keywordMatchScore(text, keyword) {
const haystack = normalizeSearchText(text);
const tokens = keywordTokens(keyword);
if (!haystack || tokens.length === 0) return 0;
const full = normalizeSearchText(keyword);
let score = haystack.includes(full) ? 220 : 0;
const matched = tokens.filter((token) => haystack.includes(token)).length;
if (matched === tokens.length) score += 180;
score += matched * 45;
return score;
}
function keywordTokens(keyword) {
const tokens = String(keyword)
.split(/[\s:\-_/]+/)
.map(normalizeSearchText)
.filter((token) => token.length >= 2);
return [...new Set(tokens)];
}
function normalizeSearchText(value) {
return decodePercentText(String(value || ""))
.toLowerCase()
.replace(/[《》【】[\]():\s\-_/]+/g, "");
}
function scoreUrl(url, config, keyword) {
let parsed;
try {
parsed = new URL(url);
} catch {
return 0;
}
const host = parsed.hostname.toLowerCase();
const path = safeDecodeURIComponent(parsed.pathname);
if (!config.allowHosts.some((allowedHost) => host === allowedHost || host.endsWith(`.${allowedHost}`))) {
return 0;
}
if (config.excludePaths.some((pattern) => pattern.test(path))) {
return 0;
}
if (/^\/(?:a_|v_)\/?$/.test(path)) return 0;
if (host.includes("youku.com") && path === "/video" && !parsed.searchParams.get("s")) return 0;
if (!config.includePaths.some((pattern) => pattern.test(path))) return 0;
let score = 80;
if (/\/a_/.test(path) || /\/show_page\//.test(path) || /\/x\/cover\//.test(path) || /\/b\//.test(path)) score += 20;
if (/\/v_/.test(path) || /\/v_show\//.test(path) || /\/x\/page\//.test(path)) score += 5;
if (url.includes(encodeURIComponent(keyword)) || url.includes(keyword)) score += 5;
if (url.includes("...") || url.includes("%E2%80%A6")) score = 0;
if (/\.(jpg|jpeg|png|gif|webp|css|js|ico|svg)$/i.test(path)) score = 0;
return score;
}
function mergeCandidates(...groups) {
const merged = new Map();
for (const candidate of groups.flat()) {
if (!candidate?.url) continue;
const previous = merged.get(candidate.url);
if (!previous || candidate.score > previous.score) {
merged.set(candidate.url, candidate);
}
}
return [...merged.values()]
.sort((a, b) => b.score - a.score)
.slice(0, 10);
}
function normalizeUrl(rawUrl, baseUrl) {
if (!rawUrl) return "";
const trimmed = decodeEscapedText(rawUrl.trim());
if (trimmed.startsWith("javascript:") || trimmed.startsWith("#")) return "";
try {
const absolute = /^(?:https?:)?\/\//i.test(trimmed)
? trimmed
: /^(?:v\.qq\.com|(?:v\.|www\.)?youku\.com|www\.iqiyi\.com|(?:www\.)?mgtv\.com)\//i.test(trimmed)
? `https://${trimmed}`
: trimmed;
const parsed = new URL(absolute, baseUrl);
const unwrapped = decodeWrappedTarget(parsed);
if (unwrapped) return new URL(unwrapped).toString();
normalizeTencentPath(parsed);
return cleanupUrl(parsed);
} catch {
return "";
}
}
function normalizeTencentPath(parsed) {
if (parsed.hostname !== "v.qq.com") return;
if (!/^\/x\/cover\//.test(parsed.pathname)) return;
if (pathExtension(parsed.pathname)) return;
parsed.pathname = `${parsed.pathname}.html`;
}
function pathExtension(pathname) {
return /\.[a-z0-9]+$/i.test(pathname);
}
function decodeWrappedTarget(parsed) {
if (parsed.hostname.endsWith("bing.com")) {
const encoded = parsed.searchParams.get("u");
if (encoded) {
try {
const value = encoded.startsWith("a1") ? encoded.slice(2) : encoded;
return Buffer.from(value, "base64url").toString("utf8");
} catch {}
}
}
for (const key of ["url", "u", "target", "to", "redirect", "jump"]) {
const value = parsed.searchParams.get(key);
if (!value) continue;
const decoded = decodePercentText(value);
if (/^https?:\/\//i.test(decoded) || /^\/\//.test(decoded)) return decoded;
}
return "";
}
function cleanupUrl(parsed) {
parsed.hash = "";
for (const key of [...parsed.searchParams.keys()]) {
if (/^(ptag|from|fromvsogou|query|wd|q|src|source|utm_|spm|cxid)/i.test(key)) {
parsed.searchParams.delete(key);
}
}
return parsed.toString();
}
function decodeEscapedText(value) {
return decodeHtmlEntities(String(value)
.replace(/\\u([0-9a-f]{4})/gi, (_, hex) => String.fromCharCode(Number.parseInt(hex, 16)))
.replace(/\\x([0-9a-f]{2})/gi, (_, hex) => String.fromCharCode(Number.parseInt(hex, 16)))
.replace(/\\\//g, "/"));
}
function decodeHtmlEntities(value) {
return String(value)
.replace(/&nbsp;/g, " ")
.replace(/&quot;/g, "\"")
.replace(/&#34;/g, "\"")
.replace(/&#x22;/gi, "\"")
.replace(/&apos;/g, "'")
.replace(/&#39;/g, "'")
.replace(/&#x27;/gi, "'")
.replace(/&amp;/g, "&")
.replace(/&lt;/g, "<")
.replace(/&gt;/g, ">");
}
function cleanSnippet(text, index, padding = 160) {
const start = Math.max(0, index - padding);
const end = Math.min(text.length, index + padding);
return decodePercentText(text.slice(start, end))
.replace(/<[^>]+>/g, " ")
.replace(/\s+/g, " ")
.trim();
}
function decodePercentText(value) {
return String(value).replace(/%[0-9a-f]{2}(?:%[0-9a-f]{2})*/gi, (match) => {
try {
return decodeURIComponent(match);
} catch {
return match;
}
});
}
function safeDecodeURIComponent(value) {
try {
return decodeURIComponent(value);
} catch {
return String(value || "");
}
}