1142 lines
35 KiB
JavaScript
1142 lines
35 KiB
JavaScript
import crypto from "node:crypto";
|
||
import { getRequestHeaders } from "./sites.js";
|
||
|
||
const SEARCH_TIMEOUT_MS = 6_000;
|
||
const QUICK_SEARCH_TIMEOUT_MS = 6_000;
|
||
|
||
const SEARCH_CONFIGS = {
|
||
tencent: {
|
||
searchUrl: (keyword) => `https://v.qq.com/x/search/?q=${encodeURIComponent(keyword)}`,
|
||
siteSearchUrls: [
|
||
(keyword) => `https://v.qq.com/x/search/?q=${encodeURIComponent(keyword)}`,
|
||
],
|
||
allowHosts: ["v.qq.com"],
|
||
includePaths: [/\/x\/cover\//, /\/x\/page\//],
|
||
excludePaths: [/\/x\/search\//, /\/search/],
|
||
fallbackQueries: [
|
||
(keyword) => `site:v.qq.com/x/cover ${keyword} 腾讯视频`,
|
||
(keyword) => `site:v.qq.com/x/page ${keyword} 腾讯视频`,
|
||
(keyword) => `${keyword} 腾讯视频`,
|
||
],
|
||
},
|
||
youku: {
|
||
searchUrl: (keyword) => `https://so.youku.com/search_video/q_${encodeURIComponent(keyword)}`,
|
||
preferFallback: true,
|
||
siteSearchUrls: [
|
||
(keyword) => `https://so.youku.com/search_video/q_${encodeURIComponent(keyword)}`,
|
||
(keyword) => `https://www.youku.com/search_video?keyword=${encodeURIComponent(keyword)}`,
|
||
],
|
||
allowHosts: ["v.youku.com", "www.youku.com", "youku.com"],
|
||
includePaths: [/\/v_show\//, /^\/video$/, /\/show_page\//],
|
||
excludePaths: [/\/search/],
|
||
fallbackQueries: [
|
||
(keyword) => `site:v.youku.com/v_show ${keyword}`,
|
||
(keyword) => `site:v.youku.com/video ${keyword}`,
|
||
(keyword) => `site:www.youku.com/show_page ${keyword}`,
|
||
(keyword) => `site:youku.com ${keyword} youku`,
|
||
(keyword) => `site:v.youku.com ${keyword} 优酷`,
|
||
(keyword) => `site:youku.com/show_page ${keyword} 优酷`,
|
||
(keyword) => `${keyword} 优酷`,
|
||
],
|
||
},
|
||
iqiyi: {
|
||
searchUrl: (keyword) => `https://so.iqiyi.com/so/q_${encodeURIComponent(keyword)}`,
|
||
siteSearchUrls: [
|
||
(keyword) => `https://so.iqiyi.com/so/q_${encodeURIComponent(keyword)}`,
|
||
(keyword) => `https://www.iqiyi.com/search?keyword=${encodeURIComponent(keyword)}`,
|
||
],
|
||
allowHosts: ["www.iqiyi.com"],
|
||
includePaths: [/\/v_/, /\/a_/],
|
||
excludePaths: [/\/so\//],
|
||
fallbackQueries: [
|
||
(keyword) => `site:www.iqiyi.com/a_ ${keyword} 爱奇艺 热度`,
|
||
(keyword) => `site:www.iqiyi.com/v_ ${keyword} 爱奇艺`,
|
||
(keyword) => `${keyword} 爱奇艺`,
|
||
],
|
||
},
|
||
mgtv: {
|
||
searchUrl: (keyword) => `https://so.mgtv.com/so?k=${encodeURIComponent(keyword)}`,
|
||
siteSearchUrls: [
|
||
(keyword) => `https://so.mgtv.com/so?k=${encodeURIComponent(keyword)}`,
|
||
],
|
||
allowHosts: ["www.mgtv.com", "mgtv.com"],
|
||
includePaths: [/\/b\//, /\/h\//, /\/l\//],
|
||
excludePaths: [/\/so/],
|
||
fallbackQueries: [
|
||
(keyword) => `site:www.mgtv.com/h ${keyword} 芒果TV`,
|
||
(keyword) => `site:www.mgtv.com/b ${keyword} 芒果TV`,
|
||
(keyword) => `${keyword} 芒果TV`,
|
||
],
|
||
},
|
||
};
|
||
|
||
SEARCH_CONFIGS.tencent.fallbackQueries = [
|
||
(keyword) => `site:v.qq.com/x/cover ${keyword} 腾讯视频`,
|
||
(keyword) => `site:v.qq.com/x/page ${keyword} 腾讯视频`,
|
||
(keyword) => `${keyword} 腾讯视频 少儿`,
|
||
(keyword) => `${keyword} 小企鹅乐园`,
|
||
(keyword) => `${keyword} 腾讯视频`,
|
||
];
|
||
SEARCH_CONFIGS.youku.fallbackQueries = [
|
||
(keyword) => `site:v.youku.com/v_show ${keyword}`,
|
||
(keyword) => `site:v.youku.com/video ${keyword}`,
|
||
(keyword) => `site:www.youku.com/show_page ${keyword}`,
|
||
(keyword) => `site:youku.com ${keyword} youku`,
|
||
(keyword) => `site:v.youku.com ${keyword} 优酷`,
|
||
(keyword) => `site:youku.com/show_page ${keyword} 优酷`,
|
||
(keyword) => `${keyword} 优酷`,
|
||
];
|
||
SEARCH_CONFIGS.iqiyi.excludePaths = [/\/so(?:\/|$)/, /\/search/];
|
||
SEARCH_CONFIGS.iqiyi.fallbackQueries = [
|
||
(keyword) => `${keyword} 爱奇艺`,
|
||
(keyword) => `site:www.iqiyi.com/a_ ${keyword} 爱奇艺 热度`,
|
||
(keyword) => `site:www.iqiyi.com/v_ ${keyword} 爱奇艺`,
|
||
];
|
||
SEARCH_CONFIGS.mgtv.fallbackQueries = [
|
||
(keyword) => `site:www.mgtv.com/h ${keyword} 芒果TV`,
|
||
(keyword) => `site:www.mgtv.com/b ${keyword} 芒果TV`,
|
||
(keyword) => `${keyword} 芒果TV`,
|
||
];
|
||
|
||
export async function findProgramPage(platform, keyword, options = {}) {
|
||
const config = SEARCH_CONFIGS[platform];
|
||
if (!config) {
|
||
return {
|
||
platform,
|
||
keyword,
|
||
url: "",
|
||
status: "error",
|
||
error: `unsupported platform: ${platform}`,
|
||
candidates: [],
|
||
};
|
||
}
|
||
|
||
try {
|
||
const keywordAliases = platform === "youku"
|
||
? await youkuHomeSearchKeywords(keyword, options.signal)
|
||
: platform === "iqiyi"
|
||
? iqiyiSearchKeywords(keyword)
|
||
: [keyword];
|
||
const searchUrl = config.searchUrl(keyword);
|
||
let html = "";
|
||
let blockedSearch = Boolean(config.preferFallback);
|
||
let responseOk = true;
|
||
|
||
if (!config.preferFallback) {
|
||
const response = await fetch(searchUrl, {
|
||
headers: getRequestHeaders(platform),
|
||
redirect: "follow",
|
||
signal: fetchSignal(options.signal, SEARCH_TIMEOUT_MS),
|
||
});
|
||
html = await response.text();
|
||
blockedSearch = response.status === 403 || response.status === 429 || isBlockedSearchPage(html);
|
||
responseOk = response.ok;
|
||
}
|
||
|
||
if (!responseOk && !blockedSearch) {
|
||
return {
|
||
platform,
|
||
keyword,
|
||
url: "",
|
||
status: "error",
|
||
error: "search HTTP error",
|
||
candidates: [],
|
||
};
|
||
}
|
||
|
||
let candidates = blockedSearch
|
||
? []
|
||
: await rankCandidates(platform, await candidateUrlsFromHtml(platform, html, searchUrl, config, keyword, options.signal), keyword, options.signal);
|
||
let matchedSearchUrl = searchUrl;
|
||
|
||
if (!hasStrongCandidate(candidates) && config.siteSearchUrls?.length) {
|
||
const siteSearch = await findFromSiteSearches(platform, config, keywordAliases, options.signal);
|
||
candidates = mergeCandidates(candidates, siteSearch.candidates);
|
||
matchedSearchUrl = siteSearch.searchUrl || matchedSearchUrl;
|
||
}
|
||
|
||
if (platform === "tencent" && !hasStrongCandidate(candidates)) {
|
||
const stationSearch = await findFromTencentStationSearch(config, keywordAliases, options.signal);
|
||
candidates = mergeCandidates(candidates, stationSearch.candidates);
|
||
matchedSearchUrl = stationSearch.searchUrl || matchedSearchUrl;
|
||
}
|
||
|
||
if (platform === "iqiyi" && !hasStrongCandidate(candidates)) {
|
||
const iqiyiFallback = await findIqiyiFromDuckDuckGo(config, keywordAliases, options.signal);
|
||
candidates = mergeCandidates(candidates, iqiyiFallback.candidates);
|
||
matchedSearchUrl = iqiyiFallback.searchUrl || matchedSearchUrl;
|
||
}
|
||
|
||
if (!hasStrongCandidate(candidates)) {
|
||
const fallback = await findFromFallbackSearch(platform, config, keywordAliases, options.signal);
|
||
candidates = mergeCandidates(candidates, fallback.candidates);
|
||
matchedSearchUrl = fallback.searchUrl || matchedSearchUrl;
|
||
}
|
||
|
||
const best = candidates[0];
|
||
return {
|
||
platform,
|
||
keyword,
|
||
url: best?.url || "",
|
||
status: best ? "ok" : "no_match",
|
||
error: best ? "" : (blockedSearch ? "search page requires verification" : "no program page found from search page"),
|
||
candidates,
|
||
searchUrl: matchedSearchUrl,
|
||
};
|
||
} catch (error) {
|
||
return {
|
||
platform,
|
||
keyword,
|
||
url: "",
|
||
status: "error",
|
||
error: error.message,
|
||
candidates: [],
|
||
};
|
||
}
|
||
}
|
||
|
||
export async function findProgramPageQuick(platform, keyword) {
|
||
const controller = new AbortController();
|
||
let timer;
|
||
try {
|
||
timer = setTimeout(() => controller.abort(), QUICK_SEARCH_TIMEOUT_MS);
|
||
return await findProgramPage(platform, keyword, { signal: controller.signal });
|
||
} catch (error) {
|
||
return {
|
||
platform,
|
||
keyword,
|
||
url: "",
|
||
status: "error",
|
||
error: controller.signal.aborted ? `quick search timeout ${QUICK_SEARCH_TIMEOUT_MS}ms` : error.message,
|
||
candidates: [],
|
||
searchUrl: "",
|
||
};
|
||
} finally {
|
||
clearTimeout(timer);
|
||
}
|
||
}
|
||
|
||
function fetchSignal(parentSignal, timeoutMs) {
|
||
return parentSignal ? AbortSignal.any([parentSignal, AbortSignal.timeout(timeoutMs)]) : AbortSignal.timeout(timeoutMs);
|
||
}
|
||
|
||
async function findFromSiteSearches(platform, config, keywords, signal) {
|
||
let bestCandidates = [];
|
||
let bestSearchUrl = "";
|
||
|
||
for (const keyword of uniqueKeywords(keywords)) {
|
||
for (const searchBuilder of config.siteSearchUrls || []) {
|
||
const searchUrl = searchBuilder(keyword);
|
||
try {
|
||
const response = await fetch(searchUrl, {
|
||
headers: getRequestHeaders(platform),
|
||
redirect: "follow",
|
||
signal: fetchSignal(signal, 8_000),
|
||
});
|
||
if (!response.ok) continue;
|
||
const html = await response.text();
|
||
if (isBlockedSearchPage(html)) continue;
|
||
|
||
const candidates = await rankCandidates(platform, await candidateUrlsFromHtml(platform, html, searchUrl, config, keyword, signal), keyword, signal);
|
||
if (hasStrongCandidate(candidates)) return { candidates, searchUrl };
|
||
if (candidates.length > bestCandidates.length) {
|
||
bestCandidates = candidates;
|
||
bestSearchUrl = searchUrl;
|
||
}
|
||
} catch {
|
||
continue;
|
||
}
|
||
}
|
||
}
|
||
|
||
return { candidates: bestCandidates, searchUrl: bestSearchUrl };
|
||
}
|
||
|
||
const TENCENT_SEARCH_API_URLS = [
|
||
"https://pbaccess.video.qq.com/trpc.videosearch.mobile_search.MultiTerminalSearch/MbSearch?vversion_platform=2",
|
||
"https://pbaccess.video.qq.com/trpc.videosearch.mobile_search.HttpMobileRecall/MbSearchHttp",
|
||
];
|
||
|
||
async function findFromTencentStationSearch(config, keywords, signal) {
|
||
let bestCandidates = [];
|
||
let bestSearchUrl = "";
|
||
|
||
for (const keyword of uniqueKeywords(keywords)) {
|
||
for (const searchUrl of TENCENT_SEARCH_API_URLS) {
|
||
try {
|
||
const response = await fetch(searchUrl, {
|
||
method: "POST",
|
||
headers: getTencentSearchApiHeaders(keyword),
|
||
body: JSON.stringify(buildTencentSearchPayload(keyword)),
|
||
redirect: "follow",
|
||
signal: fetchSignal(signal, 8_000),
|
||
});
|
||
if (!response.ok) continue;
|
||
|
||
const json = await response.json();
|
||
const candidates = await rankCandidates(
|
||
"tencent",
|
||
extractTencentSearchCandidates(json, keyword, config),
|
||
keyword,
|
||
signal,
|
||
);
|
||
if (hasStrongCandidate(candidates)) return { candidates, searchUrl };
|
||
if (candidates.length > bestCandidates.length) {
|
||
bestCandidates = candidates;
|
||
bestSearchUrl = searchUrl;
|
||
}
|
||
} catch {
|
||
continue;
|
||
}
|
||
}
|
||
}
|
||
|
||
return { candidates: bestCandidates, searchUrl: bestSearchUrl };
|
||
}
|
||
|
||
function getTencentSearchApiHeaders(keyword) {
|
||
return {
|
||
...getRequestHeaders("tencent"),
|
||
accept: "application/json, text/plain, */*",
|
||
"content-type": "application/json",
|
||
origin: "https://v.qq.com",
|
||
referer: SEARCH_CONFIGS.tencent.searchUrl(keyword),
|
||
};
|
||
}
|
||
|
||
function buildTencentSearchPayload(keyword) {
|
||
return {
|
||
query: keyword,
|
||
pagenum: 0,
|
||
pagesize: 20,
|
||
queryFrom: 0,
|
||
filterValue: "",
|
||
sceneId: 21,
|
||
searchDatakey: "",
|
||
transInfo: "",
|
||
isneedQc: true,
|
||
preQid: "",
|
||
adClientInfo: "",
|
||
extraInfo: {
|
||
isNewMarkLabel: "0",
|
||
multi_terminal_pc: "1",
|
||
themeType: "0",
|
||
sugRelatedIds: "{}",
|
||
appVersion: "",
|
||
frontVersion: "26041606",
|
||
},
|
||
version: "26022601",
|
||
clientType: 1,
|
||
uuid: crypto.randomUUID(),
|
||
retry: 0,
|
||
featureList: [
|
||
"DEFAULT_FEFEATURE",
|
||
"PC_SHORT_VIDEOS_WATERFALL",
|
||
"PC_WANT_EPISODE_V2",
|
||
"PC_WANT_EPISODE",
|
||
],
|
||
};
|
||
}
|
||
|
||
async function findFromFallbackSearch(platform, config, keywords, signal) {
|
||
let bestCandidates = [];
|
||
let bestSearchUrl = "";
|
||
for (const keyword of uniqueKeywords(keywords)) {
|
||
for (const queryBuilder of config.fallbackQueries || []) {
|
||
const query = queryBuilder(keyword);
|
||
for (const engine of fallbackSearchUrls(query)) {
|
||
try {
|
||
const response = await fetch(engine.url, {
|
||
headers: {
|
||
...getRequestHeaders(""),
|
||
referer: engine.referer,
|
||
},
|
||
redirect: "follow",
|
||
signal: fetchSignal(signal, 8_000),
|
||
});
|
||
|
||
if (!response.ok) continue;
|
||
const html = await response.text();
|
||
const candidates = await rankCandidates(platform, await candidateUrlsFromHtml(platform, html, engine.url, config, keyword, signal), keyword, signal);
|
||
if (hasStrongCandidate(candidates)) return { candidates, searchUrl: engine.url };
|
||
if (candidates.length > bestCandidates.length) {
|
||
bestCandidates = candidates;
|
||
bestSearchUrl = engine.url;
|
||
}
|
||
} catch {
|
||
continue;
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
return { candidates: bestCandidates, searchUrl: bestSearchUrl };
|
||
}
|
||
|
||
async function findIqiyiFromDuckDuckGo(config, keywords, signal) {
|
||
for (const keyword of uniqueKeywords(keywords)) {
|
||
const query = `${keyword} 爱奇艺`;
|
||
const searchUrl = `https://duckduckgo.com/html/?q=${encodeURIComponent(query)}`;
|
||
try {
|
||
const response = await fetch(searchUrl, {
|
||
headers: {
|
||
...getRequestHeaders(""),
|
||
referer: "https://duckduckgo.com/",
|
||
},
|
||
redirect: "follow",
|
||
signal: fetchSignal(signal, 8_000),
|
||
});
|
||
if (!response.ok) continue;
|
||
const html = await response.text();
|
||
const candidates = extractCandidateUrls(html, searchUrl, config, keyword)
|
||
.map((candidate) => ({
|
||
...candidate,
|
||
keywordScore: keywordMatchScore(candidate.evidence, keyword),
|
||
score: candidate.score + keywordMatchScore(candidate.evidence, keyword),
|
||
}))
|
||
.filter((candidate) => candidate.keywordScore > 0)
|
||
.sort((a, b) => b.score - a.score)
|
||
.slice(0, 10);
|
||
if (hasStrongCandidate(candidates)) return { candidates, searchUrl };
|
||
} catch {
|
||
continue;
|
||
}
|
||
}
|
||
|
||
return { candidates: [], searchUrl: "" };
|
||
}
|
||
|
||
async function youkuHomeSearchKeywords(keyword, signal) {
|
||
const keywords = [keyword];
|
||
try {
|
||
const json = await fetchYoukuMtopSearch({
|
||
pg: "1",
|
||
pz: "12",
|
||
searchFrom: "home",
|
||
utdId: "XlQcF5xQrCcCAWoLKdGqIOhS",
|
||
ykPid: "",
|
||
sdkver: 314,
|
||
pcKuFlixMode: 1,
|
||
appScene: "kubox",
|
||
appCaller: "pc",
|
||
s: "pc",
|
||
device: "pc",
|
||
platform: "pc",
|
||
keyword,
|
||
}, signal);
|
||
|
||
for (const value of extractYoukuSuggestionTexts(json)) {
|
||
keywords.push(value);
|
||
}
|
||
} catch {}
|
||
|
||
return uniqueKeywords(keywords).slice(0, 5);
|
||
}
|
||
|
||
async function fetchYoukuMtopSearch(dataObject, signal) {
|
||
const appKey = "23774304";
|
||
const api = "mtop.youku.soku.yksearch";
|
||
const data = JSON.stringify(dataObject);
|
||
const headers = {
|
||
...getRequestHeaders("youku"),
|
||
referer: "https://www.youku.com/",
|
||
};
|
||
|
||
const first = await fetch(buildYoukuMtopUrl({ api, appKey, data, token: "" }), {
|
||
headers,
|
||
redirect: "follow",
|
||
signal: fetchSignal(signal, 8_000),
|
||
});
|
||
await first.text();
|
||
const cookieHeader = first.headers.get("set-cookie") || "";
|
||
const token = extractMtopToken(cookieHeader);
|
||
if (!token) return {};
|
||
|
||
const response = await fetch(buildYoukuMtopUrl({ api, appKey, data, token }), {
|
||
headers: {
|
||
...headers,
|
||
cookie: compactMtopCookie(cookieHeader),
|
||
},
|
||
redirect: "follow",
|
||
signal: fetchSignal(signal, 8_000),
|
||
});
|
||
return response.json();
|
||
}
|
||
|
||
function buildYoukuMtopUrl({ api, appKey, data, token }) {
|
||
const timestamp = Date.now().toString();
|
||
const sign = crypto
|
||
.createHash("md5")
|
||
.update(`${token}&${timestamp}&${appKey}&${data}`)
|
||
.digest("hex");
|
||
const params = new URLSearchParams({
|
||
jsv: "2.7.2",
|
||
appKey,
|
||
t: timestamp,
|
||
sign,
|
||
api,
|
||
v: "2.0",
|
||
type: "GET",
|
||
dataType: "json",
|
||
ecode: "1",
|
||
data,
|
||
});
|
||
return `https://acs.youku.com/h5/${api}/2.0/?${params.toString()}`;
|
||
}
|
||
|
||
function extractMtopToken(cookieHeader) {
|
||
return (cookieHeader.match(/_m_h5_tk=([^_;]+)/)?.[1] || "").split("_")[0] || "";
|
||
}
|
||
|
||
function compactMtopCookie(cookieHeader) {
|
||
return [...cookieHeader.matchAll(/(?:^|, )([^=;, ]+=[^;]+)/g)]
|
||
.map((match) => match[1])
|
||
.filter((cookie) => cookie.startsWith("_m_h5") || cookie.startsWith("mtop"))
|
||
.join("; ");
|
||
}
|
||
|
||
function extractYoukuSuggestionTexts(json) {
|
||
const values = [];
|
||
walkJson(json, (key, value) => {
|
||
if (typeof value !== "string") return;
|
||
if (!["w", "show_w", "keyword"].includes(key)) return;
|
||
const text = stripHtml(value).trim();
|
||
if (text) values.push(text);
|
||
});
|
||
return values;
|
||
}
|
||
|
||
function walkJson(value, visit) {
|
||
if (!value || typeof value !== "object") return;
|
||
for (const [key, child] of Object.entries(value)) {
|
||
visit(key, child);
|
||
walkJson(child, visit);
|
||
}
|
||
}
|
||
|
||
function stripHtml(value) {
|
||
return String(value || "").replace(/<[^>]+>/g, "");
|
||
}
|
||
|
||
function uniqueKeywords(keywords) {
|
||
const seen = new Set();
|
||
const result = [];
|
||
for (const keyword of keywords) {
|
||
const value = String(keyword || "").trim();
|
||
const key = normalizeSearchText(value);
|
||
if (!value || seen.has(key)) continue;
|
||
seen.add(key);
|
||
result.push(value);
|
||
}
|
||
return result;
|
||
}
|
||
|
||
export function iqiyiSearchKeywords(keyword) {
|
||
const value = String(keyword || "").trim();
|
||
const keywords = [value];
|
||
const seasonMatch = value.match(/^(.+?)(\d{1,2})之(.+)$/);
|
||
if (seasonMatch) {
|
||
const [, prefix, season, title] = seasonMatch;
|
||
keywords.push(`${prefix} 第${season}季 ${title}`);
|
||
keywords.push(`${prefix}第${season}季${title}`);
|
||
keywords.push(`${prefix} ${title}`);
|
||
}
|
||
return uniqueKeywords(keywords).slice(0, 5);
|
||
}
|
||
|
||
function fallbackSearchUrls(query) {
|
||
const encoded = encodeURIComponent(query);
|
||
return [
|
||
{
|
||
url: `https://www.bing.com/search?format=rss&q=${encoded}`,
|
||
referer: "https://www.bing.com/",
|
||
},
|
||
{
|
||
url: `https://www.bing.com/search?q=${encoded}`,
|
||
referer: "https://www.bing.com/",
|
||
},
|
||
{
|
||
url: `https://duckduckgo.com/html/?q=${encoded}`,
|
||
referer: "https://duckduckgo.com/",
|
||
},
|
||
{
|
||
url: `https://www.baidu.com/s?wd=${encoded}`,
|
||
referer: "https://www.baidu.com/",
|
||
},
|
||
{
|
||
url: `https://www.sogou.com/web?query=${encoded}`,
|
||
referer: "https://www.sogou.com/",
|
||
},
|
||
];
|
||
}
|
||
|
||
async function candidateUrlsFromHtml(platform, html, baseUrl, config, keyword, signal) {
|
||
const direct = extractCandidateUrls(html, baseUrl, config, keyword);
|
||
const expanded = await expandShortLinkCandidates(platform, html, config, keyword, signal);
|
||
const bridge = direct.length >= 2 ? [] : await expandBridgePageCandidates(platform, html, baseUrl, config, keyword, signal);
|
||
return mergeCandidates(direct, expanded, bridge);
|
||
}
|
||
|
||
export function extractCandidateUrls(html, baseUrl, config, keyword) {
|
||
const decoded = decodeEscapedText(html);
|
||
const candidates = new Map();
|
||
|
||
for (const candidate of extractStructuredSearchCandidates(decoded, baseUrl, config, keyword)) {
|
||
const previous = candidates.get(candidate.url);
|
||
if (!previous || candidate.score > previous.score) {
|
||
candidates.set(candidate.url, candidate);
|
||
}
|
||
}
|
||
|
||
const linkMatches = [
|
||
...decoded.matchAll(/\bhref\s*=\s*["']([^"']+)["']/gi),
|
||
...decoded.matchAll(/\b(?:url|playUrl|pageUrl|coverUrl|jumpUrl|target)\s*[:=]\s*["']([^"']+)["']/gi),
|
||
...decoded.matchAll(/<link>\s*([^<\s]+)\s*<\/link>/gi),
|
||
...decoded.matchAll(/["']((?:https?:)?\/\/[^"']+)["']/gi),
|
||
...decoded.matchAll(/\b((?:https?:)?\/\/(?:v\.qq\.com|(?:v\.|www\.)?youku\.com|www\.iqiyi\.com|(?:www\.)?mgtv\.com)\/[^"'<>\s]+)/gi),
|
||
...decoded.matchAll(/\b((?:v\.qq\.com|(?:v\.|www\.)?youku\.com|www\.iqiyi\.com|(?:www\.)?mgtv\.com)\/[^"'<>\s]+)/gi),
|
||
];
|
||
|
||
for (const match of linkMatches) {
|
||
const rawUrl = match[1];
|
||
const url = normalizeUrl(rawUrl, baseUrl);
|
||
if (!url) continue;
|
||
|
||
const score = scoreUrl(url, config, keyword);
|
||
if (score <= 0) continue;
|
||
|
||
const previous = candidates.get(url);
|
||
if (!previous || score > previous.score) {
|
||
candidates.set(url, {
|
||
url,
|
||
score,
|
||
evidence: cleanSnippet(decoded, match.index ?? 0, 700),
|
||
});
|
||
}
|
||
}
|
||
|
||
return [...candidates.values()].sort((a, b) => b.score - a.score).slice(0, 10);
|
||
}
|
||
|
||
export function extractTencentSearchCandidates(json, keyword, config = SEARCH_CONFIGS.tencent) {
|
||
const candidates = new Map();
|
||
|
||
for (const { item, boxShowName } of tencentSearchItems(json)) {
|
||
const evidence = tencentItemEvidence(item, boxShowName);
|
||
if (keywordMatchScore(evidence, keyword) <= 0) continue;
|
||
|
||
for (const url of tencentItemProgramUrls(item)) {
|
||
const score = scoreUrl(url, config, keyword);
|
||
if (score <= 0) continue;
|
||
const candidate = {
|
||
url,
|
||
score: score + 140,
|
||
evidence,
|
||
};
|
||
const previous = candidates.get(url);
|
||
if (!previous || candidate.score > previous.score) {
|
||
candidates.set(url, candidate);
|
||
}
|
||
}
|
||
}
|
||
|
||
return [...candidates.values()]
|
||
.sort((a, b) => b.score - a.score)
|
||
.slice(0, 10);
|
||
}
|
||
|
||
function tencentSearchItems(json) {
|
||
const lists = [
|
||
json?.data?.normalList,
|
||
...(json?.data?.areaBoxList || []),
|
||
].filter(Boolean);
|
||
const items = [];
|
||
const seen = new Set();
|
||
|
||
for (const list of lists) {
|
||
for (const item of list.itemList || []) {
|
||
if (!item || typeof item !== "object" || seen.has(item)) continue;
|
||
seen.add(item);
|
||
items.push({ item, boxShowName: list.boxShowName || "" });
|
||
}
|
||
}
|
||
|
||
return items;
|
||
}
|
||
|
||
function tencentItemProgramUrls(item) {
|
||
const urls = [];
|
||
const nodes = [item, item?.videoInfo, item?.doc].filter(Boolean);
|
||
|
||
for (const node of nodes) {
|
||
if (Number(node.dataType) === 2) {
|
||
const cid = tencentNodeId(node);
|
||
const coverUrl = tencentCoverUrlFromCid(cid);
|
||
if (coverUrl) urls.push(coverUrl);
|
||
}
|
||
|
||
for (const key of ["url", "playUrl", "pageUrl", "coverUrl", "jumpUrl", "target"]) {
|
||
const url = canonicalTencentProgramUrl(node[key]);
|
||
if (url) urls.push(url);
|
||
}
|
||
}
|
||
|
||
return [...new Set(urls)];
|
||
}
|
||
|
||
function tencentNodeId(node) {
|
||
return String(node?.cid || node?.coverId || node?.cover_id || node?.id || "").trim();
|
||
}
|
||
|
||
function tencentCoverUrlFromCid(cid) {
|
||
const value = String(cid || "").trim();
|
||
if (!/^[a-z0-9]{8,40}$/i.test(value)) return "";
|
||
return `https://v.qq.com/x/cover/${value}.html`;
|
||
}
|
||
|
||
function canonicalTencentProgramUrl(rawUrl) {
|
||
const url = normalizeUrl(rawUrl, "https://v.qq.com/");
|
||
if (!url) return "";
|
||
|
||
try {
|
||
const parsed = new URL(url);
|
||
const path = safeDecodeURIComponent(parsed.pathname);
|
||
const coverMatch = path.match(/^\/x\/cover\/([^/]+)(?:\/[^/]+)?\.html$/);
|
||
if (parsed.hostname === "v.qq.com" && coverMatch) {
|
||
return `https://v.qq.com/x/cover/${coverMatch[1]}.html`;
|
||
}
|
||
} catch {}
|
||
|
||
return url;
|
||
}
|
||
|
||
function tencentItemEvidence(item, boxShowName = "") {
|
||
const values = [boxShowName];
|
||
collectTencentEvidenceStrings(item, values);
|
||
return [...new Set(values.map(stripHtml).map((value) => value.trim()).filter(Boolean))]
|
||
.join(" ");
|
||
}
|
||
|
||
function collectTencentEvidenceStrings(value, results, depth = 0) {
|
||
if (!value || typeof value !== "object" || depth > 3 || results.length > 80) return;
|
||
|
||
for (const [key, child] of Object.entries(value)) {
|
||
if (typeof child === "string") {
|
||
if (/title|name|subtitle|desc|keyword|text/i.test(key)) results.push(child);
|
||
continue;
|
||
}
|
||
if (child && typeof child === "object") {
|
||
collectTencentEvidenceStrings(child, results, depth + 1);
|
||
}
|
||
}
|
||
}
|
||
|
||
async function expandShortLinkCandidates(platform, html, config, keyword, signal) {
|
||
const decoded = decodeEscapedText(html);
|
||
const results = [];
|
||
const seen = new Set();
|
||
const shortLinks = extractShortLinks(decoded, keyword, platform);
|
||
|
||
for (const item of shortLinks.slice(0, 5)) {
|
||
if (seen.has(item.url)) continue;
|
||
seen.add(item.url);
|
||
|
||
try {
|
||
const response = await fetch(item.url, {
|
||
headers: getRequestHeaders(platform),
|
||
redirect: "follow",
|
||
signal: fetchSignal(signal, 5_000),
|
||
});
|
||
const target = response.url || "";
|
||
const score = scoreUrl(target, config, keyword);
|
||
if (score <= 0) continue;
|
||
results.push({
|
||
url: target,
|
||
score: score + 120,
|
||
evidence: item.evidence,
|
||
});
|
||
} catch {}
|
||
}
|
||
|
||
return results;
|
||
}
|
||
|
||
export function extractShortLinks(text, keyword = "", platform = "") {
|
||
const decoded = decodeEscapedText(text);
|
||
const results = [];
|
||
const shortLinkPattern = /\bhttps?:\/\/(?:t\.cn|url\.cn|m\.weibo\.cn\/status|weibo\.com\/ttarticle\/x\/m\/show)[^\s"'<>),。;]+/gi;
|
||
for (const match of decoded.matchAll(shortLinkPattern)) {
|
||
const evidence = cleanSnippet(decoded, match.index ?? 0, 500);
|
||
if (keyword && keywordMatchScore(evidence, keyword) <= 0) continue;
|
||
if (platform && !platformEvidenceMatches(evidence, platform)) continue;
|
||
results.push({
|
||
url: match[0],
|
||
evidence,
|
||
});
|
||
}
|
||
return results;
|
||
}
|
||
|
||
async function expandBridgePageCandidates(platform, html, baseUrl, config, keyword, signal) {
|
||
const bridgePages = extractBridgePageUrls(html, baseUrl, keyword, platform);
|
||
const results = [];
|
||
|
||
for (const bridge of bridgePages.slice(0, 3)) {
|
||
try {
|
||
const response = await fetch(bridge.url, {
|
||
headers: getRequestHeaders(platform),
|
||
redirect: "follow",
|
||
signal: fetchSignal(signal, 6_000),
|
||
});
|
||
if (!response.ok) continue;
|
||
const pageHtml = await response.text();
|
||
results.push(
|
||
...extractCandidateUrls(pageHtml, response.url || bridge.url, config, keyword)
|
||
.map((candidate) => ({
|
||
...candidate,
|
||
score: candidate.score + 60,
|
||
evidence: `${bridge.evidence} ${candidate.evidence}`.trim(),
|
||
})),
|
||
);
|
||
results.push(...await expandShortLinkCandidates(platform, pageHtml, config, keyword, signal));
|
||
} catch {}
|
||
}
|
||
|
||
return results;
|
||
}
|
||
|
||
export function extractBridgePageUrls(html, baseUrl, keyword = "", platform = "") {
|
||
const decoded = decodeEscapedText(html);
|
||
const results = [];
|
||
const seen = new Set();
|
||
const blocks = [
|
||
...decoded.matchAll(/<item\b[\s\S]*?<\/item>/gi),
|
||
...decoded.matchAll(/<div\b[^>]*(?:class|id)=["'][^"']*(?:result|b_algo|vrwrap|news-box|result-op)[^"']*["'][\s\S]*?<\/div>/gi),
|
||
];
|
||
|
||
for (const block of blocks) {
|
||
const rawBlock = block[0];
|
||
const blockText = decodePercentText(rawBlock)
|
||
.replace(/<[^>]+>/g, " ")
|
||
.replace(/\s+/g, " ")
|
||
.trim();
|
||
if (keyword && keywordMatchScore(blockText, keyword) <= 0) continue;
|
||
if (platform && !platformEvidenceMatches(blockText, platform)) continue;
|
||
|
||
for (const match of [
|
||
...rawBlock.matchAll(/<link>\s*([^<\s]+)\s*<\/link>/gi),
|
||
...rawBlock.matchAll(/\bhref\s*=\s*["']([^"']+)["']/gi),
|
||
]) {
|
||
const url = normalizeUrl(match[1], baseUrl);
|
||
if (!url || seen.has(url)) continue;
|
||
if (scoreUrl(url, configForBridge(platform), keyword) > 0) continue;
|
||
if (isSearchOrEngineUrl(url)) continue;
|
||
seen.add(url);
|
||
results.push({ url, evidence: blockText });
|
||
}
|
||
}
|
||
|
||
return results;
|
||
}
|
||
|
||
function configForBridge(platform) {
|
||
return SEARCH_CONFIGS[platform] || {
|
||
allowHosts: [],
|
||
includePaths: [],
|
||
excludePaths: [],
|
||
};
|
||
}
|
||
|
||
function isSearchOrEngineUrl(url) {
|
||
try {
|
||
const host = new URL(url).hostname.toLowerCase();
|
||
return /(?:bing|baidu|sogou|google)\./.test(host);
|
||
} catch {
|
||
return true;
|
||
}
|
||
}
|
||
|
||
function platformEvidenceMatches(text, platform) {
|
||
const normalized = normalizeSearchText(text);
|
||
const terms = {
|
||
tencent: ["腾讯视频", "腾讯", "小企鹅乐园", "企鹅乐园", "vqq"],
|
||
youku: ["优酷", "youku"],
|
||
iqiyi: ["爱奇艺", "iqiyi"],
|
||
mgtv: ["芒果tv", "芒果", "mgtv"],
|
||
}[platform] || [];
|
||
return terms.length === 0 || terms.some((term) => normalized.includes(normalizeSearchText(term)));
|
||
}
|
||
|
||
function extractStructuredSearchCandidates(decoded, baseUrl, config, keyword) {
|
||
const results = [];
|
||
const blocks = [
|
||
...decoded.matchAll(/<item\b[\s\S]*?<\/item>/gi),
|
||
...decoded.matchAll(/<div\b[^>]*(?:class|id)=["'][^"']*(?:result|b_algo|vrwrap|news-box|result-op)[^"']*["'][\s\S]*?<\/div>/gi),
|
||
];
|
||
|
||
for (const block of blocks) {
|
||
const rawBlock = block[0];
|
||
const blockText = decodePercentText(rawBlock)
|
||
.replace(/<[^>]+>/g, " ")
|
||
.replace(/\s+/g, " ")
|
||
.trim();
|
||
|
||
if (keywordMatchScore(blockText, keyword) <= 0) continue;
|
||
|
||
const urls = [
|
||
...rawBlock.matchAll(/<link>\s*([^<\s]+)\s*<\/link>/gi),
|
||
...rawBlock.matchAll(/\bhref\s*=\s*["']([^"']+)["']/gi),
|
||
...rawBlock.matchAll(/["']((?:https?:)?\/\/[^"']+)["']/gi),
|
||
...rawBlock.matchAll(/\b((?:https?:)?\/\/(?:v\.qq\.com|(?:v\.|www\.)?youku\.com|www\.iqiyi\.com|(?:www\.)?mgtv\.com)\/[^"'<>\s]+)/gi),
|
||
];
|
||
|
||
for (const match of urls) {
|
||
const url = normalizeUrl(match[1], baseUrl);
|
||
if (!url) continue;
|
||
|
||
const score = scoreUrl(url, config, keyword);
|
||
if (score <= 0) continue;
|
||
|
||
results.push({
|
||
url,
|
||
score: score + 80,
|
||
evidence: blockText,
|
||
});
|
||
}
|
||
}
|
||
|
||
return results;
|
||
}
|
||
|
||
async function rankCandidates(platform, candidates, keyword, signal) {
|
||
const ranked = [];
|
||
for (const candidate of candidates.slice(0, 8)) {
|
||
const pageTitle = await fetchPageTitle(platform, candidate.url, signal);
|
||
if (titleConflictsWithKeyword(pageTitle, keyword)) continue;
|
||
const keywordScore = keywordMatchScore(`${candidate.evidence} ${pageTitle}`, keyword);
|
||
ranked.push({
|
||
...candidate,
|
||
pageTitle,
|
||
keywordScore,
|
||
score: candidate.score + keywordScore,
|
||
});
|
||
}
|
||
|
||
return ranked
|
||
.filter((candidate) => candidate.keywordScore > 0)
|
||
.sort((a, b) => b.score - a.score)
|
||
.slice(0, 10);
|
||
}
|
||
|
||
async function fetchPageTitle(platform, url, signal) {
|
||
try {
|
||
const response = await fetch(url, {
|
||
headers: getRequestHeaders(platform),
|
||
redirect: "follow",
|
||
signal: fetchSignal(signal, 6_000),
|
||
});
|
||
const html = await response.text();
|
||
return decodeEscapedText(html.match(/<title[^>]*>([\s\S]*?)<\/title>/i)?.[1] || "")
|
||
.replace(/\s+/g, " ")
|
||
.trim();
|
||
} catch {
|
||
return "";
|
||
}
|
||
}
|
||
|
||
function hasStrongCandidate(candidates) {
|
||
return candidates.some((candidate) => candidate.score >= 180);
|
||
}
|
||
|
||
export function titleConflictsWithKeyword(pageTitle, keyword) {
|
||
const title = String(pageTitle || "").trim();
|
||
if (!title) return false;
|
||
return keywordMatchScore(title, keyword) === 0;
|
||
}
|
||
|
||
function isBlockedSearchPage(html) {
|
||
return /_____tmd_____|x5secdata|captcha|验证码|安全验证|人机验证|访问过于频繁|请求过于频繁/i.test(html);
|
||
}
|
||
|
||
function keywordMatchScore(text, keyword) {
|
||
const haystack = normalizeSearchText(text);
|
||
const tokens = keywordTokens(keyword);
|
||
if (!haystack || tokens.length === 0) return 0;
|
||
|
||
const full = normalizeSearchText(keyword);
|
||
let score = haystack.includes(full) ? 220 : 0;
|
||
const matched = tokens.filter((token) => haystack.includes(token)).length;
|
||
if (matched === tokens.length) score += 180;
|
||
score += matched * 45;
|
||
return score;
|
||
}
|
||
|
||
function keywordTokens(keyword) {
|
||
const tokens = String(keyword)
|
||
.split(/[\s::\-_/]+/)
|
||
.map(normalizeSearchText)
|
||
.filter((token) => token.length >= 2);
|
||
return [...new Set(tokens)];
|
||
}
|
||
|
||
function normalizeSearchText(value) {
|
||
return decodePercentText(String(value || ""))
|
||
.toLowerCase()
|
||
.replace(/[《》【】[\]()()::\s\-_/]+/g, "");
|
||
}
|
||
|
||
function scoreUrl(url, config, keyword) {
|
||
let parsed;
|
||
try {
|
||
parsed = new URL(url);
|
||
} catch {
|
||
return 0;
|
||
}
|
||
|
||
const host = parsed.hostname.toLowerCase();
|
||
const path = safeDecodeURIComponent(parsed.pathname);
|
||
if (!config.allowHosts.some((allowedHost) => host === allowedHost || host.endsWith(`.${allowedHost}`))) {
|
||
return 0;
|
||
}
|
||
|
||
if (config.excludePaths.some((pattern) => pattern.test(path))) {
|
||
return 0;
|
||
}
|
||
|
||
if (/^\/(?:a_|v_)\/?$/.test(path)) return 0;
|
||
if (host.includes("youku.com") && path === "/video" && !parsed.searchParams.get("s")) return 0;
|
||
if (!config.includePaths.some((pattern) => pattern.test(path))) return 0;
|
||
|
||
let score = 80;
|
||
if (/\/a_/.test(path) || /\/show_page\//.test(path) || /\/x\/cover\//.test(path) || /\/b\//.test(path)) score += 20;
|
||
if (/\/v_/.test(path) || /\/v_show\//.test(path) || /\/x\/page\//.test(path)) score += 5;
|
||
if (url.includes(encodeURIComponent(keyword)) || url.includes(keyword)) score += 5;
|
||
if (url.includes("...") || url.includes("%E2%80%A6")) score = 0;
|
||
if (/\.(jpg|jpeg|png|gif|webp|css|js|ico|svg)$/i.test(path)) score = 0;
|
||
return score;
|
||
}
|
||
|
||
function mergeCandidates(...groups) {
|
||
const merged = new Map();
|
||
for (const candidate of groups.flat()) {
|
||
if (!candidate?.url) continue;
|
||
const previous = merged.get(candidate.url);
|
||
if (!previous || candidate.score > previous.score) {
|
||
merged.set(candidate.url, candidate);
|
||
}
|
||
}
|
||
return [...merged.values()]
|
||
.sort((a, b) => b.score - a.score)
|
||
.slice(0, 10);
|
||
}
|
||
|
||
function normalizeUrl(rawUrl, baseUrl) {
|
||
if (!rawUrl) return "";
|
||
const trimmed = decodeEscapedText(rawUrl.trim());
|
||
if (trimmed.startsWith("javascript:") || trimmed.startsWith("#")) return "";
|
||
try {
|
||
const absolute = /^(?:https?:)?\/\//i.test(trimmed)
|
||
? trimmed
|
||
: /^(?:v\.qq\.com|(?:v\.|www\.)?youku\.com|www\.iqiyi\.com|(?:www\.)?mgtv\.com)\//i.test(trimmed)
|
||
? `https://${trimmed}`
|
||
: trimmed;
|
||
const parsed = new URL(absolute, baseUrl);
|
||
const unwrapped = decodeWrappedTarget(parsed);
|
||
if (unwrapped) return new URL(unwrapped).toString();
|
||
normalizeTencentPath(parsed);
|
||
return cleanupUrl(parsed);
|
||
} catch {
|
||
return "";
|
||
}
|
||
}
|
||
|
||
function normalizeTencentPath(parsed) {
|
||
if (parsed.hostname !== "v.qq.com") return;
|
||
if (!/^\/x\/cover\//.test(parsed.pathname)) return;
|
||
if (pathExtension(parsed.pathname)) return;
|
||
parsed.pathname = `${parsed.pathname}.html`;
|
||
}
|
||
|
||
function pathExtension(pathname) {
|
||
return /\.[a-z0-9]+$/i.test(pathname);
|
||
}
|
||
|
||
function decodeWrappedTarget(parsed) {
|
||
if (parsed.hostname.endsWith("bing.com")) {
|
||
const encoded = parsed.searchParams.get("u");
|
||
if (encoded) {
|
||
try {
|
||
const value = encoded.startsWith("a1") ? encoded.slice(2) : encoded;
|
||
return Buffer.from(value, "base64url").toString("utf8");
|
||
} catch {}
|
||
}
|
||
}
|
||
|
||
for (const key of ["url", "u", "target", "to", "redirect", "jump"]) {
|
||
const value = parsed.searchParams.get(key);
|
||
if (!value) continue;
|
||
const decoded = decodePercentText(value);
|
||
if (/^https?:\/\//i.test(decoded) || /^\/\//.test(decoded)) return decoded;
|
||
}
|
||
|
||
return "";
|
||
}
|
||
|
||
function cleanupUrl(parsed) {
|
||
parsed.hash = "";
|
||
for (const key of [...parsed.searchParams.keys()]) {
|
||
if (/^(ptag|from|fromvsogou|query|wd|q|src|source|utm_|spm|cxid)/i.test(key)) {
|
||
parsed.searchParams.delete(key);
|
||
}
|
||
}
|
||
return parsed.toString();
|
||
}
|
||
|
||
function decodeEscapedText(value) {
|
||
return decodeHtmlEntities(String(value)
|
||
.replace(/\\u([0-9a-f]{4})/gi, (_, hex) => String.fromCharCode(Number.parseInt(hex, 16)))
|
||
.replace(/\\x([0-9a-f]{2})/gi, (_, hex) => String.fromCharCode(Number.parseInt(hex, 16)))
|
||
.replace(/\\\//g, "/"));
|
||
}
|
||
|
||
function decodeHtmlEntities(value) {
|
||
return String(value)
|
||
.replace(/ /g, " ")
|
||
.replace(/"/g, "\"")
|
||
.replace(/"/g, "\"")
|
||
.replace(/"/gi, "\"")
|
||
.replace(/'/g, "'")
|
||
.replace(/'/g, "'")
|
||
.replace(/'/gi, "'")
|
||
.replace(/&/g, "&")
|
||
.replace(/</g, "<")
|
||
.replace(/>/g, ">");
|
||
}
|
||
|
||
function cleanSnippet(text, index, padding = 160) {
|
||
const start = Math.max(0, index - padding);
|
||
const end = Math.min(text.length, index + padding);
|
||
return decodePercentText(text.slice(start, end))
|
||
.replace(/<[^>]+>/g, " ")
|
||
.replace(/\s+/g, " ")
|
||
.trim();
|
||
}
|
||
|
||
function decodePercentText(value) {
|
||
return String(value).replace(/%[0-9a-f]{2}(?:%[0-9a-f]{2})*/gi, (match) => {
|
||
try {
|
||
return decodeURIComponent(match);
|
||
} catch {
|
||
return match;
|
||
}
|
||
});
|
||
}
|
||
|
||
function safeDecodeURIComponent(value) {
|
||
try {
|
||
return decodeURIComponent(value);
|
||
} catch {
|
||
return String(value || "");
|
||
}
|
||
}
|