import crypto from "node:crypto"; import { getRequestHeaders } from "./sites.js"; const SEARCH_TIMEOUT_MS = 6_000; const QUICK_SEARCH_TIMEOUT_MS = 6_000; const SEARCH_CONFIGS = { tencent: { searchUrl: (keyword) => `https://v.qq.com/x/search/?q=${encodeURIComponent(keyword)}`, siteSearchUrls: [ (keyword) => `https://v.qq.com/x/search/?q=${encodeURIComponent(keyword)}`, ], allowHosts: ["v.qq.com"], includePaths: [/\/x\/cover\//, /\/x\/page\//], excludePaths: [/\/x\/search\//, /\/search/], fallbackQueries: [ (keyword) => `site:v.qq.com/x/cover ${keyword} 腾讯视频`, (keyword) => `site:v.qq.com/x/page ${keyword} 腾讯视频`, (keyword) => `${keyword} 腾讯视频`, ], }, youku: { searchUrl: (keyword) => `https://so.youku.com/search_video/q_${encodeURIComponent(keyword)}`, preferFallback: true, siteSearchUrls: [ (keyword) => `https://so.youku.com/search_video/q_${encodeURIComponent(keyword)}`, (keyword) => `https://www.youku.com/search_video?keyword=${encodeURIComponent(keyword)}`, ], allowHosts: ["v.youku.com", "www.youku.com", "youku.com"], includePaths: [/\/v_show\//, /^\/video$/, /\/show_page\//], excludePaths: [/\/search/], fallbackQueries: [ (keyword) => `site:v.youku.com/v_show ${keyword}`, (keyword) => `site:v.youku.com/video ${keyword}`, (keyword) => `site:www.youku.com/show_page ${keyword}`, (keyword) => `site:youku.com ${keyword} youku`, (keyword) => `site:v.youku.com ${keyword} 优酷`, (keyword) => `site:youku.com/show_page ${keyword} 优酷`, (keyword) => `${keyword} 优酷`, ], }, iqiyi: { searchUrl: (keyword) => `https://so.iqiyi.com/so/q_${encodeURIComponent(keyword)}`, siteSearchUrls: [ (keyword) => `https://so.iqiyi.com/so/q_${encodeURIComponent(keyword)}`, (keyword) => `https://www.iqiyi.com/search?keyword=${encodeURIComponent(keyword)}`, ], allowHosts: ["www.iqiyi.com"], includePaths: [/\/v_/, /\/a_/], excludePaths: [/\/so\//], fallbackQueries: [ (keyword) => `site:www.iqiyi.com/a_ ${keyword} 爱奇艺 热度`, (keyword) => `site:www.iqiyi.com/v_ ${keyword} 爱奇艺`, (keyword) => `${keyword} 爱奇艺`, ], }, mgtv: { searchUrl: (keyword) => `https://so.mgtv.com/so?k=${encodeURIComponent(keyword)}`, siteSearchUrls: [ (keyword) => `https://so.mgtv.com/so?k=${encodeURIComponent(keyword)}`, ], allowHosts: ["www.mgtv.com", "mgtv.com"], includePaths: [/\/b\//, /\/h\//, /\/l\//], excludePaths: [/\/so/], fallbackQueries: [ (keyword) => `site:www.mgtv.com/h ${keyword} 芒果TV`, (keyword) => `site:www.mgtv.com/b ${keyword} 芒果TV`, (keyword) => `${keyword} 芒果TV`, ], }, }; SEARCH_CONFIGS.tencent.fallbackQueries = [ (keyword) => `site:v.qq.com/x/cover ${keyword} 腾讯视频`, (keyword) => `site:v.qq.com/x/page ${keyword} 腾讯视频`, (keyword) => `${keyword} 腾讯视频 少儿`, (keyword) => `${keyword} 小企鹅乐园`, (keyword) => `${keyword} 腾讯视频`, ]; SEARCH_CONFIGS.youku.fallbackQueries = [ (keyword) => `site:v.youku.com/v_show ${keyword}`, (keyword) => `site:v.youku.com/video ${keyword}`, (keyword) => `site:www.youku.com/show_page ${keyword}`, (keyword) => `site:youku.com ${keyword} youku`, (keyword) => `site:v.youku.com ${keyword} 优酷`, (keyword) => `site:youku.com/show_page ${keyword} 优酷`, (keyword) => `${keyword} 优酷`, ]; SEARCH_CONFIGS.iqiyi.excludePaths = [/\/so(?:\/|$)/, /\/search/]; SEARCH_CONFIGS.iqiyi.fallbackQueries = [ (keyword) => `${keyword} 爱奇艺`, (keyword) => `site:www.iqiyi.com/a_ ${keyword} 爱奇艺 热度`, (keyword) => `site:www.iqiyi.com/v_ ${keyword} 爱奇艺`, ]; SEARCH_CONFIGS.mgtv.fallbackQueries = [ (keyword) => `site:www.mgtv.com/h ${keyword} 芒果TV`, (keyword) => `site:www.mgtv.com/b ${keyword} 芒果TV`, (keyword) => `${keyword} 芒果TV`, ]; export async function findProgramPage(platform, keyword, options = {}) { const config = SEARCH_CONFIGS[platform]; if (!config) { return { platform, keyword, url: "", status: "error", error: `unsupported platform: ${platform}`, candidates: [], }; } try { const keywordAliases = platform === "youku" ? await youkuHomeSearchKeywords(keyword, options.signal) : platform === "iqiyi" ? iqiyiSearchKeywords(keyword) : [keyword]; const searchUrl = config.searchUrl(keyword); let html = ""; let blockedSearch = Boolean(config.preferFallback); let responseOk = true; if (!config.preferFallback) { const response = await fetch(searchUrl, { headers: getRequestHeaders(platform), redirect: "follow", signal: fetchSignal(options.signal, SEARCH_TIMEOUT_MS), }); html = await response.text(); blockedSearch = response.status === 403 || response.status === 429 || isBlockedSearchPage(html); responseOk = response.ok; } if (!responseOk && !blockedSearch) { return { platform, keyword, url: "", status: "error", error: "search HTTP error", candidates: [], }; } let candidates = blockedSearch ? [] : await rankCandidates(platform, await candidateUrlsFromHtml(platform, html, searchUrl, config, keyword, options.signal), keyword, options.signal); let matchedSearchUrl = searchUrl; if (!hasStrongCandidate(candidates) && config.siteSearchUrls?.length) { const siteSearch = await findFromSiteSearches(platform, config, keywordAliases, options.signal); candidates = mergeCandidates(candidates, siteSearch.candidates); matchedSearchUrl = siteSearch.searchUrl || matchedSearchUrl; } if (platform === "tencent" && !hasStrongCandidate(candidates)) { const stationSearch = await findFromTencentStationSearch(config, keywordAliases, options.signal); candidates = mergeCandidates(candidates, stationSearch.candidates); matchedSearchUrl = stationSearch.searchUrl || matchedSearchUrl; } if (platform === "iqiyi" && !hasStrongCandidate(candidates)) { const iqiyiFallback = await findIqiyiFromDuckDuckGo(config, keywordAliases, options.signal); candidates = mergeCandidates(candidates, iqiyiFallback.candidates); matchedSearchUrl = iqiyiFallback.searchUrl || matchedSearchUrl; } if (!hasStrongCandidate(candidates)) { const fallback = await findFromFallbackSearch(platform, config, keywordAliases, options.signal); candidates = mergeCandidates(candidates, fallback.candidates); matchedSearchUrl = fallback.searchUrl || matchedSearchUrl; } const best = candidates[0]; return { platform, keyword, url: best?.url || "", status: best ? "ok" : "no_match", error: best ? "" : (blockedSearch ? "search page requires verification" : "no program page found from search page"), candidates, searchUrl: matchedSearchUrl, }; } catch (error) { return { platform, keyword, url: "", status: "error", error: error.message, candidates: [], }; } } export async function findProgramPageQuick(platform, keyword) { const controller = new AbortController(); let timer; try { timer = setTimeout(() => controller.abort(), QUICK_SEARCH_TIMEOUT_MS); return await findProgramPage(platform, keyword, { signal: controller.signal }); } catch (error) { return { platform, keyword, url: "", status: "error", error: controller.signal.aborted ? `quick search timeout ${QUICK_SEARCH_TIMEOUT_MS}ms` : error.message, candidates: [], searchUrl: "", }; } finally { clearTimeout(timer); } } function fetchSignal(parentSignal, timeoutMs) { return parentSignal ? AbortSignal.any([parentSignal, AbortSignal.timeout(timeoutMs)]) : AbortSignal.timeout(timeoutMs); } async function findFromSiteSearches(platform, config, keywords, signal) { let bestCandidates = []; let bestSearchUrl = ""; for (const keyword of uniqueKeywords(keywords)) { for (const searchBuilder of config.siteSearchUrls || []) { const searchUrl = searchBuilder(keyword); try { const response = await fetch(searchUrl, { headers: getRequestHeaders(platform), redirect: "follow", signal: fetchSignal(signal, 8_000), }); if (!response.ok) continue; const html = await response.text(); if (isBlockedSearchPage(html)) continue; const candidates = await rankCandidates(platform, await candidateUrlsFromHtml(platform, html, searchUrl, config, keyword, signal), keyword, signal); if (hasStrongCandidate(candidates)) return { candidates, searchUrl }; if (candidates.length > bestCandidates.length) { bestCandidates = candidates; bestSearchUrl = searchUrl; } } catch { continue; } } } return { candidates: bestCandidates, searchUrl: bestSearchUrl }; } const TENCENT_SEARCH_API_URLS = [ "https://pbaccess.video.qq.com/trpc.videosearch.mobile_search.MultiTerminalSearch/MbSearch?vversion_platform=2", "https://pbaccess.video.qq.com/trpc.videosearch.mobile_search.HttpMobileRecall/MbSearchHttp", ]; async function findFromTencentStationSearch(config, keywords, signal) { let bestCandidates = []; let bestSearchUrl = ""; for (const keyword of uniqueKeywords(keywords)) { for (const searchUrl of TENCENT_SEARCH_API_URLS) { try { const response = await fetch(searchUrl, { method: "POST", headers: getTencentSearchApiHeaders(keyword), body: JSON.stringify(buildTencentSearchPayload(keyword)), redirect: "follow", signal: fetchSignal(signal, 8_000), }); if (!response.ok) continue; const json = await response.json(); const candidates = await rankCandidates( "tencent", extractTencentSearchCandidates(json, keyword, config), keyword, signal, ); if (hasStrongCandidate(candidates)) return { candidates, searchUrl }; if (candidates.length > bestCandidates.length) { bestCandidates = candidates; bestSearchUrl = searchUrl; } } catch { continue; } } } return { candidates: bestCandidates, searchUrl: bestSearchUrl }; } function getTencentSearchApiHeaders(keyword) { return { ...getRequestHeaders("tencent"), accept: "application/json, text/plain, */*", "content-type": "application/json", origin: "https://v.qq.com", referer: SEARCH_CONFIGS.tencent.searchUrl(keyword), }; } function buildTencentSearchPayload(keyword) { return { query: keyword, pagenum: 0, pagesize: 20, queryFrom: 0, filterValue: "", sceneId: 21, searchDatakey: "", transInfo: "", isneedQc: true, preQid: "", adClientInfo: "", extraInfo: { isNewMarkLabel: "0", multi_terminal_pc: "1", themeType: "0", sugRelatedIds: "{}", appVersion: "", frontVersion: "26041606", }, version: "26022601", clientType: 1, uuid: crypto.randomUUID(), retry: 0, featureList: [ "DEFAULT_FEFEATURE", "PC_SHORT_VIDEOS_WATERFALL", "PC_WANT_EPISODE_V2", "PC_WANT_EPISODE", ], }; } async function findFromFallbackSearch(platform, config, keywords, signal) { let bestCandidates = []; let bestSearchUrl = ""; for (const keyword of uniqueKeywords(keywords)) { for (const queryBuilder of config.fallbackQueries || []) { const query = queryBuilder(keyword); for (const engine of fallbackSearchUrls(query)) { try { const response = await fetch(engine.url, { headers: { ...getRequestHeaders(""), referer: engine.referer, }, redirect: "follow", signal: fetchSignal(signal, 8_000), }); if (!response.ok) continue; const html = await response.text(); const candidates = await rankCandidates(platform, await candidateUrlsFromHtml(platform, html, engine.url, config, keyword, signal), keyword, signal); if (hasStrongCandidate(candidates)) return { candidates, searchUrl: engine.url }; if (candidates.length > bestCandidates.length) { bestCandidates = candidates; bestSearchUrl = engine.url; } } catch { continue; } } } } return { candidates: bestCandidates, searchUrl: bestSearchUrl }; } async function findIqiyiFromDuckDuckGo(config, keywords, signal) { for (const keyword of uniqueKeywords(keywords)) { const query = `${keyword} 爱奇艺`; const searchUrl = `https://duckduckgo.com/html/?q=${encodeURIComponent(query)}`; try { const response = await fetch(searchUrl, { headers: { ...getRequestHeaders(""), referer: "https://duckduckgo.com/", }, redirect: "follow", signal: fetchSignal(signal, 8_000), }); if (!response.ok) continue; const html = await response.text(); const candidates = extractCandidateUrls(html, searchUrl, config, keyword) .map((candidate) => ({ ...candidate, keywordScore: keywordMatchScore(candidate.evidence, keyword), score: candidate.score + keywordMatchScore(candidate.evidence, keyword), })) .filter((candidate) => candidate.keywordScore > 0) .sort((a, b) => b.score - a.score) .slice(0, 10); if (hasStrongCandidate(candidates)) return { candidates, searchUrl }; } catch { continue; } } return { candidates: [], searchUrl: "" }; } async function youkuHomeSearchKeywords(keyword, signal) { const keywords = [keyword]; try { const json = await fetchYoukuMtopSearch({ pg: "1", pz: "12", searchFrom: "home", utdId: "XlQcF5xQrCcCAWoLKdGqIOhS", ykPid: "", sdkver: 314, pcKuFlixMode: 1, appScene: "kubox", appCaller: "pc", s: "pc", device: "pc", platform: "pc", keyword, }, signal); for (const value of extractYoukuSuggestionTexts(json)) { keywords.push(value); } } catch {} return uniqueKeywords(keywords).slice(0, 5); } async function fetchYoukuMtopSearch(dataObject, signal) { const appKey = "23774304"; const api = "mtop.youku.soku.yksearch"; const data = JSON.stringify(dataObject); const headers = { ...getRequestHeaders("youku"), referer: "https://www.youku.com/", }; const first = await fetch(buildYoukuMtopUrl({ api, appKey, data, token: "" }), { headers, redirect: "follow", signal: fetchSignal(signal, 8_000), }); await first.text(); const cookieHeader = first.headers.get("set-cookie") || ""; const token = extractMtopToken(cookieHeader); if (!token) return {}; const response = await fetch(buildYoukuMtopUrl({ api, appKey, data, token }), { headers: { ...headers, cookie: compactMtopCookie(cookieHeader), }, redirect: "follow", signal: fetchSignal(signal, 8_000), }); return response.json(); } function buildYoukuMtopUrl({ api, appKey, data, token }) { const timestamp = Date.now().toString(); const sign = crypto .createHash("md5") .update(`${token}&${timestamp}&${appKey}&${data}`) .digest("hex"); const params = new URLSearchParams({ jsv: "2.7.2", appKey, t: timestamp, sign, api, v: "2.0", type: "GET", dataType: "json", ecode: "1", data, }); return `https://acs.youku.com/h5/${api}/2.0/?${params.toString()}`; } function extractMtopToken(cookieHeader) { return (cookieHeader.match(/_m_h5_tk=([^_;]+)/)?.[1] || "").split("_")[0] || ""; } function compactMtopCookie(cookieHeader) { return [...cookieHeader.matchAll(/(?:^|, )([^=;, ]+=[^;]+)/g)] .map((match) => match[1]) .filter((cookie) => cookie.startsWith("_m_h5") || cookie.startsWith("mtop")) .join("; "); } function extractYoukuSuggestionTexts(json) { const values = []; walkJson(json, (key, value) => { if (typeof value !== "string") return; if (!["w", "show_w", "keyword"].includes(key)) return; const text = stripHtml(value).trim(); if (text) values.push(text); }); return values; } function walkJson(value, visit) { if (!value || typeof value !== "object") return; for (const [key, child] of Object.entries(value)) { visit(key, child); walkJson(child, visit); } } function stripHtml(value) { return String(value || "").replace(/<[^>]+>/g, ""); } function uniqueKeywords(keywords) { const seen = new Set(); const result = []; for (const keyword of keywords) { const value = String(keyword || "").trim(); const key = normalizeSearchText(value); if (!value || seen.has(key)) continue; seen.add(key); result.push(value); } return result; } export function iqiyiSearchKeywords(keyword) { const value = String(keyword || "").trim(); const keywords = [value]; const seasonMatch = value.match(/^(.+?)(\d{1,2})之(.+)$/); if (seasonMatch) { const [, prefix, season, title] = seasonMatch; keywords.push(`${prefix} 第${season}季 ${title}`); keywords.push(`${prefix}第${season}季${title}`); keywords.push(`${prefix} ${title}`); } return uniqueKeywords(keywords).slice(0, 5); } function fallbackSearchUrls(query) { const encoded = encodeURIComponent(query); return [ { url: `https://www.bing.com/search?format=rss&q=${encoded}`, referer: "https://www.bing.com/", }, { url: `https://www.bing.com/search?q=${encoded}`, referer: "https://www.bing.com/", }, { url: `https://duckduckgo.com/html/?q=${encoded}`, referer: "https://duckduckgo.com/", }, { url: `https://www.baidu.com/s?wd=${encoded}`, referer: "https://www.baidu.com/", }, { url: `https://www.sogou.com/web?query=${encoded}`, referer: "https://www.sogou.com/", }, ]; } async function candidateUrlsFromHtml(platform, html, baseUrl, config, keyword, signal) { const direct = extractCandidateUrls(html, baseUrl, config, keyword); const expanded = await expandShortLinkCandidates(platform, html, config, keyword, signal); const bridge = direct.length >= 2 ? [] : await expandBridgePageCandidates(platform, html, baseUrl, config, keyword, signal); return mergeCandidates(direct, expanded, bridge); } export function extractCandidateUrls(html, baseUrl, config, keyword) { const decoded = decodeEscapedText(html); const candidates = new Map(); for (const candidate of extractStructuredSearchCandidates(decoded, baseUrl, config, keyword)) { const previous = candidates.get(candidate.url); if (!previous || candidate.score > previous.score) { candidates.set(candidate.url, candidate); } } const linkMatches = [ ...decoded.matchAll(/\bhref\s*=\s*["']([^"']+)["']/gi), ...decoded.matchAll(/\b(?:url|playUrl|pageUrl|coverUrl|jumpUrl|target)\s*[:=]\s*["']([^"']+)["']/gi), ...decoded.matchAll(/\s*([^<\s]+)\s*<\/link>/gi), ...decoded.matchAll(/["']((?:https?:)?\/\/[^"']+)["']/gi), ...decoded.matchAll(/\b((?:https?:)?\/\/(?:v\.qq\.com|(?:v\.|www\.)?youku\.com|www\.iqiyi\.com|(?:www\.)?mgtv\.com)\/[^"'<>\s]+)/gi), ...decoded.matchAll(/\b((?:v\.qq\.com|(?:v\.|www\.)?youku\.com|www\.iqiyi\.com|(?:www\.)?mgtv\.com)\/[^"'<>\s]+)/gi), ]; for (const match of linkMatches) { const rawUrl = match[1]; const url = normalizeUrl(rawUrl, baseUrl); if (!url) continue; const score = scoreUrl(url, config, keyword); if (score <= 0) continue; const previous = candidates.get(url); if (!previous || score > previous.score) { candidates.set(url, { url, score, evidence: cleanSnippet(decoded, match.index ?? 0, 700), }); } } return [...candidates.values()].sort((a, b) => b.score - a.score).slice(0, 10); } export function extractTencentSearchCandidates(json, keyword, config = SEARCH_CONFIGS.tencent) { const candidates = new Map(); for (const { item, boxShowName } of tencentSearchItems(json)) { const evidence = tencentItemEvidence(item, boxShowName); if (keywordMatchScore(evidence, keyword) <= 0) continue; for (const url of tencentItemProgramUrls(item)) { const score = scoreUrl(url, config, keyword); if (score <= 0) continue; const candidate = { url, score: score + 140, evidence, }; const previous = candidates.get(url); if (!previous || candidate.score > previous.score) { candidates.set(url, candidate); } } } return [...candidates.values()] .sort((a, b) => b.score - a.score) .slice(0, 10); } function tencentSearchItems(json) { const lists = [ json?.data?.normalList, ...(json?.data?.areaBoxList || []), ].filter(Boolean); const items = []; const seen = new Set(); for (const list of lists) { for (const item of list.itemList || []) { if (!item || typeof item !== "object" || seen.has(item)) continue; seen.add(item); items.push({ item, boxShowName: list.boxShowName || "" }); } } return items; } function tencentItemProgramUrls(item) { const urls = []; const nodes = [item, item?.videoInfo, item?.doc].filter(Boolean); for (const node of nodes) { if (Number(node.dataType) === 2) { const cid = tencentNodeId(node); const coverUrl = tencentCoverUrlFromCid(cid); if (coverUrl) urls.push(coverUrl); } for (const key of ["url", "playUrl", "pageUrl", "coverUrl", "jumpUrl", "target"]) { const url = canonicalTencentProgramUrl(node[key]); if (url) urls.push(url); } } return [...new Set(urls)]; } function tencentNodeId(node) { return String(node?.cid || node?.coverId || node?.cover_id || node?.id || "").trim(); } function tencentCoverUrlFromCid(cid) { const value = String(cid || "").trim(); if (!/^[a-z0-9]{8,40}$/i.test(value)) return ""; return `https://v.qq.com/x/cover/${value}.html`; } function canonicalTencentProgramUrl(rawUrl) { const url = normalizeUrl(rawUrl, "https://v.qq.com/"); if (!url) return ""; try { const parsed = new URL(url); const path = safeDecodeURIComponent(parsed.pathname); const coverMatch = path.match(/^\/x\/cover\/([^/]+)(?:\/[^/]+)?\.html$/); if (parsed.hostname === "v.qq.com" && coverMatch) { return `https://v.qq.com/x/cover/${coverMatch[1]}.html`; } } catch {} return url; } function tencentItemEvidence(item, boxShowName = "") { const values = [boxShowName]; collectTencentEvidenceStrings(item, values); return [...new Set(values.map(stripHtml).map((value) => value.trim()).filter(Boolean))] .join(" "); } function collectTencentEvidenceStrings(value, results, depth = 0) { if (!value || typeof value !== "object" || depth > 3 || results.length > 80) return; for (const [key, child] of Object.entries(value)) { if (typeof child === "string") { if (/title|name|subtitle|desc|keyword|text/i.test(key)) results.push(child); continue; } if (child && typeof child === "object") { collectTencentEvidenceStrings(child, results, depth + 1); } } } async function expandShortLinkCandidates(platform, html, config, keyword, signal) { const decoded = decodeEscapedText(html); const results = []; const seen = new Set(); const shortLinks = extractShortLinks(decoded, keyword, platform); for (const item of shortLinks.slice(0, 5)) { if (seen.has(item.url)) continue; seen.add(item.url); try { const response = await fetch(item.url, { headers: getRequestHeaders(platform), redirect: "follow", signal: fetchSignal(signal, 5_000), }); const target = response.url || ""; const score = scoreUrl(target, config, keyword); if (score <= 0) continue; results.push({ url: target, score: score + 120, evidence: item.evidence, }); } catch {} } return results; } export function extractShortLinks(text, keyword = "", platform = "") { const decoded = decodeEscapedText(text); const results = []; const shortLinkPattern = /\bhttps?:\/\/(?:t\.cn|url\.cn|m\.weibo\.cn\/status|weibo\.com\/ttarticle\/x\/m\/show)[^\s"'<>),。;]+/gi; for (const match of decoded.matchAll(shortLinkPattern)) { const evidence = cleanSnippet(decoded, match.index ?? 0, 500); if (keyword && keywordMatchScore(evidence, keyword) <= 0) continue; if (platform && !platformEvidenceMatches(evidence, platform)) continue; results.push({ url: match[0], evidence, }); } return results; } async function expandBridgePageCandidates(platform, html, baseUrl, config, keyword, signal) { const bridgePages = extractBridgePageUrls(html, baseUrl, keyword, platform); const results = []; for (const bridge of bridgePages.slice(0, 3)) { try { const response = await fetch(bridge.url, { headers: getRequestHeaders(platform), redirect: "follow", signal: fetchSignal(signal, 6_000), }); if (!response.ok) continue; const pageHtml = await response.text(); results.push( ...extractCandidateUrls(pageHtml, response.url || bridge.url, config, keyword) .map((candidate) => ({ ...candidate, score: candidate.score + 60, evidence: `${bridge.evidence} ${candidate.evidence}`.trim(), })), ); results.push(...await expandShortLinkCandidates(platform, pageHtml, config, keyword, signal)); } catch {} } return results; } export function extractBridgePageUrls(html, baseUrl, keyword = "", platform = "") { const decoded = decodeEscapedText(html); const results = []; const seen = new Set(); const blocks = [ ...decoded.matchAll(//gi), ...decoded.matchAll(/]*(?:class|id)=["'][^"']*(?:result|b_algo|vrwrap|news-box|result-op)[^"']*["'][\s\S]*?<\/div>/gi), ]; for (const block of blocks) { const rawBlock = block[0]; const blockText = decodePercentText(rawBlock) .replace(/<[^>]+>/g, " ") .replace(/\s+/g, " ") .trim(); if (keyword && keywordMatchScore(blockText, keyword) <= 0) continue; if (platform && !platformEvidenceMatches(blockText, platform)) continue; for (const match of [ ...rawBlock.matchAll(/\s*([^<\s]+)\s*<\/link>/gi), ...rawBlock.matchAll(/\bhref\s*=\s*["']([^"']+)["']/gi), ]) { const url = normalizeUrl(match[1], baseUrl); if (!url || seen.has(url)) continue; if (scoreUrl(url, configForBridge(platform), keyword) > 0) continue; if (isSearchOrEngineUrl(url)) continue; seen.add(url); results.push({ url, evidence: blockText }); } } return results; } function configForBridge(platform) { return SEARCH_CONFIGS[platform] || { allowHosts: [], includePaths: [], excludePaths: [], }; } function isSearchOrEngineUrl(url) { try { const host = new URL(url).hostname.toLowerCase(); return /(?:bing|baidu|sogou|google)\./.test(host); } catch { return true; } } function platformEvidenceMatches(text, platform) { const normalized = normalizeSearchText(text); const terms = { tencent: ["腾讯视频", "腾讯", "小企鹅乐园", "企鹅乐园", "vqq"], youku: ["优酷", "youku"], iqiyi: ["爱奇艺", "iqiyi"], mgtv: ["芒果tv", "芒果", "mgtv"], }[platform] || []; return terms.length === 0 || terms.some((term) => normalized.includes(normalizeSearchText(term))); } function extractStructuredSearchCandidates(decoded, baseUrl, config, keyword) { const results = []; const blocks = [ ...decoded.matchAll(//gi), ...decoded.matchAll(/]*(?:class|id)=["'][^"']*(?:result|b_algo|vrwrap|news-box|result-op)[^"']*["'][\s\S]*?<\/div>/gi), ]; for (const block of blocks) { const rawBlock = block[0]; const blockText = decodePercentText(rawBlock) .replace(/<[^>]+>/g, " ") .replace(/\s+/g, " ") .trim(); if (keywordMatchScore(blockText, keyword) <= 0) continue; const urls = [ ...rawBlock.matchAll(/\s*([^<\s]+)\s*<\/link>/gi), ...rawBlock.matchAll(/\bhref\s*=\s*["']([^"']+)["']/gi), ...rawBlock.matchAll(/["']((?:https?:)?\/\/[^"']+)["']/gi), ...rawBlock.matchAll(/\b((?:https?:)?\/\/(?:v\.qq\.com|(?:v\.|www\.)?youku\.com|www\.iqiyi\.com|(?:www\.)?mgtv\.com)\/[^"'<>\s]+)/gi), ]; for (const match of urls) { const url = normalizeUrl(match[1], baseUrl); if (!url) continue; const score = scoreUrl(url, config, keyword); if (score <= 0) continue; results.push({ url, score: score + 80, evidence: blockText, }); } } return results; } async function rankCandidates(platform, candidates, keyword, signal) { const ranked = []; for (const candidate of candidates.slice(0, 8)) { const pageTitle = await fetchPageTitle(platform, candidate.url, signal); if (titleConflictsWithKeyword(pageTitle, keyword)) continue; const keywordScore = keywordMatchScore(`${candidate.evidence} ${pageTitle}`, keyword); ranked.push({ ...candidate, pageTitle, keywordScore, score: candidate.score + keywordScore, }); } return ranked .filter((candidate) => candidate.keywordScore > 0) .sort((a, b) => b.score - a.score) .slice(0, 10); } async function fetchPageTitle(platform, url, signal) { try { const response = await fetch(url, { headers: getRequestHeaders(platform), redirect: "follow", signal: fetchSignal(signal, 6_000), }); const html = await response.text(); return decodeEscapedText(html.match(/]*>([\s\S]*?)<\/title>/i)?.[1] || "") .replace(/\s+/g, " ") .trim(); } catch { return ""; } } function hasStrongCandidate(candidates) { return candidates.some((candidate) => candidate.score >= 180); } export function titleConflictsWithKeyword(pageTitle, keyword) { const title = String(pageTitle || "").trim(); if (!title) return false; return keywordMatchScore(title, keyword) === 0; } function isBlockedSearchPage(html) { return /_____tmd_____|x5secdata|captcha|验证码|安全验证|人机验证|访问过于频繁|请求过于频繁/i.test(html); } function keywordMatchScore(text, keyword) { const haystack = normalizeSearchText(text); const tokens = keywordTokens(keyword); if (!haystack || tokens.length === 0) return 0; const full = normalizeSearchText(keyword); let score = haystack.includes(full) ? 220 : 0; const matched = tokens.filter((token) => haystack.includes(token)).length; if (matched === tokens.length) score += 180; score += matched * 45; return score; } function keywordTokens(keyword) { const tokens = String(keyword) .split(/[\s::\-_/]+/) .map(normalizeSearchText) .filter((token) => token.length >= 2); return [...new Set(tokens)]; } function normalizeSearchText(value) { return decodePercentText(String(value || "")) .toLowerCase() .replace(/[《》【】[\]()()::\s\-_/]+/g, ""); } function scoreUrl(url, config, keyword) { let parsed; try { parsed = new URL(url); } catch { return 0; } const host = parsed.hostname.toLowerCase(); const path = safeDecodeURIComponent(parsed.pathname); if (!config.allowHosts.some((allowedHost) => host === allowedHost || host.endsWith(`.${allowedHost}`))) { return 0; } if (config.excludePaths.some((pattern) => pattern.test(path))) { return 0; } if (/^\/(?:a_|v_)\/?$/.test(path)) return 0; if (host.includes("youku.com") && path === "/video" && !parsed.searchParams.get("s")) return 0; if (!config.includePaths.some((pattern) => pattern.test(path))) return 0; let score = 80; if (/\/a_/.test(path) || /\/show_page\//.test(path) || /\/x\/cover\//.test(path) || /\/b\//.test(path)) score += 20; if (/\/v_/.test(path) || /\/v_show\//.test(path) || /\/x\/page\//.test(path)) score += 5; if (url.includes(encodeURIComponent(keyword)) || url.includes(keyword)) score += 5; if (url.includes("...") || url.includes("%E2%80%A6")) score = 0; if (/\.(jpg|jpeg|png|gif|webp|css|js|ico|svg)$/i.test(path)) score = 0; return score; } function mergeCandidates(...groups) { const merged = new Map(); for (const candidate of groups.flat()) { if (!candidate?.url) continue; const previous = merged.get(candidate.url); if (!previous || candidate.score > previous.score) { merged.set(candidate.url, candidate); } } return [...merged.values()] .sort((a, b) => b.score - a.score) .slice(0, 10); } function normalizeUrl(rawUrl, baseUrl) { if (!rawUrl) return ""; const trimmed = decodeEscapedText(rawUrl.trim()); if (trimmed.startsWith("javascript:") || trimmed.startsWith("#")) return ""; try { const absolute = /^(?:https?:)?\/\//i.test(trimmed) ? trimmed : /^(?:v\.qq\.com|(?:v\.|www\.)?youku\.com|www\.iqiyi\.com|(?:www\.)?mgtv\.com)\//i.test(trimmed) ? `https://${trimmed}` : trimmed; const parsed = new URL(absolute, baseUrl); const unwrapped = decodeWrappedTarget(parsed); if (unwrapped) return new URL(unwrapped).toString(); normalizeTencentPath(parsed); return cleanupUrl(parsed); } catch { return ""; } } function normalizeTencentPath(parsed) { if (parsed.hostname !== "v.qq.com") return; if (!/^\/x\/cover\//.test(parsed.pathname)) return; if (pathExtension(parsed.pathname)) return; parsed.pathname = `${parsed.pathname}.html`; } function pathExtension(pathname) { return /\.[a-z0-9]+$/i.test(pathname); } function decodeWrappedTarget(parsed) { if (parsed.hostname.endsWith("bing.com")) { const encoded = parsed.searchParams.get("u"); if (encoded) { try { const value = encoded.startsWith("a1") ? encoded.slice(2) : encoded; return Buffer.from(value, "base64url").toString("utf8"); } catch {} } } for (const key of ["url", "u", "target", "to", "redirect", "jump"]) { const value = parsed.searchParams.get(key); if (!value) continue; const decoded = decodePercentText(value); if (/^https?:\/\//i.test(decoded) || /^\/\//.test(decoded)) return decoded; } return ""; } function cleanupUrl(parsed) { parsed.hash = ""; for (const key of [...parsed.searchParams.keys()]) { if (/^(ptag|from|fromvsogou|query|wd|q|src|source|utm_|spm|cxid)/i.test(key)) { parsed.searchParams.delete(key); } } return parsed.toString(); } function decodeEscapedText(value) { return decodeHtmlEntities(String(value) .replace(/\\u([0-9a-f]{4})/gi, (_, hex) => String.fromCharCode(Number.parseInt(hex, 16))) .replace(/\\x([0-9a-f]{2})/gi, (_, hex) => String.fromCharCode(Number.parseInt(hex, 16))) .replace(/\\\//g, "/")); } function decodeHtmlEntities(value) { return String(value) .replace(/ /g, " ") .replace(/"/g, "\"") .replace(/"/g, "\"") .replace(/"/gi, "\"") .replace(/'/g, "'") .replace(/'/g, "'") .replace(/'/gi, "'") .replace(/&/g, "&") .replace(/</g, "<") .replace(/>/g, ">"); } function cleanSnippet(text, index, padding = 160) { const start = Math.max(0, index - padding); const end = Math.min(text.length, index + padding); return decodePercentText(text.slice(start, end)) .replace(/<[^>]+>/g, " ") .replace(/\s+/g, " ") .trim(); } function decodePercentText(value) { return String(value).replace(/%[0-9a-f]{2}(?:%[0-9a-f]{2})*/gi, (match) => { try { return decodeURIComponent(match); } catch { return match; } }); } function safeDecodeURIComponent(value) { try { return decodeURIComponent(value); } catch { return String(value || ""); } }