#!python # -*- coding: utf-8 -*- """ yunpan1 搜索工具 — 搜索云盘资源分享社区并提取夸克链接 用法: py -X utf8 yunpan1_search.py <关键词> py -X utf8 yunpan1_search.py 遮天 说明: - Discuz! 首次搜索需建索引(可能等 60-120 秒),同一关键词后续秒回 - Cookie 文件: tmp/yunpan1_cookies.txt(需先通过 Playwright 登录获取) - 依赖: Python 标准库(无需额外安装) """ import re import os import sys import urllib.request import urllib.parse import time # ── 配置 ────────────────────────────────────────────────────────── BASE_URL = 'https://yunpan1.cc' SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) COOKIE_FILE = os.path.abspath(os.path.join(SCRIPT_DIR, '..', '..', '..', 'tmp', 'yunpan1_cookies.txt')) OUTPUT_FILE = os.path.abspath(os.path.join(SCRIPT_DIR, '..', '..', '..', 'tmp', 'quark_links.txt')) SEARCH_URL = BASE_URL + '/search.php?mod=forum&srchtxt={keyword}&searchsubmit=yes' # 全局 opener(timeout=300 避免分块传输时断连) _opener = urllib.request.build_opener() _opener.timeout = 300 # ── 工具函数 ────────────────────────────────────────────────────── def load_cookie(): path = COOKIE_FILE if not os.path.exists(path): print(f'❌ Cookie 文件不存在: {path}') print(f'请通过 Playwright 登录 yunpan1.cc 后,将 Cookie 保存到该文件(一行一个 key=value)') sys.exit(1) with open(path, 'r', encoding='utf-8') as f: return f.read().strip().replace('\n', '; ') def request(url, cookie): req = urllib.request.Request(url, headers={ 'Cookie': cookie, 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', }) t0 = time.time() resp = _opener.open(req) html = resp.read().decode('utf-8', errors='replace') return resp.status, html, resp.url, time.time() - t0 # ── 提取函数 ────────────────────────────────────────────────────── def extract_quark_links(html): """提取完整夸克链接(12 位字母数字 ID),排除末尾拼接 https 的误匹配""" raw = re.findall(r'https?://pan\.quark\.cn/s/[a-zA-Z0-9]{12,}', html) # 过滤掉后面紧跟着 https 的(如 .../s/xxxhttps) return sorted(set(l for l in raw if not l.endswith('https'))) def extract_truncated_links(html): """提取被截断的链接(1-11 位 ID,需要点进帖子)""" raw = re.findall(r'https?://pan\.quark\.cn/s/[a-zA-Z0-9]{1,11}', html) return sorted(set(raw)) def extract_threads(html): """提取帖子列表""" titles = re.findall(r']+href="forum\.php\?mod=viewthread[^>]+>([^<]+)', html) seen = set() result = [] for t in titles: t = t.strip() if t and len(t) > 5 and t not in seen: seen.add(t) result.append(t) return result # ── 主流程 ──────────────────────────────────────────────────────── def main(): if len(sys.argv) < 2: print('用法: py -X utf8 yunpan1_search.py <关键词>') print('示例: py -X utf8 yunpan1_search.py 遮天') sys.exit(1) keyword = sys.argv[1] cookie = load_cookie() print(f'🔍 搜索 "{keyword}" ...') print(f'⏱ 首次搜索需建索引,可能等待 1-2 分钟,请耐心等待...') sys.stdout.flush() status, html, final_url, elapsed = request( SEARCH_URL.format(keyword=urllib.parse.quote(keyword)), cookie ) links = extract_quark_links(html) truncated = extract_truncated_links(html) threads = extract_threads(html) print(f'\n✅ 完成(耗时 {elapsed:.0f}s,状态码 {status})') print(f' HTML: {len(html)} 字符 / {len(threads)} 条帖子') if links: print(f'\n{"=" * 50}') print(f'✅ 完整夸克链接(可直接转存): {len(links)}') print(f'{"=" * 50}') for l in links: print(f' {l}') os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True) with open(OUTPUT_FILE, 'w', encoding='utf-8') as f: f.write('\n'.join(links) + '\n') print(f'\n💾 链接已保存: {os.path.relpath(OUTPUT_FILE)}') else: print('\n⚠️ 未找到完整夸克链接') if truncated: print(f'\n⚠️ 被截断的链接({len(truncated)} 个,需点进帖子):') for l in truncated[:5]: print(f' {l}') print(f'\n📌 帖子预览(前 10 条):') for t in threads[:10]: print(f' • {t[:60]}') if __name__ == '__main__': main()