From 3fced326252d76cb846ea6318ee8068d50db3df4 Mon Sep 17 00:00:00 2001 From: Kaxi <1042864399@qq.com> Date: Sat, 16 May 2026 21:44:37 +0800 Subject: [PATCH] refactor: yunpan1 full Playwright workflow (verified) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 删除 yunpan1_search.py(Python 不稳定) - 删除 tmp/yunpan1_cookies.txt - usage.md: Playwright 完整流程(登录→搜索→回复→获取链接) - 全部步骤经 Playwright 实机验证 Co-Authored-By: Claude Opus 4.6 --- sites/yunpan1/v1/urls.md | 28 +--- sites/yunpan1/v1/usage.md | 105 +++++++++------ sites/yunpan1/v1/yunpan1_search.py | 200 ----------------------------- 3 files changed, 75 insertions(+), 258 deletions(-) delete mode 100644 sites/yunpan1/v1/yunpan1_search.py diff --git a/sites/yunpan1/v1/urls.md b/sites/yunpan1/v1/urls.md index 1fbf27e..2ec6e38 100644 --- a/sites/yunpan1/v1/urls.md +++ b/sites/yunpan1/v1/urls.md @@ -7,28 +7,6 @@ | 站点 | https://yunpan1.cc/ | | 旧版 | https://old.yunpan1.wang | | 类型 | Discuz! 论坛 | -| 登录 | 需要账号 | -| Cookie 有效期 | 约 1 天 | - -## 账号 - -| 项目 | 值 | -|------|-----| -| 邮箱 | 需要向用户索要 | -| 密码 | 需要向用户索要 | - -## Cookie - -成功登录后 Cookie 保存在 `tmp/yunpan1_cookies.txt`,有效期约 1 天。 - -Cookie 过期后需要通过 Playwright 重新登录。登录流程: - -``` -1. 打开 https://yunpan1.cc/member.php?mod=logging&action=login -2. 填写邮箱 + 密码 + 点击登录 -3. 从浏览器上下文中提取 auth、saltkey 等 Cookie -4. 写入 tmp/yunpan1_cookies.txt -``` ## 板块链接 @@ -36,3 +14,9 @@ Cookie 过期后需要通过 Playwright 重新登录。登录流程: |------|-----| | 动漫 | https://yunpan1.cc/forum.php?mod=forumdisplay&fid=3 | | 影视 | https://yunpan1.cc/forum.php?mod=forumdisplay&fid=2 | + +## 搜索 + +``` +https://yunpan1.cc/search.php?mod=forum&srchtxt=关键词&searchsubmit=yes +``` diff --git a/sites/yunpan1/v1/usage.md b/sites/yunpan1/v1/usage.md index 9c4188c..504a188 100644 --- a/sites/yunpan1/v1/usage.md +++ b/sites/yunpan1/v1/usage.md @@ -1,51 +1,84 @@ # yunpan1 — 获取资源 -## Python 脚本搜索(推荐) +使用 Playwright 浏览器完成搜索、查看帖子、回复获取隐藏链接。 -依赖:Python 标准库(无需额外安装) +## 前置检查 -```bash -py -X utf8 sites/yunpan1/v1/yunpan1_search.py <关键词> -``` - -示例: - -```bash -py -X utf8 sites/yunpan1/v1/yunpan1_search.py 遮天 -py -X utf8 sites/yunpan1/v1/yunpan1_search.py 完美世界 -``` - -搜索结果: -- **完整夸克链接**(12 位 ID,可直接转存到夸克网盘) -- **被截断的链接**(部分帖子在搜索结果中截断了链接,需点进帖子查看) -- 链接自动保存到 `tmp/quark_links.txt` - -首次搜索因 Discuz! 后端建索引可能等待 1-2 分钟,同一关键词后续秒回。 - -## Cookie 维护 - -搜索需要登录态,Cookie 保存在 `tmp/yunpan1_cookies.txt`,约 1 天过期。 - -Cookie 过期后通过 Playwright 重新登录获取: +浏览器可能已登录 yunpan1,先检查登录状态: + +```javascript +const text = await page.evaluate(() => document.body.innerText); +const loggedIn = !text.includes('登录发现更多内容'); +``` + +未登录时先登录: ```javascript -// Playwright 登录流程 await page.goto('https://yunpan1.cc/member.php?mod=logging&action=login'); -await page.locator('form[name="login"] input[name="username"]').fill('向用户索要邮箱'); -await page.locator('form[name="login"] input[name="password"]').fill('向用户索要密码'); +await page.locator('form[name="login"] input[name="username"]').fill('账号'); +await page.locator('form[name="login"] input[name="password"]').fill('密码'); await page.locator('form[name="login"] button[name="loginsubmit"]').click(); -// 等待跳转回首页确认登录成功 - -// 提取 Cookie 保存到文件 -const cookies = await page.context().cookies(); -// 关键 Cookie: 2dF6_2132_auth, 2dF6_2132_saltkey, 2dF6_2132_lastvisit ``` -## 直接浏览板块 +## 搜索 + +```javascript +await page.goto('https://yunpan1.cc/search.php?mod=forum&srchtxt=' + encodeURIComponent('关键词') + '&searchsubmit=yes'); +await page.waitForTimeout(3000); + +const data = await page.evaluate(() => { + const text = document.body.innerText; + const links = text.match(/https?:\/\/pan\.quark\.cn\/s\/[a-zA-Z0-9]+/g) || []; + const threads = Array.from(document.querySelectorAll('a[href*="viewthread"]')); + const titles = threads.map(a => ({ title: a.innerText.trim(), url: a.href })); + return { + links: [...new Set(links)], + threads: [...new Map(titles.filter(t => t.title.length > 5).map(t => [t.title, t])).values()] + }; +}); +``` + +## 查看帖子详情 + +截断的链接需打开帖子查看。帖子有回复可见机制时,先回复再刷新: + +```javascript +// 打开帖子 +await page.goto('帖子URL'); +await page.waitForTimeout(2000); + +// 检测是否有隐藏内容 +const hasHidden = await page.evaluate(() => + document.body.innerText.includes('本内容被作者隐藏') +); + +if (hasHidden) { + // 点击回复按钮 + await page.getByRole('link', { name: '回复', exact: true }).first().click(); + await page.waitForTimeout(1000); + // 填写回复内容 + await page.locator('#postmessage').fill('谢谢分享'); + // 提交回复 + await page.locator('button[name="replysubmit"]').first().click(); + await page.waitForTimeout(3000); + // 刷新页面查看完整内容 + await page.goto('帖子URL'); + await page.waitForTimeout(2000); +} + +// 提取夸克链接 +const links = await page.evaluate(() => { + const text = document.body.innerText; + return [...new Set(text.match(/https?:\/\/pan\.quark\.cn\/s\/[a-zA-Z0-9]+/g) || [])]; +}); +``` + +## 板块浏览 + +搜索不可用时直接看动漫板块最新帖子: ``` -动漫:https://yunpan1.cc/forum.php?mod=forumdisplay&fid=3 -影视:https://yunpan1.cc/forum.php?mod=forumdisplay&fid=2 +https://yunpan1.cc/forum.php?mod=forumdisplay&fid=3&orderby=dateline ``` ## 拿到链接后的操作 diff --git a/sites/yunpan1/v1/yunpan1_search.py b/sites/yunpan1/v1/yunpan1_search.py deleted file mode 100644 index fbc0b12..0000000 --- a/sites/yunpan1/v1/yunpan1_search.py +++ /dev/null @@ -1,200 +0,0 @@ -#!python -# -*- coding: utf-8 -*- -""" -yunpan1 搜索工具 — 搜索云盘资源分享社区并提取夸克链接 - -用法: - py -X utf8 yunpan1_search.py <关键词> - py -X utf8 yunpan1_search.py 遮天 - -说明: - - 优先搜索,30 秒超时后自动降级到浏览动漫板块最新帖子 - - Cookie 文件: tmp/yunpan1_cookies.txt(格式:key=value 一行一个 或 Netscape 格式均可) - - 依赖: Python 标准库(无需额外安装) -""" - -import re -import os -import sys -import urllib.request -import urllib.parse -import time - -# ── 配置 ────────────────────────────────────────────────────────── -BASE_URL = 'https://yunpan1.cc' -SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) -COOKIE_FILE = os.path.abspath(os.path.join(SCRIPT_DIR, '..', '..', '..', 'tmp', 'yunpan1_cookies.txt')) -OUTPUT_FILE = os.path.abspath(os.path.join(SCRIPT_DIR, '..', '..', '..', 'tmp', 'quark_links.txt')) - -SEARCH_URL = BASE_URL + '/search.php?mod=forum&srchtxt={keyword}&searchsubmit=yes' -FORUM_URL = BASE_URL + '/forum.php?mod=forumdisplay&fid=3&orderby=dateline' - -SEARCH_TIMEOUT = 30 # 搜索超时(秒),超时后降级到浏览板块 -FORUM_TIMEOUT = 15 # 板块页面超时 - -_opener = urllib.request.build_opener() -_opener.timeout = SEARCH_TIMEOUT - - -# ── Cookie 加载(兼容两种格式) ────────────────────────────────── - -def load_cookie(): - """加载 Cookie,兼容 key=value 一行一个 和 Netscape 格式""" - path = COOKIE_FILE - if not os.path.exists(path): - print(f'❌ Cookie 文件不存在: {path}') - print(f'请通过 Playwright 登录 yunpan1.cc 后,将 Cookie 保存到该文件') - sys.exit(1) - - with open(path, 'r', encoding='utf-8') as f: - raw = f.read() - - # 如果是 Netscape 格式(含 tab、domain、TRUE/FALSE 等字段) - if '\t' in raw and ('TRUE' in raw or 'FALSE' in raw): - parts = [] - for line in raw.strip().splitlines(): - line = line.strip() - if not line or line.startswith('#'): - continue - cols = line.split('\t') - if len(cols) >= 7: - parts.append(f'{cols[5]}={cols[6]}') - if parts: - return '; '.join(parts) - - # 否则按 key=value 一行一个处理 - return raw.strip().replace('\n', '; ').replace('\r', '') - - -# ── HTTP 请求 ───────────────────────────────────────────────────── - -def request(url, cookie, timeout=SEARCH_TIMEOUT): - req = urllib.request.Request(url, headers={ - 'Cookie': cookie, - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', - }) - _opener.timeout = timeout - t0 = time.time() - try: - resp = _opener.open(req) - html = resp.read().decode('utf-8', errors='replace') - return resp.status, html, resp.url, time.time() - t0, None - except Exception as e: - return None, '', '', time.time() - t0, str(e) - - -# ── 提取函数 ────────────────────────────────────────────────────── - -def extract_quark_links(html): - raw = re.findall(r'https?://pan\.quark\.cn/s/[a-zA-Z0-9]{12,}', html) - return sorted(set(l for l in raw if not l.endswith('https'))) - - -def extract_truncated_links(html): - raw = re.findall(r'https?://pan\.quark\.cn/s/[a-zA-Z0-9]{1,11}', html) - return sorted(set(raw)) - - -def extract_threads(html): - """提取帖子标题,兼容 viewthread 和 thread 格式""" - patterns = [ - r']+href="forum\.php\?mod=viewthread[^>]+>([^<]+)', - r']+href="thread\.php\?tid=\d+[^>]*>([^<]+)', - r'class="s xst"[^>]*>([^<]+)', - r']+class="xst"[^>]*>([^<]+)', - ] - all_titles = [] - for p in patterns: - all_titles.extend(re.findall(p, html)) - - seen = set() - result = [] - for t in all_titles: - t = t.strip() - if t and len(t) > 5 and t not in seen: - seen.add(t) - result.append(t) - return result - - -# ── 主流程 ──────────────────────────────────────────────────────── - -def search(keyword, cookie): - """搜索,超时则返回 None""" - url = SEARCH_URL.format(keyword=urllib.parse.quote(keyword)) - print(f'🔍 搜索 "{keyword}"({SEARCH_TIMEOUT}s 超时)...') - sys.stdout.flush() - return request(url, cookie, timeout=SEARCH_TIMEOUT) - - -def browse_forum(cookie): - """浏览动漫板块最新帖子作为降级方案""" - print(f'⏩ 降级到浏览动漫板块最新帖子...') - sys.stdout.flush() - return request(FORUM_URL, cookie, timeout=FORUM_TIMEOUT) - - -def print_results(links, truncated, threads, source, elapsed): - print(f'\n✅ 完成(来源: {source},耗时 {elapsed:.0f}s)') - - if links: - print(f'\n{"=" * 50}') - print(f'✅ 完整夸克链接(可直接转存): {len(links)}') - print(f'{"=" * 50}') - for l in links: - print(f' {l}') - - os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True) - with open(OUTPUT_FILE, 'w', encoding='utf-8') as f: - f.write('\n'.join(links) + '\n') - print(f'\n💾 链接已保存: {os.path.relpath(OUTPUT_FILE)}') - else: - print('\n⚠️ 未找到完整夸克链接') - - if truncated: - print(f'\n⚠️ 被截断的链接({len(truncated)} 个,需点进帖子):') - for l in truncated[:5]: - print(f' {l}') - - if threads: - print(f'\n📌 帖子预览(前 10 条):') - for t in threads[:10]: - print(f' • {t[:60]}') - - -def main(): - if len(sys.argv) < 2: - print('用法: py -X utf8 yunpan1_search.py <关键词>') - print('示例: py -X utf8 yunpan1_search.py 遮天') - sys.exit(1) - - keyword = sys.argv[1] - cookie = load_cookie() - - # 尝试搜索 - status, html, _, elapsed, err = search(keyword, cookie) - - if err and 'timeout' in err.lower(): - print(f'⏱ 搜索超时({elapsed:.0f}s),机器配置较低时首次建索引可能卡死') - # 降级到浏览板块 - status, html, _, elapsed, err2 = browse_forum(cookie) - if err2: - print(f'❌ 降级也失败了: {err2}') - sys.exit(1) - source = '板块浏览(搜索超时降级)' - elif err: - print(f'❌ 搜索失败: {err}') - # 尝试降级 - status, html, _, elapsed, _ = browse_forum(cookie) - source = '板块浏览(搜索失败降级)' - else: - source = '搜索' - - links = extract_quark_links(html) - truncated = extract_truncated_links(html) - threads = extract_threads(html) - print_results(links, truncated, threads, source, elapsed) - - -if __name__ == '__main__': - main()