refactor: yunpan1 full Playwright workflow (verified)

- 删除 yunpan1_search.py（Python 不稳定） - 删除 tmp/yunpan1_cookies.txt - usage.md: Playwright 完整流程（登录→搜索→回复→获取链接） - 全部步骤经 Playwright 实机验证 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-05-16 21:44:37 +08:00
parent 01dbdc8455
commit 3fced32625
3 changed files with 75 additions and 258 deletions
@@ -7,28 +7,6 @@
 | 站点 | https://yunpan1.cc/ |
 | 旧版 | https://old.yunpan1.wang |
 | 类型 | Discuz! 论坛 |
-| 登录 | 需要账号 |
-| Cookie 有效期 | 约 1 天 |
-
-## 账号
-
-| 项目 | 值 |
-|------|-----|
-| 邮箱 | 需要向用户索要 |
-| 密码 | 需要向用户索要 |
-
-## Cookie
-
-成功登录后 Cookie 保存在 `tmp/yunpan1_cookies.txt`，有效期约 1 天。
-
-Cookie 过期后需要通过 Playwright 重新登录。登录流程：
-
-```
-1. 打开 https://yunpan1.cc/member.php?mod=logging&action=login
-2. 填写邮箱 + 密码 + 点击登录
-3. 从浏览器上下文中提取 auth、saltkey 等 Cookie
-4. 写入 tmp/yunpan1_cookies.txt
-```

 ## 板块链接

@@ -36,3 +14,9 @@ Cookie 过期后需要通过 Playwright 重新登录。登录流程：
 |------|-----|
 | 动漫 | https://yunpan1.cc/forum.php?mod=forumdisplay&fid=3 |
 | 影视 | https://yunpan1.cc/forum.php?mod=forumdisplay&fid=2 |
+
+## 搜索
+
+```
+https://yunpan1.cc/search.php?mod=forum&srchtxt=关键词&searchsubmit=yes
+```
@@ -1,51 +1,84 @@
 # yunpan1 — 获取资源

-## Python 脚本搜索（推荐）
+使用 Playwright 浏览器完成搜索、查看帖子、回复获取隐藏链接。

-依赖：Python 标准库（无需额外安装）
+## 前置检查

-```bash
-py -X utf8 sites/yunpan1/v1/yunpan1_search.py <关键词>
-```
-
-示例：
-
-```bash
-py -X utf8 sites/yunpan1/v1/yunpan1_search.py 遮天
-py -X utf8 sites/yunpan1/v1/yunpan1_search.py 完美世界
-```
-
-搜索结果：
- **完整夸克链接**（12 位 ID，可直接转存到夸克网盘）
- **被截断的链接**（部分帖子在搜索结果中截断了链接，需点进帖子查看）
- 链接自动保存到 `tmp/quark_links.txt`
-
-首次搜索因 Discuz! 后端建索引可能等待 1-2 分钟，同一关键词后续秒回。
-
-## Cookie 维护
-
-搜索需要登录态，Cookie 保存在 `tmp/yunpan1_cookies.txt`，约 1 天过期。
-
-Cookie 过期后通过 Playwright 重新登录获取：
+浏览器可能已登录 yunpan1，先检查登录状态：
+
+```javascript
+const text = await page.evaluate(() => document.body.innerText);
+const loggedIn = !text.includes('登录发现更多内容');
+```
+
+未登录时先登录：

 ```javascript
-// Playwright 登录流程
 await page.goto('https://yunpan1.cc/member.php?mod=logging&action=login');
-await page.locator('form[name="login"] input[name="username"]').fill('向用户索要邮箱');
-await page.locator('form[name="login"] input[name="password"]').fill('向用户索要密码');
+await page.locator('form[name="login"] input[name="username"]').fill('账号');
+await page.locator('form[name="login"] input[name="password"]').fill('密码');
 await page.locator('form[name="login"] button[name="loginsubmit"]').click();
-// 等待跳转回首页确认登录成功
-
-// 提取 Cookie 保存到文件
-const cookies = await page.context().cookies();
-// 关键 Cookie: 2dF6_2132_auth, 2dF6_2132_saltkey, 2dF6_2132_lastvisit
 ```

-## 直接浏览板块
+## 搜索
+
+```javascript
+await page.goto('https://yunpan1.cc/search.php?mod=forum&srchtxt=' + encodeURIComponent('关键词') + '&searchsubmit=yes');
+await page.waitForTimeout(3000);
+
+const data = await page.evaluate(() => {
+  const text = document.body.innerText;
+  const links = text.match(/https?:\/\/pan\.quark\.cn\/s\/[a-zA-Z0-9]+/g) || [];
+  const threads = Array.from(document.querySelectorAll('a[href*="viewthread"]'));
+  const titles = threads.map(a => ({ title: a.innerText.trim(), url: a.href }));
+  return {
+    links: [...new Set(links)],
+    threads: [...new Map(titles.filter(t => t.title.length > 5).map(t => [t.title, t])).values()]
+  };
+});
+```
+
+## 查看帖子详情
+
+截断的链接需打开帖子查看。帖子有回复可见机制时，先回复再刷新：
+
+```javascript
+// 打开帖子
+await page.goto('帖子URL');
+await page.waitForTimeout(2000);
+
+// 检测是否有隐藏内容
+const hasHidden = await page.evaluate(() =>
+  document.body.innerText.includes('本内容被作者隐藏')
+);
+
+if (hasHidden) {
+  // 点击回复按钮
+  await page.getByRole('link', { name: '回复', exact: true }).first().click();
+  await page.waitForTimeout(1000);
+  // 填写回复内容
+  await page.locator('#postmessage').fill('谢谢分享');
+  // 提交回复
+  await page.locator('button[name="replysubmit"]').first().click();
+  await page.waitForTimeout(3000);
+  // 刷新页面查看完整内容
+  await page.goto('帖子URL');
+  await page.waitForTimeout(2000);
+}
+
+// 提取夸克链接
+const links = await page.evaluate(() => {
+  const text = document.body.innerText;
+  return [...new Set(text.match(/https?:\/\/pan\.quark\.cn\/s\/[a-zA-Z0-9]+/g) || [])];
+});
+```
+
+## 板块浏览
+
+搜索不可用时直接看动漫板块最新帖子：

 ```
-动漫：https://yunpan1.cc/forum.php?mod=forumdisplay&fid=3
-影视：https://yunpan1.cc/forum.php?mod=forumdisplay&fid=2
+https://yunpan1.cc/forum.php?mod=forumdisplay&fid=3&orderby=dateline
 ```

 ## 拿到链接后的操作
@@ -1,200 +0,0 @@
-#!python
-# -*- coding: utf-8 -*-
-"""
-yunpan1 搜索工具 — 搜索云盘资源分享社区并提取夸克链接
-
-用法：
-  py -X utf8 yunpan1_search.py <关键词>
-  py -X utf8 yunpan1_search.py 遮天
-
-说明：
-  - 优先搜索，30 秒超时后自动降级到浏览动漫板块最新帖子
-  - Cookie 文件: tmp/yunpan1_cookies.txt（格式：key=value 一行一个 或 Netscape 格式均可）
-  - 依赖: Python 标准库（无需额外安装）
-"""
-
-import re
-import os
-import sys
-import urllib.request
-import urllib.parse
-import time
-
-# ── 配置 ──────────────────────────────────────────────────────────
-BASE_URL = 'https://yunpan1.cc'
-SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
-COOKIE_FILE = os.path.abspath(os.path.join(SCRIPT_DIR, '..', '..', '..', 'tmp', 'yunpan1_cookies.txt'))
-OUTPUT_FILE = os.path.abspath(os.path.join(SCRIPT_DIR, '..', '..', '..', 'tmp', 'quark_links.txt'))
-
-SEARCH_URL = BASE_URL + '/search.php?mod=forum&srchtxt={keyword}&searchsubmit=yes'
-FORUM_URL = BASE_URL + '/forum.php?mod=forumdisplay&fid=3&orderby=dateline'
-
-SEARCH_TIMEOUT = 30     # 搜索超时（秒），超时后降级到浏览板块
-FORUM_TIMEOUT = 15      # 板块页面超时
-
-_opener = urllib.request.build_opener()
-_opener.timeout = SEARCH_TIMEOUT
-
-
-# ── Cookie 加载（兼容两种格式） ──────────────────────────────────
-
-def load_cookie():
-    """加载 Cookie，兼容 key=value 一行一个 和 Netscape 格式"""
-    path = COOKIE_FILE
-    if not os.path.exists(path):
-        print(f'❌ Cookie 文件不存在: {path}')
-        print(f'请通过 Playwright 登录 yunpan1.cc 后，将 Cookie 保存到该文件')
-        sys.exit(1)
-
-    with open(path, 'r', encoding='utf-8') as f:
-        raw = f.read()
-
-    # 如果是 Netscape 格式（含 tab、domain、TRUE/FALSE 等字段）
-    if '\t' in raw and ('TRUE' in raw or 'FALSE' in raw):
-        parts = []
-        for line in raw.strip().splitlines():
-            line = line.strip()
-            if not line or line.startswith('#'):
-                continue
-            cols = line.split('\t')
-            if len(cols) >= 7:
-                parts.append(f'{cols[5]}={cols[6]}')
-        if parts:
-            return '; '.join(parts)
-
-    # 否则按 key=value 一行一个处理
-    return raw.strip().replace('\n', '; ').replace('\r', '')
-
-
-# ── HTTP 请求 ─────────────────────────────────────────────────────
-
-def request(url, cookie, timeout=SEARCH_TIMEOUT):
-    req = urllib.request.Request(url, headers={
-        'Cookie': cookie,
-        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
-    })
-    _opener.timeout = timeout
-    t0 = time.time()
-    try:
-        resp = _opener.open(req)
-        html = resp.read().decode('utf-8', errors='replace')
-        return resp.status, html, resp.url, time.time() - t0, None
-    except Exception as e:
-        return None, '', '', time.time() - t0, str(e)
-
-
-# ── 提取函数 ──────────────────────────────────────────────────────
-
-def extract_quark_links(html):
-    raw = re.findall(r'https?://pan\.quark\.cn/s/[a-zA-Z0-9]{12,}', html)
-    return sorted(set(l for l in raw if not l.endswith('https')))
-
-
-def extract_truncated_links(html):
-    raw = re.findall(r'https?://pan\.quark\.cn/s/[a-zA-Z0-9]{1,11}', html)
-    return sorted(set(raw))
-
-
-def extract_threads(html):
-    """提取帖子标题，兼容 viewthread 和 thread 格式"""
-    patterns = [
-        r'<a[^>]+href="forum\.php\?mod=viewthread[^>]+>([^<]+)</a>',
-        r'<a[^>]+href="thread\.php\?tid=\d+[^>]*>([^<]+)</a>',
-        r'class="s xst"[^>]*>([^<]+)</a>',
-        r'<a[^>]+class="xst"[^>]*>([^<]+)</a>',
-    ]
-    all_titles = []
-    for p in patterns:
-        all_titles.extend(re.findall(p, html))
-
-    seen = set()
-    result = []
-    for t in all_titles:
-        t = t.strip()
-        if t and len(t) > 5 and t not in seen:
-            seen.add(t)
-            result.append(t)
-    return result
-
-
-# ── 主流程 ────────────────────────────────────────────────────────
-
-def search(keyword, cookie):
-    """搜索，超时则返回 None"""
-    url = SEARCH_URL.format(keyword=urllib.parse.quote(keyword))
-    print(f'🔍 搜索 "{keyword}"（{SEARCH_TIMEOUT}s 超时）...')
-    sys.stdout.flush()
-    return request(url, cookie, timeout=SEARCH_TIMEOUT)
-
-
-def browse_forum(cookie):
-    """浏览动漫板块最新帖子作为降级方案"""
-    print(f'⏩ 降级到浏览动漫板块最新帖子...')
-    sys.stdout.flush()
-    return request(FORUM_URL, cookie, timeout=FORUM_TIMEOUT)
-
-
-def print_results(links, truncated, threads, source, elapsed):
-    print(f'\n✅ 完成（来源: {source}，耗时 {elapsed:.0f}s）')
-
-    if links:
-        print(f'\n{"=" * 50}')
-        print(f'✅ 完整夸克链接（可直接转存）: {len(links)}')
-        print(f'{"=" * 50}')
-        for l in links:
-            print(f'   {l}')
-
-        os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True)
-        with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
-            f.write('\n'.join(links) + '\n')
-        print(f'\n💾 链接已保存: {os.path.relpath(OUTPUT_FILE)}')
-    else:
-        print('\n⚠️  未找到完整夸克链接')
-
-    if truncated:
-        print(f'\n⚠️  被截断的链接（{len(truncated)} 个，需点进帖子）:')
-        for l in truncated[:5]:
-            print(f'   {l}')
-
-    if threads:
-        print(f'\n📌 帖子预览（前 10 条）:')
-        for t in threads[:10]:
-            print(f'   • {t[:60]}')
-
-
-def main():
-    if len(sys.argv) < 2:
-        print('用法: py -X utf8 yunpan1_search.py <关键词>')
-        print('示例: py -X utf8 yunpan1_search.py 遮天')
-        sys.exit(1)
-
-    keyword = sys.argv[1]
-    cookie = load_cookie()
-
-    # 尝试搜索
-    status, html, _, elapsed, err = search(keyword, cookie)
-
-    if err and 'timeout' in err.lower():
-        print(f'⏱  搜索超时（{elapsed:.0f}s），机器配置较低时首次建索引可能卡死')
-        # 降级到浏览板块
-        status, html, _, elapsed, err2 = browse_forum(cookie)
-        if err2:
-            print(f'❌ 降级也失败了: {err2}')
-            sys.exit(1)
-        source = '板块浏览（搜索超时降级）'
-    elif err:
-        print(f'❌ 搜索失败: {err}')
-        # 尝试降级
-        status, html, _, elapsed, _ = browse_forum(cookie)
-        source = '板块浏览（搜索失败降级）'
-    else:
-        source = '搜索'
-
-    links = extract_quark_links(html)
-    truncated = extract_truncated_links(html)
-    threads = extract_threads(html)
-    print_results(links, truncated, threads, source, elapsed)
-
-
-if __name__ == '__main__':
-    main()