refactor: yunpan1 full Playwright workflow (verified)
- 删除 yunpan1_search.py(Python 不稳定) - 删除 tmp/yunpan1_cookies.txt - usage.md: Playwright 完整流程(登录→搜索→回复→获取链接) - 全部步骤经 Playwright 实机验证 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -7,28 +7,6 @@
|
||||
| 站点 | https://yunpan1.cc/ |
|
||||
| 旧版 | https://old.yunpan1.wang |
|
||||
| 类型 | Discuz! 论坛 |
|
||||
| 登录 | 需要账号 |
|
||||
| Cookie 有效期 | 约 1 天 |
|
||||
|
||||
## 账号
|
||||
|
||||
| 项目 | 值 |
|
||||
|------|-----|
|
||||
| 邮箱 | 需要向用户索要 |
|
||||
| 密码 | 需要向用户索要 |
|
||||
|
||||
## Cookie
|
||||
|
||||
成功登录后 Cookie 保存在 `tmp/yunpan1_cookies.txt`,有效期约 1 天。
|
||||
|
||||
Cookie 过期后需要通过 Playwright 重新登录。登录流程:
|
||||
|
||||
```
|
||||
1. 打开 https://yunpan1.cc/member.php?mod=logging&action=login
|
||||
2. 填写邮箱 + 密码 + 点击登录
|
||||
3. 从浏览器上下文中提取 auth、saltkey 等 Cookie
|
||||
4. 写入 tmp/yunpan1_cookies.txt
|
||||
```
|
||||
|
||||
## 板块链接
|
||||
|
||||
@@ -36,3 +14,9 @@ Cookie 过期后需要通过 Playwright 重新登录。登录流程:
|
||||
|------|-----|
|
||||
| 动漫 | https://yunpan1.cc/forum.php?mod=forumdisplay&fid=3 |
|
||||
| 影视 | https://yunpan1.cc/forum.php?mod=forumdisplay&fid=2 |
|
||||
|
||||
## 搜索
|
||||
|
||||
```
|
||||
https://yunpan1.cc/search.php?mod=forum&srchtxt=关键词&searchsubmit=yes
|
||||
```
|
||||
|
||||
+69
-36
@@ -1,51 +1,84 @@
|
||||
# yunpan1 — 获取资源
|
||||
|
||||
## Python 脚本搜索(推荐)
|
||||
使用 Playwright 浏览器完成搜索、查看帖子、回复获取隐藏链接。
|
||||
|
||||
依赖:Python 标准库(无需额外安装)
|
||||
## 前置检查
|
||||
|
||||
```bash
|
||||
py -X utf8 sites/yunpan1/v1/yunpan1_search.py <关键词>
|
||||
```
|
||||
|
||||
示例:
|
||||
|
||||
```bash
|
||||
py -X utf8 sites/yunpan1/v1/yunpan1_search.py 遮天
|
||||
py -X utf8 sites/yunpan1/v1/yunpan1_search.py 完美世界
|
||||
```
|
||||
|
||||
搜索结果:
|
||||
- **完整夸克链接**(12 位 ID,可直接转存到夸克网盘)
|
||||
- **被截断的链接**(部分帖子在搜索结果中截断了链接,需点进帖子查看)
|
||||
- 链接自动保存到 `tmp/quark_links.txt`
|
||||
|
||||
首次搜索因 Discuz! 后端建索引可能等待 1-2 分钟,同一关键词后续秒回。
|
||||
|
||||
## Cookie 维护
|
||||
|
||||
搜索需要登录态,Cookie 保存在 `tmp/yunpan1_cookies.txt`,约 1 天过期。
|
||||
|
||||
Cookie 过期后通过 Playwright 重新登录获取:
|
||||
浏览器可能已登录 yunpan1,先检查登录状态:
|
||||
|
||||
```javascript
|
||||
const text = await page.evaluate(() => document.body.innerText);
|
||||
const loggedIn = !text.includes('登录发现更多内容');
|
||||
```
|
||||
|
||||
未登录时先登录:
|
||||
|
||||
```javascript
|
||||
// Playwright 登录流程
|
||||
await page.goto('https://yunpan1.cc/member.php?mod=logging&action=login');
|
||||
await page.locator('form[name="login"] input[name="username"]').fill('向用户索要邮箱');
|
||||
await page.locator('form[name="login"] input[name="password"]').fill('向用户索要密码');
|
||||
await page.locator('form[name="login"] input[name="username"]').fill('账号');
|
||||
await page.locator('form[name="login"] input[name="password"]').fill('密码');
|
||||
await page.locator('form[name="login"] button[name="loginsubmit"]').click();
|
||||
// 等待跳转回首页确认登录成功
|
||||
|
||||
// 提取 Cookie 保存到文件
|
||||
const cookies = await page.context().cookies();
|
||||
// 关键 Cookie: 2dF6_2132_auth, 2dF6_2132_saltkey, 2dF6_2132_lastvisit
|
||||
```
|
||||
|
||||
## 直接浏览板块
|
||||
## 搜索
|
||||
|
||||
```javascript
|
||||
await page.goto('https://yunpan1.cc/search.php?mod=forum&srchtxt=' + encodeURIComponent('关键词') + '&searchsubmit=yes');
|
||||
await page.waitForTimeout(3000);
|
||||
|
||||
const data = await page.evaluate(() => {
|
||||
const text = document.body.innerText;
|
||||
const links = text.match(/https?:\/\/pan\.quark\.cn\/s\/[a-zA-Z0-9]+/g) || [];
|
||||
const threads = Array.from(document.querySelectorAll('a[href*="viewthread"]'));
|
||||
const titles = threads.map(a => ({ title: a.innerText.trim(), url: a.href }));
|
||||
return {
|
||||
links: [...new Set(links)],
|
||||
threads: [...new Map(titles.filter(t => t.title.length > 5).map(t => [t.title, t])).values()]
|
||||
};
|
||||
});
|
||||
```
|
||||
|
||||
## 查看帖子详情
|
||||
|
||||
截断的链接需打开帖子查看。帖子有回复可见机制时,先回复再刷新:
|
||||
|
||||
```javascript
|
||||
// 打开帖子
|
||||
await page.goto('帖子URL');
|
||||
await page.waitForTimeout(2000);
|
||||
|
||||
// 检测是否有隐藏内容
|
||||
const hasHidden = await page.evaluate(() =>
|
||||
document.body.innerText.includes('本内容被作者隐藏')
|
||||
);
|
||||
|
||||
if (hasHidden) {
|
||||
// 点击回复按钮
|
||||
await page.getByRole('link', { name: '回复', exact: true }).first().click();
|
||||
await page.waitForTimeout(1000);
|
||||
// 填写回复内容
|
||||
await page.locator('#postmessage').fill('谢谢分享');
|
||||
// 提交回复
|
||||
await page.locator('button[name="replysubmit"]').first().click();
|
||||
await page.waitForTimeout(3000);
|
||||
// 刷新页面查看完整内容
|
||||
await page.goto('帖子URL');
|
||||
await page.waitForTimeout(2000);
|
||||
}
|
||||
|
||||
// 提取夸克链接
|
||||
const links = await page.evaluate(() => {
|
||||
const text = document.body.innerText;
|
||||
return [...new Set(text.match(/https?:\/\/pan\.quark\.cn\/s\/[a-zA-Z0-9]+/g) || [])];
|
||||
});
|
||||
```
|
||||
|
||||
## 板块浏览
|
||||
|
||||
搜索不可用时直接看动漫板块最新帖子:
|
||||
|
||||
```
|
||||
动漫:https://yunpan1.cc/forum.php?mod=forumdisplay&fid=3
|
||||
影视:https://yunpan1.cc/forum.php?mod=forumdisplay&fid=2
|
||||
https://yunpan1.cc/forum.php?mod=forumdisplay&fid=3&orderby=dateline
|
||||
```
|
||||
|
||||
## 拿到链接后的操作
|
||||
|
||||
@@ -1,200 +0,0 @@
|
||||
#!python
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
yunpan1 搜索工具 — 搜索云盘资源分享社区并提取夸克链接
|
||||
|
||||
用法:
|
||||
py -X utf8 yunpan1_search.py <关键词>
|
||||
py -X utf8 yunpan1_search.py 遮天
|
||||
|
||||
说明:
|
||||
- 优先搜索,30 秒超时后自动降级到浏览动漫板块最新帖子
|
||||
- Cookie 文件: tmp/yunpan1_cookies.txt(格式:key=value 一行一个 或 Netscape 格式均可)
|
||||
- 依赖: Python 标准库(无需额外安装)
|
||||
"""
|
||||
|
||||
import re
|
||||
import os
|
||||
import sys
|
||||
import urllib.request
|
||||
import urllib.parse
|
||||
import time
|
||||
|
||||
# ── 配置 ──────────────────────────────────────────────────────────
|
||||
BASE_URL = 'https://yunpan1.cc'
|
||||
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
COOKIE_FILE = os.path.abspath(os.path.join(SCRIPT_DIR, '..', '..', '..', 'tmp', 'yunpan1_cookies.txt'))
|
||||
OUTPUT_FILE = os.path.abspath(os.path.join(SCRIPT_DIR, '..', '..', '..', 'tmp', 'quark_links.txt'))
|
||||
|
||||
SEARCH_URL = BASE_URL + '/search.php?mod=forum&srchtxt={keyword}&searchsubmit=yes'
|
||||
FORUM_URL = BASE_URL + '/forum.php?mod=forumdisplay&fid=3&orderby=dateline'
|
||||
|
||||
SEARCH_TIMEOUT = 30 # 搜索超时(秒),超时后降级到浏览板块
|
||||
FORUM_TIMEOUT = 15 # 板块页面超时
|
||||
|
||||
_opener = urllib.request.build_opener()
|
||||
_opener.timeout = SEARCH_TIMEOUT
|
||||
|
||||
|
||||
# ── Cookie 加载(兼容两种格式) ──────────────────────────────────
|
||||
|
||||
def load_cookie():
|
||||
"""加载 Cookie,兼容 key=value 一行一个 和 Netscape 格式"""
|
||||
path = COOKIE_FILE
|
||||
if not os.path.exists(path):
|
||||
print(f'❌ Cookie 文件不存在: {path}')
|
||||
print(f'请通过 Playwright 登录 yunpan1.cc 后,将 Cookie 保存到该文件')
|
||||
sys.exit(1)
|
||||
|
||||
with open(path, 'r', encoding='utf-8') as f:
|
||||
raw = f.read()
|
||||
|
||||
# 如果是 Netscape 格式(含 tab、domain、TRUE/FALSE 等字段)
|
||||
if '\t' in raw and ('TRUE' in raw or 'FALSE' in raw):
|
||||
parts = []
|
||||
for line in raw.strip().splitlines():
|
||||
line = line.strip()
|
||||
if not line or line.startswith('#'):
|
||||
continue
|
||||
cols = line.split('\t')
|
||||
if len(cols) >= 7:
|
||||
parts.append(f'{cols[5]}={cols[6]}')
|
||||
if parts:
|
||||
return '; '.join(parts)
|
||||
|
||||
# 否则按 key=value 一行一个处理
|
||||
return raw.strip().replace('\n', '; ').replace('\r', '')
|
||||
|
||||
|
||||
# ── HTTP 请求 ─────────────────────────────────────────────────────
|
||||
|
||||
def request(url, cookie, timeout=SEARCH_TIMEOUT):
|
||||
req = urllib.request.Request(url, headers={
|
||||
'Cookie': cookie,
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
|
||||
})
|
||||
_opener.timeout = timeout
|
||||
t0 = time.time()
|
||||
try:
|
||||
resp = _opener.open(req)
|
||||
html = resp.read().decode('utf-8', errors='replace')
|
||||
return resp.status, html, resp.url, time.time() - t0, None
|
||||
except Exception as e:
|
||||
return None, '', '', time.time() - t0, str(e)
|
||||
|
||||
|
||||
# ── 提取函数 ──────────────────────────────────────────────────────
|
||||
|
||||
def extract_quark_links(html):
|
||||
raw = re.findall(r'https?://pan\.quark\.cn/s/[a-zA-Z0-9]{12,}', html)
|
||||
return sorted(set(l for l in raw if not l.endswith('https')))
|
||||
|
||||
|
||||
def extract_truncated_links(html):
|
||||
raw = re.findall(r'https?://pan\.quark\.cn/s/[a-zA-Z0-9]{1,11}', html)
|
||||
return sorted(set(raw))
|
||||
|
||||
|
||||
def extract_threads(html):
|
||||
"""提取帖子标题,兼容 viewthread 和 thread 格式"""
|
||||
patterns = [
|
||||
r'<a[^>]+href="forum\.php\?mod=viewthread[^>]+>([^<]+)</a>',
|
||||
r'<a[^>]+href="thread\.php\?tid=\d+[^>]*>([^<]+)</a>',
|
||||
r'class="s xst"[^>]*>([^<]+)</a>',
|
||||
r'<a[^>]+class="xst"[^>]*>([^<]+)</a>',
|
||||
]
|
||||
all_titles = []
|
||||
for p in patterns:
|
||||
all_titles.extend(re.findall(p, html))
|
||||
|
||||
seen = set()
|
||||
result = []
|
||||
for t in all_titles:
|
||||
t = t.strip()
|
||||
if t and len(t) > 5 and t not in seen:
|
||||
seen.add(t)
|
||||
result.append(t)
|
||||
return result
|
||||
|
||||
|
||||
# ── 主流程 ────────────────────────────────────────────────────────
|
||||
|
||||
def search(keyword, cookie):
|
||||
"""搜索,超时则返回 None"""
|
||||
url = SEARCH_URL.format(keyword=urllib.parse.quote(keyword))
|
||||
print(f'🔍 搜索 "{keyword}"({SEARCH_TIMEOUT}s 超时)...')
|
||||
sys.stdout.flush()
|
||||
return request(url, cookie, timeout=SEARCH_TIMEOUT)
|
||||
|
||||
|
||||
def browse_forum(cookie):
|
||||
"""浏览动漫板块最新帖子作为降级方案"""
|
||||
print(f'⏩ 降级到浏览动漫板块最新帖子...')
|
||||
sys.stdout.flush()
|
||||
return request(FORUM_URL, cookie, timeout=FORUM_TIMEOUT)
|
||||
|
||||
|
||||
def print_results(links, truncated, threads, source, elapsed):
|
||||
print(f'\n✅ 完成(来源: {source},耗时 {elapsed:.0f}s)')
|
||||
|
||||
if links:
|
||||
print(f'\n{"=" * 50}')
|
||||
print(f'✅ 完整夸克链接(可直接转存): {len(links)}')
|
||||
print(f'{"=" * 50}')
|
||||
for l in links:
|
||||
print(f' {l}')
|
||||
|
||||
os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True)
|
||||
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
|
||||
f.write('\n'.join(links) + '\n')
|
||||
print(f'\n💾 链接已保存: {os.path.relpath(OUTPUT_FILE)}')
|
||||
else:
|
||||
print('\n⚠️ 未找到完整夸克链接')
|
||||
|
||||
if truncated:
|
||||
print(f'\n⚠️ 被截断的链接({len(truncated)} 个,需点进帖子):')
|
||||
for l in truncated[:5]:
|
||||
print(f' {l}')
|
||||
|
||||
if threads:
|
||||
print(f'\n📌 帖子预览(前 10 条):')
|
||||
for t in threads[:10]:
|
||||
print(f' • {t[:60]}')
|
||||
|
||||
|
||||
def main():
|
||||
if len(sys.argv) < 2:
|
||||
print('用法: py -X utf8 yunpan1_search.py <关键词>')
|
||||
print('示例: py -X utf8 yunpan1_search.py 遮天')
|
||||
sys.exit(1)
|
||||
|
||||
keyword = sys.argv[1]
|
||||
cookie = load_cookie()
|
||||
|
||||
# 尝试搜索
|
||||
status, html, _, elapsed, err = search(keyword, cookie)
|
||||
|
||||
if err and 'timeout' in err.lower():
|
||||
print(f'⏱ 搜索超时({elapsed:.0f}s),机器配置较低时首次建索引可能卡死')
|
||||
# 降级到浏览板块
|
||||
status, html, _, elapsed, err2 = browse_forum(cookie)
|
||||
if err2:
|
||||
print(f'❌ 降级也失败了: {err2}')
|
||||
sys.exit(1)
|
||||
source = '板块浏览(搜索超时降级)'
|
||||
elif err:
|
||||
print(f'❌ 搜索失败: {err}')
|
||||
# 尝试降级
|
||||
status, html, _, elapsed, _ = browse_forum(cookie)
|
||||
source = '板块浏览(搜索失败降级)'
|
||||
else:
|
||||
source = '搜索'
|
||||
|
||||
links = extract_quark_links(html)
|
||||
truncated = extract_truncated_links(html)
|
||||
threads = extract_threads(html)
|
||||
print_results(links, truncated, threads, source, elapsed)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user