feat: add yunpan1 search source
- sites/yunpan1/v1/: 新增云盘资源分享社区搜索源 - intro.md: 论坛介绍、板块列表 - urls.md: 站点链接、Cookie 维护说明 - usage.md: 搜索脚本使用、登录流程 - yunpan1_search.py: Python 搜索脚本(标准库零依赖) - .gitignore: 追加 .idea/ __pycache__/ *.pyc Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,138 @@
|
||||
#!python
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
yunpan1 搜索工具 — 搜索云盘资源分享社区并提取夸克链接
|
||||
|
||||
用法:
|
||||
py -X utf8 yunpan1_search.py <关键词>
|
||||
py -X utf8 yunpan1_search.py 遮天
|
||||
|
||||
说明:
|
||||
- Discuz! 首次搜索需建索引(可能等 60-120 秒),同一关键词后续秒回
|
||||
- Cookie 文件: tmp/yunpan1_cookies.txt(需先通过 Playwright 登录获取)
|
||||
- 依赖: Python 标准库(无需额外安装)
|
||||
"""
|
||||
|
||||
import re
|
||||
import os
|
||||
import sys
|
||||
import urllib.request
|
||||
import urllib.parse
|
||||
import time
|
||||
|
||||
# ── 配置 ──────────────────────────────────────────────────────────
|
||||
BASE_URL = 'https://yunpan1.cc'
|
||||
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
COOKIE_FILE = os.path.abspath(os.path.join(SCRIPT_DIR, '..', '..', '..', 'tmp', 'yunpan1_cookies.txt'))
|
||||
OUTPUT_FILE = os.path.abspath(os.path.join(SCRIPT_DIR, '..', '..', '..', 'tmp', 'quark_links.txt'))
|
||||
SEARCH_URL = BASE_URL + '/search.php?mod=forum&srchtxt={keyword}&searchsubmit=yes'
|
||||
|
||||
# 全局 opener(timeout=300 避免分块传输时断连)
|
||||
_opener = urllib.request.build_opener()
|
||||
_opener.timeout = 300
|
||||
|
||||
|
||||
# ── 工具函数 ──────────────────────────────────────────────────────
|
||||
|
||||
def load_cookie():
|
||||
path = COOKIE_FILE
|
||||
if not os.path.exists(path):
|
||||
print(f'❌ Cookie 文件不存在: {path}')
|
||||
print(f'请通过 Playwright 登录 yunpan1.cc 后,将 Cookie 保存到该文件(一行一个 key=value)')
|
||||
sys.exit(1)
|
||||
with open(path, 'r', encoding='utf-8') as f:
|
||||
return f.read().strip().replace('\n', '; ')
|
||||
|
||||
|
||||
def request(url, cookie):
|
||||
req = urllib.request.Request(url, headers={
|
||||
'Cookie': cookie,
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
|
||||
})
|
||||
t0 = time.time()
|
||||
resp = _opener.open(req)
|
||||
html = resp.read().decode('utf-8', errors='replace')
|
||||
return resp.status, html, resp.url, time.time() - t0
|
||||
|
||||
|
||||
# ── 提取函数 ──────────────────────────────────────────────────────
|
||||
|
||||
def extract_quark_links(html):
|
||||
"""提取完整夸克链接(12 位字母数字 ID),排除末尾拼接 https 的误匹配"""
|
||||
raw = re.findall(r'https?://pan\.quark\.cn/s/[a-zA-Z0-9]{12,}', html)
|
||||
# 过滤掉后面紧跟着 https 的(如 .../s/xxxhttps)
|
||||
return sorted(set(l for l in raw if not l.endswith('https')))
|
||||
|
||||
|
||||
def extract_truncated_links(html):
|
||||
"""提取被截断的链接(1-11 位 ID,需要点进帖子)"""
|
||||
raw = re.findall(r'https?://pan\.quark\.cn/s/[a-zA-Z0-9]{1,11}', html)
|
||||
return sorted(set(raw))
|
||||
|
||||
|
||||
def extract_threads(html):
|
||||
"""提取帖子列表"""
|
||||
titles = re.findall(r'<a[^>]+href="forum\.php\?mod=viewthread[^>]+>([^<]+)</a>', html)
|
||||
seen = set()
|
||||
result = []
|
||||
for t in titles:
|
||||
t = t.strip()
|
||||
if t and len(t) > 5 and t not in seen:
|
||||
seen.add(t)
|
||||
result.append(t)
|
||||
return result
|
||||
|
||||
|
||||
# ── 主流程 ────────────────────────────────────────────────────────
|
||||
|
||||
def main():
|
||||
if len(sys.argv) < 2:
|
||||
print('用法: py -X utf8 yunpan1_search.py <关键词>')
|
||||
print('示例: py -X utf8 yunpan1_search.py 遮天')
|
||||
sys.exit(1)
|
||||
|
||||
keyword = sys.argv[1]
|
||||
cookie = load_cookie()
|
||||
|
||||
print(f'🔍 搜索 "{keyword}" ...')
|
||||
print(f'⏱ 首次搜索需建索引,可能等待 1-2 分钟,请耐心等待...')
|
||||
sys.stdout.flush()
|
||||
|
||||
status, html, final_url, elapsed = request(
|
||||
SEARCH_URL.format(keyword=urllib.parse.quote(keyword)),
|
||||
cookie
|
||||
)
|
||||
|
||||
links = extract_quark_links(html)
|
||||
truncated = extract_truncated_links(html)
|
||||
threads = extract_threads(html)
|
||||
|
||||
print(f'\n✅ 完成(耗时 {elapsed:.0f}s,状态码 {status})')
|
||||
print(f' HTML: {len(html)} 字符 / {len(threads)} 条帖子')
|
||||
|
||||
if links:
|
||||
print(f'\n{"=" * 50}')
|
||||
print(f'✅ 完整夸克链接(可直接转存): {len(links)}')
|
||||
print(f'{"=" * 50}')
|
||||
for l in links:
|
||||
print(f' {l}')
|
||||
|
||||
os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True)
|
||||
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
|
||||
f.write('\n'.join(links) + '\n')
|
||||
print(f'\n💾 链接已保存: {os.path.relpath(OUTPUT_FILE)}')
|
||||
else:
|
||||
print('\n⚠️ 未找到完整夸克链接')
|
||||
|
||||
if truncated:
|
||||
print(f'\n⚠️ 被截断的链接({len(truncated)} 个,需点进帖子):')
|
||||
for l in truncated[:5]:
|
||||
print(f' {l}')
|
||||
|
||||
print(f'\n📌 帖子预览(前 10 条):')
|
||||
for t in threads[:10]:
|
||||
print(f' • {t[:60]}')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user