Files
media-center/sites/yunpan1/v1/yunpan1_search.py
T
sutong ecc1b9d6dc feat: add yunpan1 search source
- sites/yunpan1/v1/: 新增云盘资源分享社区搜索源
  - intro.md: 论坛介绍、板块列表
  - urls.md: 站点链接、Cookie 维护说明
  - usage.md: 搜索脚本使用、登录流程
  - yunpan1_search.py: Python 搜索脚本(标准库零依赖)
- .gitignore: 追加 .idea/ __pycache__/ *.pyc

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-05-16 20:44:28 +08:00

139 lines
4.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!python
# -*- coding: utf-8 -*-
"""
yunpan1 搜索工具 — 搜索云盘资源分享社区并提取夸克链接
用法:
py -X utf8 yunpan1_search.py <关键词>
py -X utf8 yunpan1_search.py 遮天
说明:
- Discuz! 首次搜索需建索引(可能等 60-120 秒),同一关键词后续秒回
- Cookie 文件: tmp/yunpan1_cookies.txt(需先通过 Playwright 登录获取)
- 依赖: Python 标准库(无需额外安装)
"""
import re
import os
import sys
import urllib.request
import urllib.parse
import time
# ── 配置 ──────────────────────────────────────────────────────────
BASE_URL = 'https://yunpan1.cc'
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
COOKIE_FILE = os.path.abspath(os.path.join(SCRIPT_DIR, '..', '..', '..', 'tmp', 'yunpan1_cookies.txt'))
OUTPUT_FILE = os.path.abspath(os.path.join(SCRIPT_DIR, '..', '..', '..', 'tmp', 'quark_links.txt'))
SEARCH_URL = BASE_URL + '/search.php?mod=forum&srchtxt={keyword}&searchsubmit=yes'
# 全局 openertimeout=300 避免分块传输时断连)
_opener = urllib.request.build_opener()
_opener.timeout = 300
# ── 工具函数 ──────────────────────────────────────────────────────
def load_cookie():
path = COOKIE_FILE
if not os.path.exists(path):
print(f'❌ Cookie 文件不存在: {path}')
print(f'请通过 Playwright 登录 yunpan1.cc 后,将 Cookie 保存到该文件(一行一个 key=value)')
sys.exit(1)
with open(path, 'r', encoding='utf-8') as f:
return f.read().strip().replace('\n', '; ')
def request(url, cookie):
req = urllib.request.Request(url, headers={
'Cookie': cookie,
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
})
t0 = time.time()
resp = _opener.open(req)
html = resp.read().decode('utf-8', errors='replace')
return resp.status, html, resp.url, time.time() - t0
# ── 提取函数 ──────────────────────────────────────────────────────
def extract_quark_links(html):
"""提取完整夸克链接(12 位字母数字 ID),排除末尾拼接 https 的误匹配"""
raw = re.findall(r'https?://pan\.quark\.cn/s/[a-zA-Z0-9]{12,}', html)
# 过滤掉后面紧跟着 https 的(如 .../s/xxxhttps
return sorted(set(l for l in raw if not l.endswith('https')))
def extract_truncated_links(html):
"""提取被截断的链接(1-11 位 ID,需要点进帖子)"""
raw = re.findall(r'https?://pan\.quark\.cn/s/[a-zA-Z0-9]{1,11}', html)
return sorted(set(raw))
def extract_threads(html):
"""提取帖子列表"""
titles = re.findall(r'<a[^>]+href="forum\.php\?mod=viewthread[^>]+>([^<]+)</a>', html)
seen = set()
result = []
for t in titles:
t = t.strip()
if t and len(t) > 5 and t not in seen:
seen.add(t)
result.append(t)
return result
# ── 主流程 ────────────────────────────────────────────────────────
def main():
if len(sys.argv) < 2:
print('用法: py -X utf8 yunpan1_search.py <关键词>')
print('示例: py -X utf8 yunpan1_search.py 遮天')
sys.exit(1)
keyword = sys.argv[1]
cookie = load_cookie()
print(f'🔍 搜索 "{keyword}" ...')
print(f'⏱ 首次搜索需建索引,可能等待 1-2 分钟,请耐心等待...')
sys.stdout.flush()
status, html, final_url, elapsed = request(
SEARCH_URL.format(keyword=urllib.parse.quote(keyword)),
cookie
)
links = extract_quark_links(html)
truncated = extract_truncated_links(html)
threads = extract_threads(html)
print(f'\n✅ 完成(耗时 {elapsed:.0f}s,状态码 {status}')
print(f' HTML: {len(html)} 字符 / {len(threads)} 条帖子')
if links:
print(f'\n{"=" * 50}')
print(f'✅ 完整夸克链接(可直接转存): {len(links)}')
print(f'{"=" * 50}')
for l in links:
print(f' {l}')
os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True)
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
f.write('\n'.join(links) + '\n')
print(f'\n💾 链接已保存: {os.path.relpath(OUTPUT_FILE)}')
else:
print('\n⚠️ 未找到完整夸克链接')
if truncated:
print(f'\n⚠️ 被截断的链接({len(truncated)} 个,需点进帖子):')
for l in truncated[:5]:
print(f' {l}')
print(f'\n📌 帖子预览(前 10 条):')
for t in threads[:10]:
print(f'{t[:60]}')
if __name__ == '__main__':
main()