media-center/sites/yunpan1/v1/yunpan1_search.py

#!python
# -*- coding: utf-8 -*-
"""
yunpan1 搜索工具 — 搜索云盘资源分享社区并提取夸克链接

用法：
  py -X utf8 yunpan1_search.py <关键词>
  py -X utf8 yunpan1_search.py 遮天

说明：
  - Discuz! 首次搜索需建索引（可能等 60-120 秒），同一关键词后续秒回
  - Cookie 文件: tmp/yunpan1_cookies.txt（需先通过 Playwright 登录获取）
  - 依赖: Python 标准库（无需额外安装）
"""

import re
import os
import sys
import urllib.request
import urllib.parse
import time

# ── 配置 ──────────────────────────────────────────────────────────
BASE_URL = 'https://yunpan1.cc'
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
COOKIE_FILE = os.path.abspath(os.path.join(SCRIPT_DIR, '..', '..', '..', 'tmp', 'yunpan1_cookies.txt'))
OUTPUT_FILE = os.path.abspath(os.path.join(SCRIPT_DIR, '..', '..', '..', 'tmp', 'quark_links.txt'))
SEARCH_URL = BASE_URL + '/search.php?mod=forum&srchtxt={keyword}&searchsubmit=yes'

# 全局 opener（timeout=300 避免分块传输时断连）
_opener = urllib.request.build_opener()
_opener.timeout = 300


# ── 工具函数 ──────────────────────────────────────────────────────

def load_cookie():
    path = COOKIE_FILE
    if not os.path.exists(path):
        print(f'❌ Cookie 文件不存在: {path}')
        print(f'请通过 Playwright 登录 yunpan1.cc 后，将 Cookie 保存到该文件（一行一个 key=value）')
        sys.exit(1)
    with open(path, 'r', encoding='utf-8') as f:
        return f.read().strip().replace('\n', '; ')


def request(url, cookie):
    req = urllib.request.Request(url, headers={
        'Cookie': cookie,
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
    })
    t0 = time.time()
    resp = _opener.open(req)
    html = resp.read().decode('utf-8', errors='replace')
    return resp.status, html, resp.url, time.time() - t0


# ── 提取函数 ──────────────────────────────────────────────────────

def extract_quark_links(html):
    """提取完整夸克链接（12 位字母数字 ID），排除末尾拼接 https 的误匹配"""
    raw = re.findall(r'https?://pan\.quark\.cn/s/[a-zA-Z0-9]{12,}', html)
    # 过滤掉后面紧跟着 https 的（如 .../s/xxxhttps）
    return sorted(set(l for l in raw if not l.endswith('https')))


def extract_truncated_links(html):
    """提取被截断的链接（1-11 位 ID，需要点进帖子）"""
    raw = re.findall(r'https?://pan\.quark\.cn/s/[a-zA-Z0-9]{1,11}', html)
    return sorted(set(raw))


def extract_threads(html):
    """提取帖子列表"""
    titles = re.findall(r'<a[^>]+href="forum\.php\?mod=viewthread[^>]+>([^<]+)</a>', html)
    seen = set()
    result = []
    for t in titles:
        t = t.strip()
        if t and len(t) > 5 and t not in seen:
            seen.add(t)
            result.append(t)
    return result


# ── 主流程 ────────────────────────────────────────────────────────

def main():
    if len(sys.argv) < 2:
        print('用法: py -X utf8 yunpan1_search.py <关键词>')
        print('示例: py -X utf8 yunpan1_search.py 遮天')
        sys.exit(1)

    keyword = sys.argv[1]
    cookie = load_cookie()

    print(f'🔍 搜索 "{keyword}" ...')
    print(f'⏱  首次搜索需建索引，可能等待 1-2 分钟，请耐心等待...')
    sys.stdout.flush()

    status, html, final_url, elapsed = request(
        SEARCH_URL.format(keyword=urllib.parse.quote(keyword)),
        cookie
    )

    links = extract_quark_links(html)
    truncated = extract_truncated_links(html)
    threads = extract_threads(html)

    print(f'\n✅ 完成（耗时 {elapsed:.0f}s，状态码 {status}）')
    print(f'   HTML: {len(html)} 字符 / {len(threads)} 条帖子')

    if links:
        print(f'\n{"=" * 50}')
        print(f'✅ 完整夸克链接（可直接转存）: {len(links)}')
        print(f'{"=" * 50}')
        for l in links:
            print(f'   {l}')

        os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True)
        with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
            f.write('\n'.join(links) + '\n')
        print(f'\n💾 链接已保存: {os.path.relpath(OUTPUT_FILE)}')
    else:
        print('\n⚠️  未找到完整夸克链接')

    if truncated:
        print(f'\n⚠️  被截断的链接（{len(truncated)} 个，需点进帖子）:')
        for l in truncated[:5]:
            print(f'   {l}')

    print(f'\n📌 帖子预览（前 10 条）:')
    for t in threads[:10]:
        print(f'   • {t[:60]}')


if __name__ == '__main__':
    main()