feat: 新增 mineru_mvp 文档解析组件并适配 Linux 路径

2026-06-11 11:24:12 +08:00
parent 4c51d8ce7f
commit 91a7e22840
5 changed files with 297 additions and 3 deletions
--- a/backend/.env.example
+++ b/backend/.env.example
@@ -5,6 +5,10 @@ DEEPSEEK_BASE_URL=https://api.deepseek.com
 # MinerU (required for document parsing)
 MINERU_API_TOKEN=your_mineru_api_token_here
-# MinerU venv path (absolute path to python.exe)
+# MinerU venv path (absolute path to the python interpreter in mineru_mvp's venv)
-MINERU_PYTHON=F:/GraphRAGAgent/mineru_mvp/.venv/Scripts/python.exe
+# Linux:   /home/user/GraphRAGAgent/mineru_mvp/.venv/bin/python
-MINERU_PIPELINE=F:/GraphRAGAgent/mineru_mvp/pipeline.py
+# Windows: F:/GraphRAGAgent/mineru_mvp/.venv/Scripts/python.exe
 MINERU_PYTHON=/root/projects/GraphRAGAgent/mineru_mvp/.venv/bin/python
 # Linux:   /home/user/GraphRAGAgent/mineru_mvp/pipeline.py
 # Windows: F:/GraphRAGAgent/mineru_mvp/pipeline.py
 MINERU_PIPELINE=/root/projects/GraphRAGAgent/mineru_mvp/pipeline.py
--- a/mineru_mvp/.env.example
+++ b/mineru_mvp/.env.example
@@ -0,0 +1,3 @@
 # MinerU Cloud API Token
 # Get yours at: https://mineru.net/apiManage/token
 MINERU_API_TOKEN=your_mineru_api_token_here
--- a/mineru_mvp/CLAUDE.md
+++ b/mineru_mvp/CLAUDE.md
@@ -0,0 +1,64 @@
 # MinerU MVP — 文档解析组件
 ## 路径
 ```
 GraphRAGAgent/mineru_mvp/
 ```
 ## 功能
 通过 MinerU Cloud API 将 PDF/DOCX 等文档解析为结构化 JSON（`content_list.json`），供后端索引流水线消费。
 ## 安装
 ```bash
 cd mineru_mvp
 uv venv --python 3.12
 source .venv/bin/activate        # Linux / macOS
 # .venv\Scripts\activate         # Windows
 uv pip install -r requirements.txt
 ```
 ## 配置
 复制 `.env.example` 为 `.env`，填入 MinerU API Token：
 ```env
 MINERU_API_TOKEN=your_token_here
 ```
 Token 获取地址：https://mineru.net/apiManage/token
 ## 使用
 ```bash
 # 激活 venv 后（或直接指定解释器路径）：
 python pipeline.py /path/to/document.pdf
 # 或由 backend 通过 subprocess 调用：
 /path/to/mineru_mvp/.venv/bin/python /path/to/mineru_mvp/pipeline.py /path/to/document.pdf
 ```
 ## 输出
 解析结果输出到 `output/{文件名}/` 目录：
 ```
 output/
 └── {pdf_stem}/
    ├── {uuid}_content_list.json   ← 核心产物，供 backend 读取
    ├── full.md
    ├── {uuid}_origin.pdf
    ├── layout.json
    └── images/
        └── {hash}.jpg
 ```
 ## 流水线步骤
 1. POST `/file-urls/batch` — 获取预签名上传 URL
 2. PUT 文件到预签名 URL（不带 Content-Type）
 3. 轮询 GET `/extract-results/batch/{batch_id}`
 4. 下载 ZIP → 解压到 `output/`
 5. 打印摘要到 stdout
--- a/mineru_mvp/pipeline.py
+++ b/mineru_mvp/pipeline.py
@@ -0,0 +1,221 @@
 #!/usr/bin/env python3
 """
 MinerU MVP — Document Parsing Pipeline (Cloud API)
 Usage:
    python pipeline.py <pdf_path>
 Flow:
    1. POST /file-urls/batch  →  get presigned upload URL + batch_id
    2. PUT <pdf> to presigned URL
    3. Poll GET /extract-results/batch/{batch_id}
    4. Download & extract ZIP → output/{pdf_stem}/
    5. Print summary to stdout
 The backend (indexing_service.py) calls this via subprocess and reads
 output/{pdf_stem}/*_content_list.json for downstream KG construction.
 """
 from __future__ import annotations
 import argparse
 import io
 import json
 import os
 import sys
 import time
 import zipfile
 from pathlib import Path
 from typing import Any
 import requests
 from dotenv import load_dotenv
 # ── Config ──────────────────────────────────────────────────────────────────────
 load_dotenv(Path(__file__).parent / ".env", override=True)
 API_BASE = "https://mineru.net/api/v4"
 TOKEN = os.getenv("MINERU_API_TOKEN", "")
 POLL_INTERVAL = 5          # seconds between status checks
 MAX_WAIT = 600             # max total wait time (seconds) — matches backend timeout
 if not TOKEN:
    print("ERROR: MINERU_API_TOKEN not set in mineru_mvp/.env", file=sys.stderr)
    sys.exit(1)
 HEADERS = {
    "Authorization": f"Bearer {TOKEN}",
    "Content-Type": "application/json",
 }
 OUTPUT_ROOT = Path(__file__).parent / "output"
 # ── Pipeline Steps ──────────────────────────────────────────────────────────────
 def _request(method: str, url: str, **kwargs: Any) -> dict:
    """Wrapper around requests that raises on HTTP errors."""
    resp = requests.request(method, url, **kwargs)
    resp.raise_for_status()
    body: dict = resp.json()
    if body.get("code") != 0:
        raise RuntimeError(f"API error code={body.get('code')} msg={body.get('msg')}")
    return body
 def step1_get_upload_url(filename: str) -> tuple[str, str]:
    """POST /file-urls/batch → (batch_id, upload_url)."""
    print(f"[1/5] Requesting presigned upload URL for: {filename}")
    body = _request(
        "POST",
        f"{API_BASE}/file-urls/batch",
        headers=HEADERS,
        json={
            "files": [{"name": filename}],
            "enable_formula": True,
            "enable_table": True,
            "language": "en",
        },
    )
    data = body["data"]
    batch_id: str = data["batch_id"]
    upload_url: str = data["file_urls"][0]
    print(f"       batch_id: {batch_id}")
    return batch_id, upload_url
 def step2_upload(upload_url: str, pdf_path: Path) -> None:
    """PUT file to presigned URL — MUST NOT include Content-Type header."""
    print(f"[2/5] Uploading file ({pdf_path.stat().st_size / 1024:.0f} KB)...")
    with open(pdf_path, "rb") as f:
        resp = requests.put(upload_url, data=f)  # no headers = no Content-Type
        if not resp.ok:
            raise RuntimeError(f"Upload failed: HTTP {resp.status_code} — {resp.text[:300]}")
    print("       Upload complete.")
 def step3_poll(batch_id: str) -> dict:
    """Poll GET /extract-results/batch/{batch_id} until done or failed."""
    print(f"[3/5] Waiting for parsing to complete (polling every {POLL_INTERVAL}s)...")
    started = time.time()
    last_state = ""
    while True:
        elapsed = time.time() - started
        if elapsed > MAX_WAIT:
            raise TimeoutError(f"Parsing timed out after {MAX_WAIT}s")
        body = _request(
            "GET",
            f"{API_BASE}/extract-results/batch/{batch_id}",
            headers=HEADERS,
        )
        item = body["data"]["extract_result"][0]
        state: str = item["state"]
        if state != last_state:
            print(f"       state: {state}")
            last_state = state
        if state == "done":
            zip_url: str = item["full_zip_url"]
            progress = item.get("extract_progress", {})
            pages = progress.get("extracted_pages", "?")
            total = progress.get("total_pages", "?")
            print(f"       Parsing done — {pages}/{total} pages extracted.")
            return {"zip_url": zip_url, "pages": pages, "total": total}
        if state == "failed":
            err = item.get("err_msg", "unknown error")
            raise RuntimeError(f"Parsing failed: {err}")
        time.sleep(POLL_INTERVAL)
 def step4_download_and_extract(zip_url: str, output_dir: Path) -> list[str]:
    """Download ZIP and extract to output/ directory."""
    print(f"[4/5] Downloading & extracting results...")
    resp = requests.get(zip_url)
    resp.raise_for_status()
    output_dir.mkdir(parents=True, exist_ok=True)
    with zipfile.ZipFile(io.BytesIO(resp.content)) as zf:
        zf.extractall(str(output_dir))
    files = sorted(
        f for f in os.listdir(output_dir) if not f.startswith(".")
    )
    for f in files:
        fpath = output_dir / f
        if fpath.is_file():
            print(f"       {f} ({fpath.stat().st_size / 1024:.0f} KB)")
    return files
 def step5_summary(output_dir: Path) -> None:
    """Print summary of parsed output."""
    print(f"[5/5] Summary")
    # Find content_list.json
    matches = list(output_dir.glob("*_content_list.json"))
    if not matches:
        print("       WARNING: No content_list.json found!", file=sys.stderr)
        return
    content_list_path = matches[0]
    with open(content_list_path, "r", encoding="utf-8") as f:
        content_list = json.load(f)
    # Count block types
    type_counts: dict[str, int] = {}
    pages: set[int] = set()
    for block in content_list:
        t = block.get("type", "unknown")
        type_counts[t] = type_counts.get(t, 0) + 1
        pages.add(block.get("page_idx", 0))
    print(f"       File:     {content_list_path.name}")
    print(f"       Blocks:   {len(content_list)}")
    print(f"       Pages:    {len(pages)}")
    for t, c in sorted(type_counts.items()):
        print(f"         {t}: {c}")
 # ── Main ────────────────────────────────────────────────────────────────────────
 def main() -> None:
    parser = argparse.ArgumentParser(description="MinerU document parsing pipeline")
    parser.add_argument("pdf_path", help="Path to the PDF file to parse")
    args = parser.parse_args()
    pdf_path = Path(args.pdf_path).resolve()
    if not pdf_path.exists():
        print(f"ERROR: File not found: {pdf_path}", file=sys.stderr)
        sys.exit(1)
    stem = pdf_path.stem
    output_dir = OUTPUT_ROOT / stem
    try:
        batch_id, upload_url = step1_get_upload_url(pdf_path.name)
        step2_upload(upload_url, pdf_path)
        result = step3_poll(batch_id)
        step4_download_and_extract(result["zip_url"], output_dir)
        step5_summary(output_dir)
        print(f"\nDone. Output: {output_dir}")
    except Exception as exc:
        print(f"\nERROR: {exc}", file=sys.stderr)
        sys.exit(1)
 if __name__ == "__main__":
    main()
--- a/mineru_mvp/requirements.txt
+++ b/mineru_mvp/requirements.txt
@@ -0,0 +1,2 @@
 requests>=2.28
 python-dotenv>=1.0