feat: 新增 mineru_mvp 文档解析组件并适配 Linux 路径

2026-06-11 11:24:12 +08:00
parent 4c51d8ce7f
commit 91a7e22840
5 changed files with 297 additions and 3 deletions
--- a/mineru_mvp/pipeline.py
+++ b/mineru_mvp/pipeline.py
@@ -0,0 +1,221 @@
+#!/usr/bin/env python3
+"""
+MinerU MVP — Document Parsing Pipeline (Cloud API)
+
+Usage:
+    python pipeline.py <pdf_path>
+
+Flow:
+    1. POST /file-urls/batch  →  get presigned upload URL + batch_id
+    2. PUT <pdf> to presigned URL
+    3. Poll GET /extract-results/batch/{batch_id}
+    4. Download & extract ZIP → output/{pdf_stem}/
+    5. Print summary to stdout
+
+The backend (indexing_service.py) calls this via subprocess and reads
+output/{pdf_stem}/*_content_list.json for downstream KG construction.
+"""
+
+from __future__ import annotations
+
+import argparse
+import io
+import json
+import os
+import sys
+import time
+import zipfile
+from pathlib import Path
+from typing import Any
+
+import requests
+from dotenv import load_dotenv
+
+# ── Config ──────────────────────────────────────────────────────────────────────
+
+load_dotenv(Path(__file__).parent / ".env", override=True)
+
+API_BASE = "https://mineru.net/api/v4"
+TOKEN = os.getenv("MINERU_API_TOKEN", "")
+POLL_INTERVAL = 5          # seconds between status checks
+MAX_WAIT = 600             # max total wait time (seconds) — matches backend timeout
+
+if not TOKEN:
+    print("ERROR: MINERU_API_TOKEN not set in mineru_mvp/.env", file=sys.stderr)
+    sys.exit(1)
+
+HEADERS = {
+    "Authorization": f"Bearer {TOKEN}",
+    "Content-Type": "application/json",
+}
+
+OUTPUT_ROOT = Path(__file__).parent / "output"
+
+
+# ── Pipeline Steps ──────────────────────────────────────────────────────────────
+
+def _request(method: str, url: str, **kwargs: Any) -> dict:
+    """Wrapper around requests that raises on HTTP errors."""
+    resp = requests.request(method, url, **kwargs)
+    resp.raise_for_status()
+    body: dict = resp.json()
+    if body.get("code") != 0:
+        raise RuntimeError(f"API error code={body.get('code')} msg={body.get('msg')}")
+    return body
+
+
+def step1_get_upload_url(filename: str) -> tuple[str, str]:
+    """POST /file-urls/batch → (batch_id, upload_url)."""
+    print(f"[1/5] Requesting presigned upload URL for: {filename}")
+
+    body = _request(
+        "POST",
+        f"{API_BASE}/file-urls/batch",
+        headers=HEADERS,
+        json={
+            "files": [{"name": filename}],
+            "enable_formula": True,
+            "enable_table": True,
+            "language": "en",
+        },
+    )
+
+    data = body["data"]
+    batch_id: str = data["batch_id"]
+    upload_url: str = data["file_urls"][0]
+    print(f"       batch_id: {batch_id}")
+    return batch_id, upload_url
+
+
+def step2_upload(upload_url: str, pdf_path: Path) -> None:
+    """PUT file to presigned URL — MUST NOT include Content-Type header."""
+    print(f"[2/5] Uploading file ({pdf_path.stat().st_size / 1024:.0f} KB)...")
+
+    with open(pdf_path, "rb") as f:
+        resp = requests.put(upload_url, data=f)  # no headers = no Content-Type
+        if not resp.ok:
+            raise RuntimeError(f"Upload failed: HTTP {resp.status_code} — {resp.text[:300]}")
+    print("       Upload complete.")
+
+
+def step3_poll(batch_id: str) -> dict:
+    """Poll GET /extract-results/batch/{batch_id} until done or failed."""
+    print(f"[3/5] Waiting for parsing to complete (polling every {POLL_INTERVAL}s)...")
+
+    started = time.time()
+    last_state = ""
+
+    while True:
+        elapsed = time.time() - started
+        if elapsed > MAX_WAIT:
+            raise TimeoutError(f"Parsing timed out after {MAX_WAIT}s")
+
+        body = _request(
+            "GET",
+            f"{API_BASE}/extract-results/batch/{batch_id}",
+            headers=HEADERS,
+        )
+
+        item = body["data"]["extract_result"][0]
+        state: str = item["state"]
+
+        if state != last_state:
+            print(f"       state: {state}")
+            last_state = state
+
+        if state == "done":
+            zip_url: str = item["full_zip_url"]
+            progress = item.get("extract_progress", {})
+            pages = progress.get("extracted_pages", "?")
+            total = progress.get("total_pages", "?")
+            print(f"       Parsing done — {pages}/{total} pages extracted.")
+            return {"zip_url": zip_url, "pages": pages, "total": total}
+
+        if state == "failed":
+            err = item.get("err_msg", "unknown error")
+            raise RuntimeError(f"Parsing failed: {err}")
+
+        time.sleep(POLL_INTERVAL)
+
+
+def step4_download_and_extract(zip_url: str, output_dir: Path) -> list[str]:
+    """Download ZIP and extract to output/ directory."""
+    print(f"[4/5] Downloading & extracting results...")
+
+    resp = requests.get(zip_url)
+    resp.raise_for_status()
+
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    with zipfile.ZipFile(io.BytesIO(resp.content)) as zf:
+        zf.extractall(str(output_dir))
+
+    files = sorted(
+        f for f in os.listdir(output_dir) if not f.startswith(".")
+    )
+    for f in files:
+        fpath = output_dir / f
+        if fpath.is_file():
+            print(f"       {f} ({fpath.stat().st_size / 1024:.0f} KB)")
+
+    return files
+
+
+def step5_summary(output_dir: Path) -> None:
+    """Print summary of parsed output."""
+    print(f"[5/5] Summary")
+
+    # Find content_list.json
+    matches = list(output_dir.glob("*_content_list.json"))
+    if not matches:
+        print("       WARNING: No content_list.json found!", file=sys.stderr)
+        return
+
+    content_list_path = matches[0]
+    with open(content_list_path, "r", encoding="utf-8") as f:
+        content_list = json.load(f)
+
+    # Count block types
+    type_counts: dict[str, int] = {}
+    pages: set[int] = set()
+    for block in content_list:
+        t = block.get("type", "unknown")
+        type_counts[t] = type_counts.get(t, 0) + 1
+        pages.add(block.get("page_idx", 0))
+
+    print(f"       File:     {content_list_path.name}")
+    print(f"       Blocks:   {len(content_list)}")
+    print(f"       Pages:    {len(pages)}")
+    for t, c in sorted(type_counts.items()):
+        print(f"         {t}: {c}")
+
+
+# ── Main ────────────────────────────────────────────────────────────────────────
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="MinerU document parsing pipeline")
+    parser.add_argument("pdf_path", help="Path to the PDF file to parse")
+    args = parser.parse_args()
+
+    pdf_path = Path(args.pdf_path).resolve()
+    if not pdf_path.exists():
+        print(f"ERROR: File not found: {pdf_path}", file=sys.stderr)
+        sys.exit(1)
+
+    stem = pdf_path.stem
+    output_dir = OUTPUT_ROOT / stem
+
+    try:
+        batch_id, upload_url = step1_get_upload_url(pdf_path.name)
+        step2_upload(upload_url, pdf_path)
+        result = step3_poll(batch_id)
+        step4_download_and_extract(result["zip_url"], output_dir)
+        step5_summary(output_dir)
+        print(f"\nDone. Output: {output_dir}")
+    except Exception as exc:
+        print(f"\nERROR: {exc}", file=sys.stderr)
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()