feat: 新增 mineru_mvp 文档解析组件并适配 Linux 路径
This commit is contained in:
221
mineru_mvp/pipeline.py
Normal file
221
mineru_mvp/pipeline.py
Normal file
@@ -0,0 +1,221 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
MinerU MVP — Document Parsing Pipeline (Cloud API)
|
||||
|
||||
Usage:
|
||||
python pipeline.py <pdf_path>
|
||||
|
||||
Flow:
|
||||
1. POST /file-urls/batch → get presigned upload URL + batch_id
|
||||
2. PUT <pdf> to presigned URL
|
||||
3. Poll GET /extract-results/batch/{batch_id}
|
||||
4. Download & extract ZIP → output/{pdf_stem}/
|
||||
5. Print summary to stdout
|
||||
|
||||
The backend (indexing_service.py) calls this via subprocess and reads
|
||||
output/{pdf_stem}/*_content_list.json for downstream KG construction.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import io
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import requests
|
||||
from dotenv import load_dotenv
|
||||
|
||||
# ── Config ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
load_dotenv(Path(__file__).parent / ".env", override=True)
|
||||
|
||||
API_BASE = "https://mineru.net/api/v4"
|
||||
TOKEN = os.getenv("MINERU_API_TOKEN", "")
|
||||
POLL_INTERVAL = 5 # seconds between status checks
|
||||
MAX_WAIT = 600 # max total wait time (seconds) — matches backend timeout
|
||||
|
||||
if not TOKEN:
|
||||
print("ERROR: MINERU_API_TOKEN not set in mineru_mvp/.env", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
HEADERS = {
|
||||
"Authorization": f"Bearer {TOKEN}",
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
|
||||
OUTPUT_ROOT = Path(__file__).parent / "output"
|
||||
|
||||
|
||||
# ── Pipeline Steps ──────────────────────────────────────────────────────────────
|
||||
|
||||
def _request(method: str, url: str, **kwargs: Any) -> dict:
|
||||
"""Wrapper around requests that raises on HTTP errors."""
|
||||
resp = requests.request(method, url, **kwargs)
|
||||
resp.raise_for_status()
|
||||
body: dict = resp.json()
|
||||
if body.get("code") != 0:
|
||||
raise RuntimeError(f"API error code={body.get('code')} msg={body.get('msg')}")
|
||||
return body
|
||||
|
||||
|
||||
def step1_get_upload_url(filename: str) -> tuple[str, str]:
|
||||
"""POST /file-urls/batch → (batch_id, upload_url)."""
|
||||
print(f"[1/5] Requesting presigned upload URL for: {filename}")
|
||||
|
||||
body = _request(
|
||||
"POST",
|
||||
f"{API_BASE}/file-urls/batch",
|
||||
headers=HEADERS,
|
||||
json={
|
||||
"files": [{"name": filename}],
|
||||
"enable_formula": True,
|
||||
"enable_table": True,
|
||||
"language": "en",
|
||||
},
|
||||
)
|
||||
|
||||
data = body["data"]
|
||||
batch_id: str = data["batch_id"]
|
||||
upload_url: str = data["file_urls"][0]
|
||||
print(f" batch_id: {batch_id}")
|
||||
return batch_id, upload_url
|
||||
|
||||
|
||||
def step2_upload(upload_url: str, pdf_path: Path) -> None:
|
||||
"""PUT file to presigned URL — MUST NOT include Content-Type header."""
|
||||
print(f"[2/5] Uploading file ({pdf_path.stat().st_size / 1024:.0f} KB)...")
|
||||
|
||||
with open(pdf_path, "rb") as f:
|
||||
resp = requests.put(upload_url, data=f) # no headers = no Content-Type
|
||||
if not resp.ok:
|
||||
raise RuntimeError(f"Upload failed: HTTP {resp.status_code} — {resp.text[:300]}")
|
||||
print(" Upload complete.")
|
||||
|
||||
|
||||
def step3_poll(batch_id: str) -> dict:
|
||||
"""Poll GET /extract-results/batch/{batch_id} until done or failed."""
|
||||
print(f"[3/5] Waiting for parsing to complete (polling every {POLL_INTERVAL}s)...")
|
||||
|
||||
started = time.time()
|
||||
last_state = ""
|
||||
|
||||
while True:
|
||||
elapsed = time.time() - started
|
||||
if elapsed > MAX_WAIT:
|
||||
raise TimeoutError(f"Parsing timed out after {MAX_WAIT}s")
|
||||
|
||||
body = _request(
|
||||
"GET",
|
||||
f"{API_BASE}/extract-results/batch/{batch_id}",
|
||||
headers=HEADERS,
|
||||
)
|
||||
|
||||
item = body["data"]["extract_result"][0]
|
||||
state: str = item["state"]
|
||||
|
||||
if state != last_state:
|
||||
print(f" state: {state}")
|
||||
last_state = state
|
||||
|
||||
if state == "done":
|
||||
zip_url: str = item["full_zip_url"]
|
||||
progress = item.get("extract_progress", {})
|
||||
pages = progress.get("extracted_pages", "?")
|
||||
total = progress.get("total_pages", "?")
|
||||
print(f" Parsing done — {pages}/{total} pages extracted.")
|
||||
return {"zip_url": zip_url, "pages": pages, "total": total}
|
||||
|
||||
if state == "failed":
|
||||
err = item.get("err_msg", "unknown error")
|
||||
raise RuntimeError(f"Parsing failed: {err}")
|
||||
|
||||
time.sleep(POLL_INTERVAL)
|
||||
|
||||
|
||||
def step4_download_and_extract(zip_url: str, output_dir: Path) -> list[str]:
|
||||
"""Download ZIP and extract to output/ directory."""
|
||||
print(f"[4/5] Downloading & extracting results...")
|
||||
|
||||
resp = requests.get(zip_url)
|
||||
resp.raise_for_status()
|
||||
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with zipfile.ZipFile(io.BytesIO(resp.content)) as zf:
|
||||
zf.extractall(str(output_dir))
|
||||
|
||||
files = sorted(
|
||||
f for f in os.listdir(output_dir) if not f.startswith(".")
|
||||
)
|
||||
for f in files:
|
||||
fpath = output_dir / f
|
||||
if fpath.is_file():
|
||||
print(f" {f} ({fpath.stat().st_size / 1024:.0f} KB)")
|
||||
|
||||
return files
|
||||
|
||||
|
||||
def step5_summary(output_dir: Path) -> None:
|
||||
"""Print summary of parsed output."""
|
||||
print(f"[5/5] Summary")
|
||||
|
||||
# Find content_list.json
|
||||
matches = list(output_dir.glob("*_content_list.json"))
|
||||
if not matches:
|
||||
print(" WARNING: No content_list.json found!", file=sys.stderr)
|
||||
return
|
||||
|
||||
content_list_path = matches[0]
|
||||
with open(content_list_path, "r", encoding="utf-8") as f:
|
||||
content_list = json.load(f)
|
||||
|
||||
# Count block types
|
||||
type_counts: dict[str, int] = {}
|
||||
pages: set[int] = set()
|
||||
for block in content_list:
|
||||
t = block.get("type", "unknown")
|
||||
type_counts[t] = type_counts.get(t, 0) + 1
|
||||
pages.add(block.get("page_idx", 0))
|
||||
|
||||
print(f" File: {content_list_path.name}")
|
||||
print(f" Blocks: {len(content_list)}")
|
||||
print(f" Pages: {len(pages)}")
|
||||
for t, c in sorted(type_counts.items()):
|
||||
print(f" {t}: {c}")
|
||||
|
||||
|
||||
# ── Main ────────────────────────────────────────────────────────────────────────
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description="MinerU document parsing pipeline")
|
||||
parser.add_argument("pdf_path", help="Path to the PDF file to parse")
|
||||
args = parser.parse_args()
|
||||
|
||||
pdf_path = Path(args.pdf_path).resolve()
|
||||
if not pdf_path.exists():
|
||||
print(f"ERROR: File not found: {pdf_path}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
stem = pdf_path.stem
|
||||
output_dir = OUTPUT_ROOT / stem
|
||||
|
||||
try:
|
||||
batch_id, upload_url = step1_get_upload_url(pdf_path.name)
|
||||
step2_upload(upload_url, pdf_path)
|
||||
result = step3_poll(batch_id)
|
||||
step4_download_and_extract(result["zip_url"], output_dir)
|
||||
step5_summary(output_dir)
|
||||
print(f"\nDone. Output: {output_dir}")
|
||||
except Exception as exc:
|
||||
print(f"\nERROR: {exc}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user