feat: 新增 mineru_mvp 文档解析组件并适配 Linux 路径

This commit is contained in:
2026-06-11 11:24:12 +08:00
parent 4c51d8ce7f
commit 91a7e22840
5 changed files with 297 additions and 3 deletions

3
mineru_mvp/.env.example Normal file
View File

@@ -0,0 +1,3 @@
# MinerU Cloud API Token
# Get yours at: https://mineru.net/apiManage/token
MINERU_API_TOKEN=your_mineru_api_token_here

64
mineru_mvp/CLAUDE.md Normal file
View File

@@ -0,0 +1,64 @@
# MinerU MVP — 文档解析组件
## 路径
```
GraphRAGAgent/mineru_mvp/
```
## 功能
通过 MinerU Cloud API 将 PDF/DOCX 等文档解析为结构化 JSON`content_list.json`),供后端索引流水线消费。
## 安装
```bash
cd mineru_mvp
uv venv --python 3.12
source .venv/bin/activate # Linux / macOS
# .venv\Scripts\activate # Windows
uv pip install -r requirements.txt
```
## 配置
复制 `.env.example``.env`,填入 MinerU API Token
```env
MINERU_API_TOKEN=your_token_here
```
Token 获取地址https://mineru.net/apiManage/token
## 使用
```bash
# 激活 venv 后(或直接指定解释器路径):
python pipeline.py /path/to/document.pdf
# 或由 backend 通过 subprocess 调用:
/path/to/mineru_mvp/.venv/bin/python /path/to/mineru_mvp/pipeline.py /path/to/document.pdf
```
## 输出
解析结果输出到 `output/{文件名}/` 目录:
```
output/
└── {pdf_stem}/
├── {uuid}_content_list.json ← 核心产物,供 backend 读取
├── full.md
├── {uuid}_origin.pdf
├── layout.json
└── images/
└── {hash}.jpg
```
## 流水线步骤
1. POST `/file-urls/batch` — 获取预签名上传 URL
2. PUT 文件到预签名 URL不带 Content-Type
3. 轮询 GET `/extract-results/batch/{batch_id}`
4. 下载 ZIP → 解压到 `output/`
5. 打印摘要到 stdout

221
mineru_mvp/pipeline.py Normal file
View File

@@ -0,0 +1,221 @@
#!/usr/bin/env python3
"""
MinerU MVP — Document Parsing Pipeline (Cloud API)
Usage:
python pipeline.py <pdf_path>
Flow:
1. POST /file-urls/batch → get presigned upload URL + batch_id
2. PUT <pdf> to presigned URL
3. Poll GET /extract-results/batch/{batch_id}
4. Download & extract ZIP → output/{pdf_stem}/
5. Print summary to stdout
The backend (indexing_service.py) calls this via subprocess and reads
output/{pdf_stem}/*_content_list.json for downstream KG construction.
"""
from __future__ import annotations
import argparse
import io
import json
import os
import sys
import time
import zipfile
from pathlib import Path
from typing import Any
import requests
from dotenv import load_dotenv
# ── Config ──────────────────────────────────────────────────────────────────────
load_dotenv(Path(__file__).parent / ".env", override=True)
API_BASE = "https://mineru.net/api/v4"
TOKEN = os.getenv("MINERU_API_TOKEN", "")
POLL_INTERVAL = 5 # seconds between status checks
MAX_WAIT = 600 # max total wait time (seconds) — matches backend timeout
if not TOKEN:
print("ERROR: MINERU_API_TOKEN not set in mineru_mvp/.env", file=sys.stderr)
sys.exit(1)
HEADERS = {
"Authorization": f"Bearer {TOKEN}",
"Content-Type": "application/json",
}
OUTPUT_ROOT = Path(__file__).parent / "output"
# ── Pipeline Steps ──────────────────────────────────────────────────────────────
def _request(method: str, url: str, **kwargs: Any) -> dict:
"""Wrapper around requests that raises on HTTP errors."""
resp = requests.request(method, url, **kwargs)
resp.raise_for_status()
body: dict = resp.json()
if body.get("code") != 0:
raise RuntimeError(f"API error code={body.get('code')} msg={body.get('msg')}")
return body
def step1_get_upload_url(filename: str) -> tuple[str, str]:
"""POST /file-urls/batch → (batch_id, upload_url)."""
print(f"[1/5] Requesting presigned upload URL for: {filename}")
body = _request(
"POST",
f"{API_BASE}/file-urls/batch",
headers=HEADERS,
json={
"files": [{"name": filename}],
"enable_formula": True,
"enable_table": True,
"language": "en",
},
)
data = body["data"]
batch_id: str = data["batch_id"]
upload_url: str = data["file_urls"][0]
print(f" batch_id: {batch_id}")
return batch_id, upload_url
def step2_upload(upload_url: str, pdf_path: Path) -> None:
"""PUT file to presigned URL — MUST NOT include Content-Type header."""
print(f"[2/5] Uploading file ({pdf_path.stat().st_size / 1024:.0f} KB)...")
with open(pdf_path, "rb") as f:
resp = requests.put(upload_url, data=f) # no headers = no Content-Type
if not resp.ok:
raise RuntimeError(f"Upload failed: HTTP {resp.status_code}{resp.text[:300]}")
print(" Upload complete.")
def step3_poll(batch_id: str) -> dict:
"""Poll GET /extract-results/batch/{batch_id} until done or failed."""
print(f"[3/5] Waiting for parsing to complete (polling every {POLL_INTERVAL}s)...")
started = time.time()
last_state = ""
while True:
elapsed = time.time() - started
if elapsed > MAX_WAIT:
raise TimeoutError(f"Parsing timed out after {MAX_WAIT}s")
body = _request(
"GET",
f"{API_BASE}/extract-results/batch/{batch_id}",
headers=HEADERS,
)
item = body["data"]["extract_result"][0]
state: str = item["state"]
if state != last_state:
print(f" state: {state}")
last_state = state
if state == "done":
zip_url: str = item["full_zip_url"]
progress = item.get("extract_progress", {})
pages = progress.get("extracted_pages", "?")
total = progress.get("total_pages", "?")
print(f" Parsing done — {pages}/{total} pages extracted.")
return {"zip_url": zip_url, "pages": pages, "total": total}
if state == "failed":
err = item.get("err_msg", "unknown error")
raise RuntimeError(f"Parsing failed: {err}")
time.sleep(POLL_INTERVAL)
def step4_download_and_extract(zip_url: str, output_dir: Path) -> list[str]:
"""Download ZIP and extract to output/ directory."""
print(f"[4/5] Downloading & extracting results...")
resp = requests.get(zip_url)
resp.raise_for_status()
output_dir.mkdir(parents=True, exist_ok=True)
with zipfile.ZipFile(io.BytesIO(resp.content)) as zf:
zf.extractall(str(output_dir))
files = sorted(
f for f in os.listdir(output_dir) if not f.startswith(".")
)
for f in files:
fpath = output_dir / f
if fpath.is_file():
print(f" {f} ({fpath.stat().st_size / 1024:.0f} KB)")
return files
def step5_summary(output_dir: Path) -> None:
"""Print summary of parsed output."""
print(f"[5/5] Summary")
# Find content_list.json
matches = list(output_dir.glob("*_content_list.json"))
if not matches:
print(" WARNING: No content_list.json found!", file=sys.stderr)
return
content_list_path = matches[0]
with open(content_list_path, "r", encoding="utf-8") as f:
content_list = json.load(f)
# Count block types
type_counts: dict[str, int] = {}
pages: set[int] = set()
for block in content_list:
t = block.get("type", "unknown")
type_counts[t] = type_counts.get(t, 0) + 1
pages.add(block.get("page_idx", 0))
print(f" File: {content_list_path.name}")
print(f" Blocks: {len(content_list)}")
print(f" Pages: {len(pages)}")
for t, c in sorted(type_counts.items()):
print(f" {t}: {c}")
# ── Main ────────────────────────────────────────────────────────────────────────
def main() -> None:
parser = argparse.ArgumentParser(description="MinerU document parsing pipeline")
parser.add_argument("pdf_path", help="Path to the PDF file to parse")
args = parser.parse_args()
pdf_path = Path(args.pdf_path).resolve()
if not pdf_path.exists():
print(f"ERROR: File not found: {pdf_path}", file=sys.stderr)
sys.exit(1)
stem = pdf_path.stem
output_dir = OUTPUT_ROOT / stem
try:
batch_id, upload_url = step1_get_upload_url(pdf_path.name)
step2_upload(upload_url, pdf_path)
result = step3_poll(batch_id)
step4_download_and_extract(result["zip_url"], output_dir)
step5_summary(output_dir)
print(f"\nDone. Output: {output_dir}")
except Exception as exc:
print(f"\nERROR: {exc}", file=sys.stderr)
sys.exit(1)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,2 @@
requests>=2.28
python-dotenv>=1.0