feat: 新增 mineru_mvp 文档解析组件并适配 Linux 路径
This commit is contained in:
@@ -5,6 +5,10 @@ DEEPSEEK_BASE_URL=https://api.deepseek.com
|
|||||||
# MinerU (required for document parsing)
|
# MinerU (required for document parsing)
|
||||||
MINERU_API_TOKEN=your_mineru_api_token_here
|
MINERU_API_TOKEN=your_mineru_api_token_here
|
||||||
|
|
||||||
# MinerU venv path (absolute path to python.exe)
|
# MinerU venv path (absolute path to the python interpreter in mineru_mvp's venv)
|
||||||
MINERU_PYTHON=F:/GraphRAGAgent/mineru_mvp/.venv/Scripts/python.exe
|
# Linux: /home/user/GraphRAGAgent/mineru_mvp/.venv/bin/python
|
||||||
MINERU_PIPELINE=F:/GraphRAGAgent/mineru_mvp/pipeline.py
|
# Windows: F:/GraphRAGAgent/mineru_mvp/.venv/Scripts/python.exe
|
||||||
|
MINERU_PYTHON=/root/projects/GraphRAGAgent/mineru_mvp/.venv/bin/python
|
||||||
|
# Linux: /home/user/GraphRAGAgent/mineru_mvp/pipeline.py
|
||||||
|
# Windows: F:/GraphRAGAgent/mineru_mvp/pipeline.py
|
||||||
|
MINERU_PIPELINE=/root/projects/GraphRAGAgent/mineru_mvp/pipeline.py
|
||||||
|
|||||||
3
mineru_mvp/.env.example
Normal file
3
mineru_mvp/.env.example
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
# MinerU Cloud API Token
|
||||||
|
# Get yours at: https://mineru.net/apiManage/token
|
||||||
|
MINERU_API_TOKEN=your_mineru_api_token_here
|
||||||
64
mineru_mvp/CLAUDE.md
Normal file
64
mineru_mvp/CLAUDE.md
Normal file
@@ -0,0 +1,64 @@
|
|||||||
|
# MinerU MVP — 文档解析组件
|
||||||
|
|
||||||
|
## 路径
|
||||||
|
|
||||||
|
```
|
||||||
|
GraphRAGAgent/mineru_mvp/
|
||||||
|
```
|
||||||
|
|
||||||
|
## 功能
|
||||||
|
|
||||||
|
通过 MinerU Cloud API 将 PDF/DOCX 等文档解析为结构化 JSON(`content_list.json`),供后端索引流水线消费。
|
||||||
|
|
||||||
|
## 安装
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd mineru_mvp
|
||||||
|
uv venv --python 3.12
|
||||||
|
source .venv/bin/activate # Linux / macOS
|
||||||
|
# .venv\Scripts\activate # Windows
|
||||||
|
uv pip install -r requirements.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
## 配置
|
||||||
|
|
||||||
|
复制 `.env.example` 为 `.env`,填入 MinerU API Token:
|
||||||
|
|
||||||
|
```env
|
||||||
|
MINERU_API_TOKEN=your_token_here
|
||||||
|
```
|
||||||
|
|
||||||
|
Token 获取地址:https://mineru.net/apiManage/token
|
||||||
|
|
||||||
|
## 使用
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 激活 venv 后(或直接指定解释器路径):
|
||||||
|
python pipeline.py /path/to/document.pdf
|
||||||
|
|
||||||
|
# 或由 backend 通过 subprocess 调用:
|
||||||
|
/path/to/mineru_mvp/.venv/bin/python /path/to/mineru_mvp/pipeline.py /path/to/document.pdf
|
||||||
|
```
|
||||||
|
|
||||||
|
## 输出
|
||||||
|
|
||||||
|
解析结果输出到 `output/{文件名}/` 目录:
|
||||||
|
|
||||||
|
```
|
||||||
|
output/
|
||||||
|
└── {pdf_stem}/
|
||||||
|
├── {uuid}_content_list.json ← 核心产物,供 backend 读取
|
||||||
|
├── full.md
|
||||||
|
├── {uuid}_origin.pdf
|
||||||
|
├── layout.json
|
||||||
|
└── images/
|
||||||
|
└── {hash}.jpg
|
||||||
|
```
|
||||||
|
|
||||||
|
## 流水线步骤
|
||||||
|
|
||||||
|
1. POST `/file-urls/batch` — 获取预签名上传 URL
|
||||||
|
2. PUT 文件到预签名 URL(不带 Content-Type)
|
||||||
|
3. 轮询 GET `/extract-results/batch/{batch_id}`
|
||||||
|
4. 下载 ZIP → 解压到 `output/`
|
||||||
|
5. 打印摘要到 stdout
|
||||||
221
mineru_mvp/pipeline.py
Normal file
221
mineru_mvp/pipeline.py
Normal file
@@ -0,0 +1,221 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
MinerU MVP — Document Parsing Pipeline (Cloud API)
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python pipeline.py <pdf_path>
|
||||||
|
|
||||||
|
Flow:
|
||||||
|
1. POST /file-urls/batch → get presigned upload URL + batch_id
|
||||||
|
2. PUT <pdf> to presigned URL
|
||||||
|
3. Poll GET /extract-results/batch/{batch_id}
|
||||||
|
4. Download & extract ZIP → output/{pdf_stem}/
|
||||||
|
5. Print summary to stdout
|
||||||
|
|
||||||
|
The backend (indexing_service.py) calls this via subprocess and reads
|
||||||
|
output/{pdf_stem}/*_content_list.json for downstream KG construction.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import io
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import zipfile
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
|
# ── Config ──────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
load_dotenv(Path(__file__).parent / ".env", override=True)
|
||||||
|
|
||||||
|
API_BASE = "https://mineru.net/api/v4"
|
||||||
|
TOKEN = os.getenv("MINERU_API_TOKEN", "")
|
||||||
|
POLL_INTERVAL = 5 # seconds between status checks
|
||||||
|
MAX_WAIT = 600 # max total wait time (seconds) — matches backend timeout
|
||||||
|
|
||||||
|
if not TOKEN:
|
||||||
|
print("ERROR: MINERU_API_TOKEN not set in mineru_mvp/.env", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
HEADERS = {
|
||||||
|
"Authorization": f"Bearer {TOKEN}",
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
}
|
||||||
|
|
||||||
|
OUTPUT_ROOT = Path(__file__).parent / "output"
|
||||||
|
|
||||||
|
|
||||||
|
# ── Pipeline Steps ──────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def _request(method: str, url: str, **kwargs: Any) -> dict:
|
||||||
|
"""Wrapper around requests that raises on HTTP errors."""
|
||||||
|
resp = requests.request(method, url, **kwargs)
|
||||||
|
resp.raise_for_status()
|
||||||
|
body: dict = resp.json()
|
||||||
|
if body.get("code") != 0:
|
||||||
|
raise RuntimeError(f"API error code={body.get('code')} msg={body.get('msg')}")
|
||||||
|
return body
|
||||||
|
|
||||||
|
|
||||||
|
def step1_get_upload_url(filename: str) -> tuple[str, str]:
|
||||||
|
"""POST /file-urls/batch → (batch_id, upload_url)."""
|
||||||
|
print(f"[1/5] Requesting presigned upload URL for: {filename}")
|
||||||
|
|
||||||
|
body = _request(
|
||||||
|
"POST",
|
||||||
|
f"{API_BASE}/file-urls/batch",
|
||||||
|
headers=HEADERS,
|
||||||
|
json={
|
||||||
|
"files": [{"name": filename}],
|
||||||
|
"enable_formula": True,
|
||||||
|
"enable_table": True,
|
||||||
|
"language": "en",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
data = body["data"]
|
||||||
|
batch_id: str = data["batch_id"]
|
||||||
|
upload_url: str = data["file_urls"][0]
|
||||||
|
print(f" batch_id: {batch_id}")
|
||||||
|
return batch_id, upload_url
|
||||||
|
|
||||||
|
|
||||||
|
def step2_upload(upload_url: str, pdf_path: Path) -> None:
|
||||||
|
"""PUT file to presigned URL — MUST NOT include Content-Type header."""
|
||||||
|
print(f"[2/5] Uploading file ({pdf_path.stat().st_size / 1024:.0f} KB)...")
|
||||||
|
|
||||||
|
with open(pdf_path, "rb") as f:
|
||||||
|
resp = requests.put(upload_url, data=f) # no headers = no Content-Type
|
||||||
|
if not resp.ok:
|
||||||
|
raise RuntimeError(f"Upload failed: HTTP {resp.status_code} — {resp.text[:300]}")
|
||||||
|
print(" Upload complete.")
|
||||||
|
|
||||||
|
|
||||||
|
def step3_poll(batch_id: str) -> dict:
|
||||||
|
"""Poll GET /extract-results/batch/{batch_id} until done or failed."""
|
||||||
|
print(f"[3/5] Waiting for parsing to complete (polling every {POLL_INTERVAL}s)...")
|
||||||
|
|
||||||
|
started = time.time()
|
||||||
|
last_state = ""
|
||||||
|
|
||||||
|
while True:
|
||||||
|
elapsed = time.time() - started
|
||||||
|
if elapsed > MAX_WAIT:
|
||||||
|
raise TimeoutError(f"Parsing timed out after {MAX_WAIT}s")
|
||||||
|
|
||||||
|
body = _request(
|
||||||
|
"GET",
|
||||||
|
f"{API_BASE}/extract-results/batch/{batch_id}",
|
||||||
|
headers=HEADERS,
|
||||||
|
)
|
||||||
|
|
||||||
|
item = body["data"]["extract_result"][0]
|
||||||
|
state: str = item["state"]
|
||||||
|
|
||||||
|
if state != last_state:
|
||||||
|
print(f" state: {state}")
|
||||||
|
last_state = state
|
||||||
|
|
||||||
|
if state == "done":
|
||||||
|
zip_url: str = item["full_zip_url"]
|
||||||
|
progress = item.get("extract_progress", {})
|
||||||
|
pages = progress.get("extracted_pages", "?")
|
||||||
|
total = progress.get("total_pages", "?")
|
||||||
|
print(f" Parsing done — {pages}/{total} pages extracted.")
|
||||||
|
return {"zip_url": zip_url, "pages": pages, "total": total}
|
||||||
|
|
||||||
|
if state == "failed":
|
||||||
|
err = item.get("err_msg", "unknown error")
|
||||||
|
raise RuntimeError(f"Parsing failed: {err}")
|
||||||
|
|
||||||
|
time.sleep(POLL_INTERVAL)
|
||||||
|
|
||||||
|
|
||||||
|
def step4_download_and_extract(zip_url: str, output_dir: Path) -> list[str]:
|
||||||
|
"""Download ZIP and extract to output/ directory."""
|
||||||
|
print(f"[4/5] Downloading & extracting results...")
|
||||||
|
|
||||||
|
resp = requests.get(zip_url)
|
||||||
|
resp.raise_for_status()
|
||||||
|
|
||||||
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
with zipfile.ZipFile(io.BytesIO(resp.content)) as zf:
|
||||||
|
zf.extractall(str(output_dir))
|
||||||
|
|
||||||
|
files = sorted(
|
||||||
|
f for f in os.listdir(output_dir) if not f.startswith(".")
|
||||||
|
)
|
||||||
|
for f in files:
|
||||||
|
fpath = output_dir / f
|
||||||
|
if fpath.is_file():
|
||||||
|
print(f" {f} ({fpath.stat().st_size / 1024:.0f} KB)")
|
||||||
|
|
||||||
|
return files
|
||||||
|
|
||||||
|
|
||||||
|
def step5_summary(output_dir: Path) -> None:
|
||||||
|
"""Print summary of parsed output."""
|
||||||
|
print(f"[5/5] Summary")
|
||||||
|
|
||||||
|
# Find content_list.json
|
||||||
|
matches = list(output_dir.glob("*_content_list.json"))
|
||||||
|
if not matches:
|
||||||
|
print(" WARNING: No content_list.json found!", file=sys.stderr)
|
||||||
|
return
|
||||||
|
|
||||||
|
content_list_path = matches[0]
|
||||||
|
with open(content_list_path, "r", encoding="utf-8") as f:
|
||||||
|
content_list = json.load(f)
|
||||||
|
|
||||||
|
# Count block types
|
||||||
|
type_counts: dict[str, int] = {}
|
||||||
|
pages: set[int] = set()
|
||||||
|
for block in content_list:
|
||||||
|
t = block.get("type", "unknown")
|
||||||
|
type_counts[t] = type_counts.get(t, 0) + 1
|
||||||
|
pages.add(block.get("page_idx", 0))
|
||||||
|
|
||||||
|
print(f" File: {content_list_path.name}")
|
||||||
|
print(f" Blocks: {len(content_list)}")
|
||||||
|
print(f" Pages: {len(pages)}")
|
||||||
|
for t, c in sorted(type_counts.items()):
|
||||||
|
print(f" {t}: {c}")
|
||||||
|
|
||||||
|
|
||||||
|
# ── Main ────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
parser = argparse.ArgumentParser(description="MinerU document parsing pipeline")
|
||||||
|
parser.add_argument("pdf_path", help="Path to the PDF file to parse")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
pdf_path = Path(args.pdf_path).resolve()
|
||||||
|
if not pdf_path.exists():
|
||||||
|
print(f"ERROR: File not found: {pdf_path}", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
stem = pdf_path.stem
|
||||||
|
output_dir = OUTPUT_ROOT / stem
|
||||||
|
|
||||||
|
try:
|
||||||
|
batch_id, upload_url = step1_get_upload_url(pdf_path.name)
|
||||||
|
step2_upload(upload_url, pdf_path)
|
||||||
|
result = step3_poll(batch_id)
|
||||||
|
step4_download_and_extract(result["zip_url"], output_dir)
|
||||||
|
step5_summary(output_dir)
|
||||||
|
print(f"\nDone. Output: {output_dir}")
|
||||||
|
except Exception as exc:
|
||||||
|
print(f"\nERROR: {exc}", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
2
mineru_mvp/requirements.txt
Normal file
2
mineru_mvp/requirements.txt
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
requests>=2.28
|
||||||
|
python-dotenv>=1.0
|
||||||
Reference in New Issue
Block a user