diff --git a/backend/.env.example b/backend/.env.example index d7a1c42..070c7bc 100644 --- a/backend/.env.example +++ b/backend/.env.example @@ -5,6 +5,10 @@ DEEPSEEK_BASE_URL=https://api.deepseek.com # MinerU (required for document parsing) MINERU_API_TOKEN=your_mineru_api_token_here -# MinerU venv path (absolute path to python.exe) -MINERU_PYTHON=F:/GraphRAGAgent/mineru_mvp/.venv/Scripts/python.exe -MINERU_PIPELINE=F:/GraphRAGAgent/mineru_mvp/pipeline.py +# MinerU venv path (absolute path to the python interpreter in mineru_mvp's venv) +# Linux: /home/user/GraphRAGAgent/mineru_mvp/.venv/bin/python +# Windows: F:/GraphRAGAgent/mineru_mvp/.venv/Scripts/python.exe +MINERU_PYTHON=/root/projects/GraphRAGAgent/mineru_mvp/.venv/bin/python +# Linux: /home/user/GraphRAGAgent/mineru_mvp/pipeline.py +# Windows: F:/GraphRAGAgent/mineru_mvp/pipeline.py +MINERU_PIPELINE=/root/projects/GraphRAGAgent/mineru_mvp/pipeline.py diff --git a/mineru_mvp/.env.example b/mineru_mvp/.env.example new file mode 100644 index 0000000..27f919a --- /dev/null +++ b/mineru_mvp/.env.example @@ -0,0 +1,3 @@ +# MinerU Cloud API Token +# Get yours at: https://mineru.net/apiManage/token +MINERU_API_TOKEN=your_mineru_api_token_here diff --git a/mineru_mvp/CLAUDE.md b/mineru_mvp/CLAUDE.md new file mode 100644 index 0000000..2964e92 --- /dev/null +++ b/mineru_mvp/CLAUDE.md @@ -0,0 +1,64 @@ +# MinerU MVP — 文档解析组件 + +## 路径 + +``` +GraphRAGAgent/mineru_mvp/ +``` + +## 功能 + +通过 MinerU Cloud API 将 PDF/DOCX 等文档解析为结构化 JSON(`content_list.json`),供后端索引流水线消费。 + +## 安装 + +```bash +cd mineru_mvp +uv venv --python 3.12 +source .venv/bin/activate # Linux / macOS +# .venv\Scripts\activate # Windows +uv pip install -r requirements.txt +``` + +## 配置 + +复制 `.env.example` 为 `.env`,填入 MinerU API Token: + +```env +MINERU_API_TOKEN=your_token_here +``` + +Token 获取地址:https://mineru.net/apiManage/token + +## 使用 + +```bash +# 激活 venv 后(或直接指定解释器路径): +python pipeline.py /path/to/document.pdf + +# 或由 backend 通过 subprocess 调用: +/path/to/mineru_mvp/.venv/bin/python /path/to/mineru_mvp/pipeline.py /path/to/document.pdf +``` + +## 输出 + +解析结果输出到 `output/{文件名}/` 目录: + +``` +output/ +└── {pdf_stem}/ + ├── {uuid}_content_list.json ← 核心产物,供 backend 读取 + ├── full.md + ├── {uuid}_origin.pdf + ├── layout.json + └── images/ + └── {hash}.jpg +``` + +## 流水线步骤 + +1. POST `/file-urls/batch` — 获取预签名上传 URL +2. PUT 文件到预签名 URL(不带 Content-Type) +3. 轮询 GET `/extract-results/batch/{batch_id}` +4. 下载 ZIP → 解压到 `output/` +5. 打印摘要到 stdout diff --git a/mineru_mvp/pipeline.py b/mineru_mvp/pipeline.py new file mode 100644 index 0000000..fda0176 --- /dev/null +++ b/mineru_mvp/pipeline.py @@ -0,0 +1,221 @@ +#!/usr/bin/env python3 +""" +MinerU MVP — Document Parsing Pipeline (Cloud API) + +Usage: + python pipeline.py + +Flow: + 1. POST /file-urls/batch → get presigned upload URL + batch_id + 2. PUT to presigned URL + 3. Poll GET /extract-results/batch/{batch_id} + 4. Download & extract ZIP → output/{pdf_stem}/ + 5. Print summary to stdout + +The backend (indexing_service.py) calls this via subprocess and reads +output/{pdf_stem}/*_content_list.json for downstream KG construction. +""" + +from __future__ import annotations + +import argparse +import io +import json +import os +import sys +import time +import zipfile +from pathlib import Path +from typing import Any + +import requests +from dotenv import load_dotenv + +# ── Config ────────────────────────────────────────────────────────────────────── + +load_dotenv(Path(__file__).parent / ".env", override=True) + +API_BASE = "https://mineru.net/api/v4" +TOKEN = os.getenv("MINERU_API_TOKEN", "") +POLL_INTERVAL = 5 # seconds between status checks +MAX_WAIT = 600 # max total wait time (seconds) — matches backend timeout + +if not TOKEN: + print("ERROR: MINERU_API_TOKEN not set in mineru_mvp/.env", file=sys.stderr) + sys.exit(1) + +HEADERS = { + "Authorization": f"Bearer {TOKEN}", + "Content-Type": "application/json", +} + +OUTPUT_ROOT = Path(__file__).parent / "output" + + +# ── Pipeline Steps ────────────────────────────────────────────────────────────── + +def _request(method: str, url: str, **kwargs: Any) -> dict: + """Wrapper around requests that raises on HTTP errors.""" + resp = requests.request(method, url, **kwargs) + resp.raise_for_status() + body: dict = resp.json() + if body.get("code") != 0: + raise RuntimeError(f"API error code={body.get('code')} msg={body.get('msg')}") + return body + + +def step1_get_upload_url(filename: str) -> tuple[str, str]: + """POST /file-urls/batch → (batch_id, upload_url).""" + print(f"[1/5] Requesting presigned upload URL for: {filename}") + + body = _request( + "POST", + f"{API_BASE}/file-urls/batch", + headers=HEADERS, + json={ + "files": [{"name": filename}], + "enable_formula": True, + "enable_table": True, + "language": "en", + }, + ) + + data = body["data"] + batch_id: str = data["batch_id"] + upload_url: str = data["file_urls"][0] + print(f" batch_id: {batch_id}") + return batch_id, upload_url + + +def step2_upload(upload_url: str, pdf_path: Path) -> None: + """PUT file to presigned URL — MUST NOT include Content-Type header.""" + print(f"[2/5] Uploading file ({pdf_path.stat().st_size / 1024:.0f} KB)...") + + with open(pdf_path, "rb") as f: + resp = requests.put(upload_url, data=f) # no headers = no Content-Type + if not resp.ok: + raise RuntimeError(f"Upload failed: HTTP {resp.status_code} — {resp.text[:300]}") + print(" Upload complete.") + + +def step3_poll(batch_id: str) -> dict: + """Poll GET /extract-results/batch/{batch_id} until done or failed.""" + print(f"[3/5] Waiting for parsing to complete (polling every {POLL_INTERVAL}s)...") + + started = time.time() + last_state = "" + + while True: + elapsed = time.time() - started + if elapsed > MAX_WAIT: + raise TimeoutError(f"Parsing timed out after {MAX_WAIT}s") + + body = _request( + "GET", + f"{API_BASE}/extract-results/batch/{batch_id}", + headers=HEADERS, + ) + + item = body["data"]["extract_result"][0] + state: str = item["state"] + + if state != last_state: + print(f" state: {state}") + last_state = state + + if state == "done": + zip_url: str = item["full_zip_url"] + progress = item.get("extract_progress", {}) + pages = progress.get("extracted_pages", "?") + total = progress.get("total_pages", "?") + print(f" Parsing done — {pages}/{total} pages extracted.") + return {"zip_url": zip_url, "pages": pages, "total": total} + + if state == "failed": + err = item.get("err_msg", "unknown error") + raise RuntimeError(f"Parsing failed: {err}") + + time.sleep(POLL_INTERVAL) + + +def step4_download_and_extract(zip_url: str, output_dir: Path) -> list[str]: + """Download ZIP and extract to output/ directory.""" + print(f"[4/5] Downloading & extracting results...") + + resp = requests.get(zip_url) + resp.raise_for_status() + + output_dir.mkdir(parents=True, exist_ok=True) + + with zipfile.ZipFile(io.BytesIO(resp.content)) as zf: + zf.extractall(str(output_dir)) + + files = sorted( + f for f in os.listdir(output_dir) if not f.startswith(".") + ) + for f in files: + fpath = output_dir / f + if fpath.is_file(): + print(f" {f} ({fpath.stat().st_size / 1024:.0f} KB)") + + return files + + +def step5_summary(output_dir: Path) -> None: + """Print summary of parsed output.""" + print(f"[5/5] Summary") + + # Find content_list.json + matches = list(output_dir.glob("*_content_list.json")) + if not matches: + print(" WARNING: No content_list.json found!", file=sys.stderr) + return + + content_list_path = matches[0] + with open(content_list_path, "r", encoding="utf-8") as f: + content_list = json.load(f) + + # Count block types + type_counts: dict[str, int] = {} + pages: set[int] = set() + for block in content_list: + t = block.get("type", "unknown") + type_counts[t] = type_counts.get(t, 0) + 1 + pages.add(block.get("page_idx", 0)) + + print(f" File: {content_list_path.name}") + print(f" Blocks: {len(content_list)}") + print(f" Pages: {len(pages)}") + for t, c in sorted(type_counts.items()): + print(f" {t}: {c}") + + +# ── Main ──────────────────────────────────────────────────────────────────────── + +def main() -> None: + parser = argparse.ArgumentParser(description="MinerU document parsing pipeline") + parser.add_argument("pdf_path", help="Path to the PDF file to parse") + args = parser.parse_args() + + pdf_path = Path(args.pdf_path).resolve() + if not pdf_path.exists(): + print(f"ERROR: File not found: {pdf_path}", file=sys.stderr) + sys.exit(1) + + stem = pdf_path.stem + output_dir = OUTPUT_ROOT / stem + + try: + batch_id, upload_url = step1_get_upload_url(pdf_path.name) + step2_upload(upload_url, pdf_path) + result = step3_poll(batch_id) + step4_download_and_extract(result["zip_url"], output_dir) + step5_summary(output_dir) + print(f"\nDone. Output: {output_dir}") + except Exception as exc: + print(f"\nERROR: {exc}", file=sys.stderr) + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/mineru_mvp/requirements.txt b/mineru_mvp/requirements.txt new file mode 100644 index 0000000..0285233 --- /dev/null +++ b/mineru_mvp/requirements.txt @@ -0,0 +1,2 @@ +requests>=2.28 +python-dotenv>=1.0