GraphRAG Studio — initial commit: multimodal RAG system with KG visualization

Full-stack application for document-to-knowledge-graph pipeline: - Backend: FastAPI + LangGraph ReAct agent + DeepSeek + MinerU parsing - Frontend: React 19 + Vite + D3.js + shadcn/ui - Pipeline: MinerU parsing → LangExtract entity extraction → KG building Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-07 17:30:04 +08:00
commit b02d3378fc
127 changed files with 37218 additions and 0 deletions
--- a/backend/services/indexing_service.py
+++ b/backend/services/indexing_service.py
@@ -0,0 +1,255 @@
+"""Indexing Service — Pipeline orchestration (parsing → extracting → indexing)."""
+from __future__ import annotations
+
+import json
+import os
+import subprocess
+import threading
+import time
+import uuid
+from datetime import datetime, timezone
+from pathlib import Path
+
+from dotenv import load_dotenv
+
+from storage import file_store as fs
+from services.document_service import update_doc_status
+
+load_dotenv(Path(__file__).parent.parent / ".env", override=True)
+
+MINERU_PYTHON = Path(os.getenv("MINERU_PYTHON", "F:/GraphRAGAgent/mineru_mvp/.venv/Scripts/python.exe"))
+MINERU_PIPELINE = Path(os.getenv("MINERU_PIPELINE", "F:/GraphRAGAgent/mineru_mvp/pipeline.py"))
+
+# In-memory registry of active jobs {job_id: threading.Thread}
+_active_threads: dict[str, threading.Thread] = {}
+_cancel_flags: dict[str, bool] = {}
+
+
+def start_indexing(doc_id: str) -> dict:
+    doc = fs.get_doc(doc_id)
+    if not doc:
+        return None  # type: ignore
+
+    job_id = f"job_{uuid.uuid4().hex[:8]}"
+    now = datetime.now(timezone.utc).isoformat()
+
+    meta = {
+        "job_id": job_id,
+        "doc_id": doc_id,
+        "status": "submitted",
+        "stage": "Job submitted",
+        "progress": {"parsed_pages": 0, "total_pages": 0, "extracted_entities": 0},
+        "created_at": now,
+        "elapsed_seconds": 0.0,
+        "error": None,
+        "pdf_name": doc["filename"],
+        "pdf_path": str(fs.UPLOADS_DIR / doc.get("upload_filename", "")),
+    }
+    fs.save_job_meta(job_id, meta)
+
+    _cancel_flags[job_id] = False
+    thread = threading.Thread(target=_run_pipeline, args=(job_id,), daemon=True)
+    _active_threads[job_id] = thread
+    thread.start()
+
+    return meta
+
+
+def _update_meta(job_id: str, **kwargs) -> None:
+    meta = fs.load_job_meta(job_id) or {}
+    meta.update(kwargs)
+    meta["elapsed_seconds"] = round(
+        (datetime.now(timezone.utc) - datetime.fromisoformat(meta["created_at"])).total_seconds(), 1
+    )
+    fs.save_job_meta(job_id, meta)
+
+
+def _run_pipeline(job_id: str) -> None:
+    meta = fs.load_job_meta(job_id)
+    if not meta:
+        return
+
+    doc_id = meta["doc_id"]
+    pdf_path = Path(meta["pdf_path"])
+    job_dir = fs.job_dir(job_id)
+    start_time = time.time()
+
+    try:
+        # ── Stage 1: parsing ──────────────────────────────────────────────
+        if _cancel_flags.get(job_id):
+            _update_meta(job_id, status="cancelled", stage="Cancelled")
+            return
+
+        _update_meta(job_id, status="parsing", stage="MinerU document parsing...")
+        mineru_out_dir = job_dir / "mineru_output"
+        mineru_out_dir.mkdir(parents=True, exist_ok=True)
+
+        result = subprocess.run(
+            [str(MINERU_PYTHON), str(MINERU_PIPELINE), str(pdf_path)],
+            cwd=str(MINERU_PIPELINE.parent),
+            capture_output=True,
+            text=True,
+            timeout=600,
+        )
+
+        if result.returncode != 0:
+            raise RuntimeError(f"MinerU failed: {result.stderr[:500]}")
+
+        # Find content_list.json in MinerU output
+        # MinerU writes output to mineru_mvp/output/{stem}/
+        stem = pdf_path.stem
+        mineru_default_out = MINERU_PIPELINE.parent / "output" / stem
+        content_list_path = None
+
+        if mineru_default_out.exists():
+            matches = list(mineru_default_out.glob("*_content_list.json"))
+            if matches:
+                content_list_path = matches[0]
+                # Copy to our job dir
+                import shutil
+                shutil.copytree(str(mineru_default_out), str(mineru_out_dir), dirs_exist_ok=True)
+
+        if not content_list_path:
+            # Fallback: search job mineru_output dir
+            matches = list(mineru_out_dir.glob("*_content_list.json"))
+            if matches:
+                content_list_path = matches[0]
+
+        if not content_list_path or not content_list_path.exists():
+            raise RuntimeError(f"MinerU output content_list.json not found. stdout: {result.stdout[:300]}")
+
+        # ── Stage 2: extracting ───────────────────────────────────────────
+        if _cancel_flags.get(job_id):
+            _update_meta(job_id, status="cancelled", stage="Cancelled")
+            return
+
+        from pipeline.text_assembler import load_content_list, assemble_pages, count_blocks_by_type
+        from pipeline.entity_extractor import create_model, extract_entities
+        from pipeline.kg_builder import build_kg, extractions_to_records
+
+        content_list = load_content_list(content_list_path)
+        pages = assemble_pages(content_list)
+        total_pages = len(pages)
+        block_types = count_blocks_by_type(content_list)
+
+        _update_meta(
+            job_id,
+            status="extracting",
+            stage=f"Extracting entities (LangExtract + DeepSeek)...",
+            progress={"parsed_pages": total_pages, "total_pages": total_pages, "extracted_entities": 0},
+        )
+        update_doc_status(doc_id, "indexing", pages=total_pages)
+
+        model = create_model()
+        annotated_docs = []
+        total_entities = 0
+
+        for i, page in enumerate(pages):
+            if _cancel_flags.get(job_id):
+                _update_meta(job_id, status="cancelled", stage="Cancelled")
+                return
+
+            _update_meta(
+                job_id,
+                stage=f"Extracting entities page {i+1}/{total_pages} (LangExtract + DeepSeek)...",
+                progress={"parsed_pages": total_pages, "total_pages": total_pages,
+                          "extracted_entities": total_entities},
+            )
+            ann_doc = extract_entities(page.text, model)
+            annotated_docs.append(ann_doc)
+            total_entities += len(ann_doc.extractions) if ann_doc.extractions else 0
+
+        # Save raw extractions
+        records = extractions_to_records(pages, annotated_docs, doc_id)
+        fs.write_json(job_dir / "extractions.json", records)
+
+        # ── Stage 3: indexing ─────────────────────────────────────────────
+        _update_meta(job_id, status="indexing", stage="Building knowledge graph...")
+
+        nodes, edges = build_kg(pages, annotated_docs, doc_id)
+        fs.write_json(job_dir / "kg_nodes.json", nodes)
+        fs.write_json(job_dir / "kg_edges.json", edges)
+
+        # Merge into global KG
+        fs.merge_kg(nodes, edges, doc_id)
+
+        # Count alignment types
+        alignment_counts: dict[str, int] = {}
+        type_counts: dict[str, int] = {}
+        for r in records:
+            al = r.get("alignment") or "null"
+            alignment_counts[al] = alignment_counts.get(al, 0) + 1
+            t = r.get("type", "UNKNOWN")
+            type_counts[t] = type_counts.get(t, 0) + 1
+
+        elapsed = round(time.time() - start_time, 1)
+        stats = {
+            "blocks": len(content_list),
+            "block_types": block_types,
+            "pages": total_pages,
+            "raw_extractions": len(records),
+            "nodes": len(nodes),
+            "edges": len(edges),
+            "type_counts": type_counts,
+            "alignment_counts": alignment_counts,
+            "elapsed_seconds": elapsed,
+        }
+        fs.write_json(job_dir / "stats.json", stats)
+
+        _update_meta(
+            job_id,
+            status="done",
+            stage="Complete",
+            progress={"parsed_pages": total_pages, "total_pages": total_pages,
+                      "extracted_entities": len(records)},
+        )
+        update_doc_status(doc_id, "indexed", pages=total_pages)
+
+    except Exception as exc:
+        _update_meta(job_id, status="failed", stage=f"Error: {exc}", error=str(exc))
+        update_doc_status(doc_id, "failed")
+    finally:
+        _active_threads.pop(job_id, None)
+        _cancel_flags.pop(job_id, None)
+
+
+def get_job_status(job_id: str) -> dict | None:
+    return fs.load_job_meta(job_id)
+
+
+def get_job_result(job_id: str) -> dict | None:
+    meta = fs.load_job_meta(job_id)
+    if not meta:
+        return None
+    if meta["status"] != "done":
+        return meta
+
+    job_dir = fs.job_dir(job_id)
+    stats = fs.read_json(job_dir / "stats.json") or {}
+    extractions = fs.read_json(job_dir / "extractions.json") or []
+    nodes = fs.read_json(job_dir / "kg_nodes.json") or []
+    edges = fs.read_json(job_dir / "kg_edges.json") or []
+
+    return {
+        "job_id": meta["job_id"],
+        "doc_id": meta["doc_id"],
+        "status": "done",
+        "stats": stats,
+        "extractions": extractions,
+        "nodes": nodes,
+        "edges": edges,
+    }
+
+
+def cancel_job(job_id: str) -> tuple[bool, str]:
+    meta = fs.load_job_meta(job_id)
+    if not meta:
+        return False, "not_found"
+    prev_status = meta["status"]
+    _cancel_flags[job_id] = True
+    _update_meta(job_id, status="cancelled", stage="Cancelled by user")
+    return True, prev_status
+
+
+def count_active_jobs() -> int:
+    return sum(1 for t in _active_threads.values() if t.is_alive())