GraphRAG Studio — initial commit: multimodal RAG system with KG visualization

Full-stack application for document-to-knowledge-graph pipeline: - Backend: FastAPI + LangGraph ReAct agent + DeepSeek + MinerU parsing - Frontend: React 19 + Vite + D3.js + shadcn/ui - Pipeline: MinerU parsing → LangExtract entity extraction → KG building Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-07 17:30:04 +08:00
commit b02d3378fc
127 changed files with 37218 additions and 0 deletions
--- a/backend/routers/system.py
+++ b/backend/routers/system.py
@@ -0,0 +1,171 @@
+"""F 组：系统（4 个端点）"""
+import os
+import time
+from pathlib import Path
+
+from fastapi import APIRouter
+
+from models.schemas import APIResponse
+from storage import file_store as fs
+
+router = APIRouter(tags=["System"])
+
+_START_TIME = time.time()
+
+
+@router.get("/health")
+async def health_check():
+    env_path = Path(__file__).parent.parent / ".env"
+    from dotenv import load_dotenv
+    load_dotenv(env_path, override=False)
+
+    mineru_python = Path(os.getenv("MINERU_PYTHON", "F:/GraphRAGAgent/mineru_mvp/.venv/Scripts/python.exe"))
+    backend_python = Path(__file__).parent.parent / ".venv" / "Scripts" / "python.exe"
+    deepseek_key = os.getenv("DEEPSEEK_API_KEY", "")
+    deepseek_url = os.getenv("DEEPSEEK_BASE_URL", "https://api.deepseek.com")
+
+    # Check if langextract is importable from backend's venv
+    try:
+        import subprocess
+        result = subprocess.run(
+            [str(backend_python), "-c", "import langextract; print('ok')"],
+            capture_output=True, text=True, timeout=10
+        )
+        langextract_ok = result.returncode == 0 and "ok" in result.stdout
+    except Exception:
+        langextract_ok = False
+
+    components = {
+        "mineru_venv": {
+            "status": "ok" if mineru_python.exists() else "error",
+            "path": str(mineru_python),
+            "exists": mineru_python.exists(),
+        },
+        "langextract_venv": {
+            "status": "ok" if langextract_ok else "error",
+            "path": str(backend_python),
+            "exists": backend_python.exists(),
+        },
+        "deepseek_api": {
+            "status": "ok" if deepseek_key else "error",
+            "base_url": deepseek_url,
+            "key_configured": bool(deepseek_key),
+        },
+        "storage": {
+            "status": "ok",
+            "kg_nodes_exists": fs.kg_nodes_path().exists(),
+            "kg_edges_exists": fs.kg_edges_path().exists(),
+            "uploads_dir_exists": fs.UPLOADS_DIR.exists(),
+        },
+    }
+
+    overall = "healthy" if all(c["status"] == "ok" for c in components.values()) else "degraded"
+
+    return APIResponse.ok({
+        "status": overall,
+        "version": "1.0.0",
+        "uptime_seconds": round(time.time() - _START_TIME, 1),
+        "components": components,
+    })
+
+
+@router.get("/system/stats")
+async def system_stats():
+    from services import indexing_service as idx_svc
+
+    docs = list(fs.load_docs_index().values())
+    nodes = fs.load_kg_nodes()
+    edges = fs.load_kg_edges()
+    history = fs.load_query_history()
+
+    type_dist: dict[str, int] = {}
+    for n in nodes:
+        t = n.get("type", "UNKNOWN")
+        type_dist[t] = type_dist.get(t, 0) + 1
+
+    return APIResponse.ok({
+        "total_documents": len(docs),
+        "indexed_documents": sum(1 for d in docs if d.get("status") == "indexed"),
+        "failed_documents": sum(1 for d in docs if d.get("status") == "failed"),
+        "total_nodes": len(nodes),
+        "total_edges": len(edges),
+        "type_distribution": type_dist,
+        "total_queries": len(history),
+        "active_jobs": idx_svc.count_active_jobs(),
+        "storage_used_mb": fs.storage_used_mb(),
+    })
+
+
+@router.get("/system/formats")
+async def list_formats():
+    return APIResponse.ok({
+        "formats": [
+            {"ext": "pdf",  "description": "PDF 文档（文本型/扫描型/混合型）", "max_size_mb": 200, "max_pages": 600, "requires_ocr": False},
+            {"ext": "docx", "description": "Microsoft Word（新版）", "max_size_mb": 200, "max_pages": 600, "requires_ocr": False},
+            {"ext": "doc",  "description": "Microsoft Word（旧版）", "max_size_mb": 200, "max_pages": 600, "requires_ocr": False},
+            {"ext": "pptx", "description": "PowerPoint（新版）", "max_size_mb": 200, "max_pages": 600, "requires_ocr": False},
+            {"ext": "ppt",  "description": "PowerPoint（旧版）", "max_size_mb": 200, "max_pages": 600, "requires_ocr": False},
+            {"ext": "png",  "description": "PNG 图片（单页）", "max_size_mb": 200, "max_pages": 1, "requires_ocr": True},
+            {"ext": "jpg",  "description": "JPEG 图片（单页）", "max_size_mb": 200, "max_pages": 1, "requires_ocr": True},
+            {"ext": "jpeg", "description": "JPEG 图片（单页）", "max_size_mb": 200, "max_pages": 1, "requires_ocr": True},
+            {"ext": "html", "description": "HTML 文件", "max_size_mb": 200, "max_pages": 600, "requires_ocr": False},
+        ],
+        "ocr_languages": [
+            {"code": "ch", "name": "中文（默认）"},
+            {"code": "en", "name": "英文"},
+            {"code": "japan", "name": "日文"},
+            {"code": "korean", "name": "韩文"},
+            {"code": "french", "name": "法文"},
+            {"code": "german", "name": "德文"},
+        ],
+        "notes": [
+            "language 参数默认值为 'ch'（非 'zh'），遵循 PaddleOCR v3 语言代码规范",
+            "上传时不需要携带 Content-Type，服务端自动识别",
+            "PNG/JPG/JPEG 单次最多处理 1 页",
+        ],
+    })
+
+
+@router.get("/system/demo")
+async def get_demo_data():
+    # Try backend KG first, then fall back to graphrag_pipeline/output
+    nodes = fs.load_kg_nodes()
+    edges = fs.load_kg_edges()
+
+    if not nodes:
+        # Fallback: load from existing graphrag_pipeline output
+        legacy_nodes_path = Path("F:/GraphRAGAgent/graphrag_pipeline/output/kg_nodes.json")
+        legacy_edges_path = Path("F:/GraphRAGAgent/graphrag_pipeline/output/kg_edges.json")
+        if legacy_nodes_path.exists():
+            import json
+            nodes = json.loads(legacy_nodes_path.read_text(encoding="utf-8"))
+            edges = json.loads(legacy_edges_path.read_text(encoding="utf-8")) if legacy_edges_path.exists() else []
+        else:
+            from fastapi.responses import JSONResponse
+            return JSONResponse(
+                status_code=400,
+                content=APIResponse.err(3002, "No demo data available. Index a document first.").model_dump(),
+            )
+
+    type_counts: dict[str, int] = {}
+    for n in nodes:
+        t = n.get("type", "UNKNOWN")
+        type_counts[t] = type_counts.get(t, 0) + 1
+
+    import networkx as nx
+    G = nx.Graph()
+    for n in nodes:
+        G.add_node(n["id"])
+    for e in edges:
+        G.add_edge(e["source"], e["target"])
+
+    return APIResponse.ok({
+        "nodes": nodes,
+        "edges": edges,
+        "stats": {
+            "nodes": len(nodes),
+            "edges": len(edges),
+            "type_counts": type_counts,
+            "density": round(nx.density(G), 4) if G.number_of_nodes() > 1 else 0.0,
+        },
+    })