GraphRAG Studio — initial commit: multimodal RAG system with KG visualization
Full-stack application for document-to-knowledge-graph pipeline: - Backend: FastAPI + LangGraph ReAct agent + DeepSeek + MinerU parsing - Frontend: React 19 + Vite + D3.js + shadcn/ui - Pipeline: MinerU parsing → LangExtract entity extraction → KG building Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
0
backend/routers/__init__.py
Normal file
0
backend/routers/__init__.py
Normal file
71
backend/routers/documents.py
Normal file
71
backend/routers/documents.py
Normal file
@@ -0,0 +1,71 @@
|
||||
"""A 组:文档管理(4 个端点)"""
|
||||
from fastapi import APIRouter, File, Form, HTTPException, UploadFile
|
||||
from fastapi.responses import JSONResponse
|
||||
|
||||
from models.schemas import APIResponse
|
||||
from services import document_service as svc
|
||||
|
||||
router = APIRouter(prefix="/documents", tags=["Documents"])
|
||||
|
||||
|
||||
@router.post("/upload", status_code=200)
|
||||
async def upload_document(
|
||||
file: UploadFile = File(...),
|
||||
language: str = Form("ch"),
|
||||
enable_formula: bool = Form(True),
|
||||
enable_table: bool = Form(True),
|
||||
):
|
||||
content = await file.read()
|
||||
ok, code, msg = svc.validate_upload(file.filename or "", len(content))
|
||||
if not ok:
|
||||
return JSONResponse(
|
||||
status_code=400,
|
||||
content=APIResponse.err(code, msg).model_dump(),
|
||||
)
|
||||
doc = svc.save_upload(file.filename or "upload", content, language, enable_formula, enable_table)
|
||||
# Remove internal field
|
||||
doc.pop("upload_filename", None)
|
||||
return APIResponse.ok(doc)
|
||||
|
||||
|
||||
@router.get("/{doc_id}")
|
||||
async def get_document(doc_id: str):
|
||||
doc = svc.get_document(doc_id)
|
||||
if not doc:
|
||||
return JSONResponse(
|
||||
status_code=404,
|
||||
content=APIResponse.err(2001, f"Document '{doc_id}' not found").model_dump(),
|
||||
)
|
||||
doc.pop("upload_filename", None)
|
||||
return APIResponse.ok(doc)
|
||||
|
||||
|
||||
@router.get("")
|
||||
async def list_documents(
|
||||
page: int = 1,
|
||||
page_size: int = 20,
|
||||
status: str | None = None,
|
||||
format: str | None = None,
|
||||
):
|
||||
page_size = min(page_size, 100)
|
||||
result = svc.list_documents(page, page_size, status, format)
|
||||
for item in result["items"]:
|
||||
item.pop("upload_filename", None)
|
||||
return APIResponse.ok(result)
|
||||
|
||||
|
||||
@router.delete("/{doc_id}")
|
||||
async def delete_document(doc_id: str):
|
||||
doc = svc.get_document(doc_id)
|
||||
if not doc:
|
||||
return JSONResponse(
|
||||
status_code=404,
|
||||
content=APIResponse.err(2001, f"Document '{doc_id}' not found").model_dump(),
|
||||
)
|
||||
ok, removed_nodes, removed_edges = svc.delete_document(doc_id)
|
||||
return APIResponse.ok({
|
||||
"deleted": True,
|
||||
"doc_id": doc_id,
|
||||
"removed_nodes": removed_nodes,
|
||||
"removed_edges": removed_edges,
|
||||
})
|
||||
70
backend/routers/indexing.py
Normal file
70
backend/routers/indexing.py
Normal file
@@ -0,0 +1,70 @@
|
||||
"""B 组:Indexing Pipeline(4 个端点)"""
|
||||
from fastapi import APIRouter
|
||||
from fastapi.responses import JSONResponse
|
||||
|
||||
from models.schemas import APIResponse, StartIndexRequest
|
||||
from services import document_service as doc_svc
|
||||
from services import indexing_service as idx_svc
|
||||
|
||||
router = APIRouter(prefix="/index", tags=["Indexing"])
|
||||
|
||||
|
||||
@router.post("/start", status_code=202)
|
||||
async def start_indexing(body: StartIndexRequest):
|
||||
doc = doc_svc.get_document(body.doc_id)
|
||||
if not doc:
|
||||
return JSONResponse(
|
||||
status_code=404,
|
||||
content=APIResponse.err(2001, f"Document '{body.doc_id}' not found").model_dump(),
|
||||
)
|
||||
meta = idx_svc.start_indexing(body.doc_id)
|
||||
return APIResponse.ok({
|
||||
"job_id": meta["job_id"],
|
||||
"doc_id": meta["doc_id"],
|
||||
"status": meta["status"],
|
||||
"stage": meta["stage"],
|
||||
"created_at": meta["created_at"],
|
||||
})
|
||||
|
||||
|
||||
@router.get("/status/{job_id}")
|
||||
async def get_job_status(job_id: str):
|
||||
meta = idx_svc.get_job_status(job_id)
|
||||
if not meta:
|
||||
return JSONResponse(
|
||||
status_code=404,
|
||||
content=APIResponse.err(2002, f"Job '{job_id}' not found").model_dump(),
|
||||
)
|
||||
return APIResponse.ok(meta)
|
||||
|
||||
|
||||
@router.get("/result/{job_id}")
|
||||
async def get_job_result(job_id: str):
|
||||
result = idx_svc.get_job_result(job_id)
|
||||
if not result:
|
||||
return JSONResponse(
|
||||
status_code=404,
|
||||
content=APIResponse.err(2002, f"Job '{job_id}' not found").model_dump(),
|
||||
)
|
||||
if result.get("status") not in ("done",) and "stats" not in result:
|
||||
return JSONResponse(
|
||||
status_code=400,
|
||||
content=APIResponse.err(2003, f"Job '{job_id}' is still running (status={result.get('status')})").model_dump(),
|
||||
)
|
||||
return APIResponse.ok(result)
|
||||
|
||||
|
||||
@router.delete("/jobs/{job_id}")
|
||||
async def cancel_job(job_id: str):
|
||||
meta = idx_svc.get_job_status(job_id)
|
||||
if not meta:
|
||||
return JSONResponse(
|
||||
status_code=404,
|
||||
content=APIResponse.err(2002, f"Job '{job_id}' not found").model_dump(),
|
||||
)
|
||||
ok, prev_status = idx_svc.cancel_job(job_id)
|
||||
return APIResponse.ok({
|
||||
"cancelled": True,
|
||||
"job_id": job_id,
|
||||
"previous_status": prev_status,
|
||||
})
|
||||
72
backend/routers/kg.py
Normal file
72
backend/routers/kg.py
Normal file
@@ -0,0 +1,72 @@
|
||||
"""C 组:知识图谱(6 个端点)"""
|
||||
from fastapi import APIRouter
|
||||
from fastapi.responses import JSONResponse
|
||||
|
||||
from models.schemas import APIResponse
|
||||
from services import kg_service as svc
|
||||
|
||||
router = APIRouter(prefix="/kg", tags=["Knowledge Graph"])
|
||||
|
||||
|
||||
@router.get("/nodes")
|
||||
async def list_nodes(
|
||||
type: str | None = None,
|
||||
doc_id: str | None = None,
|
||||
confidence: str | None = None,
|
||||
page: int = 1,
|
||||
page_size: int = 50,
|
||||
):
|
||||
page_size = min(page_size, 200)
|
||||
result = svc.get_nodes(page, page_size, type, doc_id, confidence)
|
||||
if result["total"] == 0 and not any([type, doc_id, confidence]):
|
||||
return JSONResponse(
|
||||
status_code=400,
|
||||
content=APIResponse.err(3002, "Knowledge graph is empty. Index documents first.").model_dump(),
|
||||
)
|
||||
return APIResponse.ok(result)
|
||||
|
||||
|
||||
@router.get("/edges")
|
||||
async def list_edges(
|
||||
doc_id: str | None = None,
|
||||
relation: str | None = None,
|
||||
page: int = 1,
|
||||
page_size: int = 100,
|
||||
):
|
||||
page_size = min(page_size, 500)
|
||||
result = svc.get_edges(page, page_size, doc_id, relation)
|
||||
return APIResponse.ok(result)
|
||||
|
||||
|
||||
@router.get("/nodes/{node_id}")
|
||||
async def get_node_detail(node_id: str):
|
||||
node = svc.get_node_detail(node_id)
|
||||
if not node:
|
||||
return JSONResponse(
|
||||
status_code=404,
|
||||
content=APIResponse.err(3001, f"Node '{node_id}' not found").model_dump(),
|
||||
)
|
||||
return APIResponse.ok(node)
|
||||
|
||||
|
||||
@router.get("/nodes/{node_id}/neighbors")
|
||||
async def get_node_neighbors(node_id: str, hops: int = 1):
|
||||
result = svc.get_neighbors(node_id, hops)
|
||||
if result is None:
|
||||
return JSONResponse(
|
||||
status_code=404,
|
||||
content=APIResponse.err(3001, f"Node '{node_id}' not found").model_dump(),
|
||||
)
|
||||
return APIResponse.ok(result)
|
||||
|
||||
|
||||
@router.get("/stats")
|
||||
async def get_kg_stats():
|
||||
stats = svc.get_stats()
|
||||
return APIResponse.ok(stats)
|
||||
|
||||
|
||||
@router.get("/export")
|
||||
async def export_kg(format: str = "json", doc_id: str | None = None):
|
||||
result = svc.export_kg(doc_id)
|
||||
return APIResponse.ok(result)
|
||||
66
backend/routers/query.py
Normal file
66
backend/routers/query.py
Normal file
@@ -0,0 +1,66 @@
|
||||
"""D 组:QA 问答(4 个端点)"""
|
||||
import asyncio
|
||||
from functools import partial
|
||||
|
||||
from fastapi import APIRouter
|
||||
from fastapi.responses import JSONResponse
|
||||
|
||||
from models.schemas import APIResponse, BatchQueryRequest, QueryRequest
|
||||
from services import qa_service as svc
|
||||
|
||||
router = APIRouter(prefix="/query", tags=["QA"])
|
||||
|
||||
|
||||
@router.post("")
|
||||
async def run_query(body: QueryRequest):
|
||||
try:
|
||||
loop = asyncio.get_event_loop()
|
||||
result = await loop.run_in_executor(
|
||||
None,
|
||||
partial(svc.run_query, body.question, [m.model_dump() for m in body.history]),
|
||||
)
|
||||
return APIResponse.ok(result)
|
||||
except ValueError as e:
|
||||
if "KG_EMPTY" in str(e):
|
||||
return JSONResponse(
|
||||
status_code=400,
|
||||
content=APIResponse.err(3002, "Knowledge graph is empty. Index documents first.").model_dump(),
|
||||
)
|
||||
return JSONResponse(
|
||||
status_code=500,
|
||||
content=APIResponse.err(4001, str(e)).model_dump(),
|
||||
)
|
||||
except Exception as e:
|
||||
return JSONResponse(
|
||||
status_code=500,
|
||||
content=APIResponse.err(4001, f"QA service error: {e}").model_dump(),
|
||||
)
|
||||
|
||||
|
||||
@router.post("/batch", status_code=202)
|
||||
async def start_batch(body: BatchQueryRequest):
|
||||
if len(body.questions) > 20:
|
||||
return JSONResponse(
|
||||
status_code=400,
|
||||
content=APIResponse.err(1001, "Maximum 20 questions per batch").model_dump(),
|
||||
)
|
||||
result = svc.start_batch(body.questions)
|
||||
return APIResponse.ok(result)
|
||||
|
||||
|
||||
@router.get("/batch/{batch_id}")
|
||||
async def get_batch_result(batch_id: str):
|
||||
result = svc.get_batch_result(batch_id)
|
||||
if not result:
|
||||
return JSONResponse(
|
||||
status_code=404,
|
||||
content=APIResponse.err(2002, f"Batch '{batch_id}' not found").model_dump(),
|
||||
)
|
||||
return APIResponse.ok(result)
|
||||
|
||||
|
||||
@router.get("/history")
|
||||
async def get_query_history(page: int = 1, page_size: int = 20):
|
||||
page_size = min(page_size, 50)
|
||||
result = svc.get_history(page, page_size)
|
||||
return APIResponse.ok(result)
|
||||
43
backend/routers/search.py
Normal file
43
backend/routers/search.py
Normal file
@@ -0,0 +1,43 @@
|
||||
"""E 组:搜索(3 个端点)"""
|
||||
from fastapi import APIRouter, Query, Request
|
||||
from fastapi.responses import JSONResponse
|
||||
|
||||
from models.schemas import APIResponse
|
||||
from services import search_service as svc
|
||||
|
||||
router = APIRouter(prefix="/search", tags=["Search"])
|
||||
|
||||
|
||||
@router.get("/entities")
|
||||
async def search_entities(q: str, type: str | None = None, limit: int = 15):
|
||||
limit = min(limit, 100)
|
||||
result = svc.search_entities(q, type, limit)
|
||||
return APIResponse.ok(result)
|
||||
|
||||
|
||||
@router.get("/path")
|
||||
async def search_path(request: Request, max_hops: int = 3):
|
||||
# 'from' is a Python keyword, read from raw query params
|
||||
params = dict(request.query_params)
|
||||
from_id = params.get("from")
|
||||
to_id = params.get("to")
|
||||
|
||||
if not from_id or not to_id:
|
||||
return JSONResponse(
|
||||
status_code=400,
|
||||
content=APIResponse.err(1001, "Parameters 'from' and 'to' are required").model_dump(),
|
||||
)
|
||||
max_hops = max(1, min(max_hops, 5))
|
||||
result = svc.search_path(from_id, to_id, max_hops)
|
||||
if result is None:
|
||||
return JSONResponse(
|
||||
status_code=404,
|
||||
content=APIResponse.err(3001, "One or both nodes not found").model_dump(),
|
||||
)
|
||||
return APIResponse.ok(result)
|
||||
|
||||
|
||||
@router.get("/graph")
|
||||
async def search_graph(q: str, include_neighbors: bool = False):
|
||||
result = svc.search_graph(q, include_neighbors)
|
||||
return APIResponse.ok(result)
|
||||
171
backend/routers/system.py
Normal file
171
backend/routers/system.py
Normal file
@@ -0,0 +1,171 @@
|
||||
"""F 组:系统(4 个端点)"""
|
||||
import os
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
from fastapi import APIRouter
|
||||
|
||||
from models.schemas import APIResponse
|
||||
from storage import file_store as fs
|
||||
|
||||
router = APIRouter(tags=["System"])
|
||||
|
||||
_START_TIME = time.time()
|
||||
|
||||
|
||||
@router.get("/health")
|
||||
async def health_check():
|
||||
env_path = Path(__file__).parent.parent / ".env"
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(env_path, override=False)
|
||||
|
||||
mineru_python = Path(os.getenv("MINERU_PYTHON", "F:/GraphRAGAgent/mineru_mvp/.venv/Scripts/python.exe"))
|
||||
backend_python = Path(__file__).parent.parent / ".venv" / "Scripts" / "python.exe"
|
||||
deepseek_key = os.getenv("DEEPSEEK_API_KEY", "")
|
||||
deepseek_url = os.getenv("DEEPSEEK_BASE_URL", "https://api.deepseek.com")
|
||||
|
||||
# Check if langextract is importable from backend's venv
|
||||
try:
|
||||
import subprocess
|
||||
result = subprocess.run(
|
||||
[str(backend_python), "-c", "import langextract; print('ok')"],
|
||||
capture_output=True, text=True, timeout=10
|
||||
)
|
||||
langextract_ok = result.returncode == 0 and "ok" in result.stdout
|
||||
except Exception:
|
||||
langextract_ok = False
|
||||
|
||||
components = {
|
||||
"mineru_venv": {
|
||||
"status": "ok" if mineru_python.exists() else "error",
|
||||
"path": str(mineru_python),
|
||||
"exists": mineru_python.exists(),
|
||||
},
|
||||
"langextract_venv": {
|
||||
"status": "ok" if langextract_ok else "error",
|
||||
"path": str(backend_python),
|
||||
"exists": backend_python.exists(),
|
||||
},
|
||||
"deepseek_api": {
|
||||
"status": "ok" if deepseek_key else "error",
|
||||
"base_url": deepseek_url,
|
||||
"key_configured": bool(deepseek_key),
|
||||
},
|
||||
"storage": {
|
||||
"status": "ok",
|
||||
"kg_nodes_exists": fs.kg_nodes_path().exists(),
|
||||
"kg_edges_exists": fs.kg_edges_path().exists(),
|
||||
"uploads_dir_exists": fs.UPLOADS_DIR.exists(),
|
||||
},
|
||||
}
|
||||
|
||||
overall = "healthy" if all(c["status"] == "ok" for c in components.values()) else "degraded"
|
||||
|
||||
return APIResponse.ok({
|
||||
"status": overall,
|
||||
"version": "1.0.0",
|
||||
"uptime_seconds": round(time.time() - _START_TIME, 1),
|
||||
"components": components,
|
||||
})
|
||||
|
||||
|
||||
@router.get("/system/stats")
|
||||
async def system_stats():
|
||||
from services import indexing_service as idx_svc
|
||||
|
||||
docs = list(fs.load_docs_index().values())
|
||||
nodes = fs.load_kg_nodes()
|
||||
edges = fs.load_kg_edges()
|
||||
history = fs.load_query_history()
|
||||
|
||||
type_dist: dict[str, int] = {}
|
||||
for n in nodes:
|
||||
t = n.get("type", "UNKNOWN")
|
||||
type_dist[t] = type_dist.get(t, 0) + 1
|
||||
|
||||
return APIResponse.ok({
|
||||
"total_documents": len(docs),
|
||||
"indexed_documents": sum(1 for d in docs if d.get("status") == "indexed"),
|
||||
"failed_documents": sum(1 for d in docs if d.get("status") == "failed"),
|
||||
"total_nodes": len(nodes),
|
||||
"total_edges": len(edges),
|
||||
"type_distribution": type_dist,
|
||||
"total_queries": len(history),
|
||||
"active_jobs": idx_svc.count_active_jobs(),
|
||||
"storage_used_mb": fs.storage_used_mb(),
|
||||
})
|
||||
|
||||
|
||||
@router.get("/system/formats")
|
||||
async def list_formats():
|
||||
return APIResponse.ok({
|
||||
"formats": [
|
||||
{"ext": "pdf", "description": "PDF 文档(文本型/扫描型/混合型)", "max_size_mb": 200, "max_pages": 600, "requires_ocr": False},
|
||||
{"ext": "docx", "description": "Microsoft Word(新版)", "max_size_mb": 200, "max_pages": 600, "requires_ocr": False},
|
||||
{"ext": "doc", "description": "Microsoft Word(旧版)", "max_size_mb": 200, "max_pages": 600, "requires_ocr": False},
|
||||
{"ext": "pptx", "description": "PowerPoint(新版)", "max_size_mb": 200, "max_pages": 600, "requires_ocr": False},
|
||||
{"ext": "ppt", "description": "PowerPoint(旧版)", "max_size_mb": 200, "max_pages": 600, "requires_ocr": False},
|
||||
{"ext": "png", "description": "PNG 图片(单页)", "max_size_mb": 200, "max_pages": 1, "requires_ocr": True},
|
||||
{"ext": "jpg", "description": "JPEG 图片(单页)", "max_size_mb": 200, "max_pages": 1, "requires_ocr": True},
|
||||
{"ext": "jpeg", "description": "JPEG 图片(单页)", "max_size_mb": 200, "max_pages": 1, "requires_ocr": True},
|
||||
{"ext": "html", "description": "HTML 文件", "max_size_mb": 200, "max_pages": 600, "requires_ocr": False},
|
||||
],
|
||||
"ocr_languages": [
|
||||
{"code": "ch", "name": "中文(默认)"},
|
||||
{"code": "en", "name": "英文"},
|
||||
{"code": "japan", "name": "日文"},
|
||||
{"code": "korean", "name": "韩文"},
|
||||
{"code": "french", "name": "法文"},
|
||||
{"code": "german", "name": "德文"},
|
||||
],
|
||||
"notes": [
|
||||
"language 参数默认值为 'ch'(非 'zh'),遵循 PaddleOCR v3 语言代码规范",
|
||||
"上传时不需要携带 Content-Type,服务端自动识别",
|
||||
"PNG/JPG/JPEG 单次最多处理 1 页",
|
||||
],
|
||||
})
|
||||
|
||||
|
||||
@router.get("/system/demo")
|
||||
async def get_demo_data():
|
||||
# Try backend KG first, then fall back to graphrag_pipeline/output
|
||||
nodes = fs.load_kg_nodes()
|
||||
edges = fs.load_kg_edges()
|
||||
|
||||
if not nodes:
|
||||
# Fallback: load from existing graphrag_pipeline output
|
||||
legacy_nodes_path = Path("F:/GraphRAGAgent/graphrag_pipeline/output/kg_nodes.json")
|
||||
legacy_edges_path = Path("F:/GraphRAGAgent/graphrag_pipeline/output/kg_edges.json")
|
||||
if legacy_nodes_path.exists():
|
||||
import json
|
||||
nodes = json.loads(legacy_nodes_path.read_text(encoding="utf-8"))
|
||||
edges = json.loads(legacy_edges_path.read_text(encoding="utf-8")) if legacy_edges_path.exists() else []
|
||||
else:
|
||||
from fastapi.responses import JSONResponse
|
||||
return JSONResponse(
|
||||
status_code=400,
|
||||
content=APIResponse.err(3002, "No demo data available. Index a document first.").model_dump(),
|
||||
)
|
||||
|
||||
type_counts: dict[str, int] = {}
|
||||
for n in nodes:
|
||||
t = n.get("type", "UNKNOWN")
|
||||
type_counts[t] = type_counts.get(t, 0) + 1
|
||||
|
||||
import networkx as nx
|
||||
G = nx.Graph()
|
||||
for n in nodes:
|
||||
G.add_node(n["id"])
|
||||
for e in edges:
|
||||
G.add_edge(e["source"], e["target"])
|
||||
|
||||
return APIResponse.ok({
|
||||
"nodes": nodes,
|
||||
"edges": edges,
|
||||
"stats": {
|
||||
"nodes": len(nodes),
|
||||
"edges": len(edges),
|
||||
"type_counts": type_counts,
|
||||
"density": round(nx.density(G), 4) if G.number_of_nodes() > 1 else 0.0,
|
||||
},
|
||||
})
|
||||
Reference in New Issue
Block a user