GraphRAG Studio — initial commit: multimodal RAG system with KG visualization

Full-stack application for document-to-knowledge-graph pipeline:
- Backend: FastAPI + LangGraph ReAct agent + DeepSeek + MinerU parsing
- Frontend: React 19 + Vite + D3.js + shadcn/ui
- Pipeline: MinerU parsing → LangExtract entity extraction → KG building

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
plf
2026-06-07 17:30:04 +08:00
commit b02d3378fc
127 changed files with 37218 additions and 0 deletions

View File

@@ -0,0 +1,268 @@
"""
File Store — unified JSON read/write for all backend data.
All data lives under backend/data/.
"""
from __future__ import annotations
import json
import os
import shutil
from pathlib import Path
from typing import Any
# Root data directory relative to this file
_BASE = Path(__file__).parent.parent / "data"
UPLOADS_DIR = _BASE / "uploads"
JOBS_DIR = _BASE / "jobs"
KG_DIR = _BASE / "kg"
QUERY_DIR = _BASE / "jobs" # query_history.jsonl lives here
# Ensure directories exist at import time
for _d in (UPLOADS_DIR, JOBS_DIR, KG_DIR):
_d.mkdir(parents=True, exist_ok=True)
# ---------------------------------------------------------------------------
# Generic helpers
# ---------------------------------------------------------------------------
def read_json(path: Path) -> Any:
"""Read and parse a JSON file. Returns None if file doesn't exist."""
if not path.exists():
return None
with open(path, "r", encoding="utf-8") as f:
return json.load(f)
def write_json(path: Path, data: Any) -> None:
"""Atomically write data as JSON (write to .tmp then rename)."""
path.parent.mkdir(parents=True, exist_ok=True)
tmp = path.with_suffix(".tmp")
with open(tmp, "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=2)
os.replace(tmp, path)
def append_jsonl(path: Path, record: dict) -> None:
"""Append a record to a JSONL file."""
path.parent.mkdir(parents=True, exist_ok=True)
with open(path, "a", encoding="utf-8") as f:
f.write(json.dumps(record, ensure_ascii=False) + "\n")
def read_jsonl(path: Path) -> list[dict]:
"""Read all records from a JSONL file."""
if not path.exists():
return []
records = []
with open(path, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if line:
try:
records.append(json.loads(line))
except json.JSONDecodeError:
pass
return records
# ---------------------------------------------------------------------------
# Document helpers
# ---------------------------------------------------------------------------
def docs_index_path() -> Path:
return _BASE / "docs_index.json"
def load_docs_index() -> dict[str, dict]:
"""Load the documents index {doc_id: DocumentInfo dict}."""
data = read_json(docs_index_path())
return data if isinstance(data, dict) else {}
def save_docs_index(index: dict[str, dict]) -> None:
write_json(docs_index_path(), index)
def get_doc(doc_id: str) -> dict | None:
return load_docs_index().get(doc_id)
def save_doc(doc: dict) -> None:
index = load_docs_index()
index[doc["doc_id"]] = doc
save_docs_index(index)
def delete_doc(doc_id: str) -> bool:
index = load_docs_index()
if doc_id not in index:
return False
del index[doc_id]
save_docs_index(index)
# Remove upload file
doc_info = index.get(doc_id, {})
upload_path = UPLOADS_DIR / doc_info.get("upload_filename", "")
if upload_path.exists():
upload_path.unlink()
return True
# ---------------------------------------------------------------------------
# Job helpers
# ---------------------------------------------------------------------------
def job_dir(job_id: str) -> Path:
return JOBS_DIR / job_id
def job_meta_path(job_id: str) -> Path:
return job_dir(job_id) / "meta.json"
def load_job_meta(job_id: str) -> dict | None:
return read_json(job_meta_path(job_id))
def save_job_meta(job_id: str, meta: dict) -> None:
job_dir(job_id).mkdir(parents=True, exist_ok=True)
write_json(job_meta_path(job_id), meta)
def list_all_jobs() -> list[dict]:
metas = []
for d in JOBS_DIR.iterdir():
if d.is_dir():
meta = read_json(d / "meta.json")
if meta:
metas.append(meta)
return metas
def delete_job(job_id: str) -> None:
jd = job_dir(job_id)
if jd.exists():
shutil.rmtree(jd)
# ---------------------------------------------------------------------------
# Global KG helpers
# ---------------------------------------------------------------------------
def kg_nodes_path() -> Path:
return KG_DIR / "kg_nodes.json"
def kg_edges_path() -> Path:
return KG_DIR / "kg_edges.json"
def load_kg_nodes() -> list[dict]:
data = read_json(kg_nodes_path())
return data if isinstance(data, list) else []
def load_kg_edges() -> list[dict]:
data = read_json(kg_edges_path())
return data if isinstance(data, list) else []
def save_kg_nodes(nodes: list[dict]) -> None:
write_json(kg_nodes_path(), nodes)
def save_kg_edges(edges: list[dict]) -> None:
write_json(kg_edges_path(), edges)
def merge_kg(new_nodes: list[dict], new_edges: list[dict], doc_id: str) -> tuple[int, int]:
"""Merge job KG output into global KG. Returns (removed_old, added_new)."""
existing_nodes = load_kg_nodes()
existing_edges = load_kg_edges()
# Remove nodes/edges from this doc
existing_nodes = [n for n in existing_nodes if n.get("source_doc") != doc_id]
existing_edges = [e for e in existing_edges if e.get("doc_id") != doc_id]
# Merge: deduplicate nodes by (name.lower(), type)
node_keys: set[tuple] = {(n["name"].lower(), n["type"]) for n in existing_nodes}
for n in new_nodes:
key = (n["name"].lower(), n["type"])
if key not in node_keys:
existing_nodes.append(n)
node_keys.add(key)
# Merge edges: deduplicate by (min(src,tgt), max(src,tgt), doc_id)
edge_keys: set[tuple] = set()
for e in existing_edges:
s, t = e["source"], e["target"]
edge_keys.add((min(s, t), max(s, t), e["doc_id"]))
for e in new_edges:
s, t = e["source"], e["target"]
key = (min(s, t), max(s, t), e["doc_id"])
if key not in edge_keys:
existing_edges.append(e)
edge_keys.add(key)
save_kg_nodes(existing_nodes)
save_kg_edges(existing_edges)
return len(existing_nodes), len(existing_edges)
def remove_doc_from_kg(doc_id: str) -> tuple[int, int]:
"""Remove all nodes/edges from a document. Returns (removed_nodes, removed_edges)."""
nodes = load_kg_nodes()
edges = load_kg_edges()
old_n, old_e = len(nodes), len(edges)
nodes = [n for n in nodes if n.get("source_doc") != doc_id]
edges = [e for e in edges if e.get("doc_id") != doc_id]
save_kg_nodes(nodes)
save_kg_edges(edges)
return old_n - len(nodes), old_e - len(edges)
# ---------------------------------------------------------------------------
# Query history helpers
# ---------------------------------------------------------------------------
def query_history_path() -> Path:
return _BASE / "query_history.jsonl"
def append_query_history(result: dict) -> None:
append_jsonl(query_history_path(), result)
def load_query_history() -> list[dict]:
records = read_jsonl(query_history_path())
return list(reversed(records)) # newest first
# ---------------------------------------------------------------------------
# Batch job helpers
# ---------------------------------------------------------------------------
def batch_meta_path(batch_id: str) -> Path:
return _BASE / "batches" / f"{batch_id}.json"
def load_batch_meta(batch_id: str) -> dict | None:
return read_json(batch_meta_path(batch_id))
def save_batch_meta(batch_id: str, meta: dict) -> None:
write_json(batch_meta_path(batch_id), meta)
# ---------------------------------------------------------------------------
# Storage usage
# ---------------------------------------------------------------------------
def storage_used_mb() -> float:
total = 0
for path in _BASE.rglob("*"):
if path.is_file():
total += path.stat().st_size
return round(total / (1024 * 1024), 2)