GraphRAG Studio — initial commit: multimodal RAG system with KG visualization

Full-stack application for document-to-knowledge-graph pipeline:
- Backend: FastAPI + LangGraph ReAct agent + DeepSeek + MinerU parsing
- Frontend: React 19 + Vite + D3.js + shadcn/ui
- Pipeline: MinerU parsing → LangExtract entity extraction → KG building

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
plf
2026-06-07 17:30:04 +08:00
commit b02d3378fc
127 changed files with 37218 additions and 0 deletions

View File

@@ -0,0 +1,107 @@
"""
Text Assembler — MinerU content_list.json → per-page plain text.
Independent implementation for the GraphRAG Studio backend.
"""
from __future__ import annotations
import dataclasses
import json
from collections import defaultdict
from pathlib import Path
from bs4 import BeautifulSoup
@dataclasses.dataclass
class BlockSpan:
block_index: int
block_type: str
page_idx: int
char_start: int
char_end: int
bbox: list
@dataclasses.dataclass
class PageText:
page_idx: int
text: str
block_spans: list[BlockSpan]
def html_table_to_text(table_body: str) -> str:
soup = BeautifulSoup(table_body, "html.parser")
rows = []
for tr in soup.find_all("tr"):
cells = [td.get_text(strip=True) for td in tr.find_all(["td", "th"])]
rows.append(" | ".join(cells))
return "\n".join(rows)
def load_content_list(path: Path) -> list[dict]:
if path.is_dir():
matches = list(path.glob("*_content_list.json"))
if not matches:
matches = list(path.glob("*content_list.json"))
if not matches:
raise FileNotFoundError(f"No content_list.json found in {path}")
path = matches[0]
with open(path, "r", encoding="utf-8") as f:
return json.load(f)
def assemble_pages(content_list: list[dict]) -> list[PageText]:
pages: dict[int, list[tuple[int, dict]]] = defaultdict(list)
for i, block in enumerate(content_list):
page_idx = block.get("page_idx", 0)
pages[page_idx].append((i, block))
result = []
for page_idx in sorted(pages.keys()):
blocks = pages[page_idx]
buffer = []
spans = []
cursor = 0
for block_index, block in blocks:
block_type = block.get("type", "unknown")
bbox = block.get("bbox", [0, 0, 0, 0])
if block_type == "text":
block_text = block.get("text", "").rstrip()
elif block_type == "table":
table_body = block.get("table_body", "")
block_text = html_table_to_text(table_body) if table_body else ""
else:
continue
if not block_text:
continue
char_start = cursor
buffer.append(block_text)
cursor += len(block_text)
char_end = cursor
spans.append(BlockSpan(
block_index=block_index,
block_type=block_type,
page_idx=page_idx,
char_start=char_start,
char_end=char_end,
bbox=bbox,
))
buffer.append("\n")
cursor += 1
text = "".join(buffer).rstrip("\n")
result.append(PageText(page_idx=page_idx, text=text, block_spans=spans))
return result
def count_blocks_by_type(content_list: list[dict]) -> dict[str, int]:
counts: dict[str, int] = defaultdict(int)
for block in content_list:
counts[block.get("type", "unknown")] += 1
return dict(counts)