GraphRAG Studio — initial commit: multimodal RAG system with KG visualization
Full-stack application for document-to-knowledge-graph pipeline: - Backend: FastAPI + LangGraph ReAct agent + DeepSeek + MinerU parsing - Frontend: React 19 + Vite + D3.js + shadcn/ui - Pipeline: MinerU parsing → LangExtract entity extraction → KG building Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
123
backend/pipeline/kg_builder.py
Normal file
123
backend/pipeline/kg_builder.py
Normal file
@@ -0,0 +1,123 @@
|
||||
"""
|
||||
KG Builder — node deduplication + CO_OCCURS_IN edge generation.
|
||||
Independent implementation for the GraphRAG Studio backend.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from collections import defaultdict
|
||||
|
||||
import langextract as lx
|
||||
|
||||
from pipeline.text_assembler import PageText
|
||||
|
||||
ACCEPTED_ALIGNMENTS = {"match_exact", "match_greater", "match_lesser"}
|
||||
|
||||
|
||||
def build_kg(
|
||||
pages: list[PageText],
|
||||
annotated_docs: list[lx.data.AnnotatedDocument],
|
||||
source_doc_id: str,
|
||||
) -> tuple[list[dict], list[dict]]:
|
||||
"""Build KG nodes and edges from LangExtract results.
|
||||
|
||||
Returns:
|
||||
(nodes, edges) — deduplicated node list and edge list.
|
||||
"""
|
||||
# Phase 1: collect raw entities
|
||||
raw_entities = []
|
||||
for page, doc in zip(pages, annotated_docs):
|
||||
if not doc.extractions:
|
||||
continue
|
||||
for ext in doc.extractions:
|
||||
status = ext.alignment_status.value if ext.alignment_status else None
|
||||
if status not in ACCEPTED_ALIGNMENTS:
|
||||
continue
|
||||
char_start = ext.char_interval.start_pos if ext.char_interval else None
|
||||
char_end = ext.char_interval.end_pos if ext.char_interval else None
|
||||
raw_entities.append({
|
||||
"name": ext.extraction_text,
|
||||
"type": ext.extraction_class,
|
||||
"char_start": char_start,
|
||||
"char_end": char_end,
|
||||
"confidence": status,
|
||||
"page": page.page_idx,
|
||||
"source_doc": source_doc_id,
|
||||
})
|
||||
|
||||
# Phase 2: deduplicate nodes
|
||||
seen: dict[tuple[str, str], int] = {}
|
||||
nodes: list[dict] = []
|
||||
node_pages: dict[int, set[int]] = defaultdict(set)
|
||||
|
||||
for entity in raw_entities:
|
||||
type_prefix = entity["type"].lower()[:4]
|
||||
name_slug = entity["name"].lower().replace(" ", "")[:12]
|
||||
dedup_key = (entity["name"].lower(), entity["type"])
|
||||
if dedup_key not in seen:
|
||||
node_idx = len(nodes)
|
||||
seen[dedup_key] = node_idx
|
||||
nodes.append({
|
||||
"id": f"{type_prefix}_{name_slug}_{node_idx}",
|
||||
"name": entity["name"],
|
||||
"type": entity["type"],
|
||||
"source_doc": entity["source_doc"],
|
||||
"char_start": entity["char_start"],
|
||||
"char_end": entity["char_end"],
|
||||
"confidence": entity["confidence"],
|
||||
"page": entity["page"],
|
||||
})
|
||||
node_idx = seen[dedup_key]
|
||||
node_pages[node_idx].add(entity["page"])
|
||||
|
||||
# Phase 3: CO_OCCURS_IN edges
|
||||
page_nodes: dict[int, list[int]] = defaultdict(list)
|
||||
for node_idx, page_set in node_pages.items():
|
||||
for page_idx in page_set:
|
||||
page_nodes[page_idx].append(node_idx)
|
||||
|
||||
edges: list[dict] = []
|
||||
edge_seen: set[tuple] = set()
|
||||
|
||||
for page_idx, node_indices in sorted(page_nodes.items()):
|
||||
for i in range(len(node_indices)):
|
||||
for j in range(i + 1, len(node_indices)):
|
||||
a = nodes[node_indices[i]]["id"]
|
||||
b = nodes[node_indices[j]]["id"]
|
||||
src, tgt = (a, b) if a < b else (b, a)
|
||||
key = (src, tgt, source_doc_id, page_idx)
|
||||
if key in edge_seen:
|
||||
continue
|
||||
edge_seen.add(key)
|
||||
edges.append({
|
||||
"source": src,
|
||||
"target": tgt,
|
||||
"relation": "CO_OCCURS_IN",
|
||||
"doc_id": source_doc_id,
|
||||
"page": page_idx,
|
||||
})
|
||||
|
||||
return nodes, edges
|
||||
|
||||
|
||||
def extractions_to_records(
|
||||
pages: list[PageText],
|
||||
annotated_docs: list[lx.data.AnnotatedDocument],
|
||||
doc_id: str,
|
||||
) -> list[dict]:
|
||||
"""Flatten LangExtract results to ExtractionRecord dicts."""
|
||||
records = []
|
||||
for page, doc in zip(pages, annotated_docs):
|
||||
if not doc.extractions:
|
||||
continue
|
||||
for ext in doc.extractions:
|
||||
status = ext.alignment_status.value if ext.alignment_status else None
|
||||
records.append({
|
||||
"text": ext.extraction_text,
|
||||
"type": ext.extraction_class,
|
||||
"char_start": ext.char_interval.start_pos if ext.char_interval else None,
|
||||
"char_end": ext.char_interval.end_pos if ext.char_interval else None,
|
||||
"alignment": status,
|
||||
"page": page.page_idx,
|
||||
"doc_id": doc_id,
|
||||
})
|
||||
return records
|
||||
Reference in New Issue
Block a user