GraphRAG Studio — initial commit: multimodal RAG system with KG visualization

Full-stack application for document-to-knowledge-graph pipeline: - Backend: FastAPI + LangGraph ReAct agent + DeepSeek + MinerU parsing - Frontend: React 19 + Vite + D3.js + shadcn/ui - Pipeline: MinerU parsing → LangExtract entity extraction → KG building Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-07 17:30:04 +08:00
commit b02d3378fc
127 changed files with 37218 additions and 0 deletions
--- a/backend/pipeline/kg_builder.py
+++ b/backend/pipeline/kg_builder.py
@@ -0,0 +1,123 @@
+"""
+KG Builder — node deduplication + CO_OCCURS_IN edge generation.
+Independent implementation for the GraphRAG Studio backend.
+"""
+from __future__ import annotations
+
+from collections import defaultdict
+
+import langextract as lx
+
+from pipeline.text_assembler import PageText
+
+ACCEPTED_ALIGNMENTS = {"match_exact", "match_greater", "match_lesser"}
+
+
+def build_kg(
+    pages: list[PageText],
+    annotated_docs: list[lx.data.AnnotatedDocument],
+    source_doc_id: str,
+) -> tuple[list[dict], list[dict]]:
+    """Build KG nodes and edges from LangExtract results.
+
+    Returns:
+        (nodes, edges) — deduplicated node list and edge list.
+    """
+    # Phase 1: collect raw entities
+    raw_entities = []
+    for page, doc in zip(pages, annotated_docs):
+        if not doc.extractions:
+            continue
+        for ext in doc.extractions:
+            status = ext.alignment_status.value if ext.alignment_status else None
+            if status not in ACCEPTED_ALIGNMENTS:
+                continue
+            char_start = ext.char_interval.start_pos if ext.char_interval else None
+            char_end = ext.char_interval.end_pos if ext.char_interval else None
+            raw_entities.append({
+                "name": ext.extraction_text,
+                "type": ext.extraction_class,
+                "char_start": char_start,
+                "char_end": char_end,
+                "confidence": status,
+                "page": page.page_idx,
+                "source_doc": source_doc_id,
+            })
+
+    # Phase 2: deduplicate nodes
+    seen: dict[tuple[str, str], int] = {}
+    nodes: list[dict] = []
+    node_pages: dict[int, set[int]] = defaultdict(set)
+
+    for entity in raw_entities:
+        type_prefix = entity["type"].lower()[:4]
+        name_slug = entity["name"].lower().replace(" ", "")[:12]
+        dedup_key = (entity["name"].lower(), entity["type"])
+        if dedup_key not in seen:
+            node_idx = len(nodes)
+            seen[dedup_key] = node_idx
+            nodes.append({
+                "id": f"{type_prefix}_{name_slug}_{node_idx}",
+                "name": entity["name"],
+                "type": entity["type"],
+                "source_doc": entity["source_doc"],
+                "char_start": entity["char_start"],
+                "char_end": entity["char_end"],
+                "confidence": entity["confidence"],
+                "page": entity["page"],
+            })
+        node_idx = seen[dedup_key]
+        node_pages[node_idx].add(entity["page"])
+
+    # Phase 3: CO_OCCURS_IN edges
+    page_nodes: dict[int, list[int]] = defaultdict(list)
+    for node_idx, page_set in node_pages.items():
+        for page_idx in page_set:
+            page_nodes[page_idx].append(node_idx)
+
+    edges: list[dict] = []
+    edge_seen: set[tuple] = set()
+
+    for page_idx, node_indices in sorted(page_nodes.items()):
+        for i in range(len(node_indices)):
+            for j in range(i + 1, len(node_indices)):
+                a = nodes[node_indices[i]]["id"]
+                b = nodes[node_indices[j]]["id"]
+                src, tgt = (a, b) if a < b else (b, a)
+                key = (src, tgt, source_doc_id, page_idx)
+                if key in edge_seen:
+                    continue
+                edge_seen.add(key)
+                edges.append({
+                    "source": src,
+                    "target": tgt,
+                    "relation": "CO_OCCURS_IN",
+                    "doc_id": source_doc_id,
+                    "page": page_idx,
+                })
+
+    return nodes, edges
+
+
+def extractions_to_records(
+    pages: list[PageText],
+    annotated_docs: list[lx.data.AnnotatedDocument],
+    doc_id: str,
+) -> list[dict]:
+    """Flatten LangExtract results to ExtractionRecord dicts."""
+    records = []
+    for page, doc in zip(pages, annotated_docs):
+        if not doc.extractions:
+            continue
+        for ext in doc.extractions:
+            status = ext.alignment_status.value if ext.alignment_status else None
+            records.append({
+                "text": ext.extraction_text,
+                "type": ext.extraction_class,
+                "char_start": ext.char_interval.start_pos if ext.char_interval else None,
+                "char_end": ext.char_interval.end_pos if ext.char_interval else None,
+                "alignment": status,
+                "page": page.page_idx,
+                "doc_id": doc_id,
+            })
+    return records