GraphRAG Studio — initial commit: multimodal RAG system with KG visualization

Full-stack application for document-to-knowledge-graph pipeline:
- Backend: FastAPI + LangGraph ReAct agent + DeepSeek + MinerU parsing
- Frontend: React 19 + Vite + D3.js + shadcn/ui
- Pipeline: MinerU parsing → LangExtract entity extraction → KG building

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
plf
2026-06-07 17:30:04 +08:00
commit b02d3378fc
127 changed files with 37218 additions and 0 deletions

View File

@@ -0,0 +1,66 @@
"""
Entity Extractor — LangExtract + DeepSeek entity extraction.
Independent implementation for the GraphRAG Studio backend.
"""
from __future__ import annotations
import os
from pathlib import Path
from dotenv import load_dotenv
import langextract as lx
from langextract.providers.openai import OpenAILanguageModel
load_dotenv(Path(__file__).parent.parent / ".env", override=True)
DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY", "")
DEEPSEEK_BASE_URL = os.getenv("DEEPSEEK_BASE_URL", "https://api.deepseek.com")
MODEL_ID = "deepseek-chat"
PROMPT_DESCRIPTION = (
"Extract named entities from the text in order of appearance. "
"Entity types: TECHNOLOGY (software, algorithms, models, tools), "
"ORGANIZATION (companies, research groups, institutions), "
"PERSON (individual people), "
"LOCATION (places, geographic entities), "
"CONCEPT (technical concepts, methodologies, frameworks)."
)
EXAMPLES = [
lx.data.ExampleData(
text=(
"LangChain is a framework created by Harrison Chase for building "
"LLM applications. It integrates with OpenAI models and Pinecone "
"vector database for semantic search."
),
extractions=[
lx.data.Extraction(extraction_class="TECHNOLOGY", extraction_text="LangChain"),
lx.data.Extraction(extraction_class="PERSON", extraction_text="Harrison Chase"),
lx.data.Extraction(extraction_class="CONCEPT", extraction_text="LLM applications"),
lx.data.Extraction(extraction_class="TECHNOLOGY", extraction_text="OpenAI models"),
lx.data.Extraction(extraction_class="TECHNOLOGY", extraction_text="Pinecone"),
lx.data.Extraction(extraction_class="CONCEPT", extraction_text="semantic search"),
],
)
]
def create_model() -> OpenAILanguageModel:
if not DEEPSEEK_API_KEY:
raise ValueError("DEEPSEEK_API_KEY not set in backend/.env")
return OpenAILanguageModel(
model_id=MODEL_ID,
api_key=DEEPSEEK_API_KEY,
base_url=DEEPSEEK_BASE_URL,
)
def extract_entities(page_text: str, model: OpenAILanguageModel) -> lx.data.AnnotatedDocument:
return lx.extract(
text_or_documents=page_text,
prompt_description=PROMPT_DESCRIPTION,
examples=EXAMPLES,
model=model,
show_progress=False,
)