GraphRAG Studio — initial commit: multimodal RAG system with KG visualization
Full-stack application for document-to-knowledge-graph pipeline: - Backend: FastAPI + LangGraph ReAct agent + DeepSeek + MinerU parsing - Frontend: React 19 + Vite + D3.js + shadcn/ui - Pipeline: MinerU parsing → LangExtract entity extraction → KG building Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
66
backend/pipeline/entity_extractor.py
Normal file
66
backend/pipeline/entity_extractor.py
Normal file
@@ -0,0 +1,66 @@
|
||||
"""
|
||||
Entity Extractor — LangExtract + DeepSeek entity extraction.
|
||||
Independent implementation for the GraphRAG Studio backend.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from dotenv import load_dotenv
|
||||
|
||||
import langextract as lx
|
||||
from langextract.providers.openai import OpenAILanguageModel
|
||||
|
||||
load_dotenv(Path(__file__).parent.parent / ".env", override=True)
|
||||
|
||||
DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY", "")
|
||||
DEEPSEEK_BASE_URL = os.getenv("DEEPSEEK_BASE_URL", "https://api.deepseek.com")
|
||||
MODEL_ID = "deepseek-chat"
|
||||
|
||||
PROMPT_DESCRIPTION = (
|
||||
"Extract named entities from the text in order of appearance. "
|
||||
"Entity types: TECHNOLOGY (software, algorithms, models, tools), "
|
||||
"ORGANIZATION (companies, research groups, institutions), "
|
||||
"PERSON (individual people), "
|
||||
"LOCATION (places, geographic entities), "
|
||||
"CONCEPT (technical concepts, methodologies, frameworks)."
|
||||
)
|
||||
|
||||
EXAMPLES = [
|
||||
lx.data.ExampleData(
|
||||
text=(
|
||||
"LangChain is a framework created by Harrison Chase for building "
|
||||
"LLM applications. It integrates with OpenAI models and Pinecone "
|
||||
"vector database for semantic search."
|
||||
),
|
||||
extractions=[
|
||||
lx.data.Extraction(extraction_class="TECHNOLOGY", extraction_text="LangChain"),
|
||||
lx.data.Extraction(extraction_class="PERSON", extraction_text="Harrison Chase"),
|
||||
lx.data.Extraction(extraction_class="CONCEPT", extraction_text="LLM applications"),
|
||||
lx.data.Extraction(extraction_class="TECHNOLOGY", extraction_text="OpenAI models"),
|
||||
lx.data.Extraction(extraction_class="TECHNOLOGY", extraction_text="Pinecone"),
|
||||
lx.data.Extraction(extraction_class="CONCEPT", extraction_text="semantic search"),
|
||||
],
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
def create_model() -> OpenAILanguageModel:
|
||||
if not DEEPSEEK_API_KEY:
|
||||
raise ValueError("DEEPSEEK_API_KEY not set in backend/.env")
|
||||
return OpenAILanguageModel(
|
||||
model_id=MODEL_ID,
|
||||
api_key=DEEPSEEK_API_KEY,
|
||||
base_url=DEEPSEEK_BASE_URL,
|
||||
)
|
||||
|
||||
|
||||
def extract_entities(page_text: str, model: OpenAILanguageModel) -> lx.data.AnnotatedDocument:
|
||||
return lx.extract(
|
||||
text_or_documents=page_text,
|
||||
prompt_description=PROMPT_DESCRIPTION,
|
||||
examples=EXAMPLES,
|
||||
model=model,
|
||||
show_progress=False,
|
||||
)
|
||||
Reference in New Issue
Block a user