GraphRAG Studio — initial commit: multimodal RAG system with KG visualization

Full-stack application for document-to-knowledge-graph pipeline:
- Backend: FastAPI + LangGraph ReAct agent + DeepSeek + MinerU parsing
- Frontend: React 19 + Vite + D3.js + shadcn/ui
- Pipeline: MinerU parsing → LangExtract entity extraction → KG building

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
plf
2026-06-07 17:30:04 +08:00
commit b02d3378fc
127 changed files with 37218 additions and 0 deletions

10
backend/.env.example Normal file
View File

@@ -0,0 +1,10 @@
# DeepSeek API (required for entity extraction + QA)
DEEPSEEK_API_KEY=your_deepseek_api_key_here
DEEPSEEK_BASE_URL=https://api.deepseek.com
# MinerU (required for document parsing)
MINERU_API_TOKEN=your_mineru_api_token_here
# MinerU venv path (absolute path to python.exe)
MINERU_PYTHON=F:/GraphRAGAgent/mineru_mvp/.venv/Scripts/python.exe
MINERU_PIPELINE=F:/GraphRAGAgent/mineru_mvp/pipeline.py

10
backend/.gitignore vendored Normal file
View File

@@ -0,0 +1,10 @@
.env
.venv/
__pycache__/
*.pyc
*.pyo
data/uploads/
data/jobs/
data/kg/
*.egg-info/
dist/

28
backend/CLAUDE.md Normal file
View File

@@ -0,0 +1,28 @@
# Backend — GraphRAG Studio API
## 路径
```
F:\GraphRAGAgent\backend\
```
## 启动命令
```bash
cd F:/GraphRAGAgent/backend
.venv/Scripts/python.exe -m uvicorn main:app --host 0.0.0.0 --port 8000 --reload
```
## 接口测试
服务启动后,运行:
```bash
.venv/Scripts/python.exe tests/test_api.py
```
## API 文档
- Swagger UIhttp://localhost:8000/docs
- ReDochttp://localhost:8000/redoc
- 健康检查http://localhost:8000/api/v1/health

58
backend/main.py Normal file
View File

@@ -0,0 +1,58 @@
"""
GraphRAG Studio — FastAPI Backend
Entry point: uvicorn main:app --host 0.0.0.0 --port 8000 --reload
"""
import sys
from pathlib import Path
# Ensure backend/ is in sys.path for absolute imports
sys.path.insert(0, str(Path(__file__).parent))
from dotenv import load_dotenv
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
load_dotenv(Path(__file__).parent / ".env", override=True)
from routers import documents, indexing, kg, query, search, system
app = FastAPI(
title="GraphRAG Studio API",
description="Multimodal RAG Q&A system backend — MinerU + LangExtract + Agentic-RAG",
version="1.0.0",
docs_url="/docs",
redoc_url="/redoc",
)
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# All routers under /api/v1. Each router carries its own sub-prefix.
# documents.router prefix="/documents" → /api/v1/documents
# indexing.router prefix="/index" → /api/v1/index
# kg.router prefix="/kg" → /api/v1/kg
# query.router prefix="/query" → /api/v1/query
# search.router prefix="/search" → /api/v1/search
# system.router no prefix → /api/v1/health, /api/v1/system/...
PREFIX = "/api/v1"
app.include_router(documents.router, prefix=PREFIX)
app.include_router(indexing.router, prefix=PREFIX)
app.include_router(kg.router, prefix=PREFIX)
app.include_router(query.router, prefix=PREFIX)
app.include_router(search.router, prefix=PREFIX)
app.include_router(system.router, prefix=PREFIX)
@app.get("/")
async def root():
return {"msg": "GraphRAG Studio API v1.0.0", "docs": "/docs", "health": "/api/v1/health"}
if __name__ == "__main__":
import uvicorn
uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=True)

View File

360
backend/models/schemas.py Normal file
View File

@@ -0,0 +1,360 @@
"""
Pydantic v2 schemas — all API data objects per backend_service_specification-v1.0.md
"""
from __future__ import annotations
import uuid
from typing import Any, Generic, Optional, TypeVar
from pydantic import BaseModel, Field
T = TypeVar("T")
# ---------------------------------------------------------------------------
# Universal response envelope
# ---------------------------------------------------------------------------
class APIResponse(BaseModel, Generic[T]):
code: int = 0
msg: str = "success"
request_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
data: Optional[T] = None
@classmethod
def ok(cls, data: Any = None) -> "APIResponse":
return cls(code=0, msg="success", data=data)
@classmethod
def err(cls, code: int, msg: str) -> "APIResponse":
return cls(code=code, msg=msg, data=None)
# ---------------------------------------------------------------------------
# A. Document schemas
# ---------------------------------------------------------------------------
class DocumentInfo(BaseModel):
doc_id: str
filename: str
format: str
size_bytes: int
pages: Optional[int] = None
uploaded_at: str
status: str # uploaded | indexed | failed
language: str = "ch"
enable_formula: bool = True
enable_table: bool = True
class DocumentListData(BaseModel):
total: int
page: int
page_size: int
items: list[DocumentInfo]
class DeleteDocumentData(BaseModel):
deleted: bool
doc_id: str
removed_nodes: int
removed_edges: int
# ---------------------------------------------------------------------------
# B. Indexing job schemas
# ---------------------------------------------------------------------------
class IndexingProgress(BaseModel):
parsed_pages: int = 0
total_pages: int = 0
extracted_entities: int = 0
class IndexingJobStatus(BaseModel):
job_id: str
doc_id: str
status: str # submitted|queued|parsing|extracting|indexing|done|failed|cancelled
stage: str = ""
progress: IndexingProgress = Field(default_factory=IndexingProgress)
created_at: str
elapsed_seconds: float = 0.0
error: Optional[str] = None
class StartIndexRequest(BaseModel):
doc_id: str
class CancelJobData(BaseModel):
cancelled: bool
job_id: str
previous_status: str
# ---------------------------------------------------------------------------
# C. KG schemas
# ---------------------------------------------------------------------------
class KGNode(BaseModel):
id: str
name: str
type: str
source_doc: str
char_start: Optional[int] = None
char_end: Optional[int] = None
confidence: Optional[str] = None
page: int = 0
degree: int = 0
class KGNodeDetail(KGNode):
degree_centrality: float = 0.0
neighbor_count: int = 0
class KGEdge(BaseModel):
source: str
target: str
relation: str = "CO_OCCURS_IN"
doc_id: str
page: int = 0
class KGNodeListData(BaseModel):
total: int
page: int
page_size: int
items: list[KGNode]
class KGEdgeListData(BaseModel):
total: int
page: int
page_size: int
items: list[KGEdge]
class KGStatsData(BaseModel):
total_nodes: int
total_edges: int
density: float
type_distribution: dict[str, int]
relation_types: dict[str, int]
top5_central_nodes: list[dict]
source_documents: list[str]
class KGExportData(BaseModel):
format: str
doc_id: Optional[str]
total_nodes: int
total_edges: int
exported_at: str
nodes: list[KGNode]
edges: list[KGEdge]
class NeighborInfo(BaseModel):
id: str
name: str
type: str
page: int
class NeighborsData(BaseModel):
center: NeighborInfo
hops: int
neighbors_by_hop: dict[str, list[NeighborInfo]]
total_neighbors: int
# ---------------------------------------------------------------------------
# D. QA schemas
# ---------------------------------------------------------------------------
class ChatMessage(BaseModel):
role: str # human | ai
content: str
class QueryRequest(BaseModel):
question: str
history: list[ChatMessage] = Field(default_factory=list)
class ToolCallRecord(BaseModel):
tool: str
input: dict
output: str
class QAResult(BaseModel):
query_id: str
question: str
answer: str
tool_calls: list[ToolCallRecord] = Field(default_factory=list)
cited_nodes: list[str] = Field(default_factory=list)
elapsed_seconds: float
created_at: str
class QAHistoryData(BaseModel):
total: int
page: int
page_size: int
items: list[QAResult]
class BatchQueryRequest(BaseModel):
questions: list[str] = Field(..., max_length=20)
class BatchQueryData(BaseModel):
batch_id: str
total: int
status: str
created_at: str
class BatchResultData(BaseModel):
batch_id: str
total: int
completed: int
failed: int
status: str
results: list[QAResult]
# ---------------------------------------------------------------------------
# E. Search schemas
# ---------------------------------------------------------------------------
class EntitySearchData(BaseModel):
query: str
total: int
items: list[KGNode]
class PathNode(BaseModel):
id: str
name: str
type: str
class PathEdge(BaseModel):
source: str
target: str
relation: str
class PathInfo(BaseModel):
length: int
nodes: list[PathNode]
edges: list[PathEdge]
class PathSearchData(BaseModel):
from_node: PathNode = Field(alias="from")
to_node: PathNode = Field(alias="to")
max_hops: int
paths: list[PathInfo]
total_paths: int
model_config = {"populate_by_name": True}
class GraphSearchData(BaseModel):
query: str
matched_nodes: list[KGNode]
subgraph_edges: list[KGEdge]
# ---------------------------------------------------------------------------
# F. System schemas
# ---------------------------------------------------------------------------
class ComponentHealth(BaseModel):
status: str # ok | error
path: Optional[str] = None
exists: Optional[bool] = None
base_url: Optional[str] = None
key_configured: Optional[bool] = None
kg_nodes_exists: Optional[bool] = None
kg_edges_exists: Optional[bool] = None
uploads_dir_exists: Optional[bool] = None
class HealthData(BaseModel):
status: str
version: str
uptime_seconds: float
components: dict[str, ComponentHealth]
class SystemStatsData(BaseModel):
total_documents: int
indexed_documents: int
failed_documents: int
total_nodes: int
total_edges: int
type_distribution: dict[str, int]
total_queries: int
active_jobs: int
storage_used_mb: float
class FormatInfo(BaseModel):
ext: str
description: str
max_size_mb: int
max_pages: int
requires_ocr: bool
class FormatsData(BaseModel):
formats: list[FormatInfo]
ocr_languages: list[dict]
notes: list[str]
class DemoData(BaseModel):
nodes: list[KGNode]
edges: list[KGEdge]
stats: dict
# ---------------------------------------------------------------------------
# B3 index result
# ---------------------------------------------------------------------------
class IndexResultStats(BaseModel):
blocks: int = 0
block_types: dict[str, int] = Field(default_factory=dict)
pages: int = 0
raw_extractions: int = 0
nodes: int = 0
edges: int = 0
type_counts: dict[str, int] = Field(default_factory=dict)
alignment_counts: dict[str, int] = Field(default_factory=dict)
elapsed_seconds: float = 0.0
class ExtractionRecord(BaseModel):
text: str
type: str
char_start: Optional[int] = None
char_end: Optional[int] = None
alignment: Optional[str] = None
page: int = 0
doc_id: str
class IndexResultData(BaseModel):
job_id: str
doc_id: str
status: str
stats: Optional[IndexResultStats] = None
extractions: Optional[list[ExtractionRecord]] = None
nodes: Optional[list[KGNode]] = None
edges: Optional[list[KGEdge]] = None

View File

@@ -0,0 +1,367 @@
[
{
"type": "text",
"text": "GraphRAG System ",
"text_level": 1,
"bbox": [
344,
175,
655,
204
],
"page_idx": 0
},
{
"type": "text",
"text": "Technical Architecture Overview ",
"bbox": [
289,
234,
710,
254
],
"page_idx": 0
},
{
"type": "text",
"text": "Version 1.0 | March 2026 ",
"bbox": [
364,
272,
633,
290
],
"page_idx": 0
},
{
"type": "text",
"text": "1. Abstract ",
"text_level": 1,
"bbox": [
52,
42,
200,
61
],
"page_idx": 1
},
{
"type": "text",
"text": "This document presents the technical architecture of a Multimodal GraphRAG System designed for intelligent document parsing and knowledge graph construction. The system integrates MinerU for document parsing, LangExtract for structured entity extraction, and a graph database for knowledge storage and retrieval. ",
"bbox": [
48,
83,
951,
171
],
"page_idx": 1
},
{
"type": "text",
"text": "The pipeline supports multiple document formats including PDF, DOCX, PPTX, and image files. Extracted entities and relations are stored as graph nodes and edges, enabling semantic search and question answering over large document collections. ",
"bbox": [
48,
200,
949,
265
],
"page_idx": 1
},
{
"type": "text",
"text": "2. System Components ",
"text_level": 1,
"bbox": [
50,
299,
321,
318
],
"page_idx": 1
},
{
"type": "text",
"text": "2.1 Document Parsing Module ",
"text_level": 1,
"bbox": [
50,
343,
349,
361
],
"page_idx": 1
},
{
"type": "text",
"text": "MinerU Cloud API (v4) serves as the document parsing backend. It accepts PDF, DOCX, PPTX, PNG, JPG, and HTML files. Output includes Markdown text, structured content_list.json, and extracted images. ",
"bbox": [
48,
373,
951,
436
],
"page_idx": 1
},
{
"type": "text",
"text": "2.2 Entity Extraction Module ",
"text_level": 1,
"bbox": [
50,
461,
357,
479
],
"page_idx": 1
},
{
"type": "text",
"text": "LangExtract (v1.1.1) performs structured information extraction from plain text using few-shot prompting with LLM backends (Gemini, OpenAI, or local Ollama). Each extraction includes character-level position anchoring. ",
"bbox": [
48,
492,
949,
555
],
"page_idx": 1
},
{
"type": "text",
"text": "2.3 Knowledge Graph Module ",
"text_level": 1,
"bbox": [
50,
580,
337,
596
],
"page_idx": 1
},
{
"type": "text",
"text": "Extracted entities and relationships are stored in a graph database. Node types include: Person, Organization, Location, Event, Concept. Edge types include: RELATED_TO, BELONGS_TO, CAUSED_BY, LOCATED_IN. ",
"bbox": [
48,
608,
949,
674
],
"page_idx": 1
},
{
"type": "text",
"text": "2.4 Retrieval Module ",
"text_level": 1,
"bbox": [
50,
697,
272,
715
],
"page_idx": 1
},
{
"type": "text",
"text": "The retrieval layer supports hybrid search combining vector similarity and graph traversal. \nQuery results are ranked by relevance score and returned with source document references. ",
"bbox": [
48,
727,
944,
766
],
"page_idx": 1
},
{
"type": "text",
"text": "3. Data Pipeline ",
"text_level": 1,
"bbox": [
50,
42,
268,
61
],
"page_idx": 2
},
{
"type": "text",
"text": "The end-to-end data pipeline consists of the following stages: ",
"bbox": [
50,
83,
623,
99
],
"page_idx": 2
},
{
"type": "text",
"text": "Stage 1: Document Ingestion ",
"bbox": [
68,
130,
322,
146
],
"page_idx": 2
},
{
"type": "text",
"text": "- Accept raw documents (PDF, DOCX, images, HTML) - Submit to MinerU API for parsing - Poll task status until state $\\underline { { \\underline { { \\mathbf { \\delta \\pi } } } } }$ done ",
"bbox": [
85,
153,
531,
217
],
"page_idx": 2
},
{
"type": "text",
"text": "Stage 2: Content Extraction ",
"bbox": [
68,
249,
322,
263
],
"page_idx": 2
},
{
"type": "text",
"text": "- Download and decompress full_zip_url - Parse content_list.json into Document objects - Separate text blocks, tables, images, equations ",
"bbox": [
85,
272,
542,
335
],
"page_idx": 2
},
{
"type": "text",
"text": "Stage 3: Entity & Relation Extraction ",
"bbox": [
67,
367,
415,
381
],
"page_idx": 2
},
{
"type": "text",
"text": "- Feed text blocks to LangExtract - Extract entities with char_interval positions - Extract relationships between entities ",
"bbox": [
85,
390,
526,
454
],
"page_idx": 2
},
{
"type": "text",
"text": "Stage 4: Graph Construction ",
"bbox": [
68,
485,
322,
500
],
"page_idx": 2
},
{
"type": "text",
"text": "- Map extractions to graph nodes and edges - Store with source provenance (page_idx, bbox) - Build vector embeddings for semantic search ",
"bbox": [
85,
508,
522,
571
],
"page_idx": 2
},
{
"type": "text",
"text": "4. Supported File Formats ",
"text_level": 1,
"bbox": [
50,
604,
326,
620
],
"page_idx": 2
},
{
"type": "table",
"img_path": "images/1ed7aacecd20fecef8dc27ee2fe76dc1ae7fa93c44f7d10878d17a41f21a6bef.jpg",
"table_caption": [],
"table_footnote": [],
"table_body": "<table><tr><td rowspan=1 colspan=1>Format</td><td rowspan=1 colspan=1>Extension</td><td rowspan=1 colspan=1>OCR Required</td><td rowspan=1 colspan=1>ModeI</td></tr><tr><td rowspan=1 colspan=1>PDF (text)</td><td rowspan=1 colspan=1>. pdf</td><td rowspan=1 colspan=1>No</td><td rowspan=1 colspan=1>pipeline / vlm</td></tr><tr><td rowspan=1 colspan=1>PDF (scan)</td><td rowspan=1 colspan=1>. pdf</td><td rowspan=1 colspan=1>Yes</td><td rowspan=1 colspan=1>vIlm</td></tr><tr><td rowspan=1 colspan=1>Word</td><td rowspan=1 colspan=1>. docx</td><td rowspan=1 colspan=1>No</td><td rowspan=1 colspan=1>pipeline</td></tr><tr><td rowspan=1 colspan=1>PowerPoint</td><td rowspan=1 colspan=1>.pptx</td><td rowspan=1 colspan=1>No</td><td rowspan=1 colspan=1>pipeline</td></tr><tr><td rowspan=1 colspan=1>Image</td><td rowspan=1 colspan=1>.png / .jpg</td><td rowspan=1 colspan=1>Auto</td><td rowspan=1 colspan=1>vIlm</td></tr><tr><td rowspan=1 colspan=1>HTML</td><td rowspan=1 colspan=1>.html</td><td rowspan=1 colspan=1>No</td><td rowspan=1 colspan=1>MinerU-HTML</td></tr></table>",
"bbox": [
45,
634,
882,
806
],
"page_idx": 2
},
{
"type": "text",
"text": "5. API Configuration Reference ",
"text_level": 1,
"bbox": [
48,
42,
457,
63
],
"page_idx": 3
},
{
"type": "text",
"text": "The following environment variables must be configured before running the MinerU parsing service: ",
"bbox": [
48,
83,
952,
123
],
"page_idx": 3
},
{
"type": "text",
"text": "MINERU_API_TOKEN : Bearer token for API authentication \nMINERU_USER_UID : User UUID for quota management \nMINERU_BASE_URL : https://mineru.net/api/v4 \nMINERU_MODEL_VERSION : pipeline (default) | vlm | MinerU-HTML \nMINERU_LANGUAGE : ch (Chinese) | en (English) \nMINERU_IS_OCR : false (text PDF) | true (scanned PDF) \nMINERU_ENABLE_FORMULA: true | false \nMINERU_ENABLE_TABLE : true | false ",
"bbox": [
65,
152,
636,
337
],
"page_idx": 3
},
{
"type": "text",
"text": "Rate Limits: ",
"bbox": [
48,
367,
161,
381
],
"page_idx": 3
},
{
"type": "text",
"text": "- Max file size : 200 MB per file - Max pages : 600 pages per file - Daily quota : 2000 pages (high priority) - Batch limit : 200 files per request ",
"bbox": [
65,
388,
504,
478
],
"page_idx": 3
}
]

View File

@@ -0,0 +1,71 @@
# GraphRAG System
Technical Architecture Overview
Version 1.0 | March 2026
# 1. Abstract
This document presents the technical architecture of a Multimodal GraphRAG System designed for intelligent document parsing and knowledge graph construction. The system integrates MinerU for document parsing, LangExtract for structured entity extraction, and a graph database for knowledge storage and retrieval.
The pipeline supports multiple document formats including PDF, DOCX, PPTX, and image files. Extracted entities and relations are stored as graph nodes and edges, enabling semantic search and question answering over large document collections.
# 2. System Components
# 2.1 Document Parsing Module
MinerU Cloud API (v4) serves as the document parsing backend. It accepts PDF, DOCX, PPTX, PNG, JPG, and HTML files. Output includes Markdown text, structured content_list.json, and extracted images.
# 2.2 Entity Extraction Module
LangExtract (v1.1.1) performs structured information extraction from plain text using few-shot prompting with LLM backends (Gemini, OpenAI, or local Ollama). Each extraction includes character-level position anchoring.
# 2.3 Knowledge Graph Module
Extracted entities and relationships are stored in a graph database. Node types include: Person, Organization, Location, Event, Concept. Edge types include: RELATED_TO, BELONGS_TO, CAUSED_BY, LOCATED_IN.
# 2.4 Retrieval Module
The retrieval layer supports hybrid search combining vector similarity and graph traversal.
Query results are ranked by relevance score and returned with source document references.
# 3. Data Pipeline
The end-to-end data pipeline consists of the following stages:
Stage 1: Document Ingestion
- Accept raw documents (PDF, DOCX, images, HTML) - Submit to MinerU API for parsing - Poll task status until state $\underline { { \underline { { \mathbf { \delta \pi } } } } }$ done
Stage 2: Content Extraction
- Download and decompress full_zip_url - Parse content_list.json into Document objects - Separate text blocks, tables, images, equations
Stage 3: Entity & Relation Extraction
- Feed text blocks to LangExtract - Extract entities with char_interval positions - Extract relationships between entities
Stage 4: Graph Construction
- Map extractions to graph nodes and edges - Store with source provenance (page_idx, bbox) - Build vector embeddings for semantic search
# 4. Supported File Formats
<table><tr><td rowspan=1 colspan=1>Format</td><td rowspan=1 colspan=1>Extension</td><td rowspan=1 colspan=1>OCR Required</td><td rowspan=1 colspan=1>ModeI</td></tr><tr><td rowspan=1 colspan=1>PDF (text)</td><td rowspan=1 colspan=1>. pdf</td><td rowspan=1 colspan=1>No</td><td rowspan=1 colspan=1>pipeline / vlm</td></tr><tr><td rowspan=1 colspan=1>PDF (scan)</td><td rowspan=1 colspan=1>. pdf</td><td rowspan=1 colspan=1>Yes</td><td rowspan=1 colspan=1>vIlm</td></tr><tr><td rowspan=1 colspan=1>Word</td><td rowspan=1 colspan=1>. docx</td><td rowspan=1 colspan=1>No</td><td rowspan=1 colspan=1>pipeline</td></tr><tr><td rowspan=1 colspan=1>PowerPoint</td><td rowspan=1 colspan=1>.pptx</td><td rowspan=1 colspan=1>No</td><td rowspan=1 colspan=1>pipeline</td></tr><tr><td rowspan=1 colspan=1>Image</td><td rowspan=1 colspan=1>.png / .jpg</td><td rowspan=1 colspan=1>Auto</td><td rowspan=1 colspan=1>vIlm</td></tr><tr><td rowspan=1 colspan=1>HTML</td><td rowspan=1 colspan=1>.html</td><td rowspan=1 colspan=1>No</td><td rowspan=1 colspan=1>MinerU-HTML</td></tr></table>
# 5. API Configuration Reference
The following environment variables must be configured before running the MinerU parsing service:
MINERU_API_TOKEN : Bearer token for API authentication
MINERU_USER_UID : User UUID for quota management
MINERU_BASE_URL : https://mineru.net/api/v4
MINERU_MODEL_VERSION : pipeline (default) | vlm | MinerU-HTML
MINERU_LANGUAGE : ch (Chinese) | en (English)
MINERU_IS_OCR : false (text PDF) | true (scanned PDF)
MINERU_ENABLE_FORMULA: true | false
MINERU_ENABLE_TABLE : true | false
Rate Limits:
- Max file size : 200 MB per file - Max pages : 600 pages per file - Daily quota : 2000 pages (high priority) - Batch limit : 200 files per request

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,10 @@
{
"total_blocks": 32,
"type_distribution": {
"text": 31,
"table": 1
},
"total_pages": 4,
"text_block_count": 31,
"table_block_count": 1
}

View File

View File

@@ -0,0 +1,66 @@
"""
Entity Extractor — LangExtract + DeepSeek entity extraction.
Independent implementation for the GraphRAG Studio backend.
"""
from __future__ import annotations
import os
from pathlib import Path
from dotenv import load_dotenv
import langextract as lx
from langextract.providers.openai import OpenAILanguageModel
load_dotenv(Path(__file__).parent.parent / ".env", override=True)
DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY", "")
DEEPSEEK_BASE_URL = os.getenv("DEEPSEEK_BASE_URL", "https://api.deepseek.com")
MODEL_ID = "deepseek-chat"
PROMPT_DESCRIPTION = (
"Extract named entities from the text in order of appearance. "
"Entity types: TECHNOLOGY (software, algorithms, models, tools), "
"ORGANIZATION (companies, research groups, institutions), "
"PERSON (individual people), "
"LOCATION (places, geographic entities), "
"CONCEPT (technical concepts, methodologies, frameworks)."
)
EXAMPLES = [
lx.data.ExampleData(
text=(
"LangChain is a framework created by Harrison Chase for building "
"LLM applications. It integrates with OpenAI models and Pinecone "
"vector database for semantic search."
),
extractions=[
lx.data.Extraction(extraction_class="TECHNOLOGY", extraction_text="LangChain"),
lx.data.Extraction(extraction_class="PERSON", extraction_text="Harrison Chase"),
lx.data.Extraction(extraction_class="CONCEPT", extraction_text="LLM applications"),
lx.data.Extraction(extraction_class="TECHNOLOGY", extraction_text="OpenAI models"),
lx.data.Extraction(extraction_class="TECHNOLOGY", extraction_text="Pinecone"),
lx.data.Extraction(extraction_class="CONCEPT", extraction_text="semantic search"),
],
)
]
def create_model() -> OpenAILanguageModel:
if not DEEPSEEK_API_KEY:
raise ValueError("DEEPSEEK_API_KEY not set in backend/.env")
return OpenAILanguageModel(
model_id=MODEL_ID,
api_key=DEEPSEEK_API_KEY,
base_url=DEEPSEEK_BASE_URL,
)
def extract_entities(page_text: str, model: OpenAILanguageModel) -> lx.data.AnnotatedDocument:
return lx.extract(
text_or_documents=page_text,
prompt_description=PROMPT_DESCRIPTION,
examples=EXAMPLES,
model=model,
show_progress=False,
)

View File

@@ -0,0 +1,123 @@
"""
KG Builder — node deduplication + CO_OCCURS_IN edge generation.
Independent implementation for the GraphRAG Studio backend.
"""
from __future__ import annotations
from collections import defaultdict
import langextract as lx
from pipeline.text_assembler import PageText
ACCEPTED_ALIGNMENTS = {"match_exact", "match_greater", "match_lesser"}
def build_kg(
pages: list[PageText],
annotated_docs: list[lx.data.AnnotatedDocument],
source_doc_id: str,
) -> tuple[list[dict], list[dict]]:
"""Build KG nodes and edges from LangExtract results.
Returns:
(nodes, edges) — deduplicated node list and edge list.
"""
# Phase 1: collect raw entities
raw_entities = []
for page, doc in zip(pages, annotated_docs):
if not doc.extractions:
continue
for ext in doc.extractions:
status = ext.alignment_status.value if ext.alignment_status else None
if status not in ACCEPTED_ALIGNMENTS:
continue
char_start = ext.char_interval.start_pos if ext.char_interval else None
char_end = ext.char_interval.end_pos if ext.char_interval else None
raw_entities.append({
"name": ext.extraction_text,
"type": ext.extraction_class,
"char_start": char_start,
"char_end": char_end,
"confidence": status,
"page": page.page_idx,
"source_doc": source_doc_id,
})
# Phase 2: deduplicate nodes
seen: dict[tuple[str, str], int] = {}
nodes: list[dict] = []
node_pages: dict[int, set[int]] = defaultdict(set)
for entity in raw_entities:
type_prefix = entity["type"].lower()[:4]
name_slug = entity["name"].lower().replace(" ", "")[:12]
dedup_key = (entity["name"].lower(), entity["type"])
if dedup_key not in seen:
node_idx = len(nodes)
seen[dedup_key] = node_idx
nodes.append({
"id": f"{type_prefix}_{name_slug}_{node_idx}",
"name": entity["name"],
"type": entity["type"],
"source_doc": entity["source_doc"],
"char_start": entity["char_start"],
"char_end": entity["char_end"],
"confidence": entity["confidence"],
"page": entity["page"],
})
node_idx = seen[dedup_key]
node_pages[node_idx].add(entity["page"])
# Phase 3: CO_OCCURS_IN edges
page_nodes: dict[int, list[int]] = defaultdict(list)
for node_idx, page_set in node_pages.items():
for page_idx in page_set:
page_nodes[page_idx].append(node_idx)
edges: list[dict] = []
edge_seen: set[tuple] = set()
for page_idx, node_indices in sorted(page_nodes.items()):
for i in range(len(node_indices)):
for j in range(i + 1, len(node_indices)):
a = nodes[node_indices[i]]["id"]
b = nodes[node_indices[j]]["id"]
src, tgt = (a, b) if a < b else (b, a)
key = (src, tgt, source_doc_id, page_idx)
if key in edge_seen:
continue
edge_seen.add(key)
edges.append({
"source": src,
"target": tgt,
"relation": "CO_OCCURS_IN",
"doc_id": source_doc_id,
"page": page_idx,
})
return nodes, edges
def extractions_to_records(
pages: list[PageText],
annotated_docs: list[lx.data.AnnotatedDocument],
doc_id: str,
) -> list[dict]:
"""Flatten LangExtract results to ExtractionRecord dicts."""
records = []
for page, doc in zip(pages, annotated_docs):
if not doc.extractions:
continue
for ext in doc.extractions:
status = ext.alignment_status.value if ext.alignment_status else None
records.append({
"text": ext.extraction_text,
"type": ext.extraction_class,
"char_start": ext.char_interval.start_pos if ext.char_interval else None,
"char_end": ext.char_interval.end_pos if ext.char_interval else None,
"alignment": status,
"page": page.page_idx,
"doc_id": doc_id,
})
return records

View File

@@ -0,0 +1,217 @@
"""
QA Agent — LangGraph ReAct agent over the knowledge graph.
Independent implementation for the GraphRAG Studio backend.
"""
from __future__ import annotations
import os
import re
from pathlib import Path
import networkx as nx
from dotenv import load_dotenv
from langchain.tools import tool
from langchain_openai import ChatOpenAI
from langchain_core.messages import HumanMessage, AIMessage, ToolMessage, SystemMessage
from langgraph.prebuilt import create_react_agent
load_dotenv(Path(__file__).parent.parent / ".env", override=True)
DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY", "")
DEEPSEEK_BASE_URL = os.getenv("DEEPSEEK_BASE_URL", "https://api.deepseek.com")
def build_kg_graph(nodes: list[dict], edges: list[dict]) -> nx.Graph:
G = nx.Graph()
for n in nodes:
G.add_node(n["id"], **n)
for e in edges:
G.add_edge(e["source"], e["target"], **{k: v for k, v in e.items() if k not in ("source", "target")})
return G
def make_tools(G: nx.Graph) -> list:
@tool
def search_entities(query: str) -> str:
"""Search knowledge graph entities by name (case-insensitive substring).
Args:
query: Keyword to search for in entity names.
"""
q = query.lower()
matches = [data for _, data in G.nodes(data=True) if q in data.get("name", "").lower()]
if not matches:
sample = ", ".join(d.get("name", "") for _, d in list(G.nodes(data=True))[:8])
return f"No entities found matching '{query}'. Sample: {sample}"
lines = [f"Found {len(matches)} entity(ies) matching '{query}':"]
for m in matches[:15]:
lines.append(
f" [{m['type']}] \"{m['name']}\" "
f"(confidence={m.get('confidence','?')}, page={m.get('page',0)}, id={m['id']})"
)
return "\n".join(lines)
@tool
def get_neighbors(entity_name: str, hops: int = 1) -> str:
"""Get N-hop neighbors of an entity in the knowledge graph.
Args:
entity_name: Entity name (partial match).
hops: Number of hops (1-3, default 1).
"""
hops = max(1, min(int(hops), 3))
candidates = [(nid, d) for nid, d in G.nodes(data=True)
if entity_name.lower() in d.get("name", "").lower()]
if not candidates:
return f"Entity '{entity_name}' not found. Use search_entities first."
node_id, node_data = candidates[0]
reachable = nx.single_source_shortest_path_length(G, node_id, cutoff=hops)
by_hop: dict[int, list] = {}
for nid, dist in reachable.items():
if dist > 0:
by_hop.setdefault(dist, []).append(G.nodes[nid])
lines = [f"Neighbors of '{node_data['name']}' [{node_data['type']}] within {hops} hop(s):"]
for hop in sorted(by_hop.keys()):
hop_nodes = by_hop[hop]
lines.append(f"\n Hop {hop}{len(hop_nodes)} related entities:")
for n in hop_nodes[:20]:
lines.append(f" [{n.get('type','?')}] {n.get('name','?')}")
if len(hop_nodes) > 20:
lines.append(f" ... and {len(hop_nodes)-20} more")
lines.append(f"\n Total related entities: {sum(len(v) for v in by_hop.values())}")
return "\n".join(lines)
@tool
def get_entities_by_type(entity_type: str) -> str:
"""List all entities of a specific type.
Args:
entity_type: TECHNOLOGY, CONCEPT, PERSON, ORGANIZATION, or LOCATION.
"""
t_upper = entity_type.strip().upper()
valid = {"TECHNOLOGY", "CONCEPT", "PERSON", "ORGANIZATION", "LOCATION"}
if t_upper not in valid:
present = sorted({d.get("type","") for _, d in G.nodes(data=True)})
return f"Unknown type '{entity_type}'. Present: {present}"
matches = [d for _, d in G.nodes(data=True) if d.get("type","") == t_upper]
if not matches:
return f"No {t_upper} entities found."
lines = [f"Found {len(matches)} {t_upper} entities:"]
for m in matches[:30]:
lines.append(f" \"{m['name']}\" (page={m.get('page',0)}, id={m['id']})")
if len(matches) > 30:
lines.append(f" ... and {len(matches)-30} more")
return "\n".join(lines)
@tool
def describe_graph() -> str:
"""Get an overview of the knowledge graph statistics."""
n_nodes = G.number_of_nodes()
n_edges = G.number_of_edges()
type_counts: dict[str, int] = {}
for _, d in G.nodes(data=True):
t = d.get("type", "UNKNOWN")
type_counts[t] = type_counts.get(t, 0) + 1
lines = [
f"Knowledge Graph Overview:",
f" Nodes: {n_nodes}",
f" Edges: {n_edges}",
f" Entity types: {type_counts}",
]
if n_nodes > 0:
centrality = nx.degree_centrality(G)
top5 = sorted(centrality.items(), key=lambda x: x[1], reverse=True)[:5]
lines.append(" Top 5 central nodes:")
for nid, c in top5:
nd = G.nodes[nid]
lines.append(f" [{nd.get('type','?')}] {nd.get('name','?')} (centrality={c:.3f})")
return "\n".join(lines)
return [search_entities, get_neighbors, get_entities_by_type, describe_graph]
def run_qa(
question: str,
history: list[dict],
nodes: list[dict],
edges: list[dict],
) -> dict:
"""Run Agentic-RAG QA. Returns dict with answer, tool_calls, cited_nodes."""
if not DEEPSEEK_API_KEY:
raise ValueError("DEEPSEEK_API_KEY not set in backend/.env")
G = build_kg_graph(nodes, edges)
tools = make_tools(G)
llm = ChatOpenAI(
model="deepseek-chat",
api_key=DEEPSEEK_API_KEY,
base_url=DEEPSEEK_BASE_URL,
temperature=0,
)
system_prompt = (
"You are a helpful assistant with access to a knowledge graph (KG) built from the user's documents.\n"
"\n"
"Guidelines:\n"
"- If the question is clearly unrelated to the KG (greetings, math, general knowledge, etc.), "
"answer directly WITHOUT using any tools.\n"
"- If the question might be answered by the KG (topics related to entities in the documents), "
"use the tools to search and explore before answering.\n"
"- When you DO use the KG, cite the entity names and types you found.\n"
"- If the KG has no relevant information, say so honestly and answer from general knowledge if possible.\n"
"\n"
"Available tools: search entities by name, get neighbors, list entities by type, get graph overview."
)
agent = create_react_agent(llm, tools, prompt=system_prompt)
# Build messages: system + history + current question
messages: list = []
for msg in history[-8:]:
role = msg.get("role", "human")
content = msg.get("content", "") or msg.get("answer", "")
if role == "human":
messages.append(HumanMessage(content=msg.get("question", content)))
else:
messages.append(AIMessage(content=content))
messages.append(HumanMessage(content=question))
result = agent.invoke({"messages": messages})
# Extract answer from last AIMessage
answer = ""
for msg in reversed(result.get("messages", [])):
if isinstance(msg, AIMessage) and msg.content and not msg.tool_calls:
answer = msg.content
break
# Extract tool calls and cited node IDs from message history
tool_calls = []
cited_node_ids: set[str] = set()
step = 0
all_messages = result.get("messages", [])
for i, msg in enumerate(all_messages):
if isinstance(msg, AIMessage) and msg.tool_calls:
for tc in msg.tool_calls:
step += 1
# Find the corresponding ToolMessage
output = ""
for j in range(i + 1, len(all_messages)):
tm = all_messages[j]
if isinstance(tm, ToolMessage) and tm.tool_call_id == tc.get("id"):
output = tm.content
break
tool_input = tc.get("args", {})
tool_calls.append({
"step": step,
"tool_name": tc.get("name", ""),
"tool_input": str(tool_input),
"tool_output": str(output),
})
# Extract node IDs mentioned in tool output
for node_id in re.findall(r'\bid=([^\s,\)\]]+)', str(output)):
cited_node_ids.add(node_id)
return {
"answer": answer,
"tool_calls": tool_calls,
"cited_nodes": list(cited_node_ids),
}

View File

@@ -0,0 +1,107 @@
"""
Text Assembler — MinerU content_list.json → per-page plain text.
Independent implementation for the GraphRAG Studio backend.
"""
from __future__ import annotations
import dataclasses
import json
from collections import defaultdict
from pathlib import Path
from bs4 import BeautifulSoup
@dataclasses.dataclass
class BlockSpan:
block_index: int
block_type: str
page_idx: int
char_start: int
char_end: int
bbox: list
@dataclasses.dataclass
class PageText:
page_idx: int
text: str
block_spans: list[BlockSpan]
def html_table_to_text(table_body: str) -> str:
soup = BeautifulSoup(table_body, "html.parser")
rows = []
for tr in soup.find_all("tr"):
cells = [td.get_text(strip=True) for td in tr.find_all(["td", "th"])]
rows.append(" | ".join(cells))
return "\n".join(rows)
def load_content_list(path: Path) -> list[dict]:
if path.is_dir():
matches = list(path.glob("*_content_list.json"))
if not matches:
matches = list(path.glob("*content_list.json"))
if not matches:
raise FileNotFoundError(f"No content_list.json found in {path}")
path = matches[0]
with open(path, "r", encoding="utf-8") as f:
return json.load(f)
def assemble_pages(content_list: list[dict]) -> list[PageText]:
pages: dict[int, list[tuple[int, dict]]] = defaultdict(list)
for i, block in enumerate(content_list):
page_idx = block.get("page_idx", 0)
pages[page_idx].append((i, block))
result = []
for page_idx in sorted(pages.keys()):
blocks = pages[page_idx]
buffer = []
spans = []
cursor = 0
for block_index, block in blocks:
block_type = block.get("type", "unknown")
bbox = block.get("bbox", [0, 0, 0, 0])
if block_type == "text":
block_text = block.get("text", "").rstrip()
elif block_type == "table":
table_body = block.get("table_body", "")
block_text = html_table_to_text(table_body) if table_body else ""
else:
continue
if not block_text:
continue
char_start = cursor
buffer.append(block_text)
cursor += len(block_text)
char_end = cursor
spans.append(BlockSpan(
block_index=block_index,
block_type=block_type,
page_idx=page_idx,
char_start=char_start,
char_end=char_end,
bbox=bbox,
))
buffer.append("\n")
cursor += 1
text = "".join(buffer).rstrip("\n")
result.append(PageText(page_idx=page_idx, text=text, block_spans=spans))
return result
def count_blocks_by_type(content_list: list[dict]) -> dict[str, int]:
counts: dict[str, int] = defaultdict(int)
for block in content_list:
counts[block.get("type", "unknown")] += 1
return dict(counts)

22
backend/pyproject.toml Normal file
View File

@@ -0,0 +1,22 @@
[project]
name = "graphrag-studio-backend"
version = "1.0.0"
description = "GraphRAG Studio — FastAPI backend service"
requires-python = ">=3.12"
dependencies = [
"fastapi>=0.104.0",
"uvicorn[standard]>=0.24.0",
"python-multipart>=0.0.6",
"langextract[all]>=0.1.0",
"langchain>=0.2.0",
"langchain-openai>=0.1.0",
"langgraph>=0.1.0",
"networkx>=3.0",
"python-dotenv>=1.0.0",
"requests>=2.31.0",
"beautifulsoup4>=4.12.0",
]
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"

View File

View File

@@ -0,0 +1,71 @@
"""A 组文档管理4 个端点)"""
from fastapi import APIRouter, File, Form, HTTPException, UploadFile
from fastapi.responses import JSONResponse
from models.schemas import APIResponse
from services import document_service as svc
router = APIRouter(prefix="/documents", tags=["Documents"])
@router.post("/upload", status_code=200)
async def upload_document(
file: UploadFile = File(...),
language: str = Form("ch"),
enable_formula: bool = Form(True),
enable_table: bool = Form(True),
):
content = await file.read()
ok, code, msg = svc.validate_upload(file.filename or "", len(content))
if not ok:
return JSONResponse(
status_code=400,
content=APIResponse.err(code, msg).model_dump(),
)
doc = svc.save_upload(file.filename or "upload", content, language, enable_formula, enable_table)
# Remove internal field
doc.pop("upload_filename", None)
return APIResponse.ok(doc)
@router.get("/{doc_id}")
async def get_document(doc_id: str):
doc = svc.get_document(doc_id)
if not doc:
return JSONResponse(
status_code=404,
content=APIResponse.err(2001, f"Document '{doc_id}' not found").model_dump(),
)
doc.pop("upload_filename", None)
return APIResponse.ok(doc)
@router.get("")
async def list_documents(
page: int = 1,
page_size: int = 20,
status: str | None = None,
format: str | None = None,
):
page_size = min(page_size, 100)
result = svc.list_documents(page, page_size, status, format)
for item in result["items"]:
item.pop("upload_filename", None)
return APIResponse.ok(result)
@router.delete("/{doc_id}")
async def delete_document(doc_id: str):
doc = svc.get_document(doc_id)
if not doc:
return JSONResponse(
status_code=404,
content=APIResponse.err(2001, f"Document '{doc_id}' not found").model_dump(),
)
ok, removed_nodes, removed_edges = svc.delete_document(doc_id)
return APIResponse.ok({
"deleted": True,
"doc_id": doc_id,
"removed_nodes": removed_nodes,
"removed_edges": removed_edges,
})

View File

@@ -0,0 +1,70 @@
"""B 组Indexing Pipeline4 个端点)"""
from fastapi import APIRouter
from fastapi.responses import JSONResponse
from models.schemas import APIResponse, StartIndexRequest
from services import document_service as doc_svc
from services import indexing_service as idx_svc
router = APIRouter(prefix="/index", tags=["Indexing"])
@router.post("/start", status_code=202)
async def start_indexing(body: StartIndexRequest):
doc = doc_svc.get_document(body.doc_id)
if not doc:
return JSONResponse(
status_code=404,
content=APIResponse.err(2001, f"Document '{body.doc_id}' not found").model_dump(),
)
meta = idx_svc.start_indexing(body.doc_id)
return APIResponse.ok({
"job_id": meta["job_id"],
"doc_id": meta["doc_id"],
"status": meta["status"],
"stage": meta["stage"],
"created_at": meta["created_at"],
})
@router.get("/status/{job_id}")
async def get_job_status(job_id: str):
meta = idx_svc.get_job_status(job_id)
if not meta:
return JSONResponse(
status_code=404,
content=APIResponse.err(2002, f"Job '{job_id}' not found").model_dump(),
)
return APIResponse.ok(meta)
@router.get("/result/{job_id}")
async def get_job_result(job_id: str):
result = idx_svc.get_job_result(job_id)
if not result:
return JSONResponse(
status_code=404,
content=APIResponse.err(2002, f"Job '{job_id}' not found").model_dump(),
)
if result.get("status") not in ("done",) and "stats" not in result:
return JSONResponse(
status_code=400,
content=APIResponse.err(2003, f"Job '{job_id}' is still running (status={result.get('status')})").model_dump(),
)
return APIResponse.ok(result)
@router.delete("/jobs/{job_id}")
async def cancel_job(job_id: str):
meta = idx_svc.get_job_status(job_id)
if not meta:
return JSONResponse(
status_code=404,
content=APIResponse.err(2002, f"Job '{job_id}' not found").model_dump(),
)
ok, prev_status = idx_svc.cancel_job(job_id)
return APIResponse.ok({
"cancelled": True,
"job_id": job_id,
"previous_status": prev_status,
})

72
backend/routers/kg.py Normal file
View File

@@ -0,0 +1,72 @@
"""C 组知识图谱6 个端点)"""
from fastapi import APIRouter
from fastapi.responses import JSONResponse
from models.schemas import APIResponse
from services import kg_service as svc
router = APIRouter(prefix="/kg", tags=["Knowledge Graph"])
@router.get("/nodes")
async def list_nodes(
type: str | None = None,
doc_id: str | None = None,
confidence: str | None = None,
page: int = 1,
page_size: int = 50,
):
page_size = min(page_size, 200)
result = svc.get_nodes(page, page_size, type, doc_id, confidence)
if result["total"] == 0 and not any([type, doc_id, confidence]):
return JSONResponse(
status_code=400,
content=APIResponse.err(3002, "Knowledge graph is empty. Index documents first.").model_dump(),
)
return APIResponse.ok(result)
@router.get("/edges")
async def list_edges(
doc_id: str | None = None,
relation: str | None = None,
page: int = 1,
page_size: int = 100,
):
page_size = min(page_size, 500)
result = svc.get_edges(page, page_size, doc_id, relation)
return APIResponse.ok(result)
@router.get("/nodes/{node_id}")
async def get_node_detail(node_id: str):
node = svc.get_node_detail(node_id)
if not node:
return JSONResponse(
status_code=404,
content=APIResponse.err(3001, f"Node '{node_id}' not found").model_dump(),
)
return APIResponse.ok(node)
@router.get("/nodes/{node_id}/neighbors")
async def get_node_neighbors(node_id: str, hops: int = 1):
result = svc.get_neighbors(node_id, hops)
if result is None:
return JSONResponse(
status_code=404,
content=APIResponse.err(3001, f"Node '{node_id}' not found").model_dump(),
)
return APIResponse.ok(result)
@router.get("/stats")
async def get_kg_stats():
stats = svc.get_stats()
return APIResponse.ok(stats)
@router.get("/export")
async def export_kg(format: str = "json", doc_id: str | None = None):
result = svc.export_kg(doc_id)
return APIResponse.ok(result)

66
backend/routers/query.py Normal file
View File

@@ -0,0 +1,66 @@
"""D 组QA 问答4 个端点)"""
import asyncio
from functools import partial
from fastapi import APIRouter
from fastapi.responses import JSONResponse
from models.schemas import APIResponse, BatchQueryRequest, QueryRequest
from services import qa_service as svc
router = APIRouter(prefix="/query", tags=["QA"])
@router.post("")
async def run_query(body: QueryRequest):
try:
loop = asyncio.get_event_loop()
result = await loop.run_in_executor(
None,
partial(svc.run_query, body.question, [m.model_dump() for m in body.history]),
)
return APIResponse.ok(result)
except ValueError as e:
if "KG_EMPTY" in str(e):
return JSONResponse(
status_code=400,
content=APIResponse.err(3002, "Knowledge graph is empty. Index documents first.").model_dump(),
)
return JSONResponse(
status_code=500,
content=APIResponse.err(4001, str(e)).model_dump(),
)
except Exception as e:
return JSONResponse(
status_code=500,
content=APIResponse.err(4001, f"QA service error: {e}").model_dump(),
)
@router.post("/batch", status_code=202)
async def start_batch(body: BatchQueryRequest):
if len(body.questions) > 20:
return JSONResponse(
status_code=400,
content=APIResponse.err(1001, "Maximum 20 questions per batch").model_dump(),
)
result = svc.start_batch(body.questions)
return APIResponse.ok(result)
@router.get("/batch/{batch_id}")
async def get_batch_result(batch_id: str):
result = svc.get_batch_result(batch_id)
if not result:
return JSONResponse(
status_code=404,
content=APIResponse.err(2002, f"Batch '{batch_id}' not found").model_dump(),
)
return APIResponse.ok(result)
@router.get("/history")
async def get_query_history(page: int = 1, page_size: int = 20):
page_size = min(page_size, 50)
result = svc.get_history(page, page_size)
return APIResponse.ok(result)

43
backend/routers/search.py Normal file
View File

@@ -0,0 +1,43 @@
"""E 组搜索3 个端点)"""
from fastapi import APIRouter, Query, Request
from fastapi.responses import JSONResponse
from models.schemas import APIResponse
from services import search_service as svc
router = APIRouter(prefix="/search", tags=["Search"])
@router.get("/entities")
async def search_entities(q: str, type: str | None = None, limit: int = 15):
limit = min(limit, 100)
result = svc.search_entities(q, type, limit)
return APIResponse.ok(result)
@router.get("/path")
async def search_path(request: Request, max_hops: int = 3):
# 'from' is a Python keyword, read from raw query params
params = dict(request.query_params)
from_id = params.get("from")
to_id = params.get("to")
if not from_id or not to_id:
return JSONResponse(
status_code=400,
content=APIResponse.err(1001, "Parameters 'from' and 'to' are required").model_dump(),
)
max_hops = max(1, min(max_hops, 5))
result = svc.search_path(from_id, to_id, max_hops)
if result is None:
return JSONResponse(
status_code=404,
content=APIResponse.err(3001, "One or both nodes not found").model_dump(),
)
return APIResponse.ok(result)
@router.get("/graph")
async def search_graph(q: str, include_neighbors: bool = False):
result = svc.search_graph(q, include_neighbors)
return APIResponse.ok(result)

171
backend/routers/system.py Normal file
View File

@@ -0,0 +1,171 @@
"""F 组系统4 个端点)"""
import os
import time
from pathlib import Path
from fastapi import APIRouter
from models.schemas import APIResponse
from storage import file_store as fs
router = APIRouter(tags=["System"])
_START_TIME = time.time()
@router.get("/health")
async def health_check():
env_path = Path(__file__).parent.parent / ".env"
from dotenv import load_dotenv
load_dotenv(env_path, override=False)
mineru_python = Path(os.getenv("MINERU_PYTHON", "F:/GraphRAGAgent/mineru_mvp/.venv/Scripts/python.exe"))
backend_python = Path(__file__).parent.parent / ".venv" / "Scripts" / "python.exe"
deepseek_key = os.getenv("DEEPSEEK_API_KEY", "")
deepseek_url = os.getenv("DEEPSEEK_BASE_URL", "https://api.deepseek.com")
# Check if langextract is importable from backend's venv
try:
import subprocess
result = subprocess.run(
[str(backend_python), "-c", "import langextract; print('ok')"],
capture_output=True, text=True, timeout=10
)
langextract_ok = result.returncode == 0 and "ok" in result.stdout
except Exception:
langextract_ok = False
components = {
"mineru_venv": {
"status": "ok" if mineru_python.exists() else "error",
"path": str(mineru_python),
"exists": mineru_python.exists(),
},
"langextract_venv": {
"status": "ok" if langextract_ok else "error",
"path": str(backend_python),
"exists": backend_python.exists(),
},
"deepseek_api": {
"status": "ok" if deepseek_key else "error",
"base_url": deepseek_url,
"key_configured": bool(deepseek_key),
},
"storage": {
"status": "ok",
"kg_nodes_exists": fs.kg_nodes_path().exists(),
"kg_edges_exists": fs.kg_edges_path().exists(),
"uploads_dir_exists": fs.UPLOADS_DIR.exists(),
},
}
overall = "healthy" if all(c["status"] == "ok" for c in components.values()) else "degraded"
return APIResponse.ok({
"status": overall,
"version": "1.0.0",
"uptime_seconds": round(time.time() - _START_TIME, 1),
"components": components,
})
@router.get("/system/stats")
async def system_stats():
from services import indexing_service as idx_svc
docs = list(fs.load_docs_index().values())
nodes = fs.load_kg_nodes()
edges = fs.load_kg_edges()
history = fs.load_query_history()
type_dist: dict[str, int] = {}
for n in nodes:
t = n.get("type", "UNKNOWN")
type_dist[t] = type_dist.get(t, 0) + 1
return APIResponse.ok({
"total_documents": len(docs),
"indexed_documents": sum(1 for d in docs if d.get("status") == "indexed"),
"failed_documents": sum(1 for d in docs if d.get("status") == "failed"),
"total_nodes": len(nodes),
"total_edges": len(edges),
"type_distribution": type_dist,
"total_queries": len(history),
"active_jobs": idx_svc.count_active_jobs(),
"storage_used_mb": fs.storage_used_mb(),
})
@router.get("/system/formats")
async def list_formats():
return APIResponse.ok({
"formats": [
{"ext": "pdf", "description": "PDF 文档(文本型/扫描型/混合型)", "max_size_mb": 200, "max_pages": 600, "requires_ocr": False},
{"ext": "docx", "description": "Microsoft Word新版", "max_size_mb": 200, "max_pages": 600, "requires_ocr": False},
{"ext": "doc", "description": "Microsoft Word旧版", "max_size_mb": 200, "max_pages": 600, "requires_ocr": False},
{"ext": "pptx", "description": "PowerPoint新版", "max_size_mb": 200, "max_pages": 600, "requires_ocr": False},
{"ext": "ppt", "description": "PowerPoint旧版", "max_size_mb": 200, "max_pages": 600, "requires_ocr": False},
{"ext": "png", "description": "PNG 图片(单页)", "max_size_mb": 200, "max_pages": 1, "requires_ocr": True},
{"ext": "jpg", "description": "JPEG 图片(单页)", "max_size_mb": 200, "max_pages": 1, "requires_ocr": True},
{"ext": "jpeg", "description": "JPEG 图片(单页)", "max_size_mb": 200, "max_pages": 1, "requires_ocr": True},
{"ext": "html", "description": "HTML 文件", "max_size_mb": 200, "max_pages": 600, "requires_ocr": False},
],
"ocr_languages": [
{"code": "ch", "name": "中文(默认)"},
{"code": "en", "name": "英文"},
{"code": "japan", "name": "日文"},
{"code": "korean", "name": "韩文"},
{"code": "french", "name": "法文"},
{"code": "german", "name": "德文"},
],
"notes": [
"language 参数默认值为 'ch'(非 'zh'),遵循 PaddleOCR v3 语言代码规范",
"上传时不需要携带 Content-Type服务端自动识别",
"PNG/JPG/JPEG 单次最多处理 1 页",
],
})
@router.get("/system/demo")
async def get_demo_data():
# Try backend KG first, then fall back to graphrag_pipeline/output
nodes = fs.load_kg_nodes()
edges = fs.load_kg_edges()
if not nodes:
# Fallback: load from existing graphrag_pipeline output
legacy_nodes_path = Path("F:/GraphRAGAgent/graphrag_pipeline/output/kg_nodes.json")
legacy_edges_path = Path("F:/GraphRAGAgent/graphrag_pipeline/output/kg_edges.json")
if legacy_nodes_path.exists():
import json
nodes = json.loads(legacy_nodes_path.read_text(encoding="utf-8"))
edges = json.loads(legacy_edges_path.read_text(encoding="utf-8")) if legacy_edges_path.exists() else []
else:
from fastapi.responses import JSONResponse
return JSONResponse(
status_code=400,
content=APIResponse.err(3002, "No demo data available. Index a document first.").model_dump(),
)
type_counts: dict[str, int] = {}
for n in nodes:
t = n.get("type", "UNKNOWN")
type_counts[t] = type_counts.get(t, 0) + 1
import networkx as nx
G = nx.Graph()
for n in nodes:
G.add_node(n["id"])
for e in edges:
G.add_edge(e["source"], e["target"])
return APIResponse.ok({
"nodes": nodes,
"edges": edges,
"stats": {
"nodes": len(nodes),
"edges": len(edges),
"type_counts": type_counts,
"density": round(nx.density(G), 4) if G.number_of_nodes() > 1 else 0.0,
},
})

View File

View File

@@ -0,0 +1,109 @@
"""Document Service — file upload, metadata CRUD."""
from __future__ import annotations
import uuid
from datetime import datetime, timezone
from pathlib import Path
from storage import file_store as fs
ALLOWED_EXTENSIONS = {"pdf", "docx", "doc", "pptx", "ppt", "png", "jpg", "jpeg", "html"}
MAX_FILE_SIZE_MB = 200
def validate_upload(filename: str, size_bytes: int) -> tuple[bool, int, str]:
"""Returns (ok, error_code, error_msg)."""
if not filename or "/" in filename or "\\" in filename:
return False, 1001, "Invalid filename"
ext = Path(filename).suffix.lower().lstrip(".")
if ext not in ALLOWED_EXTENSIONS:
return False, 1002, f"Unsupported file format: .{ext}. Supported: {', '.join(sorted(ALLOWED_EXTENSIONS))}"
size_mb = size_bytes / (1024 * 1024)
if size_mb > MAX_FILE_SIZE_MB:
return False, 1003, f"File size {size_mb:.1f}MB exceeds {MAX_FILE_SIZE_MB}MB limit"
return True, 0, ""
def save_upload(filename: str, content: bytes, language: str = "ch",
enable_formula: bool = True, enable_table: bool = True) -> dict:
doc_id = uuid.uuid4().hex[:8]
ext = Path(filename).suffix.lower().lstrip(".")
upload_filename = f"{doc_id}_{filename}"
upload_path = fs.UPLOADS_DIR / upload_filename
upload_path.write_bytes(content)
doc = {
"doc_id": doc_id,
"filename": filename,
"format": ext,
"size_bytes": len(content),
"pages": None,
"uploaded_at": datetime.now(timezone.utc).isoformat(),
"status": "uploaded",
"language": language,
"enable_formula": enable_formula,
"enable_table": enable_table,
"upload_filename": upload_filename, # internal: actual stored filename
}
fs.save_doc(doc)
return doc
def get_document(doc_id: str) -> dict | None:
return fs.get_doc(doc_id)
def list_documents(page: int = 1, page_size: int = 20,
status: str | None = None, fmt: str | None = None) -> dict:
index = fs.load_docs_index()
items = list(index.values())
items.sort(key=lambda d: d.get("uploaded_at", ""), reverse=True)
if status:
items = [d for d in items if d.get("status") == status]
if fmt:
items = [d for d in items if d.get("format") == fmt.lower()]
total = len(items)
start = (page - 1) * page_size
return {
"total": total,
"page": page,
"page_size": page_size,
"items": items[start: start + page_size],
}
def delete_document(doc_id: str) -> tuple[bool, int, int]:
"""Delete doc and its KG contributions. Returns (ok, removed_nodes, removed_edges)."""
doc = fs.get_doc(doc_id)
if not doc:
return False, 0, 0
# Remove from KG
removed_nodes, removed_edges = fs.remove_doc_from_kg(doc_id)
# Remove upload file
upload_filename = doc.get("upload_filename", "")
upload_path = fs.UPLOADS_DIR / upload_filename
if upload_path.exists():
upload_path.unlink(missing_ok=True)
# Remove associated jobs
for meta in fs.list_all_jobs():
if meta.get("doc_id") == doc_id:
fs.delete_job(meta["job_id"])
# Remove from index
index = fs.load_docs_index()
index.pop(doc_id, None)
fs.save_docs_index(index)
return True, removed_nodes, removed_edges
def update_doc_status(doc_id: str, status: str, pages: int | None = None) -> None:
index = fs.load_docs_index()
if doc_id in index:
index[doc_id]["status"] = status
if pages is not None:
index[doc_id]["pages"] = pages
fs.save_docs_index(index)

View File

@@ -0,0 +1,255 @@
"""Indexing Service — Pipeline orchestration (parsing → extracting → indexing)."""
from __future__ import annotations
import json
import os
import subprocess
import threading
import time
import uuid
from datetime import datetime, timezone
from pathlib import Path
from dotenv import load_dotenv
from storage import file_store as fs
from services.document_service import update_doc_status
load_dotenv(Path(__file__).parent.parent / ".env", override=True)
MINERU_PYTHON = Path(os.getenv("MINERU_PYTHON", "F:/GraphRAGAgent/mineru_mvp/.venv/Scripts/python.exe"))
MINERU_PIPELINE = Path(os.getenv("MINERU_PIPELINE", "F:/GraphRAGAgent/mineru_mvp/pipeline.py"))
# In-memory registry of active jobs {job_id: threading.Thread}
_active_threads: dict[str, threading.Thread] = {}
_cancel_flags: dict[str, bool] = {}
def start_indexing(doc_id: str) -> dict:
doc = fs.get_doc(doc_id)
if not doc:
return None # type: ignore
job_id = f"job_{uuid.uuid4().hex[:8]}"
now = datetime.now(timezone.utc).isoformat()
meta = {
"job_id": job_id,
"doc_id": doc_id,
"status": "submitted",
"stage": "Job submitted",
"progress": {"parsed_pages": 0, "total_pages": 0, "extracted_entities": 0},
"created_at": now,
"elapsed_seconds": 0.0,
"error": None,
"pdf_name": doc["filename"],
"pdf_path": str(fs.UPLOADS_DIR / doc.get("upload_filename", "")),
}
fs.save_job_meta(job_id, meta)
_cancel_flags[job_id] = False
thread = threading.Thread(target=_run_pipeline, args=(job_id,), daemon=True)
_active_threads[job_id] = thread
thread.start()
return meta
def _update_meta(job_id: str, **kwargs) -> None:
meta = fs.load_job_meta(job_id) or {}
meta.update(kwargs)
meta["elapsed_seconds"] = round(
(datetime.now(timezone.utc) - datetime.fromisoformat(meta["created_at"])).total_seconds(), 1
)
fs.save_job_meta(job_id, meta)
def _run_pipeline(job_id: str) -> None:
meta = fs.load_job_meta(job_id)
if not meta:
return
doc_id = meta["doc_id"]
pdf_path = Path(meta["pdf_path"])
job_dir = fs.job_dir(job_id)
start_time = time.time()
try:
# ── Stage 1: parsing ──────────────────────────────────────────────
if _cancel_flags.get(job_id):
_update_meta(job_id, status="cancelled", stage="Cancelled")
return
_update_meta(job_id, status="parsing", stage="MinerU document parsing...")
mineru_out_dir = job_dir / "mineru_output"
mineru_out_dir.mkdir(parents=True, exist_ok=True)
result = subprocess.run(
[str(MINERU_PYTHON), str(MINERU_PIPELINE), str(pdf_path)],
cwd=str(MINERU_PIPELINE.parent),
capture_output=True,
text=True,
timeout=600,
)
if result.returncode != 0:
raise RuntimeError(f"MinerU failed: {result.stderr[:500]}")
# Find content_list.json in MinerU output
# MinerU writes output to mineru_mvp/output/{stem}/
stem = pdf_path.stem
mineru_default_out = MINERU_PIPELINE.parent / "output" / stem
content_list_path = None
if mineru_default_out.exists():
matches = list(mineru_default_out.glob("*_content_list.json"))
if matches:
content_list_path = matches[0]
# Copy to our job dir
import shutil
shutil.copytree(str(mineru_default_out), str(mineru_out_dir), dirs_exist_ok=True)
if not content_list_path:
# Fallback: search job mineru_output dir
matches = list(mineru_out_dir.glob("*_content_list.json"))
if matches:
content_list_path = matches[0]
if not content_list_path or not content_list_path.exists():
raise RuntimeError(f"MinerU output content_list.json not found. stdout: {result.stdout[:300]}")
# ── Stage 2: extracting ───────────────────────────────────────────
if _cancel_flags.get(job_id):
_update_meta(job_id, status="cancelled", stage="Cancelled")
return
from pipeline.text_assembler import load_content_list, assemble_pages, count_blocks_by_type
from pipeline.entity_extractor import create_model, extract_entities
from pipeline.kg_builder import build_kg, extractions_to_records
content_list = load_content_list(content_list_path)
pages = assemble_pages(content_list)
total_pages = len(pages)
block_types = count_blocks_by_type(content_list)
_update_meta(
job_id,
status="extracting",
stage=f"Extracting entities (LangExtract + DeepSeek)...",
progress={"parsed_pages": total_pages, "total_pages": total_pages, "extracted_entities": 0},
)
update_doc_status(doc_id, "indexing", pages=total_pages)
model = create_model()
annotated_docs = []
total_entities = 0
for i, page in enumerate(pages):
if _cancel_flags.get(job_id):
_update_meta(job_id, status="cancelled", stage="Cancelled")
return
_update_meta(
job_id,
stage=f"Extracting entities page {i+1}/{total_pages} (LangExtract + DeepSeek)...",
progress={"parsed_pages": total_pages, "total_pages": total_pages,
"extracted_entities": total_entities},
)
ann_doc = extract_entities(page.text, model)
annotated_docs.append(ann_doc)
total_entities += len(ann_doc.extractions) if ann_doc.extractions else 0
# Save raw extractions
records = extractions_to_records(pages, annotated_docs, doc_id)
fs.write_json(job_dir / "extractions.json", records)
# ── Stage 3: indexing ─────────────────────────────────────────────
_update_meta(job_id, status="indexing", stage="Building knowledge graph...")
nodes, edges = build_kg(pages, annotated_docs, doc_id)
fs.write_json(job_dir / "kg_nodes.json", nodes)
fs.write_json(job_dir / "kg_edges.json", edges)
# Merge into global KG
fs.merge_kg(nodes, edges, doc_id)
# Count alignment types
alignment_counts: dict[str, int] = {}
type_counts: dict[str, int] = {}
for r in records:
al = r.get("alignment") or "null"
alignment_counts[al] = alignment_counts.get(al, 0) + 1
t = r.get("type", "UNKNOWN")
type_counts[t] = type_counts.get(t, 0) + 1
elapsed = round(time.time() - start_time, 1)
stats = {
"blocks": len(content_list),
"block_types": block_types,
"pages": total_pages,
"raw_extractions": len(records),
"nodes": len(nodes),
"edges": len(edges),
"type_counts": type_counts,
"alignment_counts": alignment_counts,
"elapsed_seconds": elapsed,
}
fs.write_json(job_dir / "stats.json", stats)
_update_meta(
job_id,
status="done",
stage="Complete",
progress={"parsed_pages": total_pages, "total_pages": total_pages,
"extracted_entities": len(records)},
)
update_doc_status(doc_id, "indexed", pages=total_pages)
except Exception as exc:
_update_meta(job_id, status="failed", stage=f"Error: {exc}", error=str(exc))
update_doc_status(doc_id, "failed")
finally:
_active_threads.pop(job_id, None)
_cancel_flags.pop(job_id, None)
def get_job_status(job_id: str) -> dict | None:
return fs.load_job_meta(job_id)
def get_job_result(job_id: str) -> dict | None:
meta = fs.load_job_meta(job_id)
if not meta:
return None
if meta["status"] != "done":
return meta
job_dir = fs.job_dir(job_id)
stats = fs.read_json(job_dir / "stats.json") or {}
extractions = fs.read_json(job_dir / "extractions.json") or []
nodes = fs.read_json(job_dir / "kg_nodes.json") or []
edges = fs.read_json(job_dir / "kg_edges.json") or []
return {
"job_id": meta["job_id"],
"doc_id": meta["doc_id"],
"status": "done",
"stats": stats,
"extractions": extractions,
"nodes": nodes,
"edges": edges,
}
def cancel_job(job_id: str) -> tuple[bool, str]:
meta = fs.load_job_meta(job_id)
if not meta:
return False, "not_found"
prev_status = meta["status"]
_cancel_flags[job_id] = True
_update_meta(job_id, status="cancelled", stage="Cancelled by user")
return True, prev_status
def count_active_jobs() -> int:
return sum(1 for t in _active_threads.values() if t.is_alive())

View File

@@ -0,0 +1,167 @@
"""KG Service — NetworkX graph operations over the global KG."""
from __future__ import annotations
import networkx as nx
from storage import file_store as fs
def _load_graph() -> nx.Graph:
nodes = fs.load_kg_nodes()
edges = fs.load_kg_edges()
G = nx.Graph()
for n in nodes:
G.add_node(n["id"], **n)
for e in edges:
G.add_edge(e["source"], e["target"],
relation=e.get("relation", "CO_OCCURS_IN"),
doc_id=e.get("doc_id", ""),
page=e.get("page", 0))
return G
def get_nodes(page: int = 1, page_size: int = 50,
node_type: str | None = None,
doc_id: str | None = None,
confidence: str | None = None) -> dict:
nodes = fs.load_kg_nodes()
G = _load_graph()
# Attach degree
degrees = dict(G.degree())
for n in nodes:
n["degree"] = degrees.get(n["id"], 0)
if node_type:
nodes = [n for n in nodes if n.get("type", "").upper() == node_type.upper()]
if doc_id:
nodes = [n for n in nodes if n.get("source_doc") == doc_id]
if confidence:
nodes = [n for n in nodes if n.get("confidence") == confidence]
total = len(nodes)
start = (page - 1) * page_size
return {"total": total, "page": page, "page_size": page_size,
"items": nodes[start: start + page_size]}
def get_edges(page: int = 1, page_size: int = 100,
doc_id: str | None = None,
relation: str | None = None) -> dict:
edges = fs.load_kg_edges()
if doc_id:
edges = [e for e in edges if e.get("doc_id") == doc_id]
if relation:
edges = [e for e in edges if e.get("relation") == relation]
total = len(edges)
start = (page - 1) * page_size
return {"total": total, "page": page, "page_size": page_size,
"items": edges[start: start + page_size]}
def get_node_detail(node_id: str) -> dict | None:
nodes = fs.load_kg_nodes()
node = next((n for n in nodes if n["id"] == node_id), None)
if not node:
return None
G = _load_graph()
if node_id not in G:
node["degree"] = 0
node["degree_centrality"] = 0.0
node["neighbor_count"] = 0
return node
deg = G.degree(node_id)
centrality = nx.degree_centrality(G)
node["degree"] = deg
node["degree_centrality"] = round(centrality.get(node_id, 0.0), 4)
node["neighbor_count"] = deg
return node
def get_neighbors(node_id: str, hops: int = 1) -> dict | None:
nodes = fs.load_kg_nodes()
node = next((n for n in nodes if n["id"] == node_id), None)
if not node:
return None
G = _load_graph()
if node_id not in G:
return {
"center": {"id": node_id, "name": node["name"], "type": node["type"], "page": node.get("page", 0)},
"hops": hops, "neighbors_by_hop": {}, "total_neighbors": 0,
}
hops = max(1, min(hops, 3))
reachable = nx.single_source_shortest_path_length(G, node_id, cutoff=hops)
by_hop: dict[str, list] = {}
for nid, dist in reachable.items():
if dist == 0:
continue
nd = G.nodes[nid]
by_hop.setdefault(str(dist), []).append({
"id": nid, "name": nd.get("name", ""), "type": nd.get("type", ""), "page": nd.get("page", 0)
})
total = sum(len(v) for v in by_hop.values())
return {
"center": {"id": node_id, "name": node["name"], "type": node["type"], "page": node.get("page", 0)},
"hops": hops,
"neighbors_by_hop": by_hop,
"total_neighbors": total,
}
def get_stats() -> dict:
nodes = fs.load_kg_nodes()
edges = fs.load_kg_edges()
G = _load_graph()
type_dist: dict[str, int] = {}
for n in nodes:
t = n.get("type", "UNKNOWN")
type_dist[t] = type_dist.get(t, 0) + 1
relation_types: dict[str, int] = {}
for e in edges:
r = e.get("relation", "CO_OCCURS_IN")
relation_types[r] = relation_types.get(r, 0) + 1
density = round(nx.density(G), 4) if G.number_of_nodes() > 1 else 0.0
top5: list[dict] = []
if G.number_of_nodes() > 0:
centrality = nx.degree_centrality(G)
for nid, c in sorted(centrality.items(), key=lambda x: x[1], reverse=True)[:5]:
nd = G.nodes[nid]
top5.append({"node_id": nid, "name": nd.get("name", ""), "type": nd.get("type", ""),
"centrality": round(c, 4)})
source_docs = list({n.get("source_doc", "") for n in nodes if n.get("source_doc")})
return {
"total_nodes": len(nodes),
"total_edges": len(edges),
"density": density,
"type_distribution": type_dist,
"relation_types": relation_types,
"top5_central_nodes": top5,
"source_documents": source_docs,
}
def export_kg(doc_id: str | None = None) -> dict:
from datetime import datetime, timezone
nodes = fs.load_kg_nodes()
edges = fs.load_kg_edges()
G = _load_graph()
degrees = dict(G.degree())
for n in nodes:
n["degree"] = degrees.get(n["id"], 0)
if doc_id:
nodes = [n for n in nodes if n.get("source_doc") == doc_id]
edges = [e for e in edges if e.get("doc_id") == doc_id]
return {
"format": "json",
"doc_id": doc_id,
"total_nodes": len(nodes),
"total_edges": len(edges),
"exported_at": datetime.now(timezone.utc).isoformat(),
"nodes": nodes,
"edges": edges,
}

View File

@@ -0,0 +1,85 @@
"""QA Service — Agentic-RAG wrapper."""
from __future__ import annotations
import time
import uuid
from datetime import datetime, timezone
from storage import file_store as fs
def run_query(question: str, history: list[dict]) -> dict:
from pipeline.qa_agent import run_qa
nodes = fs.load_kg_nodes()
edges = fs.load_kg_edges()
if not nodes:
raise ValueError("KG_EMPTY")
start = time.time()
result = run_qa(question, history, nodes, edges)
elapsed = round(time.time() - start, 2)
query_id = f"q_{uuid.uuid4().hex[:10]}"
now = datetime.now(timezone.utc).isoformat()
record = {
"id": query_id,
"question": question,
"answer": result["answer"],
"tool_calls": result["tool_calls"],
"cited_nodes": result["cited_nodes"],
"duration_seconds": elapsed,
"timestamp": now,
}
fs.append_query_history(record)
return record
def get_history(page: int = 1, page_size: int = 20) -> dict:
all_records = fs.load_query_history()
total = len(all_records)
start = (page - 1) * page_size
return {
"total": total,
"page": page,
"page_size": page_size,
"items": all_records[start: start + page_size],
}
def start_batch(questions: list[str]) -> dict:
import threading
batch_id = f"batch_{uuid.uuid4().hex[:10]}"
now = datetime.now(timezone.utc).isoformat()
meta = {
"batch_id": batch_id,
"total": len(questions),
"completed": 0,
"failed": 0,
"status": "submitted",
"created_at": now,
"results": [],
}
fs.save_batch_meta(batch_id, meta)
def _run():
for q in questions:
try:
res = run_query(q, [])
meta["results"].append(res)
meta["completed"] += 1
except Exception as e:
meta["failed"] += 1
meta["results"].append({"question": q, "error": str(e)})
meta["status"] = "done"
fs.save_batch_meta(batch_id, meta)
threading.Thread(target=_run, daemon=True).start()
return {"batch_id": batch_id, "total": len(questions), "status": "submitted", "created_at": now}
def get_batch_result(batch_id: str) -> dict | None:
return fs.load_batch_meta(batch_id)

View File

@@ -0,0 +1,106 @@
"""Search Service — entity, path, and graph search."""
from __future__ import annotations
import networkx as nx
from storage import file_store as fs
def _load_graph() -> nx.Graph:
nodes = fs.load_kg_nodes()
edges = fs.load_kg_edges()
G = nx.Graph()
for n in nodes:
G.add_node(n["id"], **n)
for e in edges:
G.add_edge(e["source"], e["target"],
relation=e.get("relation", "CO_OCCURS_IN"),
doc_id=e.get("doc_id", ""), page=e.get("page", 0))
return G
def search_entities(q: str, entity_type: str | None = None, limit: int = 15) -> dict:
nodes = fs.load_kg_nodes()
G = _load_graph()
degrees = dict(G.degree())
q_lower = q.lower()
matches = [n for n in nodes if q_lower in n.get("name", "").lower()]
if entity_type:
matches = [n for n in matches if n.get("type", "").upper() == entity_type.upper()]
for n in matches:
n["degree"] = degrees.get(n["id"], 0)
matches = matches[:limit]
return {"query": q, "total": len(matches), "items": matches}
def search_path(from_id: str, to_id: str, max_hops: int = 3) -> dict | None:
nodes = fs.load_kg_nodes()
node_map = {n["id"]: n for n in nodes}
if from_id not in node_map or to_id not in node_map:
return None # node not found
G = _load_graph()
max_hops = max(1, min(max_hops, 5))
try:
raw_paths = list(nx.all_simple_paths(G, from_id, to_id, cutoff=max_hops))
except nx.NetworkXError:
raw_paths = []
paths = []
for path_nodes in raw_paths:
path_edges = []
for i in range(len(path_nodes) - 1):
s, t = path_nodes[i], path_nodes[i + 1]
edge_data = G.edges[s, t]
path_edges.append({"source": s, "target": t,
"relation": edge_data.get("relation", "CO_OCCURS_IN")})
paths.append({
"length": len(path_nodes) - 1,
"nodes": [{"id": nid, "name": node_map.get(nid, {}).get("name", nid),
"type": node_map.get(nid, {}).get("type", "")} for nid in path_nodes],
"edges": path_edges,
})
from_node = node_map[from_id]
to_node = node_map[to_id]
return {
"from": {"id": from_id, "name": from_node.get("name", ""), "type": from_node.get("type", "")},
"to": {"id": to_id, "name": to_node.get("name", ""), "type": to_node.get("type", "")},
"max_hops": max_hops,
"paths": paths,
"total_paths": len(paths),
}
def search_graph(q: str, include_neighbors: bool = False) -> dict:
nodes = fs.load_kg_nodes()
edges = fs.load_kg_edges()
G = _load_graph()
degrees = dict(G.degree())
q_lower = q.lower()
matched = [n for n in nodes if q_lower in n.get("name", "").lower()]
matched_ids = {n["id"] for n in matched}
for n in matched:
n["degree"] = degrees.get(n["id"], 0)
if include_neighbors:
neighbor_ids = set()
for nid in matched_ids:
if nid in G:
neighbor_ids.update(G.neighbors(nid))
all_relevant = matched_ids | neighbor_ids
else:
all_relevant = matched_ids
subgraph_edges = [
e for e in edges
if e.get("source") in all_relevant and e.get("target") in all_relevant
]
return {
"query": q,
"matched_nodes": matched,
"subgraph_edges": subgraph_edges,
}

View File

View File

@@ -0,0 +1,268 @@
"""
File Store — unified JSON read/write for all backend data.
All data lives under backend/data/.
"""
from __future__ import annotations
import json
import os
import shutil
from pathlib import Path
from typing import Any
# Root data directory relative to this file
_BASE = Path(__file__).parent.parent / "data"
UPLOADS_DIR = _BASE / "uploads"
JOBS_DIR = _BASE / "jobs"
KG_DIR = _BASE / "kg"
QUERY_DIR = _BASE / "jobs" # query_history.jsonl lives here
# Ensure directories exist at import time
for _d in (UPLOADS_DIR, JOBS_DIR, KG_DIR):
_d.mkdir(parents=True, exist_ok=True)
# ---------------------------------------------------------------------------
# Generic helpers
# ---------------------------------------------------------------------------
def read_json(path: Path) -> Any:
"""Read and parse a JSON file. Returns None if file doesn't exist."""
if not path.exists():
return None
with open(path, "r", encoding="utf-8") as f:
return json.load(f)
def write_json(path: Path, data: Any) -> None:
"""Atomically write data as JSON (write to .tmp then rename)."""
path.parent.mkdir(parents=True, exist_ok=True)
tmp = path.with_suffix(".tmp")
with open(tmp, "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=2)
os.replace(tmp, path)
def append_jsonl(path: Path, record: dict) -> None:
"""Append a record to a JSONL file."""
path.parent.mkdir(parents=True, exist_ok=True)
with open(path, "a", encoding="utf-8") as f:
f.write(json.dumps(record, ensure_ascii=False) + "\n")
def read_jsonl(path: Path) -> list[dict]:
"""Read all records from a JSONL file."""
if not path.exists():
return []
records = []
with open(path, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if line:
try:
records.append(json.loads(line))
except json.JSONDecodeError:
pass
return records
# ---------------------------------------------------------------------------
# Document helpers
# ---------------------------------------------------------------------------
def docs_index_path() -> Path:
return _BASE / "docs_index.json"
def load_docs_index() -> dict[str, dict]:
"""Load the documents index {doc_id: DocumentInfo dict}."""
data = read_json(docs_index_path())
return data if isinstance(data, dict) else {}
def save_docs_index(index: dict[str, dict]) -> None:
write_json(docs_index_path(), index)
def get_doc(doc_id: str) -> dict | None:
return load_docs_index().get(doc_id)
def save_doc(doc: dict) -> None:
index = load_docs_index()
index[doc["doc_id"]] = doc
save_docs_index(index)
def delete_doc(doc_id: str) -> bool:
index = load_docs_index()
if doc_id not in index:
return False
del index[doc_id]
save_docs_index(index)
# Remove upload file
doc_info = index.get(doc_id, {})
upload_path = UPLOADS_DIR / doc_info.get("upload_filename", "")
if upload_path.exists():
upload_path.unlink()
return True
# ---------------------------------------------------------------------------
# Job helpers
# ---------------------------------------------------------------------------
def job_dir(job_id: str) -> Path:
return JOBS_DIR / job_id
def job_meta_path(job_id: str) -> Path:
return job_dir(job_id) / "meta.json"
def load_job_meta(job_id: str) -> dict | None:
return read_json(job_meta_path(job_id))
def save_job_meta(job_id: str, meta: dict) -> None:
job_dir(job_id).mkdir(parents=True, exist_ok=True)
write_json(job_meta_path(job_id), meta)
def list_all_jobs() -> list[dict]:
metas = []
for d in JOBS_DIR.iterdir():
if d.is_dir():
meta = read_json(d / "meta.json")
if meta:
metas.append(meta)
return metas
def delete_job(job_id: str) -> None:
jd = job_dir(job_id)
if jd.exists():
shutil.rmtree(jd)
# ---------------------------------------------------------------------------
# Global KG helpers
# ---------------------------------------------------------------------------
def kg_nodes_path() -> Path:
return KG_DIR / "kg_nodes.json"
def kg_edges_path() -> Path:
return KG_DIR / "kg_edges.json"
def load_kg_nodes() -> list[dict]:
data = read_json(kg_nodes_path())
return data if isinstance(data, list) else []
def load_kg_edges() -> list[dict]:
data = read_json(kg_edges_path())
return data if isinstance(data, list) else []
def save_kg_nodes(nodes: list[dict]) -> None:
write_json(kg_nodes_path(), nodes)
def save_kg_edges(edges: list[dict]) -> None:
write_json(kg_edges_path(), edges)
def merge_kg(new_nodes: list[dict], new_edges: list[dict], doc_id: str) -> tuple[int, int]:
"""Merge job KG output into global KG. Returns (removed_old, added_new)."""
existing_nodes = load_kg_nodes()
existing_edges = load_kg_edges()
# Remove nodes/edges from this doc
existing_nodes = [n for n in existing_nodes if n.get("source_doc") != doc_id]
existing_edges = [e for e in existing_edges if e.get("doc_id") != doc_id]
# Merge: deduplicate nodes by (name.lower(), type)
node_keys: set[tuple] = {(n["name"].lower(), n["type"]) for n in existing_nodes}
for n in new_nodes:
key = (n["name"].lower(), n["type"])
if key not in node_keys:
existing_nodes.append(n)
node_keys.add(key)
# Merge edges: deduplicate by (min(src,tgt), max(src,tgt), doc_id)
edge_keys: set[tuple] = set()
for e in existing_edges:
s, t = e["source"], e["target"]
edge_keys.add((min(s, t), max(s, t), e["doc_id"]))
for e in new_edges:
s, t = e["source"], e["target"]
key = (min(s, t), max(s, t), e["doc_id"])
if key not in edge_keys:
existing_edges.append(e)
edge_keys.add(key)
save_kg_nodes(existing_nodes)
save_kg_edges(existing_edges)
return len(existing_nodes), len(existing_edges)
def remove_doc_from_kg(doc_id: str) -> tuple[int, int]:
"""Remove all nodes/edges from a document. Returns (removed_nodes, removed_edges)."""
nodes = load_kg_nodes()
edges = load_kg_edges()
old_n, old_e = len(nodes), len(edges)
nodes = [n for n in nodes if n.get("source_doc") != doc_id]
edges = [e for e in edges if e.get("doc_id") != doc_id]
save_kg_nodes(nodes)
save_kg_edges(edges)
return old_n - len(nodes), old_e - len(edges)
# ---------------------------------------------------------------------------
# Query history helpers
# ---------------------------------------------------------------------------
def query_history_path() -> Path:
return _BASE / "query_history.jsonl"
def append_query_history(result: dict) -> None:
append_jsonl(query_history_path(), result)
def load_query_history() -> list[dict]:
records = read_jsonl(query_history_path())
return list(reversed(records)) # newest first
# ---------------------------------------------------------------------------
# Batch job helpers
# ---------------------------------------------------------------------------
def batch_meta_path(batch_id: str) -> Path:
return _BASE / "batches" / f"{batch_id}.json"
def load_batch_meta(batch_id: str) -> dict | None:
return read_json(batch_meta_path(batch_id))
def save_batch_meta(batch_id: str, meta: dict) -> None:
write_json(batch_meta_path(batch_id), meta)
# ---------------------------------------------------------------------------
# Storage usage
# ---------------------------------------------------------------------------
def storage_used_mb() -> float:
total = 0
for path in _BASE.rglob("*"):
if path.is_file():
total += path.stat().st_size
return round(total / (1024 * 1024), 2)

256
backend/tests/test_api.py Normal file
View File

@@ -0,0 +1,256 @@
"""
API integration tests — tests all major endpoints against a running server.
Run with: python tests/test_api.py
Server must be running on http://localhost:8000
"""
import json
import sys
import time
import urllib.request
import urllib.error
from pathlib import Path
BASE = "http://localhost:8000/api/v1"
PASS = "\033[92m[PASS]\033[0m"
FAIL = "\033[91m[FAIL]\033[0m"
INFO = "\033[94m[INFO]\033[0m"
results = {"passed": 0, "failed": 0}
def req(method: str, path: str, body: dict | None = None, form: dict | None = None) -> dict:
url = BASE + path
try:
if method == "GET" and not body and not form:
r = urllib.request.urlopen(url, timeout=30)
else:
if body is not None:
data = json.dumps(body).encode()
req_obj = urllib.request.Request(url, data=data, method=method,
headers={"Content-Type": "application/json"})
else:
req_obj = urllib.request.Request(url, method=method)
r = urllib.request.urlopen(req_obj, timeout=30)
return json.loads(r.read().decode())
except urllib.error.HTTPError as e:
return json.loads(e.read().decode())
def check(name: str, condition: bool, detail: str = "") -> None:
if condition:
results["passed"] += 1
print(f" {PASS} {name}")
else:
results["failed"] += 1
print(f" {FAIL} {name} {detail}")
def wait_for_server(max_retries: int = 15) -> bool:
print(f"{INFO} Waiting for server at {BASE}...")
for i in range(max_retries):
try:
urllib.request.urlopen(BASE.replace("/api/v1", "/"), timeout=3)
print(f"{INFO} Server is up.")
return True
except Exception:
time.sleep(1)
return False
# ─────────────────────────────────────────────────────────────────────────────
# Test groups
# ─────────────────────────────────────────────────────────────────────────────
def test_system():
print("\n── F 组: System ──")
r = req("GET", "/health")
check("GET /health returns code=0", r.get("code") == 0)
check("health data.status exists", "status" in (r.get("data") or {}))
check("health data.components exists", "components" in (r.get("data") or {}))
print(f" {INFO} status={r.get('data',{}).get('status')} uptime={r.get('data',{}).get('uptime_seconds')}s")
r = req("GET", "/system/stats")
check("GET /system/stats returns code=0", r.get("code") == 0)
d = r.get("data") or {}
check("stats has total_documents", "total_documents" in d)
check("stats has total_nodes", "total_nodes" in d)
print(f" {INFO} docs={d.get('total_documents')} nodes={d.get('total_nodes')} edges={d.get('total_edges')}")
r = req("GET", "/system/formats")
check("GET /system/formats returns code=0", r.get("code") == 0)
d = r.get("data") or {}
check("formats list is non-empty", len(d.get("formats", [])) > 0)
exts = [f["ext"] for f in d.get("formats", [])]
check("pdf format present", "pdf" in exts)
check("docx format present", "docx" in exts)
r = req("GET", "/system/demo")
check("GET /system/demo returns code=0 or 3002", r.get("code") in (0, 3002))
if r.get("code") == 0:
d = r.get("data") or {}
check("demo data has nodes", "nodes" in d)
print(f" {INFO} demo: {len(d.get('nodes',[]))} nodes, {len(d.get('edges',[]))} edges")
else:
print(f" {INFO} demo data not available (no KG yet) — code={r.get('code')}")
def test_documents():
print("\n── A 组: Documents ──")
r = req("GET", "/documents")
check("GET /documents returns code=0", r.get("code") == 0)
d = r.get("data") or {}
check("documents list has total field", "total" in d)
check("documents list has items field", "items" in d)
print(f" {INFO} total documents={d.get('total', 0)}")
# Upload a test text file (not a real supported format to test validation)
print(" Testing upload validation...")
import urllib.request, io
boundary = "boundary123"
body_parts = (
f"--{boundary}\r\n"
f'Content-Disposition: form-data; name="file"; filename="test.xyz"\r\n'
f"Content-Type: application/octet-stream\r\n\r\n"
f"dummy content\r\n"
f"--{boundary}--\r\n"
).encode()
req_obj = urllib.request.Request(
BASE + "/documents/upload",
data=body_parts,
method="POST",
headers={"Content-Type": f"multipart/form-data; boundary={boundary}"},
)
try:
urllib.request.urlopen(req_obj, timeout=10)
r_upload = {}
except urllib.error.HTTPError as e:
r_upload = json.loads(e.read().decode())
check("upload unsupported format returns code=1002", r_upload.get("code") == 1002)
r = req("GET", "/documents/nonexistent_id")
check("GET /documents/nonexistent returns code=2001", r.get("code") == 2001)
def test_indexing():
print("\n── B 组: Indexing ──")
r = req("POST", "/index/start", body={"doc_id": "nonexistent_doc"})
check("start indexing nonexistent doc returns 2001", r.get("code") == 2001)
r = req("GET", "/index/status/nonexistent_job")
check("get status nonexistent job returns 2002", r.get("code") == 2002)
r = req("GET", "/index/result/nonexistent_job")
check("get result nonexistent job returns 2002", r.get("code") == 2002)
r = req("DELETE", "/index/jobs/nonexistent_job")
check("cancel nonexistent job returns 2002", r.get("code") == 2002)
def test_kg():
print("\n── C 组: Knowledge Graph ──")
r = req("GET", "/kg/stats")
check("GET /kg/stats returns code=0", r.get("code") == 0)
d = r.get("data") or {}
check("stats has total_nodes", "total_nodes" in d)
check("stats has total_edges", "total_edges" in d)
print(f" {INFO} KG: {d.get('total_nodes')} nodes, {d.get('total_edges')} edges")
r = req("GET", "/kg/nodes")
check("GET /kg/nodes returns code 0 or 3002", r.get("code") in (0, 3002))
if r.get("code") == 0:
d = r.get("data") or {}
check("nodes data has items", "items" in d)
print(f" {INFO} nodes total={d.get('total')}")
if d.get("items"):
node_id = d["items"][0]["id"]
r2 = req("GET", f"/kg/nodes/{node_id}")
check(f"GET /kg/nodes/{node_id} returns code=0", r2.get("code") == 0)
r3 = req("GET", f"/kg/nodes/{node_id}/neighbors?hops=1")
check(f"GET /kg/nodes/{node_id}/neighbors returns code=0", r3.get("code") == 0)
else:
print(f" {INFO} KG is empty (code=3002) — skipping node detail tests")
r = req("GET", "/kg/nodes/definitely_not_a_real_node")
check("GET /kg/nodes/invalid returns code=3001", r.get("code") == 3001)
r = req("GET", "/kg/edges")
check("GET /kg/edges returns code=0", r.get("code") == 0)
r = req("GET", "/kg/export")
check("GET /kg/export returns code=0", r.get("code") == 0)
def test_search():
print("\n── E 组: Search ──")
r = req("GET", "/search/entities?q=graph")
check("GET /search/entities returns code=0", r.get("code") == 0)
d = r.get("data") or {}
check("search entities has query field", "query" in d)
check("search entities has items field", "items" in d)
print(f" {INFO} 'graph' search: {d.get('total', 0)} results")
r = req("GET", "/search/entities?q=technology&type=TECHNOLOGY")
check("GET /search/entities with type filter returns code=0", r.get("code") == 0)
r = req("GET", "/search/path?max_hops=2")
check("path search without from/to returns 1001", r.get("code") == 1001)
r = req("GET", "/search/graph?q=knowledge")
check("GET /search/graph returns code=0", r.get("code") == 0)
d = r.get("data") or {}
check("graph search has matched_nodes", "matched_nodes" in d)
def test_query():
print("\n── D 组: QA Query ──")
# Don't call /query (POST) in basic tests as it needs DeepSeek API + KG data
r = req("GET", "/query/history")
check("GET /query/history returns code=0", r.get("code") == 0)
d = r.get("data") or {}
check("history has total field", "total" in d)
check("history has items field", "items" in d)
print(f" {INFO} query history: {d.get('total', 0)} records")
r = req("GET", "/query/batch/nonexistent_batch")
check("GET /query/batch/nonexistent returns 2002", r.get("code") == 2002)
r = req("POST", "/query/batch", body={"questions": ["test question"]})
check("POST /query/batch returns code=0", r.get("code") == 0)
d = r.get("data") or {}
check("batch has batch_id", "batch_id" in d)
# ─────────────────────────────────────────────────────────────────────────────
# Main
# ─────────────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
if not wait_for_server():
print(f"\n{FAIL} Server not responding. Start with: python main.py")
sys.exit(1)
test_system()
test_documents()
test_indexing()
test_kg()
test_search()
test_query()
total = results["passed"] + results["failed"]
print(f"\n{'='*50}")
print(f"Results: {results['passed']}/{total} passed, {results['failed']} failed")
if results["failed"] == 0:
print(f"{PASS} All tests passed!")
else:
print(f"{FAIL} {results['failed']} test(s) failed")
print(f"{'='*50}")
sys.exit(0 if results["failed"] == 0 else 1)