GraphRAG Studio — initial commit: multimodal RAG system with KG visualization
Full-stack application for document-to-knowledge-graph pipeline: - Backend: FastAPI + LangGraph ReAct agent + DeepSeek + MinerU parsing - Frontend: React 19 + Vite + D3.js + shadcn/ui - Pipeline: MinerU parsing → LangExtract entity extraction → KG building Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
10
backend/.env.example
Normal file
10
backend/.env.example
Normal file
@@ -0,0 +1,10 @@
|
||||
# DeepSeek API (required for entity extraction + QA)
|
||||
DEEPSEEK_API_KEY=your_deepseek_api_key_here
|
||||
DEEPSEEK_BASE_URL=https://api.deepseek.com
|
||||
|
||||
# MinerU (required for document parsing)
|
||||
MINERU_API_TOKEN=your_mineru_api_token_here
|
||||
|
||||
# MinerU venv path (absolute path to python.exe)
|
||||
MINERU_PYTHON=F:/GraphRAGAgent/mineru_mvp/.venv/Scripts/python.exe
|
||||
MINERU_PIPELINE=F:/GraphRAGAgent/mineru_mvp/pipeline.py
|
||||
10
backend/.gitignore
vendored
Normal file
10
backend/.gitignore
vendored
Normal file
@@ -0,0 +1,10 @@
|
||||
.env
|
||||
.venv/
|
||||
__pycache__/
|
||||
*.pyc
|
||||
*.pyo
|
||||
data/uploads/
|
||||
data/jobs/
|
||||
data/kg/
|
||||
*.egg-info/
|
||||
dist/
|
||||
28
backend/CLAUDE.md
Normal file
28
backend/CLAUDE.md
Normal file
@@ -0,0 +1,28 @@
|
||||
# Backend — GraphRAG Studio API
|
||||
|
||||
## 路径
|
||||
|
||||
```
|
||||
F:\GraphRAGAgent\backend\
|
||||
```
|
||||
|
||||
## 启动命令
|
||||
|
||||
```bash
|
||||
cd F:/GraphRAGAgent/backend
|
||||
.venv/Scripts/python.exe -m uvicorn main:app --host 0.0.0.0 --port 8000 --reload
|
||||
```
|
||||
|
||||
## 接口测试
|
||||
|
||||
服务启动后,运行:
|
||||
|
||||
```bash
|
||||
.venv/Scripts/python.exe tests/test_api.py
|
||||
```
|
||||
|
||||
## API 文档
|
||||
|
||||
- Swagger UI:http://localhost:8000/docs
|
||||
- ReDoc:http://localhost:8000/redoc
|
||||
- 健康检查:http://localhost:8000/api/v1/health
|
||||
58
backend/main.py
Normal file
58
backend/main.py
Normal file
@@ -0,0 +1,58 @@
|
||||
"""
|
||||
GraphRAG Studio — FastAPI Backend
|
||||
Entry point: uvicorn main:app --host 0.0.0.0 --port 8000 --reload
|
||||
"""
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Ensure backend/ is in sys.path for absolute imports
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from fastapi import FastAPI
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
|
||||
load_dotenv(Path(__file__).parent / ".env", override=True)
|
||||
|
||||
from routers import documents, indexing, kg, query, search, system
|
||||
|
||||
app = FastAPI(
|
||||
title="GraphRAG Studio API",
|
||||
description="Multimodal RAG Q&A system backend — MinerU + LangExtract + Agentic-RAG",
|
||||
version="1.0.0",
|
||||
docs_url="/docs",
|
||||
redoc_url="/redoc",
|
||||
)
|
||||
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["*"],
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
# All routers under /api/v1. Each router carries its own sub-prefix.
|
||||
# documents.router prefix="/documents" → /api/v1/documents
|
||||
# indexing.router prefix="/index" → /api/v1/index
|
||||
# kg.router prefix="/kg" → /api/v1/kg
|
||||
# query.router prefix="/query" → /api/v1/query
|
||||
# search.router prefix="/search" → /api/v1/search
|
||||
# system.router no prefix → /api/v1/health, /api/v1/system/...
|
||||
PREFIX = "/api/v1"
|
||||
app.include_router(documents.router, prefix=PREFIX)
|
||||
app.include_router(indexing.router, prefix=PREFIX)
|
||||
app.include_router(kg.router, prefix=PREFIX)
|
||||
app.include_router(query.router, prefix=PREFIX)
|
||||
app.include_router(search.router, prefix=PREFIX)
|
||||
app.include_router(system.router, prefix=PREFIX)
|
||||
|
||||
|
||||
@app.get("/")
|
||||
async def root():
|
||||
return {"msg": "GraphRAG Studio API v1.0.0", "docs": "/docs", "health": "/api/v1/health"}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=True)
|
||||
0
backend/models/__init__.py
Normal file
0
backend/models/__init__.py
Normal file
360
backend/models/schemas.py
Normal file
360
backend/models/schemas.py
Normal file
@@ -0,0 +1,360 @@
|
||||
"""
|
||||
Pydantic v2 schemas — all API data objects per backend_service_specification-v1.0.md
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import uuid
|
||||
from typing import Any, Generic, Optional, TypeVar
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
T = TypeVar("T")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Universal response envelope
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class APIResponse(BaseModel, Generic[T]):
|
||||
code: int = 0
|
||||
msg: str = "success"
|
||||
request_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
|
||||
data: Optional[T] = None
|
||||
|
||||
@classmethod
|
||||
def ok(cls, data: Any = None) -> "APIResponse":
|
||||
return cls(code=0, msg="success", data=data)
|
||||
|
||||
@classmethod
|
||||
def err(cls, code: int, msg: str) -> "APIResponse":
|
||||
return cls(code=code, msg=msg, data=None)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# A. Document schemas
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class DocumentInfo(BaseModel):
|
||||
doc_id: str
|
||||
filename: str
|
||||
format: str
|
||||
size_bytes: int
|
||||
pages: Optional[int] = None
|
||||
uploaded_at: str
|
||||
status: str # uploaded | indexed | failed
|
||||
language: str = "ch"
|
||||
enable_formula: bool = True
|
||||
enable_table: bool = True
|
||||
|
||||
|
||||
class DocumentListData(BaseModel):
|
||||
total: int
|
||||
page: int
|
||||
page_size: int
|
||||
items: list[DocumentInfo]
|
||||
|
||||
|
||||
class DeleteDocumentData(BaseModel):
|
||||
deleted: bool
|
||||
doc_id: str
|
||||
removed_nodes: int
|
||||
removed_edges: int
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# B. Indexing job schemas
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class IndexingProgress(BaseModel):
|
||||
parsed_pages: int = 0
|
||||
total_pages: int = 0
|
||||
extracted_entities: int = 0
|
||||
|
||||
|
||||
class IndexingJobStatus(BaseModel):
|
||||
job_id: str
|
||||
doc_id: str
|
||||
status: str # submitted|queued|parsing|extracting|indexing|done|failed|cancelled
|
||||
stage: str = ""
|
||||
progress: IndexingProgress = Field(default_factory=IndexingProgress)
|
||||
created_at: str
|
||||
elapsed_seconds: float = 0.0
|
||||
error: Optional[str] = None
|
||||
|
||||
|
||||
class StartIndexRequest(BaseModel):
|
||||
doc_id: str
|
||||
|
||||
|
||||
class CancelJobData(BaseModel):
|
||||
cancelled: bool
|
||||
job_id: str
|
||||
previous_status: str
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# C. KG schemas
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class KGNode(BaseModel):
|
||||
id: str
|
||||
name: str
|
||||
type: str
|
||||
source_doc: str
|
||||
char_start: Optional[int] = None
|
||||
char_end: Optional[int] = None
|
||||
confidence: Optional[str] = None
|
||||
page: int = 0
|
||||
degree: int = 0
|
||||
|
||||
|
||||
class KGNodeDetail(KGNode):
|
||||
degree_centrality: float = 0.0
|
||||
neighbor_count: int = 0
|
||||
|
||||
|
||||
class KGEdge(BaseModel):
|
||||
source: str
|
||||
target: str
|
||||
relation: str = "CO_OCCURS_IN"
|
||||
doc_id: str
|
||||
page: int = 0
|
||||
|
||||
|
||||
class KGNodeListData(BaseModel):
|
||||
total: int
|
||||
page: int
|
||||
page_size: int
|
||||
items: list[KGNode]
|
||||
|
||||
|
||||
class KGEdgeListData(BaseModel):
|
||||
total: int
|
||||
page: int
|
||||
page_size: int
|
||||
items: list[KGEdge]
|
||||
|
||||
|
||||
class KGStatsData(BaseModel):
|
||||
total_nodes: int
|
||||
total_edges: int
|
||||
density: float
|
||||
type_distribution: dict[str, int]
|
||||
relation_types: dict[str, int]
|
||||
top5_central_nodes: list[dict]
|
||||
source_documents: list[str]
|
||||
|
||||
|
||||
class KGExportData(BaseModel):
|
||||
format: str
|
||||
doc_id: Optional[str]
|
||||
total_nodes: int
|
||||
total_edges: int
|
||||
exported_at: str
|
||||
nodes: list[KGNode]
|
||||
edges: list[KGEdge]
|
||||
|
||||
|
||||
class NeighborInfo(BaseModel):
|
||||
id: str
|
||||
name: str
|
||||
type: str
|
||||
page: int
|
||||
|
||||
|
||||
class NeighborsData(BaseModel):
|
||||
center: NeighborInfo
|
||||
hops: int
|
||||
neighbors_by_hop: dict[str, list[NeighborInfo]]
|
||||
total_neighbors: int
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# D. QA schemas
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class ChatMessage(BaseModel):
|
||||
role: str # human | ai
|
||||
content: str
|
||||
|
||||
|
||||
class QueryRequest(BaseModel):
|
||||
question: str
|
||||
history: list[ChatMessage] = Field(default_factory=list)
|
||||
|
||||
|
||||
class ToolCallRecord(BaseModel):
|
||||
tool: str
|
||||
input: dict
|
||||
output: str
|
||||
|
||||
|
||||
class QAResult(BaseModel):
|
||||
query_id: str
|
||||
question: str
|
||||
answer: str
|
||||
tool_calls: list[ToolCallRecord] = Field(default_factory=list)
|
||||
cited_nodes: list[str] = Field(default_factory=list)
|
||||
elapsed_seconds: float
|
||||
created_at: str
|
||||
|
||||
|
||||
class QAHistoryData(BaseModel):
|
||||
total: int
|
||||
page: int
|
||||
page_size: int
|
||||
items: list[QAResult]
|
||||
|
||||
|
||||
class BatchQueryRequest(BaseModel):
|
||||
questions: list[str] = Field(..., max_length=20)
|
||||
|
||||
|
||||
class BatchQueryData(BaseModel):
|
||||
batch_id: str
|
||||
total: int
|
||||
status: str
|
||||
created_at: str
|
||||
|
||||
|
||||
class BatchResultData(BaseModel):
|
||||
batch_id: str
|
||||
total: int
|
||||
completed: int
|
||||
failed: int
|
||||
status: str
|
||||
results: list[QAResult]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# E. Search schemas
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class EntitySearchData(BaseModel):
|
||||
query: str
|
||||
total: int
|
||||
items: list[KGNode]
|
||||
|
||||
|
||||
class PathNode(BaseModel):
|
||||
id: str
|
||||
name: str
|
||||
type: str
|
||||
|
||||
|
||||
class PathEdge(BaseModel):
|
||||
source: str
|
||||
target: str
|
||||
relation: str
|
||||
|
||||
|
||||
class PathInfo(BaseModel):
|
||||
length: int
|
||||
nodes: list[PathNode]
|
||||
edges: list[PathEdge]
|
||||
|
||||
|
||||
class PathSearchData(BaseModel):
|
||||
from_node: PathNode = Field(alias="from")
|
||||
to_node: PathNode = Field(alias="to")
|
||||
max_hops: int
|
||||
paths: list[PathInfo]
|
||||
total_paths: int
|
||||
|
||||
model_config = {"populate_by_name": True}
|
||||
|
||||
|
||||
class GraphSearchData(BaseModel):
|
||||
query: str
|
||||
matched_nodes: list[KGNode]
|
||||
subgraph_edges: list[KGEdge]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# F. System schemas
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class ComponentHealth(BaseModel):
|
||||
status: str # ok | error
|
||||
path: Optional[str] = None
|
||||
exists: Optional[bool] = None
|
||||
base_url: Optional[str] = None
|
||||
key_configured: Optional[bool] = None
|
||||
kg_nodes_exists: Optional[bool] = None
|
||||
kg_edges_exists: Optional[bool] = None
|
||||
uploads_dir_exists: Optional[bool] = None
|
||||
|
||||
|
||||
class HealthData(BaseModel):
|
||||
status: str
|
||||
version: str
|
||||
uptime_seconds: float
|
||||
components: dict[str, ComponentHealth]
|
||||
|
||||
|
||||
class SystemStatsData(BaseModel):
|
||||
total_documents: int
|
||||
indexed_documents: int
|
||||
failed_documents: int
|
||||
total_nodes: int
|
||||
total_edges: int
|
||||
type_distribution: dict[str, int]
|
||||
total_queries: int
|
||||
active_jobs: int
|
||||
storage_used_mb: float
|
||||
|
||||
|
||||
class FormatInfo(BaseModel):
|
||||
ext: str
|
||||
description: str
|
||||
max_size_mb: int
|
||||
max_pages: int
|
||||
requires_ocr: bool
|
||||
|
||||
|
||||
class FormatsData(BaseModel):
|
||||
formats: list[FormatInfo]
|
||||
ocr_languages: list[dict]
|
||||
notes: list[str]
|
||||
|
||||
|
||||
class DemoData(BaseModel):
|
||||
nodes: list[KGNode]
|
||||
edges: list[KGEdge]
|
||||
stats: dict
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# B3 index result
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class IndexResultStats(BaseModel):
|
||||
blocks: int = 0
|
||||
block_types: dict[str, int] = Field(default_factory=dict)
|
||||
pages: int = 0
|
||||
raw_extractions: int = 0
|
||||
nodes: int = 0
|
||||
edges: int = 0
|
||||
type_counts: dict[str, int] = Field(default_factory=dict)
|
||||
alignment_counts: dict[str, int] = Field(default_factory=dict)
|
||||
elapsed_seconds: float = 0.0
|
||||
|
||||
|
||||
class ExtractionRecord(BaseModel):
|
||||
text: str
|
||||
type: str
|
||||
char_start: Optional[int] = None
|
||||
char_end: Optional[int] = None
|
||||
alignment: Optional[str] = None
|
||||
page: int = 0
|
||||
doc_id: str
|
||||
|
||||
|
||||
class IndexResultData(BaseModel):
|
||||
job_id: str
|
||||
doc_id: str
|
||||
status: str
|
||||
stats: Optional[IndexResultStats] = None
|
||||
extractions: Optional[list[ExtractionRecord]] = None
|
||||
nodes: Optional[list[KGNode]] = None
|
||||
edges: Optional[list[KGEdge]] = None
|
||||
@@ -0,0 +1,367 @@
|
||||
[
|
||||
{
|
||||
"type": "text",
|
||||
"text": "GraphRAG System ",
|
||||
"text_level": 1,
|
||||
"bbox": [
|
||||
344,
|
||||
175,
|
||||
655,
|
||||
204
|
||||
],
|
||||
"page_idx": 0
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Technical Architecture Overview ",
|
||||
"bbox": [
|
||||
289,
|
||||
234,
|
||||
710,
|
||||
254
|
||||
],
|
||||
"page_idx": 0
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Version 1.0 | March 2026 ",
|
||||
"bbox": [
|
||||
364,
|
||||
272,
|
||||
633,
|
||||
290
|
||||
],
|
||||
"page_idx": 0
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "1. Abstract ",
|
||||
"text_level": 1,
|
||||
"bbox": [
|
||||
52,
|
||||
42,
|
||||
200,
|
||||
61
|
||||
],
|
||||
"page_idx": 1
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "This document presents the technical architecture of a Multimodal GraphRAG System designed for intelligent document parsing and knowledge graph construction. The system integrates MinerU for document parsing, LangExtract for structured entity extraction, and a graph database for knowledge storage and retrieval. ",
|
||||
"bbox": [
|
||||
48,
|
||||
83,
|
||||
951,
|
||||
171
|
||||
],
|
||||
"page_idx": 1
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "The pipeline supports multiple document formats including PDF, DOCX, PPTX, and image files. Extracted entities and relations are stored as graph nodes and edges, enabling semantic search and question answering over large document collections. ",
|
||||
"bbox": [
|
||||
48,
|
||||
200,
|
||||
949,
|
||||
265
|
||||
],
|
||||
"page_idx": 1
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "2. System Components ",
|
||||
"text_level": 1,
|
||||
"bbox": [
|
||||
50,
|
||||
299,
|
||||
321,
|
||||
318
|
||||
],
|
||||
"page_idx": 1
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "2.1 Document Parsing Module ",
|
||||
"text_level": 1,
|
||||
"bbox": [
|
||||
50,
|
||||
343,
|
||||
349,
|
||||
361
|
||||
],
|
||||
"page_idx": 1
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "MinerU Cloud API (v4) serves as the document parsing backend. It accepts PDF, DOCX, PPTX, PNG, JPG, and HTML files. Output includes Markdown text, structured content_list.json, and extracted images. ",
|
||||
"bbox": [
|
||||
48,
|
||||
373,
|
||||
951,
|
||||
436
|
||||
],
|
||||
"page_idx": 1
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "2.2 Entity Extraction Module ",
|
||||
"text_level": 1,
|
||||
"bbox": [
|
||||
50,
|
||||
461,
|
||||
357,
|
||||
479
|
||||
],
|
||||
"page_idx": 1
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "LangExtract (v1.1.1) performs structured information extraction from plain text using few-shot prompting with LLM backends (Gemini, OpenAI, or local Ollama). Each extraction includes character-level position anchoring. ",
|
||||
"bbox": [
|
||||
48,
|
||||
492,
|
||||
949,
|
||||
555
|
||||
],
|
||||
"page_idx": 1
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "2.3 Knowledge Graph Module ",
|
||||
"text_level": 1,
|
||||
"bbox": [
|
||||
50,
|
||||
580,
|
||||
337,
|
||||
596
|
||||
],
|
||||
"page_idx": 1
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Extracted entities and relationships are stored in a graph database. Node types include: Person, Organization, Location, Event, Concept. Edge types include: RELATED_TO, BELONGS_TO, CAUSED_BY, LOCATED_IN. ",
|
||||
"bbox": [
|
||||
48,
|
||||
608,
|
||||
949,
|
||||
674
|
||||
],
|
||||
"page_idx": 1
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "2.4 Retrieval Module ",
|
||||
"text_level": 1,
|
||||
"bbox": [
|
||||
50,
|
||||
697,
|
||||
272,
|
||||
715
|
||||
],
|
||||
"page_idx": 1
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "The retrieval layer supports hybrid search combining vector similarity and graph traversal. \nQuery results are ranked by relevance score and returned with source document references. ",
|
||||
"bbox": [
|
||||
48,
|
||||
727,
|
||||
944,
|
||||
766
|
||||
],
|
||||
"page_idx": 1
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "3. Data Pipeline ",
|
||||
"text_level": 1,
|
||||
"bbox": [
|
||||
50,
|
||||
42,
|
||||
268,
|
||||
61
|
||||
],
|
||||
"page_idx": 2
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "The end-to-end data pipeline consists of the following stages: ",
|
||||
"bbox": [
|
||||
50,
|
||||
83,
|
||||
623,
|
||||
99
|
||||
],
|
||||
"page_idx": 2
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Stage 1: Document Ingestion ",
|
||||
"bbox": [
|
||||
68,
|
||||
130,
|
||||
322,
|
||||
146
|
||||
],
|
||||
"page_idx": 2
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "- Accept raw documents (PDF, DOCX, images, HTML) - Submit to MinerU API for parsing - Poll task status until state $\\underline { { \\underline { { \\mathbf { \\delta \\pi } } } } }$ done ",
|
||||
"bbox": [
|
||||
85,
|
||||
153,
|
||||
531,
|
||||
217
|
||||
],
|
||||
"page_idx": 2
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Stage 2: Content Extraction ",
|
||||
"bbox": [
|
||||
68,
|
||||
249,
|
||||
322,
|
||||
263
|
||||
],
|
||||
"page_idx": 2
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "- Download and decompress full_zip_url - Parse content_list.json into Document objects - Separate text blocks, tables, images, equations ",
|
||||
"bbox": [
|
||||
85,
|
||||
272,
|
||||
542,
|
||||
335
|
||||
],
|
||||
"page_idx": 2
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Stage 3: Entity & Relation Extraction ",
|
||||
"bbox": [
|
||||
67,
|
||||
367,
|
||||
415,
|
||||
381
|
||||
],
|
||||
"page_idx": 2
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "- Feed text blocks to LangExtract - Extract entities with char_interval positions - Extract relationships between entities ",
|
||||
"bbox": [
|
||||
85,
|
||||
390,
|
||||
526,
|
||||
454
|
||||
],
|
||||
"page_idx": 2
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Stage 4: Graph Construction ",
|
||||
"bbox": [
|
||||
68,
|
||||
485,
|
||||
322,
|
||||
500
|
||||
],
|
||||
"page_idx": 2
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "- Map extractions to graph nodes and edges - Store with source provenance (page_idx, bbox) - Build vector embeddings for semantic search ",
|
||||
"bbox": [
|
||||
85,
|
||||
508,
|
||||
522,
|
||||
571
|
||||
],
|
||||
"page_idx": 2
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "4. Supported File Formats ",
|
||||
"text_level": 1,
|
||||
"bbox": [
|
||||
50,
|
||||
604,
|
||||
326,
|
||||
620
|
||||
],
|
||||
"page_idx": 2
|
||||
},
|
||||
{
|
||||
"type": "table",
|
||||
"img_path": "images/1ed7aacecd20fecef8dc27ee2fe76dc1ae7fa93c44f7d10878d17a41f21a6bef.jpg",
|
||||
"table_caption": [],
|
||||
"table_footnote": [],
|
||||
"table_body": "<table><tr><td rowspan=1 colspan=1>Format</td><td rowspan=1 colspan=1>Extension</td><td rowspan=1 colspan=1>OCR Required</td><td rowspan=1 colspan=1>ModeI</td></tr><tr><td rowspan=1 colspan=1>PDF (text)</td><td rowspan=1 colspan=1>. pdf</td><td rowspan=1 colspan=1>No</td><td rowspan=1 colspan=1>pipeline / vlm</td></tr><tr><td rowspan=1 colspan=1>PDF (scan)</td><td rowspan=1 colspan=1>. pdf</td><td rowspan=1 colspan=1>Yes</td><td rowspan=1 colspan=1>vIlm</td></tr><tr><td rowspan=1 colspan=1>Word</td><td rowspan=1 colspan=1>. docx</td><td rowspan=1 colspan=1>No</td><td rowspan=1 colspan=1>pipeline</td></tr><tr><td rowspan=1 colspan=1>PowerPoint</td><td rowspan=1 colspan=1>.pptx</td><td rowspan=1 colspan=1>No</td><td rowspan=1 colspan=1>pipeline</td></tr><tr><td rowspan=1 colspan=1>Image</td><td rowspan=1 colspan=1>.png / .jpg</td><td rowspan=1 colspan=1>Auto</td><td rowspan=1 colspan=1>vIlm</td></tr><tr><td rowspan=1 colspan=1>HTML</td><td rowspan=1 colspan=1>.html</td><td rowspan=1 colspan=1>No</td><td rowspan=1 colspan=1>MinerU-HTML</td></tr></table>",
|
||||
"bbox": [
|
||||
45,
|
||||
634,
|
||||
882,
|
||||
806
|
||||
],
|
||||
"page_idx": 2
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "5. API Configuration Reference ",
|
||||
"text_level": 1,
|
||||
"bbox": [
|
||||
48,
|
||||
42,
|
||||
457,
|
||||
63
|
||||
],
|
||||
"page_idx": 3
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "The following environment variables must be configured before running the MinerU parsing service: ",
|
||||
"bbox": [
|
||||
48,
|
||||
83,
|
||||
952,
|
||||
123
|
||||
],
|
||||
"page_idx": 3
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "MINERU_API_TOKEN : Bearer token for API authentication \nMINERU_USER_UID : User UUID for quota management \nMINERU_BASE_URL : https://mineru.net/api/v4 \nMINERU_MODEL_VERSION : pipeline (default) | vlm | MinerU-HTML \nMINERU_LANGUAGE : ch (Chinese) | en (English) \nMINERU_IS_OCR : false (text PDF) | true (scanned PDF) \nMINERU_ENABLE_FORMULA: true | false \nMINERU_ENABLE_TABLE : true | false ",
|
||||
"bbox": [
|
||||
65,
|
||||
152,
|
||||
636,
|
||||
337
|
||||
],
|
||||
"page_idx": 3
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Rate Limits: ",
|
||||
"bbox": [
|
||||
48,
|
||||
367,
|
||||
161,
|
||||
381
|
||||
],
|
||||
"page_idx": 3
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": "- Max file size : 200 MB per file - Max pages : 600 pages per file - Daily quota : 2000 pages (high priority) - Batch limit : 200 files per request ",
|
||||
"bbox": [
|
||||
65,
|
||||
388,
|
||||
504,
|
||||
478
|
||||
],
|
||||
"page_idx": 3
|
||||
}
|
||||
]
|
||||
Binary file not shown.
71
backend/output/8456b615_sample_graphrag_overview/full.md
Normal file
71
backend/output/8456b615_sample_graphrag_overview/full.md
Normal file
@@ -0,0 +1,71 @@
|
||||
# GraphRAG System
|
||||
|
||||
Technical Architecture Overview
|
||||
|
||||
Version 1.0 | March 2026
|
||||
|
||||
# 1. Abstract
|
||||
|
||||
This document presents the technical architecture of a Multimodal GraphRAG System designed for intelligent document parsing and knowledge graph construction. The system integrates MinerU for document parsing, LangExtract for structured entity extraction, and a graph database for knowledge storage and retrieval.
|
||||
|
||||
The pipeline supports multiple document formats including PDF, DOCX, PPTX, and image files. Extracted entities and relations are stored as graph nodes and edges, enabling semantic search and question answering over large document collections.
|
||||
|
||||
# 2. System Components
|
||||
|
||||
# 2.1 Document Parsing Module
|
||||
|
||||
MinerU Cloud API (v4) serves as the document parsing backend. It accepts PDF, DOCX, PPTX, PNG, JPG, and HTML files. Output includes Markdown text, structured content_list.json, and extracted images.
|
||||
|
||||
# 2.2 Entity Extraction Module
|
||||
|
||||
LangExtract (v1.1.1) performs structured information extraction from plain text using few-shot prompting with LLM backends (Gemini, OpenAI, or local Ollama). Each extraction includes character-level position anchoring.
|
||||
|
||||
# 2.3 Knowledge Graph Module
|
||||
|
||||
Extracted entities and relationships are stored in a graph database. Node types include: Person, Organization, Location, Event, Concept. Edge types include: RELATED_TO, BELONGS_TO, CAUSED_BY, LOCATED_IN.
|
||||
|
||||
# 2.4 Retrieval Module
|
||||
|
||||
The retrieval layer supports hybrid search combining vector similarity and graph traversal.
|
||||
Query results are ranked by relevance score and returned with source document references.
|
||||
|
||||
# 3. Data Pipeline
|
||||
|
||||
The end-to-end data pipeline consists of the following stages:
|
||||
|
||||
Stage 1: Document Ingestion
|
||||
|
||||
- Accept raw documents (PDF, DOCX, images, HTML) - Submit to MinerU API for parsing - Poll task status until state $\underline { { \underline { { \mathbf { \delta \pi } } } } }$ done
|
||||
|
||||
Stage 2: Content Extraction
|
||||
|
||||
- Download and decompress full_zip_url - Parse content_list.json into Document objects - Separate text blocks, tables, images, equations
|
||||
|
||||
Stage 3: Entity & Relation Extraction
|
||||
|
||||
- Feed text blocks to LangExtract - Extract entities with char_interval positions - Extract relationships between entities
|
||||
|
||||
Stage 4: Graph Construction
|
||||
|
||||
- Map extractions to graph nodes and edges - Store with source provenance (page_idx, bbox) - Build vector embeddings for semantic search
|
||||
|
||||
# 4. Supported File Formats
|
||||
|
||||
<table><tr><td rowspan=1 colspan=1>Format</td><td rowspan=1 colspan=1>Extension</td><td rowspan=1 colspan=1>OCR Required</td><td rowspan=1 colspan=1>ModeI</td></tr><tr><td rowspan=1 colspan=1>PDF (text)</td><td rowspan=1 colspan=1>. pdf</td><td rowspan=1 colspan=1>No</td><td rowspan=1 colspan=1>pipeline / vlm</td></tr><tr><td rowspan=1 colspan=1>PDF (scan)</td><td rowspan=1 colspan=1>. pdf</td><td rowspan=1 colspan=1>Yes</td><td rowspan=1 colspan=1>vIlm</td></tr><tr><td rowspan=1 colspan=1>Word</td><td rowspan=1 colspan=1>. docx</td><td rowspan=1 colspan=1>No</td><td rowspan=1 colspan=1>pipeline</td></tr><tr><td rowspan=1 colspan=1>PowerPoint</td><td rowspan=1 colspan=1>.pptx</td><td rowspan=1 colspan=1>No</td><td rowspan=1 colspan=1>pipeline</td></tr><tr><td rowspan=1 colspan=1>Image</td><td rowspan=1 colspan=1>.png / .jpg</td><td rowspan=1 colspan=1>Auto</td><td rowspan=1 colspan=1>vIlm</td></tr><tr><td rowspan=1 colspan=1>HTML</td><td rowspan=1 colspan=1>.html</td><td rowspan=1 colspan=1>No</td><td rowspan=1 colspan=1>MinerU-HTML</td></tr></table>
|
||||
|
||||
# 5. API Configuration Reference
|
||||
|
||||
The following environment variables must be configured before running the MinerU parsing service:
|
||||
|
||||
MINERU_API_TOKEN : Bearer token for API authentication
|
||||
MINERU_USER_UID : User UUID for quota management
|
||||
MINERU_BASE_URL : https://mineru.net/api/v4
|
||||
MINERU_MODEL_VERSION : pipeline (default) | vlm | MinerU-HTML
|
||||
MINERU_LANGUAGE : ch (Chinese) | en (English)
|
||||
MINERU_IS_OCR : false (text PDF) | true (scanned PDF)
|
||||
MINERU_ENABLE_FORMULA: true | false
|
||||
MINERU_ENABLE_TABLE : true | false
|
||||
|
||||
Rate Limits:
|
||||
|
||||
- Max file size : 200 MB per file - Max pages : 600 pages per file - Daily quota : 2000 pages (high priority) - Batch limit : 200 files per request
|
||||
Binary file not shown.
|
After Width: | Height: | Size: 56 KiB |
4063
backend/output/8456b615_sample_graphrag_overview/layout.json
Normal file
4063
backend/output/8456b615_sample_graphrag_overview/layout.json
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,10 @@
|
||||
{
|
||||
"total_blocks": 32,
|
||||
"type_distribution": {
|
||||
"text": 31,
|
||||
"table": 1
|
||||
},
|
||||
"total_pages": 4,
|
||||
"text_block_count": 31,
|
||||
"table_block_count": 1
|
||||
}
|
||||
0
backend/pipeline/__init__.py
Normal file
0
backend/pipeline/__init__.py
Normal file
66
backend/pipeline/entity_extractor.py
Normal file
66
backend/pipeline/entity_extractor.py
Normal file
@@ -0,0 +1,66 @@
|
||||
"""
|
||||
Entity Extractor — LangExtract + DeepSeek entity extraction.
|
||||
Independent implementation for the GraphRAG Studio backend.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from dotenv import load_dotenv
|
||||
|
||||
import langextract as lx
|
||||
from langextract.providers.openai import OpenAILanguageModel
|
||||
|
||||
load_dotenv(Path(__file__).parent.parent / ".env", override=True)
|
||||
|
||||
DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY", "")
|
||||
DEEPSEEK_BASE_URL = os.getenv("DEEPSEEK_BASE_URL", "https://api.deepseek.com")
|
||||
MODEL_ID = "deepseek-chat"
|
||||
|
||||
PROMPT_DESCRIPTION = (
|
||||
"Extract named entities from the text in order of appearance. "
|
||||
"Entity types: TECHNOLOGY (software, algorithms, models, tools), "
|
||||
"ORGANIZATION (companies, research groups, institutions), "
|
||||
"PERSON (individual people), "
|
||||
"LOCATION (places, geographic entities), "
|
||||
"CONCEPT (technical concepts, methodologies, frameworks)."
|
||||
)
|
||||
|
||||
EXAMPLES = [
|
||||
lx.data.ExampleData(
|
||||
text=(
|
||||
"LangChain is a framework created by Harrison Chase for building "
|
||||
"LLM applications. It integrates with OpenAI models and Pinecone "
|
||||
"vector database for semantic search."
|
||||
),
|
||||
extractions=[
|
||||
lx.data.Extraction(extraction_class="TECHNOLOGY", extraction_text="LangChain"),
|
||||
lx.data.Extraction(extraction_class="PERSON", extraction_text="Harrison Chase"),
|
||||
lx.data.Extraction(extraction_class="CONCEPT", extraction_text="LLM applications"),
|
||||
lx.data.Extraction(extraction_class="TECHNOLOGY", extraction_text="OpenAI models"),
|
||||
lx.data.Extraction(extraction_class="TECHNOLOGY", extraction_text="Pinecone"),
|
||||
lx.data.Extraction(extraction_class="CONCEPT", extraction_text="semantic search"),
|
||||
],
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
def create_model() -> OpenAILanguageModel:
|
||||
if not DEEPSEEK_API_KEY:
|
||||
raise ValueError("DEEPSEEK_API_KEY not set in backend/.env")
|
||||
return OpenAILanguageModel(
|
||||
model_id=MODEL_ID,
|
||||
api_key=DEEPSEEK_API_KEY,
|
||||
base_url=DEEPSEEK_BASE_URL,
|
||||
)
|
||||
|
||||
|
||||
def extract_entities(page_text: str, model: OpenAILanguageModel) -> lx.data.AnnotatedDocument:
|
||||
return lx.extract(
|
||||
text_or_documents=page_text,
|
||||
prompt_description=PROMPT_DESCRIPTION,
|
||||
examples=EXAMPLES,
|
||||
model=model,
|
||||
show_progress=False,
|
||||
)
|
||||
123
backend/pipeline/kg_builder.py
Normal file
123
backend/pipeline/kg_builder.py
Normal file
@@ -0,0 +1,123 @@
|
||||
"""
|
||||
KG Builder — node deduplication + CO_OCCURS_IN edge generation.
|
||||
Independent implementation for the GraphRAG Studio backend.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from collections import defaultdict
|
||||
|
||||
import langextract as lx
|
||||
|
||||
from pipeline.text_assembler import PageText
|
||||
|
||||
ACCEPTED_ALIGNMENTS = {"match_exact", "match_greater", "match_lesser"}
|
||||
|
||||
|
||||
def build_kg(
|
||||
pages: list[PageText],
|
||||
annotated_docs: list[lx.data.AnnotatedDocument],
|
||||
source_doc_id: str,
|
||||
) -> tuple[list[dict], list[dict]]:
|
||||
"""Build KG nodes and edges from LangExtract results.
|
||||
|
||||
Returns:
|
||||
(nodes, edges) — deduplicated node list and edge list.
|
||||
"""
|
||||
# Phase 1: collect raw entities
|
||||
raw_entities = []
|
||||
for page, doc in zip(pages, annotated_docs):
|
||||
if not doc.extractions:
|
||||
continue
|
||||
for ext in doc.extractions:
|
||||
status = ext.alignment_status.value if ext.alignment_status else None
|
||||
if status not in ACCEPTED_ALIGNMENTS:
|
||||
continue
|
||||
char_start = ext.char_interval.start_pos if ext.char_interval else None
|
||||
char_end = ext.char_interval.end_pos if ext.char_interval else None
|
||||
raw_entities.append({
|
||||
"name": ext.extraction_text,
|
||||
"type": ext.extraction_class,
|
||||
"char_start": char_start,
|
||||
"char_end": char_end,
|
||||
"confidence": status,
|
||||
"page": page.page_idx,
|
||||
"source_doc": source_doc_id,
|
||||
})
|
||||
|
||||
# Phase 2: deduplicate nodes
|
||||
seen: dict[tuple[str, str], int] = {}
|
||||
nodes: list[dict] = []
|
||||
node_pages: dict[int, set[int]] = defaultdict(set)
|
||||
|
||||
for entity in raw_entities:
|
||||
type_prefix = entity["type"].lower()[:4]
|
||||
name_slug = entity["name"].lower().replace(" ", "")[:12]
|
||||
dedup_key = (entity["name"].lower(), entity["type"])
|
||||
if dedup_key not in seen:
|
||||
node_idx = len(nodes)
|
||||
seen[dedup_key] = node_idx
|
||||
nodes.append({
|
||||
"id": f"{type_prefix}_{name_slug}_{node_idx}",
|
||||
"name": entity["name"],
|
||||
"type": entity["type"],
|
||||
"source_doc": entity["source_doc"],
|
||||
"char_start": entity["char_start"],
|
||||
"char_end": entity["char_end"],
|
||||
"confidence": entity["confidence"],
|
||||
"page": entity["page"],
|
||||
})
|
||||
node_idx = seen[dedup_key]
|
||||
node_pages[node_idx].add(entity["page"])
|
||||
|
||||
# Phase 3: CO_OCCURS_IN edges
|
||||
page_nodes: dict[int, list[int]] = defaultdict(list)
|
||||
for node_idx, page_set in node_pages.items():
|
||||
for page_idx in page_set:
|
||||
page_nodes[page_idx].append(node_idx)
|
||||
|
||||
edges: list[dict] = []
|
||||
edge_seen: set[tuple] = set()
|
||||
|
||||
for page_idx, node_indices in sorted(page_nodes.items()):
|
||||
for i in range(len(node_indices)):
|
||||
for j in range(i + 1, len(node_indices)):
|
||||
a = nodes[node_indices[i]]["id"]
|
||||
b = nodes[node_indices[j]]["id"]
|
||||
src, tgt = (a, b) if a < b else (b, a)
|
||||
key = (src, tgt, source_doc_id, page_idx)
|
||||
if key in edge_seen:
|
||||
continue
|
||||
edge_seen.add(key)
|
||||
edges.append({
|
||||
"source": src,
|
||||
"target": tgt,
|
||||
"relation": "CO_OCCURS_IN",
|
||||
"doc_id": source_doc_id,
|
||||
"page": page_idx,
|
||||
})
|
||||
|
||||
return nodes, edges
|
||||
|
||||
|
||||
def extractions_to_records(
|
||||
pages: list[PageText],
|
||||
annotated_docs: list[lx.data.AnnotatedDocument],
|
||||
doc_id: str,
|
||||
) -> list[dict]:
|
||||
"""Flatten LangExtract results to ExtractionRecord dicts."""
|
||||
records = []
|
||||
for page, doc in zip(pages, annotated_docs):
|
||||
if not doc.extractions:
|
||||
continue
|
||||
for ext in doc.extractions:
|
||||
status = ext.alignment_status.value if ext.alignment_status else None
|
||||
records.append({
|
||||
"text": ext.extraction_text,
|
||||
"type": ext.extraction_class,
|
||||
"char_start": ext.char_interval.start_pos if ext.char_interval else None,
|
||||
"char_end": ext.char_interval.end_pos if ext.char_interval else None,
|
||||
"alignment": status,
|
||||
"page": page.page_idx,
|
||||
"doc_id": doc_id,
|
||||
})
|
||||
return records
|
||||
217
backend/pipeline/qa_agent.py
Normal file
217
backend/pipeline/qa_agent.py
Normal file
@@ -0,0 +1,217 @@
|
||||
"""
|
||||
QA Agent — LangGraph ReAct agent over the knowledge graph.
|
||||
Independent implementation for the GraphRAG Studio backend.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
import networkx as nx
|
||||
from dotenv import load_dotenv
|
||||
from langchain.tools import tool
|
||||
from langchain_openai import ChatOpenAI
|
||||
from langchain_core.messages import HumanMessage, AIMessage, ToolMessage, SystemMessage
|
||||
from langgraph.prebuilt import create_react_agent
|
||||
|
||||
load_dotenv(Path(__file__).parent.parent / ".env", override=True)
|
||||
|
||||
DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY", "")
|
||||
DEEPSEEK_BASE_URL = os.getenv("DEEPSEEK_BASE_URL", "https://api.deepseek.com")
|
||||
|
||||
|
||||
def build_kg_graph(nodes: list[dict], edges: list[dict]) -> nx.Graph:
|
||||
G = nx.Graph()
|
||||
for n in nodes:
|
||||
G.add_node(n["id"], **n)
|
||||
for e in edges:
|
||||
G.add_edge(e["source"], e["target"], **{k: v for k, v in e.items() if k not in ("source", "target")})
|
||||
return G
|
||||
|
||||
|
||||
def make_tools(G: nx.Graph) -> list:
|
||||
@tool
|
||||
def search_entities(query: str) -> str:
|
||||
"""Search knowledge graph entities by name (case-insensitive substring).
|
||||
Args:
|
||||
query: Keyword to search for in entity names.
|
||||
"""
|
||||
q = query.lower()
|
||||
matches = [data for _, data in G.nodes(data=True) if q in data.get("name", "").lower()]
|
||||
if not matches:
|
||||
sample = ", ".join(d.get("name", "") for _, d in list(G.nodes(data=True))[:8])
|
||||
return f"No entities found matching '{query}'. Sample: {sample}"
|
||||
lines = [f"Found {len(matches)} entity(ies) matching '{query}':"]
|
||||
for m in matches[:15]:
|
||||
lines.append(
|
||||
f" [{m['type']}] \"{m['name']}\" "
|
||||
f"(confidence={m.get('confidence','?')}, page={m.get('page',0)}, id={m['id']})"
|
||||
)
|
||||
return "\n".join(lines)
|
||||
|
||||
@tool
|
||||
def get_neighbors(entity_name: str, hops: int = 1) -> str:
|
||||
"""Get N-hop neighbors of an entity in the knowledge graph.
|
||||
Args:
|
||||
entity_name: Entity name (partial match).
|
||||
hops: Number of hops (1-3, default 1).
|
||||
"""
|
||||
hops = max(1, min(int(hops), 3))
|
||||
candidates = [(nid, d) for nid, d in G.nodes(data=True)
|
||||
if entity_name.lower() in d.get("name", "").lower()]
|
||||
if not candidates:
|
||||
return f"Entity '{entity_name}' not found. Use search_entities first."
|
||||
node_id, node_data = candidates[0]
|
||||
reachable = nx.single_source_shortest_path_length(G, node_id, cutoff=hops)
|
||||
by_hop: dict[int, list] = {}
|
||||
for nid, dist in reachable.items():
|
||||
if dist > 0:
|
||||
by_hop.setdefault(dist, []).append(G.nodes[nid])
|
||||
lines = [f"Neighbors of '{node_data['name']}' [{node_data['type']}] within {hops} hop(s):"]
|
||||
for hop in sorted(by_hop.keys()):
|
||||
hop_nodes = by_hop[hop]
|
||||
lines.append(f"\n Hop {hop} — {len(hop_nodes)} related entities:")
|
||||
for n in hop_nodes[:20]:
|
||||
lines.append(f" [{n.get('type','?')}] {n.get('name','?')}")
|
||||
if len(hop_nodes) > 20:
|
||||
lines.append(f" ... and {len(hop_nodes)-20} more")
|
||||
lines.append(f"\n Total related entities: {sum(len(v) for v in by_hop.values())}")
|
||||
return "\n".join(lines)
|
||||
|
||||
@tool
|
||||
def get_entities_by_type(entity_type: str) -> str:
|
||||
"""List all entities of a specific type.
|
||||
Args:
|
||||
entity_type: TECHNOLOGY, CONCEPT, PERSON, ORGANIZATION, or LOCATION.
|
||||
"""
|
||||
t_upper = entity_type.strip().upper()
|
||||
valid = {"TECHNOLOGY", "CONCEPT", "PERSON", "ORGANIZATION", "LOCATION"}
|
||||
if t_upper not in valid:
|
||||
present = sorted({d.get("type","") for _, d in G.nodes(data=True)})
|
||||
return f"Unknown type '{entity_type}'. Present: {present}"
|
||||
matches = [d for _, d in G.nodes(data=True) if d.get("type","") == t_upper]
|
||||
if not matches:
|
||||
return f"No {t_upper} entities found."
|
||||
lines = [f"Found {len(matches)} {t_upper} entities:"]
|
||||
for m in matches[:30]:
|
||||
lines.append(f" \"{m['name']}\" (page={m.get('page',0)}, id={m['id']})")
|
||||
if len(matches) > 30:
|
||||
lines.append(f" ... and {len(matches)-30} more")
|
||||
return "\n".join(lines)
|
||||
|
||||
@tool
|
||||
def describe_graph() -> str:
|
||||
"""Get an overview of the knowledge graph statistics."""
|
||||
n_nodes = G.number_of_nodes()
|
||||
n_edges = G.number_of_edges()
|
||||
type_counts: dict[str, int] = {}
|
||||
for _, d in G.nodes(data=True):
|
||||
t = d.get("type", "UNKNOWN")
|
||||
type_counts[t] = type_counts.get(t, 0) + 1
|
||||
lines = [
|
||||
f"Knowledge Graph Overview:",
|
||||
f" Nodes: {n_nodes}",
|
||||
f" Edges: {n_edges}",
|
||||
f" Entity types: {type_counts}",
|
||||
]
|
||||
if n_nodes > 0:
|
||||
centrality = nx.degree_centrality(G)
|
||||
top5 = sorted(centrality.items(), key=lambda x: x[1], reverse=True)[:5]
|
||||
lines.append(" Top 5 central nodes:")
|
||||
for nid, c in top5:
|
||||
nd = G.nodes[nid]
|
||||
lines.append(f" [{nd.get('type','?')}] {nd.get('name','?')} (centrality={c:.3f})")
|
||||
return "\n".join(lines)
|
||||
|
||||
return [search_entities, get_neighbors, get_entities_by_type, describe_graph]
|
||||
|
||||
|
||||
def run_qa(
|
||||
question: str,
|
||||
history: list[dict],
|
||||
nodes: list[dict],
|
||||
edges: list[dict],
|
||||
) -> dict:
|
||||
"""Run Agentic-RAG QA. Returns dict with answer, tool_calls, cited_nodes."""
|
||||
if not DEEPSEEK_API_KEY:
|
||||
raise ValueError("DEEPSEEK_API_KEY not set in backend/.env")
|
||||
|
||||
G = build_kg_graph(nodes, edges)
|
||||
tools = make_tools(G)
|
||||
|
||||
llm = ChatOpenAI(
|
||||
model="deepseek-chat",
|
||||
api_key=DEEPSEEK_API_KEY,
|
||||
base_url=DEEPSEEK_BASE_URL,
|
||||
temperature=0,
|
||||
)
|
||||
|
||||
system_prompt = (
|
||||
"You are a helpful assistant with access to a knowledge graph (KG) built from the user's documents.\n"
|
||||
"\n"
|
||||
"Guidelines:\n"
|
||||
"- If the question is clearly unrelated to the KG (greetings, math, general knowledge, etc.), "
|
||||
"answer directly WITHOUT using any tools.\n"
|
||||
"- If the question might be answered by the KG (topics related to entities in the documents), "
|
||||
"use the tools to search and explore before answering.\n"
|
||||
"- When you DO use the KG, cite the entity names and types you found.\n"
|
||||
"- If the KG has no relevant information, say so honestly and answer from general knowledge if possible.\n"
|
||||
"\n"
|
||||
"Available tools: search entities by name, get neighbors, list entities by type, get graph overview."
|
||||
)
|
||||
|
||||
agent = create_react_agent(llm, tools, prompt=system_prompt)
|
||||
|
||||
# Build messages: system + history + current question
|
||||
messages: list = []
|
||||
for msg in history[-8:]:
|
||||
role = msg.get("role", "human")
|
||||
content = msg.get("content", "") or msg.get("answer", "")
|
||||
if role == "human":
|
||||
messages.append(HumanMessage(content=msg.get("question", content)))
|
||||
else:
|
||||
messages.append(AIMessage(content=content))
|
||||
messages.append(HumanMessage(content=question))
|
||||
|
||||
result = agent.invoke({"messages": messages})
|
||||
|
||||
# Extract answer from last AIMessage
|
||||
answer = ""
|
||||
for msg in reversed(result.get("messages", [])):
|
||||
if isinstance(msg, AIMessage) and msg.content and not msg.tool_calls:
|
||||
answer = msg.content
|
||||
break
|
||||
|
||||
# Extract tool calls and cited node IDs from message history
|
||||
tool_calls = []
|
||||
cited_node_ids: set[str] = set()
|
||||
step = 0
|
||||
all_messages = result.get("messages", [])
|
||||
for i, msg in enumerate(all_messages):
|
||||
if isinstance(msg, AIMessage) and msg.tool_calls:
|
||||
for tc in msg.tool_calls:
|
||||
step += 1
|
||||
# Find the corresponding ToolMessage
|
||||
output = ""
|
||||
for j in range(i + 1, len(all_messages)):
|
||||
tm = all_messages[j]
|
||||
if isinstance(tm, ToolMessage) and tm.tool_call_id == tc.get("id"):
|
||||
output = tm.content
|
||||
break
|
||||
tool_input = tc.get("args", {})
|
||||
tool_calls.append({
|
||||
"step": step,
|
||||
"tool_name": tc.get("name", ""),
|
||||
"tool_input": str(tool_input),
|
||||
"tool_output": str(output),
|
||||
})
|
||||
# Extract node IDs mentioned in tool output
|
||||
for node_id in re.findall(r'\bid=([^\s,\)\]]+)', str(output)):
|
||||
cited_node_ids.add(node_id)
|
||||
|
||||
return {
|
||||
"answer": answer,
|
||||
"tool_calls": tool_calls,
|
||||
"cited_nodes": list(cited_node_ids),
|
||||
}
|
||||
107
backend/pipeline/text_assembler.py
Normal file
107
backend/pipeline/text_assembler.py
Normal file
@@ -0,0 +1,107 @@
|
||||
"""
|
||||
Text Assembler — MinerU content_list.json → per-page plain text.
|
||||
Independent implementation for the GraphRAG Studio backend.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import dataclasses
|
||||
import json
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class BlockSpan:
|
||||
block_index: int
|
||||
block_type: str
|
||||
page_idx: int
|
||||
char_start: int
|
||||
char_end: int
|
||||
bbox: list
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class PageText:
|
||||
page_idx: int
|
||||
text: str
|
||||
block_spans: list[BlockSpan]
|
||||
|
||||
|
||||
def html_table_to_text(table_body: str) -> str:
|
||||
soup = BeautifulSoup(table_body, "html.parser")
|
||||
rows = []
|
||||
for tr in soup.find_all("tr"):
|
||||
cells = [td.get_text(strip=True) for td in tr.find_all(["td", "th"])]
|
||||
rows.append(" | ".join(cells))
|
||||
return "\n".join(rows)
|
||||
|
||||
|
||||
def load_content_list(path: Path) -> list[dict]:
|
||||
if path.is_dir():
|
||||
matches = list(path.glob("*_content_list.json"))
|
||||
if not matches:
|
||||
matches = list(path.glob("*content_list.json"))
|
||||
if not matches:
|
||||
raise FileNotFoundError(f"No content_list.json found in {path}")
|
||||
path = matches[0]
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
def assemble_pages(content_list: list[dict]) -> list[PageText]:
|
||||
pages: dict[int, list[tuple[int, dict]]] = defaultdict(list)
|
||||
for i, block in enumerate(content_list):
|
||||
page_idx = block.get("page_idx", 0)
|
||||
pages[page_idx].append((i, block))
|
||||
|
||||
result = []
|
||||
for page_idx in sorted(pages.keys()):
|
||||
blocks = pages[page_idx]
|
||||
buffer = []
|
||||
spans = []
|
||||
cursor = 0
|
||||
|
||||
for block_index, block in blocks:
|
||||
block_type = block.get("type", "unknown")
|
||||
bbox = block.get("bbox", [0, 0, 0, 0])
|
||||
|
||||
if block_type == "text":
|
||||
block_text = block.get("text", "").rstrip()
|
||||
elif block_type == "table":
|
||||
table_body = block.get("table_body", "")
|
||||
block_text = html_table_to_text(table_body) if table_body else ""
|
||||
else:
|
||||
continue
|
||||
|
||||
if not block_text:
|
||||
continue
|
||||
|
||||
char_start = cursor
|
||||
buffer.append(block_text)
|
||||
cursor += len(block_text)
|
||||
char_end = cursor
|
||||
|
||||
spans.append(BlockSpan(
|
||||
block_index=block_index,
|
||||
block_type=block_type,
|
||||
page_idx=page_idx,
|
||||
char_start=char_start,
|
||||
char_end=char_end,
|
||||
bbox=bbox,
|
||||
))
|
||||
buffer.append("\n")
|
||||
cursor += 1
|
||||
|
||||
text = "".join(buffer).rstrip("\n")
|
||||
result.append(PageText(page_idx=page_idx, text=text, block_spans=spans))
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def count_blocks_by_type(content_list: list[dict]) -> dict[str, int]:
|
||||
counts: dict[str, int] = defaultdict(int)
|
||||
for block in content_list:
|
||||
counts[block.get("type", "unknown")] += 1
|
||||
return dict(counts)
|
||||
22
backend/pyproject.toml
Normal file
22
backend/pyproject.toml
Normal file
@@ -0,0 +1,22 @@
|
||||
[project]
|
||||
name = "graphrag-studio-backend"
|
||||
version = "1.0.0"
|
||||
description = "GraphRAG Studio — FastAPI backend service"
|
||||
requires-python = ">=3.12"
|
||||
dependencies = [
|
||||
"fastapi>=0.104.0",
|
||||
"uvicorn[standard]>=0.24.0",
|
||||
"python-multipart>=0.0.6",
|
||||
"langextract[all]>=0.1.0",
|
||||
"langchain>=0.2.0",
|
||||
"langchain-openai>=0.1.0",
|
||||
"langgraph>=0.1.0",
|
||||
"networkx>=3.0",
|
||||
"python-dotenv>=1.0.0",
|
||||
"requests>=2.31.0",
|
||||
"beautifulsoup4>=4.12.0",
|
||||
]
|
||||
|
||||
[build-system]
|
||||
requires = ["hatchling"]
|
||||
build-backend = "hatchling.build"
|
||||
0
backend/routers/__init__.py
Normal file
0
backend/routers/__init__.py
Normal file
71
backend/routers/documents.py
Normal file
71
backend/routers/documents.py
Normal file
@@ -0,0 +1,71 @@
|
||||
"""A 组:文档管理(4 个端点)"""
|
||||
from fastapi import APIRouter, File, Form, HTTPException, UploadFile
|
||||
from fastapi.responses import JSONResponse
|
||||
|
||||
from models.schemas import APIResponse
|
||||
from services import document_service as svc
|
||||
|
||||
router = APIRouter(prefix="/documents", tags=["Documents"])
|
||||
|
||||
|
||||
@router.post("/upload", status_code=200)
|
||||
async def upload_document(
|
||||
file: UploadFile = File(...),
|
||||
language: str = Form("ch"),
|
||||
enable_formula: bool = Form(True),
|
||||
enable_table: bool = Form(True),
|
||||
):
|
||||
content = await file.read()
|
||||
ok, code, msg = svc.validate_upload(file.filename or "", len(content))
|
||||
if not ok:
|
||||
return JSONResponse(
|
||||
status_code=400,
|
||||
content=APIResponse.err(code, msg).model_dump(),
|
||||
)
|
||||
doc = svc.save_upload(file.filename or "upload", content, language, enable_formula, enable_table)
|
||||
# Remove internal field
|
||||
doc.pop("upload_filename", None)
|
||||
return APIResponse.ok(doc)
|
||||
|
||||
|
||||
@router.get("/{doc_id}")
|
||||
async def get_document(doc_id: str):
|
||||
doc = svc.get_document(doc_id)
|
||||
if not doc:
|
||||
return JSONResponse(
|
||||
status_code=404,
|
||||
content=APIResponse.err(2001, f"Document '{doc_id}' not found").model_dump(),
|
||||
)
|
||||
doc.pop("upload_filename", None)
|
||||
return APIResponse.ok(doc)
|
||||
|
||||
|
||||
@router.get("")
|
||||
async def list_documents(
|
||||
page: int = 1,
|
||||
page_size: int = 20,
|
||||
status: str | None = None,
|
||||
format: str | None = None,
|
||||
):
|
||||
page_size = min(page_size, 100)
|
||||
result = svc.list_documents(page, page_size, status, format)
|
||||
for item in result["items"]:
|
||||
item.pop("upload_filename", None)
|
||||
return APIResponse.ok(result)
|
||||
|
||||
|
||||
@router.delete("/{doc_id}")
|
||||
async def delete_document(doc_id: str):
|
||||
doc = svc.get_document(doc_id)
|
||||
if not doc:
|
||||
return JSONResponse(
|
||||
status_code=404,
|
||||
content=APIResponse.err(2001, f"Document '{doc_id}' not found").model_dump(),
|
||||
)
|
||||
ok, removed_nodes, removed_edges = svc.delete_document(doc_id)
|
||||
return APIResponse.ok({
|
||||
"deleted": True,
|
||||
"doc_id": doc_id,
|
||||
"removed_nodes": removed_nodes,
|
||||
"removed_edges": removed_edges,
|
||||
})
|
||||
70
backend/routers/indexing.py
Normal file
70
backend/routers/indexing.py
Normal file
@@ -0,0 +1,70 @@
|
||||
"""B 组:Indexing Pipeline(4 个端点)"""
|
||||
from fastapi import APIRouter
|
||||
from fastapi.responses import JSONResponse
|
||||
|
||||
from models.schemas import APIResponse, StartIndexRequest
|
||||
from services import document_service as doc_svc
|
||||
from services import indexing_service as idx_svc
|
||||
|
||||
router = APIRouter(prefix="/index", tags=["Indexing"])
|
||||
|
||||
|
||||
@router.post("/start", status_code=202)
|
||||
async def start_indexing(body: StartIndexRequest):
|
||||
doc = doc_svc.get_document(body.doc_id)
|
||||
if not doc:
|
||||
return JSONResponse(
|
||||
status_code=404,
|
||||
content=APIResponse.err(2001, f"Document '{body.doc_id}' not found").model_dump(),
|
||||
)
|
||||
meta = idx_svc.start_indexing(body.doc_id)
|
||||
return APIResponse.ok({
|
||||
"job_id": meta["job_id"],
|
||||
"doc_id": meta["doc_id"],
|
||||
"status": meta["status"],
|
||||
"stage": meta["stage"],
|
||||
"created_at": meta["created_at"],
|
||||
})
|
||||
|
||||
|
||||
@router.get("/status/{job_id}")
|
||||
async def get_job_status(job_id: str):
|
||||
meta = idx_svc.get_job_status(job_id)
|
||||
if not meta:
|
||||
return JSONResponse(
|
||||
status_code=404,
|
||||
content=APIResponse.err(2002, f"Job '{job_id}' not found").model_dump(),
|
||||
)
|
||||
return APIResponse.ok(meta)
|
||||
|
||||
|
||||
@router.get("/result/{job_id}")
|
||||
async def get_job_result(job_id: str):
|
||||
result = idx_svc.get_job_result(job_id)
|
||||
if not result:
|
||||
return JSONResponse(
|
||||
status_code=404,
|
||||
content=APIResponse.err(2002, f"Job '{job_id}' not found").model_dump(),
|
||||
)
|
||||
if result.get("status") not in ("done",) and "stats" not in result:
|
||||
return JSONResponse(
|
||||
status_code=400,
|
||||
content=APIResponse.err(2003, f"Job '{job_id}' is still running (status={result.get('status')})").model_dump(),
|
||||
)
|
||||
return APIResponse.ok(result)
|
||||
|
||||
|
||||
@router.delete("/jobs/{job_id}")
|
||||
async def cancel_job(job_id: str):
|
||||
meta = idx_svc.get_job_status(job_id)
|
||||
if not meta:
|
||||
return JSONResponse(
|
||||
status_code=404,
|
||||
content=APIResponse.err(2002, f"Job '{job_id}' not found").model_dump(),
|
||||
)
|
||||
ok, prev_status = idx_svc.cancel_job(job_id)
|
||||
return APIResponse.ok({
|
||||
"cancelled": True,
|
||||
"job_id": job_id,
|
||||
"previous_status": prev_status,
|
||||
})
|
||||
72
backend/routers/kg.py
Normal file
72
backend/routers/kg.py
Normal file
@@ -0,0 +1,72 @@
|
||||
"""C 组:知识图谱(6 个端点)"""
|
||||
from fastapi import APIRouter
|
||||
from fastapi.responses import JSONResponse
|
||||
|
||||
from models.schemas import APIResponse
|
||||
from services import kg_service as svc
|
||||
|
||||
router = APIRouter(prefix="/kg", tags=["Knowledge Graph"])
|
||||
|
||||
|
||||
@router.get("/nodes")
|
||||
async def list_nodes(
|
||||
type: str | None = None,
|
||||
doc_id: str | None = None,
|
||||
confidence: str | None = None,
|
||||
page: int = 1,
|
||||
page_size: int = 50,
|
||||
):
|
||||
page_size = min(page_size, 200)
|
||||
result = svc.get_nodes(page, page_size, type, doc_id, confidence)
|
||||
if result["total"] == 0 and not any([type, doc_id, confidence]):
|
||||
return JSONResponse(
|
||||
status_code=400,
|
||||
content=APIResponse.err(3002, "Knowledge graph is empty. Index documents first.").model_dump(),
|
||||
)
|
||||
return APIResponse.ok(result)
|
||||
|
||||
|
||||
@router.get("/edges")
|
||||
async def list_edges(
|
||||
doc_id: str | None = None,
|
||||
relation: str | None = None,
|
||||
page: int = 1,
|
||||
page_size: int = 100,
|
||||
):
|
||||
page_size = min(page_size, 500)
|
||||
result = svc.get_edges(page, page_size, doc_id, relation)
|
||||
return APIResponse.ok(result)
|
||||
|
||||
|
||||
@router.get("/nodes/{node_id}")
|
||||
async def get_node_detail(node_id: str):
|
||||
node = svc.get_node_detail(node_id)
|
||||
if not node:
|
||||
return JSONResponse(
|
||||
status_code=404,
|
||||
content=APIResponse.err(3001, f"Node '{node_id}' not found").model_dump(),
|
||||
)
|
||||
return APIResponse.ok(node)
|
||||
|
||||
|
||||
@router.get("/nodes/{node_id}/neighbors")
|
||||
async def get_node_neighbors(node_id: str, hops: int = 1):
|
||||
result = svc.get_neighbors(node_id, hops)
|
||||
if result is None:
|
||||
return JSONResponse(
|
||||
status_code=404,
|
||||
content=APIResponse.err(3001, f"Node '{node_id}' not found").model_dump(),
|
||||
)
|
||||
return APIResponse.ok(result)
|
||||
|
||||
|
||||
@router.get("/stats")
|
||||
async def get_kg_stats():
|
||||
stats = svc.get_stats()
|
||||
return APIResponse.ok(stats)
|
||||
|
||||
|
||||
@router.get("/export")
|
||||
async def export_kg(format: str = "json", doc_id: str | None = None):
|
||||
result = svc.export_kg(doc_id)
|
||||
return APIResponse.ok(result)
|
||||
66
backend/routers/query.py
Normal file
66
backend/routers/query.py
Normal file
@@ -0,0 +1,66 @@
|
||||
"""D 组:QA 问答(4 个端点)"""
|
||||
import asyncio
|
||||
from functools import partial
|
||||
|
||||
from fastapi import APIRouter
|
||||
from fastapi.responses import JSONResponse
|
||||
|
||||
from models.schemas import APIResponse, BatchQueryRequest, QueryRequest
|
||||
from services import qa_service as svc
|
||||
|
||||
router = APIRouter(prefix="/query", tags=["QA"])
|
||||
|
||||
|
||||
@router.post("")
|
||||
async def run_query(body: QueryRequest):
|
||||
try:
|
||||
loop = asyncio.get_event_loop()
|
||||
result = await loop.run_in_executor(
|
||||
None,
|
||||
partial(svc.run_query, body.question, [m.model_dump() for m in body.history]),
|
||||
)
|
||||
return APIResponse.ok(result)
|
||||
except ValueError as e:
|
||||
if "KG_EMPTY" in str(e):
|
||||
return JSONResponse(
|
||||
status_code=400,
|
||||
content=APIResponse.err(3002, "Knowledge graph is empty. Index documents first.").model_dump(),
|
||||
)
|
||||
return JSONResponse(
|
||||
status_code=500,
|
||||
content=APIResponse.err(4001, str(e)).model_dump(),
|
||||
)
|
||||
except Exception as e:
|
||||
return JSONResponse(
|
||||
status_code=500,
|
||||
content=APIResponse.err(4001, f"QA service error: {e}").model_dump(),
|
||||
)
|
||||
|
||||
|
||||
@router.post("/batch", status_code=202)
|
||||
async def start_batch(body: BatchQueryRequest):
|
||||
if len(body.questions) > 20:
|
||||
return JSONResponse(
|
||||
status_code=400,
|
||||
content=APIResponse.err(1001, "Maximum 20 questions per batch").model_dump(),
|
||||
)
|
||||
result = svc.start_batch(body.questions)
|
||||
return APIResponse.ok(result)
|
||||
|
||||
|
||||
@router.get("/batch/{batch_id}")
|
||||
async def get_batch_result(batch_id: str):
|
||||
result = svc.get_batch_result(batch_id)
|
||||
if not result:
|
||||
return JSONResponse(
|
||||
status_code=404,
|
||||
content=APIResponse.err(2002, f"Batch '{batch_id}' not found").model_dump(),
|
||||
)
|
||||
return APIResponse.ok(result)
|
||||
|
||||
|
||||
@router.get("/history")
|
||||
async def get_query_history(page: int = 1, page_size: int = 20):
|
||||
page_size = min(page_size, 50)
|
||||
result = svc.get_history(page, page_size)
|
||||
return APIResponse.ok(result)
|
||||
43
backend/routers/search.py
Normal file
43
backend/routers/search.py
Normal file
@@ -0,0 +1,43 @@
|
||||
"""E 组:搜索(3 个端点)"""
|
||||
from fastapi import APIRouter, Query, Request
|
||||
from fastapi.responses import JSONResponse
|
||||
|
||||
from models.schemas import APIResponse
|
||||
from services import search_service as svc
|
||||
|
||||
router = APIRouter(prefix="/search", tags=["Search"])
|
||||
|
||||
|
||||
@router.get("/entities")
|
||||
async def search_entities(q: str, type: str | None = None, limit: int = 15):
|
||||
limit = min(limit, 100)
|
||||
result = svc.search_entities(q, type, limit)
|
||||
return APIResponse.ok(result)
|
||||
|
||||
|
||||
@router.get("/path")
|
||||
async def search_path(request: Request, max_hops: int = 3):
|
||||
# 'from' is a Python keyword, read from raw query params
|
||||
params = dict(request.query_params)
|
||||
from_id = params.get("from")
|
||||
to_id = params.get("to")
|
||||
|
||||
if not from_id or not to_id:
|
||||
return JSONResponse(
|
||||
status_code=400,
|
||||
content=APIResponse.err(1001, "Parameters 'from' and 'to' are required").model_dump(),
|
||||
)
|
||||
max_hops = max(1, min(max_hops, 5))
|
||||
result = svc.search_path(from_id, to_id, max_hops)
|
||||
if result is None:
|
||||
return JSONResponse(
|
||||
status_code=404,
|
||||
content=APIResponse.err(3001, "One or both nodes not found").model_dump(),
|
||||
)
|
||||
return APIResponse.ok(result)
|
||||
|
||||
|
||||
@router.get("/graph")
|
||||
async def search_graph(q: str, include_neighbors: bool = False):
|
||||
result = svc.search_graph(q, include_neighbors)
|
||||
return APIResponse.ok(result)
|
||||
171
backend/routers/system.py
Normal file
171
backend/routers/system.py
Normal file
@@ -0,0 +1,171 @@
|
||||
"""F 组:系统(4 个端点)"""
|
||||
import os
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
from fastapi import APIRouter
|
||||
|
||||
from models.schemas import APIResponse
|
||||
from storage import file_store as fs
|
||||
|
||||
router = APIRouter(tags=["System"])
|
||||
|
||||
_START_TIME = time.time()
|
||||
|
||||
|
||||
@router.get("/health")
|
||||
async def health_check():
|
||||
env_path = Path(__file__).parent.parent / ".env"
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv(env_path, override=False)
|
||||
|
||||
mineru_python = Path(os.getenv("MINERU_PYTHON", "F:/GraphRAGAgent/mineru_mvp/.venv/Scripts/python.exe"))
|
||||
backend_python = Path(__file__).parent.parent / ".venv" / "Scripts" / "python.exe"
|
||||
deepseek_key = os.getenv("DEEPSEEK_API_KEY", "")
|
||||
deepseek_url = os.getenv("DEEPSEEK_BASE_URL", "https://api.deepseek.com")
|
||||
|
||||
# Check if langextract is importable from backend's venv
|
||||
try:
|
||||
import subprocess
|
||||
result = subprocess.run(
|
||||
[str(backend_python), "-c", "import langextract; print('ok')"],
|
||||
capture_output=True, text=True, timeout=10
|
||||
)
|
||||
langextract_ok = result.returncode == 0 and "ok" in result.stdout
|
||||
except Exception:
|
||||
langextract_ok = False
|
||||
|
||||
components = {
|
||||
"mineru_venv": {
|
||||
"status": "ok" if mineru_python.exists() else "error",
|
||||
"path": str(mineru_python),
|
||||
"exists": mineru_python.exists(),
|
||||
},
|
||||
"langextract_venv": {
|
||||
"status": "ok" if langextract_ok else "error",
|
||||
"path": str(backend_python),
|
||||
"exists": backend_python.exists(),
|
||||
},
|
||||
"deepseek_api": {
|
||||
"status": "ok" if deepseek_key else "error",
|
||||
"base_url": deepseek_url,
|
||||
"key_configured": bool(deepseek_key),
|
||||
},
|
||||
"storage": {
|
||||
"status": "ok",
|
||||
"kg_nodes_exists": fs.kg_nodes_path().exists(),
|
||||
"kg_edges_exists": fs.kg_edges_path().exists(),
|
||||
"uploads_dir_exists": fs.UPLOADS_DIR.exists(),
|
||||
},
|
||||
}
|
||||
|
||||
overall = "healthy" if all(c["status"] == "ok" for c in components.values()) else "degraded"
|
||||
|
||||
return APIResponse.ok({
|
||||
"status": overall,
|
||||
"version": "1.0.0",
|
||||
"uptime_seconds": round(time.time() - _START_TIME, 1),
|
||||
"components": components,
|
||||
})
|
||||
|
||||
|
||||
@router.get("/system/stats")
|
||||
async def system_stats():
|
||||
from services import indexing_service as idx_svc
|
||||
|
||||
docs = list(fs.load_docs_index().values())
|
||||
nodes = fs.load_kg_nodes()
|
||||
edges = fs.load_kg_edges()
|
||||
history = fs.load_query_history()
|
||||
|
||||
type_dist: dict[str, int] = {}
|
||||
for n in nodes:
|
||||
t = n.get("type", "UNKNOWN")
|
||||
type_dist[t] = type_dist.get(t, 0) + 1
|
||||
|
||||
return APIResponse.ok({
|
||||
"total_documents": len(docs),
|
||||
"indexed_documents": sum(1 for d in docs if d.get("status") == "indexed"),
|
||||
"failed_documents": sum(1 for d in docs if d.get("status") == "failed"),
|
||||
"total_nodes": len(nodes),
|
||||
"total_edges": len(edges),
|
||||
"type_distribution": type_dist,
|
||||
"total_queries": len(history),
|
||||
"active_jobs": idx_svc.count_active_jobs(),
|
||||
"storage_used_mb": fs.storage_used_mb(),
|
||||
})
|
||||
|
||||
|
||||
@router.get("/system/formats")
|
||||
async def list_formats():
|
||||
return APIResponse.ok({
|
||||
"formats": [
|
||||
{"ext": "pdf", "description": "PDF 文档(文本型/扫描型/混合型)", "max_size_mb": 200, "max_pages": 600, "requires_ocr": False},
|
||||
{"ext": "docx", "description": "Microsoft Word(新版)", "max_size_mb": 200, "max_pages": 600, "requires_ocr": False},
|
||||
{"ext": "doc", "description": "Microsoft Word(旧版)", "max_size_mb": 200, "max_pages": 600, "requires_ocr": False},
|
||||
{"ext": "pptx", "description": "PowerPoint(新版)", "max_size_mb": 200, "max_pages": 600, "requires_ocr": False},
|
||||
{"ext": "ppt", "description": "PowerPoint(旧版)", "max_size_mb": 200, "max_pages": 600, "requires_ocr": False},
|
||||
{"ext": "png", "description": "PNG 图片(单页)", "max_size_mb": 200, "max_pages": 1, "requires_ocr": True},
|
||||
{"ext": "jpg", "description": "JPEG 图片(单页)", "max_size_mb": 200, "max_pages": 1, "requires_ocr": True},
|
||||
{"ext": "jpeg", "description": "JPEG 图片(单页)", "max_size_mb": 200, "max_pages": 1, "requires_ocr": True},
|
||||
{"ext": "html", "description": "HTML 文件", "max_size_mb": 200, "max_pages": 600, "requires_ocr": False},
|
||||
],
|
||||
"ocr_languages": [
|
||||
{"code": "ch", "name": "中文(默认)"},
|
||||
{"code": "en", "name": "英文"},
|
||||
{"code": "japan", "name": "日文"},
|
||||
{"code": "korean", "name": "韩文"},
|
||||
{"code": "french", "name": "法文"},
|
||||
{"code": "german", "name": "德文"},
|
||||
],
|
||||
"notes": [
|
||||
"language 参数默认值为 'ch'(非 'zh'),遵循 PaddleOCR v3 语言代码规范",
|
||||
"上传时不需要携带 Content-Type,服务端自动识别",
|
||||
"PNG/JPG/JPEG 单次最多处理 1 页",
|
||||
],
|
||||
})
|
||||
|
||||
|
||||
@router.get("/system/demo")
|
||||
async def get_demo_data():
|
||||
# Try backend KG first, then fall back to graphrag_pipeline/output
|
||||
nodes = fs.load_kg_nodes()
|
||||
edges = fs.load_kg_edges()
|
||||
|
||||
if not nodes:
|
||||
# Fallback: load from existing graphrag_pipeline output
|
||||
legacy_nodes_path = Path("F:/GraphRAGAgent/graphrag_pipeline/output/kg_nodes.json")
|
||||
legacy_edges_path = Path("F:/GraphRAGAgent/graphrag_pipeline/output/kg_edges.json")
|
||||
if legacy_nodes_path.exists():
|
||||
import json
|
||||
nodes = json.loads(legacy_nodes_path.read_text(encoding="utf-8"))
|
||||
edges = json.loads(legacy_edges_path.read_text(encoding="utf-8")) if legacy_edges_path.exists() else []
|
||||
else:
|
||||
from fastapi.responses import JSONResponse
|
||||
return JSONResponse(
|
||||
status_code=400,
|
||||
content=APIResponse.err(3002, "No demo data available. Index a document first.").model_dump(),
|
||||
)
|
||||
|
||||
type_counts: dict[str, int] = {}
|
||||
for n in nodes:
|
||||
t = n.get("type", "UNKNOWN")
|
||||
type_counts[t] = type_counts.get(t, 0) + 1
|
||||
|
||||
import networkx as nx
|
||||
G = nx.Graph()
|
||||
for n in nodes:
|
||||
G.add_node(n["id"])
|
||||
for e in edges:
|
||||
G.add_edge(e["source"], e["target"])
|
||||
|
||||
return APIResponse.ok({
|
||||
"nodes": nodes,
|
||||
"edges": edges,
|
||||
"stats": {
|
||||
"nodes": len(nodes),
|
||||
"edges": len(edges),
|
||||
"type_counts": type_counts,
|
||||
"density": round(nx.density(G), 4) if G.number_of_nodes() > 1 else 0.0,
|
||||
},
|
||||
})
|
||||
0
backend/services/__init__.py
Normal file
0
backend/services/__init__.py
Normal file
109
backend/services/document_service.py
Normal file
109
backend/services/document_service.py
Normal file
@@ -0,0 +1,109 @@
|
||||
"""Document Service — file upload, metadata CRUD."""
|
||||
from __future__ import annotations
|
||||
|
||||
import uuid
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
from storage import file_store as fs
|
||||
|
||||
ALLOWED_EXTENSIONS = {"pdf", "docx", "doc", "pptx", "ppt", "png", "jpg", "jpeg", "html"}
|
||||
MAX_FILE_SIZE_MB = 200
|
||||
|
||||
|
||||
def validate_upload(filename: str, size_bytes: int) -> tuple[bool, int, str]:
|
||||
"""Returns (ok, error_code, error_msg)."""
|
||||
if not filename or "/" in filename or "\\" in filename:
|
||||
return False, 1001, "Invalid filename"
|
||||
ext = Path(filename).suffix.lower().lstrip(".")
|
||||
if ext not in ALLOWED_EXTENSIONS:
|
||||
return False, 1002, f"Unsupported file format: .{ext}. Supported: {', '.join(sorted(ALLOWED_EXTENSIONS))}"
|
||||
size_mb = size_bytes / (1024 * 1024)
|
||||
if size_mb > MAX_FILE_SIZE_MB:
|
||||
return False, 1003, f"File size {size_mb:.1f}MB exceeds {MAX_FILE_SIZE_MB}MB limit"
|
||||
return True, 0, ""
|
||||
|
||||
|
||||
def save_upload(filename: str, content: bytes, language: str = "ch",
|
||||
enable_formula: bool = True, enable_table: bool = True) -> dict:
|
||||
doc_id = uuid.uuid4().hex[:8]
|
||||
ext = Path(filename).suffix.lower().lstrip(".")
|
||||
upload_filename = f"{doc_id}_{filename}"
|
||||
upload_path = fs.UPLOADS_DIR / upload_filename
|
||||
upload_path.write_bytes(content)
|
||||
|
||||
doc = {
|
||||
"doc_id": doc_id,
|
||||
"filename": filename,
|
||||
"format": ext,
|
||||
"size_bytes": len(content),
|
||||
"pages": None,
|
||||
"uploaded_at": datetime.now(timezone.utc).isoformat(),
|
||||
"status": "uploaded",
|
||||
"language": language,
|
||||
"enable_formula": enable_formula,
|
||||
"enable_table": enable_table,
|
||||
"upload_filename": upload_filename, # internal: actual stored filename
|
||||
}
|
||||
fs.save_doc(doc)
|
||||
return doc
|
||||
|
||||
|
||||
def get_document(doc_id: str) -> dict | None:
|
||||
return fs.get_doc(doc_id)
|
||||
|
||||
|
||||
def list_documents(page: int = 1, page_size: int = 20,
|
||||
status: str | None = None, fmt: str | None = None) -> dict:
|
||||
index = fs.load_docs_index()
|
||||
items = list(index.values())
|
||||
items.sort(key=lambda d: d.get("uploaded_at", ""), reverse=True)
|
||||
if status:
|
||||
items = [d for d in items if d.get("status") == status]
|
||||
if fmt:
|
||||
items = [d for d in items if d.get("format") == fmt.lower()]
|
||||
total = len(items)
|
||||
start = (page - 1) * page_size
|
||||
return {
|
||||
"total": total,
|
||||
"page": page,
|
||||
"page_size": page_size,
|
||||
"items": items[start: start + page_size],
|
||||
}
|
||||
|
||||
|
||||
def delete_document(doc_id: str) -> tuple[bool, int, int]:
|
||||
"""Delete doc and its KG contributions. Returns (ok, removed_nodes, removed_edges)."""
|
||||
doc = fs.get_doc(doc_id)
|
||||
if not doc:
|
||||
return False, 0, 0
|
||||
|
||||
# Remove from KG
|
||||
removed_nodes, removed_edges = fs.remove_doc_from_kg(doc_id)
|
||||
|
||||
# Remove upload file
|
||||
upload_filename = doc.get("upload_filename", "")
|
||||
upload_path = fs.UPLOADS_DIR / upload_filename
|
||||
if upload_path.exists():
|
||||
upload_path.unlink(missing_ok=True)
|
||||
|
||||
# Remove associated jobs
|
||||
for meta in fs.list_all_jobs():
|
||||
if meta.get("doc_id") == doc_id:
|
||||
fs.delete_job(meta["job_id"])
|
||||
|
||||
# Remove from index
|
||||
index = fs.load_docs_index()
|
||||
index.pop(doc_id, None)
|
||||
fs.save_docs_index(index)
|
||||
|
||||
return True, removed_nodes, removed_edges
|
||||
|
||||
|
||||
def update_doc_status(doc_id: str, status: str, pages: int | None = None) -> None:
|
||||
index = fs.load_docs_index()
|
||||
if doc_id in index:
|
||||
index[doc_id]["status"] = status
|
||||
if pages is not None:
|
||||
index[doc_id]["pages"] = pages
|
||||
fs.save_docs_index(index)
|
||||
255
backend/services/indexing_service.py
Normal file
255
backend/services/indexing_service.py
Normal file
@@ -0,0 +1,255 @@
|
||||
"""Indexing Service — Pipeline orchestration (parsing → extracting → indexing)."""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
import threading
|
||||
import time
|
||||
import uuid
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
from dotenv import load_dotenv
|
||||
|
||||
from storage import file_store as fs
|
||||
from services.document_service import update_doc_status
|
||||
|
||||
load_dotenv(Path(__file__).parent.parent / ".env", override=True)
|
||||
|
||||
MINERU_PYTHON = Path(os.getenv("MINERU_PYTHON", "F:/GraphRAGAgent/mineru_mvp/.venv/Scripts/python.exe"))
|
||||
MINERU_PIPELINE = Path(os.getenv("MINERU_PIPELINE", "F:/GraphRAGAgent/mineru_mvp/pipeline.py"))
|
||||
|
||||
# In-memory registry of active jobs {job_id: threading.Thread}
|
||||
_active_threads: dict[str, threading.Thread] = {}
|
||||
_cancel_flags: dict[str, bool] = {}
|
||||
|
||||
|
||||
def start_indexing(doc_id: str) -> dict:
|
||||
doc = fs.get_doc(doc_id)
|
||||
if not doc:
|
||||
return None # type: ignore
|
||||
|
||||
job_id = f"job_{uuid.uuid4().hex[:8]}"
|
||||
now = datetime.now(timezone.utc).isoformat()
|
||||
|
||||
meta = {
|
||||
"job_id": job_id,
|
||||
"doc_id": doc_id,
|
||||
"status": "submitted",
|
||||
"stage": "Job submitted",
|
||||
"progress": {"parsed_pages": 0, "total_pages": 0, "extracted_entities": 0},
|
||||
"created_at": now,
|
||||
"elapsed_seconds": 0.0,
|
||||
"error": None,
|
||||
"pdf_name": doc["filename"],
|
||||
"pdf_path": str(fs.UPLOADS_DIR / doc.get("upload_filename", "")),
|
||||
}
|
||||
fs.save_job_meta(job_id, meta)
|
||||
|
||||
_cancel_flags[job_id] = False
|
||||
thread = threading.Thread(target=_run_pipeline, args=(job_id,), daemon=True)
|
||||
_active_threads[job_id] = thread
|
||||
thread.start()
|
||||
|
||||
return meta
|
||||
|
||||
|
||||
def _update_meta(job_id: str, **kwargs) -> None:
|
||||
meta = fs.load_job_meta(job_id) or {}
|
||||
meta.update(kwargs)
|
||||
meta["elapsed_seconds"] = round(
|
||||
(datetime.now(timezone.utc) - datetime.fromisoformat(meta["created_at"])).total_seconds(), 1
|
||||
)
|
||||
fs.save_job_meta(job_id, meta)
|
||||
|
||||
|
||||
def _run_pipeline(job_id: str) -> None:
|
||||
meta = fs.load_job_meta(job_id)
|
||||
if not meta:
|
||||
return
|
||||
|
||||
doc_id = meta["doc_id"]
|
||||
pdf_path = Path(meta["pdf_path"])
|
||||
job_dir = fs.job_dir(job_id)
|
||||
start_time = time.time()
|
||||
|
||||
try:
|
||||
# ── Stage 1: parsing ──────────────────────────────────────────────
|
||||
if _cancel_flags.get(job_id):
|
||||
_update_meta(job_id, status="cancelled", stage="Cancelled")
|
||||
return
|
||||
|
||||
_update_meta(job_id, status="parsing", stage="MinerU document parsing...")
|
||||
mineru_out_dir = job_dir / "mineru_output"
|
||||
mineru_out_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
result = subprocess.run(
|
||||
[str(MINERU_PYTHON), str(MINERU_PIPELINE), str(pdf_path)],
|
||||
cwd=str(MINERU_PIPELINE.parent),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=600,
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
raise RuntimeError(f"MinerU failed: {result.stderr[:500]}")
|
||||
|
||||
# Find content_list.json in MinerU output
|
||||
# MinerU writes output to mineru_mvp/output/{stem}/
|
||||
stem = pdf_path.stem
|
||||
mineru_default_out = MINERU_PIPELINE.parent / "output" / stem
|
||||
content_list_path = None
|
||||
|
||||
if mineru_default_out.exists():
|
||||
matches = list(mineru_default_out.glob("*_content_list.json"))
|
||||
if matches:
|
||||
content_list_path = matches[0]
|
||||
# Copy to our job dir
|
||||
import shutil
|
||||
shutil.copytree(str(mineru_default_out), str(mineru_out_dir), dirs_exist_ok=True)
|
||||
|
||||
if not content_list_path:
|
||||
# Fallback: search job mineru_output dir
|
||||
matches = list(mineru_out_dir.glob("*_content_list.json"))
|
||||
if matches:
|
||||
content_list_path = matches[0]
|
||||
|
||||
if not content_list_path or not content_list_path.exists():
|
||||
raise RuntimeError(f"MinerU output content_list.json not found. stdout: {result.stdout[:300]}")
|
||||
|
||||
# ── Stage 2: extracting ───────────────────────────────────────────
|
||||
if _cancel_flags.get(job_id):
|
||||
_update_meta(job_id, status="cancelled", stage="Cancelled")
|
||||
return
|
||||
|
||||
from pipeline.text_assembler import load_content_list, assemble_pages, count_blocks_by_type
|
||||
from pipeline.entity_extractor import create_model, extract_entities
|
||||
from pipeline.kg_builder import build_kg, extractions_to_records
|
||||
|
||||
content_list = load_content_list(content_list_path)
|
||||
pages = assemble_pages(content_list)
|
||||
total_pages = len(pages)
|
||||
block_types = count_blocks_by_type(content_list)
|
||||
|
||||
_update_meta(
|
||||
job_id,
|
||||
status="extracting",
|
||||
stage=f"Extracting entities (LangExtract + DeepSeek)...",
|
||||
progress={"parsed_pages": total_pages, "total_pages": total_pages, "extracted_entities": 0},
|
||||
)
|
||||
update_doc_status(doc_id, "indexing", pages=total_pages)
|
||||
|
||||
model = create_model()
|
||||
annotated_docs = []
|
||||
total_entities = 0
|
||||
|
||||
for i, page in enumerate(pages):
|
||||
if _cancel_flags.get(job_id):
|
||||
_update_meta(job_id, status="cancelled", stage="Cancelled")
|
||||
return
|
||||
|
||||
_update_meta(
|
||||
job_id,
|
||||
stage=f"Extracting entities page {i+1}/{total_pages} (LangExtract + DeepSeek)...",
|
||||
progress={"parsed_pages": total_pages, "total_pages": total_pages,
|
||||
"extracted_entities": total_entities},
|
||||
)
|
||||
ann_doc = extract_entities(page.text, model)
|
||||
annotated_docs.append(ann_doc)
|
||||
total_entities += len(ann_doc.extractions) if ann_doc.extractions else 0
|
||||
|
||||
# Save raw extractions
|
||||
records = extractions_to_records(pages, annotated_docs, doc_id)
|
||||
fs.write_json(job_dir / "extractions.json", records)
|
||||
|
||||
# ── Stage 3: indexing ─────────────────────────────────────────────
|
||||
_update_meta(job_id, status="indexing", stage="Building knowledge graph...")
|
||||
|
||||
nodes, edges = build_kg(pages, annotated_docs, doc_id)
|
||||
fs.write_json(job_dir / "kg_nodes.json", nodes)
|
||||
fs.write_json(job_dir / "kg_edges.json", edges)
|
||||
|
||||
# Merge into global KG
|
||||
fs.merge_kg(nodes, edges, doc_id)
|
||||
|
||||
# Count alignment types
|
||||
alignment_counts: dict[str, int] = {}
|
||||
type_counts: dict[str, int] = {}
|
||||
for r in records:
|
||||
al = r.get("alignment") or "null"
|
||||
alignment_counts[al] = alignment_counts.get(al, 0) + 1
|
||||
t = r.get("type", "UNKNOWN")
|
||||
type_counts[t] = type_counts.get(t, 0) + 1
|
||||
|
||||
elapsed = round(time.time() - start_time, 1)
|
||||
stats = {
|
||||
"blocks": len(content_list),
|
||||
"block_types": block_types,
|
||||
"pages": total_pages,
|
||||
"raw_extractions": len(records),
|
||||
"nodes": len(nodes),
|
||||
"edges": len(edges),
|
||||
"type_counts": type_counts,
|
||||
"alignment_counts": alignment_counts,
|
||||
"elapsed_seconds": elapsed,
|
||||
}
|
||||
fs.write_json(job_dir / "stats.json", stats)
|
||||
|
||||
_update_meta(
|
||||
job_id,
|
||||
status="done",
|
||||
stage="Complete",
|
||||
progress={"parsed_pages": total_pages, "total_pages": total_pages,
|
||||
"extracted_entities": len(records)},
|
||||
)
|
||||
update_doc_status(doc_id, "indexed", pages=total_pages)
|
||||
|
||||
except Exception as exc:
|
||||
_update_meta(job_id, status="failed", stage=f"Error: {exc}", error=str(exc))
|
||||
update_doc_status(doc_id, "failed")
|
||||
finally:
|
||||
_active_threads.pop(job_id, None)
|
||||
_cancel_flags.pop(job_id, None)
|
||||
|
||||
|
||||
def get_job_status(job_id: str) -> dict | None:
|
||||
return fs.load_job_meta(job_id)
|
||||
|
||||
|
||||
def get_job_result(job_id: str) -> dict | None:
|
||||
meta = fs.load_job_meta(job_id)
|
||||
if not meta:
|
||||
return None
|
||||
if meta["status"] != "done":
|
||||
return meta
|
||||
|
||||
job_dir = fs.job_dir(job_id)
|
||||
stats = fs.read_json(job_dir / "stats.json") or {}
|
||||
extractions = fs.read_json(job_dir / "extractions.json") or []
|
||||
nodes = fs.read_json(job_dir / "kg_nodes.json") or []
|
||||
edges = fs.read_json(job_dir / "kg_edges.json") or []
|
||||
|
||||
return {
|
||||
"job_id": meta["job_id"],
|
||||
"doc_id": meta["doc_id"],
|
||||
"status": "done",
|
||||
"stats": stats,
|
||||
"extractions": extractions,
|
||||
"nodes": nodes,
|
||||
"edges": edges,
|
||||
}
|
||||
|
||||
|
||||
def cancel_job(job_id: str) -> tuple[bool, str]:
|
||||
meta = fs.load_job_meta(job_id)
|
||||
if not meta:
|
||||
return False, "not_found"
|
||||
prev_status = meta["status"]
|
||||
_cancel_flags[job_id] = True
|
||||
_update_meta(job_id, status="cancelled", stage="Cancelled by user")
|
||||
return True, prev_status
|
||||
|
||||
|
||||
def count_active_jobs() -> int:
|
||||
return sum(1 for t in _active_threads.values() if t.is_alive())
|
||||
167
backend/services/kg_service.py
Normal file
167
backend/services/kg_service.py
Normal file
@@ -0,0 +1,167 @@
|
||||
"""KG Service — NetworkX graph operations over the global KG."""
|
||||
from __future__ import annotations
|
||||
|
||||
import networkx as nx
|
||||
|
||||
from storage import file_store as fs
|
||||
|
||||
|
||||
def _load_graph() -> nx.Graph:
|
||||
nodes = fs.load_kg_nodes()
|
||||
edges = fs.load_kg_edges()
|
||||
G = nx.Graph()
|
||||
for n in nodes:
|
||||
G.add_node(n["id"], **n)
|
||||
for e in edges:
|
||||
G.add_edge(e["source"], e["target"],
|
||||
relation=e.get("relation", "CO_OCCURS_IN"),
|
||||
doc_id=e.get("doc_id", ""),
|
||||
page=e.get("page", 0))
|
||||
return G
|
||||
|
||||
|
||||
def get_nodes(page: int = 1, page_size: int = 50,
|
||||
node_type: str | None = None,
|
||||
doc_id: str | None = None,
|
||||
confidence: str | None = None) -> dict:
|
||||
nodes = fs.load_kg_nodes()
|
||||
G = _load_graph()
|
||||
# Attach degree
|
||||
degrees = dict(G.degree())
|
||||
for n in nodes:
|
||||
n["degree"] = degrees.get(n["id"], 0)
|
||||
|
||||
if node_type:
|
||||
nodes = [n for n in nodes if n.get("type", "").upper() == node_type.upper()]
|
||||
if doc_id:
|
||||
nodes = [n for n in nodes if n.get("source_doc") == doc_id]
|
||||
if confidence:
|
||||
nodes = [n for n in nodes if n.get("confidence") == confidence]
|
||||
|
||||
total = len(nodes)
|
||||
start = (page - 1) * page_size
|
||||
return {"total": total, "page": page, "page_size": page_size,
|
||||
"items": nodes[start: start + page_size]}
|
||||
|
||||
|
||||
def get_edges(page: int = 1, page_size: int = 100,
|
||||
doc_id: str | None = None,
|
||||
relation: str | None = None) -> dict:
|
||||
edges = fs.load_kg_edges()
|
||||
if doc_id:
|
||||
edges = [e for e in edges if e.get("doc_id") == doc_id]
|
||||
if relation:
|
||||
edges = [e for e in edges if e.get("relation") == relation]
|
||||
total = len(edges)
|
||||
start = (page - 1) * page_size
|
||||
return {"total": total, "page": page, "page_size": page_size,
|
||||
"items": edges[start: start + page_size]}
|
||||
|
||||
|
||||
def get_node_detail(node_id: str) -> dict | None:
|
||||
nodes = fs.load_kg_nodes()
|
||||
node = next((n for n in nodes if n["id"] == node_id), None)
|
||||
if not node:
|
||||
return None
|
||||
G = _load_graph()
|
||||
if node_id not in G:
|
||||
node["degree"] = 0
|
||||
node["degree_centrality"] = 0.0
|
||||
node["neighbor_count"] = 0
|
||||
return node
|
||||
deg = G.degree(node_id)
|
||||
centrality = nx.degree_centrality(G)
|
||||
node["degree"] = deg
|
||||
node["degree_centrality"] = round(centrality.get(node_id, 0.0), 4)
|
||||
node["neighbor_count"] = deg
|
||||
return node
|
||||
|
||||
|
||||
def get_neighbors(node_id: str, hops: int = 1) -> dict | None:
|
||||
nodes = fs.load_kg_nodes()
|
||||
node = next((n for n in nodes if n["id"] == node_id), None)
|
||||
if not node:
|
||||
return None
|
||||
G = _load_graph()
|
||||
if node_id not in G:
|
||||
return {
|
||||
"center": {"id": node_id, "name": node["name"], "type": node["type"], "page": node.get("page", 0)},
|
||||
"hops": hops, "neighbors_by_hop": {}, "total_neighbors": 0,
|
||||
}
|
||||
hops = max(1, min(hops, 3))
|
||||
reachable = nx.single_source_shortest_path_length(G, node_id, cutoff=hops)
|
||||
by_hop: dict[str, list] = {}
|
||||
for nid, dist in reachable.items():
|
||||
if dist == 0:
|
||||
continue
|
||||
nd = G.nodes[nid]
|
||||
by_hop.setdefault(str(dist), []).append({
|
||||
"id": nid, "name": nd.get("name", ""), "type": nd.get("type", ""), "page": nd.get("page", 0)
|
||||
})
|
||||
total = sum(len(v) for v in by_hop.values())
|
||||
return {
|
||||
"center": {"id": node_id, "name": node["name"], "type": node["type"], "page": node.get("page", 0)},
|
||||
"hops": hops,
|
||||
"neighbors_by_hop": by_hop,
|
||||
"total_neighbors": total,
|
||||
}
|
||||
|
||||
|
||||
def get_stats() -> dict:
|
||||
nodes = fs.load_kg_nodes()
|
||||
edges = fs.load_kg_edges()
|
||||
G = _load_graph()
|
||||
|
||||
type_dist: dict[str, int] = {}
|
||||
for n in nodes:
|
||||
t = n.get("type", "UNKNOWN")
|
||||
type_dist[t] = type_dist.get(t, 0) + 1
|
||||
|
||||
relation_types: dict[str, int] = {}
|
||||
for e in edges:
|
||||
r = e.get("relation", "CO_OCCURS_IN")
|
||||
relation_types[r] = relation_types.get(r, 0) + 1
|
||||
|
||||
density = round(nx.density(G), 4) if G.number_of_nodes() > 1 else 0.0
|
||||
|
||||
top5: list[dict] = []
|
||||
if G.number_of_nodes() > 0:
|
||||
centrality = nx.degree_centrality(G)
|
||||
for nid, c in sorted(centrality.items(), key=lambda x: x[1], reverse=True)[:5]:
|
||||
nd = G.nodes[nid]
|
||||
top5.append({"node_id": nid, "name": nd.get("name", ""), "type": nd.get("type", ""),
|
||||
"centrality": round(c, 4)})
|
||||
|
||||
source_docs = list({n.get("source_doc", "") for n in nodes if n.get("source_doc")})
|
||||
|
||||
return {
|
||||
"total_nodes": len(nodes),
|
||||
"total_edges": len(edges),
|
||||
"density": density,
|
||||
"type_distribution": type_dist,
|
||||
"relation_types": relation_types,
|
||||
"top5_central_nodes": top5,
|
||||
"source_documents": source_docs,
|
||||
}
|
||||
|
||||
|
||||
def export_kg(doc_id: str | None = None) -> dict:
|
||||
from datetime import datetime, timezone
|
||||
nodes = fs.load_kg_nodes()
|
||||
edges = fs.load_kg_edges()
|
||||
G = _load_graph()
|
||||
degrees = dict(G.degree())
|
||||
for n in nodes:
|
||||
n["degree"] = degrees.get(n["id"], 0)
|
||||
if doc_id:
|
||||
nodes = [n for n in nodes if n.get("source_doc") == doc_id]
|
||||
edges = [e for e in edges if e.get("doc_id") == doc_id]
|
||||
return {
|
||||
"format": "json",
|
||||
"doc_id": doc_id,
|
||||
"total_nodes": len(nodes),
|
||||
"total_edges": len(edges),
|
||||
"exported_at": datetime.now(timezone.utc).isoformat(),
|
||||
"nodes": nodes,
|
||||
"edges": edges,
|
||||
}
|
||||
85
backend/services/qa_service.py
Normal file
85
backend/services/qa_service.py
Normal file
@@ -0,0 +1,85 @@
|
||||
"""QA Service — Agentic-RAG wrapper."""
|
||||
from __future__ import annotations
|
||||
|
||||
import time
|
||||
import uuid
|
||||
from datetime import datetime, timezone
|
||||
|
||||
from storage import file_store as fs
|
||||
|
||||
|
||||
def run_query(question: str, history: list[dict]) -> dict:
|
||||
from pipeline.qa_agent import run_qa
|
||||
|
||||
nodes = fs.load_kg_nodes()
|
||||
edges = fs.load_kg_edges()
|
||||
|
||||
if not nodes:
|
||||
raise ValueError("KG_EMPTY")
|
||||
|
||||
start = time.time()
|
||||
result = run_qa(question, history, nodes, edges)
|
||||
elapsed = round(time.time() - start, 2)
|
||||
|
||||
query_id = f"q_{uuid.uuid4().hex[:10]}"
|
||||
now = datetime.now(timezone.utc).isoformat()
|
||||
|
||||
record = {
|
||||
"id": query_id,
|
||||
"question": question,
|
||||
"answer": result["answer"],
|
||||
"tool_calls": result["tool_calls"],
|
||||
"cited_nodes": result["cited_nodes"],
|
||||
"duration_seconds": elapsed,
|
||||
"timestamp": now,
|
||||
}
|
||||
fs.append_query_history(record)
|
||||
return record
|
||||
|
||||
|
||||
def get_history(page: int = 1, page_size: int = 20) -> dict:
|
||||
all_records = fs.load_query_history()
|
||||
total = len(all_records)
|
||||
start = (page - 1) * page_size
|
||||
return {
|
||||
"total": total,
|
||||
"page": page,
|
||||
"page_size": page_size,
|
||||
"items": all_records[start: start + page_size],
|
||||
}
|
||||
|
||||
|
||||
def start_batch(questions: list[str]) -> dict:
|
||||
import threading
|
||||
|
||||
batch_id = f"batch_{uuid.uuid4().hex[:10]}"
|
||||
now = datetime.now(timezone.utc).isoformat()
|
||||
meta = {
|
||||
"batch_id": batch_id,
|
||||
"total": len(questions),
|
||||
"completed": 0,
|
||||
"failed": 0,
|
||||
"status": "submitted",
|
||||
"created_at": now,
|
||||
"results": [],
|
||||
}
|
||||
fs.save_batch_meta(batch_id, meta)
|
||||
|
||||
def _run():
|
||||
for q in questions:
|
||||
try:
|
||||
res = run_query(q, [])
|
||||
meta["results"].append(res)
|
||||
meta["completed"] += 1
|
||||
except Exception as e:
|
||||
meta["failed"] += 1
|
||||
meta["results"].append({"question": q, "error": str(e)})
|
||||
meta["status"] = "done"
|
||||
fs.save_batch_meta(batch_id, meta)
|
||||
|
||||
threading.Thread(target=_run, daemon=True).start()
|
||||
return {"batch_id": batch_id, "total": len(questions), "status": "submitted", "created_at": now}
|
||||
|
||||
|
||||
def get_batch_result(batch_id: str) -> dict | None:
|
||||
return fs.load_batch_meta(batch_id)
|
||||
106
backend/services/search_service.py
Normal file
106
backend/services/search_service.py
Normal file
@@ -0,0 +1,106 @@
|
||||
"""Search Service — entity, path, and graph search."""
|
||||
from __future__ import annotations
|
||||
|
||||
import networkx as nx
|
||||
|
||||
from storage import file_store as fs
|
||||
|
||||
|
||||
def _load_graph() -> nx.Graph:
|
||||
nodes = fs.load_kg_nodes()
|
||||
edges = fs.load_kg_edges()
|
||||
G = nx.Graph()
|
||||
for n in nodes:
|
||||
G.add_node(n["id"], **n)
|
||||
for e in edges:
|
||||
G.add_edge(e["source"], e["target"],
|
||||
relation=e.get("relation", "CO_OCCURS_IN"),
|
||||
doc_id=e.get("doc_id", ""), page=e.get("page", 0))
|
||||
return G
|
||||
|
||||
|
||||
def search_entities(q: str, entity_type: str | None = None, limit: int = 15) -> dict:
|
||||
nodes = fs.load_kg_nodes()
|
||||
G = _load_graph()
|
||||
degrees = dict(G.degree())
|
||||
q_lower = q.lower()
|
||||
matches = [n for n in nodes if q_lower in n.get("name", "").lower()]
|
||||
if entity_type:
|
||||
matches = [n for n in matches if n.get("type", "").upper() == entity_type.upper()]
|
||||
for n in matches:
|
||||
n["degree"] = degrees.get(n["id"], 0)
|
||||
matches = matches[:limit]
|
||||
return {"query": q, "total": len(matches), "items": matches}
|
||||
|
||||
|
||||
def search_path(from_id: str, to_id: str, max_hops: int = 3) -> dict | None:
|
||||
nodes = fs.load_kg_nodes()
|
||||
node_map = {n["id"]: n for n in nodes}
|
||||
if from_id not in node_map or to_id not in node_map:
|
||||
return None # node not found
|
||||
|
||||
G = _load_graph()
|
||||
max_hops = max(1, min(max_hops, 5))
|
||||
|
||||
try:
|
||||
raw_paths = list(nx.all_simple_paths(G, from_id, to_id, cutoff=max_hops))
|
||||
except nx.NetworkXError:
|
||||
raw_paths = []
|
||||
|
||||
paths = []
|
||||
for path_nodes in raw_paths:
|
||||
path_edges = []
|
||||
for i in range(len(path_nodes) - 1):
|
||||
s, t = path_nodes[i], path_nodes[i + 1]
|
||||
edge_data = G.edges[s, t]
|
||||
path_edges.append({"source": s, "target": t,
|
||||
"relation": edge_data.get("relation", "CO_OCCURS_IN")})
|
||||
paths.append({
|
||||
"length": len(path_nodes) - 1,
|
||||
"nodes": [{"id": nid, "name": node_map.get(nid, {}).get("name", nid),
|
||||
"type": node_map.get(nid, {}).get("type", "")} for nid in path_nodes],
|
||||
"edges": path_edges,
|
||||
})
|
||||
|
||||
from_node = node_map[from_id]
|
||||
to_node = node_map[to_id]
|
||||
return {
|
||||
"from": {"id": from_id, "name": from_node.get("name", ""), "type": from_node.get("type", "")},
|
||||
"to": {"id": to_id, "name": to_node.get("name", ""), "type": to_node.get("type", "")},
|
||||
"max_hops": max_hops,
|
||||
"paths": paths,
|
||||
"total_paths": len(paths),
|
||||
}
|
||||
|
||||
|
||||
def search_graph(q: str, include_neighbors: bool = False) -> dict:
|
||||
nodes = fs.load_kg_nodes()
|
||||
edges = fs.load_kg_edges()
|
||||
G = _load_graph()
|
||||
degrees = dict(G.degree())
|
||||
q_lower = q.lower()
|
||||
|
||||
matched = [n for n in nodes if q_lower in n.get("name", "").lower()]
|
||||
matched_ids = {n["id"] for n in matched}
|
||||
for n in matched:
|
||||
n["degree"] = degrees.get(n["id"], 0)
|
||||
|
||||
if include_neighbors:
|
||||
neighbor_ids = set()
|
||||
for nid in matched_ids:
|
||||
if nid in G:
|
||||
neighbor_ids.update(G.neighbors(nid))
|
||||
all_relevant = matched_ids | neighbor_ids
|
||||
else:
|
||||
all_relevant = matched_ids
|
||||
|
||||
subgraph_edges = [
|
||||
e for e in edges
|
||||
if e.get("source") in all_relevant and e.get("target") in all_relevant
|
||||
]
|
||||
|
||||
return {
|
||||
"query": q,
|
||||
"matched_nodes": matched,
|
||||
"subgraph_edges": subgraph_edges,
|
||||
}
|
||||
0
backend/storage/__init__.py
Normal file
0
backend/storage/__init__.py
Normal file
268
backend/storage/file_store.py
Normal file
268
backend/storage/file_store.py
Normal file
@@ -0,0 +1,268 @@
|
||||
"""
|
||||
File Store — unified JSON read/write for all backend data.
|
||||
All data lives under backend/data/.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
# Root data directory relative to this file
|
||||
_BASE = Path(__file__).parent.parent / "data"
|
||||
|
||||
UPLOADS_DIR = _BASE / "uploads"
|
||||
JOBS_DIR = _BASE / "jobs"
|
||||
KG_DIR = _BASE / "kg"
|
||||
QUERY_DIR = _BASE / "jobs" # query_history.jsonl lives here
|
||||
|
||||
# Ensure directories exist at import time
|
||||
for _d in (UPLOADS_DIR, JOBS_DIR, KG_DIR):
|
||||
_d.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Generic helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def read_json(path: Path) -> Any:
|
||||
"""Read and parse a JSON file. Returns None if file doesn't exist."""
|
||||
if not path.exists():
|
||||
return None
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
def write_json(path: Path, data: Any) -> None:
|
||||
"""Atomically write data as JSON (write to .tmp then rename)."""
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
tmp = path.with_suffix(".tmp")
|
||||
with open(tmp, "w", encoding="utf-8") as f:
|
||||
json.dump(data, f, ensure_ascii=False, indent=2)
|
||||
os.replace(tmp, path)
|
||||
|
||||
|
||||
def append_jsonl(path: Path, record: dict) -> None:
|
||||
"""Append a record to a JSONL file."""
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(path, "a", encoding="utf-8") as f:
|
||||
f.write(json.dumps(record, ensure_ascii=False) + "\n")
|
||||
|
||||
|
||||
def read_jsonl(path: Path) -> list[dict]:
|
||||
"""Read all records from a JSONL file."""
|
||||
if not path.exists():
|
||||
return []
|
||||
records = []
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if line:
|
||||
try:
|
||||
records.append(json.loads(line))
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
return records
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Document helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def docs_index_path() -> Path:
|
||||
return _BASE / "docs_index.json"
|
||||
|
||||
|
||||
def load_docs_index() -> dict[str, dict]:
|
||||
"""Load the documents index {doc_id: DocumentInfo dict}."""
|
||||
data = read_json(docs_index_path())
|
||||
return data if isinstance(data, dict) else {}
|
||||
|
||||
|
||||
def save_docs_index(index: dict[str, dict]) -> None:
|
||||
write_json(docs_index_path(), index)
|
||||
|
||||
|
||||
def get_doc(doc_id: str) -> dict | None:
|
||||
return load_docs_index().get(doc_id)
|
||||
|
||||
|
||||
def save_doc(doc: dict) -> None:
|
||||
index = load_docs_index()
|
||||
index[doc["doc_id"]] = doc
|
||||
save_docs_index(index)
|
||||
|
||||
|
||||
def delete_doc(doc_id: str) -> bool:
|
||||
index = load_docs_index()
|
||||
if doc_id not in index:
|
||||
return False
|
||||
del index[doc_id]
|
||||
save_docs_index(index)
|
||||
# Remove upload file
|
||||
doc_info = index.get(doc_id, {})
|
||||
upload_path = UPLOADS_DIR / doc_info.get("upload_filename", "")
|
||||
if upload_path.exists():
|
||||
upload_path.unlink()
|
||||
return True
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Job helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def job_dir(job_id: str) -> Path:
|
||||
return JOBS_DIR / job_id
|
||||
|
||||
|
||||
def job_meta_path(job_id: str) -> Path:
|
||||
return job_dir(job_id) / "meta.json"
|
||||
|
||||
|
||||
def load_job_meta(job_id: str) -> dict | None:
|
||||
return read_json(job_meta_path(job_id))
|
||||
|
||||
|
||||
def save_job_meta(job_id: str, meta: dict) -> None:
|
||||
job_dir(job_id).mkdir(parents=True, exist_ok=True)
|
||||
write_json(job_meta_path(job_id), meta)
|
||||
|
||||
|
||||
def list_all_jobs() -> list[dict]:
|
||||
metas = []
|
||||
for d in JOBS_DIR.iterdir():
|
||||
if d.is_dir():
|
||||
meta = read_json(d / "meta.json")
|
||||
if meta:
|
||||
metas.append(meta)
|
||||
return metas
|
||||
|
||||
|
||||
def delete_job(job_id: str) -> None:
|
||||
jd = job_dir(job_id)
|
||||
if jd.exists():
|
||||
shutil.rmtree(jd)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Global KG helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def kg_nodes_path() -> Path:
|
||||
return KG_DIR / "kg_nodes.json"
|
||||
|
||||
|
||||
def kg_edges_path() -> Path:
|
||||
return KG_DIR / "kg_edges.json"
|
||||
|
||||
|
||||
def load_kg_nodes() -> list[dict]:
|
||||
data = read_json(kg_nodes_path())
|
||||
return data if isinstance(data, list) else []
|
||||
|
||||
|
||||
def load_kg_edges() -> list[dict]:
|
||||
data = read_json(kg_edges_path())
|
||||
return data if isinstance(data, list) else []
|
||||
|
||||
|
||||
def save_kg_nodes(nodes: list[dict]) -> None:
|
||||
write_json(kg_nodes_path(), nodes)
|
||||
|
||||
|
||||
def save_kg_edges(edges: list[dict]) -> None:
|
||||
write_json(kg_edges_path(), edges)
|
||||
|
||||
|
||||
def merge_kg(new_nodes: list[dict], new_edges: list[dict], doc_id: str) -> tuple[int, int]:
|
||||
"""Merge job KG output into global KG. Returns (removed_old, added_new)."""
|
||||
existing_nodes = load_kg_nodes()
|
||||
existing_edges = load_kg_edges()
|
||||
|
||||
# Remove nodes/edges from this doc
|
||||
existing_nodes = [n for n in existing_nodes if n.get("source_doc") != doc_id]
|
||||
existing_edges = [e for e in existing_edges if e.get("doc_id") != doc_id]
|
||||
|
||||
# Merge: deduplicate nodes by (name.lower(), type)
|
||||
node_keys: set[tuple] = {(n["name"].lower(), n["type"]) for n in existing_nodes}
|
||||
for n in new_nodes:
|
||||
key = (n["name"].lower(), n["type"])
|
||||
if key not in node_keys:
|
||||
existing_nodes.append(n)
|
||||
node_keys.add(key)
|
||||
|
||||
# Merge edges: deduplicate by (min(src,tgt), max(src,tgt), doc_id)
|
||||
edge_keys: set[tuple] = set()
|
||||
for e in existing_edges:
|
||||
s, t = e["source"], e["target"]
|
||||
edge_keys.add((min(s, t), max(s, t), e["doc_id"]))
|
||||
|
||||
for e in new_edges:
|
||||
s, t = e["source"], e["target"]
|
||||
key = (min(s, t), max(s, t), e["doc_id"])
|
||||
if key not in edge_keys:
|
||||
existing_edges.append(e)
|
||||
edge_keys.add(key)
|
||||
|
||||
save_kg_nodes(existing_nodes)
|
||||
save_kg_edges(existing_edges)
|
||||
return len(existing_nodes), len(existing_edges)
|
||||
|
||||
|
||||
def remove_doc_from_kg(doc_id: str) -> tuple[int, int]:
|
||||
"""Remove all nodes/edges from a document. Returns (removed_nodes, removed_edges)."""
|
||||
nodes = load_kg_nodes()
|
||||
edges = load_kg_edges()
|
||||
old_n, old_e = len(nodes), len(edges)
|
||||
nodes = [n for n in nodes if n.get("source_doc") != doc_id]
|
||||
edges = [e for e in edges if e.get("doc_id") != doc_id]
|
||||
save_kg_nodes(nodes)
|
||||
save_kg_edges(edges)
|
||||
return old_n - len(nodes), old_e - len(edges)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Query history helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def query_history_path() -> Path:
|
||||
return _BASE / "query_history.jsonl"
|
||||
|
||||
|
||||
def append_query_history(result: dict) -> None:
|
||||
append_jsonl(query_history_path(), result)
|
||||
|
||||
|
||||
def load_query_history() -> list[dict]:
|
||||
records = read_jsonl(query_history_path())
|
||||
return list(reversed(records)) # newest first
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Batch job helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def batch_meta_path(batch_id: str) -> Path:
|
||||
return _BASE / "batches" / f"{batch_id}.json"
|
||||
|
||||
|
||||
def load_batch_meta(batch_id: str) -> dict | None:
|
||||
return read_json(batch_meta_path(batch_id))
|
||||
|
||||
|
||||
def save_batch_meta(batch_id: str, meta: dict) -> None:
|
||||
write_json(batch_meta_path(batch_id), meta)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Storage usage
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def storage_used_mb() -> float:
|
||||
total = 0
|
||||
for path in _BASE.rglob("*"):
|
||||
if path.is_file():
|
||||
total += path.stat().st_size
|
||||
return round(total / (1024 * 1024), 2)
|
||||
256
backend/tests/test_api.py
Normal file
256
backend/tests/test_api.py
Normal file
@@ -0,0 +1,256 @@
|
||||
"""
|
||||
API integration tests — tests all major endpoints against a running server.
|
||||
Run with: python tests/test_api.py
|
||||
Server must be running on http://localhost:8000
|
||||
"""
|
||||
import json
|
||||
import sys
|
||||
import time
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
from pathlib import Path
|
||||
|
||||
BASE = "http://localhost:8000/api/v1"
|
||||
|
||||
PASS = "\033[92m[PASS]\033[0m"
|
||||
FAIL = "\033[91m[FAIL]\033[0m"
|
||||
INFO = "\033[94m[INFO]\033[0m"
|
||||
|
||||
results = {"passed": 0, "failed": 0}
|
||||
|
||||
|
||||
def req(method: str, path: str, body: dict | None = None, form: dict | None = None) -> dict:
|
||||
url = BASE + path
|
||||
try:
|
||||
if method == "GET" and not body and not form:
|
||||
r = urllib.request.urlopen(url, timeout=30)
|
||||
else:
|
||||
if body is not None:
|
||||
data = json.dumps(body).encode()
|
||||
req_obj = urllib.request.Request(url, data=data, method=method,
|
||||
headers={"Content-Type": "application/json"})
|
||||
else:
|
||||
req_obj = urllib.request.Request(url, method=method)
|
||||
r = urllib.request.urlopen(req_obj, timeout=30)
|
||||
return json.loads(r.read().decode())
|
||||
except urllib.error.HTTPError as e:
|
||||
return json.loads(e.read().decode())
|
||||
|
||||
|
||||
def check(name: str, condition: bool, detail: str = "") -> None:
|
||||
if condition:
|
||||
results["passed"] += 1
|
||||
print(f" {PASS} {name}")
|
||||
else:
|
||||
results["failed"] += 1
|
||||
print(f" {FAIL} {name} {detail}")
|
||||
|
||||
|
||||
def wait_for_server(max_retries: int = 15) -> bool:
|
||||
print(f"{INFO} Waiting for server at {BASE}...")
|
||||
for i in range(max_retries):
|
||||
try:
|
||||
urllib.request.urlopen(BASE.replace("/api/v1", "/"), timeout=3)
|
||||
print(f"{INFO} Server is up.")
|
||||
return True
|
||||
except Exception:
|
||||
time.sleep(1)
|
||||
return False
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# Test groups
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
def test_system():
|
||||
print("\n── F 组: System ──")
|
||||
|
||||
r = req("GET", "/health")
|
||||
check("GET /health returns code=0", r.get("code") == 0)
|
||||
check("health data.status exists", "status" in (r.get("data") or {}))
|
||||
check("health data.components exists", "components" in (r.get("data") or {}))
|
||||
print(f" {INFO} status={r.get('data',{}).get('status')} uptime={r.get('data',{}).get('uptime_seconds')}s")
|
||||
|
||||
r = req("GET", "/system/stats")
|
||||
check("GET /system/stats returns code=0", r.get("code") == 0)
|
||||
d = r.get("data") or {}
|
||||
check("stats has total_documents", "total_documents" in d)
|
||||
check("stats has total_nodes", "total_nodes" in d)
|
||||
print(f" {INFO} docs={d.get('total_documents')} nodes={d.get('total_nodes')} edges={d.get('total_edges')}")
|
||||
|
||||
r = req("GET", "/system/formats")
|
||||
check("GET /system/formats returns code=0", r.get("code") == 0)
|
||||
d = r.get("data") or {}
|
||||
check("formats list is non-empty", len(d.get("formats", [])) > 0)
|
||||
exts = [f["ext"] for f in d.get("formats", [])]
|
||||
check("pdf format present", "pdf" in exts)
|
||||
check("docx format present", "docx" in exts)
|
||||
|
||||
r = req("GET", "/system/demo")
|
||||
check("GET /system/demo returns code=0 or 3002", r.get("code") in (0, 3002))
|
||||
if r.get("code") == 0:
|
||||
d = r.get("data") or {}
|
||||
check("demo data has nodes", "nodes" in d)
|
||||
print(f" {INFO} demo: {len(d.get('nodes',[]))} nodes, {len(d.get('edges',[]))} edges")
|
||||
else:
|
||||
print(f" {INFO} demo data not available (no KG yet) — code={r.get('code')}")
|
||||
|
||||
|
||||
def test_documents():
|
||||
print("\n── A 组: Documents ──")
|
||||
|
||||
r = req("GET", "/documents")
|
||||
check("GET /documents returns code=0", r.get("code") == 0)
|
||||
d = r.get("data") or {}
|
||||
check("documents list has total field", "total" in d)
|
||||
check("documents list has items field", "items" in d)
|
||||
print(f" {INFO} total documents={d.get('total', 0)}")
|
||||
|
||||
# Upload a test text file (not a real supported format to test validation)
|
||||
print(" Testing upload validation...")
|
||||
import urllib.request, io
|
||||
boundary = "boundary123"
|
||||
body_parts = (
|
||||
f"--{boundary}\r\n"
|
||||
f'Content-Disposition: form-data; name="file"; filename="test.xyz"\r\n'
|
||||
f"Content-Type: application/octet-stream\r\n\r\n"
|
||||
f"dummy content\r\n"
|
||||
f"--{boundary}--\r\n"
|
||||
).encode()
|
||||
req_obj = urllib.request.Request(
|
||||
BASE + "/documents/upload",
|
||||
data=body_parts,
|
||||
method="POST",
|
||||
headers={"Content-Type": f"multipart/form-data; boundary={boundary}"},
|
||||
)
|
||||
try:
|
||||
urllib.request.urlopen(req_obj, timeout=10)
|
||||
r_upload = {}
|
||||
except urllib.error.HTTPError as e:
|
||||
r_upload = json.loads(e.read().decode())
|
||||
check("upload unsupported format returns code=1002", r_upload.get("code") == 1002)
|
||||
|
||||
r = req("GET", "/documents/nonexistent_id")
|
||||
check("GET /documents/nonexistent returns code=2001", r.get("code") == 2001)
|
||||
|
||||
|
||||
def test_indexing():
|
||||
print("\n── B 组: Indexing ──")
|
||||
|
||||
r = req("POST", "/index/start", body={"doc_id": "nonexistent_doc"})
|
||||
check("start indexing nonexistent doc returns 2001", r.get("code") == 2001)
|
||||
|
||||
r = req("GET", "/index/status/nonexistent_job")
|
||||
check("get status nonexistent job returns 2002", r.get("code") == 2002)
|
||||
|
||||
r = req("GET", "/index/result/nonexistent_job")
|
||||
check("get result nonexistent job returns 2002", r.get("code") == 2002)
|
||||
|
||||
r = req("DELETE", "/index/jobs/nonexistent_job")
|
||||
check("cancel nonexistent job returns 2002", r.get("code") == 2002)
|
||||
|
||||
|
||||
def test_kg():
|
||||
print("\n── C 组: Knowledge Graph ──")
|
||||
|
||||
r = req("GET", "/kg/stats")
|
||||
check("GET /kg/stats returns code=0", r.get("code") == 0)
|
||||
d = r.get("data") or {}
|
||||
check("stats has total_nodes", "total_nodes" in d)
|
||||
check("stats has total_edges", "total_edges" in d)
|
||||
print(f" {INFO} KG: {d.get('total_nodes')} nodes, {d.get('total_edges')} edges")
|
||||
|
||||
r = req("GET", "/kg/nodes")
|
||||
check("GET /kg/nodes returns code 0 or 3002", r.get("code") in (0, 3002))
|
||||
if r.get("code") == 0:
|
||||
d = r.get("data") or {}
|
||||
check("nodes data has items", "items" in d)
|
||||
print(f" {INFO} nodes total={d.get('total')}")
|
||||
|
||||
if d.get("items"):
|
||||
node_id = d["items"][0]["id"]
|
||||
r2 = req("GET", f"/kg/nodes/{node_id}")
|
||||
check(f"GET /kg/nodes/{node_id} returns code=0", r2.get("code") == 0)
|
||||
|
||||
r3 = req("GET", f"/kg/nodes/{node_id}/neighbors?hops=1")
|
||||
check(f"GET /kg/nodes/{node_id}/neighbors returns code=0", r3.get("code") == 0)
|
||||
else:
|
||||
print(f" {INFO} KG is empty (code=3002) — skipping node detail tests")
|
||||
|
||||
r = req("GET", "/kg/nodes/definitely_not_a_real_node")
|
||||
check("GET /kg/nodes/invalid returns code=3001", r.get("code") == 3001)
|
||||
|
||||
r = req("GET", "/kg/edges")
|
||||
check("GET /kg/edges returns code=0", r.get("code") == 0)
|
||||
|
||||
r = req("GET", "/kg/export")
|
||||
check("GET /kg/export returns code=0", r.get("code") == 0)
|
||||
|
||||
|
||||
def test_search():
|
||||
print("\n── E 组: Search ──")
|
||||
|
||||
r = req("GET", "/search/entities?q=graph")
|
||||
check("GET /search/entities returns code=0", r.get("code") == 0)
|
||||
d = r.get("data") or {}
|
||||
check("search entities has query field", "query" in d)
|
||||
check("search entities has items field", "items" in d)
|
||||
print(f" {INFO} 'graph' search: {d.get('total', 0)} results")
|
||||
|
||||
r = req("GET", "/search/entities?q=technology&type=TECHNOLOGY")
|
||||
check("GET /search/entities with type filter returns code=0", r.get("code") == 0)
|
||||
|
||||
r = req("GET", "/search/path?max_hops=2")
|
||||
check("path search without from/to returns 1001", r.get("code") == 1001)
|
||||
|
||||
r = req("GET", "/search/graph?q=knowledge")
|
||||
check("GET /search/graph returns code=0", r.get("code") == 0)
|
||||
d = r.get("data") or {}
|
||||
check("graph search has matched_nodes", "matched_nodes" in d)
|
||||
|
||||
|
||||
def test_query():
|
||||
print("\n── D 组: QA Query ──")
|
||||
|
||||
# Don't call /query (POST) in basic tests as it needs DeepSeek API + KG data
|
||||
r = req("GET", "/query/history")
|
||||
check("GET /query/history returns code=0", r.get("code") == 0)
|
||||
d = r.get("data") or {}
|
||||
check("history has total field", "total" in d)
|
||||
check("history has items field", "items" in d)
|
||||
print(f" {INFO} query history: {d.get('total', 0)} records")
|
||||
|
||||
r = req("GET", "/query/batch/nonexistent_batch")
|
||||
check("GET /query/batch/nonexistent returns 2002", r.get("code") == 2002)
|
||||
|
||||
r = req("POST", "/query/batch", body={"questions": ["test question"]})
|
||||
check("POST /query/batch returns code=0", r.get("code") == 0)
|
||||
d = r.get("data") or {}
|
||||
check("batch has batch_id", "batch_id" in d)
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# Main
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
if __name__ == "__main__":
|
||||
if not wait_for_server():
|
||||
print(f"\n{FAIL} Server not responding. Start with: python main.py")
|
||||
sys.exit(1)
|
||||
|
||||
test_system()
|
||||
test_documents()
|
||||
test_indexing()
|
||||
test_kg()
|
||||
test_search()
|
||||
test_query()
|
||||
|
||||
total = results["passed"] + results["failed"]
|
||||
print(f"\n{'='*50}")
|
||||
print(f"Results: {results['passed']}/{total} passed, {results['failed']} failed")
|
||||
if results["failed"] == 0:
|
||||
print(f"{PASS} All tests passed!")
|
||||
else:
|
||||
print(f"{FAIL} {results['failed']} test(s) failed")
|
||||
print(f"{'='*50}")
|
||||
sys.exit(0 if results["failed"] == 0 else 1)
|
||||
Reference in New Issue
Block a user