From b02d3378fc1949786e3819ad318e43ae64f9a87e Mon Sep 17 00:00:00 2001 From: plf Date: Sun, 7 Jun 2026 17:30:04 +0800 Subject: [PATCH] =?UTF-8?q?GraphRAG=20Studio=20=E2=80=94=20initial=20commi?= =?UTF-8?q?t:=20multimodal=20RAG=20system=20with=20KG=20visualization?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Full-stack application for document-to-knowledge-graph pipeline: - Backend: FastAPI + LangGraph ReAct agent + DeepSeek + MinerU parsing - Frontend: React 19 + Vite + D3.js + shadcn/ui - Pipeline: MinerU parsing → LangExtract entity extraction → KG building Co-Authored-By: Claude Opus 4.7 --- .gitignore | 39 + CLAUDE.md | 31 + backend/.env.example | 10 + backend/.gitignore | 10 + backend/CLAUDE.md | 28 + backend/main.py | 58 + backend/models/__init__.py | 0 backend/models/schemas.py | 360 + ...4-4a58-824b-7331d50db9bb_content_list.json | 367 + ...e1f-bba4-4a58-824b-7331d50db9bb_origin.pdf | Bin 0 -> 11789 bytes .../8456b615_sample_graphrag_overview/full.md | 71 + ...e76dc1ae7fa93c44f7d10878d17a41f21a6bef.jpg | Bin 0 -> 57747 bytes .../layout.json | 4063 +++++++++++ .../parse_summary.json | 10 + backend/pipeline/__init__.py | 0 backend/pipeline/entity_extractor.py | 66 + backend/pipeline/kg_builder.py | 123 + backend/pipeline/qa_agent.py | 217 + backend/pipeline/text_assembler.py | 107 + backend/pyproject.toml | 22 + backend/routers/__init__.py | 0 backend/routers/documents.py | 71 + backend/routers/indexing.py | 70 + backend/routers/kg.py | 72 + backend/routers/query.py | 66 + backend/routers/search.py | 43 + backend/routers/system.py | 171 + backend/services/__init__.py | 0 backend/services/document_service.py | 109 + backend/services/indexing_service.py | 255 + backend/services/kg_service.py | 167 + backend/services/qa_service.py | 85 + backend/services/search_service.py | 106 + backend/storage/__init__.py | 0 backend/storage/file_store.py | 268 + backend/tests/test_api.py | 256 + docs/agentic_rag_specification-v1.0.md | 779 +++ docs/backend_service_specification-v1.0.md | 1757 +++++ docs/bridge_pipeline_specification-v1.0.md | 481 ++ docs/frontend_design_specification-v1.0.md | 1232 ++++ docs/langextract_specification-v1.0.md | 604 ++ docs/langextract_specification.md | 672 ++ docs/mineru_specification-v1.0.md | 879 +++ docs/mineru_specification.md | 680 ++ docs/product_requirements_document-v1.0.md | 1442 ++++ frontend/.gitignore | 2 + frontend/ATTRIBUTIONS.md | 3 + frontend/CLAUDE.md | 33 + frontend/README.md | 11 + frontend/guidelines/Guidelines.md | 61 + frontend/index.html | 15 + frontend/package-lock.json | 6034 +++++++++++++++++ frontend/package.json | 95 + frontend/pnpm-lock.yaml | 4676 +++++++++++++ frontend/postcss.config.mjs | 15 + frontend/src/app/App.tsx | 7 + frontend/src/app/api.ts | 279 + .../components/figma/ImageWithFallback.tsx | 27 + .../src/app/components/layout/AppLayout.tsx | 47 + frontend/src/app/components/layout/Header.tsx | 145 + .../src/app/components/layout/Sidebar.tsx | 110 + .../src/app/components/layout/StatusBar.tsx | 34 + .../src/app/components/pages/Dashboard.tsx | 210 + .../src/app/components/pages/Documents.tsx | 439 ++ .../src/app/components/pages/KGExplorer.tsx | 439 ++ frontend/src/app/components/pages/QAChat.tsx | 377 + .../src/app/components/pages/SearchPage.tsx | 469 ++ frontend/src/app/components/ui/accordion.tsx | 66 + .../src/app/components/ui/alert-dialog.tsx | 157 + frontend/src/app/components/ui/alert.tsx | 66 + .../src/app/components/ui/aspect-ratio.tsx | 11 + frontend/src/app/components/ui/avatar.tsx | 53 + frontend/src/app/components/ui/badge.tsx | 46 + frontend/src/app/components/ui/breadcrumb.tsx | 109 + frontend/src/app/components/ui/button.tsx | 58 + frontend/src/app/components/ui/calendar.tsx | 75 + frontend/src/app/components/ui/card.tsx | 92 + frontend/src/app/components/ui/carousel.tsx | 241 + frontend/src/app/components/ui/chart.tsx | 353 + frontend/src/app/components/ui/checkbox.tsx | 32 + .../src/app/components/ui/collapsible.tsx | 33 + frontend/src/app/components/ui/command.tsx | 177 + .../src/app/components/ui/context-menu.tsx | 252 + frontend/src/app/components/ui/dialog.tsx | 135 + frontend/src/app/components/ui/drawer.tsx | 132 + .../src/app/components/ui/dropdown-menu.tsx | 257 + frontend/src/app/components/ui/form.tsx | 168 + frontend/src/app/components/ui/hover-card.tsx | 44 + frontend/src/app/components/ui/input-otp.tsx | 77 + frontend/src/app/components/ui/input.tsx | 21 + frontend/src/app/components/ui/label.tsx | 24 + frontend/src/app/components/ui/menubar.tsx | 276 + .../src/app/components/ui/navigation-menu.tsx | 168 + frontend/src/app/components/ui/pagination.tsx | 127 + frontend/src/app/components/ui/popover.tsx | 48 + frontend/src/app/components/ui/progress.tsx | 31 + .../src/app/components/ui/radio-group.tsx | 45 + frontend/src/app/components/ui/resizable.tsx | 56 + .../src/app/components/ui/scroll-area.tsx | 58 + frontend/src/app/components/ui/select.tsx | 189 + frontend/src/app/components/ui/separator.tsx | 28 + frontend/src/app/components/ui/sheet.tsx | 139 + frontend/src/app/components/ui/sidebar.tsx | 726 ++ frontend/src/app/components/ui/skeleton.tsx | 13 + frontend/src/app/components/ui/slider.tsx | 63 + frontend/src/app/components/ui/sonner.tsx | 25 + frontend/src/app/components/ui/switch.tsx | 31 + frontend/src/app/components/ui/table.tsx | 116 + frontend/src/app/components/ui/tabs.tsx | 66 + frontend/src/app/components/ui/textarea.tsx | 18 + .../src/app/components/ui/toggle-group.tsx | 73 + frontend/src/app/components/ui/toggle.tsx | 47 + frontend/src/app/components/ui/tooltip.tsx | 61 + frontend/src/app/components/ui/use-mobile.ts | 21 + frontend/src/app/components/ui/utils.ts | 6 + frontend/src/app/mock-data.ts | 286 + frontend/src/app/routes.tsx | 22 + frontend/src/app/store.tsx | 404 ++ .../product_requirements_document-v1.0.md | 1442 ++++ frontend/src/main.tsx | 7 + frontend/src/styles/app.css | 96 + frontend/src/styles/fonts.css | 0 frontend/src/styles/index.css | 3 + frontend/src/styles/tailwind.css | 4 + frontend/src/styles/theme.css | 181 + frontend/tests/integration_test.py | 266 + frontend/vite.config.ts | 22 + 127 files changed, 37218 insertions(+) create mode 100644 .gitignore create mode 100644 CLAUDE.md create mode 100644 backend/.env.example create mode 100644 backend/.gitignore create mode 100644 backend/CLAUDE.md create mode 100644 backend/main.py create mode 100644 backend/models/__init__.py create mode 100644 backend/models/schemas.py create mode 100644 backend/output/8456b615_sample_graphrag_overview/99c9be1f-bba4-4a58-824b-7331d50db9bb_content_list.json create mode 100644 backend/output/8456b615_sample_graphrag_overview/99c9be1f-bba4-4a58-824b-7331d50db9bb_origin.pdf create mode 100644 backend/output/8456b615_sample_graphrag_overview/full.md create mode 100644 backend/output/8456b615_sample_graphrag_overview/images/1ed7aacecd20fecef8dc27ee2fe76dc1ae7fa93c44f7d10878d17a41f21a6bef.jpg create mode 100644 backend/output/8456b615_sample_graphrag_overview/layout.json create mode 100644 backend/output/8456b615_sample_graphrag_overview/parse_summary.json create mode 100644 backend/pipeline/__init__.py create mode 100644 backend/pipeline/entity_extractor.py create mode 100644 backend/pipeline/kg_builder.py create mode 100644 backend/pipeline/qa_agent.py create mode 100644 backend/pipeline/text_assembler.py create mode 100644 backend/pyproject.toml create mode 100644 backend/routers/__init__.py create mode 100644 backend/routers/documents.py create mode 100644 backend/routers/indexing.py create mode 100644 backend/routers/kg.py create mode 100644 backend/routers/query.py create mode 100644 backend/routers/search.py create mode 100644 backend/routers/system.py create mode 100644 backend/services/__init__.py create mode 100644 backend/services/document_service.py create mode 100644 backend/services/indexing_service.py create mode 100644 backend/services/kg_service.py create mode 100644 backend/services/qa_service.py create mode 100644 backend/services/search_service.py create mode 100644 backend/storage/__init__.py create mode 100644 backend/storage/file_store.py create mode 100644 backend/tests/test_api.py create mode 100644 docs/agentic_rag_specification-v1.0.md create mode 100644 docs/backend_service_specification-v1.0.md create mode 100644 docs/bridge_pipeline_specification-v1.0.md create mode 100644 docs/frontend_design_specification-v1.0.md create mode 100644 docs/langextract_specification-v1.0.md create mode 100644 docs/langextract_specification.md create mode 100644 docs/mineru_specification-v1.0.md create mode 100644 docs/mineru_specification.md create mode 100644 docs/product_requirements_document-v1.0.md create mode 100644 frontend/.gitignore create mode 100644 frontend/ATTRIBUTIONS.md create mode 100644 frontend/CLAUDE.md create mode 100644 frontend/README.md create mode 100644 frontend/guidelines/Guidelines.md create mode 100644 frontend/index.html create mode 100644 frontend/package-lock.json create mode 100644 frontend/package.json create mode 100644 frontend/pnpm-lock.yaml create mode 100644 frontend/postcss.config.mjs create mode 100644 frontend/src/app/App.tsx create mode 100644 frontend/src/app/api.ts create mode 100644 frontend/src/app/components/figma/ImageWithFallback.tsx create mode 100644 frontend/src/app/components/layout/AppLayout.tsx create mode 100644 frontend/src/app/components/layout/Header.tsx create mode 100644 frontend/src/app/components/layout/Sidebar.tsx create mode 100644 frontend/src/app/components/layout/StatusBar.tsx create mode 100644 frontend/src/app/components/pages/Dashboard.tsx create mode 100644 frontend/src/app/components/pages/Documents.tsx create mode 100644 frontend/src/app/components/pages/KGExplorer.tsx create mode 100644 frontend/src/app/components/pages/QAChat.tsx create mode 100644 frontend/src/app/components/pages/SearchPage.tsx create mode 100644 frontend/src/app/components/ui/accordion.tsx create mode 100644 frontend/src/app/components/ui/alert-dialog.tsx create mode 100644 frontend/src/app/components/ui/alert.tsx create mode 100644 frontend/src/app/components/ui/aspect-ratio.tsx create mode 100644 frontend/src/app/components/ui/avatar.tsx create mode 100644 frontend/src/app/components/ui/badge.tsx create mode 100644 frontend/src/app/components/ui/breadcrumb.tsx create mode 100644 frontend/src/app/components/ui/button.tsx create mode 100644 frontend/src/app/components/ui/calendar.tsx create mode 100644 frontend/src/app/components/ui/card.tsx create mode 100644 frontend/src/app/components/ui/carousel.tsx create mode 100644 frontend/src/app/components/ui/chart.tsx create mode 100644 frontend/src/app/components/ui/checkbox.tsx create mode 100644 frontend/src/app/components/ui/collapsible.tsx create mode 100644 frontend/src/app/components/ui/command.tsx create mode 100644 frontend/src/app/components/ui/context-menu.tsx create mode 100644 frontend/src/app/components/ui/dialog.tsx create mode 100644 frontend/src/app/components/ui/drawer.tsx create mode 100644 frontend/src/app/components/ui/dropdown-menu.tsx create mode 100644 frontend/src/app/components/ui/form.tsx create mode 100644 frontend/src/app/components/ui/hover-card.tsx create mode 100644 frontend/src/app/components/ui/input-otp.tsx create mode 100644 frontend/src/app/components/ui/input.tsx create mode 100644 frontend/src/app/components/ui/label.tsx create mode 100644 frontend/src/app/components/ui/menubar.tsx create mode 100644 frontend/src/app/components/ui/navigation-menu.tsx create mode 100644 frontend/src/app/components/ui/pagination.tsx create mode 100644 frontend/src/app/components/ui/popover.tsx create mode 100644 frontend/src/app/components/ui/progress.tsx create mode 100644 frontend/src/app/components/ui/radio-group.tsx create mode 100644 frontend/src/app/components/ui/resizable.tsx create mode 100644 frontend/src/app/components/ui/scroll-area.tsx create mode 100644 frontend/src/app/components/ui/select.tsx create mode 100644 frontend/src/app/components/ui/separator.tsx create mode 100644 frontend/src/app/components/ui/sheet.tsx create mode 100644 frontend/src/app/components/ui/sidebar.tsx create mode 100644 frontend/src/app/components/ui/skeleton.tsx create mode 100644 frontend/src/app/components/ui/slider.tsx create mode 100644 frontend/src/app/components/ui/sonner.tsx create mode 100644 frontend/src/app/components/ui/switch.tsx create mode 100644 frontend/src/app/components/ui/table.tsx create mode 100644 frontend/src/app/components/ui/tabs.tsx create mode 100644 frontend/src/app/components/ui/textarea.tsx create mode 100644 frontend/src/app/components/ui/toggle-group.tsx create mode 100644 frontend/src/app/components/ui/toggle.tsx create mode 100644 frontend/src/app/components/ui/tooltip.tsx create mode 100644 frontend/src/app/components/ui/use-mobile.ts create mode 100644 frontend/src/app/components/ui/utils.ts create mode 100644 frontend/src/app/mock-data.ts create mode 100644 frontend/src/app/routes.tsx create mode 100644 frontend/src/app/store.tsx create mode 100644 frontend/src/imports/product_requirements_document-v1.0.md create mode 100644 frontend/src/main.tsx create mode 100644 frontend/src/styles/app.css create mode 100644 frontend/src/styles/fonts.css create mode 100644 frontend/src/styles/index.css create mode 100644 frontend/src/styles/tailwind.css create mode 100644 frontend/src/styles/theme.css create mode 100644 frontend/tests/integration_test.py create mode 100644 frontend/vite.config.ts diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..1480d44 --- /dev/null +++ b/.gitignore @@ -0,0 +1,39 @@ +# IDE / Editor +.idea/ +.vscode/ +*.swp +*.swo + +# OS +.DS_Store +Thumbs.db + +# Dependencies +node_modules/ +.pnpm-store/ + +# Build output +dist/ +build/ + +# Environment & secrets +.env +.env.local +.env.*.local + +# Python +__pycache__/ +*.pyc +*.pyo +.venv/ +*.egg-info/ + +# Logs +*.log + +# OMC +.omc/ +**/.git_embedded_backup/ + +# Claude Code personal config +settings.json diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..ab3dc8e --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,31 @@ +# GraphRAG Studio — Project Conventions + +## 1. 目录结构 + +- **前端代码** 统一放在 `frontend/` 目录下 +- **后端代码** 统一放在 `backend/` 目录下 + +``` +GraphRAGAgent/ +├── frontend/ # 所有前端代码(HTML/CSS/JS) +├── backend/ # 所有后端代码(FastAPI 服务) +└── docs/ # 规范文档 +``` + +## 2. 环境变量与敏感配置 + +- 所有外部配置(API Key、Base URL、Token 等)统一在 `backend/.env` 中管理 +- `.env` 文件**禁止提交到 Git**,必须在 `.gitignore` 中忽略 +- 提供 `backend/.env.example` 作为配置模板(不含真实值) + +## 3. 后端虚拟环境 + +- 后端服务必须使用 `uv` 创建独立虚拟环境: + +```bash +cd backend +uv venv +uv pip install -r requirements.txt +``` + +- 虚拟环境目录 `.venv/` 不提交到 Git diff --git a/backend/.env.example b/backend/.env.example new file mode 100644 index 0000000..d7a1c42 --- /dev/null +++ b/backend/.env.example @@ -0,0 +1,10 @@ +# DeepSeek API (required for entity extraction + QA) +DEEPSEEK_API_KEY=your_deepseek_api_key_here +DEEPSEEK_BASE_URL=https://api.deepseek.com + +# MinerU (required for document parsing) +MINERU_API_TOKEN=your_mineru_api_token_here + +# MinerU venv path (absolute path to python.exe) +MINERU_PYTHON=F:/GraphRAGAgent/mineru_mvp/.venv/Scripts/python.exe +MINERU_PIPELINE=F:/GraphRAGAgent/mineru_mvp/pipeline.py diff --git a/backend/.gitignore b/backend/.gitignore new file mode 100644 index 0000000..87b96af --- /dev/null +++ b/backend/.gitignore @@ -0,0 +1,10 @@ +.env +.venv/ +__pycache__/ +*.pyc +*.pyo +data/uploads/ +data/jobs/ +data/kg/ +*.egg-info/ +dist/ diff --git a/backend/CLAUDE.md b/backend/CLAUDE.md new file mode 100644 index 0000000..fe37c1b --- /dev/null +++ b/backend/CLAUDE.md @@ -0,0 +1,28 @@ +# Backend — GraphRAG Studio API + +## 路径 + +``` +F:\GraphRAGAgent\backend\ +``` + +## 启动命令 + +```bash +cd F:/GraphRAGAgent/backend +.venv/Scripts/python.exe -m uvicorn main:app --host 0.0.0.0 --port 8000 --reload +``` + +## 接口测试 + +服务启动后,运行: + +```bash +.venv/Scripts/python.exe tests/test_api.py +``` + +## API 文档 + +- Swagger UI:http://localhost:8000/docs +- ReDoc:http://localhost:8000/redoc +- 健康检查:http://localhost:8000/api/v1/health diff --git a/backend/main.py b/backend/main.py new file mode 100644 index 0000000..0bcd2e7 --- /dev/null +++ b/backend/main.py @@ -0,0 +1,58 @@ +""" +GraphRAG Studio — FastAPI Backend +Entry point: uvicorn main:app --host 0.0.0.0 --port 8000 --reload +""" +import sys +from pathlib import Path + +# Ensure backend/ is in sys.path for absolute imports +sys.path.insert(0, str(Path(__file__).parent)) + +from dotenv import load_dotenv +from fastapi import FastAPI +from fastapi.middleware.cors import CORSMiddleware + +load_dotenv(Path(__file__).parent / ".env", override=True) + +from routers import documents, indexing, kg, query, search, system + +app = FastAPI( + title="GraphRAG Studio API", + description="Multimodal RAG Q&A system backend — MinerU + LangExtract + Agentic-RAG", + version="1.0.0", + docs_url="/docs", + redoc_url="/redoc", +) + +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + +# All routers under /api/v1. Each router carries its own sub-prefix. +# documents.router prefix="/documents" → /api/v1/documents +# indexing.router prefix="/index" → /api/v1/index +# kg.router prefix="/kg" → /api/v1/kg +# query.router prefix="/query" → /api/v1/query +# search.router prefix="/search" → /api/v1/search +# system.router no prefix → /api/v1/health, /api/v1/system/... +PREFIX = "/api/v1" +app.include_router(documents.router, prefix=PREFIX) +app.include_router(indexing.router, prefix=PREFIX) +app.include_router(kg.router, prefix=PREFIX) +app.include_router(query.router, prefix=PREFIX) +app.include_router(search.router, prefix=PREFIX) +app.include_router(system.router, prefix=PREFIX) + + +@app.get("/") +async def root(): + return {"msg": "GraphRAG Studio API v1.0.0", "docs": "/docs", "health": "/api/v1/health"} + + +if __name__ == "__main__": + import uvicorn + uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=True) diff --git a/backend/models/__init__.py b/backend/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/backend/models/schemas.py b/backend/models/schemas.py new file mode 100644 index 0000000..aa3dbc8 --- /dev/null +++ b/backend/models/schemas.py @@ -0,0 +1,360 @@ +""" +Pydantic v2 schemas — all API data objects per backend_service_specification-v1.0.md +""" +from __future__ import annotations + +import uuid +from typing import Any, Generic, Optional, TypeVar + +from pydantic import BaseModel, Field + +T = TypeVar("T") + + +# --------------------------------------------------------------------------- +# Universal response envelope +# --------------------------------------------------------------------------- + +class APIResponse(BaseModel, Generic[T]): + code: int = 0 + msg: str = "success" + request_id: str = Field(default_factory=lambda: str(uuid.uuid4())) + data: Optional[T] = None + + @classmethod + def ok(cls, data: Any = None) -> "APIResponse": + return cls(code=0, msg="success", data=data) + + @classmethod + def err(cls, code: int, msg: str) -> "APIResponse": + return cls(code=code, msg=msg, data=None) + + +# --------------------------------------------------------------------------- +# A. Document schemas +# --------------------------------------------------------------------------- + +class DocumentInfo(BaseModel): + doc_id: str + filename: str + format: str + size_bytes: int + pages: Optional[int] = None + uploaded_at: str + status: str # uploaded | indexed | failed + language: str = "ch" + enable_formula: bool = True + enable_table: bool = True + + +class DocumentListData(BaseModel): + total: int + page: int + page_size: int + items: list[DocumentInfo] + + +class DeleteDocumentData(BaseModel): + deleted: bool + doc_id: str + removed_nodes: int + removed_edges: int + + +# --------------------------------------------------------------------------- +# B. Indexing job schemas +# --------------------------------------------------------------------------- + +class IndexingProgress(BaseModel): + parsed_pages: int = 0 + total_pages: int = 0 + extracted_entities: int = 0 + + +class IndexingJobStatus(BaseModel): + job_id: str + doc_id: str + status: str # submitted|queued|parsing|extracting|indexing|done|failed|cancelled + stage: str = "" + progress: IndexingProgress = Field(default_factory=IndexingProgress) + created_at: str + elapsed_seconds: float = 0.0 + error: Optional[str] = None + + +class StartIndexRequest(BaseModel): + doc_id: str + + +class CancelJobData(BaseModel): + cancelled: bool + job_id: str + previous_status: str + + +# --------------------------------------------------------------------------- +# C. KG schemas +# --------------------------------------------------------------------------- + +class KGNode(BaseModel): + id: str + name: str + type: str + source_doc: str + char_start: Optional[int] = None + char_end: Optional[int] = None + confidence: Optional[str] = None + page: int = 0 + degree: int = 0 + + +class KGNodeDetail(KGNode): + degree_centrality: float = 0.0 + neighbor_count: int = 0 + + +class KGEdge(BaseModel): + source: str + target: str + relation: str = "CO_OCCURS_IN" + doc_id: str + page: int = 0 + + +class KGNodeListData(BaseModel): + total: int + page: int + page_size: int + items: list[KGNode] + + +class KGEdgeListData(BaseModel): + total: int + page: int + page_size: int + items: list[KGEdge] + + +class KGStatsData(BaseModel): + total_nodes: int + total_edges: int + density: float + type_distribution: dict[str, int] + relation_types: dict[str, int] + top5_central_nodes: list[dict] + source_documents: list[str] + + +class KGExportData(BaseModel): + format: str + doc_id: Optional[str] + total_nodes: int + total_edges: int + exported_at: str + nodes: list[KGNode] + edges: list[KGEdge] + + +class NeighborInfo(BaseModel): + id: str + name: str + type: str + page: int + + +class NeighborsData(BaseModel): + center: NeighborInfo + hops: int + neighbors_by_hop: dict[str, list[NeighborInfo]] + total_neighbors: int + + +# --------------------------------------------------------------------------- +# D. QA schemas +# --------------------------------------------------------------------------- + +class ChatMessage(BaseModel): + role: str # human | ai + content: str + + +class QueryRequest(BaseModel): + question: str + history: list[ChatMessage] = Field(default_factory=list) + + +class ToolCallRecord(BaseModel): + tool: str + input: dict + output: str + + +class QAResult(BaseModel): + query_id: str + question: str + answer: str + tool_calls: list[ToolCallRecord] = Field(default_factory=list) + cited_nodes: list[str] = Field(default_factory=list) + elapsed_seconds: float + created_at: str + + +class QAHistoryData(BaseModel): + total: int + page: int + page_size: int + items: list[QAResult] + + +class BatchQueryRequest(BaseModel): + questions: list[str] = Field(..., max_length=20) + + +class BatchQueryData(BaseModel): + batch_id: str + total: int + status: str + created_at: str + + +class BatchResultData(BaseModel): + batch_id: str + total: int + completed: int + failed: int + status: str + results: list[QAResult] + + +# --------------------------------------------------------------------------- +# E. Search schemas +# --------------------------------------------------------------------------- + +class EntitySearchData(BaseModel): + query: str + total: int + items: list[KGNode] + + +class PathNode(BaseModel): + id: str + name: str + type: str + + +class PathEdge(BaseModel): + source: str + target: str + relation: str + + +class PathInfo(BaseModel): + length: int + nodes: list[PathNode] + edges: list[PathEdge] + + +class PathSearchData(BaseModel): + from_node: PathNode = Field(alias="from") + to_node: PathNode = Field(alias="to") + max_hops: int + paths: list[PathInfo] + total_paths: int + + model_config = {"populate_by_name": True} + + +class GraphSearchData(BaseModel): + query: str + matched_nodes: list[KGNode] + subgraph_edges: list[KGEdge] + + +# --------------------------------------------------------------------------- +# F. System schemas +# --------------------------------------------------------------------------- + +class ComponentHealth(BaseModel): + status: str # ok | error + path: Optional[str] = None + exists: Optional[bool] = None + base_url: Optional[str] = None + key_configured: Optional[bool] = None + kg_nodes_exists: Optional[bool] = None + kg_edges_exists: Optional[bool] = None + uploads_dir_exists: Optional[bool] = None + + +class HealthData(BaseModel): + status: str + version: str + uptime_seconds: float + components: dict[str, ComponentHealth] + + +class SystemStatsData(BaseModel): + total_documents: int + indexed_documents: int + failed_documents: int + total_nodes: int + total_edges: int + type_distribution: dict[str, int] + total_queries: int + active_jobs: int + storage_used_mb: float + + +class FormatInfo(BaseModel): + ext: str + description: str + max_size_mb: int + max_pages: int + requires_ocr: bool + + +class FormatsData(BaseModel): + formats: list[FormatInfo] + ocr_languages: list[dict] + notes: list[str] + + +class DemoData(BaseModel): + nodes: list[KGNode] + edges: list[KGEdge] + stats: dict + + +# --------------------------------------------------------------------------- +# B3 index result +# --------------------------------------------------------------------------- + +class IndexResultStats(BaseModel): + blocks: int = 0 + block_types: dict[str, int] = Field(default_factory=dict) + pages: int = 0 + raw_extractions: int = 0 + nodes: int = 0 + edges: int = 0 + type_counts: dict[str, int] = Field(default_factory=dict) + alignment_counts: dict[str, int] = Field(default_factory=dict) + elapsed_seconds: float = 0.0 + + +class ExtractionRecord(BaseModel): + text: str + type: str + char_start: Optional[int] = None + char_end: Optional[int] = None + alignment: Optional[str] = None + page: int = 0 + doc_id: str + + +class IndexResultData(BaseModel): + job_id: str + doc_id: str + status: str + stats: Optional[IndexResultStats] = None + extractions: Optional[list[ExtractionRecord]] = None + nodes: Optional[list[KGNode]] = None + edges: Optional[list[KGEdge]] = None diff --git a/backend/output/8456b615_sample_graphrag_overview/99c9be1f-bba4-4a58-824b-7331d50db9bb_content_list.json b/backend/output/8456b615_sample_graphrag_overview/99c9be1f-bba4-4a58-824b-7331d50db9bb_content_list.json new file mode 100644 index 0000000..6388389 --- /dev/null +++ b/backend/output/8456b615_sample_graphrag_overview/99c9be1f-bba4-4a58-824b-7331d50db9bb_content_list.json @@ -0,0 +1,367 @@ +[ + { + "type": "text", + "text": "GraphRAG System ", + "text_level": 1, + "bbox": [ + 344, + 175, + 655, + 204 + ], + "page_idx": 0 + }, + { + "type": "text", + "text": "Technical Architecture Overview ", + "bbox": [ + 289, + 234, + 710, + 254 + ], + "page_idx": 0 + }, + { + "type": "text", + "text": "Version 1.0 | March 2026 ", + "bbox": [ + 364, + 272, + 633, + 290 + ], + "page_idx": 0 + }, + { + "type": "text", + "text": "1. Abstract ", + "text_level": 1, + "bbox": [ + 52, + 42, + 200, + 61 + ], + "page_idx": 1 + }, + { + "type": "text", + "text": "This document presents the technical architecture of a Multimodal GraphRAG System designed for intelligent document parsing and knowledge graph construction. The system integrates MinerU for document parsing, LangExtract for structured entity extraction, and a graph database for knowledge storage and retrieval. ", + "bbox": [ + 48, + 83, + 951, + 171 + ], + "page_idx": 1 + }, + { + "type": "text", + "text": "The pipeline supports multiple document formats including PDF, DOCX, PPTX, and image files. Extracted entities and relations are stored as graph nodes and edges, enabling semantic search and question answering over large document collections. ", + "bbox": [ + 48, + 200, + 949, + 265 + ], + "page_idx": 1 + }, + { + "type": "text", + "text": "2. System Components ", + "text_level": 1, + "bbox": [ + 50, + 299, + 321, + 318 + ], + "page_idx": 1 + }, + { + "type": "text", + "text": "2.1 Document Parsing Module ", + "text_level": 1, + "bbox": [ + 50, + 343, + 349, + 361 + ], + "page_idx": 1 + }, + { + "type": "text", + "text": "MinerU Cloud API (v4) serves as the document parsing backend. It accepts PDF, DOCX, PPTX, PNG, JPG, and HTML files. Output includes Markdown text, structured content_list.json, and extracted images. ", + "bbox": [ + 48, + 373, + 951, + 436 + ], + "page_idx": 1 + }, + { + "type": "text", + "text": "2.2 Entity Extraction Module ", + "text_level": 1, + "bbox": [ + 50, + 461, + 357, + 479 + ], + "page_idx": 1 + }, + { + "type": "text", + "text": "LangExtract (v1.1.1) performs structured information extraction from plain text using few-shot prompting with LLM backends (Gemini, OpenAI, or local Ollama). Each extraction includes character-level position anchoring. ", + "bbox": [ + 48, + 492, + 949, + 555 + ], + "page_idx": 1 + }, + { + "type": "text", + "text": "2.3 Knowledge Graph Module ", + "text_level": 1, + "bbox": [ + 50, + 580, + 337, + 596 + ], + "page_idx": 1 + }, + { + "type": "text", + "text": "Extracted entities and relationships are stored in a graph database. Node types include: Person, Organization, Location, Event, Concept. Edge types include: RELATED_TO, BELONGS_TO, CAUSED_BY, LOCATED_IN. ", + "bbox": [ + 48, + 608, + 949, + 674 + ], + "page_idx": 1 + }, + { + "type": "text", + "text": "2.4 Retrieval Module ", + "text_level": 1, + "bbox": [ + 50, + 697, + 272, + 715 + ], + "page_idx": 1 + }, + { + "type": "text", + "text": "The retrieval layer supports hybrid search combining vector similarity and graph traversal. \nQuery results are ranked by relevance score and returned with source document references. ", + "bbox": [ + 48, + 727, + 944, + 766 + ], + "page_idx": 1 + }, + { + "type": "text", + "text": "3. Data Pipeline ", + "text_level": 1, + "bbox": [ + 50, + 42, + 268, + 61 + ], + "page_idx": 2 + }, + { + "type": "text", + "text": "The end-to-end data pipeline consists of the following stages: ", + "bbox": [ + 50, + 83, + 623, + 99 + ], + "page_idx": 2 + }, + { + "type": "text", + "text": "Stage 1: Document Ingestion ", + "bbox": [ + 68, + 130, + 322, + 146 + ], + "page_idx": 2 + }, + { + "type": "text", + "text": "- Accept raw documents (PDF, DOCX, images, HTML) - Submit to MinerU API for parsing - Poll task status until state $\\underline { { \\underline { { \\mathbf { \\delta \\pi } } } } }$ done ", + "bbox": [ + 85, + 153, + 531, + 217 + ], + "page_idx": 2 + }, + { + "type": "text", + "text": "Stage 2: Content Extraction ", + "bbox": [ + 68, + 249, + 322, + 263 + ], + "page_idx": 2 + }, + { + "type": "text", + "text": "- Download and decompress full_zip_url - Parse content_list.json into Document objects - Separate text blocks, tables, images, equations ", + "bbox": [ + 85, + 272, + 542, + 335 + ], + "page_idx": 2 + }, + { + "type": "text", + "text": "Stage 3: Entity & Relation Extraction ", + "bbox": [ + 67, + 367, + 415, + 381 + ], + "page_idx": 2 + }, + { + "type": "text", + "text": "- Feed text blocks to LangExtract - Extract entities with char_interval positions - Extract relationships between entities ", + "bbox": [ + 85, + 390, + 526, + 454 + ], + "page_idx": 2 + }, + { + "type": "text", + "text": "Stage 4: Graph Construction ", + "bbox": [ + 68, + 485, + 322, + 500 + ], + "page_idx": 2 + }, + { + "type": "text", + "text": "- Map extractions to graph nodes and edges - Store with source provenance (page_idx, bbox) - Build vector embeddings for semantic search ", + "bbox": [ + 85, + 508, + 522, + 571 + ], + "page_idx": 2 + }, + { + "type": "text", + "text": "4. Supported File Formats ", + "text_level": 1, + "bbox": [ + 50, + 604, + 326, + 620 + ], + "page_idx": 2 + }, + { + "type": "table", + "img_path": "images/1ed7aacecd20fecef8dc27ee2fe76dc1ae7fa93c44f7d10878d17a41f21a6bef.jpg", + "table_caption": [], + "table_footnote": [], + "table_body": "
FormatExtensionOCR RequiredModeI
PDF (text). pdfNopipeline / vlm
PDF (scan). pdfYesvIlm
Word. docxNopipeline
PowerPoint.pptxNopipeline
Image.png / .jpgAutovIlm
HTML.htmlNoMinerU-HTML
", + "bbox": [ + 45, + 634, + 882, + 806 + ], + "page_idx": 2 + }, + { + "type": "text", + "text": "5. API Configuration Reference ", + "text_level": 1, + "bbox": [ + 48, + 42, + 457, + 63 + ], + "page_idx": 3 + }, + { + "type": "text", + "text": "The following environment variables must be configured before running the MinerU parsing service: ", + "bbox": [ + 48, + 83, + 952, + 123 + ], + "page_idx": 3 + }, + { + "type": "text", + "text": "MINERU_API_TOKEN : Bearer token for API authentication \nMINERU_USER_UID : User UUID for quota management \nMINERU_BASE_URL : https://mineru.net/api/v4 \nMINERU_MODEL_VERSION : pipeline (default) | vlm | MinerU-HTML \nMINERU_LANGUAGE : ch (Chinese) | en (English) \nMINERU_IS_OCR : false (text PDF) | true (scanned PDF) \nMINERU_ENABLE_FORMULA: true | false \nMINERU_ENABLE_TABLE : true | false ", + "bbox": [ + 65, + 152, + 636, + 337 + ], + "page_idx": 3 + }, + { + "type": "text", + "text": "Rate Limits: ", + "bbox": [ + 48, + 367, + 161, + 381 + ], + "page_idx": 3 + }, + { + "type": "text", + "text": "- Max file size : 200 MB per file - Max pages : 600 pages per file - Daily quota : 2000 pages (high priority) - Batch limit : 200 files per request ", + "bbox": [ + 65, + 388, + 504, + 478 + ], + "page_idx": 3 + } +] \ No newline at end of file diff --git a/backend/output/8456b615_sample_graphrag_overview/99c9be1f-bba4-4a58-824b-7331d50db9bb_origin.pdf b/backend/output/8456b615_sample_graphrag_overview/99c9be1f-bba4-4a58-824b-7331d50db9bb_origin.pdf new file mode 100644 index 0000000000000000000000000000000000000000..9346f8e9b427cb03a126f4e83112c05e79c18693 GIT binary patch literal 11789 zcmc(FcUV)+w=N=xfQZr*1&PvoIsuUm(osN=DlI@L0ZHghl&VNCQUvLU^dbtqLR6GOP}>(z9%M*;+I8CxJR6k=i&NVFM#0x)jmuZh&`k!VR%EXodjJPSZj z4uzEmpdD>&a0c+I%EqpCj#z-88X75WXX9v#CNyIJc!$-0O(JcKHMX&{_-z`z)4!&X zwnJl)Xe6zX|7H-c`^%sl$_9&cz&UGUj77>I zP3_E(6oSe~v<21@00sf^S{SSY(%6>5CD9gW$0QmR(;X4OSg4z|I*EII*i95Q>vF5KfQ zb)?w@%aW4kWKW)Ogl^M_PLWfnqz^J%&>ZG_vv%!XPGsktVvy7GuDh|W_oNUGHm7GM zzO313pe3c3XnOBVFwoAx|p!&5H#gRAkr0h9Mo+C2yQw{}a{ zE*b?oXHKLgvN~0k(geFQnmUwqXgxo1vWd44zmCV?j#m`FGk!(ppHRd9hfp7Fp+A~5NwUQKBB{oeVdyB(erPN!A z#ceqwhqmS~Xh7-dc`i>By}XS$ zm%_gA8AK<`P@S^bCA(E&MtKWv(fzY9_awE^hP5*a}K{l7CaRs6J_UZTg(sDZ~D%?z5n2BC9<1$^J(`5@37ZCfgasP`l;Pj zC*nzYTtcO5^;%qD!1gOCf~f_EDXvrO74Q?^IUff~R|ekDsgwRXa$ffF&^&Q$@3JM6 zXkSvFn1f3ax8cjdK_p{dr7dY}S*&`LLYaPLRCt4sG)1QrkhCRaGL2zir}5-uHh)P= zyZ+;|B)$QutaLGrZqdvdw-98^F?v__TayMkMFrHh8&w%ClBHK!JycDKZs-?tUBX;O zPoD}SrhSj}Khc?J)A1ia-J=5 zsU@E?$=J*5>3uj{18ZBTM8Dtk{cE64oKwkWBUDbMiP7HiTl*`jpA=GT4F=ZKC?ySm znt%E&VcEVch=<^Z1Xk%3pr}eU|4N~OmOOddCc{t1fwjz!{hqZ`%dz|}UtUmo1>c^} z7tv^XA7+xR{eGCcy7q3UvNzk#9i)R}80LNZPrK{=g)ddCr=Dr~RDUU^rGDc_T-pVq zN!&Qy&mg1zsOB<95TC541xIP3DEXM}lPR{7s74=SQm0l}@ygjbz9JJ5WfI*_t=z#( z37RwKVyzN8yP=2jv6de5m*Nxl5b?}?Ofz-X080D8T4m)ZG>|h}I5syRi@!q9R^*Ag)lz%@ zbZYOJ`wAX`Z>0Al!s`66`@1g|hTDHo?ZUn@j46Y-MaG8BQQj@A=`#j>I#&z;U;4Dw z?Sp-X?|g_#Z6KSbx^-Vd5%~Ra2%}}h*9Vu0E@bj<@oS*q)MN>#Q|sTh7^ET|o!)1t zmA-34lk9qOV@R_(&eZhO+JIGL@tgr`^lOZ0n>3F{v-F!(sd%V5MStyWifBSXEi;=d$uZ`OeXB~2U5kpSIX02 zb?wz|cpleaqv{ZmE!aT0hKSGl4B<=}jf=F-ae=fYjK35{*|*gi-T$gveqScjVF)qI zOclyOPzDXI@??|4eVF<;gd9OiZG>@ z)fBaViJiFG`9?0f!#<{x`J7Qs$|vcJ-57 zCP#^{H&ynOIAmRgxp9?JCUXxhWwk~X9~&q8&S^oM^M(=k-xUadSMsZ1{8NFBi}`<7 z?7{e3mVa&_{-{8B-CqX(V+8^ug#T56o@y&$KEuz+u?ydqSN&l>bsIW;hObg(itHg- za##AdH!9Rsj&e6y63jBocR$vQkFras14x{qbh3f_POF~d0ut|k%BOSj9NIb5OtW}t z>qko|R`0coFWa|rrC1rl6g7p%51Ph{LTwrlmBtz|^>srqy<*0WL5eQQn5wzTnr|f} zSyJ{HbQykoNz8Q(Od+J7i?0(IG&s9AP1+lq?-?7JoplztOFTe65?R|CZ&2%ObzX47 zjWii{J;Or4nD1(-(zdv9@~M}@?@H-ZKFl^vVn#;a4_R*pWmfsW;ar*wS64QcKaZ8` zQBp>o;hju(ySqP9x61Tlc4a2mp5xp?ROXUjZ5-;wj{j(Jc~pZE#~|5mRpaLoU;7ut zQr&|M*JDCIa#^!uVXD&?A3kU8idd64fiNnCsttxl&BHoNGO$u6%9Dwur6DJy6=&R- zz7A8=2le|N^pHnLQ6S#a#;-(tzyA}kvlSET={;mwv6U$}e3?e2ClZx4*x>F&Djv47 zBl$t`TweIJ!bn&$!^|sgSM0DWj8%;|5<+xSmS$?n*W92axwRgquQq5!SD0NECL355 zvpBYf=77UEH)W3JG^TlgZBN<2xyDAO zMjNWYndxVcnbs7>$gti|=UEh`t#3aoOl48HeX9_r*lt++rNCY^ZLFu^P%Vz~rL0b9 zmItLhW80;qoL98jrQEOw2bO6F;cUIRX@p_K2v1o$YnntB)=_4SC7hy+p0Q({J~%g< zZ9p?D0QFu;x@x8!>slf>_cYeQ@BIZ&{ zQsJq$VW8koAyzzm!Tzb}tIOq$Yr-S$^p;bnWOC#0pAwg5TS|ELlUHxTwxFw!s2ioO zS9zA>kycWvkaAuK<~lUvUFd$_2#FTvik%iN|2po2soDdOX?ej->m3({z{~ z-}Ii3P$pFvkk5~MF+$vH@mh!2nz;TzJ>5vFdN$MRi$+EM<5oE{ErR<=&TmnJZ$8bq z$|kqih0Op@Tu%B)p3M^w-PmY+Cu!P=JJ6bvyQDH=?2T{!6Dh9Ilto3ljL`fi6hZ(= zto}ywkQ}Fi?Z#)Pi}jy;XQfJeS6yFH-p;lUgy@}lb%hQt!yZ`aO{|l6>Jra2!Ry7E zj|{qcUp!>yC=+V~DON^GE>`x(lW7JLU#AzHp>jcY2oUF9hj#3W#hZ>w^A3tguN?mfhD;BDz3lCt>Z9&1*CHCf}JTgOysBSZ!#Av!%UaoAZp$JEauo-oIJK_?f=iw6mhT>c(S! zr(hG7J>gs<)HCxZ3~56*-HW4hL@rbW_UdpOD~HaHrJppD`@j@;W#rUrUv8dK3gB0w zv@>oGj3Y(Tx!kXOTrv1509^h_lmUJA`R^2szu*6rQU6KNpyM3;-{$ClJ>>n7qmPt- znfy;lT1e=Rr-VD>cD=xJV)1#*s-L@=Ip%+7blaykax42XfX5159AY^i`yO!Us$^#PKin!>_2Gf?I=jeC?5^*?~jqE_a8BqwOD66FyQ-;CB=D{2o`oXX3cMS$0fbK|k%BQMG* z2hZhD1dFAexc_qhs#W)>9xXSLNvZn*d!EQdULm{e?l!M z>L5%fcj}vS-KiNbWa*hsZ{y}7#TIt+NIR0nCpWI=^GNBjdD$_)%8fY~=oEQyF=Ba9 zVA))-D|CdH)I(PAL++16>)KHozt9VUK3uMA&hMU2?VcO+bv2l{lXplCySz?a8X{6J z*1w4Q#u1sM(SP4mAcayOoB2?DbDP;j$Xkk|?QWb+WK$wNCS!=7eZ7~9{pFRhLlJ0J zaJJ!FZ=wU;UF*LuPtfoDkCZV6DTm`s0D>wwl1NgLR|91$k3>-j${;bOxQBjYG#1~D zqlQ3uHV4lH$)a&22nuZh5L7V3aVsdSE5AI2poXIf_J{<+Q$ax7)4tsuGz!m_06+v% z>6jzJTl{w6eNP-^gAqagC!bssGNl2I=NfM;vJ}H6(CG<%jNpKiJk|9Vk z1WEQtf*p^PBlHohfC%{DvPUC91blG9d>{foxHO@UfKP}p1BieRiVG?(JRkx-Avi%I z;1eQ52}HmLJMJUkgCFY=@IeUJK?Ho@V_yjPAcP2l2>67KeIek367Yct_#lLMf(ZB^ zg!#b7hzNb)V?=~VfR9(~cq9Q4oZvP1c*UfS5P^?ZjDQ_XKm;cE0wy4W9!ms7(BqNE zD@Is%FaZ&afC#6K-&xROUyczS`$9kjC9DycfCxsgf(r;x_XC{szag$)vylLb{!7z{39Yop9?#|DG~rGYDFS^$L5@5Sf83fAux=l_)> z2g6{$RUwzeZ*_Cx$Uw(q1_T6}&$=lbGoL>u2o4B1dWph-3i@vG!54S1Aw@O)=M5CzuSY7CrK!^|c^<{V?~eb13mhlkOs1*?g!9$=Pf>kr^B)Wk~= zHI3f!-ht=5DA)Te`n9dw!&cOMPume~KDUTvV6doJZ5KD%FSsjoVOBdvKHAcEM(|b+ z<%I&Vy{_sX;{X}Z>H9YY6O+ZaN?vKSNU&hQjwgt|x6HIWW2yJ(_G{`>nWWpYd-jOq z?fum?$a;ehRYGfA#`KsD0>jJ%jru=+iS8RyBe$Fj z{rDr}%evCX=d^Z@?#bUo7xb=+0m1~;=D+E*xaJ6W2kyLz5=*HFi?XXac+S%4S?9Ge zCzC}aH74q?Hn(x`ecM``J$*}TCv2qd6F^kq@~BgmbHI}Eo@*{U0oM-TT zuy38{+vnc~r>d3NQX}}K%D$N`^Cx16WQ27hg@?gQ&z&|qCm)%KzYJgGruN-DOoBN) zQB>s84U&94sKJpeMSb7!L4vkIaMDTYYiK*-TRvYe&NtT2Z;WzqBw-U-DVh~u=S%CJ zy{lO>FEW~vre{W@Y5TD*o|MagSAxgyJpxOgc6J)t7vl4L!;p1}rq6oZ2ZUAa@D}z?PWXnN zY)ncw`S?JnO}t*zzD+#&mP|OHM8Z9+q@=tqZszouIlLUGroO#+^VHlMp*Ld}yaS@_ zR31#^<*mgDA$C?sMK1-6O~{DQ%*&gkHK>`&WbJoFWoh`Rx{A<$jatg35&7~ps)u_3 z(fBl-N^AS)x1U(n&nxfPu|-=?#>k#^bUvcmj*=F2aU{9UUpvQ3x27~uH0^D#Me6OX zVoAb%zIuyxjcuyVaLu6Svs%SUJ^31!Nw@n9w*yo4J>&s>uKvk@3vEaj8!ArOV6tJ` zS%37VbND{P&Z!;RHHI#i*nLiqf{q*ARgI&wUX2H=TjyEUZf?sSD($nZly7fsd(Vo~ ztku5(@{6+T(|M`AVc)NMM$1C078Kjazj|lYaaFL1MKK&Jp&MoxJ{BbwW-&wRs!@H_ zTj>z9!}&NzG&@ntP^&tBUt%Y2A=}Y0c{jZx#Y^GfQmM|u0cGlz^LcMNsVTXvJZSx9 z!1!L$?l-%HWiIc!sR*LC=t#2GhOk6zw0Q}8$5Xcwh;X0nM+^6B6eo3pQeCWadd<`N z%}Q>^HavJ};{V{jyU9X4hjqjPqvQ~bO}JD=5K4cYbH#^^eD~S44q-`6{-zDHJF92d zW2t3y)XUhGedDj73@1dqaxbOiKNE(0B5^dhjfpq0PTgD7GHHE{{tTX)$O^k4)Ozo# zL*!j!eJ9iAM*pOnz0(M-A7&0lM&N>*)6wk+d$S29<@*(3siY{5ggi7Iv+{Y97fH;| zlsCez$+&q%G_Rf81Svl5p~d!omGVn77l?h5VFF5Bq{+2-l$hljCV=J>6raE4@*w*B z1ac)D5i{sjw8df;`L1n3@m1xH$M+p^cO@@M=Zt!=cH3g%>##7-eHl8P{fpxCvKktZ zrtcCVk#dssBW|M+2l|O$A3onI&yjT92)_}dm!rsghpuyEpucIb<%# z@WbQ*aAPMu+s)xkR`#oEq*Ao`ym1}&ZJ{M4$8z+0eLj~UO9pHI#8Mgo|D^2q)kH%M zOB|P(TTdA_$YrHpHT#q(A3nP;8~0G)Do0;nuJRoc%7^9G zGOk!L(WO_sb2q7B8}gHhMf3&n%D&WydS5uvR@~KrG18yJj)$7)O+O+jg<%&Y8Lkog z>{i%@ec`VjYD7ievP!oez9p)kJD=d-D%CaKoSJ{lIP>n176 zi~PDpowYg72GkdQ?7uNurLKQ)ka3m|l)F45v^;v^v!0=$W5{)1l!A$FYBZHo=_TNp zQ}6&~5X_35)@N!ZZRG}ykFU$o{+%KI;6 z53pFWDWoncI5_2&-9T(ge0hRgF+PEzf97@%)9Eq%s=KTEiU?L}c=P%p^vgi2YR}i| z&cu61A&YoXt)^a{qdpx;wSk5L2d?PC1oVF{}hdu1LbB(sHm=wtMaL^;0bZ zaeI4LG1VPSrAh+@cK2#j?e6(t7ketZOtcF-;+C&M8L2Bh72p?as*CRN+*lz(76 z=<;ILHBb1Px}#}g9!W7_Vo-P8nAo4cxl2JmfvVeEa>V)8mhiZ5k)0cXp3)%ob z>6+Z$1WV9Xa>aO5L6m(e6G?FzSw`z?ssWgddxgFh%-vn9mj@~S*du-G`RTWHsa}QG z#o*3OEnghy`mAni1|qIokoXvj&;rv;iY{IEms=$1Fptv$#5y5tuuiVJ!5^J21jMS3 zfyKC;qmA2pb1ss-Bd1@pQbf!Ec<%b!1U@{a zM8l@!$f^**o*xYfX4(zTPL*;C!cg_rd z##Ef|*-Iw*ep}OFOY_Fflcfa=hxS3k_Gg}YhJ0HoVSf2qyJ(Zrvv}wUq?!bBnKTI?iq}F8sdjUUQz$OmMT`_qv zZl?ScXBL9xfCkxp6s2USv=7Ab)x?TF_bg8cCGPXA%P&l<+-zsbHeAqn>g`Bfpz`r+ z1M!D3u{&qI_`@;ER@-5@8$+UV7fJy=kH_9k_=oaEGhcRRW-x`X+!z^9+oN@uo*5l+ z;a=t>1rnK3n+SzGy!diJ#*E9wooQg}%pHyNM8KPvN%z;M$TTz!u`e^YmU6X+4ALWt zx5K=;(~IUJ%oYqa3&^CUd88;QNUyG3an$9v(Cw4;0lL449@90aUTd+c`b4AtD(u7!Xe_H9+b+HT@1X`_hw=E zPRG>tgE@BF9~-C$*Jfl-&&OpcxA%&BYx^k&S=8_4yAr26K{c`rNmreBbuN18{Ukj& zkh^LzFIax9uuW;+ZX~riXC$^56iVwQbeXbpVk*B#hGxW4B!+Q6p>;7de0@fEO>~#N z_`!~j!->R?tBkc)Gr+fj%-s?vJI)+3HS<1TX|xFrHrW8Wl6W&~o~CUjqc)aFA$#C_ zei>=R_h6Q%LHAT1{RZ+==A7}4nvvuA@W?fLif+=cU$4cr>=qOm@IL6(uzn$OYS@An?Nu80YAF7tR~g*fqW~@$ z5ik%82SRYHxG)$7Md)(>#`Cy1;7E8N01TnPljnpF01kseU;uN#KW%t^3;zM2|7io^ zWAsNG7>qlo_@fO1#2uXc(FQ}{*rh+(;BXvM^6xe<7=}ajM;iq4*BlVI@L&ExgoOUG z2Ze+GvIiA{|78y<4E+lm9D+X%{&M1q4LEaNFaL zaew;)0s+CejsNd)Akgn~U>%Hcj3Mq|1oyJF2FeYIUtd9WJ3A}@zfEv2B^@1U;iGXR z$N=<2fxFormatExtensionOCR RequiredModeIPDF (text). pdfNopipeline / vlmPDF (scan). pdfYesvIlmWord. docxNopipelinePowerPoint.pptxNopipelineImage.png / .jpgAutovIlmHTML.htmlNoMinerU-HTML + +# 5. API Configuration Reference + +The following environment variables must be configured before running the MinerU parsing service: + +MINERU_API_TOKEN : Bearer token for API authentication +MINERU_USER_UID : User UUID for quota management +MINERU_BASE_URL : https://mineru.net/api/v4 +MINERU_MODEL_VERSION : pipeline (default) | vlm | MinerU-HTML +MINERU_LANGUAGE : ch (Chinese) | en (English) +MINERU_IS_OCR : false (text PDF) | true (scanned PDF) +MINERU_ENABLE_FORMULA: true | false +MINERU_ENABLE_TABLE : true | false + +Rate Limits: + +- Max file size : 200 MB per file - Max pages : 600 pages per file - Daily quota : 2000 pages (high priority) - Batch limit : 200 files per request \ No newline at end of file diff --git a/backend/output/8456b615_sample_graphrag_overview/images/1ed7aacecd20fecef8dc27ee2fe76dc1ae7fa93c44f7d10878d17a41f21a6bef.jpg b/backend/output/8456b615_sample_graphrag_overview/images/1ed7aacecd20fecef8dc27ee2fe76dc1ae7fa93c44f7d10878d17a41f21a6bef.jpg new file mode 100644 index 0000000000000000000000000000000000000000..f14bd7e3f97d12d64a09cabf3f44ee19cb02acee GIT binary patch literal 57747 zcmeFa2V7HK)-N1H3DS`+Rk|V|y%SI=iZtn6MQWtC&{3o}0RaW+Ae~6>AYDXy?-F_^ z)PMo*ah@s4ym#h#-kJN(ckgHT9g^&vy>s?DYwy+mYvpY8>??@qzU)0&5C#SagaQ14 z&L%*2K-gGVV602nU@-XdWo#TgQhYpITs*2PB!r~2SLx|!uhP&kFmtgnFy3UMpW4yX#hKa!l zxe{RY2) zppdYL=$*SVvU2kG?yIS5XliNe=o&vYF*P%{uyk;Aa(02ay7|8J^A89N3XY0?9TOY( z=52ghdPe5^tn8fJ;*!#`@`}o;>ZazF*0%Of9iRIL28V`6M#siy5p(kki%ZKZtH_<* zz5Rnj)Y0+xd0~Jsf14KY|F?;qn-?)KuL~D1VqOG)pBKgj7vR7ozIf>x57rfFRq$gw z5=P$V*rc~3Q;QleGx0s#CVOJvheOWHKg)vrKD9s1?DtLV#eZvNf1TLh<~0t&!^8k? z9wspe0@~xI@=z__wr(Jg6pV3q<&W)jsJ`HH>1Jodn{fNA-F$kK5jp)Hr)9(8q!C)f z{tQIbFrOuT26_Z**mEpv(fsf}zUuY+fuglD(02Vz$g=K`_f};CDi<>Aq3zvUmU{-8 zvGYc1%#&(-V^%IRgP(z#P#?}fsZfnGP(9)dgvdSv`5_;lfo>1yoq;@C&p@qOuT5!`HuiQ1=&Om7H5W-{fGNW&s+^07pe!5`SOQi11bVErEXCPce^Qm8f*6Fv9 zGf;5)DMf3*39c^uQ*!297&pX&pN)uVXGn{9>ig^J{w8;U z`}K=<-*u$_2;XTCqs~BCp4otAhCQ8C`5x^WxkS|t--om0ADn?A{UB?DXP{;EL}ve7 zzKP$lqO#Us)%{v5KWnPLHk#I7+u(mvgtVi+Www@f&2BstFb-=?7oR)XvgBylk3j85 zoK_Rgo9AcIX&K9ZUR_E@o1TH962T|W0Sl|Ie}veeq0Q##daw5=9rp<>r_A;lNTGk9 zVwiR4rp&yvv#Dd~r;aFz!Ql_ieqR>hOkrs?mAFhDBw;ktKCY(>*blq8ZiiOv+Uj5tH z9L=%H)N_s`Ga2(Wp=!_l?F7aCwgjw?;?KntdK{v=%+x?>aPx7OgZeVFEZ2IiaT9HH zS55g+Julybp_|?%0?-m{&lX`x8DrIzU1oO`dA&FVHPQc zmj+)`vnW{|=*-Z&XZ57TRaL&FI23O_%6+L$CFQXUsR~0{Fgxh?9=W1slhtP+cn&1d z@Q2rImBX#89L3wiVHp!B)7_Tig_@~}R`wUueflXeC!dz+PuvpJ`u7Z68h23vG9Ro& z5ua7!G}F`=$s{OA_D>xhEG5+f(rTKsmJNw}OV<*O6+{cOvgMYrb&apyp|?)Fm&ZA4 zaMkDv%5t+GEGk*lQ6AaB6FKB!CY<@j_w{ZrN8fyO<3m`Q!3A%;d(R~MtztK+S1A3b zN@=sg8+7M9?xh2s0F?jXgDX_zmv0ri#V|2RqWA|3@wGpv1@BSo*$}MI6P9)vt85>y z3JLMmK2RKJ4JGkyle4)`{UNGqX;8}wxFfV%FPV^nepUie1?^*-oXoF2ksjyKc|t@H z7Fo8Ijen&j+=}8eFCo;ocd`id|z`m4f&)=U%k}v z!U67-26|%1IG)n?=fgeN5MmR7n3KA4BvRi3M-_-*U@)@qNl5tLhW{tG>qkMVk*c$C z24ceT9?3Za_2%$Krph=s;FU`786q!r)#p89d(Ii+$JOaui8oKYraMizZJOvl7iQi3 zu>8u{XspKV_1d_m9c+o~BFs3PNft5&!wlgQy7d))&aGrEfg$g{Vlm)ks>`ba*Ur|T zUwVSu>_Ywld!eS3S7{<>dW>cVhGwr^cOX@jxq!owg_Mmk$y)ZJt}OGBLLrXM+A6huIoEa*h7Pk^WPkY|$Br4md`n(P3mgS_q-6qUCem zA+uSBRw^WJ4`CwC$`?be6l-^Pk_gF4GVUjlZ(o4D7@npQ zP?$)+=y3lz+K+0PkV?1^r`ig)<)A!Xe4MW2(a5-dv1=}qXVmR8(6rjgjq%=>`+{(5 za0nXHJ%2_TXSMp-vi~A`lk2BZ%16S?rUk9pqstBW{9;EpuhrC(JSgwvH6h(xc;oVs zhR|)v2~Mh`kXq=MYO!UKMI}BKuOrmK=RE07Xqn{=mvNotzC)R7$g5jcBS|_J2Wx2sZt4AbKH_J;VUbz?h@J!GQ z6QiD+3D)HFIK1y3Z}xF=4>-Y z*>G*-W>vby_&Sjh$`W7JrdW2`VUm}|p$!CQRr3e;j-*(ga&3p#InfShrp$EMM&8tm znrHubK$=k|BeM*+Kqoq&h`U33HQ4J>f|sm(DfEpmUUBp7Su!}yvofA7xUtkaOdifU z6sb8(#fa#w?2JC)2==_wv;rTS(pwR-c#J*8S@VVR(??{-vZyJ%d9K4+Kv8ku-MM-Pb0QtHQ z^=h#WsLvBQHs*c024K^0kiatUtr^F5{JY_5su0$9*0$Sqd&z&sq!KS42N8 zGRxg=pgeb%fa5_#2t;d@tGei=VuP{HGkF|WA__?}h5LVGTa+NSFtE4Zbygbb0 zj=e{WY2_0vX=YoVHmn=rWmrL(9{NTVdTsHhdr|TSXrE#;W;g34vd^dpod`Xo#Bc!( z=M_c9%CKAC9p6K>&~*op6ga#0 zT=T(Fp3^s}omQMz`_SEi`AY2T;yaBO50+1v7_%Y-1Y~eh<3Mr~LdsWaKIx%c!NK$5s`>43RQQ%!Fl!JS!%O3+TPfc6M=Ow;k=ZNcv2-l` zOVv=8F&hUfuJ*B$;U3kk`{NAt`D23xR@|H7*AT)5Ps)>3D%GokS(^Cro>vl#EoP2* z7};gOcB~-s73CGB6w=ld9hyRT@>7+O?;L!vIN}FuA90r@-q6nXD}0+-eUj8i&CJg5 zE_(2trgcHSTsA6qPD}nRQ|wYVar7suZg$m80)r4Qv66}p$nqQ;=#^HS&p2tJ7%>;; zkgQeXBS_gIx(0+o*F*Mr=1P-ml=GqN^!>#*Vq5B(g4y5yF16%!77Y`xBax%C!aH3$*w=Pi> zeQ(x)TD53IoXy+GJ_72>BZ$0A`5LoOnv&j6qZDZ!S2OLVN0``yCcSalu^Z`=hvKSc&`U#`&YHf>T3$CsSc>}$6`dRBlLV5! zzJ5CtE{{3$?n!O3#-O#6=flO*bzkvKo$XuT566xL!nKMlSz$&kg4vzb3-5P2R`Xre zP&68}&O1FS!<+D%oYT|OE?kWEmmj)?i<$8w@*8W?MeAg7!^mf5H3OY$s5ZYst1jg8 zuHUeq{Dw3%Fu>mmOsn2jN=zxJ$uEztEx67E#Y_UBEjkRFLW=s((us}}5)i3=E5?dG zR*mXlnI~=b?opl+k}?L2FGD2oPHmVP_R2!0w?UfKMP$btdsANJmS{C#w$g<89f`?J2c?_cxhFZt+U zVo3F14kx9iCT{POpn5IMZQKHyEV=z1*!&0<0IRH&R8&X;N%{@$YL*8#mvE#~8 zY`gO^IP)>;_LTVJxmn3V;vrxC2hW}JHfZn&ra0L2N79B$J+E{yz$TQQr#U~twwD*7 zk5TI_%S$F)BeK*?co0daQO%o#4N-lVp7mC1aGz2jQ4+BU^)zeIC2I>~EXHLoxPNq~ zytzVzUUx~Hw|tz!zw8V|tS3j-HB`E0WyTy*$-uVdvMx@%EO2Zkfpwr5kbyfidN`o-NrY>*^OaR}E?d+g%HU5UtFb z7Adi=+J^sGHoO12HKrAa#^Gq>&4>+mNU4|+zNH&oWQM*p83Mlb{xx3-W^%#=rVmG8 zZg5#gG!A=O_N(7xP5nbPeR&3d-n_7^gLF1pAtqq(Ngbm=_<)M9TIlnWI#eUwCjk@uVfRgA73{wmP`>k5TZLp^jR(z#vf$Fsy16&RdJ5I8;M0FNeFTi zPZvfqc04k2Wa0C{?#A*Yka3JcRWwr^cII13(j}Iu+^KH03pBAq-=&jNO752FGC{J!i-NPjL$%YzDHL%lHe^i%Eo=C zeBK>OzNa6N-peb{z`~gy4VcQ{;p8SWH3Yz*`98mgLI2q~xuOkd-=}9FSI8cZR_+fD zk%zHg>?Z4ENY#k`h(7#}&eiy9L%f$1{;qZ{|He)bP<{BS)h462J1YJFPK z7d6;H9z@b{5QKy0+8=(4P{8*Lgp=HEwA&+h2KpEWAQ_-TXQ0JWgEP=*1L5gw1p}-5 z(YlQ9)FzL%PRStQ4NFN8XQ1cYXP}n8zw5bt|N3rcrTyLCbt?N(ob7zj?hN{dH?4P% z1pOK40uZnYNZhV-pd>fQ=|untK_YjqCp?usJ;|Usimk_c zI|uD_1`^-%Mql=m>kGcXNFcj&3?pgmbE!@?G zKt5Rlh=*q(N6@>#q_dU{@#q1oA_iIj8)qHfA_@S`Qbsq|Ih420zOKG*g5 zO0dYnqzI$O;du(0pYJQM$3p7xi=^kt@*gK@egp|x1nCtyd$eVfJ|Nj0aV7bNvuUS% zZsjF8dybXrx%%tg9g=l`D>Ov;`ZrBpqsXl@{npq9&cvo}*Am^8w4xKYx1<&yYDw?jCOk+K#cH#$zt<{JfI zE%f_k{z=*U@;fxMs^(3#EqEY_+)EFG-sgs+X1Hyb3nw}FgSK%(t@Q3(6a^7woY47-g{}fMRW95bzofm z-H4s;o%N}1hG4O~cng7_zD6Evcpnu|cpq>#pMf&3jPM}K7pcW;aHsSoKRq6ri;`^= zQ^$*LFT0jN?oH;HBOnIF*ODLM7va)-%AKooq*~)l)d3i6afYZkvs75D;w483sA_FE zSz=qVHA@p6YLak8-=>A0(!7DFG(j03rg_gvLe{-11ab1KUVU3PD3ty-=UWqv$X}orjB)2zR|ebk&n%ILfqY^ptCSO5dRM@vJh5Q9*v! z#i((e%PPF@h+hpQ=gC$ZZQ!OUT3(=4YPws6+jf%rLHT5-A;w{b0BxQ`eFl0NJXQ;V zud*uYn(gwhaf?vAdz9D=4u6fYT2F;kK)sp-_X-qnbO0c_YaB*wU%I`v*o505y33aS zA-VX_d%h}qVx2tfY@cdFI%96hAH}iJ-Z%jP(W-zJiZ!@4%0*f}`C&52WG8Q|0@T*k z6cie}e91|p6~-bn5+wdO=iZl%oRe21gK}5cRga^kXss4bk`l4wk2i76LoO2XKPN3( z=2cc((d76f$tx5LDVBnjMyz{pV2k?`vL)$>TT@PiE&;B^-qOqBsv-W!n{MLfARb!D zd(P(H^$fmu=mI4=~Cw<@0 z$6x7~hdqy!*k~+^R(}0Xui87`to@qQ67oPG4|%y(rJl0>s=WoF$-!mG(Gh&cj>IfU zZBTe3uQ_IMdDV-Zaj_7c^rXua_6j8ansh1Q*oEdWUzGuIo0EuqX2|0~FDtG^0ghev zqf{@iLtFP@il#;6Uxu=<9^NI9tjDPT*S?$ zude=-9`P|!iuOCG=hE*v>}E(C=93Kpj6-YI>FZX5uyYMkv|-j-GjZV^~dFg|5MHWCkz@^@*`&r(XUSP3BO% zxMvO#+X8K?E~m_Kh~yXRwTi~ZWM9-2nO(>U+Njju-@^3x_(G&7JB3gMNgtRm&Y7-| zYstwL^^!3IS~YL{QG>Q4myR0wZ2V-td;sO3)Bl!fmuxEqC!7}x0}Pko@I_TmbL`xF z8EHN|u)tgEC%*StGe$9E-cNxZtGgDP=4 zz%pql?i)TBg&QF2*S(koas^Nx%1G|nMK}R?{N{VEbd)|89^?W($X$kPNvkWw^zqqHV`u`Az6QPTL*M!Mie#44e! ztvMz@kKYdi`)B7C(it6bInNw$0sCntu0gBq&Kan~0mwNia#lWJ*9C2|7s?^nLJB=7 zW=EaG^t3k(Hs@Zp9WCgIv=JH#=i_eCz}lN{52yoZt1Ow+$b%)FVn&2>YH7N2Zk+&f&{3L&AU^P!jD zitUsoF6#D(P?TsoyRZg^E0?X3n_~e0v4`zQ7~dVTc7htMuT4>nOWdZNh{1VbpH(Rg zi2*sQYeh!{PT#EWXF-xDzhAzW!C-Nve-Y_7Ulf6$Jd$QG1MkCSwl}M+5XyvOQ1a)o z)Y;8`*12v(UN4EI$o7;;bfMKrMUSoUR#;}NU9!8Q z#5Iff3yVP7epCz>VqV-5d3`_j81W zQ!bZ=DbbqAl@4z7YtL2(naI2beI$NIba@GxR~9pysA=HPeAP5Z+=8aUd4(^{%$(T_ zQ=n;uP8?|#w2!Z+*vB6xU^cMD9JIMPHGE0RHXS$COjzO;mc%mVz;Ypdv!SGW7p8s< z@>0IDmKzP6Z1tnUwQ;_f9`R!>@fqS;-D3_|6fK{$0RwmZ&5T-`a(<)fxdkxI990UT zPFl8kby}AjP`wdc;jJpcyQTMI(#Gu|Hwf>dW(asky_lenS zFNF(V9I7)`uju0sDY(B8=s$(~O0;MlTqTUTrsGB2zj<@-s0{y}Hv*c|agYE%;FZ*F zFD?X+@MHA$$vlBIs1Nchkg*%z3QrrTrl+xgA&&tq+5P9Q&@v7=T%476?(a0a^1lFuD7 zTr%a|bmLk@)T8*82eU-w6oYeR)@PtD^BQ0wd(oe;$5xMnp4!O~F$(u#td&W@8HNQi zw;S52H*|NE2hGZ&AIPhZ>rq{P<{P0jw63lKegm^^)0vN)hF9H z)|h%Madl4QpxbrhzV4T`mL&&W+*Tm=d}3SLT?}sFL@F zt6tbPAN1}yaFz`UapK+K`W)5`b_lLXMa|qpdJL?T6ItGvzC%R^)eKWBpG-z9?FkI4 zO3_1>-CX>|uUXio!+M%XCM{JSoXQM3EAmdIiWuaJ9W2)2HpOK%dlPP!L1zf$(%&D( zOGx$$?o!fRnygJ+3f=;w8(~R*t;}oM<}IX;Bz zj9jy_X%c-YDCcgdBifwAt3x2Vx2C%X&<%yzRxYS37&NO_0MQ81@*^MCj_>l3rLpE5 zu<;nq3OBoYkog`Z*t-?K__Vr#*uR^T@CUP1 z!Hv^=#DQSgRPCWCdH9Dr%=y)=B}Bp6K9@V@%G@sf5KjC{Sn-cMpHl{EZp?Yndbd{u zLDruDfvvvR7!e0LJsViocvn1=zfs#HeAwvTe7h1qw_eNXIXe?kPL7o3>Bo!KpSX4V zArGVALXWgRWvlxTDaJe<^>9SfN|_a*;eZoXMbwT$9!DqC;f7K{L>yDPt*}7N_K3ou zo+^?sSb*dDO{FsXVreLQbA~1MrGO_6{dG8h4Cnr%((_-v<_L&8RYx6q_{sQ zo8u(zZFz4aUZ51mzI7Bi=nR)d8u`s44HK=*B)+`5NufH0C37(<;CYWV)Xj%)>(MQg z8)C}7m6Hmb(?qO4(xlB&jxtGa6;~z=j(F{KL#}%&%(+wr9Ejp;E3YRVLhHsCWPUA9 z(0ZFz{Q@h-lDWsr1O~q~VMymh1 zhC^s5{e$OS4&@JF_nVvhRz_wB3z?DXJlKkP+I*KYF)=C#aFQ}c8dRmI=qSqKJDab1 z*0qM!z3jlAfm$VVF}jY6Nbu?IsY=oK9ir7wc|sKF1>dIgP8u95*t>>9JkysX7ohO> z%&Kkr?N@W0CtGEO=+#y&_CRVZL2Q1A$UCyD8m|w*FRlZ-+Dn*={NSdz#pACbmCT~b z0r20$lm; zGO7N!yeurQ-8CphUYcnW(UNK?E{Sx+eo)|q@%q;8JWz|`XtA`RsGbRMX>_ClQR6Bg zz+4SlH!l?XMvZ`+;KkFQ;wGPgLehVJn?leQJok6~AioCsfn>jE^l#*&^OF7VSZ(2y z(ZWkKpiw9y8(Mw$00Dhu9NZBZLN$lGOD^2}C`lAkimpV)Zf10b7zoro?FGOT`T*?0 zfc|^~$eCY@_9P5Rf9kJDcm}%E0~FNgJNLd9Z2W5|fOx-X1u(~df*Azb(12MTVROZM zwAK3zlz!d&{MNhRL70hJ)ppu7E?gnXU5g80kL^hINIi9?zx9zVWr?m7W6kjwpZ_QB z-LLKCKeA+g7ukQd6S7X*Pypoc4lCeyc(DZ7WNVSp50LF>&e3b4i_s@(jzT8M>xv73 zwO?^IrV=MUI3&HsSlj!xs(!7i-*k&#;_*Kt9-p8APi7v5FGP1r%2j|4gW5vNe{0XI%K)sxZnu{8C#8eN*O$*R?fgFX4n{Q3}5}?IdcAYSV9SADxpLTtkNu0W2Sh`@u^dJbAD0x5J^Ead>@M9{@rpYlDuLXXT=b zCz_7QI?8V4POZA+riSvEErdC=Ewy{S@DUSbPxAp__^f&gbIEUQ9IK-1D>z%yKgFB zb@?7jtGSOjrTOkXYgjk_&3iVks$Vt|O0A6YwU|wqASR_4UQ`g95n_}xs-x1k^(L7Z z4!hHa502mm+ySGbAvueFgtLFbg`0u~;g!uuUusXmKX8+w2 z=A?hKVaEXQgz5d}2@}ixe!FjQ49|CSP`e(V%;jswsT2THAZRblQ8g4h*Cas7qd7 zj!!D};5kV&W9b=lGG9zE+h$m8ANLVypoEIXwVa4fG`k7;mDR{ERD~;qOx+*(6wa)c zM4+T-z7iSOV4jC+WM=gbiC)*N)2xlwjcC!OXcOxpwm7Az9ok?Wh#Gr9kS+9-q~loS zV)b!|(KuT(??)b%?&KSa5eOA}Ds;et{!DYCcgwLVl17)!cS84hnd{r7t1oSzJv*7o zzbyV}X}Si2gMzkP%?f8lRsztokG$f=s6#$YyKw89K zZ?TMso4K&@m4E2tk#J^YSKVtAm>9tj__(dyLLHJ&QgRoG%eK`s^ks^!1#bfO2`&CI4q9U+J@%`itu6VU`Enq|mtS zWi2}T?Tw8Oe*7!*)JaNPA4p0BURPGjU0k}@0&TQEGwMj~q~C&J39m!O zgO>K;l&FR#H%HyJoH<&Y#Vxm^Q~GeqLw2?pEzO(q`zxE!EPyvdou;VOwlj1us(v2N z4-d0pr?l5_Ua0oLeLDA{kVLCbz(C>Sj`=va2?$|z3^P**Ipjalx*>a;zIHxxFzIdK zp+Vn15OBESAzAyP%@{v~&)^Er{GtH8&|E7>qh=6oG0RPga+WaGD;>%g2uZcLdHv~! zlf<6ep#<8Vo*Ym_H5|?Ht>ak1il5~6RCACP<$KsS`9T2*Xybs6Zil$DTMLk#?-&MJ zh$xdrBs@oKk?6o=TaIFfquX8cKG)`32H4+KK78vl_kk2+=HI%b8=H%kXXZ+#2;h?& z+|rw*OyRvxzkw|(FH8v@hzGTxvJjlgP1Bd-)cOq`ZdfzQOg?{(H_C)nRp6k73Yx`d zK+FzPPz^6Xq@9wIX}Fq@0%oJ#?=*uC zOKu)!duT;BIo4xZNH+g6)Wl)(DRM8r(-#3JL(6X$776)t?M*Q@-2)%rx&G=(Z3)Jq zoD}3L$ajw5ErR7m+bmj!WK<#VWBG9MBb7%_l!m!$I3sRjLHlbZvY^Lq-2z=+653UC zstRNkJI71uG)obmKS*9TKlU$4Te-C|9Hrkuo@c`!oTx$iuA*1V=?^)N|IX2D=S=_s zQ0y}YJTu=>>>C^*D7)`45>_Cb3hcrFj6?+!_;Q;Z)V+ObIbusUG0*ybqucKy_ZYly zbffN>Nv3BR!A2gAj>pzq_l}Dsz2TfoXid0c5Er!ZzIOXP+{%V3$~AvMzR#_MlmX`x z%H;jD1wBEjJO1sdjm^}cmyr++vC-Y5UNEK;l6T8+L_oMe+&B|v*wkkF0>jR_q|32$ zDA3qX|5!_F`>;LEkZrp@&(86c{_0ir2$Hg_$*PYmI-WvLsx}}BL-lm4Ze?$`J(8~V zZ*e89_Ad^pW<gl0oZGsa9;XZuj%qw^xPKrsNyR<{Ah+m=M_FCV09V`FZbQmLjPGCQgVao3_m( zxF_tG!&}W=ns*3l?}3SO4VVcC8b=sV0-C8 z<=OBU?{u`lz~0I2p^eQsojWZjvIc=C*eQg9%Jd$$=?F`Zstq69A7`ILtox5zS9KD{ zK0cIc6FwfXo8>fXRrR<#C@zGMxIMft`>vwz4FhptkZmT|UPL?zR;cK~1ds4_^NK!+ zu{%KRM~XE*uasNlnc8bCTpf^aP!6LQuhIR)f7lmwsq&PB;T3Izz6#jpyzcjJcn^N# z0qBj97yfLRNV&12w$4C|7a{v+Ao^3EJr%6^AMK99?#fR;=rRV0KUE+iBWo?CuyfVA z)z0qQFpgBU%+58afX#p*u#^L9#j;T8_eHHUiUq17Sz2sLH`OA$EPY5p zq6F8^UDbb=}IxuQF*7q1OVp3NAS0l)pHHb{3^c&`wJ0& z(dQtNrC#>K*eWxyt=n@?v#fck@Q2}^(fi&o!8=ysLKEhil84ry!kPP@Ckf~1nr@ZcS z4K-KUF@ZJubKhTs{i4|^>;Gq1qsj&o<wAMD$Rxz9^(D`YSC4+NO*n=SWZr$!gALnQfN21fNcj1EVOub0?(gIahF=@U zcUAn?&Im}A|A-VJBt})GLdzwI{GnUq)0QeHMc_!ze0ILFqtDqQXP~Pb=_nxN&q(^4 zasC1@&JWSUAr0RsW&zDH$kNlOH7pZ0a%cOS4md*n9(L6(PthwgYP)XOmWuZei3n9Q zA!=b98r!jzF!H!!CZQA{)qG9mLNn4$1kHy~eC(K-NVo7vzP|6Ymd^@5v4Pv`7PN2O z%smem8pO0StsOJQmzd#Wm$biwjxESC-(I0?UfZ@fq&(fFGJ<^3Dna?Sh;A%}UfVYF zGpQ&^;&^Q5>MibN7L^E6UshJCFU43U)k-$^Vnh;z&B1C}5D6jfyRNv(#@Tm>0_$gL z3w?cH5)l1iAZsiHttZryXx21N25t>;rxS=!O^Pm4$BTIg@B!7` z(Ij$R?8l1x%vs?De(-f}Ctu;jSbbZod96;(s3t^hkW(YULp5+{w4B&hlt8MGx}^v$ z4Z(+(B`W9?S69@OaN1=S#AiiPhvzZO$9xi&+{0KCrOR*%!)jLJah2ViV%<5ko~XnO zMJAPbeefUw?gAGOh&3bve3KU{vVV6;@bjkrIpsgE=v+fKX3_U9`Setp4~27RRo-^195E%AKcPj#@d%cRu?Af5-ymZq7IvU#Sb zblW`vni%<(wpb*Sc$u~LYwN|52qa?+1ZK!!LakRd6A+%aC$t8fEW`tC7%PbG;nls) z{^}bK+KcDt=GxyPLFKiHCi%^Ve6wQhyf+@F)jtt5wE$_#g?(&9=RuoX=s0;-b&Z0a zgvC!r6tBKw(5QSUy1w4L5)4{1hqG3q{NUy)t6tQDX%vPWB>B3=%+e26m?6w>Xz*)+ zRWM>0m)`7clcC$5Yrr{#l3x0pO~mK&k+w$V(&}Hs#eWGK|IX*16Axm>w5Ye&P9$W&OoQ3DLBU!EpPa3f;1&bs*7>SCM{)_5)XAs%I?n%V!S&npw(;Lcn{l>+3qdA zf`4CxX418C=ir511+s>0I*OJ?4Up|b4p;Qg?e_K~c)huW}ESQdh< zc}lUeQZTyRwr*GhkJ8KIlpkYLUF(ew)Q|_cn3|(J+od@m11vz$3E{42-6H>Rpi4C8_8p*uMG<*;43Iu zL~`F`EF(U7$J74lM2}V^g;2FJGPOKbJ@oUkUo6j1AT8u-&-P}POa8Lv$l>_YYLhQ1 zaiwpf`o2bXOYQY2?VIk*zmK$Y!L($uih% zpGZLN?>Atho_nax8bEZk(diE4OQAG=k8D4>KfMZq&-Rcedg~yulAg6--m4~`ZAqw| zeBls^=eW}K%(B8k+d|02c{svVoQFShBDMMc$EWOHcibG$rz`3q^1g*}H@TIaq+hHo zFOAY3^^tm%5l*NJD$6ib&R#xVX2eWAx4TJlmmAQlPv?#pj5)o16qzvycc96YSU}$KBO_1n(JO^}b&I*gVpx%C& zar73)Ni(a9b^DqW-Ik0AP<1b&#gKYnKX9N^uToR-ZTG;W7g!+gUyBm~$p?QVQ7<|4 z_|jIO2e#GT+CA$+M0L1KILl;nf2E%|`|KnNj0S!ieff zEe@gx3+f2#{UQV{hu-#K3nA)(qXAEuo_n4uP=J-hFUBrZ)C?}&>Lx&nu{bW7PzZw~ z@ZrJxo;MU3n-!lUipvrk<7J;MydkU>S=lR~1|2X+s@a~kJ8nO-k^*Z{NIJFBw3a3ZVW|GW~ zj{2k5?bD`hpu_JOTRdnss6@m|es{e$N<1G07xz}*z(2~_nn zB2PY6BXbrw-vV+Lh)A`p4u5|*r_yIKAXC>EOu}4q+UuJhj)k$eSk9h8pn#C`=c?8E zdX)c+N@o3GW^1K`Pj`SYTe&RM9~quGXG00&G9ahFG(E2TQpjF`F~%yxj^eP`_25EX zd@yJ?nH1%&TQP_hLDXIOVrXPx>ql2rHAh#D$do}8v!U~#qcWZ`+27njG~2szKcy>4uuYJq5kIwad@ArXZd+>+1dNBwTrYYaPzEL3!k|`K>{Bk#UwAmN~gm_9Rz161xG8k#T@uSj+`f)@%)BM3m*&JXT#sRr=!LD z^968SDyoC@#g&Kb*S#YPtAjyEZY-FuZm)n1d56vHT9=V>oYot{BBH0^DsH^Vvkc+x zPpY|bP(o@Ay=W~vc*{+=`2F>^Nx?bm1o;PZ4PD1tR%tNj=H?BrXPQzrg}SR&TQyu$ zcFP=^KH}~QaV^ud+dD94Wsql$T6t)rjWaCTsugv=VBejMaWp;@z_G*4CL#S zpr9SE7X(DU=M3{(;``4HMUr+XPR$Z>mj~p<%3}I*O|>!+D#}@> z;5AEk4s+11zHHSE7_%8uuaCP5I9F9a>nBe7nFT>kj#}4$W~CpQP>R-o?DJ2Y6!GIF z^n$fy?2~`sVxaEu4=ji=)c5Pl|Ga?zUn2Xt&`Q?NbF;6<+GeKMhJKm8hGdphs&WMp zSw_iHa?Cjx=h0UH4O;4Al`$UGN158DAXVnv^j^dBDD1?U+ zzN|IK8q-^SC}quQeUmnnFMIXYq;Dg-S_SO=Cp*DdS0B28$U|}EZ$#gbR~gs8elqvT zYx{9c${EN}vkL$+YS%L(B_jMb))(u;XNNa8IGJ|Tuh1ToL8xj!;w{PylEL zC;;>fu*h?7tgoGK0j)3p2+qqlB7!BgH%F~RPRofU%5GR)+fk_YeKl z>Uogf--D*LEY2ki`H0=;=tSQ`O0G3~lb3GcPqtg~hrF4axvsXzR4*mc&GSl3bFm6B zbw?5Pc6Nq9nzpKV@Nvwd*+YG@i(y&9v=V@e`SrixQaYb<^Zh;9Pe5U4_0M2ODcYg$ z6-UU(8sA!FLY*|O9W`LaC696A{K}JaG3O$BjEyaPv@k$3>XeglQ;~~uu|ZpVqWyYP z?VBU%dAnpgDGK&nUGeyr;}Y-UOL?TT`R`jZ1}+dIKhY%dZCY;pgoXXD=Jk8q{&?Dv z!cFod@>p#AmX3Vc4(vzu#u!cWdp!?E6bR9=2|eE^03HVdgxrDCa^W zHnm@-de|~eM$rOw5pAcp>MHM^b`2Jog!e3gmtJ2$JI||Jaus**HglSrxx&&CJ!(1^~5`Sjl(I5)7*EIskveCFzh#;0d*RH{vo&=s!a2 z!O^fiUGGsdC3}y~OpZXc65C#M&Rl@T*@)~~Z*;S&T!w58Wrd;z7bS0-FdhljOvLWA z&rR52OLZFhYJ@=k49H8mp&IkYEfiCv~pmBpBb#g zYEN*_)MPFdl0^nL`CsjQXH-*d)^=#30;2Se3W`dTuC!33i5O6Nl_pIsKTAmzd$EF*E$M+hUDgQ8&CAJ@IMpOFaN)uGC2eNn2|V`fKs zZ%ER6PM6Qs`$9irPr2rz{Y~6j! zPVXbcX?3p!n6(VjcWeCgNcPG2S+gJ~u&}vZ5wi1(qAKjSL-q z&IiGz)Fe>sq5P5>^H{SH%Sed>iMmbR&WX=!VH-8#M5&-*^j=t|jLO)~fQjaJ<%=sq z#i5cgkFx@=K25twGc{vEzD&)c><;xdHl$ziFEq8#l-3uw7Jd@6h%BM#Y^j2ye92 z(6|^XMy$*4EE5sZUMI{~6_iw`Zh@Fb zmR9wU-C-EYF62En`Z3u`$gKXlYWVNn-K8HpB`u^Hx)r=+nL-$iVba^ z+15Nz6_9^*oge}UC?nKcR}5AT}z@sW};XncKyPI ze%`z6k!<8a9Rq~M`V(r4SGwJA$?h@4rhuY9RVRl5f`M_&%c)18$R}&t{c0Blsw)Fw zw{zoLB3?ShW+s9j;ndMp@pjUznRqHGr>@WZ_nqr&7uzDsOvVBhwvDza9;=PdD>@rwcnT9R`zC^1t%gFW|OF zi55)GvKfM#IMpDY1SFmWPhS?^Rv0t|_`d(bDE%wO`;~i+Aw4#V{s^WXB4%fB zGtyB($K)bKrjlx(+urPi$3!nlvYR=jF5$E+QLwNEt(M3fjC)>TFU`lO1BvcaXJ}(7 zld^nXWpF=Sc<}%-*LXTRp+A1dz)h*!J)!Z`+f{waIE1~U-Tb=qpjz4RLJHf~K3UAV zGcf!@=w`UHbOb7qVN39KHju4r2#HLv9lpe>)Ta!0nSY$)`4%TO%KSzf~ctZ z`b6R!WVpeu|KZhmCmRr`ih_y|dQe9jg5H#CLXQWy4sd8_8Wg#iUR0dA?d)|(0eULU zFoIB7Ff!>}c0(N+cib-@{SrFO^JVhVlWiWCErPfgUa4c~(&%a9v!@t18Z+=i`XaKUFG&_q6*F zHq;IwWF%FDAFN1UcK811xW_?!o>pZwrrTd~n4Nk>93G5(kH3bo(BF=H960l#3*Q;` z_RUb8>5vsT-PeXZQmiUV6wVtt=tbUDi?fbD6^5)l7)4S6xmflU82d6CNKK1s?{bg*{X`>(W|3;sC#N|*ZY-wp8I8Fj1v3> zQWQUkB0UVz0mvgn#;_k&y!nY&J+y}rLS2^JJ(pPo%N_<&mRLe1I#NR;P+&Ua86Gp% z`&fvB#@1bbC5~EsBt|>ksR#?v9x0Jaiz-sN94*i~)hE-Nb#~@dV_h?vFU-S3XuwJ2 zl_O`Y^ZPDO&hd1t^knCZ$f%?O$!5fk1n<@UhpD#)UNu^|50W8*s%gvv#zmHmFO2XB zJD#>-vj42%WM><%Ota`fM0%=1njPz+j$YJXSs`N(YbYL9U-O-aDUaS~6IUnYpC6y0AI`Z@u8S4>zYR@$gWj2o>&a(v&P4+?>7y*P}OxMs~8!eQ!@K0a8#Su?L7+JMW@!+BXp|3U$6gC1VwyXi?C0`wA&k zRWvju?V}jldtC(j?@ssPY!CnK)c#*g>48II7+zIv=`d;t1^}WahJF`p{QVU3C;Mc- z67^}+Nq8RuG|G+q(o-@At z@k~rzy4w9ez={s%pYOqWL(}lx5=WpzMx(rA0R^;v{E+*f_?`QYPW0~>Da3&!cVy=! zN03HU?AL2w^x`d8NR7i*Zmj?JcR<1`p**sk;QWM_qgL!8i16t80=mOlCBaj@GOIzzjr^;5gpngC#6KFL^4a=MeO z_y1@b4uy4PhT-YZd%>5RhO1uG)QLqfx+MHxqjct!y_WP+VO+(Q1M?u3?yXQxs z)Q=uVATu0zUxU}Jil7ZAhB;LTpTee#aOa3mti+MjT`MXlm2G9j*&#}AVgUwC_6(sT zP}yi=|0)xW*Je1LbHLNXxF0EW6 zLKc*AV;$eFQu_#xt4pqX<`24%WHtB>D^)wc?Wre5A$Al4VDsrzzU&X08Zdb~eM-COR1op2JRk&m2{K`MqzuC*+f#D#ml+h@HCBZ9w1xEEKwgq{r)QM*q+@ihL4|K4|ob(7I0ba82j^yL-|X|VFx zqSe?FIA=`*AIQFS(Tc5yJ>-#(`KU$G5vW6-n9hTKKxB6EYITj#unLQ>|K8Hp+anO= zPGJtVv=nK$tdsh{?tICb_IN~fZ7I3ZI1lVBX!k7lpfrsN=QC)gccuL}A85)Ypodnq z4e6U7UJYhn-jDsUwm4Svf~o?2yY+@FOKi8l_&R+Thb~63Df=+458n$N|M1bIg!95?!)LkADW_+>g-3K{73f&vs4`!ofgJIjA*p|UH$rog>F^b#K-<)r z)$LN#7~H!tOiU|_`fnP8|JLlsZ)LHcKofr_IV2Iu{T!Q^YiJ9=oDQfPjFDy=)6eEM z<6il3T&@RGVM7o$0I>M9^)6L~YH8o|sh1zlDzFQ>?lIu55sfi@^fuI?kMX zM<92{Ly7{T=ImxmH8usTRX#RRL}F*`RQaO8J1+aOOV5I;y(}_ZjN|56nb^s3renfs z+)?>83CgM|#NUNOzI+Gqxy+U>$*|@ByG7A|@b~{Jhybp7tdPbvS9*GI-ezawYD273 zp-DB8_<<Ku&M6cw+G7W%bz%AsH6m-2fL1QClkS`l?FKYFTh&pys`o zS9;u};#hodAdJZ|_sffH_Sm>F6ji%kKH1jm#TNxUKZrA)!OKT(GyQp)%wse2{|Fjg z?0a_5u*&;;@rsiPPcq5+ou|QDW@}*}i-S*HGz3c~%;r?M+ew1T6dr`n#iN_F8`1Pp zZU?IN^oE!##WU<7@oBxa+%8Xx_Ou3NC*zG1r4V8QKU_Lk3ZJ>ZdmZW*nwxdbKogxd zgYD>Ocu(C!+x+bL>)^uvQ${(YHP`W+XasU8T=I6Ik_(GU?o`!AbZ!a(=#iHz96+gI zD9qp0VG~eSVKhjrF=Xm>+=J>!w?-sFIot3?YzJ}biXS^PL1)xQw$Mh5DG%QG1}PUq zS7i6(lG@^T_WrGm+o;BaC<%jBl7T9rfFV_eCw^|6PDkzfD%fnOeaW zP!x)&y@!#$eCjRc)w*(3oW5%{f)jfp`~k7^xqo%qOwd%@Hx_X z1P0I?`&MqB&cIuQO@UdFeZ!1MXo+d8&cQIxBZ2)?0GAg-*WxhjBV~OBdJowt5JA0p zRh~S~31}#q&|o9?+u3PXKk0}o1osPe*f?lx==m)tNK$qH+{(sLA_y^N$#3Qf=BJES z&W)@H4UN7aSZEfvevEz1)C=#LuZdGT&F}a~<#hu|Wf@QA#U5wKybg{PC1vDeb1Hy7 zoJUeEFYWPjgCcH`VEqewvI5Uv?LTf3_qOKC+1j`mYq45$haAkIN`jy(|KX1Ap)tpr ztLJRd5M23{hO1M{xfXzlG;>z~sGfi8TPylIL0fg=!g=Nh>8rh{#-f55hBf?vodeBA z+*ri&;pT_9^H{|NO-ynaPQlWRg=)FDU$7zJYv;74IR>{{0sskDD2bu@cgCt6u}W^M z+`eL(b#`F}Y37_Y8?(?i=O`D}_5BFZcQVNAP4%(UJ-*IY6`yPpF-XKYLqsM3Z85sM zxg*d8J!p~oxX<;X2?NpG0=~H>|Aq|>|30N!+o5qBGk>Hczolj2$`vR5AE=O7|Qt9MfN|!l$rEHOW!Pm z8TmLztI1S3eFr>3h{XX)8${9;$v1*ds0Nji(<_Hvkfnl^_JaTPPTe=Ni?nsFb z?@-Hx#5+9FBh0K)eMf$dI~#~FT&ol=Tgx1q7*i`WR z8Fc<3852Jq1boyvL;{GR{$vQCP;WDDvd`%1h@%AlPh=4o9gKa54~YR@5p?;lQpk2q&i`Do2g&JxbZ+LhY4ytk^pe5;h>@kK*Ll~yT#u5a7`f#%PS&oH@j z@GKOlBX}ert(i2bdAkaz6s(xaPJO+oC^LBRnQsipHS?Gk0=gwXdpd2{&t3>X?Ed8F z{JD=gp`2rWPVe|@DINE7pps(_IqsOB)3U<}8v(MMcJnoQTKl}Ww zv$H3r`0wy{Vv650ffG~wZ!pCof&>sZ(l@~R8?GV^uc~?NYEl_1Hpb`nmGv%O7}|qR z9`irWY#?-g7~eWh&cT)9hB;k}H_cJ168fO;(E*D$*^d=5`);9Fp>Z05F`!)V)3N&` zb0=Z$B+UJxa{G5-PI7R8KYW`P(x%>KT4FQ)(6>f(Ypy|snc3tO(JPIIKDOvjL2saJ zjBCPbqQmCiOh1x#3z^OBFwcY{uMw5~gpC7HjwNsi9A%s}RjUE7Zn)>4t{|6(z*hqcf9CvO&5C zr7Wq0BIAy-6I^z*@i7m&fuPE=%D@eg=4cpOp*uRLL?d#b8B3iRmm2Hqs%Lh`b-GoZ zbua9l9%QYDL*{0q+&GG%85HK~jr9?i?CL1q4_wrW`7An>5HPZpMG4JrHb>7!AmrWV z!0g6TEBkR7@do`41I|CfB@(t+aP`El>n3v82YOuS&=}!i$>&WkcsABl1+EFINU4GB zej>>D7hxv9s(Wk5)lIhABr!j*K9D%NJ5pR0z(Ymzh}@q|No1r4zAOMC2}YJYHxY;P z&dd`RE8~}MXHneceuYVnEI|m&AIdE|tm$ZkPP_?3qSVQQU4J;AVcgEg>U4=pU;0?J z+&udT8PZju##=>w{aTl-X)UM!nN1m1653b$_!vg@{p!G^7wxa%>W^i|l)&DTC2FS% z^QVmmtQ=u>BBS8dN3oVxafe?>IWSpdt~#06B>Wb8)C^^1UZwB_j7w18whBdrF1JQ z-P(p1b$XS2#`e!m+8Hc=>i&wfg66yk|MEkHxZsvl!w_lSM@`3_67IVeMoJGWD+PVq zoWy$?m+PX(I^(azSj3Ed4ApwEcwvPCB>pSoIKO(&|E$f|kc}u^=)&jcW5E#nt6M98 zXz)&)TU813&1+MZ%DZrL6gWoYwjT}jzP%`Zg=`YrS*hwM#1YQ|^(ni(;<>L*+VoMk zfkVY@E?p|G-Yv;UTyUIx;?X^Gp8H;V18me)qHK(nHTL#9*#Rvh(|ak+<{vynokj`) zF<(usGu2I3XBgk4X;wsDWA9I#nWr^(+$iS?TC?p+Ux&YbQY)%+bEBW z0x8-|(Wab96EemfyVIo%{=73^=;UD2Tq4dW6qz!ccS=qu4WlRLqqjLz86+ST1x;U_ z`;i9Lxh>Tyd`^oT_pQ^-T?`hzSQ&eP0dS1VfCPmGzQ zdfxd(@4nK-SOu*rM1y0{g_-L2-D}@6tTGHYXTMmvZ`2QZf!3`{`OrxLVlmAOP-pMW z3wFD;XIP>~^h_9L9vH@@hhHg=<<7*K&Zm`9b-`dT)GUqT$c|Xxt&s;3tPhDDiCLCf z?t=E1xH}*bA_n|OMb+#48ueB!be$LF;kBgtxES{+n8)0t z*mK<^UtL9GI^@c^Q<}Q~uK_SD@HesL7F6bG1MHjcigg()y62MJ`JP=4LPS4xnFOe8 zVUflJmFN!v`LhPAOw*G!IplsuJIjeTT|V3dSc45vl(>Yr^RV&YmVF9vx8M0%Oh$sE zsgoOTg}iezL5{i#cgF4gPf7K;J#cG!35Osy)ajRc7J}7m5njOdi~M`o>&kMsY~QySi3zh zLc%hq-&-;sX=|mSca~2#;dvO+U$S`>!UPG!bK1 z2=FHXc`48;DV3+)8@6BD;LmSKx`G@RI{b0YE4ArQf0-msnY6IQ3}fI69LU5cDj207 z`#^>@okm+U==GHKV1e*{*GA56V~`+OKJOwC?#v*-{rUnsbp~rx0D5|_3>aOQl-2bU zNArc`_*`tNkL~6fT}Q>vt&rE(d8+d1_U_g9F5D`W2j=EXF|e@5A2;|oYh!g31r-T6 z>`rY00VTPk44d3u>qS0-5HRd6PXTl+h}puHWmxiKs$zs5Vs+e#Ol)dA#G}1rEEb}&ybb830(M#y9!VdXhpAb9{`4hAWg6L7aN zA8JJ^dJ8&AB%gkmdKs45;xQr->uXm%?8++EF-wo&J(u#N?98?vn>+DFN_oG)hVN=2 zEL0mZujVKt9@3NecBXovzsw;zNxPLTlh>{`n0K-(RVQ3}j6W@3uR2?y%KoW$!m13* zY$R;48p?&QGXh3aM853=6hgHrRJ)MiaO1Cc((b8fTn5?uE|78|Yrp+@jf#JU$X%>1 zxjJiv-So~gf`8*fNPZNnftVCXj$fHKy1N01G`=vj!#j&)I$vM@A`_*h{Ta5I`NbHh zA)NMY!SPUHRsF{Jk=>5{ku*~)x#0y#qt8OFv8mQ@_* zu~h*a3=7qC?Q*v-@=YDu)`mz*P=-i~SY^vf46>Fz;(SW*q+ALRa`_5OCIM>U(8+F( z{c40q`J`aHjO8vL8{TK1KyFdo-5_f$cyiBC zlKPBRdo$^KaZF^LA63UKXlsA7?SJwDrT9ifcv$V2jO)1Kkz5~qg zHBEo?a%HSNA#F@9DY-&tKFis00I^&S;P@EOI_q3tN0YAQ zIY3GxIB^78n4LGw5Afr_%BuyL7!0^0sio^Zo#^IfB^`cfv$$UNmvTSxg(tD#|K(Wt zQ|V3jn5UJ7glAj;Gdvnothe7K8OoR%3`KGZg_4cH34{sOGH|zM#1pdb1H0e|B;&=Mfo)o7A%{k_p+7G(bu~?oPhP;) Rby!*J^e%b>9i*d){{k(KAlU!_ literal 0 HcmV?d00001 diff --git a/backend/output/8456b615_sample_graphrag_overview/layout.json b/backend/output/8456b615_sample_graphrag_overview/layout.json new file mode 100644 index 0000000..a25e876 --- /dev/null +++ b/backend/output/8456b615_sample_graphrag_overview/layout.json @@ -0,0 +1,4063 @@ +{ + "pdf_info": [ + { + "preproc_blocks": [ + { + "type": "title", + "bbox": [ + 205, + 148, + 390, + 172 + ], + "lines": [ + { + "bbox": [ + 203, + 144, + 393, + 177 + ], + "spans": [ + { + "bbox": [ + 203, + 144, + 393, + 177 + ], + "score": 1.0, + "content": "GraphRAG System", + "type": "text" + } + ], + "index": 0 + } + ], + "index": 0 + }, + { + "type": "text", + "bbox": [ + 172, + 197, + 423, + 214 + ], + "lines": [ + { + "bbox": [ + 171, + 195, + 424, + 216 + ], + "spans": [ + { + "bbox": [ + 171, + 195, + 424, + 216 + ], + "score": 1.0, + "content": "Technical Architecture Overview", + "type": "text" + } + ], + "index": 1 + } + ], + "index": 1 + }, + { + "type": "text", + "bbox": [ + 217, + 229, + 377, + 244 + ], + "lines": [ + { + "bbox": [ + 216, + 227, + 378, + 246 + ], + "spans": [ + { + "bbox": [ + 216, + 227, + 378, + 246 + ], + "score": 1.0, + "content": "Version 1.0 | March 2026", + "type": "text" + } + ], + "index": 2 + } + ], + "index": 2 + } + ], + "page_idx": 0, + "page_size": [ + 595, + 841 + ], + "discarded_blocks": [], + "para_blocks": [ + { + "type": "title", + "bbox": [ + 205, + 148, + 390, + 172 + ], + "lines": [ + { + "bbox": [ + 203, + 144, + 393, + 177 + ], + "spans": [ + { + "bbox": [ + 203, + 144, + 393, + 177 + ], + "score": 1.0, + "content": "GraphRAG System", + "type": "text" + } + ], + "index": 0 + } + ], + "index": 0 + }, + { + "type": "text", + "bbox": [ + 172, + 197, + 423, + 214 + ], + "lines": [ + { + "bbox": [ + 171, + 195, + 424, + 216 + ], + "spans": [ + { + "bbox": [ + 171, + 195, + 424, + 216 + ], + "score": 1.0, + "content": "Technical Architecture Overview", + "type": "text" + } + ], + "index": 1 + } + ], + "index": 1, + "bbox_fs": [ + 171, + 195, + 424, + 216 + ] + }, + { + "type": "text", + "bbox": [ + 217, + 229, + 377, + 244 + ], + "lines": [ + { + "bbox": [ + 216, + 227, + 378, + 246 + ], + "spans": [ + { + "bbox": [ + 216, + 227, + 378, + 246 + ], + "score": 1.0, + "content": "Version 1.0 | March 2026", + "type": "text" + } + ], + "index": 2 + } + ], + "index": 2, + "bbox_fs": [ + 216, + 227, + 378, + 246 + ] + } + ] + }, + { + "preproc_blocks": [ + { + "type": "title", + "bbox": [ + 31, + 36, + 119, + 52 + ], + "lines": [ + { + "bbox": [ + 27, + 34, + 121, + 54 + ], + "spans": [ + { + "bbox": [ + 27, + 34, + 121, + 54 + ], + "score": 1.0, + "content": "1. Abstract", + "type": "text" + } + ], + "index": 0 + } + ], + "index": 0 + }, + { + "type": "text", + "bbox": [ + 29, + 70, + 566, + 144 + ], + "lines": [ + { + "bbox": [ + 29, + 69, + 565, + 84 + ], + "spans": [ + { + "bbox": [ + 29, + 69, + 565, + 84 + ], + "score": 1.0, + "content": "This document presents the technical architecture of a Multimodal GraphRAG System designed for", + "type": "text" + } + ], + "index": 1 + }, + { + "bbox": [ + 30, + 89, + 565, + 104 + ], + "spans": [ + { + "bbox": [ + 30, + 89, + 565, + 104 + ], + "score": 1.0, + "content": "intelligent document parsing and knowledge graph construction. The system integrates MinerU for", + "type": "text" + } + ], + "index": 2 + }, + { + "bbox": [ + 28, + 110, + 565, + 125 + ], + "spans": [ + { + "bbox": [ + 28, + 110, + 565, + 125 + ], + "score": 1.0, + "content": "document parsing, LangExtract for structured entity extraction, and a graph database for", + "type": "text" + } + ], + "index": 3 + }, + { + "bbox": [ + 28, + 129, + 207, + 145 + ], + "spans": [ + { + "bbox": [ + 28, + 129, + 207, + 145 + ], + "score": 1.0, + "content": "knowledge storage and retrieval.", + "type": "text" + } + ], + "index": 4 + } + ], + "index": 2.5 + }, + { + "type": "text", + "bbox": [ + 29, + 169, + 565, + 223 + ], + "lines": [ + { + "bbox": [ + 27, + 167, + 565, + 185 + ], + "spans": [ + { + "bbox": [ + 27, + 167, + 565, + 185 + ], + "score": 1.0, + "content": "The pipeline supports multiple document formats including PDF, DOCX, PPTX, and image files.", + "type": "text" + } + ], + "index": 5 + }, + { + "bbox": [ + 28, + 188, + 567, + 205 + ], + "spans": [ + { + "bbox": [ + 28, + 188, + 567, + 205 + ], + "score": 1.0, + "content": "Extracted entities and relations are stored as graph nodes and edges, enabling semantic search", + "type": "text" + } + ], + "index": 6 + }, + { + "bbox": [ + 28, + 209, + 331, + 224 + ], + "spans": [ + { + "bbox": [ + 28, + 209, + 331, + 224 + ], + "score": 1.0, + "content": "and question answering over large document collections.", + "type": "text" + } + ], + "index": 7 + } + ], + "index": 6 + }, + { + "type": "title", + "bbox": [ + 30, + 252, + 191, + 268 + ], + "lines": [ + { + "bbox": [ + 27, + 249, + 193, + 271 + ], + "spans": [ + { + "bbox": [ + 27, + 249, + 193, + 271 + ], + "score": 1.0, + "content": "2. System Components", + "type": "text" + } + ], + "index": 8 + } + ], + "index": 8 + }, + { + "type": "title", + "bbox": [ + 30, + 289, + 208, + 304 + ], + "lines": [ + { + "bbox": [ + 28, + 288, + 208, + 306 + ], + "spans": [ + { + "bbox": [ + 28, + 288, + 208, + 306 + ], + "score": 1.0, + "content": "2.1 Document Parsing Module", + "type": "text" + } + ], + "index": 9 + } + ], + "index": 9 + }, + { + "type": "text", + "bbox": [ + 29, + 314, + 566, + 367 + ], + "lines": [ + { + "bbox": [ + 28, + 313, + 563, + 329 + ], + "spans": [ + { + "bbox": [ + 28, + 313, + 563, + 329 + ], + "score": 1.0, + "content": "MinerU Cloud API (v4) serves as the document parsing backend. It accepts PDF, DOCX, PPTX, PNG,", + "type": "text" + } + ], + "index": 10 + }, + { + "bbox": [ + 29, + 334, + 566, + 348 + ], + "spans": [ + { + "bbox": [ + 29, + 334, + 566, + 348 + ], + "score": 1.0, + "content": "JPG, and HTML files. Output includes Markdown text, structured content_list.json, and extracted", + "type": "text" + } + ], + "index": 11 + }, + { + "bbox": [ + 28, + 352, + 69, + 370 + ], + "spans": [ + { + "bbox": [ + 28, + 352, + 69, + 370 + ], + "score": 1.0, + "content": "images.", + "type": "text" + } + ], + "index": 12 + } + ], + "index": 11 + }, + { + "type": "title", + "bbox": [ + 30, + 388, + 213, + 403 + ], + "lines": [ + { + "bbox": [ + 28, + 388, + 214, + 404 + ], + "spans": [ + { + "bbox": [ + 28, + 388, + 214, + 404 + ], + "score": 1.0, + "content": "2.2 Entity Extraction Module", + "type": "text" + } + ], + "index": 13 + } + ], + "index": 13 + }, + { + "type": "text", + "bbox": [ + 29, + 414, + 565, + 467 + ], + "lines": [ + { + "bbox": [ + 28, + 412, + 567, + 428 + ], + "spans": [ + { + "bbox": [ + 28, + 412, + 567, + 428 + ], + "score": 1.0, + "content": "LangExtract (v1.1.1) performs structured information extraction from plain text using few-shot", + "type": "text" + } + ], + "index": 14 + }, + { + "bbox": [ + 28, + 432, + 565, + 448 + ], + "spans": [ + { + "bbox": [ + 28, + 432, + 565, + 448 + ], + "score": 1.0, + "content": "prompting with LLM backends (Gemini, OpenAI, or local Ollama). Each extraction includes", + "type": "text" + } + ], + "index": 15 + }, + { + "bbox": [ + 27, + 451, + 223, + 469 + ], + "spans": [ + { + "bbox": [ + 27, + 451, + 223, + 469 + ], + "score": 1.0, + "content": "character-level position anchoring.", + "type": "text" + } + ], + "index": 16 + } + ], + "index": 15 + }, + { + "type": "title", + "bbox": [ + 30, + 488, + 201, + 502 + ], + "lines": [ + { + "bbox": [ + 28, + 487, + 201, + 504 + ], + "spans": [ + { + "bbox": [ + 28, + 487, + 201, + 504 + ], + "score": 1.0, + "content": "2.3 Knowledge Graph Module", + "type": "text" + } + ], + "index": 17 + } + ], + "index": 17 + }, + { + "type": "text", + "bbox": [ + 29, + 512, + 565, + 567 + ], + "lines": [ + { + "bbox": [ + 27, + 510, + 564, + 529 + ], + "spans": [ + { + "bbox": [ + 27, + 510, + 564, + 529 + ], + "score": 1.0, + "content": "Extracted entities and relationships are stored in a graph database. Node types include: Person,", + "type": "text" + } + ], + "index": 18 + }, + { + "bbox": [ + 28, + 531, + 563, + 547 + ], + "spans": [ + { + "bbox": [ + 28, + 531, + 563, + 547 + ], + "score": 1.0, + "content": "Organization, Location, Event, Concept. Edge types include: RELATED_TO, BELONGS_TO, CAUSED_BY,", + "type": "text" + } + ], + "index": 19 + }, + { + "bbox": [ + 28, + 552, + 91, + 567 + ], + "spans": [ + { + "bbox": [ + 28, + 552, + 91, + 567 + ], + "score": 1.0, + "content": "LOCATED_IN.", + "type": "text" + } + ], + "index": 20 + } + ], + "index": 19 + }, + { + "type": "title", + "bbox": [ + 30, + 587, + 162, + 602 + ], + "lines": [ + { + "bbox": [ + 28, + 587, + 162, + 603 + ], + "spans": [ + { + "bbox": [ + 28, + 587, + 162, + 603 + ], + "score": 1.0, + "content": "2.4 Retrieval Module", + "type": "text" + } + ], + "index": 21 + } + ], + "index": 21 + }, + { + "type": "text", + "bbox": [ + 29, + 612, + 562, + 645 + ], + "lines": [ + { + "bbox": [ + 28, + 610, + 563, + 627 + ], + "spans": [ + { + "bbox": [ + 28, + 610, + 563, + 627 + ], + "score": 1.0, + "content": "The retrieval layer supports hybrid search combining vector similarity and graph traversal.", + "type": "text" + } + ], + "index": 22 + }, + { + "bbox": [ + 28, + 631, + 519, + 646 + ], + "spans": [ + { + "bbox": [ + 28, + 631, + 519, + 646 + ], + "score": 1.0, + "content": "Query results are ranked by relevance score and returned with source document references.", + "type": "text" + } + ], + "index": 23 + } + ], + "index": 22.5 + } + ], + "page_idx": 1, + "page_size": [ + 595, + 841 + ], + "discarded_blocks": [], + "para_blocks": [ + { + "type": "title", + "bbox": [ + 31, + 36, + 119, + 52 + ], + "lines": [ + { + "bbox": [ + 27, + 34, + 121, + 54 + ], + "spans": [ + { + "bbox": [ + 27, + 34, + 121, + 54 + ], + "score": 1.0, + "content": "1. Abstract", + "type": "text" + } + ], + "index": 0 + } + ], + "index": 0 + }, + { + "type": "text", + "bbox": [ + 29, + 70, + 566, + 144 + ], + "lines": [ + { + "bbox": [ + 29, + 69, + 565, + 84 + ], + "spans": [ + { + "bbox": [ + 29, + 69, + 565, + 84 + ], + "score": 1.0, + "content": "This document presents the technical architecture of a Multimodal GraphRAG System designed for", + "type": "text" + } + ], + "index": 1 + }, + { + "bbox": [ + 30, + 89, + 565, + 104 + ], + "spans": [ + { + "bbox": [ + 30, + 89, + 565, + 104 + ], + "score": 1.0, + "content": "intelligent document parsing and knowledge graph construction. The system integrates MinerU for", + "type": "text" + } + ], + "index": 2 + }, + { + "bbox": [ + 28, + 110, + 565, + 125 + ], + "spans": [ + { + "bbox": [ + 28, + 110, + 565, + 125 + ], + "score": 1.0, + "content": "document parsing, LangExtract for structured entity extraction, and a graph database for", + "type": "text" + } + ], + "index": 3 + }, + { + "bbox": [ + 28, + 129, + 207, + 145 + ], + "spans": [ + { + "bbox": [ + 28, + 129, + 207, + 145 + ], + "score": 1.0, + "content": "knowledge storage and retrieval.", + "type": "text" + } + ], + "index": 4 + } + ], + "index": 2.5, + "bbox_fs": [ + 28, + 69, + 565, + 145 + ] + }, + { + "type": "text", + "bbox": [ + 29, + 169, + 565, + 223 + ], + "lines": [ + { + "bbox": [ + 27, + 167, + 565, + 185 + ], + "spans": [ + { + "bbox": [ + 27, + 167, + 565, + 185 + ], + "score": 1.0, + "content": "The pipeline supports multiple document formats including PDF, DOCX, PPTX, and image files.", + "type": "text" + } + ], + "index": 5 + }, + { + "bbox": [ + 28, + 188, + 567, + 205 + ], + "spans": [ + { + "bbox": [ + 28, + 188, + 567, + 205 + ], + "score": 1.0, + "content": "Extracted entities and relations are stored as graph nodes and edges, enabling semantic search", + "type": "text" + } + ], + "index": 6 + }, + { + "bbox": [ + 28, + 209, + 331, + 224 + ], + "spans": [ + { + "bbox": [ + 28, + 209, + 331, + 224 + ], + "score": 1.0, + "content": "and question answering over large document collections.", + "type": "text" + } + ], + "index": 7 + } + ], + "index": 6, + "bbox_fs": [ + 27, + 167, + 567, + 224 + ] + }, + { + "type": "title", + "bbox": [ + 30, + 252, + 191, + 268 + ], + "lines": [ + { + "bbox": [ + 27, + 249, + 193, + 271 + ], + "spans": [ + { + "bbox": [ + 27, + 249, + 193, + 271 + ], + "score": 1.0, + "content": "2. System Components", + "type": "text" + } + ], + "index": 8 + } + ], + "index": 8 + }, + { + "type": "title", + "bbox": [ + 30, + 289, + 208, + 304 + ], + "lines": [ + { + "bbox": [ + 28, + 288, + 208, + 306 + ], + "spans": [ + { + "bbox": [ + 28, + 288, + 208, + 306 + ], + "score": 1.0, + "content": "2.1 Document Parsing Module", + "type": "text" + } + ], + "index": 9 + } + ], + "index": 9 + }, + { + "type": "text", + "bbox": [ + 29, + 314, + 566, + 367 + ], + "lines": [ + { + "bbox": [ + 28, + 313, + 563, + 329 + ], + "spans": [ + { + "bbox": [ + 28, + 313, + 563, + 329 + ], + "score": 1.0, + "content": "MinerU Cloud API (v4) serves as the document parsing backend. It accepts PDF, DOCX, PPTX, PNG,", + "type": "text" + } + ], + "index": 10 + }, + { + "bbox": [ + 29, + 334, + 566, + 348 + ], + "spans": [ + { + "bbox": [ + 29, + 334, + 566, + 348 + ], + "score": 1.0, + "content": "JPG, and HTML files. Output includes Markdown text, structured content_list.json, and extracted", + "type": "text" + } + ], + "index": 11 + }, + { + "bbox": [ + 28, + 352, + 69, + 370 + ], + "spans": [ + { + "bbox": [ + 28, + 352, + 69, + 370 + ], + "score": 1.0, + "content": "images.", + "type": "text" + } + ], + "index": 12 + } + ], + "index": 11, + "bbox_fs": [ + 28, + 313, + 566, + 370 + ] + }, + { + "type": "title", + "bbox": [ + 30, + 388, + 213, + 403 + ], + "lines": [ + { + "bbox": [ + 28, + 388, + 214, + 404 + ], + "spans": [ + { + "bbox": [ + 28, + 388, + 214, + 404 + ], + "score": 1.0, + "content": "2.2 Entity Extraction Module", + "type": "text" + } + ], + "index": 13 + } + ], + "index": 13 + }, + { + "type": "text", + "bbox": [ + 29, + 414, + 565, + 467 + ], + "lines": [ + { + "bbox": [ + 28, + 412, + 567, + 428 + ], + "spans": [ + { + "bbox": [ + 28, + 412, + 567, + 428 + ], + "score": 1.0, + "content": "LangExtract (v1.1.1) performs structured information extraction from plain text using few-shot", + "type": "text" + } + ], + "index": 14 + }, + { + "bbox": [ + 28, + 432, + 565, + 448 + ], + "spans": [ + { + "bbox": [ + 28, + 432, + 565, + 448 + ], + "score": 1.0, + "content": "prompting with LLM backends (Gemini, OpenAI, or local Ollama). Each extraction includes", + "type": "text" + } + ], + "index": 15 + }, + { + "bbox": [ + 27, + 451, + 223, + 469 + ], + "spans": [ + { + "bbox": [ + 27, + 451, + 223, + 469 + ], + "score": 1.0, + "content": "character-level position anchoring.", + "type": "text" + } + ], + "index": 16 + } + ], + "index": 15, + "bbox_fs": [ + 27, + 412, + 567, + 469 + ] + }, + { + "type": "title", + "bbox": [ + 30, + 488, + 201, + 502 + ], + "lines": [ + { + "bbox": [ + 28, + 487, + 201, + 504 + ], + "spans": [ + { + "bbox": [ + 28, + 487, + 201, + 504 + ], + "score": 1.0, + "content": "2.3 Knowledge Graph Module", + "type": "text" + } + ], + "index": 17 + } + ], + "index": 17 + }, + { + "type": "text", + "bbox": [ + 29, + 512, + 565, + 567 + ], + "lines": [ + { + "bbox": [ + 27, + 510, + 564, + 529 + ], + "spans": [ + { + "bbox": [ + 27, + 510, + 564, + 529 + ], + "score": 1.0, + "content": "Extracted entities and relationships are stored in a graph database. Node types include: Person,", + "type": "text" + } + ], + "index": 18 + }, + { + "bbox": [ + 28, + 531, + 563, + 547 + ], + "spans": [ + { + "bbox": [ + 28, + 531, + 563, + 547 + ], + "score": 1.0, + "content": "Organization, Location, Event, Concept. Edge types include: RELATED_TO, BELONGS_TO, CAUSED_BY,", + "type": "text" + } + ], + "index": 19 + }, + { + "bbox": [ + 28, + 552, + 91, + 567 + ], + "spans": [ + { + "bbox": [ + 28, + 552, + 91, + 567 + ], + "score": 1.0, + "content": "LOCATED_IN.", + "type": "text" + } + ], + "index": 20 + } + ], + "index": 19, + "bbox_fs": [ + 27, + 510, + 564, + 567 + ] + }, + { + "type": "title", + "bbox": [ + 30, + 587, + 162, + 602 + ], + "lines": [ + { + "bbox": [ + 28, + 587, + 162, + 603 + ], + "spans": [ + { + "bbox": [ + 28, + 587, + 162, + 603 + ], + "score": 1.0, + "content": "2.4 Retrieval Module", + "type": "text" + } + ], + "index": 21 + } + ], + "index": 21 + }, + { + "type": "list", + "bbox": [ + 29, + 612, + 562, + 645 + ], + "lines": [ + { + "bbox": [ + 28, + 610, + 563, + 627 + ], + "spans": [ + { + "bbox": [ + 28, + 610, + 563, + 627 + ], + "score": 1.0, + "content": "The retrieval layer supports hybrid search combining vector similarity and graph traversal.", + "type": "text" + } + ], + "index": 22, + "is_list_end_line": true + }, + { + "bbox": [ + 28, + 631, + 519, + 646 + ], + "spans": [ + { + "bbox": [ + 28, + 631, + 519, + 646 + ], + "score": 1.0, + "content": "Query results are ranked by relevance score and returned with source document references.", + "type": "text" + } + ], + "index": 23, + "is_list_start_line": true, + "is_list_end_line": true + } + ], + "index": 22.5, + "bbox_fs": [ + 28, + 610, + 563, + 646 + ] + } + ] + }, + { + "preproc_blocks": [ + { + "type": "title", + "bbox": [ + 30, + 36, + 160, + 52 + ], + "lines": [ + { + "bbox": [ + 27, + 34, + 162, + 54 + ], + "spans": [ + { + "bbox": [ + 27, + 34, + 162, + 54 + ], + "score": 1.0, + "content": "3. Data Pipeline", + "type": "text" + } + ], + "index": 0 + } + ], + "index": 0 + }, + { + "type": "text", + "bbox": [ + 30, + 70, + 371, + 84 + ], + "lines": [ + { + "bbox": [ + 28, + 68, + 373, + 86 + ], + "spans": [ + { + "bbox": [ + 28, + 68, + 373, + 86 + ], + "score": 1.0, + "content": "The end-to-end data pipeline consists of the following stages:", + "type": "text" + } + ], + "index": 1 + } + ], + "index": 1 + }, + { + "type": "text", + "bbox": [ + 41, + 110, + 192, + 123 + ], + "lines": [ + { + "bbox": [ + 39, + 109, + 192, + 124 + ], + "spans": [ + { + "bbox": [ + 39, + 109, + 192, + 124 + ], + "score": 1.0, + "content": "Stage 1: Document Ingestion", + "type": "text" + } + ], + "index": 2 + } + ], + "index": 2 + }, + { + "type": "text", + "bbox": [ + 51, + 129, + 316, + 183 + ], + "lines": [ + { + "bbox": [ + 51, + 129, + 317, + 145 + ], + "spans": [ + { + "bbox": [ + 51, + 129, + 317, + 145 + ], + "score": 1.0, + "content": "- Accept raw documents (PDF, DOCX, images, HTML)", + "type": "text" + } + ], + "index": 3 + }, + { + "bbox": [ + 50, + 149, + 242, + 165 + ], + "spans": [ + { + "bbox": [ + 50, + 149, + 242, + 165 + ], + "score": 1.0, + "content": "- Submit to MinerU API for parsing", + "type": "text" + } + ], + "index": 4 + }, + { + "bbox": [ + 51, + 169, + 258, + 184 + ], + "spans": [ + { + "bbox": [ + 51, + 169, + 220, + 184 + ], + "score": 1.0, + "content": "- Poll task status until state", + "type": "text" + }, + { + "bbox": [ + 221, + 171, + 231, + 181 + ], + "score": 0.76, + "content": "\\underline { { \\underline { { \\mathbf { \\delta \\pi } } } } }", + "type": "inline_equation" + }, + { + "bbox": [ + 231, + 169, + 258, + 184 + ], + "score": 1.0, + "content": "done", + "type": "text" + } + ], + "index": 5 + } + ], + "index": 4 + }, + { + "type": "text", + "bbox": [ + 41, + 210, + 192, + 222 + ], + "lines": [ + { + "bbox": [ + 40, + 209, + 192, + 223 + ], + "spans": [ + { + "bbox": [ + 40, + 209, + 192, + 223 + ], + "score": 1.0, + "content": "Stage 2: Content Extraction", + "type": "text" + } + ], + "index": 6 + } + ], + "index": 6 + }, + { + "type": "text", + "bbox": [ + 51, + 229, + 323, + 282 + ], + "lines": [ + { + "bbox": [ + 51, + 229, + 262, + 243 + ], + "spans": [ + { + "bbox": [ + 51, + 229, + 262, + 243 + ], + "score": 1.0, + "content": "- Download and decompress full_zip_url", + "type": "text" + } + ], + "index": 7 + }, + { + "bbox": [ + 50, + 248, + 313, + 263 + ], + "spans": [ + { + "bbox": [ + 50, + 248, + 313, + 263 + ], + "score": 1.0, + "content": "- Parse content_list.json into Document objects", + "type": "text" + } + ], + "index": 8 + }, + { + "bbox": [ + 51, + 269, + 323, + 284 + ], + "spans": [ + { + "bbox": [ + 51, + 269, + 323, + 284 + ], + "score": 1.0, + "content": "- Separate text blocks, tables, images, equations", + "type": "text" + } + ], + "index": 9 + } + ], + "index": 8 + }, + { + "type": "text", + "bbox": [ + 40, + 309, + 247, + 321 + ], + "lines": [ + { + "bbox": [ + 40, + 308, + 247, + 322 + ], + "spans": [ + { + "bbox": [ + 40, + 308, + 247, + 322 + ], + "score": 1.0, + "content": "Stage 3: Entity & Relation Extraction", + "type": "text" + } + ], + "index": 10 + } + ], + "index": 10 + }, + { + "type": "text", + "bbox": [ + 51, + 328, + 313, + 382 + ], + "lines": [ + { + "bbox": [ + 51, + 327, + 236, + 342 + ], + "spans": [ + { + "bbox": [ + 51, + 327, + 236, + 342 + ], + "score": 1.0, + "content": "- Feed text blocks to LangExtract", + "type": "text" + } + ], + "index": 11 + }, + { + "bbox": [ + 50, + 348, + 312, + 362 + ], + "spans": [ + { + "bbox": [ + 50, + 348, + 312, + 362 + ], + "score": 1.0, + "content": "- Extract entities with char_interval positions", + "type": "text" + } + ], + "index": 12 + }, + { + "bbox": [ + 51, + 368, + 274, + 382 + ], + "spans": [ + { + "bbox": [ + 51, + 368, + 274, + 382 + ], + "score": 1.0, + "content": "- Extract relationships between entities", + "type": "text" + } + ], + "index": 13 + } + ], + "index": 12 + }, + { + "type": "text", + "bbox": [ + 41, + 408, + 192, + 421 + ], + "lines": [ + { + "bbox": [ + 40, + 408, + 192, + 422 + ], + "spans": [ + { + "bbox": [ + 40, + 408, + 192, + 422 + ], + "score": 1.0, + "content": "Stage 4: Graph Construction", + "type": "text" + } + ], + "index": 14 + } + ], + "index": 14 + }, + { + "type": "text", + "bbox": [ + 51, + 428, + 311, + 481 + ], + "lines": [ + { + "bbox": [ + 50, + 426, + 285, + 443 + ], + "spans": [ + { + "bbox": [ + 50, + 426, + 285, + 443 + ], + "score": 1.0, + "content": "- Map extractions to graph nodes and edges", + "type": "text" + } + ], + "index": 15 + }, + { + "bbox": [ + 51, + 447, + 311, + 462 + ], + "spans": [ + { + "bbox": [ + 51, + 447, + 311, + 462 + ], + "score": 1.0, + "content": "- Store with source provenance (page_idx, bbox)", + "type": "text" + } + ], + "index": 16 + }, + { + "bbox": [ + 51, + 467, + 302, + 481 + ], + "spans": [ + { + "bbox": [ + 51, + 467, + 302, + 481 + ], + "score": 1.0, + "content": "- Build vector embeddings for semantic search", + "type": "text" + } + ], + "index": 17 + } + ], + "index": 16 + }, + { + "type": "title", + "bbox": [ + 30, + 508, + 194, + 522 + ], + "lines": [ + { + "bbox": [ + 28, + 507, + 195, + 524 + ], + "spans": [ + { + "bbox": [ + 28, + 507, + 195, + 524 + ], + "score": 1.0, + "content": "4. Supported File Formats", + "type": "text" + } + ], + "index": 18 + } + ], + "index": 18 + }, + { + "type": "table", + "bbox": [ + 27, + 534, + 525, + 678 + ], + "blocks": [ + { + "type": "table_body", + "bbox": [ + 27, + 534, + 525, + 678 + ], + "group_id": 0, + "lines": [ + { + "bbox": [ + 27, + 534, + 525, + 678 + ], + "spans": [ + { + "bbox": [ + 27, + 534, + 525, + 678 + ], + "score": 0.985, + "html": "
FormatExtensionOCR RequiredModeI
PDF (text). pdfNopipeline / vlm
PDF (scan). pdfYesvIlm
Word. docxNopipeline
PowerPoint.pptxNopipeline
Image.png / .jpgAutovIlm
HTML.htmlNoMinerU-HTML
", + "type": "table", + "image_path": "1ed7aacecd20fecef8dc27ee2fe76dc1ae7fa93c44f7d10878d17a41f21a6bef.jpg" + } + ] + } + ], + "index": 20, + "virtual_lines": [ + { + "bbox": [ + 27, + 534, + 525, + 582.0 + ], + "spans": [], + "index": 19 + }, + { + "bbox": [ + 27, + 582.0, + 525, + 630.0 + ], + "spans": [], + "index": 20 + }, + { + "bbox": [ + 27, + 630.0, + 525, + 678.0 + ], + "spans": [], + "index": 21 + } + ] + } + ], + "index": 20 + } + ], + "page_idx": 2, + "page_size": [ + 595, + 841 + ], + "discarded_blocks": [], + "para_blocks": [ + { + "type": "title", + "bbox": [ + 30, + 36, + 160, + 52 + ], + "lines": [ + { + "bbox": [ + 27, + 34, + 162, + 54 + ], + "spans": [ + { + "bbox": [ + 27, + 34, + 162, + 54 + ], + "score": 1.0, + "content": "3. Data Pipeline", + "type": "text" + } + ], + "index": 0 + } + ], + "index": 0 + }, + { + "type": "text", + "bbox": [ + 30, + 70, + 371, + 84 + ], + "lines": [ + { + "bbox": [ + 28, + 68, + 373, + 86 + ], + "spans": [ + { + "bbox": [ + 28, + 68, + 373, + 86 + ], + "score": 1.0, + "content": "The end-to-end data pipeline consists of the following stages:", + "type": "text" + } + ], + "index": 1 + } + ], + "index": 1, + "bbox_fs": [ + 28, + 68, + 373, + 86 + ] + }, + { + "type": "text", + "bbox": [ + 41, + 110, + 192, + 123 + ], + "lines": [ + { + "bbox": [ + 39, + 109, + 192, + 124 + ], + "spans": [ + { + "bbox": [ + 39, + 109, + 192, + 124 + ], + "score": 1.0, + "content": "Stage 1: Document Ingestion", + "type": "text" + } + ], + "index": 2 + } + ], + "index": 2, + "bbox_fs": [ + 39, + 109, + 192, + 124 + ] + }, + { + "type": "text", + "bbox": [ + 51, + 129, + 316, + 183 + ], + "lines": [ + { + "bbox": [ + 51, + 129, + 317, + 145 + ], + "spans": [ + { + "bbox": [ + 51, + 129, + 317, + 145 + ], + "score": 1.0, + "content": "- Accept raw documents (PDF, DOCX, images, HTML)", + "type": "text" + } + ], + "index": 3 + }, + { + "bbox": [ + 50, + 149, + 242, + 165 + ], + "spans": [ + { + "bbox": [ + 50, + 149, + 242, + 165 + ], + "score": 1.0, + "content": "- Submit to MinerU API for parsing", + "type": "text" + } + ], + "index": 4 + }, + { + "bbox": [ + 51, + 169, + 258, + 184 + ], + "spans": [ + { + "bbox": [ + 51, + 169, + 220, + 184 + ], + "score": 1.0, + "content": "- Poll task status until state", + "type": "text" + }, + { + "bbox": [ + 221, + 171, + 231, + 181 + ], + "score": 0.76, + "content": "\\underline { { \\underline { { \\mathbf { \\delta \\pi } } } } }", + "type": "inline_equation" + }, + { + "bbox": [ + 231, + 169, + 258, + 184 + ], + "score": 1.0, + "content": "done", + "type": "text" + } + ], + "index": 5 + } + ], + "index": 4, + "bbox_fs": [ + 50, + 129, + 317, + 184 + ] + }, + { + "type": "text", + "bbox": [ + 41, + 210, + 192, + 222 + ], + "lines": [ + { + "bbox": [ + 40, + 209, + 192, + 223 + ], + "spans": [ + { + "bbox": [ + 40, + 209, + 192, + 223 + ], + "score": 1.0, + "content": "Stage 2: Content Extraction", + "type": "text" + } + ], + "index": 6 + } + ], + "index": 6, + "bbox_fs": [ + 40, + 209, + 192, + 223 + ] + }, + { + "type": "text", + "bbox": [ + 51, + 229, + 323, + 282 + ], + "lines": [ + { + "bbox": [ + 51, + 229, + 262, + 243 + ], + "spans": [ + { + "bbox": [ + 51, + 229, + 262, + 243 + ], + "score": 1.0, + "content": "- Download and decompress full_zip_url", + "type": "text" + } + ], + "index": 7 + }, + { + "bbox": [ + 50, + 248, + 313, + 263 + ], + "spans": [ + { + "bbox": [ + 50, + 248, + 313, + 263 + ], + "score": 1.0, + "content": "- Parse content_list.json into Document objects", + "type": "text" + } + ], + "index": 8 + }, + { + "bbox": [ + 51, + 269, + 323, + 284 + ], + "spans": [ + { + "bbox": [ + 51, + 269, + 323, + 284 + ], + "score": 1.0, + "content": "- Separate text blocks, tables, images, equations", + "type": "text" + } + ], + "index": 9 + } + ], + "index": 8, + "bbox_fs": [ + 50, + 229, + 323, + 284 + ] + }, + { + "type": "text", + "bbox": [ + 40, + 309, + 247, + 321 + ], + "lines": [ + { + "bbox": [ + 40, + 308, + 247, + 322 + ], + "spans": [ + { + "bbox": [ + 40, + 308, + 247, + 322 + ], + "score": 1.0, + "content": "Stage 3: Entity & Relation Extraction", + "type": "text" + } + ], + "index": 10 + } + ], + "index": 10, + "bbox_fs": [ + 40, + 308, + 247, + 322 + ] + }, + { + "type": "text", + "bbox": [ + 51, + 328, + 313, + 382 + ], + "lines": [ + { + "bbox": [ + 51, + 327, + 236, + 342 + ], + "spans": [ + { + "bbox": [ + 51, + 327, + 236, + 342 + ], + "score": 1.0, + "content": "- Feed text blocks to LangExtract", + "type": "text" + } + ], + "index": 11 + }, + { + "bbox": [ + 50, + 348, + 312, + 362 + ], + "spans": [ + { + "bbox": [ + 50, + 348, + 312, + 362 + ], + "score": 1.0, + "content": "- Extract entities with char_interval positions", + "type": "text" + } + ], + "index": 12 + }, + { + "bbox": [ + 51, + 368, + 274, + 382 + ], + "spans": [ + { + "bbox": [ + 51, + 368, + 274, + 382 + ], + "score": 1.0, + "content": "- Extract relationships between entities", + "type": "text" + } + ], + "index": 13 + } + ], + "index": 12, + "bbox_fs": [ + 50, + 327, + 312, + 382 + ] + }, + { + "type": "text", + "bbox": [ + 41, + 408, + 192, + 421 + ], + "lines": [ + { + "bbox": [ + 40, + 408, + 192, + 422 + ], + "spans": [ + { + "bbox": [ + 40, + 408, + 192, + 422 + ], + "score": 1.0, + "content": "Stage 4: Graph Construction", + "type": "text" + } + ], + "index": 14 + } + ], + "index": 14, + "bbox_fs": [ + 40, + 408, + 192, + 422 + ] + }, + { + "type": "text", + "bbox": [ + 51, + 428, + 311, + 481 + ], + "lines": [ + { + "bbox": [ + 50, + 426, + 285, + 443 + ], + "spans": [ + { + "bbox": [ + 50, + 426, + 285, + 443 + ], + "score": 1.0, + "content": "- Map extractions to graph nodes and edges", + "type": "text" + } + ], + "index": 15 + }, + { + "bbox": [ + 51, + 447, + 311, + 462 + ], + "spans": [ + { + "bbox": [ + 51, + 447, + 311, + 462 + ], + "score": 1.0, + "content": "- Store with source provenance (page_idx, bbox)", + "type": "text" + } + ], + "index": 16 + }, + { + "bbox": [ + 51, + 467, + 302, + 481 + ], + "spans": [ + { + "bbox": [ + 51, + 467, + 302, + 481 + ], + "score": 1.0, + "content": "- Build vector embeddings for semantic search", + "type": "text" + } + ], + "index": 17 + } + ], + "index": 16, + "bbox_fs": [ + 50, + 426, + 311, + 481 + ] + }, + { + "type": "title", + "bbox": [ + 30, + 508, + 194, + 522 + ], + "lines": [ + { + "bbox": [ + 28, + 507, + 195, + 524 + ], + "spans": [ + { + "bbox": [ + 28, + 507, + 195, + 524 + ], + "score": 1.0, + "content": "4. Supported File Formats", + "type": "text" + } + ], + "index": 18 + } + ], + "index": 18 + }, + { + "type": "table", + "bbox": [ + 27, + 534, + 525, + 678 + ], + "blocks": [ + { + "type": "table_body", + "bbox": [ + 27, + 534, + 525, + 678 + ], + "group_id": 0, + "lines": [ + { + "bbox": [ + 27, + 534, + 525, + 678 + ], + "spans": [ + { + "bbox": [ + 27, + 534, + 525, + 678 + ], + "score": 0.985, + "html": "
FormatExtensionOCR RequiredModeI
PDF (text). pdfNopipeline / vlm
PDF (scan). pdfYesvIlm
Word. docxNopipeline
PowerPoint.pptxNopipeline
Image.png / .jpgAutovIlm
HTML.htmlNoMinerU-HTML
", + "type": "table", + "image_path": "1ed7aacecd20fecef8dc27ee2fe76dc1ae7fa93c44f7d10878d17a41f21a6bef.jpg" + } + ] + } + ], + "index": 20, + "virtual_lines": [ + { + "bbox": [ + 27, + 534, + 525, + 582.0 + ], + "spans": [], + "index": 19 + }, + { + "bbox": [ + 27, + 582.0, + 525, + 630.0 + ], + "spans": [], + "index": 20 + }, + { + "bbox": [ + 27, + 630.0, + 525, + 678.0 + ], + "spans": [], + "index": 21 + } + ] + } + ], + "index": 20 + } + ] + }, + { + "preproc_blocks": [ + { + "type": "title", + "bbox": [ + 29, + 36, + 272, + 53 + ], + "lines": [ + { + "bbox": [ + 27, + 33, + 274, + 55 + ], + "spans": [ + { + "bbox": [ + 27, + 33, + 274, + 55 + ], + "score": 1.0, + "content": "5. API Configuration Reference", + "type": "text" + } + ], + "index": 0 + } + ], + "index": 0 + }, + { + "type": "text", + "bbox": [ + 29, + 70, + 567, + 104 + ], + "lines": [ + { + "bbox": [ + 27, + 67, + 567, + 87 + ], + "spans": [ + { + "bbox": [ + 27, + 67, + 567, + 87 + ], + "score": 1.0, + "content": "The following environment variables must be configured before running the MinerU parsing", + "type": "text" + } + ], + "index": 1 + }, + { + "bbox": [ + 27, + 90, + 77, + 105 + ], + "spans": [ + { + "bbox": [ + 27, + 90, + 77, + 105 + ], + "score": 1.0, + "content": "service:", + "type": "text" + } + ], + "index": 2 + } + ], + "index": 1.5 + }, + { + "type": "text", + "bbox": [ + 39, + 128, + 379, + 284 + ], + "lines": [ + { + "bbox": [ + 39, + 129, + 362, + 145 + ], + "spans": [ + { + "bbox": [ + 39, + 130, + 132, + 144 + ], + "score": 1.0, + "content": "MINERU_API_TOKEN", + "type": "text" + }, + { + "bbox": [ + 155, + 129, + 362, + 145 + ], + "score": 1.0, + "content": ": Bearer token for API authentication", + "type": "text" + } + ], + "index": 3 + }, + { + "bbox": [ + 39, + 149, + 335, + 165 + ], + "spans": [ + { + "bbox": [ + 39, + 149, + 126, + 165 + ], + "score": 1.0, + "content": "MINERU_USER_UID", + "type": "text" + }, + { + "bbox": [ + 156, + 149, + 335, + 164 + ], + "score": 1.0, + "content": ": User UUID for quota management", + "type": "text" + } + ], + "index": 4 + }, + { + "bbox": [ + 39, + 170, + 307, + 183 + ], + "spans": [ + { + "bbox": [ + 39, + 170, + 126, + 183 + ], + "score": 1.0, + "content": "MINERU_BASE_URL", + "type": "text" + }, + { + "bbox": [ + 156, + 170, + 307, + 183 + ], + "score": 1.0, + "content": ": https://mineru.net/api/v4", + "type": "text" + } + ], + "index": 5 + }, + { + "bbox": [ + 39, + 189, + 379, + 204 + ], + "spans": [ + { + "bbox": [ + 39, + 189, + 379, + 204 + ], + "score": 1.0, + "content": "MINERU_MODEL_VERSION : pipeline (default) | vlm | MinerU-HTML", + "type": "text" + } + ], + "index": 6 + }, + { + "bbox": [ + 40, + 210, + 316, + 223 + ], + "spans": [ + { + "bbox": [ + 40, + 210, + 126, + 223 + ], + "score": 1.0, + "content": "MINERU_LANGUAGE", + "type": "text" + }, + { + "bbox": [ + 156, + 210, + 316, + 223 + ], + "score": 1.0, + "content": ": ch (Chinese) | en (English)", + "type": "text" + } + ], + "index": 7 + }, + { + "bbox": [ + 39, + 229, + 371, + 244 + ], + "spans": [ + { + "bbox": [ + 39, + 230, + 115, + 244 + ], + "score": 1.0, + "content": "MINERU_IS_OCR", + "type": "text" + }, + { + "bbox": [ + 156, + 229, + 371, + 243 + ], + "score": 1.0, + "content": ": false (text PDF) | true (scanned PDF)", + "type": "text" + } + ], + "index": 8 + }, + { + "bbox": [ + 39, + 249, + 236, + 263 + ], + "spans": [ + { + "bbox": [ + 39, + 249, + 236, + 263 + ], + "score": 1.0, + "content": "MINERU_ENABLE_FORMULA: true | false", + "type": "text" + } + ], + "index": 9 + }, + { + "bbox": [ + 39, + 269, + 236, + 282 + ], + "spans": [ + { + "bbox": [ + 39, + 269, + 236, + 282 + ], + "score": 1.0, + "content": "MINERU_ENABLE_TABLE : true | false", + "type": "text" + } + ], + "index": 10 + } + ], + "index": 6.5 + }, + { + "type": "text", + "bbox": [ + 29, + 309, + 96, + 321 + ], + "lines": [ + { + "bbox": [ + 28, + 308, + 97, + 322 + ], + "spans": [ + { + "bbox": [ + 28, + 308, + 97, + 322 + ], + "score": 1.0, + "content": "Rate Limits:", + "type": "text" + } + ], + "index": 11 + } + ], + "index": 11 + }, + { + "type": "text", + "bbox": [ + 39, + 327, + 300, + 402 + ], + "lines": [ + { + "bbox": [ + 39, + 327, + 242, + 343 + ], + "spans": [ + { + "bbox": [ + 39, + 327, + 126, + 342 + ], + "score": 1.0, + "content": "- Max file size", + "type": "text" + }, + { + "bbox": [ + 139, + 327, + 242, + 343 + ], + "score": 1.0, + "content": ": 200 MB per file", + "type": "text" + } + ], + "index": 12 + }, + { + "bbox": [ + 39, + 347, + 258, + 364 + ], + "spans": [ + { + "bbox": [ + 39, + 347, + 104, + 364 + ], + "score": 1.0, + "content": "- Max pages", + "type": "text" + }, + { + "bbox": [ + 145, + 348, + 258, + 363 + ], + "score": 1.0, + "content": ": 600 pages per file", + "type": "text" + } + ], + "index": 13 + }, + { + "bbox": [ + 39, + 367, + 300, + 383 + ], + "spans": [ + { + "bbox": [ + 39, + 367, + 115, + 383 + ], + "score": 1.0, + "content": "- Daily quota", + "type": "text" + }, + { + "bbox": [ + 145, + 368, + 300, + 383 + ], + "score": 1.0, + "content": ": 2000 pages (high priority)", + "type": "text" + } + ], + "index": 14 + }, + { + "bbox": [ + 39, + 387, + 274, + 403 + ], + "spans": [ + { + "bbox": [ + 39, + 387, + 116, + 402 + ], + "score": 1.0, + "content": "- Batch limit", + "type": "text" + }, + { + "bbox": [ + 144, + 387, + 274, + 403 + ], + "score": 1.0, + "content": ": 200 files per request", + "type": "text" + } + ], + "index": 15 + } + ], + "index": 13.5 + } + ], + "page_idx": 3, + "page_size": [ + 595, + 841 + ], + "discarded_blocks": [], + "para_blocks": [ + { + "type": "title", + "bbox": [ + 29, + 36, + 272, + 53 + ], + "lines": [ + { + "bbox": [ + 27, + 33, + 274, + 55 + ], + "spans": [ + { + "bbox": [ + 27, + 33, + 274, + 55 + ], + "score": 1.0, + "content": "5. API Configuration Reference", + "type": "text" + } + ], + "index": 0 + } + ], + "index": 0 + }, + { + "type": "text", + "bbox": [ + 29, + 70, + 567, + 104 + ], + "lines": [ + { + "bbox": [ + 27, + 67, + 567, + 87 + ], + "spans": [ + { + "bbox": [ + 27, + 67, + 567, + 87 + ], + "score": 1.0, + "content": "The following environment variables must be configured before running the MinerU parsing", + "type": "text" + } + ], + "index": 1 + }, + { + "bbox": [ + 27, + 90, + 77, + 105 + ], + "spans": [ + { + "bbox": [ + 27, + 90, + 77, + 105 + ], + "score": 1.0, + "content": "service:", + "type": "text" + } + ], + "index": 2 + } + ], + "index": 1.5, + "bbox_fs": [ + 27, + 67, + 567, + 105 + ] + }, + { + "type": "list", + "bbox": [ + 39, + 128, + 379, + 284 + ], + "lines": [ + { + "bbox": [ + 39, + 129, + 362, + 145 + ], + "spans": [ + { + "bbox": [ + 39, + 130, + 132, + 144 + ], + "score": 1.0, + "content": "MINERU_API_TOKEN", + "type": "text" + }, + { + "bbox": [ + 155, + 129, + 362, + 145 + ], + "score": 1.0, + "content": ": Bearer token for API authentication", + "type": "text" + } + ], + "index": 3, + "is_list_start_line": true + }, + { + "bbox": [ + 39, + 149, + 335, + 165 + ], + "spans": [ + { + "bbox": [ + 39, + 149, + 126, + 165 + ], + "score": 1.0, + "content": "MINERU_USER_UID", + "type": "text" + }, + { + "bbox": [ + 156, + 149, + 335, + 164 + ], + "score": 1.0, + "content": ": User UUID for quota management", + "type": "text" + } + ], + "index": 4, + "is_list_start_line": true + }, + { + "bbox": [ + 39, + 170, + 307, + 183 + ], + "spans": [ + { + "bbox": [ + 39, + 170, + 126, + 183 + ], + "score": 1.0, + "content": "MINERU_BASE_URL", + "type": "text" + }, + { + "bbox": [ + 156, + 170, + 307, + 183 + ], + "score": 1.0, + "content": ": https://mineru.net/api/v4", + "type": "text" + } + ], + "index": 5, + "is_list_start_line": true + }, + { + "bbox": [ + 39, + 189, + 379, + 204 + ], + "spans": [ + { + "bbox": [ + 39, + 189, + 379, + 204 + ], + "score": 1.0, + "content": "MINERU_MODEL_VERSION : pipeline (default) | vlm | MinerU-HTML", + "type": "text" + } + ], + "index": 6, + "is_list_start_line": true + }, + { + "bbox": [ + 40, + 210, + 316, + 223 + ], + "spans": [ + { + "bbox": [ + 40, + 210, + 126, + 223 + ], + "score": 1.0, + "content": "MINERU_LANGUAGE", + "type": "text" + }, + { + "bbox": [ + 156, + 210, + 316, + 223 + ], + "score": 1.0, + "content": ": ch (Chinese) | en (English)", + "type": "text" + } + ], + "index": 7, + "is_list_start_line": true + }, + { + "bbox": [ + 39, + 229, + 371, + 244 + ], + "spans": [ + { + "bbox": [ + 39, + 230, + 115, + 244 + ], + "score": 1.0, + "content": "MINERU_IS_OCR", + "type": "text" + }, + { + "bbox": [ + 156, + 229, + 371, + 243 + ], + "score": 1.0, + "content": ": false (text PDF) | true (scanned PDF)", + "type": "text" + } + ], + "index": 8, + "is_list_start_line": true + }, + { + "bbox": [ + 39, + 249, + 236, + 263 + ], + "spans": [ + { + "bbox": [ + 39, + 249, + 236, + 263 + ], + "score": 1.0, + "content": "MINERU_ENABLE_FORMULA: true | false", + "type": "text" + } + ], + "index": 9, + "is_list_start_line": true + }, + { + "bbox": [ + 39, + 269, + 236, + 282 + ], + "spans": [ + { + "bbox": [ + 39, + 269, + 236, + 282 + ], + "score": 1.0, + "content": "MINERU_ENABLE_TABLE : true | false", + "type": "text" + } + ], + "index": 10, + "is_list_start_line": true + } + ], + "index": 6.5, + "bbox_fs": [ + 39, + 129, + 379, + 282 + ] + }, + { + "type": "text", + "bbox": [ + 29, + 309, + 96, + 321 + ], + "lines": [ + { + "bbox": [ + 28, + 308, + 97, + 322 + ], + "spans": [ + { + "bbox": [ + 28, + 308, + 97, + 322 + ], + "score": 1.0, + "content": "Rate Limits:", + "type": "text" + } + ], + "index": 11 + } + ], + "index": 11, + "bbox_fs": [ + 28, + 308, + 97, + 322 + ] + }, + { + "type": "text", + "bbox": [ + 39, + 327, + 300, + 402 + ], + "lines": [ + { + "bbox": [ + 39, + 327, + 242, + 343 + ], + "spans": [ + { + "bbox": [ + 39, + 327, + 126, + 342 + ], + "score": 1.0, + "content": "- Max file size", + "type": "text" + }, + { + "bbox": [ + 139, + 327, + 242, + 343 + ], + "score": 1.0, + "content": ": 200 MB per file", + "type": "text" + } + ], + "index": 12 + }, + { + "bbox": [ + 39, + 347, + 258, + 364 + ], + "spans": [ + { + "bbox": [ + 39, + 347, + 104, + 364 + ], + "score": 1.0, + "content": "- Max pages", + "type": "text" + }, + { + "bbox": [ + 145, + 348, + 258, + 363 + ], + "score": 1.0, + "content": ": 600 pages per file", + "type": "text" + } + ], + "index": 13 + }, + { + "bbox": [ + 39, + 367, + 300, + 383 + ], + "spans": [ + { + "bbox": [ + 39, + 367, + 115, + 383 + ], + "score": 1.0, + "content": "- Daily quota", + "type": "text" + }, + { + "bbox": [ + 145, + 368, + 300, + 383 + ], + "score": 1.0, + "content": ": 2000 pages (high priority)", + "type": "text" + } + ], + "index": 14 + }, + { + "bbox": [ + 39, + 387, + 274, + 403 + ], + "spans": [ + { + "bbox": [ + 39, + 387, + 116, + 402 + ], + "score": 1.0, + "content": "- Batch limit", + "type": "text" + }, + { + "bbox": [ + 144, + 387, + 274, + 403 + ], + "score": 1.0, + "content": ": 200 files per request", + "type": "text" + } + ], + "index": 15 + } + ], + "index": 13.5, + "bbox_fs": [ + 39, + 327, + 300, + 403 + ] + } + ] + } + ], + "_backend": "pipeline", + "_version_name": "2.7.6" +} \ No newline at end of file diff --git a/backend/output/8456b615_sample_graphrag_overview/parse_summary.json b/backend/output/8456b615_sample_graphrag_overview/parse_summary.json new file mode 100644 index 0000000..d88cbc1 --- /dev/null +++ b/backend/output/8456b615_sample_graphrag_overview/parse_summary.json @@ -0,0 +1,10 @@ +{ + "total_blocks": 32, + "type_distribution": { + "text": 31, + "table": 1 + }, + "total_pages": 4, + "text_block_count": 31, + "table_block_count": 1 +} \ No newline at end of file diff --git a/backend/pipeline/__init__.py b/backend/pipeline/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/backend/pipeline/entity_extractor.py b/backend/pipeline/entity_extractor.py new file mode 100644 index 0000000..87cc98f --- /dev/null +++ b/backend/pipeline/entity_extractor.py @@ -0,0 +1,66 @@ +""" +Entity Extractor — LangExtract + DeepSeek entity extraction. +Independent implementation for the GraphRAG Studio backend. +""" +from __future__ import annotations + +import os +from pathlib import Path + +from dotenv import load_dotenv + +import langextract as lx +from langextract.providers.openai import OpenAILanguageModel + +load_dotenv(Path(__file__).parent.parent / ".env", override=True) + +DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY", "") +DEEPSEEK_BASE_URL = os.getenv("DEEPSEEK_BASE_URL", "https://api.deepseek.com") +MODEL_ID = "deepseek-chat" + +PROMPT_DESCRIPTION = ( + "Extract named entities from the text in order of appearance. " + "Entity types: TECHNOLOGY (software, algorithms, models, tools), " + "ORGANIZATION (companies, research groups, institutions), " + "PERSON (individual people), " + "LOCATION (places, geographic entities), " + "CONCEPT (technical concepts, methodologies, frameworks)." +) + +EXAMPLES = [ + lx.data.ExampleData( + text=( + "LangChain is a framework created by Harrison Chase for building " + "LLM applications. It integrates with OpenAI models and Pinecone " + "vector database for semantic search." + ), + extractions=[ + lx.data.Extraction(extraction_class="TECHNOLOGY", extraction_text="LangChain"), + lx.data.Extraction(extraction_class="PERSON", extraction_text="Harrison Chase"), + lx.data.Extraction(extraction_class="CONCEPT", extraction_text="LLM applications"), + lx.data.Extraction(extraction_class="TECHNOLOGY", extraction_text="OpenAI models"), + lx.data.Extraction(extraction_class="TECHNOLOGY", extraction_text="Pinecone"), + lx.data.Extraction(extraction_class="CONCEPT", extraction_text="semantic search"), + ], + ) +] + + +def create_model() -> OpenAILanguageModel: + if not DEEPSEEK_API_KEY: + raise ValueError("DEEPSEEK_API_KEY not set in backend/.env") + return OpenAILanguageModel( + model_id=MODEL_ID, + api_key=DEEPSEEK_API_KEY, + base_url=DEEPSEEK_BASE_URL, + ) + + +def extract_entities(page_text: str, model: OpenAILanguageModel) -> lx.data.AnnotatedDocument: + return lx.extract( + text_or_documents=page_text, + prompt_description=PROMPT_DESCRIPTION, + examples=EXAMPLES, + model=model, + show_progress=False, + ) diff --git a/backend/pipeline/kg_builder.py b/backend/pipeline/kg_builder.py new file mode 100644 index 0000000..962b5fe --- /dev/null +++ b/backend/pipeline/kg_builder.py @@ -0,0 +1,123 @@ +""" +KG Builder — node deduplication + CO_OCCURS_IN edge generation. +Independent implementation for the GraphRAG Studio backend. +""" +from __future__ import annotations + +from collections import defaultdict + +import langextract as lx + +from pipeline.text_assembler import PageText + +ACCEPTED_ALIGNMENTS = {"match_exact", "match_greater", "match_lesser"} + + +def build_kg( + pages: list[PageText], + annotated_docs: list[lx.data.AnnotatedDocument], + source_doc_id: str, +) -> tuple[list[dict], list[dict]]: + """Build KG nodes and edges from LangExtract results. + + Returns: + (nodes, edges) — deduplicated node list and edge list. + """ + # Phase 1: collect raw entities + raw_entities = [] + for page, doc in zip(pages, annotated_docs): + if not doc.extractions: + continue + for ext in doc.extractions: + status = ext.alignment_status.value if ext.alignment_status else None + if status not in ACCEPTED_ALIGNMENTS: + continue + char_start = ext.char_interval.start_pos if ext.char_interval else None + char_end = ext.char_interval.end_pos if ext.char_interval else None + raw_entities.append({ + "name": ext.extraction_text, + "type": ext.extraction_class, + "char_start": char_start, + "char_end": char_end, + "confidence": status, + "page": page.page_idx, + "source_doc": source_doc_id, + }) + + # Phase 2: deduplicate nodes + seen: dict[tuple[str, str], int] = {} + nodes: list[dict] = [] + node_pages: dict[int, set[int]] = defaultdict(set) + + for entity in raw_entities: + type_prefix = entity["type"].lower()[:4] + name_slug = entity["name"].lower().replace(" ", "")[:12] + dedup_key = (entity["name"].lower(), entity["type"]) + if dedup_key not in seen: + node_idx = len(nodes) + seen[dedup_key] = node_idx + nodes.append({ + "id": f"{type_prefix}_{name_slug}_{node_idx}", + "name": entity["name"], + "type": entity["type"], + "source_doc": entity["source_doc"], + "char_start": entity["char_start"], + "char_end": entity["char_end"], + "confidence": entity["confidence"], + "page": entity["page"], + }) + node_idx = seen[dedup_key] + node_pages[node_idx].add(entity["page"]) + + # Phase 3: CO_OCCURS_IN edges + page_nodes: dict[int, list[int]] = defaultdict(list) + for node_idx, page_set in node_pages.items(): + for page_idx in page_set: + page_nodes[page_idx].append(node_idx) + + edges: list[dict] = [] + edge_seen: set[tuple] = set() + + for page_idx, node_indices in sorted(page_nodes.items()): + for i in range(len(node_indices)): + for j in range(i + 1, len(node_indices)): + a = nodes[node_indices[i]]["id"] + b = nodes[node_indices[j]]["id"] + src, tgt = (a, b) if a < b else (b, a) + key = (src, tgt, source_doc_id, page_idx) + if key in edge_seen: + continue + edge_seen.add(key) + edges.append({ + "source": src, + "target": tgt, + "relation": "CO_OCCURS_IN", + "doc_id": source_doc_id, + "page": page_idx, + }) + + return nodes, edges + + +def extractions_to_records( + pages: list[PageText], + annotated_docs: list[lx.data.AnnotatedDocument], + doc_id: str, +) -> list[dict]: + """Flatten LangExtract results to ExtractionRecord dicts.""" + records = [] + for page, doc in zip(pages, annotated_docs): + if not doc.extractions: + continue + for ext in doc.extractions: + status = ext.alignment_status.value if ext.alignment_status else None + records.append({ + "text": ext.extraction_text, + "type": ext.extraction_class, + "char_start": ext.char_interval.start_pos if ext.char_interval else None, + "char_end": ext.char_interval.end_pos if ext.char_interval else None, + "alignment": status, + "page": page.page_idx, + "doc_id": doc_id, + }) + return records diff --git a/backend/pipeline/qa_agent.py b/backend/pipeline/qa_agent.py new file mode 100644 index 0000000..3b766ca --- /dev/null +++ b/backend/pipeline/qa_agent.py @@ -0,0 +1,217 @@ +""" +QA Agent — LangGraph ReAct agent over the knowledge graph. +Independent implementation for the GraphRAG Studio backend. +""" +from __future__ import annotations + +import os +import re +from pathlib import Path + +import networkx as nx +from dotenv import load_dotenv +from langchain.tools import tool +from langchain_openai import ChatOpenAI +from langchain_core.messages import HumanMessage, AIMessage, ToolMessage, SystemMessage +from langgraph.prebuilt import create_react_agent + +load_dotenv(Path(__file__).parent.parent / ".env", override=True) + +DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY", "") +DEEPSEEK_BASE_URL = os.getenv("DEEPSEEK_BASE_URL", "https://api.deepseek.com") + + +def build_kg_graph(nodes: list[dict], edges: list[dict]) -> nx.Graph: + G = nx.Graph() + for n in nodes: + G.add_node(n["id"], **n) + for e in edges: + G.add_edge(e["source"], e["target"], **{k: v for k, v in e.items() if k not in ("source", "target")}) + return G + + +def make_tools(G: nx.Graph) -> list: + @tool + def search_entities(query: str) -> str: + """Search knowledge graph entities by name (case-insensitive substring). + Args: + query: Keyword to search for in entity names. + """ + q = query.lower() + matches = [data for _, data in G.nodes(data=True) if q in data.get("name", "").lower()] + if not matches: + sample = ", ".join(d.get("name", "") for _, d in list(G.nodes(data=True))[:8]) + return f"No entities found matching '{query}'. Sample: {sample}" + lines = [f"Found {len(matches)} entity(ies) matching '{query}':"] + for m in matches[:15]: + lines.append( + f" [{m['type']}] \"{m['name']}\" " + f"(confidence={m.get('confidence','?')}, page={m.get('page',0)}, id={m['id']})" + ) + return "\n".join(lines) + + @tool + def get_neighbors(entity_name: str, hops: int = 1) -> str: + """Get N-hop neighbors of an entity in the knowledge graph. + Args: + entity_name: Entity name (partial match). + hops: Number of hops (1-3, default 1). + """ + hops = max(1, min(int(hops), 3)) + candidates = [(nid, d) for nid, d in G.nodes(data=True) + if entity_name.lower() in d.get("name", "").lower()] + if not candidates: + return f"Entity '{entity_name}' not found. Use search_entities first." + node_id, node_data = candidates[0] + reachable = nx.single_source_shortest_path_length(G, node_id, cutoff=hops) + by_hop: dict[int, list] = {} + for nid, dist in reachable.items(): + if dist > 0: + by_hop.setdefault(dist, []).append(G.nodes[nid]) + lines = [f"Neighbors of '{node_data['name']}' [{node_data['type']}] within {hops} hop(s):"] + for hop in sorted(by_hop.keys()): + hop_nodes = by_hop[hop] + lines.append(f"\n Hop {hop} — {len(hop_nodes)} related entities:") + for n in hop_nodes[:20]: + lines.append(f" [{n.get('type','?')}] {n.get('name','?')}") + if len(hop_nodes) > 20: + lines.append(f" ... and {len(hop_nodes)-20} more") + lines.append(f"\n Total related entities: {sum(len(v) for v in by_hop.values())}") + return "\n".join(lines) + + @tool + def get_entities_by_type(entity_type: str) -> str: + """List all entities of a specific type. + Args: + entity_type: TECHNOLOGY, CONCEPT, PERSON, ORGANIZATION, or LOCATION. + """ + t_upper = entity_type.strip().upper() + valid = {"TECHNOLOGY", "CONCEPT", "PERSON", "ORGANIZATION", "LOCATION"} + if t_upper not in valid: + present = sorted({d.get("type","") for _, d in G.nodes(data=True)}) + return f"Unknown type '{entity_type}'. Present: {present}" + matches = [d for _, d in G.nodes(data=True) if d.get("type","") == t_upper] + if not matches: + return f"No {t_upper} entities found." + lines = [f"Found {len(matches)} {t_upper} entities:"] + for m in matches[:30]: + lines.append(f" \"{m['name']}\" (page={m.get('page',0)}, id={m['id']})") + if len(matches) > 30: + lines.append(f" ... and {len(matches)-30} more") + return "\n".join(lines) + + @tool + def describe_graph() -> str: + """Get an overview of the knowledge graph statistics.""" + n_nodes = G.number_of_nodes() + n_edges = G.number_of_edges() + type_counts: dict[str, int] = {} + for _, d in G.nodes(data=True): + t = d.get("type", "UNKNOWN") + type_counts[t] = type_counts.get(t, 0) + 1 + lines = [ + f"Knowledge Graph Overview:", + f" Nodes: {n_nodes}", + f" Edges: {n_edges}", + f" Entity types: {type_counts}", + ] + if n_nodes > 0: + centrality = nx.degree_centrality(G) + top5 = sorted(centrality.items(), key=lambda x: x[1], reverse=True)[:5] + lines.append(" Top 5 central nodes:") + for nid, c in top5: + nd = G.nodes[nid] + lines.append(f" [{nd.get('type','?')}] {nd.get('name','?')} (centrality={c:.3f})") + return "\n".join(lines) + + return [search_entities, get_neighbors, get_entities_by_type, describe_graph] + + +def run_qa( + question: str, + history: list[dict], + nodes: list[dict], + edges: list[dict], +) -> dict: + """Run Agentic-RAG QA. Returns dict with answer, tool_calls, cited_nodes.""" + if not DEEPSEEK_API_KEY: + raise ValueError("DEEPSEEK_API_KEY not set in backend/.env") + + G = build_kg_graph(nodes, edges) + tools = make_tools(G) + + llm = ChatOpenAI( + model="deepseek-chat", + api_key=DEEPSEEK_API_KEY, + base_url=DEEPSEEK_BASE_URL, + temperature=0, + ) + + system_prompt = ( + "You are a helpful assistant with access to a knowledge graph (KG) built from the user's documents.\n" + "\n" + "Guidelines:\n" + "- If the question is clearly unrelated to the KG (greetings, math, general knowledge, etc.), " + "answer directly WITHOUT using any tools.\n" + "- If the question might be answered by the KG (topics related to entities in the documents), " + "use the tools to search and explore before answering.\n" + "- When you DO use the KG, cite the entity names and types you found.\n" + "- If the KG has no relevant information, say so honestly and answer from general knowledge if possible.\n" + "\n" + "Available tools: search entities by name, get neighbors, list entities by type, get graph overview." + ) + + agent = create_react_agent(llm, tools, prompt=system_prompt) + + # Build messages: system + history + current question + messages: list = [] + for msg in history[-8:]: + role = msg.get("role", "human") + content = msg.get("content", "") or msg.get("answer", "") + if role == "human": + messages.append(HumanMessage(content=msg.get("question", content))) + else: + messages.append(AIMessage(content=content)) + messages.append(HumanMessage(content=question)) + + result = agent.invoke({"messages": messages}) + + # Extract answer from last AIMessage + answer = "" + for msg in reversed(result.get("messages", [])): + if isinstance(msg, AIMessage) and msg.content and not msg.tool_calls: + answer = msg.content + break + + # Extract tool calls and cited node IDs from message history + tool_calls = [] + cited_node_ids: set[str] = set() + step = 0 + all_messages = result.get("messages", []) + for i, msg in enumerate(all_messages): + if isinstance(msg, AIMessage) and msg.tool_calls: + for tc in msg.tool_calls: + step += 1 + # Find the corresponding ToolMessage + output = "" + for j in range(i + 1, len(all_messages)): + tm = all_messages[j] + if isinstance(tm, ToolMessage) and tm.tool_call_id == tc.get("id"): + output = tm.content + break + tool_input = tc.get("args", {}) + tool_calls.append({ + "step": step, + "tool_name": tc.get("name", ""), + "tool_input": str(tool_input), + "tool_output": str(output), + }) + # Extract node IDs mentioned in tool output + for node_id in re.findall(r'\bid=([^\s,\)\]]+)', str(output)): + cited_node_ids.add(node_id) + + return { + "answer": answer, + "tool_calls": tool_calls, + "cited_nodes": list(cited_node_ids), + } diff --git a/backend/pipeline/text_assembler.py b/backend/pipeline/text_assembler.py new file mode 100644 index 0000000..a210d87 --- /dev/null +++ b/backend/pipeline/text_assembler.py @@ -0,0 +1,107 @@ +""" +Text Assembler — MinerU content_list.json → per-page plain text. +Independent implementation for the GraphRAG Studio backend. +""" +from __future__ import annotations + +import dataclasses +import json +from collections import defaultdict +from pathlib import Path + +from bs4 import BeautifulSoup + + +@dataclasses.dataclass +class BlockSpan: + block_index: int + block_type: str + page_idx: int + char_start: int + char_end: int + bbox: list + + +@dataclasses.dataclass +class PageText: + page_idx: int + text: str + block_spans: list[BlockSpan] + + +def html_table_to_text(table_body: str) -> str: + soup = BeautifulSoup(table_body, "html.parser") + rows = [] + for tr in soup.find_all("tr"): + cells = [td.get_text(strip=True) for td in tr.find_all(["td", "th"])] + rows.append(" | ".join(cells)) + return "\n".join(rows) + + +def load_content_list(path: Path) -> list[dict]: + if path.is_dir(): + matches = list(path.glob("*_content_list.json")) + if not matches: + matches = list(path.glob("*content_list.json")) + if not matches: + raise FileNotFoundError(f"No content_list.json found in {path}") + path = matches[0] + with open(path, "r", encoding="utf-8") as f: + return json.load(f) + + +def assemble_pages(content_list: list[dict]) -> list[PageText]: + pages: dict[int, list[tuple[int, dict]]] = defaultdict(list) + for i, block in enumerate(content_list): + page_idx = block.get("page_idx", 0) + pages[page_idx].append((i, block)) + + result = [] + for page_idx in sorted(pages.keys()): + blocks = pages[page_idx] + buffer = [] + spans = [] + cursor = 0 + + for block_index, block in blocks: + block_type = block.get("type", "unknown") + bbox = block.get("bbox", [0, 0, 0, 0]) + + if block_type == "text": + block_text = block.get("text", "").rstrip() + elif block_type == "table": + table_body = block.get("table_body", "") + block_text = html_table_to_text(table_body) if table_body else "" + else: + continue + + if not block_text: + continue + + char_start = cursor + buffer.append(block_text) + cursor += len(block_text) + char_end = cursor + + spans.append(BlockSpan( + block_index=block_index, + block_type=block_type, + page_idx=page_idx, + char_start=char_start, + char_end=char_end, + bbox=bbox, + )) + buffer.append("\n") + cursor += 1 + + text = "".join(buffer).rstrip("\n") + result.append(PageText(page_idx=page_idx, text=text, block_spans=spans)) + + return result + + +def count_blocks_by_type(content_list: list[dict]) -> dict[str, int]: + counts: dict[str, int] = defaultdict(int) + for block in content_list: + counts[block.get("type", "unknown")] += 1 + return dict(counts) diff --git a/backend/pyproject.toml b/backend/pyproject.toml new file mode 100644 index 0000000..cd34855 --- /dev/null +++ b/backend/pyproject.toml @@ -0,0 +1,22 @@ +[project] +name = "graphrag-studio-backend" +version = "1.0.0" +description = "GraphRAG Studio — FastAPI backend service" +requires-python = ">=3.12" +dependencies = [ + "fastapi>=0.104.0", + "uvicorn[standard]>=0.24.0", + "python-multipart>=0.0.6", + "langextract[all]>=0.1.0", + "langchain>=0.2.0", + "langchain-openai>=0.1.0", + "langgraph>=0.1.0", + "networkx>=3.0", + "python-dotenv>=1.0.0", + "requests>=2.31.0", + "beautifulsoup4>=4.12.0", +] + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" diff --git a/backend/routers/__init__.py b/backend/routers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/backend/routers/documents.py b/backend/routers/documents.py new file mode 100644 index 0000000..ba3dab7 --- /dev/null +++ b/backend/routers/documents.py @@ -0,0 +1,71 @@ +"""A 组:文档管理(4 个端点)""" +from fastapi import APIRouter, File, Form, HTTPException, UploadFile +from fastapi.responses import JSONResponse + +from models.schemas import APIResponse +from services import document_service as svc + +router = APIRouter(prefix="/documents", tags=["Documents"]) + + +@router.post("/upload", status_code=200) +async def upload_document( + file: UploadFile = File(...), + language: str = Form("ch"), + enable_formula: bool = Form(True), + enable_table: bool = Form(True), +): + content = await file.read() + ok, code, msg = svc.validate_upload(file.filename or "", len(content)) + if not ok: + return JSONResponse( + status_code=400, + content=APIResponse.err(code, msg).model_dump(), + ) + doc = svc.save_upload(file.filename or "upload", content, language, enable_formula, enable_table) + # Remove internal field + doc.pop("upload_filename", None) + return APIResponse.ok(doc) + + +@router.get("/{doc_id}") +async def get_document(doc_id: str): + doc = svc.get_document(doc_id) + if not doc: + return JSONResponse( + status_code=404, + content=APIResponse.err(2001, f"Document '{doc_id}' not found").model_dump(), + ) + doc.pop("upload_filename", None) + return APIResponse.ok(doc) + + +@router.get("") +async def list_documents( + page: int = 1, + page_size: int = 20, + status: str | None = None, + format: str | None = None, +): + page_size = min(page_size, 100) + result = svc.list_documents(page, page_size, status, format) + for item in result["items"]: + item.pop("upload_filename", None) + return APIResponse.ok(result) + + +@router.delete("/{doc_id}") +async def delete_document(doc_id: str): + doc = svc.get_document(doc_id) + if not doc: + return JSONResponse( + status_code=404, + content=APIResponse.err(2001, f"Document '{doc_id}' not found").model_dump(), + ) + ok, removed_nodes, removed_edges = svc.delete_document(doc_id) + return APIResponse.ok({ + "deleted": True, + "doc_id": doc_id, + "removed_nodes": removed_nodes, + "removed_edges": removed_edges, + }) diff --git a/backend/routers/indexing.py b/backend/routers/indexing.py new file mode 100644 index 0000000..a190b75 --- /dev/null +++ b/backend/routers/indexing.py @@ -0,0 +1,70 @@ +"""B 组:Indexing Pipeline(4 个端点)""" +from fastapi import APIRouter +from fastapi.responses import JSONResponse + +from models.schemas import APIResponse, StartIndexRequest +from services import document_service as doc_svc +from services import indexing_service as idx_svc + +router = APIRouter(prefix="/index", tags=["Indexing"]) + + +@router.post("/start", status_code=202) +async def start_indexing(body: StartIndexRequest): + doc = doc_svc.get_document(body.doc_id) + if not doc: + return JSONResponse( + status_code=404, + content=APIResponse.err(2001, f"Document '{body.doc_id}' not found").model_dump(), + ) + meta = idx_svc.start_indexing(body.doc_id) + return APIResponse.ok({ + "job_id": meta["job_id"], + "doc_id": meta["doc_id"], + "status": meta["status"], + "stage": meta["stage"], + "created_at": meta["created_at"], + }) + + +@router.get("/status/{job_id}") +async def get_job_status(job_id: str): + meta = idx_svc.get_job_status(job_id) + if not meta: + return JSONResponse( + status_code=404, + content=APIResponse.err(2002, f"Job '{job_id}' not found").model_dump(), + ) + return APIResponse.ok(meta) + + +@router.get("/result/{job_id}") +async def get_job_result(job_id: str): + result = idx_svc.get_job_result(job_id) + if not result: + return JSONResponse( + status_code=404, + content=APIResponse.err(2002, f"Job '{job_id}' not found").model_dump(), + ) + if result.get("status") not in ("done",) and "stats" not in result: + return JSONResponse( + status_code=400, + content=APIResponse.err(2003, f"Job '{job_id}' is still running (status={result.get('status')})").model_dump(), + ) + return APIResponse.ok(result) + + +@router.delete("/jobs/{job_id}") +async def cancel_job(job_id: str): + meta = idx_svc.get_job_status(job_id) + if not meta: + return JSONResponse( + status_code=404, + content=APIResponse.err(2002, f"Job '{job_id}' not found").model_dump(), + ) + ok, prev_status = idx_svc.cancel_job(job_id) + return APIResponse.ok({ + "cancelled": True, + "job_id": job_id, + "previous_status": prev_status, + }) diff --git a/backend/routers/kg.py b/backend/routers/kg.py new file mode 100644 index 0000000..6689ef0 --- /dev/null +++ b/backend/routers/kg.py @@ -0,0 +1,72 @@ +"""C 组:知识图谱(6 个端点)""" +from fastapi import APIRouter +from fastapi.responses import JSONResponse + +from models.schemas import APIResponse +from services import kg_service as svc + +router = APIRouter(prefix="/kg", tags=["Knowledge Graph"]) + + +@router.get("/nodes") +async def list_nodes( + type: str | None = None, + doc_id: str | None = None, + confidence: str | None = None, + page: int = 1, + page_size: int = 50, +): + page_size = min(page_size, 200) + result = svc.get_nodes(page, page_size, type, doc_id, confidence) + if result["total"] == 0 and not any([type, doc_id, confidence]): + return JSONResponse( + status_code=400, + content=APIResponse.err(3002, "Knowledge graph is empty. Index documents first.").model_dump(), + ) + return APIResponse.ok(result) + + +@router.get("/edges") +async def list_edges( + doc_id: str | None = None, + relation: str | None = None, + page: int = 1, + page_size: int = 100, +): + page_size = min(page_size, 500) + result = svc.get_edges(page, page_size, doc_id, relation) + return APIResponse.ok(result) + + +@router.get("/nodes/{node_id}") +async def get_node_detail(node_id: str): + node = svc.get_node_detail(node_id) + if not node: + return JSONResponse( + status_code=404, + content=APIResponse.err(3001, f"Node '{node_id}' not found").model_dump(), + ) + return APIResponse.ok(node) + + +@router.get("/nodes/{node_id}/neighbors") +async def get_node_neighbors(node_id: str, hops: int = 1): + result = svc.get_neighbors(node_id, hops) + if result is None: + return JSONResponse( + status_code=404, + content=APIResponse.err(3001, f"Node '{node_id}' not found").model_dump(), + ) + return APIResponse.ok(result) + + +@router.get("/stats") +async def get_kg_stats(): + stats = svc.get_stats() + return APIResponse.ok(stats) + + +@router.get("/export") +async def export_kg(format: str = "json", doc_id: str | None = None): + result = svc.export_kg(doc_id) + return APIResponse.ok(result) diff --git a/backend/routers/query.py b/backend/routers/query.py new file mode 100644 index 0000000..52bf9a8 --- /dev/null +++ b/backend/routers/query.py @@ -0,0 +1,66 @@ +"""D 组:QA 问答(4 个端点)""" +import asyncio +from functools import partial + +from fastapi import APIRouter +from fastapi.responses import JSONResponse + +from models.schemas import APIResponse, BatchQueryRequest, QueryRequest +from services import qa_service as svc + +router = APIRouter(prefix="/query", tags=["QA"]) + + +@router.post("") +async def run_query(body: QueryRequest): + try: + loop = asyncio.get_event_loop() + result = await loop.run_in_executor( + None, + partial(svc.run_query, body.question, [m.model_dump() for m in body.history]), + ) + return APIResponse.ok(result) + except ValueError as e: + if "KG_EMPTY" in str(e): + return JSONResponse( + status_code=400, + content=APIResponse.err(3002, "Knowledge graph is empty. Index documents first.").model_dump(), + ) + return JSONResponse( + status_code=500, + content=APIResponse.err(4001, str(e)).model_dump(), + ) + except Exception as e: + return JSONResponse( + status_code=500, + content=APIResponse.err(4001, f"QA service error: {e}").model_dump(), + ) + + +@router.post("/batch", status_code=202) +async def start_batch(body: BatchQueryRequest): + if len(body.questions) > 20: + return JSONResponse( + status_code=400, + content=APIResponse.err(1001, "Maximum 20 questions per batch").model_dump(), + ) + result = svc.start_batch(body.questions) + return APIResponse.ok(result) + + +@router.get("/batch/{batch_id}") +async def get_batch_result(batch_id: str): + result = svc.get_batch_result(batch_id) + if not result: + return JSONResponse( + status_code=404, + content=APIResponse.err(2002, f"Batch '{batch_id}' not found").model_dump(), + ) + return APIResponse.ok(result) + + +@router.get("/history") +async def get_query_history(page: int = 1, page_size: int = 20): + page_size = min(page_size, 50) + result = svc.get_history(page, page_size) + return APIResponse.ok(result) diff --git a/backend/routers/search.py b/backend/routers/search.py new file mode 100644 index 0000000..58b6451 --- /dev/null +++ b/backend/routers/search.py @@ -0,0 +1,43 @@ +"""E 组:搜索(3 个端点)""" +from fastapi import APIRouter, Query, Request +from fastapi.responses import JSONResponse + +from models.schemas import APIResponse +from services import search_service as svc + +router = APIRouter(prefix="/search", tags=["Search"]) + + +@router.get("/entities") +async def search_entities(q: str, type: str | None = None, limit: int = 15): + limit = min(limit, 100) + result = svc.search_entities(q, type, limit) + return APIResponse.ok(result) + + +@router.get("/path") +async def search_path(request: Request, max_hops: int = 3): + # 'from' is a Python keyword, read from raw query params + params = dict(request.query_params) + from_id = params.get("from") + to_id = params.get("to") + + if not from_id or not to_id: + return JSONResponse( + status_code=400, + content=APIResponse.err(1001, "Parameters 'from' and 'to' are required").model_dump(), + ) + max_hops = max(1, min(max_hops, 5)) + result = svc.search_path(from_id, to_id, max_hops) + if result is None: + return JSONResponse( + status_code=404, + content=APIResponse.err(3001, "One or both nodes not found").model_dump(), + ) + return APIResponse.ok(result) + + +@router.get("/graph") +async def search_graph(q: str, include_neighbors: bool = False): + result = svc.search_graph(q, include_neighbors) + return APIResponse.ok(result) diff --git a/backend/routers/system.py b/backend/routers/system.py new file mode 100644 index 0000000..7605ce0 --- /dev/null +++ b/backend/routers/system.py @@ -0,0 +1,171 @@ +"""F 组:系统(4 个端点)""" +import os +import time +from pathlib import Path + +from fastapi import APIRouter + +from models.schemas import APIResponse +from storage import file_store as fs + +router = APIRouter(tags=["System"]) + +_START_TIME = time.time() + + +@router.get("/health") +async def health_check(): + env_path = Path(__file__).parent.parent / ".env" + from dotenv import load_dotenv + load_dotenv(env_path, override=False) + + mineru_python = Path(os.getenv("MINERU_PYTHON", "F:/GraphRAGAgent/mineru_mvp/.venv/Scripts/python.exe")) + backend_python = Path(__file__).parent.parent / ".venv" / "Scripts" / "python.exe" + deepseek_key = os.getenv("DEEPSEEK_API_KEY", "") + deepseek_url = os.getenv("DEEPSEEK_BASE_URL", "https://api.deepseek.com") + + # Check if langextract is importable from backend's venv + try: + import subprocess + result = subprocess.run( + [str(backend_python), "-c", "import langextract; print('ok')"], + capture_output=True, text=True, timeout=10 + ) + langextract_ok = result.returncode == 0 and "ok" in result.stdout + except Exception: + langextract_ok = False + + components = { + "mineru_venv": { + "status": "ok" if mineru_python.exists() else "error", + "path": str(mineru_python), + "exists": mineru_python.exists(), + }, + "langextract_venv": { + "status": "ok" if langextract_ok else "error", + "path": str(backend_python), + "exists": backend_python.exists(), + }, + "deepseek_api": { + "status": "ok" if deepseek_key else "error", + "base_url": deepseek_url, + "key_configured": bool(deepseek_key), + }, + "storage": { + "status": "ok", + "kg_nodes_exists": fs.kg_nodes_path().exists(), + "kg_edges_exists": fs.kg_edges_path().exists(), + "uploads_dir_exists": fs.UPLOADS_DIR.exists(), + }, + } + + overall = "healthy" if all(c["status"] == "ok" for c in components.values()) else "degraded" + + return APIResponse.ok({ + "status": overall, + "version": "1.0.0", + "uptime_seconds": round(time.time() - _START_TIME, 1), + "components": components, + }) + + +@router.get("/system/stats") +async def system_stats(): + from services import indexing_service as idx_svc + + docs = list(fs.load_docs_index().values()) + nodes = fs.load_kg_nodes() + edges = fs.load_kg_edges() + history = fs.load_query_history() + + type_dist: dict[str, int] = {} + for n in nodes: + t = n.get("type", "UNKNOWN") + type_dist[t] = type_dist.get(t, 0) + 1 + + return APIResponse.ok({ + "total_documents": len(docs), + "indexed_documents": sum(1 for d in docs if d.get("status") == "indexed"), + "failed_documents": sum(1 for d in docs if d.get("status") == "failed"), + "total_nodes": len(nodes), + "total_edges": len(edges), + "type_distribution": type_dist, + "total_queries": len(history), + "active_jobs": idx_svc.count_active_jobs(), + "storage_used_mb": fs.storage_used_mb(), + }) + + +@router.get("/system/formats") +async def list_formats(): + return APIResponse.ok({ + "formats": [ + {"ext": "pdf", "description": "PDF 文档(文本型/扫描型/混合型)", "max_size_mb": 200, "max_pages": 600, "requires_ocr": False}, + {"ext": "docx", "description": "Microsoft Word(新版)", "max_size_mb": 200, "max_pages": 600, "requires_ocr": False}, + {"ext": "doc", "description": "Microsoft Word(旧版)", "max_size_mb": 200, "max_pages": 600, "requires_ocr": False}, + {"ext": "pptx", "description": "PowerPoint(新版)", "max_size_mb": 200, "max_pages": 600, "requires_ocr": False}, + {"ext": "ppt", "description": "PowerPoint(旧版)", "max_size_mb": 200, "max_pages": 600, "requires_ocr": False}, + {"ext": "png", "description": "PNG 图片(单页)", "max_size_mb": 200, "max_pages": 1, "requires_ocr": True}, + {"ext": "jpg", "description": "JPEG 图片(单页)", "max_size_mb": 200, "max_pages": 1, "requires_ocr": True}, + {"ext": "jpeg", "description": "JPEG 图片(单页)", "max_size_mb": 200, "max_pages": 1, "requires_ocr": True}, + {"ext": "html", "description": "HTML 文件", "max_size_mb": 200, "max_pages": 600, "requires_ocr": False}, + ], + "ocr_languages": [ + {"code": "ch", "name": "中文(默认)"}, + {"code": "en", "name": "英文"}, + {"code": "japan", "name": "日文"}, + {"code": "korean", "name": "韩文"}, + {"code": "french", "name": "法文"}, + {"code": "german", "name": "德文"}, + ], + "notes": [ + "language 参数默认值为 'ch'(非 'zh'),遵循 PaddleOCR v3 语言代码规范", + "上传时不需要携带 Content-Type,服务端自动识别", + "PNG/JPG/JPEG 单次最多处理 1 页", + ], + }) + + +@router.get("/system/demo") +async def get_demo_data(): + # Try backend KG first, then fall back to graphrag_pipeline/output + nodes = fs.load_kg_nodes() + edges = fs.load_kg_edges() + + if not nodes: + # Fallback: load from existing graphrag_pipeline output + legacy_nodes_path = Path("F:/GraphRAGAgent/graphrag_pipeline/output/kg_nodes.json") + legacy_edges_path = Path("F:/GraphRAGAgent/graphrag_pipeline/output/kg_edges.json") + if legacy_nodes_path.exists(): + import json + nodes = json.loads(legacy_nodes_path.read_text(encoding="utf-8")) + edges = json.loads(legacy_edges_path.read_text(encoding="utf-8")) if legacy_edges_path.exists() else [] + else: + from fastapi.responses import JSONResponse + return JSONResponse( + status_code=400, + content=APIResponse.err(3002, "No demo data available. Index a document first.").model_dump(), + ) + + type_counts: dict[str, int] = {} + for n in nodes: + t = n.get("type", "UNKNOWN") + type_counts[t] = type_counts.get(t, 0) + 1 + + import networkx as nx + G = nx.Graph() + for n in nodes: + G.add_node(n["id"]) + for e in edges: + G.add_edge(e["source"], e["target"]) + + return APIResponse.ok({ + "nodes": nodes, + "edges": edges, + "stats": { + "nodes": len(nodes), + "edges": len(edges), + "type_counts": type_counts, + "density": round(nx.density(G), 4) if G.number_of_nodes() > 1 else 0.0, + }, + }) diff --git a/backend/services/__init__.py b/backend/services/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/backend/services/document_service.py b/backend/services/document_service.py new file mode 100644 index 0000000..cc8c91d --- /dev/null +++ b/backend/services/document_service.py @@ -0,0 +1,109 @@ +"""Document Service — file upload, metadata CRUD.""" +from __future__ import annotations + +import uuid +from datetime import datetime, timezone +from pathlib import Path + +from storage import file_store as fs + +ALLOWED_EXTENSIONS = {"pdf", "docx", "doc", "pptx", "ppt", "png", "jpg", "jpeg", "html"} +MAX_FILE_SIZE_MB = 200 + + +def validate_upload(filename: str, size_bytes: int) -> tuple[bool, int, str]: + """Returns (ok, error_code, error_msg).""" + if not filename or "/" in filename or "\\" in filename: + return False, 1001, "Invalid filename" + ext = Path(filename).suffix.lower().lstrip(".") + if ext not in ALLOWED_EXTENSIONS: + return False, 1002, f"Unsupported file format: .{ext}. Supported: {', '.join(sorted(ALLOWED_EXTENSIONS))}" + size_mb = size_bytes / (1024 * 1024) + if size_mb > MAX_FILE_SIZE_MB: + return False, 1003, f"File size {size_mb:.1f}MB exceeds {MAX_FILE_SIZE_MB}MB limit" + return True, 0, "" + + +def save_upload(filename: str, content: bytes, language: str = "ch", + enable_formula: bool = True, enable_table: bool = True) -> dict: + doc_id = uuid.uuid4().hex[:8] + ext = Path(filename).suffix.lower().lstrip(".") + upload_filename = f"{doc_id}_{filename}" + upload_path = fs.UPLOADS_DIR / upload_filename + upload_path.write_bytes(content) + + doc = { + "doc_id": doc_id, + "filename": filename, + "format": ext, + "size_bytes": len(content), + "pages": None, + "uploaded_at": datetime.now(timezone.utc).isoformat(), + "status": "uploaded", + "language": language, + "enable_formula": enable_formula, + "enable_table": enable_table, + "upload_filename": upload_filename, # internal: actual stored filename + } + fs.save_doc(doc) + return doc + + +def get_document(doc_id: str) -> dict | None: + return fs.get_doc(doc_id) + + +def list_documents(page: int = 1, page_size: int = 20, + status: str | None = None, fmt: str | None = None) -> dict: + index = fs.load_docs_index() + items = list(index.values()) + items.sort(key=lambda d: d.get("uploaded_at", ""), reverse=True) + if status: + items = [d for d in items if d.get("status") == status] + if fmt: + items = [d for d in items if d.get("format") == fmt.lower()] + total = len(items) + start = (page - 1) * page_size + return { + "total": total, + "page": page, + "page_size": page_size, + "items": items[start: start + page_size], + } + + +def delete_document(doc_id: str) -> tuple[bool, int, int]: + """Delete doc and its KG contributions. Returns (ok, removed_nodes, removed_edges).""" + doc = fs.get_doc(doc_id) + if not doc: + return False, 0, 0 + + # Remove from KG + removed_nodes, removed_edges = fs.remove_doc_from_kg(doc_id) + + # Remove upload file + upload_filename = doc.get("upload_filename", "") + upload_path = fs.UPLOADS_DIR / upload_filename + if upload_path.exists(): + upload_path.unlink(missing_ok=True) + + # Remove associated jobs + for meta in fs.list_all_jobs(): + if meta.get("doc_id") == doc_id: + fs.delete_job(meta["job_id"]) + + # Remove from index + index = fs.load_docs_index() + index.pop(doc_id, None) + fs.save_docs_index(index) + + return True, removed_nodes, removed_edges + + +def update_doc_status(doc_id: str, status: str, pages: int | None = None) -> None: + index = fs.load_docs_index() + if doc_id in index: + index[doc_id]["status"] = status + if pages is not None: + index[doc_id]["pages"] = pages + fs.save_docs_index(index) diff --git a/backend/services/indexing_service.py b/backend/services/indexing_service.py new file mode 100644 index 0000000..b359108 --- /dev/null +++ b/backend/services/indexing_service.py @@ -0,0 +1,255 @@ +"""Indexing Service — Pipeline orchestration (parsing → extracting → indexing).""" +from __future__ import annotations + +import json +import os +import subprocess +import threading +import time +import uuid +from datetime import datetime, timezone +from pathlib import Path + +from dotenv import load_dotenv + +from storage import file_store as fs +from services.document_service import update_doc_status + +load_dotenv(Path(__file__).parent.parent / ".env", override=True) + +MINERU_PYTHON = Path(os.getenv("MINERU_PYTHON", "F:/GraphRAGAgent/mineru_mvp/.venv/Scripts/python.exe")) +MINERU_PIPELINE = Path(os.getenv("MINERU_PIPELINE", "F:/GraphRAGAgent/mineru_mvp/pipeline.py")) + +# In-memory registry of active jobs {job_id: threading.Thread} +_active_threads: dict[str, threading.Thread] = {} +_cancel_flags: dict[str, bool] = {} + + +def start_indexing(doc_id: str) -> dict: + doc = fs.get_doc(doc_id) + if not doc: + return None # type: ignore + + job_id = f"job_{uuid.uuid4().hex[:8]}" + now = datetime.now(timezone.utc).isoformat() + + meta = { + "job_id": job_id, + "doc_id": doc_id, + "status": "submitted", + "stage": "Job submitted", + "progress": {"parsed_pages": 0, "total_pages": 0, "extracted_entities": 0}, + "created_at": now, + "elapsed_seconds": 0.0, + "error": None, + "pdf_name": doc["filename"], + "pdf_path": str(fs.UPLOADS_DIR / doc.get("upload_filename", "")), + } + fs.save_job_meta(job_id, meta) + + _cancel_flags[job_id] = False + thread = threading.Thread(target=_run_pipeline, args=(job_id,), daemon=True) + _active_threads[job_id] = thread + thread.start() + + return meta + + +def _update_meta(job_id: str, **kwargs) -> None: + meta = fs.load_job_meta(job_id) or {} + meta.update(kwargs) + meta["elapsed_seconds"] = round( + (datetime.now(timezone.utc) - datetime.fromisoformat(meta["created_at"])).total_seconds(), 1 + ) + fs.save_job_meta(job_id, meta) + + +def _run_pipeline(job_id: str) -> None: + meta = fs.load_job_meta(job_id) + if not meta: + return + + doc_id = meta["doc_id"] + pdf_path = Path(meta["pdf_path"]) + job_dir = fs.job_dir(job_id) + start_time = time.time() + + try: + # ── Stage 1: parsing ────────────────────────────────────────────── + if _cancel_flags.get(job_id): + _update_meta(job_id, status="cancelled", stage="Cancelled") + return + + _update_meta(job_id, status="parsing", stage="MinerU document parsing...") + mineru_out_dir = job_dir / "mineru_output" + mineru_out_dir.mkdir(parents=True, exist_ok=True) + + result = subprocess.run( + [str(MINERU_PYTHON), str(MINERU_PIPELINE), str(pdf_path)], + cwd=str(MINERU_PIPELINE.parent), + capture_output=True, + text=True, + timeout=600, + ) + + if result.returncode != 0: + raise RuntimeError(f"MinerU failed: {result.stderr[:500]}") + + # Find content_list.json in MinerU output + # MinerU writes output to mineru_mvp/output/{stem}/ + stem = pdf_path.stem + mineru_default_out = MINERU_PIPELINE.parent / "output" / stem + content_list_path = None + + if mineru_default_out.exists(): + matches = list(mineru_default_out.glob("*_content_list.json")) + if matches: + content_list_path = matches[0] + # Copy to our job dir + import shutil + shutil.copytree(str(mineru_default_out), str(mineru_out_dir), dirs_exist_ok=True) + + if not content_list_path: + # Fallback: search job mineru_output dir + matches = list(mineru_out_dir.glob("*_content_list.json")) + if matches: + content_list_path = matches[0] + + if not content_list_path or not content_list_path.exists(): + raise RuntimeError(f"MinerU output content_list.json not found. stdout: {result.stdout[:300]}") + + # ── Stage 2: extracting ─────────────────────────────────────────── + if _cancel_flags.get(job_id): + _update_meta(job_id, status="cancelled", stage="Cancelled") + return + + from pipeline.text_assembler import load_content_list, assemble_pages, count_blocks_by_type + from pipeline.entity_extractor import create_model, extract_entities + from pipeline.kg_builder import build_kg, extractions_to_records + + content_list = load_content_list(content_list_path) + pages = assemble_pages(content_list) + total_pages = len(pages) + block_types = count_blocks_by_type(content_list) + + _update_meta( + job_id, + status="extracting", + stage=f"Extracting entities (LangExtract + DeepSeek)...", + progress={"parsed_pages": total_pages, "total_pages": total_pages, "extracted_entities": 0}, + ) + update_doc_status(doc_id, "indexing", pages=total_pages) + + model = create_model() + annotated_docs = [] + total_entities = 0 + + for i, page in enumerate(pages): + if _cancel_flags.get(job_id): + _update_meta(job_id, status="cancelled", stage="Cancelled") + return + + _update_meta( + job_id, + stage=f"Extracting entities page {i+1}/{total_pages} (LangExtract + DeepSeek)...", + progress={"parsed_pages": total_pages, "total_pages": total_pages, + "extracted_entities": total_entities}, + ) + ann_doc = extract_entities(page.text, model) + annotated_docs.append(ann_doc) + total_entities += len(ann_doc.extractions) if ann_doc.extractions else 0 + + # Save raw extractions + records = extractions_to_records(pages, annotated_docs, doc_id) + fs.write_json(job_dir / "extractions.json", records) + + # ── Stage 3: indexing ───────────────────────────────────────────── + _update_meta(job_id, status="indexing", stage="Building knowledge graph...") + + nodes, edges = build_kg(pages, annotated_docs, doc_id) + fs.write_json(job_dir / "kg_nodes.json", nodes) + fs.write_json(job_dir / "kg_edges.json", edges) + + # Merge into global KG + fs.merge_kg(nodes, edges, doc_id) + + # Count alignment types + alignment_counts: dict[str, int] = {} + type_counts: dict[str, int] = {} + for r in records: + al = r.get("alignment") or "null" + alignment_counts[al] = alignment_counts.get(al, 0) + 1 + t = r.get("type", "UNKNOWN") + type_counts[t] = type_counts.get(t, 0) + 1 + + elapsed = round(time.time() - start_time, 1) + stats = { + "blocks": len(content_list), + "block_types": block_types, + "pages": total_pages, + "raw_extractions": len(records), + "nodes": len(nodes), + "edges": len(edges), + "type_counts": type_counts, + "alignment_counts": alignment_counts, + "elapsed_seconds": elapsed, + } + fs.write_json(job_dir / "stats.json", stats) + + _update_meta( + job_id, + status="done", + stage="Complete", + progress={"parsed_pages": total_pages, "total_pages": total_pages, + "extracted_entities": len(records)}, + ) + update_doc_status(doc_id, "indexed", pages=total_pages) + + except Exception as exc: + _update_meta(job_id, status="failed", stage=f"Error: {exc}", error=str(exc)) + update_doc_status(doc_id, "failed") + finally: + _active_threads.pop(job_id, None) + _cancel_flags.pop(job_id, None) + + +def get_job_status(job_id: str) -> dict | None: + return fs.load_job_meta(job_id) + + +def get_job_result(job_id: str) -> dict | None: + meta = fs.load_job_meta(job_id) + if not meta: + return None + if meta["status"] != "done": + return meta + + job_dir = fs.job_dir(job_id) + stats = fs.read_json(job_dir / "stats.json") or {} + extractions = fs.read_json(job_dir / "extractions.json") or [] + nodes = fs.read_json(job_dir / "kg_nodes.json") or [] + edges = fs.read_json(job_dir / "kg_edges.json") or [] + + return { + "job_id": meta["job_id"], + "doc_id": meta["doc_id"], + "status": "done", + "stats": stats, + "extractions": extractions, + "nodes": nodes, + "edges": edges, + } + + +def cancel_job(job_id: str) -> tuple[bool, str]: + meta = fs.load_job_meta(job_id) + if not meta: + return False, "not_found" + prev_status = meta["status"] + _cancel_flags[job_id] = True + _update_meta(job_id, status="cancelled", stage="Cancelled by user") + return True, prev_status + + +def count_active_jobs() -> int: + return sum(1 for t in _active_threads.values() if t.is_alive()) diff --git a/backend/services/kg_service.py b/backend/services/kg_service.py new file mode 100644 index 0000000..fc92c03 --- /dev/null +++ b/backend/services/kg_service.py @@ -0,0 +1,167 @@ +"""KG Service — NetworkX graph operations over the global KG.""" +from __future__ import annotations + +import networkx as nx + +from storage import file_store as fs + + +def _load_graph() -> nx.Graph: + nodes = fs.load_kg_nodes() + edges = fs.load_kg_edges() + G = nx.Graph() + for n in nodes: + G.add_node(n["id"], **n) + for e in edges: + G.add_edge(e["source"], e["target"], + relation=e.get("relation", "CO_OCCURS_IN"), + doc_id=e.get("doc_id", ""), + page=e.get("page", 0)) + return G + + +def get_nodes(page: int = 1, page_size: int = 50, + node_type: str | None = None, + doc_id: str | None = None, + confidence: str | None = None) -> dict: + nodes = fs.load_kg_nodes() + G = _load_graph() + # Attach degree + degrees = dict(G.degree()) + for n in nodes: + n["degree"] = degrees.get(n["id"], 0) + + if node_type: + nodes = [n for n in nodes if n.get("type", "").upper() == node_type.upper()] + if doc_id: + nodes = [n for n in nodes if n.get("source_doc") == doc_id] + if confidence: + nodes = [n for n in nodes if n.get("confidence") == confidence] + + total = len(nodes) + start = (page - 1) * page_size + return {"total": total, "page": page, "page_size": page_size, + "items": nodes[start: start + page_size]} + + +def get_edges(page: int = 1, page_size: int = 100, + doc_id: str | None = None, + relation: str | None = None) -> dict: + edges = fs.load_kg_edges() + if doc_id: + edges = [e for e in edges if e.get("doc_id") == doc_id] + if relation: + edges = [e for e in edges if e.get("relation") == relation] + total = len(edges) + start = (page - 1) * page_size + return {"total": total, "page": page, "page_size": page_size, + "items": edges[start: start + page_size]} + + +def get_node_detail(node_id: str) -> dict | None: + nodes = fs.load_kg_nodes() + node = next((n for n in nodes if n["id"] == node_id), None) + if not node: + return None + G = _load_graph() + if node_id not in G: + node["degree"] = 0 + node["degree_centrality"] = 0.0 + node["neighbor_count"] = 0 + return node + deg = G.degree(node_id) + centrality = nx.degree_centrality(G) + node["degree"] = deg + node["degree_centrality"] = round(centrality.get(node_id, 0.0), 4) + node["neighbor_count"] = deg + return node + + +def get_neighbors(node_id: str, hops: int = 1) -> dict | None: + nodes = fs.load_kg_nodes() + node = next((n for n in nodes if n["id"] == node_id), None) + if not node: + return None + G = _load_graph() + if node_id not in G: + return { + "center": {"id": node_id, "name": node["name"], "type": node["type"], "page": node.get("page", 0)}, + "hops": hops, "neighbors_by_hop": {}, "total_neighbors": 0, + } + hops = max(1, min(hops, 3)) + reachable = nx.single_source_shortest_path_length(G, node_id, cutoff=hops) + by_hop: dict[str, list] = {} + for nid, dist in reachable.items(): + if dist == 0: + continue + nd = G.nodes[nid] + by_hop.setdefault(str(dist), []).append({ + "id": nid, "name": nd.get("name", ""), "type": nd.get("type", ""), "page": nd.get("page", 0) + }) + total = sum(len(v) for v in by_hop.values()) + return { + "center": {"id": node_id, "name": node["name"], "type": node["type"], "page": node.get("page", 0)}, + "hops": hops, + "neighbors_by_hop": by_hop, + "total_neighbors": total, + } + + +def get_stats() -> dict: + nodes = fs.load_kg_nodes() + edges = fs.load_kg_edges() + G = _load_graph() + + type_dist: dict[str, int] = {} + for n in nodes: + t = n.get("type", "UNKNOWN") + type_dist[t] = type_dist.get(t, 0) + 1 + + relation_types: dict[str, int] = {} + for e in edges: + r = e.get("relation", "CO_OCCURS_IN") + relation_types[r] = relation_types.get(r, 0) + 1 + + density = round(nx.density(G), 4) if G.number_of_nodes() > 1 else 0.0 + + top5: list[dict] = [] + if G.number_of_nodes() > 0: + centrality = nx.degree_centrality(G) + for nid, c in sorted(centrality.items(), key=lambda x: x[1], reverse=True)[:5]: + nd = G.nodes[nid] + top5.append({"node_id": nid, "name": nd.get("name", ""), "type": nd.get("type", ""), + "centrality": round(c, 4)}) + + source_docs = list({n.get("source_doc", "") for n in nodes if n.get("source_doc")}) + + return { + "total_nodes": len(nodes), + "total_edges": len(edges), + "density": density, + "type_distribution": type_dist, + "relation_types": relation_types, + "top5_central_nodes": top5, + "source_documents": source_docs, + } + + +def export_kg(doc_id: str | None = None) -> dict: + from datetime import datetime, timezone + nodes = fs.load_kg_nodes() + edges = fs.load_kg_edges() + G = _load_graph() + degrees = dict(G.degree()) + for n in nodes: + n["degree"] = degrees.get(n["id"], 0) + if doc_id: + nodes = [n for n in nodes if n.get("source_doc") == doc_id] + edges = [e for e in edges if e.get("doc_id") == doc_id] + return { + "format": "json", + "doc_id": doc_id, + "total_nodes": len(nodes), + "total_edges": len(edges), + "exported_at": datetime.now(timezone.utc).isoformat(), + "nodes": nodes, + "edges": edges, + } diff --git a/backend/services/qa_service.py b/backend/services/qa_service.py new file mode 100644 index 0000000..62e4fbb --- /dev/null +++ b/backend/services/qa_service.py @@ -0,0 +1,85 @@ +"""QA Service — Agentic-RAG wrapper.""" +from __future__ import annotations + +import time +import uuid +from datetime import datetime, timezone + +from storage import file_store as fs + + +def run_query(question: str, history: list[dict]) -> dict: + from pipeline.qa_agent import run_qa + + nodes = fs.load_kg_nodes() + edges = fs.load_kg_edges() + + if not nodes: + raise ValueError("KG_EMPTY") + + start = time.time() + result = run_qa(question, history, nodes, edges) + elapsed = round(time.time() - start, 2) + + query_id = f"q_{uuid.uuid4().hex[:10]}" + now = datetime.now(timezone.utc).isoformat() + + record = { + "id": query_id, + "question": question, + "answer": result["answer"], + "tool_calls": result["tool_calls"], + "cited_nodes": result["cited_nodes"], + "duration_seconds": elapsed, + "timestamp": now, + } + fs.append_query_history(record) + return record + + +def get_history(page: int = 1, page_size: int = 20) -> dict: + all_records = fs.load_query_history() + total = len(all_records) + start = (page - 1) * page_size + return { + "total": total, + "page": page, + "page_size": page_size, + "items": all_records[start: start + page_size], + } + + +def start_batch(questions: list[str]) -> dict: + import threading + + batch_id = f"batch_{uuid.uuid4().hex[:10]}" + now = datetime.now(timezone.utc).isoformat() + meta = { + "batch_id": batch_id, + "total": len(questions), + "completed": 0, + "failed": 0, + "status": "submitted", + "created_at": now, + "results": [], + } + fs.save_batch_meta(batch_id, meta) + + def _run(): + for q in questions: + try: + res = run_query(q, []) + meta["results"].append(res) + meta["completed"] += 1 + except Exception as e: + meta["failed"] += 1 + meta["results"].append({"question": q, "error": str(e)}) + meta["status"] = "done" + fs.save_batch_meta(batch_id, meta) + + threading.Thread(target=_run, daemon=True).start() + return {"batch_id": batch_id, "total": len(questions), "status": "submitted", "created_at": now} + + +def get_batch_result(batch_id: str) -> dict | None: + return fs.load_batch_meta(batch_id) diff --git a/backend/services/search_service.py b/backend/services/search_service.py new file mode 100644 index 0000000..8cdfc10 --- /dev/null +++ b/backend/services/search_service.py @@ -0,0 +1,106 @@ +"""Search Service — entity, path, and graph search.""" +from __future__ import annotations + +import networkx as nx + +from storage import file_store as fs + + +def _load_graph() -> nx.Graph: + nodes = fs.load_kg_nodes() + edges = fs.load_kg_edges() + G = nx.Graph() + for n in nodes: + G.add_node(n["id"], **n) + for e in edges: + G.add_edge(e["source"], e["target"], + relation=e.get("relation", "CO_OCCURS_IN"), + doc_id=e.get("doc_id", ""), page=e.get("page", 0)) + return G + + +def search_entities(q: str, entity_type: str | None = None, limit: int = 15) -> dict: + nodes = fs.load_kg_nodes() + G = _load_graph() + degrees = dict(G.degree()) + q_lower = q.lower() + matches = [n for n in nodes if q_lower in n.get("name", "").lower()] + if entity_type: + matches = [n for n in matches if n.get("type", "").upper() == entity_type.upper()] + for n in matches: + n["degree"] = degrees.get(n["id"], 0) + matches = matches[:limit] + return {"query": q, "total": len(matches), "items": matches} + + +def search_path(from_id: str, to_id: str, max_hops: int = 3) -> dict | None: + nodes = fs.load_kg_nodes() + node_map = {n["id"]: n for n in nodes} + if from_id not in node_map or to_id not in node_map: + return None # node not found + + G = _load_graph() + max_hops = max(1, min(max_hops, 5)) + + try: + raw_paths = list(nx.all_simple_paths(G, from_id, to_id, cutoff=max_hops)) + except nx.NetworkXError: + raw_paths = [] + + paths = [] + for path_nodes in raw_paths: + path_edges = [] + for i in range(len(path_nodes) - 1): + s, t = path_nodes[i], path_nodes[i + 1] + edge_data = G.edges[s, t] + path_edges.append({"source": s, "target": t, + "relation": edge_data.get("relation", "CO_OCCURS_IN")}) + paths.append({ + "length": len(path_nodes) - 1, + "nodes": [{"id": nid, "name": node_map.get(nid, {}).get("name", nid), + "type": node_map.get(nid, {}).get("type", "")} for nid in path_nodes], + "edges": path_edges, + }) + + from_node = node_map[from_id] + to_node = node_map[to_id] + return { + "from": {"id": from_id, "name": from_node.get("name", ""), "type": from_node.get("type", "")}, + "to": {"id": to_id, "name": to_node.get("name", ""), "type": to_node.get("type", "")}, + "max_hops": max_hops, + "paths": paths, + "total_paths": len(paths), + } + + +def search_graph(q: str, include_neighbors: bool = False) -> dict: + nodes = fs.load_kg_nodes() + edges = fs.load_kg_edges() + G = _load_graph() + degrees = dict(G.degree()) + q_lower = q.lower() + + matched = [n for n in nodes if q_lower in n.get("name", "").lower()] + matched_ids = {n["id"] for n in matched} + for n in matched: + n["degree"] = degrees.get(n["id"], 0) + + if include_neighbors: + neighbor_ids = set() + for nid in matched_ids: + if nid in G: + neighbor_ids.update(G.neighbors(nid)) + all_relevant = matched_ids | neighbor_ids + else: + all_relevant = matched_ids + + subgraph_edges = [ + e for e in edges + if e.get("source") in all_relevant and e.get("target") in all_relevant + ] + + return { + "query": q, + "matched_nodes": matched, + "subgraph_edges": subgraph_edges, + } diff --git a/backend/storage/__init__.py b/backend/storage/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/backend/storage/file_store.py b/backend/storage/file_store.py new file mode 100644 index 0000000..6da85e9 --- /dev/null +++ b/backend/storage/file_store.py @@ -0,0 +1,268 @@ +""" +File Store — unified JSON read/write for all backend data. +All data lives under backend/data/. +""" +from __future__ import annotations + +import json +import os +import shutil +from pathlib import Path +from typing import Any + +# Root data directory relative to this file +_BASE = Path(__file__).parent.parent / "data" + +UPLOADS_DIR = _BASE / "uploads" +JOBS_DIR = _BASE / "jobs" +KG_DIR = _BASE / "kg" +QUERY_DIR = _BASE / "jobs" # query_history.jsonl lives here + +# Ensure directories exist at import time +for _d in (UPLOADS_DIR, JOBS_DIR, KG_DIR): + _d.mkdir(parents=True, exist_ok=True) + + +# --------------------------------------------------------------------------- +# Generic helpers +# --------------------------------------------------------------------------- + +def read_json(path: Path) -> Any: + """Read and parse a JSON file. Returns None if file doesn't exist.""" + if not path.exists(): + return None + with open(path, "r", encoding="utf-8") as f: + return json.load(f) + + +def write_json(path: Path, data: Any) -> None: + """Atomically write data as JSON (write to .tmp then rename).""" + path.parent.mkdir(parents=True, exist_ok=True) + tmp = path.with_suffix(".tmp") + with open(tmp, "w", encoding="utf-8") as f: + json.dump(data, f, ensure_ascii=False, indent=2) + os.replace(tmp, path) + + +def append_jsonl(path: Path, record: dict) -> None: + """Append a record to a JSONL file.""" + path.parent.mkdir(parents=True, exist_ok=True) + with open(path, "a", encoding="utf-8") as f: + f.write(json.dumps(record, ensure_ascii=False) + "\n") + + +def read_jsonl(path: Path) -> list[dict]: + """Read all records from a JSONL file.""" + if not path.exists(): + return [] + records = [] + with open(path, "r", encoding="utf-8") as f: + for line in f: + line = line.strip() + if line: + try: + records.append(json.loads(line)) + except json.JSONDecodeError: + pass + return records + + +# --------------------------------------------------------------------------- +# Document helpers +# --------------------------------------------------------------------------- + +def docs_index_path() -> Path: + return _BASE / "docs_index.json" + + +def load_docs_index() -> dict[str, dict]: + """Load the documents index {doc_id: DocumentInfo dict}.""" + data = read_json(docs_index_path()) + return data if isinstance(data, dict) else {} + + +def save_docs_index(index: dict[str, dict]) -> None: + write_json(docs_index_path(), index) + + +def get_doc(doc_id: str) -> dict | None: + return load_docs_index().get(doc_id) + + +def save_doc(doc: dict) -> None: + index = load_docs_index() + index[doc["doc_id"]] = doc + save_docs_index(index) + + +def delete_doc(doc_id: str) -> bool: + index = load_docs_index() + if doc_id not in index: + return False + del index[doc_id] + save_docs_index(index) + # Remove upload file + doc_info = index.get(doc_id, {}) + upload_path = UPLOADS_DIR / doc_info.get("upload_filename", "") + if upload_path.exists(): + upload_path.unlink() + return True + + +# --------------------------------------------------------------------------- +# Job helpers +# --------------------------------------------------------------------------- + +def job_dir(job_id: str) -> Path: + return JOBS_DIR / job_id + + +def job_meta_path(job_id: str) -> Path: + return job_dir(job_id) / "meta.json" + + +def load_job_meta(job_id: str) -> dict | None: + return read_json(job_meta_path(job_id)) + + +def save_job_meta(job_id: str, meta: dict) -> None: + job_dir(job_id).mkdir(parents=True, exist_ok=True) + write_json(job_meta_path(job_id), meta) + + +def list_all_jobs() -> list[dict]: + metas = [] + for d in JOBS_DIR.iterdir(): + if d.is_dir(): + meta = read_json(d / "meta.json") + if meta: + metas.append(meta) + return metas + + +def delete_job(job_id: str) -> None: + jd = job_dir(job_id) + if jd.exists(): + shutil.rmtree(jd) + + +# --------------------------------------------------------------------------- +# Global KG helpers +# --------------------------------------------------------------------------- + +def kg_nodes_path() -> Path: + return KG_DIR / "kg_nodes.json" + + +def kg_edges_path() -> Path: + return KG_DIR / "kg_edges.json" + + +def load_kg_nodes() -> list[dict]: + data = read_json(kg_nodes_path()) + return data if isinstance(data, list) else [] + + +def load_kg_edges() -> list[dict]: + data = read_json(kg_edges_path()) + return data if isinstance(data, list) else [] + + +def save_kg_nodes(nodes: list[dict]) -> None: + write_json(kg_nodes_path(), nodes) + + +def save_kg_edges(edges: list[dict]) -> None: + write_json(kg_edges_path(), edges) + + +def merge_kg(new_nodes: list[dict], new_edges: list[dict], doc_id: str) -> tuple[int, int]: + """Merge job KG output into global KG. Returns (removed_old, added_new).""" + existing_nodes = load_kg_nodes() + existing_edges = load_kg_edges() + + # Remove nodes/edges from this doc + existing_nodes = [n for n in existing_nodes if n.get("source_doc") != doc_id] + existing_edges = [e for e in existing_edges if e.get("doc_id") != doc_id] + + # Merge: deduplicate nodes by (name.lower(), type) + node_keys: set[tuple] = {(n["name"].lower(), n["type"]) for n in existing_nodes} + for n in new_nodes: + key = (n["name"].lower(), n["type"]) + if key not in node_keys: + existing_nodes.append(n) + node_keys.add(key) + + # Merge edges: deduplicate by (min(src,tgt), max(src,tgt), doc_id) + edge_keys: set[tuple] = set() + for e in existing_edges: + s, t = e["source"], e["target"] + edge_keys.add((min(s, t), max(s, t), e["doc_id"])) + + for e in new_edges: + s, t = e["source"], e["target"] + key = (min(s, t), max(s, t), e["doc_id"]) + if key not in edge_keys: + existing_edges.append(e) + edge_keys.add(key) + + save_kg_nodes(existing_nodes) + save_kg_edges(existing_edges) + return len(existing_nodes), len(existing_edges) + + +def remove_doc_from_kg(doc_id: str) -> tuple[int, int]: + """Remove all nodes/edges from a document. Returns (removed_nodes, removed_edges).""" + nodes = load_kg_nodes() + edges = load_kg_edges() + old_n, old_e = len(nodes), len(edges) + nodes = [n for n in nodes if n.get("source_doc") != doc_id] + edges = [e for e in edges if e.get("doc_id") != doc_id] + save_kg_nodes(nodes) + save_kg_edges(edges) + return old_n - len(nodes), old_e - len(edges) + + +# --------------------------------------------------------------------------- +# Query history helpers +# --------------------------------------------------------------------------- + +def query_history_path() -> Path: + return _BASE / "query_history.jsonl" + + +def append_query_history(result: dict) -> None: + append_jsonl(query_history_path(), result) + + +def load_query_history() -> list[dict]: + records = read_jsonl(query_history_path()) + return list(reversed(records)) # newest first + + +# --------------------------------------------------------------------------- +# Batch job helpers +# --------------------------------------------------------------------------- + +def batch_meta_path(batch_id: str) -> Path: + return _BASE / "batches" / f"{batch_id}.json" + + +def load_batch_meta(batch_id: str) -> dict | None: + return read_json(batch_meta_path(batch_id)) + + +def save_batch_meta(batch_id: str, meta: dict) -> None: + write_json(batch_meta_path(batch_id), meta) + + +# --------------------------------------------------------------------------- +# Storage usage +# --------------------------------------------------------------------------- + +def storage_used_mb() -> float: + total = 0 + for path in _BASE.rglob("*"): + if path.is_file(): + total += path.stat().st_size + return round(total / (1024 * 1024), 2) diff --git a/backend/tests/test_api.py b/backend/tests/test_api.py new file mode 100644 index 0000000..527f280 --- /dev/null +++ b/backend/tests/test_api.py @@ -0,0 +1,256 @@ +""" +API integration tests — tests all major endpoints against a running server. +Run with: python tests/test_api.py +Server must be running on http://localhost:8000 +""" +import json +import sys +import time +import urllib.request +import urllib.error +from pathlib import Path + +BASE = "http://localhost:8000/api/v1" + +PASS = "\033[92m[PASS]\033[0m" +FAIL = "\033[91m[FAIL]\033[0m" +INFO = "\033[94m[INFO]\033[0m" + +results = {"passed": 0, "failed": 0} + + +def req(method: str, path: str, body: dict | None = None, form: dict | None = None) -> dict: + url = BASE + path + try: + if method == "GET" and not body and not form: + r = urllib.request.urlopen(url, timeout=30) + else: + if body is not None: + data = json.dumps(body).encode() + req_obj = urllib.request.Request(url, data=data, method=method, + headers={"Content-Type": "application/json"}) + else: + req_obj = urllib.request.Request(url, method=method) + r = urllib.request.urlopen(req_obj, timeout=30) + return json.loads(r.read().decode()) + except urllib.error.HTTPError as e: + return json.loads(e.read().decode()) + + +def check(name: str, condition: bool, detail: str = "") -> None: + if condition: + results["passed"] += 1 + print(f" {PASS} {name}") + else: + results["failed"] += 1 + print(f" {FAIL} {name} {detail}") + + +def wait_for_server(max_retries: int = 15) -> bool: + print(f"{INFO} Waiting for server at {BASE}...") + for i in range(max_retries): + try: + urllib.request.urlopen(BASE.replace("/api/v1", "/"), timeout=3) + print(f"{INFO} Server is up.") + return True + except Exception: + time.sleep(1) + return False + + +# ───────────────────────────────────────────────────────────────────────────── +# Test groups +# ───────────────────────────────────────────────────────────────────────────── + +def test_system(): + print("\n── F 组: System ──") + + r = req("GET", "/health") + check("GET /health returns code=0", r.get("code") == 0) + check("health data.status exists", "status" in (r.get("data") or {})) + check("health data.components exists", "components" in (r.get("data") or {})) + print(f" {INFO} status={r.get('data',{}).get('status')} uptime={r.get('data',{}).get('uptime_seconds')}s") + + r = req("GET", "/system/stats") + check("GET /system/stats returns code=0", r.get("code") == 0) + d = r.get("data") or {} + check("stats has total_documents", "total_documents" in d) + check("stats has total_nodes", "total_nodes" in d) + print(f" {INFO} docs={d.get('total_documents')} nodes={d.get('total_nodes')} edges={d.get('total_edges')}") + + r = req("GET", "/system/formats") + check("GET /system/formats returns code=0", r.get("code") == 0) + d = r.get("data") or {} + check("formats list is non-empty", len(d.get("formats", [])) > 0) + exts = [f["ext"] for f in d.get("formats", [])] + check("pdf format present", "pdf" in exts) + check("docx format present", "docx" in exts) + + r = req("GET", "/system/demo") + check("GET /system/demo returns code=0 or 3002", r.get("code") in (0, 3002)) + if r.get("code") == 0: + d = r.get("data") or {} + check("demo data has nodes", "nodes" in d) + print(f" {INFO} demo: {len(d.get('nodes',[]))} nodes, {len(d.get('edges',[]))} edges") + else: + print(f" {INFO} demo data not available (no KG yet) — code={r.get('code')}") + + +def test_documents(): + print("\n── A 组: Documents ──") + + r = req("GET", "/documents") + check("GET /documents returns code=0", r.get("code") == 0) + d = r.get("data") or {} + check("documents list has total field", "total" in d) + check("documents list has items field", "items" in d) + print(f" {INFO} total documents={d.get('total', 0)}") + + # Upload a test text file (not a real supported format to test validation) + print(" Testing upload validation...") + import urllib.request, io + boundary = "boundary123" + body_parts = ( + f"--{boundary}\r\n" + f'Content-Disposition: form-data; name="file"; filename="test.xyz"\r\n' + f"Content-Type: application/octet-stream\r\n\r\n" + f"dummy content\r\n" + f"--{boundary}--\r\n" + ).encode() + req_obj = urllib.request.Request( + BASE + "/documents/upload", + data=body_parts, + method="POST", + headers={"Content-Type": f"multipart/form-data; boundary={boundary}"}, + ) + try: + urllib.request.urlopen(req_obj, timeout=10) + r_upload = {} + except urllib.error.HTTPError as e: + r_upload = json.loads(e.read().decode()) + check("upload unsupported format returns code=1002", r_upload.get("code") == 1002) + + r = req("GET", "/documents/nonexistent_id") + check("GET /documents/nonexistent returns code=2001", r.get("code") == 2001) + + +def test_indexing(): + print("\n── B 组: Indexing ──") + + r = req("POST", "/index/start", body={"doc_id": "nonexistent_doc"}) + check("start indexing nonexistent doc returns 2001", r.get("code") == 2001) + + r = req("GET", "/index/status/nonexistent_job") + check("get status nonexistent job returns 2002", r.get("code") == 2002) + + r = req("GET", "/index/result/nonexistent_job") + check("get result nonexistent job returns 2002", r.get("code") == 2002) + + r = req("DELETE", "/index/jobs/nonexistent_job") + check("cancel nonexistent job returns 2002", r.get("code") == 2002) + + +def test_kg(): + print("\n── C 组: Knowledge Graph ──") + + r = req("GET", "/kg/stats") + check("GET /kg/stats returns code=0", r.get("code") == 0) + d = r.get("data") or {} + check("stats has total_nodes", "total_nodes" in d) + check("stats has total_edges", "total_edges" in d) + print(f" {INFO} KG: {d.get('total_nodes')} nodes, {d.get('total_edges')} edges") + + r = req("GET", "/kg/nodes") + check("GET /kg/nodes returns code 0 or 3002", r.get("code") in (0, 3002)) + if r.get("code") == 0: + d = r.get("data") or {} + check("nodes data has items", "items" in d) + print(f" {INFO} nodes total={d.get('total')}") + + if d.get("items"): + node_id = d["items"][0]["id"] + r2 = req("GET", f"/kg/nodes/{node_id}") + check(f"GET /kg/nodes/{node_id} returns code=0", r2.get("code") == 0) + + r3 = req("GET", f"/kg/nodes/{node_id}/neighbors?hops=1") + check(f"GET /kg/nodes/{node_id}/neighbors returns code=0", r3.get("code") == 0) + else: + print(f" {INFO} KG is empty (code=3002) — skipping node detail tests") + + r = req("GET", "/kg/nodes/definitely_not_a_real_node") + check("GET /kg/nodes/invalid returns code=3001", r.get("code") == 3001) + + r = req("GET", "/kg/edges") + check("GET /kg/edges returns code=0", r.get("code") == 0) + + r = req("GET", "/kg/export") + check("GET /kg/export returns code=0", r.get("code") == 0) + + +def test_search(): + print("\n── E 组: Search ──") + + r = req("GET", "/search/entities?q=graph") + check("GET /search/entities returns code=0", r.get("code") == 0) + d = r.get("data") or {} + check("search entities has query field", "query" in d) + check("search entities has items field", "items" in d) + print(f" {INFO} 'graph' search: {d.get('total', 0)} results") + + r = req("GET", "/search/entities?q=technology&type=TECHNOLOGY") + check("GET /search/entities with type filter returns code=0", r.get("code") == 0) + + r = req("GET", "/search/path?max_hops=2") + check("path search without from/to returns 1001", r.get("code") == 1001) + + r = req("GET", "/search/graph?q=knowledge") + check("GET /search/graph returns code=0", r.get("code") == 0) + d = r.get("data") or {} + check("graph search has matched_nodes", "matched_nodes" in d) + + +def test_query(): + print("\n── D 组: QA Query ──") + + # Don't call /query (POST) in basic tests as it needs DeepSeek API + KG data + r = req("GET", "/query/history") + check("GET /query/history returns code=0", r.get("code") == 0) + d = r.get("data") or {} + check("history has total field", "total" in d) + check("history has items field", "items" in d) + print(f" {INFO} query history: {d.get('total', 0)} records") + + r = req("GET", "/query/batch/nonexistent_batch") + check("GET /query/batch/nonexistent returns 2002", r.get("code") == 2002) + + r = req("POST", "/query/batch", body={"questions": ["test question"]}) + check("POST /query/batch returns code=0", r.get("code") == 0) + d = r.get("data") or {} + check("batch has batch_id", "batch_id" in d) + + +# ───────────────────────────────────────────────────────────────────────────── +# Main +# ───────────────────────────────────────────────────────────────────────────── + +if __name__ == "__main__": + if not wait_for_server(): + print(f"\n{FAIL} Server not responding. Start with: python main.py") + sys.exit(1) + + test_system() + test_documents() + test_indexing() + test_kg() + test_search() + test_query() + + total = results["passed"] + results["failed"] + print(f"\n{'='*50}") + print(f"Results: {results['passed']}/{total} passed, {results['failed']} failed") + if results["failed"] == 0: + print(f"{PASS} All tests passed!") + else: + print(f"{FAIL} {results['failed']} test(s) failed") + print(f"{'='*50}") + sys.exit(0 if results["failed"] == 0 else 1) diff --git a/docs/agentic_rag_specification-v1.0.md b/docs/agentic_rag_specification-v1.0.md new file mode 100644 index 0000000..22f804d --- /dev/null +++ b/docs/agentic_rag_specification-v1.0.md @@ -0,0 +1,779 @@ +# Agentic-RAG 规范文档 v1.0 + +> GraphRAG 问答阶段核心流程:Knowledge Graph → LangChain Agent → QA +> +> 数据来源:Bridge Pipeline 输出(`kg_nodes.json` + `kg_edges.json`) +> 测试验证日期:2026-03-05 +> 全流程运行耗时:~40s(4 个测试查询) + +--- + +## 目录 + +- [一、完整执行思路与脚本位置](#一完整执行思路与脚本位置) +- [二、LangChain Agent 输入输出规范](#二langchain-agent-输入输出规范) +- [三、MinerU ↔ Agentic-RAG 对接规范与核心架构](#三mineru--agentic-rag-对接规范与核心架构) +- [四、问答流程最终数据返回格式规范](#四问答流程最终数据返回格式规范) +- [五、虚拟环境与依赖](#五虚拟环境与依赖) + +--- + +## 一、完整执行思路与脚本位置 + +### 1.1 总体架构定位 + +Agentic-RAG 是 GraphRAG 系统的**问答阶段**,位于 Bridge Pipeline 之后,负责将知识图谱转化为可交互的智能问答能力。 + +``` +【已完成阶段】 【本阶段:Agentic-RAG】 +──────────────────── ────────────────────────── +PDF + ↓ MinerU Cloud API +content_list.json + ↓ Bridge Pipeline +kg_nodes.json (40 nodes) ──────────→ NetworkX Graph (内存) +kg_edges.json (780 edges) ↓ + 4 个 LangChain @tool + ↓ + LangChain v1 create_agent + (DeepSeek deepseek-chat) + ↓ + ReAct 推理循环 + ↓ + 自然语言答案 +``` + +### 1.2 五步执行流程 + +| 步骤 | 模块 | 说明 | +|------|------|------| +| Step 0 | 环境 + 配置 | 加载 `.env`(DEEPSEEK_API_KEY),初始化 `ChatOpenAI` | +| Step 1 | KG 加载 | 读取 `kg_nodes.json` + `kg_edges.json`,构建 NetworkX 无向图 | +| Step 2 | Tool 注册 | 用 `@tool` 装饰器注册 4 个 KG 检索工具 | +| Step 3 | Agent 构建 | `create_agent(model, tools, system_prompt)` 编译 LangGraph | +| Step 4 | 问答调用 | `agent.invoke({"messages": [("human", question)]})` | +| Step 5 | 结果提取 | `result["messages"][-1].content` 获取最终答案 | + +### 1.3 测试脚本存放位置 + +``` +F:\GraphRAGAgent\graphrag_pipeline\ +├── agentic_rag_mvp.py ← 主测试脚本(本规范对应文件) +├── .env ← DEEPSEEK_API_KEY 配置 +└── output/ + ├── kg_nodes.json ← Bridge Pipeline 生成(40 节点) + └── kg_edges.json ← Bridge Pipeline 生成(780 边) +``` + +### 1.4 运行命令 + +```bash +# MVP 连通性测试(4 个预设测试查询) +F:/GraphRAGAgent/langextract_src/.venv/Scripts/python.exe \ + F:/GraphRAGAgent/graphrag_pipeline/agentic_rag_mvp.py +``` + +### 1.5 ReAct 推理循环详解 + +Agent 使用 **ReAct(Reasoning + Acting)** 模式,每个问题的处理流如下: + +``` +用户输入 (question: str) + │ + ▼ +┌─────────────────────────────────────────────────┐ +│ LLM Reasoning(DeepSeek deepseek-chat) │ +│ 决策:需要调用哪个工具?参数是什么? │ +└─────────────────────────────────────────────────┘ + │ tool_call + ▼ +┌─────────────────────────────────────────────────┐ +│ Tool Execution(NetworkX 本地计算,无 API 调用) │ +│ search_entities / get_neighbors / │ +│ get_entities_by_type / describe_graph │ +└─────────────────────────────────────────────────┘ + │ ToolMessage(工具返回的文本结果) + ▼ +┌─────────────────────────────────────────────────┐ +│ LLM Observation(观察工具结果) │ +│ 决策:结果够用了吗?还需要调更多工具? │ +└─────────────────────────────────────────────────┘ + │ 继续 tool_call 或输出最终答案 + ▼ +AIMessage(最终自然语言答案) +``` + +**实测工具调用模式(4 个测试查询):** + +| 查询类型 | 工具调用序列 | 特点 | +|---------|------------|------| +| 图谱整体概览 | `describe_graph` | 单次工具调用 | +| 类型枚举 | `get_entities_by_type` | 单次工具调用 | +| 多跳关系推理 | `search_entities` → `get_neighbors` | 两步串行调用 | +| 概念精确查找 | `search_entities` → `get_neighbors` | 两步串行调用 | + +--- + +## 二、LangChain Agent 输入输出规范 + +### 2.1 LLM 适配规范 + +#### 2.1.1 DeepSeek → LangChain 标准组件 + +LangChain v1 使用 `ChatOpenAI` 通过 `base_url` 覆盖接入任何 OpenAI 兼容 API: + +```python +from langchain_openai import ChatOpenAI + +llm = ChatOpenAI( + model="deepseek-chat", # DeepSeek 模型名 + api_key=DEEPSEEK_API_KEY, # 来自 graphrag_pipeline/.env + base_url="https://api.deepseek.com", # OpenAI 兼容端点 + temperature=0, # 问答场景确定性输出 +) +``` + +| 参数 | 值 | 说明 | +|------|-----|------| +| `model` | `"deepseek-chat"` | DeepSeek 实际模型标识 | +| `api_key` | `${DEEPSEEK_API_KEY}` | 从 `.env` 读取,与 Bridge Pipeline 共用 | +| `base_url` | `"https://api.deepseek.com"` | SDK 自动补全 `/v1` 路径 | +| `temperature` | `0` | 问答场景设为 0,保证可重现性 | + +#### 2.1.2 与 LangExtract 中 DeepSeek 的区别 + +| 对比项 | LangExtract 中的 DeepSeek | Agentic-RAG 中的 DeepSeek | +|--------|--------------------------|--------------------------| +| 接入方式 | 直接实例化 `OpenAILanguageModel` | LangChain `ChatOpenAI` 标准组件 | +| API Key 环境变量 | `OPENAI_API_KEY` | `DEEPSEEK_API_KEY` | +| 调用方式 | `lx.extract(model=model)` | `agent.invoke({"messages": ...})` | +| 输出格式 | JSON(实体抽取) | 自然语言(问答) | +| Tool Calling | 不支持(单轮推理) | 支持(ReAct 多轮) | + +### 2.2 Agent 构建规范 + +#### 2.2.1 LangChain v1 create_agent + +```python +from langchain.agents import create_agent + +agent = create_agent( + model=llm, # ChatOpenAI 实例 + tools=_tools, # List[BaseTool],4 个工具 + system_prompt=SYSTEM_PROMPT, # 系统提示词字符串 +) +``` + +**版本注意事项:** + +| API | 状态 | 说明 | +|-----|------|------| +| `langchain.agents.create_agent` | ✅ LangChain v1 推荐 | 本项目使用 | +| `langgraph.prebuilt.create_react_agent` | ⚠️ Deprecated in LangGraph V1.0 | 已废弃,勿用 | +| `langchain.agents.create_react_agent` (旧版) | ❌ Legacy | 已移除 | + +#### 2.2.2 System Prompt 规范 + +``` +You are a Knowledge Graph QA assistant. You have access to a knowledge graph +extracted from academic documents about GraphRAG and related technologies. + +The graph contains: +- {node_count} deduplicated entities ({type_list} types) +- {edge_count} CO_OCCURS_IN edges representing same-page co-occurrence + +Available tools: +1. search_entities — find entities by keyword substring +2. get_neighbors — explore entity relationships (N-hop BFS) +3. get_entities_by_type — list all entities of a type +4. describe_graph — get graph statistics overview + +Reasoning strategy: +- Always use at least one tool before answering a factual question +- For relationship questions, use get_neighbors after identifying the entity with search_entities +- For enumeration questions, use get_entities_by_type +- Synthesize tool results into a clear, concise answer +- Cite the entity names and types in your final answer +``` + +### 2.3 Agent 输入规范 + +#### 2.3.1 invoke 输入格式 + +```python +result = agent.invoke({ + "messages": [ + ("human", question) # 用户问题(自然语言字符串) + ] +}) +``` + +**输入字段规范:** + +| 字段 | 类型 | 说明 | +|------|------|------| +| `messages` | `list[tuple[str, str]]` | 消息列表,格式 `(role, content)` | +| `role` | `"human"` \| `"ai"` \| `"system"` | 消息角色 | +| `content` | `str` | 消息内容 | + +**多轮对话输入(支持历史上下文):** + +```python +result = agent.invoke({ + "messages": [ + ("human", "What is GraphRAG?"), + ("ai", "GraphRAG is a knowledge graph-enhanced RAG system..."), + ("human", "How does it relate to LLMs?"), # 当前问题 + ] +}) +``` + +### 2.4 Agent 输出规范 + +#### 2.4.1 invoke 原始返回 + +```python +{ + "messages": [ + HumanMessage(content="What is GraphRAG?"), + AIMessage(content="", tool_calls=[...]), # 工具调用 + ToolMessage(content="...", tool_call_id="..."), # 工具结果 + AIMessage(content="GraphRAG is an advanced...") # 最终答案 + ] +} +``` + +#### 2.4.2 消息类型枚举 + +| 消息类型 | 角色 | 说明 | +|---------|------|------| +| `HumanMessage` | `human` | 用户输入 | +| `AIMessage`(tool_calls 非空) | `ai` | LLM 决策发起工具调用 | +| `ToolMessage` | `tool` | 工具执行结果 | +| `AIMessage`(tool_calls 为空) | `ai` | 最终自然语言答案 | + +#### 2.4.3 最终答案提取 + +```python +final_msg = result["messages"][-1] +answer = final_msg.content # str,最终自然语言答案 +``` + +### 2.5 四个工具输入输出规范 + +#### Tool 1: `search_entities` + +| 项目 | 规范 | +|------|------| +| 入参 | `query: str` — 关键词(大小写不敏感子串匹配) | +| 匹配逻辑 | `query.lower() in entity_name.lower()` | +| 返回格式 | 多行文本,每行格式:`[{type}] "{name}" (confidence={c}, page={p}, id={id})` | +| 无匹配时 | 返回提示 + 前 8 个样例实体名 | +| 最多返回 | 15 条 | + +**实际调用示例:** + +``` +输入: query="GraphRAG" +输出: +Found 3 entity(ies) matching 'GraphRAG': + [TECHNOLOGY] "GraphRAG" (confidence=match_exact, page=0, id=node_0) + [CONCEPT] "GraphRAG pipeline" (confidence=match_exact, page=0, id=node_12) + [CONCEPT] "GraphRAG (Global)" (confidence=match_exact, page=0, id=node_15) +``` + +#### Tool 2: `get_neighbors` + +| 项目 | 规范 | +|------|------| +| 入参 | `entity_name: str`,`hops: int = 1`(范围 1-3) | +| 匹配逻辑 | 子串匹配找起始节点,取 `candidates[0]` | +| 遍历算法 | `nx.single_source_shortest_path_length(G, node_id, cutoff=hops)` | +| 返回格式 | 按 hop 分组,每组 `[{type}] {name}`,每组最多 20 条 | +| 未找到时 | 返回提示,建议先用 `search_entities` | + +**实际调用示例:** + +``` +输入: entity_name="GraphRAG", hops=1 +输出: +Neighbors of 'GraphRAG' [TECHNOLOGY] within 1 hop(s): + + Hop 1 — 39 related entities: + [CONCEPT] Knowledge Graph Enhanced RAG System + [CONCEPT] retrieval-augmented generation + ... + Total related entities: 39 +``` + +#### Tool 3: `get_entities_by_type` + +| 项目 | 规范 | +|------|------| +| 入参 | `entity_type: str`(自动 `.upper()` 处理) | +| 有效类型 | `TECHNOLOGY`, `CONCEPT`, `PERSON`, `ORGANIZATION`, `LOCATION` | +| 返回格式 | 按 `name` 字母序排列,每行 `• {name} (confidence={c}, page={p})` | +| 无效类型时 | 返回错误 + 图谱中实际存在的类型列表 | + +**实际调用示例:** + +``` +输入: entity_type="TECHNOLOGY" +输出: +TECHNOLOGY entities (4 total): + • GraphRAG (confidence=match_exact, page=0) + • LLMs (confidence=match_exact, page=0) + • LangExtract (confidence=match_exact, page=0) + • MinerU (confidence=match_exact, page=0) +``` + +#### Tool 4: `describe_graph` + +| 项目 | 规范 | +|------|------| +| 入参 | 无参数 | +| 计算指标 | 节点数、边数、关系类型、图密度(`nx.density`)、度中心性(`nx.degree_centrality`) | +| 返回格式 | 结构化文本,包含概览 + 类型分布 + Top-5 中心节点 | + +**实际调用示例(实测输出):** + +``` +=== Knowledge Graph Overview === + Nodes (entities): 40 + Edges (relations): 780 + Relation type: CO_OCCURS_IN (same-page co-occurrence) + Graph density: 1.0000 + + Entity type distribution: + CONCEPT : 36 + TECHNOLOGY : 4 + + Top-5 most connected entities (by degree centrality): + [TECHNOLOGY] GraphRAG (centrality=1.000) + [CONCEPT] Knowledge Graph Enhanced RAG System (centrality=1.000) + [CONCEPT] retrieval-augmented generation (centrality=1.000) + [CONCEPT] knowledge graphs (centrality=1.000) + [CONCEPT] large language models (centrality=1.000) +``` + +--- + +## 三、MinerU ↔ Agentic-RAG 对接规范与核心架构 + +### 3.1 全链路技术架构 + +``` +┌─────────────────────────────────────────────────────────────────────┐ +│ 阶段一:文档解析(MinerU Cloud API) │ +│ │ +│ PDF 文件 │ +│ │ POST /file-urls/batch (enable_table=True, language="en") │ +│ ├─ PUT {presigned_url}(裸上传,不带 Content-Type) │ +│ └─ GET /extract-results/batch/{batch_id}(轮询 done) │ +│ ↓ │ +│ full_zip_url → 解压 → {uuid}_content_list.json │ +│ │ +│ 关键输出字段:type, text, text_level, table_body, page_idx, bbox │ +└─────────────────────────────────────────────────────────────────────┘ + ↓ +┌─────────────────────────────────────────────────────────────────────┐ +│ 阶段二:知识图谱构建(Bridge Pipeline) │ +│ │ +│ content_list.json │ +│ │ text_assembler.py │ +│ ├─ text blocks → .rstrip() 拼接 │ +│ ├─ table blocks → BeautifulSoup HTML → pipe 分隔文本 │ +│ └─ PageText(page_idx, text, block_spans) │ +│ ↓ │ +│ entity_extractor.py (LangExtract + DeepSeek) │ +│ ↓ │ +│ kg_builder.py (去重 + CO_OCCURS_IN 边) │ +│ ↓ │ +│ kg_nodes.json (40 nodes) + kg_edges.json (780 edges) │ +└─────────────────────────────────────────────────────────────────────┘ + ↓ +┌─────────────────────────────────────────────────────────────────────┐ +│ 阶段三:Agentic-RAG 问答(LangChain + LangGraph) │ +│ │ +│ kg_nodes.json → NetworkX.G.add_node(**node) │ +│ kg_edges.json → NetworkX.G.add_edge(source, target, **edge) │ +│ │ +│ @tool search_entities ← 子串匹配 │ +│ @tool get_neighbors ← BFS N-hop 遍历 │ +│ @tool get_entities_by_type ← 类型过滤 │ +│ @tool describe_graph ← 图统计 │ +│ ↓ │ +│ create_agent(ChatOpenAI("deepseek-chat"), tools, system_prompt) │ +│ ↓ │ +│ ReAct 推理循环(think → tool_call → observe → repeat) │ +│ ↓ │ +│ 自然语言答案(AIMessage.content) │ +└─────────────────────────────────────────────────────────────────────┘ +``` + +### 3.2 MinerU → KG 关键参数对接 + +| MinerU 输出字段 | Bridge Pipeline 处理 | Agentic-RAG 使用 | +|---------------|-------------------|----------------| +| `block["type"]` | 区分 `text`/`table`/`image` | 不直接使用(已由 Bridge 转换) | +| `block["text"]` | `.rstrip()` 后加入 PageText | 已内化为 `node["name"]` | +| `block["table_body"]` | BeautifulSoup → pipe 分隔文本 | 已内化为实体描述 | +| `block["page_idx"]` | 分组依据,记入 BlockSpan | `node["page"]` 字段 | +| `block["bbox"]` | 记录字符偏移位置 | `node["char_start"]` / `node["char_end"]` | +| `{uuid}_content_list.json 文件名` | UUID 作为 `source_doc_id` | `node["source_doc"]` / `edge["doc_id"]` | + +### 3.3 NetworkX 图构建规范 + +```python +import networkx as nx + +G = nx.Graph() # 无向图(CO_OCCURS_IN 关系无方向) + +# 节点:来自 kg_nodes.json +for node in kg_nodes: + G.add_node( + node["id"], # 主键:node_0, node_1, ... + **node # 所有字段作为节点属性 + ) + +# 边:来自 kg_edges.json +for edge in kg_edges: + G.add_edge( + edge["source"], # node_0 + edge["target"], # node_1 + relation=edge["relation"], # "CO_OCCURS_IN" + doc_id=edge["doc_id"], # UUID + page=edge["page"], # 0-indexed + ) +``` + +**图属性:** + +| 属性 | 实测值 | 说明 | +|------|--------|------| +| `G.number_of_nodes()` | `40` | 去重实体数 | +| `G.number_of_edges()` | `780` | CO_OCCURS_IN 边数 | +| `nx.density(G)` | `1.0` | 完全图(单页文档所有节点两两连接) | +| `G.nodes[nid]` | `dict` | 节点属性字典(id, name, type, page, confidence, ...) | + +### 3.4 MinerU API 关键参数(与 Agentic-RAG 相关部分) + +| 参数 | 推荐值 | 影响 Agentic-RAG 的原因 | +|------|--------|----------------------| +| `enable_table` | `True` | 表格被解析为 HTML ``,Bridge 转为文本参与实体抽取,影响 KG 节点质量 | +| `enable_formula` | `True`(默认) | 公式以 LaTeX 内联写入文本,影响文本纯净度,可能产生噪声实体 | +| `language` | `"en"` / `"ch"` | 影响 OCR 精度,直接影响文本质量和实体对齐率 | +| `model_version` | `"pipeline"` | 输出 `{uuid}_content_list.json`,Bridge 通过 glob `*_content_list.json` 匹配 | +| `page_ranges` | 按需设置 | 多页文档可分批处理,减少每批实体数和边数规模 | + +### 3.5 Agent 系统扩展点 + +当 KG 数据更新后(新文档接入),Agentic-RAG 只需**重新加载 JSON 文件**,不需要重新构建 agent: + +```python +# 动态重载 KG(新文档处理完成后) +G.clear() +G = _load_kg() # 重新读取 kg_nodes.json + kg_edges.json +# agent 实例无需重建,tools 引用同一 G 对象 +``` + +--- + +## 四、问答流程最终数据返回格式规范 + +### 4.1 invoke 完整返回结构 + +```python +result = agent.invoke({"messages": [("human", question)]}) +# result 类型: dict +# result.keys(): ["messages"] +``` + +`result["messages"]` 是一个有序列表,包含完整的对话历史: + +```python +[ + HumanMessage, # 用户输入 + AIMessage, # 工具调用决策(可能多轮) + ToolMessage, # 工具执行结果(可能多轮) + ... # 可能有多轮 AIMessage + ToolMessage + AIMessage, # 最终答案(tool_calls=[]) +] +``` + +### 4.2 HumanMessage 格式 + +```python +HumanMessage( + content="What technology entities are in the knowledge graph?", + additional_kwargs={}, + response_metadata={}, + id="uuid-string", # 自动生成 +) +``` + +### 4.3 AIMessage(工具调用)格式 + +```python +AIMessage( + content="", # 内容为空(LLM 决策调用工具) + additional_kwargs={ + "tool_calls": [ + { + "id": "call_abc123", + "type": "function", + "function": { + "name": "get_entities_by_type", + "arguments": "{\"entity_type\": \"TECHNOLOGY\"}" + } + } + ] + }, + tool_calls=[ + { + "name": "get_entities_by_type", + "args": {"entity_type": "TECHNOLOGY"}, + "id": "call_abc123", + "type": "tool_call", + } + ], + response_metadata={ + "model_name": "deepseek-chat", + "finish_reason": "tool_calls", + "usage": { + "prompt_tokens": 580, + "completion_tokens": 18, + "total_tokens": 598, + } + }, +) +``` + +### 4.4 ToolMessage 格式 + +```python +ToolMessage( + content="TECHNOLOGY entities (4 total):\n • GraphRAG ...\n • LLMs ...", + tool_call_id="call_abc123", # 与 AIMessage.tool_calls[i].id 对应 + name="get_entities_by_type", # 工具名称 + additional_kwargs={}, + response_metadata={}, +) +``` + +### 4.5 AIMessage(最终答案)格式 + +```python +AIMessage( + content="## Technology Entities in the Knowledge Graph\n\n1. **GraphRAG** ...", + additional_kwargs={ + "tool_calls": [] # 空列表,表示无更多工具调用 + }, + tool_calls=[], + response_metadata={ + "model_name": "deepseek-chat", + "finish_reason": "stop", + "usage": { + "prompt_tokens": 820, + "completion_tokens": 350, + "total_tokens": 1170, + } + }, + id="msg-uuid-string", +) +``` + +### 4.6 最终答案提取规范 + +```python +# 标准提取方式 +final_msg = result["messages"][-1] # 最后一条消息必为最终 AIMessage +answer: str = final_msg.content # 自然语言答案 + +# 安全提取方式(防御性编程) +answer = ( + final_msg.content + if hasattr(final_msg, "content") + else str(final_msg) +) +``` + +### 4.7 推荐封装数据格式 + +业务层调用时建议封装为以下结构,便于下游使用: + +```python +from dataclasses import dataclass +from typing import Any + +@dataclass +class AgenticRAGResponse: + question: str # 用户原始问题 + answer: str # 最终答案(Markdown 格式) + tool_calls: list[dict] # 工具调用链记录 + total_messages: int # 对话轮次(含 human/ai/tool 全部) + token_usage: dict[str, int] # Token 用量统计 + kg_stats: dict[str, Any] # KG 规模信息 +``` + +**填充示例:** + +```python +def run_query_with_metadata(question: str) -> AgenticRAGResponse: + result = agent.invoke({"messages": [("human", question)]}) + messages = result["messages"] + + # 提取工具调用链 + tool_calls = [] + for msg in messages: + if hasattr(msg, "tool_calls") and msg.tool_calls: + for tc in msg.tool_calls: + tool_calls.append({ + "tool": tc["name"], + "args": tc["args"], + "call_id": tc["id"], + }) + + # Token 统计(来自最后一条 AIMessage) + last_ai = messages[-1] + usage = last_ai.response_metadata.get("usage", {}) + + return AgenticRAGResponse( + question=question, + answer=messages[-1].content, + tool_calls=tool_calls, + total_messages=len(messages), + token_usage={ + "prompt_tokens": usage.get("prompt_tokens", 0), + "completion_tokens": usage.get("completion_tokens", 0), + "total_tokens": usage.get("total_tokens", 0), + }, + kg_stats={ + "nodes": G.number_of_nodes(), + "edges": G.number_of_edges(), + "density": nx.density(G), + }, + ) +``` + +### 4.8 实测问答响应样例 + +#### 样例 1:T1-Overview(图谱概览类问题) + +``` +问题:Give me an overview of the knowledge graph. + What types of entities does it contain and which entities are most central? + +工具调用链: + [1] describe_graph() + +最终答案(节选): + ## Knowledge Graph Overview + **Structure:** + - 40 entities (nodes) connected by 780 edges + - All edges represent CO_OCCURS_IN relationships + - Graph density: 1.000 — fully connected graph + + **Entity Types:** + 1. TECHNOLOGY (4): GraphRAG, LLMs, LangExtract, MinerU + 2. CONCEPT (36): retrieval-augmented generation, knowledge graphs, ... + + **Most Central Entities (centrality=1.000):** + 1. [TECHNOLOGY] GraphRAG + 2. [CONCEPT] Knowledge Graph Enhanced RAG System + ... + +消息轮次:4 条(human + ai_tool_call + tool_result + ai_final) +Token 用量:约 900 tokens +``` + +#### 样例 2:T3-MultiHop(多跳推理类问题) + +``` +问题:What concepts and technologies are most closely related to GraphRAG? + Explore the graph neighborhood and explain the connections. + +工具调用链: + [1] search_entities(query="GraphRAG") + [2] get_neighbors(entity_name="GraphRAG", hops=1) + +最终答案(节选): + ## Concepts and Technologies Most Closely Related to GraphRAG + + ### Core Technologies (Directly Connected): + 1. LLMs — The foundation models that GraphRAG enhances + 2. LangExtract — Used for language extraction in the pipeline + 3. MinerU — Part of the data processing ecosystem + + ### Key Concepts: + - Knowledge Graph Enhanced RAG System (overarching architecture) + - retrieval-augmented generation (core paradigm) + - multi-hop reasoning (key capability) + ... + +消息轮次:6 条(human + 2×ai_tool_call + 2×tool_result + ai_final) +Token 用量:约 1,200 tokens +``` + +### 4.9 错误与边界情况处理 + +| 情况 | Agent 行为 | 返回内容 | +|------|------------|---------| +| 实体不存在 | 工具返回提示 + 样例实体名 | Agent 改写查询或给出不确定性说明 | +| 类型不合法 | 工具返回有效类型列表 | Agent 自动纠正并重试 | +| 问题超出 KG 范围 | 无工具调用结果支撑 | Agent 如实说明 "信息不在当前 KG 中" | +| Token 超限 | LangChain 内部截断 | 减少 `hops` 或缩短问题 | + +--- + +## 五、虚拟环境与依赖 + +### 5.1 运行环境 + +| 项目 | 值 | +|------|-----| +| 虚拟环境 | `F:\GraphRAGAgent\langextract_src\.venv\`(复用 Bridge Pipeline 的 venv) | +| Python 版本 | 3.12 | +| 安装方式 | uv | + +### 5.2 Agentic-RAG 新增依赖 + +| 包 | 版本(实测) | 用途 | +|----|------------|------| +| `langchain` | 1.2.10 | `@tool` 装饰器、`create_agent` | +| `langchain-openai` | latest | `ChatOpenAI`(DeepSeek 适配) | +| `langgraph` | latest | `create_agent` 底层运行时 | +| `networkx` | latest | KG 图构建、BFS 遍历、中心性计算 | + +### 5.3 完整依赖安装 + +```bash +uv pip install langchain langchain-openai langgraph networkx \ + --python F:/GraphRAGAgent/langextract_src/.venv/Scripts/python.exe +``` + +### 5.4 环境变量 + +`F:\GraphRAGAgent\graphrag_pipeline\.env`: + +```env +DEEPSEEK_API_KEY=sk-xxxxxxxxxxxxxxxx +DEEPSEEK_BASE_URL=https://api.deepseek.com +``` + +--- + +## 附录:各阶段文件依赖速查 + +| 阶段 | 输入 | 输出 | 关键脚本 | +|------|------|------|---------| +| MinerU 解析 | `*.pdf` | `{uuid}_content_list.json` | `mineru_mvp/pipeline.py` | +| Bridge Pipeline | `*_content_list.json` | `kg_nodes.json` + `kg_edges.json` | `graphrag_pipeline/bridge.py` | +| Agentic-RAG | `kg_nodes.json` + `kg_edges.json` | 自然语言答案 | `graphrag_pipeline/agentic_rag_mvp.py` | + +| 规范文档 | 覆盖范围 | +|---------|---------| +| `docs/mineru_specification-v1.0.md` | MinerU 解析阶段输入/输出 | +| `docs/langextract_specification-v1.0.md` | LangExtract 实体抽取参数 | +| `docs/bridge_pipeline_specification-v1.0.md` | Bridge Pipeline 对接规范与 KG 输出格式 | +| `docs/agentic_rag_specification-v1.0.md` | **本文件** — Agentic-RAG 问答阶段规范 | diff --git a/docs/backend_service_specification-v1.0.md b/docs/backend_service_specification-v1.0.md new file mode 100644 index 0000000..5373e80 --- /dev/null +++ b/docs/backend_service_specification-v1.0.md @@ -0,0 +1,1757 @@ +# 多模态 RAG 后端服务接口规范 v1.0 + +> 基于 MinerU + LangExtract Bridge Pipeline + Agentic-RAG MVP 实测验证结果 +> Web 框架:FastAPI (Python 3.12 async) +> 存储方案:纯文件系统(JSON) +> 更新日期:2026-03-05 + +--- + +## 目录 + +- [一、系统架构总览](#一系统架构总览) + - [1.1 四层架构](#11-四层架构) + - [1.2 双 venv 协调方案](#12-双-venv-协调方案) + - [1.3 完整数据流](#13-完整数据流) + - [1.4 Job 状态机](#14-job-状态机) + - [1.5 FastAPI 项目目录结构](#15-fastapi-项目目录结构) + - [1.6 文件系统存储结构](#16-文件系统存储结构) +- [二、统一响应封装格式](#二统一响应封装格式) + - [2.1 通用响应结构](#21-通用响应结构) + - [2.2 错误码体系](#22-错误码体系) +- [三、核心数据对象 Schema](#三核心数据对象-schema) + - [3.1 DocumentInfo](#31-documentinfo) + - [3.2 IndexingJobStatus](#32-indexingjobstatus) + - [3.3 KGNode](#33-kgnode) + - [3.4 KGEdge](#34-kgedge) + - [3.5 ExtractionRecord](#35-extractionrecord) + - [3.6 QAResult](#36-qaresult) +- [四、A 组:文档管理(4 个端点)](#四a-组文档管理4-个端点) +- [五、B 组:Indexing Pipeline(4 个端点)](#五b-组indexing-pipeline4-个端点) +- [六、C 组:知识图谱(6 个端点)](#六c-组知识图谱6-个端点) +- [七、D 组:QA 问答(4 个端点)](#七d-组qa-问答4-个端点) +- [八、E 组:搜索(3 个端点)](#八e-组搜索3-个端点) +- [九、F 组:系统(4 个端点)](#九f-组系统4-个端点) +- [十、文件格式支持矩阵](#十文件格式支持矩阵) +- [十一、依赖与运行](#十一依赖与运行) + +--- + +## 一、系统架构总览 + +### 1.1 四层架构 + +``` +┌─────────────────────────────────────────────────────────────────────┐ +│ 客户端层 │ +│ 浏览器 / API 调用方 / 可视化前端 │ +└──────────────────────────────┬──────────────────────────────────────┘ + │ HTTP/HTTPS +┌──────────────────────────────▼──────────────────────────────────────┐ +│ API 网关层 │ +│ Nginx 反向代理 | 限流(per-IP/per-key) | 请求日志 | TLS 终止 │ +└──────────────────────────────┬──────────────────────────────────────┘ + │ +┌──────────────────────────────▼──────────────────────────────────────┐ +│ 服务层 — FastAPI Application │ +│ Python 3.12 async / uvicorn │ +│ │ +│ ┌────────────────┐ ┌────────────────┐ ┌───────────────────────┐ │ +│ │ DocumentService│ │ IndexingService│ │ KGService │ │ +│ │ 文件上传/管理 │ │ Pipeline 调度 │ │ NetworkX 图操作 │ │ +│ └────────────────┘ └────────────────┘ └───────────────────────┘ │ +│ ┌────────────────┐ ┌────────────────┐ ┌───────────────────────┐ │ +│ │ QAService │ │ SearchService │ │ SystemService │ │ +│ │ Agentic-RAG │ │ 实体/图谱搜索 │ │ 健康检查 / 统计 │ │ +│ └────────────────┘ └────────────────┘ └───────────────────────┘ │ +└──────────────────────────────┬──────────────────────────────────────┘ + │ +┌──────────────────────────────▼──────────────────────────────────────┐ +│ Pipeline 执行层 │ +│ │ +│ ┌──────────────────────────────────────────────────────────────┐ │ +│ │ MinerU Pipeline(subprocess → mineru_mvp/.venv) │ │ +│ │ 输入: 文件路径 输出: *content_list.json + layout.json │ │ +│ └──────────────────────────────────────────────────────────────┘ │ +│ ┌──────────────────────────────────────────────────────────────┐ │ +│ │ Bridge Pipeline(直接 import → langextract_src/.venv) │ │ +│ │ text_assembler → entity_extractor → kg_builder │ │ +│ │ 输出: kg_nodes.json + kg_edges.json │ │ +│ └──────────────────────────────────────────────────────────────┘ │ +│ ┌──────────────────────────────────────────────────────────────┐ │ +│ │ Agentic-RAG(LangChain create_agent → langextract_src/.venv)│ │ +│ │ 工具: search_entities / get_neighbors / get_entities_by_type │ │ +│ │ describe_graph │ │ +│ │ LLM: DeepSeek deepseek-chat via ChatOpenAI │ │ +│ └──────────────────────────────────────────────────────────────┘ │ +└──────────────────────────────┬──────────────────────────────────────┘ + │ +┌──────────────────────────────▼──────────────────────────────────────┐ +│ 存储层(纯文件系统) │ +│ uploads/ ← 原始上传文件 │ +│ jobs/{job_id}/ ← 每个 job 的中间产物和结果 JSON │ +│ kg/ ← 全局合并的 KG(kg_nodes.json + kg_edges.json) │ +└─────────────────────────────────────────────────────────────────────┘ +``` + +### 1.2 双 venv 协调方案 + +项目中存在两个隔离的 Python 虚拟环境,FastAPI 服务通过以下方式协调: + +| 组件 | 虚拟环境 | 调用方式 | +|------|---------|---------| +| **FastAPI 服务本体** | `langextract_src/.venv` | 直接运行 | +| **Bridge Pipeline** | `langextract_src/.venv` | `from text_assembler import ...` 直接 import | +| **Agentic-RAG** | `langextract_src/.venv` | `from agentic_rag_mvp import ...` 直接 import | +| **MinerU Pipeline** | `mineru_mvp/.venv` | `subprocess.run([MINERU_PYTHON, MINERU_PIPELINE, pdf_path])` | + +```python +# 双 venv 协调核心代码 +MINERU_PYTHON = Path("F:/GraphRAGAgent/mineru_mvp/.venv/Scripts/python.exe") +MINERU_PIPELINE = Path("F:/GraphRAGAgent/mineru_mvp/pipeline.py") + +# Stage 1: MinerU — subprocess 隔离调用 +result = subprocess.run( + [str(MINERU_PYTHON), str(MINERU_PIPELINE), str(pdf_path)], + cwd=str(MINERU_DIR), capture_output=True, text=True, timeout=600 +) + +# Stage 2-4: Bridge + RAG — 直接 import(同 venv) +from text_assembler import load_content_list, assemble_pages +from entity_extractor import create_model, extract_entities +from kg_builder import build_kg +``` + +### 1.3 完整数据流 + +``` +上传文件(PDF/DOCX/PPT/PNG/JPG/HTML) + │ + ▼ POST /api/v1/documents/upload +DocumentService: 保存到 uploads/{doc_id}_{filename} + │ + ▼ POST /api/v1/index/start +IndexingService: 启动后台 threading.Thread + │ + ├─ Stage: parsing + │ MinerU subprocess → mineru_mvp/output/{stem}/*_content_list.json + │ + ├─ Stage: extracting + │ text_assembler.assemble_pages() → PageText[] + │ entity_extractor.extract_entities() → AnnotatedDocument[] + │ → ExtractionRecord[] 保存到 jobs/{job_id}/extractions.json + │ + ├─ Stage: indexing + │ kg_builder.build_kg() → KGNode[] + KGEdge[] + │ → 保存到 jobs/{job_id}/kg_nodes.json + kg_edges.json + │ → 合并到全局 kg/kg_nodes.json + kg/kg_edges.json + │ + └─ Status: done + GET /api/v1/index/result/{job_id} → 完整结果 + +用户查询(自然语言问题) + │ + ▼ POST /api/v1/query +QAService: 加载全局 KG → NetworkX Graph + │ + ├─ LangChain create_agent(DeepSeek) + │ ReAct 循环: think → tool_call → observe → repeat + │ 工具调用链: search_entities / get_neighbors / ... + │ + └─ QAResult: answer + tool_calls + cited_nodes +``` + +### 1.4 Job 状态机 + +``` + ┌─────────┐ + │submitted│ + └────┬────┘ + │ 后台线程启动 + ┌────▼────┐ + │ queued │ (等待线程池,当前实现立即转 parsing) + └────┬────┘ + │ MinerU subprocess 开始 + ┌────▼────┐ + │ parsing │ MinerU 云端 API 解析 + └────┬────┘ + │ content_list.json 就绪 + ┌─────▼──────┐ + │ extracting │ LangExtract + DeepSeek 实体抽取 + └─────┬──────┘ + │ extractions.json 就绪 + ┌─────▼──────┐ + │ indexing │ kg_builder 构建知识图谱 + └─────┬──────┘ + │ kg_nodes/edges 就绪 + ┌──────────▼──────────┐ + ┌─────▼─────┐ ┌──────▼──────┐ + │ done │ │ failed │ + └───────────┘ └─────────────┘ +``` + +**进度字段说明(`progress` 对象):** + +| 阶段 | `parsed_pages` | `total_pages` | `extracted_entities` | +|------|----------------|---------------|----------------------| +| parsing | 实时更新(MinerU 进度) | MinerU 返回总页数 | 0 | +| extracting | total_pages | total_pages | 实时累加 | +| indexing | total_pages | total_pages | 最终值 | +| done | total_pages | total_pages | 最终值 | + +### 1.5 FastAPI 项目目录结构 + +``` +F:\GraphRAGAgent\graphrag_pipeline\ +├── api_server.py # FastAPI 主入口(app 实例、路由注册、启动配置) +├── routers/ +│ ├── __init__.py +│ ├── documents.py # A 组:文档管理(4 个端点) +│ ├── indexing.py # B 组:Indexing Pipeline(4 个端点) +│ ├── kg.py # C 组:知识图谱(6 个端点) +│ ├── query.py # D 组:QA 问答(4 个端点) +│ ├── search.py # E 组:搜索(3 个端点) +│ └── system.py # F 组:系统(4 个端点) +├── services/ +│ ├── __init__.py +│ ├── document_service.py # 文件保存、元数据读写 +│ ├── indexing_service.py # Pipeline 调度(MinerU subprocess + Bridge import) +│ ├── kg_service.py # NetworkX 图加载、BFS、中心性计算 +│ ├── qa_service.py # create_agent 封装、ReAct 调用、结果解析 +│ └── search_service.py # 实体搜索、路径搜索、子图搜索 +├── models/ +│ ├── __init__.py +│ └── schemas.py # Pydantic v2 models(所有数据对象 Schema) +├── storage/ +│ ├── __init__.py +│ └── file_store.py # 统一文件读写(JSON 序列化/反序列化、目录管理) +├── .env # DEEPSEEK_API_KEY + DEEPSEEK_BASE_URL + MINERU_API_TOKEN +│ +│ # 现有文件(不修改) +├── bridge.py +├── text_assembler.py +├── entity_extractor.py +├── kg_builder.py +├── agentic_rag_mvp.py +├── web_server.py # 旧 Flask 原型(保留,不删除) +└── output/ + ├── kg_nodes.json # 向后兼容的全局 KG(与 kg/ 目录同步) + └── kg_edges.json +``` + +### 1.6 文件系统存储结构 + +``` +F:\GraphRAGAgent\graphrag_pipeline\ +│ +├── uploads/ +│ └── {doc_id}_{filename} # 上传的原始文件(如 abc12345_paper.pdf) +│ +├── jobs/ +│ └── {job_id}/ +│ ├── meta.json # job 元数据 +│ │ { +│ │ "job_id": "job_xyz789", +│ │ "doc_id": "abc12345", +│ │ "status": "done", +│ │ "stage": "Complete", +│ │ "progress": {...}, +│ │ "created_at": "ISO8601", +│ │ "elapsed_seconds": 42.1, +│ │ "error": null, +│ │ "pdf_name": "paper.pdf", +│ │ "pdf_path": "uploads/abc12345_paper.pdf" +│ │ } +│ ├── mineru_output/ # MinerU 解析产物(原样保留) +│ │ ├── {uuid}_content_list.json +│ │ ├── layout.json +│ │ ├── full.md +│ │ ├── {uuid}_origin.pdf +│ │ └── images/ +│ │ └── {sha256}.jpg +│ ├── extractions.json # LangExtract 全部抽取记录(ExtractionRecord[]) +│ ├── kg_nodes.json # 本 job 生成的 KG 节点(KGNode[]) +│ └── kg_edges.json # 本 job 生成的 KG 边(KGEdge[]) +│ +└── kg/ + ├── kg_nodes.json # 全局合并的 KG 节点(所有 job 合并去重) + └── kg_edges.json # 全局合并的 KG 边(所有 job 合并去重) +``` + +--- + +## 二、统一响应封装格式 + +### 2.1 通用响应结构 + +所有 API 端点均使用以下统一包装格式: + +```json +{ + "code": 0, + "msg": "success", + "request_id": "f47ac10b-58cc-4372-a567-0e02b2c3d479", + "data": { ... } +} +``` + +| 字段 | 类型 | 说明 | +|------|------|------| +| `code` | `int` | `0` = 成功;非 `0` = 失败(见错误码表) | +| `msg` | `string` | 状态描述(成功为 `"success"`,失败为错误信息) | +| `request_id` | `string` | UUID v4,用于日志追踪 | +| `data` | `object \| null` | 业务数据(失败时为 `null`) | + +**HTTP 状态码映射:** + +| HTTP 状态码 | 适用场景 | +|------------|---------| +| `200 OK` | 同步请求成功 | +| `202 Accepted` | 异步任务已接受(Job 启动) | +| `400 Bad Request` | 参数校验失败(code 1001/1002/1003) | +| `404 Not Found` | 资源不存在(code 2001/3001) | +| `500 Internal Server Error` | 服务器内部错误(code 5000) | + +**FastAPI Pydantic 响应模型:** + +```python +from pydantic import BaseModel +from typing import Generic, TypeVar, Optional +import uuid + +T = TypeVar("T") + +class APIResponse(BaseModel, Generic[T]): + code: int = 0 + msg: str = "success" + request_id: str = str(uuid.uuid4()) + data: Optional[T] = None +``` + +### 2.2 错误码体系 + +| code | HTTP 状态码 | 含义 | 说明 | +|------|------------|------|------| +| `0` | 200 | 成功 | | +| `1001` | 400 | 参数校验失败 | 缺少必填字段或类型错误 | +| `1002` | 400 | 文件格式不支持 | 仅支持 pdf/docx/doc/pptx/ppt/png/jpg/jpeg/html | +| `1003` | 400 | 文件超出大小限制 | 单文件最大 200MB(MinerU 限制) | +| `1004` | 400 | 文件页数超限 | 单文件最大 600 页(MinerU 限制) | +| `2001` | 404 | 文档不存在 | `doc_id` 对应的文档未找到 | +| `2002` | 400 | Job 不存在 | `job_id` 对应的任务未找到 | +| `2003` | 400 | Job 仍在执行 | 请求结果时任务尚未完成 | +| `2004` | 400 | Job 状态不可取消 | 仅 submitted/queued 可取消 | +| `3001` | 404 | KG 节点不存在 | `node_id` 对应节点未找到 | +| `3002` | 400 | KG 为空 | 尚未完成任何 Indexing,无图谱数据 | +| `4001` | 500 | QA 服务异常 | LangChain Agent 或 DeepSeek API 调用失败 | +| `5000` | 500 | 服务器内部错误 | 未预期的系统异常 | + +**错误响应示例:** + +```json +{ + "code": 1002, + "msg": "Unsupported file format: .xlsx. Supported formats: pdf, docx, doc, pptx, ppt, png, jpg, jpeg, html", + "request_id": "f47ac10b-58cc-4372-a567-0e02b2c3d479", + "data": null +} +``` + +--- + +## 三、核心数据对象 Schema + +### 3.1 DocumentInfo + +文档元数据对象,由 `POST /api/v1/documents/upload` 创建,持久化到 `jobs/` 下的 `meta.json`。 + +```json +{ + "doc_id": "abc12345", + "filename": "graphrag_overview.pdf", + "format": "pdf", + "size_bytes": 1048576, + "pages": 4, + "uploaded_at": "2026-03-05T10:00:00Z", + "status": "indexed", + "language": "en", + "enable_formula": true, + "enable_table": true +} +``` + +| 字段 | 类型 | 说明 | +|------|------|------| +| `doc_id` | `string` | 文档唯一 ID(UUID hex 前 8 位,如 `"abc12345"`) | +| `filename` | `string` | 原始文件名 | +| `format` | `string` | 文件格式(小写扩展名,不含点) | +| `size_bytes` | `int` | 文件大小(字节) | +| `pages` | `int \| null` | 总页数(MinerU 解析后填充;上传时为 `null`) | +| `uploaded_at` | `string` | ISO 8601 上传时间 | +| `status` | `string` | `"uploaded"` / `"indexed"` / `"failed"` | +| `language` | `string` | OCR 语言码(PaddleOCR,默认 `"ch"`) | +| `enable_formula` | `bool` | 是否启用公式识别 | +| `enable_table` | `bool` | 是否启用表格识别 | + +### 3.2 IndexingJobStatus + +Indexing Pipeline 的任务状态对象。 + +```json +{ + "job_id": "job_xyz789", + "doc_id": "abc12345", + "status": "extracting", + "stage": "Extracting entities (LangExtract + DeepSeek)...", + "progress": { + "parsed_pages": 4, + "total_pages": 4, + "extracted_entities": 23 + }, + "created_at": "2026-03-05T10:00:05Z", + "elapsed_seconds": 18.3, + "error": null +} +``` + +| 字段 | 类型 | 说明 | +|------|------|------| +| `job_id` | `string` | 任务唯一 ID(`"job_"` + UUID hex 前 8 位) | +| `doc_id` | `string` | 关联文档 ID | +| `status` | `string` | 状态枚举(见 1.4 状态机) | +| `stage` | `string` | 当前阶段人类可读描述 | +| `progress.parsed_pages` | `int` | 已解析页数 | +| `progress.total_pages` | `int` | 总页数(0 = 未知) | +| `progress.extracted_entities` | `int` | 已抽取实体数 | +| `created_at` | `string` | ISO 8601 任务创建时间 | +| `elapsed_seconds` | `float` | 已耗时(秒) | +| `error` | `string \| null` | 错误信息(失败时非 null) | + +### 3.3 KGNode + +知识图谱节点,直接对应 `kg_nodes.json` 格式,新增 `degree` 字段。 + +```json +{ + "id": "tech_graphrag_0", + "name": "GraphRAG", + "type": "TECHNOLOGY", + "source_doc": "abc12345", + "char_start": 0, + "char_end": 8, + "confidence": "match_exact", + "page": 0, + "degree": 39 +} +``` + +| 字段 | 类型 | 说明 | +|------|------|------| +| `id` | `string` | 节点唯一 ID(来自 kg_nodes.json) | +| `name` | `string` | 实体名称 | +| `type` | `string` | 实体类型:`TECHNOLOGY` / `CONCEPT` / `PERSON` / `ORGANIZATION` / `LOCATION` | +| `source_doc` | `string` | 来源文档 ID(doc_id) | +| `char_start` | `int` | 实体在原文中的起始字符位置(LangExtract `char_interval.start_pos`) | +| `char_end` | `int` | 实体在原文中的结束字符位置(不含,`char_interval.end_pos`) | +| `confidence` | `string` | LangExtract 对齐状态:`match_exact` / `match_greater` / `match_lesser` / `match_fuzzy` | +| `page` | `int` | 所在页码(0-indexed,来自 MinerU content_list.json `page_idx`) | +| `degree` | `int` | 节点度数(连接边数,NetworkX 计算,仅 API 返回时填充) | + +### 3.4 KGEdge + +知识图谱边,直接对应 `kg_edges.json` 格式。 + +```json +{ + "source": "tech_graphrag_0", + "target": "concept_knowledgegraph_1", + "relation": "CO_OCCURS_IN", + "doc_id": "abc12345", + "page": 0 +} +``` + +| 字段 | 类型 | 说明 | +|------|------|------| +| `source` | `string` | 起始节点 ID | +| `target` | `string` | 目标节点 ID | +| `relation` | `string` | 关系类型(当前固定为 `"CO_OCCURS_IN"`,表示同页共现) | +| `doc_id` | `string` | 边来源文档 ID | +| `page` | `int` | 共现所在页码(0-indexed) | + +### 3.5 ExtractionRecord + +LangExtract 单条实体抽取记录,对应 `AnnotatedDocument.extractions[]` 的扁平化结构。 + +```json +{ + "text": "GraphRAG", + "type": "TECHNOLOGY", + "char_start": 0, + "char_end": 8, + "alignment": "match_exact", + "page": 0, + "doc_id": "abc12345" +} +``` + +| 字段 | 类型 | 说明 | +|------|------|------| +| `text` | `string` | 实体文本(`extraction_text`,原文子串) | +| `type` | `string` | 实体类型(`extraction_class`) | +| `char_start` | `int \| null` | 字符起始位置(`char_interval.start_pos`) | +| `char_end` | `int \| null` | 字符结束位置(`char_interval.end_pos`,不含) | +| `alignment` | `string \| null` | 对齐状态(`alignment_status.value`,`null` 表示未对齐) | +| `page` | `int` | 所在页码(0-indexed) | +| `doc_id` | `string` | 来源文档 ID | + +> **过滤规则**:KG 构建时过滤掉 `alignment = null`(未对齐),`match_fuzzy` 根据项目配置可选是否过滤。当前实测:`match_exact` 占 94%+。 + +### 3.6 QAResult + +Agentic-RAG 问答返回对象,包含答案 + 完整推理溯源链。 + +```json +{ + "query_id": "q_20260305_001", + "question": "What is GraphRAG and how does it relate to knowledge graphs?", + "answer": "GraphRAG is a knowledge graph-enhanced retrieval-augmented generation system...", + "tool_calls": [ + { + "tool": "search_entities", + "input": {"query": "GraphRAG"}, + "output": "Found 1 entity(ies) matching 'GraphRAG':\n [TECHNOLOGY] \"GraphRAG\" (confidence=match_exact, page=0, id=tech_graphrag_0)" + }, + { + "tool": "get_neighbors", + "input": {"entity_name": "GraphRAG", "hops": 1}, + "output": "Neighbors of 'GraphRAG' [TECHNOLOGY] within 1 hop(s):\n Hop 1 — 39 related entities:\n [CONCEPT] knowledge graphs\n ..." + } + ], + "cited_nodes": ["tech_graphrag_0", "concept_knowledgegraph_1"], + "elapsed_seconds": 8.4, + "created_at": "2026-03-05T10:30:00Z" +} +``` + +| 字段 | 类型 | 说明 | +|------|------|------| +| `query_id` | `string` | 查询唯一 ID | +| `question` | `string` | 用户原始问题 | +| `answer` | `string` | Agent 生成的最终自然语言答案(`result["messages"][-1].content`) | +| `tool_calls` | `array` | ReAct 循环中的工具调用记录(顺序) | +| `tool_calls[].tool` | `string` | 工具名(4 个 KG 工具之一) | +| `tool_calls[].input` | `object` | 工具调用参数 | +| `tool_calls[].output` | `string` | 工具返回的文本结果(ToolMessage.content) | +| `cited_nodes` | `string[]` | 答案中引用的节点 ID 列表(从 tool_calls 解析) | +| `elapsed_seconds` | `float` | 问答总耗时(包括所有 LLM 调用) | +| `created_at` | `string` | ISO 8601 查询时间 | + +--- + +## 四、A 组:文档管理(4 个端点) + +### A1. 上传文件 + +``` +POST /api/v1/documents/upload +Content-Type: multipart/form-data +``` + +**Request(Form Data):** + +| 字段 | 类型 | 必填 | 默认值 | 说明 | +|------|------|------|--------|------| +| `file` | `binary` | **是** | — | 文件二进制内容 | +| `language` | `string` | 否 | `"ch"` | OCR 语言(PaddleOCR 语言码) | +| `enable_formula` | `bool` | 否 | `true` | 是否启用公式识别 | +| `enable_table` | `bool` | 否 | `true` | 是否启用表格识别 | + +**验证规则:** +- 文件扩展名必须在支持列表中(见第十章) +- 文件大小不得超过 200MB +- 文件名不得包含路径分隔符(防目录穿越) + +**Response 200:** + +```json +{ + "code": 0, + "msg": "success", + "request_id": "f47ac10b-...", + "data": { + "doc_id": "abc12345", + "filename": "graphrag_overview.pdf", + "format": "pdf", + "size_bytes": 1048576, + "pages": null, + "uploaded_at": "2026-03-05T10:00:00Z", + "status": "uploaded", + "language": "en", + "enable_formula": true, + "enable_table": true + } +} +``` + +**错误响应:** + +```json +// 1002: 格式不支持 +{ "code": 1002, "msg": "Unsupported file format: .xlsx", "data": null } + +// 1003: 超过大小限制 +{ "code": 1003, "msg": "File size 256MB exceeds 200MB limit", "data": null } +``` + +--- + +### A2. 获取文档信息 + +``` +GET /api/v1/documents/{doc_id} +``` + +**Path Params:** + +| 参数 | 类型 | 说明 | +|------|------|------| +| `doc_id` | `string` | 文档 ID | + +**Response 200:** + +```json +{ + "code": 0, + "msg": "success", + "request_id": "...", + "data": { + "doc_id": "abc12345", + "filename": "graphrag_overview.pdf", + "format": "pdf", + "size_bytes": 1048576, + "pages": 4, + "uploaded_at": "2026-03-05T10:00:00Z", + "status": "indexed", + "language": "en", + "enable_formula": true, + "enable_table": true + } +} +``` + +**错误:** `2001` (doc_id 不存在) + +--- + +### A3. 列出所有文档 + +``` +GET /api/v1/documents +``` + +**Query Params:** + +| 参数 | 类型 | 默认值 | 说明 | +|------|------|--------|------| +| `page` | `int` | `1` | 页码(从 1 开始) | +| `page_size` | `int` | `20` | 每页数量(最大 100) | +| `status` | `string` | — | 按状态筛选:`uploaded` / `indexed` / `failed` | +| `format` | `string` | — | 按格式筛选:如 `pdf` | + +**Response 200:** + +```json +{ + "code": 0, + "msg": "success", + "request_id": "...", + "data": { + "total": 5, + "page": 1, + "page_size": 20, + "items": [ + { + "doc_id": "abc12345", + "filename": "graphrag_overview.pdf", + "format": "pdf", + "size_bytes": 1048576, + "pages": 4, + "uploaded_at": "2026-03-05T10:00:00Z", + "status": "indexed", + "language": "en", + "enable_formula": true, + "enable_table": true + } + ] + } +} +``` + +--- + +### A4. 删除文档 + +``` +DELETE /api/v1/documents/{doc_id} +``` + +**说明:** 删除文档及其关联的 job 产物文件(`uploads/`、`jobs/` 下的对应目录),并从全局 KG 中移除该文档贡献的节点和边。 + +**Response 200:** + +```json +{ + "code": 0, + "msg": "success", + "request_id": "...", + "data": { + "deleted": true, + "doc_id": "abc12345", + "removed_nodes": 40, + "removed_edges": 780 + } +} +``` + +**错误:** `2001` (doc_id 不存在) + +--- + +## 五、B 组:Indexing Pipeline(4 个端点) + +### B1. 启动索引任务 + +``` +POST /api/v1/index/start +Content-Type: application/json +``` + +**Request Body:** + +```json +{ + "doc_id": "abc12345" +} +``` + +| 字段 | 类型 | 必填 | 说明 | +|------|------|------|------| +| `doc_id` | `string` | **是** | 已上传文档的 ID(状态须为 `uploaded`) | + +**Response 202:** + +```json +{ + "code": 0, + "msg": "success", + "request_id": "...", + "data": { + "job_id": "job_xyz789", + "doc_id": "abc12345", + "status": "submitted", + "stage": "Job submitted", + "created_at": "2026-03-05T10:00:05Z" + } +} +``` + +**实现说明:** +```python +# IndexingService 内部实现 +def start_indexing(doc_id: str) -> IndexingJobStatus: + job_id = f"job_{uuid.uuid4().hex[:8]}" + job_dir = JOBS_DIR / job_id + job_dir.mkdir(parents=True) + + meta = { "job_id": job_id, "doc_id": doc_id, "status": "submitted", ... } + save_meta(job_dir / "meta.json", meta) + + thread = threading.Thread(target=run_pipeline, args=(job_id,), daemon=True) + thread.start() + return meta +``` + +**Pipeline 执行顺序(后台线程):** + +1. `status = "parsing"` → `subprocess.run([MINERU_PYTHON, MINERU_PIPELINE, pdf_path])` +2. `status = "extracting"` → `load_content_list()` → `assemble_pages()` → `extract_entities()` per page +3. `status = "indexing"` → `build_kg()` → 保存 `jobs/{job_id}/kg_nodes.json` → 合并到 `kg/` +4. `status = "done"` + +--- + +### B2. 查询任务状态(含实时进度) + +``` +GET /api/v1/index/status/{job_id} +``` + +**推荐轮询间隔:** 3 秒 + +**Response 200:** + +```json +{ + "code": 0, + "msg": "success", + "request_id": "...", + "data": { + "job_id": "job_xyz789", + "doc_id": "abc12345", + "status": "extracting", + "stage": "Extracting entities page 2/4 (LangExtract + DeepSeek)...", + "progress": { + "parsed_pages": 4, + "total_pages": 4, + "extracted_entities": 23 + }, + "created_at": "2026-03-05T10:00:05Z", + "elapsed_seconds": 18.3, + "error": null + } +} +``` + +**各状态 `stage` 典型值:** + +| status | stage | +|--------|-------| +| `submitted` | `"Job submitted"` | +| `queued` | `"Waiting for worker..."` | +| `parsing` | `"MinerU PDF parsing (cloud API)..."` | +| `extracting` | `"Extracting entities page 2/4 (LangExtract + DeepSeek)..."` | +| `indexing` | `"Building knowledge graph..."` | +| `done` | `"Complete"` | +| `failed` | `"Error: {error message}"` | + +**错误:** `2002` (job_id 不存在) + +--- + +### B3. 获取索引结果(完整数据) + +``` +GET /api/v1/index/result/{job_id} +``` + +**Response 200(status = done):** + +```json +{ + "code": 0, + "msg": "success", + "request_id": "...", + "data": { + "job_id": "job_xyz789", + "doc_id": "abc12345", + "status": "done", + "stats": { + "blocks": 32, + "block_types": {"text": 31, "table": 1}, + "pages": 4, + "raw_extractions": 45, + "nodes": 40, + "edges": 780, + "type_counts": {"TECHNOLOGY": 4, "CONCEPT": 36}, + "alignment_counts": {"match_exact": 40, "match_fuzzy": 5}, + "elapsed_seconds": 42.1 + }, + "extractions": [ + { + "text": "GraphRAG", + "type": "TECHNOLOGY", + "char_start": 0, + "char_end": 8, + "alignment": "match_exact", + "page": 0, + "doc_id": "abc12345" + } + ], + "nodes": [ + { + "id": "tech_graphrag_0", + "name": "GraphRAG", + "type": "TECHNOLOGY", + "source_doc": "abc12345", + "char_start": 0, + "char_end": 8, + "confidence": "match_exact", + "page": 0, + "degree": 39 + } + ], + "edges": [ + { + "source": "tech_graphrag_0", + "target": "concept_knowledgegraph_1", + "relation": "CO_OCCURS_IN", + "doc_id": "abc12345", + "page": 0 + } + ] + } +} +``` + +**Response 200(status ≠ done):** 返回 `IndexingJobStatus`(不含 stats/extractions/nodes/edges) + +**错误:** `2002` (job_id 不存在) + +--- + +### B4. 取消任务 + +``` +DELETE /api/v1/index/jobs/{job_id} +``` + +**限制:** 仅 `submitted` 或 `queued` 状态可取消;`parsing`/`extracting`/`indexing` 状态无法中断后台线程,仅标记状态为 `cancelled`。 + +**Response 200:** + +```json +{ + "code": 0, + "msg": "success", + "request_id": "...", + "data": { + "cancelled": true, + "job_id": "job_xyz789", + "previous_status": "submitted" + } +} +``` + +**错误:** `2002` (不存在), `2004` (状态不可取消) + +--- + +## 六、C 组:知识图谱(6 个端点) + +### C1. 获取所有节点(分页 + 筛选) + +``` +GET /api/v1/kg/nodes +``` + +**Query Params:** + +| 参数 | 类型 | 默认值 | 说明 | +|------|------|--------|------| +| `type` | `string` | — | 实体类型筛选(大小写不敏感) | +| `doc_id` | `string` | — | 按来源文档筛选 | +| `confidence` | `string` | — | 对齐状态筛选(如 `match_exact`) | +| `page` | `int` | `1` | 页码 | +| `page_size` | `int` | `50` | 每页数量(最大 200) | + +**Response 200:** + +```json +{ + "code": 0, + "msg": "success", + "request_id": "...", + "data": { + "total": 40, + "page": 1, + "page_size": 50, + "items": [ + { + "id": "tech_graphrag_0", + "name": "GraphRAG", + "type": "TECHNOLOGY", + "source_doc": "abc12345", + "char_start": 0, + "char_end": 8, + "confidence": "match_exact", + "page": 0, + "degree": 39 + } + ] + } +} +``` + +**错误:** `3002` (KG 为空) + +--- + +### C2. 获取所有边(分页) + +``` +GET /api/v1/kg/edges +``` + +**Query Params:** + +| 参数 | 类型 | 默认值 | 说明 | +|------|------|--------|------| +| `doc_id` | `string` | — | 按来源文档筛选 | +| `relation` | `string` | — | 关系类型筛选(如 `CO_OCCURS_IN`) | +| `page` | `int` | `1` | 页码 | +| `page_size` | `int` | `100` | 每页数量(最大 500) | + +**Response 200:** + +```json +{ + "code": 0, + "msg": "success", + "request_id": "...", + "data": { + "total": 780, + "page": 1, + "page_size": 100, + "items": [ + { + "source": "tech_graphrag_0", + "target": "concept_knowledgegraph_1", + "relation": "CO_OCCURS_IN", + "doc_id": "abc12345", + "page": 0 + } + ] + } +} +``` + +--- + +### C3. 获取单个节点详情 + +``` +GET /api/v1/kg/nodes/{node_id} +``` + +**Response 200:** + +```json +{ + "code": 0, + "msg": "success", + "request_id": "...", + "data": { + "id": "tech_graphrag_0", + "name": "GraphRAG", + "type": "TECHNOLOGY", + "source_doc": "abc12345", + "char_start": 0, + "char_end": 8, + "confidence": "match_exact", + "page": 0, + "degree": 39, + "degree_centrality": 1.000, + "neighbor_count": 39 + } +} +``` + +**额外字段(仅单节点详情):** + +| 字段 | 说明 | +|------|------| +| `degree_centrality` | NetworkX `degree_centrality(G)[node_id]`(0-1 范围) | +| `neighbor_count` | 直接邻居数量(等于 `degree`) | + +**错误:** `3001` (节点不存在) + +--- + +### C4. 获取节点邻居(N-hop BFS) + +``` +GET /api/v1/kg/nodes/{node_id}/neighbors +``` + +**Query Params:** + +| 参数 | 类型 | 默认值 | 说明 | +|------|------|--------|------| +| `hops` | `int` | `1` | 跳数(1-3) | + +**Response 200:** + +```json +{ + "code": 0, + "msg": "success", + "request_id": "...", + "data": { + "center": { + "id": "tech_graphrag_0", + "name": "GraphRAG", + "type": "TECHNOLOGY", + "page": 0 + }, + "hops": 1, + "neighbors_by_hop": { + "1": [ + { "id": "concept_knowledgegraph_1", "name": "knowledge graphs", "type": "CONCEPT", "page": 0 } + ] + }, + "total_neighbors": 39 + } +} +``` + +**实现参考(来自 `agentic_rag_mvp.py`):** + +```python +reachable = nx.single_source_shortest_path_length(G, node_id, cutoff=hops) +by_hop = {dist: [] for dist in range(1, hops+1)} +for nid, dist in reachable.items(): + if dist > 0: + by_hop[dist].append(G.nodes[nid]) +``` + +**错误:** `3001` (节点不存在) + +--- + +### C5. 知识图谱统计 + +``` +GET /api/v1/kg/stats +``` + +**Response 200:** + +```json +{ + "code": 0, + "msg": "success", + "request_id": "...", + "data": { + "total_nodes": 40, + "total_edges": 780, + "density": 1.0000, + "type_distribution": { + "TECHNOLOGY": 4, + "CONCEPT": 36 + }, + "relation_types": { + "CO_OCCURS_IN": 780 + }, + "top5_central_nodes": [ + { "node_id": "tech_graphrag_0", "name": "GraphRAG", "type": "TECHNOLOGY", "centrality": 1.000 }, + { "node_id": "concept_kgrag_1", "name": "Knowledge Graph Enhanced RAG System", "type": "CONCEPT", "centrality": 1.000 }, + { "node_id": "concept_rag_2", "name": "retrieval-augmented generation", "type": "CONCEPT", "centrality": 1.000 }, + { "node_id": "concept_kg_3", "name": "knowledge graphs", "type": "CONCEPT", "centrality": 1.000 }, + { "node_id": "concept_llm_4", "name": "large language models", "type": "CONCEPT", "centrality": 1.000 } + ], + "source_documents": ["abc12345", "def67890"] + } +} +``` + +--- + +### C6. 导出完整 KG + +``` +GET /api/v1/kg/export +``` + +**Query Params:** + +| 参数 | 类型 | 默认值 | 说明 | +|------|------|--------|------| +| `format` | `string` | `"json"` | 导出格式(当前仅支持 `json`) | +| `doc_id` | `string` | — | 可选,仅导出指定文档的 KG | + +**Response 200:** + +```json +{ + "code": 0, + "msg": "success", + "request_id": "...", + "data": { + "format": "json", + "doc_id": null, + "total_nodes": 40, + "total_edges": 780, + "exported_at": "2026-03-05T12:00:00Z", + "nodes": [ ...KGNode[] ], + "edges": [ ...KGEdge[] ] + } +} +``` + +--- + +## 七、D 组:QA 问答(4 个端点) + +### D1. 提交 QA 查询(同步) + +``` +POST /api/v1/query +Content-Type: application/json +``` + +**Request Body:** + +```json +{ + "question": "What is GraphRAG and how does it relate to knowledge graphs?", + "history": [ + { "role": "human", "content": "Previous question..." }, + { "role": "ai", "content": "Previous answer..." } + ] +} +``` + +| 字段 | 类型 | 必填 | 说明 | +|------|------|------|------| +| `question` | `string` | **是** | 用户自然语言问题 | +| `history` | `array` | 否 | 多轮对话历史(最多 10 轮,即 20 条消息) | +| `history[].role` | `"human"` \| `"ai"` | — | 消息角色 | +| `history[].content` | `string` | — | 消息内容 | + +**Response 200:** + +```json +{ + "code": 0, + "msg": "success", + "request_id": "...", + "data": { + "query_id": "q_20260305_a1b2c3", + "question": "What is GraphRAG and how does it relate to knowledge graphs?", + "answer": "Based on the knowledge graph, GraphRAG [TECHNOLOGY] is a knowledge graph-enhanced retrieval-augmented generation system that...", + "tool_calls": [ + { + "tool": "search_entities", + "input": { "query": "GraphRAG" }, + "output": "Found 1 entity(ies) matching 'GraphRAG':\n [TECHNOLOGY] \"GraphRAG\" (confidence=match_exact, page=0, id=tech_graphrag_0)" + }, + { + "tool": "get_neighbors", + "input": { "entity_name": "GraphRAG", "hops": 1 }, + "output": "Neighbors of 'GraphRAG' [TECHNOLOGY] within 1 hop(s):\n Hop 1 — 39 related entities:\n [CONCEPT] knowledge graphs\n ..." + } + ], + "cited_nodes": ["tech_graphrag_0", "concept_knowledgegraph_1"], + "elapsed_seconds": 8.4, + "created_at": "2026-03-05T10:30:00Z" + } +} +``` + +**实现说明(QAService 核心逻辑):** + +```python +# 将 history 拼接为 LangChain messages 格式 +messages = [] +for h in request.history: + messages.append((h["role"], h["content"])) +messages.append(("human", request.question)) + +# 调用 LangChain create_agent +result = agent.invoke({"messages": messages}) + +# 提取工具调用链(遍历 result["messages"]) +tool_calls = [] +for msg in result["messages"]: + if hasattr(msg, "tool_calls") and msg.tool_calls: + for tc in msg.tool_calls: + tool_calls.append({"tool": tc["name"], "input": tc["args"], "output": ""}) + elif hasattr(msg, "tool_call_id"): # ToolMessage + if tool_calls: + tool_calls[-1]["output"] = msg.content + +# 最终答案 +answer = result["messages"][-1].content +``` + +**错误:** `3002` (KG 为空), `4001` (Agent/LLM 调用失败) + +**注意:** 此接口为同步调用,通常耗时 5-30 秒(取决于 DeepSeek API 响应速度和工具调用次数)。 + +--- + +### D2. 批量查询(异步) + +``` +POST /api/v1/query/batch +Content-Type: application/json +``` + +**Request Body:** + +```json +{ + "questions": [ + "What is GraphRAG?", + "List all TECHNOLOGY entities in the knowledge graph.", + "How does MinerU relate to LangExtract?" + ] +} +``` + +| 字段 | 类型 | 必填 | 约束 | 说明 | +|------|------|------|------|------| +| `questions` | `string[]` | **是** | 最多 20 个 | 问题列表 | + +**Response 202:** + +```json +{ + "code": 0, + "msg": "success", + "request_id": "...", + "data": { + "batch_id": "batch_20260305_x1y2", + "total": 3, + "status": "submitted", + "created_at": "2026-03-05T10:30:00Z" + } +} +``` + +--- + +### D3. 获取批量查询状态与结果 + +``` +GET /api/v1/query/batch/{batch_id} +``` + +**Response 200:** + +```json +{ + "code": 0, + "msg": "success", + "request_id": "...", + "data": { + "batch_id": "batch_20260305_x1y2", + "total": 3, + "completed": 2, + "failed": 0, + "status": "running", + "results": [ + { ...QAResult }, + { ...QAResult } + ] + } +} +``` + +**错误:** `2002` (batch_id 不存在) + +--- + +### D4. 查询历史 + +``` +GET /api/v1/query/history +``` + +**Query Params:** + +| 参数 | 类型 | 默认值 | 说明 | +|------|------|--------|------| +| `page` | `int` | `1` | 页码 | +| `page_size` | `int` | `20` | 每页数量(最大 50) | + +**Response 200:** + +```json +{ + "code": 0, + "msg": "success", + "request_id": "...", + "data": { + "total": 50, + "page": 1, + "page_size": 20, + "items": [ ...QAResult[] ] + } +} +``` + +**存储说明:** 历史记录以 JSONL 格式持久化到 `jobs/query_history.jsonl`,每行一条 `QAResult`。 + +--- + +## 八、E 组:搜索(3 个端点) + +### E1. 实体关键词搜索 + +``` +GET /api/v1/search/entities +``` + +**Query Params:** + +| 参数 | 类型 | 必填 | 说明 | +|------|------|------|------| +| `q` | `string` | **是** | 关键词(大小写不敏感子串匹配,对应 `agentic_rag_mvp.py: search_entities`) | +| `type` | `string` | 否 | 类型过滤(如 `TECHNOLOGY`) | +| `limit` | `int` | 否 | 最多返回数量(默认 15,最大 100) | + +**Response 200:** + +```json +{ + "code": 0, + "msg": "success", + "request_id": "...", + "data": { + "query": "GraphRAG", + "total": 1, + "items": [ + { + "id": "tech_graphrag_0", + "name": "GraphRAG", + "type": "TECHNOLOGY", + "source_doc": "abc12345", + "char_start": 0, + "char_end": 8, + "confidence": "match_exact", + "page": 0, + "degree": 39 + } + ] + } +} +``` + +**实现(参考 `agentic_rag_mvp.py: search_entities`):** + +```python +q = query.lower() +matches = [data for _, data in G.nodes(data=True) if q in data.get("name", "").lower()] +``` + +--- + +### E2. 图谱路径搜索(两节点间路径) + +``` +GET /api/v1/search/path +``` + +**Query Params:** + +| 参数 | 类型 | 必填 | 说明 | +|------|------|------|------| +| `from` | `string` | **是** | 起始节点 ID | +| `to` | `string` | **是** | 目标节点 ID | +| `max_hops` | `int` | 否 | 最大路径长度(默认 3,最大 5) | + +**Response 200:** + +```json +{ + "code": 0, + "msg": "success", + "request_id": "...", + "data": { + "from": { "id": "tech_graphrag_0", "name": "GraphRAG", "type": "TECHNOLOGY" }, + "to": { "id": "tech_mineru_3", "name": "MinerU", "type": "TECHNOLOGY" }, + "max_hops": 3, + "paths": [ + { + "length": 1, + "nodes": [ + { "id": "tech_graphrag_0", "name": "GraphRAG", "type": "TECHNOLOGY" }, + { "id": "tech_mineru_3", "name": "MinerU", "type": "TECHNOLOGY" } + ], + "edges": [ + { "source": "tech_graphrag_0", "target": "tech_mineru_3", "relation": "CO_OCCURS_IN" } + ] + } + ], + "total_paths": 1 + } +} +``` + +**实现(NetworkX):** + +```python +paths = list(nx.all_simple_paths(G, from_id, to_id, cutoff=max_hops)) +``` + +**错误:** `3001` (节点不存在) + +--- + +### E3. 全图关键词搜索(含子图) + +``` +GET /api/v1/search/graph +``` + +**Query Params:** + +| 参数 | 类型 | 必填 | 说明 | +|------|------|------|------| +| `q` | `string` | **是** | 关键词(大小写不敏感子串匹配) | +| `include_neighbors` | `bool` | 否 | 是否返回匹配节点的直接邻居边(默认 `false`) | + +**Response 200:** + +```json +{ + "code": 0, + "msg": "success", + "request_id": "...", + "data": { + "query": "retrieval", + "matched_nodes": [ + { "id": "concept_rag_2", "name": "retrieval-augmented generation", "type": "CONCEPT", "page": 0 } + ], + "subgraph_edges": [ + { "source": "concept_rag_2", "target": "tech_graphrag_0", "relation": "CO_OCCURS_IN" } + ] + } +} +``` + +--- + +## 九、F 组:系统(4 个端点) + +### F1. 健康检查 + +``` +GET /api/v1/health +``` + +**Response 200:** + +```json +{ + "code": 0, + "msg": "success", + "request_id": "...", + "data": { + "status": "healthy", + "version": "1.0.0", + "uptime_seconds": 3600, + "components": { + "mineru_venv": { + "status": "ok", + "path": "F:/GraphRAGAgent/mineru_mvp/.venv/Scripts/python.exe", + "exists": true + }, + "langextract_venv": { + "status": "ok", + "path": "F:/GraphRAGAgent/langextract_src/.venv/Scripts/python.exe", + "exists": true + }, + "deepseek_api": { + "status": "ok", + "base_url": "https://api.deepseek.com", + "key_configured": true + }, + "storage": { + "status": "ok", + "kg_nodes_exists": true, + "kg_edges_exists": true, + "uploads_dir_exists": true + } + } + } +} +``` + +**说明:** 此端点仅检查配置和文件存在性,不发起实际 API 调用(避免消耗 DeepSeek token)。 + +--- + +### F2. 系统统计 + +``` +GET /api/v1/system/stats +``` + +**Response 200:** + +```json +{ + "code": 0, + "msg": "success", + "request_id": "...", + "data": { + "total_documents": 5, + "indexed_documents": 4, + "failed_documents": 1, + "total_nodes": 200, + "total_edges": 3900, + "type_distribution": { "TECHNOLOGY": 20, "CONCEPT": 180 }, + "total_queries": 50, + "active_jobs": 1, + "storage_used_mb": 12.4 + } +} +``` + +--- + +### F3. 支持的文件格式列表 + +``` +GET /api/v1/system/formats +``` + +**Response 200:** + +```json +{ + "code": 0, + "msg": "success", + "request_id": "...", + "data": { + "formats": [ + { "ext": "pdf", "description": "PDF 文档(文本型/扫描型/混合型)", "max_size_mb": 200, "max_pages": 600, "requires_ocr": false }, + { "ext": "docx", "description": "Microsoft Word(新版)", "max_size_mb": 200, "max_pages": 600, "requires_ocr": false }, + { "ext": "doc", "description": "Microsoft Word(旧版)", "max_size_mb": 200, "max_pages": 600, "requires_ocr": false }, + { "ext": "pptx", "description": "PowerPoint(新版)", "max_size_mb": 200, "max_pages": 600, "requires_ocr": false }, + { "ext": "ppt", "description": "PowerPoint(旧版)", "max_size_mb": 200, "max_pages": 600, "requires_ocr": false }, + { "ext": "png", "description": "PNG 图片(单页)", "max_size_mb": 200, "max_pages": 1, "requires_ocr": true }, + { "ext": "jpg", "description": "JPEG 图片(单页)", "max_size_mb": 200, "max_pages": 1, "requires_ocr": true }, + { "ext": "jpeg", "description": "JPEG 图片(单页)", "max_size_mb": 200, "max_pages": 1, "requires_ocr": true }, + { "ext": "html", "description": "HTML 文件(需指定 model_version=MinerU-HTML)", "max_size_mb": 200, "max_pages": 600, "requires_ocr": false } + ], + "ocr_languages": [ + { "code": "ch", "name": "中文(默认)" }, + { "code": "en", "name": "英文" }, + { "code": "japan", "name": "日文" }, + { "code": "korean", "name": "韩文" }, + { "code": "french", "name": "法文" }, + { "code": "german", "name": "德文" } + ], + "notes": [ + "language 参数默认值为 'ch'(非 'zh'),遵循 PaddleOCR v3 语言代码规范", + "上传时不需要携带 Content-Type: application/pdf 等,服务端自动识别", + "PNG/JPG/JPEG 单次最多处理 1 页(图片文件视为单页文档)" + ] + } +} +``` + +--- + +### F4. Demo 数据(快速预览) + +``` +GET /api/v1/system/demo +``` + +**说明:** 返回现有 `output/kg_nodes.json` + `output/kg_edges.json` 数据,无需上传 PDF 即可预览 KG 可视化效果。与旧版 `GET /api/demo`(Flask web_server.py)兼容。 + +**Response 200:** + +```json +{ + "code": 0, + "msg": "success", + "request_id": "...", + "data": { + "nodes": [ ...KGNode[] ], + "edges": [ ...KGEdge[] ], + "stats": { + "nodes": 40, + "edges": 780, + "type_counts": { "TECHNOLOGY": 4, "CONCEPT": 36 }, + "density": 1.0000 + } + } +} +``` + +**错误:** `3002` (demo 数据文件不存在,需先运行 bridge.py 生成) + +--- + +## 十、文件格式支持矩阵 + +| 格式 | 扩展名 | 最大体积 | 最大页数 | OCR | MinerU model_version | 说明 | +|------|--------|---------|---------|-----|----------------------|------| +| PDF | `.pdf` | 200MB | 600 页 | 可选 | `pipeline`(默认) | 核心能力,文本型/扫描型/混合型均支持 | +| Word(新) | `.docx` | 200MB | 600 页 | 可选 | `pipeline` | | +| Word(旧) | `.doc` | 200MB | 600 页 | 可选 | `pipeline` | | +| PPT(新) | `.pptx` | 200MB | 600 页 | 可选 | `pipeline` | | +| PPT(旧) | `.ppt` | 200MB | 600 页 | 可选 | `pipeline` | | +| PNG 图片 | `.png` | 200MB | 1 页 | 必须 | `pipeline` | EXIF 方向自动校正 | +| JPEG 图片 | `.jpg` | 200MB | 1 页 | 必须 | `pipeline` | EXIF 方向自动校正 | +| JPEG 图片 | `.jpeg` | 200MB | 1 页 | 必须 | `pipeline` | 同 `.jpg` | +| HTML | `.html` | 200MB | 600 页 | 否 | `MinerU-HTML` | 必须指定特定 model_version | + +**MinerU 云端 API 限制(来自 mineru_specification-v1.0.md):** + +| 约束项 | 限制值 | +|--------|--------| +| 单文件最大体积 | 200 MB | +| 单文件最大页数 | 600 页 | +| 批量请求最大文件数 | 200 个 | +| 预签名上传 URL 有效期 | 24 小时 | +| 云端 API 每日最高优先级额度 | 2,000 页(超出降低优先级) | + +**服务端验证代码(FastAPI + Pydantic):** + +```python +ALLOWED_EXTENSIONS = {"pdf", "docx", "doc", "pptx", "ppt", "png", "jpg", "jpeg", "html"} +MAX_FILE_SIZE_MB = 200 + +async def upload_document(file: UploadFile = File(...), ...): + ext = Path(file.filename).suffix.lower().lstrip(".") + if ext not in ALLOWED_EXTENSIONS: + raise HTTPException(400, detail=f"Unsupported format: .{ext}") + + content = await file.read() + size_mb = len(content) / (1024 * 1024) + if size_mb > MAX_FILE_SIZE_MB: + raise HTTPException(400, detail=f"File size {size_mb:.1f}MB exceeds 200MB limit") +``` + +--- + +## 十一、依赖与运行 + +### 安装依赖 + +```bash +# FastAPI + uvicorn + multipart 文件上传 +uv pip install fastapi uvicorn[standard] python-multipart \ + --python F:/GraphRAGAgent/langextract_src/.venv/Scripts/python.exe + +# 已有依赖(无需重复安装) +# langextract[all]、langchain、langchain-openai、networkx、python-dotenv、flask、requests +``` + +### 启动服务 + +```bash +# 开发模式(--reload 热重载) +F:/GraphRAGAgent/langextract_src/.venv/Scripts/python.exe -m uvicorn \ + graphrag_pipeline.api_server:app \ + --host 0.0.0.0 --port 8000 --reload + +# 或直接运行主入口 +F:/GraphRAGAgent/langextract_src/.venv/Scripts/python.exe \ + F:/GraphRAGAgent/graphrag_pipeline/api_server.py +``` + +### API 文档访问 + +FastAPI 自动生成 OpenAPI 文档,启动后可访问: + +| 地址 | 说明 | +|------|------| +| `http://localhost:8000/api/v1/health` | 健康检查(验证服务启动) | +| `http://localhost:8000/docs` | Swagger UI(交互式 API 文档) | +| `http://localhost:8000/redoc` | ReDoc(只读 API 文档) | +| `http://localhost:8000/openapi.json` | OpenAPI JSON Schema | + +### 端口说明 + +| 服务 | 端口 | 说明 | +|------|------|------| +| **FastAPI(新)** | `8000` | 本规范描述的生产级 API | +| Flask web_server.py(旧) | `5000` | 原型,保留用于对比 | diff --git a/docs/bridge_pipeline_specification-v1.0.md b/docs/bridge_pipeline_specification-v1.0.md new file mode 100644 index 0000000..d67282f --- /dev/null +++ b/docs/bridge_pipeline_specification-v1.0.md @@ -0,0 +1,481 @@ +# Bridge Pipeline Specification v1.0 + +> GraphRAG 索引阶段核心流程:MinerU → LangExtract → Knowledge Graph + +--- + +## 1. Pipeline 执行思路 + +### 1.1 整体架构 + +Bridge Pipeline 是 GraphRAG 索引阶段的核心流程,负责将 MinerU 解析后的结构化 PDF 内容送入 LangExtract 完成实体抽取,最终生成知识图谱的节点(Nodes)和边(Edges)。 + +``` +MinerU output Bridge Pipeline KG output +───────────── ─────────────── ───────── +{uuid}_content_list.json → text_assembler.py + ├─ text blocks ├─ 按页拼接纯文本 + └─ table blocks (HTML) ├─ HTML表格→纯文本 + └─ 记录每个block的char偏移 + → entity_extractor.py + ├─ 逐页调用 lx.extract() + └─ DeepSeek via OpenAI Provider + → kg_builder.py + ├─ 过滤低质量对齐 → kg_nodes.json + ├─ 节点去重 (name.lower(), type) + └─ 同页实体对→CO_OCCURS_IN边 → kg_edges.json +``` + +### 1.2 五步执行流程 + +| 步骤 | 模块 | 说明 | +|------|------|------| +| Step 1 | `bridge.py` | 加载 MinerU 输出 `content_list.json`,解析输入路径和 source_doc_id | +| Step 2 | `text_assembler.py` | 按 `page_idx` 分组,拼接纯文本,记录每个 block 的字符偏移 | +| Step 3 | `entity_extractor.py` | 逐页调用 LangExtract + DeepSeek 完成实体抽取 | +| Step 4 | `kg_builder.py` | 过滤低质量对齐 → 节点去重 → 同页配对生成 CO_OCCURS_IN 边 | +| Step 5 | `bridge.py` | 保存 `kg_nodes.json` + `kg_edges.json` 到 output 目录 | + +### 1.3 文件存放位置 + +``` +F:\GraphRAGAgent\graphrag_pipeline\ +├── .env # DeepSeek API 配置 +├── CLAUDE.md # 组件开发规范 +├── bridge.py # 主入口(串联完整 Pipeline) +├── text_assembler.py # MinerU JSON → 按页纯文本 + 偏移映射 +├── entity_extractor.py # LangExtract + DeepSeek 封装 +├── kg_builder.py # KG 节点去重 + 边生成 +└── output/ + ├── kg_nodes.json # 知识图谱节点(9,851 bytes) + └── kg_edges.json # 知识图谱边(129,093 bytes) +``` + +### 1.4 运行命令 + +```bash +# 使用默认测试输入 +F:/GraphRAGAgent/langextract_src/.venv/Scripts/python.exe F:/GraphRAGAgent/graphrag_pipeline/bridge.py + +# 指定输入文件 +F:/GraphRAGAgent/langextract_src/.venv/Scripts/python.exe F:/GraphRAGAgent/graphrag_pipeline/bridge.py path/to/content_list.json + +# 指定输入目录(自动查找 *_content_list.json) +F:/GraphRAGAgent/langextract_src/.venv/Scripts/python.exe F:/GraphRAGAgent/graphrag_pipeline/bridge.py path/to/output_dir/ +``` + +--- + +## 2. 实际本地输出文档规范 + +### 2.1 测试运行结果 + +- **输入文件**: `F:\GraphRAGAgent\mineru_mvp\output\test_sample\8a719db4-2b50-405b-826d-7bb27b224fa0_content_list.json` +- **输入规模**: 10 blocks(9 text + 1 table),1 页,2102 字符 +- **抽取结果**: 45 raw extractions → 40 去重节点,780 CO_OCCURS_IN 边 +- **对齐质量**: 全部 40 节点均为 `match_exact`(1 个 `match_fuzzy` 已被过滤) +- **执行时间**: ~22s(DeepSeek API 调用) + +### 2.2 kg_nodes.json — 实际输出 + +**文件大小**: 9,851 bytes | **节点数**: 40 + +**节点类型分布**: + +| 类型 | 数量 | 示例 | +|------|------|------| +| TECHNOLOGY | 4 | GraphRAG, MinerU, LLMs, LangExtract | +| CONCEPT | 36 | knowledge graphs, retrieval-augmented generation, multi-hop reasoning | + +**节点格式(实际样例)**: + +```json +{ + "id": "node_0", + "name": "GraphRAG", + "type": "TECHNOLOGY", + "source_doc": "8a719db4-2b50-405b-826d-7bb27b224fa0", + "char_start": 0, + "char_end": 8, + "confidence": "match_exact", + "page": 0 +} +``` + +**完整节点列表(前 10 个)**: + +| id | name | type | confidence | +|----|------|------|-----------| +| node_0 | GraphRAG | TECHNOLOGY | match_exact | +| node_1 | Knowledge Graph Enhanced RAG System | CONCEPT | match_exact | +| node_2 | retrieval-augmented generation | CONCEPT | match_exact | +| node_3 | knowledge graphs | CONCEPT | match_exact | +| node_4 | large language models | CONCEPT | match_exact | +| node_5 | question answering | CONCEPT | match_exact | +| node_6 | document collections | CONCEPT | match_exact | +| node_7 | RAG systems | CONCEPT | match_exact | +| node_8 | vector similarity search | CONCEPT | match_exact | +| node_9 | hierarchical knowledge graph | CONCEPT | match_exact | + +### 2.3 kg_edges.json — 实际输出 + +**文件大小**: 129,093 bytes | **边数**: 780 + +**数学验证**: 40 个节点全部在同一页 → C(40,2) = 40×39/2 = 780 条边 ✓ + +**边格式(实际样例)**: + +```json +{ + "source": "node_0", + "target": "node_1", + "relation": "CO_OCCURS_IN", + "doc_id": "8a719db4-2b50-405b-826d-7bb27b224fa0", + "page": 0 +} +``` + +**完整性校验结果**: +- 自环数: 0 ✓ +- 重复边数: 0 ✓ +- 关系类型: 全部为 `CO_OCCURS_IN` ✓ + +--- + +## 3. MinerU Pipeline 关键参数规范 + +### 3.1 输入格式:content_list.json + +MinerU 解析 PDF 后输出的 `{uuid}_content_list.json` 是一个 JSON 数组,每个元素代表一个内容块。 + +**text block 结构**: + +```json +{ + "type": "text", + "text": "GraphRAG: Knowledge Graph Enhanced RAG System...", + "text_level": null, + "page_idx": 0, + "bbox": [72, 43, 523, 57] +} +``` + +| 字段 | 类型 | 说明 | +|------|------|------| +| `type` | string | 块类型:`"text"` \| `"table"` \| `"image"` | +| `text` | string | 文本内容(末尾可能有空格) | +| `text_level` | int \| null | `null`=正文,`1`=一级标题 | +| `page_idx` | int | 页码(从 0 开始) | +| `bbox` | list[int] | 边界框坐标 `[x0, y0, x1, y1]`(归一化 0-1000) | + +**table block 结构**: + +```json +{ + "type": "table", + "table_body": "
...
MethodScore
", + "table_caption": [], + "page_idx": 0, + "bbox": [72, 400, 523, 500] +} +``` + +| 字段 | 类型 | 说明 | +|------|------|------| +| `table_body` | string | HTML `` 标签完整内容 | +| `table_caption` | list | 表格标题(通常为空数组) | + +### 3.2 关键约束 + +- 文件命名: `{uuid}_content_list.json`,UUID 用作 source_doc_id +- block 排列顺序与 PDF 阅读顺序一致 +- `text` 字段末尾可能有多余空格,需 `.rstrip()` 处理 +- `image` 类型块不含可提取文本,Bridge 跳过处理 + +--- + +## 4. LangExtract Pipeline 关键参数规范 + +### 4.1 模型配置 + +```python +from langextract.providers.openai import OpenAILanguageModel + +model = OpenAILanguageModel( + model_id="deepseek-chat", + api_key=DEEPSEEK_API_KEY, + base_url="https://api.deepseek.com", +) +``` + +**重要**: 必须直接实例化 `OpenAILanguageModel`,不能使用 `model_id` 路由。LangExtract 的 `model_id` 同时用于内部路由和 API 请求参数,DeepSeek 不识别 GPT 模型名称。 + +### 4.2 抽取调用 + +```python +result = lx.extract( + text_or_documents=page_text, # 纯文本字符串 + prompt_description=PROMPT, # 实体类型描述 + examples=EXAMPLES, # Few-shot 示例 + model=model, # 直接传入模型实例 + show_progress=True, +) +``` + +### 4.3 Prompt 配置 + +``` +Extract named entities from the text in order of appearance. +Entity types: + TECHNOLOGY — software, algorithms, models, tools + ORGANIZATION — companies, research groups, institutions + PERSON — individual people + LOCATION — places, geographic entities + CONCEPT — technical concepts, methodologies, frameworks +``` + +### 4.4 Few-shot 示例 + +验证可用的示例(MVP 测试 94.1% match_exact): + +```python +lx.data.ExampleData( + text="LangChain is a framework created by Harrison Chase for building " + "LLM applications. It integrates with OpenAI models and Pinecone " + "vector database for semantic search.", + extractions=[ + lx.data.Extraction(extraction_class="TECHNOLOGY", extraction_text="LangChain"), + lx.data.Extraction(extraction_class="PERSON", extraction_text="Harrison Chase"), + lx.data.Extraction(extraction_class="CONCEPT", extraction_text="LLM applications"), + lx.data.Extraction(extraction_class="TECHNOLOGY", extraction_text="OpenAI models"), + lx.data.Extraction(extraction_class="TECHNOLOGY", extraction_text="Pinecone"), + lx.data.Extraction(extraction_class="CONCEPT", extraction_text="semantic search"), + ], +) +``` + +### 4.5 输出格式:AnnotatedDocument + +每页抽取返回一个 `AnnotatedDocument`,其 `extractions` 列表中每个元素包含: + +| 字段 | 类型 | 说明 | +|------|------|------| +| `extraction_text` | string | 实体名称(必须为输入文本的精确子串) | +| `extraction_class` | string | 实体类型(TECHNOLOGY/ORGANIZATION/PERSON/LOCATION/CONCEPT) | +| `char_interval.start_pos` | int | 在输入文本中的起始字符位置 | +| `char_interval.end_pos` | int | 在输入文本中的结束字符位置 | +| `alignment_status` | enum | 对齐质量:`match_exact` \| `match_greater` \| `match_lesser` \| `match_fuzzy` \| `None` | +| `extraction_index` | int | 抽取序号(从 1 开始) | +| `group_index` | int | 组序号(从 0 开始) | + +### 4.6 对齐质量过滤规则 + +| alignment_status | 含义 | Bridge 处理 | +|-----------------|------|------------| +| `match_exact` | LLM 输出与原文完全匹配 | ✅ 接受 | +| `match_greater` | LLM 输出是原文子串的超集 | ✅ 接受 | +| `match_lesser` | LLM 输出是原文子串的子集 | ✅ 接受 | +| `match_fuzzy` | 模糊匹配,偏移不可靠 | ❌ 过滤 | +| `None` | 无法对齐 | ❌ 过滤 | + +--- + +## 5. MinerU ↔ LangExtract 接口对接规范 + +### 5.1 核心挑战 + +MinerU 输出结构化 JSON 块(含 HTML 表格),而 LangExtract 仅接受纯文本 `str`。Bridge 的 `text_assembler` 模块负责转换和偏移映射。 + +### 5.2 对接转换规则 + +| 对接点 | MinerU 规范 | LangExtract 规范 | Bridge 处理 | +|--------|------------|-----------------|------------| +| 输入格式 | `content_list.json`(JSON 数组) | 仅接受纯文本 `str` | `text_assembler` 拼接转换 | +| 文本块 | `block["text"]`,末尾可能有空格 | `extraction_text` 须为原文精确子串 | `.rstrip()` 去尾部空格 | +| 表格块 | `table_body` 是 `
` HTML | 不接受 HTML | BeautifulSoup 转 pipe 分隔纯文本 | +| 标题判断 | `text_level` 缺失=正文,存在=标题 | 不区分标题/正文 | 标题和正文一起拼入文本 | +| 坐标系 | bbox 归一化 0-1000 | char_interval 基于输入字符 | BlockSpan 记录偏移映射 | +| 分页 | `page_idx` 区分不同页 | 单次调用处理一段文本 | 逐页分别调用 `lx.extract()` | +| 文件名 | `{uuid}_content_list.json` | — | glob `*_content_list.json` 匹配 | + +### 5.3 文本拼接算法 + +``` +输入: content_list (按 page_idx 分组) +输出: PageText 列表 + +对每页: + cursor = 0 + 对每个 block (保持原顺序): + if type == "text": + block_text = block["text"].rstrip() + elif type == "table": + block_text = html_table_to_text(block["table_body"]) + else: + 跳过 (image / equation 等) + + 记录 BlockSpan(char_start=cursor, char_end=cursor+len(block_text)) + buffer.append(block_text + "\n") + cursor += len(block_text) + 1 + + PageText.text = "".join(buffer).rstrip("\n") +``` + +### 5.4 偏移映射数据结构 + +```python +@dataclasses.dataclass +class BlockSpan: + block_index: int # content_list 数组下标 + block_type: str # "text" | "table" + page_idx: int # 页码 + char_start: int # 在拼接文本中的起始位置 + char_end: int # 在拼接文本中的结束位置(不含) + bbox: list[int] # MinerU 原始 bbox + +@dataclasses.dataclass +class PageText: + page_idx: int # 页码 + text: str # 拼接后的纯文本 + block_spans: list[BlockSpan] # 每个 block 在 text 中的位置 +``` + +### 5.5 HTML 表格转换 + +```python +def html_table_to_text(table_body: str) -> str: + """Convert
HTML → pipe-delimited plain text""" + soup = BeautifulSoup(table_body, "html.parser") + rows = [] + for tr in soup.find_all("tr"): + cells = [td.get_text(strip=True) for td in tr.find_all(["td", "th"])] + rows.append(" | ".join(cells)) + return "\n".join(rows) +``` + +转换示例: + +```html +
MethodScore
GraphRAG0.85
+``` + +→ + +``` +Method | Score +GraphRAG | 0.85 +``` + +--- + +## 6. Bridge Pipeline 最终输出关键参数规范 + +### 6.1 kg_nodes.json + +**文件路径**: `graphrag_pipeline/output/kg_nodes.json` + +**结构**: JSON 数组,每个元素为一个去重后的实体节点。 + +| 字段 | 类型 | 说明 | 示例 | +|------|------|------|------| +| `id` | string | 节点唯一标识,格式 `node_{index}` | `"node_0"` | +| `name` | string | 实体名称(原文子串) | `"GraphRAG"` | +| `type` | string | 实体类型 | `"TECHNOLOGY"` | +| `source_doc` | string | 来源文档 UUID | `"8a719db4-2b50-405b-826d-7bb27b224fa0"` | +| `char_start` | int | 在拼接文本中的起始字符位置 | `0` | +| `char_end` | int | 在拼接文本中的结束字符位置 | `8` | +| `confidence` | string | 对齐质量(仅 `match_exact`/`match_greater`/`match_lesser`) | `"match_exact"` | +| `page` | int | 来源页码(从 0 开始) | `0` | + +**去重规则**: key = `(name.lower(), type)`,保留首次出现的实体。 + +**实体类型枚举**: + +| 类型 | 说明 | +|------|------| +| `TECHNOLOGY` | 软件、算法、模型、工具 | +| `ORGANIZATION` | 公司、研究机构 | +| `PERSON` | 个人 | +| `LOCATION` | 地理位置 | +| `CONCEPT` | 技术概念、方法论、框架 | + +### 6.2 kg_edges.json + +**文件路径**: `graphrag_pipeline/output/kg_edges.json` + +**结构**: JSON 数组,每个元素为一条同页共现关系边。 + +| 字段 | 类型 | 说明 | 示例 | +|------|------|------|------| +| `source` | string | 源节点 ID | `"node_0"` | +| `target` | string | 目标节点 ID | `"node_1"` | +| `relation` | string | 关系类型(固定 `"CO_OCCURS_IN"`) | `"CO_OCCURS_IN"` | +| `doc_id` | string | 来源文档 UUID | `"8a719db4-..."` | +| `page` | int | 共现页码 | `0` | + +**边生成规则**: +1. 按页分组所有去重后的节点 ID +2. 同页节点两两配对 → 生成 `CO_OCCURS_IN` 边 +3. 边方向规范化: `source < target`(字典序) +4. 去重 key: `(source, target, doc_id, page)` +5. 无自环(source ≠ target) + +**边数公式**: 若某页有 N 个节点,则该页产生 C(N,2) = N×(N-1)/2 条边。 + +### 6.3 输出完整性约束 + +| 约束 | 说明 | +|------|------| +| 节点 ID 唯一 | 每个节点的 `id` 字段全局唯一 | +| 边引用合法 | 每条边的 `source` 和 `target` 必须对应存在的节点 `id` | +| 无自环 | 不存在 `source == target` 的边 | +| 无重复边 | 同一 `(source, target, doc_id, page)` 组合仅出现一次 | +| 对齐质量保证 | 所有节点的 `confidence` 仅为 accepted 值(非 fuzzy/null) | +| char 偏移有效 | `char_start < char_end`,且可定位到拼接文本中的实体子串 | + +--- + +## 7. 虚拟环境规范 + +Bridge Pipeline **复用 LangExtract 的虚拟环境**,不单独创建 venv。 + +| 项目 | 值 | +|------|------| +| 虚拟环境路径 | `F:\GraphRAGAgent\langextract_src\.venv\` | +| Python 版本 | 3.12 | +| 核心依赖 | `langextract[all]`、`beautifulsoup4`、`python-dotenv` | +| 安装新依赖 | `uv pip install --python F:/GraphRAGAgent/langextract_src/.venv/Scripts/python.exe` | + +**所有 Python 命令必须使用该虚拟环境运行,禁止使用全局 Python 或其他组件的 venv。** + +--- + +## 8. 环境配置 + +### 8.1 .env 文件 + +位置: `F:\GraphRAGAgent\graphrag_pipeline\.env` + +```env +DEEPSEEK_API_KEY= +DEEPSEEK_BASE_URL=https://api.deepseek.com +``` + +### 8.2 依赖安装 + +```bash +uv pip install beautifulsoup4 python-dotenv --python F:/GraphRAGAgent/langextract_src/.venv/Scripts/python.exe +``` + +--- + +## 9. 测试验证清单 + +- [x] text_assembler 正确读取 content_list.json(10 blocks: 9 text + 1 table) +- [x] 表格 HTML 转为 pipe 分隔纯文本,无 HTML 标签残留 +- [x] 按页拼接文本长度合理(2102 字符/页) +- [x] LangExtract 成功调用 DeepSeek 返回 AnnotatedDocument +- [x] 抽取实体数 45,match_exact 占比 > 95% +- [x] kg_nodes.json 节点已去重(40 个),每个节点有完整字段 +- [x] kg_edges.json 边为 CO_OCCURS_IN 关系(780 条),无自环,无重复 +- [x] match_fuzzy 对齐的实体已被过滤(1 个) diff --git a/docs/frontend_design_specification-v1.0.md b/docs/frontend_design_specification-v1.0.md new file mode 100644 index 0000000..3278855 --- /dev/null +++ b/docs/frontend_design_specification-v1.0.md @@ -0,0 +1,1232 @@ +# GraphRAG Studio — 前端 Web 系统设计规范 v1.0 + +> 基于 `docs/backend_service_specification-v1.0.md` 接口规范 +> 前端架构:原生 HTML + CSS + JS + D3.js v7(SPA,零构建依赖) +> 更新日期:2026-03-05 + +--- + +## 目录 + +- [一、总体架构](#一总体架构) +- [二、设计语言与风格系统](#二设计语言与风格系统) +- [三、整体布局](#三整体布局) +- [四、页面清单与详细设计](#四页面清单与详细设计) + - [Page 1 — Dashboard](#page-1--dashboard-dashboard) + - [Page 2 — Document Manager](#page-2--document-manager-documents) + - [Page 3 — KG Explorer](#page-3--kg-explorer-graph) + - [Page 4 — QA Chat](#page-4--qa-chat-chat) + - [Page 5 — Search](#page-5--search-search) +- [五、响应式设计规范](#五响应式设计规范) +- [六、关键交互模式规范](#六关键交互模式规范) +- [七、文件结构](#七文件结构) + +--- + +## 一、总体架构 + +### 1.1 技术选型 + +| 组件 | 选择 | 理由 | +|------|------|------| +| 应用类型 | **SPA(单页应用)** | 5 页无缝切换,无刷新体验 | +| 路由 | **Hash 路由**(原生 JS) | 无需构建工具,`#/dashboard` `#/documents` 等 | +| 框架 | **原生 HTML + CSS + JS** | 与现有 `index.html` 一致,零构建依赖,直接在浏览器运行 | +| 图形渲染 | **D3.js v7**(CDN) | 复用现有 KG 可视化逻辑(`graphrag_pipeline/static/index.html`) | +| Markdown 渲染 | **marked.js v9**(CDN) | Chat 页 AI 答案 Markdown 渲染 | +| API 通信 | **Fetch API** | 原生支持,封装统一错误处理 | +| 图标 | **Unicode / SVG 内联** | 零依赖(无需图标库 CDN) | + +### 1.2 路由设计 + +``` +hash 路由 → DOM 区域显示/隐藏 + +#/dashboard → 显示
+#/documents → 显示
+#/graph → 显示
+ 初始化 D3 +#/chat → 显示
+#/search → 显示