GraphRAG Studio — initial commit: multimodal RAG system with KG visualization

Full-stack application for document-to-knowledge-graph pipeline: - Backend: FastAPI + LangGraph ReAct agent + DeepSeek + MinerU parsing - Frontend: React 19 + Vite + D3.js + shadcn/ui - Pipeline: MinerU parsing → LangExtract entity extraction → KG building Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-07 17:30:04 +08:00
commit b02d3378fc
127 changed files with 37218 additions and 0 deletions
--- a/backend/services/document_service.py
+++ b/backend/services/document_service.py
@@ -0,0 +1,109 @@
+"""Document Service — file upload, metadata CRUD."""
+from __future__ import annotations
+
+import uuid
+from datetime import datetime, timezone
+from pathlib import Path
+
+from storage import file_store as fs
+
+ALLOWED_EXTENSIONS = {"pdf", "docx", "doc", "pptx", "ppt", "png", "jpg", "jpeg", "html"}
+MAX_FILE_SIZE_MB = 200
+
+
+def validate_upload(filename: str, size_bytes: int) -> tuple[bool, int, str]:
+    """Returns (ok, error_code, error_msg)."""
+    if not filename or "/" in filename or "\\" in filename:
+        return False, 1001, "Invalid filename"
+    ext = Path(filename).suffix.lower().lstrip(".")
+    if ext not in ALLOWED_EXTENSIONS:
+        return False, 1002, f"Unsupported file format: .{ext}. Supported: {', '.join(sorted(ALLOWED_EXTENSIONS))}"
+    size_mb = size_bytes / (1024 * 1024)
+    if size_mb > MAX_FILE_SIZE_MB:
+        return False, 1003, f"File size {size_mb:.1f}MB exceeds {MAX_FILE_SIZE_MB}MB limit"
+    return True, 0, ""
+
+
+def save_upload(filename: str, content: bytes, language: str = "ch",
+                enable_formula: bool = True, enable_table: bool = True) -> dict:
+    doc_id = uuid.uuid4().hex[:8]
+    ext = Path(filename).suffix.lower().lstrip(".")
+    upload_filename = f"{doc_id}_{filename}"
+    upload_path = fs.UPLOADS_DIR / upload_filename
+    upload_path.write_bytes(content)
+
+    doc = {
+        "doc_id": doc_id,
+        "filename": filename,
+        "format": ext,
+        "size_bytes": len(content),
+        "pages": None,
+        "uploaded_at": datetime.now(timezone.utc).isoformat(),
+        "status": "uploaded",
+        "language": language,
+        "enable_formula": enable_formula,
+        "enable_table": enable_table,
+        "upload_filename": upload_filename,  # internal: actual stored filename
+    }
+    fs.save_doc(doc)
+    return doc
+
+
+def get_document(doc_id: str) -> dict | None:
+    return fs.get_doc(doc_id)
+
+
+def list_documents(page: int = 1, page_size: int = 20,
+                   status: str | None = None, fmt: str | None = None) -> dict:
+    index = fs.load_docs_index()
+    items = list(index.values())
+    items.sort(key=lambda d: d.get("uploaded_at", ""), reverse=True)
+    if status:
+        items = [d for d in items if d.get("status") == status]
+    if fmt:
+        items = [d for d in items if d.get("format") == fmt.lower()]
+    total = len(items)
+    start = (page - 1) * page_size
+    return {
+        "total": total,
+        "page": page,
+        "page_size": page_size,
+        "items": items[start: start + page_size],
+    }
+
+
+def delete_document(doc_id: str) -> tuple[bool, int, int]:
+    """Delete doc and its KG contributions. Returns (ok, removed_nodes, removed_edges)."""
+    doc = fs.get_doc(doc_id)
+    if not doc:
+        return False, 0, 0
+
+    # Remove from KG
+    removed_nodes, removed_edges = fs.remove_doc_from_kg(doc_id)
+
+    # Remove upload file
+    upload_filename = doc.get("upload_filename", "")
+    upload_path = fs.UPLOADS_DIR / upload_filename
+    if upload_path.exists():
+        upload_path.unlink(missing_ok=True)
+
+    # Remove associated jobs
+    for meta in fs.list_all_jobs():
+        if meta.get("doc_id") == doc_id:
+            fs.delete_job(meta["job_id"])
+
+    # Remove from index
+    index = fs.load_docs_index()
+    index.pop(doc_id, None)
+    fs.save_docs_index(index)
+
+    return True, removed_nodes, removed_edges
+
+
+def update_doc_status(doc_id: str, status: str, pages: int | None = None) -> None:
+    index = fs.load_docs_index()
+    if doc_id in index:
+        index[doc_id]["status"] = status
+        if pages is not None:
+            index[doc_id]["pages"] = pages
+        fs.save_docs_index(index)