Full-stack application for document-to-knowledge-graph pipeline: - Backend: FastAPI + LangGraph ReAct agent + DeepSeek + MinerU parsing - Frontend: React 19 + Vite + D3.js + shadcn/ui - Pipeline: MinerU parsing → LangExtract entity extraction → KG building Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
110 lines
3.5 KiB
Python
110 lines
3.5 KiB
Python
"""Document Service — file upload, metadata CRUD."""
|
|
from __future__ import annotations
|
|
|
|
import uuid
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
from storage import file_store as fs
|
|
|
|
ALLOWED_EXTENSIONS = {"pdf", "docx", "doc", "pptx", "ppt", "png", "jpg", "jpeg", "html"}
|
|
MAX_FILE_SIZE_MB = 200
|
|
|
|
|
|
def validate_upload(filename: str, size_bytes: int) -> tuple[bool, int, str]:
|
|
"""Returns (ok, error_code, error_msg)."""
|
|
if not filename or "/" in filename or "\\" in filename:
|
|
return False, 1001, "Invalid filename"
|
|
ext = Path(filename).suffix.lower().lstrip(".")
|
|
if ext not in ALLOWED_EXTENSIONS:
|
|
return False, 1002, f"Unsupported file format: .{ext}. Supported: {', '.join(sorted(ALLOWED_EXTENSIONS))}"
|
|
size_mb = size_bytes / (1024 * 1024)
|
|
if size_mb > MAX_FILE_SIZE_MB:
|
|
return False, 1003, f"File size {size_mb:.1f}MB exceeds {MAX_FILE_SIZE_MB}MB limit"
|
|
return True, 0, ""
|
|
|
|
|
|
def save_upload(filename: str, content: bytes, language: str = "ch",
|
|
enable_formula: bool = True, enable_table: bool = True) -> dict:
|
|
doc_id = uuid.uuid4().hex[:8]
|
|
ext = Path(filename).suffix.lower().lstrip(".")
|
|
upload_filename = f"{doc_id}_{filename}"
|
|
upload_path = fs.UPLOADS_DIR / upload_filename
|
|
upload_path.write_bytes(content)
|
|
|
|
doc = {
|
|
"doc_id": doc_id,
|
|
"filename": filename,
|
|
"format": ext,
|
|
"size_bytes": len(content),
|
|
"pages": None,
|
|
"uploaded_at": datetime.now(timezone.utc).isoformat(),
|
|
"status": "uploaded",
|
|
"language": language,
|
|
"enable_formula": enable_formula,
|
|
"enable_table": enable_table,
|
|
"upload_filename": upload_filename, # internal: actual stored filename
|
|
}
|
|
fs.save_doc(doc)
|
|
return doc
|
|
|
|
|
|
def get_document(doc_id: str) -> dict | None:
|
|
return fs.get_doc(doc_id)
|
|
|
|
|
|
def list_documents(page: int = 1, page_size: int = 20,
|
|
status: str | None = None, fmt: str | None = None) -> dict:
|
|
index = fs.load_docs_index()
|
|
items = list(index.values())
|
|
items.sort(key=lambda d: d.get("uploaded_at", ""), reverse=True)
|
|
if status:
|
|
items = [d for d in items if d.get("status") == status]
|
|
if fmt:
|
|
items = [d for d in items if d.get("format") == fmt.lower()]
|
|
total = len(items)
|
|
start = (page - 1) * page_size
|
|
return {
|
|
"total": total,
|
|
"page": page,
|
|
"page_size": page_size,
|
|
"items": items[start: start + page_size],
|
|
}
|
|
|
|
|
|
def delete_document(doc_id: str) -> tuple[bool, int, int]:
|
|
"""Delete doc and its KG contributions. Returns (ok, removed_nodes, removed_edges)."""
|
|
doc = fs.get_doc(doc_id)
|
|
if not doc:
|
|
return False, 0, 0
|
|
|
|
# Remove from KG
|
|
removed_nodes, removed_edges = fs.remove_doc_from_kg(doc_id)
|
|
|
|
# Remove upload file
|
|
upload_filename = doc.get("upload_filename", "")
|
|
upload_path = fs.UPLOADS_DIR / upload_filename
|
|
if upload_path.exists():
|
|
upload_path.unlink(missing_ok=True)
|
|
|
|
# Remove associated jobs
|
|
for meta in fs.list_all_jobs():
|
|
if meta.get("doc_id") == doc_id:
|
|
fs.delete_job(meta["job_id"])
|
|
|
|
# Remove from index
|
|
index = fs.load_docs_index()
|
|
index.pop(doc_id, None)
|
|
fs.save_docs_index(index)
|
|
|
|
return True, removed_nodes, removed_edges
|
|
|
|
|
|
def update_doc_status(doc_id: str, status: str, pages: int | None = None) -> None:
|
|
index = fs.load_docs_index()
|
|
if doc_id in index:
|
|
index[doc_id]["status"] = status
|
|
if pages is not None:
|
|
index[doc_id]["pages"] = pages
|
|
fs.save_docs_index(index)
|