{ "pdf_info": [ { "preproc_blocks": [ { "type": "title", "bbox": [ 205, 148, 390, 172 ], "lines": [ { "bbox": [ 203, 144, 393, 177 ], "spans": [ { "bbox": [ 203, 144, 393, 177 ], "score": 1.0, "content": "GraphRAG System", "type": "text" } ], "index": 0 } ], "index": 0 }, { "type": "text", "bbox": [ 172, 197, 423, 214 ], "lines": [ { "bbox": [ 171, 195, 424, 216 ], "spans": [ { "bbox": [ 171, 195, 424, 216 ], "score": 1.0, "content": "Technical Architecture Overview", "type": "text" } ], "index": 1 } ], "index": 1 }, { "type": "text", "bbox": [ 217, 229, 377, 244 ], "lines": [ { "bbox": [ 216, 227, 378, 246 ], "spans": [ { "bbox": [ 216, 227, 378, 246 ], "score": 1.0, "content": "Version 1.0 | March 2026", "type": "text" } ], "index": 2 } ], "index": 2 } ], "page_idx": 0, "page_size": [ 595, 841 ], "discarded_blocks": [], "para_blocks": [ { "type": "title", "bbox": [ 205, 148, 390, 172 ], "lines": [ { "bbox": [ 203, 144, 393, 177 ], "spans": [ { "bbox": [ 203, 144, 393, 177 ], "score": 1.0, "content": "GraphRAG System", "type": "text" } ], "index": 0 } ], "index": 0 }, { "type": "text", "bbox": [ 172, 197, 423, 214 ], "lines": [ { "bbox": [ 171, 195, 424, 216 ], "spans": [ { "bbox": [ 171, 195, 424, 216 ], "score": 1.0, "content": "Technical Architecture Overview", "type": "text" } ], "index": 1 } ], "index": 1, "bbox_fs": [ 171, 195, 424, 216 ] }, { "type": "text", "bbox": [ 217, 229, 377, 244 ], "lines": [ { "bbox": [ 216, 227, 378, 246 ], "spans": [ { "bbox": [ 216, 227, 378, 246 ], "score": 1.0, "content": "Version 1.0 | March 2026", "type": "text" } ], "index": 2 } ], "index": 2, "bbox_fs": [ 216, 227, 378, 246 ] } ] }, { "preproc_blocks": [ { "type": "title", "bbox": [ 31, 36, 119, 52 ], "lines": [ { "bbox": [ 27, 34, 121, 54 ], "spans": [ { "bbox": [ 27, 34, 121, 54 ], "score": 1.0, "content": "1. Abstract", "type": "text" } ], "index": 0 } ], "index": 0 }, { "type": "text", "bbox": [ 29, 70, 566, 144 ], "lines": [ { "bbox": [ 29, 69, 565, 84 ], "spans": [ { "bbox": [ 29, 69, 565, 84 ], "score": 1.0, "content": "This document presents the technical architecture of a Multimodal GraphRAG System designed for", "type": "text" } ], "index": 1 }, { "bbox": [ 30, 89, 565, 104 ], "spans": [ { "bbox": [ 30, 89, 565, 104 ], "score": 1.0, "content": "intelligent document parsing and knowledge graph construction. The system integrates MinerU for", "type": "text" } ], "index": 2 }, { "bbox": [ 28, 110, 565, 125 ], "spans": [ { "bbox": [ 28, 110, 565, 125 ], "score": 1.0, "content": "document parsing, LangExtract for structured entity extraction, and a graph database for", "type": "text" } ], "index": 3 }, { "bbox": [ 28, 129, 207, 145 ], "spans": [ { "bbox": [ 28, 129, 207, 145 ], "score": 1.0, "content": "knowledge storage and retrieval.", "type": "text" } ], "index": 4 } ], "index": 2.5 }, { "type": "text", "bbox": [ 29, 169, 565, 223 ], "lines": [ { "bbox": [ 27, 167, 565, 185 ], "spans": [ { "bbox": [ 27, 167, 565, 185 ], "score": 1.0, "content": "The pipeline supports multiple document formats including PDF, DOCX, PPTX, and image files.", "type": "text" } ], "index": 5 }, { "bbox": [ 28, 188, 567, 205 ], "spans": [ { "bbox": [ 28, 188, 567, 205 ], "score": 1.0, "content": "Extracted entities and relations are stored as graph nodes and edges, enabling semantic search", "type": "text" } ], "index": 6 }, { "bbox": [ 28, 209, 331, 224 ], "spans": [ { "bbox": [ 28, 209, 331, 224 ], "score": 1.0, "content": "and question answering over large document collections.", "type": "text" } ], "index": 7 } ], "index": 6 }, { "type": "title", "bbox": [ 30, 252, 191, 268 ], "lines": [ { "bbox": [ 27, 249, 193, 271 ], "spans": [ { "bbox": [ 27, 249, 193, 271 ], "score": 1.0, "content": "2. System Components", "type": "text" } ], "index": 8 } ], "index": 8 }, { "type": "title", "bbox": [ 30, 289, 208, 304 ], "lines": [ { "bbox": [ 28, 288, 208, 306 ], "spans": [ { "bbox": [ 28, 288, 208, 306 ], "score": 1.0, "content": "2.1 Document Parsing Module", "type": "text" } ], "index": 9 } ], "index": 9 }, { "type": "text", "bbox": [ 29, 314, 566, 367 ], "lines": [ { "bbox": [ 28, 313, 563, 329 ], "spans": [ { "bbox": [ 28, 313, 563, 329 ], "score": 1.0, "content": "MinerU Cloud API (v4) serves as the document parsing backend. It accepts PDF, DOCX, PPTX, PNG,", "type": "text" } ], "index": 10 }, { "bbox": [ 29, 334, 566, 348 ], "spans": [ { "bbox": [ 29, 334, 566, 348 ], "score": 1.0, "content": "JPG, and HTML files. Output includes Markdown text, structured content_list.json, and extracted", "type": "text" } ], "index": 11 }, { "bbox": [ 28, 352, 69, 370 ], "spans": [ { "bbox": [ 28, 352, 69, 370 ], "score": 1.0, "content": "images.", "type": "text" } ], "index": 12 } ], "index": 11 }, { "type": "title", "bbox": [ 30, 388, 213, 403 ], "lines": [ { "bbox": [ 28, 388, 214, 404 ], "spans": [ { "bbox": [ 28, 388, 214, 404 ], "score": 1.0, "content": "2.2 Entity Extraction Module", "type": "text" } ], "index": 13 } ], "index": 13 }, { "type": "text", "bbox": [ 29, 414, 565, 467 ], "lines": [ { "bbox": [ 28, 412, 567, 428 ], "spans": [ { "bbox": [ 28, 412, 567, 428 ], "score": 1.0, "content": "LangExtract (v1.1.1) performs structured information extraction from plain text using few-shot", "type": "text" } ], "index": 14 }, { "bbox": [ 28, 432, 565, 448 ], "spans": [ { "bbox": [ 28, 432, 565, 448 ], "score": 1.0, "content": "prompting with LLM backends (Gemini, OpenAI, or local Ollama). Each extraction includes", "type": "text" } ], "index": 15 }, { "bbox": [ 27, 451, 223, 469 ], "spans": [ { "bbox": [ 27, 451, 223, 469 ], "score": 1.0, "content": "character-level position anchoring.", "type": "text" } ], "index": 16 } ], "index": 15 }, { "type": "title", "bbox": [ 30, 488, 201, 502 ], "lines": [ { "bbox": [ 28, 487, 201, 504 ], "spans": [ { "bbox": [ 28, 487, 201, 504 ], "score": 1.0, "content": "2.3 Knowledge Graph Module", "type": "text" } ], "index": 17 } ], "index": 17 }, { "type": "text", "bbox": [ 29, 512, 565, 567 ], "lines": [ { "bbox": [ 27, 510, 564, 529 ], "spans": [ { "bbox": [ 27, 510, 564, 529 ], "score": 1.0, "content": "Extracted entities and relationships are stored in a graph database. Node types include: Person,", "type": "text" } ], "index": 18 }, { "bbox": [ 28, 531, 563, 547 ], "spans": [ { "bbox": [ 28, 531, 563, 547 ], "score": 1.0, "content": "Organization, Location, Event, Concept. Edge types include: RELATED_TO, BELONGS_TO, CAUSED_BY,", "type": "text" } ], "index": 19 }, { "bbox": [ 28, 552, 91, 567 ], "spans": [ { "bbox": [ 28, 552, 91, 567 ], "score": 1.0, "content": "LOCATED_IN.", "type": "text" } ], "index": 20 } ], "index": 19 }, { "type": "title", "bbox": [ 30, 587, 162, 602 ], "lines": [ { "bbox": [ 28, 587, 162, 603 ], "spans": [ { "bbox": [ 28, 587, 162, 603 ], "score": 1.0, "content": "2.4 Retrieval Module", "type": "text" } ], "index": 21 } ], "index": 21 }, { "type": "text", "bbox": [ 29, 612, 562, 645 ], "lines": [ { "bbox": [ 28, 610, 563, 627 ], "spans": [ { "bbox": [ 28, 610, 563, 627 ], "score": 1.0, "content": "The retrieval layer supports hybrid search combining vector similarity and graph traversal.", "type": "text" } ], "index": 22 }, { "bbox": [ 28, 631, 519, 646 ], "spans": [ { "bbox": [ 28, 631, 519, 646 ], "score": 1.0, "content": "Query results are ranked by relevance score and returned with source document references.", "type": "text" } ], "index": 23 } ], "index": 22.5 } ], "page_idx": 1, "page_size": [ 595, 841 ], "discarded_blocks": [], "para_blocks": [ { "type": "title", "bbox": [ 31, 36, 119, 52 ], "lines": [ { "bbox": [ 27, 34, 121, 54 ], "spans": [ { "bbox": [ 27, 34, 121, 54 ], "score": 1.0, "content": "1. Abstract", "type": "text" } ], "index": 0 } ], "index": 0 }, { "type": "text", "bbox": [ 29, 70, 566, 144 ], "lines": [ { "bbox": [ 29, 69, 565, 84 ], "spans": [ { "bbox": [ 29, 69, 565, 84 ], "score": 1.0, "content": "This document presents the technical architecture of a Multimodal GraphRAG System designed for", "type": "text" } ], "index": 1 }, { "bbox": [ 30, 89, 565, 104 ], "spans": [ { "bbox": [ 30, 89, 565, 104 ], "score": 1.0, "content": "intelligent document parsing and knowledge graph construction. The system integrates MinerU for", "type": "text" } ], "index": 2 }, { "bbox": [ 28, 110, 565, 125 ], "spans": [ { "bbox": [ 28, 110, 565, 125 ], "score": 1.0, "content": "document parsing, LangExtract for structured entity extraction, and a graph database for", "type": "text" } ], "index": 3 }, { "bbox": [ 28, 129, 207, 145 ], "spans": [ { "bbox": [ 28, 129, 207, 145 ], "score": 1.0, "content": "knowledge storage and retrieval.", "type": "text" } ], "index": 4 } ], "index": 2.5, "bbox_fs": [ 28, 69, 565, 145 ] }, { "type": "text", "bbox": [ 29, 169, 565, 223 ], "lines": [ { "bbox": [ 27, 167, 565, 185 ], "spans": [ { "bbox": [ 27, 167, 565, 185 ], "score": 1.0, "content": "The pipeline supports multiple document formats including PDF, DOCX, PPTX, and image files.", "type": "text" } ], "index": 5 }, { "bbox": [ 28, 188, 567, 205 ], "spans": [ { "bbox": [ 28, 188, 567, 205 ], "score": 1.0, "content": "Extracted entities and relations are stored as graph nodes and edges, enabling semantic search", "type": "text" } ], "index": 6 }, { "bbox": [ 28, 209, 331, 224 ], "spans": [ { "bbox": [ 28, 209, 331, 224 ], "score": 1.0, "content": "and question answering over large document collections.", "type": "text" } ], "index": 7 } ], "index": 6, "bbox_fs": [ 27, 167, 567, 224 ] }, { "type": "title", "bbox": [ 30, 252, 191, 268 ], "lines": [ { "bbox": [ 27, 249, 193, 271 ], "spans": [ { "bbox": [ 27, 249, 193, 271 ], "score": 1.0, "content": "2. System Components", "type": "text" } ], "index": 8 } ], "index": 8 }, { "type": "title", "bbox": [ 30, 289, 208, 304 ], "lines": [ { "bbox": [ 28, 288, 208, 306 ], "spans": [ { "bbox": [ 28, 288, 208, 306 ], "score": 1.0, "content": "2.1 Document Parsing Module", "type": "text" } ], "index": 9 } ], "index": 9 }, { "type": "text", "bbox": [ 29, 314, 566, 367 ], "lines": [ { "bbox": [ 28, 313, 563, 329 ], "spans": [ { "bbox": [ 28, 313, 563, 329 ], "score": 1.0, "content": "MinerU Cloud API (v4) serves as the document parsing backend. It accepts PDF, DOCX, PPTX, PNG,", "type": "text" } ], "index": 10 }, { "bbox": [ 29, 334, 566, 348 ], "spans": [ { "bbox": [ 29, 334, 566, 348 ], "score": 1.0, "content": "JPG, and HTML files. Output includes Markdown text, structured content_list.json, and extracted", "type": "text" } ], "index": 11 }, { "bbox": [ 28, 352, 69, 370 ], "spans": [ { "bbox": [ 28, 352, 69, 370 ], "score": 1.0, "content": "images.", "type": "text" } ], "index": 12 } ], "index": 11, "bbox_fs": [ 28, 313, 566, 370 ] }, { "type": "title", "bbox": [ 30, 388, 213, 403 ], "lines": [ { "bbox": [ 28, 388, 214, 404 ], "spans": [ { "bbox": [ 28, 388, 214, 404 ], "score": 1.0, "content": "2.2 Entity Extraction Module", "type": "text" } ], "index": 13 } ], "index": 13 }, { "type": "text", "bbox": [ 29, 414, 565, 467 ], "lines": [ { "bbox": [ 28, 412, 567, 428 ], "spans": [ { "bbox": [ 28, 412, 567, 428 ], "score": 1.0, "content": "LangExtract (v1.1.1) performs structured information extraction from plain text using few-shot", "type": "text" } ], "index": 14 }, { "bbox": [ 28, 432, 565, 448 ], "spans": [ { "bbox": [ 28, 432, 565, 448 ], "score": 1.0, "content": "prompting with LLM backends (Gemini, OpenAI, or local Ollama). Each extraction includes", "type": "text" } ], "index": 15 }, { "bbox": [ 27, 451, 223, 469 ], "spans": [ { "bbox": [ 27, 451, 223, 469 ], "score": 1.0, "content": "character-level position anchoring.", "type": "text" } ], "index": 16 } ], "index": 15, "bbox_fs": [ 27, 412, 567, 469 ] }, { "type": "title", "bbox": [ 30, 488, 201, 502 ], "lines": [ { "bbox": [ 28, 487, 201, 504 ], "spans": [ { "bbox": [ 28, 487, 201, 504 ], "score": 1.0, "content": "2.3 Knowledge Graph Module", "type": "text" } ], "index": 17 } ], "index": 17 }, { "type": "text", "bbox": [ 29, 512, 565, 567 ], "lines": [ { "bbox": [ 27, 510, 564, 529 ], "spans": [ { "bbox": [ 27, 510, 564, 529 ], "score": 1.0, "content": "Extracted entities and relationships are stored in a graph database. Node types include: Person,", "type": "text" } ], "index": 18 }, { "bbox": [ 28, 531, 563, 547 ], "spans": [ { "bbox": [ 28, 531, 563, 547 ], "score": 1.0, "content": "Organization, Location, Event, Concept. Edge types include: RELATED_TO, BELONGS_TO, CAUSED_BY,", "type": "text" } ], "index": 19 }, { "bbox": [ 28, 552, 91, 567 ], "spans": [ { "bbox": [ 28, 552, 91, 567 ], "score": 1.0, "content": "LOCATED_IN.", "type": "text" } ], "index": 20 } ], "index": 19, "bbox_fs": [ 27, 510, 564, 567 ] }, { "type": "title", "bbox": [ 30, 587, 162, 602 ], "lines": [ { "bbox": [ 28, 587, 162, 603 ], "spans": [ { "bbox": [ 28, 587, 162, 603 ], "score": 1.0, "content": "2.4 Retrieval Module", "type": "text" } ], "index": 21 } ], "index": 21 }, { "type": "list", "bbox": [ 29, 612, 562, 645 ], "lines": [ { "bbox": [ 28, 610, 563, 627 ], "spans": [ { "bbox": [ 28, 610, 563, 627 ], "score": 1.0, "content": "The retrieval layer supports hybrid search combining vector similarity and graph traversal.", "type": "text" } ], "index": 22, "is_list_end_line": true }, { "bbox": [ 28, 631, 519, 646 ], "spans": [ { "bbox": [ 28, 631, 519, 646 ], "score": 1.0, "content": "Query results are ranked by relevance score and returned with source document references.", "type": "text" } ], "index": 23, "is_list_start_line": true, "is_list_end_line": true } ], "index": 22.5, "bbox_fs": [ 28, 610, 563, 646 ] } ] }, { "preproc_blocks": [ { "type": "title", "bbox": [ 30, 36, 160, 52 ], "lines": [ { "bbox": [ 27, 34, 162, 54 ], "spans": [ { "bbox": [ 27, 34, 162, 54 ], "score": 1.0, "content": "3. Data Pipeline", "type": "text" } ], "index": 0 } ], "index": 0 }, { "type": "text", "bbox": [ 30, 70, 371, 84 ], "lines": [ { "bbox": [ 28, 68, 373, 86 ], "spans": [ { "bbox": [ 28, 68, 373, 86 ], "score": 1.0, "content": "The end-to-end data pipeline consists of the following stages:", "type": "text" } ], "index": 1 } ], "index": 1 }, { "type": "text", "bbox": [ 41, 110, 192, 123 ], "lines": [ { "bbox": [ 39, 109, 192, 124 ], "spans": [ { "bbox": [ 39, 109, 192, 124 ], "score": 1.0, "content": "Stage 1: Document Ingestion", "type": "text" } ], "index": 2 } ], "index": 2 }, { "type": "text", "bbox": [ 51, 129, 316, 183 ], "lines": [ { "bbox": [ 51, 129, 317, 145 ], "spans": [ { "bbox": [ 51, 129, 317, 145 ], "score": 1.0, "content": "- Accept raw documents (PDF, DOCX, images, HTML)", "type": "text" } ], "index": 3 }, { "bbox": [ 50, 149, 242, 165 ], "spans": [ { "bbox": [ 50, 149, 242, 165 ], "score": 1.0, "content": "- Submit to MinerU API for parsing", "type": "text" } ], "index": 4 }, { "bbox": [ 51, 169, 258, 184 ], "spans": [ { "bbox": [ 51, 169, 220, 184 ], "score": 1.0, "content": "- Poll task status until state", "type": "text" }, { "bbox": [ 221, 171, 231, 181 ], "score": 0.76, "content": "\\underline { { \\underline { { \\mathbf { \\delta \\pi } } } } }", "type": "inline_equation" }, { "bbox": [ 231, 169, 258, 184 ], "score": 1.0, "content": "done", "type": "text" } ], "index": 5 } ], "index": 4 }, { "type": "text", "bbox": [ 41, 210, 192, 222 ], "lines": [ { "bbox": [ 40, 209, 192, 223 ], "spans": [ { "bbox": [ 40, 209, 192, 223 ], "score": 1.0, "content": "Stage 2: Content Extraction", "type": "text" } ], "index": 6 } ], "index": 6 }, { "type": "text", "bbox": [ 51, 229, 323, 282 ], "lines": [ { "bbox": [ 51, 229, 262, 243 ], "spans": [ { "bbox": [ 51, 229, 262, 243 ], "score": 1.0, "content": "- Download and decompress full_zip_url", "type": "text" } ], "index": 7 }, { "bbox": [ 50, 248, 313, 263 ], "spans": [ { "bbox": [ 50, 248, 313, 263 ], "score": 1.0, "content": "- Parse content_list.json into Document objects", "type": "text" } ], "index": 8 }, { "bbox": [ 51, 269, 323, 284 ], "spans": [ { "bbox": [ 51, 269, 323, 284 ], "score": 1.0, "content": "- Separate text blocks, tables, images, equations", "type": "text" } ], "index": 9 } ], "index": 8 }, { "type": "text", "bbox": [ 40, 309, 247, 321 ], "lines": [ { "bbox": [ 40, 308, 247, 322 ], "spans": [ { "bbox": [ 40, 308, 247, 322 ], "score": 1.0, "content": "Stage 3: Entity & Relation Extraction", "type": "text" } ], "index": 10 } ], "index": 10 }, { "type": "text", "bbox": [ 51, 328, 313, 382 ], "lines": [ { "bbox": [ 51, 327, 236, 342 ], "spans": [ { "bbox": [ 51, 327, 236, 342 ], "score": 1.0, "content": "- Feed text blocks to LangExtract", "type": "text" } ], "index": 11 }, { "bbox": [ 50, 348, 312, 362 ], "spans": [ { "bbox": [ 50, 348, 312, 362 ], "score": 1.0, "content": "- Extract entities with char_interval positions", "type": "text" } ], "index": 12 }, { "bbox": [ 51, 368, 274, 382 ], "spans": [ { "bbox": [ 51, 368, 274, 382 ], "score": 1.0, "content": "- Extract relationships between entities", "type": "text" } ], "index": 13 } ], "index": 12 }, { "type": "text", "bbox": [ 41, 408, 192, 421 ], "lines": [ { "bbox": [ 40, 408, 192, 422 ], "spans": [ { "bbox": [ 40, 408, 192, 422 ], "score": 1.0, "content": "Stage 4: Graph Construction", "type": "text" } ], "index": 14 } ], "index": 14 }, { "type": "text", "bbox": [ 51, 428, 311, 481 ], "lines": [ { "bbox": [ 50, 426, 285, 443 ], "spans": [ { "bbox": [ 50, 426, 285, 443 ], "score": 1.0, "content": "- Map extractions to graph nodes and edges", "type": "text" } ], "index": 15 }, { "bbox": [ 51, 447, 311, 462 ], "spans": [ { "bbox": [ 51, 447, 311, 462 ], "score": 1.0, "content": "- Store with source provenance (page_idx, bbox)", "type": "text" } ], "index": 16 }, { "bbox": [ 51, 467, 302, 481 ], "spans": [ { "bbox": [ 51, 467, 302, 481 ], "score": 1.0, "content": "- Build vector embeddings for semantic search", "type": "text" } ], "index": 17 } ], "index": 16 }, { "type": "title", "bbox": [ 30, 508, 194, 522 ], "lines": [ { "bbox": [ 28, 507, 195, 524 ], "spans": [ { "bbox": [ 28, 507, 195, 524 ], "score": 1.0, "content": "4. Supported File Formats", "type": "text" } ], "index": 18 } ], "index": 18 }, { "type": "table", "bbox": [ 27, 534, 525, 678 ], "blocks": [ { "type": "table_body", "bbox": [ 27, 534, 525, 678 ], "group_id": 0, "lines": [ { "bbox": [ 27, 534, 525, 678 ], "spans": [ { "bbox": [ 27, 534, 525, 678 ], "score": 0.985, "html": "
FormatExtensionOCR RequiredModeI
PDF (text). pdfNopipeline / vlm
PDF (scan). pdfYesvIlm
Word. docxNopipeline
PowerPoint.pptxNopipeline
Image.png / .jpgAutovIlm
HTML.htmlNoMinerU-HTML
", "type": "table", "image_path": "1ed7aacecd20fecef8dc27ee2fe76dc1ae7fa93c44f7d10878d17a41f21a6bef.jpg" } ] } ], "index": 20, "virtual_lines": [ { "bbox": [ 27, 534, 525, 582.0 ], "spans": [], "index": 19 }, { "bbox": [ 27, 582.0, 525, 630.0 ], "spans": [], "index": 20 }, { "bbox": [ 27, 630.0, 525, 678.0 ], "spans": [], "index": 21 } ] } ], "index": 20 } ], "page_idx": 2, "page_size": [ 595, 841 ], "discarded_blocks": [], "para_blocks": [ { "type": "title", "bbox": [ 30, 36, 160, 52 ], "lines": [ { "bbox": [ 27, 34, 162, 54 ], "spans": [ { "bbox": [ 27, 34, 162, 54 ], "score": 1.0, "content": "3. Data Pipeline", "type": "text" } ], "index": 0 } ], "index": 0 }, { "type": "text", "bbox": [ 30, 70, 371, 84 ], "lines": [ { "bbox": [ 28, 68, 373, 86 ], "spans": [ { "bbox": [ 28, 68, 373, 86 ], "score": 1.0, "content": "The end-to-end data pipeline consists of the following stages:", "type": "text" } ], "index": 1 } ], "index": 1, "bbox_fs": [ 28, 68, 373, 86 ] }, { "type": "text", "bbox": [ 41, 110, 192, 123 ], "lines": [ { "bbox": [ 39, 109, 192, 124 ], "spans": [ { "bbox": [ 39, 109, 192, 124 ], "score": 1.0, "content": "Stage 1: Document Ingestion", "type": "text" } ], "index": 2 } ], "index": 2, "bbox_fs": [ 39, 109, 192, 124 ] }, { "type": "text", "bbox": [ 51, 129, 316, 183 ], "lines": [ { "bbox": [ 51, 129, 317, 145 ], "spans": [ { "bbox": [ 51, 129, 317, 145 ], "score": 1.0, "content": "- Accept raw documents (PDF, DOCX, images, HTML)", "type": "text" } ], "index": 3 }, { "bbox": [ 50, 149, 242, 165 ], "spans": [ { "bbox": [ 50, 149, 242, 165 ], "score": 1.0, "content": "- Submit to MinerU API for parsing", "type": "text" } ], "index": 4 }, { "bbox": [ 51, 169, 258, 184 ], "spans": [ { "bbox": [ 51, 169, 220, 184 ], "score": 1.0, "content": "- Poll task status until state", "type": "text" }, { "bbox": [ 221, 171, 231, 181 ], "score": 0.76, "content": "\\underline { { \\underline { { \\mathbf { \\delta \\pi } } } } }", "type": "inline_equation" }, { "bbox": [ 231, 169, 258, 184 ], "score": 1.0, "content": "done", "type": "text" } ], "index": 5 } ], "index": 4, "bbox_fs": [ 50, 129, 317, 184 ] }, { "type": "text", "bbox": [ 41, 210, 192, 222 ], "lines": [ { "bbox": [ 40, 209, 192, 223 ], "spans": [ { "bbox": [ 40, 209, 192, 223 ], "score": 1.0, "content": "Stage 2: Content Extraction", "type": "text" } ], "index": 6 } ], "index": 6, "bbox_fs": [ 40, 209, 192, 223 ] }, { "type": "text", "bbox": [ 51, 229, 323, 282 ], "lines": [ { "bbox": [ 51, 229, 262, 243 ], "spans": [ { "bbox": [ 51, 229, 262, 243 ], "score": 1.0, "content": "- Download and decompress full_zip_url", "type": "text" } ], "index": 7 }, { "bbox": [ 50, 248, 313, 263 ], "spans": [ { "bbox": [ 50, 248, 313, 263 ], "score": 1.0, "content": "- Parse content_list.json into Document objects", "type": "text" } ], "index": 8 }, { "bbox": [ 51, 269, 323, 284 ], "spans": [ { "bbox": [ 51, 269, 323, 284 ], "score": 1.0, "content": "- Separate text blocks, tables, images, equations", "type": "text" } ], "index": 9 } ], "index": 8, "bbox_fs": [ 50, 229, 323, 284 ] }, { "type": "text", "bbox": [ 40, 309, 247, 321 ], "lines": [ { "bbox": [ 40, 308, 247, 322 ], "spans": [ { "bbox": [ 40, 308, 247, 322 ], "score": 1.0, "content": "Stage 3: Entity & Relation Extraction", "type": "text" } ], "index": 10 } ], "index": 10, "bbox_fs": [ 40, 308, 247, 322 ] }, { "type": "text", "bbox": [ 51, 328, 313, 382 ], "lines": [ { "bbox": [ 51, 327, 236, 342 ], "spans": [ { "bbox": [ 51, 327, 236, 342 ], "score": 1.0, "content": "- Feed text blocks to LangExtract", "type": "text" } ], "index": 11 }, { "bbox": [ 50, 348, 312, 362 ], "spans": [ { "bbox": [ 50, 348, 312, 362 ], "score": 1.0, "content": "- Extract entities with char_interval positions", "type": "text" } ], "index": 12 }, { "bbox": [ 51, 368, 274, 382 ], "spans": [ { "bbox": [ 51, 368, 274, 382 ], "score": 1.0, "content": "- Extract relationships between entities", "type": "text" } ], "index": 13 } ], "index": 12, "bbox_fs": [ 50, 327, 312, 382 ] }, { "type": "text", "bbox": [ 41, 408, 192, 421 ], "lines": [ { "bbox": [ 40, 408, 192, 422 ], "spans": [ { "bbox": [ 40, 408, 192, 422 ], "score": 1.0, "content": "Stage 4: Graph Construction", "type": "text" } ], "index": 14 } ], "index": 14, "bbox_fs": [ 40, 408, 192, 422 ] }, { "type": "text", "bbox": [ 51, 428, 311, 481 ], "lines": [ { "bbox": [ 50, 426, 285, 443 ], "spans": [ { "bbox": [ 50, 426, 285, 443 ], "score": 1.0, "content": "- Map extractions to graph nodes and edges", "type": "text" } ], "index": 15 }, { "bbox": [ 51, 447, 311, 462 ], "spans": [ { "bbox": [ 51, 447, 311, 462 ], "score": 1.0, "content": "- Store with source provenance (page_idx, bbox)", "type": "text" } ], "index": 16 }, { "bbox": [ 51, 467, 302, 481 ], "spans": [ { "bbox": [ 51, 467, 302, 481 ], "score": 1.0, "content": "- Build vector embeddings for semantic search", "type": "text" } ], "index": 17 } ], "index": 16, "bbox_fs": [ 50, 426, 311, 481 ] }, { "type": "title", "bbox": [ 30, 508, 194, 522 ], "lines": [ { "bbox": [ 28, 507, 195, 524 ], "spans": [ { "bbox": [ 28, 507, 195, 524 ], "score": 1.0, "content": "4. Supported File Formats", "type": "text" } ], "index": 18 } ], "index": 18 }, { "type": "table", "bbox": [ 27, 534, 525, 678 ], "blocks": [ { "type": "table_body", "bbox": [ 27, 534, 525, 678 ], "group_id": 0, "lines": [ { "bbox": [ 27, 534, 525, 678 ], "spans": [ { "bbox": [ 27, 534, 525, 678 ], "score": 0.985, "html": "
FormatExtensionOCR RequiredModeI
PDF (text). pdfNopipeline / vlm
PDF (scan). pdfYesvIlm
Word. docxNopipeline
PowerPoint.pptxNopipeline
Image.png / .jpgAutovIlm
HTML.htmlNoMinerU-HTML
", "type": "table", "image_path": "1ed7aacecd20fecef8dc27ee2fe76dc1ae7fa93c44f7d10878d17a41f21a6bef.jpg" } ] } ], "index": 20, "virtual_lines": [ { "bbox": [ 27, 534, 525, 582.0 ], "spans": [], "index": 19 }, { "bbox": [ 27, 582.0, 525, 630.0 ], "spans": [], "index": 20 }, { "bbox": [ 27, 630.0, 525, 678.0 ], "spans": [], "index": 21 } ] } ], "index": 20 } ] }, { "preproc_blocks": [ { "type": "title", "bbox": [ 29, 36, 272, 53 ], "lines": [ { "bbox": [ 27, 33, 274, 55 ], "spans": [ { "bbox": [ 27, 33, 274, 55 ], "score": 1.0, "content": "5. API Configuration Reference", "type": "text" } ], "index": 0 } ], "index": 0 }, { "type": "text", "bbox": [ 29, 70, 567, 104 ], "lines": [ { "bbox": [ 27, 67, 567, 87 ], "spans": [ { "bbox": [ 27, 67, 567, 87 ], "score": 1.0, "content": "The following environment variables must be configured before running the MinerU parsing", "type": "text" } ], "index": 1 }, { "bbox": [ 27, 90, 77, 105 ], "spans": [ { "bbox": [ 27, 90, 77, 105 ], "score": 1.0, "content": "service:", "type": "text" } ], "index": 2 } ], "index": 1.5 }, { "type": "text", "bbox": [ 39, 128, 379, 284 ], "lines": [ { "bbox": [ 39, 129, 362, 145 ], "spans": [ { "bbox": [ 39, 130, 132, 144 ], "score": 1.0, "content": "MINERU_API_TOKEN", "type": "text" }, { "bbox": [ 155, 129, 362, 145 ], "score": 1.0, "content": ": Bearer token for API authentication", "type": "text" } ], "index": 3 }, { "bbox": [ 39, 149, 335, 165 ], "spans": [ { "bbox": [ 39, 149, 126, 165 ], "score": 1.0, "content": "MINERU_USER_UID", "type": "text" }, { "bbox": [ 156, 149, 335, 164 ], "score": 1.0, "content": ": User UUID for quota management", "type": "text" } ], "index": 4 }, { "bbox": [ 39, 170, 307, 183 ], "spans": [ { "bbox": [ 39, 170, 126, 183 ], "score": 1.0, "content": "MINERU_BASE_URL", "type": "text" }, { "bbox": [ 156, 170, 307, 183 ], "score": 1.0, "content": ": https://mineru.net/api/v4", "type": "text" } ], "index": 5 }, { "bbox": [ 39, 189, 379, 204 ], "spans": [ { "bbox": [ 39, 189, 379, 204 ], "score": 1.0, "content": "MINERU_MODEL_VERSION : pipeline (default) | vlm | MinerU-HTML", "type": "text" } ], "index": 6 }, { "bbox": [ 40, 210, 316, 223 ], "spans": [ { "bbox": [ 40, 210, 126, 223 ], "score": 1.0, "content": "MINERU_LANGUAGE", "type": "text" }, { "bbox": [ 156, 210, 316, 223 ], "score": 1.0, "content": ": ch (Chinese) | en (English)", "type": "text" } ], "index": 7 }, { "bbox": [ 39, 229, 371, 244 ], "spans": [ { "bbox": [ 39, 230, 115, 244 ], "score": 1.0, "content": "MINERU_IS_OCR", "type": "text" }, { "bbox": [ 156, 229, 371, 243 ], "score": 1.0, "content": ": false (text PDF) | true (scanned PDF)", "type": "text" } ], "index": 8 }, { "bbox": [ 39, 249, 236, 263 ], "spans": [ { "bbox": [ 39, 249, 236, 263 ], "score": 1.0, "content": "MINERU_ENABLE_FORMULA: true | false", "type": "text" } ], "index": 9 }, { "bbox": [ 39, 269, 236, 282 ], "spans": [ { "bbox": [ 39, 269, 236, 282 ], "score": 1.0, "content": "MINERU_ENABLE_TABLE : true | false", "type": "text" } ], "index": 10 } ], "index": 6.5 }, { "type": "text", "bbox": [ 29, 309, 96, 321 ], "lines": [ { "bbox": [ 28, 308, 97, 322 ], "spans": [ { "bbox": [ 28, 308, 97, 322 ], "score": 1.0, "content": "Rate Limits:", "type": "text" } ], "index": 11 } ], "index": 11 }, { "type": "text", "bbox": [ 39, 327, 300, 402 ], "lines": [ { "bbox": [ 39, 327, 242, 343 ], "spans": [ { "bbox": [ 39, 327, 126, 342 ], "score": 1.0, "content": "- Max file size", "type": "text" }, { "bbox": [ 139, 327, 242, 343 ], "score": 1.0, "content": ": 200 MB per file", "type": "text" } ], "index": 12 }, { "bbox": [ 39, 347, 258, 364 ], "spans": [ { "bbox": [ 39, 347, 104, 364 ], "score": 1.0, "content": "- Max pages", "type": "text" }, { "bbox": [ 145, 348, 258, 363 ], "score": 1.0, "content": ": 600 pages per file", "type": "text" } ], "index": 13 }, { "bbox": [ 39, 367, 300, 383 ], "spans": [ { "bbox": [ 39, 367, 115, 383 ], "score": 1.0, "content": "- Daily quota", "type": "text" }, { "bbox": [ 145, 368, 300, 383 ], "score": 1.0, "content": ": 2000 pages (high priority)", "type": "text" } ], "index": 14 }, { "bbox": [ 39, 387, 274, 403 ], "spans": [ { "bbox": [ 39, 387, 116, 402 ], "score": 1.0, "content": "- Batch limit", "type": "text" }, { "bbox": [ 144, 387, 274, 403 ], "score": 1.0, "content": ": 200 files per request", "type": "text" } ], "index": 15 } ], "index": 13.5 } ], "page_idx": 3, "page_size": [ 595, 841 ], "discarded_blocks": [], "para_blocks": [ { "type": "title", "bbox": [ 29, 36, 272, 53 ], "lines": [ { "bbox": [ 27, 33, 274, 55 ], "spans": [ { "bbox": [ 27, 33, 274, 55 ], "score": 1.0, "content": "5. API Configuration Reference", "type": "text" } ], "index": 0 } ], "index": 0 }, { "type": "text", "bbox": [ 29, 70, 567, 104 ], "lines": [ { "bbox": [ 27, 67, 567, 87 ], "spans": [ { "bbox": [ 27, 67, 567, 87 ], "score": 1.0, "content": "The following environment variables must be configured before running the MinerU parsing", "type": "text" } ], "index": 1 }, { "bbox": [ 27, 90, 77, 105 ], "spans": [ { "bbox": [ 27, 90, 77, 105 ], "score": 1.0, "content": "service:", "type": "text" } ], "index": 2 } ], "index": 1.5, "bbox_fs": [ 27, 67, 567, 105 ] }, { "type": "list", "bbox": [ 39, 128, 379, 284 ], "lines": [ { "bbox": [ 39, 129, 362, 145 ], "spans": [ { "bbox": [ 39, 130, 132, 144 ], "score": 1.0, "content": "MINERU_API_TOKEN", "type": "text" }, { "bbox": [ 155, 129, 362, 145 ], "score": 1.0, "content": ": Bearer token for API authentication", "type": "text" } ], "index": 3, "is_list_start_line": true }, { "bbox": [ 39, 149, 335, 165 ], "spans": [ { "bbox": [ 39, 149, 126, 165 ], "score": 1.0, "content": "MINERU_USER_UID", "type": "text" }, { "bbox": [ 156, 149, 335, 164 ], "score": 1.0, "content": ": User UUID for quota management", "type": "text" } ], "index": 4, "is_list_start_line": true }, { "bbox": [ 39, 170, 307, 183 ], "spans": [ { "bbox": [ 39, 170, 126, 183 ], "score": 1.0, "content": "MINERU_BASE_URL", "type": "text" }, { "bbox": [ 156, 170, 307, 183 ], "score": 1.0, "content": ": https://mineru.net/api/v4", "type": "text" } ], "index": 5, "is_list_start_line": true }, { "bbox": [ 39, 189, 379, 204 ], "spans": [ { "bbox": [ 39, 189, 379, 204 ], "score": 1.0, "content": "MINERU_MODEL_VERSION : pipeline (default) | vlm | MinerU-HTML", "type": "text" } ], "index": 6, "is_list_start_line": true }, { "bbox": [ 40, 210, 316, 223 ], "spans": [ { "bbox": [ 40, 210, 126, 223 ], "score": 1.0, "content": "MINERU_LANGUAGE", "type": "text" }, { "bbox": [ 156, 210, 316, 223 ], "score": 1.0, "content": ": ch (Chinese) | en (English)", "type": "text" } ], "index": 7, "is_list_start_line": true }, { "bbox": [ 39, 229, 371, 244 ], "spans": [ { "bbox": [ 39, 230, 115, 244 ], "score": 1.0, "content": "MINERU_IS_OCR", "type": "text" }, { "bbox": [ 156, 229, 371, 243 ], "score": 1.0, "content": ": false (text PDF) | true (scanned PDF)", "type": "text" } ], "index": 8, "is_list_start_line": true }, { "bbox": [ 39, 249, 236, 263 ], "spans": [ { "bbox": [ 39, 249, 236, 263 ], "score": 1.0, "content": "MINERU_ENABLE_FORMULA: true | false", "type": "text" } ], "index": 9, "is_list_start_line": true }, { "bbox": [ 39, 269, 236, 282 ], "spans": [ { "bbox": [ 39, 269, 236, 282 ], "score": 1.0, "content": "MINERU_ENABLE_TABLE : true | false", "type": "text" } ], "index": 10, "is_list_start_line": true } ], "index": 6.5, "bbox_fs": [ 39, 129, 379, 282 ] }, { "type": "text", "bbox": [ 29, 309, 96, 321 ], "lines": [ { "bbox": [ 28, 308, 97, 322 ], "spans": [ { "bbox": [ 28, 308, 97, 322 ], "score": 1.0, "content": "Rate Limits:", "type": "text" } ], "index": 11 } ], "index": 11, "bbox_fs": [ 28, 308, 97, 322 ] }, { "type": "text", "bbox": [ 39, 327, 300, 402 ], "lines": [ { "bbox": [ 39, 327, 242, 343 ], "spans": [ { "bbox": [ 39, 327, 126, 342 ], "score": 1.0, "content": "- Max file size", "type": "text" }, { "bbox": [ 139, 327, 242, 343 ], "score": 1.0, "content": ": 200 MB per file", "type": "text" } ], "index": 12 }, { "bbox": [ 39, 347, 258, 364 ], "spans": [ { "bbox": [ 39, 347, 104, 364 ], "score": 1.0, "content": "- Max pages", "type": "text" }, { "bbox": [ 145, 348, 258, 363 ], "score": 1.0, "content": ": 600 pages per file", "type": "text" } ], "index": 13 }, { "bbox": [ 39, 367, 300, 383 ], "spans": [ { "bbox": [ 39, 367, 115, 383 ], "score": 1.0, "content": "- Daily quota", "type": "text" }, { "bbox": [ 145, 368, 300, 383 ], "score": 1.0, "content": ": 2000 pages (high priority)", "type": "text" } ], "index": 14 }, { "bbox": [ 39, 387, 274, 403 ], "spans": [ { "bbox": [ 39, 387, 116, 402 ], "score": 1.0, "content": "- Batch limit", "type": "text" }, { "bbox": [ 144, 387, 274, 403 ], "score": 1.0, "content": ": 200 files per request", "type": "text" } ], "index": 15 } ], "index": 13.5, "bbox_fs": [ 39, 327, 300, 403 ] } ] } ], "_backend": "pipeline", "_version_name": "2.7.6" }