{
"pdf_info": [
{
"preproc_blocks": [
{
"type": "title",
"bbox": [
205,
148,
390,
172
],
"lines": [
{
"bbox": [
203,
144,
393,
177
],
"spans": [
{
"bbox": [
203,
144,
393,
177
],
"score": 1.0,
"content": "GraphRAG System",
"type": "text"
}
],
"index": 0
}
],
"index": 0
},
{
"type": "text",
"bbox": [
172,
197,
423,
214
],
"lines": [
{
"bbox": [
171,
195,
424,
216
],
"spans": [
{
"bbox": [
171,
195,
424,
216
],
"score": 1.0,
"content": "Technical Architecture Overview",
"type": "text"
}
],
"index": 1
}
],
"index": 1
},
{
"type": "text",
"bbox": [
217,
229,
377,
244
],
"lines": [
{
"bbox": [
216,
227,
378,
246
],
"spans": [
{
"bbox": [
216,
227,
378,
246
],
"score": 1.0,
"content": "Version 1.0 | March 2026",
"type": "text"
}
],
"index": 2
}
],
"index": 2
}
],
"page_idx": 0,
"page_size": [
595,
841
],
"discarded_blocks": [],
"para_blocks": [
{
"type": "title",
"bbox": [
205,
148,
390,
172
],
"lines": [
{
"bbox": [
203,
144,
393,
177
],
"spans": [
{
"bbox": [
203,
144,
393,
177
],
"score": 1.0,
"content": "GraphRAG System",
"type": "text"
}
],
"index": 0
}
],
"index": 0
},
{
"type": "text",
"bbox": [
172,
197,
423,
214
],
"lines": [
{
"bbox": [
171,
195,
424,
216
],
"spans": [
{
"bbox": [
171,
195,
424,
216
],
"score": 1.0,
"content": "Technical Architecture Overview",
"type": "text"
}
],
"index": 1
}
],
"index": 1,
"bbox_fs": [
171,
195,
424,
216
]
},
{
"type": "text",
"bbox": [
217,
229,
377,
244
],
"lines": [
{
"bbox": [
216,
227,
378,
246
],
"spans": [
{
"bbox": [
216,
227,
378,
246
],
"score": 1.0,
"content": "Version 1.0 | March 2026",
"type": "text"
}
],
"index": 2
}
],
"index": 2,
"bbox_fs": [
216,
227,
378,
246
]
}
]
},
{
"preproc_blocks": [
{
"type": "title",
"bbox": [
31,
36,
119,
52
],
"lines": [
{
"bbox": [
27,
34,
121,
54
],
"spans": [
{
"bbox": [
27,
34,
121,
54
],
"score": 1.0,
"content": "1. Abstract",
"type": "text"
}
],
"index": 0
}
],
"index": 0
},
{
"type": "text",
"bbox": [
29,
70,
566,
144
],
"lines": [
{
"bbox": [
29,
69,
565,
84
],
"spans": [
{
"bbox": [
29,
69,
565,
84
],
"score": 1.0,
"content": "This document presents the technical architecture of a Multimodal GraphRAG System designed for",
"type": "text"
}
],
"index": 1
},
{
"bbox": [
30,
89,
565,
104
],
"spans": [
{
"bbox": [
30,
89,
565,
104
],
"score": 1.0,
"content": "intelligent document parsing and knowledge graph construction. The system integrates MinerU for",
"type": "text"
}
],
"index": 2
},
{
"bbox": [
28,
110,
565,
125
],
"spans": [
{
"bbox": [
28,
110,
565,
125
],
"score": 1.0,
"content": "document parsing, LangExtract for structured entity extraction, and a graph database for",
"type": "text"
}
],
"index": 3
},
{
"bbox": [
28,
129,
207,
145
],
"spans": [
{
"bbox": [
28,
129,
207,
145
],
"score": 1.0,
"content": "knowledge storage and retrieval.",
"type": "text"
}
],
"index": 4
}
],
"index": 2.5
},
{
"type": "text",
"bbox": [
29,
169,
565,
223
],
"lines": [
{
"bbox": [
27,
167,
565,
185
],
"spans": [
{
"bbox": [
27,
167,
565,
185
],
"score": 1.0,
"content": "The pipeline supports multiple document formats including PDF, DOCX, PPTX, and image files.",
"type": "text"
}
],
"index": 5
},
{
"bbox": [
28,
188,
567,
205
],
"spans": [
{
"bbox": [
28,
188,
567,
205
],
"score": 1.0,
"content": "Extracted entities and relations are stored as graph nodes and edges, enabling semantic search",
"type": "text"
}
],
"index": 6
},
{
"bbox": [
28,
209,
331,
224
],
"spans": [
{
"bbox": [
28,
209,
331,
224
],
"score": 1.0,
"content": "and question answering over large document collections.",
"type": "text"
}
],
"index": 7
}
],
"index": 6
},
{
"type": "title",
"bbox": [
30,
252,
191,
268
],
"lines": [
{
"bbox": [
27,
249,
193,
271
],
"spans": [
{
"bbox": [
27,
249,
193,
271
],
"score": 1.0,
"content": "2. System Components",
"type": "text"
}
],
"index": 8
}
],
"index": 8
},
{
"type": "title",
"bbox": [
30,
289,
208,
304
],
"lines": [
{
"bbox": [
28,
288,
208,
306
],
"spans": [
{
"bbox": [
28,
288,
208,
306
],
"score": 1.0,
"content": "2.1 Document Parsing Module",
"type": "text"
}
],
"index": 9
}
],
"index": 9
},
{
"type": "text",
"bbox": [
29,
314,
566,
367
],
"lines": [
{
"bbox": [
28,
313,
563,
329
],
"spans": [
{
"bbox": [
28,
313,
563,
329
],
"score": 1.0,
"content": "MinerU Cloud API (v4) serves as the document parsing backend. It accepts PDF, DOCX, PPTX, PNG,",
"type": "text"
}
],
"index": 10
},
{
"bbox": [
29,
334,
566,
348
],
"spans": [
{
"bbox": [
29,
334,
566,
348
],
"score": 1.0,
"content": "JPG, and HTML files. Output includes Markdown text, structured content_list.json, and extracted",
"type": "text"
}
],
"index": 11
},
{
"bbox": [
28,
352,
69,
370
],
"spans": [
{
"bbox": [
28,
352,
69,
370
],
"score": 1.0,
"content": "images.",
"type": "text"
}
],
"index": 12
}
],
"index": 11
},
{
"type": "title",
"bbox": [
30,
388,
213,
403
],
"lines": [
{
"bbox": [
28,
388,
214,
404
],
"spans": [
{
"bbox": [
28,
388,
214,
404
],
"score": 1.0,
"content": "2.2 Entity Extraction Module",
"type": "text"
}
],
"index": 13
}
],
"index": 13
},
{
"type": "text",
"bbox": [
29,
414,
565,
467
],
"lines": [
{
"bbox": [
28,
412,
567,
428
],
"spans": [
{
"bbox": [
28,
412,
567,
428
],
"score": 1.0,
"content": "LangExtract (v1.1.1) performs structured information extraction from plain text using few-shot",
"type": "text"
}
],
"index": 14
},
{
"bbox": [
28,
432,
565,
448
],
"spans": [
{
"bbox": [
28,
432,
565,
448
],
"score": 1.0,
"content": "prompting with LLM backends (Gemini, OpenAI, or local Ollama). Each extraction includes",
"type": "text"
}
],
"index": 15
},
{
"bbox": [
27,
451,
223,
469
],
"spans": [
{
"bbox": [
27,
451,
223,
469
],
"score": 1.0,
"content": "character-level position anchoring.",
"type": "text"
}
],
"index": 16
}
],
"index": 15
},
{
"type": "title",
"bbox": [
30,
488,
201,
502
],
"lines": [
{
"bbox": [
28,
487,
201,
504
],
"spans": [
{
"bbox": [
28,
487,
201,
504
],
"score": 1.0,
"content": "2.3 Knowledge Graph Module",
"type": "text"
}
],
"index": 17
}
],
"index": 17
},
{
"type": "text",
"bbox": [
29,
512,
565,
567
],
"lines": [
{
"bbox": [
27,
510,
564,
529
],
"spans": [
{
"bbox": [
27,
510,
564,
529
],
"score": 1.0,
"content": "Extracted entities and relationships are stored in a graph database. Node types include: Person,",
"type": "text"
}
],
"index": 18
},
{
"bbox": [
28,
531,
563,
547
],
"spans": [
{
"bbox": [
28,
531,
563,
547
],
"score": 1.0,
"content": "Organization, Location, Event, Concept. Edge types include: RELATED_TO, BELONGS_TO, CAUSED_BY,",
"type": "text"
}
],
"index": 19
},
{
"bbox": [
28,
552,
91,
567
],
"spans": [
{
"bbox": [
28,
552,
91,
567
],
"score": 1.0,
"content": "LOCATED_IN.",
"type": "text"
}
],
"index": 20
}
],
"index": 19
},
{
"type": "title",
"bbox": [
30,
587,
162,
602
],
"lines": [
{
"bbox": [
28,
587,
162,
603
],
"spans": [
{
"bbox": [
28,
587,
162,
603
],
"score": 1.0,
"content": "2.4 Retrieval Module",
"type": "text"
}
],
"index": 21
}
],
"index": 21
},
{
"type": "text",
"bbox": [
29,
612,
562,
645
],
"lines": [
{
"bbox": [
28,
610,
563,
627
],
"spans": [
{
"bbox": [
28,
610,
563,
627
],
"score": 1.0,
"content": "The retrieval layer supports hybrid search combining vector similarity and graph traversal.",
"type": "text"
}
],
"index": 22
},
{
"bbox": [
28,
631,
519,
646
],
"spans": [
{
"bbox": [
28,
631,
519,
646
],
"score": 1.0,
"content": "Query results are ranked by relevance score and returned with source document references.",
"type": "text"
}
],
"index": 23
}
],
"index": 22.5
}
],
"page_idx": 1,
"page_size": [
595,
841
],
"discarded_blocks": [],
"para_blocks": [
{
"type": "title",
"bbox": [
31,
36,
119,
52
],
"lines": [
{
"bbox": [
27,
34,
121,
54
],
"spans": [
{
"bbox": [
27,
34,
121,
54
],
"score": 1.0,
"content": "1. Abstract",
"type": "text"
}
],
"index": 0
}
],
"index": 0
},
{
"type": "text",
"bbox": [
29,
70,
566,
144
],
"lines": [
{
"bbox": [
29,
69,
565,
84
],
"spans": [
{
"bbox": [
29,
69,
565,
84
],
"score": 1.0,
"content": "This document presents the technical architecture of a Multimodal GraphRAG System designed for",
"type": "text"
}
],
"index": 1
},
{
"bbox": [
30,
89,
565,
104
],
"spans": [
{
"bbox": [
30,
89,
565,
104
],
"score": 1.0,
"content": "intelligent document parsing and knowledge graph construction. The system integrates MinerU for",
"type": "text"
}
],
"index": 2
},
{
"bbox": [
28,
110,
565,
125
],
"spans": [
{
"bbox": [
28,
110,
565,
125
],
"score": 1.0,
"content": "document parsing, LangExtract for structured entity extraction, and a graph database for",
"type": "text"
}
],
"index": 3
},
{
"bbox": [
28,
129,
207,
145
],
"spans": [
{
"bbox": [
28,
129,
207,
145
],
"score": 1.0,
"content": "knowledge storage and retrieval.",
"type": "text"
}
],
"index": 4
}
],
"index": 2.5,
"bbox_fs": [
28,
69,
565,
145
]
},
{
"type": "text",
"bbox": [
29,
169,
565,
223
],
"lines": [
{
"bbox": [
27,
167,
565,
185
],
"spans": [
{
"bbox": [
27,
167,
565,
185
],
"score": 1.0,
"content": "The pipeline supports multiple document formats including PDF, DOCX, PPTX, and image files.",
"type": "text"
}
],
"index": 5
},
{
"bbox": [
28,
188,
567,
205
],
"spans": [
{
"bbox": [
28,
188,
567,
205
],
"score": 1.0,
"content": "Extracted entities and relations are stored as graph nodes and edges, enabling semantic search",
"type": "text"
}
],
"index": 6
},
{
"bbox": [
28,
209,
331,
224
],
"spans": [
{
"bbox": [
28,
209,
331,
224
],
"score": 1.0,
"content": "and question answering over large document collections.",
"type": "text"
}
],
"index": 7
}
],
"index": 6,
"bbox_fs": [
27,
167,
567,
224
]
},
{
"type": "title",
"bbox": [
30,
252,
191,
268
],
"lines": [
{
"bbox": [
27,
249,
193,
271
],
"spans": [
{
"bbox": [
27,
249,
193,
271
],
"score": 1.0,
"content": "2. System Components",
"type": "text"
}
],
"index": 8
}
],
"index": 8
},
{
"type": "title",
"bbox": [
30,
289,
208,
304
],
"lines": [
{
"bbox": [
28,
288,
208,
306
],
"spans": [
{
"bbox": [
28,
288,
208,
306
],
"score": 1.0,
"content": "2.1 Document Parsing Module",
"type": "text"
}
],
"index": 9
}
],
"index": 9
},
{
"type": "text",
"bbox": [
29,
314,
566,
367
],
"lines": [
{
"bbox": [
28,
313,
563,
329
],
"spans": [
{
"bbox": [
28,
313,
563,
329
],
"score": 1.0,
"content": "MinerU Cloud API (v4) serves as the document parsing backend. It accepts PDF, DOCX, PPTX, PNG,",
"type": "text"
}
],
"index": 10
},
{
"bbox": [
29,
334,
566,
348
],
"spans": [
{
"bbox": [
29,
334,
566,
348
],
"score": 1.0,
"content": "JPG, and HTML files. Output includes Markdown text, structured content_list.json, and extracted",
"type": "text"
}
],
"index": 11
},
{
"bbox": [
28,
352,
69,
370
],
"spans": [
{
"bbox": [
28,
352,
69,
370
],
"score": 1.0,
"content": "images.",
"type": "text"
}
],
"index": 12
}
],
"index": 11,
"bbox_fs": [
28,
313,
566,
370
]
},
{
"type": "title",
"bbox": [
30,
388,
213,
403
],
"lines": [
{
"bbox": [
28,
388,
214,
404
],
"spans": [
{
"bbox": [
28,
388,
214,
404
],
"score": 1.0,
"content": "2.2 Entity Extraction Module",
"type": "text"
}
],
"index": 13
}
],
"index": 13
},
{
"type": "text",
"bbox": [
29,
414,
565,
467
],
"lines": [
{
"bbox": [
28,
412,
567,
428
],
"spans": [
{
"bbox": [
28,
412,
567,
428
],
"score": 1.0,
"content": "LangExtract (v1.1.1) performs structured information extraction from plain text using few-shot",
"type": "text"
}
],
"index": 14
},
{
"bbox": [
28,
432,
565,
448
],
"spans": [
{
"bbox": [
28,
432,
565,
448
],
"score": 1.0,
"content": "prompting with LLM backends (Gemini, OpenAI, or local Ollama). Each extraction includes",
"type": "text"
}
],
"index": 15
},
{
"bbox": [
27,
451,
223,
469
],
"spans": [
{
"bbox": [
27,
451,
223,
469
],
"score": 1.0,
"content": "character-level position anchoring.",
"type": "text"
}
],
"index": 16
}
],
"index": 15,
"bbox_fs": [
27,
412,
567,
469
]
},
{
"type": "title",
"bbox": [
30,
488,
201,
502
],
"lines": [
{
"bbox": [
28,
487,
201,
504
],
"spans": [
{
"bbox": [
28,
487,
201,
504
],
"score": 1.0,
"content": "2.3 Knowledge Graph Module",
"type": "text"
}
],
"index": 17
}
],
"index": 17
},
{
"type": "text",
"bbox": [
29,
512,
565,
567
],
"lines": [
{
"bbox": [
27,
510,
564,
529
],
"spans": [
{
"bbox": [
27,
510,
564,
529
],
"score": 1.0,
"content": "Extracted entities and relationships are stored in a graph database. Node types include: Person,",
"type": "text"
}
],
"index": 18
},
{
"bbox": [
28,
531,
563,
547
],
"spans": [
{
"bbox": [
28,
531,
563,
547
],
"score": 1.0,
"content": "Organization, Location, Event, Concept. Edge types include: RELATED_TO, BELONGS_TO, CAUSED_BY,",
"type": "text"
}
],
"index": 19
},
{
"bbox": [
28,
552,
91,
567
],
"spans": [
{
"bbox": [
28,
552,
91,
567
],
"score": 1.0,
"content": "LOCATED_IN.",
"type": "text"
}
],
"index": 20
}
],
"index": 19,
"bbox_fs": [
27,
510,
564,
567
]
},
{
"type": "title",
"bbox": [
30,
587,
162,
602
],
"lines": [
{
"bbox": [
28,
587,
162,
603
],
"spans": [
{
"bbox": [
28,
587,
162,
603
],
"score": 1.0,
"content": "2.4 Retrieval Module",
"type": "text"
}
],
"index": 21
}
],
"index": 21
},
{
"type": "list",
"bbox": [
29,
612,
562,
645
],
"lines": [
{
"bbox": [
28,
610,
563,
627
],
"spans": [
{
"bbox": [
28,
610,
563,
627
],
"score": 1.0,
"content": "The retrieval layer supports hybrid search combining vector similarity and graph traversal.",
"type": "text"
}
],
"index": 22,
"is_list_end_line": true
},
{
"bbox": [
28,
631,
519,
646
],
"spans": [
{
"bbox": [
28,
631,
519,
646
],
"score": 1.0,
"content": "Query results are ranked by relevance score and returned with source document references.",
"type": "text"
}
],
"index": 23,
"is_list_start_line": true,
"is_list_end_line": true
}
],
"index": 22.5,
"bbox_fs": [
28,
610,
563,
646
]
}
]
},
{
"preproc_blocks": [
{
"type": "title",
"bbox": [
30,
36,
160,
52
],
"lines": [
{
"bbox": [
27,
34,
162,
54
],
"spans": [
{
"bbox": [
27,
34,
162,
54
],
"score": 1.0,
"content": "3. Data Pipeline",
"type": "text"
}
],
"index": 0
}
],
"index": 0
},
{
"type": "text",
"bbox": [
30,
70,
371,
84
],
"lines": [
{
"bbox": [
28,
68,
373,
86
],
"spans": [
{
"bbox": [
28,
68,
373,
86
],
"score": 1.0,
"content": "The end-to-end data pipeline consists of the following stages:",
"type": "text"
}
],
"index": 1
}
],
"index": 1
},
{
"type": "text",
"bbox": [
41,
110,
192,
123
],
"lines": [
{
"bbox": [
39,
109,
192,
124
],
"spans": [
{
"bbox": [
39,
109,
192,
124
],
"score": 1.0,
"content": "Stage 1: Document Ingestion",
"type": "text"
}
],
"index": 2
}
],
"index": 2
},
{
"type": "text",
"bbox": [
51,
129,
316,
183
],
"lines": [
{
"bbox": [
51,
129,
317,
145
],
"spans": [
{
"bbox": [
51,
129,
317,
145
],
"score": 1.0,
"content": "- Accept raw documents (PDF, DOCX, images, HTML)",
"type": "text"
}
],
"index": 3
},
{
"bbox": [
50,
149,
242,
165
],
"spans": [
{
"bbox": [
50,
149,
242,
165
],
"score": 1.0,
"content": "- Submit to MinerU API for parsing",
"type": "text"
}
],
"index": 4
},
{
"bbox": [
51,
169,
258,
184
],
"spans": [
{
"bbox": [
51,
169,
220,
184
],
"score": 1.0,
"content": "- Poll task status until state",
"type": "text"
},
{
"bbox": [
221,
171,
231,
181
],
"score": 0.76,
"content": "\\underline { { \\underline { { \\mathbf { \\delta \\pi } } } } }",
"type": "inline_equation"
},
{
"bbox": [
231,
169,
258,
184
],
"score": 1.0,
"content": "done",
"type": "text"
}
],
"index": 5
}
],
"index": 4
},
{
"type": "text",
"bbox": [
41,
210,
192,
222
],
"lines": [
{
"bbox": [
40,
209,
192,
223
],
"spans": [
{
"bbox": [
40,
209,
192,
223
],
"score": 1.0,
"content": "Stage 2: Content Extraction",
"type": "text"
}
],
"index": 6
}
],
"index": 6
},
{
"type": "text",
"bbox": [
51,
229,
323,
282
],
"lines": [
{
"bbox": [
51,
229,
262,
243
],
"spans": [
{
"bbox": [
51,
229,
262,
243
],
"score": 1.0,
"content": "- Download and decompress full_zip_url",
"type": "text"
}
],
"index": 7
},
{
"bbox": [
50,
248,
313,
263
],
"spans": [
{
"bbox": [
50,
248,
313,
263
],
"score": 1.0,
"content": "- Parse content_list.json into Document objects",
"type": "text"
}
],
"index": 8
},
{
"bbox": [
51,
269,
323,
284
],
"spans": [
{
"bbox": [
51,
269,
323,
284
],
"score": 1.0,
"content": "- Separate text blocks, tables, images, equations",
"type": "text"
}
],
"index": 9
}
],
"index": 8
},
{
"type": "text",
"bbox": [
40,
309,
247,
321
],
"lines": [
{
"bbox": [
40,
308,
247,
322
],
"spans": [
{
"bbox": [
40,
308,
247,
322
],
"score": 1.0,
"content": "Stage 3: Entity & Relation Extraction",
"type": "text"
}
],
"index": 10
}
],
"index": 10
},
{
"type": "text",
"bbox": [
51,
328,
313,
382
],
"lines": [
{
"bbox": [
51,
327,
236,
342
],
"spans": [
{
"bbox": [
51,
327,
236,
342
],
"score": 1.0,
"content": "- Feed text blocks to LangExtract",
"type": "text"
}
],
"index": 11
},
{
"bbox": [
50,
348,
312,
362
],
"spans": [
{
"bbox": [
50,
348,
312,
362
],
"score": 1.0,
"content": "- Extract entities with char_interval positions",
"type": "text"
}
],
"index": 12
},
{
"bbox": [
51,
368,
274,
382
],
"spans": [
{
"bbox": [
51,
368,
274,
382
],
"score": 1.0,
"content": "- Extract relationships between entities",
"type": "text"
}
],
"index": 13
}
],
"index": 12
},
{
"type": "text",
"bbox": [
41,
408,
192,
421
],
"lines": [
{
"bbox": [
40,
408,
192,
422
],
"spans": [
{
"bbox": [
40,
408,
192,
422
],
"score": 1.0,
"content": "Stage 4: Graph Construction",
"type": "text"
}
],
"index": 14
}
],
"index": 14
},
{
"type": "text",
"bbox": [
51,
428,
311,
481
],
"lines": [
{
"bbox": [
50,
426,
285,
443
],
"spans": [
{
"bbox": [
50,
426,
285,
443
],
"score": 1.0,
"content": "- Map extractions to graph nodes and edges",
"type": "text"
}
],
"index": 15
},
{
"bbox": [
51,
447,
311,
462
],
"spans": [
{
"bbox": [
51,
447,
311,
462
],
"score": 1.0,
"content": "- Store with source provenance (page_idx, bbox)",
"type": "text"
}
],
"index": 16
},
{
"bbox": [
51,
467,
302,
481
],
"spans": [
{
"bbox": [
51,
467,
302,
481
],
"score": 1.0,
"content": "- Build vector embeddings for semantic search",
"type": "text"
}
],
"index": 17
}
],
"index": 16
},
{
"type": "title",
"bbox": [
30,
508,
194,
522
],
"lines": [
{
"bbox": [
28,
507,
195,
524
],
"spans": [
{
"bbox": [
28,
507,
195,
524
],
"score": 1.0,
"content": "4. Supported File Formats",
"type": "text"
}
],
"index": 18
}
],
"index": 18
},
{
"type": "table",
"bbox": [
27,
534,
525,
678
],
"blocks": [
{
"type": "table_body",
"bbox": [
27,
534,
525,
678
],
"group_id": 0,
"lines": [
{
"bbox": [
27,
534,
525,
678
],
"spans": [
{
"bbox": [
27,
534,
525,
678
],
"score": 0.985,
"html": "
| Format | Extension | OCR Required | ModeI |
| PDF (text) | . pdf | No | pipeline / vlm |
| PDF (scan) | . pdf | Yes | vIlm |
| Word | . docx | No | pipeline |
| PowerPoint | .pptx | No | pipeline |
| Image | .png / .jpg | Auto | vIlm |
| HTML | .html | No | MinerU-HTML |
",
"type": "table",
"image_path": "1ed7aacecd20fecef8dc27ee2fe76dc1ae7fa93c44f7d10878d17a41f21a6bef.jpg"
}
]
}
],
"index": 20,
"virtual_lines": [
{
"bbox": [
27,
534,
525,
582.0
],
"spans": [],
"index": 19
},
{
"bbox": [
27,
582.0,
525,
630.0
],
"spans": [],
"index": 20
},
{
"bbox": [
27,
630.0,
525,
678.0
],
"spans": [],
"index": 21
}
]
}
],
"index": 20
}
],
"page_idx": 2,
"page_size": [
595,
841
],
"discarded_blocks": [],
"para_blocks": [
{
"type": "title",
"bbox": [
30,
36,
160,
52
],
"lines": [
{
"bbox": [
27,
34,
162,
54
],
"spans": [
{
"bbox": [
27,
34,
162,
54
],
"score": 1.0,
"content": "3. Data Pipeline",
"type": "text"
}
],
"index": 0
}
],
"index": 0
},
{
"type": "text",
"bbox": [
30,
70,
371,
84
],
"lines": [
{
"bbox": [
28,
68,
373,
86
],
"spans": [
{
"bbox": [
28,
68,
373,
86
],
"score": 1.0,
"content": "The end-to-end data pipeline consists of the following stages:",
"type": "text"
}
],
"index": 1
}
],
"index": 1,
"bbox_fs": [
28,
68,
373,
86
]
},
{
"type": "text",
"bbox": [
41,
110,
192,
123
],
"lines": [
{
"bbox": [
39,
109,
192,
124
],
"spans": [
{
"bbox": [
39,
109,
192,
124
],
"score": 1.0,
"content": "Stage 1: Document Ingestion",
"type": "text"
}
],
"index": 2
}
],
"index": 2,
"bbox_fs": [
39,
109,
192,
124
]
},
{
"type": "text",
"bbox": [
51,
129,
316,
183
],
"lines": [
{
"bbox": [
51,
129,
317,
145
],
"spans": [
{
"bbox": [
51,
129,
317,
145
],
"score": 1.0,
"content": "- Accept raw documents (PDF, DOCX, images, HTML)",
"type": "text"
}
],
"index": 3
},
{
"bbox": [
50,
149,
242,
165
],
"spans": [
{
"bbox": [
50,
149,
242,
165
],
"score": 1.0,
"content": "- Submit to MinerU API for parsing",
"type": "text"
}
],
"index": 4
},
{
"bbox": [
51,
169,
258,
184
],
"spans": [
{
"bbox": [
51,
169,
220,
184
],
"score": 1.0,
"content": "- Poll task status until state",
"type": "text"
},
{
"bbox": [
221,
171,
231,
181
],
"score": 0.76,
"content": "\\underline { { \\underline { { \\mathbf { \\delta \\pi } } } } }",
"type": "inline_equation"
},
{
"bbox": [
231,
169,
258,
184
],
"score": 1.0,
"content": "done",
"type": "text"
}
],
"index": 5
}
],
"index": 4,
"bbox_fs": [
50,
129,
317,
184
]
},
{
"type": "text",
"bbox": [
41,
210,
192,
222
],
"lines": [
{
"bbox": [
40,
209,
192,
223
],
"spans": [
{
"bbox": [
40,
209,
192,
223
],
"score": 1.0,
"content": "Stage 2: Content Extraction",
"type": "text"
}
],
"index": 6
}
],
"index": 6,
"bbox_fs": [
40,
209,
192,
223
]
},
{
"type": "text",
"bbox": [
51,
229,
323,
282
],
"lines": [
{
"bbox": [
51,
229,
262,
243
],
"spans": [
{
"bbox": [
51,
229,
262,
243
],
"score": 1.0,
"content": "- Download and decompress full_zip_url",
"type": "text"
}
],
"index": 7
},
{
"bbox": [
50,
248,
313,
263
],
"spans": [
{
"bbox": [
50,
248,
313,
263
],
"score": 1.0,
"content": "- Parse content_list.json into Document objects",
"type": "text"
}
],
"index": 8
},
{
"bbox": [
51,
269,
323,
284
],
"spans": [
{
"bbox": [
51,
269,
323,
284
],
"score": 1.0,
"content": "- Separate text blocks, tables, images, equations",
"type": "text"
}
],
"index": 9
}
],
"index": 8,
"bbox_fs": [
50,
229,
323,
284
]
},
{
"type": "text",
"bbox": [
40,
309,
247,
321
],
"lines": [
{
"bbox": [
40,
308,
247,
322
],
"spans": [
{
"bbox": [
40,
308,
247,
322
],
"score": 1.0,
"content": "Stage 3: Entity & Relation Extraction",
"type": "text"
}
],
"index": 10
}
],
"index": 10,
"bbox_fs": [
40,
308,
247,
322
]
},
{
"type": "text",
"bbox": [
51,
328,
313,
382
],
"lines": [
{
"bbox": [
51,
327,
236,
342
],
"spans": [
{
"bbox": [
51,
327,
236,
342
],
"score": 1.0,
"content": "- Feed text blocks to LangExtract",
"type": "text"
}
],
"index": 11
},
{
"bbox": [
50,
348,
312,
362
],
"spans": [
{
"bbox": [
50,
348,
312,
362
],
"score": 1.0,
"content": "- Extract entities with char_interval positions",
"type": "text"
}
],
"index": 12
},
{
"bbox": [
51,
368,
274,
382
],
"spans": [
{
"bbox": [
51,
368,
274,
382
],
"score": 1.0,
"content": "- Extract relationships between entities",
"type": "text"
}
],
"index": 13
}
],
"index": 12,
"bbox_fs": [
50,
327,
312,
382
]
},
{
"type": "text",
"bbox": [
41,
408,
192,
421
],
"lines": [
{
"bbox": [
40,
408,
192,
422
],
"spans": [
{
"bbox": [
40,
408,
192,
422
],
"score": 1.0,
"content": "Stage 4: Graph Construction",
"type": "text"
}
],
"index": 14
}
],
"index": 14,
"bbox_fs": [
40,
408,
192,
422
]
},
{
"type": "text",
"bbox": [
51,
428,
311,
481
],
"lines": [
{
"bbox": [
50,
426,
285,
443
],
"spans": [
{
"bbox": [
50,
426,
285,
443
],
"score": 1.0,
"content": "- Map extractions to graph nodes and edges",
"type": "text"
}
],
"index": 15
},
{
"bbox": [
51,
447,
311,
462
],
"spans": [
{
"bbox": [
51,
447,
311,
462
],
"score": 1.0,
"content": "- Store with source provenance (page_idx, bbox)",
"type": "text"
}
],
"index": 16
},
{
"bbox": [
51,
467,
302,
481
],
"spans": [
{
"bbox": [
51,
467,
302,
481
],
"score": 1.0,
"content": "- Build vector embeddings for semantic search",
"type": "text"
}
],
"index": 17
}
],
"index": 16,
"bbox_fs": [
50,
426,
311,
481
]
},
{
"type": "title",
"bbox": [
30,
508,
194,
522
],
"lines": [
{
"bbox": [
28,
507,
195,
524
],
"spans": [
{
"bbox": [
28,
507,
195,
524
],
"score": 1.0,
"content": "4. Supported File Formats",
"type": "text"
}
],
"index": 18
}
],
"index": 18
},
{
"type": "table",
"bbox": [
27,
534,
525,
678
],
"blocks": [
{
"type": "table_body",
"bbox": [
27,
534,
525,
678
],
"group_id": 0,
"lines": [
{
"bbox": [
27,
534,
525,
678
],
"spans": [
{
"bbox": [
27,
534,
525,
678
],
"score": 0.985,
"html": "| Format | Extension | OCR Required | ModeI |
| PDF (text) | . pdf | No | pipeline / vlm |
| PDF (scan) | . pdf | Yes | vIlm |
| Word | . docx | No | pipeline |
| PowerPoint | .pptx | No | pipeline |
| Image | .png / .jpg | Auto | vIlm |
| HTML | .html | No | MinerU-HTML |
",
"type": "table",
"image_path": "1ed7aacecd20fecef8dc27ee2fe76dc1ae7fa93c44f7d10878d17a41f21a6bef.jpg"
}
]
}
],
"index": 20,
"virtual_lines": [
{
"bbox": [
27,
534,
525,
582.0
],
"spans": [],
"index": 19
},
{
"bbox": [
27,
582.0,
525,
630.0
],
"spans": [],
"index": 20
},
{
"bbox": [
27,
630.0,
525,
678.0
],
"spans": [],
"index": 21
}
]
}
],
"index": 20
}
]
},
{
"preproc_blocks": [
{
"type": "title",
"bbox": [
29,
36,
272,
53
],
"lines": [
{
"bbox": [
27,
33,
274,
55
],
"spans": [
{
"bbox": [
27,
33,
274,
55
],
"score": 1.0,
"content": "5. API Configuration Reference",
"type": "text"
}
],
"index": 0
}
],
"index": 0
},
{
"type": "text",
"bbox": [
29,
70,
567,
104
],
"lines": [
{
"bbox": [
27,
67,
567,
87
],
"spans": [
{
"bbox": [
27,
67,
567,
87
],
"score": 1.0,
"content": "The following environment variables must be configured before running the MinerU parsing",
"type": "text"
}
],
"index": 1
},
{
"bbox": [
27,
90,
77,
105
],
"spans": [
{
"bbox": [
27,
90,
77,
105
],
"score": 1.0,
"content": "service:",
"type": "text"
}
],
"index": 2
}
],
"index": 1.5
},
{
"type": "text",
"bbox": [
39,
128,
379,
284
],
"lines": [
{
"bbox": [
39,
129,
362,
145
],
"spans": [
{
"bbox": [
39,
130,
132,
144
],
"score": 1.0,
"content": "MINERU_API_TOKEN",
"type": "text"
},
{
"bbox": [
155,
129,
362,
145
],
"score": 1.0,
"content": ": Bearer token for API authentication",
"type": "text"
}
],
"index": 3
},
{
"bbox": [
39,
149,
335,
165
],
"spans": [
{
"bbox": [
39,
149,
126,
165
],
"score": 1.0,
"content": "MINERU_USER_UID",
"type": "text"
},
{
"bbox": [
156,
149,
335,
164
],
"score": 1.0,
"content": ": User UUID for quota management",
"type": "text"
}
],
"index": 4
},
{
"bbox": [
39,
170,
307,
183
],
"spans": [
{
"bbox": [
39,
170,
126,
183
],
"score": 1.0,
"content": "MINERU_BASE_URL",
"type": "text"
},
{
"bbox": [
156,
170,
307,
183
],
"score": 1.0,
"content": ": https://mineru.net/api/v4",
"type": "text"
}
],
"index": 5
},
{
"bbox": [
39,
189,
379,
204
],
"spans": [
{
"bbox": [
39,
189,
379,
204
],
"score": 1.0,
"content": "MINERU_MODEL_VERSION : pipeline (default) | vlm | MinerU-HTML",
"type": "text"
}
],
"index": 6
},
{
"bbox": [
40,
210,
316,
223
],
"spans": [
{
"bbox": [
40,
210,
126,
223
],
"score": 1.0,
"content": "MINERU_LANGUAGE",
"type": "text"
},
{
"bbox": [
156,
210,
316,
223
],
"score": 1.0,
"content": ": ch (Chinese) | en (English)",
"type": "text"
}
],
"index": 7
},
{
"bbox": [
39,
229,
371,
244
],
"spans": [
{
"bbox": [
39,
230,
115,
244
],
"score": 1.0,
"content": "MINERU_IS_OCR",
"type": "text"
},
{
"bbox": [
156,
229,
371,
243
],
"score": 1.0,
"content": ": false (text PDF) | true (scanned PDF)",
"type": "text"
}
],
"index": 8
},
{
"bbox": [
39,
249,
236,
263
],
"spans": [
{
"bbox": [
39,
249,
236,
263
],
"score": 1.0,
"content": "MINERU_ENABLE_FORMULA: true | false",
"type": "text"
}
],
"index": 9
},
{
"bbox": [
39,
269,
236,
282
],
"spans": [
{
"bbox": [
39,
269,
236,
282
],
"score": 1.0,
"content": "MINERU_ENABLE_TABLE : true | false",
"type": "text"
}
],
"index": 10
}
],
"index": 6.5
},
{
"type": "text",
"bbox": [
29,
309,
96,
321
],
"lines": [
{
"bbox": [
28,
308,
97,
322
],
"spans": [
{
"bbox": [
28,
308,
97,
322
],
"score": 1.0,
"content": "Rate Limits:",
"type": "text"
}
],
"index": 11
}
],
"index": 11
},
{
"type": "text",
"bbox": [
39,
327,
300,
402
],
"lines": [
{
"bbox": [
39,
327,
242,
343
],
"spans": [
{
"bbox": [
39,
327,
126,
342
],
"score": 1.0,
"content": "- Max file size",
"type": "text"
},
{
"bbox": [
139,
327,
242,
343
],
"score": 1.0,
"content": ": 200 MB per file",
"type": "text"
}
],
"index": 12
},
{
"bbox": [
39,
347,
258,
364
],
"spans": [
{
"bbox": [
39,
347,
104,
364
],
"score": 1.0,
"content": "- Max pages",
"type": "text"
},
{
"bbox": [
145,
348,
258,
363
],
"score": 1.0,
"content": ": 600 pages per file",
"type": "text"
}
],
"index": 13
},
{
"bbox": [
39,
367,
300,
383
],
"spans": [
{
"bbox": [
39,
367,
115,
383
],
"score": 1.0,
"content": "- Daily quota",
"type": "text"
},
{
"bbox": [
145,
368,
300,
383
],
"score": 1.0,
"content": ": 2000 pages (high priority)",
"type": "text"
}
],
"index": 14
},
{
"bbox": [
39,
387,
274,
403
],
"spans": [
{
"bbox": [
39,
387,
116,
402
],
"score": 1.0,
"content": "- Batch limit",
"type": "text"
},
{
"bbox": [
144,
387,
274,
403
],
"score": 1.0,
"content": ": 200 files per request",
"type": "text"
}
],
"index": 15
}
],
"index": 13.5
}
],
"page_idx": 3,
"page_size": [
595,
841
],
"discarded_blocks": [],
"para_blocks": [
{
"type": "title",
"bbox": [
29,
36,
272,
53
],
"lines": [
{
"bbox": [
27,
33,
274,
55
],
"spans": [
{
"bbox": [
27,
33,
274,
55
],
"score": 1.0,
"content": "5. API Configuration Reference",
"type": "text"
}
],
"index": 0
}
],
"index": 0
},
{
"type": "text",
"bbox": [
29,
70,
567,
104
],
"lines": [
{
"bbox": [
27,
67,
567,
87
],
"spans": [
{
"bbox": [
27,
67,
567,
87
],
"score": 1.0,
"content": "The following environment variables must be configured before running the MinerU parsing",
"type": "text"
}
],
"index": 1
},
{
"bbox": [
27,
90,
77,
105
],
"spans": [
{
"bbox": [
27,
90,
77,
105
],
"score": 1.0,
"content": "service:",
"type": "text"
}
],
"index": 2
}
],
"index": 1.5,
"bbox_fs": [
27,
67,
567,
105
]
},
{
"type": "list",
"bbox": [
39,
128,
379,
284
],
"lines": [
{
"bbox": [
39,
129,
362,
145
],
"spans": [
{
"bbox": [
39,
130,
132,
144
],
"score": 1.0,
"content": "MINERU_API_TOKEN",
"type": "text"
},
{
"bbox": [
155,
129,
362,
145
],
"score": 1.0,
"content": ": Bearer token for API authentication",
"type": "text"
}
],
"index": 3,
"is_list_start_line": true
},
{
"bbox": [
39,
149,
335,
165
],
"spans": [
{
"bbox": [
39,
149,
126,
165
],
"score": 1.0,
"content": "MINERU_USER_UID",
"type": "text"
},
{
"bbox": [
156,
149,
335,
164
],
"score": 1.0,
"content": ": User UUID for quota management",
"type": "text"
}
],
"index": 4,
"is_list_start_line": true
},
{
"bbox": [
39,
170,
307,
183
],
"spans": [
{
"bbox": [
39,
170,
126,
183
],
"score": 1.0,
"content": "MINERU_BASE_URL",
"type": "text"
},
{
"bbox": [
156,
170,
307,
183
],
"score": 1.0,
"content": ": https://mineru.net/api/v4",
"type": "text"
}
],
"index": 5,
"is_list_start_line": true
},
{
"bbox": [
39,
189,
379,
204
],
"spans": [
{
"bbox": [
39,
189,
379,
204
],
"score": 1.0,
"content": "MINERU_MODEL_VERSION : pipeline (default) | vlm | MinerU-HTML",
"type": "text"
}
],
"index": 6,
"is_list_start_line": true
},
{
"bbox": [
40,
210,
316,
223
],
"spans": [
{
"bbox": [
40,
210,
126,
223
],
"score": 1.0,
"content": "MINERU_LANGUAGE",
"type": "text"
},
{
"bbox": [
156,
210,
316,
223
],
"score": 1.0,
"content": ": ch (Chinese) | en (English)",
"type": "text"
}
],
"index": 7,
"is_list_start_line": true
},
{
"bbox": [
39,
229,
371,
244
],
"spans": [
{
"bbox": [
39,
230,
115,
244
],
"score": 1.0,
"content": "MINERU_IS_OCR",
"type": "text"
},
{
"bbox": [
156,
229,
371,
243
],
"score": 1.0,
"content": ": false (text PDF) | true (scanned PDF)",
"type": "text"
}
],
"index": 8,
"is_list_start_line": true
},
{
"bbox": [
39,
249,
236,
263
],
"spans": [
{
"bbox": [
39,
249,
236,
263
],
"score": 1.0,
"content": "MINERU_ENABLE_FORMULA: true | false",
"type": "text"
}
],
"index": 9,
"is_list_start_line": true
},
{
"bbox": [
39,
269,
236,
282
],
"spans": [
{
"bbox": [
39,
269,
236,
282
],
"score": 1.0,
"content": "MINERU_ENABLE_TABLE : true | false",
"type": "text"
}
],
"index": 10,
"is_list_start_line": true
}
],
"index": 6.5,
"bbox_fs": [
39,
129,
379,
282
]
},
{
"type": "text",
"bbox": [
29,
309,
96,
321
],
"lines": [
{
"bbox": [
28,
308,
97,
322
],
"spans": [
{
"bbox": [
28,
308,
97,
322
],
"score": 1.0,
"content": "Rate Limits:",
"type": "text"
}
],
"index": 11
}
],
"index": 11,
"bbox_fs": [
28,
308,
97,
322
]
},
{
"type": "text",
"bbox": [
39,
327,
300,
402
],
"lines": [
{
"bbox": [
39,
327,
242,
343
],
"spans": [
{
"bbox": [
39,
327,
126,
342
],
"score": 1.0,
"content": "- Max file size",
"type": "text"
},
{
"bbox": [
139,
327,
242,
343
],
"score": 1.0,
"content": ": 200 MB per file",
"type": "text"
}
],
"index": 12
},
{
"bbox": [
39,
347,
258,
364
],
"spans": [
{
"bbox": [
39,
347,
104,
364
],
"score": 1.0,
"content": "- Max pages",
"type": "text"
},
{
"bbox": [
145,
348,
258,
363
],
"score": 1.0,
"content": ": 600 pages per file",
"type": "text"
}
],
"index": 13
},
{
"bbox": [
39,
367,
300,
383
],
"spans": [
{
"bbox": [
39,
367,
115,
383
],
"score": 1.0,
"content": "- Daily quota",
"type": "text"
},
{
"bbox": [
145,
368,
300,
383
],
"score": 1.0,
"content": ": 2000 pages (high priority)",
"type": "text"
}
],
"index": 14
},
{
"bbox": [
39,
387,
274,
403
],
"spans": [
{
"bbox": [
39,
387,
116,
402
],
"score": 1.0,
"content": "- Batch limit",
"type": "text"
},
{
"bbox": [
144,
387,
274,
403
],
"score": 1.0,
"content": ": 200 files per request",
"type": "text"
}
],
"index": 15
}
],
"index": 13.5,
"bbox_fs": [
39,
327,
300,
403
]
}
]
}
],
"_backend": "pipeline",
"_version_name": "2.7.6"
}