Full-stack application for document-to-knowledge-graph pipeline: - Backend: FastAPI + LangGraph ReAct agent + DeepSeek + MinerU parsing - Frontend: React 19 + Vite + D3.js + shadcn/ui - Pipeline: MinerU parsing → LangExtract entity extraction → KG building Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
4063 lines
154 KiB
JSON
4063 lines
154 KiB
JSON
{
|
|
"pdf_info": [
|
|
{
|
|
"preproc_blocks": [
|
|
{
|
|
"type": "title",
|
|
"bbox": [
|
|
205,
|
|
148,
|
|
390,
|
|
172
|
|
],
|
|
"lines": [
|
|
{
|
|
"bbox": [
|
|
203,
|
|
144,
|
|
393,
|
|
177
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
203,
|
|
144,
|
|
393,
|
|
177
|
|
],
|
|
"score": 1.0,
|
|
"content": "GraphRAG System",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 0
|
|
}
|
|
],
|
|
"index": 0
|
|
},
|
|
{
|
|
"type": "text",
|
|
"bbox": [
|
|
172,
|
|
197,
|
|
423,
|
|
214
|
|
],
|
|
"lines": [
|
|
{
|
|
"bbox": [
|
|
171,
|
|
195,
|
|
424,
|
|
216
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
171,
|
|
195,
|
|
424,
|
|
216
|
|
],
|
|
"score": 1.0,
|
|
"content": "Technical Architecture Overview",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 1
|
|
}
|
|
],
|
|
"index": 1
|
|
},
|
|
{
|
|
"type": "text",
|
|
"bbox": [
|
|
217,
|
|
229,
|
|
377,
|
|
244
|
|
],
|
|
"lines": [
|
|
{
|
|
"bbox": [
|
|
216,
|
|
227,
|
|
378,
|
|
246
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
216,
|
|
227,
|
|
378,
|
|
246
|
|
],
|
|
"score": 1.0,
|
|
"content": "Version 1.0 | March 2026",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 2
|
|
}
|
|
],
|
|
"index": 2
|
|
}
|
|
],
|
|
"page_idx": 0,
|
|
"page_size": [
|
|
595,
|
|
841
|
|
],
|
|
"discarded_blocks": [],
|
|
"para_blocks": [
|
|
{
|
|
"type": "title",
|
|
"bbox": [
|
|
205,
|
|
148,
|
|
390,
|
|
172
|
|
],
|
|
"lines": [
|
|
{
|
|
"bbox": [
|
|
203,
|
|
144,
|
|
393,
|
|
177
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
203,
|
|
144,
|
|
393,
|
|
177
|
|
],
|
|
"score": 1.0,
|
|
"content": "GraphRAG System",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 0
|
|
}
|
|
],
|
|
"index": 0
|
|
},
|
|
{
|
|
"type": "text",
|
|
"bbox": [
|
|
172,
|
|
197,
|
|
423,
|
|
214
|
|
],
|
|
"lines": [
|
|
{
|
|
"bbox": [
|
|
171,
|
|
195,
|
|
424,
|
|
216
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
171,
|
|
195,
|
|
424,
|
|
216
|
|
],
|
|
"score": 1.0,
|
|
"content": "Technical Architecture Overview",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 1
|
|
}
|
|
],
|
|
"index": 1,
|
|
"bbox_fs": [
|
|
171,
|
|
195,
|
|
424,
|
|
216
|
|
]
|
|
},
|
|
{
|
|
"type": "text",
|
|
"bbox": [
|
|
217,
|
|
229,
|
|
377,
|
|
244
|
|
],
|
|
"lines": [
|
|
{
|
|
"bbox": [
|
|
216,
|
|
227,
|
|
378,
|
|
246
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
216,
|
|
227,
|
|
378,
|
|
246
|
|
],
|
|
"score": 1.0,
|
|
"content": "Version 1.0 | March 2026",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 2
|
|
}
|
|
],
|
|
"index": 2,
|
|
"bbox_fs": [
|
|
216,
|
|
227,
|
|
378,
|
|
246
|
|
]
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"preproc_blocks": [
|
|
{
|
|
"type": "title",
|
|
"bbox": [
|
|
31,
|
|
36,
|
|
119,
|
|
52
|
|
],
|
|
"lines": [
|
|
{
|
|
"bbox": [
|
|
27,
|
|
34,
|
|
121,
|
|
54
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
27,
|
|
34,
|
|
121,
|
|
54
|
|
],
|
|
"score": 1.0,
|
|
"content": "1. Abstract",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 0
|
|
}
|
|
],
|
|
"index": 0
|
|
},
|
|
{
|
|
"type": "text",
|
|
"bbox": [
|
|
29,
|
|
70,
|
|
566,
|
|
144
|
|
],
|
|
"lines": [
|
|
{
|
|
"bbox": [
|
|
29,
|
|
69,
|
|
565,
|
|
84
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
29,
|
|
69,
|
|
565,
|
|
84
|
|
],
|
|
"score": 1.0,
|
|
"content": "This document presents the technical architecture of a Multimodal GraphRAG System designed for",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 1
|
|
},
|
|
{
|
|
"bbox": [
|
|
30,
|
|
89,
|
|
565,
|
|
104
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
30,
|
|
89,
|
|
565,
|
|
104
|
|
],
|
|
"score": 1.0,
|
|
"content": "intelligent document parsing and knowledge graph construction. The system integrates MinerU for",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 2
|
|
},
|
|
{
|
|
"bbox": [
|
|
28,
|
|
110,
|
|
565,
|
|
125
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
28,
|
|
110,
|
|
565,
|
|
125
|
|
],
|
|
"score": 1.0,
|
|
"content": "document parsing, LangExtract for structured entity extraction, and a graph database for",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 3
|
|
},
|
|
{
|
|
"bbox": [
|
|
28,
|
|
129,
|
|
207,
|
|
145
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
28,
|
|
129,
|
|
207,
|
|
145
|
|
],
|
|
"score": 1.0,
|
|
"content": "knowledge storage and retrieval.",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 4
|
|
}
|
|
],
|
|
"index": 2.5
|
|
},
|
|
{
|
|
"type": "text",
|
|
"bbox": [
|
|
29,
|
|
169,
|
|
565,
|
|
223
|
|
],
|
|
"lines": [
|
|
{
|
|
"bbox": [
|
|
27,
|
|
167,
|
|
565,
|
|
185
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
27,
|
|
167,
|
|
565,
|
|
185
|
|
],
|
|
"score": 1.0,
|
|
"content": "The pipeline supports multiple document formats including PDF, DOCX, PPTX, and image files.",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 5
|
|
},
|
|
{
|
|
"bbox": [
|
|
28,
|
|
188,
|
|
567,
|
|
205
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
28,
|
|
188,
|
|
567,
|
|
205
|
|
],
|
|
"score": 1.0,
|
|
"content": "Extracted entities and relations are stored as graph nodes and edges, enabling semantic search",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 6
|
|
},
|
|
{
|
|
"bbox": [
|
|
28,
|
|
209,
|
|
331,
|
|
224
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
28,
|
|
209,
|
|
331,
|
|
224
|
|
],
|
|
"score": 1.0,
|
|
"content": "and question answering over large document collections.",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 7
|
|
}
|
|
],
|
|
"index": 6
|
|
},
|
|
{
|
|
"type": "title",
|
|
"bbox": [
|
|
30,
|
|
252,
|
|
191,
|
|
268
|
|
],
|
|
"lines": [
|
|
{
|
|
"bbox": [
|
|
27,
|
|
249,
|
|
193,
|
|
271
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
27,
|
|
249,
|
|
193,
|
|
271
|
|
],
|
|
"score": 1.0,
|
|
"content": "2. System Components",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 8
|
|
}
|
|
],
|
|
"index": 8
|
|
},
|
|
{
|
|
"type": "title",
|
|
"bbox": [
|
|
30,
|
|
289,
|
|
208,
|
|
304
|
|
],
|
|
"lines": [
|
|
{
|
|
"bbox": [
|
|
28,
|
|
288,
|
|
208,
|
|
306
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
28,
|
|
288,
|
|
208,
|
|
306
|
|
],
|
|
"score": 1.0,
|
|
"content": "2.1 Document Parsing Module",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 9
|
|
}
|
|
],
|
|
"index": 9
|
|
},
|
|
{
|
|
"type": "text",
|
|
"bbox": [
|
|
29,
|
|
314,
|
|
566,
|
|
367
|
|
],
|
|
"lines": [
|
|
{
|
|
"bbox": [
|
|
28,
|
|
313,
|
|
563,
|
|
329
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
28,
|
|
313,
|
|
563,
|
|
329
|
|
],
|
|
"score": 1.0,
|
|
"content": "MinerU Cloud API (v4) serves as the document parsing backend. It accepts PDF, DOCX, PPTX, PNG,",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 10
|
|
},
|
|
{
|
|
"bbox": [
|
|
29,
|
|
334,
|
|
566,
|
|
348
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
29,
|
|
334,
|
|
566,
|
|
348
|
|
],
|
|
"score": 1.0,
|
|
"content": "JPG, and HTML files. Output includes Markdown text, structured content_list.json, and extracted",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 11
|
|
},
|
|
{
|
|
"bbox": [
|
|
28,
|
|
352,
|
|
69,
|
|
370
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
28,
|
|
352,
|
|
69,
|
|
370
|
|
],
|
|
"score": 1.0,
|
|
"content": "images.",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 12
|
|
}
|
|
],
|
|
"index": 11
|
|
},
|
|
{
|
|
"type": "title",
|
|
"bbox": [
|
|
30,
|
|
388,
|
|
213,
|
|
403
|
|
],
|
|
"lines": [
|
|
{
|
|
"bbox": [
|
|
28,
|
|
388,
|
|
214,
|
|
404
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
28,
|
|
388,
|
|
214,
|
|
404
|
|
],
|
|
"score": 1.0,
|
|
"content": "2.2 Entity Extraction Module",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 13
|
|
}
|
|
],
|
|
"index": 13
|
|
},
|
|
{
|
|
"type": "text",
|
|
"bbox": [
|
|
29,
|
|
414,
|
|
565,
|
|
467
|
|
],
|
|
"lines": [
|
|
{
|
|
"bbox": [
|
|
28,
|
|
412,
|
|
567,
|
|
428
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
28,
|
|
412,
|
|
567,
|
|
428
|
|
],
|
|
"score": 1.0,
|
|
"content": "LangExtract (v1.1.1) performs structured information extraction from plain text using few-shot",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 14
|
|
},
|
|
{
|
|
"bbox": [
|
|
28,
|
|
432,
|
|
565,
|
|
448
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
28,
|
|
432,
|
|
565,
|
|
448
|
|
],
|
|
"score": 1.0,
|
|
"content": "prompting with LLM backends (Gemini, OpenAI, or local Ollama). Each extraction includes",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 15
|
|
},
|
|
{
|
|
"bbox": [
|
|
27,
|
|
451,
|
|
223,
|
|
469
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
27,
|
|
451,
|
|
223,
|
|
469
|
|
],
|
|
"score": 1.0,
|
|
"content": "character-level position anchoring.",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 16
|
|
}
|
|
],
|
|
"index": 15
|
|
},
|
|
{
|
|
"type": "title",
|
|
"bbox": [
|
|
30,
|
|
488,
|
|
201,
|
|
502
|
|
],
|
|
"lines": [
|
|
{
|
|
"bbox": [
|
|
28,
|
|
487,
|
|
201,
|
|
504
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
28,
|
|
487,
|
|
201,
|
|
504
|
|
],
|
|
"score": 1.0,
|
|
"content": "2.3 Knowledge Graph Module",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 17
|
|
}
|
|
],
|
|
"index": 17
|
|
},
|
|
{
|
|
"type": "text",
|
|
"bbox": [
|
|
29,
|
|
512,
|
|
565,
|
|
567
|
|
],
|
|
"lines": [
|
|
{
|
|
"bbox": [
|
|
27,
|
|
510,
|
|
564,
|
|
529
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
27,
|
|
510,
|
|
564,
|
|
529
|
|
],
|
|
"score": 1.0,
|
|
"content": "Extracted entities and relationships are stored in a graph database. Node types include: Person,",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 18
|
|
},
|
|
{
|
|
"bbox": [
|
|
28,
|
|
531,
|
|
563,
|
|
547
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
28,
|
|
531,
|
|
563,
|
|
547
|
|
],
|
|
"score": 1.0,
|
|
"content": "Organization, Location, Event, Concept. Edge types include: RELATED_TO, BELONGS_TO, CAUSED_BY,",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 19
|
|
},
|
|
{
|
|
"bbox": [
|
|
28,
|
|
552,
|
|
91,
|
|
567
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
28,
|
|
552,
|
|
91,
|
|
567
|
|
],
|
|
"score": 1.0,
|
|
"content": "LOCATED_IN.",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 20
|
|
}
|
|
],
|
|
"index": 19
|
|
},
|
|
{
|
|
"type": "title",
|
|
"bbox": [
|
|
30,
|
|
587,
|
|
162,
|
|
602
|
|
],
|
|
"lines": [
|
|
{
|
|
"bbox": [
|
|
28,
|
|
587,
|
|
162,
|
|
603
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
28,
|
|
587,
|
|
162,
|
|
603
|
|
],
|
|
"score": 1.0,
|
|
"content": "2.4 Retrieval Module",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 21
|
|
}
|
|
],
|
|
"index": 21
|
|
},
|
|
{
|
|
"type": "text",
|
|
"bbox": [
|
|
29,
|
|
612,
|
|
562,
|
|
645
|
|
],
|
|
"lines": [
|
|
{
|
|
"bbox": [
|
|
28,
|
|
610,
|
|
563,
|
|
627
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
28,
|
|
610,
|
|
563,
|
|
627
|
|
],
|
|
"score": 1.0,
|
|
"content": "The retrieval layer supports hybrid search combining vector similarity and graph traversal.",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 22
|
|
},
|
|
{
|
|
"bbox": [
|
|
28,
|
|
631,
|
|
519,
|
|
646
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
28,
|
|
631,
|
|
519,
|
|
646
|
|
],
|
|
"score": 1.0,
|
|
"content": "Query results are ranked by relevance score and returned with source document references.",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 23
|
|
}
|
|
],
|
|
"index": 22.5
|
|
}
|
|
],
|
|
"page_idx": 1,
|
|
"page_size": [
|
|
595,
|
|
841
|
|
],
|
|
"discarded_blocks": [],
|
|
"para_blocks": [
|
|
{
|
|
"type": "title",
|
|
"bbox": [
|
|
31,
|
|
36,
|
|
119,
|
|
52
|
|
],
|
|
"lines": [
|
|
{
|
|
"bbox": [
|
|
27,
|
|
34,
|
|
121,
|
|
54
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
27,
|
|
34,
|
|
121,
|
|
54
|
|
],
|
|
"score": 1.0,
|
|
"content": "1. Abstract",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 0
|
|
}
|
|
],
|
|
"index": 0
|
|
},
|
|
{
|
|
"type": "text",
|
|
"bbox": [
|
|
29,
|
|
70,
|
|
566,
|
|
144
|
|
],
|
|
"lines": [
|
|
{
|
|
"bbox": [
|
|
29,
|
|
69,
|
|
565,
|
|
84
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
29,
|
|
69,
|
|
565,
|
|
84
|
|
],
|
|
"score": 1.0,
|
|
"content": "This document presents the technical architecture of a Multimodal GraphRAG System designed for",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 1
|
|
},
|
|
{
|
|
"bbox": [
|
|
30,
|
|
89,
|
|
565,
|
|
104
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
30,
|
|
89,
|
|
565,
|
|
104
|
|
],
|
|
"score": 1.0,
|
|
"content": "intelligent document parsing and knowledge graph construction. The system integrates MinerU for",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 2
|
|
},
|
|
{
|
|
"bbox": [
|
|
28,
|
|
110,
|
|
565,
|
|
125
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
28,
|
|
110,
|
|
565,
|
|
125
|
|
],
|
|
"score": 1.0,
|
|
"content": "document parsing, LangExtract for structured entity extraction, and a graph database for",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 3
|
|
},
|
|
{
|
|
"bbox": [
|
|
28,
|
|
129,
|
|
207,
|
|
145
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
28,
|
|
129,
|
|
207,
|
|
145
|
|
],
|
|
"score": 1.0,
|
|
"content": "knowledge storage and retrieval.",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 4
|
|
}
|
|
],
|
|
"index": 2.5,
|
|
"bbox_fs": [
|
|
28,
|
|
69,
|
|
565,
|
|
145
|
|
]
|
|
},
|
|
{
|
|
"type": "text",
|
|
"bbox": [
|
|
29,
|
|
169,
|
|
565,
|
|
223
|
|
],
|
|
"lines": [
|
|
{
|
|
"bbox": [
|
|
27,
|
|
167,
|
|
565,
|
|
185
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
27,
|
|
167,
|
|
565,
|
|
185
|
|
],
|
|
"score": 1.0,
|
|
"content": "The pipeline supports multiple document formats including PDF, DOCX, PPTX, and image files.",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 5
|
|
},
|
|
{
|
|
"bbox": [
|
|
28,
|
|
188,
|
|
567,
|
|
205
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
28,
|
|
188,
|
|
567,
|
|
205
|
|
],
|
|
"score": 1.0,
|
|
"content": "Extracted entities and relations are stored as graph nodes and edges, enabling semantic search",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 6
|
|
},
|
|
{
|
|
"bbox": [
|
|
28,
|
|
209,
|
|
331,
|
|
224
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
28,
|
|
209,
|
|
331,
|
|
224
|
|
],
|
|
"score": 1.0,
|
|
"content": "and question answering over large document collections.",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 7
|
|
}
|
|
],
|
|
"index": 6,
|
|
"bbox_fs": [
|
|
27,
|
|
167,
|
|
567,
|
|
224
|
|
]
|
|
},
|
|
{
|
|
"type": "title",
|
|
"bbox": [
|
|
30,
|
|
252,
|
|
191,
|
|
268
|
|
],
|
|
"lines": [
|
|
{
|
|
"bbox": [
|
|
27,
|
|
249,
|
|
193,
|
|
271
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
27,
|
|
249,
|
|
193,
|
|
271
|
|
],
|
|
"score": 1.0,
|
|
"content": "2. System Components",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 8
|
|
}
|
|
],
|
|
"index": 8
|
|
},
|
|
{
|
|
"type": "title",
|
|
"bbox": [
|
|
30,
|
|
289,
|
|
208,
|
|
304
|
|
],
|
|
"lines": [
|
|
{
|
|
"bbox": [
|
|
28,
|
|
288,
|
|
208,
|
|
306
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
28,
|
|
288,
|
|
208,
|
|
306
|
|
],
|
|
"score": 1.0,
|
|
"content": "2.1 Document Parsing Module",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 9
|
|
}
|
|
],
|
|
"index": 9
|
|
},
|
|
{
|
|
"type": "text",
|
|
"bbox": [
|
|
29,
|
|
314,
|
|
566,
|
|
367
|
|
],
|
|
"lines": [
|
|
{
|
|
"bbox": [
|
|
28,
|
|
313,
|
|
563,
|
|
329
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
28,
|
|
313,
|
|
563,
|
|
329
|
|
],
|
|
"score": 1.0,
|
|
"content": "MinerU Cloud API (v4) serves as the document parsing backend. It accepts PDF, DOCX, PPTX, PNG,",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 10
|
|
},
|
|
{
|
|
"bbox": [
|
|
29,
|
|
334,
|
|
566,
|
|
348
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
29,
|
|
334,
|
|
566,
|
|
348
|
|
],
|
|
"score": 1.0,
|
|
"content": "JPG, and HTML files. Output includes Markdown text, structured content_list.json, and extracted",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 11
|
|
},
|
|
{
|
|
"bbox": [
|
|
28,
|
|
352,
|
|
69,
|
|
370
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
28,
|
|
352,
|
|
69,
|
|
370
|
|
],
|
|
"score": 1.0,
|
|
"content": "images.",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 12
|
|
}
|
|
],
|
|
"index": 11,
|
|
"bbox_fs": [
|
|
28,
|
|
313,
|
|
566,
|
|
370
|
|
]
|
|
},
|
|
{
|
|
"type": "title",
|
|
"bbox": [
|
|
30,
|
|
388,
|
|
213,
|
|
403
|
|
],
|
|
"lines": [
|
|
{
|
|
"bbox": [
|
|
28,
|
|
388,
|
|
214,
|
|
404
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
28,
|
|
388,
|
|
214,
|
|
404
|
|
],
|
|
"score": 1.0,
|
|
"content": "2.2 Entity Extraction Module",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 13
|
|
}
|
|
],
|
|
"index": 13
|
|
},
|
|
{
|
|
"type": "text",
|
|
"bbox": [
|
|
29,
|
|
414,
|
|
565,
|
|
467
|
|
],
|
|
"lines": [
|
|
{
|
|
"bbox": [
|
|
28,
|
|
412,
|
|
567,
|
|
428
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
28,
|
|
412,
|
|
567,
|
|
428
|
|
],
|
|
"score": 1.0,
|
|
"content": "LangExtract (v1.1.1) performs structured information extraction from plain text using few-shot",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 14
|
|
},
|
|
{
|
|
"bbox": [
|
|
28,
|
|
432,
|
|
565,
|
|
448
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
28,
|
|
432,
|
|
565,
|
|
448
|
|
],
|
|
"score": 1.0,
|
|
"content": "prompting with LLM backends (Gemini, OpenAI, or local Ollama). Each extraction includes",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 15
|
|
},
|
|
{
|
|
"bbox": [
|
|
27,
|
|
451,
|
|
223,
|
|
469
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
27,
|
|
451,
|
|
223,
|
|
469
|
|
],
|
|
"score": 1.0,
|
|
"content": "character-level position anchoring.",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 16
|
|
}
|
|
],
|
|
"index": 15,
|
|
"bbox_fs": [
|
|
27,
|
|
412,
|
|
567,
|
|
469
|
|
]
|
|
},
|
|
{
|
|
"type": "title",
|
|
"bbox": [
|
|
30,
|
|
488,
|
|
201,
|
|
502
|
|
],
|
|
"lines": [
|
|
{
|
|
"bbox": [
|
|
28,
|
|
487,
|
|
201,
|
|
504
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
28,
|
|
487,
|
|
201,
|
|
504
|
|
],
|
|
"score": 1.0,
|
|
"content": "2.3 Knowledge Graph Module",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 17
|
|
}
|
|
],
|
|
"index": 17
|
|
},
|
|
{
|
|
"type": "text",
|
|
"bbox": [
|
|
29,
|
|
512,
|
|
565,
|
|
567
|
|
],
|
|
"lines": [
|
|
{
|
|
"bbox": [
|
|
27,
|
|
510,
|
|
564,
|
|
529
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
27,
|
|
510,
|
|
564,
|
|
529
|
|
],
|
|
"score": 1.0,
|
|
"content": "Extracted entities and relationships are stored in a graph database. Node types include: Person,",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 18
|
|
},
|
|
{
|
|
"bbox": [
|
|
28,
|
|
531,
|
|
563,
|
|
547
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
28,
|
|
531,
|
|
563,
|
|
547
|
|
],
|
|
"score": 1.0,
|
|
"content": "Organization, Location, Event, Concept. Edge types include: RELATED_TO, BELONGS_TO, CAUSED_BY,",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 19
|
|
},
|
|
{
|
|
"bbox": [
|
|
28,
|
|
552,
|
|
91,
|
|
567
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
28,
|
|
552,
|
|
91,
|
|
567
|
|
],
|
|
"score": 1.0,
|
|
"content": "LOCATED_IN.",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 20
|
|
}
|
|
],
|
|
"index": 19,
|
|
"bbox_fs": [
|
|
27,
|
|
510,
|
|
564,
|
|
567
|
|
]
|
|
},
|
|
{
|
|
"type": "title",
|
|
"bbox": [
|
|
30,
|
|
587,
|
|
162,
|
|
602
|
|
],
|
|
"lines": [
|
|
{
|
|
"bbox": [
|
|
28,
|
|
587,
|
|
162,
|
|
603
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
28,
|
|
587,
|
|
162,
|
|
603
|
|
],
|
|
"score": 1.0,
|
|
"content": "2.4 Retrieval Module",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 21
|
|
}
|
|
],
|
|
"index": 21
|
|
},
|
|
{
|
|
"type": "list",
|
|
"bbox": [
|
|
29,
|
|
612,
|
|
562,
|
|
645
|
|
],
|
|
"lines": [
|
|
{
|
|
"bbox": [
|
|
28,
|
|
610,
|
|
563,
|
|
627
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
28,
|
|
610,
|
|
563,
|
|
627
|
|
],
|
|
"score": 1.0,
|
|
"content": "The retrieval layer supports hybrid search combining vector similarity and graph traversal.",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 22,
|
|
"is_list_end_line": true
|
|
},
|
|
{
|
|
"bbox": [
|
|
28,
|
|
631,
|
|
519,
|
|
646
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
28,
|
|
631,
|
|
519,
|
|
646
|
|
],
|
|
"score": 1.0,
|
|
"content": "Query results are ranked by relevance score and returned with source document references.",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 23,
|
|
"is_list_start_line": true,
|
|
"is_list_end_line": true
|
|
}
|
|
],
|
|
"index": 22.5,
|
|
"bbox_fs": [
|
|
28,
|
|
610,
|
|
563,
|
|
646
|
|
]
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"preproc_blocks": [
|
|
{
|
|
"type": "title",
|
|
"bbox": [
|
|
30,
|
|
36,
|
|
160,
|
|
52
|
|
],
|
|
"lines": [
|
|
{
|
|
"bbox": [
|
|
27,
|
|
34,
|
|
162,
|
|
54
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
27,
|
|
34,
|
|
162,
|
|
54
|
|
],
|
|
"score": 1.0,
|
|
"content": "3. Data Pipeline",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 0
|
|
}
|
|
],
|
|
"index": 0
|
|
},
|
|
{
|
|
"type": "text",
|
|
"bbox": [
|
|
30,
|
|
70,
|
|
371,
|
|
84
|
|
],
|
|
"lines": [
|
|
{
|
|
"bbox": [
|
|
28,
|
|
68,
|
|
373,
|
|
86
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
28,
|
|
68,
|
|
373,
|
|
86
|
|
],
|
|
"score": 1.0,
|
|
"content": "The end-to-end data pipeline consists of the following stages:",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 1
|
|
}
|
|
],
|
|
"index": 1
|
|
},
|
|
{
|
|
"type": "text",
|
|
"bbox": [
|
|
41,
|
|
110,
|
|
192,
|
|
123
|
|
],
|
|
"lines": [
|
|
{
|
|
"bbox": [
|
|
39,
|
|
109,
|
|
192,
|
|
124
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
39,
|
|
109,
|
|
192,
|
|
124
|
|
],
|
|
"score": 1.0,
|
|
"content": "Stage 1: Document Ingestion",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 2
|
|
}
|
|
],
|
|
"index": 2
|
|
},
|
|
{
|
|
"type": "text",
|
|
"bbox": [
|
|
51,
|
|
129,
|
|
316,
|
|
183
|
|
],
|
|
"lines": [
|
|
{
|
|
"bbox": [
|
|
51,
|
|
129,
|
|
317,
|
|
145
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
51,
|
|
129,
|
|
317,
|
|
145
|
|
],
|
|
"score": 1.0,
|
|
"content": "- Accept raw documents (PDF, DOCX, images, HTML)",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 3
|
|
},
|
|
{
|
|
"bbox": [
|
|
50,
|
|
149,
|
|
242,
|
|
165
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
50,
|
|
149,
|
|
242,
|
|
165
|
|
],
|
|
"score": 1.0,
|
|
"content": "- Submit to MinerU API for parsing",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 4
|
|
},
|
|
{
|
|
"bbox": [
|
|
51,
|
|
169,
|
|
258,
|
|
184
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
51,
|
|
169,
|
|
220,
|
|
184
|
|
],
|
|
"score": 1.0,
|
|
"content": "- Poll task status until state",
|
|
"type": "text"
|
|
},
|
|
{
|
|
"bbox": [
|
|
221,
|
|
171,
|
|
231,
|
|
181
|
|
],
|
|
"score": 0.76,
|
|
"content": "\\underline { { \\underline { { \\mathbf { \\delta \\pi } } } } }",
|
|
"type": "inline_equation"
|
|
},
|
|
{
|
|
"bbox": [
|
|
231,
|
|
169,
|
|
258,
|
|
184
|
|
],
|
|
"score": 1.0,
|
|
"content": "done",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 5
|
|
}
|
|
],
|
|
"index": 4
|
|
},
|
|
{
|
|
"type": "text",
|
|
"bbox": [
|
|
41,
|
|
210,
|
|
192,
|
|
222
|
|
],
|
|
"lines": [
|
|
{
|
|
"bbox": [
|
|
40,
|
|
209,
|
|
192,
|
|
223
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
40,
|
|
209,
|
|
192,
|
|
223
|
|
],
|
|
"score": 1.0,
|
|
"content": "Stage 2: Content Extraction",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 6
|
|
}
|
|
],
|
|
"index": 6
|
|
},
|
|
{
|
|
"type": "text",
|
|
"bbox": [
|
|
51,
|
|
229,
|
|
323,
|
|
282
|
|
],
|
|
"lines": [
|
|
{
|
|
"bbox": [
|
|
51,
|
|
229,
|
|
262,
|
|
243
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
51,
|
|
229,
|
|
262,
|
|
243
|
|
],
|
|
"score": 1.0,
|
|
"content": "- Download and decompress full_zip_url",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 7
|
|
},
|
|
{
|
|
"bbox": [
|
|
50,
|
|
248,
|
|
313,
|
|
263
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
50,
|
|
248,
|
|
313,
|
|
263
|
|
],
|
|
"score": 1.0,
|
|
"content": "- Parse content_list.json into Document objects",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 8
|
|
},
|
|
{
|
|
"bbox": [
|
|
51,
|
|
269,
|
|
323,
|
|
284
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
51,
|
|
269,
|
|
323,
|
|
284
|
|
],
|
|
"score": 1.0,
|
|
"content": "- Separate text blocks, tables, images, equations",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 9
|
|
}
|
|
],
|
|
"index": 8
|
|
},
|
|
{
|
|
"type": "text",
|
|
"bbox": [
|
|
40,
|
|
309,
|
|
247,
|
|
321
|
|
],
|
|
"lines": [
|
|
{
|
|
"bbox": [
|
|
40,
|
|
308,
|
|
247,
|
|
322
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
40,
|
|
308,
|
|
247,
|
|
322
|
|
],
|
|
"score": 1.0,
|
|
"content": "Stage 3: Entity & Relation Extraction",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 10
|
|
}
|
|
],
|
|
"index": 10
|
|
},
|
|
{
|
|
"type": "text",
|
|
"bbox": [
|
|
51,
|
|
328,
|
|
313,
|
|
382
|
|
],
|
|
"lines": [
|
|
{
|
|
"bbox": [
|
|
51,
|
|
327,
|
|
236,
|
|
342
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
51,
|
|
327,
|
|
236,
|
|
342
|
|
],
|
|
"score": 1.0,
|
|
"content": "- Feed text blocks to LangExtract",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 11
|
|
},
|
|
{
|
|
"bbox": [
|
|
50,
|
|
348,
|
|
312,
|
|
362
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
50,
|
|
348,
|
|
312,
|
|
362
|
|
],
|
|
"score": 1.0,
|
|
"content": "- Extract entities with char_interval positions",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 12
|
|
},
|
|
{
|
|
"bbox": [
|
|
51,
|
|
368,
|
|
274,
|
|
382
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
51,
|
|
368,
|
|
274,
|
|
382
|
|
],
|
|
"score": 1.0,
|
|
"content": "- Extract relationships between entities",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 13
|
|
}
|
|
],
|
|
"index": 12
|
|
},
|
|
{
|
|
"type": "text",
|
|
"bbox": [
|
|
41,
|
|
408,
|
|
192,
|
|
421
|
|
],
|
|
"lines": [
|
|
{
|
|
"bbox": [
|
|
40,
|
|
408,
|
|
192,
|
|
422
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
40,
|
|
408,
|
|
192,
|
|
422
|
|
],
|
|
"score": 1.0,
|
|
"content": "Stage 4: Graph Construction",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 14
|
|
}
|
|
],
|
|
"index": 14
|
|
},
|
|
{
|
|
"type": "text",
|
|
"bbox": [
|
|
51,
|
|
428,
|
|
311,
|
|
481
|
|
],
|
|
"lines": [
|
|
{
|
|
"bbox": [
|
|
50,
|
|
426,
|
|
285,
|
|
443
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
50,
|
|
426,
|
|
285,
|
|
443
|
|
],
|
|
"score": 1.0,
|
|
"content": "- Map extractions to graph nodes and edges",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 15
|
|
},
|
|
{
|
|
"bbox": [
|
|
51,
|
|
447,
|
|
311,
|
|
462
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
51,
|
|
447,
|
|
311,
|
|
462
|
|
],
|
|
"score": 1.0,
|
|
"content": "- Store with source provenance (page_idx, bbox)",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 16
|
|
},
|
|
{
|
|
"bbox": [
|
|
51,
|
|
467,
|
|
302,
|
|
481
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
51,
|
|
467,
|
|
302,
|
|
481
|
|
],
|
|
"score": 1.0,
|
|
"content": "- Build vector embeddings for semantic search",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 17
|
|
}
|
|
],
|
|
"index": 16
|
|
},
|
|
{
|
|
"type": "title",
|
|
"bbox": [
|
|
30,
|
|
508,
|
|
194,
|
|
522
|
|
],
|
|
"lines": [
|
|
{
|
|
"bbox": [
|
|
28,
|
|
507,
|
|
195,
|
|
524
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
28,
|
|
507,
|
|
195,
|
|
524
|
|
],
|
|
"score": 1.0,
|
|
"content": "4. Supported File Formats",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 18
|
|
}
|
|
],
|
|
"index": 18
|
|
},
|
|
{
|
|
"type": "table",
|
|
"bbox": [
|
|
27,
|
|
534,
|
|
525,
|
|
678
|
|
],
|
|
"blocks": [
|
|
{
|
|
"type": "table_body",
|
|
"bbox": [
|
|
27,
|
|
534,
|
|
525,
|
|
678
|
|
],
|
|
"group_id": 0,
|
|
"lines": [
|
|
{
|
|
"bbox": [
|
|
27,
|
|
534,
|
|
525,
|
|
678
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
27,
|
|
534,
|
|
525,
|
|
678
|
|
],
|
|
"score": 0.985,
|
|
"html": "<table><tr><td rowspan=1 colspan=1>Format</td><td rowspan=1 colspan=1>Extension</td><td rowspan=1 colspan=1>OCR Required</td><td rowspan=1 colspan=1>ModeI</td></tr><tr><td rowspan=1 colspan=1>PDF (text)</td><td rowspan=1 colspan=1>. pdf</td><td rowspan=1 colspan=1>No</td><td rowspan=1 colspan=1>pipeline / vlm</td></tr><tr><td rowspan=1 colspan=1>PDF (scan)</td><td rowspan=1 colspan=1>. pdf</td><td rowspan=1 colspan=1>Yes</td><td rowspan=1 colspan=1>vIlm</td></tr><tr><td rowspan=1 colspan=1>Word</td><td rowspan=1 colspan=1>. docx</td><td rowspan=1 colspan=1>No</td><td rowspan=1 colspan=1>pipeline</td></tr><tr><td rowspan=1 colspan=1>PowerPoint</td><td rowspan=1 colspan=1>.pptx</td><td rowspan=1 colspan=1>No</td><td rowspan=1 colspan=1>pipeline</td></tr><tr><td rowspan=1 colspan=1>Image</td><td rowspan=1 colspan=1>.png / .jpg</td><td rowspan=1 colspan=1>Auto</td><td rowspan=1 colspan=1>vIlm</td></tr><tr><td rowspan=1 colspan=1>HTML</td><td rowspan=1 colspan=1>.html</td><td rowspan=1 colspan=1>No</td><td rowspan=1 colspan=1>MinerU-HTML</td></tr></table>",
|
|
"type": "table",
|
|
"image_path": "1ed7aacecd20fecef8dc27ee2fe76dc1ae7fa93c44f7d10878d17a41f21a6bef.jpg"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"index": 20,
|
|
"virtual_lines": [
|
|
{
|
|
"bbox": [
|
|
27,
|
|
534,
|
|
525,
|
|
582.0
|
|
],
|
|
"spans": [],
|
|
"index": 19
|
|
},
|
|
{
|
|
"bbox": [
|
|
27,
|
|
582.0,
|
|
525,
|
|
630.0
|
|
],
|
|
"spans": [],
|
|
"index": 20
|
|
},
|
|
{
|
|
"bbox": [
|
|
27,
|
|
630.0,
|
|
525,
|
|
678.0
|
|
],
|
|
"spans": [],
|
|
"index": 21
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"index": 20
|
|
}
|
|
],
|
|
"page_idx": 2,
|
|
"page_size": [
|
|
595,
|
|
841
|
|
],
|
|
"discarded_blocks": [],
|
|
"para_blocks": [
|
|
{
|
|
"type": "title",
|
|
"bbox": [
|
|
30,
|
|
36,
|
|
160,
|
|
52
|
|
],
|
|
"lines": [
|
|
{
|
|
"bbox": [
|
|
27,
|
|
34,
|
|
162,
|
|
54
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
27,
|
|
34,
|
|
162,
|
|
54
|
|
],
|
|
"score": 1.0,
|
|
"content": "3. Data Pipeline",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 0
|
|
}
|
|
],
|
|
"index": 0
|
|
},
|
|
{
|
|
"type": "text",
|
|
"bbox": [
|
|
30,
|
|
70,
|
|
371,
|
|
84
|
|
],
|
|
"lines": [
|
|
{
|
|
"bbox": [
|
|
28,
|
|
68,
|
|
373,
|
|
86
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
28,
|
|
68,
|
|
373,
|
|
86
|
|
],
|
|
"score": 1.0,
|
|
"content": "The end-to-end data pipeline consists of the following stages:",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 1
|
|
}
|
|
],
|
|
"index": 1,
|
|
"bbox_fs": [
|
|
28,
|
|
68,
|
|
373,
|
|
86
|
|
]
|
|
},
|
|
{
|
|
"type": "text",
|
|
"bbox": [
|
|
41,
|
|
110,
|
|
192,
|
|
123
|
|
],
|
|
"lines": [
|
|
{
|
|
"bbox": [
|
|
39,
|
|
109,
|
|
192,
|
|
124
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
39,
|
|
109,
|
|
192,
|
|
124
|
|
],
|
|
"score": 1.0,
|
|
"content": "Stage 1: Document Ingestion",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 2
|
|
}
|
|
],
|
|
"index": 2,
|
|
"bbox_fs": [
|
|
39,
|
|
109,
|
|
192,
|
|
124
|
|
]
|
|
},
|
|
{
|
|
"type": "text",
|
|
"bbox": [
|
|
51,
|
|
129,
|
|
316,
|
|
183
|
|
],
|
|
"lines": [
|
|
{
|
|
"bbox": [
|
|
51,
|
|
129,
|
|
317,
|
|
145
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
51,
|
|
129,
|
|
317,
|
|
145
|
|
],
|
|
"score": 1.0,
|
|
"content": "- Accept raw documents (PDF, DOCX, images, HTML)",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 3
|
|
},
|
|
{
|
|
"bbox": [
|
|
50,
|
|
149,
|
|
242,
|
|
165
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
50,
|
|
149,
|
|
242,
|
|
165
|
|
],
|
|
"score": 1.0,
|
|
"content": "- Submit to MinerU API for parsing",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 4
|
|
},
|
|
{
|
|
"bbox": [
|
|
51,
|
|
169,
|
|
258,
|
|
184
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
51,
|
|
169,
|
|
220,
|
|
184
|
|
],
|
|
"score": 1.0,
|
|
"content": "- Poll task status until state",
|
|
"type": "text"
|
|
},
|
|
{
|
|
"bbox": [
|
|
221,
|
|
171,
|
|
231,
|
|
181
|
|
],
|
|
"score": 0.76,
|
|
"content": "\\underline { { \\underline { { \\mathbf { \\delta \\pi } } } } }",
|
|
"type": "inline_equation"
|
|
},
|
|
{
|
|
"bbox": [
|
|
231,
|
|
169,
|
|
258,
|
|
184
|
|
],
|
|
"score": 1.0,
|
|
"content": "done",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 5
|
|
}
|
|
],
|
|
"index": 4,
|
|
"bbox_fs": [
|
|
50,
|
|
129,
|
|
317,
|
|
184
|
|
]
|
|
},
|
|
{
|
|
"type": "text",
|
|
"bbox": [
|
|
41,
|
|
210,
|
|
192,
|
|
222
|
|
],
|
|
"lines": [
|
|
{
|
|
"bbox": [
|
|
40,
|
|
209,
|
|
192,
|
|
223
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
40,
|
|
209,
|
|
192,
|
|
223
|
|
],
|
|
"score": 1.0,
|
|
"content": "Stage 2: Content Extraction",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 6
|
|
}
|
|
],
|
|
"index": 6,
|
|
"bbox_fs": [
|
|
40,
|
|
209,
|
|
192,
|
|
223
|
|
]
|
|
},
|
|
{
|
|
"type": "text",
|
|
"bbox": [
|
|
51,
|
|
229,
|
|
323,
|
|
282
|
|
],
|
|
"lines": [
|
|
{
|
|
"bbox": [
|
|
51,
|
|
229,
|
|
262,
|
|
243
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
51,
|
|
229,
|
|
262,
|
|
243
|
|
],
|
|
"score": 1.0,
|
|
"content": "- Download and decompress full_zip_url",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 7
|
|
},
|
|
{
|
|
"bbox": [
|
|
50,
|
|
248,
|
|
313,
|
|
263
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
50,
|
|
248,
|
|
313,
|
|
263
|
|
],
|
|
"score": 1.0,
|
|
"content": "- Parse content_list.json into Document objects",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 8
|
|
},
|
|
{
|
|
"bbox": [
|
|
51,
|
|
269,
|
|
323,
|
|
284
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
51,
|
|
269,
|
|
323,
|
|
284
|
|
],
|
|
"score": 1.0,
|
|
"content": "- Separate text blocks, tables, images, equations",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 9
|
|
}
|
|
],
|
|
"index": 8,
|
|
"bbox_fs": [
|
|
50,
|
|
229,
|
|
323,
|
|
284
|
|
]
|
|
},
|
|
{
|
|
"type": "text",
|
|
"bbox": [
|
|
40,
|
|
309,
|
|
247,
|
|
321
|
|
],
|
|
"lines": [
|
|
{
|
|
"bbox": [
|
|
40,
|
|
308,
|
|
247,
|
|
322
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
40,
|
|
308,
|
|
247,
|
|
322
|
|
],
|
|
"score": 1.0,
|
|
"content": "Stage 3: Entity & Relation Extraction",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 10
|
|
}
|
|
],
|
|
"index": 10,
|
|
"bbox_fs": [
|
|
40,
|
|
308,
|
|
247,
|
|
322
|
|
]
|
|
},
|
|
{
|
|
"type": "text",
|
|
"bbox": [
|
|
51,
|
|
328,
|
|
313,
|
|
382
|
|
],
|
|
"lines": [
|
|
{
|
|
"bbox": [
|
|
51,
|
|
327,
|
|
236,
|
|
342
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
51,
|
|
327,
|
|
236,
|
|
342
|
|
],
|
|
"score": 1.0,
|
|
"content": "- Feed text blocks to LangExtract",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 11
|
|
},
|
|
{
|
|
"bbox": [
|
|
50,
|
|
348,
|
|
312,
|
|
362
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
50,
|
|
348,
|
|
312,
|
|
362
|
|
],
|
|
"score": 1.0,
|
|
"content": "- Extract entities with char_interval positions",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 12
|
|
},
|
|
{
|
|
"bbox": [
|
|
51,
|
|
368,
|
|
274,
|
|
382
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
51,
|
|
368,
|
|
274,
|
|
382
|
|
],
|
|
"score": 1.0,
|
|
"content": "- Extract relationships between entities",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 13
|
|
}
|
|
],
|
|
"index": 12,
|
|
"bbox_fs": [
|
|
50,
|
|
327,
|
|
312,
|
|
382
|
|
]
|
|
},
|
|
{
|
|
"type": "text",
|
|
"bbox": [
|
|
41,
|
|
408,
|
|
192,
|
|
421
|
|
],
|
|
"lines": [
|
|
{
|
|
"bbox": [
|
|
40,
|
|
408,
|
|
192,
|
|
422
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
40,
|
|
408,
|
|
192,
|
|
422
|
|
],
|
|
"score": 1.0,
|
|
"content": "Stage 4: Graph Construction",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 14
|
|
}
|
|
],
|
|
"index": 14,
|
|
"bbox_fs": [
|
|
40,
|
|
408,
|
|
192,
|
|
422
|
|
]
|
|
},
|
|
{
|
|
"type": "text",
|
|
"bbox": [
|
|
51,
|
|
428,
|
|
311,
|
|
481
|
|
],
|
|
"lines": [
|
|
{
|
|
"bbox": [
|
|
50,
|
|
426,
|
|
285,
|
|
443
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
50,
|
|
426,
|
|
285,
|
|
443
|
|
],
|
|
"score": 1.0,
|
|
"content": "- Map extractions to graph nodes and edges",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 15
|
|
},
|
|
{
|
|
"bbox": [
|
|
51,
|
|
447,
|
|
311,
|
|
462
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
51,
|
|
447,
|
|
311,
|
|
462
|
|
],
|
|
"score": 1.0,
|
|
"content": "- Store with source provenance (page_idx, bbox)",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 16
|
|
},
|
|
{
|
|
"bbox": [
|
|
51,
|
|
467,
|
|
302,
|
|
481
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
51,
|
|
467,
|
|
302,
|
|
481
|
|
],
|
|
"score": 1.0,
|
|
"content": "- Build vector embeddings for semantic search",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 17
|
|
}
|
|
],
|
|
"index": 16,
|
|
"bbox_fs": [
|
|
50,
|
|
426,
|
|
311,
|
|
481
|
|
]
|
|
},
|
|
{
|
|
"type": "title",
|
|
"bbox": [
|
|
30,
|
|
508,
|
|
194,
|
|
522
|
|
],
|
|
"lines": [
|
|
{
|
|
"bbox": [
|
|
28,
|
|
507,
|
|
195,
|
|
524
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
28,
|
|
507,
|
|
195,
|
|
524
|
|
],
|
|
"score": 1.0,
|
|
"content": "4. Supported File Formats",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 18
|
|
}
|
|
],
|
|
"index": 18
|
|
},
|
|
{
|
|
"type": "table",
|
|
"bbox": [
|
|
27,
|
|
534,
|
|
525,
|
|
678
|
|
],
|
|
"blocks": [
|
|
{
|
|
"type": "table_body",
|
|
"bbox": [
|
|
27,
|
|
534,
|
|
525,
|
|
678
|
|
],
|
|
"group_id": 0,
|
|
"lines": [
|
|
{
|
|
"bbox": [
|
|
27,
|
|
534,
|
|
525,
|
|
678
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
27,
|
|
534,
|
|
525,
|
|
678
|
|
],
|
|
"score": 0.985,
|
|
"html": "<table><tr><td rowspan=1 colspan=1>Format</td><td rowspan=1 colspan=1>Extension</td><td rowspan=1 colspan=1>OCR Required</td><td rowspan=1 colspan=1>ModeI</td></tr><tr><td rowspan=1 colspan=1>PDF (text)</td><td rowspan=1 colspan=1>. pdf</td><td rowspan=1 colspan=1>No</td><td rowspan=1 colspan=1>pipeline / vlm</td></tr><tr><td rowspan=1 colspan=1>PDF (scan)</td><td rowspan=1 colspan=1>. pdf</td><td rowspan=1 colspan=1>Yes</td><td rowspan=1 colspan=1>vIlm</td></tr><tr><td rowspan=1 colspan=1>Word</td><td rowspan=1 colspan=1>. docx</td><td rowspan=1 colspan=1>No</td><td rowspan=1 colspan=1>pipeline</td></tr><tr><td rowspan=1 colspan=1>PowerPoint</td><td rowspan=1 colspan=1>.pptx</td><td rowspan=1 colspan=1>No</td><td rowspan=1 colspan=1>pipeline</td></tr><tr><td rowspan=1 colspan=1>Image</td><td rowspan=1 colspan=1>.png / .jpg</td><td rowspan=1 colspan=1>Auto</td><td rowspan=1 colspan=1>vIlm</td></tr><tr><td rowspan=1 colspan=1>HTML</td><td rowspan=1 colspan=1>.html</td><td rowspan=1 colspan=1>No</td><td rowspan=1 colspan=1>MinerU-HTML</td></tr></table>",
|
|
"type": "table",
|
|
"image_path": "1ed7aacecd20fecef8dc27ee2fe76dc1ae7fa93c44f7d10878d17a41f21a6bef.jpg"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"index": 20,
|
|
"virtual_lines": [
|
|
{
|
|
"bbox": [
|
|
27,
|
|
534,
|
|
525,
|
|
582.0
|
|
],
|
|
"spans": [],
|
|
"index": 19
|
|
},
|
|
{
|
|
"bbox": [
|
|
27,
|
|
582.0,
|
|
525,
|
|
630.0
|
|
],
|
|
"spans": [],
|
|
"index": 20
|
|
},
|
|
{
|
|
"bbox": [
|
|
27,
|
|
630.0,
|
|
525,
|
|
678.0
|
|
],
|
|
"spans": [],
|
|
"index": 21
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"index": 20
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"preproc_blocks": [
|
|
{
|
|
"type": "title",
|
|
"bbox": [
|
|
29,
|
|
36,
|
|
272,
|
|
53
|
|
],
|
|
"lines": [
|
|
{
|
|
"bbox": [
|
|
27,
|
|
33,
|
|
274,
|
|
55
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
27,
|
|
33,
|
|
274,
|
|
55
|
|
],
|
|
"score": 1.0,
|
|
"content": "5. API Configuration Reference",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 0
|
|
}
|
|
],
|
|
"index": 0
|
|
},
|
|
{
|
|
"type": "text",
|
|
"bbox": [
|
|
29,
|
|
70,
|
|
567,
|
|
104
|
|
],
|
|
"lines": [
|
|
{
|
|
"bbox": [
|
|
27,
|
|
67,
|
|
567,
|
|
87
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
27,
|
|
67,
|
|
567,
|
|
87
|
|
],
|
|
"score": 1.0,
|
|
"content": "The following environment variables must be configured before running the MinerU parsing",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 1
|
|
},
|
|
{
|
|
"bbox": [
|
|
27,
|
|
90,
|
|
77,
|
|
105
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
27,
|
|
90,
|
|
77,
|
|
105
|
|
],
|
|
"score": 1.0,
|
|
"content": "service:",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 2
|
|
}
|
|
],
|
|
"index": 1.5
|
|
},
|
|
{
|
|
"type": "text",
|
|
"bbox": [
|
|
39,
|
|
128,
|
|
379,
|
|
284
|
|
],
|
|
"lines": [
|
|
{
|
|
"bbox": [
|
|
39,
|
|
129,
|
|
362,
|
|
145
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
39,
|
|
130,
|
|
132,
|
|
144
|
|
],
|
|
"score": 1.0,
|
|
"content": "MINERU_API_TOKEN",
|
|
"type": "text"
|
|
},
|
|
{
|
|
"bbox": [
|
|
155,
|
|
129,
|
|
362,
|
|
145
|
|
],
|
|
"score": 1.0,
|
|
"content": ": Bearer token for API authentication",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 3
|
|
},
|
|
{
|
|
"bbox": [
|
|
39,
|
|
149,
|
|
335,
|
|
165
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
39,
|
|
149,
|
|
126,
|
|
165
|
|
],
|
|
"score": 1.0,
|
|
"content": "MINERU_USER_UID",
|
|
"type": "text"
|
|
},
|
|
{
|
|
"bbox": [
|
|
156,
|
|
149,
|
|
335,
|
|
164
|
|
],
|
|
"score": 1.0,
|
|
"content": ": User UUID for quota management",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 4
|
|
},
|
|
{
|
|
"bbox": [
|
|
39,
|
|
170,
|
|
307,
|
|
183
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
39,
|
|
170,
|
|
126,
|
|
183
|
|
],
|
|
"score": 1.0,
|
|
"content": "MINERU_BASE_URL",
|
|
"type": "text"
|
|
},
|
|
{
|
|
"bbox": [
|
|
156,
|
|
170,
|
|
307,
|
|
183
|
|
],
|
|
"score": 1.0,
|
|
"content": ": https://mineru.net/api/v4",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 5
|
|
},
|
|
{
|
|
"bbox": [
|
|
39,
|
|
189,
|
|
379,
|
|
204
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
39,
|
|
189,
|
|
379,
|
|
204
|
|
],
|
|
"score": 1.0,
|
|
"content": "MINERU_MODEL_VERSION : pipeline (default) | vlm | MinerU-HTML",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 6
|
|
},
|
|
{
|
|
"bbox": [
|
|
40,
|
|
210,
|
|
316,
|
|
223
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
40,
|
|
210,
|
|
126,
|
|
223
|
|
],
|
|
"score": 1.0,
|
|
"content": "MINERU_LANGUAGE",
|
|
"type": "text"
|
|
},
|
|
{
|
|
"bbox": [
|
|
156,
|
|
210,
|
|
316,
|
|
223
|
|
],
|
|
"score": 1.0,
|
|
"content": ": ch (Chinese) | en (English)",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 7
|
|
},
|
|
{
|
|
"bbox": [
|
|
39,
|
|
229,
|
|
371,
|
|
244
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
39,
|
|
230,
|
|
115,
|
|
244
|
|
],
|
|
"score": 1.0,
|
|
"content": "MINERU_IS_OCR",
|
|
"type": "text"
|
|
},
|
|
{
|
|
"bbox": [
|
|
156,
|
|
229,
|
|
371,
|
|
243
|
|
],
|
|
"score": 1.0,
|
|
"content": ": false (text PDF) | true (scanned PDF)",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 8
|
|
},
|
|
{
|
|
"bbox": [
|
|
39,
|
|
249,
|
|
236,
|
|
263
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
39,
|
|
249,
|
|
236,
|
|
263
|
|
],
|
|
"score": 1.0,
|
|
"content": "MINERU_ENABLE_FORMULA: true | false",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 9
|
|
},
|
|
{
|
|
"bbox": [
|
|
39,
|
|
269,
|
|
236,
|
|
282
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
39,
|
|
269,
|
|
236,
|
|
282
|
|
],
|
|
"score": 1.0,
|
|
"content": "MINERU_ENABLE_TABLE : true | false",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 10
|
|
}
|
|
],
|
|
"index": 6.5
|
|
},
|
|
{
|
|
"type": "text",
|
|
"bbox": [
|
|
29,
|
|
309,
|
|
96,
|
|
321
|
|
],
|
|
"lines": [
|
|
{
|
|
"bbox": [
|
|
28,
|
|
308,
|
|
97,
|
|
322
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
28,
|
|
308,
|
|
97,
|
|
322
|
|
],
|
|
"score": 1.0,
|
|
"content": "Rate Limits:",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 11
|
|
}
|
|
],
|
|
"index": 11
|
|
},
|
|
{
|
|
"type": "text",
|
|
"bbox": [
|
|
39,
|
|
327,
|
|
300,
|
|
402
|
|
],
|
|
"lines": [
|
|
{
|
|
"bbox": [
|
|
39,
|
|
327,
|
|
242,
|
|
343
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
39,
|
|
327,
|
|
126,
|
|
342
|
|
],
|
|
"score": 1.0,
|
|
"content": "- Max file size",
|
|
"type": "text"
|
|
},
|
|
{
|
|
"bbox": [
|
|
139,
|
|
327,
|
|
242,
|
|
343
|
|
],
|
|
"score": 1.0,
|
|
"content": ": 200 MB per file",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 12
|
|
},
|
|
{
|
|
"bbox": [
|
|
39,
|
|
347,
|
|
258,
|
|
364
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
39,
|
|
347,
|
|
104,
|
|
364
|
|
],
|
|
"score": 1.0,
|
|
"content": "- Max pages",
|
|
"type": "text"
|
|
},
|
|
{
|
|
"bbox": [
|
|
145,
|
|
348,
|
|
258,
|
|
363
|
|
],
|
|
"score": 1.0,
|
|
"content": ": 600 pages per file",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 13
|
|
},
|
|
{
|
|
"bbox": [
|
|
39,
|
|
367,
|
|
300,
|
|
383
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
39,
|
|
367,
|
|
115,
|
|
383
|
|
],
|
|
"score": 1.0,
|
|
"content": "- Daily quota",
|
|
"type": "text"
|
|
},
|
|
{
|
|
"bbox": [
|
|
145,
|
|
368,
|
|
300,
|
|
383
|
|
],
|
|
"score": 1.0,
|
|
"content": ": 2000 pages (high priority)",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 14
|
|
},
|
|
{
|
|
"bbox": [
|
|
39,
|
|
387,
|
|
274,
|
|
403
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
39,
|
|
387,
|
|
116,
|
|
402
|
|
],
|
|
"score": 1.0,
|
|
"content": "- Batch limit",
|
|
"type": "text"
|
|
},
|
|
{
|
|
"bbox": [
|
|
144,
|
|
387,
|
|
274,
|
|
403
|
|
],
|
|
"score": 1.0,
|
|
"content": ": 200 files per request",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 15
|
|
}
|
|
],
|
|
"index": 13.5
|
|
}
|
|
],
|
|
"page_idx": 3,
|
|
"page_size": [
|
|
595,
|
|
841
|
|
],
|
|
"discarded_blocks": [],
|
|
"para_blocks": [
|
|
{
|
|
"type": "title",
|
|
"bbox": [
|
|
29,
|
|
36,
|
|
272,
|
|
53
|
|
],
|
|
"lines": [
|
|
{
|
|
"bbox": [
|
|
27,
|
|
33,
|
|
274,
|
|
55
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
27,
|
|
33,
|
|
274,
|
|
55
|
|
],
|
|
"score": 1.0,
|
|
"content": "5. API Configuration Reference",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 0
|
|
}
|
|
],
|
|
"index": 0
|
|
},
|
|
{
|
|
"type": "text",
|
|
"bbox": [
|
|
29,
|
|
70,
|
|
567,
|
|
104
|
|
],
|
|
"lines": [
|
|
{
|
|
"bbox": [
|
|
27,
|
|
67,
|
|
567,
|
|
87
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
27,
|
|
67,
|
|
567,
|
|
87
|
|
],
|
|
"score": 1.0,
|
|
"content": "The following environment variables must be configured before running the MinerU parsing",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 1
|
|
},
|
|
{
|
|
"bbox": [
|
|
27,
|
|
90,
|
|
77,
|
|
105
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
27,
|
|
90,
|
|
77,
|
|
105
|
|
],
|
|
"score": 1.0,
|
|
"content": "service:",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 2
|
|
}
|
|
],
|
|
"index": 1.5,
|
|
"bbox_fs": [
|
|
27,
|
|
67,
|
|
567,
|
|
105
|
|
]
|
|
},
|
|
{
|
|
"type": "list",
|
|
"bbox": [
|
|
39,
|
|
128,
|
|
379,
|
|
284
|
|
],
|
|
"lines": [
|
|
{
|
|
"bbox": [
|
|
39,
|
|
129,
|
|
362,
|
|
145
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
39,
|
|
130,
|
|
132,
|
|
144
|
|
],
|
|
"score": 1.0,
|
|
"content": "MINERU_API_TOKEN",
|
|
"type": "text"
|
|
},
|
|
{
|
|
"bbox": [
|
|
155,
|
|
129,
|
|
362,
|
|
145
|
|
],
|
|
"score": 1.0,
|
|
"content": ": Bearer token for API authentication",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 3,
|
|
"is_list_start_line": true
|
|
},
|
|
{
|
|
"bbox": [
|
|
39,
|
|
149,
|
|
335,
|
|
165
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
39,
|
|
149,
|
|
126,
|
|
165
|
|
],
|
|
"score": 1.0,
|
|
"content": "MINERU_USER_UID",
|
|
"type": "text"
|
|
},
|
|
{
|
|
"bbox": [
|
|
156,
|
|
149,
|
|
335,
|
|
164
|
|
],
|
|
"score": 1.0,
|
|
"content": ": User UUID for quota management",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 4,
|
|
"is_list_start_line": true
|
|
},
|
|
{
|
|
"bbox": [
|
|
39,
|
|
170,
|
|
307,
|
|
183
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
39,
|
|
170,
|
|
126,
|
|
183
|
|
],
|
|
"score": 1.0,
|
|
"content": "MINERU_BASE_URL",
|
|
"type": "text"
|
|
},
|
|
{
|
|
"bbox": [
|
|
156,
|
|
170,
|
|
307,
|
|
183
|
|
],
|
|
"score": 1.0,
|
|
"content": ": https://mineru.net/api/v4",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 5,
|
|
"is_list_start_line": true
|
|
},
|
|
{
|
|
"bbox": [
|
|
39,
|
|
189,
|
|
379,
|
|
204
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
39,
|
|
189,
|
|
379,
|
|
204
|
|
],
|
|
"score": 1.0,
|
|
"content": "MINERU_MODEL_VERSION : pipeline (default) | vlm | MinerU-HTML",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 6,
|
|
"is_list_start_line": true
|
|
},
|
|
{
|
|
"bbox": [
|
|
40,
|
|
210,
|
|
316,
|
|
223
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
40,
|
|
210,
|
|
126,
|
|
223
|
|
],
|
|
"score": 1.0,
|
|
"content": "MINERU_LANGUAGE",
|
|
"type": "text"
|
|
},
|
|
{
|
|
"bbox": [
|
|
156,
|
|
210,
|
|
316,
|
|
223
|
|
],
|
|
"score": 1.0,
|
|
"content": ": ch (Chinese) | en (English)",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 7,
|
|
"is_list_start_line": true
|
|
},
|
|
{
|
|
"bbox": [
|
|
39,
|
|
229,
|
|
371,
|
|
244
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
39,
|
|
230,
|
|
115,
|
|
244
|
|
],
|
|
"score": 1.0,
|
|
"content": "MINERU_IS_OCR",
|
|
"type": "text"
|
|
},
|
|
{
|
|
"bbox": [
|
|
156,
|
|
229,
|
|
371,
|
|
243
|
|
],
|
|
"score": 1.0,
|
|
"content": ": false (text PDF) | true (scanned PDF)",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 8,
|
|
"is_list_start_line": true
|
|
},
|
|
{
|
|
"bbox": [
|
|
39,
|
|
249,
|
|
236,
|
|
263
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
39,
|
|
249,
|
|
236,
|
|
263
|
|
],
|
|
"score": 1.0,
|
|
"content": "MINERU_ENABLE_FORMULA: true | false",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 9,
|
|
"is_list_start_line": true
|
|
},
|
|
{
|
|
"bbox": [
|
|
39,
|
|
269,
|
|
236,
|
|
282
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
39,
|
|
269,
|
|
236,
|
|
282
|
|
],
|
|
"score": 1.0,
|
|
"content": "MINERU_ENABLE_TABLE : true | false",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 10,
|
|
"is_list_start_line": true
|
|
}
|
|
],
|
|
"index": 6.5,
|
|
"bbox_fs": [
|
|
39,
|
|
129,
|
|
379,
|
|
282
|
|
]
|
|
},
|
|
{
|
|
"type": "text",
|
|
"bbox": [
|
|
29,
|
|
309,
|
|
96,
|
|
321
|
|
],
|
|
"lines": [
|
|
{
|
|
"bbox": [
|
|
28,
|
|
308,
|
|
97,
|
|
322
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
28,
|
|
308,
|
|
97,
|
|
322
|
|
],
|
|
"score": 1.0,
|
|
"content": "Rate Limits:",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 11
|
|
}
|
|
],
|
|
"index": 11,
|
|
"bbox_fs": [
|
|
28,
|
|
308,
|
|
97,
|
|
322
|
|
]
|
|
},
|
|
{
|
|
"type": "text",
|
|
"bbox": [
|
|
39,
|
|
327,
|
|
300,
|
|
402
|
|
],
|
|
"lines": [
|
|
{
|
|
"bbox": [
|
|
39,
|
|
327,
|
|
242,
|
|
343
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
39,
|
|
327,
|
|
126,
|
|
342
|
|
],
|
|
"score": 1.0,
|
|
"content": "- Max file size",
|
|
"type": "text"
|
|
},
|
|
{
|
|
"bbox": [
|
|
139,
|
|
327,
|
|
242,
|
|
343
|
|
],
|
|
"score": 1.0,
|
|
"content": ": 200 MB per file",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 12
|
|
},
|
|
{
|
|
"bbox": [
|
|
39,
|
|
347,
|
|
258,
|
|
364
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
39,
|
|
347,
|
|
104,
|
|
364
|
|
],
|
|
"score": 1.0,
|
|
"content": "- Max pages",
|
|
"type": "text"
|
|
},
|
|
{
|
|
"bbox": [
|
|
145,
|
|
348,
|
|
258,
|
|
363
|
|
],
|
|
"score": 1.0,
|
|
"content": ": 600 pages per file",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 13
|
|
},
|
|
{
|
|
"bbox": [
|
|
39,
|
|
367,
|
|
300,
|
|
383
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
39,
|
|
367,
|
|
115,
|
|
383
|
|
],
|
|
"score": 1.0,
|
|
"content": "- Daily quota",
|
|
"type": "text"
|
|
},
|
|
{
|
|
"bbox": [
|
|
145,
|
|
368,
|
|
300,
|
|
383
|
|
],
|
|
"score": 1.0,
|
|
"content": ": 2000 pages (high priority)",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 14
|
|
},
|
|
{
|
|
"bbox": [
|
|
39,
|
|
387,
|
|
274,
|
|
403
|
|
],
|
|
"spans": [
|
|
{
|
|
"bbox": [
|
|
39,
|
|
387,
|
|
116,
|
|
402
|
|
],
|
|
"score": 1.0,
|
|
"content": "- Batch limit",
|
|
"type": "text"
|
|
},
|
|
{
|
|
"bbox": [
|
|
144,
|
|
387,
|
|
274,
|
|
403
|
|
],
|
|
"score": 1.0,
|
|
"content": ": 200 files per request",
|
|
"type": "text"
|
|
}
|
|
],
|
|
"index": 15
|
|
}
|
|
],
|
|
"index": 13.5,
|
|
"bbox_fs": [
|
|
39,
|
|
327,
|
|
300,
|
|
403
|
|
]
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"_backend": "pipeline",
|
|
"_version_name": "2.7.6"
|
|
} |