{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2024:CMDBZD66D25STJEVFILEYMFOWV","short_pith_number":"pith:CMDBZD66","canonical_record":{"source":{"id":"2409.18839","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2024-09-27T15:35:15Z","cross_cats_sorted":[],"title_canon_sha256":"818fa4332e6a9e69b932219b941c25e6d0005107a01f923c193435bdef7819b0","abstract_canon_sha256":"e86d00729bac6e949a7c8694a6ca9a66683ccb6e67b205c75372e165b03567f5"},"schema_version":"1.0"},"canonical_sha256":"13061c8fde1ebb29a4952a164c30aeb575585f6faadc075ce8f64822a9da0bc3","source":{"kind":"arxiv","id":"2409.18839","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2409.18839","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"arxiv_version","alias_value":"2409.18839v1","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2409.18839","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"pith_short_12","alias_value":"CMDBZD66D25S","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"CMDBZD66D25STJEV","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"CMDBZD66","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2024:CMDBZD66D25STJEVFILEYMFOWV","target":"record","payload":{"canonical_record":{"source":{"id":"2409.18839","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2024-09-27T15:35:15Z","cross_cats_sorted":[],"title_canon_sha256":"818fa4332e6a9e69b932219b941c25e6d0005107a01f923c193435bdef7819b0","abstract_canon_sha256":"e86d00729bac6e949a7c8694a6ca9a66683ccb6e67b205c75372e165b03567f5"},"schema_version":"1.0"},"canonical_sha256":"13061c8fde1ebb29a4952a164c30aeb575585f6faadc075ce8f64822a9da0bc3","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:49.166923Z","signature_b64":"pcDGjN2SYbyKWTL4I7xa/Mtoblu57TlVapLDSCilcuFGcjxfCmiwjEK7KhNhgXq+2TiYYvcCV9KjfU6Mij1fCQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"13061c8fde1ebb29a4952a164c30aeb575585f6faadc075ce8f64822a9da0bc3","last_reissued_at":"2026-05-17T23:38:49.166438Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:49.166438Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2409.18839","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:49Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"dn45ui4ngahlDo4MGehERVf12crtYnKPGcHlX2qa9fe1a4nByrLIyZjnuVVKuoRORldB+vi5mhXEVs+nlPKkAw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-31T02:10:00.020075Z"},"content_sha256":"06044e8ee84882557faa3a79489c64d02b81b5fb5f743f5bfad8ae11d908e329","schema_version":"1.0","event_id":"sha256:06044e8ee84882557faa3a79489c64d02b81b5fb5f743f5bfad8ae11d908e329"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2024:CMDBZD66D25STJEVFILEYMFOWV","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"MinerU: An Open-Source Solution for Precise Document Content Extraction","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"MinerU combines PDF-Extract-Kit models with custom rules to deliver high-precision document content extraction in open source.","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Bin Wang, Botian Shi, Bo Zhang, Chao Xu, Conghui He, Dahua Lin, Fan Wu, Fukai Shang, Kaiwen Liu, Linke Ouyang, Liqun Wei, Rui Xu, Wei Li, Xiaomeng Zhao, Yuan Qu, Yu Qiao, Zhihao Sui, Zhiyuan Zhao","submitted_at":"2024-09-27T15:35:15Z","abstract_excerpt":"Document content analysis has been a crucial research area in computer vision. Despite significant advancements in methods such as OCR, layout detection, and formula recognition, existing open-source solutions struggle to consistently deliver high-quality content extraction due to the diversity in document types and content. To address these challenges, we present MinerU, an open-source solution for high-precision document content extraction. MinerU leverages the sophisticated PDF-Extract-Kit models to extract content from diverse documents effectively and employs finely-tuned preprocessing an"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Experimental results demonstrate that MinerU consistently achieves high performance across various document types, significantly enhancing the quality and consistency of content extraction.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That the PDF-Extract-Kit models plus the authors' preprocessing and postprocessing rules generalize beyond the tested document collection and that the reported performance metrics reflect real-world usage without hidden data selection.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"MinerU delivers an open-source pipeline for high-precision document content extraction by integrating specialized models with tuned preprocessing and postprocessing rules.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"MinerU combines PDF-Extract-Kit models with custom rules to deliver high-precision document content extraction in open source.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"77b2a58cbf05d5a29e2ed675aa27c59f7dfff80decb606046344161710d5bcbc"},"source":{"id":"2409.18839","kind":"arxiv","version":1},"verdict":{"id":"356fc045-d559-45ef-add9-5100058d9222","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-16T03:55:23.006745Z","strongest_claim":"Experimental results demonstrate that MinerU consistently achieves high performance across various document types, significantly enhancing the quality and consistency of content extraction.","one_line_summary":"MinerU delivers an open-source pipeline for high-precision document content extraction by integrating specialized models with tuned preprocessing and postprocessing rules.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That the PDF-Extract-Kit models plus the authors' preprocessing and postprocessing rules generalize beyond the tested document collection and that the reported performance metrics reflect real-world usage without hidden data selection.","pith_extraction_headline":"MinerU combines PDF-Extract-Kit models with custom rules to deliver high-precision document content extraction in open source."},"references":{"count":42,"sample":[{"doi":"","year":2023,"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","ref_index":1,"cited_arxiv_id":"2303.08774","is_internal_anchor":true},{"doi":"","year":2023,"title":"Self-RAG: Learning to Retrieve, Generate, and Critique through Self-Reflection","work_id":"7316de4a-d07d-41de-88ee-509f9b52e462","ref_index":2,"cited_arxiv_id":"2310.11511","is_internal_anchor":true},{"doi":"","year":null,"title":"pix2tex - latex ocr","work_id":"79002e27-8b88-4e41-b5d5-1ec5efe44db8","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2023,"title":"Nougat: Neural Optical Understanding for Academic Documents","work_id":"26c3b627-7e97-40d7-bab3-020936b8196b","ref_index":4,"cited_arxiv_id":"2308.13418","is_internal_anchor":true},{"doi":"","year":2005,"title":"Language Models are Few-Shot Learners","work_id":"214732c0-2edd-44a0-af9e-28184a2b8279","ref_index":5,"cited_arxiv_id":"2005.14165","is_internal_anchor":true}],"resolved_work":42,"snapshot_sha256":"5349b9a575508fcc5fcd470fa29a759ff7d0431cf6ff5204818d70daa2e8b5a8","internal_anchors":14},"formal_canon":{"evidence_count":2,"snapshot_sha256":"55ef9b60fae9115cb9d971c05748234a3dc8f3977f005150b1919c86f08a371f"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"356fc045-d559-45ef-add9-5100058d9222"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:49Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"M/g/h4ikef02cBwl42DoM0FoLyML0jysBpfUh94sMyGpAqeQW8H1hDIzpQN/inv+37lFEhM7Lfs8NGl0b2+gBQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-31T02:10:00.020777Z"},"content_sha256":"8c8a8502c5515236237aaa93db38675c9858277c4e1caf44ea869aaf63396cde","schema_version":"1.0","event_id":"sha256:8c8a8502c5515236237aaa93db38675c9858277c4e1caf44ea869aaf63396cde"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/CMDBZD66D25STJEVFILEYMFOWV/bundle.json","state_url":"https://pith.science/pith/CMDBZD66D25STJEVFILEYMFOWV/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/CMDBZD66D25STJEVFILEYMFOWV/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-31T02:10:00Z","links":{"resolver":"https://pith.science/pith/CMDBZD66D25STJEVFILEYMFOWV","bundle":"https://pith.science/pith/CMDBZD66D25STJEVFILEYMFOWV/bundle.json","state":"https://pith.science/pith/CMDBZD66D25STJEVFILEYMFOWV/state.json","well_known_bundle":"https://pith.science/.well-known/pith/CMDBZD66D25STJEVFILEYMFOWV/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2024:CMDBZD66D25STJEVFILEYMFOWV","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"e86d00729bac6e949a7c8694a6ca9a66683ccb6e67b205c75372e165b03567f5","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2024-09-27T15:35:15Z","title_canon_sha256":"818fa4332e6a9e69b932219b941c25e6d0005107a01f923c193435bdef7819b0"},"schema_version":"1.0","source":{"id":"2409.18839","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2409.18839","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"arxiv_version","alias_value":"2409.18839v1","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2409.18839","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"pith_short_12","alias_value":"CMDBZD66D25S","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"CMDBZD66D25STJEV","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"CMDBZD66","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:8c8a8502c5515236237aaa93db38675c9858277c4e1caf44ea869aaf63396cde","target":"graph","created_at":"2026-05-17T23:38:49Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Experimental results demonstrate that MinerU consistently achieves high performance across various document types, significantly enhancing the quality and consistency of content extraction."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the PDF-Extract-Kit models plus the authors' preprocessing and postprocessing rules generalize beyond the tested document collection and that the reported performance metrics reflect real-world usage without hidden data selection."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"MinerU delivers an open-source pipeline for high-precision document content extraction by integrating specialized models with tuned preprocessing and postprocessing rules."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"MinerU combines PDF-Extract-Kit models with custom rules to deliver high-precision document content extraction in open source."}],"snapshot_sha256":"77b2a58cbf05d5a29e2ed675aa27c59f7dfff80decb606046344161710d5bcbc"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"55ef9b60fae9115cb9d971c05748234a3dc8f3977f005150b1919c86f08a371f"},"paper":{"abstract_excerpt":"Document content analysis has been a crucial research area in computer vision. Despite significant advancements in methods such as OCR, layout detection, and formula recognition, existing open-source solutions struggle to consistently deliver high-quality content extraction due to the diversity in document types and content. To address these challenges, we present MinerU, an open-source solution for high-precision document content extraction. MinerU leverages the sophisticated PDF-Extract-Kit models to extract content from diverse documents effectively and employs finely-tuned preprocessing an","authors_text":"Bin Wang, Botian Shi, Bo Zhang, Chao Xu, Conghui He, Dahua Lin, Fan Wu, Fukai Shang, Kaiwen Liu, Linke Ouyang, Liqun Wei, Rui Xu, Wei Li, Xiaomeng Zhao, Yuan Qu, Yu Qiao, Zhihao Sui, Zhiyuan Zhao","cross_cats":[],"headline":"MinerU combines PDF-Extract-Kit models with custom rules to deliver high-precision document content extraction in open source.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2024-09-27T15:35:15Z","title":"MinerU: An Open-Source Solution for Precise Document Content Extraction"},"references":{"count":42,"internal_anchors":14,"resolved_work":42,"sample":[{"cited_arxiv_id":"2303.08774","doi":"","is_internal_anchor":true,"ref_index":1,"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","year":2023},{"cited_arxiv_id":"2310.11511","doi":"","is_internal_anchor":true,"ref_index":2,"title":"Self-RAG: Learning to Retrieve, Generate, and Critique through Self-Reflection","work_id":"7316de4a-d07d-41de-88ee-509f9b52e462","year":2023},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"pix2tex - latex ocr","work_id":"79002e27-8b88-4e41-b5d5-1ec5efe44db8","year":null},{"cited_arxiv_id":"2308.13418","doi":"","is_internal_anchor":true,"ref_index":4,"title":"Nougat: Neural Optical Understanding for Academic Documents","work_id":"26c3b627-7e97-40d7-bab3-020936b8196b","year":2023},{"cited_arxiv_id":"2005.14165","doi":"","is_internal_anchor":true,"ref_index":5,"title":"Language Models are Few-Shot Learners","work_id":"214732c0-2edd-44a0-af9e-28184a2b8279","year":2005}],"snapshot_sha256":"5349b9a575508fcc5fcd470fa29a759ff7d0431cf6ff5204818d70daa2e8b5a8"},"source":{"id":"2409.18839","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-16T03:55:23.006745Z","id":"356fc045-d559-45ef-add9-5100058d9222","model_set":{"reader":"grok-4.3"},"one_line_summary":"MinerU delivers an open-source pipeline for high-precision document content extraction by integrating specialized models with tuned preprocessing and postprocessing rules.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"MinerU combines PDF-Extract-Kit models with custom rules to deliver high-precision document content extraction in open source.","strongest_claim":"Experimental results demonstrate that MinerU consistently achieves high performance across various document types, significantly enhancing the quality and consistency of content extraction.","weakest_assumption":"That the PDF-Extract-Kit models plus the authors' preprocessing and postprocessing rules generalize beyond the tested document collection and that the reported performance metrics reflect real-world usage without hidden data selection."}},"verdict_id":"356fc045-d559-45ef-add9-5100058d9222"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:06044e8ee84882557faa3a79489c64d02b81b5fb5f743f5bfad8ae11d908e329","target":"record","created_at":"2026-05-17T23:38:49Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"e86d00729bac6e949a7c8694a6ca9a66683ccb6e67b205c75372e165b03567f5","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2024-09-27T15:35:15Z","title_canon_sha256":"818fa4332e6a9e69b932219b941c25e6d0005107a01f923c193435bdef7819b0"},"schema_version":"1.0","source":{"id":"2409.18839","kind":"arxiv","version":1}},"canonical_sha256":"13061c8fde1ebb29a4952a164c30aeb575585f6faadc075ce8f64822a9da0bc3","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"13061c8fde1ebb29a4952a164c30aeb575585f6faadc075ce8f64822a9da0bc3","first_computed_at":"2026-05-17T23:38:49.166438Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:49.166438Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"pcDGjN2SYbyKWTL4I7xa/Mtoblu57TlVapLDSCilcuFGcjxfCmiwjEK7KhNhgXq+2TiYYvcCV9KjfU6Mij1fCQ==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:49.166923Z","signed_message":"canonical_sha256_bytes"},"source_id":"2409.18839","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:06044e8ee84882557faa3a79489c64d02b81b5fb5f743f5bfad8ae11d908e329","sha256:8c8a8502c5515236237aaa93db38675c9858277c4e1caf44ea869aaf63396cde"],"state_sha256":"209268bceae31c5eb69b52d67b99c7c6fcc1d664e2c5d40638e9cda38357cfea"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"eQldq2EEcZcVLyOxDrz0p9rY1uABu+FYg3mHSOQrVXR6aa5hR5sAGAnrTBUuMirEOt6Q03oVFZ1SpRUF+HYFAg==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-31T02:10:00.024763Z","bundle_sha256":"ca156528d45f890465f2e8161771069383fddaeaf717684692b6bdc3d6e9db8b"}}