{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:B7LAWZNLDUKNYWFKMMPRTW5SII","short_pith_number":"pith:B7LAWZNL","canonical_record":{"source":{"id":"2604.04948","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by-nc-nd/4.0/","primary_cat":"cs.IR","submitted_at":"2026-03-30T14:40:58Z","cross_cats_sorted":["cs.AI","cs.LG"],"title_canon_sha256":"faed6d372f21fc87b008678a97c37c4731d601fe02c585039e7af4f50ca62bfe","abstract_canon_sha256":"1c5abb90893fa4fa1e425bfe25f21a3aac15fc4115cdbe716923fe74ca2c39ab"},"schema_version":"1.0"},"canonical_sha256":"0fd60b65ab1d14dc58aa631f19dbb2421849346328f92888ecdfb939b4b1f9fa","source":{"kind":"arxiv","id":"2604.04948","version":2},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2604.04948","created_at":"2026-05-27T01:04:57Z"},{"alias_kind":"arxiv_version","alias_value":"2604.04948v2","created_at":"2026-05-27T01:04:57Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2604.04948","created_at":"2026-05-27T01:04:57Z"},{"alias_kind":"pith_short_12","alias_value":"B7LAWZNLDUKN","created_at":"2026-05-27T01:04:57Z"},{"alias_kind":"pith_short_16","alias_value":"B7LAWZNLDUKNYWFK","created_at":"2026-05-27T01:04:57Z"},{"alias_kind":"pith_short_8","alias_value":"B7LAWZNL","created_at":"2026-05-27T01:04:57Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:B7LAWZNLDUKNYWFKMMPRTW5SII","target":"record","payload":{"canonical_record":{"source":{"id":"2604.04948","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by-nc-nd/4.0/","primary_cat":"cs.IR","submitted_at":"2026-03-30T14:40:58Z","cross_cats_sorted":["cs.AI","cs.LG"],"title_canon_sha256":"faed6d372f21fc87b008678a97c37c4731d601fe02c585039e7af4f50ca62bfe","abstract_canon_sha256":"1c5abb90893fa4fa1e425bfe25f21a3aac15fc4115cdbe716923fe74ca2c39ab"},"schema_version":"1.0"},"canonical_sha256":"0fd60b65ab1d14dc58aa631f19dbb2421849346328f92888ecdfb939b4b1f9fa","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-27T01:04:57.708831Z","signature_b64":"XI2r8gze2qYBHhQt4LIsahN30m20GWFP+i0lzdihkaXe1+pKawQJlD4Js0AGo9pRoGEvPNwUoEcj1YC5WX5yDQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"0fd60b65ab1d14dc58aa631f19dbb2421849346328f92888ecdfb939b4b1f9fa","last_reissued_at":"2026-05-27T01:04:57.708295Z","signature_status":"signed_v1","first_computed_at":"2026-05-27T01:04:57.708295Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2604.04948","source_version":2,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-27T01:04:57Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"GRCTOKHZaqJ8f//cRnVbaHZo8jaY8LGjW3Kx89b7eP8cJbMkmktdWr7cYQV1yKdItZ5thGk6kafyLtnF9Z2rAg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-27T17:37:30.876985Z"},"content_sha256":"a02b6fdae1b942d97408f477d1ca277afd720a6fe9ad09100f8e18ab468cd495","schema_version":"1.0","event_id":"sha256:a02b6fdae1b942d97408f477d1ca277afd720a6fe9ad09100f8e18ab468cd495"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:B7LAWZNLDUKNYWFKMMPRTW5SII","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"From PDF to RAG-Ready: Evaluating Document Conversion Frameworks for Domain-Specific Question Answering","license":"http://creativecommons.org/licenses/by-nc-nd/4.0/","headline":"Metadata enrichment and hierarchy-aware chunking improve RAG accuracy more than the choice of PDF conversion framework.","cross_cats":["cs.AI","cs.LG"],"primary_cat":"cs.IR","authors_text":"Alexandre Sousa, Br\\'igida M\\'onica Faria, Henrique Lopes Cardoso, Jos\\'e Duarte, Jos\\'e Guilherme Marques dos Santos, Jos\\'e Lu\\'is Reis, Jos\\'e Paulo Marques dos Santos, Lu\\'is Paulo Reis, Pedro Pimenta, Ricardo Yang, Rui Humberto Pereira","submitted_at":"2026-03-30T14:40:58Z","abstract_excerpt":"Retrieval-Augmented Generation (RAG) systems depend critically on the quality of document preprocessing, yet no prior study has evaluated PDF processing frameworks by their impact on downstream question-answering accuracy. We address this gap through a systematic comparison of four open-source PDF-to-Markdown conversion frameworks, Docling, MinerU, Marker, and DeepSeek OCR, across 21 pipeline configurations, varying the conversion tool, cleaning transformations, splitting strategy, and metadata enrichment. Evaluation was performed using a 50-question benchmark over a corpus of 36 Portuguese ad"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Metadata enrichment and hierarchy-aware chunking contributed more to accuracy than the conversion framework choice alone.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That LLM-as-judge scoring on 50 questions reliably measures true downstream question-answering quality without human validation or error bars on the judge itself.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Docling with hierarchical splitting reaches 94.1% RAG accuracy on domain documents, beating naive PDF loading but trailing manual Markdown curation at 97.1%.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Metadata enrichment and hierarchy-aware chunking improve RAG accuracy more than the choice of PDF conversion framework.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"260ba82bc6f34be09a896a7ce9c1a3eecc6acc6b9d6e179d8a08b0974ddfe7bb"},"source":{"id":"2604.04948","kind":"arxiv","version":2},"verdict":{"id":"3cc2b978-4156-4faf-8c7a-f1392ddd943b","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-14T02:01:10.447492Z","strongest_claim":"Metadata enrichment and hierarchy-aware chunking contributed more to accuracy than the conversion framework choice alone.","one_line_summary":"Docling with hierarchical splitting reaches 94.1% RAG accuracy on domain documents, beating naive PDF loading but trailing manual Markdown curation at 97.1%.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That LLM-as-judge scoring on 50 questions reliably measures true downstream question-answering quality without human validation or error bars on the judge itself.","pith_extraction_headline":"Metadata enrichment and hierarchy-aware chunking improve RAG accuracy more than the choice of PDF conversion framework."},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2604.04948/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":2,"snapshot_sha256":"f6f5e0ed5f4782a84fc4d1c0cc8c7c665b8caf693794c6e54accbd3d022efe5d"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"3cc2b978-4156-4faf-8c7a-f1392ddd943b"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-27T01:04:57Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"4GtspePiMtLdivY5EfxXHs28hudQAupLGq9f1wUgPBJrBhFgKjkyraU1CZude9DhgiV4h6v+ez6GUFncnJhdBQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-27T17:37:30.877486Z"},"content_sha256":"32336e8672c6be7be3ab76ad533d9fe0acde57abea8b3a92cffc9c123ed77604","schema_version":"1.0","event_id":"sha256:32336e8672c6be7be3ab76ad533d9fe0acde57abea8b3a92cffc9c123ed77604"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/B7LAWZNLDUKNYWFKMMPRTW5SII/bundle.json","state_url":"https://pith.science/pith/B7LAWZNLDUKNYWFKMMPRTW5SII/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/B7LAWZNLDUKNYWFKMMPRTW5SII/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-27T17:37:30Z","links":{"resolver":"https://pith.science/pith/B7LAWZNLDUKNYWFKMMPRTW5SII","bundle":"https://pith.science/pith/B7LAWZNLDUKNYWFKMMPRTW5SII/bundle.json","state":"https://pith.science/pith/B7LAWZNLDUKNYWFKMMPRTW5SII/state.json","well_known_bundle":"https://pith.science/.well-known/pith/B7LAWZNLDUKNYWFKMMPRTW5SII/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:B7LAWZNLDUKNYWFKMMPRTW5SII","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"1c5abb90893fa4fa1e425bfe25f21a3aac15fc4115cdbe716923fe74ca2c39ab","cross_cats_sorted":["cs.AI","cs.LG"],"license":"http://creativecommons.org/licenses/by-nc-nd/4.0/","primary_cat":"cs.IR","submitted_at":"2026-03-30T14:40:58Z","title_canon_sha256":"faed6d372f21fc87b008678a97c37c4731d601fe02c585039e7af4f50ca62bfe"},"schema_version":"1.0","source":{"id":"2604.04948","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2604.04948","created_at":"2026-05-27T01:04:57Z"},{"alias_kind":"arxiv_version","alias_value":"2604.04948v2","created_at":"2026-05-27T01:04:57Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2604.04948","created_at":"2026-05-27T01:04:57Z"},{"alias_kind":"pith_short_12","alias_value":"B7LAWZNLDUKN","created_at":"2026-05-27T01:04:57Z"},{"alias_kind":"pith_short_16","alias_value":"B7LAWZNLDUKNYWFK","created_at":"2026-05-27T01:04:57Z"},{"alias_kind":"pith_short_8","alias_value":"B7LAWZNL","created_at":"2026-05-27T01:04:57Z"}],"graph_snapshots":[{"event_id":"sha256:32336e8672c6be7be3ab76ad533d9fe0acde57abea8b3a92cffc9c123ed77604","target":"graph","created_at":"2026-05-27T01:04:57Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Metadata enrichment and hierarchy-aware chunking contributed more to accuracy than the conversion framework choice alone."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That LLM-as-judge scoring on 50 questions reliably measures true downstream question-answering quality without human validation or error bars on the judge itself."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Docling with hierarchical splitting reaches 94.1% RAG accuracy on domain documents, beating naive PDF loading but trailing manual Markdown curation at 97.1%."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Metadata enrichment and hierarchy-aware chunking improve RAG accuracy more than the choice of PDF conversion framework."}],"snapshot_sha256":"260ba82bc6f34be09a896a7ce9c1a3eecc6acc6b9d6e179d8a08b0974ddfe7bb"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"f6f5e0ed5f4782a84fc4d1c0cc8c7c665b8caf693794c6e54accbd3d022efe5d"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2604.04948/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Retrieval-Augmented Generation (RAG) systems depend critically on the quality of document preprocessing, yet no prior study has evaluated PDF processing frameworks by their impact on downstream question-answering accuracy. We address this gap through a systematic comparison of four open-source PDF-to-Markdown conversion frameworks, Docling, MinerU, Marker, and DeepSeek OCR, across 21 pipeline configurations, varying the conversion tool, cleaning transformations, splitting strategy, and metadata enrichment. Evaluation was performed using a 50-question benchmark over a corpus of 36 Portuguese ad","authors_text":"Alexandre Sousa, Br\\'igida M\\'onica Faria, Henrique Lopes Cardoso, Jos\\'e Duarte, Jos\\'e Guilherme Marques dos Santos, Jos\\'e Lu\\'is Reis, Jos\\'e Paulo Marques dos Santos, Lu\\'is Paulo Reis, Pedro Pimenta, Ricardo Yang, Rui Humberto Pereira","cross_cats":["cs.AI","cs.LG"],"headline":"Metadata enrichment and hierarchy-aware chunking improve RAG accuracy more than the choice of PDF conversion framework.","license":"http://creativecommons.org/licenses/by-nc-nd/4.0/","primary_cat":"cs.IR","submitted_at":"2026-03-30T14:40:58Z","title":"From PDF to RAG-Ready: Evaluating Document Conversion Frameworks for Domain-Specific Question Answering"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2604.04948","kind":"arxiv","version":2},"verdict":{"created_at":"2026-05-14T02:01:10.447492Z","id":"3cc2b978-4156-4faf-8c7a-f1392ddd943b","model_set":{"reader":"grok-4.3"},"one_line_summary":"Docling with hierarchical splitting reaches 94.1% RAG accuracy on domain documents, beating naive PDF loading but trailing manual Markdown curation at 97.1%.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Metadata enrichment and hierarchy-aware chunking improve RAG accuracy more than the choice of PDF conversion framework.","strongest_claim":"Metadata enrichment and hierarchy-aware chunking contributed more to accuracy than the conversion framework choice alone.","weakest_assumption":"That LLM-as-judge scoring on 50 questions reliably measures true downstream question-answering quality without human validation or error bars on the judge itself."}},"verdict_id":"3cc2b978-4156-4faf-8c7a-f1392ddd943b"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:a02b6fdae1b942d97408f477d1ca277afd720a6fe9ad09100f8e18ab468cd495","target":"record","created_at":"2026-05-27T01:04:57Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"1c5abb90893fa4fa1e425bfe25f21a3aac15fc4115cdbe716923fe74ca2c39ab","cross_cats_sorted":["cs.AI","cs.LG"],"license":"http://creativecommons.org/licenses/by-nc-nd/4.0/","primary_cat":"cs.IR","submitted_at":"2026-03-30T14:40:58Z","title_canon_sha256":"faed6d372f21fc87b008678a97c37c4731d601fe02c585039e7af4f50ca62bfe"},"schema_version":"1.0","source":{"id":"2604.04948","kind":"arxiv","version":2}},"canonical_sha256":"0fd60b65ab1d14dc58aa631f19dbb2421849346328f92888ecdfb939b4b1f9fa","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"0fd60b65ab1d14dc58aa631f19dbb2421849346328f92888ecdfb939b4b1f9fa","first_computed_at":"2026-05-27T01:04:57.708295Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-27T01:04:57.708295Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"XI2r8gze2qYBHhQt4LIsahN30m20GWFP+i0lzdihkaXe1+pKawQJlD4Js0AGo9pRoGEvPNwUoEcj1YC5WX5yDQ==","signature_status":"signed_v1","signed_at":"2026-05-27T01:04:57.708831Z","signed_message":"canonical_sha256_bytes"},"source_id":"2604.04948","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:a02b6fdae1b942d97408f477d1ca277afd720a6fe9ad09100f8e18ab468cd495","sha256:32336e8672c6be7be3ab76ad533d9fe0acde57abea8b3a92cffc9c123ed77604"],"state_sha256":"3fae2e184cec52d5cc04e4d895f8ee360fe3553af04787a6809e96d6416f48f0"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"J5EV0yyAbNceFM2Jf3BQIumJ7KwDsLX3VwgwXfyEVXokn2+fm2DxoKEpg2MhOgFuUhqZTvS3uXZwu1cma2ZGBQ==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-27T17:37:30.880148Z","bundle_sha256":"208389fa3c78dab04aa0910d00eb6fcc27368741512c375e022093e60f144045"}}