{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2024:LE7ARFCNHEWEZEENCZ3Z4QA3V5","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"ef855c401c9db5f58828228443d2d54b7befe49e7a2d658a3c722ed3ecc37174","cross_cats_sorted":["cs.AI","cs.CL","cs.CV"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.IR","submitted_at":"2024-10-14T15:04:18Z","title_canon_sha256":"06d798e0973d1a421d2517422dcf3d932d2229c42b4b5b9dc66fd712adbdc73e"},"schema_version":"1.0","source":{"id":"2410.10594","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2410.10594","created_at":"2026-05-17T23:38:47Z"},{"alias_kind":"arxiv_version","alias_value":"2410.10594v2","created_at":"2026-05-17T23:38:47Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2410.10594","created_at":"2026-05-17T23:38:47Z"},{"alias_kind":"pith_short_12","alias_value":"LE7ARFCNHEWE","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"LE7ARFCNHEWEZEEN","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"LE7ARFCN","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:50584031e2bd1bff4683c80f9284481c5ea6d3fbe537627b478475770e5a2ebd","target":"graph","created_at":"2026-05-17T23:38:47Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Experiments demonstrate that VisRAG outperforms traditional RAG in both the retrieval and generation stages, achieving a 20--40% end-to-end performance gain over traditional text-based RAG pipeline."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That vision-language models can reliably embed and retrieve relevant information directly from document images without text parsing, and that the collected open-source plus synthetic training data generalizes to unseen real-world multi-modality documents."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"VisRAG achieves 20-40% better end-to-end performance than text-based RAG by directly embedding and retrieving document images with VLMs."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"VisRAG retrieves and generates from multi-modal documents by embedding them directly as images rather than parsing to text."}],"snapshot_sha256":"b322cd4defbaa2d37499b4c3fd6488d49179f019f5515278d81ce571cbe6333c"},"formal_canon":{"evidence_count":1,"snapshot_sha256":"06f901e2d13a8df9d09f1133efd5dab3dfdb4e98ddad94c2d634fa17de4a11bc"},"paper":{"abstract_excerpt":"Retrieval-augmented generation (RAG) is an effective technique that enables large language models (LLMs) to utilize external knowledge sources for generation. However, current RAG systems are solely based on text, rendering it impossible to utilize vision information like layout and images that play crucial roles in real-world multi-modality documents. In this paper, we introduce VisRAG, which tackles this issue by establishing a vision-language model (VLM)-based RAG pipeline. In this pipeline, instead of first parsing the document to obtain text, the document is directly embedded using a VLM ","authors_text":"Bokai Xu, Chaoyue Tang, Junbo Cui, Junhao Ran, Maosong Sun, Shi Yu, Shuo Wang, Xu Han, Yukun Yan, Zhenghao Liu, Zhiyuan Liu","cross_cats":["cs.AI","cs.CL","cs.CV"],"headline":"VisRAG retrieves and generates from multi-modal documents by embedding them directly as images rather than parsing to text.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.IR","submitted_at":"2024-10-14T15:04:18Z","title":"VisRAG: Vision-based Retrieval-augmented Generation on Multi-modality Documents"},"references":{"count":43,"internal_anchors":11,"resolved_work":43,"sample":[{"cited_arxiv_id":"2303.08774","doi":"","is_internal_anchor":true,"ref_index":1,"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"A multitask, multilingual, multimodal evaluation of chatgpt on reasoning, hallucination, and interactivity","work_id":"1438d57e-4867-4699-8c77-b4fdb11e5d85","year":2023},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"Allava: Harness- ing gpt4v-synthesized data for a lite vision-language model","work_id":"4cf5f4e3-c59a-4ccb-a655-563157a9ce74","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"PP-OCR: A practical ultra lightweight OCR system.CoRR, abs/2009.09941","work_id":"555e4547-6e8f-49f5-acf4-0adae9b359e3","year":2009},{"cited_arxiv_id":"2407.01449","doi":"","is_internal_anchor":true,"ref_index":5,"title":"ColPali: Efficient Document Retrieval with Vision Language Models","work_id":"d2468d08-90dc-4690-887a-9b10a6d3574e","year":null}],"snapshot_sha256":"7f5fed8f5fd4cda713d7c3cda7c3c4a24712e79509c6365858410fab1fa160c8"},"source":{"id":"2410.10594","kind":"arxiv","version":2},"verdict":{"created_at":"2026-05-16T15:33:33.637224Z","id":"6c6e11ae-29c1-4f68-86c2-5cc11845d6ed","model_set":{"reader":"grok-4.3"},"one_line_summary":"VisRAG achieves 20-40% better end-to-end performance than text-based RAG by directly embedding and retrieving document images with VLMs.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"VisRAG retrieves and generates from multi-modal documents by embedding them directly as images rather than parsing to text.","strongest_claim":"Experiments demonstrate that VisRAG outperforms traditional RAG in both the retrieval and generation stages, achieving a 20--40% end-to-end performance gain over traditional text-based RAG pipeline.","weakest_assumption":"That vision-language models can reliably embed and retrieve relevant information directly from document images without text parsing, and that the collected open-source plus synthetic training data generalizes to unseen real-world multi-modality documents."}},"verdict_id":"6c6e11ae-29c1-4f68-86c2-5cc11845d6ed"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:9b27c566e3bfc9cf48d0baf0fed6ecfd04f902eb754417d3076ac345d1f13a3b","target":"record","created_at":"2026-05-17T23:38:47Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"ef855c401c9db5f58828228443d2d54b7befe49e7a2d658a3c722ed3ecc37174","cross_cats_sorted":["cs.AI","cs.CL","cs.CV"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.IR","submitted_at":"2024-10-14T15:04:18Z","title_canon_sha256":"06d798e0973d1a421d2517422dcf3d932d2229c42b4b5b9dc66fd712adbdc73e"},"schema_version":"1.0","source":{"id":"2410.10594","kind":"arxiv","version":2}},"canonical_sha256":"593e08944d392c4c908d16779e401baf6845fa73cb450646cc58fec8f40735bd","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"593e08944d392c4c908d16779e401baf6845fa73cb450646cc58fec8f40735bd","first_computed_at":"2026-05-17T23:38:47.418247Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:47.418247Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"95gO1Rxm2FcNIi18xdDitQ515HjdXfi38YDN1wap3iFrQ6s1RgWJtS3LTRTM7WLoqLd3tlv+OiuskEvTlKq7Cw==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:47.418750Z","signed_message":"canonical_sha256_bytes"},"source_id":"2410.10594","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:9b27c566e3bfc9cf48d0baf0fed6ecfd04f902eb754417d3076ac345d1f13a3b","sha256:50584031e2bd1bff4683c80f9284481c5ea6d3fbe537627b478475770e5a2ebd"],"state_sha256":"99013e48595d3c53df30bf9a4cd46a0513061f409b02655691337c555bf568db"}