{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2025:SGK536E6KCACMPJDU3T2K5CQFI","short_pith_number":"pith:SGK536E6","schema_version":"1.0","canonical_sha256":"9195ddf89e5080263d23a6e7a574502a11b49b5c68f37b915fa4f4368720acc3","source":{"kind":"arxiv","id":"2505.18603","version":2},"attestation_state":"computed","paper":{"title":"Doc-CoB: Enhancing Document Understanding with Visual Chain-of-Boxes Reasoning","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.CV"],"primary_cat":"cs.AI","authors_text":"Bo Zhang, Gang Huang, Hangdi Xing, Huan Zhou, Jiajun Bu, Kai Ye, Kehan Chen, Sheng Zhou, Xianwei Mao, Ye Mo, Zirui Shao, Zixu Yan","submitted_at":"2025-05-24T08:53:05Z","abstract_excerpt":"Document understanding aims to perform question answering and information extraction over document images, where the visual content is highly information-dense and most queries rely on only a few relevant layout regions. However, existing methods either adopt a one-pass strategy that implicitly assumes all layouts are equally important, or focus excessively on small regions at the cost of losing critical layout information. To address these limitations, we introduce Doc-CoB (Chain-of-Boxes), a simple-yet-effective framework that integrates coarse-to-fine layout-aware visual reasoning into mult"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2505.18603","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2025-05-24T08:53:05Z","cross_cats_sorted":["cs.CV"],"title_canon_sha256":"d303844761d84f8ce23cc7700c82cae9cc4afbe878d366435a9cb9089745d474","abstract_canon_sha256":"7908b9fe0649fa224dd11dbd3eb447458eb791f01da9b82e375c15fe7965b83e"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-27T02:05:03.387205Z","signature_b64":"KTQIoNyUDq4tO0hzWlr1TsPZ8omj2puPKzf/hzDwDpl5SRljvTbhrtLp0vysoAloNbLyRJrT0UwG0zrYZe7jCA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"9195ddf89e5080263d23a6e7a574502a11b49b5c68f37b915fa4f4368720acc3","last_reissued_at":"2026-05-27T02:05:03.386365Z","signature_status":"signed_v1","first_computed_at":"2026-05-27T02:05:03.386365Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Doc-CoB: Enhancing Document Understanding with Visual Chain-of-Boxes Reasoning","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.CV"],"primary_cat":"cs.AI","authors_text":"Bo Zhang, Gang Huang, Hangdi Xing, Huan Zhou, Jiajun Bu, Kai Ye, Kehan Chen, Sheng Zhou, Xianwei Mao, Ye Mo, Zirui Shao, Zixu Yan","submitted_at":"2025-05-24T08:53:05Z","abstract_excerpt":"Document understanding aims to perform question answering and information extraction over document images, where the visual content is highly information-dense and most queries rely on only a few relevant layout regions. However, existing methods either adopt a one-pass strategy that implicitly assumes all layouts are equally important, or focus excessively on small regions at the cost of losing critical layout information. To address these limitations, we introduce Doc-CoB (Chain-of-Boxes), a simple-yet-effective framework that integrates coarse-to-fine layout-aware visual reasoning into mult"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2505.18603","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2505.18603/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2505.18603","created_at":"2026-05-27T02:05:03.386496+00:00"},{"alias_kind":"arxiv_version","alias_value":"2505.18603v2","created_at":"2026-05-27T02:05:03.386496+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2505.18603","created_at":"2026-05-27T02:05:03.386496+00:00"},{"alias_kind":"pith_short_12","alias_value":"SGK536E6KCAC","created_at":"2026-05-27T02:05:03.386496+00:00"},{"alias_kind":"pith_short_16","alias_value":"SGK536E6KCACMPJD","created_at":"2026-05-27T02:05:03.386496+00:00"},{"alias_kind":"pith_short_8","alias_value":"SGK536E6","created_at":"2026-05-27T02:05:03.386496+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":1,"internal_anchor_count":1,"sample":[{"citing_arxiv_id":"2605.03903","citing_title":"CC-OCR V2: Benchmarking Large Multimodal Models for Literacy in Real-world Document Processing","ref_index":60,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/SGK536E6KCACMPJDU3T2K5CQFI","json":"https://pith.science/pith/SGK536E6KCACMPJDU3T2K5CQFI.json","graph_json":"https://pith.science/api/pith-number/SGK536E6KCACMPJDU3T2K5CQFI/graph.json","events_json":"https://pith.science/api/pith-number/SGK536E6KCACMPJDU3T2K5CQFI/events.json","paper":"https://pith.science/paper/SGK536E6"},"agent_actions":{"view_html":"https://pith.science/pith/SGK536E6KCACMPJDU3T2K5CQFI","download_json":"https://pith.science/pith/SGK536E6KCACMPJDU3T2K5CQFI.json","view_paper":"https://pith.science/paper/SGK536E6","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2505.18603&json=true","fetch_graph":"https://pith.science/api/pith-number/SGK536E6KCACMPJDU3T2K5CQFI/graph.json","fetch_events":"https://pith.science/api/pith-number/SGK536E6KCACMPJDU3T2K5CQFI/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/SGK536E6KCACMPJDU3T2K5CQFI/action/timestamp_anchor","attest_storage":"https://pith.science/pith/SGK536E6KCACMPJDU3T2K5CQFI/action/storage_attestation","attest_author":"https://pith.science/pith/SGK536E6KCACMPJDU3T2K5CQFI/action/author_attestation","sign_citation":"https://pith.science/pith/SGK536E6KCACMPJDU3T2K5CQFI/action/citation_signature","submit_replication":"https://pith.science/pith/SGK536E6KCACMPJDU3T2K5CQFI/action/replication_record"}},"created_at":"2026-05-27T02:05:03.386496+00:00","updated_at":"2026-05-27T02:05:03.386496+00:00"}