{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:W5TXDENBMIXO2F26MJ3NBRYV7G","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"0721734a9b04f9ec320b89e48c436ad45051e6b51dca5d0413bbd952de1ab46d","cross_cats_sorted":["cs.AI"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-13T03:40:21Z","title_canon_sha256":"b19eb8c9b0772b3c7d646a6b05cf2808298dfc8664910e53fc693adce349ce5c"},"schema_version":"1.0","source":{"id":"2605.12954","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.12954","created_at":"2026-05-18T03:09:09Z"},{"alias_kind":"arxiv_version","alias_value":"2605.12954v1","created_at":"2026-05-18T03:09:09Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.12954","created_at":"2026-05-18T03:09:09Z"},{"alias_kind":"pith_short_12","alias_value":"W5TXDENBMIXO","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"W5TXDENBMIXO2F26","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"W5TXDENB","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:31a65536925a7506dfdae3666bee401dcca774901bcd6484a5a349d75693eb55","target":"graph","created_at":"2026-05-18T03:09:09Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"AdaFocus delivers a substantially better efficiency-accuracy trade-off than strong baselines. Compared with conventional dense encoding, AdaFocus achieves improved task performance (e.g., +2.59 accuracy on VideoMME, +8.39 mIoU on Charades-STA over single-pass inference) while reducing visual token consumption by ~33x and eliminating the need for in-memory frame pre-caching through its zero-cache disk retrieval design."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"The uncertainty-triggered refinement mechanism can reliably identify when and which high-resolution evidence is needed from the initial low-cost preview, without missing critical details that would require exhaustive preloading."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"AdaFocus achieves better accuracy on long-video benchmarks with roughly 33 times fewer visual tokens by combining query-aware adaptive sampling and zero-cache disk-based refinement."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"AdaFocus improves long-video accuracy while cutting visual tokens by about 33 times through adaptive preview sampling and on-demand disk retrieval."}],"snapshot_sha256":"149ae7405348385853575b2a38e602ea91834d89db54625b86404a9a30957ce3"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"Long video understanding is heavily bottlenecked by a rigid one-shot paradigm: existing methods either densely encode videos at prohibitive memory and latency costs, or aggressively compress them into sparse frame sets that irreversibly discard fine-grained evidence needed for downstream reasoning. Consequently, current models struggle to simultaneously balance temporal coverage, visual details, and computational efficiency.\n  We propose AdaFocus, an efficient framework that rethinks long-video understanding as progressive evidence acquisition rather than one-pass encoding. AdaFocus relies on ","authors_text":"Haoxuan Yu, Ning Qin, Xiao Yang, Yingzhe Ma, Zixin Li","cross_cats":["cs.AI"],"headline":"AdaFocus improves long-video accuracy while cutting visual tokens by about 33 times through adaptive preview sampling and on-demand disk retrieval.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-13T03:40:21Z","title":"AdaFocus: Adaptive Relevance-Diversity Sampling with Zero-Cache Look-back for Efficient Long Video Understanding"},"references":{"count":33,"internal_anchors":9,"resolved_work":33,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"Lisa Anne Hendricks, Oliver Wang, Eli Shechtman, Josef Sivic, Trevor Darrell, and Bryan Russell. 2017. Localizing moments in video with natural language. In Proceedings of the IEEE international confe","work_id":"a050ad1f-bc2f-410c-a773-37094bb7af2b","year":2017},{"cited_arxiv_id":"2511.21631","doi":"","is_internal_anchor":true,"ref_index":2,"title":"Qwen3-VL Technical Report","work_id":"1fe243aa-e3c0-4da6-b391-4cbcfc88d5c0","year":2025},{"cited_arxiv_id":"2406.07476","doi":"","is_internal_anchor":true,"ref_index":3,"title":"VideoLLaMA 2: Advancing Spatial-Temporal Modeling and Audio Understanding in Video-LLMs","work_id":"ccfc3f89-c510-45f1-8a35-ed1a56c0ae5c","year":2024},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Tri Dao, Dan Fu, Stefano Ermon, Atri Rudra, and Christopher Ré. 2022. Flashat- tention: Fast and memory-efficient exact attention with io-awareness.Advances in neural information processing systems35 ","work_id":"15271a55-5c79-4cdd-b2cb-9ad910540658","year":2022},{"cited_arxiv_id":"2503.21776","doi":"","is_internal_anchor":true,"ref_index":5,"title":"Video-R1: Reinforcing Video Reasoning in MLLMs","work_id":"0ce88332-564c-4361-8e2a-3850eb1ace9c","year":2025}],"snapshot_sha256":"55ce5452aad1530c14d7dea2c266a8ec2d187431d7a16fce547b8554ca71bd6e"},"source":{"id":"2605.12954","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-14T19:39:53.060195Z","id":"2d444176-e987-4148-9205-cafd94888b09","model_set":{"reader":"grok-4.3"},"one_line_summary":"AdaFocus achieves better accuracy on long-video benchmarks with roughly 33 times fewer visual tokens by combining query-aware adaptive sampling and zero-cache disk-based refinement.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"AdaFocus improves long-video accuracy while cutting visual tokens by about 33 times through adaptive preview sampling and on-demand disk retrieval.","strongest_claim":"AdaFocus delivers a substantially better efficiency-accuracy trade-off than strong baselines. Compared with conventional dense encoding, AdaFocus achieves improved task performance (e.g., +2.59 accuracy on VideoMME, +8.39 mIoU on Charades-STA over single-pass inference) while reducing visual token consumption by ~33x and eliminating the need for in-memory frame pre-caching through its zero-cache disk retrieval design.","weakest_assumption":"The uncertainty-triggered refinement mechanism can reliably identify when and which high-resolution evidence is needed from the initial low-cost preview, without missing critical details that would require exhaustive preloading."}},"verdict_id":"2d444176-e987-4148-9205-cafd94888b09"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:f0c8cdbf83f9531d5dcf1206bc6e54e9759ef341550b01b48034ff56fdec2485","target":"record","created_at":"2026-05-18T03:09:09Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"0721734a9b04f9ec320b89e48c436ad45051e6b51dca5d0413bbd952de1ab46d","cross_cats_sorted":["cs.AI"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-13T03:40:21Z","title_canon_sha256":"b19eb8c9b0772b3c7d646a6b05cf2808298dfc8664910e53fc693adce349ce5c"},"schema_version":"1.0","source":{"id":"2605.12954","kind":"arxiv","version":1}},"canonical_sha256":"b7677191a1622eed175e6276d0c715f9ad347ca20e9771c7a8c04eb571bb8d20","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"b7677191a1622eed175e6276d0c715f9ad347ca20e9771c7a8c04eb571bb8d20","first_computed_at":"2026-05-18T03:09:09.314870Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-18T03:09:09.314870Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"shNOn90FjoaMisr6MCbUsm9ZXxbHVBM7BUjtrEwFCLyPgZQRqTvpLe7ECAcGiNV8ZoKU+WQ72nWue0RgH2roCg==","signature_status":"signed_v1","signed_at":"2026-05-18T03:09:09.315464Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.12954","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:f0c8cdbf83f9531d5dcf1206bc6e54e9759ef341550b01b48034ff56fdec2485","sha256:31a65536925a7506dfdae3666bee401dcca774901bcd6484a5a349d75693eb55"],"state_sha256":"a9de319f8016ec24b5b4c639cb6bc007682d4e92357a50d309671e64f36f7722"}