{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:BVHKYF3DTR7ZLD7LKNRVZBQ62B","short_pith_number":"pith:BVHKYF3D","schema_version":"1.0","canonical_sha256":"0d4eac17639c7f958feb53635c861ed06f006223bf90db558bdbddd92c1c1d80","source":{"kind":"arxiv","id":"2602.04094","version":2},"attestation_state":"computed","paper":{"title":"VideoBrain: Learning Adaptive Frame Sampling for Long Video Understanding","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Junbo Zou, Liwen Zhang, Shengjie Zhang, Weining Shen, Ziheng Huang","submitted_at":"2026-02-04T00:08:35Z","abstract_excerpt":"Long-form video understanding remains challenging for Vision-Language Models (VLMs) due to the inherent tension between computational constraints and the need to capture information distributed across thousands of frames. Existing approaches either sample frames uniformly (risking information loss) or select keyframes in a single pass (with no recovery from poor choices). We propose VideoBrain, an end-to-end framework that enables VLMs to adaptively acquire visual information through learned sampling policies. Our approach features dual complementary agents: a CLIP-based agent for semantic ret"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2602.04094","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-02-04T00:08:35Z","cross_cats_sorted":[],"title_canon_sha256":"704c9b4d263ee671a26dbb004028d77b1a000f12c4b90f9a90aad9159ed6c7e8","abstract_canon_sha256":"5225f08639ec653413ae5105dd520b78b01ce4316e8c4949119f45f4a7f4164c"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-02T01:04:15.605664Z","signature_b64":"OtrSLOPManhHhds9qU3z85ScEPy7Zs+KfLIm4FSKsSOwSJqpcEORGldNR3z2fmVtWTb1jevaW3u7ivkEhNTJAg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"0d4eac17639c7f958feb53635c861ed06f006223bf90db558bdbddd92c1c1d80","last_reissued_at":"2026-06-02T01:04:15.605117Z","signature_status":"signed_v1","first_computed_at":"2026-06-02T01:04:15.605117Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"VideoBrain: Learning Adaptive Frame Sampling for Long Video Understanding","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Junbo Zou, Liwen Zhang, Shengjie Zhang, Weining Shen, Ziheng Huang","submitted_at":"2026-02-04T00:08:35Z","abstract_excerpt":"Long-form video understanding remains challenging for Vision-Language Models (VLMs) due to the inherent tension between computational constraints and the need to capture information distributed across thousands of frames. Existing approaches either sample frames uniformly (risking information loss) or select keyframes in a single pass (with no recovery from poor choices). We propose VideoBrain, an end-to-end framework that enables VLMs to adaptively acquire visual information through learned sampling policies. Our approach features dual complementary agents: a CLIP-based agent for semantic ret"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2602.04094","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2602.04094/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2602.04094","created_at":"2026-06-02T01:04:15.605188+00:00"},{"alias_kind":"arxiv_version","alias_value":"2602.04094v2","created_at":"2026-06-02T01:04:15.605188+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2602.04094","created_at":"2026-06-02T01:04:15.605188+00:00"},{"alias_kind":"pith_short_12","alias_value":"BVHKYF3DTR7Z","created_at":"2026-06-02T01:04:15.605188+00:00"},{"alias_kind":"pith_short_16","alias_value":"BVHKYF3DTR7ZLD7L","created_at":"2026-06-02T01:04:15.605188+00:00"},{"alias_kind":"pith_short_8","alias_value":"BVHKYF3D","created_at":"2026-06-02T01:04:15.605188+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":1,"internal_anchor_count":1,"sample":[{"citing_arxiv_id":"2603.20180","citing_title":"Adaptive Greedy Frame Selection for Long Video Understanding","ref_index":22,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/BVHKYF3DTR7ZLD7LKNRVZBQ62B","json":"https://pith.science/pith/BVHKYF3DTR7ZLD7LKNRVZBQ62B.json","graph_json":"https://pith.science/api/pith-number/BVHKYF3DTR7ZLD7LKNRVZBQ62B/graph.json","events_json":"https://pith.science/api/pith-number/BVHKYF3DTR7ZLD7LKNRVZBQ62B/events.json","paper":"https://pith.science/paper/BVHKYF3D"},"agent_actions":{"view_html":"https://pith.science/pith/BVHKYF3DTR7ZLD7LKNRVZBQ62B","download_json":"https://pith.science/pith/BVHKYF3DTR7ZLD7LKNRVZBQ62B.json","view_paper":"https://pith.science/paper/BVHKYF3D","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2602.04094&json=true","fetch_graph":"https://pith.science/api/pith-number/BVHKYF3DTR7ZLD7LKNRVZBQ62B/graph.json","fetch_events":"https://pith.science/api/pith-number/BVHKYF3DTR7ZLD7LKNRVZBQ62B/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/BVHKYF3DTR7ZLD7LKNRVZBQ62B/action/timestamp_anchor","attest_storage":"https://pith.science/pith/BVHKYF3DTR7ZLD7LKNRVZBQ62B/action/storage_attestation","attest_author":"https://pith.science/pith/BVHKYF3DTR7ZLD7LKNRVZBQ62B/action/author_attestation","sign_citation":"https://pith.science/pith/BVHKYF3DTR7ZLD7LKNRVZBQ62B/action/citation_signature","submit_replication":"https://pith.science/pith/BVHKYF3DTR7ZLD7LKNRVZBQ62B/action/replication_record"}},"created_at":"2026-06-02T01:04:15.605188+00:00","updated_at":"2026-06-02T01:04:15.605188+00:00"}