{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2024:Y77WGRQVUQSZAHGKUPP2VBKLEK","short_pith_number":"pith:Y77WGRQV","schema_version":"1.0","canonical_sha256":"c7ff634615a425901ccaa3dfaa854b22bc3e893aa3ee6a97802c5af610c5e5c5","source":{"kind":"arxiv","id":"2406.08035","version":3},"attestation_state":"computed","paper":{"title":"LVBench: An Extreme Long Video Understanding Benchmark","license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.CV","authors_text":"Bin Xu, Jie Tang, Ji Qi, Ming Ding, Shiyu Huang, Weihan Wang, Wenyi Hong, Xiaohan Zhang, Xiaotao Gu, Yean Cheng, Yuxiao Dong, Zehai He","submitted_at":"2024-06-12T09:36:52Z","abstract_excerpt":"Recent progress in multimodal large language models has markedly enhanced the understanding of short videos (typically under one minute), and several evaluation datasets have emerged accordingly. However, these advancements fall short of meeting the demands of real-world applications such as embodied intelligence for long-term decision-making, in-depth movie reviews and discussions, and live sports commentary, all of which require comprehension of long videos spanning several hours. To address this gap, we introduce LVBench, a benchmark specifically designed for long video understanding. Our d"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2406.08035","kind":"arxiv","version":3},"metadata":{"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.CV","submitted_at":"2024-06-12T09:36:52Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"44546f02f27f5a75eb69b64b097649b302d1d80ab123b0fb5cf21474e1132c8e","abstract_canon_sha256":"2cb789c96749183e3cebcd9fb1859ca112707f59c6fe6d492c422654ca2e9843"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-19T11:49:20.128165Z","signature_b64":"Df5Na+6OzVy1/YEI53bxnsuqsTM5K0h6byv+Ifv/b+1j9Qpju59ML+PCTqds5aPsOGvGlnjUFbRKszwj8QA+Cw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"c7ff634615a425901ccaa3dfaa854b22bc3e893aa3ee6a97802c5af610c5e5c5","last_reissued_at":"2026-05-19T11:49:20.126063Z","signature_status":"signed_v1","first_computed_at":"2026-05-19T11:49:20.126063Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"LVBench: An Extreme Long Video Understanding Benchmark","license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.CV","authors_text":"Bin Xu, Jie Tang, Ji Qi, Ming Ding, Shiyu Huang, Weihan Wang, Wenyi Hong, Xiaohan Zhang, Xiaotao Gu, Yean Cheng, Yuxiao Dong, Zehai He","submitted_at":"2024-06-12T09:36:52Z","abstract_excerpt":"Recent progress in multimodal large language models has markedly enhanced the understanding of short videos (typically under one minute), and several evaluation datasets have emerged accordingly. However, these advancements fall short of meeting the demands of real-world applications such as embodied intelligence for long-term decision-making, in-depth movie reviews and discussions, and live sports commentary, all of which require comprehension of long videos spanning several hours. To address this gap, we introduce LVBench, a benchmark specifically designed for long video understanding. Our d"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2406.08035","kind":"arxiv","version":3},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2406.08035/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2406.08035","created_at":"2026-05-19T11:49:20.126138+00:00"},{"alias_kind":"arxiv_version","alias_value":"2406.08035v3","created_at":"2026-05-19T11:49:20.126138+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2406.08035","created_at":"2026-05-19T11:49:20.126138+00:00"},{"alias_kind":"pith_short_12","alias_value":"Y77WGRQVUQSZ","created_at":"2026-05-19T11:49:20.126138+00:00"},{"alias_kind":"pith_short_16","alias_value":"Y77WGRQVUQSZAHGK","created_at":"2026-05-19T11:49:20.126138+00:00"},{"alias_kind":"pith_short_8","alias_value":"Y77WGRQV","created_at":"2026-05-19T11:49:20.126138+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":20,"internal_anchor_count":20,"sample":[{"citing_arxiv_id":"2605.15342","citing_title":"Minerva-Ego: Spatiotemporal Hints for Egocentric Video Understanding","ref_index":43,"is_internal_anchor":true},{"citing_arxiv_id":"2506.05425","citing_title":"SIV-Bench: A Video Benchmark for Social Interaction Understanding and Reasoning","ref_index":50,"is_internal_anchor":true},{"citing_arxiv_id":"2507.06261","citing_title":"Gemini 2.5: Pushing the Frontier with Advanced Reasoning, Multimodality, Long Context, and Next Generation Agentic Capabilities","ref_index":85,"is_internal_anchor":true},{"citing_arxiv_id":"2508.10016","citing_title":"Training-Free Multimodal Large Language Model Orchestration","ref_index":64,"is_internal_anchor":true},{"citing_arxiv_id":"2501.00574","citing_title":"VideoChat-Flash: Hierarchical Compression for Long-Context Video Modeling","ref_index":53,"is_internal_anchor":true},{"citing_arxiv_id":"2502.04326","citing_title":"WorldSense: Evaluating Real-world Omnimodal Understanding for Multimodal LLMs","ref_index":68,"is_internal_anchor":true},{"citing_arxiv_id":"2408.16500","citing_title":"CogVLM2: Visual Language Models for Image and Video Understanding","ref_index":77,"is_internal_anchor":true},{"citing_arxiv_id":"2601.15724","citing_title":"VideoThinker: Building Agentic VideoLLMs with LLM-Guided Tool Reasoning","ref_index":30,"is_internal_anchor":true},{"citing_arxiv_id":"2602.22779","citing_title":"TrajTok: Learning Trajectory Tokens enables better Video Understanding","ref_index":70,"is_internal_anchor":true},{"citing_arxiv_id":"2509.18154","citing_title":"MiniCPM-V 4.5: Cooking Efficient MLLMs via Architecture, Data, and Training Recipe","ref_index":67,"is_internal_anchor":true},{"citing_arxiv_id":"2603.20633","citing_title":"Seed1.8 Model Card: Towards Generalized Real-World Agency","ref_index":71,"is_internal_anchor":true},{"citing_arxiv_id":"2501.13826","citing_title":"Video-MMMU: Evaluating Knowledge Acquisition from Multi-Discipline Professional Videos","ref_index":34,"is_internal_anchor":true},{"citing_arxiv_id":"2604.27393","citing_title":"MiniCPM-o 4.5: Towards Real-Time Full-Duplex Omni-Modal Interaction","ref_index":51,"is_internal_anchor":true},{"citing_arxiv_id":"2505.07062","citing_title":"Seed1.5-VL Technical Report","ref_index":143,"is_internal_anchor":true},{"citing_arxiv_id":"2507.01006","citing_title":"GLM-4.5V and GLM-4.1V-Thinking: Towards Versatile Multimodal Reasoning with Scalable Reinforcement Learning","ref_index":58,"is_internal_anchor":true},{"citing_arxiv_id":"2501.13106","citing_title":"VideoLLaMA 3: Frontier Multimodal Foundation Models for Image and Video Understanding","ref_index":135,"is_internal_anchor":true},{"citing_arxiv_id":"2602.02276","citing_title":"Kimi K2.5: Visual Agentic Intelligence","ref_index":65,"is_internal_anchor":true},{"citing_arxiv_id":"2604.14149","citing_title":"One Token per Highly Selective Frame: Towards Extreme Compression for Long Video Understanding","ref_index":59,"is_internal_anchor":true},{"citing_arxiv_id":"2604.15127","citing_title":"MCSC-Bench: Multimodal Context-to-Script Creation for Realistic Video Production","ref_index":23,"is_internal_anchor":true},{"citing_arxiv_id":"2604.16893","citing_title":"EasyVideoR1: Easier RL for Video Understanding","ref_index":37,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/Y77WGRQVUQSZAHGKUPP2VBKLEK","json":"https://pith.science/pith/Y77WGRQVUQSZAHGKUPP2VBKLEK.json","graph_json":"https://pith.science/api/pith-number/Y77WGRQVUQSZAHGKUPP2VBKLEK/graph.json","events_json":"https://pith.science/api/pith-number/Y77WGRQVUQSZAHGKUPP2VBKLEK/events.json","paper":"https://pith.science/paper/Y77WGRQV"},"agent_actions":{"view_html":"https://pith.science/pith/Y77WGRQVUQSZAHGKUPP2VBKLEK","download_json":"https://pith.science/pith/Y77WGRQVUQSZAHGKUPP2VBKLEK.json","view_paper":"https://pith.science/paper/Y77WGRQV","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2406.08035&json=true","fetch_graph":"https://pith.science/api/pith-number/Y77WGRQVUQSZAHGKUPP2VBKLEK/graph.json","fetch_events":"https://pith.science/api/pith-number/Y77WGRQVUQSZAHGKUPP2VBKLEK/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/Y77WGRQVUQSZAHGKUPP2VBKLEK/action/timestamp_anchor","attest_storage":"https://pith.science/pith/Y77WGRQVUQSZAHGKUPP2VBKLEK/action/storage_attestation","attest_author":"https://pith.science/pith/Y77WGRQVUQSZAHGKUPP2VBKLEK/action/author_attestation","sign_citation":"https://pith.science/pith/Y77WGRQVUQSZAHGKUPP2VBKLEK/action/citation_signature","submit_replication":"https://pith.science/pith/Y77WGRQVUQSZAHGKUPP2VBKLEK/action/replication_record"}},"created_at":"2026-05-19T11:49:20.126138+00:00","updated_at":"2026-05-19T11:49:20.126138+00:00"}