{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2023:JO2LRJNGHJ24KLDBJPAXPJVIWT","short_pith_number":"pith:JO2LRJNG","canonical_record":{"source":{"id":"2311.17005","kind":"arxiv","version":4},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2023-11-28T17:59:04Z","cross_cats_sorted":[],"title_canon_sha256":"e9370fc3ea60975756dc5270470f9babbffcc54e5543ad4c496f5afa89d98774","abstract_canon_sha256":"daea546185557c858111e16434b39fef7760fcb87d5ec7c76147f5c613d518f3"},"schema_version":"1.0"},"canonical_sha256":"4bb4b8a5a63a75c52c614bc177a6a8b4f2ef75e3d32702851aad90c82f4dce44","source":{"kind":"arxiv","id":"2311.17005","version":4},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2311.17005","created_at":"2026-05-17T23:38:13Z"},{"alias_kind":"arxiv_version","alias_value":"2311.17005v4","created_at":"2026-05-17T23:38:13Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2311.17005","created_at":"2026-05-17T23:38:13Z"},{"alias_kind":"pith_short_12","alias_value":"JO2LRJNGHJ24","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"JO2LRJNGHJ24KLDB","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"JO2LRJNG","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2023:JO2LRJNGHJ24KLDBJPAXPJVIWT","target":"record","payload":{"canonical_record":{"source":{"id":"2311.17005","kind":"arxiv","version":4},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2023-11-28T17:59:04Z","cross_cats_sorted":[],"title_canon_sha256":"e9370fc3ea60975756dc5270470f9babbffcc54e5543ad4c496f5afa89d98774","abstract_canon_sha256":"daea546185557c858111e16434b39fef7760fcb87d5ec7c76147f5c613d518f3"},"schema_version":"1.0"},"canonical_sha256":"4bb4b8a5a63a75c52c614bc177a6a8b4f2ef75e3d32702851aad90c82f4dce44","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:13.190159Z","signature_b64":"pL8CNvDceL9+Mq+wYzIEq0Skm50ReAZ3tP9jvyL7p8GbiUS+Qt6vrCFlozCLg2mIapGSCGlqnHNFBm3Q20rYCA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"4bb4b8a5a63a75c52c614bc177a6a8b4f2ef75e3d32702851aad90c82f4dce44","last_reissued_at":"2026-05-17T23:38:13.189427Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:13.189427Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2311.17005","source_version":4,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:13Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"/LmQcfVfG4vEHuF1/FdWvx1Zdl4Cg6vbK/fU+ED4dQnf9s8iTVTSxLR5Jq1MSXh8Vkyu/CNHs/hbkdinqf/1AQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-18T20:10:09.516844Z"},"content_sha256":"1c5824888480e9a6dd0b6002832f38ee535b997fbcef8ebcb9104b46c64c0c53","schema_version":"1.0","event_id":"sha256:1c5824888480e9a6dd0b6002832f38ee535b997fbcef8ebcb9104b46c64c0c53"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2023:JO2LRJNGHJ24KLDBJPAXPJVIWT","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"MVBench: A Comprehensive Multi-modal Video Understanding Benchmark","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Most multi-modal AI models fail at temporal understanding in videos, but a new benchmark and training method lift performance by more than 15 percent.","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Guo Chen, Jilan Xu, Kunchang Li, Limin Wang, Ping Luo, Yali Wang, Yi Liu, Yinan He, Yi Wang, Yizhuo Li, Yu Qiao, Zun Wang","submitted_at":"2023-11-28T17:59:04Z","abstract_excerpt":"With the rapid development of Multi-modal Large Language Models (MLLMs), a number of diagnostic benchmarks have recently emerged to evaluate the comprehension capabilities of these models. However, most benchmarks predominantly assess spatial understanding in the static image tasks, while overlooking temporal understanding in the dynamic video tasks. To alleviate this issue, we introduce a comprehensive Multi-modal Video understanding Benchmark, namely MVBench, which covers 20 challenging video tasks that cannot be effectively solved with a single frame. Specifically, we first introduce a nove"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"the existing MLLMs are far from satisfactory in temporal understanding, while our VideoChat2 largely surpasses these leading models by over 15% on MVBench.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That automatically converting public video annotations into multiple-choice QA pairs accurately measures the intended temporal skills without introducing annotation biases or allowing single-frame shortcuts.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"MVBench is a benchmark of 20 temporal video understanding tasks built by transforming static tasks into dynamic ones, with VideoChat2 outperforming prior MLLMs by over 15%.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Most multi-modal AI models fail at temporal understanding in videos, but a new benchmark and training method lift performance by more than 15 percent.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"cf27bad3fd342403d7194c15b35af731d434653f4e74bdcddfd284fdeb256219"},"source":{"id":"2311.17005","kind":"arxiv","version":4},"verdict":{"id":"0422e78a-9fbd-4176-be0e-57845c776662","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-17T20:18:29.561336Z","strongest_claim":"the existing MLLMs are far from satisfactory in temporal understanding, while our VideoChat2 largely surpasses these leading models by over 15% on MVBench.","one_line_summary":"MVBench is a benchmark of 20 temporal video understanding tasks built by transforming static tasks into dynamic ones, with VideoChat2 outperforming prior MLLMs by over 15%.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That automatically converting public video annotations into multiple-choice QA pairs accurately measures the intended temporal skills without introducing annotation biases or allowing single-frame shortcuts.","pith_extraction_headline":"Most multi-modal AI models fail at temporal understanding in videos, but a new benchmark and training method lift performance by more than 15 percent."},"references":{"count":104,"sample":[{"doi":"","year":2022,"title":"Flamingo: a Visual Language Model for Few-Shot Learning","work_id":"a110f764-38dc-41b2-a802-53744ecea1fc","ref_index":1,"cited_arxiv_id":"2204.14198","is_internal_anchor":true},{"doi":"","year":2023,"title":"Qwen-VL: A Versatile Vision-Language Model for Understanding, Localization, Text Reading, and Beyond","work_id":"cbc2bb21-b6bb-46c0-80bf-107e195ffe10","ref_index":2,"cited_arxiv_id":"2308.12966","is_internal_anchor":true},{"doi":"","year":2021,"title":"Frozen in time: A joint video and image encoder for end-to-end retrieval","work_id":"12562377-293a-4224-b83e-3f411bc1cd94","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2019,"title":"Ali Furkan Biten, Rub `en P ´erez Tito, Andr ´es Mafla, Llu ´ıs G´omez, Marc ¸al Rusi˜nol, Ernest Valveny, C. V . Jawahar, and Dimosthenis Karatzas. Scene text visual question answer- ing. In ICCV, 20","work_id":"1b934981-5385-4e63-b823-9601505710bd","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2020,"title":"Language models are few-shot learners","work_id":"82a86dd0-f3b8-4511-97b1-c7b263281e6e","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":104,"snapshot_sha256":"5450571d31559e0a4e997ba4d8202006403e66cc39b1edf8e2dcf00090e1818c","internal_anchors":24},"formal_canon":{"evidence_count":2,"snapshot_sha256":"86fe5896407563c1307107fb80b8f1d601edf9aa3aa1ca361ebf46b5ec3c871f"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"0422e78a-9fbd-4176-be0e-57845c776662"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:13Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"xw+N0X8GvrN62Vir4ij7NBd81rLzlMRshOnLYcW14E9hAnimpPmAp0WkKdjq4r2+r3H2x6bt41NMPDhepZipCA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-18T20:10:09.518003Z"},"content_sha256":"a4a1a86f034464a44bf15ac09e1e8fe5dfcdd0ba5755761ccd4852c05107a364","schema_version":"1.0","event_id":"sha256:a4a1a86f034464a44bf15ac09e1e8fe5dfcdd0ba5755761ccd4852c05107a364"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/JO2LRJNGHJ24KLDBJPAXPJVIWT/bundle.json","state_url":"https://pith.science/pith/JO2LRJNGHJ24KLDBJPAXPJVIWT/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/JO2LRJNGHJ24KLDBJPAXPJVIWT/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-18T20:10:09Z","links":{"resolver":"https://pith.science/pith/JO2LRJNGHJ24KLDBJPAXPJVIWT","bundle":"https://pith.science/pith/JO2LRJNGHJ24KLDBJPAXPJVIWT/bundle.json","state":"https://pith.science/pith/JO2LRJNGHJ24KLDBJPAXPJVIWT/state.json","well_known_bundle":"https://pith.science/.well-known/pith/JO2LRJNGHJ24KLDBJPAXPJVIWT/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2023:JO2LRJNGHJ24KLDBJPAXPJVIWT","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"daea546185557c858111e16434b39fef7760fcb87d5ec7c76147f5c613d518f3","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2023-11-28T17:59:04Z","title_canon_sha256":"e9370fc3ea60975756dc5270470f9babbffcc54e5543ad4c496f5afa89d98774"},"schema_version":"1.0","source":{"id":"2311.17005","kind":"arxiv","version":4}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2311.17005","created_at":"2026-05-17T23:38:13Z"},{"alias_kind":"arxiv_version","alias_value":"2311.17005v4","created_at":"2026-05-17T23:38:13Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2311.17005","created_at":"2026-05-17T23:38:13Z"},{"alias_kind":"pith_short_12","alias_value":"JO2LRJNGHJ24","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"JO2LRJNGHJ24KLDB","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"JO2LRJNG","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:a4a1a86f034464a44bf15ac09e1e8fe5dfcdd0ba5755761ccd4852c05107a364","target":"graph","created_at":"2026-05-17T23:38:13Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"the existing MLLMs are far from satisfactory in temporal understanding, while our VideoChat2 largely surpasses these leading models by over 15% on MVBench."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That automatically converting public video annotations into multiple-choice QA pairs accurately measures the intended temporal skills without introducing annotation biases or allowing single-frame shortcuts."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"MVBench is a benchmark of 20 temporal video understanding tasks built by transforming static tasks into dynamic ones, with VideoChat2 outperforming prior MLLMs by over 15%."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Most multi-modal AI models fail at temporal understanding in videos, but a new benchmark and training method lift performance by more than 15 percent."}],"snapshot_sha256":"cf27bad3fd342403d7194c15b35af731d434653f4e74bdcddfd284fdeb256219"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"86fe5896407563c1307107fb80b8f1d601edf9aa3aa1ca361ebf46b5ec3c871f"},"paper":{"abstract_excerpt":"With the rapid development of Multi-modal Large Language Models (MLLMs), a number of diagnostic benchmarks have recently emerged to evaluate the comprehension capabilities of these models. However, most benchmarks predominantly assess spatial understanding in the static image tasks, while overlooking temporal understanding in the dynamic video tasks. To alleviate this issue, we introduce a comprehensive Multi-modal Video understanding Benchmark, namely MVBench, which covers 20 challenging video tasks that cannot be effectively solved with a single frame. Specifically, we first introduce a nove","authors_text":"Guo Chen, Jilan Xu, Kunchang Li, Limin Wang, Ping Luo, Yali Wang, Yi Liu, Yinan He, Yi Wang, Yizhuo Li, Yu Qiao, Zun Wang","cross_cats":[],"headline":"Most multi-modal AI models fail at temporal understanding in videos, but a new benchmark and training method lift performance by more than 15 percent.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2023-11-28T17:59:04Z","title":"MVBench: A Comprehensive Multi-modal Video Understanding Benchmark"},"references":{"count":104,"internal_anchors":24,"resolved_work":104,"sample":[{"cited_arxiv_id":"2204.14198","doi":"","is_internal_anchor":true,"ref_index":1,"title":"Flamingo: a Visual Language Model for Few-Shot Learning","work_id":"a110f764-38dc-41b2-a802-53744ecea1fc","year":2022},{"cited_arxiv_id":"2308.12966","doi":"","is_internal_anchor":true,"ref_index":2,"title":"Qwen-VL: A Versatile Vision-Language Model for Understanding, Localization, Text Reading, and Beyond","work_id":"cbc2bb21-b6bb-46c0-80bf-107e195ffe10","year":2023},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"Frozen in time: A joint video and image encoder for end-to-end retrieval","work_id":"12562377-293a-4224-b83e-3f411bc1cd94","year":2021},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Ali Furkan Biten, Rub `en P ´erez Tito, Andr ´es Mafla, Llu ´ıs G´omez, Marc ¸al Rusi˜nol, Ernest Valveny, C. V . Jawahar, and Dimosthenis Karatzas. Scene text visual question answer- ing. In ICCV, 20","work_id":"1b934981-5385-4e63-b823-9601505710bd","year":2019},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"Language models are few-shot learners","work_id":"82a86dd0-f3b8-4511-97b1-c7b263281e6e","year":2020}],"snapshot_sha256":"5450571d31559e0a4e997ba4d8202006403e66cc39b1edf8e2dcf00090e1818c"},"source":{"id":"2311.17005","kind":"arxiv","version":4},"verdict":{"created_at":"2026-05-17T20:18:29.561336Z","id":"0422e78a-9fbd-4176-be0e-57845c776662","model_set":{"reader":"grok-4.3"},"one_line_summary":"MVBench is a benchmark of 20 temporal video understanding tasks built by transforming static tasks into dynamic ones, with VideoChat2 outperforming prior MLLMs by over 15%.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Most multi-modal AI models fail at temporal understanding in videos, but a new benchmark and training method lift performance by more than 15 percent.","strongest_claim":"the existing MLLMs are far from satisfactory in temporal understanding, while our VideoChat2 largely surpasses these leading models by over 15% on MVBench.","weakest_assumption":"That automatically converting public video annotations into multiple-choice QA pairs accurately measures the intended temporal skills without introducing annotation biases or allowing single-frame shortcuts."}},"verdict_id":"0422e78a-9fbd-4176-be0e-57845c776662"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:1c5824888480e9a6dd0b6002832f38ee535b997fbcef8ebcb9104b46c64c0c53","target":"record","created_at":"2026-05-17T23:38:13Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"daea546185557c858111e16434b39fef7760fcb87d5ec7c76147f5c613d518f3","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2023-11-28T17:59:04Z","title_canon_sha256":"e9370fc3ea60975756dc5270470f9babbffcc54e5543ad4c496f5afa89d98774"},"schema_version":"1.0","source":{"id":"2311.17005","kind":"arxiv","version":4}},"canonical_sha256":"4bb4b8a5a63a75c52c614bc177a6a8b4f2ef75e3d32702851aad90c82f4dce44","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"4bb4b8a5a63a75c52c614bc177a6a8b4f2ef75e3d32702851aad90c82f4dce44","first_computed_at":"2026-05-17T23:38:13.189427Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:13.189427Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"pL8CNvDceL9+Mq+wYzIEq0Skm50ReAZ3tP9jvyL7p8GbiUS+Qt6vrCFlozCLg2mIapGSCGlqnHNFBm3Q20rYCA==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:13.190159Z","signed_message":"canonical_sha256_bytes"},"source_id":"2311.17005","source_kind":"arxiv","source_version":4}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:1c5824888480e9a6dd0b6002832f38ee535b997fbcef8ebcb9104b46c64c0c53","sha256:a4a1a86f034464a44bf15ac09e1e8fe5dfcdd0ba5755761ccd4852c05107a364"],"state_sha256":"d398530233d4327d71a7042c7d750f780ff0cf20a2e490e2bc077aa092a07dc8"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"c50JeBehOfKQAFaLysvVjQXd+8IYeqYZPV4uA1ATJB7qDGlz9kaNSnENtqvCioqXDzQ0aSHbwknWbOP8m7goCw==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-18T20:10:09.521117Z","bundle_sha256":"9cde5a95ea89f623e6cca1b5d47a725b68eb0e77c527a915d30e41ed280b336c"}}