{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:ZYNQLIP7HRK76WL4YXJJP2URCT","short_pith_number":"pith:ZYNQLIP7","schema_version":"1.0","canonical_sha256":"ce1b05a1ff3c55ff597cc5d297ea9114fb9d9eb94e2614e293055393d968f55f","source":{"kind":"arxiv","id":"2605.20183","version":1},"attestation_state":"computed","paper":{"title":"MSAVBench: Towards Comprehensive and Reliable Evaluation of Multi-Shot Audio-Video Generation","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Difan Zou, Hongming Shan, Junjie Zhou, Junqiu Yu, Kaixun Jiang, Kai Zhu, Lingyi Hong, Quanhao Li, Ruihang Chu, Shiwei Zhang, Xiang Wang, Xihui Liu, Yang Shi, Yefei He, Yingya Zhang, Yongming Li, Yujie Wei, Yujin Han, Yu Liu, Zhekai Chen, Zhen Xing, Zhihang Liu, Zhiwu Qing","submitted_at":"2026-05-19T17:59:33Z","abstract_excerpt":"Video generation is rapidly evolving from single-shot synthesis to complex multi-shot audio-video (MSAV) narratives to meet real-world demands. However, evaluating such frontier models remains a fundamental challenge. Existing benchmarks are limited in scope and data diversity, and rely on rigid evaluation pipelines, preventing systematic and reliable assessment of modern MSAV models. To bridge these gaps, we introduce MSAVBench, the first comprehensive benchmark and adaptive hybrid evaluation framework for multi-shot audio-video generation. Our benchmark spans four key dimensions, video, audi"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2605.20183","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2026-05-19T17:59:33Z","cross_cats_sorted":[],"title_canon_sha256":"8b86889655fb25691ea8c335b9a51fad3bfb43fdf28654910dddd1e2477e5a1a","abstract_canon_sha256":"4e2f9073ce42986b334f59ea35e22a161e34189cf2f19b9667a18511ced597c9"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-20T02:06:06.876514Z","signature_b64":"9+Ni9ZYwbqxtJtkJu8rREyphsFxCgDtDxdrmWo+ku8wVE44vzd8vKwhAcDg/v540/pnqh1173OaHXyfFUp+BCg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"ce1b05a1ff3c55ff597cc5d297ea9114fb9d9eb94e2614e293055393d968f55f","last_reissued_at":"2026-05-20T02:06:06.875668Z","signature_status":"signed_v1","first_computed_at":"2026-05-20T02:06:06.875668Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"MSAVBench: Towards Comprehensive and Reliable Evaluation of Multi-Shot Audio-Video Generation","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Difan Zou, Hongming Shan, Junjie Zhou, Junqiu Yu, Kaixun Jiang, Kai Zhu, Lingyi Hong, Quanhao Li, Ruihang Chu, Shiwei Zhang, Xiang Wang, Xihui Liu, Yang Shi, Yefei He, Yingya Zhang, Yongming Li, Yujie Wei, Yujin Han, Yu Liu, Zhekai Chen, Zhen Xing, Zhihang Liu, Zhiwu Qing","submitted_at":"2026-05-19T17:59:33Z","abstract_excerpt":"Video generation is rapidly evolving from single-shot synthesis to complex multi-shot audio-video (MSAV) narratives to meet real-world demands. However, evaluating such frontier models remains a fundamental challenge. Existing benchmarks are limited in scope and data diversity, and rely on rigid evaluation pipelines, preventing systematic and reliable assessment of modern MSAV models. To bridge these gaps, we introduce MSAVBench, the first comprehensive benchmark and adaptive hybrid evaluation framework for multi-shot audio-video generation. Our benchmark spans four key dimensions, video, audi"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2605.20183","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2605.20183/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2605.20183","created_at":"2026-05-20T02:06:06.875820+00:00"},{"alias_kind":"arxiv_version","alias_value":"2605.20183v1","created_at":"2026-05-20T02:06:06.875820+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.20183","created_at":"2026-05-20T02:06:06.875820+00:00"},{"alias_kind":"pith_short_12","alias_value":"ZYNQLIP7HRK7","created_at":"2026-05-20T02:06:06.875820+00:00"},{"alias_kind":"pith_short_16","alias_value":"ZYNQLIP7HRK76WL4","created_at":"2026-05-20T02:06:06.875820+00:00"},{"alias_kind":"pith_short_8","alias_value":"ZYNQLIP7","created_at":"2026-05-20T02:06:06.875820+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/ZYNQLIP7HRK76WL4YXJJP2URCT","json":"https://pith.science/pith/ZYNQLIP7HRK76WL4YXJJP2URCT.json","graph_json":"https://pith.science/api/pith-number/ZYNQLIP7HRK76WL4YXJJP2URCT/graph.json","events_json":"https://pith.science/api/pith-number/ZYNQLIP7HRK76WL4YXJJP2URCT/events.json","paper":"https://pith.science/paper/ZYNQLIP7"},"agent_actions":{"view_html":"https://pith.science/pith/ZYNQLIP7HRK76WL4YXJJP2URCT","download_json":"https://pith.science/pith/ZYNQLIP7HRK76WL4YXJJP2URCT.json","view_paper":"https://pith.science/paper/ZYNQLIP7","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2605.20183&json=true","fetch_graph":"https://pith.science/api/pith-number/ZYNQLIP7HRK76WL4YXJJP2URCT/graph.json","fetch_events":"https://pith.science/api/pith-number/ZYNQLIP7HRK76WL4YXJJP2URCT/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/ZYNQLIP7HRK76WL4YXJJP2URCT/action/timestamp_anchor","attest_storage":"https://pith.science/pith/ZYNQLIP7HRK76WL4YXJJP2URCT/action/storage_attestation","attest_author":"https://pith.science/pith/ZYNQLIP7HRK76WL4YXJJP2URCT/action/author_attestation","sign_citation":"https://pith.science/pith/ZYNQLIP7HRK76WL4YXJJP2URCT/action/citation_signature","submit_replication":"https://pith.science/pith/ZYNQLIP7HRK76WL4YXJJP2URCT/action/replication_record"}},"created_at":"2026-05-20T02:06:06.875820+00:00","updated_at":"2026-05-20T02:06:06.875820+00:00"}