{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:BR6I2WQNOODNECFWX76TBPRD57","short_pith_number":"pith:BR6I2WQN","schema_version":"1.0","canonical_sha256":"0c7c8d5a0d7386d208b6bffd30be23efdd9a4796eb4e5ecf49df2b0c10715e42","source":{"kind":"arxiv","id":"2605.26244","version":1},"attestation_state":"computed","paper":{"title":"LongAV-Compass: Towards Unified Evaluation of Minute-Scale Audio-Visual Generation Across T2AV, I2AV, and V2AV","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.MM","cs.SD"],"primary_cat":"cs.CV","authors_text":"Bohan Zeng, Bozhou Li, Fengxiang Wang, Haotian Wang, Jiafu Tang, Jialu Chen, Leye Wang, Liu Yang, Pengfei Wan, Qixun Wang, Tengfei Liu, Xiaohan Zhang, Xinlong Chen, Xuanyu Zhu, Yang Shi, Yuanxing Zhang, Yue Ding, Yuhao Dong, Yuqi Tang, Zhuoran Zhang","submitted_at":"2026-05-25T18:12:09Z","abstract_excerpt":"Audio-visual generation is rapidly advancing from short clips to minute-long content, while existing evaluation protocols remain largely confined to short-form settings. Existing benchmarks primarily focus on 5--10 second text-conditioned generation and rarely support unified evaluation across text, image, and video conditioning modalities. Moreover, they provide limited insight into how identity consistency, narrative coherence, and audio-visual alignment degrade over extended temporal horizons. To bridge this gap, we introduce LongAV-Compass, a systematic benchmark for minute-long audio-visu"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2605.26244","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-25T18:12:09Z","cross_cats_sorted":["cs.MM","cs.SD"],"title_canon_sha256":"bd937d0e6806207a92ef4d8f9969c9986626986252d4bcd9072950701ebf46e9","abstract_canon_sha256":"62fab8d08fe5fdc63016abf858f25b5db8ea46a694d3de49d48459916c0d277d"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-27T01:05:07.794607Z","signature_b64":"eWD0uGoLgjlzeYJPri8IZbozupkplNggevUsfLUXBAnzbgi1HYMQAuNFZm2goP4tRsk6bS79kRMuRHEj9wQ3Dw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"0c7c8d5a0d7386d208b6bffd30be23efdd9a4796eb4e5ecf49df2b0c10715e42","last_reissued_at":"2026-05-27T01:05:07.793612Z","signature_status":"signed_v1","first_computed_at":"2026-05-27T01:05:07.793612Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"LongAV-Compass: Towards Unified Evaluation of Minute-Scale Audio-Visual Generation Across T2AV, I2AV, and V2AV","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.MM","cs.SD"],"primary_cat":"cs.CV","authors_text":"Bohan Zeng, Bozhou Li, Fengxiang Wang, Haotian Wang, Jiafu Tang, Jialu Chen, Leye Wang, Liu Yang, Pengfei Wan, Qixun Wang, Tengfei Liu, Xiaohan Zhang, Xinlong Chen, Xuanyu Zhu, Yang Shi, Yuanxing Zhang, Yue Ding, Yuhao Dong, Yuqi Tang, Zhuoran Zhang","submitted_at":"2026-05-25T18:12:09Z","abstract_excerpt":"Audio-visual generation is rapidly advancing from short clips to minute-long content, while existing evaluation protocols remain largely confined to short-form settings. Existing benchmarks primarily focus on 5--10 second text-conditioned generation and rarely support unified evaluation across text, image, and video conditioning modalities. Moreover, they provide limited insight into how identity consistency, narrative coherence, and audio-visual alignment degrade over extended temporal horizons. To bridge this gap, we introduce LongAV-Compass, a systematic benchmark for minute-long audio-visu"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2605.26244","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2605.26244/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2605.26244","created_at":"2026-05-27T01:05:07.793757+00:00"},{"alias_kind":"arxiv_version","alias_value":"2605.26244v1","created_at":"2026-05-27T01:05:07.793757+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.26244","created_at":"2026-05-27T01:05:07.793757+00:00"},{"alias_kind":"pith_short_12","alias_value":"BR6I2WQNOODN","created_at":"2026-05-27T01:05:07.793757+00:00"},{"alias_kind":"pith_short_16","alias_value":"BR6I2WQNOODNECFW","created_at":"2026-05-27T01:05:07.793757+00:00"},{"alias_kind":"pith_short_8","alias_value":"BR6I2WQN","created_at":"2026-05-27T01:05:07.793757+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/BR6I2WQNOODNECFWX76TBPRD57","json":"https://pith.science/pith/BR6I2WQNOODNECFWX76TBPRD57.json","graph_json":"https://pith.science/api/pith-number/BR6I2WQNOODNECFWX76TBPRD57/graph.json","events_json":"https://pith.science/api/pith-number/BR6I2WQNOODNECFWX76TBPRD57/events.json","paper":"https://pith.science/paper/BR6I2WQN"},"agent_actions":{"view_html":"https://pith.science/pith/BR6I2WQNOODNECFWX76TBPRD57","download_json":"https://pith.science/pith/BR6I2WQNOODNECFWX76TBPRD57.json","view_paper":"https://pith.science/paper/BR6I2WQN","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2605.26244&json=true","fetch_graph":"https://pith.science/api/pith-number/BR6I2WQNOODNECFWX76TBPRD57/graph.json","fetch_events":"https://pith.science/api/pith-number/BR6I2WQNOODNECFWX76TBPRD57/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/BR6I2WQNOODNECFWX76TBPRD57/action/timestamp_anchor","attest_storage":"https://pith.science/pith/BR6I2WQNOODNECFWX76TBPRD57/action/storage_attestation","attest_author":"https://pith.science/pith/BR6I2WQNOODNECFWX76TBPRD57/action/author_attestation","sign_citation":"https://pith.science/pith/BR6I2WQNOODNECFWX76TBPRD57/action/citation_signature","submit_replication":"https://pith.science/pith/BR6I2WQNOODNECFWX76TBPRD57/action/replication_record"}},"created_at":"2026-05-27T01:05:07.793757+00:00","updated_at":"2026-05-27T01:05:07.793757+00:00"}