{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2025:XGE7J4DDJEYFKCIHHNJ26X6647","short_pith_number":"pith:XGE7J4DD","schema_version":"1.0","canonical_sha256":"b989f4f06349305509073b53af5fdee7fefac6619023e4a5fe45ceed101bad7d","source":{"kind":"arxiv","id":"2512.21094","version":2},"attestation_state":"computed","paper":{"title":"T2AV-Compass: Towards Unified Evaluation for Text-to-Audio-Video Generation","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Chenxi Liao, Jiahao Wang, Jiaheng Liu, Jialu Chen, Jiaming Wang, Miao Deng, Tao Wang, Yanghai Wang, Yize Zhang, Yuanxing Zhang, Yubin Guo, Zhaoxiang Zhang, Zhe Cao","submitted_at":"2025-12-24T10:30:35Z","abstract_excerpt":"Text-to-Audio-Video (T2AV) generation aims to synthesize temporally coherent video and semantically synchronized audio from natural language, yet its evaluation remains fragmented, often relying on unimodal metrics or narrowly scoped benchmarks that fail to capture cross-modal alignment, instruction following, and perceptual realism under complex prompts. To address this limitation, we present T2AV-Compass, a unified benchmark for comprehensive evaluation of T2AV systems, consisting of 500 diverse and complex prompts constructed via a taxonomy-driven pipeline to ensure semantic richness and ph"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2512.21094","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2025-12-24T10:30:35Z","cross_cats_sorted":[],"title_canon_sha256":"1d037840269e07d9698036772c007079e1d01454cb8c50fb5106432170c60300","abstract_canon_sha256":"bebf32ff392aa61363f2555125d94edbe3debaffb9a088d87eb4e6521467125a"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-03T02:05:44.452375Z","signature_b64":"lus4l0+DOAAwnbCk8mggIgmKy2+7EC3H7qjm9QLXcv0OKcpD1rbAdVIQUyo+CcLLfE8ycexECN/xM7MG8IeBDA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"b989f4f06349305509073b53af5fdee7fefac6619023e4a5fe45ceed101bad7d","last_reissued_at":"2026-06-03T02:05:44.451905Z","signature_status":"signed_v1","first_computed_at":"2026-06-03T02:05:44.451905Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"T2AV-Compass: Towards Unified Evaluation for Text-to-Audio-Video Generation","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Chenxi Liao, Jiahao Wang, Jiaheng Liu, Jialu Chen, Jiaming Wang, Miao Deng, Tao Wang, Yanghai Wang, Yize Zhang, Yuanxing Zhang, Yubin Guo, Zhaoxiang Zhang, Zhe Cao","submitted_at":"2025-12-24T10:30:35Z","abstract_excerpt":"Text-to-Audio-Video (T2AV) generation aims to synthesize temporally coherent video and semantically synchronized audio from natural language, yet its evaluation remains fragmented, often relying on unimodal metrics or narrowly scoped benchmarks that fail to capture cross-modal alignment, instruction following, and perceptual realism under complex prompts. To address this limitation, we present T2AV-Compass, a unified benchmark for comprehensive evaluation of T2AV systems, consisting of 500 diverse and complex prompts constructed via a taxonomy-driven pipeline to ensure semantic richness and ph"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2512.21094","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2512.21094/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2512.21094","created_at":"2026-06-03T02:05:44.451963+00:00"},{"alias_kind":"arxiv_version","alias_value":"2512.21094v2","created_at":"2026-06-03T02:05:44.451963+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2512.21094","created_at":"2026-06-03T02:05:44.451963+00:00"},{"alias_kind":"pith_short_12","alias_value":"XGE7J4DDJEYF","created_at":"2026-06-03T02:05:44.451963+00:00"},{"alias_kind":"pith_short_16","alias_value":"XGE7J4DDJEYFKCIH","created_at":"2026-06-03T02:05:44.451963+00:00"},{"alias_kind":"pith_short_8","alias_value":"XGE7J4DD","created_at":"2026-06-03T02:05:44.451963+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":3,"internal_anchor_count":3,"sample":[{"citing_arxiv_id":"2605.20183","citing_title":"MSAVBench: Towards Comprehensive and Reliable Evaluation of Multi-Shot Audio-Video Generation","ref_index":7,"is_internal_anchor":true},{"citing_arxiv_id":"2604.10542","citing_title":"VidAudio-Bench: Benchmarking V2A and VT2A Generation across Four Audio Categories","ref_index":6,"is_internal_anchor":true},{"citing_arxiv_id":"2605.07061","citing_title":"Do Joint Audio-Video Generation Models Understand Physics?","ref_index":6,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/XGE7J4DDJEYFKCIHHNJ26X6647","json":"https://pith.science/pith/XGE7J4DDJEYFKCIHHNJ26X6647.json","graph_json":"https://pith.science/api/pith-number/XGE7J4DDJEYFKCIHHNJ26X6647/graph.json","events_json":"https://pith.science/api/pith-number/XGE7J4DDJEYFKCIHHNJ26X6647/events.json","paper":"https://pith.science/paper/XGE7J4DD"},"agent_actions":{"view_html":"https://pith.science/pith/XGE7J4DDJEYFKCIHHNJ26X6647","download_json":"https://pith.science/pith/XGE7J4DDJEYFKCIHHNJ26X6647.json","view_paper":"https://pith.science/paper/XGE7J4DD","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2512.21094&json=true","fetch_graph":"https://pith.science/api/pith-number/XGE7J4DDJEYFKCIHHNJ26X6647/graph.json","fetch_events":"https://pith.science/api/pith-number/XGE7J4DDJEYFKCIHHNJ26X6647/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/XGE7J4DDJEYFKCIHHNJ26X6647/action/timestamp_anchor","attest_storage":"https://pith.science/pith/XGE7J4DDJEYFKCIHHNJ26X6647/action/storage_attestation","attest_author":"https://pith.science/pith/XGE7J4DDJEYFKCIHHNJ26X6647/action/author_attestation","sign_citation":"https://pith.science/pith/XGE7J4DDJEYFKCIHHNJ26X6647/action/citation_signature","submit_replication":"https://pith.science/pith/XGE7J4DDJEYFKCIHHNJ26X6647/action/replication_record"}},"created_at":"2026-06-03T02:05:44.451963+00:00","updated_at":"2026-06-03T02:05:44.451963+00:00"}