{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2025:GQ37DA3BEJQ5ZGO6DKY3C3CIWC","short_pith_number":"pith:GQ37DA3B","canonical_record":{"source":{"id":"2512.12772","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.MM","submitted_at":"2025-12-14T17:23:21Z","cross_cats_sorted":["cs.CV"],"title_canon_sha256":"33a5624ddaf1623adcbeecca9e9824644a2f5f8dada4f0eb79edcebfff59965a","abstract_canon_sha256":"b9b2e3d389fd0cabfe9f758af7ae6591daa7eddc251f5e1570e98b50af9b80a1"},"schema_version":"1.0"},"canonical_sha256":"3437f183612261dc99de1ab1b16c48b0a44c4604c1a8637a855d0b16143662e5","source":{"kind":"arxiv","id":"2512.12772","version":2},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2512.12772","created_at":"2026-05-17T23:39:00Z"},{"alias_kind":"arxiv_version","alias_value":"2512.12772v2","created_at":"2026-05-17T23:39:00Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2512.12772","created_at":"2026-05-17T23:39:00Z"},{"alias_kind":"pith_short_12","alias_value":"GQ37DA3BEJQ5","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"GQ37DA3BEJQ5ZGO6","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"GQ37DA3B","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2025:GQ37DA3BEJQ5ZGO6DKY3C3CIWC","target":"record","payload":{"canonical_record":{"source":{"id":"2512.12772","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.MM","submitted_at":"2025-12-14T17:23:21Z","cross_cats_sorted":["cs.CV"],"title_canon_sha256":"33a5624ddaf1623adcbeecca9e9824644a2f5f8dada4f0eb79edcebfff59965a","abstract_canon_sha256":"b9b2e3d389fd0cabfe9f758af7ae6591daa7eddc251f5e1570e98b50af9b80a1"},"schema_version":"1.0"},"canonical_sha256":"3437f183612261dc99de1ab1b16c48b0a44c4604c1a8637a855d0b16143662e5","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:39:00.522070Z","signature_b64":"YneGQ2YW29E9hpUCWAZFOsj+9ZloSm2+9OHNVNjYP9aQgp1+6T2MXGQ1FOxzCmczhQsH5FEXBIuyYaYSzmYlBg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"3437f183612261dc99de1ab1b16c48b0a44c4604c1a8637a855d0b16143662e5","last_reissued_at":"2026-05-17T23:39:00.521344Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:39:00.521344Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2512.12772","source_version":2,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:39:00Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"4oz89WknO8rsj06hbZxEb4aYIwOJThZwKIfHIJRfjKRyIEM3LAmVsCyzdNJsuqbWEblGYAzIGowERsA9IA4NBA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-28T04:43:36.844669Z"},"content_sha256":"2f74845a04097a06190c6ecce4c2b4a70deac00f5f67a62e06713769663094a5","schema_version":"1.0","event_id":"sha256:2f74845a04097a06190c6ecce4c2b4a70deac00f5f67a62e06713769663094a5"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2025:GQ37DA3BEJQ5ZGO6DKY3C3CIWC","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"JointAVBench: A Benchmark for Joint Audio-Visual Reasoning Evaluation","license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","headline":"Even the best Omni-LLMs reach only 65.3 percent average accuracy on a benchmark that demands strict joint audio-visual reasoning in videos.","cross_cats":["cs.CV"],"primary_cat":"cs.MM","authors_text":"Jianghan Chao, Jianzhang Gao, Liyun Ru, Ruihua Song, Wenhui Tan, Yuchong Sun","submitted_at":"2025-12-14T17:23:21Z","abstract_excerpt":"Understanding videos inherently requires reasoning over both visual and auditory information. To properly evaluate Omni-Large Language Models (Omni-LLMs), which are capable of processing multi-modal information including vision and audio, an effective benchmark must comprehensively cover three key aspects: (1) multi-modal dependency (i.e., questions that cannot be answered using vision or audio alone), (2) diverse audio information types (e.g., speech, sound events), and (3) varying scene spans. However, existing datasets fall short in one or more of these dimensions, limiting strict and compr"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"even the best-performing Omni-LLM achieves an average accuracy of only 65.3%, outperforming uni-modal baselines but revealing substantial room for improvement, especially in cross-scene reasoning.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"The automated pipeline using vision-LLMs, audio-LLMs, and general LLMs produces questions and answers that strictly require joint audio-visual understanding without introducing biases or answer leakage from the generation process itself.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"JointAVBench is a benchmark for joint audio-visual reasoning that shows leading Omni-LLMs reach only 65.3% accuracy, with particular weakness in cross-scene tasks.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Even the best Omni-LLMs reach only 65.3 percent average accuracy on a benchmark that demands strict joint audio-visual reasoning in videos.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"6a37a89c38567baaa64ebbb3ed338028c7e4f9b5708b4eaa4633c8c529447780"},"source":{"id":"2512.12772","kind":"arxiv","version":2},"verdict":{"id":"5a06ad73-b59d-4884-9883-eaaf35a5cfa5","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-16T22:29:35.985190Z","strongest_claim":"even the best-performing Omni-LLM achieves an average accuracy of only 65.3%, outperforming uni-modal baselines but revealing substantial room for improvement, especially in cross-scene reasoning.","one_line_summary":"JointAVBench is a benchmark for joint audio-visual reasoning that shows leading Omni-LLMs reach only 65.3% accuracy, with particular weakness in cross-scene tasks.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"The automated pipeline using vision-LLMs, audio-LLMs, and general LLMs produces questions and answers that strictly require joint audio-visual understanding without introducing biases or answer leakage from the generation process itself.","pith_extraction_headline":"Even the best Omni-LLMs reach only 65.3 percent average accuracy on a benchmark that demands strict joint audio-visual reasoning in videos."},"references":{"count":53,"sample":[{"doi":"","year":2024,"title":"achieves optimal performance in identifying potential hallucinations. During the general check, we utilize only the QA pair and its explanation to filter out unqualified QA pairs. This stage includes ","work_id":"e5f43d5f-6dcf-4867-b103-74db94d8b611","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Only include details that are clearly visible in the video","work_id":"f1e28bb8-ab81-4c07-8afe-d9c8218268ee","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Focus on their most significant movements, gestures, and interactions","work_id":"a0717373-54fc-45b9-9a28-724ef9a2f501","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Include the sequence of events and the pacing of the scene to convey how it unfolds over time","work_id":"009947f9-7e91-4bb6-9a23-f1b5a04e8fc2","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Only describe emotions that are clearly expressed through visible actions or expressions","work_id":"0509480d-34b2-4ae0-8ced-cf52a785c310","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":53,"snapshot_sha256":"d971b4c13d48df6847b6945657f19f563d3d5596a8e8f332e9158c9fa8d8a2d4","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"5a06ad73-b59d-4884-9883-eaaf35a5cfa5"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:39:00Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"25E/sazmLJVEZGtC+D2qi6ef4eJw280m8Lq7v4KyhXdjnQKkooRdm6pasPegEwQoN9xx9ZhvhNaoNDwNPtC5CQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-28T04:43:36.845606Z"},"content_sha256":"6998f695f49a3a799e5150435a117f0bd050b6ba0e92177f19163c71a5ff9843","schema_version":"1.0","event_id":"sha256:6998f695f49a3a799e5150435a117f0bd050b6ba0e92177f19163c71a5ff9843"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/GQ37DA3BEJQ5ZGO6DKY3C3CIWC/bundle.json","state_url":"https://pith.science/pith/GQ37DA3BEJQ5ZGO6DKY3C3CIWC/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/GQ37DA3BEJQ5ZGO6DKY3C3CIWC/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-28T04:43:36Z","links":{"resolver":"https://pith.science/pith/GQ37DA3BEJQ5ZGO6DKY3C3CIWC","bundle":"https://pith.science/pith/GQ37DA3BEJQ5ZGO6DKY3C3CIWC/bundle.json","state":"https://pith.science/pith/GQ37DA3BEJQ5ZGO6DKY3C3CIWC/state.json","well_known_bundle":"https://pith.science/.well-known/pith/GQ37DA3BEJQ5ZGO6DKY3C3CIWC/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2025:GQ37DA3BEJQ5ZGO6DKY3C3CIWC","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"b9b2e3d389fd0cabfe9f758af7ae6591daa7eddc251f5e1570e98b50af9b80a1","cross_cats_sorted":["cs.CV"],"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.MM","submitted_at":"2025-12-14T17:23:21Z","title_canon_sha256":"33a5624ddaf1623adcbeecca9e9824644a2f5f8dada4f0eb79edcebfff59965a"},"schema_version":"1.0","source":{"id":"2512.12772","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2512.12772","created_at":"2026-05-17T23:39:00Z"},{"alias_kind":"arxiv_version","alias_value":"2512.12772v2","created_at":"2026-05-17T23:39:00Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2512.12772","created_at":"2026-05-17T23:39:00Z"},{"alias_kind":"pith_short_12","alias_value":"GQ37DA3BEJQ5","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"GQ37DA3BEJQ5ZGO6","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"GQ37DA3B","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:6998f695f49a3a799e5150435a117f0bd050b6ba0e92177f19163c71a5ff9843","target":"graph","created_at":"2026-05-17T23:39:00Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"even the best-performing Omni-LLM achieves an average accuracy of only 65.3%, outperforming uni-modal baselines but revealing substantial room for improvement, especially in cross-scene reasoning."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"The automated pipeline using vision-LLMs, audio-LLMs, and general LLMs produces questions and answers that strictly require joint audio-visual understanding without introducing biases or answer leakage from the generation process itself."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"JointAVBench is a benchmark for joint audio-visual reasoning that shows leading Omni-LLMs reach only 65.3% accuracy, with particular weakness in cross-scene tasks."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Even the best Omni-LLMs reach only 65.3 percent average accuracy on a benchmark that demands strict joint audio-visual reasoning in videos."}],"snapshot_sha256":"6a37a89c38567baaa64ebbb3ed338028c7e4f9b5708b4eaa4633c8c529447780"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"Understanding videos inherently requires reasoning over both visual and auditory information. To properly evaluate Omni-Large Language Models (Omni-LLMs), which are capable of processing multi-modal information including vision and audio, an effective benchmark must comprehensively cover three key aspects: (1) multi-modal dependency (i.e., questions that cannot be answered using vision or audio alone), (2) diverse audio information types (e.g., speech, sound events), and (3) varying scene spans. However, existing datasets fall short in one or more of these dimensions, limiting strict and compr","authors_text":"Jianghan Chao, Jianzhang Gao, Liyun Ru, Ruihua Song, Wenhui Tan, Yuchong Sun","cross_cats":["cs.CV"],"headline":"Even the best Omni-LLMs reach only 65.3 percent average accuracy on a benchmark that demands strict joint audio-visual reasoning in videos.","license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.MM","submitted_at":"2025-12-14T17:23:21Z","title":"JointAVBench: A Benchmark for Joint Audio-Visual Reasoning Evaluation"},"references":{"count":53,"internal_anchors":0,"resolved_work":53,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"achieves optimal performance in identifying potential hallucinations. During the general check, we utilize only the QA pair and its explanation to filter out unqualified QA pairs. This stage includes ","work_id":"e5f43d5f-6dcf-4867-b103-74db94d8b611","year":2024},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Only include details that are clearly visible in the video","work_id":"f1e28bb8-ab81-4c07-8afe-d9c8218268ee","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"Focus on their most significant movements, gestures, and interactions","work_id":"a0717373-54fc-45b9-9a28-724ef9a2f501","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Include the sequence of events and the pacing of the scene to convey how it unfolds over time","work_id":"009947f9-7e91-4bb6-9a23-f1b5a04e8fc2","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"Only describe emotions that are clearly expressed through visible actions or expressions","work_id":"0509480d-34b2-4ae0-8ced-cf52a785c310","year":null}],"snapshot_sha256":"d971b4c13d48df6847b6945657f19f563d3d5596a8e8f332e9158c9fa8d8a2d4"},"source":{"id":"2512.12772","kind":"arxiv","version":2},"verdict":{"created_at":"2026-05-16T22:29:35.985190Z","id":"5a06ad73-b59d-4884-9883-eaaf35a5cfa5","model_set":{"reader":"grok-4.3"},"one_line_summary":"JointAVBench is a benchmark for joint audio-visual reasoning that shows leading Omni-LLMs reach only 65.3% accuracy, with particular weakness in cross-scene tasks.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Even the best Omni-LLMs reach only 65.3 percent average accuracy on a benchmark that demands strict joint audio-visual reasoning in videos.","strongest_claim":"even the best-performing Omni-LLM achieves an average accuracy of only 65.3%, outperforming uni-modal baselines but revealing substantial room for improvement, especially in cross-scene reasoning.","weakest_assumption":"The automated pipeline using vision-LLMs, audio-LLMs, and general LLMs produces questions and answers that strictly require joint audio-visual understanding without introducing biases or answer leakage from the generation process itself."}},"verdict_id":"5a06ad73-b59d-4884-9883-eaaf35a5cfa5"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:2f74845a04097a06190c6ecce4c2b4a70deac00f5f67a62e06713769663094a5","target":"record","created_at":"2026-05-17T23:39:00Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"b9b2e3d389fd0cabfe9f758af7ae6591daa7eddc251f5e1570e98b50af9b80a1","cross_cats_sorted":["cs.CV"],"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.MM","submitted_at":"2025-12-14T17:23:21Z","title_canon_sha256":"33a5624ddaf1623adcbeecca9e9824644a2f5f8dada4f0eb79edcebfff59965a"},"schema_version":"1.0","source":{"id":"2512.12772","kind":"arxiv","version":2}},"canonical_sha256":"3437f183612261dc99de1ab1b16c48b0a44c4604c1a8637a855d0b16143662e5","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"3437f183612261dc99de1ab1b16c48b0a44c4604c1a8637a855d0b16143662e5","first_computed_at":"2026-05-17T23:39:00.521344Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:39:00.521344Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"YneGQ2YW29E9hpUCWAZFOsj+9ZloSm2+9OHNVNjYP9aQgp1+6T2MXGQ1FOxzCmczhQsH5FEXBIuyYaYSzmYlBg==","signature_status":"signed_v1","signed_at":"2026-05-17T23:39:00.522070Z","signed_message":"canonical_sha256_bytes"},"source_id":"2512.12772","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:2f74845a04097a06190c6ecce4c2b4a70deac00f5f67a62e06713769663094a5","sha256:6998f695f49a3a799e5150435a117f0bd050b6ba0e92177f19163c71a5ff9843"],"state_sha256":"775332e2233e56ac66c92647a4b2a9e39fb1d44278d28f65aa02f08ac8eb9992"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"YPKfhAKuhxP1e4I9sV/uULqEF0wO4q0Pyi8FmzgiRHhQg4v95oQCR2gNRStF1EzfdzKgCELSbzQbyorJJZYsDA==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-28T04:43:36.849357Z","bundle_sha256":"f335d0bd0c4b976bcc1921d92c5fd844fe582d52c2de331191647fdd3e317a4c"}}