{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2025:5F7NWAAXPLVHZEXZ54IRMT5N7Q","short_pith_number":"pith:5F7NWAAX","schema_version":"1.0","canonical_sha256":"e97edb00177aea7c92f9ef11164fadfc13f1dd29372bf57206ab68bf43113212","source":{"kind":"arxiv","id":"2509.17901","version":4},"attestation_state":"computed","paper":{"title":"Do Modern Video-LLMs Need to Listen? A Benchmark Audit and Scalable Remedy","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.MM","cs.SD"],"primary_cat":"cs.CV","authors_text":"Geewook Kim, Minjoon Seo","submitted_at":"2025-09-22T15:28:54Z","abstract_excerpt":"Speech and audio encoders developed over years of community effort are routinely excluded from video understanding pipelines, not because they fail, but because benchmarks never required listening. We audit 10 video benchmarks and find items largely solvable from visual cues alone: a single-frame probe answers about 76% of AVQA without audio, suggesting poor measurement of audio-visual reasoning. Building on LLaVA-OneVision, we attach a speech/audio encoder and compare five compressor architectures under 25-fold token reduction (25 Hz to 1 Hz). Across 10 benchmarks, with and without filtering,"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2509.17901","kind":"arxiv","version":4},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2025-09-22T15:28:54Z","cross_cats_sorted":["cs.MM","cs.SD"],"title_canon_sha256":"342efc416f5c6445f8ad5c87639451d61e726d55f4b33a7ec9547d4202f560cc","abstract_canon_sha256":"1bb23ecf533a11fe267d3891738d2de9a04081141195a156b884c3e496f4e96a"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-23T02:13:16.874148Z","signature_b64":"2iU121u2ZBVJ5Z4QSjLpBDE71A/2fQE68QgwPWK2nj662RPJ/h7vr1G3AxcQLTpFAB25VojiW0vNtP5VZeWIAQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"e97edb00177aea7c92f9ef11164fadfc13f1dd29372bf57206ab68bf43113212","last_reissued_at":"2026-06-23T02:13:16.873757Z","signature_status":"signed_v1","first_computed_at":"2026-06-23T02:13:16.873757Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Do Modern Video-LLMs Need to Listen? A Benchmark Audit and Scalable Remedy","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.MM","cs.SD"],"primary_cat":"cs.CV","authors_text":"Geewook Kim, Minjoon Seo","submitted_at":"2025-09-22T15:28:54Z","abstract_excerpt":"Speech and audio encoders developed over years of community effort are routinely excluded from video understanding pipelines, not because they fail, but because benchmarks never required listening. We audit 10 video benchmarks and find items largely solvable from visual cues alone: a single-frame probe answers about 76% of AVQA without audio, suggesting poor measurement of audio-visual reasoning. Building on LLaVA-OneVision, we attach a speech/audio encoder and compare five compressor architectures under 25-fold token reduction (25 Hz to 1 Hz). Across 10 benchmarks, with and without filtering,"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2509.17901","kind":"arxiv","version":4},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2509.17901/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2509.17901","created_at":"2026-06-23T02:13:16.873820+00:00"},{"alias_kind":"arxiv_version","alias_value":"2509.17901v4","created_at":"2026-06-23T02:13:16.873820+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2509.17901","created_at":"2026-06-23T02:13:16.873820+00:00"},{"alias_kind":"pith_short_12","alias_value":"5F7NWAAXPLVH","created_at":"2026-06-23T02:13:16.873820+00:00"},{"alias_kind":"pith_short_16","alias_value":"5F7NWAAXPLVHZEXZ","created_at":"2026-06-23T02:13:16.873820+00:00"},{"alias_kind":"pith_short_8","alias_value":"5F7NWAAX","created_at":"2026-06-23T02:13:16.873820+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":1,"internal_anchor_count":1,"sample":[{"citing_arxiv_id":"2605.26232","citing_title":"Not All Modalities Are Equal: Instruction-Aware Gating for Multimodal Videos","ref_index":15,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/5F7NWAAXPLVHZEXZ54IRMT5N7Q","json":"https://pith.science/pith/5F7NWAAXPLVHZEXZ54IRMT5N7Q.json","graph_json":"https://pith.science/api/pith-number/5F7NWAAXPLVHZEXZ54IRMT5N7Q/graph.json","events_json":"https://pith.science/api/pith-number/5F7NWAAXPLVHZEXZ54IRMT5N7Q/events.json","paper":"https://pith.science/paper/5F7NWAAX"},"agent_actions":{"view_html":"https://pith.science/pith/5F7NWAAXPLVHZEXZ54IRMT5N7Q","download_json":"https://pith.science/pith/5F7NWAAXPLVHZEXZ54IRMT5N7Q.json","view_paper":"https://pith.science/paper/5F7NWAAX","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2509.17901&json=true","fetch_graph":"https://pith.science/api/pith-number/5F7NWAAXPLVHZEXZ54IRMT5N7Q/graph.json","fetch_events":"https://pith.science/api/pith-number/5F7NWAAXPLVHZEXZ54IRMT5N7Q/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/5F7NWAAXPLVHZEXZ54IRMT5N7Q/action/timestamp_anchor","attest_storage":"https://pith.science/pith/5F7NWAAXPLVHZEXZ54IRMT5N7Q/action/storage_attestation","attest_author":"https://pith.science/pith/5F7NWAAXPLVHZEXZ54IRMT5N7Q/action/author_attestation","sign_citation":"https://pith.science/pith/5F7NWAAXPLVHZEXZ54IRMT5N7Q/action/citation_signature","submit_replication":"https://pith.science/pith/5F7NWAAXPLVHZEXZ54IRMT5N7Q/action/replication_record"}},"created_at":"2026-06-23T02:13:16.873820+00:00","updated_at":"2026-06-23T02:13:16.873820+00:00"}