{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:TQDOVCMHRCNRS5S6J2XKU7KCOR","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"61a43d92cecf1218fb1e78227d7db963043a0dd84bc744fdcdfbc7f7ac76aa45","cross_cats_sorted":["cs.CV"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2026-05-21T05:18:57Z","title_canon_sha256":"ae0469acb8b7b9b8a2ad95402d4e428399136b96bad89c1f181eb46da9552bf7"},"schema_version":"1.0","source":{"id":"2605.22012","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.22012","created_at":"2026-05-22T01:04:20Z"},{"alias_kind":"arxiv_version","alias_value":"2605.22012v1","created_at":"2026-05-22T01:04:20Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.22012","created_at":"2026-05-22T01:04:20Z"},{"alias_kind":"pith_short_12","alias_value":"TQDOVCMHRCNR","created_at":"2026-05-22T01:04:20Z"},{"alias_kind":"pith_short_16","alias_value":"TQDOVCMHRCNRS5S6","created_at":"2026-05-22T01:04:20Z"},{"alias_kind":"pith_short_8","alias_value":"TQDOVCMH","created_at":"2026-05-22T01:04:20Z"}],"graph_snapshots":[{"event_id":"sha256:3a2f498241e87342a0dc260248b1e164711e8a09acb9f014d86112bbc3046d95","target":"graph","created_at":"2026-05-22T01:04:20Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2605.22012/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Joint audio-visual reasoning is essential for omnimodal understanding, yet current multimodal large language models (MLLMs) still struggle when reasoning requires fine-grained evidence from both modalities. A central limitation is that explicit text-based chain-of-thought (CoT) compresses continuous audio-visual signals into discrete tokens, weakening temporal grounding and shifting intermediate reasoning toward language priors. We argue that a unified latent space is a better medium for such reasoning because it preserves dense sensory information while remaining compatible with autoregressiv","authors_text":"Bingyin Mei, Bohan Zeng, Bozhou Li, Chengzhuo Tong, Daili Hua, Fangcheng Fu, Hao Liang, Jialing Liu, Junbo Niu, Pengfei Wan, Tianyu Guo, Wentao Zhang, Xiaochen Ma, Yang Shi, Yifan Dai, Yiyan Ji, Yuanxing Zhang, Yue Ding, Yuran Wang, Yushuo Guan, Zhenhua Wu","cross_cats":["cs.CV"],"headline":"","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2026-05-21T05:18:57Z","title":"LatentOmni: Rethinking Omni-Modal Understanding via Unified Audio-Visual Latent Reasoning"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2605.22012","kind":"arxiv","version":1},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:f2414984a73ef04d1b089468559f560e7ea0f33cc7a03430bac3ffada855f1ef","target":"record","created_at":"2026-05-22T01:04:20Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"61a43d92cecf1218fb1e78227d7db963043a0dd84bc744fdcdfbc7f7ac76aa45","cross_cats_sorted":["cs.CV"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2026-05-21T05:18:57Z","title_canon_sha256":"ae0469acb8b7b9b8a2ad95402d4e428399136b96bad89c1f181eb46da9552bf7"},"schema_version":"1.0","source":{"id":"2605.22012","kind":"arxiv","version":1}},"canonical_sha256":"9c06ea8987889b19765e4eaeaa7d427441de90faf8ee7a685f5e010f2b41885c","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"9c06ea8987889b19765e4eaeaa7d427441de90faf8ee7a685f5e010f2b41885c","first_computed_at":"2026-05-22T01:04:20.493677Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-22T01:04:20.493677Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"njpUTeBvOJgCAezDnLreJjaRzxZUVTJixngHN4ENHEviiCer2hx0JnjqkOaDAqATNPR4puNwlkYrojrCN/oXDA==","signature_status":"signed_v1","signed_at":"2026-05-22T01:04:20.494426Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.22012","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:f2414984a73ef04d1b089468559f560e7ea0f33cc7a03430bac3ffada855f1ef","sha256:3a2f498241e87342a0dc260248b1e164711e8a09acb9f014d86112bbc3046d95"],"state_sha256":"740341e1febe91dd996d6dafc38a2536dd8975dd9d90aebe31e324126e1e62a0"}