{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:AFH5LHCVFT5UG63OAEDO4ZYWDU","short_pith_number":"pith:AFH5LHCV","schema_version":"1.0","canonical_sha256":"014fd59c552cfb437b6e0106ee67161d16d7aa9f1a77dc36bbb6d53f63e99cc7","source":{"kind":"arxiv","id":"2606.09131","version":1},"attestation_state":"computed","paper":{"title":"Late-Layer Fusion is Enough: Dual-Path Vision Token Routing for Multimodal Large Language Models under Visual Saturation","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.CL","cs.CV","cs.LG"],"primary_cat":"cs.AI","authors_text":"Jinyang Wu, Siyuan Liu","submitted_at":"2026-06-08T07:28:14Z","abstract_excerpt":"Multimodal large language models (MLLMs) commonly inherit the deep, symmetric Transformer backbone designed for unimodal text modeling, and apply the same computation uniformly to image and language tokens. This design overlooks a key modality asymmetry: image and text tokens differ substantially in information density, redundancy, and required reasoning depth. Through a layer-wise analysis of LLaVA-1.5, we observe that vision tokens tend to saturate in the middle layers. Specifically, text-to-image attention decreases from 0.68 at layer 0 to 0.07 by layer 4, and stabilizes near 0.04 after lay"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2606.09131","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.AI","submitted_at":"2026-06-08T07:28:14Z","cross_cats_sorted":["cs.CL","cs.CV","cs.LG"],"title_canon_sha256":"613de62861b4733df430d8e614b8f02eb0532bf4f7ab6620368b65848ac10072","abstract_canon_sha256":"e801c91ae6c8071dfb87bed05a73e1518847ee50e2903958f34bec3dc00155b8"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-09T02:08:01.028878Z","signature_b64":"ldOaHng1WR+ChpTesuZjGOvfIafR9hkJcEuAfm3vto3NKQgfMoy9uCcT3EhSbEJ+VmrLIf25w8c6DYpiLA6UCQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"014fd59c552cfb437b6e0106ee67161d16d7aa9f1a77dc36bbb6d53f63e99cc7","last_reissued_at":"2026-06-09T02:08:01.028022Z","signature_status":"signed_v1","first_computed_at":"2026-06-09T02:08:01.028022Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Late-Layer Fusion is Enough: Dual-Path Vision Token Routing for Multimodal Large Language Models under Visual Saturation","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.CL","cs.CV","cs.LG"],"primary_cat":"cs.AI","authors_text":"Jinyang Wu, Siyuan Liu","submitted_at":"2026-06-08T07:28:14Z","abstract_excerpt":"Multimodal large language models (MLLMs) commonly inherit the deep, symmetric Transformer backbone designed for unimodal text modeling, and apply the same computation uniformly to image and language tokens. This design overlooks a key modality asymmetry: image and text tokens differ substantially in information density, redundancy, and required reasoning depth. Through a layer-wise analysis of LLaVA-1.5, we observe that vision tokens tend to saturate in the middle layers. Specifically, text-to-image attention decreases from 0.68 at layer 0 to 0.07 by layer 4, and stabilizes near 0.04 after lay"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.09131","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2606.09131/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2606.09131","created_at":"2026-06-09T02:08:01.028162+00:00"},{"alias_kind":"arxiv_version","alias_value":"2606.09131v1","created_at":"2026-06-09T02:08:01.028162+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.09131","created_at":"2026-06-09T02:08:01.028162+00:00"},{"alias_kind":"pith_short_12","alias_value":"AFH5LHCVFT5U","created_at":"2026-06-09T02:08:01.028162+00:00"},{"alias_kind":"pith_short_16","alias_value":"AFH5LHCVFT5UG63O","created_at":"2026-06-09T02:08:01.028162+00:00"},{"alias_kind":"pith_short_8","alias_value":"AFH5LHCV","created_at":"2026-06-09T02:08:01.028162+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/AFH5LHCVFT5UG63OAEDO4ZYWDU","json":"https://pith.science/pith/AFH5LHCVFT5UG63OAEDO4ZYWDU.json","graph_json":"https://pith.science/api/pith-number/AFH5LHCVFT5UG63OAEDO4ZYWDU/graph.json","events_json":"https://pith.science/api/pith-number/AFH5LHCVFT5UG63OAEDO4ZYWDU/events.json","paper":"https://pith.science/paper/AFH5LHCV"},"agent_actions":{"view_html":"https://pith.science/pith/AFH5LHCVFT5UG63OAEDO4ZYWDU","download_json":"https://pith.science/pith/AFH5LHCVFT5UG63OAEDO4ZYWDU.json","view_paper":"https://pith.science/paper/AFH5LHCV","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2606.09131&json=true","fetch_graph":"https://pith.science/api/pith-number/AFH5LHCVFT5UG63OAEDO4ZYWDU/graph.json","fetch_events":"https://pith.science/api/pith-number/AFH5LHCVFT5UG63OAEDO4ZYWDU/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/AFH5LHCVFT5UG63OAEDO4ZYWDU/action/timestamp_anchor","attest_storage":"https://pith.science/pith/AFH5LHCVFT5UG63OAEDO4ZYWDU/action/storage_attestation","attest_author":"https://pith.science/pith/AFH5LHCVFT5UG63OAEDO4ZYWDU/action/author_attestation","sign_citation":"https://pith.science/pith/AFH5LHCVFT5UG63OAEDO4ZYWDU/action/citation_signature","submit_replication":"https://pith.science/pith/AFH5LHCVFT5UG63OAEDO4ZYWDU/action/replication_record"}},"created_at":"2026-06-09T02:08:01.028162+00:00","updated_at":"2026-06-09T02:08:01.028162+00:00"}