{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:NG6ZB7LZWQLRZR7REMX2MFMCGI","short_pith_number":"pith:NG6ZB7LZ","canonical_record":{"source":{"id":"2602.07574","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2026-02-07T14:46:05Z","cross_cats_sorted":["cs.CL"],"title_canon_sha256":"51c56d43dedde3eb2fb8672e8a9e4cff0361af43edd59f664a635f31bbea0a61","abstract_canon_sha256":"d7a6fa4cabdb4d5853c17bde68203f05ba5941a6d596e9fa4dbe13022bcc2ffb"},"schema_version":"1.0"},"canonical_sha256":"69bd90fd79b4171cc7f1232fa615823232b2e59ae0b7e294905836e2b3477781","source":{"kind":"arxiv","id":"2602.07574","version":2},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2602.07574","created_at":"2026-05-28T01:04:36Z"},{"alias_kind":"arxiv_version","alias_value":"2602.07574v2","created_at":"2026-05-28T01:04:36Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2602.07574","created_at":"2026-05-28T01:04:36Z"},{"alias_kind":"pith_short_12","alias_value":"NG6ZB7LZWQLR","created_at":"2026-05-28T01:04:36Z"},{"alias_kind":"pith_short_16","alias_value":"NG6ZB7LZWQLRZR7R","created_at":"2026-05-28T01:04:36Z"},{"alias_kind":"pith_short_8","alias_value":"NG6ZB7LZ","created_at":"2026-05-28T01:04:36Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:NG6ZB7LZWQLRZR7REMX2MFMCGI","target":"record","payload":{"canonical_record":{"source":{"id":"2602.07574","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2026-02-07T14:46:05Z","cross_cats_sorted":["cs.CL"],"title_canon_sha256":"51c56d43dedde3eb2fb8672e8a9e4cff0361af43edd59f664a635f31bbea0a61","abstract_canon_sha256":"d7a6fa4cabdb4d5853c17bde68203f05ba5941a6d596e9fa4dbe13022bcc2ffb"},"schema_version":"1.0"},"canonical_sha256":"69bd90fd79b4171cc7f1232fa615823232b2e59ae0b7e294905836e2b3477781","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-28T01:04:36.557759Z","signature_b64":"vBj1QGBjdg0IO0d3DC70aq/Q0vPTxg/I5IKJBAzeYA5FqzmbSXPYOe+uBaxWMfaWS487FihuUlrky9Oigru1DA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"69bd90fd79b4171cc7f1232fa615823232b2e59ae0b7e294905836e2b3477781","last_reissued_at":"2026-05-28T01:04:36.557344Z","signature_status":"signed_v1","first_computed_at":"2026-05-28T01:04:36.557344Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2602.07574","source_version":2,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-28T01:04:36Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"PUJu5l4LlkzB2zemHNRM+ZGpK2FrrCYaV4iDz0L5U43aZcZcnJpq+NzyzpM3tLucZmLyLiU6tG4WStxRupw6Bw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-01T12:39:32.358717Z"},"content_sha256":"0c17c39ad84c71891390472c5471cbe82f2404d7909080059eec84aee2843e06","schema_version":"1.0","event_id":"sha256:0c17c39ad84c71891390472c5471cbe82f2404d7909080059eec84aee2843e06"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:NG6ZB7LZWQLRZR7REMX2MFMCGI","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"ViCA: Efficient Multimodal LLMs with Vision-Only Cross-Attention","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.CL"],"primary_cat":"cs.CV","authors_text":"Anhao Zhao, Hao Wu, Wenjie Liu, Xiaoyu Shen, Xin Qiu, Xudong Wang, Yihan Zhang, Yingqi Fan, Yunpu Ma","submitted_at":"2026-02-07T14:46:05Z","abstract_excerpt":"Modern multimodal large language models (MLLMs) adopt a unified self-attention design that processes visual and textual tokens at every Transformer layer, incurring substantial computational overhead. In this work, we revisit the necessity of such dense visual processing and show that projected visual embeddings are already well-aligned with the language space, while effective vision-language interaction occurs in only a small subset of layers. Based on these insights, we propose ViCA (Vision-only Cross-Attention), a minimal MLLM architecture in which visual tokens bypass all self-attention an"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2602.07574","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2602.07574/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":null},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-28T01:04:36Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"haee/jykTZZb2SOW/ZXeKtdwVxJXwSD7ebCbQC4CP9UjIpjrs8YWNDNh+ueSaALxXwfdhg9N77fVLmdpAxRzDQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-01T12:39:32.359094Z"},"content_sha256":"18dc94f124e5d97f08f424588aa92a9da8b90fa1a40470576b5fcbc7c71682c7","schema_version":"1.0","event_id":"sha256:18dc94f124e5d97f08f424588aa92a9da8b90fa1a40470576b5fcbc7c71682c7"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/NG6ZB7LZWQLRZR7REMX2MFMCGI/bundle.json","state_url":"https://pith.science/pith/NG6ZB7LZWQLRZR7REMX2MFMCGI/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/NG6ZB7LZWQLRZR7REMX2MFMCGI/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-01T12:39:32Z","links":{"resolver":"https://pith.science/pith/NG6ZB7LZWQLRZR7REMX2MFMCGI","bundle":"https://pith.science/pith/NG6ZB7LZWQLRZR7REMX2MFMCGI/bundle.json","state":"https://pith.science/pith/NG6ZB7LZWQLRZR7REMX2MFMCGI/state.json","well_known_bundle":"https://pith.science/.well-known/pith/NG6ZB7LZWQLRZR7REMX2MFMCGI/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:NG6ZB7LZWQLRZR7REMX2MFMCGI","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"d7a6fa4cabdb4d5853c17bde68203f05ba5941a6d596e9fa4dbe13022bcc2ffb","cross_cats_sorted":["cs.CL"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2026-02-07T14:46:05Z","title_canon_sha256":"51c56d43dedde3eb2fb8672e8a9e4cff0361af43edd59f664a635f31bbea0a61"},"schema_version":"1.0","source":{"id":"2602.07574","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2602.07574","created_at":"2026-05-28T01:04:36Z"},{"alias_kind":"arxiv_version","alias_value":"2602.07574v2","created_at":"2026-05-28T01:04:36Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2602.07574","created_at":"2026-05-28T01:04:36Z"},{"alias_kind":"pith_short_12","alias_value":"NG6ZB7LZWQLR","created_at":"2026-05-28T01:04:36Z"},{"alias_kind":"pith_short_16","alias_value":"NG6ZB7LZWQLRZR7R","created_at":"2026-05-28T01:04:36Z"},{"alias_kind":"pith_short_8","alias_value":"NG6ZB7LZ","created_at":"2026-05-28T01:04:36Z"}],"graph_snapshots":[{"event_id":"sha256:18dc94f124e5d97f08f424588aa92a9da8b90fa1a40470576b5fcbc7c71682c7","target":"graph","created_at":"2026-05-28T01:04:36Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2602.07574/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Modern multimodal large language models (MLLMs) adopt a unified self-attention design that processes visual and textual tokens at every Transformer layer, incurring substantial computational overhead. In this work, we revisit the necessity of such dense visual processing and show that projected visual embeddings are already well-aligned with the language space, while effective vision-language interaction occurs in only a small subset of layers. Based on these insights, we propose ViCA (Vision-only Cross-Attention), a minimal MLLM architecture in which visual tokens bypass all self-attention an","authors_text":"Anhao Zhao, Hao Wu, Wenjie Liu, Xiaoyu Shen, Xin Qiu, Xudong Wang, Yihan Zhang, Yingqi Fan, Yunpu Ma","cross_cats":["cs.CL"],"headline":"","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2026-02-07T14:46:05Z","title":"ViCA: Efficient Multimodal LLMs with Vision-Only Cross-Attention"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2602.07574","kind":"arxiv","version":2},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:0c17c39ad84c71891390472c5471cbe82f2404d7909080059eec84aee2843e06","target":"record","created_at":"2026-05-28T01:04:36Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"d7a6fa4cabdb4d5853c17bde68203f05ba5941a6d596e9fa4dbe13022bcc2ffb","cross_cats_sorted":["cs.CL"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2026-02-07T14:46:05Z","title_canon_sha256":"51c56d43dedde3eb2fb8672e8a9e4cff0361af43edd59f664a635f31bbea0a61"},"schema_version":"1.0","source":{"id":"2602.07574","kind":"arxiv","version":2}},"canonical_sha256":"69bd90fd79b4171cc7f1232fa615823232b2e59ae0b7e294905836e2b3477781","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"69bd90fd79b4171cc7f1232fa615823232b2e59ae0b7e294905836e2b3477781","first_computed_at":"2026-05-28T01:04:36.557344Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-28T01:04:36.557344Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"vBj1QGBjdg0IO0d3DC70aq/Q0vPTxg/I5IKJBAzeYA5FqzmbSXPYOe+uBaxWMfaWS487FihuUlrky9Oigru1DA==","signature_status":"signed_v1","signed_at":"2026-05-28T01:04:36.557759Z","signed_message":"canonical_sha256_bytes"},"source_id":"2602.07574","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:0c17c39ad84c71891390472c5471cbe82f2404d7909080059eec84aee2843e06","sha256:18dc94f124e5d97f08f424588aa92a9da8b90fa1a40470576b5fcbc7c71682c7"],"state_sha256":"62214f59d39739d4dd577cc8d44c84a87f468891ebffd2b232130a32f87b894e"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"1mDLbHGlAETHgmRE0CTnhmlQaakVFRQdYKenw18FSvNNfCNq/0VOhUm/cHna7P4X8PSCrEIxeNCBxxgEiJwWBw==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-01T12:39:32.361137Z","bundle_sha256":"1b47f6b97d9f1ffd091086ac92700a996551a7985d41c8814a2774f100e3fd8b"}}