{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:IBU7HGJK27T6WSP7ALBMKAHUWE","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"9003ac28a69aa0f10bb75909789b3965bfc8a3e84ff787719e0f9567d1b0b158","cross_cats_sorted":["cs.LG"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2026-03-23T17:58:02Z","title_canon_sha256":"65a8942ff8873dde292b8509507bb6db285d6de3ebb4f8fe11a1a464dd71012f"},"schema_version":"1.0","source":{"id":"2603.22278","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2603.22278","created_at":"2026-06-08T01:04:01Z"},{"alias_kind":"arxiv_version","alias_value":"2603.22278v2","created_at":"2026-06-08T01:04:01Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2603.22278","created_at":"2026-06-08T01:04:01Z"},{"alias_kind":"pith_short_12","alias_value":"IBU7HGJK27T6","created_at":"2026-06-08T01:04:01Z"},{"alias_kind":"pith_short_16","alias_value":"IBU7HGJK27T6WSP7","created_at":"2026-06-08T01:04:01Z"},{"alias_kind":"pith_short_8","alias_value":"IBU7HGJK","created_at":"2026-06-08T01:04:01Z"}],"graph_snapshots":[{"event_id":"sha256:4d566f806b66442238bf17a40972261acdeffe71d89fef9e484e5847ef8c79ee","target":"graph","created_at":"2026-06-08T01:04:01Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2603.22278/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Many multimodal tasks, such as image captioning and visual question answering, require vision-language models (VLMs) to bind objects with their properties and spatial relations. Yet it remains unclear where and how such associations are computed within VLMs. In this work, we show that VLMs rely on two concurrent mechanisms to represent spatial variable binding. In the language model backbone, intermediate layers represent content-independent spatial relations on top of visual tokens corresponding to objects. However, this mechanism plays only a secondary role in shaping model predictions. Inst","authors_text":"Antonio Torralba, Ayush Raina, David Bau, Kelly Cui, Nikhil Prakash, Shoval Messica, Tamar Rott Shaham","cross_cats":["cs.LG"],"headline":"","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2026-03-23T17:58:02Z","title":"The Dual Mechanisms of Spatial Variable Binding in Vision-Language Models"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2603.22278","kind":"arxiv","version":2},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:0d76acd5a52762ae050843c305ffcf16ea1ce007194dbb21b7877bd7a9f44a02","target":"record","created_at":"2026-06-08T01:04:01Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"9003ac28a69aa0f10bb75909789b3965bfc8a3e84ff787719e0f9567d1b0b158","cross_cats_sorted":["cs.LG"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2026-03-23T17:58:02Z","title_canon_sha256":"65a8942ff8873dde292b8509507bb6db285d6de3ebb4f8fe11a1a464dd71012f"},"schema_version":"1.0","source":{"id":"2603.22278","kind":"arxiv","version":2}},"canonical_sha256":"4069f3992ad7e7eb49ff02c2c500f4b101239f2303c15f1c77a1ae072c0e438f","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"4069f3992ad7e7eb49ff02c2c500f4b101239f2303c15f1c77a1ae072c0e438f","first_computed_at":"2026-06-08T01:04:01.932818Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-06-08T01:04:01.932818Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"42rvPrC/pJPStcXh2hwiJPzRPI0ChGe2j8mwJGsgVqKX/Z4RFJTCy3VbVYmZ5xezWUqpkGuiwZ8SxRH9MOP6BA==","signature_status":"signed_v1","signed_at":"2026-06-08T01:04:01.933901Z","signed_message":"canonical_sha256_bytes"},"source_id":"2603.22278","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:0d76acd5a52762ae050843c305ffcf16ea1ce007194dbb21b7877bd7a9f44a02","sha256:4d566f806b66442238bf17a40972261acdeffe71d89fef9e484e5847ef8c79ee"],"state_sha256":"fb81376d790ded0195f3dfed7c621f2217eab4e62e0954938cf9f9ed5951ed1b"}