{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:NOI6RE2BZ5EATX6SOCKGKDR2RZ","short_pith_number":"pith:NOI6RE2B","canonical_record":{"source":{"id":"2605.12586","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-12T17:57:21Z","cross_cats_sorted":["cs.AI","cs.DB"],"title_canon_sha256":"adeb545abf459c52b859a0d20949b0c248ca06ee1fafd74c9033433cdffd0be9","abstract_canon_sha256":"005ba91da2624f69649f69e4b9af5a4867033ae8696317380337d8fb81c19ecd"},"schema_version":"1.0"},"canonical_sha256":"6b91e89341cf4809dfd27094650e3a8e5032b5f57e5c9b3dfe9072a365942312","source":{"kind":"arxiv","id":"2605.12586","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.12586","created_at":"2026-05-18T03:10:01Z"},{"alias_kind":"arxiv_version","alias_value":"2605.12586v1","created_at":"2026-05-18T03:10:01Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.12586","created_at":"2026-05-18T03:10:01Z"},{"alias_kind":"pith_short_12","alias_value":"NOI6RE2BZ5EA","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"NOI6RE2BZ5EATX6S","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"NOI6RE2B","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:NOI6RE2BZ5EATX6SOCKGKDR2RZ","target":"record","payload":{"canonical_record":{"source":{"id":"2605.12586","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-12T17:57:21Z","cross_cats_sorted":["cs.AI","cs.DB"],"title_canon_sha256":"adeb545abf459c52b859a0d20949b0c248ca06ee1fafd74c9033433cdffd0be9","abstract_canon_sha256":"005ba91da2624f69649f69e4b9af5a4867033ae8696317380337d8fb81c19ecd"},"schema_version":"1.0"},"canonical_sha256":"6b91e89341cf4809dfd27094650e3a8e5032b5f57e5c9b3dfe9072a365942312","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T03:10:01.354695Z","signature_b64":"Myeb0fcz6eoShNXi7nRDmti9PRHDxh2XtaJMr9fQmysw4DNgmIOs5tIGeA/Qn7zDTSm7xrA5+WHrUsxWuXzcDQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"6b91e89341cf4809dfd27094650e3a8e5032b5f57e5c9b3dfe9072a365942312","last_reissued_at":"2026-05-18T03:10:01.354139Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T03:10:01.354139Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2605.12586","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-18T03:10:01Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"lb/jSsDq/vtE8Xk3X6UYj4U+4XuWJQCmARzp0oFdWh2lIEc6XoZwXFPy9GKTM4Z2fEwFY79nGDY6H9/RnkVfCw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-25T20:36:10.671531Z"},"content_sha256":"a6f44a33a91e84ced10bfc3671dd52c946aa4476272932f1a09a00b8797258b9","schema_version":"1.0","event_id":"sha256:a6f44a33a91e84ced10bfc3671dd52c946aa4476272932f1a09a00b8797258b9"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:NOI6RE2BZ5EATX6SOCKGKDR2RZ","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"3D Primitives are a Spatial Language for VLMs","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Vision-language models gain spatial understanding when they reason through 3D geometric primitives written as executable code.","cross_cats":["cs.AI","cs.DB"],"primary_cat":"cs.CV","authors_text":"Alejandro Mottini, Anping Wang, Arvind Srinivasan, Florian Dubost, Junze Liu, Kai Zhong, Kun Qian, Nan Chen, Qingjun Cui, Sam Zhang, Tian Wang","submitted_at":"2026-05-12T17:57:21Z","abstract_excerpt":"Vision-language models (VLMs) exhibit a striking paradox: they can generate executable code that reconstructs a 3D scene from geometric primitives with correct object counts, classes, and approximate positions, yet the same models fail at simpler spatial questions on the same image. We show that 3D geometric primitives (cubes, spheres, cylinders, expressed in executable code) serve as a powerful intermediate representation for spatial understanding, and exploit this through three contributions. First, we introduce \\textbf{\\textsc{SpatialBabel}}, a benchmark evaluating fourteen VLMs on primitiv"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"3D geometric primitives (cubes, spheres, cylinders, expressed in executable code) serve as a powerful intermediate representation for spatial understanding.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That gains from Code-CoT and S³-FT stem specifically from the primitive representation rather than general code-generation prompting or fine-tuning effects alone.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"3D geometric primitives in executable code act as an effective intermediate spatial language that boosts VLMs on reconstruction and question-answering tasks.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Vision-language models gain spatial understanding when they reason through 3D geometric primitives written as executable code.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"3bebcda0b9adf748dac9230950364bbabdb5025f05688d4692c60411e1ea41ea"},"source":{"id":"2605.12586","kind":"arxiv","version":1},"verdict":{"id":"0ffd137e-929d-4091-aa48-30c988ec225c","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-14T21:29:03.665353Z","strongest_claim":"3D geometric primitives (cubes, spheres, cylinders, expressed in executable code) serve as a powerful intermediate representation for spatial understanding.","one_line_summary":"3D geometric primitives in executable code act as an effective intermediate spatial language that boosts VLMs on reconstruction and question-answering tasks.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That gains from Code-CoT and S³-FT stem specifically from the primitive representation rather than general code-generation prompting or fine-tuning effects alone.","pith_extraction_headline":"Vision-language models gain spatial understanding when they reason through 3D geometric primitives written as executable code."},"references":{"count":16,"sample":[{"doi":"","year":null,"title":"The spatial blindspot of vision-language models.arXiv preprint arXiv:2601.09954,","work_id":"3bba1383-654b-4e46-ac5c-0314aa8b8e5e","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Program Synthesis with Large Language Models","work_id":"fd241a05-03b9-4de2-9588-9d77ce176125","ref_index":2,"cited_arxiv_id":"2108.07732","is_internal_anchor":true},{"doi":"","year":null,"title":"Evaluating Large Language Models Trained on Code","work_id":"042493e9-b26f-4b4e-bbde-382072ca9b08","ref_index":3,"cited_arxiv_id":"2107.03374","is_internal_anchor":true},{"doi":"","year":null,"title":"Why is spatial reasoning hard for vlms? an attention mechanism perspective on focus areas","work_id":"cb9bb652-e52c-4ea5-9f44-b1c3342e7c5c","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"SWE-bench: Can Language Models Resolve Real-World GitHub Issues?","work_id":"d0effe15-a689-441a-8e3f-ea35f1c4e4b1","ref_index":5,"cited_arxiv_id":"2310.06770","is_internal_anchor":true}],"resolved_work":16,"snapshot_sha256":"040996ff1b2b78171b3957a26a80151130acaac200ce5a615a7182071df1f219","internal_anchors":6},"formal_canon":{"evidence_count":2,"snapshot_sha256":"fd141b44cf23d998e598a5372615822f7d28a94de4ab909052ff24e397cbb592"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"0ffd137e-929d-4091-aa48-30c988ec225c"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-18T03:10:01Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"WmI7lSsEQVGFXNkMhmh2b4wCIK+flVCxTg1r4HU12Tmw71SE8uCtHRSqpxfwFRmNDeIxWPKYnCAIs7tslD/VBg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-25T20:36:10.672485Z"},"content_sha256":"f064d1f3eddeaa9101dce537c2e3b7bd9f6b3a4fb0b8b745ca9cf173ff73ce14","schema_version":"1.0","event_id":"sha256:f064d1f3eddeaa9101dce537c2e3b7bd9f6b3a4fb0b8b745ca9cf173ff73ce14"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/NOI6RE2BZ5EATX6SOCKGKDR2RZ/bundle.json","state_url":"https://pith.science/pith/NOI6RE2BZ5EATX6SOCKGKDR2RZ/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/NOI6RE2BZ5EATX6SOCKGKDR2RZ/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-25T20:36:10Z","links":{"resolver":"https://pith.science/pith/NOI6RE2BZ5EATX6SOCKGKDR2RZ","bundle":"https://pith.science/pith/NOI6RE2BZ5EATX6SOCKGKDR2RZ/bundle.json","state":"https://pith.science/pith/NOI6RE2BZ5EATX6SOCKGKDR2RZ/state.json","well_known_bundle":"https://pith.science/.well-known/pith/NOI6RE2BZ5EATX6SOCKGKDR2RZ/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:NOI6RE2BZ5EATX6SOCKGKDR2RZ","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"005ba91da2624f69649f69e4b9af5a4867033ae8696317380337d8fb81c19ecd","cross_cats_sorted":["cs.AI","cs.DB"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-12T17:57:21Z","title_canon_sha256":"adeb545abf459c52b859a0d20949b0c248ca06ee1fafd74c9033433cdffd0be9"},"schema_version":"1.0","source":{"id":"2605.12586","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.12586","created_at":"2026-05-18T03:10:01Z"},{"alias_kind":"arxiv_version","alias_value":"2605.12586v1","created_at":"2026-05-18T03:10:01Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.12586","created_at":"2026-05-18T03:10:01Z"},{"alias_kind":"pith_short_12","alias_value":"NOI6RE2BZ5EA","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"NOI6RE2BZ5EATX6S","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"NOI6RE2B","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:f064d1f3eddeaa9101dce537c2e3b7bd9f6b3a4fb0b8b745ca9cf173ff73ce14","target":"graph","created_at":"2026-05-18T03:10:01Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"3D geometric primitives (cubes, spheres, cylinders, expressed in executable code) serve as a powerful intermediate representation for spatial understanding."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That gains from Code-CoT and S³-FT stem specifically from the primitive representation rather than general code-generation prompting or fine-tuning effects alone."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"3D geometric primitives in executable code act as an effective intermediate spatial language that boosts VLMs on reconstruction and question-answering tasks."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Vision-language models gain spatial understanding when they reason through 3D geometric primitives written as executable code."}],"snapshot_sha256":"3bebcda0b9adf748dac9230950364bbabdb5025f05688d4692c60411e1ea41ea"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"fd141b44cf23d998e598a5372615822f7d28a94de4ab909052ff24e397cbb592"},"paper":{"abstract_excerpt":"Vision-language models (VLMs) exhibit a striking paradox: they can generate executable code that reconstructs a 3D scene from geometric primitives with correct object counts, classes, and approximate positions, yet the same models fail at simpler spatial questions on the same image. We show that 3D geometric primitives (cubes, spheres, cylinders, expressed in executable code) serve as a powerful intermediate representation for spatial understanding, and exploit this through three contributions. First, we introduce \\textbf{\\textsc{SpatialBabel}}, a benchmark evaluating fourteen VLMs on primitiv","authors_text":"Alejandro Mottini, Anping Wang, Arvind Srinivasan, Florian Dubost, Junze Liu, Kai Zhong, Kun Qian, Nan Chen, Qingjun Cui, Sam Zhang, Tian Wang","cross_cats":["cs.AI","cs.DB"],"headline":"Vision-language models gain spatial understanding when they reason through 3D geometric primitives written as executable code.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-12T17:57:21Z","title":"3D Primitives are a Spatial Language for VLMs"},"references":{"count":16,"internal_anchors":6,"resolved_work":16,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"The spatial blindspot of vision-language models.arXiv preprint arXiv:2601.09954,","work_id":"3bba1383-654b-4e46-ac5c-0314aa8b8e5e","year":null},{"cited_arxiv_id":"2108.07732","doi":"","is_internal_anchor":true,"ref_index":2,"title":"Program Synthesis with Large Language Models","work_id":"fd241a05-03b9-4de2-9588-9d77ce176125","year":null},{"cited_arxiv_id":"2107.03374","doi":"","is_internal_anchor":true,"ref_index":3,"title":"Evaluating Large Language Models Trained on Code","work_id":"042493e9-b26f-4b4e-bbde-382072ca9b08","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Why is spatial reasoning hard for vlms? an attention mechanism perspective on focus areas","work_id":"cb9bb652-e52c-4ea5-9f44-b1c3342e7c5c","year":null},{"cited_arxiv_id":"2310.06770","doi":"","is_internal_anchor":true,"ref_index":5,"title":"SWE-bench: Can Language Models Resolve Real-World GitHub Issues?","work_id":"d0effe15-a689-441a-8e3f-ea35f1c4e4b1","year":null}],"snapshot_sha256":"040996ff1b2b78171b3957a26a80151130acaac200ce5a615a7182071df1f219"},"source":{"id":"2605.12586","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-14T21:29:03.665353Z","id":"0ffd137e-929d-4091-aa48-30c988ec225c","model_set":{"reader":"grok-4.3"},"one_line_summary":"3D geometric primitives in executable code act as an effective intermediate spatial language that boosts VLMs on reconstruction and question-answering tasks.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Vision-language models gain spatial understanding when they reason through 3D geometric primitives written as executable code.","strongest_claim":"3D geometric primitives (cubes, spheres, cylinders, expressed in executable code) serve as a powerful intermediate representation for spatial understanding.","weakest_assumption":"That gains from Code-CoT and S³-FT stem specifically from the primitive representation rather than general code-generation prompting or fine-tuning effects alone."}},"verdict_id":"0ffd137e-929d-4091-aa48-30c988ec225c"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:a6f44a33a91e84ced10bfc3671dd52c946aa4476272932f1a09a00b8797258b9","target":"record","created_at":"2026-05-18T03:10:01Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"005ba91da2624f69649f69e4b9af5a4867033ae8696317380337d8fb81c19ecd","cross_cats_sorted":["cs.AI","cs.DB"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-12T17:57:21Z","title_canon_sha256":"adeb545abf459c52b859a0d20949b0c248ca06ee1fafd74c9033433cdffd0be9"},"schema_version":"1.0","source":{"id":"2605.12586","kind":"arxiv","version":1}},"canonical_sha256":"6b91e89341cf4809dfd27094650e3a8e5032b5f57e5c9b3dfe9072a365942312","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"6b91e89341cf4809dfd27094650e3a8e5032b5f57e5c9b3dfe9072a365942312","first_computed_at":"2026-05-18T03:10:01.354139Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-18T03:10:01.354139Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"Myeb0fcz6eoShNXi7nRDmti9PRHDxh2XtaJMr9fQmysw4DNgmIOs5tIGeA/Qn7zDTSm7xrA5+WHrUsxWuXzcDQ==","signature_status":"signed_v1","signed_at":"2026-05-18T03:10:01.354695Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.12586","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:a6f44a33a91e84ced10bfc3671dd52c946aa4476272932f1a09a00b8797258b9","sha256:f064d1f3eddeaa9101dce537c2e3b7bd9f6b3a4fb0b8b745ca9cf173ff73ce14"],"state_sha256":"7e3278db5a873bb7630221686a957cbceda58fdcc872796a0997d11308e70934"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"uck02GaNrL5BvutudSGA9jfdwsaNEITgn20rRj1D1OG8R6NH0kflmdvJPgj/IEX5JxewKpBeeCV6KlyPI2WYBw==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-25T20:36:10.676902Z","bundle_sha256":"5c788837b2d2670bddf6977cf648b70d91ec1ec8108e194adffb42b7e00cc625"}}