{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:I5Q2GXT6PRPJUNOWSPA4CDPYMU","short_pith_number":"pith:I5Q2GXT6","schema_version":"1.0","canonical_sha256":"4761a35e7e7c5e9a35d693c1c10df8651c1ed690f8ad515a954873b0898b0623","source":{"kind":"arxiv","id":"2605.27310","version":1},"attestation_state":"computed","paper":{"title":"How and What to Imagine? Visual Thinking in Unified Multimodal Models for Cross-View Spatial Reasoning","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Aishwarya Agrawal, Ankur Sikarwar, Huy Le, Le Zhang, Perouz Taslakian, Qian Yang, Zhuan Shi","submitted_at":"2026-05-26T17:20:05Z","abstract_excerpt":"Cross-view spatial reasoning remains a weak spot for vision-language models (VLMs): they often reason in language and lose the fine-grained geometry needed for the task. Thinking with images aims to address this by generating an intermediate thinking image, but recent work shows that models often ignore the visual evidence in these traces. We therefore ask how to make visual thinking matter, and what kind of visual thinking works best. We study these questions in unified multimodal models (UMMs), which natively support interleaved image-text generation. For the first question, we propose View "},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2605.27310","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-26T17:20:05Z","cross_cats_sorted":[],"title_canon_sha256":"1e8d49ba841d3b027901d62062a2c2c93d61cf404af0ea422b02d892723ecdf9","abstract_canon_sha256":"9b3a38bd1bb30b6b327e6aafa8f36bb7fb3ef0c0804b23270f4c262f09d645dd"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-27T02:06:17.540683Z","signature_b64":"0YKursL282uuoeH+UKVkNytevaLNrjd2XMBEyOUjIUVjz6DeRVAnv0c1j2KadDlGZwe3zu2uFUWg8KVe/7Y+AA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"4761a35e7e7c5e9a35d693c1c10df8651c1ed690f8ad515a954873b0898b0623","last_reissued_at":"2026-05-27T02:06:17.539901Z","signature_status":"signed_v1","first_computed_at":"2026-05-27T02:06:17.539901Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"How and What to Imagine? Visual Thinking in Unified Multimodal Models for Cross-View Spatial Reasoning","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Aishwarya Agrawal, Ankur Sikarwar, Huy Le, Le Zhang, Perouz Taslakian, Qian Yang, Zhuan Shi","submitted_at":"2026-05-26T17:20:05Z","abstract_excerpt":"Cross-view spatial reasoning remains a weak spot for vision-language models (VLMs): they often reason in language and lose the fine-grained geometry needed for the task. Thinking with images aims to address this by generating an intermediate thinking image, but recent work shows that models often ignore the visual evidence in these traces. We therefore ask how to make visual thinking matter, and what kind of visual thinking works best. We study these questions in unified multimodal models (UMMs), which natively support interleaved image-text generation. For the first question, we propose View "},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2605.27310","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2605.27310/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2605.27310","created_at":"2026-05-27T02:06:17.540005+00:00"},{"alias_kind":"arxiv_version","alias_value":"2605.27310v1","created_at":"2026-05-27T02:06:17.540005+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.27310","created_at":"2026-05-27T02:06:17.540005+00:00"},{"alias_kind":"pith_short_12","alias_value":"I5Q2GXT6PRPJ","created_at":"2026-05-27T02:06:17.540005+00:00"},{"alias_kind":"pith_short_16","alias_value":"I5Q2GXT6PRPJUNOW","created_at":"2026-05-27T02:06:17.540005+00:00"},{"alias_kind":"pith_short_8","alias_value":"I5Q2GXT6","created_at":"2026-05-27T02:06:17.540005+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/I5Q2GXT6PRPJUNOWSPA4CDPYMU","json":"https://pith.science/pith/I5Q2GXT6PRPJUNOWSPA4CDPYMU.json","graph_json":"https://pith.science/api/pith-number/I5Q2GXT6PRPJUNOWSPA4CDPYMU/graph.json","events_json":"https://pith.science/api/pith-number/I5Q2GXT6PRPJUNOWSPA4CDPYMU/events.json","paper":"https://pith.science/paper/I5Q2GXT6"},"agent_actions":{"view_html":"https://pith.science/pith/I5Q2GXT6PRPJUNOWSPA4CDPYMU","download_json":"https://pith.science/pith/I5Q2GXT6PRPJUNOWSPA4CDPYMU.json","view_paper":"https://pith.science/paper/I5Q2GXT6","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2605.27310&json=true","fetch_graph":"https://pith.science/api/pith-number/I5Q2GXT6PRPJUNOWSPA4CDPYMU/graph.json","fetch_events":"https://pith.science/api/pith-number/I5Q2GXT6PRPJUNOWSPA4CDPYMU/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/I5Q2GXT6PRPJUNOWSPA4CDPYMU/action/timestamp_anchor","attest_storage":"https://pith.science/pith/I5Q2GXT6PRPJUNOWSPA4CDPYMU/action/storage_attestation","attest_author":"https://pith.science/pith/I5Q2GXT6PRPJUNOWSPA4CDPYMU/action/author_attestation","sign_citation":"https://pith.science/pith/I5Q2GXT6PRPJUNOWSPA4CDPYMU/action/citation_signature","submit_replication":"https://pith.science/pith/I5Q2GXT6PRPJUNOWSPA4CDPYMU/action/replication_record"}},"created_at":"2026-05-27T02:06:17.540005+00:00","updated_at":"2026-05-27T02:06:17.540005+00:00"}