{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2025:ZQIGTOMZVV6NZV46VFDBKNA5GR","short_pith_number":"pith:ZQIGTOMZ","canonical_record":{"source":{"id":"2506.09965","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by-nc-nd/4.0/","primary_cat":"cs.CV","submitted_at":"2025-06-11T17:41:50Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"49c6f98b018f769c5ca15f31125e0aa5c5cda43b0936e558bf06194b35f34ade","abstract_canon_sha256":"17d4e617e19f707f2010efdcf41dccb3a32e052c43e04f103239214ff95a16ae"},"schema_version":"1.0"},"canonical_sha256":"cc1069b999ad7cdcd79ea94615341d3443697b464b87cd791eda08e027553984","source":{"kind":"arxiv","id":"2506.09965","version":2},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2506.09965","created_at":"2026-05-17T23:38:15Z"},{"alias_kind":"arxiv_version","alias_value":"2506.09965v2","created_at":"2026-05-17T23:38:15Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2506.09965","created_at":"2026-05-17T23:38:15Z"},{"alias_kind":"pith_short_12","alias_value":"ZQIGTOMZVV6N","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"ZQIGTOMZVV6NZV46","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"ZQIGTOMZ","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2025:ZQIGTOMZVV6NZV46VFDBKNA5GR","target":"record","payload":{"canonical_record":{"source":{"id":"2506.09965","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by-nc-nd/4.0/","primary_cat":"cs.CV","submitted_at":"2025-06-11T17:41:50Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"49c6f98b018f769c5ca15f31125e0aa5c5cda43b0936e558bf06194b35f34ade","abstract_canon_sha256":"17d4e617e19f707f2010efdcf41dccb3a32e052c43e04f103239214ff95a16ae"},"schema_version":"1.0"},"canonical_sha256":"cc1069b999ad7cdcd79ea94615341d3443697b464b87cd791eda08e027553984","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:15.062806Z","signature_b64":"nNKpXqQTpdER9mTK6inaYsl3xqV1kllBoz471cf00j88MmkjI41bcIULmReUGhtyp6OmmTt9texZBhgmwWHuAQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"cc1069b999ad7cdcd79ea94615341d3443697b464b87cd791eda08e027553984","last_reissued_at":"2026-05-17T23:38:15.062127Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:15.062127Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2506.09965","source_version":2,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:15Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"sm6XmX2eP448YfuX0j6xZCC+6a91Bc9yc+m5Jd4qksEVWzbTyc9T3glxdP9ULRPflBAg0XA/gA4+KGRmPhiGCA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-27T21:42:11.287119Z"},"content_sha256":"196591e7758671a36c49a160d6f9f0a7b3bcd27edb22006701840cbb092ff2eb","schema_version":"1.0","event_id":"sha256:196591e7758671a36c49a160d6f9f0a7b3bcd27edb22006701840cbb092ff2eb"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2025:ZQIGTOMZVV6NZV46VFDBKNA5GR","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Reinforcing Spatial Reasoning in Vision-Language Models with Interwoven Thinking and Visual Drawing","license":"http://creativecommons.org/licenses/by-nc-nd/4.0/","headline":"Vision-language models improve spatial reasoning by drawing boxes and lines on images during thinking.","cross_cats":["cs.AI"],"primary_cat":"cs.CV","authors_text":"Jian Guan, Junfei Wu, Kaituo Feng, Liang Wang, Qiang Liu, Shu Wu, Tieniu Tan, Wei Wu","submitted_at":"2025-06-11T17:41:50Z","abstract_excerpt":"As textual reasoning with large language models (LLMs) has advanced significantly, there has been growing interest in enhancing the multimodal reasoning capabilities of large vision-language models (LVLMs). However, existing methods primarily approach multimodal reasoning in a straightforward, text-centric manner, where both reasoning and answer derivation are conducted purely through text, with the only difference being the presence of multimodal input. As a result, these methods often encounter fundamental limitations in spatial reasoning tasks that demand precise geometric understanding and"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"our model, named VILASR, consistently outperforms existing methods across diverse spatial reasoning benchmarks, involving maze navigation, static spatial reasoning, video-based reasoning, and multi-view-based reasoning tasks, with an average improvement of 18.4%.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That basic drawing operations (annotating bounding boxes and drawing auxiliary lines) can be learned and used by LVLMs to achieve precise geometric understanding and continuous spatial tracking without specialized external perception tools.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"VILASR integrates visual drawing operations with reasoning in LVLMs via cold-start synthetic training, reflective rejection sampling, and reinforcement learning, yielding an 18.4% average gain on spatial reasoning benchmarks.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Vision-language models improve spatial reasoning by drawing boxes and lines on images during thinking.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"3bf2a0083579bee24f385826283e3fe21b7a9747c7c929c43ba0939380ffd02e"},"source":{"id":"2506.09965","kind":"arxiv","version":2},"verdict":{"id":"47369373-6a09-416d-b94f-d11b2d96a3cb","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-17T04:53:42.203896Z","strongest_claim":"our model, named VILASR, consistently outperforms existing methods across diverse spatial reasoning benchmarks, involving maze navigation, static spatial reasoning, video-based reasoning, and multi-view-based reasoning tasks, with an average improvement of 18.4%.","one_line_summary":"VILASR integrates visual drawing operations with reasoning in LVLMs via cold-start synthetic training, reflective rejection sampling, and reinforcement learning, yielding an 18.4% average gain on spatial reasoning benchmarks.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That basic drawing operations (annotating bounding boxes and drawing auxiliary lines) can be learned and used by LVLMs to achieve precise geometric understanding and continuous spatial tracking without specialized external perception tools.","pith_extraction_headline":"Vision-language models improve spatial reasoning by drawing boxes and lines on images during thinking."},"references":{"count":82,"sample":[{"doi":"","year":2024,"title":"Self-RAG: Learning to retrieve, generate, and critique through self-reflection","work_id":"da3d632e-f0c3-464e-9cae-ad09201f96eb","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2025,"title":"Qwen2.5-VL Technical Report","work_id":"69dffacb-bfe8-442d-be86-48624c60426f","ref_index":2,"cited_arxiv_id":"2502.13923","is_internal_anchor":true},{"doi":"","year":2008,"title":"Spatial cognition and the brain","work_id":"f820e4fa-cc03-42e8-aab3-bd12faf6dc43","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2025,"title":"Spatialbot: Precise spatial understanding with vision language models, 2025","work_id":"2b053aa0-1df8-4728-aa94-69ac92bbf110","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2024,"title":"Spatialvlm: Endowing vision-language models with spatial reasoning capabilities","work_id":"62d217f3-07e5-4f8c-8252-21b4514bbae2","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":82,"snapshot_sha256":"6ce627b12ae6db4de84fc4bad84824023d061cdac82ba10da7f4980c0fe2dcdc","internal_anchors":11},"formal_canon":{"evidence_count":1,"snapshot_sha256":"2acacb8b411c1ec02311b29a916a4620c9b9f7637127543c945c6ca860977935"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"47369373-6a09-416d-b94f-d11b2d96a3cb"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:15Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"YPUuF464qsSSuBw5U1jcLK/+p90f7jY3tvYAi/z4oghxrY3gN3KCDikSY9JKtB7AxMPFSGrNdT0ChOu9ejkiCA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-27T21:42:11.287844Z"},"content_sha256":"55f25b70941030093109ceeff1fcc73c51c8e32ddddbd42cf3bfc00fcd891c98","schema_version":"1.0","event_id":"sha256:55f25b70941030093109ceeff1fcc73c51c8e32ddddbd42cf3bfc00fcd891c98"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/ZQIGTOMZVV6NZV46VFDBKNA5GR/bundle.json","state_url":"https://pith.science/pith/ZQIGTOMZVV6NZV46VFDBKNA5GR/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/ZQIGTOMZVV6NZV46VFDBKNA5GR/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-27T21:42:11Z","links":{"resolver":"https://pith.science/pith/ZQIGTOMZVV6NZV46VFDBKNA5GR","bundle":"https://pith.science/pith/ZQIGTOMZVV6NZV46VFDBKNA5GR/bundle.json","state":"https://pith.science/pith/ZQIGTOMZVV6NZV46VFDBKNA5GR/state.json","well_known_bundle":"https://pith.science/.well-known/pith/ZQIGTOMZVV6NZV46VFDBKNA5GR/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2025:ZQIGTOMZVV6NZV46VFDBKNA5GR","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"17d4e617e19f707f2010efdcf41dccb3a32e052c43e04f103239214ff95a16ae","cross_cats_sorted":["cs.AI"],"license":"http://creativecommons.org/licenses/by-nc-nd/4.0/","primary_cat":"cs.CV","submitted_at":"2025-06-11T17:41:50Z","title_canon_sha256":"49c6f98b018f769c5ca15f31125e0aa5c5cda43b0936e558bf06194b35f34ade"},"schema_version":"1.0","source":{"id":"2506.09965","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2506.09965","created_at":"2026-05-17T23:38:15Z"},{"alias_kind":"arxiv_version","alias_value":"2506.09965v2","created_at":"2026-05-17T23:38:15Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2506.09965","created_at":"2026-05-17T23:38:15Z"},{"alias_kind":"pith_short_12","alias_value":"ZQIGTOMZVV6N","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"ZQIGTOMZVV6NZV46","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"ZQIGTOMZ","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:55f25b70941030093109ceeff1fcc73c51c8e32ddddbd42cf3bfc00fcd891c98","target":"graph","created_at":"2026-05-17T23:38:15Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"our model, named VILASR, consistently outperforms existing methods across diverse spatial reasoning benchmarks, involving maze navigation, static spatial reasoning, video-based reasoning, and multi-view-based reasoning tasks, with an average improvement of 18.4%."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That basic drawing operations (annotating bounding boxes and drawing auxiliary lines) can be learned and used by LVLMs to achieve precise geometric understanding and continuous spatial tracking without specialized external perception tools."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"VILASR integrates visual drawing operations with reasoning in LVLMs via cold-start synthetic training, reflective rejection sampling, and reinforcement learning, yielding an 18.4% average gain on spatial reasoning benchmarks."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Vision-language models improve spatial reasoning by drawing boxes and lines on images during thinking."}],"snapshot_sha256":"3bf2a0083579bee24f385826283e3fe21b7a9747c7c929c43ba0939380ffd02e"},"formal_canon":{"evidence_count":1,"snapshot_sha256":"2acacb8b411c1ec02311b29a916a4620c9b9f7637127543c945c6ca860977935"},"paper":{"abstract_excerpt":"As textual reasoning with large language models (LLMs) has advanced significantly, there has been growing interest in enhancing the multimodal reasoning capabilities of large vision-language models (LVLMs). However, existing methods primarily approach multimodal reasoning in a straightforward, text-centric manner, where both reasoning and answer derivation are conducted purely through text, with the only difference being the presence of multimodal input. As a result, these methods often encounter fundamental limitations in spatial reasoning tasks that demand precise geometric understanding and","authors_text":"Jian Guan, Junfei Wu, Kaituo Feng, Liang Wang, Qiang Liu, Shu Wu, Tieniu Tan, Wei Wu","cross_cats":["cs.AI"],"headline":"Vision-language models improve spatial reasoning by drawing boxes and lines on images during thinking.","license":"http://creativecommons.org/licenses/by-nc-nd/4.0/","primary_cat":"cs.CV","submitted_at":"2025-06-11T17:41:50Z","title":"Reinforcing Spatial Reasoning in Vision-Language Models with Interwoven Thinking and Visual Drawing"},"references":{"count":82,"internal_anchors":11,"resolved_work":82,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"Self-RAG: Learning to retrieve, generate, and critique through self-reflection","work_id":"da3d632e-f0c3-464e-9cae-ad09201f96eb","year":2024},{"cited_arxiv_id":"2502.13923","doi":"","is_internal_anchor":true,"ref_index":2,"title":"Qwen2.5-VL Technical Report","work_id":"69dffacb-bfe8-442d-be86-48624c60426f","year":2025},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"Spatial cognition and the brain","work_id":"f820e4fa-cc03-42e8-aab3-bd12faf6dc43","year":2008},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Spatialbot: Precise spatial understanding with vision language models, 2025","work_id":"2b053aa0-1df8-4728-aa94-69ac92bbf110","year":2025},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"Spatialvlm: Endowing vision-language models with spatial reasoning capabilities","work_id":"62d217f3-07e5-4f8c-8252-21b4514bbae2","year":2024}],"snapshot_sha256":"6ce627b12ae6db4de84fc4bad84824023d061cdac82ba10da7f4980c0fe2dcdc"},"source":{"id":"2506.09965","kind":"arxiv","version":2},"verdict":{"created_at":"2026-05-17T04:53:42.203896Z","id":"47369373-6a09-416d-b94f-d11b2d96a3cb","model_set":{"reader":"grok-4.3"},"one_line_summary":"VILASR integrates visual drawing operations with reasoning in LVLMs via cold-start synthetic training, reflective rejection sampling, and reinforcement learning, yielding an 18.4% average gain on spatial reasoning benchmarks.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Vision-language models improve spatial reasoning by drawing boxes and lines on images during thinking.","strongest_claim":"our model, named VILASR, consistently outperforms existing methods across diverse spatial reasoning benchmarks, involving maze navigation, static spatial reasoning, video-based reasoning, and multi-view-based reasoning tasks, with an average improvement of 18.4%.","weakest_assumption":"That basic drawing operations (annotating bounding boxes and drawing auxiliary lines) can be learned and used by LVLMs to achieve precise geometric understanding and continuous spatial tracking without specialized external perception tools."}},"verdict_id":"47369373-6a09-416d-b94f-d11b2d96a3cb"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:196591e7758671a36c49a160d6f9f0a7b3bcd27edb22006701840cbb092ff2eb","target":"record","created_at":"2026-05-17T23:38:15Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"17d4e617e19f707f2010efdcf41dccb3a32e052c43e04f103239214ff95a16ae","cross_cats_sorted":["cs.AI"],"license":"http://creativecommons.org/licenses/by-nc-nd/4.0/","primary_cat":"cs.CV","submitted_at":"2025-06-11T17:41:50Z","title_canon_sha256":"49c6f98b018f769c5ca15f31125e0aa5c5cda43b0936e558bf06194b35f34ade"},"schema_version":"1.0","source":{"id":"2506.09965","kind":"arxiv","version":2}},"canonical_sha256":"cc1069b999ad7cdcd79ea94615341d3443697b464b87cd791eda08e027553984","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"cc1069b999ad7cdcd79ea94615341d3443697b464b87cd791eda08e027553984","first_computed_at":"2026-05-17T23:38:15.062127Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:15.062127Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"nNKpXqQTpdER9mTK6inaYsl3xqV1kllBoz471cf00j88MmkjI41bcIULmReUGhtyp6OmmTt9texZBhgmwWHuAQ==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:15.062806Z","signed_message":"canonical_sha256_bytes"},"source_id":"2506.09965","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:196591e7758671a36c49a160d6f9f0a7b3bcd27edb22006701840cbb092ff2eb","sha256:55f25b70941030093109ceeff1fcc73c51c8e32ddddbd42cf3bfc00fcd891c98"],"state_sha256":"cb2953cb2e10459a89574f44211051e503c3c363d8ce072cc84039982d38744a"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"K4YmufGavcd2KAptsCBjKLu+WvOF0vXVftBNiSVjM997IWKLvFP5hGbI2pTVp+gQJ8+luxFu6idTxX/iYeomBg==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-27T21:42:11.290952Z","bundle_sha256":"65fa3032b19be0ce8ff6d0968703b927a23f538315df450668d33c45d7ac4a75"}}