{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:3FBW3A4AFRFRBSMCK3MCSDEUVD","short_pith_number":"pith:3FBW3A4A","schema_version":"1.0","canonical_sha256":"d9436d83802c4b10c98256d8290c94a8f835f836115044eb71be6654a214e369","source":{"kind":"arxiv","id":"2605.05997","version":2},"attestation_state":"computed","paper":{"title":"4DThinker: Thinking with 4D Imagery for Dynamic Spatial Understanding","license":"http://creativecommons.org/licenses/by/4.0/","headline":"4DThinker lets vision-language models simulate evolving scenes inside their latent space for dynamic spatial reasoning from monocular video.","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Bo Li, Hongyu Li, Manyuan Zhang, Mingze Sun, Ruqi Huang, Shuang Chen, Xiang An, Xiaobin Hu, Xinlei Yu, Xin Xie, Zhangquan Chen, Zidong Wang","submitted_at":"2026-05-07T10:48:46Z","abstract_excerpt":"Dynamic spatial reasoning from monocular video is essential for bridging visual intelligence and the physical world, yet remains challenging for vision-language models (VLMs). Prior approaches either verbalize spatial-temporal reasoning entirely as text, which is inherently verbose and imprecise for complex dynamics, or rely on external geometric modules that increase inference complexity without fostering intrinsic model capability. In this paper, we present 4DThinker, the first framework that enables VLMs to \"think with 4D\" through dynamic latent mental imagery, i.e., internally simulating h"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2605.05997","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-07T10:48:46Z","cross_cats_sorted":[],"title_canon_sha256":"c76117be01773671c93740f35074247895a1389591a521ba58600e1e6ddd0340","abstract_canon_sha256":"f4b835433cc3360f396de4aeb7b3cf8f5c7766855e33bf2c12c6d76b1405e159"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-25T02:01:22.041829Z","signature_b64":"aGA3MNz0gQbZi/E7hb5f8/j6zgngXlHY/XcDdCvHMX0F9ji8fOBHEGk50nfeCKjveYnsuzACtrEsWVIq9Z9zDw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"d9436d83802c4b10c98256d8290c94a8f835f836115044eb71be6654a214e369","last_reissued_at":"2026-05-25T02:01:22.041210Z","signature_status":"signed_v1","first_computed_at":"2026-05-25T02:01:22.041210Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"4DThinker: Thinking with 4D Imagery for Dynamic Spatial Understanding","license":"http://creativecommons.org/licenses/by/4.0/","headline":"4DThinker lets vision-language models simulate evolving scenes inside their latent space for dynamic spatial reasoning from monocular video.","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Bo Li, Hongyu Li, Manyuan Zhang, Mingze Sun, Ruqi Huang, Shuang Chen, Xiang An, Xiaobin Hu, Xinlei Yu, Xin Xie, Zhangquan Chen, Zidong Wang","submitted_at":"2026-05-07T10:48:46Z","abstract_excerpt":"Dynamic spatial reasoning from monocular video is essential for bridging visual intelligence and the physical world, yet remains challenging for vision-language models (VLMs). Prior approaches either verbalize spatial-temporal reasoning entirely as text, which is inherently verbose and imprecise for complex dynamics, or rely on external geometric modules that increase inference complexity without fostering intrinsic model capability. In this paper, we present 4DThinker, the first framework that enables VLMs to \"think with 4D\" through dynamic latent mental imagery, i.e., internally simulating h"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"4DThinker is the first framework that enables VLMs to 'think with 4D' through dynamic latent mental imagery, and extensive experiments demonstrate that it consistently outperforms strong baselines on dynamic spatial reasoning benchmarks.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That the annotation-free 4D data synthesis pipeline produces sufficiently rich and accurate supervision signals, and that jointly training textual tokens with 4D latents via DIFT plus restricting 4DRL policy gradients to text tokens will yield stable and superior intrinsic dynamic reasoning without external geometric modules.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"4DThinker enables VLMs to perform dynamic spatial reasoning by internally simulating 4D imagery in latent space, outperforming prior text-based and modular approaches.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"4DThinker lets vision-language models simulate evolving scenes inside their latent space for dynamic spatial reasoning from monocular video.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"805ae014780d95237773309a5c07cc43c432b1a2df4593a5ddc42e2c34591e18"},"source":{"id":"2605.05997","kind":"arxiv","version":2},"verdict":{"id":"bf4cb2e5-ce86-43e2-98ca-6e412ff87f61","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-08T14:14:59.690166Z","strongest_claim":"4DThinker is the first framework that enables VLMs to 'think with 4D' through dynamic latent mental imagery, and extensive experiments demonstrate that it consistently outperforms strong baselines on dynamic spatial reasoning benchmarks.","one_line_summary":"4DThinker enables VLMs to perform dynamic spatial reasoning by internally simulating 4D imagery in latent space, outperforming prior text-based and modular approaches.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That the annotation-free 4D data synthesis pipeline produces sufficiently rich and accurate supervision signals, and that jointly training textual tokens with 4D latents via DIFT plus restricting 4DRL policy gradients to text tokens will yield stable and superior intrinsic dynamic reasoning without external geometric modules.","pith_extraction_headline":"4DThinker lets vision-language models simulate evolving scenes inside their latent space for dynamic spatial reasoning from monocular video."},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2605.05997/integrity.json","findings":[],"available":true,"detectors_run":[{"name":"claim_evidence","ran_at":"2026-05-20T13:22:04.357253Z","status":"completed","version":"1.0.0","findings_count":0},{"name":"ai_meta_artifact","ran_at":"2026-05-20T08:39:19.201278Z","status":"completed","version":"1.0.0","findings_count":0},{"name":"doi_title_agreement","ran_at":"2026-05-19T19:31:19.227551Z","status":"completed","version":"1.0.0","findings_count":0},{"name":"doi_compliance","ran_at":"2026-05-19T13:04:52.958191Z","status":"completed","version":"1.0.0","findings_count":0}],"snapshot_sha256":"17a56dd6107344e775ab73f717d2a4ed8e01fc0780da6777b21258a40d79cb8d"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2605.05997","created_at":"2026-05-25T02:01:22.041313+00:00"},{"alias_kind":"arxiv_version","alias_value":"2605.05997v2","created_at":"2026-05-25T02:01:22.041313+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.05997","created_at":"2026-05-25T02:01:22.041313+00:00"},{"alias_kind":"pith_short_12","alias_value":"3FBW3A4AFRFR","created_at":"2026-05-25T02:01:22.041313+00:00"},{"alias_kind":"pith_short_16","alias_value":"3FBW3A4AFRFRBSMC","created_at":"2026-05-25T02:01:22.041313+00:00"},{"alias_kind":"pith_short_8","alias_value":"3FBW3A4A","created_at":"2026-05-25T02:01:22.041313+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/3FBW3A4AFRFRBSMCK3MCSDEUVD","json":"https://pith.science/pith/3FBW3A4AFRFRBSMCK3MCSDEUVD.json","graph_json":"https://pith.science/api/pith-number/3FBW3A4AFRFRBSMCK3MCSDEUVD/graph.json","events_json":"https://pith.science/api/pith-number/3FBW3A4AFRFRBSMCK3MCSDEUVD/events.json","paper":"https://pith.science/paper/3FBW3A4A"},"agent_actions":{"view_html":"https://pith.science/pith/3FBW3A4AFRFRBSMCK3MCSDEUVD","download_json":"https://pith.science/pith/3FBW3A4AFRFRBSMCK3MCSDEUVD.json","view_paper":"https://pith.science/paper/3FBW3A4A","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2605.05997&json=true","fetch_graph":"https://pith.science/api/pith-number/3FBW3A4AFRFRBSMCK3MCSDEUVD/graph.json","fetch_events":"https://pith.science/api/pith-number/3FBW3A4AFRFRBSMCK3MCSDEUVD/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/3FBW3A4AFRFRBSMCK3MCSDEUVD/action/timestamp_anchor","attest_storage":"https://pith.science/pith/3FBW3A4AFRFRBSMCK3MCSDEUVD/action/storage_attestation","attest_author":"https://pith.science/pith/3FBW3A4AFRFRBSMCK3MCSDEUVD/action/author_attestation","sign_citation":"https://pith.science/pith/3FBW3A4AFRFRBSMCK3MCSDEUVD/action/citation_signature","submit_replication":"https://pith.science/pith/3FBW3A4AFRFRBSMCK3MCSDEUVD/action/replication_record"}},"created_at":"2026-05-25T02:01:22.041313+00:00","updated_at":"2026-05-25T02:01:22.041313+00:00"}