{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:TV7NHZHLXXNCXWA4LXXF226JZT","short_pith_number":"pith:TV7NHZHL","schema_version":"1.0","canonical_sha256":"9d7ed3e4ebbdda2bd81c5dee5d6bc9cccf3a5a9fece4451c88f4367671f9f67d","source":{"kind":"arxiv","id":"2603.19235","version":2},"attestation_state":"computed","paper":{"title":"Generation Models Know Space: Unleashing Implicit 3D Priors for Scene Understanding","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.RO"],"primary_cat":"cs.CV","authors_text":"Dingkang Liang, Kui Xia, Tianrui Feng, Xiang Bai, Xianjin Wu, Xiaofan Li, Xiao Tan, Yumeng Zhang","submitted_at":"2026-03-19T17:59:58Z","abstract_excerpt":"While Multimodal Large Language Models demonstrate impressive semantic capabilities, they often suffer from spatial blindness, struggling with fine-grained geometric reasoning and physical dynamics. Existing solutions typically rely on explicit 3D modalities or complex geometric scaffolding, which are limited by data scarcity and generalization challenges. In this work, we propose a paradigm shift by leveraging the implicit spatial prior within large-scale video generation models. We posit that to synthesize temporally coherent videos, these models inherently learn robust 3D structural priors "},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2603.19235","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2026-03-19T17:59:58Z","cross_cats_sorted":["cs.RO"],"title_canon_sha256":"7b69898d24382f274a14a6cd5678894ff2a2095779146f15b767446d924937b6","abstract_canon_sha256":"b3297281ef5444b447968ad4f29707dc8b7a2bb6d33937576cb0843b70777243"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-30T02:17:18.764641Z","signature_b64":"pvQapzWuKHAx1i0VkUvBjJ/qR0tkloOmH7ge6tT1/dUgtU4jK7lSeoVxF3Sb6f01FHr6REJ14BMbeAMMuHLUDw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"9d7ed3e4ebbdda2bd81c5dee5d6bc9cccf3a5a9fece4451c88f4367671f9f67d","last_reissued_at":"2026-06-30T02:17:18.763976Z","signature_status":"signed_v1","first_computed_at":"2026-06-30T02:17:18.763976Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Generation Models Know Space: Unleashing Implicit 3D Priors for Scene Understanding","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.RO"],"primary_cat":"cs.CV","authors_text":"Dingkang Liang, Kui Xia, Tianrui Feng, Xiang Bai, Xianjin Wu, Xiaofan Li, Xiao Tan, Yumeng Zhang","submitted_at":"2026-03-19T17:59:58Z","abstract_excerpt":"While Multimodal Large Language Models demonstrate impressive semantic capabilities, they often suffer from spatial blindness, struggling with fine-grained geometric reasoning and physical dynamics. Existing solutions typically rely on explicit 3D modalities or complex geometric scaffolding, which are limited by data scarcity and generalization challenges. In this work, we propose a paradigm shift by leveraging the implicit spatial prior within large-scale video generation models. We posit that to synthesize temporally coherent videos, these models inherently learn robust 3D structural priors "},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2603.19235","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2603.19235/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2603.19235","created_at":"2026-06-30T02:17:18.764063+00:00"},{"alias_kind":"arxiv_version","alias_value":"2603.19235v2","created_at":"2026-06-30T02:17:18.764063+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2603.19235","created_at":"2026-06-30T02:17:18.764063+00:00"},{"alias_kind":"pith_short_12","alias_value":"TV7NHZHLXXNC","created_at":"2026-06-30T02:17:18.764063+00:00"},{"alias_kind":"pith_short_16","alias_value":"TV7NHZHLXXNCXWA4","created_at":"2026-06-30T02:17:18.764063+00:00"},{"alias_kind":"pith_short_8","alias_value":"TV7NHZHL","created_at":"2026-06-30T02:17:18.764063+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":4,"internal_anchor_count":4,"sample":[{"citing_arxiv_id":"2606.01955","citing_title":"WALL-WM: Carving World Action Modeling at the Event Joints","ref_index":78,"is_internal_anchor":true},{"citing_arxiv_id":"2605.15876","citing_title":"Unlocking Dense Metric Depth Estimation in VLMs","ref_index":68,"is_internal_anchor":true},{"citing_arxiv_id":"2605.15876","citing_title":"Unlocking Dense Metric Depth Estimation in VLMs","ref_index":62,"is_internal_anchor":true},{"citing_arxiv_id":"2605.09693","citing_title":"Do multimodal models imagine electric sheep?","ref_index":41,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/TV7NHZHLXXNCXWA4LXXF226JZT","json":"https://pith.science/pith/TV7NHZHLXXNCXWA4LXXF226JZT.json","graph_json":"https://pith.science/api/pith-number/TV7NHZHLXXNCXWA4LXXF226JZT/graph.json","events_json":"https://pith.science/api/pith-number/TV7NHZHLXXNCXWA4LXXF226JZT/events.json","paper":"https://pith.science/paper/TV7NHZHL"},"agent_actions":{"view_html":"https://pith.science/pith/TV7NHZHLXXNCXWA4LXXF226JZT","download_json":"https://pith.science/pith/TV7NHZHLXXNCXWA4LXXF226JZT.json","view_paper":"https://pith.science/paper/TV7NHZHL","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2603.19235&json=true","fetch_graph":"https://pith.science/api/pith-number/TV7NHZHLXXNCXWA4LXXF226JZT/graph.json","fetch_events":"https://pith.science/api/pith-number/TV7NHZHLXXNCXWA4LXXF226JZT/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/TV7NHZHLXXNCXWA4LXXF226JZT/action/timestamp_anchor","attest_storage":"https://pith.science/pith/TV7NHZHLXXNCXWA4LXXF226JZT/action/storage_attestation","attest_author":"https://pith.science/pith/TV7NHZHLXXNCXWA4LXXF226JZT/action/author_attestation","sign_citation":"https://pith.science/pith/TV7NHZHLXXNCXWA4LXXF226JZT/action/citation_signature","submit_replication":"https://pith.science/pith/TV7NHZHLXXNCXWA4LXXF226JZT/action/replication_record"}},"created_at":"2026-06-30T02:17:18.764063+00:00","updated_at":"2026-06-30T02:17:18.764063+00:00"}