{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2025:HS5S2APJEFBPZE5OGTD5PAFGWU","short_pith_number":"pith:HS5S2APJ","canonical_record":{"source":{"id":"2505.23747","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2025-05-29T17:59:04Z","cross_cats_sorted":["cs.AI","cs.LG"],"title_canon_sha256":"f3bf03a7423e285470a8a9b66de09bad5f8da3292e1f37ffdaba5dd24a172cd6","abstract_canon_sha256":"147e0d7fc614f806400cd4c3204facd9d7188c981e2bb33eef44a872e1a7b4bd"},"schema_version":"1.0"},"canonical_sha256":"3cbb2d01e92142fc93ae34c7d780a6b5155da6d2bc87b6c8db06e493bb4d329c","source":{"kind":"arxiv","id":"2505.23747","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2505.23747","created_at":"2026-05-17T23:38:48Z"},{"alias_kind":"arxiv_version","alias_value":"2505.23747v1","created_at":"2026-05-17T23:38:48Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2505.23747","created_at":"2026-05-17T23:38:48Z"},{"alias_kind":"pith_short_12","alias_value":"HS5S2APJEFBP","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"HS5S2APJEFBPZE5O","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"HS5S2APJ","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2025:HS5S2APJEFBPZE5OGTD5PAFGWU","target":"record","payload":{"canonical_record":{"source":{"id":"2505.23747","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2025-05-29T17:59:04Z","cross_cats_sorted":["cs.AI","cs.LG"],"title_canon_sha256":"f3bf03a7423e285470a8a9b66de09bad5f8da3292e1f37ffdaba5dd24a172cd6","abstract_canon_sha256":"147e0d7fc614f806400cd4c3204facd9d7188c981e2bb33eef44a872e1a7b4bd"},"schema_version":"1.0"},"canonical_sha256":"3cbb2d01e92142fc93ae34c7d780a6b5155da6d2bc87b6c8db06e493bb4d329c","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:48.490665Z","signature_b64":"ZGOBgO4Wnyt0JQYUQ2nwImSoqw7p/as1jAusCNV56A8gX/mskEMIB5ZVeAlcbjDD/htmzB7DWJRNHw9ZWqjmAw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"3cbb2d01e92142fc93ae34c7d780a6b5155da6d2bc87b6c8db06e493bb4d329c","last_reissued_at":"2026-05-17T23:38:48.490187Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:48.490187Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2505.23747","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:48Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"V+wYo+hi7rJ+YNYoDe6VmN8V132vzSwYUkdBeEBmfUtZ0MsyAjPm13Vv8pp+UygQLXQl+jywuD3jfXm05Gk0BQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-08T15:38:32.690974Z"},"content_sha256":"11a7453195d5d09ea85f621e5261818c699e4bb21f9ad377fa34a367b52c142e","schema_version":"1.0","event_id":"sha256:11a7453195d5d09ea85f621e5261818c699e4bb21f9ad377fa34a367b52c142e"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2025:HS5S2APJEFBPZE5OGTD5PAFGWU","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Spatial-MLLM: Boosting MLLM Capabilities in Visual-based Spatial Intelligence","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"Spatial-MLLM equips multimodal language models with stronger 3D spatial reasoning using only 2D image and video inputs.","cross_cats":["cs.AI","cs.LG"],"primary_cat":"cs.CV","authors_text":"Diankun Wu, Fangfu Liu, Yi-Hsin Hung, Yueqi Duan","submitted_at":"2025-05-29T17:59:04Z","abstract_excerpt":"Recent advancements in Multimodal Large Language Models (MLLMs) have significantly enhanced performance on 2D visual tasks. However, improving their spatial intelligence remains a challenge. Existing 3D MLLMs always rely on additional 3D or 2.5D data to incorporate spatial awareness, restricting their utility in scenarios with only 2D inputs, such as images or videos. In this paper, we present Spatial-MLLM, a novel framework for visual-based spatial reasoning from purely 2D observations. Unlike conventional video MLLMs which rely on CLIP-based visual encoders optimized for semantic understandi"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"our spatial-MLLM achieves state-of-the-art performance in a wide range of visual-based spatial understanding and reasoning tasks","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"that initializing a spatial encoder from the backbone of a feed-forward visual geometry foundation model will reliably extract usable 3D structure features from purely 2D image or video inputs without any 3D supervision","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Spatial-MLLM boosts MLLM spatial intelligence from 2D inputs via dual encoders initialized from geometry models plus space-aware sampling, claiming state-of-the-art results.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Spatial-MLLM equips multimodal language models with stronger 3D spatial reasoning using only 2D image and video inputs.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"34ef909fdb9faa426dfc176dea00e801f3d26d0aae596bb863cc382771694c27"},"source":{"id":"2505.23747","kind":"arxiv","version":1},"verdict":{"id":"0371eabb-c042-4858-9b50-d798baf2a849","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-16T08:31:09.946115Z","strongest_claim":"our spatial-MLLM achieves state-of-the-art performance in a wide range of visual-based spatial understanding and reasoning tasks","one_line_summary":"Spatial-MLLM boosts MLLM spatial intelligence from 2D inputs via dual encoders initialized from geometry models plus space-aware sampling, claiming state-of-the-art results.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"that initializing a spatial encoder from the backbone of a feed-forward visual geometry foundation model will reliably extract usable 3D structure features from purely 2D image or video inputs without any 3D supervision","pith_extraction_headline":"Spatial-MLLM equips multimodal language models with stronger 3D spatial reasoning using only 2D image and video inputs."},"references":{"count":71,"sample":[{"doi":"","year":2022,"title":"Flamingo: a visual language model for few-shot learning,","work_id":"2aa7036b-9bcf-4f86-9e62-24ceaf7eaea7","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2023,"title":"Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models,","work_id":"9e1df70c-c5c8-459c-a56f-57934f6fd012","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2024,"title":"H. Liu, C. Li, Q. Wu, and Y . J. Lee, “Visual instruction tuning,”NeurIPS, 2024","work_id":"179f5e44-6c7c-4d41-833a-07c4934c1327","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2024,"title":"Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context","work_id":"80e3e977-f1bb-4c83-8d0c-1ab0a0c5c3f1","ref_index":4,"cited_arxiv_id":"2403.05530","is_internal_anchor":true},{"doi":"","year":2024,"title":"GPT-4o System Card","work_id":"f37bf1c7-4964-4e56-9762-d20da8d9009f","ref_index":5,"cited_arxiv_id":"2410.21276","is_internal_anchor":true}],"resolved_work":71,"snapshot_sha256":"ef71c75076133de46a9772759062ec25ae45903f6e0db99ba9a89d0437c298f8","internal_anchors":21},"formal_canon":{"evidence_count":2,"snapshot_sha256":"cc2ba9c2d17a48c092bf35e3b73d66b74e462d2389c374e9b88259aa2142b9f9"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"0371eabb-c042-4858-9b50-d798baf2a849"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:48Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"18t/E3O/q/Xfh2owI5ydQ1azh5OFKLru0VG4OiuO5irr8JZiSLoVLNVMIU5thb+YALyfYaYjIihbB+Ckcs6VBw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-08T15:38:32.691994Z"},"content_sha256":"87e1296882b34d25c3a0e6c4708005ed4f3480aee1fc8684cf0259bf4e6737ae","schema_version":"1.0","event_id":"sha256:87e1296882b34d25c3a0e6c4708005ed4f3480aee1fc8684cf0259bf4e6737ae"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/HS5S2APJEFBPZE5OGTD5PAFGWU/bundle.json","state_url":"https://pith.science/pith/HS5S2APJEFBPZE5OGTD5PAFGWU/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/HS5S2APJEFBPZE5OGTD5PAFGWU/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-08T15:38:32Z","links":{"resolver":"https://pith.science/pith/HS5S2APJEFBPZE5OGTD5PAFGWU","bundle":"https://pith.science/pith/HS5S2APJEFBPZE5OGTD5PAFGWU/bundle.json","state":"https://pith.science/pith/HS5S2APJEFBPZE5OGTD5PAFGWU/state.json","well_known_bundle":"https://pith.science/.well-known/pith/HS5S2APJEFBPZE5OGTD5PAFGWU/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2025:HS5S2APJEFBPZE5OGTD5PAFGWU","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"147e0d7fc614f806400cd4c3204facd9d7188c981e2bb33eef44a872e1a7b4bd","cross_cats_sorted":["cs.AI","cs.LG"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2025-05-29T17:59:04Z","title_canon_sha256":"f3bf03a7423e285470a8a9b66de09bad5f8da3292e1f37ffdaba5dd24a172cd6"},"schema_version":"1.0","source":{"id":"2505.23747","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2505.23747","created_at":"2026-05-17T23:38:48Z"},{"alias_kind":"arxiv_version","alias_value":"2505.23747v1","created_at":"2026-05-17T23:38:48Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2505.23747","created_at":"2026-05-17T23:38:48Z"},{"alias_kind":"pith_short_12","alias_value":"HS5S2APJEFBP","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"HS5S2APJEFBPZE5O","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"HS5S2APJ","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:87e1296882b34d25c3a0e6c4708005ed4f3480aee1fc8684cf0259bf4e6737ae","target":"graph","created_at":"2026-05-17T23:38:48Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"our spatial-MLLM achieves state-of-the-art performance in a wide range of visual-based spatial understanding and reasoning tasks"},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"that initializing a spatial encoder from the backbone of a feed-forward visual geometry foundation model will reliably extract usable 3D structure features from purely 2D image or video inputs without any 3D supervision"},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Spatial-MLLM boosts MLLM spatial intelligence from 2D inputs via dual encoders initialized from geometry models plus space-aware sampling, claiming state-of-the-art results."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Spatial-MLLM equips multimodal language models with stronger 3D spatial reasoning using only 2D image and video inputs."}],"snapshot_sha256":"34ef909fdb9faa426dfc176dea00e801f3d26d0aae596bb863cc382771694c27"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"cc2ba9c2d17a48c092bf35e3b73d66b74e462d2389c374e9b88259aa2142b9f9"},"paper":{"abstract_excerpt":"Recent advancements in Multimodal Large Language Models (MLLMs) have significantly enhanced performance on 2D visual tasks. However, improving their spatial intelligence remains a challenge. Existing 3D MLLMs always rely on additional 3D or 2.5D data to incorporate spatial awareness, restricting their utility in scenarios with only 2D inputs, such as images or videos. In this paper, we present Spatial-MLLM, a novel framework for visual-based spatial reasoning from purely 2D observations. Unlike conventional video MLLMs which rely on CLIP-based visual encoders optimized for semantic understandi","authors_text":"Diankun Wu, Fangfu Liu, Yi-Hsin Hung, Yueqi Duan","cross_cats":["cs.AI","cs.LG"],"headline":"Spatial-MLLM equips multimodal language models with stronger 3D spatial reasoning using only 2D image and video inputs.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2025-05-29T17:59:04Z","title":"Spatial-MLLM: Boosting MLLM Capabilities in Visual-based Spatial Intelligence"},"references":{"count":71,"internal_anchors":21,"resolved_work":71,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"Flamingo: a visual language model for few-shot learning,","work_id":"2aa7036b-9bcf-4f86-9e62-24ceaf7eaea7","year":2022},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models,","work_id":"9e1df70c-c5c8-459c-a56f-57934f6fd012","year":2023},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"H. Liu, C. Li, Q. Wu, and Y . J. Lee, “Visual instruction tuning,”NeurIPS, 2024","work_id":"179f5e44-6c7c-4d41-833a-07c4934c1327","year":2024},{"cited_arxiv_id":"2403.05530","doi":"","is_internal_anchor":true,"ref_index":4,"title":"Gemini 1.5: Unlocking multimodal understanding across millions of tokens of context","work_id":"80e3e977-f1bb-4c83-8d0c-1ab0a0c5c3f1","year":2024},{"cited_arxiv_id":"2410.21276","doi":"","is_internal_anchor":true,"ref_index":5,"title":"GPT-4o System Card","work_id":"f37bf1c7-4964-4e56-9762-d20da8d9009f","year":2024}],"snapshot_sha256":"ef71c75076133de46a9772759062ec25ae45903f6e0db99ba9a89d0437c298f8"},"source":{"id":"2505.23747","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-16T08:31:09.946115Z","id":"0371eabb-c042-4858-9b50-d798baf2a849","model_set":{"reader":"grok-4.3"},"one_line_summary":"Spatial-MLLM boosts MLLM spatial intelligence from 2D inputs via dual encoders initialized from geometry models plus space-aware sampling, claiming state-of-the-art results.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Spatial-MLLM equips multimodal language models with stronger 3D spatial reasoning using only 2D image and video inputs.","strongest_claim":"our spatial-MLLM achieves state-of-the-art performance in a wide range of visual-based spatial understanding and reasoning tasks","weakest_assumption":"that initializing a spatial encoder from the backbone of a feed-forward visual geometry foundation model will reliably extract usable 3D structure features from purely 2D image or video inputs without any 3D supervision"}},"verdict_id":"0371eabb-c042-4858-9b50-d798baf2a849"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:11a7453195d5d09ea85f621e5261818c699e4bb21f9ad377fa34a367b52c142e","target":"record","created_at":"2026-05-17T23:38:48Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"147e0d7fc614f806400cd4c3204facd9d7188c981e2bb33eef44a872e1a7b4bd","cross_cats_sorted":["cs.AI","cs.LG"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2025-05-29T17:59:04Z","title_canon_sha256":"f3bf03a7423e285470a8a9b66de09bad5f8da3292e1f37ffdaba5dd24a172cd6"},"schema_version":"1.0","source":{"id":"2505.23747","kind":"arxiv","version":1}},"canonical_sha256":"3cbb2d01e92142fc93ae34c7d780a6b5155da6d2bc87b6c8db06e493bb4d329c","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"3cbb2d01e92142fc93ae34c7d780a6b5155da6d2bc87b6c8db06e493bb4d329c","first_computed_at":"2026-05-17T23:38:48.490187Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:48.490187Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"ZGOBgO4Wnyt0JQYUQ2nwImSoqw7p/as1jAusCNV56A8gX/mskEMIB5ZVeAlcbjDD/htmzB7DWJRNHw9ZWqjmAw==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:48.490665Z","signed_message":"canonical_sha256_bytes"},"source_id":"2505.23747","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:11a7453195d5d09ea85f621e5261818c699e4bb21f9ad377fa34a367b52c142e","sha256:87e1296882b34d25c3a0e6c4708005ed4f3480aee1fc8684cf0259bf4e6737ae"],"state_sha256":"e5fb984dabac1068d40e7ebf4d8122abbc62855cbd151638124dbd3671f0c05e"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"pH6BBCYU+DtObvTwsGUicPdYZaacl5logclwXwcFar72CIFjHKn6H7mPtLkLW3eUUltZrTUyBgqiRhOJJHzACg==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-08T15:38:32.696700Z","bundle_sha256":"36b3bd0e7a4131e26296a486f5f042a20ce65b31a8880fa7e91c7be6e737d664"}}