{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:JT2EVYTQNKN245AQDI7SJHF6LK","short_pith_number":"pith:JT2EVYTQ","schema_version":"1.0","canonical_sha256":"4cf44ae2706a9bae74101a3f249cbe5a9b580bd17d3f730a8976da4aa09a2a2c","source":{"kind":"arxiv","id":"2606.05833","version":1},"attestation_state":"computed","paper":{"title":"Learning Geometric Representations from Videos for Spatial Intelligent Multimodal Large Language Models","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.CV","authors_text":"Haibo Wang, Lifu Huang","submitted_at":"2026-06-04T08:11:12Z","abstract_excerpt":"Multimodal Large Language Models (MLLMs) excel at 2D semantic understanding but lack intrinsic 3D awareness, resulting in representations that fail to maintain geometric and spatial consistency across video frames. Given the scarcity of large-scale 3D data, we present GeoVR, a novel framework that learns geometric representations using purely 2D video sequences. This approach effectively restructures the semantic latent space within MLLMs to unlock spatial intelligence. Rather than employing superficial feature mixing, GeoVR reshapes the internal representations of the MLLM by distilling geome"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2606.05833","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2026-06-04T08:11:12Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"b022b7d4c7217034465c80120a09c91637885822f5baf681fae9878a41e12c24","abstract_canon_sha256":"1fd1d37448468cec7e5bf5c528d20d4200013c8e92807f854e9b9246b9cc45c2"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-05T01:15:04.965056Z","signature_b64":"5gqKeYXcFDpN4GWsWsxebuG7w6AnaNsRqQeSClGgPnjX/c2JzenXAF9EhulplR70EsAml5AuAzLzu+oIbOKuDw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"4cf44ae2706a9bae74101a3f249cbe5a9b580bd17d3f730a8976da4aa09a2a2c","last_reissued_at":"2026-06-05T01:15:04.964454Z","signature_status":"signed_v1","first_computed_at":"2026-06-05T01:15:04.964454Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Learning Geometric Representations from Videos for Spatial Intelligent Multimodal Large Language Models","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.CV","authors_text":"Haibo Wang, Lifu Huang","submitted_at":"2026-06-04T08:11:12Z","abstract_excerpt":"Multimodal Large Language Models (MLLMs) excel at 2D semantic understanding but lack intrinsic 3D awareness, resulting in representations that fail to maintain geometric and spatial consistency across video frames. Given the scarcity of large-scale 3D data, we present GeoVR, a novel framework that learns geometric representations using purely 2D video sequences. This approach effectively restructures the semantic latent space within MLLMs to unlock spatial intelligence. Rather than employing superficial feature mixing, GeoVR reshapes the internal representations of the MLLM by distilling geome"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.05833","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2606.05833/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2606.05833","created_at":"2026-06-05T01:15:04.964544+00:00"},{"alias_kind":"arxiv_version","alias_value":"2606.05833v1","created_at":"2026-06-05T01:15:04.964544+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.05833","created_at":"2026-06-05T01:15:04.964544+00:00"},{"alias_kind":"pith_short_12","alias_value":"JT2EVYTQNKN2","created_at":"2026-06-05T01:15:04.964544+00:00"},{"alias_kind":"pith_short_16","alias_value":"JT2EVYTQNKN245AQ","created_at":"2026-06-05T01:15:04.964544+00:00"},{"alias_kind":"pith_short_8","alias_value":"JT2EVYTQ","created_at":"2026-06-05T01:15:04.964544+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/JT2EVYTQNKN245AQDI7SJHF6LK","json":"https://pith.science/pith/JT2EVYTQNKN245AQDI7SJHF6LK.json","graph_json":"https://pith.science/api/pith-number/JT2EVYTQNKN245AQDI7SJHF6LK/graph.json","events_json":"https://pith.science/api/pith-number/JT2EVYTQNKN245AQDI7SJHF6LK/events.json","paper":"https://pith.science/paper/JT2EVYTQ"},"agent_actions":{"view_html":"https://pith.science/pith/JT2EVYTQNKN245AQDI7SJHF6LK","download_json":"https://pith.science/pith/JT2EVYTQNKN245AQDI7SJHF6LK.json","view_paper":"https://pith.science/paper/JT2EVYTQ","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2606.05833&json=true","fetch_graph":"https://pith.science/api/pith-number/JT2EVYTQNKN245AQDI7SJHF6LK/graph.json","fetch_events":"https://pith.science/api/pith-number/JT2EVYTQNKN245AQDI7SJHF6LK/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/JT2EVYTQNKN245AQDI7SJHF6LK/action/timestamp_anchor","attest_storage":"https://pith.science/pith/JT2EVYTQNKN245AQDI7SJHF6LK/action/storage_attestation","attest_author":"https://pith.science/pith/JT2EVYTQNKN245AQDI7SJHF6LK/action/author_attestation","sign_citation":"https://pith.science/pith/JT2EVYTQNKN245AQDI7SJHF6LK/action/citation_signature","submit_replication":"https://pith.science/pith/JT2EVYTQNKN245AQDI7SJHF6LK/action/replication_record"}},"created_at":"2026-06-05T01:15:04.964544+00:00","updated_at":"2026-06-05T01:15:04.964544+00:00"}