{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2025:HZZT5D5COUDPI5SMUUSUPG2KFO","short_pith_number":"pith:HZZT5D5C","schema_version":"1.0","canonical_sha256":"3e733e8fa27506f4764ca525479b4a2bbd13648030aa4beba980e06a14ac3915","source":{"kind":"arxiv","id":"2511.04670","version":1},"attestation_state":"computed","paper":{"title":"Cambrian-S: Towards Spatial Supersensing in Video","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Daohan Lu, Ellis Brown, Jihan Yang, Li Fei-Fei, Muhan Wang, Pinzhi Huang, Rob Fergus, Saining Xie, Shengbang Tong, Shusheng Yang, Yann LeCun, Yifan Xu, Yue Yu, Zihan Zheng, Zihao Yang","submitted_at":"2025-11-06T18:55:17Z","abstract_excerpt":"We argue that progress in true multimodal intelligence calls for a shift from reactive, task-driven systems and brute-force long context towards a broader paradigm of supersensing. We frame spatial supersensing as four stages beyond linguistic-only understanding: semantic perception (naming what is seen), streaming event cognition (maintaining memory across continuous experiences), implicit 3D spatial cognition (inferring the world behind pixels), and predictive world modeling (creating internal models that filter and organize information). Current benchmarks largely test only the early stages"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2511.04670","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2025-11-06T18:55:17Z","cross_cats_sorted":[],"title_canon_sha256":"a1bc624a5238b87187593c8704081e663b3c5288c34dfdf5ea63112008b5673d","abstract_canon_sha256":"7f8316be901d6f6ea7bb57184d4b487c3f52a4245f18c43452442aaa74744465"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T03:41:18.784947Z","signature_b64":"iLj38Lg2sGi5/U3khVpyJE9z54pUgU4U81C/b84bSn9tYdgXT4KNGJxhEB1kVhV41XntPIYlDfsKsDMxCaZnBg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"3e733e8fa27506f4764ca525479b4a2bbd13648030aa4beba980e06a14ac3915","last_reissued_at":"2026-05-18T03:41:18.784295Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T03:41:18.784295Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Cambrian-S: Towards Spatial Supersensing in Video","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.CV","authors_text":"Daohan Lu, Ellis Brown, Jihan Yang, Li Fei-Fei, Muhan Wang, Pinzhi Huang, Rob Fergus, Saining Xie, Shengbang Tong, Shusheng Yang, Yann LeCun, Yifan Xu, Yue Yu, Zihan Zheng, Zihao Yang","submitted_at":"2025-11-06T18:55:17Z","abstract_excerpt":"We argue that progress in true multimodal intelligence calls for a shift from reactive, task-driven systems and brute-force long context towards a broader paradigm of supersensing. We frame spatial supersensing as four stages beyond linguistic-only understanding: semantic perception (naming what is seen), streaming event cognition (maintaining memory across continuous experiences), implicit 3D spatial cognition (inferring the world behind pixels), and predictive world modeling (creating internal models that filter and organize information). Current benchmarks largely test only the early stages"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2511.04670","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2511.04670","created_at":"2026-05-18T03:41:18.784381+00:00"},{"alias_kind":"arxiv_version","alias_value":"2511.04670v1","created_at":"2026-05-18T03:41:18.784381+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2511.04670","created_at":"2026-05-18T03:41:18.784381+00:00"},{"alias_kind":"pith_short_12","alias_value":"HZZT5D5COUDP","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_16","alias_value":"HZZT5D5COUDPI5SM","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_8","alias_value":"HZZT5D5C","created_at":"2026-05-18T12:33:37.589309+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":28,"internal_anchor_count":28,"sample":[{"citing_arxiv_id":"2605.23176","citing_title":"DRIVESPATIAL: A Benchmark for Spatiotemporal Intelligence in VLMs for Autonomous Driving","ref_index":66,"is_internal_anchor":true},{"citing_arxiv_id":"2605.22558","citing_title":"GeoWeaver: Grounding Visual Tokens with Geometric Evidence before Scene Reasoning","ref_index":56,"is_internal_anchor":true},{"citing_arxiv_id":"2605.22536","citing_title":"SpaceDG: Benchmarking Spatial Intelligence under Visual Degradation","ref_index":9,"is_internal_anchor":true},{"citing_arxiv_id":"2605.15876","citing_title":"Unlocking Dense Metric Depth Estimation in VLMs","ref_index":75,"is_internal_anchor":true},{"citing_arxiv_id":"2605.18678","citing_title":"Lance: Unified Multimodal Modeling by Multi-Task Synergy","ref_index":141,"is_internal_anchor":true},{"citing_arxiv_id":"2605.15876","citing_title":"Unlocking Dense Metric Depth Estimation in VLMs","ref_index":69,"is_internal_anchor":true},{"citing_arxiv_id":"2605.18678","citing_title":"Lance: Unified Multimodal Modeling by Multi-Task Synergy","ref_index":140,"is_internal_anchor":true},{"citing_arxiv_id":"2605.19242","citing_title":"PhyWorld: Physics-Faithful World Model for Video Generation","ref_index":25,"is_internal_anchor":true},{"citing_arxiv_id":"2605.13169","citing_title":"PanoWorld: Towards Spatial Supersensing in 360$^\\circ$ Panorama World","ref_index":51,"is_internal_anchor":true},{"citing_arxiv_id":"2604.08991","citing_title":"PinpointQA: A Dataset and Benchmark for Small Object-Centric Spatial Understanding in Indoor Videos","ref_index":38,"is_internal_anchor":true},{"citing_arxiv_id":"2605.08747","citing_title":"Done, But Not Sure: Disentangling World Completion from Self-Termination in Embodied Agents","ref_index":26,"is_internal_anchor":true},{"citing_arxiv_id":"2603.27437","citing_title":"SpatialStack: Layered Geometry-Language Fusion for 3D VLM Spatial Reasoning","ref_index":53,"is_internal_anchor":true},{"citing_arxiv_id":"2605.13169","citing_title":"PanoWorld: Towards Spatial Supersensing in 360$^\\circ$ Panorama World","ref_index":51,"is_internal_anchor":true},{"citing_arxiv_id":"2605.08747","citing_title":"Done, But Not Sure: Disentangling World Completion from Self-Termination in Embodied Agents","ref_index":26,"is_internal_anchor":true},{"citing_arxiv_id":"2604.26934","citing_title":"World2VLM: Distilling World Model Imagination into VLMs for Dynamic Spatial Reasoning","ref_index":58,"is_internal_anchor":true},{"citing_arxiv_id":"2605.08747","citing_title":"Done, But Not Sure: Disentangling World Completion from Self-Termination in Embodied Agents","ref_index":26,"is_internal_anchor":true},{"citing_arxiv_id":"2605.10921","citing_title":"RoboMemArena: A Comprehensive and Challenging Robotic Memory Benchmark","ref_index":54,"is_internal_anchor":true},{"citing_arxiv_id":"2605.10106","citing_title":"ViSRA: A Video-based Spatial Reasoning Agent for Multi-modal Large Language Models","ref_index":44,"is_internal_anchor":true},{"citing_arxiv_id":"2605.09874","citing_title":"EgoMemReason: A Memory-Driven Reasoning Benchmark for Long-Horizon Egocentric Video Understanding","ref_index":31,"is_internal_anchor":true},{"citing_arxiv_id":"2605.09449","citing_title":"SpaceMind++: Toward Allocentric Cognitive Maps for Spatially Grounded Video MLLMs","ref_index":69,"is_internal_anchor":true},{"citing_arxiv_id":"2604.24443","citing_title":"PhysNote: Self-Knowledge Notes for Evolvable Physical Reasoning in Vision-Language Model","ref_index":12,"is_internal_anchor":true},{"citing_arxiv_id":"2605.02134","citing_title":"Video Generation with Predictive Latents","ref_index":57,"is_internal_anchor":true},{"citing_arxiv_id":"2605.02130","citing_title":"From Where Things Are to What They Are For: Benchmarking Spatial-Functional Intelligence in Multimodal LLMs","ref_index":77,"is_internal_anchor":true},{"citing_arxiv_id":"2604.08991","citing_title":"PinpointQA: A Dataset and Benchmark for Small Object-Centric Spatial Understanding in Indoor Videos","ref_index":38,"is_internal_anchor":true},{"citing_arxiv_id":"2604.07296","citing_title":"OpenSpatial: A Principled Data Engine for Empowering Spatial Intelligence","ref_index":54,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/HZZT5D5COUDPI5SMUUSUPG2KFO","json":"https://pith.science/pith/HZZT5D5COUDPI5SMUUSUPG2KFO.json","graph_json":"https://pith.science/api/pith-number/HZZT5D5COUDPI5SMUUSUPG2KFO/graph.json","events_json":"https://pith.science/api/pith-number/HZZT5D5COUDPI5SMUUSUPG2KFO/events.json","paper":"https://pith.science/paper/HZZT5D5C"},"agent_actions":{"view_html":"https://pith.science/pith/HZZT5D5COUDPI5SMUUSUPG2KFO","download_json":"https://pith.science/pith/HZZT5D5COUDPI5SMUUSUPG2KFO.json","view_paper":"https://pith.science/paper/HZZT5D5C","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2511.04670&json=true","fetch_graph":"https://pith.science/api/pith-number/HZZT5D5COUDPI5SMUUSUPG2KFO/graph.json","fetch_events":"https://pith.science/api/pith-number/HZZT5D5COUDPI5SMUUSUPG2KFO/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/HZZT5D5COUDPI5SMUUSUPG2KFO/action/timestamp_anchor","attest_storage":"https://pith.science/pith/HZZT5D5COUDPI5SMUUSUPG2KFO/action/storage_attestation","attest_author":"https://pith.science/pith/HZZT5D5COUDPI5SMUUSUPG2KFO/action/author_attestation","sign_citation":"https://pith.science/pith/HZZT5D5COUDPI5SMUUSUPG2KFO/action/citation_signature","submit_replication":"https://pith.science/pith/HZZT5D5COUDPI5SMUUSUPG2KFO/action/replication_record"}},"created_at":"2026-05-18T03:41:18.784381+00:00","updated_at":"2026-05-18T03:41:18.784381+00:00"}