{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:LSX2NP7TXNCRPINJKLEHX3SW7N","short_pith_number":"pith:LSX2NP7T","schema_version":"1.0","canonical_sha256":"5cafa6bff3bb4517a1a952c87bee56fb7b0b35cac0008c0a841dad5b523d247f","source":{"kind":"arxiv","id":"2605.30307","version":1},"attestation_state":"computed","paper":{"title":"Grounded 3D-Aware Spatial Vision-Language Modeling","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.CV","authors_text":"An-Chieh Cheng, Guanqi Zhan, Hongxu Yin, Jan Kautz, Ligeng Zhu, Pavlo Molchanov, Sifei Liu, Song Han, Vidya Nariyambut Murali, Xiaolong Wang, Yang Fu, Yao Lu, Yatai Ji, Zhaojing Yang, Zhuoyang Zhang","submitted_at":"2026-05-28T17:51:38Z","abstract_excerpt":"We present GR3D, a spatial vision language model equipped with three complementary grounding capabilities--explicit 2D grounding, implicit 2D grounding, and monocular 3D grounding--within a single framework. GR3D introduces an implicit grounding mechanism that identifies entity mentions during generation and inserts the corresponding region tokens into the text stream, allowing the model to reference visual evidence on the fly when producing spatial chain-of-thought responses. In parallel, a region-prompted monocular 3D grounding design predicts 3D bounding boxes in the camera view from ground"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2605.30307","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2026-05-28T17:51:38Z","cross_cats_sorted":[],"title_canon_sha256":"5a3febd33633f8945dedbb9e48fd8f493877cc04b4af9c9d2533fd5c99a0ec65","abstract_canon_sha256":"6aeeb5dc3b96e467c476e26e56c7a71ec8d24f9728e09558bc9275daf80692ad"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-29T02:06:15.872136Z","signature_b64":"+Tard0CzLiEottKICjtqrQyKxQ2XK5OO5SOzJ44BYTEOoztVNRWw3epyYyuSfPqLNcbD2Metm8R4qye2RSF4Ag==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"5cafa6bff3bb4517a1a952c87bee56fb7b0b35cac0008c0a841dad5b523d247f","last_reissued_at":"2026-05-29T02:06:15.871795Z","signature_status":"signed_v1","first_computed_at":"2026-05-29T02:06:15.871795Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Grounded 3D-Aware Spatial Vision-Language Modeling","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.CV","authors_text":"An-Chieh Cheng, Guanqi Zhan, Hongxu Yin, Jan Kautz, Ligeng Zhu, Pavlo Molchanov, Sifei Liu, Song Han, Vidya Nariyambut Murali, Xiaolong Wang, Yang Fu, Yao Lu, Yatai Ji, Zhaojing Yang, Zhuoyang Zhang","submitted_at":"2026-05-28T17:51:38Z","abstract_excerpt":"We present GR3D, a spatial vision language model equipped with three complementary grounding capabilities--explicit 2D grounding, implicit 2D grounding, and monocular 3D grounding--within a single framework. GR3D introduces an implicit grounding mechanism that identifies entity mentions during generation and inserts the corresponding region tokens into the text stream, allowing the model to reference visual evidence on the fly when producing spatial chain-of-thought responses. In parallel, a region-prompted monocular 3D grounding design predicts 3D bounding boxes in the camera view from ground"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2605.30307","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2605.30307/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2605.30307","created_at":"2026-05-29T02:06:15.871852+00:00"},{"alias_kind":"arxiv_version","alias_value":"2605.30307v1","created_at":"2026-05-29T02:06:15.871852+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.30307","created_at":"2026-05-29T02:06:15.871852+00:00"},{"alias_kind":"pith_short_12","alias_value":"LSX2NP7TXNCR","created_at":"2026-05-29T02:06:15.871852+00:00"},{"alias_kind":"pith_short_16","alias_value":"LSX2NP7TXNCRPINJ","created_at":"2026-05-29T02:06:15.871852+00:00"},{"alias_kind":"pith_short_8","alias_value":"LSX2NP7T","created_at":"2026-05-29T02:06:15.871852+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/LSX2NP7TXNCRPINJKLEHX3SW7N","json":"https://pith.science/pith/LSX2NP7TXNCRPINJKLEHX3SW7N.json","graph_json":"https://pith.science/api/pith-number/LSX2NP7TXNCRPINJKLEHX3SW7N/graph.json","events_json":"https://pith.science/api/pith-number/LSX2NP7TXNCRPINJKLEHX3SW7N/events.json","paper":"https://pith.science/paper/LSX2NP7T"},"agent_actions":{"view_html":"https://pith.science/pith/LSX2NP7TXNCRPINJKLEHX3SW7N","download_json":"https://pith.science/pith/LSX2NP7TXNCRPINJKLEHX3SW7N.json","view_paper":"https://pith.science/paper/LSX2NP7T","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2605.30307&json=true","fetch_graph":"https://pith.science/api/pith-number/LSX2NP7TXNCRPINJKLEHX3SW7N/graph.json","fetch_events":"https://pith.science/api/pith-number/LSX2NP7TXNCRPINJKLEHX3SW7N/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/LSX2NP7TXNCRPINJKLEHX3SW7N/action/timestamp_anchor","attest_storage":"https://pith.science/pith/LSX2NP7TXNCRPINJKLEHX3SW7N/action/storage_attestation","attest_author":"https://pith.science/pith/LSX2NP7TXNCRPINJKLEHX3SW7N/action/author_attestation","sign_citation":"https://pith.science/pith/LSX2NP7TXNCRPINJKLEHX3SW7N/action/citation_signature","submit_replication":"https://pith.science/pith/LSX2NP7TXNCRPINJKLEHX3SW7N/action/replication_record"}},"created_at":"2026-05-29T02:06:15.871852+00:00","updated_at":"2026-05-29T02:06:15.871852+00:00"}