{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:2WN3GPRCKQ2TMBKQ64TBJAFVOJ","short_pith_number":"pith:2WN3GPRC","schema_version":"1.0","canonical_sha256":"d59bb33e225435360550f7261480b5725e4bfd3bf18888ca48694a4efcfa2cf4","source":{"kind":"arxiv","id":"2605.29416","version":1},"attestation_state":"computed","paper":{"title":"3DVLA: Enhancing Vision-Language-Action Models via 3D Spatial and Instance Understanding","license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","headline":"","cross_cats":["cs.CV"],"primary_cat":"cs.RO","authors_text":"Bingqing Wei, Yongtao Wang, Yousen Tang, Zhongyu Xia","submitted_at":"2026-05-28T06:07:57Z","abstract_excerpt":"Vision-Language-Action models have achieved remarkable progress in robotic manipulation, yet they suffer from a critical limitation: a lack of 3D scene understanding. This deficiency manifests as three intertwined challenges: weak extraction of 3D spatial positions without enforcing multi-view consistency, inadequate 3D instance understanding, and fragile reasoning under occlusion. Although mature 3D perception methods exist, their direct integration into VLA pipelines is hindered by architectural incompatibility and by heavy reliance on costly instance-level annotations. To address the above "},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2605.29416","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.RO","submitted_at":"2026-05-28T06:07:57Z","cross_cats_sorted":["cs.CV"],"title_canon_sha256":"24cf268ec1f39f0bbd4b5cdf2214d039f5a73c7a42c48a54b2c58b4d89ec70c0","abstract_canon_sha256":"d26258d6ba15aef4cdd69b7ee653a44f01e12be94ca91a0b30af7e4cc4325e25"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-29T01:05:37.730248Z","signature_b64":"js6D2XWxoHo81b+Wii6yOrYH4IaqLGZtSR8IBMqRxeabgcNcx/y76NB1K4nGQxZVV2PcJIfojKJWORK6bV7WBg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"d59bb33e225435360550f7261480b5725e4bfd3bf18888ca48694a4efcfa2cf4","last_reissued_at":"2026-05-29T01:05:37.729756Z","signature_status":"signed_v1","first_computed_at":"2026-05-29T01:05:37.729756Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"3DVLA: Enhancing Vision-Language-Action Models via 3D Spatial and Instance Understanding","license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","headline":"","cross_cats":["cs.CV"],"primary_cat":"cs.RO","authors_text":"Bingqing Wei, Yongtao Wang, Yousen Tang, Zhongyu Xia","submitted_at":"2026-05-28T06:07:57Z","abstract_excerpt":"Vision-Language-Action models have achieved remarkable progress in robotic manipulation, yet they suffer from a critical limitation: a lack of 3D scene understanding. This deficiency manifests as three intertwined challenges: weak extraction of 3D spatial positions without enforcing multi-view consistency, inadequate 3D instance understanding, and fragile reasoning under occlusion. Although mature 3D perception methods exist, their direct integration into VLA pipelines is hindered by architectural incompatibility and by heavy reliance on costly instance-level annotations. To address the above "},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2605.29416","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2605.29416/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2605.29416","created_at":"2026-05-29T01:05:37.729838+00:00"},{"alias_kind":"arxiv_version","alias_value":"2605.29416v1","created_at":"2026-05-29T01:05:37.729838+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.29416","created_at":"2026-05-29T01:05:37.729838+00:00"},{"alias_kind":"pith_short_12","alias_value":"2WN3GPRCKQ2T","created_at":"2026-05-29T01:05:37.729838+00:00"},{"alias_kind":"pith_short_16","alias_value":"2WN3GPRCKQ2TMBKQ","created_at":"2026-05-29T01:05:37.729838+00:00"},{"alias_kind":"pith_short_8","alias_value":"2WN3GPRC","created_at":"2026-05-29T01:05:37.729838+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/2WN3GPRCKQ2TMBKQ64TBJAFVOJ","json":"https://pith.science/pith/2WN3GPRCKQ2TMBKQ64TBJAFVOJ.json","graph_json":"https://pith.science/api/pith-number/2WN3GPRCKQ2TMBKQ64TBJAFVOJ/graph.json","events_json":"https://pith.science/api/pith-number/2WN3GPRCKQ2TMBKQ64TBJAFVOJ/events.json","paper":"https://pith.science/paper/2WN3GPRC"},"agent_actions":{"view_html":"https://pith.science/pith/2WN3GPRCKQ2TMBKQ64TBJAFVOJ","download_json":"https://pith.science/pith/2WN3GPRCKQ2TMBKQ64TBJAFVOJ.json","view_paper":"https://pith.science/paper/2WN3GPRC","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2605.29416&json=true","fetch_graph":"https://pith.science/api/pith-number/2WN3GPRCKQ2TMBKQ64TBJAFVOJ/graph.json","fetch_events":"https://pith.science/api/pith-number/2WN3GPRCKQ2TMBKQ64TBJAFVOJ/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/2WN3GPRCKQ2TMBKQ64TBJAFVOJ/action/timestamp_anchor","attest_storage":"https://pith.science/pith/2WN3GPRCKQ2TMBKQ64TBJAFVOJ/action/storage_attestation","attest_author":"https://pith.science/pith/2WN3GPRCKQ2TMBKQ64TBJAFVOJ/action/author_attestation","sign_citation":"https://pith.science/pith/2WN3GPRCKQ2TMBKQ64TBJAFVOJ/action/citation_signature","submit_replication":"https://pith.science/pith/2WN3GPRCKQ2TMBKQ64TBJAFVOJ/action/replication_record"}},"created_at":"2026-05-29T01:05:37.729838+00:00","updated_at":"2026-05-29T01:05:37.729838+00:00"}