{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:HLMHWWYZNIVC2OAJNNBWF55CAI","short_pith_number":"pith:HLMHWWYZ","schema_version":"1.0","canonical_sha256":"3ad87b5b196a2a2d38096b4362f7a20220a2ad1d4fc592bb9bcc53e2caf09734","source":{"kind":"arxiv","id":"2605.14950","version":1},"attestation_state":"computed","paper":{"title":"Evo-Depth: A Lightweight Depth-Enhanced Vision-Language-Action Model","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.RO"],"primary_cat":"cs.CV","authors_text":"Bing Cheng, Bo Zhao, Gen Li, Hongyi Cai, Jiting Liu, Junchi Yan, Kai Ye, Mingkang Dong, Nuobei Zhu, Tao Lin, Yilei Zhong, Yinxinyu Chen, Yiran Mao, Yunhe Li, Yuqian Fu, Yuxin Du, Zewei Ye","submitted_at":"2026-05-14T15:21:36Z","abstract_excerpt":"Vision-Language-Action models have emerged as a promising paradigm for robotic manipulation by unifying perception, language grounding, and action generation. However, they often struggle in scenarios requiring precise spatial understanding, as current VLA models primarily rely on 2D visual representations that lack depth information and detailed spatial relationships. While recent approaches incorporate explicit 3D inputs such as depth maps or point clouds to address this issue, they often increase system complexity, require additional sensors, and remain vulnerable to sensing noise and recon"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2605.14950","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2026-05-14T15:21:36Z","cross_cats_sorted":["cs.RO"],"title_canon_sha256":"2eb18183f368ce7e3fd5b724992af0f3034813e2c71e42924dca79c03bc12830","abstract_canon_sha256":"d97bc93a85c47c0b9921d60fe67a10dd658953dbca384b681167114e689b1b81"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:55.389304Z","signature_b64":"wekMnPB41DunpNkSSeICqPDEZPjq6wkTI5g8aP3D6oTLUWDKsf63gQ09T+daM0/kYG65C04cfM+bZ4GegwocCQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"3ad87b5b196a2a2d38096b4362f7a20220a2ad1d4fc592bb9bcc53e2caf09734","last_reissued_at":"2026-05-17T23:38:55.388691Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:55.388691Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Evo-Depth: A Lightweight Depth-Enhanced Vision-Language-Action Model","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.RO"],"primary_cat":"cs.CV","authors_text":"Bing Cheng, Bo Zhao, Gen Li, Hongyi Cai, Jiting Liu, Junchi Yan, Kai Ye, Mingkang Dong, Nuobei Zhu, Tao Lin, Yilei Zhong, Yinxinyu Chen, Yiran Mao, Yunhe Li, Yuqian Fu, Yuxin Du, Zewei Ye","submitted_at":"2026-05-14T15:21:36Z","abstract_excerpt":"Vision-Language-Action models have emerged as a promising paradigm for robotic manipulation by unifying perception, language grounding, and action generation. However, they often struggle in scenarios requiring precise spatial understanding, as current VLA models primarily rely on 2D visual representations that lack depth information and detailed spatial relationships. While recent approaches incorporate explicit 3D inputs such as depth maps or point clouds to address this issue, they often increase system complexity, require additional sensors, and remain vulnerable to sensing noise and recon"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2605.14950","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2605.14950","created_at":"2026-05-17T23:38:55.388785+00:00"},{"alias_kind":"arxiv_version","alias_value":"2605.14950v1","created_at":"2026-05-17T23:38:55.388785+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.14950","created_at":"2026-05-17T23:38:55.388785+00:00"},{"alias_kind":"pith_short_12","alias_value":"HLMHWWYZNIVC","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_16","alias_value":"HLMHWWYZNIVC2OAJ","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_8","alias_value":"HLMHWWYZ","created_at":"2026-05-18T12:33:37.589309+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/HLMHWWYZNIVC2OAJNNBWF55CAI","json":"https://pith.science/pith/HLMHWWYZNIVC2OAJNNBWF55CAI.json","graph_json":"https://pith.science/api/pith-number/HLMHWWYZNIVC2OAJNNBWF55CAI/graph.json","events_json":"https://pith.science/api/pith-number/HLMHWWYZNIVC2OAJNNBWF55CAI/events.json","paper":"https://pith.science/paper/HLMHWWYZ"},"agent_actions":{"view_html":"https://pith.science/pith/HLMHWWYZNIVC2OAJNNBWF55CAI","download_json":"https://pith.science/pith/HLMHWWYZNIVC2OAJNNBWF55CAI.json","view_paper":"https://pith.science/paper/HLMHWWYZ","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2605.14950&json=true","fetch_graph":"https://pith.science/api/pith-number/HLMHWWYZNIVC2OAJNNBWF55CAI/graph.json","fetch_events":"https://pith.science/api/pith-number/HLMHWWYZNIVC2OAJNNBWF55CAI/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/HLMHWWYZNIVC2OAJNNBWF55CAI/action/timestamp_anchor","attest_storage":"https://pith.science/pith/HLMHWWYZNIVC2OAJNNBWF55CAI/action/storage_attestation","attest_author":"https://pith.science/pith/HLMHWWYZNIVC2OAJNNBWF55CAI/action/author_attestation","sign_citation":"https://pith.science/pith/HLMHWWYZNIVC2OAJNNBWF55CAI/action/citation_signature","submit_replication":"https://pith.science/pith/HLMHWWYZNIVC2OAJNNBWF55CAI/action/replication_record"}},"created_at":"2026-05-17T23:38:55.388785+00:00","updated_at":"2026-05-17T23:38:55.388785+00:00"}