{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2025:3BQG7JNYYOPASRBS4IQZP4RSQE","short_pith_number":"pith:3BQG7JNY","schema_version":"1.0","canonical_sha256":"d8606fa5b8c39e094432e22197f232811660225bb504a34ec0326c9d364a5fc5","source":{"kind":"arxiv","id":"2505.11709","version":3},"attestation_state":"computed","paper":{"title":"EgoDex: Learning Dexterous Manipulation from Large-Scale Egocentric Video","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"EgoDex supplies 829 hours of egocentric video with native 3D hand and finger tracking to train imitation learning policies for dexterous manipulation.","cross_cats":["cs.LG","cs.RO"],"primary_cat":"cs.CV","authors_text":"David J. Yoon, Jian Zhang, Mouli Sivapurapu, Peide Huang, Ryan Hoque","submitted_at":"2025-05-16T21:34:47Z","abstract_excerpt":"Imitation learning for manipulation has a well-known data scarcity problem. Unlike natural language and 2D computer vision, there is no Internet-scale corpus of data for dexterous manipulation. One appealing option is egocentric human video, a passively scalable data source. However, existing large-scale datasets such as Ego4D do not have native hand pose annotations and do not focus on object manipulation. To this end, we use Apple Vision Pro to collect EgoDex: the largest and most diverse dataset of dexterous human manipulation to date. EgoDex has 829 hours of egocentric video with paired 3D"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":true,"formal_links_present":true},"canonical_record":{"source":{"id":"2505.11709","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2025-05-16T21:34:47Z","cross_cats_sorted":["cs.LG","cs.RO"],"title_canon_sha256":"eb392e657853695a7528f04f996ef35df4555ba2fc2a1e1762d2e671d691fe85","abstract_canon_sha256":"9f63adc591cb5330f82c858e5f152e65893f67d48ff9608087808f23b866b26a"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:51.029241Z","signature_b64":"JLNhjOdc6zB2/XbZp8iVitwri9MVDCbQL8xFQDqmMi8twxL/gO5ktfQtoCP00pNKbFYECY4WNUMDB4qk33CZAw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"d8606fa5b8c39e094432e22197f232811660225bb504a34ec0326c9d364a5fc5","last_reissued_at":"2026-05-17T23:38:51.028822Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:51.028822Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"EgoDex: Learning Dexterous Manipulation from Large-Scale Egocentric Video","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"EgoDex supplies 829 hours of egocentric video with native 3D hand and finger tracking to train imitation learning policies for dexterous manipulation.","cross_cats":["cs.LG","cs.RO"],"primary_cat":"cs.CV","authors_text":"David J. Yoon, Jian Zhang, Mouli Sivapurapu, Peide Huang, Ryan Hoque","submitted_at":"2025-05-16T21:34:47Z","abstract_excerpt":"Imitation learning for manipulation has a well-known data scarcity problem. Unlike natural language and 2D computer vision, there is no Internet-scale corpus of data for dexterous manipulation. One appealing option is egocentric human video, a passively scalable data source. However, existing large-scale datasets such as Ego4D do not have native hand pose annotations and do not focus on object manipulation. To this end, we use Apple Vision Pro to collect EgoDex: the largest and most diverse dataset of dexterous human manipulation to date. EgoDex has 829 hours of egocentric video with paired 3D"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"EgoDex is the largest and most diverse dataset of dexterous human manipulation to date with 829 hours of egocentric video with paired 3D hand and finger tracking data collected at the time of recording.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That the on-device SLAM and multi-camera tracking from Apple Vision Pro produces sufficiently accurate and unbiased 3D hand poses that can be used to train policies which generalize beyond the collected tabletop tasks to real robotic hardware.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"EgoDex delivers the largest egocentric dataset with native 3D hand tracking for dexterous manipulation, enabling imitation learning policies for hand trajectory prediction on 194 tasks.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"EgoDex supplies 829 hours of egocentric video with native 3D hand and finger tracking to train imitation learning policies for dexterous manipulation.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"e4fd9374d3d6a2ce9b6c79acb8ec1cf4162a4c03c4bc42fdd89add7ab3df6722"},"source":{"id":"2505.11709","kind":"arxiv","version":3},"verdict":{"id":"036c40f2-d469-46c6-82cc-cf55d95835ba","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T15:35:48.144579Z","strongest_claim":"EgoDex is the largest and most diverse dataset of dexterous human manipulation to date with 829 hours of egocentric video with paired 3D hand and finger tracking data collected at the time of recording.","one_line_summary":"EgoDex delivers the largest egocentric dataset with native 3D hand tracking for dexterous manipulation, enabling imitation learning policies for hand trajectory prediction on 194 tasks.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That the on-device SLAM and multi-camera tracking from Apple Vision Pro produces sufficiently accurate and unbiased 3D hand poses that can be used to train policies which generalize beyond the collected tabletop tasks to real robotic hardware.","pith_extraction_headline":"EgoDex supplies 829 hours of egocentric video with native 3D hand and finger tracking to train imitation learning policies for dexterous manipulation."},"references":{"count":16,"sample":[{"doi":"","year":null,"title":"Maple: Encoding dexterous robotic manipulation priors learned from egocentric videos","work_id":"3cc564bd-641e-48bd-b547-d35363b71a05","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2026,"title":"Kristen Grauman, Andrew Westbury, Eugene Byrne, Zachary Chavis, Antonino Furnari, Rohit Gird- har, Jackson Hamburger, Hao Jiang, Miao Liu, Xingyu Liu, Miguel Martin, Tushar Nagarajan, Ilija Radosavovi","work_id":"b54e7f09-392b-453f-85d2-4b921a5e167b","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"X-il: Exploring the design space of imitation learning policies","work_id":"d55bd7b4-7f57-482b-8c5e-6f007ea1af82","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2026,"title":"Scaling robot supervision to hundreds of hours with robo- turk: Robotic manipulation dataset through human reasoning and dexterity","work_id":"583aa526-62c0-491c-a545-276b2850e592","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Ar- mada: Augmented reality for robot manipulation and robot-free data acquisition.arXiv preprint arXiv:2412.10631,","work_id":"20667670-1396-4dcf-9b38-13d1df591f54","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":16,"snapshot_sha256":"b885edcf40a299d913e64d1441769ff8dc14ab9cdeab2fc49d3329bf436c3985","internal_anchors":3},"formal_canon":{"evidence_count":2,"snapshot_sha256":"caf14012c8aef84626a1e7650fd2b7ad22879d9cb24eb57c2f9d3911882d407d"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2505.11709","created_at":"2026-05-17T23:38:51.028888+00:00"},{"alias_kind":"arxiv_version","alias_value":"2505.11709v3","created_at":"2026-05-17T23:38:51.028888+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2505.11709","created_at":"2026-05-17T23:38:51.028888+00:00"},{"alias_kind":"pith_short_12","alias_value":"3BQG7JNYYOPA","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_16","alias_value":"3BQG7JNYYOPASRBS","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_8","alias_value":"3BQG7JNY","created_at":"2026-05-18T12:33:37.589309+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":34,"internal_anchor_count":34,"sample":[{"citing_arxiv_id":"2511.18127","citing_title":"SFHand: Learning Embodied Manipulation by Streaming Egocentric 3D Hand Forecasting","ref_index":24,"is_internal_anchor":true},{"citing_arxiv_id":"2603.03243","citing_title":"HoMMI: Learning Whole-Body Mobile Manipulation from Human Demonstrations","ref_index":14,"is_internal_anchor":true},{"citing_arxiv_id":"2605.05945","citing_title":"MobileEgo Anywhere: Open Infrastructure for long horizon egocentric data on commodity hardware","ref_index":14,"is_internal_anchor":true},{"citing_arxiv_id":"2605.18553","citing_title":"StableHand: Quality-Aware Flow Matching for World-Space Dual-Hand Motion Estimation from Egocentric Video","ref_index":12,"is_internal_anchor":true},{"citing_arxiv_id":"2605.16743","citing_title":"LACE: Latent Visual Representation for Cross-Embodiment Learning","ref_index":27,"is_internal_anchor":true},{"citing_arxiv_id":"2605.16797","citing_title":"EgoKit: Towards Unified Low-Cost Egocentric Data Collection with Heterogeneous Devices","ref_index":35,"is_internal_anchor":true},{"citing_arxiv_id":"2605.15298","citing_title":"PhysBrain 1.0 Technical Report","ref_index":17,"is_internal_anchor":true},{"citing_arxiv_id":"2508.13073","citing_title":"Large VLM-based Vision-Language-Action Models for Robotic Manipulation: A Survey","ref_index":233,"is_internal_anchor":true},{"citing_arxiv_id":"2507.15493","citing_title":"GR-3 Technical Report","ref_index":27,"is_internal_anchor":true},{"citing_arxiv_id":"2602.06949","citing_title":"DreamDojo: A Generalist Robot World Model from Large-Scale Human Videos","ref_index":35,"is_internal_anchor":true},{"citing_arxiv_id":"2507.12898","citing_title":"Vidar: Embodied Video Diffusion Model for Generalist Manipulation","ref_index":24,"is_internal_anchor":true},{"citing_arxiv_id":"2603.02115","citing_title":"Robometer: Scaling General-Purpose Robotic Reward Models via Trajectory Comparisons","ref_index":125,"is_internal_anchor":true},{"citing_arxiv_id":"2603.11755","citing_title":"Controllable Egocentric Video Generation via Occlusion-Aware Sparse 3D Hand Joints","ref_index":17,"is_internal_anchor":true},{"citing_arxiv_id":"2605.05945","citing_title":"MobileEgo Anywhere: Open Infrastructure for long horizon egocentric data on commodity hardware","ref_index":14,"is_internal_anchor":true},{"citing_arxiv_id":"2605.05945","citing_title":"MobileEgo Anywhere: Open Infrastructure for long horizon egocentric data on commodity hardware","ref_index":14,"is_internal_anchor":true},{"citing_arxiv_id":"2605.13083","citing_title":"TouchAnything: A Dataset and Framework for Bimanual Tactile Estimation from Egocentric Video","ref_index":14,"is_internal_anchor":true},{"citing_arxiv_id":"2603.29844","citing_title":"DIAL: Decoupling Intent and Action via Latent World Modeling for End-to-End VLA","ref_index":40,"is_internal_anchor":true},{"citing_arxiv_id":"2605.05945","citing_title":"MobileEgo Anywhere: Open Infrastructure for long horizon egocentric data on commodity hardware","ref_index":14,"is_internal_anchor":true},{"citing_arxiv_id":"2605.12090","citing_title":"World Action Models: The Next Frontier in Embodied AI","ref_index":206,"is_internal_anchor":true},{"citing_arxiv_id":"2512.13030","citing_title":"Motus: A Unified Latent Action World Model","ref_index":24,"is_internal_anchor":true},{"citing_arxiv_id":"2605.03637","citing_title":"Bridging the Embodiment Gap: Disentangled Cross-Embodiment Video Editing","ref_index":6,"is_internal_anchor":true},{"citing_arxiv_id":"2604.26848","citing_title":"STARRY: Spatial-Temporal Action-Centric World Modeling for Robotic Manipulation","ref_index":38,"is_internal_anchor":true},{"citing_arxiv_id":"2604.22615","citing_title":"GazeVLA: Learning Human Intention for Robotic Manipulation","ref_index":29,"is_internal_anchor":true},{"citing_arxiv_id":"2605.05945","citing_title":"MobileEgo Anywhere: Open Infrastructure for long horizon egocentric data on commodity hardware","ref_index":14,"is_internal_anchor":true},{"citing_arxiv_id":"2602.15922","citing_title":"World Action Models are Zero-shot Policies","ref_index":37,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":2,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/3BQG7JNYYOPASRBS4IQZP4RSQE","json":"https://pith.science/pith/3BQG7JNYYOPASRBS4IQZP4RSQE.json","graph_json":"https://pith.science/api/pith-number/3BQG7JNYYOPASRBS4IQZP4RSQE/graph.json","events_json":"https://pith.science/api/pith-number/3BQG7JNYYOPASRBS4IQZP4RSQE/events.json","paper":"https://pith.science/paper/3BQG7JNY"},"agent_actions":{"view_html":"https://pith.science/pith/3BQG7JNYYOPASRBS4IQZP4RSQE","download_json":"https://pith.science/pith/3BQG7JNYYOPASRBS4IQZP4RSQE.json","view_paper":"https://pith.science/paper/3BQG7JNY","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2505.11709&json=true","fetch_graph":"https://pith.science/api/pith-number/3BQG7JNYYOPASRBS4IQZP4RSQE/graph.json","fetch_events":"https://pith.science/api/pith-number/3BQG7JNYYOPASRBS4IQZP4RSQE/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/3BQG7JNYYOPASRBS4IQZP4RSQE/action/timestamp_anchor","attest_storage":"https://pith.science/pith/3BQG7JNYYOPASRBS4IQZP4RSQE/action/storage_attestation","attest_author":"https://pith.science/pith/3BQG7JNYYOPASRBS4IQZP4RSQE/action/author_attestation","sign_citation":"https://pith.science/pith/3BQG7JNYYOPASRBS4IQZP4RSQE/action/citation_signature","submit_replication":"https://pith.science/pith/3BQG7JNYYOPASRBS4IQZP4RSQE/action/replication_record"}},"created_at":"2026-05-17T23:38:51.028888+00:00","updated_at":"2026-05-17T23:38:51.028888+00:00"}