{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2024:B3MFR4N7YWZSXPIWNIYG62VWMM","short_pith_number":"pith:B3MFR4N7","canonical_record":{"source":{"id":"2409.16283","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.RO","submitted_at":"2024-09-24T17:57:33Z","cross_cats_sorted":["cs.CV","cs.LG","eess.IV"],"title_canon_sha256":"3f9ca604e7808f1291056c97bf44e06cc35ffd74c485e0535211086a76b907c9","abstract_canon_sha256":"34fb22a907aed15928b0a7e293e229a831eafa8c063e11d912c7ca183fdceb80"},"schema_version":"1.0"},"canonical_sha256":"0ed858f1bfc5b32bbd166a306f6ab6632284e5e44e96a97f8ee3ab25c759d4b4","source":{"kind":"arxiv","id":"2409.16283","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2409.16283","created_at":"2026-05-17T23:38:52Z"},{"alias_kind":"arxiv_version","alias_value":"2409.16283v1","created_at":"2026-05-17T23:38:52Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2409.16283","created_at":"2026-05-17T23:38:52Z"},{"alias_kind":"pith_short_12","alias_value":"B3MFR4N7YWZS","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"B3MFR4N7YWZSXPIW","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"B3MFR4N7","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2024:B3MFR4N7YWZSXPIWNIYG62VWMM","target":"record","payload":{"canonical_record":{"source":{"id":"2409.16283","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.RO","submitted_at":"2024-09-24T17:57:33Z","cross_cats_sorted":["cs.CV","cs.LG","eess.IV"],"title_canon_sha256":"3f9ca604e7808f1291056c97bf44e06cc35ffd74c485e0535211086a76b907c9","abstract_canon_sha256":"34fb22a907aed15928b0a7e293e229a831eafa8c063e11d912c7ca183fdceb80"},"schema_version":"1.0"},"canonical_sha256":"0ed858f1bfc5b32bbd166a306f6ab6632284e5e44e96a97f8ee3ab25c759d4b4","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:52.572957Z","signature_b64":"yKTtTdgTvE3U3YnSpBOogbmgf8vZK1RiU5GGuC3y9Olz+9lSmnK/5ZiOD29Z/pxB/XqUJJE+GKvXdwD7fT36Dw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"0ed858f1bfc5b32bbd166a306f6ab6632284e5e44e96a97f8ee3ab25c759d4b4","last_reissued_at":"2026-05-17T23:38:52.572500Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:52.572500Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2409.16283","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:52Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"QGkM3ad9xnDelJOdJiyqaTBZooZKJCUtgKOBZv7FBbSAYrk1JgbpNrmlFW4B8U3fYaKi1CSczZW4XarRga3ZCQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-25T21:26:56.853768Z"},"content_sha256":"0aac050ad07ee711144474909ca4eb19fa2bf3be0d9ae7298126b7bd84461d7d","schema_version":"1.0","event_id":"sha256:0aac050ad07ee711144474909ca4eb19fa2bf3be0d9ae7298126b7bd84461d7d"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2024:B3MFR4N7YWZSXPIWNIYG62VWMM","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Gen2Act: Human Video Generation in Novel Scenarios enables Generalizable Robot Manipulation","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Generating human videos from web data lets a single robot policy manipulate unseen objects and novel motions without fine-tuning.","cross_cats":["cs.CV","cs.LG","eess.IV"],"primary_cat":"cs.RO","authors_text":"Abhinav Gupta, Carl Doersch, Debidatta Dwibedi, Dhruv Shah, Dorsa Sadigh, Fei Xia, Homanga Bharadhwaj, Sean Kirmani, Shubham Tulsiani, Ted Xiao","submitted_at":"2024-09-24T17:57:33Z","abstract_excerpt":"How can robot manipulation policies generalize to novel tasks involving unseen object types and new motions? In this paper, we provide a solution in terms of predicting motion information from web data through human video generation and conditioning a robot policy on the generated video. Instead of attempting to scale robot data collection which is expensive, we show how we can leverage video generation models trained on easily available web data, for enabling generalization. Our approach Gen2Act casts language-conditioned manipulation as zero-shot human video generation followed by execution "},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Our results on diverse real-world scenarios show how Gen2Act enables manipulating unseen object types and performing novel motions for tasks not present in the robot data.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That videos generated by a pre-trained model from web data provide sufficiently accurate and transferable motion information for a robot policy to execute novel tasks without any fine-tuning of the video model or additional domain adaptation.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Gen2Act enables generalizable robot manipulation for unseen objects and novel motions by using zero-shot human video generation from web data to condition a policy trained on an order of magnitude less robot interaction data.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Generating human videos from web data lets a single robot policy manipulate unseen objects and novel motions without fine-tuning.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"1bbe4e4338af2e5d31282bc788cb896d47082880c17f6866ee3925319f5771eb"},"source":{"id":"2409.16283","kind":"arxiv","version":1},"verdict":{"id":"1bf39bd6-76af-49ba-909c-68383b3c8f09","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T12:13:10.717237Z","strongest_claim":"Our results on diverse real-world scenarios show how Gen2Act enables manipulating unseen object types and performing novel motions for tasks not present in the robot data.","one_line_summary":"Gen2Act enables generalizable robot manipulation for unseen objects and novel motions by using zero-shot human video generation from web data to condition a policy trained on an order of magnitude less robot interaction data.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That videos generated by a pre-trained model from web data provide sufficiently accurate and transferable motion information for a robot policy to execute novel tasks without any fine-tuning of the video model or additional domain adaptation.","pith_extraction_headline":"Generating human videos from web data lets a single robot policy manipulate unseen objects and novel motions without fine-tuning."},"references":{"count":61,"sample":[{"doi":"","year":2022,"title":"RT-1: Robotics Transformer for Real-World Control at Scale","work_id":"e11bda85-8531-46bc-a07f-d0ade3643ab1","ref_index":1,"cited_arxiv_id":"2212.06817","is_internal_anchor":true},{"doi":"","year":2024,"title":"Roboagent: Generalization and efficiency in robot manipulation via semantic augmen- tations and action chunking,","work_id":"cf539f96-21ec-42f3-9343-020ac356a037","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2024,"title":"DROID: A Large-Scale In-The-Wild Robot Manipulation Dataset","work_id":"13253de2-3d89-415c-8c2f-3adb25d4c337","ref_index":4,"cited_arxiv_id":"2403.12945","is_internal_anchor":true},{"doi":"","year":2022,"title":"R3M: A Universal Visual Representation for Robot Manipulation","work_id":"1fb6c1b7-913d-4a89-bbad-842fdb5fca1d","ref_index":5,"cited_arxiv_id":"2203.12601","is_internal_anchor":true},{"doi":"","year":2023,"title":"Where are we in the search for an artificial vi- sual cortex for embodied intelligence?","work_id":"a30a45a3-bdd7-45bf-a4f4-66d48e56bd4a","ref_index":6,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":61,"snapshot_sha256":"9e6d99e7c63ae2fdb0ab688cfa1d7b4abf4342d125363741532386a8d130fec2","internal_anchors":12},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"1bf39bd6-76af-49ba-909c-68383b3c8f09"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:52Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"5IiWHQkMcmj7/U5UQnI+UjpuRTEtVgWJh7imUGbGWfcx26QgI5qc57euPvMFR4gHcRMZy8RsuoEgJGLBaB1oDA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-25T21:26:56.854722Z"},"content_sha256":"9573058811ae0222fc5417b2edaf810434c79374c2894a1ff88f27865290e4b6","schema_version":"1.0","event_id":"sha256:9573058811ae0222fc5417b2edaf810434c79374c2894a1ff88f27865290e4b6"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/B3MFR4N7YWZSXPIWNIYG62VWMM/bundle.json","state_url":"https://pith.science/pith/B3MFR4N7YWZSXPIWNIYG62VWMM/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/B3MFR4N7YWZSXPIWNIYG62VWMM/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-25T21:26:56Z","links":{"resolver":"https://pith.science/pith/B3MFR4N7YWZSXPIWNIYG62VWMM","bundle":"https://pith.science/pith/B3MFR4N7YWZSXPIWNIYG62VWMM/bundle.json","state":"https://pith.science/pith/B3MFR4N7YWZSXPIWNIYG62VWMM/state.json","well_known_bundle":"https://pith.science/.well-known/pith/B3MFR4N7YWZSXPIWNIYG62VWMM/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2024:B3MFR4N7YWZSXPIWNIYG62VWMM","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"34fb22a907aed15928b0a7e293e229a831eafa8c063e11d912c7ca183fdceb80","cross_cats_sorted":["cs.CV","cs.LG","eess.IV"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.RO","submitted_at":"2024-09-24T17:57:33Z","title_canon_sha256":"3f9ca604e7808f1291056c97bf44e06cc35ffd74c485e0535211086a76b907c9"},"schema_version":"1.0","source":{"id":"2409.16283","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2409.16283","created_at":"2026-05-17T23:38:52Z"},{"alias_kind":"arxiv_version","alias_value":"2409.16283v1","created_at":"2026-05-17T23:38:52Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2409.16283","created_at":"2026-05-17T23:38:52Z"},{"alias_kind":"pith_short_12","alias_value":"B3MFR4N7YWZS","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"B3MFR4N7YWZSXPIW","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"B3MFR4N7","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:9573058811ae0222fc5417b2edaf810434c79374c2894a1ff88f27865290e4b6","target":"graph","created_at":"2026-05-17T23:38:52Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Our results on diverse real-world scenarios show how Gen2Act enables manipulating unseen object types and performing novel motions for tasks not present in the robot data."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That videos generated by a pre-trained model from web data provide sufficiently accurate and transferable motion information for a robot policy to execute novel tasks without any fine-tuning of the video model or additional domain adaptation."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Gen2Act enables generalizable robot manipulation for unseen objects and novel motions by using zero-shot human video generation from web data to condition a policy trained on an order of magnitude less robot interaction data."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Generating human videos from web data lets a single robot policy manipulate unseen objects and novel motions without fine-tuning."}],"snapshot_sha256":"1bbe4e4338af2e5d31282bc788cb896d47082880c17f6866ee3925319f5771eb"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"How can robot manipulation policies generalize to novel tasks involving unseen object types and new motions? In this paper, we provide a solution in terms of predicting motion information from web data through human video generation and conditioning a robot policy on the generated video. Instead of attempting to scale robot data collection which is expensive, we show how we can leverage video generation models trained on easily available web data, for enabling generalization. Our approach Gen2Act casts language-conditioned manipulation as zero-shot human video generation followed by execution ","authors_text":"Abhinav Gupta, Carl Doersch, Debidatta Dwibedi, Dhruv Shah, Dorsa Sadigh, Fei Xia, Homanga Bharadhwaj, Sean Kirmani, Shubham Tulsiani, Ted Xiao","cross_cats":["cs.CV","cs.LG","eess.IV"],"headline":"Generating human videos from web data lets a single robot policy manipulate unseen objects and novel motions without fine-tuning.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.RO","submitted_at":"2024-09-24T17:57:33Z","title":"Gen2Act: Human Video Generation in Novel Scenarios enables Generalizable Robot Manipulation"},"references":{"count":61,"internal_anchors":12,"resolved_work":61,"sample":[{"cited_arxiv_id":"2212.06817","doi":"","is_internal_anchor":true,"ref_index":1,"title":"RT-1: Robotics Transformer for Real-World Control at Scale","work_id":"e11bda85-8531-46bc-a07f-d0ade3643ab1","year":2022},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Roboagent: Generalization and efficiency in robot manipulation via semantic augmen- tations and action chunking,","work_id":"cf539f96-21ec-42f3-9343-020ac356a037","year":2024},{"cited_arxiv_id":"2403.12945","doi":"","is_internal_anchor":true,"ref_index":4,"title":"DROID: A Large-Scale In-The-Wild Robot Manipulation Dataset","work_id":"13253de2-3d89-415c-8c2f-3adb25d4c337","year":2024},{"cited_arxiv_id":"2203.12601","doi":"","is_internal_anchor":true,"ref_index":5,"title":"R3M: A Universal Visual Representation for Robot Manipulation","work_id":"1fb6c1b7-913d-4a89-bbad-842fdb5fca1d","year":2022},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":6,"title":"Where are we in the search for an artificial vi- sual cortex for embodied intelligence?","work_id":"a30a45a3-bdd7-45bf-a4f4-66d48e56bd4a","year":2023}],"snapshot_sha256":"9e6d99e7c63ae2fdb0ab688cfa1d7b4abf4342d125363741532386a8d130fec2"},"source":{"id":"2409.16283","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-15T12:13:10.717237Z","id":"1bf39bd6-76af-49ba-909c-68383b3c8f09","model_set":{"reader":"grok-4.3"},"one_line_summary":"Gen2Act enables generalizable robot manipulation for unseen objects and novel motions by using zero-shot human video generation from web data to condition a policy trained on an order of magnitude less robot interaction data.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Generating human videos from web data lets a single robot policy manipulate unseen objects and novel motions without fine-tuning.","strongest_claim":"Our results on diverse real-world scenarios show how Gen2Act enables manipulating unseen object types and performing novel motions for tasks not present in the robot data.","weakest_assumption":"That videos generated by a pre-trained model from web data provide sufficiently accurate and transferable motion information for a robot policy to execute novel tasks without any fine-tuning of the video model or additional domain adaptation."}},"verdict_id":"1bf39bd6-76af-49ba-909c-68383b3c8f09"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:0aac050ad07ee711144474909ca4eb19fa2bf3be0d9ae7298126b7bd84461d7d","target":"record","created_at":"2026-05-17T23:38:52Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"34fb22a907aed15928b0a7e293e229a831eafa8c063e11d912c7ca183fdceb80","cross_cats_sorted":["cs.CV","cs.LG","eess.IV"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.RO","submitted_at":"2024-09-24T17:57:33Z","title_canon_sha256":"3f9ca604e7808f1291056c97bf44e06cc35ffd74c485e0535211086a76b907c9"},"schema_version":"1.0","source":{"id":"2409.16283","kind":"arxiv","version":1}},"canonical_sha256":"0ed858f1bfc5b32bbd166a306f6ab6632284e5e44e96a97f8ee3ab25c759d4b4","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"0ed858f1bfc5b32bbd166a306f6ab6632284e5e44e96a97f8ee3ab25c759d4b4","first_computed_at":"2026-05-17T23:38:52.572500Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:52.572500Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"yKTtTdgTvE3U3YnSpBOogbmgf8vZK1RiU5GGuC3y9Olz+9lSmnK/5ZiOD29Z/pxB/XqUJJE+GKvXdwD7fT36Dw==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:52.572957Z","signed_message":"canonical_sha256_bytes"},"source_id":"2409.16283","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:0aac050ad07ee711144474909ca4eb19fa2bf3be0d9ae7298126b7bd84461d7d","sha256:9573058811ae0222fc5417b2edaf810434c79374c2894a1ff88f27865290e4b6"],"state_sha256":"335abd2892a6a4f751f75ebf84dd14430e69c9aed65a735360ac0882bfa45757"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"Immfb9o95vJ5nJB9/XpCafpjFeQilO7GKwBN/BXucWDeIK0RzBgsQWPbWOKfrSCMWTBaNAy9jR2P+7y0qt7+Bw==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-25T21:26:56.858941Z","bundle_sha256":"cb93ae1436c136eead9a0c7698fd4b11ed24f005eb458e1d930e79c16c2f7ff5"}}