{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2025:BQDFLROG4STK3HB6IUQSMANU74","short_pith_number":"pith:BQDFLROG","canonical_record":{"source":{"id":"2508.00795","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.RO","submitted_at":"2025-08-01T17:23:49Z","cross_cats_sorted":[],"title_canon_sha256":"a444d10e3f74deb76778d265a9478199e5e5cd0b7f6e5b90f7db6678eca95ff5","abstract_canon_sha256":"5e72f95ab4a2684d67183c98a995b8cb49ff1d51ed5898e203b6071544df66aa"},"schema_version":"1.0"},"canonical_sha256":"0c0655c5c6e4a6ad9c3e45212601b4ff3af1553ebbe9880dfd774649f1429ae8","source":{"kind":"arxiv","id":"2508.00795","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2508.00795","created_at":"2026-05-17T23:38:50Z"},{"alias_kind":"arxiv_version","alias_value":"2508.00795v1","created_at":"2026-05-17T23:38:50Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2508.00795","created_at":"2026-05-17T23:38:50Z"},{"alias_kind":"pith_short_12","alias_value":"BQDFLROG4STK","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"BQDFLROG4STK3HB6","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"BQDFLROG","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2025:BQDFLROG4STK3HB6IUQSMANU74","target":"record","payload":{"canonical_record":{"source":{"id":"2508.00795","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.RO","submitted_at":"2025-08-01T17:23:49Z","cross_cats_sorted":[],"title_canon_sha256":"a444d10e3f74deb76778d265a9478199e5e5cd0b7f6e5b90f7db6678eca95ff5","abstract_canon_sha256":"5e72f95ab4a2684d67183c98a995b8cb49ff1d51ed5898e203b6071544df66aa"},"schema_version":"1.0"},"canonical_sha256":"0c0655c5c6e4a6ad9c3e45212601b4ff3af1553ebbe9880dfd774649f1429ae8","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:50.057425Z","signature_b64":"0817qYkn3aS7+kZ0jfdFXetT60zoHBIj+RqJ1kwMqGCZoDOEredNBLB8NobSgEzrMHpH4T1GCbIOiMWRZfqFBg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"0c0655c5c6e4a6ad9c3e45212601b4ff3af1553ebbe9880dfd774649f1429ae8","last_reissued_at":"2026-05-17T23:38:50.056926Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:50.056926Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2508.00795","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:50Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"JX313c0W7Ld7CB+sDvHuZFawhLTSY1LNeW7SAt1C4ZMiqXkC+ws6MdhAWo8NU2Nq96tVokjTRHT35QSOEfCrCA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-19T17:50:09.190645Z"},"content_sha256":"1c6d3984364ab89b96e5c3e63517026f51d76a6961662b5fe7ed1bd857da8298","schema_version":"1.0","event_id":"sha256:1c6d3984364ab89b96e5c3e63517026f51d76a6961662b5fe7ed1bd857da8298"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2025:BQDFLROG4STK3HB6IUQSMANU74","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Video Generators are Robot Policies","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Video generation models can serve as robot policies by predicting future behavior frames and extracting actions from them.","cross_cats":[],"primary_cat":"cs.RO","authors_text":"Carl Vondrick, Junbang Liang, Paarth Shah, Pavel Tokmakov, Rares Ambrus, Ruoshi Liu, Sruthi Sudhakar","submitted_at":"2025-08-01T17:23:49Z","abstract_excerpt":"Despite tremendous progress in dexterous manipulation, current visuomotor policies remain fundamentally limited by two challenges: they struggle to generalize under perceptual or behavioral distribution shifts, and their performance is constrained by the size of human demonstration data. In this paper, we use video generation as a proxy for robot policy learning to address both limitations simultaneously. We propose Video Policy, a modular framework that combines video and action generation that can be trained end-to-end. Our results demonstrate that learning to generate videos of robot behavi"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"learning to generate videos of robot behavior allows for the extraction of policies with minimal demonstration data, significantly improving robustness and sample efficiency","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"that the video generator produces videos whose implied actions are both feasible and optimal for the robot, without introducing dynamics that do not match the physical system","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Training models to generate videos of robot actions produces policies that generalize better to new objects and tasks while using far less demonstration data than standard behavior cloning.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Video generation models can serve as robot policies by predicting future behavior frames and extracting actions from them.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"c8167f37032029487d31e32dbc2a7f3f97e5345635d09bf9c8b64b944f14f53a"},"source":{"id":"2508.00795","kind":"arxiv","version":1},"verdict":{"id":"833cdadc-8905-4dc4-81dd-c9cbccea6062","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T21:40:58.620100Z","strongest_claim":"learning to generate videos of robot behavior allows for the extraction of policies with minimal demonstration data, significantly improving robustness and sample efficiency","one_line_summary":"Training models to generate videos of robot actions produces policies that generalize better to new objects and tasks while using far less demonstration data than standard behavior cloning.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"that the video generator produces videos whose implied actions are both feasible and optimal for the robot, without introducing dynamics that do not match the physical system","pith_extraction_headline":"Video generation models can serve as robot policies by predicting future behavior frames and extracting actions from them."},"references":{"count":63,"sample":[{"doi":"","year":1995,"title":"M. Bain and C. Sammut. A framework for behavioural cloning. In Machine intelligence 15 , pages 103–129, 1995","work_id":"0d1cbebc-440b-42b4-b9b3-db36b6cf40be","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2023,"title":"C. Chi, S. Feng, Y . Du, Z. Xu, E. Cousineau, B. Burchfiel, and S. Song. Diffusion policy: Visuomotor policy learning via action diffusion. In RSS, 2023","work_id":"15569f87-3ba9-410a-b163-979639add640","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2022,"title":"A. Brohan, N. Brown, J. Carbajal, Y . Chebotar, J. Dabis, C. Finn, K. Gopalakrishnan, K. Haus- man, A. Herzog, J. Hsu, et al. RT-1: Robotics transformer for real-world control at scale. In RSS, 2022","work_id":"150cd0c4-b39a-4fe7-9119-ad8e42133820","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2024,"title":"O. M. Team, D. Ghosh, H. Walke, K. Pertsch, K. Black, O. Mees, S. Dasari, J. Hejna, T. Kreiman, C. Xu, et al. Octo: An open-source generalist robot policy. In RSS, 2024","work_id":"8b84caca-9bd4-4a9d-8ed5-0ca81a86f547","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2025,"title":"K. Black, N. Brown, D. Driess, A. Esmail, M. Equi, C. Finn, N. Fusai, L. Groom, K. Hausman, B. Ichter, et al. π0: A vision-language-action flow model for general robot control. RSS, 2025","work_id":"a26bc36c-e311-4408-812d-bb59153fcbe0","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":63,"snapshot_sha256":"b36c0947e9a8c9133a217e4eab11789d405cdf155fcf08939490dd7799d135ef","internal_anchors":7},"formal_canon":{"evidence_count":2,"snapshot_sha256":"2c7317f3b91e5c496f90a5d0adf37f5b3197e23f104d56f0ef7aef2b31cd8300"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"833cdadc-8905-4dc4-81dd-c9cbccea6062"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:50Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"cENjDJf4RMJZVMXn11IbCRCNeRirzZrEipKrtNrXA90WbPgiotIq+pRaKXtXXiVHwAdFbEz5kigiyy3BpxDNAw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-19T17:50:09.191667Z"},"content_sha256":"4e31a5c40eea5b9b6e726b4eaf87fddc303ad4e18f313f4b0ef7baa3a74aa255","schema_version":"1.0","event_id":"sha256:4e31a5c40eea5b9b6e726b4eaf87fddc303ad4e18f313f4b0ef7baa3a74aa255"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/BQDFLROG4STK3HB6IUQSMANU74/bundle.json","state_url":"https://pith.science/pith/BQDFLROG4STK3HB6IUQSMANU74/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/BQDFLROG4STK3HB6IUQSMANU74/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-19T17:50:09Z","links":{"resolver":"https://pith.science/pith/BQDFLROG4STK3HB6IUQSMANU74","bundle":"https://pith.science/pith/BQDFLROG4STK3HB6IUQSMANU74/bundle.json","state":"https://pith.science/pith/BQDFLROG4STK3HB6IUQSMANU74/state.json","well_known_bundle":"https://pith.science/.well-known/pith/BQDFLROG4STK3HB6IUQSMANU74/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2025:BQDFLROG4STK3HB6IUQSMANU74","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"5e72f95ab4a2684d67183c98a995b8cb49ff1d51ed5898e203b6071544df66aa","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.RO","submitted_at":"2025-08-01T17:23:49Z","title_canon_sha256":"a444d10e3f74deb76778d265a9478199e5e5cd0b7f6e5b90f7db6678eca95ff5"},"schema_version":"1.0","source":{"id":"2508.00795","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2508.00795","created_at":"2026-05-17T23:38:50Z"},{"alias_kind":"arxiv_version","alias_value":"2508.00795v1","created_at":"2026-05-17T23:38:50Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2508.00795","created_at":"2026-05-17T23:38:50Z"},{"alias_kind":"pith_short_12","alias_value":"BQDFLROG4STK","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"BQDFLROG4STK3HB6","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"BQDFLROG","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:4e31a5c40eea5b9b6e726b4eaf87fddc303ad4e18f313f4b0ef7baa3a74aa255","target":"graph","created_at":"2026-05-17T23:38:50Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"learning to generate videos of robot behavior allows for the extraction of policies with minimal demonstration data, significantly improving robustness and sample efficiency"},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"that the video generator produces videos whose implied actions are both feasible and optimal for the robot, without introducing dynamics that do not match the physical system"},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Training models to generate videos of robot actions produces policies that generalize better to new objects and tasks while using far less demonstration data than standard behavior cloning."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Video generation models can serve as robot policies by predicting future behavior frames and extracting actions from them."}],"snapshot_sha256":"c8167f37032029487d31e32dbc2a7f3f97e5345635d09bf9c8b64b944f14f53a"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"2c7317f3b91e5c496f90a5d0adf37f5b3197e23f104d56f0ef7aef2b31cd8300"},"paper":{"abstract_excerpt":"Despite tremendous progress in dexterous manipulation, current visuomotor policies remain fundamentally limited by two challenges: they struggle to generalize under perceptual or behavioral distribution shifts, and their performance is constrained by the size of human demonstration data. In this paper, we use video generation as a proxy for robot policy learning to address both limitations simultaneously. We propose Video Policy, a modular framework that combines video and action generation that can be trained end-to-end. Our results demonstrate that learning to generate videos of robot behavi","authors_text":"Carl Vondrick, Junbang Liang, Paarth Shah, Pavel Tokmakov, Rares Ambrus, Ruoshi Liu, Sruthi Sudhakar","cross_cats":[],"headline":"Video generation models can serve as robot policies by predicting future behavior frames and extracting actions from them.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.RO","submitted_at":"2025-08-01T17:23:49Z","title":"Video Generators are Robot Policies"},"references":{"count":63,"internal_anchors":7,"resolved_work":63,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"M. Bain and C. Sammut. A framework for behavioural cloning. In Machine intelligence 15 , pages 103–129, 1995","work_id":"0d1cbebc-440b-42b4-b9b3-db36b6cf40be","year":1995},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"C. Chi, S. Feng, Y . Du, Z. Xu, E. Cousineau, B. Burchfiel, and S. Song. Diffusion policy: Visuomotor policy learning via action diffusion. In RSS, 2023","work_id":"15569f87-3ba9-410a-b163-979639add640","year":2023},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"A. Brohan, N. Brown, J. Carbajal, Y . Chebotar, J. Dabis, C. Finn, K. Gopalakrishnan, K. Haus- man, A. Herzog, J. Hsu, et al. RT-1: Robotics transformer for real-world control at scale. In RSS, 2022","work_id":"150cd0c4-b39a-4fe7-9119-ad8e42133820","year":2022},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"O. M. Team, D. Ghosh, H. Walke, K. Pertsch, K. Black, O. Mees, S. Dasari, J. Hejna, T. Kreiman, C. Xu, et al. Octo: An open-source generalist robot policy. In RSS, 2024","work_id":"8b84caca-9bd4-4a9d-8ed5-0ca81a86f547","year":2024},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"K. Black, N. Brown, D. Driess, A. Esmail, M. Equi, C. Finn, N. Fusai, L. Groom, K. Hausman, B. Ichter, et al. π0: A vision-language-action flow model for general robot control. RSS, 2025","work_id":"a26bc36c-e311-4408-812d-bb59153fcbe0","year":2025}],"snapshot_sha256":"b36c0947e9a8c9133a217e4eab11789d405cdf155fcf08939490dd7799d135ef"},"source":{"id":"2508.00795","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-15T21:40:58.620100Z","id":"833cdadc-8905-4dc4-81dd-c9cbccea6062","model_set":{"reader":"grok-4.3"},"one_line_summary":"Training models to generate videos of robot actions produces policies that generalize better to new objects and tasks while using far less demonstration data than standard behavior cloning.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Video generation models can serve as robot policies by predicting future behavior frames and extracting actions from them.","strongest_claim":"learning to generate videos of robot behavior allows for the extraction of policies with minimal demonstration data, significantly improving robustness and sample efficiency","weakest_assumption":"that the video generator produces videos whose implied actions are both feasible and optimal for the robot, without introducing dynamics that do not match the physical system"}},"verdict_id":"833cdadc-8905-4dc4-81dd-c9cbccea6062"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:1c6d3984364ab89b96e5c3e63517026f51d76a6961662b5fe7ed1bd857da8298","target":"record","created_at":"2026-05-17T23:38:50Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"5e72f95ab4a2684d67183c98a995b8cb49ff1d51ed5898e203b6071544df66aa","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.RO","submitted_at":"2025-08-01T17:23:49Z","title_canon_sha256":"a444d10e3f74deb76778d265a9478199e5e5cd0b7f6e5b90f7db6678eca95ff5"},"schema_version":"1.0","source":{"id":"2508.00795","kind":"arxiv","version":1}},"canonical_sha256":"0c0655c5c6e4a6ad9c3e45212601b4ff3af1553ebbe9880dfd774649f1429ae8","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"0c0655c5c6e4a6ad9c3e45212601b4ff3af1553ebbe9880dfd774649f1429ae8","first_computed_at":"2026-05-17T23:38:50.056926Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:50.056926Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"0817qYkn3aS7+kZ0jfdFXetT60zoHBIj+RqJ1kwMqGCZoDOEredNBLB8NobSgEzrMHpH4T1GCbIOiMWRZfqFBg==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:50.057425Z","signed_message":"canonical_sha256_bytes"},"source_id":"2508.00795","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:1c6d3984364ab89b96e5c3e63517026f51d76a6961662b5fe7ed1bd857da8298","sha256:4e31a5c40eea5b9b6e726b4eaf87fddc303ad4e18f313f4b0ef7baa3a74aa255"],"state_sha256":"38bd6c43e23d1dc69beebdaf09a642837ba200dbfb92ca5d2af1477c1b2a304f"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"kRfvgJxgTFgwWCoOzkJ/Z0HYBLFdZzi7i52myJXzQe0RZxvllsbvDpC81qGQ/NaoJT+XPwl+/Q8jlyCbCVEgAg==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-19T17:50:09.195530Z","bundle_sha256":"79a1d6a0c5bc035d43fcd08c771b8bc7376afcd7335896675023235ee75bd85b"}}