{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2023:2RCKNUMH66EZWGGSE6X4BKL32A","short_pith_number":"pith:2RCKNUMH","canonical_record":{"source":{"id":"2310.10639","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.RO","submitted_at":"2023-10-16T17:57:23Z","cross_cats_sorted":[],"title_canon_sha256":"1afa1eb71c2abc380fef589399b6b9ce5c674f28e4c2d67bdf69760ca03cf0eb","abstract_canon_sha256":"3a5b18f67532489bed26bb37ff3209f173b78c96f1e8dc820dd2ab374d3cfa83"},"schema_version":"1.0"},"canonical_sha256":"d444a6d187f7899b18d227afc0a97bd0068e1f401790682db06cd5a4f03c3c19","source":{"kind":"arxiv","id":"2310.10639","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2310.10639","created_at":"2026-05-17T23:38:48Z"},{"alias_kind":"arxiv_version","alias_value":"2310.10639v1","created_at":"2026-05-17T23:38:48Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2310.10639","created_at":"2026-05-17T23:38:48Z"},{"alias_kind":"pith_short_12","alias_value":"2RCKNUMH66EZ","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_16","alias_value":"2RCKNUMH66EZWGGS","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_8","alias_value":"2RCKNUMH","created_at":"2026-05-18T12:33:33Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2023:2RCKNUMH66EZWGGSE6X4BKL32A","target":"record","payload":{"canonical_record":{"source":{"id":"2310.10639","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.RO","submitted_at":"2023-10-16T17:57:23Z","cross_cats_sorted":[],"title_canon_sha256":"1afa1eb71c2abc380fef589399b6b9ce5c674f28e4c2d67bdf69760ca03cf0eb","abstract_canon_sha256":"3a5b18f67532489bed26bb37ff3209f173b78c96f1e8dc820dd2ab374d3cfa83"},"schema_version":"1.0"},"canonical_sha256":"d444a6d187f7899b18d227afc0a97bd0068e1f401790682db06cd5a4f03c3c19","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:48.917523Z","signature_b64":"nqn+u3XR1cV6GQY/kmoXqPfIyTLmMlxQMfXC5W4WVvndviuo89mKTeN+nwyvTY7B1drUrKxKVVXRjgJr+DsNAg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"d444a6d187f7899b18d227afc0a97bd0068e1f401790682db06cd5a4f03c3c19","last_reissued_at":"2026-05-17T23:38:48.917021Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:48.917021Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2310.10639","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:48Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"0o0XrcBHwbsS5ruf/iKokox+0PGWI8vf9JuIqt7Gcz9su+hLtc1VMqDQqw6BzE51Hhq/O1JeyHRVGqsUNL0tCw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-05T01:00:09.500263Z"},"content_sha256":"61719e19f1b79086c5f39cb7cf94b1c99bb71834bf0ff8dea24065771998425c","schema_version":"1.0","event_id":"sha256:61719e19f1b79086c5f39cb7cf94b1c99bb71834bf0ff8dea24065771998425c"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2023:2RCKNUMH66EZWGGSE6X4BKL32A","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Zero-Shot Robotic Manipulation with Pretrained Image-Editing Diffusion Models","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"A finetuned image-editing diffusion model generates subgoal images that let a low-level policy complete manipulation tasks on objects and instructions absent from robot training data.","cross_cats":[],"primary_cat":"cs.RO","authors_text":"Aviral Kumar, Chelsea Finn, Homer Walke, Kevin Black, Mitsuhiko Nakamoto, Pranav Atreya, Sergey Levine","submitted_at":"2023-10-16T17:57:23Z","abstract_excerpt":"If generalist robots are to operate in truly unstructured environments, they need to be able to recognize and reason about novel objects and scenarios. Such objects and scenarios might not be present in the robot's own training data. We propose SuSIE, a method that leverages an image-editing diffusion model to act as a high-level planner by proposing intermediate subgoals that a low-level controller can accomplish. Specifically, we finetune InstructPix2Pix on video data, consisting of both human videos and robot rollouts, such that it outputs hypothetical future \"subgoal\" observations given th"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"We achieve state-of-the-art results on the CALVIN benchmark, and also demonstrate robust generalization on real-world manipulation tasks, beating strong baselines that have access to privileged information or that utilize orders of magnitude more compute and training data.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That subgoal images generated by the finetuned diffusion model remain sufficiently accurate and executable for the low-level policy when the robot encounters objects, lighting, or instructions outside the finetuning distribution.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"SuSIE uses a finetuned InstructPix2Pix diffusion model to propose subgoal images that guide a low-level goal-conditioned policy, achieving SOTA zero-shot performance on CALVIN and real-world manipulation.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"A finetuned image-editing diffusion model generates subgoal images that let a low-level policy complete manipulation tasks on objects and instructions absent from robot training data.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"253f64bfecedb08354612fff5d71577a46cdc5be8de0d115200c18ebd5862462"},"source":{"id":"2310.10639","kind":"arxiv","version":1},"verdict":{"id":"489a6c1b-4720-498b-a71d-957d779ff58c","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-16T05:50:34.186645Z","strongest_claim":"We achieve state-of-the-art results on the CALVIN benchmark, and also demonstrate robust generalization on real-world manipulation tasks, beating strong baselines that have access to privileged information or that utilize orders of magnitude more compute and training data.","one_line_summary":"SuSIE uses a finetuned InstructPix2Pix diffusion model to propose subgoal images that guide a low-level goal-conditioned policy, achieving SOTA zero-shot performance on CALVIN and real-world manipulation.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That subgoal images generated by the finetuned diffusion model remain sufficiently accurate and executable for the low-level policy when the robot encounters objects, lighting, or instructions outside the finetuning distribution.","pith_extraction_headline":"A finetuned image-editing diffusion model generates subgoal images that let a low-level policy complete manipulation tasks on objects and instructions absent from robot training data."},"references":{"count":66,"sample":[{"doi":"","year":2023,"title":"Anurag Ajay, Yilun Du, Abhi Gupta, Joshua B. Tenenbaum, Tommi S. Jaakkola, and Pulkit Agrawal. Is conditional generative modeling all you need for decision making? In The Eleventh International Confer","work_id":"773059c2-43a3-4b27-89bc-0f2c9d64cff5","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2023,"title":"Compositional founda- tion models for hierarchical planning","work_id":"31122d0b-b1dc-41ad-8818-e05c05898071","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2020,"title":"Fitvid: Overﬁtting in pixel-level video prediction","work_id":"98b75ffa-1d61-4641-a59f-5967267b7d2c","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2023,"title":"Robotic offline rl from internet videos via value-function pre-training","work_id":"51083cdf-e320-4588-923c-a475af2728ba","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2023,"title":"Introducing ChatGPT and Whis- per APIs","work_id":"749457d5-cf67-4ff2-893e-3b8b82dcff9b","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":66,"snapshot_sha256":"fb7e009c4ae4a695ff90c22c427ee6aa1089b29b0a973bf0e7abd1ca1aba7c38","internal_anchors":17},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"489a6c1b-4720-498b-a71d-957d779ff58c"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:48Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"MJ89eg21laK2c3kM9h1UQhPjXc3B7yYy07Mq+oPEzqkh839p86bUmFAgEQVkhuqsuiJiACJkpNpc+YapKuUUDQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-05T01:00:09.501260Z"},"content_sha256":"20acfac922c747017d3cf01ebd399df32966b471d6772454c220c3df310d5233","schema_version":"1.0","event_id":"sha256:20acfac922c747017d3cf01ebd399df32966b471d6772454c220c3df310d5233"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/2RCKNUMH66EZWGGSE6X4BKL32A/bundle.json","state_url":"https://pith.science/pith/2RCKNUMH66EZWGGSE6X4BKL32A/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/2RCKNUMH66EZWGGSE6X4BKL32A/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-05T01:00:09Z","links":{"resolver":"https://pith.science/pith/2RCKNUMH66EZWGGSE6X4BKL32A","bundle":"https://pith.science/pith/2RCKNUMH66EZWGGSE6X4BKL32A/bundle.json","state":"https://pith.science/pith/2RCKNUMH66EZWGGSE6X4BKL32A/state.json","well_known_bundle":"https://pith.science/.well-known/pith/2RCKNUMH66EZWGGSE6X4BKL32A/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2023:2RCKNUMH66EZWGGSE6X4BKL32A","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"3a5b18f67532489bed26bb37ff3209f173b78c96f1e8dc820dd2ab374d3cfa83","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.RO","submitted_at":"2023-10-16T17:57:23Z","title_canon_sha256":"1afa1eb71c2abc380fef589399b6b9ce5c674f28e4c2d67bdf69760ca03cf0eb"},"schema_version":"1.0","source":{"id":"2310.10639","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2310.10639","created_at":"2026-05-17T23:38:48Z"},{"alias_kind":"arxiv_version","alias_value":"2310.10639v1","created_at":"2026-05-17T23:38:48Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2310.10639","created_at":"2026-05-17T23:38:48Z"},{"alias_kind":"pith_short_12","alias_value":"2RCKNUMH66EZ","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_16","alias_value":"2RCKNUMH66EZWGGS","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_8","alias_value":"2RCKNUMH","created_at":"2026-05-18T12:33:33Z"}],"graph_snapshots":[{"event_id":"sha256:20acfac922c747017d3cf01ebd399df32966b471d6772454c220c3df310d5233","target":"graph","created_at":"2026-05-17T23:38:48Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"We achieve state-of-the-art results on the CALVIN benchmark, and also demonstrate robust generalization on real-world manipulation tasks, beating strong baselines that have access to privileged information or that utilize orders of magnitude more compute and training data."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That subgoal images generated by the finetuned diffusion model remain sufficiently accurate and executable for the low-level policy when the robot encounters objects, lighting, or instructions outside the finetuning distribution."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"SuSIE uses a finetuned InstructPix2Pix diffusion model to propose subgoal images that guide a low-level goal-conditioned policy, achieving SOTA zero-shot performance on CALVIN and real-world manipulation."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"A finetuned image-editing diffusion model generates subgoal images that let a low-level policy complete manipulation tasks on objects and instructions absent from robot training data."}],"snapshot_sha256":"253f64bfecedb08354612fff5d71577a46cdc5be8de0d115200c18ebd5862462"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"If generalist robots are to operate in truly unstructured environments, they need to be able to recognize and reason about novel objects and scenarios. Such objects and scenarios might not be present in the robot's own training data. We propose SuSIE, a method that leverages an image-editing diffusion model to act as a high-level planner by proposing intermediate subgoals that a low-level controller can accomplish. Specifically, we finetune InstructPix2Pix on video data, consisting of both human videos and robot rollouts, such that it outputs hypothetical future \"subgoal\" observations given th","authors_text":"Aviral Kumar, Chelsea Finn, Homer Walke, Kevin Black, Mitsuhiko Nakamoto, Pranav Atreya, Sergey Levine","cross_cats":[],"headline":"A finetuned image-editing diffusion model generates subgoal images that let a low-level policy complete manipulation tasks on objects and instructions absent from robot training data.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.RO","submitted_at":"2023-10-16T17:57:23Z","title":"Zero-Shot Robotic Manipulation with Pretrained Image-Editing Diffusion Models"},"references":{"count":66,"internal_anchors":17,"resolved_work":66,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"Anurag Ajay, Yilun Du, Abhi Gupta, Joshua B. Tenenbaum, Tommi S. Jaakkola, and Pulkit Agrawal. Is conditional generative modeling all you need for decision making? In The Eleventh International Confer","work_id":"773059c2-43a3-4b27-89bc-0f2c9d64cff5","year":2023},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Compositional founda- tion models for hierarchical planning","work_id":"31122d0b-b1dc-41ad-8818-e05c05898071","year":2023},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"Fitvid: Overﬁtting in pixel-level video prediction","work_id":"98b75ffa-1d61-4641-a59f-5967267b7d2c","year":2020},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Robotic offline rl from internet videos via value-function pre-training","work_id":"51083cdf-e320-4588-923c-a475af2728ba","year":2023},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"Introducing ChatGPT and Whis- per APIs","work_id":"749457d5-cf67-4ff2-893e-3b8b82dcff9b","year":2023}],"snapshot_sha256":"fb7e009c4ae4a695ff90c22c427ee6aa1089b29b0a973bf0e7abd1ca1aba7c38"},"source":{"id":"2310.10639","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-16T05:50:34.186645Z","id":"489a6c1b-4720-498b-a71d-957d779ff58c","model_set":{"reader":"grok-4.3"},"one_line_summary":"SuSIE uses a finetuned InstructPix2Pix diffusion model to propose subgoal images that guide a low-level goal-conditioned policy, achieving SOTA zero-shot performance on CALVIN and real-world manipulation.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"A finetuned image-editing diffusion model generates subgoal images that let a low-level policy complete manipulation tasks on objects and instructions absent from robot training data.","strongest_claim":"We achieve state-of-the-art results on the CALVIN benchmark, and also demonstrate robust generalization on real-world manipulation tasks, beating strong baselines that have access to privileged information or that utilize orders of magnitude more compute and training data.","weakest_assumption":"That subgoal images generated by the finetuned diffusion model remain sufficiently accurate and executable for the low-level policy when the robot encounters objects, lighting, or instructions outside the finetuning distribution."}},"verdict_id":"489a6c1b-4720-498b-a71d-957d779ff58c"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:61719e19f1b79086c5f39cb7cf94b1c99bb71834bf0ff8dea24065771998425c","target":"record","created_at":"2026-05-17T23:38:48Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"3a5b18f67532489bed26bb37ff3209f173b78c96f1e8dc820dd2ab374d3cfa83","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.RO","submitted_at":"2023-10-16T17:57:23Z","title_canon_sha256":"1afa1eb71c2abc380fef589399b6b9ce5c674f28e4c2d67bdf69760ca03cf0eb"},"schema_version":"1.0","source":{"id":"2310.10639","kind":"arxiv","version":1}},"canonical_sha256":"d444a6d187f7899b18d227afc0a97bd0068e1f401790682db06cd5a4f03c3c19","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"d444a6d187f7899b18d227afc0a97bd0068e1f401790682db06cd5a4f03c3c19","first_computed_at":"2026-05-17T23:38:48.917021Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:48.917021Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"nqn+u3XR1cV6GQY/kmoXqPfIyTLmMlxQMfXC5W4WVvndviuo89mKTeN+nwyvTY7B1drUrKxKVVXRjgJr+DsNAg==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:48.917523Z","signed_message":"canonical_sha256_bytes"},"source_id":"2310.10639","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:61719e19f1b79086c5f39cb7cf94b1c99bb71834bf0ff8dea24065771998425c","sha256:20acfac922c747017d3cf01ebd399df32966b471d6772454c220c3df310d5233"],"state_sha256":"d5304b225aeadce51bbabffe06178476008daea0239a7ff8b80626ed60f2fee5"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"8VtK0H2HGVZ60rGDaXb63+xX+usbbYCB72yEIwMTVn1iexjvRJrR8Tom06CHbSELAp7fildiADJVeHnNHNr8AA==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-05T01:00:09.505661Z","bundle_sha256":"43c91af57ce743a2ee0cad16dc801ad599a6b5d711dc7f2785693e02be69d019"}}