{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2025:MVQLCYDAV5XYQOSFXCJZJTZ4P5","short_pith_number":"pith:MVQLCYDA","canonical_record":{"source":{"id":"2502.19417","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.RO","submitted_at":"2025-02-26T18:58:41Z","cross_cats_sorted":["cs.AI","cs.LG"],"title_canon_sha256":"3b1fdea721df6a4839273c19af265454d6171db77f88a82f2f1d4d419a24a8a0","abstract_canon_sha256":"2b64350ff6a13afc04f9ab60c2db11011409494aafe124453dddad25a14cfe73"},"schema_version":"1.0"},"canonical_sha256":"6560b16060af6f883a45b89394cf3c7f69d4dc0f491bc183b2b52a5082e3020b","source":{"kind":"arxiv","id":"2502.19417","version":2},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2502.19417","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"arxiv_version","alias_value":"2502.19417v2","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2502.19417","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"pith_short_12","alias_value":"MVQLCYDAV5XY","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"MVQLCYDAV5XYQOSF","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"MVQLCYDA","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2025:MVQLCYDAV5XYQOSFXCJZJTZ4P5","target":"record","payload":{"canonical_record":{"source":{"id":"2502.19417","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.RO","submitted_at":"2025-02-26T18:58:41Z","cross_cats_sorted":["cs.AI","cs.LG"],"title_canon_sha256":"3b1fdea721df6a4839273c19af265454d6171db77f88a82f2f1d4d419a24a8a0","abstract_canon_sha256":"2b64350ff6a13afc04f9ab60c2db11011409494aafe124453dddad25a14cfe73"},"schema_version":"1.0"},"canonical_sha256":"6560b16060af6f883a45b89394cf3c7f69d4dc0f491bc183b2b52a5082e3020b","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:49.872916Z","signature_b64":"5ov1jo/6hx8sNEsXK1RwGpHG4T72r7bVtAWm4UUR+H0Csh1P+g59n30fGDWUc9X34YMljJ49F8qAPg2mJIuJCA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"6560b16060af6f883a45b89394cf3c7f69d4dc0f491bc183b2b52a5082e3020b","last_reissued_at":"2026-05-17T23:38:49.872422Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:49.872422Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2502.19417","source_version":2,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:49Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"8hneb9me8MpXQpqwayI8Kh1c+qK9Kb7uSIbBvUwEeLxgex2o5uDbPFbBcrNqwOVODIsUNZESCDDKMdvuu7duBA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-23T08:45:18.705013Z"},"content_sha256":"4d821e20505623b3776b09d9dd1b05ca44b9b9c8759fb51e1717723bf482cc56","schema_version":"1.0","event_id":"sha256:4d821e20505623b3776b09d9dd1b05ca44b9b9c8759fb51e1717723bf482cc56"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2025:MVQLCYDAV5XYQOSFXCJZJTZ4P5","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Hi Robot: Open-Ended Instruction Following with Hierarchical Vision-Language-Action Models","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"A hierarchical vision-language model lets robots interpret complex instructions and real-time feedback to choose and perform next steps.","cross_cats":["cs.AI","cs.LG"],"primary_cat":"cs.RO","authors_text":"Adrian Li-Bell, Anna Walling, Brian Ichter, Chelsea Finn, Danny Driess, Haohuan Wang, James Tanner, Karl Pertsch, Lachy Groom, Liyiming Ke, Lucy Xiaoyang Shi, Michael Equi, Niccolo Fusai, Quan Vuong, Sergey Levine","submitted_at":"2025-02-26T18:58:41Z","abstract_excerpt":"Generalist robots that can perform a range of different tasks in open-world settings must be able to not only reason about the steps needed to accomplish their goals, but also process complex instructions, prompts, and even feedback during task execution. Intricate instructions (e.g., \"Could you make me a vegetarian sandwich?\" or \"I don't like that one\") require not just the ability to physically perform the individual steps, but the ability to situate complex commands and feedback in the physical world. In this work, we describe a system that uses vision-language models in a hierarchical stru"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"our system can reason through complex prompts and incorporate situated feedback during task execution ('that's not trash')","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That the high-level VLM can reliably map open-ended natural language and visual feedback into correct next-step decisions without hallucinating or misinterpreting physical context.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"A hierarchical VLA architecture lets robots follow complex instructions and situated feedback by separating high-level reasoning from low-level control.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"A hierarchical vision-language model lets robots interpret complex instructions and real-time feedback to choose and perform next steps.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"a911d2a90d0654a3b68453cba48345a116270656d7efdc74e4788fb3186b2344"},"source":{"id":"2502.19417","kind":"arxiv","version":2},"verdict":{"id":"b6905406-1385-4c55-88d8-097e0df14877","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T22:49:39.001717Z","strongest_claim":"our system can reason through complex prompts and incorporate situated feedback during task execution ('that's not trash')","one_line_summary":"A hierarchical VLA architecture lets robots follow complex instructions and situated feedback by separating high-level reasoning from low-level control.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That the high-level VLM can reliably map open-ended natural language and visual feedback into correct next-step decisions without hallucinating or misinterpreting physical context.","pith_extraction_headline":"A hierarchical vision-language model lets robots interpret complex instructions and real-time feedback to choose and perform next steps."},"references":{"count":51,"sample":[{"doi":"","year":2024,"title":"RT-H: Action Hierarchies Using Language","work_id":"ecf7cf18-c1a8-4a6b-bc2a-fb165643aa0d","ref_index":1,"cited_arxiv_id":"2403.01823","is_internal_anchor":true},{"doi":"","year":2024,"title":"PaliGemma: A versatile 3B VLM for transfer","work_id":"df6f48b3-5792-47c7-9614-cb856ea31ad9","ref_index":2,"cited_arxiv_id":"2407.07726","is_internal_anchor":true},{"doi":"","year":2024,"title":"$\\pi_0$: A Vision-Language-Action Flow Model for General Robot Control","work_id":"f790abdc-a796-482f-a40d-f8ee035ecfc2","ref_index":3,"cited_arxiv_id":"2410.24164","is_internal_anchor":true},{"doi":"","year":2022,"title":"RT-1: Robotics Transformer for Real-World Control at Scale","work_id":"e11bda85-8531-46bc-a07f-d0ade3643ab1","ref_index":4,"cited_arxiv_id":"2212.06817","is_internal_anchor":true},{"doi":"","year":2023,"title":"RT-2: Vision-Language-Action Models Transfer Web Knowledge to Robotic Control","work_id":"ff438a8a-8003-4fae-9131-acd418b3597b","ref_index":5,"cited_arxiv_id":"2307.15818","is_internal_anchor":true}],"resolved_work":51,"snapshot_sha256":"35b3c23e920f2dff07e8dada247e4fa541fc8b0e2a40cdbe3cf143930ac69cc7","internal_anchors":15},"formal_canon":{"evidence_count":1,"snapshot_sha256":"1ab0a28f36eda47538f2f25b3a78edc1a5d12c7e26d9cdde158c42a79a1a0378"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"b6905406-1385-4c55-88d8-097e0df14877"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:49Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"UTxA7SFTQpHfGFSOQG75iGhAeIsVudBkRZXrYbHV6g/+H8XRtgBzJciCbGbRJboRvEmE2m/65ZX9GPRyPyN5DA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-23T08:45:18.706089Z"},"content_sha256":"ca97a6a4f4e56c5d3fe43d4c5a391434de02d6a65c880b433e8ef928ef0110f7","schema_version":"1.0","event_id":"sha256:ca97a6a4f4e56c5d3fe43d4c5a391434de02d6a65c880b433e8ef928ef0110f7"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/MVQLCYDAV5XYQOSFXCJZJTZ4P5/bundle.json","state_url":"https://pith.science/pith/MVQLCYDAV5XYQOSFXCJZJTZ4P5/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/MVQLCYDAV5XYQOSFXCJZJTZ4P5/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-23T08:45:18Z","links":{"resolver":"https://pith.science/pith/MVQLCYDAV5XYQOSFXCJZJTZ4P5","bundle":"https://pith.science/pith/MVQLCYDAV5XYQOSFXCJZJTZ4P5/bundle.json","state":"https://pith.science/pith/MVQLCYDAV5XYQOSFXCJZJTZ4P5/state.json","well_known_bundle":"https://pith.science/.well-known/pith/MVQLCYDAV5XYQOSFXCJZJTZ4P5/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2025:MVQLCYDAV5XYQOSFXCJZJTZ4P5","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"2b64350ff6a13afc04f9ab60c2db11011409494aafe124453dddad25a14cfe73","cross_cats_sorted":["cs.AI","cs.LG"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.RO","submitted_at":"2025-02-26T18:58:41Z","title_canon_sha256":"3b1fdea721df6a4839273c19af265454d6171db77f88a82f2f1d4d419a24a8a0"},"schema_version":"1.0","source":{"id":"2502.19417","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2502.19417","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"arxiv_version","alias_value":"2502.19417v2","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2502.19417","created_at":"2026-05-17T23:38:49Z"},{"alias_kind":"pith_short_12","alias_value":"MVQLCYDAV5XY","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"MVQLCYDAV5XYQOSF","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"MVQLCYDA","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:ca97a6a4f4e56c5d3fe43d4c5a391434de02d6a65c880b433e8ef928ef0110f7","target":"graph","created_at":"2026-05-17T23:38:49Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"our system can reason through complex prompts and incorporate situated feedback during task execution ('that's not trash')"},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the high-level VLM can reliably map open-ended natural language and visual feedback into correct next-step decisions without hallucinating or misinterpreting physical context."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"A hierarchical VLA architecture lets robots follow complex instructions and situated feedback by separating high-level reasoning from low-level control."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"A hierarchical vision-language model lets robots interpret complex instructions and real-time feedback to choose and perform next steps."}],"snapshot_sha256":"a911d2a90d0654a3b68453cba48345a116270656d7efdc74e4788fb3186b2344"},"formal_canon":{"evidence_count":1,"snapshot_sha256":"1ab0a28f36eda47538f2f25b3a78edc1a5d12c7e26d9cdde158c42a79a1a0378"},"paper":{"abstract_excerpt":"Generalist robots that can perform a range of different tasks in open-world settings must be able to not only reason about the steps needed to accomplish their goals, but also process complex instructions, prompts, and even feedback during task execution. Intricate instructions (e.g., \"Could you make me a vegetarian sandwich?\" or \"I don't like that one\") require not just the ability to physically perform the individual steps, but the ability to situate complex commands and feedback in the physical world. In this work, we describe a system that uses vision-language models in a hierarchical stru","authors_text":"Adrian Li-Bell, Anna Walling, Brian Ichter, Chelsea Finn, Danny Driess, Haohuan Wang, James Tanner, Karl Pertsch, Lachy Groom, Liyiming Ke, Lucy Xiaoyang Shi, Michael Equi, Niccolo Fusai, Quan Vuong, Sergey Levine","cross_cats":["cs.AI","cs.LG"],"headline":"A hierarchical vision-language model lets robots interpret complex instructions and real-time feedback to choose and perform next steps.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.RO","submitted_at":"2025-02-26T18:58:41Z","title":"Hi Robot: Open-Ended Instruction Following with Hierarchical Vision-Language-Action Models"},"references":{"count":51,"internal_anchors":15,"resolved_work":51,"sample":[{"cited_arxiv_id":"2403.01823","doi":"","is_internal_anchor":true,"ref_index":1,"title":"RT-H: Action Hierarchies Using Language","work_id":"ecf7cf18-c1a8-4a6b-bc2a-fb165643aa0d","year":2024},{"cited_arxiv_id":"2407.07726","doi":"","is_internal_anchor":true,"ref_index":2,"title":"PaliGemma: A versatile 3B VLM for transfer","work_id":"df6f48b3-5792-47c7-9614-cb856ea31ad9","year":2024},{"cited_arxiv_id":"2410.24164","doi":"","is_internal_anchor":true,"ref_index":3,"title":"$\\pi_0$: A Vision-Language-Action Flow Model for General Robot Control","work_id":"f790abdc-a796-482f-a40d-f8ee035ecfc2","year":2024},{"cited_arxiv_id":"2212.06817","doi":"","is_internal_anchor":true,"ref_index":4,"title":"RT-1: Robotics Transformer for Real-World Control at Scale","work_id":"e11bda85-8531-46bc-a07f-d0ade3643ab1","year":2022},{"cited_arxiv_id":"2307.15818","doi":"","is_internal_anchor":true,"ref_index":5,"title":"RT-2: Vision-Language-Action Models Transfer Web Knowledge to Robotic Control","work_id":"ff438a8a-8003-4fae-9131-acd418b3597b","year":2023}],"snapshot_sha256":"35b3c23e920f2dff07e8dada247e4fa541fc8b0e2a40cdbe3cf143930ac69cc7"},"source":{"id":"2502.19417","kind":"arxiv","version":2},"verdict":{"created_at":"2026-05-15T22:49:39.001717Z","id":"b6905406-1385-4c55-88d8-097e0df14877","model_set":{"reader":"grok-4.3"},"one_line_summary":"A hierarchical VLA architecture lets robots follow complex instructions and situated feedback by separating high-level reasoning from low-level control.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"A hierarchical vision-language model lets robots interpret complex instructions and real-time feedback to choose and perform next steps.","strongest_claim":"our system can reason through complex prompts and incorporate situated feedback during task execution ('that's not trash')","weakest_assumption":"That the high-level VLM can reliably map open-ended natural language and visual feedback into correct next-step decisions without hallucinating or misinterpreting physical context."}},"verdict_id":"b6905406-1385-4c55-88d8-097e0df14877"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:4d821e20505623b3776b09d9dd1b05ca44b9b9c8759fb51e1717723bf482cc56","target":"record","created_at":"2026-05-17T23:38:49Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"2b64350ff6a13afc04f9ab60c2db11011409494aafe124453dddad25a14cfe73","cross_cats_sorted":["cs.AI","cs.LG"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.RO","submitted_at":"2025-02-26T18:58:41Z","title_canon_sha256":"3b1fdea721df6a4839273c19af265454d6171db77f88a82f2f1d4d419a24a8a0"},"schema_version":"1.0","source":{"id":"2502.19417","kind":"arxiv","version":2}},"canonical_sha256":"6560b16060af6f883a45b89394cf3c7f69d4dc0f491bc183b2b52a5082e3020b","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"6560b16060af6f883a45b89394cf3c7f69d4dc0f491bc183b2b52a5082e3020b","first_computed_at":"2026-05-17T23:38:49.872422Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:49.872422Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"5ov1jo/6hx8sNEsXK1RwGpHG4T72r7bVtAWm4UUR+H0Csh1P+g59n30fGDWUc9X34YMljJ49F8qAPg2mJIuJCA==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:49.872916Z","signed_message":"canonical_sha256_bytes"},"source_id":"2502.19417","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:4d821e20505623b3776b09d9dd1b05ca44b9b9c8759fb51e1717723bf482cc56","sha256:ca97a6a4f4e56c5d3fe43d4c5a391434de02d6a65c880b433e8ef928ef0110f7"],"state_sha256":"08e5af86a631c79eefc3696660f952e9484c2946330e404f0b28abb24c1c1b1d"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"8LcEyKuF7Da7ON1ZE9hDLtBvvygHlnM9KYVfizvFJGYvkJ8z4z5h8gkiBju5mmnDwlc/bNYEk264iwRGpGt8DA==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-23T08:45:18.710856Z","bundle_sha256":"52b524435e52fd7076bacbf2d5993114cd70c8aa868194af88c5916d3271872d"}}