{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2025:MVQLCYDAV5XYQOSFXCJZJTZ4P5","short_pith_number":"pith:MVQLCYDA","schema_version":"1.0","canonical_sha256":"6560b16060af6f883a45b89394cf3c7f69d4dc0f491bc183b2b52a5082e3020b","source":{"kind":"arxiv","id":"2502.19417","version":2},"attestation_state":"computed","paper":{"title":"Hi Robot: Open-Ended Instruction Following with Hierarchical Vision-Language-Action Models","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"A hierarchical vision-language model lets robots interpret complex instructions and real-time feedback to choose and perform next steps.","cross_cats":["cs.AI","cs.LG"],"primary_cat":"cs.RO","authors_text":"Adrian Li-Bell, Anna Walling, Brian Ichter, Chelsea Finn, Danny Driess, Haohuan Wang, James Tanner, Karl Pertsch, Lachy Groom, Liyiming Ke, Lucy Xiaoyang Shi, Michael Equi, Niccolo Fusai, Quan Vuong, Sergey Levine","submitted_at":"2025-02-26T18:58:41Z","abstract_excerpt":"Generalist robots that can perform a range of different tasks in open-world settings must be able to not only reason about the steps needed to accomplish their goals, but also process complex instructions, prompts, and even feedback during task execution. Intricate instructions (e.g., \"Could you make me a vegetarian sandwich?\" or \"I don't like that one\") require not just the ability to physically perform the individual steps, but the ability to situate complex commands and feedback in the physical world. In this work, we describe a system that uses vision-language models in a hierarchical stru"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":true,"formal_links_present":true},"canonical_record":{"source":{"id":"2502.19417","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.RO","submitted_at":"2025-02-26T18:58:41Z","cross_cats_sorted":["cs.AI","cs.LG"],"title_canon_sha256":"3b1fdea721df6a4839273c19af265454d6171db77f88a82f2f1d4d419a24a8a0","abstract_canon_sha256":"2b64350ff6a13afc04f9ab60c2db11011409494aafe124453dddad25a14cfe73"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:49.872916Z","signature_b64":"5ov1jo/6hx8sNEsXK1RwGpHG4T72r7bVtAWm4UUR+H0Csh1P+g59n30fGDWUc9X34YMljJ49F8qAPg2mJIuJCA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"6560b16060af6f883a45b89394cf3c7f69d4dc0f491bc183b2b52a5082e3020b","last_reissued_at":"2026-05-17T23:38:49.872422Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:49.872422Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Hi Robot: Open-Ended Instruction Following with Hierarchical Vision-Language-Action Models","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"A hierarchical vision-language model lets robots interpret complex instructions and real-time feedback to choose and perform next steps.","cross_cats":["cs.AI","cs.LG"],"primary_cat":"cs.RO","authors_text":"Adrian Li-Bell, Anna Walling, Brian Ichter, Chelsea Finn, Danny Driess, Haohuan Wang, James Tanner, Karl Pertsch, Lachy Groom, Liyiming Ke, Lucy Xiaoyang Shi, Michael Equi, Niccolo Fusai, Quan Vuong, Sergey Levine","submitted_at":"2025-02-26T18:58:41Z","abstract_excerpt":"Generalist robots that can perform a range of different tasks in open-world settings must be able to not only reason about the steps needed to accomplish their goals, but also process complex instructions, prompts, and even feedback during task execution. Intricate instructions (e.g., \"Could you make me a vegetarian sandwich?\" or \"I don't like that one\") require not just the ability to physically perform the individual steps, but the ability to situate complex commands and feedback in the physical world. In this work, we describe a system that uses vision-language models in a hierarchical stru"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"our system can reason through complex prompts and incorporate situated feedback during task execution ('that's not trash')","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That the high-level VLM can reliably map open-ended natural language and visual feedback into correct next-step decisions without hallucinating or misinterpreting physical context.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"A hierarchical VLA architecture lets robots follow complex instructions and situated feedback by separating high-level reasoning from low-level control.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"A hierarchical vision-language model lets robots interpret complex instructions and real-time feedback to choose and perform next steps.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"a911d2a90d0654a3b68453cba48345a116270656d7efdc74e4788fb3186b2344"},"source":{"id":"2502.19417","kind":"arxiv","version":2},"verdict":{"id":"b6905406-1385-4c55-88d8-097e0df14877","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T22:49:39.001717Z","strongest_claim":"our system can reason through complex prompts and incorporate situated feedback during task execution ('that's not trash')","one_line_summary":"A hierarchical VLA architecture lets robots follow complex instructions and situated feedback by separating high-level reasoning from low-level control.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That the high-level VLM can reliably map open-ended natural language and visual feedback into correct next-step decisions without hallucinating or misinterpreting physical context.","pith_extraction_headline":"A hierarchical vision-language model lets robots interpret complex instructions and real-time feedback to choose and perform next steps."},"references":{"count":51,"sample":[{"doi":"","year":2024,"title":"RT-H: Action Hierarchies Using Language","work_id":"ecf7cf18-c1a8-4a6b-bc2a-fb165643aa0d","ref_index":1,"cited_arxiv_id":"2403.01823","is_internal_anchor":true},{"doi":"","year":2024,"title":"PaliGemma: A versatile 3B VLM for transfer","work_id":"df6f48b3-5792-47c7-9614-cb856ea31ad9","ref_index":2,"cited_arxiv_id":"2407.07726","is_internal_anchor":true},{"doi":"","year":2024,"title":"$\\pi_0$: A Vision-Language-Action Flow Model for General Robot Control","work_id":"f790abdc-a796-482f-a40d-f8ee035ecfc2","ref_index":3,"cited_arxiv_id":"2410.24164","is_internal_anchor":true},{"doi":"","year":2022,"title":"RT-1: Robotics Transformer for Real-World Control at Scale","work_id":"e11bda85-8531-46bc-a07f-d0ade3643ab1","ref_index":4,"cited_arxiv_id":"2212.06817","is_internal_anchor":true},{"doi":"","year":2023,"title":"RT-2: Vision-Language-Action Models Transfer Web Knowledge to Robotic Control","work_id":"ff438a8a-8003-4fae-9131-acd418b3597b","ref_index":5,"cited_arxiv_id":"2307.15818","is_internal_anchor":true}],"resolved_work":51,"snapshot_sha256":"35b3c23e920f2dff07e8dada247e4fa541fc8b0e2a40cdbe3cf143930ac69cc7","internal_anchors":15},"formal_canon":{"evidence_count":1,"snapshot_sha256":"1ab0a28f36eda47538f2f25b3a78edc1a5d12c7e26d9cdde158c42a79a1a0378"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2502.19417","created_at":"2026-05-17T23:38:49.872505+00:00"},{"alias_kind":"arxiv_version","alias_value":"2502.19417v2","created_at":"2026-05-17T23:38:49.872505+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2502.19417","created_at":"2026-05-17T23:38:49.872505+00:00"},{"alias_kind":"pith_short_12","alias_value":"MVQLCYDAV5XY","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_16","alias_value":"MVQLCYDAV5XYQOSF","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_8","alias_value":"MVQLCYDA","created_at":"2026-05-18T12:33:37.589309+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":32,"internal_anchor_count":32,"sample":[{"citing_arxiv_id":"2503.03480","citing_title":"SafeVLA: Towards Safety Alignment of Vision-Language-Action Model via Constrained Learning","ref_index":47,"is_internal_anchor":true},{"citing_arxiv_id":"2504.16054","citing_title":"$\\pi_{0.5}$: a Vision-Language-Action Model with Open-World Generalization","ref_index":72,"is_internal_anchor":true},{"citing_arxiv_id":"2605.22183","citing_title":"Action with Visual Primitives","ref_index":15,"is_internal_anchor":true},{"citing_arxiv_id":"2605.22812","citing_title":"GesVLA: Gesture-Aware Vision-Language-Action Model Embedded Representations","ref_index":15,"is_internal_anchor":true},{"citing_arxiv_id":"2605.22816","citing_title":"AwareVLN: Reasoning with Self-awareness for Vision-Language Navigation","ref_index":36,"is_internal_anchor":true},{"citing_arxiv_id":"2603.14371","citing_title":"OxyGen: Unified KV Cache Management for VLA Inference under Multi-Task Parallelism","ref_index":35,"is_internal_anchor":true},{"citing_arxiv_id":"2605.17486","citing_title":"DyGRO-VLA: Cross-Task Scaling of Vision-Language-Action Models via Dynamic Grouped Residual Optimization","ref_index":39,"is_internal_anchor":true},{"citing_arxiv_id":"2605.17522","citing_title":"RoboFlow4D: A Lightweight Flow World Model Toward Real-Time Flow-Guided Robotic Manipulation","ref_index":24,"is_internal_anchor":true},{"citing_arxiv_id":"2507.16815","citing_title":"ThinkAct: Vision-Language-Action Reasoning via Reinforced Visual Latent Planning","ref_index":42,"is_internal_anchor":true},{"citing_arxiv_id":"2507.01925","citing_title":"A Survey on Vision-Language-Action Models: An Action Tokenization Perspective","ref_index":25,"is_internal_anchor":true},{"citing_arxiv_id":"2507.04447","citing_title":"DreamVLA: A Vision-Language-Action Model Dreamed with Comprehensive World Knowledge","ref_index":100,"is_internal_anchor":true},{"citing_arxiv_id":"2503.15558","citing_title":"Cosmos-Reason1: From Physical Common Sense To Embodied Reasoning","ref_index":47,"is_internal_anchor":true},{"citing_arxiv_id":"2510.10125","citing_title":"Ctrl-World: A Controllable Generative World Model for Robot Manipulation","ref_index":39,"is_internal_anchor":true},{"citing_arxiv_id":"2602.13193","citing_title":"Steerable Vision-Language-Action Policies for Embodied Reasoning and Hierarchical Control","ref_index":2,"is_internal_anchor":true},{"citing_arxiv_id":"2602.20231","citing_title":"UniLACT: Depth-Aware RGB Latent Action Learning for Vision-Language-Action Models","ref_index":31,"is_internal_anchor":true},{"citing_arxiv_id":"2603.22003","citing_title":"VP-VLA: Visual Prompting as an Interface for Vision-Language-Action Models","ref_index":33,"is_internal_anchor":true},{"citing_arxiv_id":"2603.25044","citing_title":"ThermoAct:Thermal-Aware Vision-Language-Action Models for Robotic Perception and Decision-Making","ref_index":18,"is_internal_anchor":true},{"citing_arxiv_id":"2511.20857","citing_title":"Evo-Memory: Benchmarking LLM Agent Test-time Learning with Self-Evolving Memory","ref_index":242,"is_internal_anchor":true},{"citing_arxiv_id":"2510.13778","citing_title":"InternVLA-M1: A Spatially Guided Vision-Language-Action Framework for Generalist Robot Policy","ref_index":34,"is_internal_anchor":true},{"citing_arxiv_id":"2605.13119","citing_title":"Towards Long-horizon Embodied Agents with Tool-Aligned Vision-Language-Action Models","ref_index":27,"is_internal_anchor":true},{"citing_arxiv_id":"2604.02786","citing_title":"QuadAgent: A Responsive Agent System for Vision-Language Guided Quadrotor Agile Flight","ref_index":24,"is_internal_anchor":true},{"citing_arxiv_id":"2511.14759","citing_title":"$\\pi^{*}_{0.6}$: a VLA That Learns From Experience","ref_index":87,"is_internal_anchor":true},{"citing_arxiv_id":"2604.14125","citing_title":"HiVLA: A Visual-Grounded-Centric Hierarchical Embodied Manipulation System","ref_index":32,"is_internal_anchor":true},{"citing_arxiv_id":"2605.09959","citing_title":"G-Zero: Self-Play for Open-Ended Generation from Zero Data","ref_index":24,"is_internal_anchor":true},{"citing_arxiv_id":"2604.24447","citing_title":"Characterizing Vision-Language-Action Models across XPUs: Constraints and Acceleration for On-Robot Deployment","ref_index":18,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":1,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/MVQLCYDAV5XYQOSFXCJZJTZ4P5","json":"https://pith.science/pith/MVQLCYDAV5XYQOSFXCJZJTZ4P5.json","graph_json":"https://pith.science/api/pith-number/MVQLCYDAV5XYQOSFXCJZJTZ4P5/graph.json","events_json":"https://pith.science/api/pith-number/MVQLCYDAV5XYQOSFXCJZJTZ4P5/events.json","paper":"https://pith.science/paper/MVQLCYDA"},"agent_actions":{"view_html":"https://pith.science/pith/MVQLCYDAV5XYQOSFXCJZJTZ4P5","download_json":"https://pith.science/pith/MVQLCYDAV5XYQOSFXCJZJTZ4P5.json","view_paper":"https://pith.science/paper/MVQLCYDA","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2502.19417&json=true","fetch_graph":"https://pith.science/api/pith-number/MVQLCYDAV5XYQOSFXCJZJTZ4P5/graph.json","fetch_events":"https://pith.science/api/pith-number/MVQLCYDAV5XYQOSFXCJZJTZ4P5/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/MVQLCYDAV5XYQOSFXCJZJTZ4P5/action/timestamp_anchor","attest_storage":"https://pith.science/pith/MVQLCYDAV5XYQOSFXCJZJTZ4P5/action/storage_attestation","attest_author":"https://pith.science/pith/MVQLCYDAV5XYQOSFXCJZJTZ4P5/action/author_attestation","sign_citation":"https://pith.science/pith/MVQLCYDAV5XYQOSFXCJZJTZ4P5/action/citation_signature","submit_replication":"https://pith.science/pith/MVQLCYDAV5XYQOSFXCJZJTZ4P5/action/replication_record"}},"created_at":"2026-05-17T23:38:49.872505+00:00","updated_at":"2026-05-17T23:38:49.872505+00:00"}