{"work":{"id":"e2db69c7-ee8a-4cb7-a761-7b8de1dfcf97","openalex_id":null,"doi":null,"arxiv_id":"2503.14734","raw_key":null,"title":"GR00T N1: An Open Foundation Model for Generalist Humanoid Robots","authors":null,"authors_text":"NVIDIA: Johan Bjorck, Fernando Casta\\~neda, Nikita Cherniadev, Xingye Da, Runyu Ding, Linxi \"Jim\" Fan","year":2025,"venue":"cs.RO","abstract":"General-purpose robots need a versatile body and an intelligent mind. Recent advancements in humanoid robots have shown great promise as a hardware platform for building generalist autonomy in the human world. A robot foundation model, trained on massive and diverse data sources, is essential for enabling the robots to reason about novel situations, robustly handle real-world variability, and rapidly learn new tasks. To this end, we introduce GR00T N1, an open foundation model for humanoid robots. GR00T N1 is a Vision-Language-Action (VLA) model with a dual-system architecture. The vision-language module (System 2) interprets the environment through vision and language instructions. The subsequent diffusion transformer module (System 1) generates fluid motor actions in real time. Both modules are tightly coupled and jointly trained end-to-end. We train GR00T N1 with a heterogeneous mixture of real-robot trajectories, human videos, and synthetically generated datasets. We show that our generalist robot model GR00T N1 outperforms the state-of-the-art imitation learning baselines on standard simulation benchmarks across multiple robot embodiments. Furthermore, we deploy our model on the Fourier GR-1 humanoid robot for language-conditioned bimanual manipulation tasks, achieving strong performance with high data efficiency.","external_url":"https://arxiv.org/abs/2503.14734","cited_by_count":null,"metadata_source":"pith","metadata_fetched_at":"2026-05-19T03:28:47.976142+00:00","pith_arxiv_id":"2503.14734","created_at":"2026-05-09T06:05:34.979134+00:00","updated_at":"2026-05-19T03:28:47.976142+00:00","title_quality_ok":true,"display_title":"GR00T N1: An Open Foundation Model for Generalist Humanoid Robots","render_title":"GR00T N1: An Open Foundation Model for Generalist Humanoid Robots"},"hub":{"state":{"work_id":"e2db69c7-ee8a-4cb7-a761-7b8de1dfcf97","tier":"super_hub","tier_reason":"100+ Pith inbound or 10,000+ external citations","pith_inbound_count":172,"external_cited_by_count":null,"distinct_field_count":6,"first_pith_cited_at":"2025-02-09T11:25:56+00:00","last_pith_cited_at":"2026-05-14T17:51:40+00:00","author_build_status":"needed","summary_status":"needed","contexts_status":"needed","graph_status":"needed","ask_index_status":"needed","reader_status":"not_needed","recognition_status":"not_needed","updated_at":"2026-05-19T06:21:14.445412+00:00","tier_text":"super_hub"},"tier":"super_hub","role_counts":[{"context_role":"background","n":50},{"context_role":"baseline","n":17},{"context_role":"method","n":7},{"context_role":"other","n":1}],"polarity_counts":[{"context_polarity":"background","n":47},{"context_polarity":"baseline","n":18},{"context_polarity":"unclear","n":5},{"context_polarity":"use_method","n":5}],"runs":{"ask_index":{"job_type":"ask_index","status":"succeeded","result":{"title":"GR00T N1: An Open Foundation Model for Generalist Humanoid Robots","claims":[{"claim_text":"General-purpose robots need a versatile body and an intelligent mind. Recent advancements in humanoid robots have shown great promise as a hardware platform for building generalist autonomy in the human world. A robot foundation model, trained on massive and diverse data sources, is essential for enabling the robots to reason about novel situations, robustly handle real-world variability, and rapidly learn new tasks. To this end, we introduce GR00T N1, an open foundation model for humanoid robots. GR00T N1 is a Vision-Language-Action (VLA) model with a dual-system architecture. The vision-lang","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks GR00T N1: An Open Foundation Model for Generalist Humanoid Robots because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-14T02:23:58.518770+00:00"},"author_expand":{"job_type":"author_expand","status":"succeeded","result":{"authors_linked":[{"id":"ecadb9c8-aea5-4ecf-ad67-ca0fc261bce5","orcid":null,"display_name":"NVIDIA: Johan Bjorck"},{"id":"aa2ed323-b20a-4a4a-a165-04b8bfa15a1c","orcid":null,"display_name":"Fernando Casta\\~neda"},{"id":"db16c256-3fc5-4625-8ca0-7b9d6440d2ab","orcid":null,"display_name":"Nikita Cherniadev"},{"id":"63101f72-e314-4cb1-b8aa-0d139d362850","orcid":null,"display_name":"Xingye Da"},{"id":"2184635f-82de-4ed9-9cc0-970a2c74c382","orcid":null,"display_name":"Runyu Ding"},{"id":"c9a81992-cc64-43df-a948-d652f44e2c28","orcid":null,"display_name":"Linxi \"Jim\" Fan"}]},"error":null,"updated_at":"2026-05-14T02:14:05.681897+00:00"},"context_extract":{"job_type":"context_extract","status":"succeeded","result":{"enqueued_papers":25},"error":null,"updated_at":"2026-05-14T02:14:00.939660+00:00"},"graph_features":{"job_type":"graph_features","status":"succeeded","result":{"co_cited":[{"title":"$\\pi_0$: A Vision-Language-Action Flow Model for General Robot Control","work_id":"f790abdc-a796-482f-a40d-f8ee035ecfc2","shared_citers":67},{"title":"$\\pi_{0.5}$: a Vision-Language-Action Model with Open-World Generalization","work_id":"d1ad7304-d09a-49bc-809e-846439f6aff9","shared_citers":53},{"title":"OpenVLA: An Open-Source Vision-Language-Action Model","work_id":"3e7e65c5-5aed-4fe9-8414-2092bcb31cc7","shared_citers":41},{"title":"RT-1: Robotics Transformer for Real-World Control at Scale","work_id":"e11bda85-8531-46bc-a07f-d0ade3643ab1","shared_citers":30},{"title":"Fine-Tuning Vision-Language-Action Models: Optimizing Speed and Success","work_id":"04f46bb3-4346-47e8-bf09-c75d91f96e87","shared_citers":28},{"title":"RoboTwin 2.0: A Scalable Data Generator and Benchmark with Strong Domain Randomization for Robust Bimanual Robotic Manipulation","work_id":"9b985126-4a2f-4bdf-b014-2a7524ec634e","shared_citers":28},{"title":"FAST: Efficient Action Tokenization for Vision-Language-Action Models","work_id":"83a8f966-6cfa-4f21-81f3-87440aae238f","shared_citers":25},{"title":"DROID: A Large-Scale In-The-Wild Robot Manipulation Dataset","work_id":"13253de2-3d89-415c-8c2f-3adb25d4c337","shared_citers":23},{"title":"Learning Fine-Grained Bimanual Manipulation with Low-Cost Hardware","work_id":"6fe159e0-fa73-481a-88d4-4719c15140be","shared_citers":22},{"title":"SmolVLA: A Vision-Language-Action Model for Affordable and Efficient Robotics","work_id":"0c5e9314-5fa7-4613-ad12-605a71d561d2","shared_citers":22},{"title":"Octo: An Open-Source Generalist Robot Policy","work_id":"f9ca0722-8855-48c3-a27a-0eefb7e19253","shared_citers":21},{"title":"RT-2: Vision-Language-Action Models Transfer Web Knowledge to Robotic Control","work_id":"ff438a8a-8003-4fae-9131-acd418b3597b","shared_citers":19},{"title":"Wan: Open and Advanced Large-Scale Video Generative Models","work_id":"ad3ebc3b-4224-46c9-b61d-bcf135da0a7c","shared_citers":19},{"title":"Gemini Robotics: Bringing AI into the Physical World","work_id":"f7c5ce10-8364-4fbe-964f-2802b81c3a98","shared_citers":18},{"title":"AgiBot World Colosseo: A Large-scale Manipulation Platform for Scalable and Intelligent Embodied Systems","work_id":"f797e9ec-510f-43a7-8a0c-18009ce332e5","shared_citers":17},{"title":"UniVLA: Learning to Act Anywhere with Task-centric Latent Actions","work_id":"e05d654d-db73-48f6-9318-381b6798bac9","shared_citers":17},{"title":"WorldVLA: Towards Autoregressive Action World Model","work_id":"d8c0c873-b2fc-44a5-a0c8-0d4a698783fb","shared_citers":17},{"title":"Flow Matching for Generative Modeling","work_id":"6edb71c4-5d64-40af-a394-9757ea051a36","shared_citers":16},{"title":"GR-2: A Generative Video-Language-Action Model with Web-Scale Knowledge for Robot Manipulation","work_id":"843ab5eb-2815-4db8-b3bc-890b23fa5ffa","shared_citers":16},{"title":"CogACT: A Foundational Vision-Language-Action Model for Synergizing Cognition and Action in Robotic Manipulation","work_id":"4b158d3e-3dff-4412-85cd-baa879465a5e","shared_citers":15},{"title":"Qwen3-VL Technical Report","work_id":"1fe243aa-e3c0-4da6-b391-4cbcfc88d5c0","shared_citers":15},{"title":"RDT-1B: a Diffusion Foundation Model for Bimanual Manipulation","work_id":"12319725-bc7d-4c32-a229-ad270a7460bc","shared_citers":15},{"title":"RoboCasa: Large-Scale Simulation of Everyday Tasks for Generalist Robots","work_id":"11232b35-bd17-402a-9234-951c46015815","shared_citers":15},{"title":"World Action Models are Zero-shot Policies","work_id":"9a85fc69-74df-450e-94cd-69d186e9e830","shared_citers":15}],"time_series":[{"n":5,"year":2025},{"n":101,"year":2026}],"dependency_candidates":[]},"error":null,"updated_at":"2026-05-14T02:14:09.333044+00:00"},"identity_refresh":{"job_type":"identity_refresh","status":"succeeded","result":{"items":[{"title":"Qwen3 Technical Report","outcome":"unchanged","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","resolver":"local_arxiv","confidence":0.98,"old_work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e"}],"counts":{"fixed":0,"merged":0,"unchanged":1,"quarantined":0,"needs_external_resolution":0},"errors":[],"attempted":1},"error":null,"updated_at":"2026-05-14T02:14:09.159462+00:00"},"role_polarity":{"job_type":"role_polarity","status":"succeeded","result":{"title":"GR00T N1: An Open Foundation Model for Generalist Humanoid Robots","claims":[{"claim_text":"General-purpose robots need a versatile body and an intelligent mind. Recent advancements in humanoid robots have shown great promise as a hardware platform for building generalist autonomy in the human world. A robot foundation model, trained on massive and diverse data sources, is essential for enabling the robots to reason about novel situations, robustly handle real-world variability, and rapidly learn new tasks. To this end, we introduce GR00T N1, an open foundation model for humanoid robots. GR00T N1 is a Vision-Language-Action (VLA) model with a dual-system architecture. The vision-lang","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks GR00T N1: An Open Foundation Model for Generalist Humanoid Robots because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-14T02:14:00.943475+00:00"},"summary_claims":{"job_type":"summary_claims","status":"succeeded","result":{"title":"GR00T N1: An Open Foundation Model for Generalist Humanoid Robots","claims":[{"claim_text":"General-purpose robots need a versatile body and an intelligent mind. Recent advancements in humanoid robots have shown great promise as a hardware platform for building generalist autonomy in the human world. A robot foundation model, trained on massive and diverse data sources, is essential for enabling the robots to reason about novel situations, robustly handle real-world variability, and rapidly learn new tasks. To this end, we introduce GR00T N1, an open foundation model for humanoid robots. GR00T N1 is a Vision-Language-Action (VLA) model with a dual-system architecture. The vision-lang","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks GR00T N1: An Open Foundation Model for Generalist Humanoid Robots because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-14T02:14:04.258026+00:00"}},"summary":{"title":"GR00T N1: An Open Foundation Model for Generalist Humanoid Robots","claims":[{"claim_text":"General-purpose robots need a versatile body and an intelligent mind. Recent advancements in humanoid robots have shown great promise as a hardware platform for building generalist autonomy in the human world. A robot foundation model, trained on massive and diverse data sources, is essential for enabling the robots to reason about novel situations, robustly handle real-world variability, and rapidly learn new tasks. To this end, we introduce GR00T N1, an open foundation model for humanoid robots. GR00T N1 is a Vision-Language-Action (VLA) model with a dual-system architecture. The vision-lang","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks GR00T N1: An Open Foundation Model for Generalist Humanoid Robots because it crossed a citation-hub threshold.","role_counts":[]},"graph":{"co_cited":[{"title":"$\\pi_0$: A Vision-Language-Action Flow Model for General Robot Control","work_id":"f790abdc-a796-482f-a40d-f8ee035ecfc2","shared_citers":67},{"title":"$\\pi_{0.5}$: a Vision-Language-Action Model with Open-World Generalization","work_id":"d1ad7304-d09a-49bc-809e-846439f6aff9","shared_citers":53},{"title":"OpenVLA: An Open-Source Vision-Language-Action Model","work_id":"3e7e65c5-5aed-4fe9-8414-2092bcb31cc7","shared_citers":41},{"title":"RT-1: Robotics Transformer for Real-World Control at Scale","work_id":"e11bda85-8531-46bc-a07f-d0ade3643ab1","shared_citers":30},{"title":"Fine-Tuning Vision-Language-Action Models: Optimizing Speed and Success","work_id":"04f46bb3-4346-47e8-bf09-c75d91f96e87","shared_citers":28},{"title":"RoboTwin 2.0: A Scalable Data Generator and Benchmark with Strong Domain Randomization for Robust Bimanual Robotic Manipulation","work_id":"9b985126-4a2f-4bdf-b014-2a7524ec634e","shared_citers":28},{"title":"FAST: Efficient Action Tokenization for Vision-Language-Action Models","work_id":"83a8f966-6cfa-4f21-81f3-87440aae238f","shared_citers":25},{"title":"DROID: A Large-Scale In-The-Wild Robot Manipulation Dataset","work_id":"13253de2-3d89-415c-8c2f-3adb25d4c337","shared_citers":23},{"title":"Learning Fine-Grained Bimanual Manipulation with Low-Cost Hardware","work_id":"6fe159e0-fa73-481a-88d4-4719c15140be","shared_citers":22},{"title":"SmolVLA: A Vision-Language-Action Model for Affordable and Efficient Robotics","work_id":"0c5e9314-5fa7-4613-ad12-605a71d561d2","shared_citers":22},{"title":"Octo: An Open-Source Generalist Robot Policy","work_id":"f9ca0722-8855-48c3-a27a-0eefb7e19253","shared_citers":21},{"title":"RT-2: Vision-Language-Action Models Transfer Web Knowledge to Robotic Control","work_id":"ff438a8a-8003-4fae-9131-acd418b3597b","shared_citers":19},{"title":"Wan: Open and Advanced Large-Scale Video Generative Models","work_id":"ad3ebc3b-4224-46c9-b61d-bcf135da0a7c","shared_citers":19},{"title":"Gemini Robotics: Bringing AI into the Physical World","work_id":"f7c5ce10-8364-4fbe-964f-2802b81c3a98","shared_citers":18},{"title":"AgiBot World Colosseo: A Large-scale Manipulation Platform for Scalable and Intelligent Embodied Systems","work_id":"f797e9ec-510f-43a7-8a0c-18009ce332e5","shared_citers":17},{"title":"UniVLA: Learning to Act Anywhere with Task-centric Latent Actions","work_id":"e05d654d-db73-48f6-9318-381b6798bac9","shared_citers":17},{"title":"WorldVLA: Towards Autoregressive Action World Model","work_id":"d8c0c873-b2fc-44a5-a0c8-0d4a698783fb","shared_citers":17},{"title":"Flow Matching for Generative Modeling","work_id":"6edb71c4-5d64-40af-a394-9757ea051a36","shared_citers":16},{"title":"GR-2: A Generative Video-Language-Action Model with Web-Scale Knowledge for Robot Manipulation","work_id":"843ab5eb-2815-4db8-b3bc-890b23fa5ffa","shared_citers":16},{"title":"CogACT: A Foundational Vision-Language-Action Model for Synergizing Cognition and Action in Robotic Manipulation","work_id":"4b158d3e-3dff-4412-85cd-baa879465a5e","shared_citers":15},{"title":"Qwen3-VL Technical Report","work_id":"1fe243aa-e3c0-4da6-b391-4cbcfc88d5c0","shared_citers":15},{"title":"RDT-1B: a Diffusion Foundation Model for Bimanual Manipulation","work_id":"12319725-bc7d-4c32-a229-ad270a7460bc","shared_citers":15},{"title":"RoboCasa: Large-Scale Simulation of Everyday Tasks for Generalist Robots","work_id":"11232b35-bd17-402a-9234-951c46015815","shared_citers":15},{"title":"World Action Models are Zero-shot Policies","work_id":"9a85fc69-74df-450e-94cd-69d186e9e830","shared_citers":15}],"time_series":[{"n":5,"year":2025},{"n":101,"year":2026}],"dependency_candidates":[]},"authors":[{"id":"aa2ed323-b20a-4a4a-a165-04b8bfa15a1c","orcid":null,"display_name":"Fernando Casta\\~neda","source":"manual","import_confidence":0.72},{"id":"c9a81992-cc64-43df-a948-d652f44e2c28","orcid":null,"display_name":"Linxi \"Jim\" Fan","source":"manual","import_confidence":0.72},{"id":"db16c256-3fc5-4625-8ca0-7b9d6440d2ab","orcid":null,"display_name":"Nikita Cherniadev","source":"manual","import_confidence":0.72},{"id":"ecadb9c8-aea5-4ecf-ad67-ca0fc261bce5","orcid":null,"display_name":"NVIDIA: Johan Bjorck","source":"manual","import_confidence":0.72},{"id":"2184635f-82de-4ed9-9cc0-970a2c74c382","orcid":null,"display_name":"Runyu Ding","source":"manual","import_confidence":0.72},{"id":"63101f72-e314-4cb1-b8aa-0d139d362850","orcid":null,"display_name":"Xingye Da","source":"manual","import_confidence":0.72}]}}