{"work":{"id":"843ab5eb-2815-4db8-b3bc-890b23fa5ffa","openalex_id":null,"doi":null,"arxiv_id":"2410.06158","raw_key":null,"title":"GR-2: A Generative Video-Language-Action Model with Web-Scale Knowledge for Robot Manipulation","authors":null,"authors_text":"Chi-Lam Cheang, Guangzeng Chen, Ya Jing, Tao Kong, Hang Li, Yifeng Li","year":2024,"venue":"cs.RO","abstract":"We present GR-2, a state-of-the-art generalist robot agent for versatile and generalizable robot manipulation. GR-2 is first pre-trained on a vast number of Internet videos to capture the dynamics of the world. This large-scale pre-training, involving 38 million video clips and over 50 billion tokens, equips GR-2 with the ability to generalize across a wide range of robotic tasks and environments during subsequent policy learning. Following this, GR-2 is fine-tuned for both video generation and action prediction using robot trajectories. It exhibits impressive multi-task learning capabilities, achieving an average success rate of 97.7% across more than 100 tasks. Moreover, GR-2 demonstrates exceptional generalization to new, previously unseen scenarios, including novel backgrounds, environments, objects, and tasks. Notably, GR-2 scales effectively with model size, underscoring its potential for continued growth and application. Project page: \\url{https://gr2-manipulation.github.io}.","external_url":"https://arxiv.org/abs/2410.06158","cited_by_count":null,"metadata_source":"pith","metadata_fetched_at":"2026-06-29T13:33:27.768053+00:00","pith_arxiv_id":"2410.06158","created_at":"2026-05-09T06:05:35.112592+00:00","updated_at":"2026-06-29T13:33:27.768053+00:00","title_quality_ok":true,"display_title":"GR-2: A Generative Video-Language-Action Model with Web-Scale Knowledge for Robot Manipulation","render_title":"GR-2: A Generative Video-Language-Action Model with Web-Scale Knowledge for Robot Manipulation"},"hub":{"state":{"work_id":"843ab5eb-2815-4db8-b3bc-890b23fa5ffa","tier":"hub","tier_reason":"10+ Pith inbound or 1,000+ external citations","pith_inbound_count":68,"external_cited_by_count":null,"distinct_field_count":4,"first_pith_cited_at":"2024-11-29T12:06:03+00:00","last_pith_cited_at":"2026-05-28T14:36:53+00:00","author_build_status":"not_needed","summary_status":"needed","contexts_status":"needed","graph_status":"needed","ask_index_status":"not_needed","reader_status":"not_needed","recognition_status":"not_needed","updated_at":"2026-06-29T18:39:07.737197+00:00","tier_text":"hub"},"tier":"hub","role_counts":[{"context_role":"background","n":30},{"context_role":"method","n":4},{"context_role":"baseline","n":1}],"polarity_counts":[{"context_polarity":"background","n":28},{"context_polarity":"use_method","n":3},{"context_polarity":"baseline","n":2},{"context_polarity":"unclear","n":2}],"runs":{"context_extract":{"job_type":"context_extract","status":"succeeded","result":{"enqueued_papers":25},"error":null,"updated_at":"2026-05-14T18:06:40.164603+00:00"},"graph_features":{"job_type":"graph_features","status":"succeeded","result":{"co_cited":[{"title":"$\\pi_0$: A Vision-Language-Action Flow Model for General Robot Control","work_id":"f790abdc-a796-482f-a40d-f8ee035ecfc2","shared_citers":24},{"title":"OpenVLA: An Open-Source Vision-Language-Action Model","work_id":"3e7e65c5-5aed-4fe9-8414-2092bcb31cc7","shared_citers":24},{"title":"$\\pi_{0.5}$: a Vision-Language-Action Model with Open-World Generalization","work_id":"d1ad7304-d09a-49bc-809e-846439f6aff9","shared_citers":17},{"title":"RT-1: Robotics Transformer for Real-World Control at Scale","work_id":"e11bda85-8531-46bc-a07f-d0ade3643ab1","shared_citers":17},{"title":"GR00T N1: An Open Foundation Model for Generalist Humanoid Robots","work_id":"e2db69c7-ee8a-4cb7-a761-7b8de1dfcf97","shared_citers":16},{"title":"Unleashing Large-Scale Video Generative Pre-training for Visual Robot Manipulation","work_id":"e92c2c13-4330-45fe-8231-34a6002626bd","shared_citers":16},{"title":"Video Prediction Policy: A Generalist Robot Policy with Predictive Visual Representations","work_id":"62dbe235-8473-4190-8686-17e7437de50f","shared_citers":13},{"title":"World Action Models are Zero-shot Policies","work_id":"9a85fc69-74df-450e-94cd-69d186e9e830","shared_citers":13},{"title":"FAST: Efficient Action Tokenization for Vision-Language-Action Models","work_id":"83a8f966-6cfa-4f21-81f3-87440aae238f","shared_citers":12},{"title":"Fine-Tuning Vision-Language-Action Models: Optimizing Speed and Success","work_id":"04f46bb3-4346-47e8-bf09-c75d91f96e87","shared_citers":12},{"title":"Gen2act: Human video generation in novel scenarios enables generalizable robot manipulation","work_id":"a3bde288-aace-40db-8067-3ae6656f9509","shared_citers":12},{"title":"RDT-1B: a Diffusion Foundation Model for Bimanual Manipulation","work_id":"12319725-bc7d-4c32-a229-ad270a7460bc","shared_citers":12},{"title":"RT-2: Vision-Language-Action Models Transfer Web Knowledge to Robotic Control","work_id":"ff438a8a-8003-4fae-9131-acd418b3597b","shared_citers":12},{"title":"Wan: Open and Advanced Large-Scale Video Generative Models","work_id":"ad3ebc3b-4224-46c9-b61d-bcf135da0a7c","shared_citers":12},{"title":"AgiBot World Colosseo: A Large-scale Manipulation Platform for Scalable and Intelligent Embodied Systems","work_id":"f797e9ec-510f-43a7-8a0c-18009ce332e5","shared_citers":11},{"title":"Cosmos World Foundation Model Platform for Physical AI","work_id":"a2dba24c-318d-476a-8b21-4289c265810c","shared_citers":11},{"title":"Causal World Modeling for Robot Control","work_id":"a33c4ee0-db06-4f9a-8852-c62e3a72fc27","shared_citers":10},{"title":"CogACT: A Foundational Vision-Language-Action Model for Synergizing Cognition and Action in Robotic Manipulation","work_id":"4b158d3e-3dff-4412-85cd-baa879465a5e","shared_citers":10},{"title":"Motus: A Unified Latent Action World Model","work_id":"d0b2d257-524d-4d67-9daf-5fb43e5e977a","shared_citers":10},{"title":"Ro- bodreamer: Learning compositional world models for robot imagination","work_id":"b1231baa-7c16-4ecf-a6a8-ef49d0875212","shared_citers":10},{"title":"Cosmos Policy: Fine-Tuning Video Models for Visuomotor Control and Planning","work_id":"3d63039f-41b0-4a31-af31-6fc10f5c1b1b","shared_citers":9},{"title":"Flow Matching for Generative Modeling","work_id":"6edb71c4-5d64-40af-a394-9757ea051a36","shared_citers":9},{"title":"Gemini Robotics: Bringing AI into the Physical World","work_id":"f7c5ce10-8364-4fbe-964f-2802b81c3a98","shared_citers":9},{"title":"Learning Fine-Grained Bimanual Manipulation with Low-Cost Hardware","work_id":"6fe159e0-fa73-481a-88d4-4719c15140be","shared_citers":9}],"time_series":[{"n":1,"year":2024},{"n":5,"year":2025},{"n":30,"year":2026}],"dependency_candidates":[]},"error":null,"updated_at":"2026-05-14T18:06:19.912409+00:00"},"identity_refresh":{"job_type":"identity_refresh","status":"succeeded","result":{"items":[{"title":"Qwen3 Technical Report","outcome":"unchanged","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","resolver":"local_arxiv","confidence":0.98,"old_work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e"}],"counts":{"fixed":0,"merged":0,"unchanged":1,"quarantined":0,"needs_external_resolution":0},"errors":[],"attempted":1},"error":null,"updated_at":"2026-05-14T18:06:32.502337+00:00"},"summary_claims":{"job_type":"summary_claims","status":"succeeded","result":{"title":"GR-2: A Generative Video-Language-Action Model with Web-Scale Knowledge for Robot Manipulation","claims":[{"claim_text":"We present GR-2, a state-of-the-art generalist robot agent for versatile and generalizable robot manipulation. GR-2 is first pre-trained on a vast number of Internet videos to capture the dynamics of the world. This large-scale pre-training, involving 38 million video clips and over 50 billion tokens, equips GR-2 with the ability to generalize across a wide range of robotic tasks and environments during subsequent policy learning. Following this, GR-2 is fine-tuned for both video generation and action prediction using robot trajectories. It exhibits impressive multi-task learning capabilities,","claim_type":"abstract","evidence_strength":"source_metadata"},{"claim_text":"Vidar [77], Veo-Act [78], pi0.7 [ 79], V AG [80] Implicit VPP [11], VILP [ 81], Video Policy [13], ARDuP [ 82], mimic-video [ 12], LAP A [15], villa-X [ 83], S-V AM [14], OmniVTA [84], MWM [85] Joint W AM Autoregression GR1 [86], grmg [ 87], GR2 [88], Co TVLA [89], WorldVLA [90], rynnvla2 [91] VLA-JEP A [92], F1-VLA [93] Diffusion-based P AD [21], VideoVLA [94], UWM [20], DreamZero [ 17], CosmosPolicy [16], FLARE [95], UV A [96] FRAPPE [97], CoV AR [98], LDA1B [99], W A V [100], DUST [101], Ling","claim_type":"background","confidence":0.9,"evidence_strength":"citation_context"}],"why_cited":"Pith tracks GR-2: A Generative Video-Language-Action Model with Web-Scale Knowledge for Robot Manipulation because it crossed a citation-hub threshold. Current citing contexts most often use it as background evidence (1 contexts).","role_counts":[{"n":1,"context_role":"background"}]},"error":null,"updated_at":"2026-05-14T18:06:32.515601+00:00"}},"summary":{"title":"GR-2: A Generative Video-Language-Action Model with Web-Scale Knowledge for Robot Manipulation","claims":[{"claim_text":"We present GR-2, a state-of-the-art generalist robot agent for versatile and generalizable robot manipulation. GR-2 is first pre-trained on a vast number of Internet videos to capture the dynamics of the world. This large-scale pre-training, involving 38 million video clips and over 50 billion tokens, equips GR-2 with the ability to generalize across a wide range of robotic tasks and environments during subsequent policy learning. Following this, GR-2 is fine-tuned for both video generation and action prediction using robot trajectories. It exhibits impressive multi-task learning capabilities,","claim_type":"abstract","evidence_strength":"source_metadata"},{"claim_text":"Vidar [77], Veo-Act [78], pi0.7 [ 79], V AG [80] Implicit VPP [11], VILP [ 81], Video Policy [13], ARDuP [ 82], mimic-video [ 12], LAP A [15], villa-X [ 83], S-V AM [14], OmniVTA [84], MWM [85] Joint W AM Autoregression GR1 [86], grmg [ 87], GR2 [88], Co TVLA [89], WorldVLA [90], rynnvla2 [91] VLA-JEP A [92], F1-VLA [93] Diffusion-based P AD [21], VideoVLA [94], UWM [20], DreamZero [ 17], CosmosPolicy [16], FLARE [95], UV A [96] FRAPPE [97], CoV AR [98], LDA1B [99], W A V [100], DUST [101], Ling","claim_type":"background","confidence":0.9,"evidence_strength":"citation_context"}],"why_cited":"Pith tracks GR-2: A Generative Video-Language-Action Model with Web-Scale Knowledge for Robot Manipulation because it crossed a citation-hub threshold. Current citing contexts most often use it as background evidence (1 contexts).","role_counts":[{"n":1,"context_role":"background"}]},"graph":{"co_cited":[{"title":"$\\pi_0$: A Vision-Language-Action Flow Model for General Robot Control","work_id":"f790abdc-a796-482f-a40d-f8ee035ecfc2","shared_citers":24},{"title":"OpenVLA: An Open-Source Vision-Language-Action Model","work_id":"3e7e65c5-5aed-4fe9-8414-2092bcb31cc7","shared_citers":24},{"title":"$\\pi_{0.5}$: a Vision-Language-Action Model with Open-World Generalization","work_id":"d1ad7304-d09a-49bc-809e-846439f6aff9","shared_citers":17},{"title":"RT-1: Robotics Transformer for Real-World Control at Scale","work_id":"e11bda85-8531-46bc-a07f-d0ade3643ab1","shared_citers":17},{"title":"GR00T N1: An Open Foundation Model for Generalist Humanoid Robots","work_id":"e2db69c7-ee8a-4cb7-a761-7b8de1dfcf97","shared_citers":16},{"title":"Unleashing Large-Scale Video Generative Pre-training for Visual Robot Manipulation","work_id":"e92c2c13-4330-45fe-8231-34a6002626bd","shared_citers":16},{"title":"Video Prediction Policy: A Generalist Robot Policy with Predictive Visual Representations","work_id":"62dbe235-8473-4190-8686-17e7437de50f","shared_citers":13},{"title":"World Action Models are Zero-shot Policies","work_id":"9a85fc69-74df-450e-94cd-69d186e9e830","shared_citers":13},{"title":"FAST: Efficient Action Tokenization for Vision-Language-Action Models","work_id":"83a8f966-6cfa-4f21-81f3-87440aae238f","shared_citers":12},{"title":"Fine-Tuning Vision-Language-Action Models: Optimizing Speed and Success","work_id":"04f46bb3-4346-47e8-bf09-c75d91f96e87","shared_citers":12},{"title":"Gen2act: Human video generation in novel scenarios enables generalizable robot manipulation","work_id":"a3bde288-aace-40db-8067-3ae6656f9509","shared_citers":12},{"title":"RDT-1B: a Diffusion Foundation Model for Bimanual Manipulation","work_id":"12319725-bc7d-4c32-a229-ad270a7460bc","shared_citers":12},{"title":"RT-2: Vision-Language-Action Models Transfer Web Knowledge to Robotic Control","work_id":"ff438a8a-8003-4fae-9131-acd418b3597b","shared_citers":12},{"title":"Wan: Open and Advanced Large-Scale Video Generative Models","work_id":"ad3ebc3b-4224-46c9-b61d-bcf135da0a7c","shared_citers":12},{"title":"AgiBot World Colosseo: A Large-scale Manipulation Platform for Scalable and Intelligent Embodied Systems","work_id":"f797e9ec-510f-43a7-8a0c-18009ce332e5","shared_citers":11},{"title":"Cosmos World Foundation Model Platform for Physical AI","work_id":"a2dba24c-318d-476a-8b21-4289c265810c","shared_citers":11},{"title":"Causal World Modeling for Robot Control","work_id":"a33c4ee0-db06-4f9a-8852-c62e3a72fc27","shared_citers":10},{"title":"CogACT: A Foundational Vision-Language-Action Model for Synergizing Cognition and Action in Robotic Manipulation","work_id":"4b158d3e-3dff-4412-85cd-baa879465a5e","shared_citers":10},{"title":"Motus: A Unified Latent Action World Model","work_id":"d0b2d257-524d-4d67-9daf-5fb43e5e977a","shared_citers":10},{"title":"Ro- bodreamer: Learning compositional world models for robot imagination","work_id":"b1231baa-7c16-4ecf-a6a8-ef49d0875212","shared_citers":10},{"title":"Cosmos Policy: Fine-Tuning Video Models for Visuomotor Control and Planning","work_id":"3d63039f-41b0-4a31-af31-6fc10f5c1b1b","shared_citers":9},{"title":"Flow Matching for Generative Modeling","work_id":"6edb71c4-5d64-40af-a394-9757ea051a36","shared_citers":9},{"title":"Gemini Robotics: Bringing AI into the Physical World","work_id":"f7c5ce10-8364-4fbe-964f-2802b81c3a98","shared_citers":9},{"title":"Learning Fine-Grained Bimanual Manipulation with Low-Cost Hardware","work_id":"6fe159e0-fa73-481a-88d4-4719c15140be","shared_citers":9}],"time_series":[{"n":1,"year":2024},{"n":5,"year":2025},{"n":30,"year":2026}],"dependency_candidates":[]},"authors":[]}}