{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2025:CIFBAGXB6DSHLM5PYSIZRSHSOY","short_pith_number":"pith:CIFBAGXB","schema_version":"1.0","canonical_sha256":"120a101ae1f0e475b3afc49198c8f276191da960ad08bc5bf7e3687fcc15e680","source":{"kind":"arxiv","id":"2507.15493","version":2},"attestation_state":"computed","paper":{"title":"GR-3 Technical Report","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"GR-3 is a vision-language-action model that generalizes to novel objects, abstract instructions, and long-horizon dexterous tasks through combined web-scale and robot data training.","cross_cats":["cs.AI","cs.CV"],"primary_cat":"cs.RO","authors_text":"Chilam Cheang, Haixin Shi, Hang Li, Hao Niu, Hongtao Wu, Jiafeng Xu, Jiawen Tian, Liqun Huang, Sijin Chen, Tao Kong, Wanli Peng, Wenxuan Ou, Xiao Ma, Xin Xiao, Yichu Yang, Yifeng Li, Yingdong Hu, Yuxiao Liu, Yuyang Xiao, Zeyu Ren, Zhongren Cui","submitted_at":"2025-07-21T10:54:13Z","abstract_excerpt":"We report our recent progress towards building generalist robot policies, the development of GR-3. GR-3 is a large-scale vision-language-action (VLA) model. It showcases exceptional capabilities in generalizing to novel objects, environments, and instructions involving abstract concepts. Furthermore, it can be efficiently fine-tuned with minimal human trajectory data, enabling rapid and cost-effective adaptation to new settings. GR-3 also excels in handling long-horizon and dexterous tasks, including those requiring bi-manual manipulation and mobile movement, showcasing robust and reliable per"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":true,"formal_links_present":true},"canonical_record":{"source":{"id":"2507.15493","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.RO","submitted_at":"2025-07-21T10:54:13Z","cross_cats_sorted":["cs.AI","cs.CV"],"title_canon_sha256":"966ac5cf5aae66e8f56a341483469b131fda7e9732a72cb2ad9fa207943c0de8","abstract_canon_sha256":"5ee6562e7038be26845b2d75f72ba35e533effe9e5a823d166c03e11c2c3eef3"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:14.638031Z","signature_b64":"Xqu7rkRud2YQTrKeVcNthbp7YrgrdkiaRIsSgDS6c6PnvA1FxedyH1Rgam5UugLIwzHl1dAThneGuFGlDQZLAg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"120a101ae1f0e475b3afc49198c8f276191da960ad08bc5bf7e3687fcc15e680","last_reissued_at":"2026-05-17T23:38:14.637376Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:14.637376Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"GR-3 Technical Report","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"GR-3 is a vision-language-action model that generalizes to novel objects, abstract instructions, and long-horizon dexterous tasks through combined web-scale and robot data training.","cross_cats":["cs.AI","cs.CV"],"primary_cat":"cs.RO","authors_text":"Chilam Cheang, Haixin Shi, Hang Li, Hao Niu, Hongtao Wu, Jiafeng Xu, Jiawen Tian, Liqun Huang, Sijin Chen, Tao Kong, Wanli Peng, Wenxuan Ou, Xiao Ma, Xin Xiao, Yichu Yang, Yifeng Li, Yingdong Hu, Yuxiao Liu, Yuyang Xiao, Zeyu Ren, Zhongren Cui","submitted_at":"2025-07-21T10:54:13Z","abstract_excerpt":"We report our recent progress towards building generalist robot policies, the development of GR-3. GR-3 is a large-scale vision-language-action (VLA) model. It showcases exceptional capabilities in generalizing to novel objects, environments, and instructions involving abstract concepts. Furthermore, it can be efficiently fine-tuned with minimal human trajectory data, enabling rapid and cost-effective adaptation to new settings. GR-3 also excels in handling long-horizon and dexterous tasks, including those requiring bi-manual manipulation and mobile movement, showcasing robust and reliable per"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"GR-3 surpasses the state-of-the-art baseline method, π0, on a wide variety of challenging tasks.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That the multi-faceted training recipe of web-scale vision-language co-training, VR human trajectory data, and robot imitation learning produces the claimed generalization to novel objects, abstract instructions, and long-horizon dexterous tasks without heavy post-hoc tuning or task-specific overfitting.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"GR-3 is a VLA model that generalizes to novel objects, environments, and abstract instructions, outperforms the π0 baseline, and integrates with the new ByteMini bi-manual mobile robot.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"GR-3 is a vision-language-action model that generalizes to novel objects, abstract instructions, and long-horizon dexterous tasks through combined web-scale and robot data training.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"75acb5493e69f646abc3e6d11a3b5a4808c98eed9a07e492669eb22e094fb87b"},"source":{"id":"2507.15493","kind":"arxiv","version":2},"verdict":{"id":"bce5f505-d14e-4519-9180-4ff9ba235ec7","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-17T07:58:50.642162Z","strongest_claim":"GR-3 surpasses the state-of-the-art baseline method, π0, on a wide variety of challenging tasks.","one_line_summary":"GR-3 is a VLA model that generalizes to novel objects, environments, and abstract instructions, outperforms the π0 baseline, and integrates with the new ByteMini bi-manual mobile robot.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That the multi-faceted training recipe of web-scale vision-language co-training, VR human trajectory data, and robot imitation learning produces the claimed generalization to novel objects, abstract instructions, and long-horizon dexterous tasks without heavy post-hoc tuning or task-specific overfitting.","pith_extraction_headline":"GR-3 is a vision-language-action model that generalizes to novel objects, abstract instructions, and long-horizon dexterous tasks through combined web-scale and robot data training."},"references":{"count":78,"sample":[{"doi":"","year":2023,"title":"Affordances from human videos as a versatile representation for robotics","work_id":"231f2724-ff91-4970-b0f2-50b9bc160577","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2025,"title":"Qwen2.5-VL Technical Report","work_id":"69dffacb-bfe8-442d-be86-48624c60426f","ref_index":2,"cited_arxiv_id":"2502.13923","is_internal_anchor":true},{"doi":"","year":2025,"title":"A careful examination of large behavior models for multitask dexterous manipulation","work_id":"a6923092-ec5d-4c9c-aa05-96009ad1e9a7","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2024,"title":"PaliGemma: A versatile 3B VLM for transfer","work_id":"df6f48b3-5792-47c7-9614-cb856ea31ad9","ref_index":4,"cited_arxiv_id":"2407.07726","is_internal_anchor":true},{"doi":"","year":2024,"title":"//arxiv.org/abs/2405.01527","work_id":"88e7a0f8-c0d2-40e5-8449-864ef0385305","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":78,"snapshot_sha256":"b8ca37fd1b2bd6ebe649b22060d7e1cb084f55fef3154d9312b0ab3340bfef7d","internal_anchors":37},"formal_canon":{"evidence_count":2,"snapshot_sha256":"78ac8e22a4469ef3ec2ab19c5eafdf1371c85888e37264d6d4f6e6d414174ce6"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2507.15493","created_at":"2026-05-17T23:38:14.637514+00:00"},{"alias_kind":"arxiv_version","alias_value":"2507.15493v2","created_at":"2026-05-17T23:38:14.637514+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2507.15493","created_at":"2026-05-17T23:38:14.637514+00:00"},{"alias_kind":"pith_short_12","alias_value":"CIFBAGXB6DSH","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_16","alias_value":"CIFBAGXB6DSHLM5P","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_8","alias_value":"CIFBAGXB","created_at":"2026-05-18T12:33:37.589309+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":27,"internal_anchor_count":27,"sample":[{"citing_arxiv_id":"2605.23856","citing_title":"Point Tracking Improves World Action Models","ref_index":3,"is_internal_anchor":true},{"citing_arxiv_id":"2605.22183","citing_title":"Action with Visual Primitives","ref_index":25,"is_internal_anchor":true},{"citing_arxiv_id":"2602.12978","citing_title":"Learning Native Continuation for Action Chunking Flow Policies","ref_index":11,"is_internal_anchor":true},{"citing_arxiv_id":"2605.15157","citing_title":"Hand-in-the-Loop: Improving VLA Policies for Dexterous Manipulation via Seamless Hand-Arm Intervention","ref_index":4,"is_internal_anchor":true},{"citing_arxiv_id":"2605.11567","citing_title":"Dynamic Execution Commitment of Vision-Language-Action Models","ref_index":3,"is_internal_anchor":true},{"citing_arxiv_id":"2605.18722","citing_title":"Dexora: Open-source VLA for High-DoF Bimanual Dexterity","ref_index":6,"is_internal_anchor":true},{"citing_arxiv_id":"2605.18556","citing_title":"Key-Gram: Extensible World Knowledge for Embodied Manipulation","ref_index":9,"is_internal_anchor":true},{"citing_arxiv_id":"2511.02776","citing_title":"XR-1: Towards Versatile Vision-Language-Action Models via Learning Unified Vision-Motion Representations","ref_index":15,"is_internal_anchor":true},{"citing_arxiv_id":"2508.13073","citing_title":"Large VLM-based Vision-Language-Action Models for Robotic Manipulation: A Survey","ref_index":141,"is_internal_anchor":true},{"citing_arxiv_id":"2511.18960","citing_title":"AVA-VLA: Improving Vision-Language-Action models with Active Visual Attention","ref_index":7,"is_internal_anchor":true},{"citing_arxiv_id":"2601.18692","citing_title":"A Pragmatic VLA Foundation Model","ref_index":7,"is_internal_anchor":true},{"citing_arxiv_id":"2509.06951","citing_title":"F1: A Vision-Language-Action Model Bridging Understanding and Generation to Actions","ref_index":8,"is_internal_anchor":true},{"citing_arxiv_id":"2602.11236","citing_title":"ABot-M0: VLA Foundation Model for Robotic Manipulation with Action Manifold Learning","ref_index":10,"is_internal_anchor":true},{"citing_arxiv_id":"2509.09674","citing_title":"SimpleVLA-RL: Scaling VLA Training via Reinforcement Learning","ref_index":14,"is_internal_anchor":true},{"citing_arxiv_id":"2605.15157","citing_title":"Hand-in-the-Loop: Improving VLA Policies for Dexterous Manipulation via Seamless Hand-Arm Intervention","ref_index":4,"is_internal_anchor":true},{"citing_arxiv_id":"2510.13778","citing_title":"InternVLA-M1: A Spatially Guided Vision-Language-Action Framework for Generalist Robot Policy","ref_index":8,"is_internal_anchor":true},{"citing_arxiv_id":"2605.13632","citing_title":"Guide, Think, Act: Interactive Embodied Reasoning in Vision-Language-Action Models","ref_index":6,"is_internal_anchor":true},{"citing_arxiv_id":"2604.03181","citing_title":"Multi-View Video Diffusion Policy: A 3D Spatio-Temporal-Aware Video Action Model","ref_index":11,"is_internal_anchor":true},{"citing_arxiv_id":"2605.11567","citing_title":"Dynamic Execution Commitment of Vision-Language-Action Models","ref_index":3,"is_internal_anchor":true},{"citing_arxiv_id":"2601.21998","citing_title":"Causal World Modeling for Robot Control","ref_index":39,"is_internal_anchor":true},{"citing_arxiv_id":"2604.14125","citing_title":"HiVLA: A Visual-Grounded-Centric Hierarchical Embodied Manipulation System","ref_index":9,"is_internal_anchor":true},{"citing_arxiv_id":"2605.02130","citing_title":"From Where Things Are to What They Are For: Benchmarking Spatial-Functional Intelligence in Multimodal LLMs","ref_index":10,"is_internal_anchor":true},{"citing_arxiv_id":"2604.10170","citing_title":"Device-Conditioned Neural Architecture Search for Efficient Robotic Manipulation","ref_index":13,"is_internal_anchor":true},{"citing_arxiv_id":"2604.05484","citing_title":"CoEnv: Driving Embodied Multi-Agent Collaboration via Compositional Environment","ref_index":8,"is_internal_anchor":true},{"citing_arxiv_id":"2604.14125","citing_title":"HiVLA: A Visual-Grounded-Centric Hierarchical Embodied Manipulation System","ref_index":9,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":2,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/CIFBAGXB6DSHLM5PYSIZRSHSOY","json":"https://pith.science/pith/CIFBAGXB6DSHLM5PYSIZRSHSOY.json","graph_json":"https://pith.science/api/pith-number/CIFBAGXB6DSHLM5PYSIZRSHSOY/graph.json","events_json":"https://pith.science/api/pith-number/CIFBAGXB6DSHLM5PYSIZRSHSOY/events.json","paper":"https://pith.science/paper/CIFBAGXB"},"agent_actions":{"view_html":"https://pith.science/pith/CIFBAGXB6DSHLM5PYSIZRSHSOY","download_json":"https://pith.science/pith/CIFBAGXB6DSHLM5PYSIZRSHSOY.json","view_paper":"https://pith.science/paper/CIFBAGXB","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2507.15493&json=true","fetch_graph":"https://pith.science/api/pith-number/CIFBAGXB6DSHLM5PYSIZRSHSOY/graph.json","fetch_events":"https://pith.science/api/pith-number/CIFBAGXB6DSHLM5PYSIZRSHSOY/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/CIFBAGXB6DSHLM5PYSIZRSHSOY/action/timestamp_anchor","attest_storage":"https://pith.science/pith/CIFBAGXB6DSHLM5PYSIZRSHSOY/action/storage_attestation","attest_author":"https://pith.science/pith/CIFBAGXB6DSHLM5PYSIZRSHSOY/action/author_attestation","sign_citation":"https://pith.science/pith/CIFBAGXB6DSHLM5PYSIZRSHSOY/action/citation_signature","submit_replication":"https://pith.science/pith/CIFBAGXB6DSHLM5PYSIZRSHSOY/action/replication_record"}},"created_at":"2026-05-17T23:38:14.637514+00:00","updated_at":"2026-05-17T23:38:14.637514+00:00"}