{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2025:5HVVMZ2FNWZSAJYWRKQS3UHIDT","short_pith_number":"pith:5HVVMZ2F","schema_version":"1.0","canonical_sha256":"e9eb5667456db32027168aa12dd0e81cce114d9b3680f22295e31f4af977801d","source":{"kind":"arxiv","id":"2511.17502","version":3},"attestation_state":"computed","paper":{"title":"RynnVLA-002: A Unified Vision-Language-Action and World Model","license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.RO","authors_text":"Bohan Hou, Chaohui Yu, Deli Zhao, Fan Wang, Hangjie Yuan, Hao Chen, Hao Luo, Jiayan Guo, Jun Cen, Kehan Li, Siteng Huang, Xin Li, Yuming Jiang, Yuqian Yuan","submitted_at":"2025-11-21T18:59:32Z","abstract_excerpt":"We introduce RynnVLA-002, a unified Vision-Language-Action (VLA) and world model. The world model leverages action and visual inputs to predict future image states, learning the underlying physics of the environment to refine action generation. Conversely, the VLA model produces subsequent actions from image observations, enhancing visual understanding and supporting the world model's image generation. The unified framework of RynnVLA-002 enables joint learning of environmental dynamics and action planning. Our experiments show that RynnVLA-002 surpasses individual VLA and world models, demons"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2511.17502","kind":"arxiv","version":3},"metadata":{"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.RO","submitted_at":"2025-11-21T18:59:32Z","cross_cats_sorted":[],"title_canon_sha256":"e203702bbf9cfbae009587c811a341d3370e719af3c1012224034e7aef68111b","abstract_canon_sha256":"e37e9db43216310e0ab3f7375ef56010b1b9bffd04fbcc6809d1a5ed97f46a3e"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-02T01:03:38.783064Z","signature_b64":"bgfmCXmdra63oPT2UxlTXZw0l9GJtoVG9U4fAFSEWo4KsR1o+4K2jTaacoK/NOiBXCYWYzKWCGypTjVqktgqCQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"e9eb5667456db32027168aa12dd0e81cce114d9b3680f22295e31f4af977801d","last_reissued_at":"2026-06-02T01:03:38.782512Z","signature_status":"signed_v1","first_computed_at":"2026-06-02T01:03:38.782512Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"RynnVLA-002: A Unified Vision-Language-Action and World Model","license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.RO","authors_text":"Bohan Hou, Chaohui Yu, Deli Zhao, Fan Wang, Hangjie Yuan, Hao Chen, Hao Luo, Jiayan Guo, Jun Cen, Kehan Li, Siteng Huang, Xin Li, Yuming Jiang, Yuqian Yuan","submitted_at":"2025-11-21T18:59:32Z","abstract_excerpt":"We introduce RynnVLA-002, a unified Vision-Language-Action (VLA) and world model. The world model leverages action and visual inputs to predict future image states, learning the underlying physics of the environment to refine action generation. Conversely, the VLA model produces subsequent actions from image observations, enhancing visual understanding and supporting the world model's image generation. The unified framework of RynnVLA-002 enables joint learning of environmental dynamics and action planning. Our experiments show that RynnVLA-002 surpasses individual VLA and world models, demons"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2511.17502","kind":"arxiv","version":3},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2511.17502/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2511.17502","created_at":"2026-06-02T01:03:38.782570+00:00"},{"alias_kind":"arxiv_version","alias_value":"2511.17502v3","created_at":"2026-06-02T01:03:38.782570+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2511.17502","created_at":"2026-06-02T01:03:38.782570+00:00"},{"alias_kind":"pith_short_12","alias_value":"5HVVMZ2FNWZS","created_at":"2026-06-02T01:03:38.782570+00:00"},{"alias_kind":"pith_short_16","alias_value":"5HVVMZ2FNWZSAJYW","created_at":"2026-06-02T01:03:38.782570+00:00"},{"alias_kind":"pith_short_8","alias_value":"5HVVMZ2F","created_at":"2026-06-02T01:03:38.782570+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":16,"internal_anchor_count":16,"sample":[{"citing_arxiv_id":"2605.22446","citing_title":"Pre-VLA: Preemptive Runtime Verification for Reliable Vision-Language-Action and World-Model Rollouts","ref_index":24,"is_internal_anchor":true},{"citing_arxiv_id":"2602.18532","citing_title":"VLANeXt: Recipes for Building Strong VLA Models","ref_index":6,"is_internal_anchor":true},{"citing_arxiv_id":"2603.14371","citing_title":"OxyGen: Unified KV Cache Management for VLA Inference under Multi-Task Parallelism","ref_index":8,"is_internal_anchor":true},{"citing_arxiv_id":"2605.19678","citing_title":"RoVLA: Multi-Consistency Constraints for Robust Vision-Language-Action Models","ref_index":35,"is_internal_anchor":true},{"citing_arxiv_id":"2605.07931","citing_title":"One Token Per Frame: Reconsidering Visual Bandwidth in World Models for VLA Policy","ref_index":9,"is_internal_anchor":true},{"citing_arxiv_id":"2603.16666","citing_title":"Fast-WAM: Do World Action Models Need Test-time Future Imagination?","ref_index":29,"is_internal_anchor":true},{"citing_arxiv_id":"2605.12090","citing_title":"World Action Models: The Next Frontier in Embodied AI","ref_index":97,"is_internal_anchor":true},{"citing_arxiv_id":"2605.12167","citing_title":"From Imagined Futures to Executable Actions: Mixture of Latent Actions for Robot Manipulation","ref_index":9,"is_internal_anchor":true},{"citing_arxiv_id":"2605.07931","citing_title":"One Token Per Frame: Reconsidering Visual Bandwidth in World Models for VLA Policy","ref_index":9,"is_internal_anchor":true},{"citing_arxiv_id":"2605.10942","citing_title":"HarmoWAM: Harmonizing Generalizable and Precise Manipulation via Adaptive World Action Models","ref_index":8,"is_internal_anchor":true},{"citing_arxiv_id":"2605.06481","citing_title":"OA-WAM: Object-Addressable World Action Model for Robust Robot Manipulation","ref_index":9,"is_internal_anchor":true},{"citing_arxiv_id":"2605.00080","citing_title":"World Model for Robot Learning: A Comprehensive Survey","ref_index":9,"is_internal_anchor":true},{"citing_arxiv_id":"2605.07288","citing_title":"Sword: Style-Robust World Models as Simulators via Dynamic Latent Bootstrapping for VLA Policy Post-Training","ref_index":8,"is_internal_anchor":true},{"citing_arxiv_id":"2605.07931","citing_title":"One Token Per Frame: Reconsidering Visual Bandwidth in World Models for VLA Policy","ref_index":9,"is_internal_anchor":true},{"citing_arxiv_id":"2605.07794","citing_title":"NoiseGate: Learning Per-Latent Timestep Schedules as Information Gating in World Action Models","ref_index":6,"is_internal_anchor":true},{"citing_arxiv_id":"2605.07381","citing_title":"Escaping the Diversity Trap in Robotic Manipulation via Anchor-Centric Adaptation","ref_index":15,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/5HVVMZ2FNWZSAJYWRKQS3UHIDT","json":"https://pith.science/pith/5HVVMZ2FNWZSAJYWRKQS3UHIDT.json","graph_json":"https://pith.science/api/pith-number/5HVVMZ2FNWZSAJYWRKQS3UHIDT/graph.json","events_json":"https://pith.science/api/pith-number/5HVVMZ2FNWZSAJYWRKQS3UHIDT/events.json","paper":"https://pith.science/paper/5HVVMZ2F"},"agent_actions":{"view_html":"https://pith.science/pith/5HVVMZ2FNWZSAJYWRKQS3UHIDT","download_json":"https://pith.science/pith/5HVVMZ2FNWZSAJYWRKQS3UHIDT.json","view_paper":"https://pith.science/paper/5HVVMZ2F","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2511.17502&json=true","fetch_graph":"https://pith.science/api/pith-number/5HVVMZ2FNWZSAJYWRKQS3UHIDT/graph.json","fetch_events":"https://pith.science/api/pith-number/5HVVMZ2FNWZSAJYWRKQS3UHIDT/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/5HVVMZ2FNWZSAJYWRKQS3UHIDT/action/timestamp_anchor","attest_storage":"https://pith.science/pith/5HVVMZ2FNWZSAJYWRKQS3UHIDT/action/storage_attestation","attest_author":"https://pith.science/pith/5HVVMZ2FNWZSAJYWRKQS3UHIDT/action/author_attestation","sign_citation":"https://pith.science/pith/5HVVMZ2FNWZSAJYWRKQS3UHIDT/action/citation_signature","submit_replication":"https://pith.science/pith/5HVVMZ2FNWZSAJYWRKQS3UHIDT/action/replication_record"}},"created_at":"2026-06-02T01:03:38.782570+00:00","updated_at":"2026-06-02T01:03:38.782570+00:00"}