{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:7UNFV772FNRC5AX63ECEBJJNLW","short_pith_number":"pith:7UNFV772","schema_version":"1.0","canonical_sha256":"fd1a5afffa2b622e82fed90440a52d5da9bbc0d853c481f1fed1519437d83071","source":{"kind":"arxiv","id":"2605.13403","version":1},"attestation_state":"computed","paper":{"title":"RotVLA: Rotational Latent Action for Vision-Language-Action Model","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"RotVLA replaces discrete action codes with continuous rotations in SO(n) for vision-language-action models.","cross_cats":["cs.CV"],"primary_cat":"cs.RO","authors_text":"Hangjun Ye, Jiahuan Zhou, Peiyan Li, Qiwei Li, Quanyun Zhou, Xicheng Gong, Xinghang Li, Yadong Mu","submitted_at":"2026-05-13T11:58:02Z","abstract_excerpt":"Latent Action Models (LAMs) have emerged as an effective paradigm for handling heterogeneous datasets during Vision-Language-Action (VLA) model pretraining, offering a unified action space across embodiments. However, existing LAMs often rely on discrete quantization encode and decode pipelines, which can lead to trivial frame reconstruction behavior, limited representational capacity, and a lack of physically meaningful structure. We introduce RotVLA, a VLA framework built on a continuous rotational latent action representation. Latent actions are modeled as elements of SO(n), providing conti"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":true,"formal_links_present":true},"canonical_record":{"source":{"id":"2605.13403","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.RO","submitted_at":"2026-05-13T11:58:02Z","cross_cats_sorted":["cs.CV"],"title_canon_sha256":"156756c0e83713819ca534a0933f03d1d22d311eef5d9fb16e469ff32a419c15","abstract_canon_sha256":"b0058c9cb08bfe16717aa0f2996ad0907bf79985020d3514a734f2490e937e93"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T02:44:47.563245Z","signature_b64":"+YB4vFlj8LSr6rvLu7tU2lGA7JinpKWjW+9m0N1uhq7/BqjFPq1HYIFE7X0mszsTQMZ8qHXrvkVYO4ZCCS3kDQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"fd1a5afffa2b622e82fed90440a52d5da9bbc0d853c481f1fed1519437d83071","last_reissued_at":"2026-05-18T02:44:47.562788Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T02:44:47.562788Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"RotVLA: Rotational Latent Action for Vision-Language-Action Model","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"RotVLA replaces discrete action codes with continuous rotations in SO(n) for vision-language-action models.","cross_cats":["cs.CV"],"primary_cat":"cs.RO","authors_text":"Hangjun Ye, Jiahuan Zhou, Peiyan Li, Qiwei Li, Quanyun Zhou, Xicheng Gong, Xinghang Li, Yadong Mu","submitted_at":"2026-05-13T11:58:02Z","abstract_excerpt":"Latent Action Models (LAMs) have emerged as an effective paradigm for handling heterogeneous datasets during Vision-Language-Action (VLA) model pretraining, offering a unified action space across embodiments. However, existing LAMs often rely on discrete quantization encode and decode pipelines, which can lead to trivial frame reconstruction behavior, limited representational capacity, and a lack of physically meaningful structure. We introduce RotVLA, a VLA framework built on a continuous rotational latent action representation. Latent actions are modeled as elements of SO(n), providing conti"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"With only 1.7B parameters and 1700+ hours of pretraining data, RotVLA achieves 98.2% on LIBERO and 89.6% / 88.5% on RoboTwin2.0 under clean and randomized settings, respectively. It also demonstrates strong real-world performance on manipulation tasks, consistently outperforming existing VLA models.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That representing latent actions as elements of SO(n) together with a triplet-frame objective inherently supplies continuity, compositionality, and physically meaningful structure while preventing trivial frame-reconstruction solutions.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"RotVLA models latent actions as continuous SO(n) rotations with triplet-frame supervision and flow-matching to reach 98.2% success on LIBERO and 89.6%/88.5% on RoboTwin2.0 using a 1.7B-parameter model.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"RotVLA replaces discrete action codes with continuous rotations in SO(n) for vision-language-action models.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"a9c8e1113c85f322833bd3fa93239dafd3ee05470983c42e33923972ba7d27b0"},"source":{"id":"2605.13403","kind":"arxiv","version":1},"verdict":{"id":"7f51c42f-7b3a-4437-a206-a8768d87e6b6","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-14T17:44:25.288826Z","strongest_claim":"With only 1.7B parameters and 1700+ hours of pretraining data, RotVLA achieves 98.2% on LIBERO and 89.6% / 88.5% on RoboTwin2.0 under clean and randomized settings, respectively. It also demonstrates strong real-world performance on manipulation tasks, consistently outperforming existing VLA models.","one_line_summary":"RotVLA models latent actions as continuous SO(n) rotations with triplet-frame supervision and flow-matching to reach 98.2% success on LIBERO and 89.6%/88.5% on RoboTwin2.0 using a 1.7B-parameter model.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That representing latent actions as elements of SO(n) together with a triplet-frame objective inherently supplies continuity, compositionality, and physically meaningful structure while preventing trivial frame-reconstruction solutions.","pith_extraction_headline":"RotVLA replaces discrete action codes with continuous rotations in SO(n) for vision-language-action models."},"references":{"count":86,"sample":[{"doi":"","year":2024,"title":"A Survey on Vision-Language-Action Models for Embodied AI","work_id":"9492fb3d-d667-4892-81bb-b2878f12ff0c","ref_index":1,"cited_arxiv_id":"2405.14093","is_internal_anchor":true},{"doi":"","year":2025,"title":"Roumelio- tis, and Manoj Karkee","work_id":"5f0bf2cc-1901-4ad0-940b-1e742cc6d7e7","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2023,"title":"Visual instruction tuning.Advances in neural information processing systems, 36:34892–34916","work_id":"115823a2-8918-4227-8872-3d0a36ff07a9","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2024,"title":"Internvl: Scaling up vision foundation models and aligning for generic visual-linguistic tasks","work_id":"321b2bd4-950a-44f0-ab50-e70251e75187","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2025,"title":"Qwen2.5-VL Technical Report","work_id":"69dffacb-bfe8-442d-be86-48624c60426f","ref_index":5,"cited_arxiv_id":"2502.13923","is_internal_anchor":true}],"resolved_work":86,"snapshot_sha256":"114c3a6b5eda750fac0cfaa8fbe9b84492ddf74d37e389105a4d0f5a207e0207","internal_anchors":28},"formal_canon":{"evidence_count":2,"snapshot_sha256":"61af6b48e18519772de59e4d3a20be8a02b289c706a90aad0c87e7dc08b7a177"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2605.13403","created_at":"2026-05-18T02:44:47.562862+00:00"},{"alias_kind":"arxiv_version","alias_value":"2605.13403v1","created_at":"2026-05-18T02:44:47.562862+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.13403","created_at":"2026-05-18T02:44:47.562862+00:00"},{"alias_kind":"pith_short_12","alias_value":"7UNFV772FNRC","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_16","alias_value":"7UNFV772FNRC5AX6","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_8","alias_value":"7UNFV772","created_at":"2026-05-18T12:33:37.589309+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":2,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/7UNFV772FNRC5AX63ECEBJJNLW","json":"https://pith.science/pith/7UNFV772FNRC5AX63ECEBJJNLW.json","graph_json":"https://pith.science/api/pith-number/7UNFV772FNRC5AX63ECEBJJNLW/graph.json","events_json":"https://pith.science/api/pith-number/7UNFV772FNRC5AX63ECEBJJNLW/events.json","paper":"https://pith.science/paper/7UNFV772"},"agent_actions":{"view_html":"https://pith.science/pith/7UNFV772FNRC5AX63ECEBJJNLW","download_json":"https://pith.science/pith/7UNFV772FNRC5AX63ECEBJJNLW.json","view_paper":"https://pith.science/paper/7UNFV772","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2605.13403&json=true","fetch_graph":"https://pith.science/api/pith-number/7UNFV772FNRC5AX63ECEBJJNLW/graph.json","fetch_events":"https://pith.science/api/pith-number/7UNFV772FNRC5AX63ECEBJJNLW/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/7UNFV772FNRC5AX63ECEBJJNLW/action/timestamp_anchor","attest_storage":"https://pith.science/pith/7UNFV772FNRC5AX63ECEBJJNLW/action/storage_attestation","attest_author":"https://pith.science/pith/7UNFV772FNRC5AX63ECEBJJNLW/action/author_attestation","sign_citation":"https://pith.science/pith/7UNFV772FNRC5AX63ECEBJJNLW/action/citation_signature","submit_replication":"https://pith.science/pith/7UNFV772FNRC5AX63ECEBJJNLW/action/replication_record"}},"created_at":"2026-05-18T02:44:47.562862+00:00","updated_at":"2026-05-18T02:44:47.562862+00:00"}