{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2025:DOUVTC4ESLWD424PE5DBCFLMUV","short_pith_number":"pith:DOUVTC4E","schema_version":"1.0","canonical_sha256":"1ba9598b8492ec3e6b8f274611156ca577a80f1decac602e61c957ef2dbea89a","source":{"kind":"arxiv","id":"2508.20072","version":4},"attestation_state":"computed","paper":{"title":"Discrete Diffusion VLA: Bringing Discrete Diffusion to Action Decoding in Vision-Language-Action Policies","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.LG","cs.RO"],"primary_cat":"cs.CV","authors_text":"Chengyue Wu, Jiangmiao Pang, Liuao Pei, Ping Luo, Shunbo Zhou, Sitong Mao, Tian Nian, Tianshuo Yang, Xiaokang Yang, Yao Mu, Yizhuo Li, Zhixuan Liang","submitted_at":"2025-08-27T17:39:11Z","abstract_excerpt":"Vision-Language-Action (VLA) models adapt large vision-language backbones to map images and instructions into robot actions. However, prevailing VLAs either generate actions autoregressively in a fixed left-to-right order with poor performance or attach separate diffusion heads outside the backbone that fragments information pathways and hinders unified, scalable architectures. Instead, we present Discrete Diffusion VLA that discretizes action chunks and models them with discrete diffusion pattern retaining progressive refinement inside the unified transformer backbone. Our method achieves an "},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2508.20072","kind":"arxiv","version":4},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2025-08-27T17:39:11Z","cross_cats_sorted":["cs.LG","cs.RO"],"title_canon_sha256":"d00087b7c8e648993aa002a5ca809206dbaefe70ee6f43e73a7426022b0a1c60","abstract_canon_sha256":"4706605381a662bbffad575f0c9bbc8db08f3bbeb86cc0dbf96355c70054224c"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-02T02:04:08.354713Z","signature_b64":"vNivRRR+2e+qOUkeBl7WbE9gq+zgOYlgeCcb5FcLhfH7a3jQQyl09UaE3C3F3vgQTTNeh84dTD2eA8skHex4BQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"1ba9598b8492ec3e6b8f274611156ca577a80f1decac602e61c957ef2dbea89a","last_reissued_at":"2026-06-02T02:04:08.353829Z","signature_status":"signed_v1","first_computed_at":"2026-06-02T02:04:08.353829Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Discrete Diffusion VLA: Bringing Discrete Diffusion to Action Decoding in Vision-Language-Action Policies","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.LG","cs.RO"],"primary_cat":"cs.CV","authors_text":"Chengyue Wu, Jiangmiao Pang, Liuao Pei, Ping Luo, Shunbo Zhou, Sitong Mao, Tian Nian, Tianshuo Yang, Xiaokang Yang, Yao Mu, Yizhuo Li, Zhixuan Liang","submitted_at":"2025-08-27T17:39:11Z","abstract_excerpt":"Vision-Language-Action (VLA) models adapt large vision-language backbones to map images and instructions into robot actions. However, prevailing VLAs either generate actions autoregressively in a fixed left-to-right order with poor performance or attach separate diffusion heads outside the backbone that fragments information pathways and hinders unified, scalable architectures. Instead, we present Discrete Diffusion VLA that discretizes action chunks and models them with discrete diffusion pattern retaining progressive refinement inside the unified transformer backbone. Our method achieves an "},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2508.20072","kind":"arxiv","version":4},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2508.20072/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2508.20072","created_at":"2026-06-02T02:04:08.353932+00:00"},{"alias_kind":"arxiv_version","alias_value":"2508.20072v4","created_at":"2026-06-02T02:04:08.353932+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2508.20072","created_at":"2026-06-02T02:04:08.353932+00:00"},{"alias_kind":"pith_short_12","alias_value":"DOUVTC4ESLWD","created_at":"2026-06-02T02:04:08.353932+00:00"},{"alias_kind":"pith_short_16","alias_value":"DOUVTC4ESLWD424P","created_at":"2026-06-02T02:04:08.353932+00:00"},{"alias_kind":"pith_short_8","alias_value":"DOUVTC4E","created_at":"2026-06-02T02:04:08.353932+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":18,"internal_anchor_count":18,"sample":[{"citing_arxiv_id":"2602.19710","citing_title":"Universal Pose Pretraining for Generalizable Vision-Language-Action Policies","ref_index":21,"is_internal_anchor":true},{"citing_arxiv_id":"2602.12978","citing_title":"Learning Native Continuation for Action Chunking Flow Policies","ref_index":21,"is_internal_anchor":true},{"citing_arxiv_id":"2605.19282","citing_title":"Rethinking Muon Beyond Pretraining: Spectral Failures and High-Pass Remedies for VLA and RLVR","ref_index":21,"is_internal_anchor":true},{"citing_arxiv_id":"2511.14148","citing_title":"AsyncVLA: Asynchronous Flow Matching for Vision-Language-Action Models","ref_index":42,"is_internal_anchor":true},{"citing_arxiv_id":"2602.11236","citing_title":"ABot-M0: VLA Foundation Model for Robotic Manipulation with Action Manifold Learning","ref_index":28,"is_internal_anchor":true},{"citing_arxiv_id":"2605.11459","citing_title":"Overcoming Dynamics-Blindness: Training-Free Pace-and-Path Correction for VLA Models","ref_index":50,"is_internal_anchor":true},{"citing_arxiv_id":"2605.13382","citing_title":"BlockVLA: Accelerating Autoregressive VLA via Block Diffusion Finetuning","ref_index":12,"is_internal_anchor":true},{"citing_arxiv_id":"2605.11832","citing_title":"Learning Action Manifold with Multi-view Latent Priors for Robotic Manipulation","ref_index":81,"is_internal_anchor":true},{"citing_arxiv_id":"2605.12369","citing_title":"GuidedVLA: Specifying Task-Relevant Factors via Plug-and-Play Action Attention Specialization","ref_index":55,"is_internal_anchor":true},{"citing_arxiv_id":"2605.11459","citing_title":"Overcoming Dynamics-Blindness: Training-Free Pace-and-Path Correction for VLA Models","ref_index":50,"is_internal_anchor":true},{"citing_arxiv_id":"2601.21998","citing_title":"Causal World Modeling for Robot Control","ref_index":44,"is_internal_anchor":true},{"citing_arxiv_id":"2605.09302","citing_title":"Discrete Langevin-Inspired Posterior Sampling","ref_index":20,"is_internal_anchor":true},{"citing_arxiv_id":"2605.10925","citing_title":"PriorVLA: Prior-Preserving Adaptation for Vision-Language-Action Models","ref_index":44,"is_internal_anchor":true},{"citing_arxiv_id":"2604.25050","citing_title":"DiscreteRTC: Discrete Diffusion Policies are Natural Asynchronous Executors","ref_index":26,"is_internal_anchor":true},{"citing_arxiv_id":"2604.22152","citing_title":"dWorldEval: Scalable Robotic Policy Evaluation via Discrete Diffusion World Model","ref_index":22,"is_internal_anchor":true},{"citing_arxiv_id":"2605.00078","citing_title":"Being-H0.7: A Latent World-Action Model from Egocentric Videos","ref_index":39,"is_internal_anchor":true},{"citing_arxiv_id":"2604.20472","citing_title":"Temporal Difference Calibration in Sequential Tasks: Application to Vision-Language-Action Models","ref_index":42,"is_internal_anchor":true},{"citing_arxiv_id":"2604.14732","citing_title":"World-Value-Action Model: Implicit Planning for Vision-Language-Action Systems","ref_index":19,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/DOUVTC4ESLWD424PE5DBCFLMUV","json":"https://pith.science/pith/DOUVTC4ESLWD424PE5DBCFLMUV.json","graph_json":"https://pith.science/api/pith-number/DOUVTC4ESLWD424PE5DBCFLMUV/graph.json","events_json":"https://pith.science/api/pith-number/DOUVTC4ESLWD424PE5DBCFLMUV/events.json","paper":"https://pith.science/paper/DOUVTC4E"},"agent_actions":{"view_html":"https://pith.science/pith/DOUVTC4ESLWD424PE5DBCFLMUV","download_json":"https://pith.science/pith/DOUVTC4ESLWD424PE5DBCFLMUV.json","view_paper":"https://pith.science/paper/DOUVTC4E","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2508.20072&json=true","fetch_graph":"https://pith.science/api/pith-number/DOUVTC4ESLWD424PE5DBCFLMUV/graph.json","fetch_events":"https://pith.science/api/pith-number/DOUVTC4ESLWD424PE5DBCFLMUV/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/DOUVTC4ESLWD424PE5DBCFLMUV/action/timestamp_anchor","attest_storage":"https://pith.science/pith/DOUVTC4ESLWD424PE5DBCFLMUV/action/storage_attestation","attest_author":"https://pith.science/pith/DOUVTC4ESLWD424PE5DBCFLMUV/action/author_attestation","sign_citation":"https://pith.science/pith/DOUVTC4ESLWD424PE5DBCFLMUV/action/citation_signature","submit_replication":"https://pith.science/pith/DOUVTC4ESLWD424PE5DBCFLMUV/action/replication_record"}},"created_at":"2026-06-02T02:04:08.353932+00:00","updated_at":"2026-06-02T02:04:08.353932+00:00"}