{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2025:2IESB3WMK23AGYDUTI5ZDPZ3KL","short_pith_number":"pith:2IESB3WM","schema_version":"1.0","canonical_sha256":"d20920eecc56b60360749a3b91bf3b52f057455c9e45fb5659c1d02b25ff9a62","source":{"kind":"arxiv","id":"2510.27607","version":3},"attestation_state":"computed","paper":{"title":"Dual-Stream Diffusion for World-Model Augmented Vision-Language-Action Model","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.RO"],"primary_cat":"cs.CV","authors_text":"Dongyoung Kim, Huiwon Jang, Jinwoo Shin, John Won, Kyungmin Lee","submitted_at":"2025-10-31T16:32:12Z","abstract_excerpt":"Augmenting vision-language-action models (VLAs) with world models is promising for robotic policy learning but faces challenges in jointly predicting states and actions due to the modality gap. To address this, we propose DUal-STream diffusion (DUST), a world-model augmented VLA framework featuring a multimodal diffusion transformer that maintains separate modality streams while enabling cross-modal knowledge sharing. In addition, DUST utilizes independent noise perturbations and a decoupled flow matching loss to learn cross-modal causal relationships. We further introduce an asynchronous samp"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2510.27607","kind":"arxiv","version":3},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2025-10-31T16:32:12Z","cross_cats_sorted":["cs.RO"],"title_canon_sha256":"5bcf56f378723d61bb225af5b28e2dbc3d541a7c822da2a40b730ecf227f3db0","abstract_canon_sha256":"b092c91f41b8a1245cb75a7303e0f53df7a1eef21d91747577ed6fb1ad5f5e7b"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-29T01:04:57.684526Z","signature_b64":"RFgE1S5gF9mHtKMdJDAWPa1cDhh1iKhEH4EqgnmsAMysKvhM5rz+41xwh3JUVXIqJDrBs8QjGKZCrhG3IGB+CQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"d20920eecc56b60360749a3b91bf3b52f057455c9e45fb5659c1d02b25ff9a62","last_reissued_at":"2026-05-29T01:04:57.684005Z","signature_status":"signed_v1","first_computed_at":"2026-05-29T01:04:57.684005Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Dual-Stream Diffusion for World-Model Augmented Vision-Language-Action Model","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.RO"],"primary_cat":"cs.CV","authors_text":"Dongyoung Kim, Huiwon Jang, Jinwoo Shin, John Won, Kyungmin Lee","submitted_at":"2025-10-31T16:32:12Z","abstract_excerpt":"Augmenting vision-language-action models (VLAs) with world models is promising for robotic policy learning but faces challenges in jointly predicting states and actions due to the modality gap. To address this, we propose DUal-STream diffusion (DUST), a world-model augmented VLA framework featuring a multimodal diffusion transformer that maintains separate modality streams while enabling cross-modal knowledge sharing. In addition, DUST utilizes independent noise perturbations and a decoupled flow matching loss to learn cross-modal causal relationships. We further introduce an asynchronous samp"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2510.27607","kind":"arxiv","version":3},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2510.27607/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2510.27607","created_at":"2026-05-29T01:04:57.684066+00:00"},{"alias_kind":"arxiv_version","alias_value":"2510.27607v3","created_at":"2026-05-29T01:04:57.684066+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2510.27607","created_at":"2026-05-29T01:04:57.684066+00:00"},{"alias_kind":"pith_short_12","alias_value":"2IESB3WMK23A","created_at":"2026-05-29T01:04:57.684066+00:00"},{"alias_kind":"pith_short_16","alias_value":"2IESB3WMK23AGYDU","created_at":"2026-05-29T01:04:57.684066+00:00"},{"alias_kind":"pith_short_8","alias_value":"2IESB3WM","created_at":"2026-05-29T01:04:57.684066+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":11,"internal_anchor_count":11,"sample":[{"citing_arxiv_id":"2605.23856","citing_title":"Point Tracking Improves World Action Models","ref_index":20,"is_internal_anchor":true},{"citing_arxiv_id":"2512.14234","citing_title":"ViBES: A Conversational Agent with Behaviorally-Intelligent 3D Virtual Body","ref_index":114,"is_internal_anchor":true},{"citing_arxiv_id":"2603.16666","citing_title":"Fast-WAM: Do World Action Models Need Test-time Future Imagination?","ref_index":25,"is_internal_anchor":true},{"citing_arxiv_id":"2605.12090","citing_title":"World Action Models: The Next Frontier in Embodied AI","ref_index":107,"is_internal_anchor":true},{"citing_arxiv_id":"2605.12162","citing_title":"X-Imitator: Spatial-Aware Imitation Learning via Bidirectional Action-Pose Interaction","ref_index":63,"is_internal_anchor":true},{"citing_arxiv_id":"2601.16163","citing_title":"Cosmos Policy: Fine-Tuning Video Models for Visuomotor Control and Planning","ref_index":30,"is_internal_anchor":true},{"citing_arxiv_id":"2605.03269","citing_title":"RLDX-1 Technical Report","ref_index":108,"is_internal_anchor":true},{"citing_arxiv_id":"2605.10942","citing_title":"HarmoWAM: Harmonizing Generalizable and Precise Manipulation via Adaptive World Action Models","ref_index":50,"is_internal_anchor":true},{"citing_arxiv_id":"2602.15922","citing_title":"World Action Models are Zero-shot Policies","ref_index":80,"is_internal_anchor":true},{"citing_arxiv_id":"2604.09330","citing_title":"VAG: Dual-Stream Video-Action Generation for Embodied Data Synthesis","ref_index":71,"is_internal_anchor":true},{"citing_arxiv_id":"2605.03269","citing_title":"RLDX-1 Technical Report","ref_index":108,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/2IESB3WMK23AGYDUTI5ZDPZ3KL","json":"https://pith.science/pith/2IESB3WMK23AGYDUTI5ZDPZ3KL.json","graph_json":"https://pith.science/api/pith-number/2IESB3WMK23AGYDUTI5ZDPZ3KL/graph.json","events_json":"https://pith.science/api/pith-number/2IESB3WMK23AGYDUTI5ZDPZ3KL/events.json","paper":"https://pith.science/paper/2IESB3WM"},"agent_actions":{"view_html":"https://pith.science/pith/2IESB3WMK23AGYDUTI5ZDPZ3KL","download_json":"https://pith.science/pith/2IESB3WMK23AGYDUTI5ZDPZ3KL.json","view_paper":"https://pith.science/paper/2IESB3WM","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2510.27607&json=true","fetch_graph":"https://pith.science/api/pith-number/2IESB3WMK23AGYDUTI5ZDPZ3KL/graph.json","fetch_events":"https://pith.science/api/pith-number/2IESB3WMK23AGYDUTI5ZDPZ3KL/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/2IESB3WMK23AGYDUTI5ZDPZ3KL/action/timestamp_anchor","attest_storage":"https://pith.science/pith/2IESB3WMK23AGYDUTI5ZDPZ3KL/action/storage_attestation","attest_author":"https://pith.science/pith/2IESB3WMK23AGYDUTI5ZDPZ3KL/action/author_attestation","sign_citation":"https://pith.science/pith/2IESB3WMK23AGYDUTI5ZDPZ3KL/action/citation_signature","submit_replication":"https://pith.science/pith/2IESB3WMK23AGYDUTI5ZDPZ3KL/action/replication_record"}},"created_at":"2026-05-29T01:04:57.684066+00:00","updated_at":"2026-05-29T01:04:57.684066+00:00"}