{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2025:WUMYKSR74ZAYOLM2FTE775KAD2","short_pith_number":"pith:WUMYKSR7","canonical_record":{"source":{"id":"2510.12796","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2025-10-14T17:59:47Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"0d6fd76fc2dd6307b1b71f8456c10fcefcf50453f3ad221fbbd9a0ed2deee3a4","abstract_canon_sha256":"b0c36fc3151591d8a7b9a6ce74f682f50eb4bcf76c0dc2eaedbfb740b51fb4c2"},"schema_version":"1.0"},"canonical_sha256":"b519854a3fe641872d9a2cc9fff5401e9cd4a492cba5a529daca1605f6176e25","source":{"kind":"arxiv","id":"2510.12796","version":2},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2510.12796","created_at":"2026-05-17T23:38:14Z"},{"alias_kind":"arxiv_version","alias_value":"2510.12796v2","created_at":"2026-05-17T23:38:14Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2510.12796","created_at":"2026-05-17T23:38:14Z"},{"alias_kind":"pith_short_12","alias_value":"WUMYKSR74ZAY","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"WUMYKSR74ZAYOLM2","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"WUMYKSR7","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2025:WUMYKSR74ZAYOLM2FTE775KAD2","target":"record","payload":{"canonical_record":{"source":{"id":"2510.12796","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2025-10-14T17:59:47Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"0d6fd76fc2dd6307b1b71f8456c10fcefcf50453f3ad221fbbd9a0ed2deee3a4","abstract_canon_sha256":"b0c36fc3151591d8a7b9a6ce74f682f50eb4bcf76c0dc2eaedbfb740b51fb4c2"},"schema_version":"1.0"},"canonical_sha256":"b519854a3fe641872d9a2cc9fff5401e9cd4a492cba5a529daca1605f6176e25","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:14.790072Z","signature_b64":"sa/EH0jNsHgde/0PlWKyaSWADuc2YMGDlF2l1nbsatRhgSAbE+uWXng1nOJ4Qn/JvkSAuDF+oo5q+EPVcbL5BA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"b519854a3fe641872d9a2cc9fff5401e9cd4a492cba5a529daca1605f6176e25","last_reissued_at":"2026-05-17T23:38:14.789505Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:14.789505Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2510.12796","source_version":2,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:14Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"bEThXCyvmHuBIcmxE298sFv52IEtSBxM6d7mURN9NphyZF+HLvQ0QGlHs9rvFbsobNgUR/6E1mMwkOFpla9TDg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-28T16:31:44.871938Z"},"content_sha256":"4a3e38c6d584818462ca459367fd94d4a05ec5508a8393f99cb4c0f035b79772","schema_version":"1.0","event_id":"sha256:4a3e38c6d584818462ca459367fd94d4a05ec5508a8393f99cb4c0f035b79772"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2025:WUMYKSR74ZAYOLM2FTE775KAD2","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"DriveVLA-W0: World Models Amplify Data Scaling Law in Autonomous Driving","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Adding world modeling to predict future images lets vision-language-action models use large driving datasets more effectively and accelerate performance gains as data scales.","cross_cats":["cs.AI"],"primary_cat":"cs.CV","authors_text":"Bing Zhan, Chufeng Tang, Haochen Wang, Lue Fan, Lu Hou, Shuyao Shang, Weisong Liu, Xiaoman Wang, Yasong An, Yingyan Li, Yuntao Chen, Yuqi Wang, Zhaoxiang Zhang","submitted_at":"2025-10-14T17:59:47Z","abstract_excerpt":"Scaling Vision-Language-Action (VLA) models on large-scale data offers a promising path to achieving a more generalized driving intelligence. However, VLA models are limited by a ``supervision deficit'': the vast model capacity is supervised by sparse, low-dimensional actions, leaving much of their representational power underutilized. To remedy this, we propose \\textbf{DriveVLA-W0}, a training paradigm that employs world modeling to predict future images. This task generates a dense, self-supervised signal that compels the model to learn the underlying dynamics of the driving environment. We "},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"we propose DriveVLA-W0, a training paradigm that employs world modeling to predict future images. ... Crucially, it amplifies the data scaling law, showing that performance gains accelerate as the training dataset size increases.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That the added world modeling task of predicting future images supplies a dense, unbiased self-supervised signal that meaningfully utilizes unused model capacity without requiring extra labels or introducing new failure modes in driving dynamics.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"DriveVLA-W0 adds world modeling to predict future images in VLA models, overcoming sparse action supervision and amplifying data scaling laws on NAVSIM benchmarks and a large in-house dataset.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Adding world modeling to predict future images lets vision-language-action models use large driving datasets more effectively and accelerate performance gains as data scales.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"b1bf9098c357bc593fcd6e5671323d349b63e1254a40e7a4893235c251225b2a"},"source":{"id":"2510.12796","kind":"arxiv","version":2},"verdict":{"id":"cf6793d1-473e-4e13-9da8-1ff37c4b0c32","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-17T06:42:20.341660Z","strongest_claim":"we propose DriveVLA-W0, a training paradigm that employs world modeling to predict future images. ... Crucially, it amplifies the data scaling law, showing that performance gains accelerate as the training dataset size increases.","one_line_summary":"DriveVLA-W0 adds world modeling to predict future images in VLA models, overcoming sparse action supervision and amplifying data scaling laws on NAVSIM benchmarks and a large in-house dataset.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That the added world modeling task of predicting future images supplies a dense, unbiased self-supervised signal that meaningfully utilizes unused model capacity without requiring extra labels or introducing new failure modes in driving dynamics.","pith_extraction_headline":"Adding world modeling to predict future images lets vision-language-action models use large driving datasets more effectively and accelerate performance gains as data scales."},"references":{"count":39,"sample":[{"doi":"","year":2025,"title":"Covla: Comprehensive vision-language-action dataset for autonomous driving","work_id":"6783599d-5a5e-4a21-a4b0-e2ed5582cf30","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Qwen2.5-VL Technical Report","work_id":"69dffacb-bfe8-442d-be86-48624c60426f","ref_index":2,"cited_arxiv_id":"2502.13923","is_internal_anchor":true},{"doi":"","year":null,"title":"Scaling Laws of Mo- tion Forecasting and Planning – Technical Report","work_id":"dfe35d03-cdc6-4f73-940e-1ae1ceb82a53","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Vavim and vavam: Autonomous driving through video generative modeling","work_id":"b75ea66d-dafb-43ec-9345-f50eb3d615e2","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"$\\pi_0$: A Vision-Language-Action Flow Model for General Robot Control","work_id":"f790abdc-a796-482f-a40d-f8ee035ecfc2","ref_index":6,"cited_arxiv_id":"2410.24164","is_internal_anchor":true}],"resolved_work":39,"snapshot_sha256":"81a2968186f58f7db9dd6e0c27c178cccb4f6c53537cbf5a001d03d57aa9f2f7","internal_anchors":15},"formal_canon":{"evidence_count":2,"snapshot_sha256":"854227cfb28bcd6bbc663def7c2d1a8978711bed00d69c4e32b3263629f9c637"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"cf6793d1-473e-4e13-9da8-1ff37c4b0c32"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:14Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"sotuanX5+5psDY6bsymuozHejqUl5a9TB3Y6jl7AKAB9gU0fy/bsr4QoKlva0UHaHWdjdHSdfnJQhG/qlkCrAQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-28T16:31:44.873052Z"},"content_sha256":"c7e0ce2ad011667d9f1a831dc83794992d490f84a5b49334453a370657759020","schema_version":"1.0","event_id":"sha256:c7e0ce2ad011667d9f1a831dc83794992d490f84a5b49334453a370657759020"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/WUMYKSR74ZAYOLM2FTE775KAD2/bundle.json","state_url":"https://pith.science/pith/WUMYKSR74ZAYOLM2FTE775KAD2/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/WUMYKSR74ZAYOLM2FTE775KAD2/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-28T16:31:44Z","links":{"resolver":"https://pith.science/pith/WUMYKSR74ZAYOLM2FTE775KAD2","bundle":"https://pith.science/pith/WUMYKSR74ZAYOLM2FTE775KAD2/bundle.json","state":"https://pith.science/pith/WUMYKSR74ZAYOLM2FTE775KAD2/state.json","well_known_bundle":"https://pith.science/.well-known/pith/WUMYKSR74ZAYOLM2FTE775KAD2/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2025:WUMYKSR74ZAYOLM2FTE775KAD2","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"b0c36fc3151591d8a7b9a6ce74f682f50eb4bcf76c0dc2eaedbfb740b51fb4c2","cross_cats_sorted":["cs.AI"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2025-10-14T17:59:47Z","title_canon_sha256":"0d6fd76fc2dd6307b1b71f8456c10fcefcf50453f3ad221fbbd9a0ed2deee3a4"},"schema_version":"1.0","source":{"id":"2510.12796","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2510.12796","created_at":"2026-05-17T23:38:14Z"},{"alias_kind":"arxiv_version","alias_value":"2510.12796v2","created_at":"2026-05-17T23:38:14Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2510.12796","created_at":"2026-05-17T23:38:14Z"},{"alias_kind":"pith_short_12","alias_value":"WUMYKSR74ZAY","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"WUMYKSR74ZAYOLM2","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"WUMYKSR7","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:c7e0ce2ad011667d9f1a831dc83794992d490f84a5b49334453a370657759020","target":"graph","created_at":"2026-05-17T23:38:14Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"we propose DriveVLA-W0, a training paradigm that employs world modeling to predict future images. ... Crucially, it amplifies the data scaling law, showing that performance gains accelerate as the training dataset size increases."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the added world modeling task of predicting future images supplies a dense, unbiased self-supervised signal that meaningfully utilizes unused model capacity without requiring extra labels or introducing new failure modes in driving dynamics."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"DriveVLA-W0 adds world modeling to predict future images in VLA models, overcoming sparse action supervision and amplifying data scaling laws on NAVSIM benchmarks and a large in-house dataset."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Adding world modeling to predict future images lets vision-language-action models use large driving datasets more effectively and accelerate performance gains as data scales."}],"snapshot_sha256":"b1bf9098c357bc593fcd6e5671323d349b63e1254a40e7a4893235c251225b2a"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"854227cfb28bcd6bbc663def7c2d1a8978711bed00d69c4e32b3263629f9c637"},"paper":{"abstract_excerpt":"Scaling Vision-Language-Action (VLA) models on large-scale data offers a promising path to achieving a more generalized driving intelligence. However, VLA models are limited by a ``supervision deficit'': the vast model capacity is supervised by sparse, low-dimensional actions, leaving much of their representational power underutilized. To remedy this, we propose \\textbf{DriveVLA-W0}, a training paradigm that employs world modeling to predict future images. This task generates a dense, self-supervised signal that compels the model to learn the underlying dynamics of the driving environment. We ","authors_text":"Bing Zhan, Chufeng Tang, Haochen Wang, Lue Fan, Lu Hou, Shuyao Shang, Weisong Liu, Xiaoman Wang, Yasong An, Yingyan Li, Yuntao Chen, Yuqi Wang, Zhaoxiang Zhang","cross_cats":["cs.AI"],"headline":"Adding world modeling to predict future images lets vision-language-action models use large driving datasets more effectively and accelerate performance gains as data scales.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2025-10-14T17:59:47Z","title":"DriveVLA-W0: World Models Amplify Data Scaling Law in Autonomous Driving"},"references":{"count":39,"internal_anchors":15,"resolved_work":39,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"Covla: Comprehensive vision-language-action dataset for autonomous driving","work_id":"6783599d-5a5e-4a21-a4b0-e2ed5582cf30","year":2025},{"cited_arxiv_id":"2502.13923","doi":"","is_internal_anchor":true,"ref_index":2,"title":"Qwen2.5-VL Technical Report","work_id":"69dffacb-bfe8-442d-be86-48624c60426f","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"Scaling Laws of Mo- tion Forecasting and Planning – Technical Report","work_id":"dfe35d03-cdc6-4f73-940e-1ae1ceb82a53","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Vavim and vavam: Autonomous driving through video generative modeling","work_id":"b75ea66d-dafb-43ec-9345-f50eb3d615e2","year":null},{"cited_arxiv_id":"2410.24164","doi":"","is_internal_anchor":true,"ref_index":6,"title":"$\\pi_0$: A Vision-Language-Action Flow Model for General Robot Control","work_id":"f790abdc-a796-482f-a40d-f8ee035ecfc2","year":null}],"snapshot_sha256":"81a2968186f58f7db9dd6e0c27c178cccb4f6c53537cbf5a001d03d57aa9f2f7"},"source":{"id":"2510.12796","kind":"arxiv","version":2},"verdict":{"created_at":"2026-05-17T06:42:20.341660Z","id":"cf6793d1-473e-4e13-9da8-1ff37c4b0c32","model_set":{"reader":"grok-4.3"},"one_line_summary":"DriveVLA-W0 adds world modeling to predict future images in VLA models, overcoming sparse action supervision and amplifying data scaling laws on NAVSIM benchmarks and a large in-house dataset.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Adding world modeling to predict future images lets vision-language-action models use large driving datasets more effectively and accelerate performance gains as data scales.","strongest_claim":"we propose DriveVLA-W0, a training paradigm that employs world modeling to predict future images. ... Crucially, it amplifies the data scaling law, showing that performance gains accelerate as the training dataset size increases.","weakest_assumption":"That the added world modeling task of predicting future images supplies a dense, unbiased self-supervised signal that meaningfully utilizes unused model capacity without requiring extra labels or introducing new failure modes in driving dynamics."}},"verdict_id":"cf6793d1-473e-4e13-9da8-1ff37c4b0c32"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:4a3e38c6d584818462ca459367fd94d4a05ec5508a8393f99cb4c0f035b79772","target":"record","created_at":"2026-05-17T23:38:14Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"b0c36fc3151591d8a7b9a6ce74f682f50eb4bcf76c0dc2eaedbfb740b51fb4c2","cross_cats_sorted":["cs.AI"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2025-10-14T17:59:47Z","title_canon_sha256":"0d6fd76fc2dd6307b1b71f8456c10fcefcf50453f3ad221fbbd9a0ed2deee3a4"},"schema_version":"1.0","source":{"id":"2510.12796","kind":"arxiv","version":2}},"canonical_sha256":"b519854a3fe641872d9a2cc9fff5401e9cd4a492cba5a529daca1605f6176e25","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"b519854a3fe641872d9a2cc9fff5401e9cd4a492cba5a529daca1605f6176e25","first_computed_at":"2026-05-17T23:38:14.789505Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:14.789505Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"sa/EH0jNsHgde/0PlWKyaSWADuc2YMGDlF2l1nbsatRhgSAbE+uWXng1nOJ4Qn/JvkSAuDF+oo5q+EPVcbL5BA==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:14.790072Z","signed_message":"canonical_sha256_bytes"},"source_id":"2510.12796","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:4a3e38c6d584818462ca459367fd94d4a05ec5508a8393f99cb4c0f035b79772","sha256:c7e0ce2ad011667d9f1a831dc83794992d490f84a5b49334453a370657759020"],"state_sha256":"1ca9c8b89356e90cc030571f8fb3534f861aeddfc87079783344f96f162e3b90"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"9Q1HLxxVqQw5rMjriJZWp2wp+/4Ykdmrcev1sctAWr3HWHR3peANNhXlz0qIXoZ29Czo7T0btlmsBR4jK+2tAg==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-28T16:31:44.877177Z","bundle_sha256":"a7756c3ee4412c2ee2519ec9b682728967c231d20616ff73a6542c997c42c268"}}