{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:BYW6BAGAE3LN74WJFDAUFFXDKV","short_pith_number":"pith:BYW6BAGA","canonical_record":{"source":{"id":"2603.14851","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2026-03-16T05:50:31Z","cross_cats_sorted":["cs.RO"],"title_canon_sha256":"93860400555a6041304a7472b3dbd56c6a1cfca83a995f3c6733d8b65bb6d597","abstract_canon_sha256":"5a979f659fa168f61dd8846d36cd771f0c20bdb2fc2aa03ae3ed19ebdb7ad642"},"schema_version":"1.0"},"canonical_sha256":"0e2de080c026d6dff2c928c14296e3555cc9c2b11beb506f886ca8297308b5f7","source":{"kind":"arxiv","id":"2603.14851","version":3},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2603.14851","created_at":"2026-05-17T23:38:59Z"},{"alias_kind":"arxiv_version","alias_value":"2603.14851v3","created_at":"2026-05-17T23:38:59Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2603.14851","created_at":"2026-05-17T23:38:59Z"},{"alias_kind":"pith_short_12","alias_value":"BYW6BAGAE3LN","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"BYW6BAGAE3LN74WJ","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"BYW6BAGA","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:BYW6BAGAE3LN74WJFDAUFFXDKV","target":"record","payload":{"canonical_record":{"source":{"id":"2603.14851","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2026-03-16T05:50:31Z","cross_cats_sorted":["cs.RO"],"title_canon_sha256":"93860400555a6041304a7472b3dbd56c6a1cfca83a995f3c6733d8b65bb6d597","abstract_canon_sha256":"5a979f659fa168f61dd8846d36cd771f0c20bdb2fc2aa03ae3ed19ebdb7ad642"},"schema_version":"1.0"},"canonical_sha256":"0e2de080c026d6dff2c928c14296e3555cc9c2b11beb506f886ca8297308b5f7","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:59.646667Z","signature_b64":"O+Znf6MPGdUrvFKhZNOJQYMgHWdpygTzt3NzlvFf2pFnyC/e8jVb/mZJA+qdDFg5SUhz6JxDI4lj1AsF52LhBA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"0e2de080c026d6dff2c928c14296e3555cc9c2b11beb506f886ca8297308b5f7","last_reissued_at":"2026-05-17T23:38:59.645972Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:59.645972Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2603.14851","source_version":3,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:59Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"yQxtZlDIfMp0tOL8E5f7qKH9o6Iecu7hIhSl++EPipcWiyFlGLcv50g6FdEX7ExIJvVHYAyPFr4R28fULImSAw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-30T12:05:10.790604Z"},"content_sha256":"c624f10e46111974eb1810e361cdc492e2cbfb47becd4bf29a4de6ee5d04a72e","schema_version":"1.0","event_id":"sha256:c624f10e46111974eb1810e361cdc492e2cbfb47becd4bf29a4de6ee5d04a72e"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:BYW6BAGAE3LN74WJFDAUFFXDKV","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"AutoMoT: A Unified Vision-Language-Action Model with Asynchronous Mixture-of-Transformers for End-to-End Autonomous Driving","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"AutoMoT unifies vision-language reasoning and driving actions in one model using asynchronous mixture-of-transformers.","cross_cats":["cs.RO"],"primary_cat":"cs.CV","authors_text":"Chen Lv, Collister Chua, Long Chen, Qihang Huang, Songyan Zhang, Wenhui Huang, Zhan Chen, Zhidong Wang, Zhiqi Mao","submitted_at":"2026-03-16T05:50:31Z","abstract_excerpt":"Integrating vision-language models (VLMs) into end-to-end (E2E) autonomous driving (AD) systems has shown promise in improving scene understanding. However, existing integration strategies suffer from several limitations: they either struggle to resolve distribution misalignment between reasoning and action spaces, underexploit the general reasoning capabilities of pretrained VLMs, or incur substantial inference latency during action policy generation, which degrades driving performance. To address these challenges, we propose AutoMoT in this work, an end-to-end AD framework that unifies reaso"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"AutoMoT achieves competitive performance compared to state-of-the-art methods on multiple benchmarks under both open- and closed-loop settings, while pre-trained VLMs achieve competitive multi-task scene understanding through semantic prompting alone.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That joint attention sharing in the mixture-of-transformers architecture preserves the general reasoning capabilities of pre-trained VLMs without degradation while enabling efficient asynchronous fast-slow inference.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"AutoMoT unifies vision, language, and action in one model via asynchronous mixture-of-transformers, achieving competitive benchmark performance in autonomous driving while showing pre-trained VLMs suffice for scene understanding but need fine-tuning for actions.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"AutoMoT unifies vision-language reasoning and driving actions in one model using asynchronous mixture-of-transformers.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"4b464ad137385c3c3b5a3060f0ddf40fb619ed82e40e52676296e705e56efcb9"},"source":{"id":"2603.14851","kind":"arxiv","version":3},"verdict":{"id":"51c7e0d1-a27c-4429-a994-52c90d229719","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T10:31:57.898709Z","strongest_claim":"AutoMoT achieves competitive performance compared to state-of-the-art methods on multiple benchmarks under both open- and closed-loop settings, while pre-trained VLMs achieve competitive multi-task scene understanding through semantic prompting alone.","one_line_summary":"AutoMoT unifies vision, language, and action in one model via asynchronous mixture-of-transformers, achieving competitive benchmark performance in autonomous driving while showing pre-trained VLMs suffice for scene understanding but need fine-tuning for actions.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That joint attention sharing in the mixture-of-transformers architecture preserves the general reasoning capabilities of pre-trained VLMs without degradation while enabling efficient asynchronous fast-slow inference.","pith_extraction_headline":"AutoMoT unifies vision-language reasoning and driving actions in one model using asynchronous mixture-of-transformers."},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":1,"snapshot_sha256":"576eef3bfc09c5d0a342c1078524e0e958aeec523bf23ac38c37a877b0467d08"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"51c7e0d1-a27c-4429-a994-52c90d229719"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:59Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"hd1ob36DFwzsRZ2lX1Fwd0LeL2YrjuOm3GxmgCCNpw8gJAMvB9opm35NUvFM1QWo/040iqfa5RmCyJwspxvtAQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-30T12:05:10.791485Z"},"content_sha256":"098b8c495d4cd644908c3385315c011a731675f6671f800d48568e0807049d2d","schema_version":"1.0","event_id":"sha256:098b8c495d4cd644908c3385315c011a731675f6671f800d48568e0807049d2d"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/BYW6BAGAE3LN74WJFDAUFFXDKV/bundle.json","state_url":"https://pith.science/pith/BYW6BAGAE3LN74WJFDAUFFXDKV/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/BYW6BAGAE3LN74WJFDAUFFXDKV/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-30T12:05:10Z","links":{"resolver":"https://pith.science/pith/BYW6BAGAE3LN74WJFDAUFFXDKV","bundle":"https://pith.science/pith/BYW6BAGAE3LN74WJFDAUFFXDKV/bundle.json","state":"https://pith.science/pith/BYW6BAGAE3LN74WJFDAUFFXDKV/state.json","well_known_bundle":"https://pith.science/.well-known/pith/BYW6BAGAE3LN74WJFDAUFFXDKV/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:BYW6BAGAE3LN74WJFDAUFFXDKV","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"5a979f659fa168f61dd8846d36cd771f0c20bdb2fc2aa03ae3ed19ebdb7ad642","cross_cats_sorted":["cs.RO"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2026-03-16T05:50:31Z","title_canon_sha256":"93860400555a6041304a7472b3dbd56c6a1cfca83a995f3c6733d8b65bb6d597"},"schema_version":"1.0","source":{"id":"2603.14851","kind":"arxiv","version":3}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2603.14851","created_at":"2026-05-17T23:38:59Z"},{"alias_kind":"arxiv_version","alias_value":"2603.14851v3","created_at":"2026-05-17T23:38:59Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2603.14851","created_at":"2026-05-17T23:38:59Z"},{"alias_kind":"pith_short_12","alias_value":"BYW6BAGAE3LN","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"BYW6BAGAE3LN74WJ","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"BYW6BAGA","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:098b8c495d4cd644908c3385315c011a731675f6671f800d48568e0807049d2d","target":"graph","created_at":"2026-05-17T23:38:59Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"AutoMoT achieves competitive performance compared to state-of-the-art methods on multiple benchmarks under both open- and closed-loop settings, while pre-trained VLMs achieve competitive multi-task scene understanding through semantic prompting alone."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That joint attention sharing in the mixture-of-transformers architecture preserves the general reasoning capabilities of pre-trained VLMs without degradation while enabling efficient asynchronous fast-slow inference."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"AutoMoT unifies vision, language, and action in one model via asynchronous mixture-of-transformers, achieving competitive benchmark performance in autonomous driving while showing pre-trained VLMs suffice for scene understanding but need fine-tuning for actions."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"AutoMoT unifies vision-language reasoning and driving actions in one model using asynchronous mixture-of-transformers."}],"snapshot_sha256":"4b464ad137385c3c3b5a3060f0ddf40fb619ed82e40e52676296e705e56efcb9"},"formal_canon":{"evidence_count":1,"snapshot_sha256":"576eef3bfc09c5d0a342c1078524e0e958aeec523bf23ac38c37a877b0467d08"},"paper":{"abstract_excerpt":"Integrating vision-language models (VLMs) into end-to-end (E2E) autonomous driving (AD) systems has shown promise in improving scene understanding. However, existing integration strategies suffer from several limitations: they either struggle to resolve distribution misalignment between reasoning and action spaces, underexploit the general reasoning capabilities of pretrained VLMs, or incur substantial inference latency during action policy generation, which degrades driving performance. To address these challenges, we propose AutoMoT in this work, an end-to-end AD framework that unifies reaso","authors_text":"Chen Lv, Collister Chua, Long Chen, Qihang Huang, Songyan Zhang, Wenhui Huang, Zhan Chen, Zhidong Wang, Zhiqi Mao","cross_cats":["cs.RO"],"headline":"AutoMoT unifies vision-language reasoning and driving actions in one model using asynchronous mixture-of-transformers.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2026-03-16T05:50:31Z","title":"AutoMoT: A Unified Vision-Language-Action Model with Asynchronous Mixture-of-Transformers for End-to-End Autonomous Driving"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2603.14851","kind":"arxiv","version":3},"verdict":{"created_at":"2026-05-15T10:31:57.898709Z","id":"51c7e0d1-a27c-4429-a994-52c90d229719","model_set":{"reader":"grok-4.3"},"one_line_summary":"AutoMoT unifies vision, language, and action in one model via asynchronous mixture-of-transformers, achieving competitive benchmark performance in autonomous driving while showing pre-trained VLMs suffice for scene understanding but need fine-tuning for actions.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"AutoMoT unifies vision-language reasoning and driving actions in one model using asynchronous mixture-of-transformers.","strongest_claim":"AutoMoT achieves competitive performance compared to state-of-the-art methods on multiple benchmarks under both open- and closed-loop settings, while pre-trained VLMs achieve competitive multi-task scene understanding through semantic prompting alone.","weakest_assumption":"That joint attention sharing in the mixture-of-transformers architecture preserves the general reasoning capabilities of pre-trained VLMs without degradation while enabling efficient asynchronous fast-slow inference."}},"verdict_id":"51c7e0d1-a27c-4429-a994-52c90d229719"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:c624f10e46111974eb1810e361cdc492e2cbfb47becd4bf29a4de6ee5d04a72e","target":"record","created_at":"2026-05-17T23:38:59Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"5a979f659fa168f61dd8846d36cd771f0c20bdb2fc2aa03ae3ed19ebdb7ad642","cross_cats_sorted":["cs.RO"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2026-03-16T05:50:31Z","title_canon_sha256":"93860400555a6041304a7472b3dbd56c6a1cfca83a995f3c6733d8b65bb6d597"},"schema_version":"1.0","source":{"id":"2603.14851","kind":"arxiv","version":3}},"canonical_sha256":"0e2de080c026d6dff2c928c14296e3555cc9c2b11beb506f886ca8297308b5f7","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"0e2de080c026d6dff2c928c14296e3555cc9c2b11beb506f886ca8297308b5f7","first_computed_at":"2026-05-17T23:38:59.645972Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:59.645972Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"O+Znf6MPGdUrvFKhZNOJQYMgHWdpygTzt3NzlvFf2pFnyC/e8jVb/mZJA+qdDFg5SUhz6JxDI4lj1AsF52LhBA==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:59.646667Z","signed_message":"canonical_sha256_bytes"},"source_id":"2603.14851","source_kind":"arxiv","source_version":3}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:c624f10e46111974eb1810e361cdc492e2cbfb47becd4bf29a4de6ee5d04a72e","sha256:098b8c495d4cd644908c3385315c011a731675f6671f800d48568e0807049d2d"],"state_sha256":"2d335a7266b50fa98ae60d27ea65d4f4a38decfcf5f7bb89fe056971ed2ae8a6"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"IFiuumsuq5135WzZEMznxdn/EwrzRDzG1pJj3TBM33nX66US7yOiy8kKRW81HgZpyRx7EcYAAHqf7HRMMm6ODQ==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-30T12:05:10.795420Z","bundle_sha256":"5c59d98f639ec4f82e42d2dbaaea6c33ef2e1cf90ecffd2b2863a1f8e88edfb9"}}