{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:PEGRU4FOQZO7W74V5UESR33DD3","short_pith_number":"pith:PEGRU4FO","schema_version":"1.0","canonical_sha256":"790d1a70ae865dfb7f95ed0928ef631efa1bb068d132212bfa93186d445258f0","source":{"kind":"arxiv","id":"2606.28758","version":1},"attestation_state":"computed","paper":{"title":"X-Mind: Efficient Visual Chain-of-Thought via Predictive World Model for End-to-End Driving","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.CV","authors_text":"Bohao Zhao, Chengrui Wei, Guangfeng Jiang, Hang Zhang, Hanpeng Liu, Jinyun Zhou, Liu Liang, Pengkun Zheng, Qingyu Luo, Rui Guo, Ruixin Liu, Sutao Deng, Xianming Liu, Xinlong Zheng, Xiuyang Fan, Xuejie Lv, Yi Guo, Yutong Zheng, Yu Zhang, Zhuangzhuang Ding","submitted_at":"2026-06-27T06:17:50Z","abstract_excerpt":"Predicting future states is essential for autonomous agents, yet current Vision-Language-Action (VLA) models fundamentally lack this capability, relying instead on reactive perception-action mapping. While integrating Predictive World Models (PWMs) addresses this gap, existing approaches either incur prohibitive cascaded latency or act as shallow terminal tasks that fail to deeply embed forward-looking reasoning. To endow VLA models with this reasoning capability, we propose X-Mind. Rather than treating PWMs as an external auxiliary module, this framework internalizes them as the Visual Chain-"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2606.28758","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CV","submitted_at":"2026-06-27T06:17:50Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"dacfe072fec00c5f0d0e6f825cf875105045af00cc1411b46c7a4514b3b453a3","abstract_canon_sha256":"ff157d9c8d5a72523df70e6c670be948234f4249c847ebfb0a413ef3fe71e607"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-30T01:16:50.782919Z","signature_b64":"b7UCU3Uf4ksCjLfJGNHosss2ePMkpZysyDm1CuGFAM5ABIYvH3sdG+afgPW5p4JWzZCwfXZJiAToJ4vIBkNTDA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"790d1a70ae865dfb7f95ed0928ef631efa1bb068d132212bfa93186d445258f0","last_reissued_at":"2026-06-30T01:16:50.782422Z","signature_status":"signed_v1","first_computed_at":"2026-06-30T01:16:50.782422Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"X-Mind: Efficient Visual Chain-of-Thought via Predictive World Model for End-to-End Driving","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.CV","authors_text":"Bohao Zhao, Chengrui Wei, Guangfeng Jiang, Hang Zhang, Hanpeng Liu, Jinyun Zhou, Liu Liang, Pengkun Zheng, Qingyu Luo, Rui Guo, Ruixin Liu, Sutao Deng, Xianming Liu, Xinlong Zheng, Xiuyang Fan, Xuejie Lv, Yi Guo, Yutong Zheng, Yu Zhang, Zhuangzhuang Ding","submitted_at":"2026-06-27T06:17:50Z","abstract_excerpt":"Predicting future states is essential for autonomous agents, yet current Vision-Language-Action (VLA) models fundamentally lack this capability, relying instead on reactive perception-action mapping. While integrating Predictive World Models (PWMs) addresses this gap, existing approaches either incur prohibitive cascaded latency or act as shallow terminal tasks that fail to deeply embed forward-looking reasoning. To endow VLA models with this reasoning capability, we propose X-Mind. Rather than treating PWMs as an external auxiliary module, this framework internalizes them as the Visual Chain-"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.28758","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2606.28758/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2606.28758","created_at":"2026-06-30T01:16:50.782495+00:00"},{"alias_kind":"arxiv_version","alias_value":"2606.28758v1","created_at":"2026-06-30T01:16:50.782495+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.28758","created_at":"2026-06-30T01:16:50.782495+00:00"},{"alias_kind":"pith_short_12","alias_value":"PEGRU4FOQZO7","created_at":"2026-06-30T01:16:50.782495+00:00"},{"alias_kind":"pith_short_16","alias_value":"PEGRU4FOQZO7W74V","created_at":"2026-06-30T01:16:50.782495+00:00"},{"alias_kind":"pith_short_8","alias_value":"PEGRU4FO","created_at":"2026-06-30T01:16:50.782495+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/PEGRU4FOQZO7W74V5UESR33DD3","json":"https://pith.science/pith/PEGRU4FOQZO7W74V5UESR33DD3.json","graph_json":"https://pith.science/api/pith-number/PEGRU4FOQZO7W74V5UESR33DD3/graph.json","events_json":"https://pith.science/api/pith-number/PEGRU4FOQZO7W74V5UESR33DD3/events.json","paper":"https://pith.science/paper/PEGRU4FO"},"agent_actions":{"view_html":"https://pith.science/pith/PEGRU4FOQZO7W74V5UESR33DD3","download_json":"https://pith.science/pith/PEGRU4FOQZO7W74V5UESR33DD3.json","view_paper":"https://pith.science/paper/PEGRU4FO","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2606.28758&json=true","fetch_graph":"https://pith.science/api/pith-number/PEGRU4FOQZO7W74V5UESR33DD3/graph.json","fetch_events":"https://pith.science/api/pith-number/PEGRU4FOQZO7W74V5UESR33DD3/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/PEGRU4FOQZO7W74V5UESR33DD3/action/timestamp_anchor","attest_storage":"https://pith.science/pith/PEGRU4FOQZO7W74V5UESR33DD3/action/storage_attestation","attest_author":"https://pith.science/pith/PEGRU4FOQZO7W74V5UESR33DD3/action/author_attestation","sign_citation":"https://pith.science/pith/PEGRU4FOQZO7W74V5UESR33DD3/action/citation_signature","submit_replication":"https://pith.science/pith/PEGRU4FOQZO7W74V5UESR33DD3/action/replication_record"}},"created_at":"2026-06-30T01:16:50.782495+00:00","updated_at":"2026-06-30T01:16:50.782495+00:00"}