{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2025:YIKPG3DR7LRHJO2T6HQE43O6CP","short_pith_number":"pith:YIKPG3DR","schema_version":"1.0","canonical_sha256":"c214f36c71fae274bb53f1e04e6dde13e7269e499ef70a6385ebc17331adb0fb","source":{"kind":"arxiv","id":"2506.06006","version":3},"attestation_state":"computed","paper":{"title":"Can VLMs Predict Future States? Bootstrapping World Models from Inverse Dynamics","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI","cs.CL"],"primary_cat":"cs.CV","authors_text":"Anna Korhonen, Edoardo M. Ponti, Shay B. Cohen, Yftah Ziser, Yifu Qiu","submitted_at":"2025-06-06T11:50:18Z","abstract_excerpt":"Can unified vision-language models (VLMs) perform forward dynamics prediction (FDP), i.e., predicting the future state (in image form) given the previous observation and an action (in language form)? We find that VLMs struggle to generate physically plausible transitions between frames from instructions. Nevertheless, we identify a crucial asymmetry in multimodal grounding: fine-tuning a VLM to learn inverse dynamics prediction (IDP)-effectively captioning the action between frames-is significantly easier than learning FDP. In turn, IDP can be used to bootstrap FDP through two main strategies:"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2506.06006","kind":"arxiv","version":3},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2025-06-06T11:50:18Z","cross_cats_sorted":["cs.AI","cs.CL"],"title_canon_sha256":"ff8184da166c039c099917624887bed74af006e032f9ac1741e88760492c80b1","abstract_canon_sha256":"362d33ebf1cf5c60d8a1979271481f5e18ca39453dc66b8a7529705c122ce523"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-04T01:08:29.571813Z","signature_b64":"AZCLoMRgwo4vAT2MjrLCicMgREmG6US7fBN354CDh5WeyHnVPC+JFIu/2eya8ZBQ6I3vXGRvSKUT3rtfuqBSCw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"c214f36c71fae274bb53f1e04e6dde13e7269e499ef70a6385ebc17331adb0fb","last_reissued_at":"2026-06-04T01:08:29.571185Z","signature_status":"signed_v1","first_computed_at":"2026-06-04T01:08:29.571185Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Can VLMs Predict Future States? Bootstrapping World Models from Inverse Dynamics","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI","cs.CL"],"primary_cat":"cs.CV","authors_text":"Anna Korhonen, Edoardo M. Ponti, Shay B. Cohen, Yftah Ziser, Yifu Qiu","submitted_at":"2025-06-06T11:50:18Z","abstract_excerpt":"Can unified vision-language models (VLMs) perform forward dynamics prediction (FDP), i.e., predicting the future state (in image form) given the previous observation and an action (in language form)? We find that VLMs struggle to generate physically plausible transitions between frames from instructions. Nevertheless, we identify a crucial asymmetry in multimodal grounding: fine-tuning a VLM to learn inverse dynamics prediction (IDP)-effectively captioning the action between frames-is significantly easier than learning FDP. In turn, IDP can be used to bootstrap FDP through two main strategies:"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2506.06006","kind":"arxiv","version":3},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2506.06006/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2506.06006","created_at":"2026-06-04T01:08:29.571276+00:00"},{"alias_kind":"arxiv_version","alias_value":"2506.06006v3","created_at":"2026-06-04T01:08:29.571276+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2506.06006","created_at":"2026-06-04T01:08:29.571276+00:00"},{"alias_kind":"pith_short_12","alias_value":"YIKPG3DR7LRH","created_at":"2026-06-04T01:08:29.571276+00:00"},{"alias_kind":"pith_short_16","alias_value":"YIKPG3DR7LRHJO2T","created_at":"2026-06-04T01:08:29.571276+00:00"},{"alias_kind":"pith_short_8","alias_value":"YIKPG3DR","created_at":"2026-06-04T01:08:29.571276+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/YIKPG3DR7LRHJO2T6HQE43O6CP","json":"https://pith.science/pith/YIKPG3DR7LRHJO2T6HQE43O6CP.json","graph_json":"https://pith.science/api/pith-number/YIKPG3DR7LRHJO2T6HQE43O6CP/graph.json","events_json":"https://pith.science/api/pith-number/YIKPG3DR7LRHJO2T6HQE43O6CP/events.json","paper":"https://pith.science/paper/YIKPG3DR"},"agent_actions":{"view_html":"https://pith.science/pith/YIKPG3DR7LRHJO2T6HQE43O6CP","download_json":"https://pith.science/pith/YIKPG3DR7LRHJO2T6HQE43O6CP.json","view_paper":"https://pith.science/paper/YIKPG3DR","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2506.06006&json=true","fetch_graph":"https://pith.science/api/pith-number/YIKPG3DR7LRHJO2T6HQE43O6CP/graph.json","fetch_events":"https://pith.science/api/pith-number/YIKPG3DR7LRHJO2T6HQE43O6CP/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/YIKPG3DR7LRHJO2T6HQE43O6CP/action/timestamp_anchor","attest_storage":"https://pith.science/pith/YIKPG3DR7LRHJO2T6HQE43O6CP/action/storage_attestation","attest_author":"https://pith.science/pith/YIKPG3DR7LRHJO2T6HQE43O6CP/action/author_attestation","sign_citation":"https://pith.science/pith/YIKPG3DR7LRHJO2T6HQE43O6CP/action/citation_signature","submit_replication":"https://pith.science/pith/YIKPG3DR7LRHJO2T6HQE43O6CP/action/replication_record"}},"created_at":"2026-06-04T01:08:29.571276+00:00","updated_at":"2026-06-04T01:08:29.571276+00:00"}