{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2025:A5FG35S7CKS6D6ESOMRXLFI35P","short_pith_number":"pith:A5FG35S7","canonical_record":{"source":{"id":"2509.24251","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2025-09-29T03:52:01Z","cross_cats_sorted":["cs.CL"],"title_canon_sha256":"8020884362e361106ef159f775c8b48cb15e589341acafc13635e7b10be70580","abstract_canon_sha256":"6e5d09a53c02ec8eaea5cd02f2729d5217e9f565b9604f26438a0960398b3982"},"schema_version":"1.0"},"canonical_sha256":"074a6df65f12a5e1f892732375951bebd1a9aecc134fabdc66ec4e602b6bbda9","source":{"kind":"arxiv","id":"2509.24251","version":2},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2509.24251","created_at":"2026-05-17T23:38:50Z"},{"alias_kind":"arxiv_version","alias_value":"2509.24251v2","created_at":"2026-05-17T23:38:50Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2509.24251","created_at":"2026-05-17T23:38:50Z"},{"alias_kind":"pith_short_12","alias_value":"A5FG35S7CKS6","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"A5FG35S7CKS6D6ES","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"A5FG35S7","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2025:A5FG35S7CKS6D6ESOMRXLFI35P","target":"record","payload":{"canonical_record":{"source":{"id":"2509.24251","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2025-09-29T03:52:01Z","cross_cats_sorted":["cs.CL"],"title_canon_sha256":"8020884362e361106ef159f775c8b48cb15e589341acafc13635e7b10be70580","abstract_canon_sha256":"6e5d09a53c02ec8eaea5cd02f2729d5217e9f565b9604f26438a0960398b3982"},"schema_version":"1.0"},"canonical_sha256":"074a6df65f12a5e1f892732375951bebd1a9aecc134fabdc66ec4e602b6bbda9","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:50.558419Z","signature_b64":"LkV4WGEmxCfNsqG7hGwfBNnLfwTKbu8xdySOYop5nBKzFXG+pI9RKuvDaVG84aIY2Uqkj/u/nDl1NPcl4aaaBw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"074a6df65f12a5e1f892732375951bebd1a9aecc134fabdc66ec4e602b6bbda9","last_reissued_at":"2026-05-17T23:38:50.557867Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:50.557867Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2509.24251","source_version":2,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:50Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"Q8UJeXTK6W/0HzyRuyCLMLggHDAl/bIjMZD+RKJHl/AuUZm6TG5e858CMxR50a/3kgYQhRkq+la1RhK+JiG5Aw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-27T05:08:09.670039Z"},"content_sha256":"f05fba278653a0ede6b5691770809090a52767c96a866a857ead3a59ef9fe31f","schema_version":"1.0","event_id":"sha256:f05fba278653a0ede6b5691770809090a52767c96a866a857ead3a59ef9fe31f"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2025:A5FG35S7CKS6D6ESOMRXLFI35P","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Latent Visual Reasoning","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Multimodal models can perform reasoning steps by autoregressively generating latent visual states that reconstruct key image tokens.","cross_cats":["cs.CL"],"primary_cat":"cs.CV","authors_text":"Bangzheng Li, Emad Barsoum, Hao Chen, Jialian Wu, Jiang Liu, Muhao Chen, Xiaodong Yu, Ximeng Sun, Ze Wang, Zicheng Liu","submitted_at":"2025-09-29T03:52:01Z","abstract_excerpt":"Multimodal Large Language Models (MLLMs) have achieved notable gains in various tasks by incorporating Chain-of-Thought (CoT) reasoning in language spaces. Recent work extends this direction by leveraging external tools for visual editing, thereby enhancing the visual signal along the reasoning trajectories. Nevertheless, these approaches remain fundamentally constrained: reasoning is still confined to the language space, with visual information treated as static preconditions. We introduce Latent Visual Reasoning (LVR), a new paradigm that enables autoregressive reasoning directly in the visu"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"We introduce Latent Visual Reasoning (LVR), a new paradigm that enables autoregressive reasoning directly in the visual embedding space... By interleaving LVR with standard text generation, our model achieves substantial gains on perception-intensive visual question answering tasks.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That generating latent states whose explicit goal is to reconstruct selected visual tokens constitutes genuine visual reasoning that improves downstream task performance beyond what language-only CoT or tool-based editing already achieves.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Latent Visual Reasoning enables autoregressive generation of latent visual states that reconstruct critical image tokens, yielding gains on perception-heavy VQA benchmarks such as 71.67% on MMVP.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Multimodal models can perform reasoning steps by autoregressively generating latent visual states that reconstruct key image tokens.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"26bc37f9fe211d62f1b0009b83692354cfe91b7868af157e78654685760b2c64"},"source":{"id":"2509.24251","kind":"arxiv","version":2},"verdict":{"id":"52775360-78d4-46d3-b050-18e82bf3bd48","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T18:38:00.990162Z","strongest_claim":"We introduce Latent Visual Reasoning (LVR), a new paradigm that enables autoregressive reasoning directly in the visual embedding space... By interleaving LVR with standard text generation, our model achieves substantial gains on perception-intensive visual question answering tasks.","one_line_summary":"Latent Visual Reasoning enables autoregressive generation of latent visual states that reconstruct critical image tokens, yielding gains on perception-heavy VQA benchmarks such as 71.67% on MMVP.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That generating latent states whose explicit goal is to reconstruct selected visual tokens constitutes genuine visual reasoning that improves downstream task performance beyond what language-only CoT or tool-based editing already achieves.","pith_extraction_headline":"Multimodal models can perform reasoning steps by autoregressively generating latent visual states that reconstruct key image tokens."},"references":{"count":29,"sample":[{"doi":"","year":null,"title":"Qwen2.5-VL Technical Report","work_id":"69dffacb-bfe8-442d-be86-48624c60426f","ref_index":1,"cited_arxiv_id":"2502.13923","is_internal_anchor":true},{"doi":"","year":null,"title":"Diagnosing and mitigating modality interference in multimodal large language models","work_id":"00d9057c-1f23-4fec-ac22-cfb22cdb7aae","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"SFT or RL? An Early Investigation into Training R1-Like Reasoning Large Vision-Language Models","work_id":"a521360c-8673-4d0d-a3a3-6eb9f7a71b90","ref_index":3,"cited_arxiv_id":"2504.11468","is_internal_anchor":true},{"doi":"","year":null,"title":"Compressed Chain of Thought: Efficient Reasoning Through Dense Representations","work_id":"5d72fcbb-d14d-4ac0-8644-50807a64d543","ref_index":4,"cited_arxiv_id":"2412.13171","is_internal_anchor":true},{"doi":"","year":null,"title":"v1: Learning to Point Visual Tokens for Multimodal Grounded Reasoning","work_id":"8f83df31-830d-408d-bcad-25a9fffd7d37","ref_index":5,"cited_arxiv_id":"2505.18842","is_internal_anchor":true}],"resolved_work":29,"snapshot_sha256":"96e9f5364930434f5a3f0995d3f1747642a1f13bbd7f7005a2961506168aebfd","internal_anchors":17},"formal_canon":{"evidence_count":2,"snapshot_sha256":"e623ee273f2554bc1d033848a90de2dd768793cc76df7dbb704db601fb349e9a"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"52775360-78d4-46d3-b050-18e82bf3bd48"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:50Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"NZko9wUeZQbBnY/qf+nTmogGu4aN4wPzyqVYceCheu/lXtGH/f5WtIN7I00l7TqJ2HFnEKpWeFAlZy1DqNR1Cw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-27T05:08:09.670769Z"},"content_sha256":"7f5d6e1355e4d9961b66467bd8118a03093d51b5024232dcccca321cb22e4305","schema_version":"1.0","event_id":"sha256:7f5d6e1355e4d9961b66467bd8118a03093d51b5024232dcccca321cb22e4305"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/A5FG35S7CKS6D6ESOMRXLFI35P/bundle.json","state_url":"https://pith.science/pith/A5FG35S7CKS6D6ESOMRXLFI35P/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/A5FG35S7CKS6D6ESOMRXLFI35P/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-27T05:08:09Z","links":{"resolver":"https://pith.science/pith/A5FG35S7CKS6D6ESOMRXLFI35P","bundle":"https://pith.science/pith/A5FG35S7CKS6D6ESOMRXLFI35P/bundle.json","state":"https://pith.science/pith/A5FG35S7CKS6D6ESOMRXLFI35P/state.json","well_known_bundle":"https://pith.science/.well-known/pith/A5FG35S7CKS6D6ESOMRXLFI35P/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2025:A5FG35S7CKS6D6ESOMRXLFI35P","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"6e5d09a53c02ec8eaea5cd02f2729d5217e9f565b9604f26438a0960398b3982","cross_cats_sorted":["cs.CL"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2025-09-29T03:52:01Z","title_canon_sha256":"8020884362e361106ef159f775c8b48cb15e589341acafc13635e7b10be70580"},"schema_version":"1.0","source":{"id":"2509.24251","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2509.24251","created_at":"2026-05-17T23:38:50Z"},{"alias_kind":"arxiv_version","alias_value":"2509.24251v2","created_at":"2026-05-17T23:38:50Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2509.24251","created_at":"2026-05-17T23:38:50Z"},{"alias_kind":"pith_short_12","alias_value":"A5FG35S7CKS6","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"A5FG35S7CKS6D6ES","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"A5FG35S7","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:7f5d6e1355e4d9961b66467bd8118a03093d51b5024232dcccca321cb22e4305","target":"graph","created_at":"2026-05-17T23:38:50Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"We introduce Latent Visual Reasoning (LVR), a new paradigm that enables autoregressive reasoning directly in the visual embedding space... By interleaving LVR with standard text generation, our model achieves substantial gains on perception-intensive visual question answering tasks."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That generating latent states whose explicit goal is to reconstruct selected visual tokens constitutes genuine visual reasoning that improves downstream task performance beyond what language-only CoT or tool-based editing already achieves."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Latent Visual Reasoning enables autoregressive generation of latent visual states that reconstruct critical image tokens, yielding gains on perception-heavy VQA benchmarks such as 71.67% on MMVP."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Multimodal models can perform reasoning steps by autoregressively generating latent visual states that reconstruct key image tokens."}],"snapshot_sha256":"26bc37f9fe211d62f1b0009b83692354cfe91b7868af157e78654685760b2c64"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"e623ee273f2554bc1d033848a90de2dd768793cc76df7dbb704db601fb349e9a"},"paper":{"abstract_excerpt":"Multimodal Large Language Models (MLLMs) have achieved notable gains in various tasks by incorporating Chain-of-Thought (CoT) reasoning in language spaces. Recent work extends this direction by leveraging external tools for visual editing, thereby enhancing the visual signal along the reasoning trajectories. Nevertheless, these approaches remain fundamentally constrained: reasoning is still confined to the language space, with visual information treated as static preconditions. We introduce Latent Visual Reasoning (LVR), a new paradigm that enables autoregressive reasoning directly in the visu","authors_text":"Bangzheng Li, Emad Barsoum, Hao Chen, Jialian Wu, Jiang Liu, Muhao Chen, Xiaodong Yu, Ximeng Sun, Ze Wang, Zicheng Liu","cross_cats":["cs.CL"],"headline":"Multimodal models can perform reasoning steps by autoregressively generating latent visual states that reconstruct key image tokens.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2025-09-29T03:52:01Z","title":"Latent Visual Reasoning"},"references":{"count":29,"internal_anchors":17,"resolved_work":29,"sample":[{"cited_arxiv_id":"2502.13923","doi":"","is_internal_anchor":true,"ref_index":1,"title":"Qwen2.5-VL Technical Report","work_id":"69dffacb-bfe8-442d-be86-48624c60426f","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Diagnosing and mitigating modality interference in multimodal large language models","work_id":"00d9057c-1f23-4fec-ac22-cfb22cdb7aae","year":null},{"cited_arxiv_id":"2504.11468","doi":"","is_internal_anchor":true,"ref_index":3,"title":"SFT or RL? An Early Investigation into Training R1-Like Reasoning Large Vision-Language Models","work_id":"a521360c-8673-4d0d-a3a3-6eb9f7a71b90","year":null},{"cited_arxiv_id":"2412.13171","doi":"","is_internal_anchor":true,"ref_index":4,"title":"Compressed Chain of Thought: Efficient Reasoning Through Dense Representations","work_id":"5d72fcbb-d14d-4ac0-8644-50807a64d543","year":null},{"cited_arxiv_id":"2505.18842","doi":"","is_internal_anchor":true,"ref_index":5,"title":"v1: Learning to Point Visual Tokens for Multimodal Grounded Reasoning","work_id":"8f83df31-830d-408d-bcad-25a9fffd7d37","year":null}],"snapshot_sha256":"96e9f5364930434f5a3f0995d3f1747642a1f13bbd7f7005a2961506168aebfd"},"source":{"id":"2509.24251","kind":"arxiv","version":2},"verdict":{"created_at":"2026-05-15T18:38:00.990162Z","id":"52775360-78d4-46d3-b050-18e82bf3bd48","model_set":{"reader":"grok-4.3"},"one_line_summary":"Latent Visual Reasoning enables autoregressive generation of latent visual states that reconstruct critical image tokens, yielding gains on perception-heavy VQA benchmarks such as 71.67% on MMVP.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Multimodal models can perform reasoning steps by autoregressively generating latent visual states that reconstruct key image tokens.","strongest_claim":"We introduce Latent Visual Reasoning (LVR), a new paradigm that enables autoregressive reasoning directly in the visual embedding space... By interleaving LVR with standard text generation, our model achieves substantial gains on perception-intensive visual question answering tasks.","weakest_assumption":"That generating latent states whose explicit goal is to reconstruct selected visual tokens constitutes genuine visual reasoning that improves downstream task performance beyond what language-only CoT or tool-based editing already achieves."}},"verdict_id":"52775360-78d4-46d3-b050-18e82bf3bd48"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:f05fba278653a0ede6b5691770809090a52767c96a866a857ead3a59ef9fe31f","target":"record","created_at":"2026-05-17T23:38:50Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"6e5d09a53c02ec8eaea5cd02f2729d5217e9f565b9604f26438a0960398b3982","cross_cats_sorted":["cs.CL"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CV","submitted_at":"2025-09-29T03:52:01Z","title_canon_sha256":"8020884362e361106ef159f775c8b48cb15e589341acafc13635e7b10be70580"},"schema_version":"1.0","source":{"id":"2509.24251","kind":"arxiv","version":2}},"canonical_sha256":"074a6df65f12a5e1f892732375951bebd1a9aecc134fabdc66ec4e602b6bbda9","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"074a6df65f12a5e1f892732375951bebd1a9aecc134fabdc66ec4e602b6bbda9","first_computed_at":"2026-05-17T23:38:50.557867Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:50.557867Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"LkV4WGEmxCfNsqG7hGwfBNnLfwTKbu8xdySOYop5nBKzFXG+pI9RKuvDaVG84aIY2Uqkj/u/nDl1NPcl4aaaBw==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:50.558419Z","signed_message":"canonical_sha256_bytes"},"source_id":"2509.24251","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:f05fba278653a0ede6b5691770809090a52767c96a866a857ead3a59ef9fe31f","sha256:7f5d6e1355e4d9961b66467bd8118a03093d51b5024232dcccca321cb22e4305"],"state_sha256":"b97e5b4429c94573cbe2830dca6c64730fde480b72cb18ee7714d98ab02befbc"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"HBFxv3r9cJkeA/HjgQ2s7uaMVLLhxPZR1vlAOh6IQ9tHd96YqY9vdap33R/JrYeqh9zZwZFUj+67VUAqEOHlBw==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-27T05:08:09.675452Z","bundle_sha256":"6ac58c5f79a8cb9a294569f45d9782dd060bf1448b4a0ca97815bf72dd728b9d"}}