{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:XJHDCY3YIRSEYDRJNRSGTUPXJG","short_pith_number":"pith:XJHDCY3Y","schema_version":"1.0","canonical_sha256":"ba4e31637844644c0e296c6469d1f749b9824946d1a4786461d2277b6407c470","source":{"kind":"arxiv","id":"2601.00215","version":2},"attestation_state":"computed","paper":{"title":"Disentangling Perception and Reasoning in Multimodal LLMs via Reward Design","license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","headline":"","cross_cats":["cs.CL"],"primary_cat":"cs.CV","authors_text":"Eftekhar Hossain, Nikhil Singh, Omar Sharif, Patrick Ng","submitted_at":"2026-01-01T05:19:28Z","abstract_excerpt":"Reinforcement learning with verifiable rewards has driven major gains in LLM reasoning, and it is intuitive to assume this recipe will transfer well to multimodal models. However, multimodal models do two things: first, perceive what is in an image, then reason about what it implies. Because these stages are graded jointly, it is hard to tell how much room reasoning alone has to grow. We study this on algorithmic visual puzzles, where both components are necessary and show that perception, not reasoning, is the binding constraint. Replacing images with simple textual descriptions raises perfor"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2601.00215","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.CV","submitted_at":"2026-01-01T05:19:28Z","cross_cats_sorted":["cs.CL"],"title_canon_sha256":"781d59b1f5285cd91d279de4f215a5ba2a9818fe317d4d988725108ccf982052","abstract_canon_sha256":"956b791ad5f4ed441f0343f33b536938dae842b875d75482c2f6651a387dac10"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-19T16:09:53.432943Z","signature_b64":"Y1KuhJixLBiktpKiHisuCYRNl4ADU3raoFkn8dlnvhoYmB1FizyEHtDLt8mNkB6AUnjmqQuNBEfKyT1MBaB9Bw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"ba4e31637844644c0e296c6469d1f749b9824946d1a4786461d2277b6407c470","last_reissued_at":"2026-06-19T16:09:53.432518Z","signature_status":"signed_v1","first_computed_at":"2026-06-19T16:09:53.432518Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Disentangling Perception and Reasoning in Multimodal LLMs via Reward Design","license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","headline":"","cross_cats":["cs.CL"],"primary_cat":"cs.CV","authors_text":"Eftekhar Hossain, Nikhil Singh, Omar Sharif, Patrick Ng","submitted_at":"2026-01-01T05:19:28Z","abstract_excerpt":"Reinforcement learning with verifiable rewards has driven major gains in LLM reasoning, and it is intuitive to assume this recipe will transfer well to multimodal models. However, multimodal models do two things: first, perceive what is in an image, then reason about what it implies. Because these stages are graded jointly, it is hard to tell how much room reasoning alone has to grow. We study this on algorithmic visual puzzles, where both components are necessary and show that perception, not reasoning, is the binding constraint. Replacing images with simple textual descriptions raises perfor"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2601.00215","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2601.00215/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2601.00215","created_at":"2026-06-19T16:09:53.432574+00:00"},{"alias_kind":"arxiv_version","alias_value":"2601.00215v2","created_at":"2026-06-19T16:09:53.432574+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2601.00215","created_at":"2026-06-19T16:09:53.432574+00:00"},{"alias_kind":"pith_short_12","alias_value":"XJHDCY3YIRSE","created_at":"2026-06-19T16:09:53.432574+00:00"},{"alias_kind":"pith_short_16","alias_value":"XJHDCY3YIRSEYDRJ","created_at":"2026-06-19T16:09:53.432574+00:00"},{"alias_kind":"pith_short_8","alias_value":"XJHDCY3Y","created_at":"2026-06-19T16:09:53.432574+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/XJHDCY3YIRSEYDRJNRSGTUPXJG","json":"https://pith.science/pith/XJHDCY3YIRSEYDRJNRSGTUPXJG.json","graph_json":"https://pith.science/api/pith-number/XJHDCY3YIRSEYDRJNRSGTUPXJG/graph.json","events_json":"https://pith.science/api/pith-number/XJHDCY3YIRSEYDRJNRSGTUPXJG/events.json","paper":"https://pith.science/paper/XJHDCY3Y"},"agent_actions":{"view_html":"https://pith.science/pith/XJHDCY3YIRSEYDRJNRSGTUPXJG","download_json":"https://pith.science/pith/XJHDCY3YIRSEYDRJNRSGTUPXJG.json","view_paper":"https://pith.science/paper/XJHDCY3Y","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2601.00215&json=true","fetch_graph":"https://pith.science/api/pith-number/XJHDCY3YIRSEYDRJNRSGTUPXJG/graph.json","fetch_events":"https://pith.science/api/pith-number/XJHDCY3YIRSEYDRJNRSGTUPXJG/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/XJHDCY3YIRSEYDRJNRSGTUPXJG/action/timestamp_anchor","attest_storage":"https://pith.science/pith/XJHDCY3YIRSEYDRJNRSGTUPXJG/action/storage_attestation","attest_author":"https://pith.science/pith/XJHDCY3YIRSEYDRJNRSGTUPXJG/action/author_attestation","sign_citation":"https://pith.science/pith/XJHDCY3YIRSEYDRJNRSGTUPXJG/action/citation_signature","submit_replication":"https://pith.science/pith/XJHDCY3YIRSEYDRJNRSGTUPXJG/action/replication_record"}},"created_at":"2026-06-19T16:09:53.432574+00:00","updated_at":"2026-06-19T16:09:53.432574+00:00"}