{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2025:XWKVJIXSLVYNKHWXKRVTXQIOU2","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"8a45099f14d045accff594ca13ca08c77d46017efad9a353a561b48d2641f330","cross_cats_sorted":["cs.CL","cs.LG"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2025-06-07T22:42:29Z","title_canon_sha256":"a0d32bd599754e05eb9948d06ed7aed1b2cdac8f3f64203a8c1b4e2a57a86a6c"},"schema_version":"1.0","source":{"id":"2506.06941","kind":"arxiv","version":3}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2506.06941","created_at":"2026-05-17T23:38:50Z"},{"alias_kind":"arxiv_version","alias_value":"2506.06941v3","created_at":"2026-05-17T23:38:50Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2506.06941","created_at":"2026-05-17T23:38:50Z"},{"alias_kind":"pith_short_12","alias_value":"XWKVJIXSLVYN","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"XWKVJIXSLVYNKHWX","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"XWKVJIXS","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:6e55473daeec7db0ab9b36d9c1c0ac0cf39b2526a995258db818317990db8b00","target":"graph","created_at":"2026-05-17T23:38:50Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"LRMs face a complete accuracy collapse beyond certain complexities. Moreover, they exhibit a counterintuitive scaling limit: their reasoning effort increases with problem complexity up to a point, then declines despite having remaining token budget."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the chosen controllable puzzle environments provide an unbiased and generalizable measure of reasoning complexity without introducing artifacts that do not appear in other domains such as math or coding."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"LRMs exhibit complete accuracy collapse beyond certain puzzle complexities, with reasoning effort rising then declining, outperforming standard LLMs only on medium-complexity tasks."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Large Reasoning Models exhibit complete accuracy collapse beyond certain complexities and reduce reasoning effort despite available compute."}],"snapshot_sha256":"6013975e7a8b629077629637ac402effe0523f2865e275237dfbdd1c418085d1"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"62e8360a3887101bdb96469aab1b850550b05d0db1bd52b61d4215d15ae846ce"},"paper":{"abstract_excerpt":"Recent generations of language models have introduced Large Reasoning Models (LRMs) that generate detailed thinking processes before providing answers. While these models demonstrate improved performance on reasoning benchmarks, their fundamental capabilities, scaling properties, and limitations remain insufficiently understood. Current evaluations primarily focus on established math and coding benchmarks, emphasizing final answer accuracy. However, this evaluation paradigm often suffers from contamination and does not provide insights into the reasoning traces. In this work, we systematically","authors_text":"Iman Mirzadeh, Keivan Alizadeh, Maxwell Horton, Mehrdad Farajtabar, Parshin Shojaee, Samy Bengio","cross_cats":["cs.CL","cs.LG"],"headline":"Large Reasoning Models exhibit complete accuracy collapse beyond certain complexities and reduce reasoning effort despite available compute.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2025-06-07T22:42:29Z","title":"The Illusion of Thinking: Understanding the Strengths and Limitations of Reasoning Models via the Lens of Problem Complexity"},"references":{"count":55,"internal_anchors":12,"resolved_work":55,"sample":[{"cited_arxiv_id":"2412.16720","doi":"","is_internal_anchor":true,"ref_index":1,"title":"OpenAI o1 System Card","work_id":"68d3c334-0fc9-49e3-b7b0-a69afae933e2","year":2024},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Introducing openai o1","work_id":"5497958f-45dd-499a-b0c6-ccc8449e45bc","year":2024},{"cited_arxiv_id":"2501.12948","doi":"","is_internal_anchor":true,"ref_index":3,"title":"DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning","work_id":"e6b75ad5-2877-4168-97c8-710407094d20","year":2025},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Claude 3.7 sonnet","work_id":"defe6220-1f5a-4678-923b-c0c9c0ef2c4a","year":2025},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"Gemini flash thinking.Google AI Blog, Jan 2025","work_id":"94b01b54-7289-4d34-b163-52dcb02e770c","year":2025}],"snapshot_sha256":"f1c8446e766d84e24ffbfbd4a9e5adc5f8c9931d9f64ed559c9257ae17eaa78a"},"source":{"id":"2506.06941","kind":"arxiv","version":3},"verdict":{"created_at":"2026-05-15T16:04:32.572571Z","id":"6892fb93-a504-4e34-a60f-6fe793f4beb0","model_set":{"reader":"grok-4.3"},"one_line_summary":"LRMs exhibit complete accuracy collapse beyond certain puzzle complexities, with reasoning effort rising then declining, outperforming standard LLMs only on medium-complexity tasks.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Large Reasoning Models exhibit complete accuracy collapse beyond certain complexities and reduce reasoning effort despite available compute.","strongest_claim":"LRMs face a complete accuracy collapse beyond certain complexities. Moreover, they exhibit a counterintuitive scaling limit: their reasoning effort increases with problem complexity up to a point, then declines despite having remaining token budget.","weakest_assumption":"That the chosen controllable puzzle environments provide an unbiased and generalizable measure of reasoning complexity without introducing artifacts that do not appear in other domains such as math or coding."}},"verdict_id":"6892fb93-a504-4e34-a60f-6fe793f4beb0"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:a6b5ec2a6c213f2c0a03415bc2305045a12ed5d938fb536674c82056b70ab488","target":"record","created_at":"2026-05-17T23:38:50Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"8a45099f14d045accff594ca13ca08c77d46017efad9a353a561b48d2641f330","cross_cats_sorted":["cs.CL","cs.LG"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2025-06-07T22:42:29Z","title_canon_sha256":"a0d32bd599754e05eb9948d06ed7aed1b2cdac8f3f64203a8c1b4e2a57a86a6c"},"schema_version":"1.0","source":{"id":"2506.06941","kind":"arxiv","version":3}},"canonical_sha256":"bd9554a2f25d70d51ed7546b3bc10ea6987dd4cbd948aa53f779b964a512b7c5","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"bd9554a2f25d70d51ed7546b3bc10ea6987dd4cbd948aa53f779b964a512b7c5","first_computed_at":"2026-05-17T23:38:50.945787Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:50.945787Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"QJojdnMraqS7GGRC/bq8u8DOQyIO2OmylCegjVkwUC5GH0BKZTfQWreUP6tkesPRiMA1gcZJ2fH5SrnJYvReDA==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:50.946361Z","signed_message":"canonical_sha256_bytes"},"source_id":"2506.06941","source_kind":"arxiv","source_version":3}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:a6b5ec2a6c213f2c0a03415bc2305045a12ed5d938fb536674c82056b70ab488","sha256:6e55473daeec7db0ab9b36d9c1c0ac0cf39b2526a995258db818317990db8b00"],"state_sha256":"30d33eac48fda5b99ed5e5755974c7038f33474cbfbe5d8b8d2d83ddfdc9fef5"}