{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2024:GNBHBX5AV57ZBW4XO7HMQW3VML","short_pith_number":"pith:GNBHBX5A","canonical_record":{"source":{"id":"2410.05229","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2024-10-07T17:36:37Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"acaeed5a707ab7d7d3f7aaad19d1f014386246dea511a2a9263886064c877d57","abstract_canon_sha256":"1dc6c56c53a6fe5ea687fb26bc48ca00a6117e241edd9dbd65d2c9cd925265e1"},"schema_version":"1.0"},"canonical_sha256":"334270dfa0af7f90db9777cec85b7562e5edc3e94d8b66606e23715261873f72","source":{"kind":"arxiv","id":"2410.05229","version":2},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2410.05229","created_at":"2026-05-17T23:39:19Z"},{"alias_kind":"arxiv_version","alias_value":"2410.05229v2","created_at":"2026-05-17T23:39:19Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2410.05229","created_at":"2026-05-17T23:39:19Z"},{"alias_kind":"pith_short_12","alias_value":"GNBHBX5AV57Z","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"GNBHBX5AV57ZBW4X","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"GNBHBX5A","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2024:GNBHBX5AV57ZBW4XO7HMQW3VML","target":"record","payload":{"canonical_record":{"source":{"id":"2410.05229","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2024-10-07T17:36:37Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"acaeed5a707ab7d7d3f7aaad19d1f014386246dea511a2a9263886064c877d57","abstract_canon_sha256":"1dc6c56c53a6fe5ea687fb26bc48ca00a6117e241edd9dbd65d2c9cd925265e1"},"schema_version":"1.0"},"canonical_sha256":"334270dfa0af7f90db9777cec85b7562e5edc3e94d8b66606e23715261873f72","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:39:19.673719Z","signature_b64":"7+wxJHgmst8ruR72E3zmbKZpQayz7dmJV5rqMvRdXXRtU11uFlZMyudEQg6f8cAOZr4lfr9Z+OtQz6i43d9ACA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"334270dfa0af7f90db9777cec85b7562e5edc3e94d8b66606e23715261873f72","last_reissued_at":"2026-05-17T23:39:19.672939Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:39:19.672939Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2410.05229","source_version":2,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:39:19Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"kmFyDCff5mfpHy5yLdUPuVlTqOMCjaWUtLczT1fcVIdy0qV4yi1KMnr/KbqsP31qMhmAskDSoS2f8cSmR8YYCA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-26T13:01:37.496971Z"},"content_sha256":"cef8b917c35e12a13c698d5bce7d291d7d70a734d7bc6d50f217181b8a50879e","schema_version":"1.0","event_id":"sha256:cef8b917c35e12a13c698d5bce7d291d7d70a734d7bc6d50f217181b8a50879e"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2024:GNBHBX5AV57ZBW4XO7HMQW3VML","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"GSM-Symbolic: Understanding the Limitations of Mathematical Reasoning in Large Language Models","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"Large language models cannot perform genuine mathematical reasoning and instead replicate patterns from training data.","cross_cats":["cs.AI"],"primary_cat":"cs.LG","authors_text":"Hooman Shahrokhi, Iman Mirzadeh, Keivan Alizadeh, Mehrdad Farajtabar, Oncel Tuzel, Samy Bengio","submitted_at":"2024-10-07T17:36:37Z","abstract_excerpt":"Recent advancements in Large Language Models (LLMs) have sparked interest in their formal reasoning capabilities, particularly in mathematics. The GSM8K benchmark is widely used to assess the mathematical reasoning of models on grade-school-level questions. While the performance of LLMs on GSM8K has significantly improved in recent years, it remains unclear whether their mathematical reasoning capabilities have genuinely advanced, raising questions about the reliability of the reported metrics. To address these concerns, we conduct a large-scale study on several SOTA open and closed models. To"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"current LLMs cannot perform genuine logical reasoning; they replicate reasoning steps from their training data.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That the added clauses are truly irrelevant to the solution process and that performance drops therefore demonstrate absence of genuine reasoning rather than sensitivity to prompt length or surface features.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"LLMs display high variance and major accuracy drops on GSM-Symbolic variants of grade-school math problems, indicating they replicate training patterns rather than execute logical reasoning.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Large language models cannot perform genuine mathematical reasoning and instead replicate patterns from training data.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"24ce1db7409d6c901187d304ad51e0fb5322dc38349390d1c2448d40f64a39dd"},"source":{"id":"2410.05229","kind":"arxiv","version":2},"verdict":{"id":"13ce7d1d-d14a-4603-9e8b-387841dfe564","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T00:38:35.597661Z","strongest_claim":"current LLMs cannot perform genuine logical reasoning; they replicate reasoning steps from their training data.","one_line_summary":"LLMs display high variance and major accuracy drops on GSM-Symbolic variants of grade-school math problems, indicating they replicate training patterns rather than execute logical reasoning.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That the added clauses are truly irrelevant to the solution process and that performance drops therefore demonstrate absence of genuine reasoning rather than sensitivity to prompt length or surface features.","pith_extraction_headline":"Large language models cannot perform genuine mathematical reasoning and instead replicate patterns from training data."},"references":{"count":85,"sample":[{"doi":"","year":2024,"title":"Qintong Li and Leyang Cui and Xueliang Zhao and Lingpeng Kong and Wei Bi , editor =. GSM-Plus:. Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long ","work_id":"f828d390-1c29-4d83-9b59-137df9a0f3ff","ref_index":8,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2023,"title":"Hwang and Soumya Sanyal and Xiang Ren and Allyson Ettinger and Za","work_id":"a2ac64ea-b2e2-46b4-9eb4-3be4b6a28a1a","ref_index":9,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2022,"title":"Chi and Quoc V","work_id":"df55022b-f2f7-4e29-b72f-dde7b4cc35d2","ref_index":10,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2023,"title":"Chi and Nathanael Sch","work_id":"2c4d3df1-27b5-4d4e-a4d3-e1984386bb48","ref_index":11,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2021,"title":"Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics , pages=","work_id":"cd336020-437f-4fb5-abe3-0f9376e8553d","ref_index":12,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":85,"snapshot_sha256":"af2a7f1de19547f003f863aa82fef9e00bc691d0c74690c9de26664c33953948","internal_anchors":14},"formal_canon":{"evidence_count":1,"snapshot_sha256":"be9e2e8ced882d7786ad9f36e47f4a5c62668e34da1316b5245d9a7ee384a546"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"13ce7d1d-d14a-4603-9e8b-387841dfe564"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:39:19Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"NO9zn09pI8Uzp5+gHHc4Ev/IME7tHX1y/BazrgLLoH9yneNyYKju1dWDUbFaGUYnPa/BPrfD6QC5Jd11XKKqBQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-26T13:01:37.497946Z"},"content_sha256":"d7501237884b5db8b919ca0f3bbcdaa8699d5b1a2a4af9224e7a70298782b595","schema_version":"1.0","event_id":"sha256:d7501237884b5db8b919ca0f3bbcdaa8699d5b1a2a4af9224e7a70298782b595"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/GNBHBX5AV57ZBW4XO7HMQW3VML/bundle.json","state_url":"https://pith.science/pith/GNBHBX5AV57ZBW4XO7HMQW3VML/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/GNBHBX5AV57ZBW4XO7HMQW3VML/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-26T13:01:37Z","links":{"resolver":"https://pith.science/pith/GNBHBX5AV57ZBW4XO7HMQW3VML","bundle":"https://pith.science/pith/GNBHBX5AV57ZBW4XO7HMQW3VML/bundle.json","state":"https://pith.science/pith/GNBHBX5AV57ZBW4XO7HMQW3VML/state.json","well_known_bundle":"https://pith.science/.well-known/pith/GNBHBX5AV57ZBW4XO7HMQW3VML/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2024:GNBHBX5AV57ZBW4XO7HMQW3VML","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"1dc6c56c53a6fe5ea687fb26bc48ca00a6117e241edd9dbd65d2c9cd925265e1","cross_cats_sorted":["cs.AI"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2024-10-07T17:36:37Z","title_canon_sha256":"acaeed5a707ab7d7d3f7aaad19d1f014386246dea511a2a9263886064c877d57"},"schema_version":"1.0","source":{"id":"2410.05229","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2410.05229","created_at":"2026-05-17T23:39:19Z"},{"alias_kind":"arxiv_version","alias_value":"2410.05229v2","created_at":"2026-05-17T23:39:19Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2410.05229","created_at":"2026-05-17T23:39:19Z"},{"alias_kind":"pith_short_12","alias_value":"GNBHBX5AV57Z","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"GNBHBX5AV57ZBW4X","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"GNBHBX5A","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:d7501237884b5db8b919ca0f3bbcdaa8699d5b1a2a4af9224e7a70298782b595","target":"graph","created_at":"2026-05-17T23:39:19Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"current LLMs cannot perform genuine logical reasoning; they replicate reasoning steps from their training data."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the added clauses are truly irrelevant to the solution process and that performance drops therefore demonstrate absence of genuine reasoning rather than sensitivity to prompt length or surface features."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"LLMs display high variance and major accuracy drops on GSM-Symbolic variants of grade-school math problems, indicating they replicate training patterns rather than execute logical reasoning."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Large language models cannot perform genuine mathematical reasoning and instead replicate patterns from training data."}],"snapshot_sha256":"24ce1db7409d6c901187d304ad51e0fb5322dc38349390d1c2448d40f64a39dd"},"formal_canon":{"evidence_count":1,"snapshot_sha256":"be9e2e8ced882d7786ad9f36e47f4a5c62668e34da1316b5245d9a7ee384a546"},"paper":{"abstract_excerpt":"Recent advancements in Large Language Models (LLMs) have sparked interest in their formal reasoning capabilities, particularly in mathematics. The GSM8K benchmark is widely used to assess the mathematical reasoning of models on grade-school-level questions. While the performance of LLMs on GSM8K has significantly improved in recent years, it remains unclear whether their mathematical reasoning capabilities have genuinely advanced, raising questions about the reliability of the reported metrics. To address these concerns, we conduct a large-scale study on several SOTA open and closed models. To","authors_text":"Hooman Shahrokhi, Iman Mirzadeh, Keivan Alizadeh, Mehrdad Farajtabar, Oncel Tuzel, Samy Bengio","cross_cats":["cs.AI"],"headline":"Large language models cannot perform genuine mathematical reasoning and instead replicate patterns from training data.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2024-10-07T17:36:37Z","title":"GSM-Symbolic: Understanding the Limitations of Mathematical Reasoning in Large Language Models"},"references":{"count":85,"internal_anchors":14,"resolved_work":85,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":8,"title":"Qintong Li and Leyang Cui and Xueliang Zhao and Lingpeng Kong and Wei Bi , editor =. GSM-Plus:. Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long ","work_id":"f828d390-1c29-4d83-9b59-137df9a0f3ff","year":2024},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":9,"title":"Hwang and Soumya Sanyal and Xiang Ren and Allyson Ettinger and Za","work_id":"a2ac64ea-b2e2-46b4-9eb4-3be4b6a28a1a","year":2023},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":10,"title":"Chi and Quoc V","work_id":"df55022b-f2f7-4e29-b72f-dde7b4cc35d2","year":2022},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":11,"title":"Chi and Nathanael Sch","work_id":"2c4d3df1-27b5-4d4e-a4d3-e1984386bb48","year":2023},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":12,"title":"Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics , pages=","work_id":"cd336020-437f-4fb5-abe3-0f9376e8553d","year":2021}],"snapshot_sha256":"af2a7f1de19547f003f863aa82fef9e00bc691d0c74690c9de26664c33953948"},"source":{"id":"2410.05229","kind":"arxiv","version":2},"verdict":{"created_at":"2026-05-15T00:38:35.597661Z","id":"13ce7d1d-d14a-4603-9e8b-387841dfe564","model_set":{"reader":"grok-4.3"},"one_line_summary":"LLMs display high variance and major accuracy drops on GSM-Symbolic variants of grade-school math problems, indicating they replicate training patterns rather than execute logical reasoning.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Large language models cannot perform genuine mathematical reasoning and instead replicate patterns from training data.","strongest_claim":"current LLMs cannot perform genuine logical reasoning; they replicate reasoning steps from their training data.","weakest_assumption":"That the added clauses are truly irrelevant to the solution process and that performance drops therefore demonstrate absence of genuine reasoning rather than sensitivity to prompt length or surface features."}},"verdict_id":"13ce7d1d-d14a-4603-9e8b-387841dfe564"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:cef8b917c35e12a13c698d5bce7d291d7d70a734d7bc6d50f217181b8a50879e","target":"record","created_at":"2026-05-17T23:39:19Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"1dc6c56c53a6fe5ea687fb26bc48ca00a6117e241edd9dbd65d2c9cd925265e1","cross_cats_sorted":["cs.AI"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2024-10-07T17:36:37Z","title_canon_sha256":"acaeed5a707ab7d7d3f7aaad19d1f014386246dea511a2a9263886064c877d57"},"schema_version":"1.0","source":{"id":"2410.05229","kind":"arxiv","version":2}},"canonical_sha256":"334270dfa0af7f90db9777cec85b7562e5edc3e94d8b66606e23715261873f72","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"334270dfa0af7f90db9777cec85b7562e5edc3e94d8b66606e23715261873f72","first_computed_at":"2026-05-17T23:39:19.672939Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:39:19.672939Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"7+wxJHgmst8ruR72E3zmbKZpQayz7dmJV5rqMvRdXXRtU11uFlZMyudEQg6f8cAOZr4lfr9Z+OtQz6i43d9ACA==","signature_status":"signed_v1","signed_at":"2026-05-17T23:39:19.673719Z","signed_message":"canonical_sha256_bytes"},"source_id":"2410.05229","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:cef8b917c35e12a13c698d5bce7d291d7d70a734d7bc6d50f217181b8a50879e","sha256:d7501237884b5db8b919ca0f3bbcdaa8699d5b1a2a4af9224e7a70298782b595"],"state_sha256":"ddfb4efa210656dbf8e43fbb6f2d1cbf91ff6405d20d1af6bc5c0c08c09ce425"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"aB6VDw/cB0+WcXG8zUA76kWNKK/mSXjUtDka8M47uz3/sEtidYI0+7usBRF3ENeka0kpRr4BtS5vDRwGi9ULAg==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-26T13:01:37.502335Z","bundle_sha256":"4a9d408ad744b1dd165affa45ecdc32c28697b1da190e4f5528a34cfeb01c9ec"}}