{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2021:Y2PRIO4E3LEENHWXTSUJMEA4GQ","short_pith_number":"pith:Y2PRIO4E","canonical_record":{"source":{"id":"2103.07191","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2021-03-12T10:23:47Z","cross_cats_sorted":[],"title_canon_sha256":"27961b34ee3f17ffb2c39dc34923abd3cbb8e795187eb39ceb96d48b3f33953d","abstract_canon_sha256":"620056386ef10602f08752d34cf4fdc9b37ad1f22161507fa11deb87ca0ad34a"},"schema_version":"1.0"},"canonical_sha256":"c69f143b84dac8469ed79ca896101c343937602e2da37b54f59641f4f9c4056c","source":{"kind":"arxiv","id":"2103.07191","version":2},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2103.07191","created_at":"2026-05-17T23:38:47Z"},{"alias_kind":"arxiv_version","alias_value":"2103.07191v2","created_at":"2026-05-17T23:38:47Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2103.07191","created_at":"2026-05-17T23:38:47Z"},{"alias_kind":"pith_short_12","alias_value":"Y2PRIO4E3LEE","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_16","alias_value":"Y2PRIO4E3LEENHWX","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_8","alias_value":"Y2PRIO4E","created_at":"2026-05-18T12:33:33Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2021:Y2PRIO4E3LEENHWXTSUJMEA4GQ","target":"record","payload":{"canonical_record":{"source":{"id":"2103.07191","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2021-03-12T10:23:47Z","cross_cats_sorted":[],"title_canon_sha256":"27961b34ee3f17ffb2c39dc34923abd3cbb8e795187eb39ceb96d48b3f33953d","abstract_canon_sha256":"620056386ef10602f08752d34cf4fdc9b37ad1f22161507fa11deb87ca0ad34a"},"schema_version":"1.0"},"canonical_sha256":"c69f143b84dac8469ed79ca896101c343937602e2da37b54f59641f4f9c4056c","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:47.116211Z","signature_b64":"P9H3kPswR3+6ZHFos/bmehUKd/3z6BEoXc7BOFZ2fJj+LvjWyoYVQH3ZppqPvqrHuTp3qPQoMw9FBym+VPzsAg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"c69f143b84dac8469ed79ca896101c343937602e2da37b54f59641f4f9c4056c","last_reissued_at":"2026-05-17T23:38:47.115752Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:47.115752Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2103.07191","source_version":2,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:47Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"7bHXb52XAGKeL8Ozz0XiYlqgFo7EIb/yZj53gDO00m76qhBGmv5VzIDO0Nf3SDuKxU2YkrDgkurjdp7FNE2cDA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-06T22:30:26.526875Z"},"content_sha256":"4e017eee6140f1004a03dc17140c7bd0566205e592c245b28ee3a99992e78c31","schema_version":"1.0","event_id":"sha256:4e017eee6140f1004a03dc17140c7bd0566205e592c245b28ee3a99992e78c31"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2021:Y2PRIO4E3LEENHWXTSUJMEA4GQ","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Are NLP Models really able to Solve Simple Math Word Problems?","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"NLP solvers for simple math word problems achieve high benchmark scores by exploiting shallow patterns instead of actual reasoning.","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Arkil Patel, Navin Goyal, Satwik Bhattamishra","submitted_at":"2021-03-12T10:23:47Z","abstract_excerpt":"The problem of designing NLP solvers for math word problems (MWP) has seen sustained research activity and steady gains in the test accuracy. Since existing solvers achieve high performance on the benchmark datasets for elementary level MWPs containing one-unknown arithmetic word problems, such problems are often considered \"solved\" with the bulk of research attention moving to more complex MWPs. In this paper, we restrict our attention to English MWPs taught in grades four and lower. We provide strong evidence that the existing MWP solvers rely on shallow heuristics to achieve high performanc"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"MWP solvers that do not have access to the question asked in the MWP can still solve a large fraction of MWPs. Similarly, models that treat MWPs as bag-of-words can also achieve surprisingly high accuracy. The best accuracy achieved by state-of-the-art models is substantially lower on SVAMP.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That the carefully chosen variations used to create SVAMP are sufficient to block all shallow heuristics while still testing the intended arithmetic reasoning.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"NLP models for elementary math word problems rely on shallow heuristics rather than genuine understanding, performing well without questions or as bag-of-words but dropping substantially on the new SVAMP variation dataset.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"NLP solvers for simple math word problems achieve high benchmark scores by exploiting shallow patterns instead of actual reasoning.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"88643d083ba5b2aa8260df09b6d37f9a40b8e50c0e30100e3091326dd646f777"},"source":{"id":"2103.07191","kind":"arxiv","version":2},"verdict":{"id":"60bd8bfe-824d-4207-bfb1-52a7d9f03f46","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-16T17:26:35.756651Z","strongest_claim":"MWP solvers that do not have access to the question asked in the MWP can still solve a large fraction of MWPs. Similarly, models that treat MWPs as bag-of-words can also achieve surprisingly high accuracy. The best accuracy achieved by state-of-the-art models is substantially lower on SVAMP.","one_line_summary":"NLP models for elementary math word problems rely on shallow heuristics rather than genuine understanding, performing well without questions or as bag-of-words but dropping substantially on the new SVAMP variation dataset.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That the carefully chosen variations used to create SVAMP are sufficient to block all shallow heuristics while still testing the intended arithmetic reasoning.","pith_extraction_headline":"NLP solvers for simple math word problems achieve high benchmark scores by exploiting shallow patterns instead of actual reasoning."},"references":{"count":12,"sample":[{"doi":"","year":2018,"title":"Suchin Gururangan, Swabha Swayamdipta, Omer Levy, Roy Schwartz, Samuel Bowman, and Noah A","work_id":"94e45806-43ab-42d2-a622-ba654423f9cb","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2020,"title":"In Proceed- ings of the 58th Annual Meeting of the Association for Computational Linguistics, pages 975–984, On- line","work_id":"03bc4244-11f6-46ce-846f-67799b8784dc","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2020,"title":"In Proceed- ings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP) , pages 3702–3710, Online","work_id":"c7adf57e-989f-47fa-bdd4-c2556f46654a","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2020,"title":"IEEE Transac- tions on Pattern Analysis and Machine Intelligence , 42(9):2287–2305","work_id":"a3266838-e9a2-466d-ba59-76b2d13439fc","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"B Implementation Details We use 8 NVIDIA Tesla P100 GPUs each with 16 GB memory to run our experiments","work_id":"6167010d-9c6b-43c9-8e30-20d05cb93cfd","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":12,"snapshot_sha256":"bac01043592fe10ecc8f06eb5051cfc959091af85eb122088c5d7771a87da948","internal_anchors":0},"formal_canon":{"evidence_count":1,"snapshot_sha256":"26efa7f76fb43cb2be17d454d0d5e33c052606cf54467f76be5bbd2950da5f53"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"60bd8bfe-824d-4207-bfb1-52a7d9f03f46"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:47Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"M5BGukegz9H/tzqe1+r8imoaL5LH2mwvamvTKpO+lCPNV4xrVyPfZjDD5HAQPf5b9dbcbvVvYQ7QXNthGatsAw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-06T22:30:26.527930Z"},"content_sha256":"7ec0f77d6f2d7bd80a85266677ea53fb47e87191eb60a2c32c6f1b23558e1b49","schema_version":"1.0","event_id":"sha256:7ec0f77d6f2d7bd80a85266677ea53fb47e87191eb60a2c32c6f1b23558e1b49"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/Y2PRIO4E3LEENHWXTSUJMEA4GQ/bundle.json","state_url":"https://pith.science/pith/Y2PRIO4E3LEENHWXTSUJMEA4GQ/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/Y2PRIO4E3LEENHWXTSUJMEA4GQ/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-06T22:30:26Z","links":{"resolver":"https://pith.science/pith/Y2PRIO4E3LEENHWXTSUJMEA4GQ","bundle":"https://pith.science/pith/Y2PRIO4E3LEENHWXTSUJMEA4GQ/bundle.json","state":"https://pith.science/pith/Y2PRIO4E3LEENHWXTSUJMEA4GQ/state.json","well_known_bundle":"https://pith.science/.well-known/pith/Y2PRIO4E3LEENHWXTSUJMEA4GQ/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2021:Y2PRIO4E3LEENHWXTSUJMEA4GQ","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"620056386ef10602f08752d34cf4fdc9b37ad1f22161507fa11deb87ca0ad34a","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2021-03-12T10:23:47Z","title_canon_sha256":"27961b34ee3f17ffb2c39dc34923abd3cbb8e795187eb39ceb96d48b3f33953d"},"schema_version":"1.0","source":{"id":"2103.07191","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2103.07191","created_at":"2026-05-17T23:38:47Z"},{"alias_kind":"arxiv_version","alias_value":"2103.07191v2","created_at":"2026-05-17T23:38:47Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2103.07191","created_at":"2026-05-17T23:38:47Z"},{"alias_kind":"pith_short_12","alias_value":"Y2PRIO4E3LEE","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_16","alias_value":"Y2PRIO4E3LEENHWX","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_8","alias_value":"Y2PRIO4E","created_at":"2026-05-18T12:33:33Z"}],"graph_snapshots":[{"event_id":"sha256:7ec0f77d6f2d7bd80a85266677ea53fb47e87191eb60a2c32c6f1b23558e1b49","target":"graph","created_at":"2026-05-17T23:38:47Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"MWP solvers that do not have access to the question asked in the MWP can still solve a large fraction of MWPs. Similarly, models that treat MWPs as bag-of-words can also achieve surprisingly high accuracy. The best accuracy achieved by state-of-the-art models is substantially lower on SVAMP."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the carefully chosen variations used to create SVAMP are sufficient to block all shallow heuristics while still testing the intended arithmetic reasoning."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"NLP models for elementary math word problems rely on shallow heuristics rather than genuine understanding, performing well without questions or as bag-of-words but dropping substantially on the new SVAMP variation dataset."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"NLP solvers for simple math word problems achieve high benchmark scores by exploiting shallow patterns instead of actual reasoning."}],"snapshot_sha256":"88643d083ba5b2aa8260df09b6d37f9a40b8e50c0e30100e3091326dd646f777"},"formal_canon":{"evidence_count":1,"snapshot_sha256":"26efa7f76fb43cb2be17d454d0d5e33c052606cf54467f76be5bbd2950da5f53"},"paper":{"abstract_excerpt":"The problem of designing NLP solvers for math word problems (MWP) has seen sustained research activity and steady gains in the test accuracy. Since existing solvers achieve high performance on the benchmark datasets for elementary level MWPs containing one-unknown arithmetic word problems, such problems are often considered \"solved\" with the bulk of research attention moving to more complex MWPs. In this paper, we restrict our attention to English MWPs taught in grades four and lower. We provide strong evidence that the existing MWP solvers rely on shallow heuristics to achieve high performanc","authors_text":"Arkil Patel, Navin Goyal, Satwik Bhattamishra","cross_cats":[],"headline":"NLP solvers for simple math word problems achieve high benchmark scores by exploiting shallow patterns instead of actual reasoning.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2021-03-12T10:23:47Z","title":"Are NLP Models really able to Solve Simple Math Word Problems?"},"references":{"count":12,"internal_anchors":0,"resolved_work":12,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"Suchin Gururangan, Swabha Swayamdipta, Omer Levy, Roy Schwartz, Samuel Bowman, and Noah A","work_id":"94e45806-43ab-42d2-a622-ba654423f9cb","year":2018},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"In Proceed- ings of the 58th Annual Meeting of the Association for Computational Linguistics, pages 975–984, On- line","work_id":"03bc4244-11f6-46ce-846f-67799b8784dc","year":2020},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"In Proceed- ings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP) , pages 3702–3710, Online","work_id":"c7adf57e-989f-47fa-bdd4-c2556f46654a","year":2020},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"IEEE Transac- tions on Pattern Analysis and Machine Intelligence , 42(9):2287–2305","work_id":"a3266838-e9a2-466d-ba59-76b2d13439fc","year":2020},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"B Implementation Details We use 8 NVIDIA Tesla P100 GPUs each with 16 GB memory to run our experiments","work_id":"6167010d-9c6b-43c9-8e30-20d05cb93cfd","year":null}],"snapshot_sha256":"bac01043592fe10ecc8f06eb5051cfc959091af85eb122088c5d7771a87da948"},"source":{"id":"2103.07191","kind":"arxiv","version":2},"verdict":{"created_at":"2026-05-16T17:26:35.756651Z","id":"60bd8bfe-824d-4207-bfb1-52a7d9f03f46","model_set":{"reader":"grok-4.3"},"one_line_summary":"NLP models for elementary math word problems rely on shallow heuristics rather than genuine understanding, performing well without questions or as bag-of-words but dropping substantially on the new SVAMP variation dataset.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"NLP solvers for simple math word problems achieve high benchmark scores by exploiting shallow patterns instead of actual reasoning.","strongest_claim":"MWP solvers that do not have access to the question asked in the MWP can still solve a large fraction of MWPs. Similarly, models that treat MWPs as bag-of-words can also achieve surprisingly high accuracy. The best accuracy achieved by state-of-the-art models is substantially lower on SVAMP.","weakest_assumption":"That the carefully chosen variations used to create SVAMP are sufficient to block all shallow heuristics while still testing the intended arithmetic reasoning."}},"verdict_id":"60bd8bfe-824d-4207-bfb1-52a7d9f03f46"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:4e017eee6140f1004a03dc17140c7bd0566205e592c245b28ee3a99992e78c31","target":"record","created_at":"2026-05-17T23:38:47Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"620056386ef10602f08752d34cf4fdc9b37ad1f22161507fa11deb87ca0ad34a","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2021-03-12T10:23:47Z","title_canon_sha256":"27961b34ee3f17ffb2c39dc34923abd3cbb8e795187eb39ceb96d48b3f33953d"},"schema_version":"1.0","source":{"id":"2103.07191","kind":"arxiv","version":2}},"canonical_sha256":"c69f143b84dac8469ed79ca896101c343937602e2da37b54f59641f4f9c4056c","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"c69f143b84dac8469ed79ca896101c343937602e2da37b54f59641f4f9c4056c","first_computed_at":"2026-05-17T23:38:47.115752Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:47.115752Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"P9H3kPswR3+6ZHFos/bmehUKd/3z6BEoXc7BOFZ2fJj+LvjWyoYVQH3ZppqPvqrHuTp3qPQoMw9FBym+VPzsAg==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:47.116211Z","signed_message":"canonical_sha256_bytes"},"source_id":"2103.07191","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:4e017eee6140f1004a03dc17140c7bd0566205e592c245b28ee3a99992e78c31","sha256:7ec0f77d6f2d7bd80a85266677ea53fb47e87191eb60a2c32c6f1b23558e1b49"],"state_sha256":"d7f2cca0514d8337c0b5ecb83127730f9d9d0613719ded0c410817438f4db1c1"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"CfTH/xE+Z0UblG6Ybwu/ea6999p+rlMoDwcwfBjm/Akno2P7kCyJ7lgSxTwYjbEf6tHEwUobSzaGuMcws/6gBw==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-06T22:30:26.532396Z","bundle_sha256":"cf83ea522644c7ccb469d2b7228fb90a64e14950ea33419233d4e3d97dc03eeb"}}