{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2025:63PMY45UWEKZNAZPOIZLJFZBOU","short_pith_number":"pith:63PMY45U","schema_version":"1.0","canonical_sha256":"f6decc73b4b11596832f7232b49721751124dbb68901389158025506ae17b373","source":{"kind":"arxiv","id":"2504.08300","version":5},"attestation_state":"computed","paper":{"title":"Large Language Models Could Be Rote Learners","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.CL","authors_text":"Haochao Ying, Jian Wu, Renjun Hu, Wei Lin, Xing Shi, Yuyang Xu","submitted_at":"2025-04-11T07:04:44Z","abstract_excerpt":"Benchmark-based evaluation, e.g., multiple-choice questions (MCQs) and open-ended questions (OEQs), is widely used for evaluating Large Language Models (LLMs), yet their reliability is undermined by benchmark contamination. When pre-exposed to the testing benchmark during training, less capable LLMs have been found to achieve inflated performance, thereby yielding erroneous results in LLM evaluation. In this study, we reframe contamination as an inherent aspect of learning and seek to disentangle and expose genuine capability acquisition from superficial memorization in LLM evaluation. Followi"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2504.08300","kind":"arxiv","version":5},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2025-04-11T07:04:44Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"d84f198908c44edb0972a52a6491c39eb2aef41474fd732c7a6bf47d809900df","abstract_canon_sha256":"35147cbf37c6f35d85cd0ea24ca8dd4d71f48229852595a42a3d80a1e8f0376a"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-20T00:00:19.151490Z","signature_b64":"JM2+lkuTUGF4EWJwpwq0GOQlvRYoN79XxGvtV+idsrGSNbzo5tPaaYtV4fQsZduTVI7v5lmOQX9tRi4Huy8yDQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"f6decc73b4b11596832f7232b49721751124dbb68901389158025506ae17b373","last_reissued_at":"2026-05-20T00:00:19.150759Z","signature_status":"signed_v1","first_computed_at":"2026-05-20T00:00:19.150759Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Large Language Models Could Be Rote Learners","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.CL","authors_text":"Haochao Ying, Jian Wu, Renjun Hu, Wei Lin, Xing Shi, Yuyang Xu","submitted_at":"2025-04-11T07:04:44Z","abstract_excerpt":"Benchmark-based evaluation, e.g., multiple-choice questions (MCQs) and open-ended questions (OEQs), is widely used for evaluating Large Language Models (LLMs), yet their reliability is undermined by benchmark contamination. When pre-exposed to the testing benchmark during training, less capable LLMs have been found to achieve inflated performance, thereby yielding erroneous results in LLM evaluation. In this study, we reframe contamination as an inherent aspect of learning and seek to disentangle and expose genuine capability acquisition from superficial memorization in LLM evaluation. Followi"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2504.08300","kind":"arxiv","version":5},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2504.08300/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2504.08300","created_at":"2026-05-20T00:00:19.150882+00:00"},{"alias_kind":"arxiv_version","alias_value":"2504.08300v5","created_at":"2026-05-20T00:00:19.150882+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2504.08300","created_at":"2026-05-20T00:00:19.150882+00:00"},{"alias_kind":"pith_short_12","alias_value":"63PMY45UWEKZ","created_at":"2026-05-20T00:00:19.150882+00:00"},{"alias_kind":"pith_short_16","alias_value":"63PMY45UWEKZNAZP","created_at":"2026-05-20T00:00:19.150882+00:00"},{"alias_kind":"pith_short_8","alias_value":"63PMY45U","created_at":"2026-05-20T00:00:19.150882+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":1,"internal_anchor_count":1,"sample":[{"citing_arxiv_id":"2601.05905","citing_title":"Illusions of Confidence? Diagnosing LLM Truthfulness via Neighborhood Consistency","ref_index":9,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/63PMY45UWEKZNAZPOIZLJFZBOU","json":"https://pith.science/pith/63PMY45UWEKZNAZPOIZLJFZBOU.json","graph_json":"https://pith.science/api/pith-number/63PMY45UWEKZNAZPOIZLJFZBOU/graph.json","events_json":"https://pith.science/api/pith-number/63PMY45UWEKZNAZPOIZLJFZBOU/events.json","paper":"https://pith.science/paper/63PMY45U"},"agent_actions":{"view_html":"https://pith.science/pith/63PMY45UWEKZNAZPOIZLJFZBOU","download_json":"https://pith.science/pith/63PMY45UWEKZNAZPOIZLJFZBOU.json","view_paper":"https://pith.science/paper/63PMY45U","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2504.08300&json=true","fetch_graph":"https://pith.science/api/pith-number/63PMY45UWEKZNAZPOIZLJFZBOU/graph.json","fetch_events":"https://pith.science/api/pith-number/63PMY45UWEKZNAZPOIZLJFZBOU/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/63PMY45UWEKZNAZPOIZLJFZBOU/action/timestamp_anchor","attest_storage":"https://pith.science/pith/63PMY45UWEKZNAZPOIZLJFZBOU/action/storage_attestation","attest_author":"https://pith.science/pith/63PMY45UWEKZNAZPOIZLJFZBOU/action/author_attestation","sign_citation":"https://pith.science/pith/63PMY45UWEKZNAZPOIZLJFZBOU/action/citation_signature","submit_replication":"https://pith.science/pith/63PMY45UWEKZNAZPOIZLJFZBOU/action/replication_record"}},"created_at":"2026-05-20T00:00:19.150882+00:00","updated_at":"2026-05-20T00:00:19.150882+00:00"}