{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2024:UHXBMDQASPG5XYGWXVOS6SX6RG","short_pith_number":"pith:UHXBMDQA","schema_version":"1.0","canonical_sha256":"a1ee160e0093cddbe0d6bd5d2f4afe899f4f8a616859e8b6b59d1f38f69cac2f","source":{"kind":"arxiv","id":"2406.04244","version":1},"attestation_state":"computed","paper":{"title":"Benchmark Data Contamination of Large Language Models: A Survey","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Cheng Xu, Derek Greene, M-Tahar Kechadi, Shuhao Guan","submitted_at":"2024-06-06T16:41:39Z","abstract_excerpt":"The rapid development of Large Language Models (LLMs) like GPT-4, Claude-3, and Gemini has transformed the field of natural language processing. However, it has also resulted in a significant issue known as Benchmark Data Contamination (BDC). This occurs when language models inadvertently incorporate evaluation benchmark information from their training data, leading to inaccurate or unreliable performance during the evaluation phase of the process. This paper reviews the complex challenge of BDC in LLM evaluation and explores alternative assessment methods to mitigate the risks associated with"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2406.04244","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2024-06-06T16:41:39Z","cross_cats_sorted":[],"title_canon_sha256":"d142bdb5ee38989c04bd7e0b7d4be8412f8d73f7813cb9772b1a7b3bc8be4eef","abstract_canon_sha256":"bbd4335ebcbd8bfff3e05e631c12d0350c341ffbc1cef14a1b9a2650f73a44a5"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-22T23:03:43.315110Z","signature_b64":"vRmmV+PnUkmJSfRIsXUQV0t8VCzLNowJAeSn1nUATuP2tUhq37o3XVpx27ZkTFZzp8hMBFDUHhj0yp1tax9lBQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"a1ee160e0093cddbe0d6bd5d2f4afe899f4f8a616859e8b6b59d1f38f69cac2f","last_reissued_at":"2026-05-22T23:03:43.311884Z","signature_status":"signed_v1","first_computed_at":"2026-05-22T23:03:43.311884Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Benchmark Data Contamination of Large Language Models: A Survey","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Cheng Xu, Derek Greene, M-Tahar Kechadi, Shuhao Guan","submitted_at":"2024-06-06T16:41:39Z","abstract_excerpt":"The rapid development of Large Language Models (LLMs) like GPT-4, Claude-3, and Gemini has transformed the field of natural language processing. However, it has also resulted in a significant issue known as Benchmark Data Contamination (BDC). This occurs when language models inadvertently incorporate evaluation benchmark information from their training data, leading to inaccurate or unreliable performance during the evaluation phase of the process. This paper reviews the complex challenge of BDC in LLM evaluation and explores alternative assessment methods to mitigate the risks associated with"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2406.04244","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2406.04244/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2406.04244","created_at":"2026-05-22T23:03:43.312062+00:00"},{"alias_kind":"arxiv_version","alias_value":"2406.04244v1","created_at":"2026-05-22T23:03:43.312062+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2406.04244","created_at":"2026-05-22T23:03:43.312062+00:00"},{"alias_kind":"pith_short_12","alias_value":"UHXBMDQASPG5","created_at":"2026-05-22T23:03:43.312062+00:00"},{"alias_kind":"pith_short_16","alias_value":"UHXBMDQASPG5XYGW","created_at":"2026-05-22T23:03:43.312062+00:00"},{"alias_kind":"pith_short_8","alias_value":"UHXBMDQA","created_at":"2026-05-22T23:03:43.312062+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":25,"internal_anchor_count":25,"sample":[{"citing_arxiv_id":"2508.15503","citing_title":"Guidelines for Empirical Studies in Software Engineering involving Large Language Models","ref_index":150,"is_internal_anchor":true},{"citing_arxiv_id":"2503.17181","citing_title":"A Study of LLMs' Preferences for Libraries and Programming Languages","ref_index":81,"is_internal_anchor":true},{"citing_arxiv_id":"2605.21856","citing_title":"The Illusion of Reasoning: Exposing Evasive Data Contamination in LLMs via Zero-CoT Truncation","ref_index":15,"is_internal_anchor":true},{"citing_arxiv_id":"2605.21543","citing_title":"Provable Joint Decontamination for Benchmarking Multiple Large Language Models","ref_index":171,"is_internal_anchor":true},{"citing_arxiv_id":"2604.23267","citing_title":"Fine-tuning vs. In-context Learning in Large Language Models: A Formal Language Learning Perspective","ref_index":73,"is_internal_anchor":true},{"citing_arxiv_id":"2605.19999","citing_title":"LLM Benchmark Datasets Should Be Contamination-Resistant","ref_index":27,"is_internal_anchor":true},{"citing_arxiv_id":"2605.19711","citing_title":"Can Large Language Models Reliably Correct Errors in Low-Resource ASR? A Contamination-Aware Case Study on West Frisian","ref_index":26,"is_internal_anchor":true},{"citing_arxiv_id":"2507.22359","citing_title":"League of LLMs: A Benchmark-Free Paradigm for Mutual Evaluation of Large Language Models","ref_index":45,"is_internal_anchor":true},{"citing_arxiv_id":"2507.23009","citing_title":"Position: Stop Evaluating AI with Human Tests, Develop Principled, AI-specific Tests instead","ref_index":86,"is_internal_anchor":true},{"citing_arxiv_id":"2508.15503","citing_title":"Guidelines for Empirical Studies in Software Engineering involving Large Language Models","ref_index":150,"is_internal_anchor":true},{"citing_arxiv_id":"2512.05929","citing_title":"LLM Harms: A Taxonomy and Discussion","ref_index":42,"is_internal_anchor":true},{"citing_arxiv_id":"2605.01847","citing_title":"NeuroState-Bench: A Human-Calibrated Benchmark for Commitment Integrity in LLM Agent Profiles","ref_index":28,"is_internal_anchor":true},{"citing_arxiv_id":"2605.14164","citing_title":"Unsteady Metrics and Benchmarking Cultures of AI Model Builders","ref_index":65,"is_internal_anchor":true},{"citing_arxiv_id":"2509.16941","citing_title":"SWE-Bench Pro: Can AI Agents Solve Long-Horizon Software Engineering Tasks?","ref_index":15,"is_internal_anchor":true},{"citing_arxiv_id":"2604.26048","citing_title":"BioGraphletQA: Knowledge-Anchored Generation of Complex QA Datasets","ref_index":29,"is_internal_anchor":true},{"citing_arxiv_id":"2604.23267","citing_title":"Fine-tuning vs. In-context Learning in Large Language Models: A Formal Language Learning Perspective","ref_index":73,"is_internal_anchor":true},{"citing_arxiv_id":"2604.20273","citing_title":"ActuBench: A Multi-Agent LLM Pipeline for Generation and Evaluation of Actuarial Reasoning Tasks","ref_index":26,"is_internal_anchor":true},{"citing_arxiv_id":"2604.18543","citing_title":"ClawEnvKit: Automatic Environment Generation for Claw-Like Agents","ref_index":111,"is_internal_anchor":true},{"citing_arxiv_id":"2605.01847","citing_title":"NeuroState-Bench: A Human-Calibrated Benchmark for Commitment Integrity in LLM Agent Profiles","ref_index":28,"is_internal_anchor":true},{"citing_arxiv_id":"2604.07650","citing_title":"How Independent are Large Language Models? A Statistical Framework for Auditing Behavioral Entanglement and Reweighting Verifier Ensembles","ref_index":23,"is_internal_anchor":true},{"citing_arxiv_id":"2604.06802","citing_title":"Riemann-Bench: A Benchmark for Moonshot Mathematics","ref_index":17,"is_internal_anchor":true},{"citing_arxiv_id":"2605.06865","citing_title":"Dataset Watermarking for Closed LLMs with Provable Detection","ref_index":18,"is_internal_anchor":true},{"citing_arxiv_id":"2604.05150","citing_title":"Compiled AI: Deterministic Code Generation for LLM-Based Workflow Automation","ref_index":20,"is_internal_anchor":true},{"citing_arxiv_id":"2604.04815","citing_title":"LiveFact: A Dynamic, Time-Aware Benchmark for LLM-Driven Fake News Detection","ref_index":6,"is_internal_anchor":true},{"citing_arxiv_id":"2604.19642","citing_title":"Micro Language Models Enable Instant Responses","ref_index":7,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/UHXBMDQASPG5XYGWXVOS6SX6RG","json":"https://pith.science/pith/UHXBMDQASPG5XYGWXVOS6SX6RG.json","graph_json":"https://pith.science/api/pith-number/UHXBMDQASPG5XYGWXVOS6SX6RG/graph.json","events_json":"https://pith.science/api/pith-number/UHXBMDQASPG5XYGWXVOS6SX6RG/events.json","paper":"https://pith.science/paper/UHXBMDQA"},"agent_actions":{"view_html":"https://pith.science/pith/UHXBMDQASPG5XYGWXVOS6SX6RG","download_json":"https://pith.science/pith/UHXBMDQASPG5XYGWXVOS6SX6RG.json","view_paper":"https://pith.science/paper/UHXBMDQA","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2406.04244&json=true","fetch_graph":"https://pith.science/api/pith-number/UHXBMDQASPG5XYGWXVOS6SX6RG/graph.json","fetch_events":"https://pith.science/api/pith-number/UHXBMDQASPG5XYGWXVOS6SX6RG/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/UHXBMDQASPG5XYGWXVOS6SX6RG/action/timestamp_anchor","attest_storage":"https://pith.science/pith/UHXBMDQASPG5XYGWXVOS6SX6RG/action/storage_attestation","attest_author":"https://pith.science/pith/UHXBMDQASPG5XYGWXVOS6SX6RG/action/author_attestation","sign_citation":"https://pith.science/pith/UHXBMDQASPG5XYGWXVOS6SX6RG/action/citation_signature","submit_replication":"https://pith.science/pith/UHXBMDQASPG5XYGWXVOS6SX6RG/action/replication_record"}},"created_at":"2026-05-22T23:03:43.312062+00:00","updated_at":"2026-05-22T23:03:43.312062+00:00"}