{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2021:FFKM5KHQJOWAOGZE2VQIBEC7J6","short_pith_number":"pith:FFKM5KHQ","schema_version":"1.0","canonical_sha256":"2954cea8f04bac071b24d56080905f4f8157d97e96558f7a676ad31e51e54ecc","source":{"kind":"arxiv","id":"2102.04664","version":2},"attestation_state":"computed","paper":{"title":"CodeXGLUE: A Machine Learning Benchmark Dataset for Code Understanding and Generation","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"CodeXGLUE introduces a benchmark with 10 tasks across 14 datasets for code understanding and generation.","cross_cats":["cs.CL"],"primary_cat":"cs.SE","authors_text":"Alexey Svyatkovskiy, Ambrosio Blanco, Colin Clement, Dawn Drain, Daxin Jiang, Daya Guo, Duyu Tang, Ge Li, Junjie Huang, Lidong Zhou, Linjun Shou, Long Zhou, Michele Tufano, Ming Gong, Ming Zhou, Nan Duan, Neel Sundaresan, Shao Kun Deng, Shengyu Fu, Shuai Lu, Shujie Liu, Shuo Ren","submitted_at":"2021-02-09T06:16:25Z","abstract_excerpt":"Benchmark datasets have a significant impact on accelerating research in programming language tasks. In this paper, we introduce CodeXGLUE, a benchmark dataset to foster machine learning research for program understanding and generation. CodeXGLUE includes a collection of 10 tasks across 14 datasets and a platform for model evaluation and comparison. CodeXGLUE also features three baseline systems, including the BERT-style, GPT-style, and Encoder-Decoder models, to make it easy for researchers to use the platform. The availability of such data and baselines can help the development and validati"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":true,"formal_links_present":true},"canonical_record":{"source":{"id":"2102.04664","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.SE","submitted_at":"2021-02-09T06:16:25Z","cross_cats_sorted":["cs.CL"],"title_canon_sha256":"860be0ba9c5444e1c8b2bc4245a205e9d9c1e4bea18ae82f2556b8d32393d9c2","abstract_canon_sha256":"577000571d1832c71d8988455cd2943e664fd4fd9798e9932abb6058814ac2be"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:52.655016Z","signature_b64":"mtzjpJGFR/VM7D9mE7IlPGeqZialamVrX3CuJP09p/Kz/mo6QPuhwN9IMzOypbEq7IBMbJ4ck/Wy3dNBuf5BDg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"2954cea8f04bac071b24d56080905f4f8157d97e96558f7a676ad31e51e54ecc","last_reissued_at":"2026-05-17T23:38:52.654437Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:52.654437Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"CodeXGLUE: A Machine Learning Benchmark Dataset for Code Understanding and Generation","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"CodeXGLUE introduces a benchmark with 10 tasks across 14 datasets for code understanding and generation.","cross_cats":["cs.CL"],"primary_cat":"cs.SE","authors_text":"Alexey Svyatkovskiy, Ambrosio Blanco, Colin Clement, Dawn Drain, Daxin Jiang, Daya Guo, Duyu Tang, Ge Li, Junjie Huang, Lidong Zhou, Linjun Shou, Long Zhou, Michele Tufano, Ming Gong, Ming Zhou, Nan Duan, Neel Sundaresan, Shao Kun Deng, Shengyu Fu, Shuai Lu, Shujie Liu, Shuo Ren","submitted_at":"2021-02-09T06:16:25Z","abstract_excerpt":"Benchmark datasets have a significant impact on accelerating research in programming language tasks. In this paper, we introduce CodeXGLUE, a benchmark dataset to foster machine learning research for program understanding and generation. CodeXGLUE includes a collection of 10 tasks across 14 datasets and a platform for model evaluation and comparison. CodeXGLUE also features three baseline systems, including the BERT-style, GPT-style, and Encoder-Decoder models, to make it easy for researchers to use the platform. The availability of such data and baselines can help the development and validati"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"CodeXGLUE includes a collection of 10 tasks across 14 datasets and a platform for model evaluation and comparison.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"The selected 10 tasks and 14 datasets are assumed to be representative of the broader space of program understanding and generation problems.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"CodeXGLUE supplies a standardized collection of 10 code-related tasks, 14 datasets, an evaluation platform, and BERT-, GPT-, and encoder-decoder-style baselines.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"CodeXGLUE introduces a benchmark with 10 tasks across 14 datasets for code understanding and generation.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"52213557bf4deec84f17e8b355a20d8b4ae6cd4a8055bf71e79df8ed0ae7ab55"},"source":{"id":"2102.04664","kind":"arxiv","version":2},"verdict":{"id":"8627a318-fd9d-4346-966f-3ae52cfd3dd9","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T11:35:59.404148Z","strongest_claim":"CodeXGLUE includes a collection of 10 tasks across 14 datasets and a platform for model evaluation and comparison.","one_line_summary":"CodeXGLUE supplies a standardized collection of 10 code-related tasks, 14 datasets, an evaluation platform, and BERT-, GPT-, and encoder-decoder-style baselines.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"The selected 10 tasks and 14 datasets are assumed to be representative of the broader space of program understanding and generation problems.","pith_extraction_headline":"CodeXGLUE introduces a benchmark with 10 tasks across 14 datasets for code understanding and generation."},"references":{"count":107,"sample":[{"doi":"10.1145/3212695","year":2018,"title":"T., Devanbu, P., and Sutton, C","work_id":"db061796-6810-4e22-9e85-3a1b70629115","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2017,"title":"Learning to represent programs with graphs","work_id":"3a9d5702-9b14-4cf1-abe5-abec3d6bae13","ref_index":2,"cited_arxiv_id":"1711.00740","is_internal_anchor":true},{"doi":"","year":2016,"title":"Miltiadis Allamanis, Hao Peng, and Charles Sutton. 2016. A convolutional at- tention network for extreme summarization of source code. In International conference on machine learning . 2091–2100","work_id":"2883b1f0-64a1-4b7f-99f9-02165afecc7e","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2013,"title":"Miltiadis Allamanis and Charles Sutton. 2013. Mining Source Code Repositories at Massive Scale using Language Modeling. In 2013 10th Working Conference on Mining Software Repositories (MSR) . IEEE, 20","work_id":"fd3652df-9444-49be-8b7a-7abff7921ca4","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2014,"title":"Miltiadis Allamanis and Charles Sutton. 2014. Mining idioms from source code. In Proceedings of the 22nd ACM SIGSOFT International Symposium on Foundations of Software Engineering. 472–483","work_id":"3fd64261-694f-4094-a41b-0c41147c3f74","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":107,"snapshot_sha256":"9542bd20c6eea5553e01e67f4b5cd75db0e3b58e44f1a2e8343ad23b3fa0e672","internal_anchors":20},"formal_canon":{"evidence_count":3,"snapshot_sha256":"5fa5712a9ec7732418156e7f6ca5c33e7572e96170866d783481f6650472e619"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2102.04664","created_at":"2026-05-17T23:38:52.654516+00:00"},{"alias_kind":"arxiv_version","alias_value":"2102.04664v2","created_at":"2026-05-17T23:38:52.654516+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2102.04664","created_at":"2026-05-17T23:38:52.654516+00:00"},{"alias_kind":"pith_short_12","alias_value":"FFKM5KHQJOWA","created_at":"2026-05-18T12:33:33.725879+00:00"},{"alias_kind":"pith_short_16","alias_value":"FFKM5KHQJOWAOGZE","created_at":"2026-05-18T12:33:33.725879+00:00"},{"alias_kind":"pith_short_8","alias_value":"FFKM5KHQ","created_at":"2026-05-18T12:33:33.725879+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":37,"internal_anchor_count":37,"sample":[{"citing_arxiv_id":"2502.14925","citing_title":"CODEPROMPTZIP: Code-specific Prompt Compression for Retrieval-Augmented Generation in Coding Tasks with LMs","ref_index":7,"is_internal_anchor":true},{"citing_arxiv_id":"2503.02497","citing_title":"A PennyLane-Centric Dataset to Enhance LLM-based Quantum Code Generation using RAG","ref_index":47,"is_internal_anchor":true},{"citing_arxiv_id":"2503.14281","citing_title":"XOXO: Stealthy Cross-Origin Context Poisoning Attacks against AI Coding Assistants","ref_index":44,"is_internal_anchor":true},{"citing_arxiv_id":"2509.20881","citing_title":"PseudoBridge: Pseudo Code as the Bridge for Better Semantic and Logic Alignment in Code Retrieval","ref_index":32,"is_internal_anchor":true},{"citing_arxiv_id":"2605.20788","citing_title":"BioDefect: The First Dataset for Defect Detection in Bioinformatics Software","ref_index":34,"is_internal_anchor":true},{"citing_arxiv_id":"2605.17458","citing_title":"ClaHF: A Human Feedback-inspired Reinforcement Learning Framework for Improving Classification Tasks","ref_index":45,"is_internal_anchor":true},{"citing_arxiv_id":"2605.17152","citing_title":"Multilingual and Multimodal LLMs in the Wild: Building for Low-Resource Languages","ref_index":219,"is_internal_anchor":true},{"citing_arxiv_id":"2605.19365","citing_title":"On-the-Fly Input Adaptation for Reliable Code Intelligence","ref_index":13,"is_internal_anchor":true},{"citing_arxiv_id":"2605.19369","citing_title":"When to Answer and When to Defer: A Decision Framework for Reliable Code Predictions","ref_index":15,"is_internal_anchor":true},{"citing_arxiv_id":"2508.03949","citing_title":"Model Compression vs. Adversarial Robustness: An Empirical Study on Language Models for Code","ref_index":5,"is_internal_anchor":true},{"citing_arxiv_id":"2508.16771","citing_title":"EyeMulator: Improving Code Language Models by Mimicking Human Visual Attention","ref_index":33,"is_internal_anchor":true},{"citing_arxiv_id":"2510.04166","citing_title":"Multi Language Models for On-the-Fly Syntax Highlighting","ref_index":20,"is_internal_anchor":true},{"citing_arxiv_id":"2511.05476","citing_title":"A Metamorphic Testing Perspective on Knowledge Distillation for Language Models of Code: Does the Student Deeply Mimic the Teacher?","ref_index":23,"is_internal_anchor":true},{"citing_arxiv_id":"2601.16456","citing_title":"RubberDuckBench: A Benchmark for AI Coding Assistants","ref_index":28,"is_internal_anchor":true},{"citing_arxiv_id":"2306.03091","citing_title":"RepoBench: Benchmarking Repository-Level Code Auto-Completion Systems","ref_index":28,"is_internal_anchor":true},{"citing_arxiv_id":"2603.00989","citing_title":"Sustainable Code Generation Using Large Language Models: A Systematic Literature Review","ref_index":130,"is_internal_anchor":true},{"citing_arxiv_id":"2603.06276","citing_title":"Story Point Estimation Using Large Language Models","ref_index":20,"is_internal_anchor":true},{"citing_arxiv_id":"2109.00859","citing_title":"CodeT5: Identifier-aware Unified Pre-trained Encoder-Decoder Models for Code Understanding and Generation","ref_index":20,"is_internal_anchor":true},{"citing_arxiv_id":"2605.13896","citing_title":"Neural Code Translation of Legacy Code: APL to C#","ref_index":11,"is_internal_anchor":true},{"citing_arxiv_id":"2605.14503","citing_title":"Not All RAGs Are Created Equal: A Component-Wise Empirical Study for Software Engineering Tasks","ref_index":30,"is_internal_anchor":true},{"citing_arxiv_id":"2605.13280","citing_title":"The Readability Spectrum: Patterns, Issues, and Prompt Effects in LLM-Generated Code","ref_index":60,"is_internal_anchor":true},{"citing_arxiv_id":"2305.01210","citing_title":"Is Your Code Generated by ChatGPT Really Correct? Rigorous Evaluation of Large Language Models for Code Generation","ref_index":37,"is_internal_anchor":true},{"citing_arxiv_id":"2604.02761","citing_title":"Sustainability Analysis of Prompt Strategies for SLM-based Automated Test Generation","ref_index":10,"is_internal_anchor":true},{"citing_arxiv_id":"2406.00515","citing_title":"A Survey on Large Language Models for Code Generation","ref_index":176,"is_internal_anchor":true},{"citing_arxiv_id":"2605.11269","citing_title":"gwBenchmarks: Stress-Testing LLM Agents on High-Precision Gravitational Wave Astronomy","ref_index":53,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":3,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/FFKM5KHQJOWAOGZE2VQIBEC7J6","json":"https://pith.science/pith/FFKM5KHQJOWAOGZE2VQIBEC7J6.json","graph_json":"https://pith.science/api/pith-number/FFKM5KHQJOWAOGZE2VQIBEC7J6/graph.json","events_json":"https://pith.science/api/pith-number/FFKM5KHQJOWAOGZE2VQIBEC7J6/events.json","paper":"https://pith.science/paper/FFKM5KHQ"},"agent_actions":{"view_html":"https://pith.science/pith/FFKM5KHQJOWAOGZE2VQIBEC7J6","download_json":"https://pith.science/pith/FFKM5KHQJOWAOGZE2VQIBEC7J6.json","view_paper":"https://pith.science/paper/FFKM5KHQ","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2102.04664&json=true","fetch_graph":"https://pith.science/api/pith-number/FFKM5KHQJOWAOGZE2VQIBEC7J6/graph.json","fetch_events":"https://pith.science/api/pith-number/FFKM5KHQJOWAOGZE2VQIBEC7J6/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/FFKM5KHQJOWAOGZE2VQIBEC7J6/action/timestamp_anchor","attest_storage":"https://pith.science/pith/FFKM5KHQJOWAOGZE2VQIBEC7J6/action/storage_attestation","attest_author":"https://pith.science/pith/FFKM5KHQJOWAOGZE2VQIBEC7J6/action/author_attestation","sign_citation":"https://pith.science/pith/FFKM5KHQJOWAOGZE2VQIBEC7J6/action/citation_signature","submit_replication":"https://pith.science/pith/FFKM5KHQJOWAOGZE2VQIBEC7J6/action/replication_record"}},"created_at":"2026-05-17T23:38:52.654516+00:00","updated_at":"2026-05-17T23:38:52.654516+00:00"}