{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2022:KNM7SW3MNVTU65JWDNPEVVF6NF","short_pith_number":"pith:KNM7SW3M","schema_version":"1.0","canonical_sha256":"5359f95b6c6d674f75361b5e4ad4be6951e86c150e07c40e37391cef35dfebba","source":{"kind":"arxiv","id":"2204.06745","version":1},"attestation_state":"computed","paper":{"title":"GPT-NeoX-20B: An Open-Source Autoregressive Language Model","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Ben Wang, Connor Leahy, Eric Hallahan, Horace He, Jason Phang, Jonathan Tow, Kyle McDonell, Laria Reynolds, Laurence Golding, Leo Gao, Michael Pieler, Quentin Anthony, Samuel Weinbach, Shivanshu Purohit, Sid Black, Stella Biderman, USVSN Sai Prashanth","submitted_at":"2022-04-14T04:00:27Z","abstract_excerpt":"We introduce GPT-NeoX-20B, a 20 billion parameter autoregressive language model trained on the Pile, whose weights will be made freely and openly available to the public through a permissive license. It is, to the best of our knowledge, the largest dense autoregressive model that has publicly available weights at the time of submission. In this work, we describe \\model{}'s architecture and training and evaluate its performance on a range of language-understanding, mathematics, and knowledge-based tasks. We find that GPT-NeoX-20B is a particularly powerful few-shot reasoner and gains far more i"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2204.06745","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2022-04-14T04:00:27Z","cross_cats_sorted":[],"title_canon_sha256":"70948c06c0b9c04f88c9feb44b61e750ae2def6e81da0c39e88db048e7547ea6","abstract_canon_sha256":"3ac6f4790db2765bfb802f06b5749dd2150551686b1454419508395f9ce70cd1"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:49.488007Z","signature_b64":"dIZ+CayvDl5Ujm/gFwdfcUxcrIuWM9igFwRP1OEhgTDTWRIy6QwvKrn7l4heP5Cz9TytHk3Jp+v6AO/fsoABAA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"5359f95b6c6d674f75361b5e4ad4be6951e86c150e07c40e37391cef35dfebba","last_reissued_at":"2026-05-17T23:38:49.487474Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:49.487474Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"GPT-NeoX-20B: An Open-Source Autoregressive Language Model","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Ben Wang, Connor Leahy, Eric Hallahan, Horace He, Jason Phang, Jonathan Tow, Kyle McDonell, Laria Reynolds, Laurence Golding, Leo Gao, Michael Pieler, Quentin Anthony, Samuel Weinbach, Shivanshu Purohit, Sid Black, Stella Biderman, USVSN Sai Prashanth","submitted_at":"2022-04-14T04:00:27Z","abstract_excerpt":"We introduce GPT-NeoX-20B, a 20 billion parameter autoregressive language model trained on the Pile, whose weights will be made freely and openly available to the public through a permissive license. It is, to the best of our knowledge, the largest dense autoregressive model that has publicly available weights at the time of submission. In this work, we describe \\model{}'s architecture and training and evaluate its performance on a range of language-understanding, mathematics, and knowledge-based tasks. We find that GPT-NeoX-20B is a particularly powerful few-shot reasoner and gains far more i"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2204.06745","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2204.06745","created_at":"2026-05-17T23:38:49.487566+00:00"},{"alias_kind":"arxiv_version","alias_value":"2204.06745v1","created_at":"2026-05-17T23:38:49.487566+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2204.06745","created_at":"2026-05-17T23:38:49.487566+00:00"},{"alias_kind":"pith_short_12","alias_value":"KNM7SW3MNVTU","created_at":"2026-05-18T12:33:33.725879+00:00"},{"alias_kind":"pith_short_16","alias_value":"KNM7SW3MNVTU65JW","created_at":"2026-05-18T12:33:33.725879+00:00"},{"alias_kind":"pith_short_8","alias_value":"KNM7SW3M","created_at":"2026-05-18T12:33:33.725879+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":37,"internal_anchor_count":17,"sample":[{"citing_arxiv_id":"2306.00978","citing_title":"AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration","ref_index":2,"is_internal_anchor":true},{"citing_arxiv_id":"2402.01411","citing_title":"CodePori: Large-Scale System for Autonomous Software Development Using Multi-Agent Technology","ref_index":66,"is_internal_anchor":true},{"citing_arxiv_id":"2404.10981","citing_title":"A Survey on Retrieval-Augmented Text Generation for Large Language Models","ref_index":8,"is_internal_anchor":true},{"citing_arxiv_id":"2410.21316","citing_title":"Deep Optimizer States: Towards Scalable Training of Transformer Models Using Interleaved Offloading","ref_index":2,"is_internal_anchor":true},{"citing_arxiv_id":"2502.12120","citing_title":"LLMs on the Line: Data Determines Loss-to-Loss Scaling Laws","ref_index":5,"is_internal_anchor":true},{"citing_arxiv_id":"2305.02301","citing_title":"Distilling Step-by-Step! Outperforming Larger Language Models with Less Training Data and Smaller Model Sizes","ref_index":65,"is_internal_anchor":true},{"citing_arxiv_id":"2604.13392","citing_title":"ReSS: Learning Reasoning Models for Tabular Data Prediction via Symbolic Scaffold","ref_index":7,"is_internal_anchor":true},{"citing_arxiv_id":"2307.06435","citing_title":"A Comprehensive Overview of Large Language Models","ref_index":118,"is_internal_anchor":true},{"citing_arxiv_id":"2305.07922","citing_title":"CodeT5+: Open Code Large Language Models for Code Understanding and Generation","ref_index":5,"is_internal_anchor":true},{"citing_arxiv_id":"2508.16745","citing_title":"Beyond Memorization: Extending Reasoning Depth with Recurrence, Memory and Test-Time Compute Scaling","ref_index":7,"is_internal_anchor":true},{"citing_arxiv_id":"2410.10819","citing_title":"DuoAttention: Efficient Long-Context LLM Inference with Retrieval and Streaming Heads","ref_index":7,"is_internal_anchor":true},{"citing_arxiv_id":"2305.16264","citing_title":"Scaling Data-Constrained Language Models","ref_index":14,"is_internal_anchor":true},{"citing_arxiv_id":"2511.17388","citing_title":"Selective Rotary Position Embedding","ref_index":5,"is_internal_anchor":true},{"citing_arxiv_id":"2310.16789","citing_title":"Detecting Pretraining Data from Large Language Models","ref_index":3,"is_internal_anchor":true},{"citing_arxiv_id":"2311.01378","citing_title":"Vision-Language Foundation Models as Effective Robot Imitators","ref_index":3,"is_internal_anchor":true},{"citing_arxiv_id":"2405.14782","citing_title":"Lessons from the Trenches on Reproducible Evaluation of Language Models","ref_index":251,"is_internal_anchor":true},{"citing_arxiv_id":"2501.08313","citing_title":"MiniMax-01: Scaling Foundation Models with Lightning Attention","ref_index":44,"is_internal_anchor":true},{"citing_arxiv_id":"2407.04620","citing_title":"Learning to (Learn at Test Time): RNNs with Expressive Hidden States","ref_index":8,"is_internal_anchor":false},{"citing_arxiv_id":"2406.00515","citing_title":"A Survey on Large Language Models for Code Generation","ref_index":29,"is_internal_anchor":false},{"citing_arxiv_id":"2605.10288","citing_title":"BROS: Bias-Corrected Randomized Subspaces for Memory-Efficient Single-Loop Bilevel Optimization","ref_index":4,"is_internal_anchor":false},{"citing_arxiv_id":"2211.09085","citing_title":"Galactica: A Large Language Model for Science","ref_index":145,"is_internal_anchor":false},{"citing_arxiv_id":"2211.09085","citing_title":"Galactica: A Large Language Model for Science","ref_index":99,"is_internal_anchor":false},{"citing_arxiv_id":"2311.05232","citing_title":"A Survey on Hallucination in Large Language Models: Principles, Taxonomy, Challenges, and Open Questions","ref_index":24,"is_internal_anchor":false},{"citing_arxiv_id":"2303.08112","citing_title":"Eliciting Latent Predictions from Transformers with the Tuned Lens","ref_index":16,"is_internal_anchor":false},{"citing_arxiv_id":"2310.11511","citing_title":"Self-RAG: Learning to Retrieve, Generate, and Critique through Self-Reflection","ref_index":91,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/KNM7SW3MNVTU65JWDNPEVVF6NF","json":"https://pith.science/pith/KNM7SW3MNVTU65JWDNPEVVF6NF.json","graph_json":"https://pith.science/api/pith-number/KNM7SW3MNVTU65JWDNPEVVF6NF/graph.json","events_json":"https://pith.science/api/pith-number/KNM7SW3MNVTU65JWDNPEVVF6NF/events.json","paper":"https://pith.science/paper/KNM7SW3M"},"agent_actions":{"view_html":"https://pith.science/pith/KNM7SW3MNVTU65JWDNPEVVF6NF","download_json":"https://pith.science/pith/KNM7SW3MNVTU65JWDNPEVVF6NF.json","view_paper":"https://pith.science/paper/KNM7SW3M","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2204.06745&json=true","fetch_graph":"https://pith.science/api/pith-number/KNM7SW3MNVTU65JWDNPEVVF6NF/graph.json","fetch_events":"https://pith.science/api/pith-number/KNM7SW3MNVTU65JWDNPEVVF6NF/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/KNM7SW3MNVTU65JWDNPEVVF6NF/action/timestamp_anchor","attest_storage":"https://pith.science/pith/KNM7SW3MNVTU65JWDNPEVVF6NF/action/storage_attestation","attest_author":"https://pith.science/pith/KNM7SW3MNVTU65JWDNPEVVF6NF/action/author_attestation","sign_citation":"https://pith.science/pith/KNM7SW3MNVTU65JWDNPEVVF6NF/action/citation_signature","submit_replication":"https://pith.science/pith/KNM7SW3MNVTU65JWDNPEVVF6NF/action/replication_record"}},"created_at":"2026-05-17T23:38:49.487566+00:00","updated_at":"2026-05-17T23:38:49.487566+00:00"}