{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2020:ASTJHJTFV6SPSM2W3YA3UAD2VF","short_pith_number":"pith:ASTJHJTF","schema_version":"1.0","canonical_sha256":"04a693a665afa4f93356de01ba007aa954c12eac308743f5150934ff9a119141","source":{"kind":"arxiv","id":"2002.08910","version":4},"attestation_state":"computed","paper":{"title":"How Much Knowledge Can You Pack Into the Parameters of a Language Model?","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"Fine-tuned language models answer questions using only knowledge stored in their parameters.","cross_cats":["cs.LG","stat.ML"],"primary_cat":"cs.CL","authors_text":"Adam Roberts, Colin Raffel, Noam Shazeer","submitted_at":"2020-02-10T18:55:58Z","abstract_excerpt":"It has recently been observed that neural language models trained on unstructured text can implicitly store and retrieve knowledge using natural language queries. In this short paper, we measure the practical utility of this approach by fine-tuning pre-trained models to answer questions without access to any external context or knowledge. We show that this approach scales with model size and performs competitively with open-domain systems that explicitly retrieve answers from an external knowledge source when answering questions. To facilitate reproducibility and future work, we release our co"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":true,"formal_links_present":true},"canonical_record":{"source":{"id":"2002.08910","kind":"arxiv","version":4},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2020-02-10T18:55:58Z","cross_cats_sorted":["cs.LG","stat.ML"],"title_canon_sha256":"2963317b125562b73a72cb6679954121a20b97cabd0104f1364ebc3b3f33a2c1","abstract_canon_sha256":"da783045f046730f4f7d55bb7366b33d9c4a68171a5e2b5cc93ce32da568aa65"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:53.873490Z","signature_b64":"AENH5grUyyZ63jFDeKvMtEhgqFE5p7rxZKZPs8+ER2aJhn0PrLLvDHzo9aiM089dVfn641d/Nc9bhjh2E8xqCg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"04a693a665afa4f93356de01ba007aa954c12eac308743f5150934ff9a119141","last_reissued_at":"2026-05-17T23:38:53.873003Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:53.873003Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"How Much Knowledge Can You Pack Into the Parameters of a Language Model?","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"Fine-tuned language models answer questions using only knowledge stored in their parameters.","cross_cats":["cs.LG","stat.ML"],"primary_cat":"cs.CL","authors_text":"Adam Roberts, Colin Raffel, Noam Shazeer","submitted_at":"2020-02-10T18:55:58Z","abstract_excerpt":"It has recently been observed that neural language models trained on unstructured text can implicitly store and retrieve knowledge using natural language queries. In this short paper, we measure the practical utility of this approach by fine-tuning pre-trained models to answer questions without access to any external context or knowledge. We show that this approach scales with model size and performs competitively with open-domain systems that explicitly retrieve answers from an external knowledge source when answering questions. To facilitate reproducibility and future work, we release our co"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"fine-tuning pre-trained models to answer questions without access to any external context or knowledge ... performs competitively with open-domain systems that explicitly retrieve answers from an external knowledge source","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That the knowledge required to answer the questions is present in the pre-training data and can be effectively stored and retrieved from the model parameters through fine-tuning on QA pairs.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Fine-tuned language models store knowledge in parameters to answer questions competitively with retrieval-based open-domain QA systems.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Fine-tuned language models answer questions using only knowledge stored in their parameters.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"ded9f367a679ac1c83d5f409932e3506f46aff5435fd6ed7fc90c27fb63ad3dc"},"source":{"id":"2002.08910","kind":"arxiv","version":4},"verdict":{"id":"339ec349-527e-4121-afc6-b43a1a0b9139","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T01:56:16.668688Z","strongest_claim":"fine-tuning pre-trained models to answer questions without access to any external context or knowledge ... performs competitively with open-domain systems that explicitly retrieve answers from an external knowledge source","one_line_summary":"Fine-tuned language models store knowledge in parameters to answer questions competitively with retrieval-based open-domain QA systems.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That the knowledge required to answer the questions is present in the pre-training data and can be effectively stored and retrieved from the model parameters through fine-tuning on QA pairs.","pith_extraction_headline":"Fine-tuned language models answer questions using only knowledge stored in their parameters."},"references":{"count":60,"sample":[{"doi":"","year":null,"title":"A discrete hard","work_id":"9150595c-1c04-4fa6-a608-2e4ad4049e47","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Chen, Danqi and Fisch, Adam and Weston, Jason and Bordes, Antoine , journal=. Reading","work_id":"ccb6d06f-15e5-44ed-83d0-4ad20942666a","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Learning to Retrieve Reasoning Paths over","work_id":"67417f41-9e9e-4536-9301-0e60d3d8e11b","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina , journal=","work_id":"688e3467-057d-4547-89bf-cba40f753c3a","ref_index":6,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Yang, Zhilin and Dai, Zihang and Yang, Yiming and Carbonell, Jaime and Salakhutdinov, Ruslan and Le, Quoc V. , journal=","work_id":"53a3312e-99a4-4d98-9d8f-0ac83a47cbc0","ref_index":7,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":60,"snapshot_sha256":"0817775aeec006b6dacd4bac7ec7945519cf618aa2dbba21231117a36077c2b1","internal_anchors":18},"formal_canon":{"evidence_count":2,"snapshot_sha256":"9791859ff4da89b5810cad8f8565dac92952843495d6b09e2d79c047de3ccb8e"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2002.08910","created_at":"2026-05-17T23:38:53.873078+00:00"},{"alias_kind":"arxiv_version","alias_value":"2002.08910v4","created_at":"2026-05-17T23:38:53.873078+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2002.08910","created_at":"2026-05-17T23:38:53.873078+00:00"},{"alias_kind":"pith_short_12","alias_value":"ASTJHJTFV6SP","created_at":"2026-05-18T12:33:33.725879+00:00"},{"alias_kind":"pith_short_16","alias_value":"ASTJHJTFV6SPSM2W","created_at":"2026-05-18T12:33:33.725879+00:00"},{"alias_kind":"pith_short_8","alias_value":"ASTJHJTF","created_at":"2026-05-18T12:33:33.725879+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":23,"internal_anchor_count":23,"sample":[{"citing_arxiv_id":"2208.03299","citing_title":"Atlas: Few-shot Learning with Retrieval Augmented Language Models","ref_index":49,"is_internal_anchor":true},{"citing_arxiv_id":"2604.06179","citing_title":"ARIA: Adaptive Retrieval Intelligence Assistant -- A Multimodal RAG Framework for Domain-Specific Engineering Education","ref_index":9,"is_internal_anchor":true},{"citing_arxiv_id":"2004.04906","citing_title":"Dense Passage Retrieval for Open-Domain Question Answering","ref_index":101,"is_internal_anchor":true},{"citing_arxiv_id":"2604.14172","citing_title":"Tug-of-War within A Decade: Conflict Resolution in Vulnerability Analysis via Teacher-Guided Retrieval-Augmented Generations","ref_index":25,"is_internal_anchor":true},{"citing_arxiv_id":"2502.02737","citing_title":"SmolLM2: When Smol Goes Big -- Data-Centric Training of a Small Language Model","ref_index":216,"is_internal_anchor":true},{"citing_arxiv_id":"2101.03961","citing_title":"Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity","ref_index":28,"is_internal_anchor":true},{"citing_arxiv_id":"2202.08906","citing_title":"ST-MoE: Designing Stable and Transferable Sparse Expert Models","ref_index":189,"is_internal_anchor":true},{"citing_arxiv_id":"2112.09118","citing_title":"Unsupervised Dense Information Retrieval with Contrastive Learning","ref_index":20,"is_internal_anchor":true},{"citing_arxiv_id":"2605.10640","citing_title":"Towards Understanding Continual Factual Knowledge Acquisition of Language Models: From Theory to Algorithm","ref_index":46,"is_internal_anchor":true},{"citing_arxiv_id":"2605.09724","citing_title":"Model Capacity Determines Grokking through Competing Memorisation and Generalisation Speeds","ref_index":20,"is_internal_anchor":true},{"citing_arxiv_id":"2605.03344","citing_title":"RAG over Thinking Traces Can Improve Reasoning Tasks","ref_index":36,"is_internal_anchor":true},{"citing_arxiv_id":"2604.26981","citing_title":"Budget-Constrained Online Retrieval-Augmented Generation: The Chunk-as-a-Service Model","ref_index":17,"is_internal_anchor":true},{"citing_arxiv_id":"2207.05608","citing_title":"Inner Monologue: Embodied Reasoning through Planning with Language Models","ref_index":7,"is_internal_anchor":true},{"citing_arxiv_id":"2605.06216","citing_title":"TIDE: Every Layer Knows the Token Beneath the Context","ref_index":85,"is_internal_anchor":true},{"citing_arxiv_id":"2605.05459","citing_title":"Privacy Without Losing Place: A Paradigm for Private Retrieval in Spatial RAGs","ref_index":4,"is_internal_anchor":true},{"citing_arxiv_id":"2604.12610","citing_title":"Transforming External Knowledge into Triplets for Enhanced Retrieval in RAG of LLMs","ref_index":23,"is_internal_anchor":true},{"citing_arxiv_id":"2604.08519","citing_title":"Cram Less to Fit More: Training Data Pruning Improves Memorization of Facts","ref_index":73,"is_internal_anchor":true},{"citing_arxiv_id":"2604.07116","citing_title":"Yale-DM-Lab at ArchEHR-QA 2026: Deterministic Grounding and Multi-Pass Evidence Alignment for EHR Question Answering","ref_index":13,"is_internal_anchor":true},{"citing_arxiv_id":"2005.11401","citing_title":"Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks","ref_index":55,"is_internal_anchor":true},{"citing_arxiv_id":"2005.14165","citing_title":"Language Models are Few-Shot Learners","ref_index":69,"is_internal_anchor":true},{"citing_arxiv_id":"2604.18124","citing_title":"TLoRA: Task-aware Low Rank Adaptation of Large Language Models","ref_index":50,"is_internal_anchor":true},{"citing_arxiv_id":"2604.17200","citing_title":"Calibrating Model-Based Evaluation Metrics for Summarization","ref_index":117,"is_internal_anchor":true},{"citing_arxiv_id":"2605.02853","citing_title":"Trust, but Verify: Peeling Low-Bit Transformer Networks for Training Monitoring","ref_index":15,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":2,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/ASTJHJTFV6SPSM2W3YA3UAD2VF","json":"https://pith.science/pith/ASTJHJTFV6SPSM2W3YA3UAD2VF.json","graph_json":"https://pith.science/api/pith-number/ASTJHJTFV6SPSM2W3YA3UAD2VF/graph.json","events_json":"https://pith.science/api/pith-number/ASTJHJTFV6SPSM2W3YA3UAD2VF/events.json","paper":"https://pith.science/paper/ASTJHJTF"},"agent_actions":{"view_html":"https://pith.science/pith/ASTJHJTFV6SPSM2W3YA3UAD2VF","download_json":"https://pith.science/pith/ASTJHJTFV6SPSM2W3YA3UAD2VF.json","view_paper":"https://pith.science/paper/ASTJHJTF","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2002.08910&json=true","fetch_graph":"https://pith.science/api/pith-number/ASTJHJTFV6SPSM2W3YA3UAD2VF/graph.json","fetch_events":"https://pith.science/api/pith-number/ASTJHJTFV6SPSM2W3YA3UAD2VF/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/ASTJHJTFV6SPSM2W3YA3UAD2VF/action/timestamp_anchor","attest_storage":"https://pith.science/pith/ASTJHJTFV6SPSM2W3YA3UAD2VF/action/storage_attestation","attest_author":"https://pith.science/pith/ASTJHJTFV6SPSM2W3YA3UAD2VF/action/author_attestation","sign_citation":"https://pith.science/pith/ASTJHJTFV6SPSM2W3YA3UAD2VF/action/citation_signature","submit_replication":"https://pith.science/pith/ASTJHJTFV6SPSM2W3YA3UAD2VF/action/replication_record"}},"created_at":"2026-05-17T23:38:53.873078+00:00","updated_at":"2026-05-17T23:38:53.873078+00:00"}