{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2019:QEET3LADZ3NCLD36BMTOHQVDRT","short_pith_number":"pith:QEET3LAD","schema_version":"1.0","canonical_sha256":"81093dac03ceda258f7e0b26e3c2a38cc7a8d7e8d5b8bf146611ab68d6d2dc25","source":{"kind":"arxiv","id":"1910.07467","version":1},"attestation_state":"computed","paper":{"title":"Root Mean Square Layer Normalization","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"RMSNorm delivers re-scaling invariance and comparable accuracy to LayerNorm while cutting computation by skipping mean subtraction, yielding 7-64% runtime reductions across tested models.","cross_cats":["cs.CL","stat.ML"],"primary_cat":"cs.LG","authors_text":"Biao Zhang, Rico Sennrich","submitted_at":"2019-10-16T16:44:22Z","abstract_excerpt":"Layer normalization (LayerNorm) has been successfully applied to various deep neural networks to help stabilize training and boost model convergence because of its capability in handling re-centering and re-scaling of both inputs and weight matrix. However, the computational overhead introduced by LayerNorm makes these improvements expensive and significantly slows the underlying network, e.g. RNN in particular. In this paper, we hypothesize that re-centering invariance in LayerNorm is dispensable and propose root mean square layer normalization, or RMSNorm. RMSNorm regularizes the summed inpu"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":true,"formal_links_present":true},"canonical_record":{"source":{"id":"1910.07467","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2019-10-16T16:44:22Z","cross_cats_sorted":["cs.CL","stat.ML"],"title_canon_sha256":"13f2e705c8860613b17cff09bdab6ee432490ed93dbe9c4b5baad45f52d78345","abstract_canon_sha256":"010165e0e38b20055e51086183e93942e4b020d8ca5e1b88c446daa3842a151b"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:13.383645Z","signature_b64":"nRAtZjP0SMxQyFMiZ55AzVpxnWfZ0YE7guadG1O3D2jcDQ+R+7P2OodvKCFkxzCa328G3lthBR9d83oQ3xylBQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"81093dac03ceda258f7e0b26e3c2a38cc7a8d7e8d5b8bf146611ab68d6d2dc25","last_reissued_at":"2026-05-17T23:38:13.382892Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:13.382892Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Root Mean Square Layer Normalization","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"RMSNorm delivers re-scaling invariance and comparable accuracy to LayerNorm while cutting computation by skipping mean subtraction, yielding 7-64% runtime reductions across tested models.","cross_cats":["cs.CL","stat.ML"],"primary_cat":"cs.LG","authors_text":"Biao Zhang, Rico Sennrich","submitted_at":"2019-10-16T16:44:22Z","abstract_excerpt":"Layer normalization (LayerNorm) has been successfully applied to various deep neural networks to help stabilize training and boost model convergence because of its capability in handling re-centering and re-scaling of both inputs and weight matrix. However, the computational overhead introduced by LayerNorm makes these improvements expensive and significantly slows the underlying network, e.g. RNN in particular. In this paper, we hypothesize that re-centering invariance in LayerNorm is dispensable and propose root mean square layer normalization, or RMSNorm. RMSNorm regularizes the summed inpu"},"claims":{"count":3,"items":[{"kind":"strongest_claim","text":"Extensive experiments on several tasks using diverse network architectures show that RMSNorm achieves comparable performance against LayerNorm but reduces the running time by 7%~64% on different models.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"Re-centering invariance in LayerNorm is dispensable for the stabilization and convergence benefits the method provides.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"RMSNorm delivers re-scaling invariance and comparable accuracy to LayerNorm while cutting computation by skipping mean subtraction, yielding 7-64% runtime reductions across tested models.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"}],"snapshot_sha256":"500d62aac33e3cac7d4290625de31760a2485fa97f9a7fe5b8711432f5c0f68b"},"source":{"id":"1910.07467","kind":"arxiv","version":1},"verdict":{"id":"428befb3-8470-42df-b5d6-636fd531defa","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-17T18:35:04.596165Z","strongest_claim":"Extensive experiments on several tasks using diverse network architectures show that RMSNorm achieves comparable performance against LayerNorm but reduces the running time by 7%~64% on different models.","one_line_summary":"RMSNorm delivers re-scaling invariance and comparable accuracy to LayerNorm while cutting computation by skipping mean subtraction, yielding 7-64% runtime reductions across tested models.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"Re-centering invariance in LayerNorm is dispensable for the stabilization and convergence benefits the method provides.","pith_extraction_headline":""},"references":{"count":37,"sample":[{"doi":"","year":2016,"title":"Murray, Benoit Steiner, Paul Tucker, Vijay Vasudevan, Pete Warden, Martin Wicke, Yuan Yu, and Xiaoqiang Zheng","work_id":"7d508c46-4bf6-4a21-a869-a2047d225905","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2016,"title":"Normalization Propagation: A Parametric Technique for Removing Internal Covariate Shift in Deep Networks","work_id":"c9de45d0-ed57-47a0-ac8d-f52a961173b4","ref_index":2,"cited_arxiv_id":"1603.01431","is_internal_anchor":true},{"doi":"","year":2016,"title":"Layer Normalization","work_id":"20a2d720-0046-4c7c-bcd6-327ec8143f69","ref_index":3,"cited_arxiv_id":"1607.06450","is_internal_anchor":true},{"doi":"","year":2014,"title":"Neural Machine Translation by Jointly Learning to Align and Translate","work_id":"d831e763-d530-4029-a65c-ac595d82cb2a","ref_index":4,"cited_arxiv_id":"1409.0473","is_internal_anchor":true},{"doi":"","year":2018,"title":"Understanding batch normalization","work_id":"e66d9f22-11a6-481d-b49f-73514654db8e","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":37,"snapshot_sha256":"820b5a6f211d3171f5b53ca84896d0d7c098f19d6e10fb9c703b7f327465d1e0","internal_anchors":16},"formal_canon":{"evidence_count":2,"snapshot_sha256":"04c43cf71823bd0f60267fd06334181c3788186a5fdf182cb393959fc49b918f"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1910.07467","created_at":"2026-05-17T23:38:13.382999+00:00"},{"alias_kind":"arxiv_version","alias_value":"1910.07467v1","created_at":"2026-05-17T23:38:13.382999+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1910.07467","created_at":"2026-05-17T23:38:13.382999+00:00"},{"alias_kind":"pith_short_12","alias_value":"QEET3LADZ3NC","created_at":"2026-05-18T12:33:27.125529+00:00"},{"alias_kind":"pith_short_16","alias_value":"QEET3LADZ3NCLD36","created_at":"2026-05-18T12:33:27.125529+00:00"},{"alias_kind":"pith_short_8","alias_value":"QEET3LAD","created_at":"2026-05-18T12:33:27.125529+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":24,"internal_anchor_count":24,"sample":[{"citing_arxiv_id":"2605.23061","citing_title":"Anytime Training with Schedule-Free Spectral Optimization","ref_index":78,"is_internal_anchor":true},{"citing_arxiv_id":"2407.09577","citing_title":"FlashNorm: Fast Normalization for Transformers","ref_index":1,"is_internal_anchor":true},{"citing_arxiv_id":"2601.21798","citing_title":"CG-MLLM: Captioning and Generating 3D content via Multi-modal Large Language Models","ref_index":57,"is_internal_anchor":true},{"citing_arxiv_id":"2602.10408","citing_title":"Gated Normalization Removal and Scale Anchoring in Pre-Norm Transformers","ref_index":11,"is_internal_anchor":true},{"citing_arxiv_id":"2605.18933","citing_title":"A Geometric Analysis of Sign-Magnitude Asymmetry in a ReLU + RMSNorm Block under Ternary Quantization","ref_index":13,"is_internal_anchor":true},{"citing_arxiv_id":"2605.18163","citing_title":"TRACE: Trajectory Correction from Cross-layer Evidence for Hallucination Reduction","ref_index":47,"is_internal_anchor":true},{"citing_arxiv_id":"2512.20856","citing_title":"NVIDIA Nemotron 3: Efficient and Open Intelligence","ref_index":116,"is_internal_anchor":true},{"citing_arxiv_id":"2601.14910","citing_title":"PipeWeave: Synergizing Analytical and Learning Models for Unified GPU Performance Prediction","ref_index":77,"is_internal_anchor":true},{"citing_arxiv_id":"2509.06951","citing_title":"F1: A Vision-Language-Action Model Bridging Understanding and Generation to Actions","ref_index":30,"is_internal_anchor":true},{"citing_arxiv_id":"2601.17636","citing_title":"HealDA: Highlighting the importance of initial errors in end-to-end AI weather forecasts","ref_index":45,"is_internal_anchor":true},{"citing_arxiv_id":"2605.10886","citing_title":"LoKA: Low-precision Kernel Applications for Recommendation Models At Scale","ref_index":82,"is_internal_anchor":true},{"citing_arxiv_id":"2506.21734","citing_title":"Hierarchical Reasoning Model","ref_index":58,"is_internal_anchor":true},{"citing_arxiv_id":"2605.12011","citing_title":"CaloArt: Large-Patch x-Prediction Diffusion Transformers for High-Granularity Calorimeter Shower Generation","ref_index":46,"is_internal_anchor":true},{"citing_arxiv_id":"2202.08906","citing_title":"ST-MoE: Designing Stable and Transferable Sparse Expert Models","ref_index":213,"is_internal_anchor":true},{"citing_arxiv_id":"2605.10886","citing_title":"LoKA: Low-precision Kernel Applications for Recommendation Models At Scale","ref_index":82,"is_internal_anchor":true},{"citing_arxiv_id":"2605.08300","citing_title":"mHC-SSM: Manifold-Constrained Hyper-Connections for State Space Language Models with Stream-Specialized Adapters","ref_index":24,"is_internal_anchor":true},{"citing_arxiv_id":"2604.24809","citing_title":"Nautile-370M: Spectral Memory Meets Attention in a Small Reasoning Model","ref_index":18,"is_internal_anchor":true},{"citing_arxiv_id":"2604.23318","citing_title":"Hidden States Know Where Reasoning Diverges: Credit Assignment via Span-Level Wasserstein Distance","ref_index":32,"is_internal_anchor":true},{"citing_arxiv_id":"2605.04527","citing_title":"Velox: Learning Representations of 4D Geometry and Appearance","ref_index":114,"is_internal_anchor":true},{"citing_arxiv_id":"2605.04418","citing_title":"Demystifying Manifold Constraints in LLM Pre-training","ref_index":29,"is_internal_anchor":true},{"citing_arxiv_id":"2605.00206","citing_title":"State Stream Transformer (SST) V2: Parallel Training of Nonlinear Recurrence for Latent Space Reasoning","ref_index":54,"is_internal_anchor":true},{"citing_arxiv_id":"2604.12946","citing_title":"Parcae: Scaling Laws For Stable Looped Language Models","ref_index":92,"is_internal_anchor":true},{"citing_arxiv_id":"2403.08295","citing_title":"Gemma: Open Models Based on Gemini Research and Technology","ref_index":99,"is_internal_anchor":true},{"citing_arxiv_id":"2408.00118","citing_title":"Gemma 2: Improving Open Language Models at a Practical Size","ref_index":111,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":2,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/QEET3LADZ3NCLD36BMTOHQVDRT","json":"https://pith.science/pith/QEET3LADZ3NCLD36BMTOHQVDRT.json","graph_json":"https://pith.science/api/pith-number/QEET3LADZ3NCLD36BMTOHQVDRT/graph.json","events_json":"https://pith.science/api/pith-number/QEET3LADZ3NCLD36BMTOHQVDRT/events.json","paper":"https://pith.science/paper/QEET3LAD"},"agent_actions":{"view_html":"https://pith.science/pith/QEET3LADZ3NCLD36BMTOHQVDRT","download_json":"https://pith.science/pith/QEET3LADZ3NCLD36BMTOHQVDRT.json","view_paper":"https://pith.science/paper/QEET3LAD","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1910.07467&json=true","fetch_graph":"https://pith.science/api/pith-number/QEET3LADZ3NCLD36BMTOHQVDRT/graph.json","fetch_events":"https://pith.science/api/pith-number/QEET3LADZ3NCLD36BMTOHQVDRT/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/QEET3LADZ3NCLD36BMTOHQVDRT/action/timestamp_anchor","attest_storage":"https://pith.science/pith/QEET3LADZ3NCLD36BMTOHQVDRT/action/storage_attestation","attest_author":"https://pith.science/pith/QEET3LADZ3NCLD36BMTOHQVDRT/action/author_attestation","sign_citation":"https://pith.science/pith/QEET3LADZ3NCLD36BMTOHQVDRT/action/citation_signature","submit_replication":"https://pith.science/pith/QEET3LADZ3NCLD36BMTOHQVDRT/action/replication_record"}},"created_at":"2026-05-17T23:38:13.382999+00:00","updated_at":"2026-05-17T23:38:13.382999+00:00"}