{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2017:NNWZ43PU2DFMGEY6WIMM5Y45BD","short_pith_number":"pith:NNWZ43PU","schema_version":"1.0","canonical_sha256":"6b6d9e6df4d0cac3131eb218cee39d08fadd075e3b3414138cea0dc4fa540c76","source":{"kind":"arxiv","id":"1706.05350","version":1},"attestation_state":"computed","paper":{"title":"L2 Regularization versus Batch and Weight Normalization","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["stat.ML"],"primary_cat":"cs.LG","authors_text":"Twan van Laarhoven","submitted_at":"2017-06-16T17:08:08Z","abstract_excerpt":"Batch Normalization is a commonly used trick to improve the training of deep neural networks. These neural networks use L2 regularization, also called weight decay, ostensibly to prevent overfitting. However, we show that L2 regularization has no regularizing effect when combined with normalization. Instead, regularization has an influence on the scale of weights, and thereby on the effective learning rate. We investigate this dependence, both in theory, and experimentally. We show that popular optimization methods such as ADAM only partially eliminate the influence of normalization on the lea"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1706.05350","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2017-06-16T17:08:08Z","cross_cats_sorted":["stat.ML"],"title_canon_sha256":"ee8541df693c02f1fe2b0b451abf77af864f5f1012c7aa6fdcfe40114d737c09","abstract_canon_sha256":"f347e157ca4ba14136930554f133074a3890d82db449953e3c157f6857f6efd9"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T00:42:14.016779Z","signature_b64":"CElL2RjQuu1x22aqO9d+wIqJBFbaMFC7//tejONEYsFwnaKUP48evE8I7bNjfARxyqPUsaFPvkUdIYWMA2l1Aw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"6b6d9e6df4d0cac3131eb218cee39d08fadd075e3b3414138cea0dc4fa540c76","last_reissued_at":"2026-05-18T00:42:14.016068Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T00:42:14.016068Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"L2 Regularization versus Batch and Weight Normalization","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["stat.ML"],"primary_cat":"cs.LG","authors_text":"Twan van Laarhoven","submitted_at":"2017-06-16T17:08:08Z","abstract_excerpt":"Batch Normalization is a commonly used trick to improve the training of deep neural networks. These neural networks use L2 regularization, also called weight decay, ostensibly to prevent overfitting. However, we show that L2 regularization has no regularizing effect when combined with normalization. Instead, regularization has an influence on the scale of weights, and thereby on the effective learning rate. We investigate this dependence, both in theory, and experimentally. We show that popular optimization methods such as ADAM only partially eliminate the influence of normalization on the lea"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1706.05350","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1706.05350","created_at":"2026-05-18T00:42:14.016189+00:00"},{"alias_kind":"arxiv_version","alias_value":"1706.05350v1","created_at":"2026-05-18T00:42:14.016189+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1706.05350","created_at":"2026-05-18T00:42:14.016189+00:00"},{"alias_kind":"pith_short_12","alias_value":"NNWZ43PU2DFM","created_at":"2026-05-18T12:31:34.259226+00:00"},{"alias_kind":"pith_short_16","alias_value":"NNWZ43PU2DFMGEY6","created_at":"2026-05-18T12:31:34.259226+00:00"},{"alias_kind":"pith_short_8","alias_value":"NNWZ43PU","created_at":"2026-05-18T12:31:34.259226+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":8,"internal_anchor_count":3,"sample":[{"citing_arxiv_id":"2510.04686","citing_title":"How does the optimizer implicitly bias the model merging loss landscape?","ref_index":10,"is_internal_anchor":true},{"citing_arxiv_id":"2511.07308","citing_title":"Can Stationary Distributions of Scale-Invariant Neural Networks Be Described by the Thermodynamics of an Ideal Gas?","ref_index":6,"is_internal_anchor":true},{"citing_arxiv_id":"2512.15930","citing_title":"Scalable Agentic Reasoning for Designing Biologics Targeting Intrinsically Disordered Proteins","ref_index":63,"is_internal_anchor":true},{"citing_arxiv_id":"1710.10196","citing_title":"Progressive Growing of GANs for Improved Quality, Stability, and Variation","ref_index":50,"is_internal_anchor":false},{"citing_arxiv_id":"2605.10734","citing_title":"XQCfD: Accelerating Fast Actor-Critic Algorithms with Prior Data and Prior Policies","ref_index":40,"is_internal_anchor":false},{"citing_arxiv_id":"2605.04418","citing_title":"Demystifying Manifold Constraints in LLM Pre-training","ref_index":60,"is_internal_anchor":false},{"citing_arxiv_id":"2605.00171","citing_title":"Adaptive Norm-Based Regularization for Neural Networks","ref_index":20,"is_internal_anchor":false},{"citing_arxiv_id":"2604.04539","citing_title":"FlashSAC: Fast and Stable Off-Policy Reinforcement Learning for High-Dimensional Robot Control","ref_index":88,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/NNWZ43PU2DFMGEY6WIMM5Y45BD","json":"https://pith.science/pith/NNWZ43PU2DFMGEY6WIMM5Y45BD.json","graph_json":"https://pith.science/api/pith-number/NNWZ43PU2DFMGEY6WIMM5Y45BD/graph.json","events_json":"https://pith.science/api/pith-number/NNWZ43PU2DFMGEY6WIMM5Y45BD/events.json","paper":"https://pith.science/paper/NNWZ43PU"},"agent_actions":{"view_html":"https://pith.science/pith/NNWZ43PU2DFMGEY6WIMM5Y45BD","download_json":"https://pith.science/pith/NNWZ43PU2DFMGEY6WIMM5Y45BD.json","view_paper":"https://pith.science/paper/NNWZ43PU","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1706.05350&json=true","fetch_graph":"https://pith.science/api/pith-number/NNWZ43PU2DFMGEY6WIMM5Y45BD/graph.json","fetch_events":"https://pith.science/api/pith-number/NNWZ43PU2DFMGEY6WIMM5Y45BD/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/NNWZ43PU2DFMGEY6WIMM5Y45BD/action/timestamp_anchor","attest_storage":"https://pith.science/pith/NNWZ43PU2DFMGEY6WIMM5Y45BD/action/storage_attestation","attest_author":"https://pith.science/pith/NNWZ43PU2DFMGEY6WIMM5Y45BD/action/author_attestation","sign_citation":"https://pith.science/pith/NNWZ43PU2DFMGEY6WIMM5Y45BD/action/citation_signature","submit_replication":"https://pith.science/pith/NNWZ43PU2DFMGEY6WIMM5Y45BD/action/replication_record"}},"created_at":"2026-05-18T00:42:14.016189+00:00","updated_at":"2026-05-18T00:42:14.016189+00:00"}