{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2018:SETOAZEYAMXLS2ZIS56Z4PQUM4","short_pith_number":"pith:SETOAZEY","schema_version":"1.0","canonical_sha256":"9126e06498032eb96b28977d9e3e146721a5a4ccda077940e46273ad7ec17c3b","source":{"kind":"arxiv","id":"1803.05407","version":3},"attestation_state":"computed","paper":{"title":"Averaging Weights Leads to Wider Optima and Better Generalization","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI","cs.CV","stat.ML"],"primary_cat":"cs.LG","authors_text":"Andrew Gordon Wilson, Dmitrii Podoprikhin, Dmitry Vetrov, Pavel Izmailov, Timur Garipov","submitted_at":"2018-03-14T17:09:27Z","abstract_excerpt":"Deep neural networks are typically trained by optimizing a loss function with an SGD variant, in conjunction with a decaying learning rate, until convergence. We show that simple averaging of multiple points along the trajectory of SGD, with a cyclical or constant learning rate, leads to better generalization than conventional training. We also show that this Stochastic Weight Averaging (SWA) procedure finds much flatter solutions than SGD, and approximates the recent Fast Geometric Ensembling (FGE) approach with a single model. Using SWA we achieve notable improvement in test accuracy over co"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1803.05407","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2018-03-14T17:09:27Z","cross_cats_sorted":["cs.AI","cs.CV","stat.ML"],"title_canon_sha256":"d607ccfd50cde3bb43b1255345618b2b00a1b5046f1516107637080d707abb64","abstract_canon_sha256":"5e265993e34fccbbb2148419991c16061c9177440d5fb622841a059d5e441e7d"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:52:53.272097Z","signature_b64":"lTS72qpnAt9JA3LEe+BLu0X8Aqwr1AdXeV92JQcNIHq/ajJqUicU2G3kmGG6vzk8CXju6TVFdk/mghXZ/+jUAA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"9126e06498032eb96b28977d9e3e146721a5a4ccda077940e46273ad7ec17c3b","last_reissued_at":"2026-05-17T23:52:53.271272Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:52:53.271272Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Averaging Weights Leads to Wider Optima and Better Generalization","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI","cs.CV","stat.ML"],"primary_cat":"cs.LG","authors_text":"Andrew Gordon Wilson, Dmitrii Podoprikhin, Dmitry Vetrov, Pavel Izmailov, Timur Garipov","submitted_at":"2018-03-14T17:09:27Z","abstract_excerpt":"Deep neural networks are typically trained by optimizing a loss function with an SGD variant, in conjunction with a decaying learning rate, until convergence. We show that simple averaging of multiple points along the trajectory of SGD, with a cyclical or constant learning rate, leads to better generalization than conventional training. We also show that this Stochastic Weight Averaging (SWA) procedure finds much flatter solutions than SGD, and approximates the recent Fast Geometric Ensembling (FGE) approach with a single model. Using SWA we achieve notable improvement in test accuracy over co"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1803.05407","kind":"arxiv","version":3},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1803.05407","created_at":"2026-05-17T23:52:53.271426+00:00"},{"alias_kind":"arxiv_version","alias_value":"1803.05407v3","created_at":"2026-05-17T23:52:53.271426+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1803.05407","created_at":"2026-05-17T23:52:53.271426+00:00"},{"alias_kind":"pith_short_12","alias_value":"SETOAZEYAMXL","created_at":"2026-05-18T12:32:53.628368+00:00"},{"alias_kind":"pith_short_16","alias_value":"SETOAZEYAMXLS2ZI","created_at":"2026-05-18T12:32:53.628368+00:00"},{"alias_kind":"pith_short_8","alias_value":"SETOAZEY","created_at":"2026-05-18T12:32:53.628368+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":34,"internal_anchor_count":14,"sample":[{"citing_arxiv_id":"2605.23061","citing_title":"Anytime Training with Schedule-Free Spectral Optimization","ref_index":28,"is_internal_anchor":true},{"citing_arxiv_id":"2410.14375","citing_title":"Causal Fine-Tuning under Latent Confounded Shift","ref_index":25,"is_internal_anchor":true},{"citing_arxiv_id":"2504.05902","citing_title":"Defending against Backdoor Attacks via Module Switching","ref_index":19,"is_internal_anchor":true},{"citing_arxiv_id":"2605.18878","citing_title":"Prognostic Value of Lung Ultrasound Biomarkers for Readmission Risk in Congestive Heart Failure: A Pilot Data-Driven Analysis","ref_index":144,"is_internal_anchor":true},{"citing_arxiv_id":"2502.05564","citing_title":"TabICL: A Tabular Foundation Model for In-Context Learning on Large Data","ref_index":245,"is_internal_anchor":true},{"citing_arxiv_id":"2605.17575","citing_title":"UniAlign: A Model-Agnostic Framework for Robust Network Traffic Classification under Distribution Shifts","ref_index":46,"is_internal_anchor":true},{"citing_arxiv_id":"2605.15459","citing_title":"Don't Stop Me Yet: Sampling Loss Minima via Dissipative Riemannian Mechanics","ref_index":46,"is_internal_anchor":true},{"citing_arxiv_id":"2408.07666","citing_title":"Model Merging in LLMs, MLLMs, and Beyond: Methods, Theories, Applications and Opportunities","ref_index":89,"is_internal_anchor":true},{"citing_arxiv_id":"2512.10275","citing_title":"Sample-wise Adaptive Weighting for Transfer Consistency in Adversarial Distillation","ref_index":35,"is_internal_anchor":true},{"citing_arxiv_id":"2010.01412","citing_title":"Sharpness-Aware Minimization for Efficiently Improving Generalization","ref_index":21,"is_internal_anchor":true},{"citing_arxiv_id":"2509.20354","citing_title":"EmbeddingGemma: Powerful and Lightweight Text Representations","ref_index":7,"is_internal_anchor":true},{"citing_arxiv_id":"2405.07987","citing_title":"The Platonic Representation Hypothesis","ref_index":101,"is_internal_anchor":true},{"citing_arxiv_id":"2605.14521","citing_title":"Enjoy Your Layer Normalization with the Computational Efficiency of RMSNorm","ref_index":39,"is_internal_anchor":true},{"citing_arxiv_id":"2603.26499","citing_title":"AIRA_2: Overcoming Bottlenecks in AI Research Agents","ref_index":9,"is_internal_anchor":true},{"citing_arxiv_id":"2604.02719","citing_title":"MOMO: Mars Orbital Model Foundation Model for Mars Orbital Applications","ref_index":32,"is_internal_anchor":false},{"citing_arxiv_id":"2309.16588","citing_title":"Vision Transformers Need Registers","ref_index":166,"is_internal_anchor":false},{"citing_arxiv_id":"2605.11394","citing_title":"Spatial Adapter: Structured Spatial Decomposition and Closed-Form Covariance for Frozen Predictors","ref_index":21,"is_internal_anchor":false},{"citing_arxiv_id":"2605.08870","citing_title":"TopoGeoScore: A Self-Supervised Source-Only Geometric Framework for OOD Checkpoint Selection","ref_index":12,"is_internal_anchor":false},{"citing_arxiv_id":"2605.08311","citing_title":"Revitalizing the Beginning: Avoiding Storage Dependency for Model Merging in Continual Learning","ref_index":30,"is_internal_anchor":false},{"citing_arxiv_id":"2605.10391","citing_title":"Phoenix-VL 1.5 Medium Technical Report","ref_index":11,"is_internal_anchor":false},{"citing_arxiv_id":"2605.04326","citing_title":"A foundation model of vision, audition, and language for in-silico neuroscience","ref_index":65,"is_internal_anchor":false},{"citing_arxiv_id":"2605.05136","citing_title":"CPCANet: Deep Unfolding Common Principal Component Analysis for Domain Generalization","ref_index":44,"is_internal_anchor":false},{"citing_arxiv_id":"2605.01632","citing_title":"Perturb and Correct: Post-Hoc Ensembles using Affine Redundancy","ref_index":24,"is_internal_anchor":false},{"citing_arxiv_id":"2605.02292","citing_title":"Momentum-Anchored Multi-Scale Fusion Model for Long-Tailed Chest X-Ray Classification","ref_index":14,"is_internal_anchor":false},{"citing_arxiv_id":"2604.20985","citing_title":"Differentially Private Model Merging","ref_index":33,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/SETOAZEYAMXLS2ZIS56Z4PQUM4","json":"https://pith.science/pith/SETOAZEYAMXLS2ZIS56Z4PQUM4.json","graph_json":"https://pith.science/api/pith-number/SETOAZEYAMXLS2ZIS56Z4PQUM4/graph.json","events_json":"https://pith.science/api/pith-number/SETOAZEYAMXLS2ZIS56Z4PQUM4/events.json","paper":"https://pith.science/paper/SETOAZEY"},"agent_actions":{"view_html":"https://pith.science/pith/SETOAZEYAMXLS2ZIS56Z4PQUM4","download_json":"https://pith.science/pith/SETOAZEYAMXLS2ZIS56Z4PQUM4.json","view_paper":"https://pith.science/paper/SETOAZEY","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1803.05407&json=true","fetch_graph":"https://pith.science/api/pith-number/SETOAZEYAMXLS2ZIS56Z4PQUM4/graph.json","fetch_events":"https://pith.science/api/pith-number/SETOAZEYAMXLS2ZIS56Z4PQUM4/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/SETOAZEYAMXLS2ZIS56Z4PQUM4/action/timestamp_anchor","attest_storage":"https://pith.science/pith/SETOAZEYAMXLS2ZIS56Z4PQUM4/action/storage_attestation","attest_author":"https://pith.science/pith/SETOAZEYAMXLS2ZIS56Z4PQUM4/action/author_attestation","sign_citation":"https://pith.science/pith/SETOAZEYAMXLS2ZIS56Z4PQUM4/action/citation_signature","submit_replication":"https://pith.science/pith/SETOAZEYAMXLS2ZIS56Z4PQUM4/action/replication_record"}},"created_at":"2026-05-17T23:52:53.271426+00:00","updated_at":"2026-05-17T23:52:53.271426+00:00"}