{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2017:R3KV3QAS4QJKEGKVXV3S4MEJM2","short_pith_number":"pith:R3KV3QAS","schema_version":"1.0","canonical_sha256":"8ed55dc012e412a21955bd772e308966beab5975f43dca78f62010eaae1032dc","source":{"kind":"arxiv","id":"1712.07628","version":1},"attestation_state":"computed","paper":{"title":"Improving Generalization Performance by Switching from Adam to SGD","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["math.OC"],"primary_cat":"cs.LG","authors_text":"Nitish Shirish Keskar, Richard Socher","submitted_at":"2017-12-20T18:34:08Z","abstract_excerpt":"Despite superior training outcomes, adaptive optimization methods such as Adam, Adagrad or RMSprop have been found to generalize poorly compared to Stochastic gradient descent (SGD). These methods tend to perform well in the initial portion of training but are outperformed by SGD at later stages of training. We investigate a hybrid strategy that begins training with an adaptive method and switches to SGD when appropriate. Concretely, we propose SWATS, a simple strategy which switches from Adam to SGD when a triggering condition is satisfied. The condition we propose relates to the projection o"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1712.07628","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2017-12-20T18:34:08Z","cross_cats_sorted":["math.OC"],"title_canon_sha256":"8289420afd7c9f331074fbeae53c2322559be3bf5d7049c386a51b45c9ddefd0","abstract_canon_sha256":"2b08f046e7acd5bbae3c92f8d779ab1415dd3a25d16d09a78134bc7fd3522732"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T00:27:34.007092Z","signature_b64":"1zreiMU58iX5O/iOZavO55M0TIgl3/kKumiyHxT5nVppN1IN/TIQtY4NJBmSodaU4OPPydr/xHP+jboD/IVaCA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"8ed55dc012e412a21955bd772e308966beab5975f43dca78f62010eaae1032dc","last_reissued_at":"2026-05-18T00:27:34.006363Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T00:27:34.006363Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Improving Generalization Performance by Switching from Adam to SGD","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["math.OC"],"primary_cat":"cs.LG","authors_text":"Nitish Shirish Keskar, Richard Socher","submitted_at":"2017-12-20T18:34:08Z","abstract_excerpt":"Despite superior training outcomes, adaptive optimization methods such as Adam, Adagrad or RMSprop have been found to generalize poorly compared to Stochastic gradient descent (SGD). These methods tend to perform well in the initial portion of training but are outperformed by SGD at later stages of training. We investigate a hybrid strategy that begins training with an adaptive method and switches to SGD when appropriate. Concretely, we propose SWATS, a simple strategy which switches from Adam to SGD when a triggering condition is satisfied. The condition we propose relates to the projection o"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1712.07628","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1712.07628","created_at":"2026-05-18T00:27:34.006486+00:00"},{"alias_kind":"arxiv_version","alias_value":"1712.07628v1","created_at":"2026-05-18T00:27:34.006486+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1712.07628","created_at":"2026-05-18T00:27:34.006486+00:00"},{"alias_kind":"pith_short_12","alias_value":"R3KV3QAS4QJK","created_at":"2026-05-18T12:31:39.905425+00:00"},{"alias_kind":"pith_short_16","alias_value":"R3KV3QAS4QJKEGKV","created_at":"2026-05-18T12:31:39.905425+00:00"},{"alias_kind":"pith_short_8","alias_value":"R3KV3QAS","created_at":"2026-05-18T12:31:39.905425+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":7,"internal_anchor_count":5,"sample":[{"citing_arxiv_id":"2605.20533","citing_title":"Ada2MS: A Hybrid Optimization Algorithm Based on Exponential Mixing of Elementwise and Global Second-Moment Estimates","ref_index":6,"is_internal_anchor":true},{"citing_arxiv_id":"2605.17787","citing_title":"Revisiting the Adam-SGD Gap in LLM Pre-Training: The Role of Large Effective Learning Rates","ref_index":5,"is_internal_anchor":true},{"citing_arxiv_id":"2605.19811","citing_title":"LionMuon: Alternating Spectral and Sign Descent for Efficient Training","ref_index":15,"is_internal_anchor":true},{"citing_arxiv_id":"2605.15416","citing_title":"Margin-Adaptive Confidence Ranking for Reliable LLM Judgement","ref_index":179,"is_internal_anchor":true},{"citing_arxiv_id":"2603.10079","citing_title":"Large Spikes in Stochastic Gradient Descent: A Large-Deviations View","ref_index":30,"is_internal_anchor":true},{"citing_arxiv_id":"2604.25550","citing_title":"Enhancing SignSGD: Small-Batch Convergence Analysis and a Hybrid Switching Strategy","ref_index":8,"is_internal_anchor":false},{"citing_arxiv_id":"2604.22838","citing_title":"Neural Network Optimization Reimagined: Decoupled Techniques for Scratch and Fine-Tuning","ref_index":21,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/R3KV3QAS4QJKEGKVXV3S4MEJM2","json":"https://pith.science/pith/R3KV3QAS4QJKEGKVXV3S4MEJM2.json","graph_json":"https://pith.science/api/pith-number/R3KV3QAS4QJKEGKVXV3S4MEJM2/graph.json","events_json":"https://pith.science/api/pith-number/R3KV3QAS4QJKEGKVXV3S4MEJM2/events.json","paper":"https://pith.science/paper/R3KV3QAS"},"agent_actions":{"view_html":"https://pith.science/pith/R3KV3QAS4QJKEGKVXV3S4MEJM2","download_json":"https://pith.science/pith/R3KV3QAS4QJKEGKVXV3S4MEJM2.json","view_paper":"https://pith.science/paper/R3KV3QAS","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1712.07628&json=true","fetch_graph":"https://pith.science/api/pith-number/R3KV3QAS4QJKEGKVXV3S4MEJM2/graph.json","fetch_events":"https://pith.science/api/pith-number/R3KV3QAS4QJKEGKVXV3S4MEJM2/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/R3KV3QAS4QJKEGKVXV3S4MEJM2/action/timestamp_anchor","attest_storage":"https://pith.science/pith/R3KV3QAS4QJKEGKVXV3S4MEJM2/action/storage_attestation","attest_author":"https://pith.science/pith/R3KV3QAS4QJKEGKVXV3S4MEJM2/action/author_attestation","sign_citation":"https://pith.science/pith/R3KV3QAS4QJKEGKVXV3S4MEJM2/action/citation_signature","submit_replication":"https://pith.science/pith/R3KV3QAS4QJKEGKVXV3S4MEJM2/action/replication_record"}},"created_at":"2026-05-18T00:27:34.006486+00:00","updated_at":"2026-05-18T00:27:34.006486+00:00"}