{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2017:UTTGU3KIYM6JKD6ALUV7A7CX74","short_pith_number":"pith:UTTGU3KI","schema_version":"1.0","canonical_sha256":"a4e66a6d48c33c950fc05d2bf07c57ff2c5d85adbe6d2cb53b1f4bf2120e0ec2","source":{"kind":"arxiv","id":"1704.04289","version":2},"attestation_state":"computed","paper":{"title":"Stochastic Gradient Descent as Approximate Bayesian Inference","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.LG"],"primary_cat":"stat.ML","authors_text":"David M. Blei, Matthew D. Hoffman, Stephan Mandt","submitted_at":"2017-04-13T22:17:30Z","abstract_excerpt":"Stochastic Gradient Descent with a constant learning rate (constant SGD) simulates a Markov chain with a stationary distribution. With this perspective, we derive several new results. (1) We show that constant SGD can be used as an approximate Bayesian posterior inference algorithm. Specifically, we show how to adjust the tuning parameters of constant SGD to best match the stationary distribution to a posterior, minimizing the Kullback-Leibler divergence between these two distributions. (2) We demonstrate that constant SGD gives rise to a new variational EM algorithm that optimizes hyperparame"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1704.04289","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"stat.ML","submitted_at":"2017-04-13T22:17:30Z","cross_cats_sorted":["cs.LG"],"title_canon_sha256":"a24370d325eff13bd544c32f17e24a8c23e3fb11d2f5d1fa8b85f960716a713a","abstract_canon_sha256":"0e98092077ce3bc631cccf09b9e72b769cf7d617e9424f99c254c2ab1c246c5b"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T00:25:29.106710Z","signature_b64":"SFtSeQGyZA+oyeYvdYAB/qyn6JRaoa/Svpzxpgt8fWphlFIwRXHutCEqA0GckdKexQ2sW3pMWxqySt4EqT1vAw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"a4e66a6d48c33c950fc05d2bf07c57ff2c5d85adbe6d2cb53b1f4bf2120e0ec2","last_reissued_at":"2026-05-18T00:25:29.106134Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T00:25:29.106134Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Stochastic Gradient Descent as Approximate Bayesian Inference","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.LG"],"primary_cat":"stat.ML","authors_text":"David M. Blei, Matthew D. Hoffman, Stephan Mandt","submitted_at":"2017-04-13T22:17:30Z","abstract_excerpt":"Stochastic Gradient Descent with a constant learning rate (constant SGD) simulates a Markov chain with a stationary distribution. With this perspective, we derive several new results. (1) We show that constant SGD can be used as an approximate Bayesian posterior inference algorithm. Specifically, we show how to adjust the tuning parameters of constant SGD to best match the stationary distribution to a posterior, minimizing the Kullback-Leibler divergence between these two distributions. (2) We demonstrate that constant SGD gives rise to a new variational EM algorithm that optimizes hyperparame"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1704.04289","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1704.04289","created_at":"2026-05-18T00:25:29.106229+00:00"},{"alias_kind":"arxiv_version","alias_value":"1704.04289v2","created_at":"2026-05-18T00:25:29.106229+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1704.04289","created_at":"2026-05-18T00:25:29.106229+00:00"},{"alias_kind":"pith_short_12","alias_value":"UTTGU3KIYM6J","created_at":"2026-05-18T12:31:49.984773+00:00"},{"alias_kind":"pith_short_16","alias_value":"UTTGU3KIYM6JKD6A","created_at":"2026-05-18T12:31:49.984773+00:00"},{"alias_kind":"pith_short_8","alias_value":"UTTGU3KI","created_at":"2026-05-18T12:31:49.984773+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":5,"internal_anchor_count":3,"sample":[{"citing_arxiv_id":"2510.21588","citing_title":"Contribution of task-irrelevant stimuli to drift of neural representations","ref_index":28,"is_internal_anchor":true},{"citing_arxiv_id":"2511.02258","citing_title":"Limit Theorems for Stochastic Gradient Descent in High-Dimensional Single-Layer Networks","ref_index":21,"is_internal_anchor":true},{"citing_arxiv_id":"2102.01293","citing_title":"Scaling Laws for Transfer","ref_index":150,"is_internal_anchor":true},{"citing_arxiv_id":"2112.00861","citing_title":"A General Language Assistant as a Laboratory for Alignment","ref_index":192,"is_internal_anchor":false},{"citing_arxiv_id":"2207.05221","citing_title":"Language Models (Mostly) Know What They Know","ref_index":270,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/UTTGU3KIYM6JKD6ALUV7A7CX74","json":"https://pith.science/pith/UTTGU3KIYM6JKD6ALUV7A7CX74.json","graph_json":"https://pith.science/api/pith-number/UTTGU3KIYM6JKD6ALUV7A7CX74/graph.json","events_json":"https://pith.science/api/pith-number/UTTGU3KIYM6JKD6ALUV7A7CX74/events.json","paper":"https://pith.science/paper/UTTGU3KI"},"agent_actions":{"view_html":"https://pith.science/pith/UTTGU3KIYM6JKD6ALUV7A7CX74","download_json":"https://pith.science/pith/UTTGU3KIYM6JKD6ALUV7A7CX74.json","view_paper":"https://pith.science/paper/UTTGU3KI","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1704.04289&json=true","fetch_graph":"https://pith.science/api/pith-number/UTTGU3KIYM6JKD6ALUV7A7CX74/graph.json","fetch_events":"https://pith.science/api/pith-number/UTTGU3KIYM6JKD6ALUV7A7CX74/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/UTTGU3KIYM6JKD6ALUV7A7CX74/action/timestamp_anchor","attest_storage":"https://pith.science/pith/UTTGU3KIYM6JKD6ALUV7A7CX74/action/storage_attestation","attest_author":"https://pith.science/pith/UTTGU3KIYM6JKD6ALUV7A7CX74/action/author_attestation","sign_citation":"https://pith.science/pith/UTTGU3KIYM6JKD6ALUV7A7CX74/action/citation_signature","submit_replication":"https://pith.science/pith/UTTGU3KIYM6JKD6ALUV7A7CX74/action/replication_record"}},"created_at":"2026-05-18T00:25:29.106229+00:00","updated_at":"2026-05-18T00:25:29.106229+00:00"}