{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2017:FM6VC6MWNLIZ4SB6S7UWZ4UXMB","short_pith_number":"pith:FM6VC6MW","schema_version":"1.0","canonical_sha256":"2b3d5179966ad19e483e97e96cf2976070cdffd50968bcb09f72a107501bed11","source":{"kind":"arxiv","id":"1710.06451","version":3},"attestation_state":"computed","paper":{"title":"A Bayesian Perspective on Generalization and Stochastic Gradient Descent","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI","stat.ML"],"primary_cat":"cs.LG","authors_text":"Quoc V. Le, Samuel L. Smith","submitted_at":"2017-10-17T18:08:04Z","abstract_excerpt":"We consider two questions at the heart of machine learning; how can we predict if a minimum will generalize to the test set, and why does stochastic gradient descent find minima that generalize well? Our work responds to Zhang et al. (2016), who showed deep neural networks can easily memorize randomly labeled training data, despite generalizing well on real labels of the same inputs. We show that the same phenomenon occurs in small linear models. These observations are explained by the Bayesian evidence, which penalizes sharp minima but is invariant to model parameterization. We also demonstra"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1710.06451","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2017-10-17T18:08:04Z","cross_cats_sorted":["cs.AI","stat.ML"],"title_canon_sha256":"c990ac36a8c87148cbd417831fb12fcd2f9334e00e004dfacac6854fbf5dadab","abstract_canon_sha256":"373bc0967e4392b4d72fccd02acc585a9d4d9662cfefa91c37193f292aac91e7"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T00:23:16.570272Z","signature_b64":"JAA5W57p1WQqb5qdFv76P5AbOk3SE26Fkzne1xKEiznydYDxIKRKMJuPSf95gvbY78BKHhD+Rdus8fTQovKNCQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"2b3d5179966ad19e483e97e96cf2976070cdffd50968bcb09f72a107501bed11","last_reissued_at":"2026-05-18T00:23:16.569517Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T00:23:16.569517Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"A Bayesian Perspective on Generalization and Stochastic Gradient Descent","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI","stat.ML"],"primary_cat":"cs.LG","authors_text":"Quoc V. Le, Samuel L. Smith","submitted_at":"2017-10-17T18:08:04Z","abstract_excerpt":"We consider two questions at the heart of machine learning; how can we predict if a minimum will generalize to the test set, and why does stochastic gradient descent find minima that generalize well? Our work responds to Zhang et al. (2016), who showed deep neural networks can easily memorize randomly labeled training data, despite generalizing well on real labels of the same inputs. We show that the same phenomenon occurs in small linear models. These observations are explained by the Bayesian evidence, which penalizes sharp minima but is invariant to model parameterization. We also demonstra"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1710.06451","kind":"arxiv","version":3},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1710.06451","created_at":"2026-05-18T00:23:16.569633+00:00"},{"alias_kind":"arxiv_version","alias_value":"1710.06451v3","created_at":"2026-05-18T00:23:16.569633+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1710.06451","created_at":"2026-05-18T00:23:16.569633+00:00"},{"alias_kind":"pith_short_12","alias_value":"FM6VC6MWNLIZ","created_at":"2026-05-18T12:31:15.632608+00:00"},{"alias_kind":"pith_short_16","alias_value":"FM6VC6MWNLIZ4SB6","created_at":"2026-05-18T12:31:15.632608+00:00"},{"alias_kind":"pith_short_8","alias_value":"FM6VC6MW","created_at":"2026-05-18T12:31:15.632608+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":9,"internal_anchor_count":4,"sample":[{"citing_arxiv_id":"2605.23778","citing_title":"The physics of AI weather models","ref_index":45,"is_internal_anchor":true},{"citing_arxiv_id":"2212.08989","citing_title":"Deep learning applied to computational mechanics: A comprehensive review, state of the art, and the classics","ref_index":188,"is_internal_anchor":true},{"citing_arxiv_id":"2605.22644","citing_title":"Why SGD is not Brownian Motion: A New Perspective on Stochastic Dynamics","ref_index":234,"is_internal_anchor":true},{"citing_arxiv_id":"2102.01293","citing_title":"Scaling Laws for Transfer","ref_index":154,"is_internal_anchor":true},{"citing_arxiv_id":"2605.11491","citing_title":"Understanding and Preventing Entropy Collapse in RLVR with On-Policy Entropy Flow Optimization","ref_index":47,"is_internal_anchor":false},{"citing_arxiv_id":"1712.00409","citing_title":"Deep Learning Scaling is Predictable, Empirically","ref_index":10,"is_internal_anchor":false},{"citing_arxiv_id":"2605.05115","citing_title":"Manifold Steering Reveals the Shared Geometry of Neural Network Representation and Behavior","ref_index":246,"is_internal_anchor":false},{"citing_arxiv_id":"2112.00861","citing_title":"A General Language Assistant as a Laboratory for Alignment","ref_index":196,"is_internal_anchor":false},{"citing_arxiv_id":"2207.05221","citing_title":"Language Models (Mostly) Know What They Know","ref_index":274,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/FM6VC6MWNLIZ4SB6S7UWZ4UXMB","json":"https://pith.science/pith/FM6VC6MWNLIZ4SB6S7UWZ4UXMB.json","graph_json":"https://pith.science/api/pith-number/FM6VC6MWNLIZ4SB6S7UWZ4UXMB/graph.json","events_json":"https://pith.science/api/pith-number/FM6VC6MWNLIZ4SB6S7UWZ4UXMB/events.json","paper":"https://pith.science/paper/FM6VC6MW"},"agent_actions":{"view_html":"https://pith.science/pith/FM6VC6MWNLIZ4SB6S7UWZ4UXMB","download_json":"https://pith.science/pith/FM6VC6MWNLIZ4SB6S7UWZ4UXMB.json","view_paper":"https://pith.science/paper/FM6VC6MW","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1710.06451&json=true","fetch_graph":"https://pith.science/api/pith-number/FM6VC6MWNLIZ4SB6S7UWZ4UXMB/graph.json","fetch_events":"https://pith.science/api/pith-number/FM6VC6MWNLIZ4SB6S7UWZ4UXMB/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/FM6VC6MWNLIZ4SB6S7UWZ4UXMB/action/timestamp_anchor","attest_storage":"https://pith.science/pith/FM6VC6MWNLIZ4SB6S7UWZ4UXMB/action/storage_attestation","attest_author":"https://pith.science/pith/FM6VC6MWNLIZ4SB6S7UWZ4UXMB/action/author_attestation","sign_citation":"https://pith.science/pith/FM6VC6MWNLIZ4SB6S7UWZ4UXMB/action/citation_signature","submit_replication":"https://pith.science/pith/FM6VC6MWNLIZ4SB6S7UWZ4UXMB/action/replication_record"}},"created_at":"2026-05-18T00:23:16.569633+00:00","updated_at":"2026-05-18T00:23:16.569633+00:00"}