{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2018:MJ2WA57DQNU52CGKLQGUREQPWC","short_pith_number":"pith:MJ2WA57D","schema_version":"1.0","canonical_sha256":"62756077e38369dd08ca5c0d48920fb08e08ea73a30ffc793368bcb2808112f5","source":{"kind":"arxiv","id":"1811.08888","version":3},"attestation_state":"computed","paper":{"title":"Stochastic Gradient Descent Optimizes Over-parameterized Deep ReLU Networks","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI","math.OC","stat.ML"],"primary_cat":"cs.LG","authors_text":"Difan Zou, Dongruo Zhou, Quanquan Gu, Yuan Cao","submitted_at":"2018-11-21T18:58:46Z","abstract_excerpt":"We study the problem of training deep neural networks with Rectified Linear Unit (ReLU) activation function using gradient descent and stochastic gradient descent. In particular, we study the binary classification problem and show that for a broad family of loss functions, with proper random weight initialization, both gradient descent and stochastic gradient descent can find the global minima of the training loss for an over-parameterized deep ReLU network, under mild assumption on the training data. The key idea of our proof is that Gaussian random initialization followed by (stochastic) gra"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1811.08888","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2018-11-21T18:58:46Z","cross_cats_sorted":["cs.AI","math.OC","stat.ML"],"title_canon_sha256":"ba3d66e38335314c46352c85f975f0a56f2edf99fa82881beb3d59975108a5c7","abstract_canon_sha256":"9b3ac1e631028645c4f2f77f7b1033aa2429571b6f257d3d903a7b2e6a28e50f"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:57:21.181773Z","signature_b64":"9rGZulk9NOyfgjf3SYJdn2Glu6yZLbaYVcifoKVKzM2LNgYhW+l8vUUq5X5mxJaCjk+c37LB8ZXGIaauX8GcAw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"62756077e38369dd08ca5c0d48920fb08e08ea73a30ffc793368bcb2808112f5","last_reissued_at":"2026-05-17T23:57:21.181083Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:57:21.181083Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Stochastic Gradient Descent Optimizes Over-parameterized Deep ReLU Networks","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI","math.OC","stat.ML"],"primary_cat":"cs.LG","authors_text":"Difan Zou, Dongruo Zhou, Quanquan Gu, Yuan Cao","submitted_at":"2018-11-21T18:58:46Z","abstract_excerpt":"We study the problem of training deep neural networks with Rectified Linear Unit (ReLU) activation function using gradient descent and stochastic gradient descent. In particular, we study the binary classification problem and show that for a broad family of loss functions, with proper random weight initialization, both gradient descent and stochastic gradient descent can find the global minima of the training loss for an over-parameterized deep ReLU network, under mild assumption on the training data. The key idea of our proof is that Gaussian random initialization followed by (stochastic) gra"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1811.08888","kind":"arxiv","version":3},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1811.08888","created_at":"2026-05-17T23:57:21.181175+00:00"},{"alias_kind":"arxiv_version","alias_value":"1811.08888v3","created_at":"2026-05-17T23:57:21.181175+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1811.08888","created_at":"2026-05-17T23:57:21.181175+00:00"},{"alias_kind":"pith_short_12","alias_value":"MJ2WA57DQNU5","created_at":"2026-05-18T12:32:37.024351+00:00"},{"alias_kind":"pith_short_16","alias_value":"MJ2WA57DQNU52CGK","created_at":"2026-05-18T12:32:37.024351+00:00"},{"alias_kind":"pith_short_8","alias_value":"MJ2WA57D","created_at":"2026-05-18T12:32:37.024351+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":7,"internal_anchor_count":6,"sample":[{"citing_arxiv_id":"1906.08899","citing_title":"Limitations of Lazy Training of Two-layers Neural Networks","ref_index":36,"is_internal_anchor":true},{"citing_arxiv_id":"1907.00560","citing_title":"On Symmetry and Initialization for Neural Networks","ref_index":44,"is_internal_anchor":true},{"citing_arxiv_id":"1907.04524","citing_title":"Two-block vs. Multi-block ADMM: An empirical evaluation of convergence","ref_index":21,"is_internal_anchor":true},{"citing_arxiv_id":"1907.10732","citing_title":"Hessian based analysis of SGD for Deep Nets: Dynamics and Generalization","ref_index":82,"is_internal_anchor":true},{"citing_arxiv_id":"2102.11840","citing_title":"Convergence rates for gradient descent in the training of overparameterized artificial neural networks with piecewise affine activation","ref_index":29,"is_internal_anchor":true},{"citing_arxiv_id":"2401.01335","citing_title":"Self-Play Fine-Tuning Converts Weak Language Models to Strong Language Models","ref_index":232,"is_internal_anchor":true},{"citing_arxiv_id":"2604.10208","citing_title":"Mild Over-Parameterization Benefits Asymmetric Tensor PCA","ref_index":6,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/MJ2WA57DQNU52CGKLQGUREQPWC","json":"https://pith.science/pith/MJ2WA57DQNU52CGKLQGUREQPWC.json","graph_json":"https://pith.science/api/pith-number/MJ2WA57DQNU52CGKLQGUREQPWC/graph.json","events_json":"https://pith.science/api/pith-number/MJ2WA57DQNU52CGKLQGUREQPWC/events.json","paper":"https://pith.science/paper/MJ2WA57D"},"agent_actions":{"view_html":"https://pith.science/pith/MJ2WA57DQNU52CGKLQGUREQPWC","download_json":"https://pith.science/pith/MJ2WA57DQNU52CGKLQGUREQPWC.json","view_paper":"https://pith.science/paper/MJ2WA57D","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1811.08888&json=true","fetch_graph":"https://pith.science/api/pith-number/MJ2WA57DQNU52CGKLQGUREQPWC/graph.json","fetch_events":"https://pith.science/api/pith-number/MJ2WA57DQNU52CGKLQGUREQPWC/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/MJ2WA57DQNU52CGKLQGUREQPWC/action/timestamp_anchor","attest_storage":"https://pith.science/pith/MJ2WA57DQNU52CGKLQGUREQPWC/action/storage_attestation","attest_author":"https://pith.science/pith/MJ2WA57DQNU52CGKLQGUREQPWC/action/author_attestation","sign_citation":"https://pith.science/pith/MJ2WA57DQNU52CGKLQGUREQPWC/action/citation_signature","submit_replication":"https://pith.science/pith/MJ2WA57DQNU52CGKLQGUREQPWC/action/replication_record"}},"created_at":"2026-05-17T23:57:21.181175+00:00","updated_at":"2026-05-17T23:57:21.181175+00:00"}