{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2018:MQTFJ7CKS7QYHTNSIU3H22MTA6","short_pith_number":"pith:MQTFJ7CK","schema_version":"1.0","canonical_sha256":"642654fc4a97e183cdb245367d699307a1dced49f05682222ed6e3c237ce4254","source":{"kind":"arxiv","id":"1811.03804","version":4},"attestation_state":"computed","paper":{"title":"Gradient Descent Finds Global Minima of Deep Neural Networks","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI","cs.CV","math.OC","stat.ML"],"primary_cat":"cs.LG","authors_text":"Haochuan Li, Jason D. Lee, Liwei Wang, Simon S. Du, Xiyu Zhai","submitted_at":"2018-11-09T07:39:59Z","abstract_excerpt":"Gradient descent finds a global minimum in training deep neural networks despite the objective function being non-convex. The current paper proves gradient descent achieves zero training loss in polynomial time for a deep over-parameterized neural network with residual connections (ResNet). Our analysis relies on the particular structure of the Gram matrix induced by the neural network architecture. This structure allows us to show the Gram matrix is stable throughout the training process and this stability implies the global optimality of the gradient descent algorithm. We further extend our "},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1811.03804","kind":"arxiv","version":4},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2018-11-09T07:39:59Z","cross_cats_sorted":["cs.AI","cs.CV","math.OC","stat.ML"],"title_canon_sha256":"e83fb9bc7cef7c02f92de7238aba2bd3985e07f70fd6191eae758f4efa58234c","abstract_canon_sha256":"e90b7d4c973220abb227d88a48559d09cdc9dfeaa72b2ddd76bc1a9e119f6197"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:44:49.521187Z","signature_b64":"bjxfQElP57kb6TQY1WetC8eMLHyi3yI2Ye0yTF4xURAZaEa+K4thiwOpBAzckWmwg0xMuAur5ElOQQ4RePbbAQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"642654fc4a97e183cdb245367d699307a1dced49f05682222ed6e3c237ce4254","last_reissued_at":"2026-05-17T23:44:49.520621Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:44:49.520621Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Gradient Descent Finds Global Minima of Deep Neural Networks","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI","cs.CV","math.OC","stat.ML"],"primary_cat":"cs.LG","authors_text":"Haochuan Li, Jason D. Lee, Liwei Wang, Simon S. Du, Xiyu Zhai","submitted_at":"2018-11-09T07:39:59Z","abstract_excerpt":"Gradient descent finds a global minimum in training deep neural networks despite the objective function being non-convex. The current paper proves gradient descent achieves zero training loss in polynomial time for a deep over-parameterized neural network with residual connections (ResNet). Our analysis relies on the particular structure of the Gram matrix induced by the neural network architecture. This structure allows us to show the Gram matrix is stable throughout the training process and this stability implies the global optimality of the gradient descent algorithm. We further extend our "},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1811.03804","kind":"arxiv","version":4},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1811.03804","created_at":"2026-05-17T23:44:49.520701+00:00"},{"alias_kind":"arxiv_version","alias_value":"1811.03804v4","created_at":"2026-05-17T23:44:49.520701+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1811.03804","created_at":"2026-05-17T23:44:49.520701+00:00"},{"alias_kind":"pith_short_12","alias_value":"MQTFJ7CKS7QY","created_at":"2026-05-18T12:32:40.477152+00:00"},{"alias_kind":"pith_short_16","alias_value":"MQTFJ7CKS7QYHTNS","created_at":"2026-05-18T12:32:40.477152+00:00"},{"alias_kind":"pith_short_8","alias_value":"MQTFJ7CK","created_at":"2026-05-18T12:32:40.477152+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":4,"internal_anchor_count":4,"sample":[{"citing_arxiv_id":"1906.08899","citing_title":"Limitations of Lazy Training of Two-layers Neural Networks","ref_index":13,"is_internal_anchor":true},{"citing_arxiv_id":"1907.04524","citing_title":"Two-block vs. Multi-block ADMM: An empirical evaluation of convergence","ref_index":20,"is_internal_anchor":true},{"citing_arxiv_id":"2102.11840","citing_title":"Convergence rates for gradient descent in the training of overparameterized artificial neural networks with piecewise affine activation","ref_index":11,"is_internal_anchor":true},{"citing_arxiv_id":"2601.18399","citing_title":"Estimating Dense-Packed Zone Height in Liquid-Liquid Separation: A Physics-Informed Neural Network Approach","ref_index":10,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/MQTFJ7CKS7QYHTNSIU3H22MTA6","json":"https://pith.science/pith/MQTFJ7CKS7QYHTNSIU3H22MTA6.json","graph_json":"https://pith.science/api/pith-number/MQTFJ7CKS7QYHTNSIU3H22MTA6/graph.json","events_json":"https://pith.science/api/pith-number/MQTFJ7CKS7QYHTNSIU3H22MTA6/events.json","paper":"https://pith.science/paper/MQTFJ7CK"},"agent_actions":{"view_html":"https://pith.science/pith/MQTFJ7CKS7QYHTNSIU3H22MTA6","download_json":"https://pith.science/pith/MQTFJ7CKS7QYHTNSIU3H22MTA6.json","view_paper":"https://pith.science/paper/MQTFJ7CK","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1811.03804&json=true","fetch_graph":"https://pith.science/api/pith-number/MQTFJ7CKS7QYHTNSIU3H22MTA6/graph.json","fetch_events":"https://pith.science/api/pith-number/MQTFJ7CKS7QYHTNSIU3H22MTA6/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/MQTFJ7CKS7QYHTNSIU3H22MTA6/action/timestamp_anchor","attest_storage":"https://pith.science/pith/MQTFJ7CKS7QYHTNSIU3H22MTA6/action/storage_attestation","attest_author":"https://pith.science/pith/MQTFJ7CKS7QYHTNSIU3H22MTA6/action/author_attestation","sign_citation":"https://pith.science/pith/MQTFJ7CKS7QYHTNSIU3H22MTA6/action/citation_signature","submit_replication":"https://pith.science/pith/MQTFJ7CKS7QYHTNSIU3H22MTA6/action/replication_record"}},"created_at":"2026-05-17T23:44:49.520701+00:00","updated_at":"2026-05-17T23:44:49.520701+00:00"}