{"work":{"id":"4c83c7ac-e217-4d73-b433-14f6b522da36","openalex_id":null,"doi":null,"arxiv_id":"1812.04754","raw_key":null,"title":"Gradient Descent Happens in a Tiny Subspace","authors":null,"authors_text":"URLhttps://arxiv","year":2018,"venue":"cs.LG","abstract":"We show that in a variety of large-scale deep learning scenarios the gradient dynamically converges to a very small subspace after a short period of training. The subspace is spanned by a few top eigenvectors of the Hessian (equal to the number of classes in the dataset), and is mostly preserved over long periods of training. A simple argument then suggests that gradient descent may happen mostly in this subspace. We give an example of this effect in a solvable model of classification, and we comment on possible implications for optimization and learning.","external_url":"https://arxiv.org/abs/1812.04754","cited_by_count":null,"metadata_source":"pith","metadata_fetched_at":"2026-05-25T07:35:28.884845+00:00","pith_arxiv_id":"1812.04754","created_at":"2026-05-09T04:51:47.673046+00:00","updated_at":"2026-05-25T07:35:28.884845+00:00","title_quality_ok":true,"display_title":"Gradient descent happens in a tiny subspace.arXiv preprint arXiv:1812.04754","render_title":"Gradient descent happens in a tiny subspace.arXiv preprint arXiv:1812.04754"},"hub":{"state":{"work_id":"4c83c7ac-e217-4d73-b433-14f6b522da36","tier":"hub","tier_reason":"10+ Pith inbound or 1,000+ external citations","pith_inbound_count":20,"external_cited_by_count":null,"distinct_field_count":5,"first_pith_cited_at":"2019-07-24T21:27:13+00:00","last_pith_cited_at":"2026-05-21T12:55:52+00:00","author_build_status":"not_needed","summary_status":"needed","contexts_status":"needed","graph_status":"needed","ask_index_status":"not_needed","reader_status":"not_needed","recognition_status":"not_needed","updated_at":"2026-05-25T23:36:08.690123+00:00","tier_text":"hub"},"tier":"hub","role_counts":[{"context_role":"background","n":2},{"context_role":"method","n":1}],"polarity_counts":[{"context_polarity":"background","n":2},{"context_polarity":"use_method","n":1}],"runs":{},"summary":{},"graph":{},"authors":[]}}