{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2013:4LYXWUIQT54XZQ5VC2XUC77MLC","short_pith_number":"pith:4LYXWUIQ","schema_version":"1.0","canonical_sha256":"e2f17b51109f797cc3b516af417fec58bca3da720bec2e31904de48c7586a89c","source":{"kind":"arxiv","id":"1312.6120","version":3},"attestation_state":"computed","paper":{"title":"Exact solutions to the nonlinear dynamics of learning in deep linear neural networks","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cond-mat.dis-nn","cs.CV","cs.LG","q-bio.NC","stat.ML"],"primary_cat":"cs.NE","authors_text":"Andrew M. Saxe, James L. McClelland, Surya Ganguli","submitted_at":"2013-12-20T20:24:00Z","abstract_excerpt":"Despite the widespread practical success of deep learning methods, our theoretical understanding of the dynamics of learning in deep neural networks remains quite sparse. We attempt to bridge the gap between the theory and practice of deep learning by systematically analyzing learning dynamics for the restricted case of deep linear neural networks. Despite the linearity of their input-output map, such networks have nonlinear gradient descent dynamics on weights that change with the addition of each new hidden layer. We show that deep linear networks exhibit nonlinear learning phenomena similar"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1312.6120","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.NE","submitted_at":"2013-12-20T20:24:00Z","cross_cats_sorted":["cond-mat.dis-nn","cs.CV","cs.LG","q-bio.NC","stat.ML"],"title_canon_sha256":"dada7c67511e83bc201c79bd43baa0b2a4715e0f172b113d56cd793a54041f97","abstract_canon_sha256":"420755b6a13adf908b01f74f367405d27a9467e81378b61293338b9a5e413f38"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T02:58:34.283997Z","signature_b64":"1fvLFTdt09ZvKjDN2m60W+OFyKorpLolGLB7SJIv8NbYyvmSpfnVyfkXt6q8QzjpqiQKQXiltJcopoLdQeo1DQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"e2f17b51109f797cc3b516af417fec58bca3da720bec2e31904de48c7586a89c","last_reissued_at":"2026-05-18T02:58:34.282675Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T02:58:34.282675Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Exact solutions to the nonlinear dynamics of learning in deep linear neural networks","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cond-mat.dis-nn","cs.CV","cs.LG","q-bio.NC","stat.ML"],"primary_cat":"cs.NE","authors_text":"Andrew M. Saxe, James L. McClelland, Surya Ganguli","submitted_at":"2013-12-20T20:24:00Z","abstract_excerpt":"Despite the widespread practical success of deep learning methods, our theoretical understanding of the dynamics of learning in deep neural networks remains quite sparse. We attempt to bridge the gap between the theory and practice of deep learning by systematically analyzing learning dynamics for the restricted case of deep linear neural networks. Despite the linearity of their input-output map, such networks have nonlinear gradient descent dynamics on weights that change with the addition of each new hidden layer. We show that deep linear networks exhibit nonlinear learning phenomena similar"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1312.6120","kind":"arxiv","version":3},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1312.6120","created_at":"2026-05-18T02:58:34.283241+00:00"},{"alias_kind":"arxiv_version","alias_value":"1312.6120v3","created_at":"2026-05-18T02:58:34.283241+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1312.6120","created_at":"2026-05-18T02:58:34.283241+00:00"},{"alias_kind":"pith_short_12","alias_value":"4LYXWUIQT54X","created_at":"2026-05-18T12:27:34.582898+00:00"},{"alias_kind":"pith_short_16","alias_value":"4LYXWUIQT54XZQ5V","created_at":"2026-05-18T12:27:34.582898+00:00"},{"alias_kind":"pith_short_8","alias_value":"4LYXWUIQ","created_at":"2026-05-18T12:27:34.582898+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":37,"internal_anchor_count":19,"sample":[{"citing_arxiv_id":"2605.23033","citing_title":"Uncovering the Latent Potential of Deep Intermediate Representations","ref_index":16,"is_internal_anchor":true},{"citing_arxiv_id":"1907.02911","citing_title":"Weight-space symmetry in deep networks gives rise to permutation saddles, connected by equal-loss valleys across the loss landscape","ref_index":18,"is_internal_anchor":true},{"citing_arxiv_id":"2405.13901","citing_title":"Discrete Cosine Transform Based Decorrelated Attention for Vision Transformers","ref_index":29,"is_internal_anchor":true},{"citing_arxiv_id":"2503.03206","citing_title":"An Analytical Theory of Spectral Bias in the Learning Dynamics of Diffusion Models","ref_index":15,"is_internal_anchor":true},{"citing_arxiv_id":"2605.07870","citing_title":"Spectral Dynamics in Deep Networks: Feature Learning, Outlier Escape, and Learning Rate Transfer","ref_index":66,"is_internal_anchor":true},{"citing_arxiv_id":"2502.07529","citing_title":"Training Deep Learning Models with Norm-Constrained LMOs","ref_index":205,"is_internal_anchor":true},{"citing_arxiv_id":"2510.26745","citing_title":"Deep sequence models tend to memorize geometrically; it is unclear why","ref_index":158,"is_internal_anchor":true},{"citing_arxiv_id":"2605.21174","citing_title":"Exact expression for maximum Lyapunov exponent during transients in computationally powerful dynamical networks","ref_index":51,"is_internal_anchor":true},{"citing_arxiv_id":"2605.15416","citing_title":"Margin-Adaptive Confidence Ranking for Reliable LLM Judgement","ref_index":226,"is_internal_anchor":true},{"citing_arxiv_id":"2506.17530","citing_title":"Deep-OFDM: Neural Modulation for High Mobility","ref_index":28,"is_internal_anchor":true},{"citing_arxiv_id":"2508.12121","citing_title":"Time-Scale Coupling Between States and Parameters in Recurrent Neural Networks","ref_index":8,"is_internal_anchor":true},{"citing_arxiv_id":"2509.06653","citing_title":"Classical Neural Networks on Quantum Devices via Tensor Network Disentanglers: A Case Study in Image Classification","ref_index":38,"is_internal_anchor":true},{"citing_arxiv_id":"2510.21588","citing_title":"Contribution of task-irrelevant stimuli to drift of neural representations","ref_index":36,"is_internal_anchor":true},{"citing_arxiv_id":"2512.23405","citing_title":"On the Sample Complexity of Learning for Blind Inverse Problems","ref_index":7,"is_internal_anchor":true},{"citing_arxiv_id":"2603.02622","citing_title":"Implicit Bias in Deep Linear Discriminant Analysis","ref_index":14,"is_internal_anchor":true},{"citing_arxiv_id":"2603.22347","citing_title":"Intelligence Inertia: Physical Isomorphism and Applications","ref_index":65,"is_internal_anchor":true},{"citing_arxiv_id":"2401.01335","citing_title":"Self-Play Fine-Tuning Converts Weak Language Models to Strong Language Models","ref_index":34,"is_internal_anchor":true},{"citing_arxiv_id":"2605.12580","citing_title":"CAWI: Copula-Aligned Weight Initialization for Randomized Neural Networks","ref_index":42,"is_internal_anchor":true},{"citing_arxiv_id":"2605.12763","citing_title":"State-Space NTK Collapse Near Bifurcations","ref_index":67,"is_internal_anchor":true},{"citing_arxiv_id":"1502.03167","citing_title":"Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift","ref_index":17,"is_internal_anchor":false},{"citing_arxiv_id":"2604.27031","citing_title":"NORACL: Neurogenesis for Oracle-free Resource-Adaptive Continual Learning","ref_index":20,"is_internal_anchor":false},{"citing_arxiv_id":"2605.08171","citing_title":"Communication Dynamics Neural Networks: FFT-Diagonalized Layers for Improved Hessian Conditioning at Reduced Parameter Count","ref_index":27,"is_internal_anchor":false},{"citing_arxiv_id":"2605.09243","citing_title":"How Much is Brain Data Worth for Machine Learning?","ref_index":31,"is_internal_anchor":false},{"citing_arxiv_id":"2605.09355","citing_title":"FLAME: Adaptive Mixture-of-Experts for Continual Multimodal Multi-Task Learning","ref_index":49,"is_internal_anchor":false},{"citing_arxiv_id":"2605.08746","citing_title":"The Global Empirical NTK: Self-Referential Bias and Dimensionality of Gradient Descent Learning","ref_index":67,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/4LYXWUIQT54XZQ5VC2XUC77MLC","json":"https://pith.science/pith/4LYXWUIQT54XZQ5VC2XUC77MLC.json","graph_json":"https://pith.science/api/pith-number/4LYXWUIQT54XZQ5VC2XUC77MLC/graph.json","events_json":"https://pith.science/api/pith-number/4LYXWUIQT54XZQ5VC2XUC77MLC/events.json","paper":"https://pith.science/paper/4LYXWUIQ"},"agent_actions":{"view_html":"https://pith.science/pith/4LYXWUIQT54XZQ5VC2XUC77MLC","download_json":"https://pith.science/pith/4LYXWUIQT54XZQ5VC2XUC77MLC.json","view_paper":"https://pith.science/paper/4LYXWUIQ","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1312.6120&json=true","fetch_graph":"https://pith.science/api/pith-number/4LYXWUIQT54XZQ5VC2XUC77MLC/graph.json","fetch_events":"https://pith.science/api/pith-number/4LYXWUIQT54XZQ5VC2XUC77MLC/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/4LYXWUIQT54XZQ5VC2XUC77MLC/action/timestamp_anchor","attest_storage":"https://pith.science/pith/4LYXWUIQT54XZQ5VC2XUC77MLC/action/storage_attestation","attest_author":"https://pith.science/pith/4LYXWUIQT54XZQ5VC2XUC77MLC/action/author_attestation","sign_citation":"https://pith.science/pith/4LYXWUIQT54XZQ5VC2XUC77MLC/action/citation_signature","submit_replication":"https://pith.science/pith/4LYXWUIQT54XZQ5VC2XUC77MLC/action/replication_record"}},"created_at":"2026-05-18T02:58:34.283241+00:00","updated_at":"2026-05-18T02:58:34.283241+00:00"}