{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2017:SKJZJQ7ZIEOW6PLDT7BTC3I66P","short_pith_number":"pith:SKJZJQ7Z","schema_version":"1.0","canonical_sha256":"929394c3f9411d6f3d639fc3316d1ef3fd01bd461ad2a89afc8f11284ef108f5","source":{"kind":"arxiv","id":"1706.04454","version":3},"attestation_state":"computed","paper":{"title":"Empirical Analysis of the Hessian of Over-Parametrized Neural Networks","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.LG","authors_text":"Leon Bottou, Levent Sagun, Utku Evci, V. Ugur Guney, Yann Dauphin","submitted_at":"2017-06-14T12:50:00Z","abstract_excerpt":"We study the properties of common loss surfaces through their Hessian matrix. In particular, in the context of deep learning, we empirically show that the spectrum of the Hessian is composed of two parts: (1) the bulk centered near zero, (2) and outliers away from the bulk. We present numerical evidence and mathematical justifications to the following conjectures laid out by Sagun et al. (2016): Fixing data, increasing the number of parameters merely scales the bulk of the spectrum; fixing the dimension and changing the data (for instance adding more clusters or making the data less separable)"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1706.04454","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2017-06-14T12:50:00Z","cross_cats_sorted":[],"title_canon_sha256":"8f65047c1bd0116a91b51e7a0bc79b3bf369107262955beabe1df872d4779bb9","abstract_canon_sha256":"f5e3ad147b8b566e36896de2c8d56201c1a248da8bf4a83b781ef3a5c137099c"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T00:16:45.252801Z","signature_b64":"R4+PD7EnY6FfxF7Jfi7P4WAH4d9ZAIXqc6w94Gmb0ZOoAvUQuk4kyEA0p40uXZqJXi1cEA9qteWho7T9OLHUCg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"929394c3f9411d6f3d639fc3316d1ef3fd01bd461ad2a89afc8f11284ef108f5","last_reissued_at":"2026-05-18T00:16:45.252198Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T00:16:45.252198Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Empirical Analysis of the Hessian of Over-Parametrized Neural Networks","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.LG","authors_text":"Leon Bottou, Levent Sagun, Utku Evci, V. Ugur Guney, Yann Dauphin","submitted_at":"2017-06-14T12:50:00Z","abstract_excerpt":"We study the properties of common loss surfaces through their Hessian matrix. In particular, in the context of deep learning, we empirically show that the spectrum of the Hessian is composed of two parts: (1) the bulk centered near zero, (2) and outliers away from the bulk. We present numerical evidence and mathematical justifications to the following conjectures laid out by Sagun et al. (2016): Fixing data, increasing the number of parameters merely scales the bulk of the spectrum; fixing the dimension and changing the data (for instance adding more clusters or making the data less separable)"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1706.04454","kind":"arxiv","version":3},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1706.04454","created_at":"2026-05-18T00:16:45.252282+00:00"},{"alias_kind":"arxiv_version","alias_value":"1706.04454v3","created_at":"2026-05-18T00:16:45.252282+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1706.04454","created_at":"2026-05-18T00:16:45.252282+00:00"},{"alias_kind":"pith_short_12","alias_value":"SKJZJQ7ZIEOW","created_at":"2026-05-18T12:31:43.269735+00:00"},{"alias_kind":"pith_short_16","alias_value":"SKJZJQ7ZIEOW6PLD","created_at":"2026-05-18T12:31:43.269735+00:00"},{"alias_kind":"pith_short_8","alias_value":"SKJZJQ7Z","created_at":"2026-05-18T12:31:43.269735+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":24,"internal_anchor_count":14,"sample":[{"citing_arxiv_id":"1906.10822","citing_title":"Gradient Noise Convolution (GNC): Smoothing Loss Function for Distributed Large-Batch SGD","ref_index":20,"is_internal_anchor":true},{"citing_arxiv_id":"2605.23061","citing_title":"Anytime Training with Schedule-Free Spectral Optimization","ref_index":53,"is_internal_anchor":true},{"citing_arxiv_id":"2605.23087","citing_title":"The Implicit Bias of Depth: From Neural Collapse to Softmax Codes","ref_index":103,"is_internal_anchor":true},{"citing_arxiv_id":"2605.23476","citing_title":"Non-normal spectral signatures of instability in neural network training dynamics","ref_index":8,"is_internal_anchor":true},{"citing_arxiv_id":"1907.02911","citing_title":"Weight-space symmetry in deep networks gives rise to permutation saddles, connected by equal-loss valleys across the loss landscape","ref_index":28,"is_internal_anchor":true},{"citing_arxiv_id":"1907.10732","citing_title":"Hessian based analysis of SGD for Deep Nets: Dynamics and Generalization","ref_index":65,"is_internal_anchor":true},{"citing_arxiv_id":"2605.22432","citing_title":"AMUSE: Anytime Muon with Stable Gradient Evaluation","ref_index":5,"is_internal_anchor":true},{"citing_arxiv_id":"2602.00545","citing_title":"Depth, Not Data: An Analysis of Hessian Spectral Bifurcation","ref_index":8,"is_internal_anchor":true},{"citing_arxiv_id":"2605.07790","citing_title":"Hessian Surgery: Class-Targeted Post-Hoc Rebalancing via Hessian Spike Perturbation","ref_index":1,"is_internal_anchor":true},{"citing_arxiv_id":"2505.23737","citing_title":"On the Convergence Analysis of Muon","ref_index":20,"is_internal_anchor":true},{"citing_arxiv_id":"2602.15823","citing_title":"CrispEdit: Low-Curvature Projections for Scalable Non-Destructive LLM Editing","ref_index":22,"is_internal_anchor":true},{"citing_arxiv_id":"2603.20527","citing_title":"RMNP: Row-Momentum Normalized Preconditioning for Scalable Matrix-Based Optimization","ref_index":29,"is_internal_anchor":true},{"citing_arxiv_id":"2603.28964","citing_title":"Spectral Edge Dynamics: An Analytical-Empirical Study of Phase Transitions in Neural Network Training","ref_index":21,"is_internal_anchor":true},{"citing_arxiv_id":"2605.13214","citing_title":"Backdoor Channels Hidden in Latent Space: Cryptographic Undetectability in Modern Neural Networks","ref_index":28,"is_internal_anchor":true},{"citing_arxiv_id":"2604.03068","citing_title":"Escape dynamics and implicit bias of one-pass SGD in overparameterized quadratic networks","ref_index":49,"is_internal_anchor":false},{"citing_arxiv_id":"2605.09345","citing_title":"Selection Plateau and a Sparsity-Dependent Hierarchy of Pruning Features","ref_index":21,"is_internal_anchor":false},{"citing_arxiv_id":"2605.09209","citing_title":"Select-then-differentiate: Solving Bilevel Optimization with Manifold Lower-level Solution Sets","ref_index":9,"is_internal_anchor":false},{"citing_arxiv_id":"2605.06615","citing_title":"When and Why SignSGD Outperforms SGD: A Theoretical Study Based on $\\ell_1$-norm Lower Bounds","ref_index":27,"is_internal_anchor":false},{"citing_arxiv_id":"2605.06081","citing_title":"Fast Gauss-Newton for Multiclass Cross-Entropy","ref_index":33,"is_internal_anchor":false},{"citing_arxiv_id":"2605.02850","citing_title":"Quantum Tilted Loss in Variational Optimization: Theory and Applications","ref_index":48,"is_internal_anchor":false},{"citing_arxiv_id":"2605.07790","citing_title":"Hessian Surgery: Class-Targeted Post-Hoc Rebalancing via Hessian Spike Perturbation","ref_index":1,"is_internal_anchor":false},{"citing_arxiv_id":"2604.07380","citing_title":"The Lifecycle of the Spectral Edge: From Gradient Learning to Weight-Decay Compression","ref_index":14,"is_internal_anchor":false},{"citing_arxiv_id":"2604.14751","citing_title":"Exploiting Correlations in Federated Learning: Opportunities and Practical Limitations","ref_index":32,"is_internal_anchor":false},{"citing_arxiv_id":"2604.19740","citing_title":"Generalization at the Edge of Stability","ref_index":63,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/SKJZJQ7ZIEOW6PLDT7BTC3I66P","json":"https://pith.science/pith/SKJZJQ7ZIEOW6PLDT7BTC3I66P.json","graph_json":"https://pith.science/api/pith-number/SKJZJQ7ZIEOW6PLDT7BTC3I66P/graph.json","events_json":"https://pith.science/api/pith-number/SKJZJQ7ZIEOW6PLDT7BTC3I66P/events.json","paper":"https://pith.science/paper/SKJZJQ7Z"},"agent_actions":{"view_html":"https://pith.science/pith/SKJZJQ7ZIEOW6PLDT7BTC3I66P","download_json":"https://pith.science/pith/SKJZJQ7ZIEOW6PLDT7BTC3I66P.json","view_paper":"https://pith.science/paper/SKJZJQ7Z","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1706.04454&json=true","fetch_graph":"https://pith.science/api/pith-number/SKJZJQ7ZIEOW6PLDT7BTC3I66P/graph.json","fetch_events":"https://pith.science/api/pith-number/SKJZJQ7ZIEOW6PLDT7BTC3I66P/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/SKJZJQ7ZIEOW6PLDT7BTC3I66P/action/timestamp_anchor","attest_storage":"https://pith.science/pith/SKJZJQ7ZIEOW6PLDT7BTC3I66P/action/storage_attestation","attest_author":"https://pith.science/pith/SKJZJQ7ZIEOW6PLDT7BTC3I66P/action/author_attestation","sign_citation":"https://pith.science/pith/SKJZJQ7ZIEOW6PLDT7BTC3I66P/action/citation_signature","submit_replication":"https://pith.science/pith/SKJZJQ7ZIEOW6PLDT7BTC3I66P/action/replication_record"}},"created_at":"2026-05-18T00:16:45.252282+00:00","updated_at":"2026-05-18T00:16:45.252282+00:00"}