{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2025:WJXR7AP6QAEQT33UZ2LPOPUGZ3","short_pith_number":"pith:WJXR7AP6","schema_version":"1.0","canonical_sha256":"b26f1f81fe800909ef74ce96f73e86cee4c0ee201b1e7dfa29a513436a577278","source":{"kind":"arxiv","id":"2506.04805","version":2},"attestation_state":"computed","paper":{"title":"Adaptive Preconditioners Trigger Loss Spikes in Adam","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.LG","authors_text":"Feiyu Xiong, Hongkang Yang, Jiajie Zhao, Xiaolong Li, Yaoyu Zhang, Zhangchen Zhou, Zhi-Qin John Xu, Zhiwei Bai, Zhiyu Li","submitted_at":"2025-06-05T09:31:41Z","abstract_excerpt":"Loss spikes commonly emerge during neural network training with the Adam optimizer across diverse architectures and scales, yet their underlying mechanism remains elusive. While previous explanations attribute these phenomena to sharper loss landscapes at lower loss, we show that landscape geometry alone is insufficient to explain the phenomenon. In this work, we pinpoint the root cause in the internal dynamics of Adam's second moment estimator. We identify a critical ``decoupling'' mechanism where the adaptive preconditioner $v_t$ fails to track the instantaneous squared gradients $g_t^2$, ca"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2506.04805","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2025-06-05T09:31:41Z","cross_cats_sorted":[],"title_canon_sha256":"ab504f83c6d231fe65aaed65b43ff2c35919ba7a7676ff3699a3a7de2401676c","abstract_canon_sha256":"a1564c2bb461d269c3cd0f4296c4165cf5cbd170d3e3b29117ad8c79658f12bd"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-26T02:03:51.431152Z","signature_b64":"SWvv0ZgHU+4dLa0BAYvLxgSKMHfXUHliu3N9ZkscSg/3d51J5CTJXxALUpagvzY7xIwRrwmzh62A6CbZZKm4Bw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"b26f1f81fe800909ef74ce96f73e86cee4c0ee201b1e7dfa29a513436a577278","last_reissued_at":"2026-05-26T02:03:51.430096Z","signature_status":"signed_v1","first_computed_at":"2026-05-26T02:03:51.430096Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Adaptive Preconditioners Trigger Loss Spikes in Adam","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.LG","authors_text":"Feiyu Xiong, Hongkang Yang, Jiajie Zhao, Xiaolong Li, Yaoyu Zhang, Zhangchen Zhou, Zhi-Qin John Xu, Zhiwei Bai, Zhiyu Li","submitted_at":"2025-06-05T09:31:41Z","abstract_excerpt":"Loss spikes commonly emerge during neural network training with the Adam optimizer across diverse architectures and scales, yet their underlying mechanism remains elusive. While previous explanations attribute these phenomena to sharper loss landscapes at lower loss, we show that landscape geometry alone is insufficient to explain the phenomenon. In this work, we pinpoint the root cause in the internal dynamics of Adam's second moment estimator. We identify a critical ``decoupling'' mechanism where the adaptive preconditioner $v_t$ fails to track the instantaneous squared gradients $g_t^2$, ca"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2506.04805","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2506.04805/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2506.04805","created_at":"2026-05-26T02:03:51.430243+00:00"},{"alias_kind":"arxiv_version","alias_value":"2506.04805v2","created_at":"2026-05-26T02:03:51.430243+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2506.04805","created_at":"2026-05-26T02:03:51.430243+00:00"},{"alias_kind":"pith_short_12","alias_value":"WJXR7AP6QAEQ","created_at":"2026-05-26T02:03:51.430243+00:00"},{"alias_kind":"pith_short_16","alias_value":"WJXR7AP6QAEQT33U","created_at":"2026-05-26T02:03:51.430243+00:00"},{"alias_kind":"pith_short_8","alias_value":"WJXR7AP6","created_at":"2026-05-26T02:03:51.430243+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":4,"internal_anchor_count":4,"sample":[{"citing_arxiv_id":"2605.06152","citing_title":"Grokking or Glitching? How Low-Precision Drives Slingshot Loss Spikes","ref_index":25,"is_internal_anchor":true},{"citing_arxiv_id":"2605.06152","citing_title":"Grokking or Glitching? How Low-Precision Drives Slingshot Loss Spikes","ref_index":25,"is_internal_anchor":true},{"citing_arxiv_id":"2605.06166","citing_title":"One Algorithm, Two Goals: Dual Scoring for Parameter and Data Selection in LLM Fine-Tuning","ref_index":98,"is_internal_anchor":true},{"citing_arxiv_id":"2604.26687","citing_title":"COPUS: Co-adaptive Parallelism and Batch Size Selection in Large Language Model Training","ref_index":1,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/WJXR7AP6QAEQT33UZ2LPOPUGZ3","json":"https://pith.science/pith/WJXR7AP6QAEQT33UZ2LPOPUGZ3.json","graph_json":"https://pith.science/api/pith-number/WJXR7AP6QAEQT33UZ2LPOPUGZ3/graph.json","events_json":"https://pith.science/api/pith-number/WJXR7AP6QAEQT33UZ2LPOPUGZ3/events.json","paper":"https://pith.science/paper/WJXR7AP6"},"agent_actions":{"view_html":"https://pith.science/pith/WJXR7AP6QAEQT33UZ2LPOPUGZ3","download_json":"https://pith.science/pith/WJXR7AP6QAEQT33UZ2LPOPUGZ3.json","view_paper":"https://pith.science/paper/WJXR7AP6","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2506.04805&json=true","fetch_graph":"https://pith.science/api/pith-number/WJXR7AP6QAEQT33UZ2LPOPUGZ3/graph.json","fetch_events":"https://pith.science/api/pith-number/WJXR7AP6QAEQT33UZ2LPOPUGZ3/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/WJXR7AP6QAEQT33UZ2LPOPUGZ3/action/timestamp_anchor","attest_storage":"https://pith.science/pith/WJXR7AP6QAEQT33UZ2LPOPUGZ3/action/storage_attestation","attest_author":"https://pith.science/pith/WJXR7AP6QAEQT33UZ2LPOPUGZ3/action/author_attestation","sign_citation":"https://pith.science/pith/WJXR7AP6QAEQT33UZ2LPOPUGZ3/action/citation_signature","submit_replication":"https://pith.science/pith/WJXR7AP6QAEQT33UZ2LPOPUGZ3/action/replication_record"}},"created_at":"2026-05-26T02:03:51.430243+00:00","updated_at":"2026-05-26T02:03:51.430243+00:00"}