{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2025:XMKKDZROK3OLAGJX5VFA7YKND6","short_pith_number":"pith:XMKKDZRO","schema_version":"1.0","canonical_sha256":"bb14a1e62e56dcb01937ed4a0fe14d1fbaac407bbe1fa6016ea2d878c930620b","source":{"kind":"arxiv","id":"2502.11034","version":3},"attestation_state":"computed","paper":{"title":"AdaGC: Enhancing LLM Pretraining Stability via Adaptive Gradient Clipping","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.LG","authors_text":"Congliang Chen, Dianhai Yu, Guoxia Wang, JiaBin Yang, Jinle Zeng, Li Shen, Shuai Li, Yanjun Ma","submitted_at":"2025-02-16T08:13:23Z","abstract_excerpt":"Loss spikes remain a persistent obstacle in large-scale language model pretraining. While previous research has attempted to identify the root cause of loss spikes by investigating individual factors, we observe that, in practice, such spikes are typically triggered by the confluence of heterogeneous factors. Empirically, loss spikes may arise from a combination of data outliers, hardware or transient computational faults, numerical precision issues, and hyperparameter settings. Regardless of the underlying cause, these spikes manifest as unstable optimizer updates, as abnormal gradients conta"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2502.11034","kind":"arxiv","version":3},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2025-02-16T08:13:23Z","cross_cats_sorted":[],"title_canon_sha256":"da1d429176ea1a439e9ad17defaee5d76f3da5332069187e9fd5d104724c1e94","abstract_canon_sha256":"e33d8d67eed9faf296c1a4dfde403d07356cb250b95866b1a89a79a87b6a3e39"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-10T01:09:15.419088Z","signature_b64":"IWyk/xzy1NTn2v+RQlRrprWxVOJvo2hWoaXdKrfxMWHrPf4/MUsL1Eed3UN/Z12JKmXSoTAo2HSeIUoM7zGODw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"bb14a1e62e56dcb01937ed4a0fe14d1fbaac407bbe1fa6016ea2d878c930620b","last_reissued_at":"2026-06-10T01:09:15.417904Z","signature_status":"signed_v1","first_computed_at":"2026-06-10T01:09:15.417904Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"AdaGC: Enhancing LLM Pretraining Stability via Adaptive Gradient Clipping","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.LG","authors_text":"Congliang Chen, Dianhai Yu, Guoxia Wang, JiaBin Yang, Jinle Zeng, Li Shen, Shuai Li, Yanjun Ma","submitted_at":"2025-02-16T08:13:23Z","abstract_excerpt":"Loss spikes remain a persistent obstacle in large-scale language model pretraining. While previous research has attempted to identify the root cause of loss spikes by investigating individual factors, we observe that, in practice, such spikes are typically triggered by the confluence of heterogeneous factors. Empirically, loss spikes may arise from a combination of data outliers, hardware or transient computational faults, numerical precision issues, and hyperparameter settings. Regardless of the underlying cause, these spikes manifest as unstable optimizer updates, as abnormal gradients conta"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2502.11034","kind":"arxiv","version":3},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2502.11034/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2502.11034","created_at":"2026-06-10T01:09:15.418088+00:00"},{"alias_kind":"arxiv_version","alias_value":"2502.11034v3","created_at":"2026-06-10T01:09:15.418088+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2502.11034","created_at":"2026-06-10T01:09:15.418088+00:00"},{"alias_kind":"pith_short_12","alias_value":"XMKKDZROK3OL","created_at":"2026-06-10T01:09:15.418088+00:00"},{"alias_kind":"pith_short_16","alias_value":"XMKKDZROK3OLAGJX","created_at":"2026-06-10T01:09:15.418088+00:00"},{"alias_kind":"pith_short_8","alias_value":"XMKKDZRO","created_at":"2026-06-10T01:09:15.418088+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":1,"internal_anchor_count":1,"sample":[{"citing_arxiv_id":"2605.08809","citing_title":"SimReg: Achieving Higher Performance in the Pretraining via Embedding Similarity Regularization","ref_index":15,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/XMKKDZROK3OLAGJX5VFA7YKND6","json":"https://pith.science/pith/XMKKDZROK3OLAGJX5VFA7YKND6.json","graph_json":"https://pith.science/api/pith-number/XMKKDZROK3OLAGJX5VFA7YKND6/graph.json","events_json":"https://pith.science/api/pith-number/XMKKDZROK3OLAGJX5VFA7YKND6/events.json","paper":"https://pith.science/paper/XMKKDZRO"},"agent_actions":{"view_html":"https://pith.science/pith/XMKKDZROK3OLAGJX5VFA7YKND6","download_json":"https://pith.science/pith/XMKKDZROK3OLAGJX5VFA7YKND6.json","view_paper":"https://pith.science/paper/XMKKDZRO","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2502.11034&json=true","fetch_graph":"https://pith.science/api/pith-number/XMKKDZROK3OLAGJX5VFA7YKND6/graph.json","fetch_events":"https://pith.science/api/pith-number/XMKKDZROK3OLAGJX5VFA7YKND6/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/XMKKDZROK3OLAGJX5VFA7YKND6/action/timestamp_anchor","attest_storage":"https://pith.science/pith/XMKKDZROK3OLAGJX5VFA7YKND6/action/storage_attestation","attest_author":"https://pith.science/pith/XMKKDZROK3OLAGJX5VFA7YKND6/action/author_attestation","sign_citation":"https://pith.science/pith/XMKKDZROK3OLAGJX5VFA7YKND6/action/citation_signature","submit_replication":"https://pith.science/pith/XMKKDZROK3OLAGJX5VFA7YKND6/action/replication_record"}},"created_at":"2026-06-10T01:09:15.418088+00:00","updated_at":"2026-06-10T01:09:15.418088+00:00"}