{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:BPBE6CBM2PMKAIUWNBNDYRORLE","short_pith_number":"pith:BPBE6CBM","schema_version":"1.0","canonical_sha256":"0bc24f082cd3d8a02296685a3c45d15915b50f104c67787c94efd66184ebdf0b","source":{"kind":"arxiv","id":"2603.14315","version":2},"attestation_state":"computed","paper":{"title":"Enhancing LLM Training via Spectral Clipping","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["math.OC"],"primary_cat":"cs.LG","authors_text":"Andrei Semenov, Sebastian U. Stich, Xiaowen Jiang","submitted_at":"2026-03-15T10:16:40Z","abstract_excerpt":"While spectral-based optimizers like Muon operate directly on the spectrum of updates, standard adaptive methods such as AdamW do not account for the spectral structure of weights and gradients, leaving them vulnerable to two empirical issues in large language model (LLM) training: (i) the optimizer updates can have large spectral norms, potentially destabilizing training and degrading generalization; (ii) stochastic gradient noise can exhibit sparse spectral spikes, with a few dominant singular values much larger than the rest. We propose SPECTRA, a general framework addressing these by (i) p"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2603.14315","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2026-03-15T10:16:40Z","cross_cats_sorted":["math.OC"],"title_canon_sha256":"23022389bd0c8616bd007fde1a55b44f58b519dbe10b257f2ad0d9c9fc44230a","abstract_canon_sha256":"d38863a22bad0ee685062044f4266675dbb00ecf6f6749e8bd0cee5b09471034"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-29T02:05:42.873640Z","signature_b64":"yiRF0aEEyFhfZWXpHfs8clt+WLQNfxDLomcmlk1MI/6V8qL0Y1yz9U+dezQ0mLzxF+bi6x1Fy+0WNqv1aytEDA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"0bc24f082cd3d8a02296685a3c45d15915b50f104c67787c94efd66184ebdf0b","last_reissued_at":"2026-05-29T02:05:42.872905Z","signature_status":"signed_v1","first_computed_at":"2026-05-29T02:05:42.872905Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Enhancing LLM Training via Spectral Clipping","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["math.OC"],"primary_cat":"cs.LG","authors_text":"Andrei Semenov, Sebastian U. Stich, Xiaowen Jiang","submitted_at":"2026-03-15T10:16:40Z","abstract_excerpt":"While spectral-based optimizers like Muon operate directly on the spectrum of updates, standard adaptive methods such as AdamW do not account for the spectral structure of weights and gradients, leaving them vulnerable to two empirical issues in large language model (LLM) training: (i) the optimizer updates can have large spectral norms, potentially destabilizing training and degrading generalization; (ii) stochastic gradient noise can exhibit sparse spectral spikes, with a few dominant singular values much larger than the rest. We propose SPECTRA, a general framework addressing these by (i) p"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2603.14315","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2603.14315/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2603.14315","created_at":"2026-05-29T02:05:42.872996+00:00"},{"alias_kind":"arxiv_version","alias_value":"2603.14315v2","created_at":"2026-05-29T02:05:42.872996+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2603.14315","created_at":"2026-05-29T02:05:42.872996+00:00"},{"alias_kind":"pith_short_12","alias_value":"BPBE6CBM2PMK","created_at":"2026-05-29T02:05:42.872996+00:00"},{"alias_kind":"pith_short_16","alias_value":"BPBE6CBM2PMKAIUW","created_at":"2026-05-29T02:05:42.872996+00:00"},{"alias_kind":"pith_short_8","alias_value":"BPBE6CBM","created_at":"2026-05-29T02:05:42.872996+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":1,"internal_anchor_count":1,"sample":[{"citing_arxiv_id":"2605.04418","citing_title":"Demystifying Manifold Constraints in LLM Pre-training","ref_index":3,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/BPBE6CBM2PMKAIUWNBNDYRORLE","json":"https://pith.science/pith/BPBE6CBM2PMKAIUWNBNDYRORLE.json","graph_json":"https://pith.science/api/pith-number/BPBE6CBM2PMKAIUWNBNDYRORLE/graph.json","events_json":"https://pith.science/api/pith-number/BPBE6CBM2PMKAIUWNBNDYRORLE/events.json","paper":"https://pith.science/paper/BPBE6CBM"},"agent_actions":{"view_html":"https://pith.science/pith/BPBE6CBM2PMKAIUWNBNDYRORLE","download_json":"https://pith.science/pith/BPBE6CBM2PMKAIUWNBNDYRORLE.json","view_paper":"https://pith.science/paper/BPBE6CBM","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2603.14315&json=true","fetch_graph":"https://pith.science/api/pith-number/BPBE6CBM2PMKAIUWNBNDYRORLE/graph.json","fetch_events":"https://pith.science/api/pith-number/BPBE6CBM2PMKAIUWNBNDYRORLE/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/BPBE6CBM2PMKAIUWNBNDYRORLE/action/timestamp_anchor","attest_storage":"https://pith.science/pith/BPBE6CBM2PMKAIUWNBNDYRORLE/action/storage_attestation","attest_author":"https://pith.science/pith/BPBE6CBM2PMKAIUWNBNDYRORLE/action/author_attestation","sign_citation":"https://pith.science/pith/BPBE6CBM2PMKAIUWNBNDYRORLE/action/citation_signature","submit_replication":"https://pith.science/pith/BPBE6CBM2PMKAIUWNBNDYRORLE/action/replication_record"}},"created_at":"2026-05-29T02:05:42.872996+00:00","updated_at":"2026-05-29T02:05:42.872996+00:00"}