{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2025:57AUUJAEAMMXTZ7IWUHVM3XSAL","short_pith_number":"pith:57AUUJAE","schema_version":"1.0","canonical_sha256":"efc14a2404031979e7e8b50f566ef202eeeb6dcbb1c7db9b7105fdbd73334c2e","source":{"kind":"arxiv","id":"2502.07529","version":2},"attestation_state":"computed","paper":{"title":"Training Deep Learning Models with Norm-Constrained LMOs","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["math.OC"],"primary_cat":"cs.LG","authors_text":"Antonio Silveti-Falls, Kimon Antonakopoulos, Thomas Pethick, Volkan Cevher, Wanyun Xie, Zhenyu Zhu","submitted_at":"2025-02-11T13:10:34Z","abstract_excerpt":"In this work, we study optimization methods that leverage the linear minimization oracle (LMO) over a norm-ball. We propose a new stochastic family of algorithms that uses the LMO to adapt to the geometry of the problem and, perhaps surprisingly, show that they can be applied to unconstrained problems. The resulting update rule unifies several existing optimization methods under a single framework. Furthermore, we propose an explicit choice of norm for deep architectures, which, as a side benefit, leads to the transferability of hyperparameters across model sizes. Experimentally, we demonstrat"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2502.07529","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2025-02-11T13:10:34Z","cross_cats_sorted":["math.OC"],"title_canon_sha256":"9e53f7a72f384884a4531cd4b63c7246a2a95065ea0c7e0972312afaffd25816","abstract_canon_sha256":"fc403908c7efc199015d6bd89b7d5fe565d4c174eac1238491602e3a7bbe35dd"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-21T21:14:43.496681Z","signature_b64":"sMhCXS0QIkM3M/qOD7Pbsf0ke7gluGyZGsNhCFIFTT68XGgB4UNGS9z/zbTAg9xXragOh9/V7Yq0/sdrcsH5Dw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"efc14a2404031979e7e8b50f566ef202eeeb6dcbb1c7db9b7105fdbd73334c2e","last_reissued_at":"2026-05-21T21:14:43.494748Z","signature_status":"signed_v1","first_computed_at":"2026-05-21T21:14:43.494748Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Training Deep Learning Models with Norm-Constrained LMOs","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["math.OC"],"primary_cat":"cs.LG","authors_text":"Antonio Silveti-Falls, Kimon Antonakopoulos, Thomas Pethick, Volkan Cevher, Wanyun Xie, Zhenyu Zhu","submitted_at":"2025-02-11T13:10:34Z","abstract_excerpt":"In this work, we study optimization methods that leverage the linear minimization oracle (LMO) over a norm-ball. We propose a new stochastic family of algorithms that uses the LMO to adapt to the geometry of the problem and, perhaps surprisingly, show that they can be applied to unconstrained problems. The resulting update rule unifies several existing optimization methods under a single framework. Furthermore, we propose an explicit choice of norm for deep architectures, which, as a side benefit, leads to the transferability of hyperparameters across model sizes. Experimentally, we demonstrat"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2502.07529","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2502.07529/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2502.07529","created_at":"2026-05-21T21:14:43.494854+00:00"},{"alias_kind":"arxiv_version","alias_value":"2502.07529v2","created_at":"2026-05-21T21:14:43.494854+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2502.07529","created_at":"2026-05-21T21:14:43.494854+00:00"},{"alias_kind":"pith_short_12","alias_value":"57AUUJAEAMMX","created_at":"2026-05-21T21:14:43.494854+00:00"},{"alias_kind":"pith_short_16","alias_value":"57AUUJAEAMMXTZ7I","created_at":"2026-05-21T21:14:43.494854+00:00"},{"alias_kind":"pith_short_8","alias_value":"57AUUJAE","created_at":"2026-05-21T21:14:43.494854+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":21,"internal_anchor_count":21,"sample":[{"citing_arxiv_id":"2506.16659","citing_title":"Memory-Efficient LLM Pretraining via Minimalist Optimizer Design","ref_index":12,"is_internal_anchor":true},{"citing_arxiv_id":"2510.10777","citing_title":"Preconditioned Norms: A Unified Framework for Steepest Descent, Quasi-Newton and Adaptive Methods","ref_index":14,"is_internal_anchor":true},{"citing_arxiv_id":"2605.21104","citing_title":"HORST: Composing Optimizer Geometries for Sparse Transformer Training","ref_index":211,"is_internal_anchor":true},{"citing_arxiv_id":"2605.18999","citing_title":"Distance-Aware Muon: Adaptive Step Scaling for Normalized Optimization","ref_index":7,"is_internal_anchor":true},{"citing_arxiv_id":"2605.19619","citing_title":"MiMuon: Mixed Muon Optimizer with Improved Generalization for Large Models","ref_index":26,"is_internal_anchor":true},{"citing_arxiv_id":"2505.23737","citing_title":"On the Convergence Analysis of Muon","ref_index":17,"is_internal_anchor":true},{"citing_arxiv_id":"2509.15816","citing_title":"On the Convergence of Muon and Beyond","ref_index":37,"is_internal_anchor":true},{"citing_arxiv_id":"2509.11983","citing_title":"Low-rank Orthogonalization for Large-scale Matrix Optimization with Applications to Foundation Model Training","ref_index":41,"is_internal_anchor":true},{"citing_arxiv_id":"2603.28254","citing_title":"MuonEq: Balancing Before Orthogonalization with Lightweight Equilibration","ref_index":39,"is_internal_anchor":true},{"citing_arxiv_id":"2605.13434","citing_title":"Rescaled Asynchronous SGD: Optimal Distributed Optimization under Data and System Heterogeneity","ref_index":58,"is_internal_anchor":true},{"citing_arxiv_id":"2605.11838","citing_title":"Gradient Clipping Beyond Vector Norms: A Spectral Approach for Matrix-Valued Parameters","ref_index":48,"is_internal_anchor":true},{"citing_arxiv_id":"2605.12492","citing_title":"Pion: A Spectrum-Preserving Optimizer via Orthogonal Equivalence Transformation","ref_index":58,"is_internal_anchor":true},{"citing_arxiv_id":"2605.11181","citing_title":"Muon is Not That Special: Random or Inverted Spectra Work Just as Well","ref_index":41,"is_internal_anchor":true},{"citing_arxiv_id":"2605.09238","citing_title":"Intrinsic Muon: Spectral Optimization on Riemannian Matrix Manifolds","ref_index":49,"is_internal_anchor":true},{"citing_arxiv_id":"2604.23980","citing_title":"SUDA-Muon: Structural Design Principles and Boundaries for Fully Decentralized Muon","ref_index":18,"is_internal_anchor":true},{"citing_arxiv_id":"2605.06654","citing_title":"Optimizer-Model Consistency: Full Finetuning with the Same Optimizer as Pretraining Forgets Less","ref_index":19,"is_internal_anchor":true},{"citing_arxiv_id":"2605.04418","citing_title":"Demystifying Manifold Constraints in LLM Pre-training","ref_index":34,"is_internal_anchor":true},{"citing_arxiv_id":"2605.00650","citing_title":"AdaMeZO: Adam-style Zeroth-Order Optimizer for LLM Fine-tuning Without Maintaining the Moments","ref_index":69,"is_internal_anchor":true},{"citing_arxiv_id":"2604.10689","citing_title":"Communication-Efficient Gluon in Federated Learning","ref_index":28,"is_internal_anchor":true},{"citing_arxiv_id":"2604.04726","citing_title":"A Muon-Accelerated Algorithm for Low Separation Rank Tensor Generalized Linear Models","ref_index":17,"is_internal_anchor":true},{"citing_arxiv_id":"2604.17423","citing_title":"A unified convergence theory for adaptive first-order methods in the nonconvex case, including AdaNorm, full and diagonal AdaGrad, Shampoo and Muo","ref_index":41,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/57AUUJAEAMMXTZ7IWUHVM3XSAL","json":"https://pith.science/pith/57AUUJAEAMMXTZ7IWUHVM3XSAL.json","graph_json":"https://pith.science/api/pith-number/57AUUJAEAMMXTZ7IWUHVM3XSAL/graph.json","events_json":"https://pith.science/api/pith-number/57AUUJAEAMMXTZ7IWUHVM3XSAL/events.json","paper":"https://pith.science/paper/57AUUJAE"},"agent_actions":{"view_html":"https://pith.science/pith/57AUUJAEAMMXTZ7IWUHVM3XSAL","download_json":"https://pith.science/pith/57AUUJAEAMMXTZ7IWUHVM3XSAL.json","view_paper":"https://pith.science/paper/57AUUJAE","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2502.07529&json=true","fetch_graph":"https://pith.science/api/pith-number/57AUUJAEAMMXTZ7IWUHVM3XSAL/graph.json","fetch_events":"https://pith.science/api/pith-number/57AUUJAEAMMXTZ7IWUHVM3XSAL/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/57AUUJAEAMMXTZ7IWUHVM3XSAL/action/timestamp_anchor","attest_storage":"https://pith.science/pith/57AUUJAEAMMXTZ7IWUHVM3XSAL/action/storage_attestation","attest_author":"https://pith.science/pith/57AUUJAEAMMXTZ7IWUHVM3XSAL/action/author_attestation","sign_citation":"https://pith.science/pith/57AUUJAEAMMXTZ7IWUHVM3XSAL/action/citation_signature","submit_replication":"https://pith.science/pith/57AUUJAEAMMXTZ7IWUHVM3XSAL/action/replication_record"}},"created_at":"2026-05-21T21:14:43.494854+00:00","updated_at":"2026-05-21T21:14:43.494854+00:00"}