{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2025:RW2DRVDFL7GRVAJPBJ7BANIHMU","short_pith_number":"pith:RW2DRVDF","schema_version":"1.0","canonical_sha256":"8db438d4655fcd1a812f0a7e1035076514dcbeb006c13bc64cfbe3ec9de6fdcd","source":{"kind":"arxiv","id":"2507.01598","version":5},"attestation_state":"computed","paper":{"title":"Convergence Bound and Critical Batch Size of Muon Optimizer","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.LG","authors_text":"Hideaki Iiduka, Hiroki Naganuma, Naoki Sato","submitted_at":"2025-07-02T11:03:13Z","abstract_excerpt":"Muon, a recently proposed optimizer that leverages the inherent matrix structure of neural network parameters, has demonstrated strong empirical performance, indicating its potential as a successor to standard optimizers such as AdamW. This paper presents theoretical analysis to support its practical success. We provide convergence proofs for Muon across four practical settings, systematically examining its behavior with and without the inclusion of Nesterov momentum and weight decay. We then demonstrate that the addition of weight decay ensures almost-sure boundedness of the parameter and gra"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2507.01598","kind":"arxiv","version":5},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2025-07-02T11:03:13Z","cross_cats_sorted":[],"title_canon_sha256":"f29d9d5b821d1df470c0e3eca5272b0e226bb21850df3221026ab624e99922a3","abstract_canon_sha256":"805e6514cd09c1431516f4e3c82e6ff31864131f662abdd24c479e6773a95d8a"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-09T02:07:04.960023Z","signature_b64":"/3E2v8w2mFzDeK2gvd1rOT01/tj2dGV4+9qaJ9+6S7/DBo2PRE+l1XhA0IWExy6fL5C6fid9Atw5Tz1RuAYVBA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"8db438d4655fcd1a812f0a7e1035076514dcbeb006c13bc64cfbe3ec9de6fdcd","last_reissued_at":"2026-06-09T02:07:04.958953Z","signature_status":"signed_v1","first_computed_at":"2026-06-09T02:07:04.958953Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Convergence Bound and Critical Batch Size of Muon Optimizer","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.LG","authors_text":"Hideaki Iiduka, Hiroki Naganuma, Naoki Sato","submitted_at":"2025-07-02T11:03:13Z","abstract_excerpt":"Muon, a recently proposed optimizer that leverages the inherent matrix structure of neural network parameters, has demonstrated strong empirical performance, indicating its potential as a successor to standard optimizers such as AdamW. This paper presents theoretical analysis to support its practical success. We provide convergence proofs for Muon across four practical settings, systematically examining its behavior with and without the inclusion of Nesterov momentum and weight decay. We then demonstrate that the addition of weight decay ensures almost-sure boundedness of the parameter and gra"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2507.01598","kind":"arxiv","version":5},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2507.01598/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2507.01598","created_at":"2026-06-09T02:07:04.959113+00:00"},{"alias_kind":"arxiv_version","alias_value":"2507.01598v5","created_at":"2026-06-09T02:07:04.959113+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2507.01598","created_at":"2026-06-09T02:07:04.959113+00:00"},{"alias_kind":"pith_short_12","alias_value":"RW2DRVDFL7GR","created_at":"2026-06-09T02:07:04.959113+00:00"},{"alias_kind":"pith_short_16","alias_value":"RW2DRVDFL7GRVAJP","created_at":"2026-06-09T02:07:04.959113+00:00"},{"alias_kind":"pith_short_8","alias_value":"RW2DRVDF","created_at":"2026-06-09T02:07:04.959113+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":10,"internal_anchor_count":10,"sample":[{"citing_arxiv_id":"2605.23061","citing_title":"Anytime Training with Schedule-Free Spectral Optimization","ref_index":62,"is_internal_anchor":true},{"citing_arxiv_id":"2605.16341","citing_title":"Orth-Dion: Eliminating Geometric Mismatch in Distributed Low-Rank Spectral Optimization","ref_index":30,"is_internal_anchor":true},{"citing_arxiv_id":"2509.11983","citing_title":"Low-rank Orthogonalization for Large-scale Matrix Optimization with Applications to Foundation Model Training","ref_index":46,"is_internal_anchor":true},{"citing_arxiv_id":"2605.12994","citing_title":"DP-Muon: Differentially Private Optimization via Matrix-Orthogonalized Momentum","ref_index":6,"is_internal_anchor":true},{"citing_arxiv_id":"2605.08980","citing_title":"Muon Does Not Converge on Convex Lipschitz Functions","ref_index":84,"is_internal_anchor":true},{"citing_arxiv_id":"2605.09238","citing_title":"Intrinsic Muon: Spectral Optimization on Riemannian Matrix Manifolds","ref_index":52,"is_internal_anchor":true},{"citing_arxiv_id":"2605.06615","citing_title":"When and Why SignSGD Outperforms SGD: A Theoretical Study Based on $\\ell_1$-norm Lower Bounds","ref_index":28,"is_internal_anchor":true},{"citing_arxiv_id":"2605.05577","citing_title":"Accelerating LMO-Based Optimization via Implicit Gradient Transport","ref_index":11,"is_internal_anchor":true},{"citing_arxiv_id":"2604.21616","citing_title":"Convergence Rate Analysis of SOAP with Arbitrary Orthogonal Projection Matrices","ref_index":19,"is_internal_anchor":true},{"citing_arxiv_id":"2605.06884","citing_title":"Muon with Nesterov Momentum: Heavy-Tailed Noise and (Randomized) Inexact Polar Decomposition","ref_index":44,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/RW2DRVDFL7GRVAJPBJ7BANIHMU","json":"https://pith.science/pith/RW2DRVDFL7GRVAJPBJ7BANIHMU.json","graph_json":"https://pith.science/api/pith-number/RW2DRVDFL7GRVAJPBJ7BANIHMU/graph.json","events_json":"https://pith.science/api/pith-number/RW2DRVDFL7GRVAJPBJ7BANIHMU/events.json","paper":"https://pith.science/paper/RW2DRVDF"},"agent_actions":{"view_html":"https://pith.science/pith/RW2DRVDFL7GRVAJPBJ7BANIHMU","download_json":"https://pith.science/pith/RW2DRVDFL7GRVAJPBJ7BANIHMU.json","view_paper":"https://pith.science/paper/RW2DRVDF","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2507.01598&json=true","fetch_graph":"https://pith.science/api/pith-number/RW2DRVDFL7GRVAJPBJ7BANIHMU/graph.json","fetch_events":"https://pith.science/api/pith-number/RW2DRVDFL7GRVAJPBJ7BANIHMU/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/RW2DRVDFL7GRVAJPBJ7BANIHMU/action/timestamp_anchor","attest_storage":"https://pith.science/pith/RW2DRVDFL7GRVAJPBJ7BANIHMU/action/storage_attestation","attest_author":"https://pith.science/pith/RW2DRVDFL7GRVAJPBJ7BANIHMU/action/author_attestation","sign_citation":"https://pith.science/pith/RW2DRVDFL7GRVAJPBJ7BANIHMU/action/citation_signature","submit_replication":"https://pith.science/pith/RW2DRVDFL7GRVAJPBJ7BANIHMU/action/replication_record"}},"created_at":"2026-06-09T02:07:04.959113+00:00","updated_at":"2026-06-09T02:07:04.959113+00:00"}