{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2025:GIKSL5R7OXS2E5PBS2Z6E3GVGT","short_pith_number":"pith:GIKSL5R7","schema_version":"1.0","canonical_sha256":"321525f63f75e5a275e196b3e26cd534c6013047fe667098875dd736cb24cce8","source":{"kind":"arxiv","id":"2509.14562","version":3},"attestation_state":"computed","paper":{"title":"LiMuon: Light and Fast Muon Optimizer for Large Models","license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","headline":"","cross_cats":["math.OC"],"primary_cat":"cs.LG","authors_text":"Feihu Huang, Songcan Chen, Yuning Luo","submitted_at":"2025-09-18T02:49:27Z","abstract_excerpt":"Large models recently are widely applied in machine learning, so efficient training of large models has received widespread attention. More recently, the useful Muon optimizer is specifically designed for matrix-structured parameters of large models. Although some works have begun to study the Muon optimizer, the existing Muon and its variants still suffer from high sample complexity or high memory for large models. To fill this gap, we propose a light and fast Muon (LiMuon) optimizer for training large models, which builds on the momentum-based variance reduced technique and randomized Singul"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2509.14562","kind":"arxiv","version":3},"metadata":{"license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","primary_cat":"cs.LG","submitted_at":"2025-09-18T02:49:27Z","cross_cats_sorted":["math.OC"],"title_canon_sha256":"b41447fb80b2283531571d91210766eb11eab7326b8834fcf0cdc0ad65d692d7","abstract_canon_sha256":"1e3b8bedc5c9fcc2fd7f63c7704ffcde1879750904718ab6339ff20adb575169"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-01T01:02:20.747377Z","signature_b64":"dzpUhyCWgnnUmMUREkFWL3s918biBe1ahnk5QvL0j/W6A5Z9RSA6I/+T9XiC7IppYOsDLu8b+3qyGxxW5zVnBw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"321525f63f75e5a275e196b3e26cd534c6013047fe667098875dd736cb24cce8","last_reissued_at":"2026-06-01T01:02:20.746492Z","signature_status":"signed_v1","first_computed_at":"2026-06-01T01:02:20.746492Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"LiMuon: Light and Fast Muon Optimizer for Large Models","license":"http://creativecommons.org/licenses/by-nc-sa/4.0/","headline":"","cross_cats":["math.OC"],"primary_cat":"cs.LG","authors_text":"Feihu Huang, Songcan Chen, Yuning Luo","submitted_at":"2025-09-18T02:49:27Z","abstract_excerpt":"Large models recently are widely applied in machine learning, so efficient training of large models has received widespread attention. More recently, the useful Muon optimizer is specifically designed for matrix-structured parameters of large models. Although some works have begun to study the Muon optimizer, the existing Muon and its variants still suffer from high sample complexity or high memory for large models. To fill this gap, we propose a light and fast Muon (LiMuon) optimizer for training large models, which builds on the momentum-based variance reduced technique and randomized Singul"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2509.14562","kind":"arxiv","version":3},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2509.14562/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2509.14562","created_at":"2026-06-01T01:02:20.746659+00:00"},{"alias_kind":"arxiv_version","alias_value":"2509.14562v3","created_at":"2026-06-01T01:02:20.746659+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2509.14562","created_at":"2026-06-01T01:02:20.746659+00:00"},{"alias_kind":"pith_short_12","alias_value":"GIKSL5R7OXS2","created_at":"2026-06-01T01:02:20.746659+00:00"},{"alias_kind":"pith_short_16","alias_value":"GIKSL5R7OXS2E5PB","created_at":"2026-06-01T01:02:20.746659+00:00"},{"alias_kind":"pith_short_8","alias_value":"GIKSL5R7","created_at":"2026-06-01T01:02:20.746659+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":6,"internal_anchor_count":6,"sample":[{"citing_arxiv_id":"2605.18106","citing_title":"Symmetry-Compatible Principle for Optimizer Design: Embeddings, LM Heads, SwiGLU MLPs, and MoE Routers","ref_index":71,"is_internal_anchor":true},{"citing_arxiv_id":"2605.19619","citing_title":"MiMuon: Mixed Muon Optimizer with Improved Generalization for Large Models","ref_index":12,"is_internal_anchor":true},{"citing_arxiv_id":"2605.19811","citing_title":"LionMuon: Alternating Spectral and Sign Descent for Efficient Training","ref_index":12,"is_internal_anchor":true},{"citing_arxiv_id":"2603.28254","citing_title":"MuonEq: Balancing Before Orthogonalization with Lightweight Equilibration","ref_index":33,"is_internal_anchor":true},{"citing_arxiv_id":"2605.06615","citing_title":"When and Why SignSGD Outperforms SGD: A Theoretical Study Based on $\\ell_1$-norm Lower Bounds","ref_index":14,"is_internal_anchor":true},{"citing_arxiv_id":"2604.10689","citing_title":"Communication-Efficient Gluon in Federated Learning","ref_index":12,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/GIKSL5R7OXS2E5PBS2Z6E3GVGT","json":"https://pith.science/pith/GIKSL5R7OXS2E5PBS2Z6E3GVGT.json","graph_json":"https://pith.science/api/pith-number/GIKSL5R7OXS2E5PBS2Z6E3GVGT/graph.json","events_json":"https://pith.science/api/pith-number/GIKSL5R7OXS2E5PBS2Z6E3GVGT/events.json","paper":"https://pith.science/paper/GIKSL5R7"},"agent_actions":{"view_html":"https://pith.science/pith/GIKSL5R7OXS2E5PBS2Z6E3GVGT","download_json":"https://pith.science/pith/GIKSL5R7OXS2E5PBS2Z6E3GVGT.json","view_paper":"https://pith.science/paper/GIKSL5R7","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2509.14562&json=true","fetch_graph":"https://pith.science/api/pith-number/GIKSL5R7OXS2E5PBS2Z6E3GVGT/graph.json","fetch_events":"https://pith.science/api/pith-number/GIKSL5R7OXS2E5PBS2Z6E3GVGT/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/GIKSL5R7OXS2E5PBS2Z6E3GVGT/action/timestamp_anchor","attest_storage":"https://pith.science/pith/GIKSL5R7OXS2E5PBS2Z6E3GVGT/action/storage_attestation","attest_author":"https://pith.science/pith/GIKSL5R7OXS2E5PBS2Z6E3GVGT/action/author_attestation","sign_citation":"https://pith.science/pith/GIKSL5R7OXS2E5PBS2Z6E3GVGT/action/citation_signature","submit_replication":"https://pith.science/pith/GIKSL5R7OXS2E5PBS2Z6E3GVGT/action/replication_record"}},"created_at":"2026-06-01T01:02:20.746659+00:00","updated_at":"2026-06-01T01:02:20.746659+00:00"}