{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:Y7FO5DPZ3RPIRHZVWCIHCDC37P","short_pith_number":"pith:Y7FO5DPZ","schema_version":"1.0","canonical_sha256":"c7caee8df9dc5e889f35b090710c5bfbdf3a1bb35f8d3726305dda66fd07290a","source":{"kind":"arxiv","id":"2602.05725","version":2},"attestation_state":"computed","paper":{"title":"Muon in Associative Memory Learning: Training Dynamics and Scaling Laws","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["math.OC","stat.ML"],"primary_cat":"cs.LG","authors_text":"Binghui Li, Han Zhong, Kaifei Wang, Liwei Wang, Pinyan Lu","submitted_at":"2026-02-05T14:49:40Z","abstract_excerpt":"Muon updates matrix parameters via the matrix sign of the gradient and has shown strong empirical gains, yet its dynamics and scaling behavior remain unclear in theory. We study Muon in a linear associative memory model with softmax retrieval and a hierarchical frequency spectrum over query-answer pairs, with and without label noise. In this setting, we show that Gradient Descent (GD) learns frequency components at highly imbalanced rates, leading to slow convergence bottlenecked by low-frequency components. In contrast, the Muon optimizer mitigates this imbalance, leading to faster and more u"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2602.05725","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2026-02-05T14:49:40Z","cross_cats_sorted":["math.OC","stat.ML"],"title_canon_sha256":"4f1fb0ef774a7c019fb49c2f192caaa153a6ed5a2c711e24bf5c112ad0330a2b","abstract_canon_sha256":"97d5142a20236def7c09759a806abb4dd7d0cddf87701fdae88f5754ea3de9c6"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-26T02:04:05.117511Z","signature_b64":"Qg4uA8MDxmCgaggD1+hug2XmjiLo+sIcH2t8x7b9Es+QR1fUvQ0o+5g3BVMz0TZlQ1qjGSFgr1/1bWZnMyTMBA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"c7caee8df9dc5e889f35b090710c5bfbdf3a1bb35f8d3726305dda66fd07290a","last_reissued_at":"2026-05-26T02:04:05.116639Z","signature_status":"signed_v1","first_computed_at":"2026-05-26T02:04:05.116639Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Muon in Associative Memory Learning: Training Dynamics and Scaling Laws","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["math.OC","stat.ML"],"primary_cat":"cs.LG","authors_text":"Binghui Li, Han Zhong, Kaifei Wang, Liwei Wang, Pinyan Lu","submitted_at":"2026-02-05T14:49:40Z","abstract_excerpt":"Muon updates matrix parameters via the matrix sign of the gradient and has shown strong empirical gains, yet its dynamics and scaling behavior remain unclear in theory. We study Muon in a linear associative memory model with softmax retrieval and a hierarchical frequency spectrum over query-answer pairs, with and without label noise. In this setting, we show that Gradient Descent (GD) learns frequency components at highly imbalanced rates, leading to slow convergence bottlenecked by low-frequency components. In contrast, the Muon optimizer mitigates this imbalance, leading to faster and more u"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2602.05725","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2602.05725/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2602.05725","created_at":"2026-05-26T02:04:05.116744+00:00"},{"alias_kind":"arxiv_version","alias_value":"2602.05725v2","created_at":"2026-05-26T02:04:05.116744+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2602.05725","created_at":"2026-05-26T02:04:05.116744+00:00"},{"alias_kind":"pith_short_12","alias_value":"Y7FO5DPZ3RPI","created_at":"2026-05-26T02:04:05.116744+00:00"},{"alias_kind":"pith_short_16","alias_value":"Y7FO5DPZ3RPIRHZV","created_at":"2026-05-26T02:04:05.116744+00:00"},{"alias_kind":"pith_short_8","alias_value":"Y7FO5DPZ","created_at":"2026-05-26T02:04:05.116744+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":3,"internal_anchor_count":3,"sample":[{"citing_arxiv_id":"2603.26554","citing_title":"Sharp Capacity Scaling of Spectral Optimizers in Learning Associative Memory","ref_index":31,"is_internal_anchor":true},{"citing_arxiv_id":"2605.09552","citing_title":"Phases of Muon: When Muon Eclipses SignSGD","ref_index":40,"is_internal_anchor":true},{"citing_arxiv_id":"2605.09331","citing_title":"Dimension-Free Saddle-Point Escape in Muon","ref_index":20,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/Y7FO5DPZ3RPIRHZVWCIHCDC37P","json":"https://pith.science/pith/Y7FO5DPZ3RPIRHZVWCIHCDC37P.json","graph_json":"https://pith.science/api/pith-number/Y7FO5DPZ3RPIRHZVWCIHCDC37P/graph.json","events_json":"https://pith.science/api/pith-number/Y7FO5DPZ3RPIRHZVWCIHCDC37P/events.json","paper":"https://pith.science/paper/Y7FO5DPZ"},"agent_actions":{"view_html":"https://pith.science/pith/Y7FO5DPZ3RPIRHZVWCIHCDC37P","download_json":"https://pith.science/pith/Y7FO5DPZ3RPIRHZVWCIHCDC37P.json","view_paper":"https://pith.science/paper/Y7FO5DPZ","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2602.05725&json=true","fetch_graph":"https://pith.science/api/pith-number/Y7FO5DPZ3RPIRHZVWCIHCDC37P/graph.json","fetch_events":"https://pith.science/api/pith-number/Y7FO5DPZ3RPIRHZVWCIHCDC37P/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/Y7FO5DPZ3RPIRHZVWCIHCDC37P/action/timestamp_anchor","attest_storage":"https://pith.science/pith/Y7FO5DPZ3RPIRHZVWCIHCDC37P/action/storage_attestation","attest_author":"https://pith.science/pith/Y7FO5DPZ3RPIRHZVWCIHCDC37P/action/author_attestation","sign_citation":"https://pith.science/pith/Y7FO5DPZ3RPIRHZVWCIHCDC37P/action/citation_signature","submit_replication":"https://pith.science/pith/Y7FO5DPZ3RPIRHZVWCIHCDC37P/action/replication_record"}},"created_at":"2026-05-26T02:04:05.116744+00:00","updated_at":"2026-05-26T02:04:05.116744+00:00"}