{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2024:2ISEK7MSL5YKEBZUPXSAH7ZZNY","short_pith_number":"pith:2ISEK7MS","schema_version":"1.0","canonical_sha256":"d224457d925f70a207347de403ff396e36848c9029324d97e664496d31b4d3bd","source":{"kind":"arxiv","id":"2411.07724","version":2},"attestation_state":"computed","paper":{"title":"Convergence Rate Analysis of LION","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["math.OC"],"primary_cat":"cs.LG","authors_text":"Huan Li, Yiming Dong, Zhouchen Lin","submitted_at":"2024-11-12T11:30:53Z","abstract_excerpt":"The LION (evoLved sIgn mOmeNtum) optimizer for deep neural network training was found by Google via program search, with the simple sign update yet showing impressive performance in training large scale networks. Although previous studies have investigated its convergence properties, a comprehensive analysis, especially the convergence rate, is still desirable. Recognizing that LION can be regarded as solving a specific constrained problem, this paper focuses on demonstrating its convergence to the Karush-Kuhn-Tucker (KKT) point at the rate of $\\cal O(\\sqrt{d}K^{-1/4})$ measured by gradient $\\"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2411.07724","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2024-11-12T11:30:53Z","cross_cats_sorted":["math.OC"],"title_canon_sha256":"151a45ec617174e44fe95d85d46d121032ed19ba348cb9359c2573547af00865","abstract_canon_sha256":"a80864a6353bd7bd2f5a66822d34e4afb512605f35628926780e5e1ff5012e3e"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-23T03:13:43.921948Z","signature_b64":"96S/bYP6OK5EAlSRBZTtQ76X6QR9ky6TS9mjpt875o09FaMLbZT78V4Uc2G9To/dVkBgdEcXFa7A78QEGPY4DQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"d224457d925f70a207347de403ff396e36848c9029324d97e664496d31b4d3bd","last_reissued_at":"2026-06-23T03:13:43.921464Z","signature_status":"signed_v1","first_computed_at":"2026-06-23T03:13:43.921464Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Convergence Rate Analysis of LION","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["math.OC"],"primary_cat":"cs.LG","authors_text":"Huan Li, Yiming Dong, Zhouchen Lin","submitted_at":"2024-11-12T11:30:53Z","abstract_excerpt":"The LION (evoLved sIgn mOmeNtum) optimizer for deep neural network training was found by Google via program search, with the simple sign update yet showing impressive performance in training large scale networks. Although previous studies have investigated its convergence properties, a comprehensive analysis, especially the convergence rate, is still desirable. Recognizing that LION can be regarded as solving a specific constrained problem, this paper focuses on demonstrating its convergence to the Karush-Kuhn-Tucker (KKT) point at the rate of $\\cal O(\\sqrt{d}K^{-1/4})$ measured by gradient $\\"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2411.07724","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2411.07724/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2411.07724","created_at":"2026-06-23T03:13:43.921521+00:00"},{"alias_kind":"arxiv_version","alias_value":"2411.07724v2","created_at":"2026-06-23T03:13:43.921521+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2411.07724","created_at":"2026-06-23T03:13:43.921521+00:00"},{"alias_kind":"pith_short_12","alias_value":"2ISEK7MSL5YK","created_at":"2026-06-23T03:13:43.921521+00:00"},{"alias_kind":"pith_short_16","alias_value":"2ISEK7MSL5YKEBZU","created_at":"2026-06-23T03:13:43.921521+00:00"},{"alias_kind":"pith_short_8","alias_value":"2ISEK7MS","created_at":"2026-06-23T03:13:43.921521+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":6,"internal_anchor_count":6,"sample":[{"citing_arxiv_id":"2506.01897","citing_title":"MLorc: Momentum Low-rank Compression for Memory Efficient Large Language Model Adaptation","ref_index":5,"is_internal_anchor":true},{"citing_arxiv_id":"2605.11170","citing_title":"Unlearning with Asymmetric Sources: Improved Unlearning-Utility Trade-off with Public Data","ref_index":11,"is_internal_anchor":true},{"citing_arxiv_id":"2605.06615","citing_title":"When and Why SignSGD Outperforms SGD: A Theoretical Study Based on $\\ell_1$-norm Lower Bounds","ref_index":11,"is_internal_anchor":true},{"citing_arxiv_id":"2605.05577","citing_title":"Accelerating LMO-Based Optimization via Implicit Gradient Transport","ref_index":5,"is_internal_anchor":true},{"citing_arxiv_id":"2604.15416","citing_title":"StoSignSGD: Unbiased Structural Stochasticity Fixes SignSGD for Training Large Language Models","ref_index":11,"is_internal_anchor":true},{"citing_arxiv_id":"2604.14587","citing_title":"CLion: Efficient Cautious Lion Optimizer with Enhanced Generalization","ref_index":7,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/2ISEK7MSL5YKEBZUPXSAH7ZZNY","json":"https://pith.science/pith/2ISEK7MSL5YKEBZUPXSAH7ZZNY.json","graph_json":"https://pith.science/api/pith-number/2ISEK7MSL5YKEBZUPXSAH7ZZNY/graph.json","events_json":"https://pith.science/api/pith-number/2ISEK7MSL5YKEBZUPXSAH7ZZNY/events.json","paper":"https://pith.science/paper/2ISEK7MS"},"agent_actions":{"view_html":"https://pith.science/pith/2ISEK7MSL5YKEBZUPXSAH7ZZNY","download_json":"https://pith.science/pith/2ISEK7MSL5YKEBZUPXSAH7ZZNY.json","view_paper":"https://pith.science/paper/2ISEK7MS","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2411.07724&json=true","fetch_graph":"https://pith.science/api/pith-number/2ISEK7MSL5YKEBZUPXSAH7ZZNY/graph.json","fetch_events":"https://pith.science/api/pith-number/2ISEK7MSL5YKEBZUPXSAH7ZZNY/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/2ISEK7MSL5YKEBZUPXSAH7ZZNY/action/timestamp_anchor","attest_storage":"https://pith.science/pith/2ISEK7MSL5YKEBZUPXSAH7ZZNY/action/storage_attestation","attest_author":"https://pith.science/pith/2ISEK7MSL5YKEBZUPXSAH7ZZNY/action/author_attestation","sign_citation":"https://pith.science/pith/2ISEK7MSL5YKEBZUPXSAH7ZZNY/action/citation_signature","submit_replication":"https://pith.science/pith/2ISEK7MSL5YKEBZUPXSAH7ZZNY/action/replication_record"}},"created_at":"2026-06-23T03:13:43.921521+00:00","updated_at":"2026-06-23T03:13:43.921521+00:00"}