{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:6P6EOFSCFGMMUCOH4ZZRE36AXR","short_pith_number":"pith:6P6EOFSC","schema_version":"1.0","canonical_sha256":"f3fc4716422998ca09c7e673126fc0bc77393cd5c61a2bf4f12a5f9325c2c742","source":{"kind":"arxiv","id":"2606.29158","version":1},"attestation_state":"computed","paper":{"title":"On the Nonlinearity of Learning Rate Scaling for LLM Training","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.LG","authors_text":"Huaqing Zhang, Jing Xu, Jingzhao Zhang, Zaiwen Yang","submitted_at":"2026-06-28T02:42:47Z","abstract_excerpt":"Learning-rate transfer can reduce the cost of training large language models: instead of sweeping learning rates at target scale, practitioners extrapolate from smaller runs. Existing approaches often assume that the optimal learning rate follows a log-linear scaling law in data scale and model size. We carefully examine and evaluate this scaling law. In our empirical study of GPT-2--style models from 22M to 707M parameters trained on 5B to 100B tokens, the optimal learning rate develops upward curvature at larger scales, leading to inaccurate extrapolation. We find that this curvature largely"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2606.29158","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-06-28T02:42:47Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"17d06fd65211a18b916634cec7fbad2e689721de79f021abf913d790f3432a24","abstract_canon_sha256":"caf5201387685f564e7e353296945cef39146a538bb207b4512e0ba68bbac320"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-30T01:17:55.003245Z","signature_b64":"WgLnzMng885JAjuKCBv3f09IJxAq3uR+S819CyIzkjCUKXhG3Fz7se+O51JDM77vRR4UxFzx4skdWTKyuGy8Aw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"f3fc4716422998ca09c7e673126fc0bc77393cd5c61a2bf4f12a5f9325c2c742","last_reissued_at":"2026-06-30T01:17:55.002648Z","signature_status":"signed_v1","first_computed_at":"2026-06-30T01:17:55.002648Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"On the Nonlinearity of Learning Rate Scaling for LLM Training","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.LG","authors_text":"Huaqing Zhang, Jing Xu, Jingzhao Zhang, Zaiwen Yang","submitted_at":"2026-06-28T02:42:47Z","abstract_excerpt":"Learning-rate transfer can reduce the cost of training large language models: instead of sweeping learning rates at target scale, practitioners extrapolate from smaller runs. Existing approaches often assume that the optimal learning rate follows a log-linear scaling law in data scale and model size. We carefully examine and evaluate this scaling law. In our empirical study of GPT-2--style models from 22M to 707M parameters trained on 5B to 100B tokens, the optimal learning rate develops upward curvature at larger scales, leading to inaccurate extrapolation. We find that this curvature largely"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.29158","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2606.29158/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2606.29158","created_at":"2026-06-30T01:17:55.002716+00:00"},{"alias_kind":"arxiv_version","alias_value":"2606.29158v1","created_at":"2026-06-30T01:17:55.002716+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.29158","created_at":"2026-06-30T01:17:55.002716+00:00"},{"alias_kind":"pith_short_12","alias_value":"6P6EOFSCFGMM","created_at":"2026-06-30T01:17:55.002716+00:00"},{"alias_kind":"pith_short_16","alias_value":"6P6EOFSCFGMMUCOH","created_at":"2026-06-30T01:17:55.002716+00:00"},{"alias_kind":"pith_short_8","alias_value":"6P6EOFSC","created_at":"2026-06-30T01:17:55.002716+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/6P6EOFSCFGMMUCOH4ZZRE36AXR","json":"https://pith.science/pith/6P6EOFSCFGMMUCOH4ZZRE36AXR.json","graph_json":"https://pith.science/api/pith-number/6P6EOFSCFGMMUCOH4ZZRE36AXR/graph.json","events_json":"https://pith.science/api/pith-number/6P6EOFSCFGMMUCOH4ZZRE36AXR/events.json","paper":"https://pith.science/paper/6P6EOFSC"},"agent_actions":{"view_html":"https://pith.science/pith/6P6EOFSCFGMMUCOH4ZZRE36AXR","download_json":"https://pith.science/pith/6P6EOFSCFGMMUCOH4ZZRE36AXR.json","view_paper":"https://pith.science/paper/6P6EOFSC","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2606.29158&json=true","fetch_graph":"https://pith.science/api/pith-number/6P6EOFSCFGMMUCOH4ZZRE36AXR/graph.json","fetch_events":"https://pith.science/api/pith-number/6P6EOFSCFGMMUCOH4ZZRE36AXR/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/6P6EOFSCFGMMUCOH4ZZRE36AXR/action/timestamp_anchor","attest_storage":"https://pith.science/pith/6P6EOFSCFGMMUCOH4ZZRE36AXR/action/storage_attestation","attest_author":"https://pith.science/pith/6P6EOFSCFGMMUCOH4ZZRE36AXR/action/author_attestation","sign_citation":"https://pith.science/pith/6P6EOFSCFGMMUCOH4ZZRE36AXR/action/citation_signature","submit_replication":"https://pith.science/pith/6P6EOFSCFGMMUCOH4ZZRE36AXR/action/replication_record"}},"created_at":"2026-06-30T01:17:55.002716+00:00","updated_at":"2026-06-30T01:17:55.002716+00:00"}