{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2025:NZ3U7IAKB3ARCOA6KVNUVOERKY","short_pith_number":"pith:NZ3U7IAK","schema_version":"1.0","canonical_sha256":"6e774fa00a0ec111381e555b4ab891561a10b628ccba76f1bdb1bc6afdfb2895","source":{"kind":"arxiv","id":"2511.04981","version":2},"attestation_state":"computed","paper":{"title":"Scaling depth capacity via zero/one-layer model expansion","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.LG","authors_text":"Zhiqi Bu","submitted_at":"2025-11-07T04:56:45Z","abstract_excerpt":"Model depth is a double-edged sword in deep learning: deeper models achieve higher accuracy but require higher computational cost. To efficiently train models at scale, progressive training (also known as model expansion) scales up model capacity during training and significantly reduces computation with little performance degradation. In this work, we study the depth expansion of large-scale models through the lens of optimization theory and feature learning, offering insights on the initialization of new layers, hyperparameter transfer, learning rate schedule, and timing of model expansion. "},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2511.04981","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2025-11-07T04:56:45Z","cross_cats_sorted":[],"title_canon_sha256":"2e61ede94508fea2be88f20cc53b824cfc70f98d717645d9a19951e5d20459a0","abstract_canon_sha256":"1edac4ee97eec4f628d123f1f141174006979975c6fccd4d0d4000bd52460542"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-02T02:04:10.532676Z","signature_b64":"lkTusNjUrhYpK9NiPpp1zSvWckbesQwxpqwnsrs1cxoobYbi20OWzplhKRpYZVzqp7gpfuN62oSeOQbzQ8bPAw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"6e774fa00a0ec111381e555b4ab891561a10b628ccba76f1bdb1bc6afdfb2895","last_reissued_at":"2026-06-02T02:04:10.532202Z","signature_status":"signed_v1","first_computed_at":"2026-06-02T02:04:10.532202Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Scaling depth capacity via zero/one-layer model expansion","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.LG","authors_text":"Zhiqi Bu","submitted_at":"2025-11-07T04:56:45Z","abstract_excerpt":"Model depth is a double-edged sword in deep learning: deeper models achieve higher accuracy but require higher computational cost. To efficiently train models at scale, progressive training (also known as model expansion) scales up model capacity during training and significantly reduces computation with little performance degradation. In this work, we study the depth expansion of large-scale models through the lens of optimization theory and feature learning, offering insights on the initialization of new layers, hyperparameter transfer, learning rate schedule, and timing of model expansion. "},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2511.04981","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2511.04981/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2511.04981","created_at":"2026-06-02T02:04:10.532257+00:00"},{"alias_kind":"arxiv_version","alias_value":"2511.04981v2","created_at":"2026-06-02T02:04:10.532257+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2511.04981","created_at":"2026-06-02T02:04:10.532257+00:00"},{"alias_kind":"pith_short_12","alias_value":"NZ3U7IAKB3AR","created_at":"2026-06-02T02:04:10.532257+00:00"},{"alias_kind":"pith_short_16","alias_value":"NZ3U7IAKB3ARCOA6","created_at":"2026-06-02T02:04:10.532257+00:00"},{"alias_kind":"pith_short_8","alias_value":"NZ3U7IAK","created_at":"2026-06-02T02:04:10.532257+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":2,"internal_anchor_count":2,"sample":[{"citing_arxiv_id":"2604.19835","citing_title":"Expert Upcycling: Shifting the Compute-Efficient Frontier of Mixture-of-Experts","ref_index":3,"is_internal_anchor":true},{"citing_arxiv_id":"2604.19835","citing_title":"Expert Upcycling: Shifting the Compute-Efficient Frontier of Mixture-of-Experts","ref_index":3,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/NZ3U7IAKB3ARCOA6KVNUVOERKY","json":"https://pith.science/pith/NZ3U7IAKB3ARCOA6KVNUVOERKY.json","graph_json":"https://pith.science/api/pith-number/NZ3U7IAKB3ARCOA6KVNUVOERKY/graph.json","events_json":"https://pith.science/api/pith-number/NZ3U7IAKB3ARCOA6KVNUVOERKY/events.json","paper":"https://pith.science/paper/NZ3U7IAK"},"agent_actions":{"view_html":"https://pith.science/pith/NZ3U7IAKB3ARCOA6KVNUVOERKY","download_json":"https://pith.science/pith/NZ3U7IAKB3ARCOA6KVNUVOERKY.json","view_paper":"https://pith.science/paper/NZ3U7IAK","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2511.04981&json=true","fetch_graph":"https://pith.science/api/pith-number/NZ3U7IAKB3ARCOA6KVNUVOERKY/graph.json","fetch_events":"https://pith.science/api/pith-number/NZ3U7IAKB3ARCOA6KVNUVOERKY/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/NZ3U7IAKB3ARCOA6KVNUVOERKY/action/timestamp_anchor","attest_storage":"https://pith.science/pith/NZ3U7IAKB3ARCOA6KVNUVOERKY/action/storage_attestation","attest_author":"https://pith.science/pith/NZ3U7IAKB3ARCOA6KVNUVOERKY/action/author_attestation","sign_citation":"https://pith.science/pith/NZ3U7IAKB3ARCOA6KVNUVOERKY/action/citation_signature","submit_replication":"https://pith.science/pith/NZ3U7IAKB3ARCOA6KVNUVOERKY/action/replication_record"}},"created_at":"2026-06-02T02:04:10.532257+00:00","updated_at":"2026-06-02T02:04:10.532257+00:00"}