{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2019:4DNVTFNK5NUD56MVMPRAMADY6E","short_pith_number":"pith:4DNVTFNK","schema_version":"1.0","canonical_sha256":"e0db5995aaeb683ef99563e2060078f12dc9f48d098aabda297cb11443dc86bc","source":{"kind":"arxiv","id":"1905.09899","version":1},"attestation_state":"computed","paper":{"title":"Blockwise Adaptivity: Faster Training and Better Generalization in Deep Learning","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["math.OC","stat.ML"],"primary_cat":"cs.LG","authors_text":"James T. Kwok, Shuai Zheng","submitted_at":"2019-05-23T20:06:10Z","abstract_excerpt":"Stochastic methods with coordinate-wise adaptive stepsize (such as RMSprop and Adam) have been widely used in training deep neural networks. Despite their fast convergence, they can generalize worse than stochastic gradient descent. In this paper, by revisiting the design of Adagrad, we propose to split the network parameters into blocks, and use a blockwise adaptive stepsize. Intuitively, blockwise adaptivity is less aggressive than adaptivity to individual coordinates, and can have a better balance between adaptivity and generalization. We show theoretically that the proposed blockwise adapt"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1905.09899","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2019-05-23T20:06:10Z","cross_cats_sorted":["math.OC","stat.ML"],"title_canon_sha256":"4a09c5f5f93ff7380d03c173e8f8e197e998d6bd8249e6d31fe9a4560fa505bf","abstract_canon_sha256":"a7bd76d796d98ccd323e21ba1e8381ee4c72a244bb16ac1cbc0a1d16c4a26e65"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:45:14.106834Z","signature_b64":"RbhGwI2Bzru7Ufet92fhI1F54d3n8J3AKpVBLlbnv0bBWpsKlyiLN8jzabHjiQF2DJellF54/vMADlDG9U5IDw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"e0db5995aaeb683ef99563e2060078f12dc9f48d098aabda297cb11443dc86bc","last_reissued_at":"2026-05-17T23:45:14.106301Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:45:14.106301Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Blockwise Adaptivity: Faster Training and Better Generalization in Deep Learning","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["math.OC","stat.ML"],"primary_cat":"cs.LG","authors_text":"James T. Kwok, Shuai Zheng","submitted_at":"2019-05-23T20:06:10Z","abstract_excerpt":"Stochastic methods with coordinate-wise adaptive stepsize (such as RMSprop and Adam) have been widely used in training deep neural networks. Despite their fast convergence, they can generalize worse than stochastic gradient descent. In this paper, by revisiting the design of Adagrad, we propose to split the network parameters into blocks, and use a blockwise adaptive stepsize. Intuitively, blockwise adaptivity is less aggressive than adaptivity to individual coordinates, and can have a better balance between adaptivity and generalization. We show theoretically that the proposed blockwise adapt"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1905.09899","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1905.09899","created_at":"2026-05-17T23:45:14.106391+00:00"},{"alias_kind":"arxiv_version","alias_value":"1905.09899v1","created_at":"2026-05-17T23:45:14.106391+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1905.09899","created_at":"2026-05-17T23:45:14.106391+00:00"},{"alias_kind":"pith_short_12","alias_value":"4DNVTFNK5NUD","created_at":"2026-05-18T12:33:10.108867+00:00"},{"alias_kind":"pith_short_16","alias_value":"4DNVTFNK5NUD56MV","created_at":"2026-05-18T12:33:10.108867+00:00"},{"alias_kind":"pith_short_8","alias_value":"4DNVTFNK","created_at":"2026-05-18T12:33:10.108867+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":1,"internal_anchor_count":0,"sample":[{"citing_arxiv_id":"2605.10335","citing_title":"PowerStep: Memory-Efficient Adaptive Optimization via $\\ell_p$-Norm Steepest Descent","ref_index":3,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/4DNVTFNK5NUD56MVMPRAMADY6E","json":"https://pith.science/pith/4DNVTFNK5NUD56MVMPRAMADY6E.json","graph_json":"https://pith.science/api/pith-number/4DNVTFNK5NUD56MVMPRAMADY6E/graph.json","events_json":"https://pith.science/api/pith-number/4DNVTFNK5NUD56MVMPRAMADY6E/events.json","paper":"https://pith.science/paper/4DNVTFNK"},"agent_actions":{"view_html":"https://pith.science/pith/4DNVTFNK5NUD56MVMPRAMADY6E","download_json":"https://pith.science/pith/4DNVTFNK5NUD56MVMPRAMADY6E.json","view_paper":"https://pith.science/paper/4DNVTFNK","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1905.09899&json=true","fetch_graph":"https://pith.science/api/pith-number/4DNVTFNK5NUD56MVMPRAMADY6E/graph.json","fetch_events":"https://pith.science/api/pith-number/4DNVTFNK5NUD56MVMPRAMADY6E/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/4DNVTFNK5NUD56MVMPRAMADY6E/action/timestamp_anchor","attest_storage":"https://pith.science/pith/4DNVTFNK5NUD56MVMPRAMADY6E/action/storage_attestation","attest_author":"https://pith.science/pith/4DNVTFNK5NUD56MVMPRAMADY6E/action/author_attestation","sign_citation":"https://pith.science/pith/4DNVTFNK5NUD56MVMPRAMADY6E/action/citation_signature","submit_replication":"https://pith.science/pith/4DNVTFNK5NUD56MVMPRAMADY6E/action/replication_record"}},"created_at":"2026-05-17T23:45:14.106391+00:00","updated_at":"2026-05-17T23:45:14.106391+00:00"}