{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2018:4JIX4JC5OC6QMTEXYQZIW7GQRS","short_pith_number":"pith:4JIX4JC5","schema_version":"1.0","canonical_sha256":"e2517e245d70bd064c97c4328b7cd08cb42f894eff9e64307ef6ae852101b5b3","source":{"kind":"arxiv","id":"1811.00659","version":1},"attestation_state":"computed","paper":{"title":"Implicit Regularization of Stochastic Gradient Descent in Natural Language Processing: Observations and Implications","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.LG","cs.NE"],"primary_cat":"cs.CL","authors_text":"Deren Lei, William Yang Wang, Yijun Xiao, Zichen Sun","submitted_at":"2018-11-01T22:24:25Z","abstract_excerpt":"Deep neural networks with remarkably strong generalization performances are usually over-parameterized. Despite explicit regularization strategies are used for practitioners to avoid over-fitting, the impacts are often small. Some theoretical studies have analyzed the implicit regularization effect of stochastic gradient descent (SGD) on simple machine learning models with certain assumptions. However, how it behaves practically in state-of-the-art models and real-world datasets is still unknown. To bridge this gap, we study the role of SGD implicit regularization in deep learning systems. We "},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1811.00659","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2018-11-01T22:24:25Z","cross_cats_sorted":["cs.LG","cs.NE"],"title_canon_sha256":"362c66e256c8e9498c7d996bca1f115c6879b368ca9e0c4ea5a4bc8462c588a0","abstract_canon_sha256":"33d9547b2d40d4cd68e13a59acc6a6c7dfddfd79bc898bf4629c156881bb853f"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T00:01:42.553770Z","signature_b64":"bNzUfaPjbLcPvEvWR7LES163+tId+B3ZlbLFfhHHmyDbQxlwJpIeRy7qjSnPwCgavwl+3eXwUNUA2qJsHmA1CQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"e2517e245d70bd064c97c4328b7cd08cb42f894eff9e64307ef6ae852101b5b3","last_reissued_at":"2026-05-18T00:01:42.553311Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T00:01:42.553311Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Implicit Regularization of Stochastic Gradient Descent in Natural Language Processing: Observations and Implications","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.LG","cs.NE"],"primary_cat":"cs.CL","authors_text":"Deren Lei, William Yang Wang, Yijun Xiao, Zichen Sun","submitted_at":"2018-11-01T22:24:25Z","abstract_excerpt":"Deep neural networks with remarkably strong generalization performances are usually over-parameterized. Despite explicit regularization strategies are used for practitioners to avoid over-fitting, the impacts are often small. Some theoretical studies have analyzed the implicit regularization effect of stochastic gradient descent (SGD) on simple machine learning models with certain assumptions. However, how it behaves practically in state-of-the-art models and real-world datasets is still unknown. To bridge this gap, we study the role of SGD implicit regularization in deep learning systems. We "},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1811.00659","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1811.00659","created_at":"2026-05-18T00:01:42.553396+00:00"},{"alias_kind":"arxiv_version","alias_value":"1811.00659v1","created_at":"2026-05-18T00:01:42.553396+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1811.00659","created_at":"2026-05-18T00:01:42.553396+00:00"},{"alias_kind":"pith_short_12","alias_value":"4JIX4JC5OC6Q","created_at":"2026-05-18T12:32:05.422762+00:00"},{"alias_kind":"pith_short_16","alias_value":"4JIX4JC5OC6QMTEX","created_at":"2026-05-18T12:32:05.422762+00:00"},{"alias_kind":"pith_short_8","alias_value":"4JIX4JC5","created_at":"2026-05-18T12:32:05.422762+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":1,"internal_anchor_count":1,"sample":[{"citing_arxiv_id":"2602.07131","citing_title":"Behavior Score Prediction in Resting-State Functional MRI by Deep State Space Modeling","ref_index":55,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/4JIX4JC5OC6QMTEXYQZIW7GQRS","json":"https://pith.science/pith/4JIX4JC5OC6QMTEXYQZIW7GQRS.json","graph_json":"https://pith.science/api/pith-number/4JIX4JC5OC6QMTEXYQZIW7GQRS/graph.json","events_json":"https://pith.science/api/pith-number/4JIX4JC5OC6QMTEXYQZIW7GQRS/events.json","paper":"https://pith.science/paper/4JIX4JC5"},"agent_actions":{"view_html":"https://pith.science/pith/4JIX4JC5OC6QMTEXYQZIW7GQRS","download_json":"https://pith.science/pith/4JIX4JC5OC6QMTEXYQZIW7GQRS.json","view_paper":"https://pith.science/paper/4JIX4JC5","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1811.00659&json=true","fetch_graph":"https://pith.science/api/pith-number/4JIX4JC5OC6QMTEXYQZIW7GQRS/graph.json","fetch_events":"https://pith.science/api/pith-number/4JIX4JC5OC6QMTEXYQZIW7GQRS/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/4JIX4JC5OC6QMTEXYQZIW7GQRS/action/timestamp_anchor","attest_storage":"https://pith.science/pith/4JIX4JC5OC6QMTEXYQZIW7GQRS/action/storage_attestation","attest_author":"https://pith.science/pith/4JIX4JC5OC6QMTEXYQZIW7GQRS/action/author_attestation","sign_citation":"https://pith.science/pith/4JIX4JC5OC6QMTEXYQZIW7GQRS/action/citation_signature","submit_replication":"https://pith.science/pith/4JIX4JC5OC6QMTEXYQZIW7GQRS/action/replication_record"}},"created_at":"2026-05-18T00:01:42.553396+00:00","updated_at":"2026-05-18T00:01:42.553396+00:00"}