{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:WMJWGC5EG4WOZOKDBKSPTP2CSM","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"4c06a8085fff6c80d14c8d29003b3d7560e6e52b0b270b4fda82145441072e5d","cross_cats_sorted":["cs.AI","cs.CL"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-13T05:29:38Z","title_canon_sha256":"3d0550aa8e6ce25dee32e15e7b0b08246d96347d234a7cb6a8f121b878032dd5"},"schema_version":"1.0","source":{"id":"2605.13026","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.13026","created_at":"2026-05-18T03:08:59Z"},{"alias_kind":"arxiv_version","alias_value":"2605.13026v1","created_at":"2026-05-18T03:08:59Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.13026","created_at":"2026-05-18T03:08:59Z"},{"alias_kind":"pith_short_12","alias_value":"WMJWGC5EG4WO","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"WMJWGC5EG4WOZOKD","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"WMJWGC5E","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:9747b9c354cbdc07537e1777c16963c75d807fc742da5f9cd412bf6aee9a94c6","target":"graph","created_at":"2026-05-18T03:08:59Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"MDMs trained with our training recipe reach the same validation negative log-likelihood (NLL) up to ∼4× faster than standard training on One Billion Word Benchmark (LM1B). We also show faster improvements in generative perplexity, zero-shot perplexity, and downstream task performance on various benchmarks."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"The locality bias of language is the dominant cause of slow MDM training, and bell-shaped time sampling directly mitigates it without introducing new optimization pathologies or degrading final performance."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Bell-shaped time sampling accelerates masked diffusion language model training by roughly 4x on LM1B by countering locality bias in language data."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Bell-shaped time sampling accelerates masked diffusion language models to target performance up to four times faster."}],"snapshot_sha256":"2331df7118bf0dcf7e38d808aebf7f8324c2b82d421308ffe13c1dcd46f09455"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"Masked diffusion models (MDMs) have emerged as a promising alternative to autoregressive models (ARMs) for language modeling. However, MDMs are known to learn substantially more slowly than ARMs, which may become problematic when scaling MDMs to larger models. Therefore, we ask the following question: how can we accelerate standard MDM training while maintaining its final performance? To this end, we first provide a detailed analysis of why MDM training is slow. We find that the main factor is the locality bias of language: the predictive information for a token is concentrated in nearby posit","authors_text":"Chieh-Hsin Lai, Chunsan Hong, Jong Chul Ye, Sanghyun Lee, Satoshi Hayakawa, Seungryong Kim, Yuhta Takida, Yuki Mitsufuji","cross_cats":["cs.AI","cs.CL"],"headline":"Bell-shaped time sampling accelerates masked diffusion language models to target performance up to four times faster.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-13T05:29:38Z","title":"Understanding and Accelerating the Training of Masked Diffusion Language Models"},"references":{"count":83,"internal_anchors":9,"resolved_work":83,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"Chiu, Zhihan Yang, Zhixuan Qi, Jiaqi Han, Subham Sekhar Sahoo, and V olodymyr Kuleshov","work_id":"348f3134-d8cd-4cc2-b5ae-b2d37f39e976","year":2025},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Struc- tured denoising diffusion models in discrete state-spaces","work_id":"1efe8ef1-51b4-4c60-afc1-6316316a4e5a","year":2021},{"cited_arxiv_id":"2512.15745","doi":"","is_internal_anchor":true,"ref_index":3,"title":"LLaDA2.0: Scaling Up Diffusion Language Models to 100B","work_id":"a1b1080d-0a91-44a4-8f70-2bf3e7a27e0b","year":2025},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Piqa: Reasoning about phys- ical commonsense in natural language","work_id":"2d3eecc9-292b-40b9-a29b-5a41245865a9","year":2020},{"cited_arxiv_id":"","doi":"10.21437/interspeech.2014-564","is_internal_anchor":false,"ref_index":5,"title":"One billion word benchmark for measuring progress in statistical language modeling","work_id":"860df94a-d88e-401f-bec2-35bff9859f0b","year":2014}],"snapshot_sha256":"3382bf43c40e3f7ba2e687b224c0aaabe7966d8ba86cd64847107ce1f12247f9"},"source":{"id":"2605.13026","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-14T20:27:51.874375Z","id":"97bac824-ed10-48ce-9e2c-405474fb667e","model_set":{"reader":"grok-4.3"},"one_line_summary":"Bell-shaped time sampling accelerates masked diffusion language model training by roughly 4x on LM1B by countering locality bias in language data.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Bell-shaped time sampling accelerates masked diffusion language models to target performance up to four times faster.","strongest_claim":"MDMs trained with our training recipe reach the same validation negative log-likelihood (NLL) up to ∼4× faster than standard training on One Billion Word Benchmark (LM1B). We also show faster improvements in generative perplexity, zero-shot perplexity, and downstream task performance on various benchmarks.","weakest_assumption":"The locality bias of language is the dominant cause of slow MDM training, and bell-shaped time sampling directly mitigates it without introducing new optimization pathologies or degrading final performance."}},"verdict_id":"97bac824-ed10-48ce-9e2c-405474fb667e"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:333f6d8ecd386c4c124baa3fc97b2f6b7d20a37f7cbe7f42285c40904b8c9b9d","target":"record","created_at":"2026-05-18T03:08:59Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"4c06a8085fff6c80d14c8d29003b3d7560e6e52b0b270b4fda82145441072e5d","cross_cats_sorted":["cs.AI","cs.CL"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-13T05:29:38Z","title_canon_sha256":"3d0550aa8e6ce25dee32e15e7b0b08246d96347d234a7cb6a8f121b878032dd5"},"schema_version":"1.0","source":{"id":"2605.13026","kind":"arxiv","version":1}},"canonical_sha256":"b313630ba4372cecb9430aa4f9bf4293345e7f91f5dd1af0efafc01bcda15d7a","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"b313630ba4372cecb9430aa4f9bf4293345e7f91f5dd1af0efafc01bcda15d7a","first_computed_at":"2026-05-18T03:08:59.893399Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-18T03:08:59.893399Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"l1x4kNcMNUxBhigRGmEjYBbE6h+zG0KV/DRUP9BXMJrHedeXL3EyeuVP6ExFUVFvkfW502cyP+UwqVlr2umGDA==","signature_status":"signed_v1","signed_at":"2026-05-18T03:08:59.894228Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.13026","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:333f6d8ecd386c4c124baa3fc97b2f6b7d20a37f7cbe7f42285c40904b8c9b9d","sha256:9747b9c354cbdc07537e1777c16963c75d807fc742da5f9cd412bf6aee9a94c6"],"state_sha256":"6a308172f62eb8f8633932f4b20712861e3661a154329d545af5ad8ea3108fc1"}