{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:EHWICTYBINN65FLIKWH3MJABOK","short_pith_number":"pith:EHWICTYB","schema_version":"1.0","canonical_sha256":"21ec814f01435bee9568558fb6240172907bcb4fb8ce40eeac86829f4eb06356","source":{"kind":"arxiv","id":"2606.06712","version":1},"attestation_state":"computed","paper":{"title":"Data-Efficient Autoregressive-to-Diffusion Language Models via On-Policy Distillation","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.CL","authors_text":"Atharv Chagi, Degui Zhi, Dileep Kalathil, Jacob Helwig, James Caverlee, Lakshmi Jotsna, Shubham Parashar, Shuiwang Ji, Xingyu Su","submitted_at":"2026-06-04T20:58:08Z","abstract_excerpt":"We study the transformation of autoregressive models (ARLMs) into diffusion language models (DLMs). Rather than pretraining from scratch, prior work replaces the causal attention in ARLMs with bidirectional attention and then trains the resulting model using a DLM objective. However, these approaches incur two distribution shifts. First, transitioning from a next-token prediction objective to a DLM objective can discard knowledge acquired by the ARLM during training. Second, standard DLMs suffer from a train-inference mismatch, as the training loss is defined on randomly masked sequences rathe"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2606.06712","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-06-04T20:58:08Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"6eaabf400d1dee3876f36edbe1326e56a31fa4f6b1726c49ff0075448afadccb","abstract_canon_sha256":"417baf873ee4325cabe9230bf8fe4f5938b5c83e77f70ebe7f1596b0bb9f9f2a"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-08T01:04:24.126932Z","signature_b64":"KRMUV5JCYc8mPRUrdeBIyIi/Z4b8MxCV2diaZOsw3XJ7n4hWy8TBR2+HaYmF7WKq/0NFqrULY8CsDK7HO6M4AA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"21ec814f01435bee9568558fb6240172907bcb4fb8ce40eeac86829f4eb06356","last_reissued_at":"2026-06-08T01:04:24.125950Z","signature_status":"signed_v1","first_computed_at":"2026-06-08T01:04:24.125950Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Data-Efficient Autoregressive-to-Diffusion Language Models via On-Policy Distillation","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.CL","authors_text":"Atharv Chagi, Degui Zhi, Dileep Kalathil, Jacob Helwig, James Caverlee, Lakshmi Jotsna, Shubham Parashar, Shuiwang Ji, Xingyu Su","submitted_at":"2026-06-04T20:58:08Z","abstract_excerpt":"We study the transformation of autoregressive models (ARLMs) into diffusion language models (DLMs). Rather than pretraining from scratch, prior work replaces the causal attention in ARLMs with bidirectional attention and then trains the resulting model using a DLM objective. However, these approaches incur two distribution shifts. First, transitioning from a next-token prediction objective to a DLM objective can discard knowledge acquired by the ARLM during training. Second, standard DLMs suffer from a train-inference mismatch, as the training loss is defined on randomly masked sequences rathe"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.06712","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2606.06712/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2606.06712","created_at":"2026-06-08T01:04:24.126130+00:00"},{"alias_kind":"arxiv_version","alias_value":"2606.06712v1","created_at":"2026-06-08T01:04:24.126130+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.06712","created_at":"2026-06-08T01:04:24.126130+00:00"},{"alias_kind":"pith_short_12","alias_value":"EHWICTYBINN6","created_at":"2026-06-08T01:04:24.126130+00:00"},{"alias_kind":"pith_short_16","alias_value":"EHWICTYBINN65FLI","created_at":"2026-06-08T01:04:24.126130+00:00"},{"alias_kind":"pith_short_8","alias_value":"EHWICTYB","created_at":"2026-06-08T01:04:24.126130+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/EHWICTYBINN65FLIKWH3MJABOK","json":"https://pith.science/pith/EHWICTYBINN65FLIKWH3MJABOK.json","graph_json":"https://pith.science/api/pith-number/EHWICTYBINN65FLIKWH3MJABOK/graph.json","events_json":"https://pith.science/api/pith-number/EHWICTYBINN65FLIKWH3MJABOK/events.json","paper":"https://pith.science/paper/EHWICTYB"},"agent_actions":{"view_html":"https://pith.science/pith/EHWICTYBINN65FLIKWH3MJABOK","download_json":"https://pith.science/pith/EHWICTYBINN65FLIKWH3MJABOK.json","view_paper":"https://pith.science/paper/EHWICTYB","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2606.06712&json=true","fetch_graph":"https://pith.science/api/pith-number/EHWICTYBINN65FLIKWH3MJABOK/graph.json","fetch_events":"https://pith.science/api/pith-number/EHWICTYBINN65FLIKWH3MJABOK/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/EHWICTYBINN65FLIKWH3MJABOK/action/timestamp_anchor","attest_storage":"https://pith.science/pith/EHWICTYBINN65FLIKWH3MJABOK/action/storage_attestation","attest_author":"https://pith.science/pith/EHWICTYBINN65FLIKWH3MJABOK/action/author_attestation","sign_citation":"https://pith.science/pith/EHWICTYBINN65FLIKWH3MJABOK/action/citation_signature","submit_replication":"https://pith.science/pith/EHWICTYBINN65FLIKWH3MJABOK/action/replication_record"}},"created_at":"2026-06-08T01:04:24.126130+00:00","updated_at":"2026-06-08T01:04:24.126130+00:00"}