{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:GPNWVPAN6IYETZMQ3XYMZ26D3P","short_pith_number":"pith:GPNWVPAN","canonical_record":{"source":{"id":"2605.13043","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-05-13T05:56:45Z","cross_cats_sorted":[],"title_canon_sha256":"fdf5a15e1a31d371803789b22e66682c506d27e01edada719e4e3f73e048f153","abstract_canon_sha256":"219965594f065d426432292ade566afe9fc55dc57446b5ed35786b4ad3643bd8"},"schema_version":"1.0"},"canonical_sha256":"33db6abc0df23049e590ddf0ccebc3dbe7a4792be6b6d344749e7868162ffadb","source":{"kind":"arxiv","id":"2605.13043","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.13043","created_at":"2026-05-18T03:08:59Z"},{"alias_kind":"arxiv_version","alias_value":"2605.13043v1","created_at":"2026-05-18T03:08:59Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.13043","created_at":"2026-05-18T03:08:59Z"},{"alias_kind":"pith_short_12","alias_value":"GPNWVPAN6IYE","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"GPNWVPAN6IYETZMQ","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"GPNWVPAN","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:GPNWVPAN6IYETZMQ3XYMZ26D3P","target":"record","payload":{"canonical_record":{"source":{"id":"2605.13043","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-05-13T05:56:45Z","cross_cats_sorted":[],"title_canon_sha256":"fdf5a15e1a31d371803789b22e66682c506d27e01edada719e4e3f73e048f153","abstract_canon_sha256":"219965594f065d426432292ade566afe9fc55dc57446b5ed35786b4ad3643bd8"},"schema_version":"1.0"},"canonical_sha256":"33db6abc0df23049e590ddf0ccebc3dbe7a4792be6b6d344749e7868162ffadb","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T03:08:59.495911Z","signature_b64":"ZlmzdcfNbH3uND4aRlS6eD7B2tV/ONOeld4BQM0bKSwx4bmLZWyEQAxED8QlqafPcAuMSN069swjM9mvvWzoBQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"33db6abc0df23049e590ddf0ccebc3dbe7a4792be6b6d344749e7868162ffadb","last_reissued_at":"2026-05-18T03:08:59.495271Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T03:08:59.495271Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2605.13043","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-18T03:08:59Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"yvxlZv7rSlFDEgNARVWmc8Yze7PeKYtm5Mb1y1ZH+todLucGjNL5NxH8FjLThrMweByIauSJbhwXYxdf+QUoBw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-02T12:27:45.022495Z"},"content_sha256":"e641e4cbb7520148d83f32890b6fea7043d78f7a8bfeec48f05ae8e44114a1b1","schema_version":"1.0","event_id":"sha256:e641e4cbb7520148d83f32890b6fea7043d78f7a8bfeec48f05ae8e44114a1b1"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:GPNWVPAN6IYETZMQ3XYMZ26D3P","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Adaptive Steering and Remasking for Safe Generation in Diffusion Language Models","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Step-wise remasking guided by a contrastive safety direction reduces jailbreak success in diffusion language models to 0.64 percent while keeping output quality nearly unchanged.","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Yejin Lee, Yo-Sub Han","submitted_at":"2026-05-13T05:56:45Z","abstract_excerpt":"Diffusion Language Models (DLMs) provide a promising alternative to autoregressive language models by generating text through iterative denoising and bidirectional refinement. However, this iterative generation paradigm also introduces unique safety vulnerabilities when harmful tokens generated at intermediate denoising steps propagate through subsequent refinement processes and eventually induce unsafe outputs. While there are a few attempts to remedy this issue, they either fail to generate safe outputs or generate safe yet low-quality outputs. This motivates us to propose an inference-time "},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"our approaches reduce jailbreak success rates to 0.64% while preserving generation quality close to the original model performance","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"The contrastive safety direction reliably identifies harmful semantic alignment at intermediate denoising steps and that remasking those tokens followed by adaptive steering does not degrade final output quality or coherence.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Step-wise detection via a contrastive safety direction followed by remasking and adaptive steering reduces jailbreak success rates in diffusion language models to 0.64% while preserving output quality.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Step-wise remasking guided by a contrastive safety direction reduces jailbreak success in diffusion language models to 0.64 percent while keeping output quality nearly unchanged.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"2a4975b4e2492ace1b07790474a7b42f54941c967ef0541ade3dfd9f66d7bbf1"},"source":{"id":"2605.13043","kind":"arxiv","version":1},"verdict":{"id":"90a02904-92be-4c9f-8f95-c0b7e3d3d426","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-14T20:04:24.305736Z","strongest_claim":"our approaches reduce jailbreak success rates to 0.64% while preserving generation quality close to the original model performance","one_line_summary":"Step-wise detection via a contrastive safety direction followed by remasking and adaptive steering reduces jailbreak success rates in diffusion language models to 0.64% while preserving output quality.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"The contrastive safety direction reliably identifies harmful semantic alignment at intermediate denoising steps and that remasking those tokens followed by adaptive steering does not degrade final output quality or coherence.","pith_extraction_headline":"Step-wise remasking guided by a contrastive safety direction reduces jailbreak success in diffusion language models to 0.64 percent while keeping output quality nearly unchanged."},"references":{"count":40,"sample":[{"doi":"","year":2024,"title":"Refusal in language models is mediated by a single direction","work_id":"265ab035-b05f-4b81-9c37-3de83a7e42f9","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2021,"title":"Johnson, Jonathan Ho, Daniel Tarlow, and Rianne van den Berg","work_id":"98eea868-0cda-444e-89b8-9a1ff3048fc3","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2025,"title":"Pappas, and Eric Wong","work_id":"2645cf19-51c8-4e3a-b2d9-c3e90a2d7d59","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2024,"title":"Jailbreakbench: An open robustness benchmark for jailbreaking large language models","work_id":"ee72a9c8-6e48-4324-970a-b620e67b0914","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2022,"title":"Richemond, Arnaud Doucet, Robin Strudel, Chris Dyer, Conor Durkan, Curtis Hawthorne, Rémi Leblond, Will Grathwohl, and Jonas Adler","work_id":"402c961d-04b8-4c79-8147-b481c32e9ce9","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":40,"snapshot_sha256":"47736e142ff5c28ee9af273a0d4d1e9ae9ec1bf59e595e12c91a3bb05067d72f","internal_anchors":3},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"90a02904-92be-4c9f-8f95-c0b7e3d3d426"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-18T03:08:59Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"MwxkIXhm4lhjSFTjP6S582A9xSA49SybQsgPlb7f+JQrUcjmZD3U7b7Zx8hhMofzWtRuEzH6ZJONwiXqFUzoBA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-02T12:27:45.022994Z"},"content_sha256":"8e1062b2e7b9cef0c5379f231a7fc7a47aa0f3edcc2f5ee8e8b6bef368eb7573","schema_version":"1.0","event_id":"sha256:8e1062b2e7b9cef0c5379f231a7fc7a47aa0f3edcc2f5ee8e8b6bef368eb7573"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/GPNWVPAN6IYETZMQ3XYMZ26D3P/bundle.json","state_url":"https://pith.science/pith/GPNWVPAN6IYETZMQ3XYMZ26D3P/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/GPNWVPAN6IYETZMQ3XYMZ26D3P/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-02T12:27:45Z","links":{"resolver":"https://pith.science/pith/GPNWVPAN6IYETZMQ3XYMZ26D3P","bundle":"https://pith.science/pith/GPNWVPAN6IYETZMQ3XYMZ26D3P/bundle.json","state":"https://pith.science/pith/GPNWVPAN6IYETZMQ3XYMZ26D3P/state.json","well_known_bundle":"https://pith.science/.well-known/pith/GPNWVPAN6IYETZMQ3XYMZ26D3P/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:GPNWVPAN6IYETZMQ3XYMZ26D3P","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"219965594f065d426432292ade566afe9fc55dc57446b5ed35786b4ad3643bd8","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-05-13T05:56:45Z","title_canon_sha256":"fdf5a15e1a31d371803789b22e66682c506d27e01edada719e4e3f73e048f153"},"schema_version":"1.0","source":{"id":"2605.13043","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.13043","created_at":"2026-05-18T03:08:59Z"},{"alias_kind":"arxiv_version","alias_value":"2605.13043v1","created_at":"2026-05-18T03:08:59Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.13043","created_at":"2026-05-18T03:08:59Z"},{"alias_kind":"pith_short_12","alias_value":"GPNWVPAN6IYE","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"GPNWVPAN6IYETZMQ","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"GPNWVPAN","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:8e1062b2e7b9cef0c5379f231a7fc7a47aa0f3edcc2f5ee8e8b6bef368eb7573","target":"graph","created_at":"2026-05-18T03:08:59Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"our approaches reduce jailbreak success rates to 0.64% while preserving generation quality close to the original model performance"},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"The contrastive safety direction reliably identifies harmful semantic alignment at intermediate denoising steps and that remasking those tokens followed by adaptive steering does not degrade final output quality or coherence."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Step-wise detection via a contrastive safety direction followed by remasking and adaptive steering reduces jailbreak success rates in diffusion language models to 0.64% while preserving output quality."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Step-wise remasking guided by a contrastive safety direction reduces jailbreak success in diffusion language models to 0.64 percent while keeping output quality nearly unchanged."}],"snapshot_sha256":"2a4975b4e2492ace1b07790474a7b42f54941c967ef0541ade3dfd9f66d7bbf1"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"Diffusion Language Models (DLMs) provide a promising alternative to autoregressive language models by generating text through iterative denoising and bidirectional refinement. However, this iterative generation paradigm also introduces unique safety vulnerabilities when harmful tokens generated at intermediate denoising steps propagate through subsequent refinement processes and eventually induce unsafe outputs. While there are a few attempts to remedy this issue, they either fail to generate safe outputs or generate safe yet low-quality outputs. This motivates us to propose an inference-time ","authors_text":"Yejin Lee, Yo-Sub Han","cross_cats":[],"headline":"Step-wise remasking guided by a contrastive safety direction reduces jailbreak success in diffusion language models to 0.64 percent while keeping output quality nearly unchanged.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-05-13T05:56:45Z","title":"Adaptive Steering and Remasking for Safe Generation in Diffusion Language Models"},"references":{"count":40,"internal_anchors":3,"resolved_work":40,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"Refusal in language models is mediated by a single direction","work_id":"265ab035-b05f-4b81-9c37-3de83a7e42f9","year":2024},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Johnson, Jonathan Ho, Daniel Tarlow, and Rianne van den Berg","work_id":"98eea868-0cda-444e-89b8-9a1ff3048fc3","year":2021},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"Pappas, and Eric Wong","work_id":"2645cf19-51c8-4e3a-b2d9-c3e90a2d7d59","year":2025},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Jailbreakbench: An open robustness benchmark for jailbreaking large language models","work_id":"ee72a9c8-6e48-4324-970a-b620e67b0914","year":2024},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"Richemond, Arnaud Doucet, Robin Strudel, Chris Dyer, Conor Durkan, Curtis Hawthorne, Rémi Leblond, Will Grathwohl, and Jonas Adler","work_id":"402c961d-04b8-4c79-8147-b481c32e9ce9","year":2022}],"snapshot_sha256":"47736e142ff5c28ee9af273a0d4d1e9ae9ec1bf59e595e12c91a3bb05067d72f"},"source":{"id":"2605.13043","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-14T20:04:24.305736Z","id":"90a02904-92be-4c9f-8f95-c0b7e3d3d426","model_set":{"reader":"grok-4.3"},"one_line_summary":"Step-wise detection via a contrastive safety direction followed by remasking and adaptive steering reduces jailbreak success rates in diffusion language models to 0.64% while preserving output quality.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Step-wise remasking guided by a contrastive safety direction reduces jailbreak success in diffusion language models to 0.64 percent while keeping output quality nearly unchanged.","strongest_claim":"our approaches reduce jailbreak success rates to 0.64% while preserving generation quality close to the original model performance","weakest_assumption":"The contrastive safety direction reliably identifies harmful semantic alignment at intermediate denoising steps and that remasking those tokens followed by adaptive steering does not degrade final output quality or coherence."}},"verdict_id":"90a02904-92be-4c9f-8f95-c0b7e3d3d426"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:e641e4cbb7520148d83f32890b6fea7043d78f7a8bfeec48f05ae8e44114a1b1","target":"record","created_at":"2026-05-18T03:08:59Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"219965594f065d426432292ade566afe9fc55dc57446b5ed35786b4ad3643bd8","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-05-13T05:56:45Z","title_canon_sha256":"fdf5a15e1a31d371803789b22e66682c506d27e01edada719e4e3f73e048f153"},"schema_version":"1.0","source":{"id":"2605.13043","kind":"arxiv","version":1}},"canonical_sha256":"33db6abc0df23049e590ddf0ccebc3dbe7a4792be6b6d344749e7868162ffadb","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"33db6abc0df23049e590ddf0ccebc3dbe7a4792be6b6d344749e7868162ffadb","first_computed_at":"2026-05-18T03:08:59.495271Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-18T03:08:59.495271Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"ZlmzdcfNbH3uND4aRlS6eD7B2tV/ONOeld4BQM0bKSwx4bmLZWyEQAxED8QlqafPcAuMSN069swjM9mvvWzoBQ==","signature_status":"signed_v1","signed_at":"2026-05-18T03:08:59.495911Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.13043","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:e641e4cbb7520148d83f32890b6fea7043d78f7a8bfeec48f05ae8e44114a1b1","sha256:8e1062b2e7b9cef0c5379f231a7fc7a47aa0f3edcc2f5ee8e8b6bef368eb7573"],"state_sha256":"1e2a5bc5078b2128fdeadcee965180a048f9661629ff4ea868d91f6157aadb57"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"1PVyDZxngXc9yuS3zScZaB32iNbLajv6Y4H+RYx0NmLnM/vsiqA1UAMOsHHej4mPSArU1kCn6MDutPvYxk5MDA==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-02T12:27:45.025166Z","bundle_sha256":"1247296d7fc9fd58657532fcb513a583fdec99eaa45ce525822ae1a0069cc992"}}