{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:6XB6HN2Q4LOJE6R4XU2MJPPEZ4","short_pith_number":"pith:6XB6HN2Q","canonical_record":{"source":{"id":"2605.14194","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-05-13T23:19:55Z","cross_cats_sorted":[],"title_canon_sha256":"526d2d1ecf63258686820a849a3b08b2b334642dd8905210460ada30bf1017db","abstract_canon_sha256":"669e5eada6fb706ff88053c591c62135ce4f23e1a263c91734aa308796a45956"},"schema_version":"1.0"},"canonical_sha256":"f5c3e3b750e2dc927a3cbd34c4bde4cf167c8ca1c56c34019b92fe40897c3900","source":{"kind":"arxiv","id":"2605.14194","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.14194","created_at":"2026-05-17T23:39:11Z"},{"alias_kind":"arxiv_version","alias_value":"2605.14194v1","created_at":"2026-05-17T23:39:11Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.14194","created_at":"2026-05-17T23:39:11Z"},{"alias_kind":"pith_short_12","alias_value":"6XB6HN2Q4LOJ","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"6XB6HN2Q4LOJE6R4","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"6XB6HN2Q","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:6XB6HN2Q4LOJE6R4XU2MJPPEZ4","target":"record","payload":{"canonical_record":{"source":{"id":"2605.14194","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-05-13T23:19:55Z","cross_cats_sorted":[],"title_canon_sha256":"526d2d1ecf63258686820a849a3b08b2b334642dd8905210460ada30bf1017db","abstract_canon_sha256":"669e5eada6fb706ff88053c591c62135ce4f23e1a263c91734aa308796a45956"},"schema_version":"1.0"},"canonical_sha256":"f5c3e3b750e2dc927a3cbd34c4bde4cf167c8ca1c56c34019b92fe40897c3900","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:39:11.110049Z","signature_b64":"viWJaSvwnzHfo1fHlXlnY5i3SUj4PtUbJySZNec6PZk/n87mHBJuAPwY3zKOF+Lvu+kJmD8/spsDtawz2un9Aw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"f5c3e3b750e2dc927a3cbd34c4bde4cf167c8ca1c56c34019b92fe40897c3900","last_reissued_at":"2026-05-17T23:39:11.109473Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:39:11.109473Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2605.14194","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:39:11Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"S/lCSAbJ3sDRY0q+CHv5ccqN7USiLoIPVDIpF9XnNMyZpYthn/PxBguZ5qtKGiLweBudZXXcG3CP6LvpO9oLCw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-07T11:17:59.718811Z"},"content_sha256":"f6613c24578cd4bc171408a29df4a19670344cc1810ad0bfec6db600d818e1bc","schema_version":"1.0","event_id":"sha256:f6613c24578cd4bc171408a29df4a19670344cc1810ad0bfec6db600d818e1bc"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:6XB6HN2Q4LOJE6R4XU2MJPPEZ4","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"GradShield: Alignment Preserving Finetuning","license":"http://creativecommons.org/licenses/by/4.0/","headline":"GradShield filters data points by their implicit harmfulness score to keep finetuned LLMs aligned while retaining utility.","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Basel Alomair, David Wagner, Emad A. Alghamdi, Patrick Mendoza, Raluca Ada Popa, Xiao Huang, Zhanhao Hu","submitted_at":"2026-05-13T23:19:55Z","abstract_excerpt":"Large Language Models (LLMs) pose a significant risk of safety misalignment after finetuning, as models can be compromised by both explicitly and implicitly harmful data. Even some seemingly benign data can inadvertently steer a model towards misaligned behaviors. To address this, we introduce GradShield, a principled filtering method that safeguards LLMs during finetuning by identifying and removing harmful data points before they corrupt the model's alignment. It removes potentially harmful data by computing a Finetuning Implicit Harmfulness Score (FIHS) for each data point and employs an ad"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"GradShield outperforms all baseline methods, consistently maintaining an Attack Success Rate (ASR) below 6% while preserving utility performance.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"The Finetuning Implicit Harmfulness Score (FIHS) computed for each data point accurately identifies examples that will cause misalignment after finetuning, and the adaptive thresholding removes harmful points without discarding too much useful signal.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"GradShield removes data points likely to cause safety misalignment during LLM finetuning by computing a Finetuning Implicit Harmfulness Score and applying adaptive thresholding, keeping attack success rates below 6% while preserving utility.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"GradShield filters data points by their implicit harmfulness score to keep finetuned LLMs aligned while retaining utility.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"35afbc3801669b5274ffd87a99ea65cd029408794e4e605566ed90c8b9d48f4e"},"source":{"id":"2605.14194","kind":"arxiv","version":1},"verdict":{"id":"b2008c41-d6cf-4335-bd65-13e959f83538","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T04:40:50.095750Z","strongest_claim":"GradShield outperforms all baseline methods, consistently maintaining an Attack Success Rate (ASR) below 6% while preserving utility performance.","one_line_summary":"GradShield removes data points likely to cause safety misalignment during LLM finetuning by computing a Finetuning Implicit Harmfulness Score and applying adaptive thresholding, keeping attack success rates below 6% while preserving utility.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"The Finetuning Implicit Harmfulness Score (FIHS) computed for each data point accurately identifies examples that will cause misalignment after finetuning, and the adaptive thresholding removes harmful points without discarding too much useful signal.","pith_extraction_headline":"GradShield filters data points by their implicit harmfulness score to keep finetuned LLMs aligned while retaining utility."},"references":{"count":51,"sample":[{"doi":"","year":null,"title":"Scaling Learning Algorithms Towards","work_id":"bb2761cc-98d0-411b-92f6-803773d64460","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"and Osindero, Simon and Teh, Yee Whye , journal =","work_id":"0a5921e3-ac4e-46f1-85ae-866119a87be0","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2016,"title":"Deep learning , author=. 2016 , publisher=","work_id":"cf0899e0-53ee-4591-aae4-f38fa5ac12ad","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2022,"title":"Training language models to follow instructions with human feedback , author=. 2022 , eprint=","work_id":"5903b651-f252-45b3-9845-98a1abf380ae","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2023,"title":"Fine-tuning Aligned Language Models Compromises Safety, Even When Users Do Not Intend To! , author=. 2023 , eprint=","work_id":"ad93e097-cad6-47a9-a638-c1771e6a877d","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":51,"snapshot_sha256":"32d0762f14b4b934b34bfb9c4de0a91d904f4b113a162b73aa3397a7aa0055d9","internal_anchors":2},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"b2008c41-d6cf-4335-bd65-13e959f83538"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:39:11Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"VW6egmWuIJzXvHRlQVy3kGxB7E0kku3dr8bzWwv0OvMivy0azrQjsyajmKu2+noFNhQftWb8l+bbaf8NkMjkBA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-07T11:17:59.719762Z"},"content_sha256":"8b8062bb406d599f32e96f6ea640766a628ed4d6cc024258bbf4fa9681e61298","schema_version":"1.0","event_id":"sha256:8b8062bb406d599f32e96f6ea640766a628ed4d6cc024258bbf4fa9681e61298"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/6XB6HN2Q4LOJE6R4XU2MJPPEZ4/bundle.json","state_url":"https://pith.science/pith/6XB6HN2Q4LOJE6R4XU2MJPPEZ4/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/6XB6HN2Q4LOJE6R4XU2MJPPEZ4/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-07T11:17:59Z","links":{"resolver":"https://pith.science/pith/6XB6HN2Q4LOJE6R4XU2MJPPEZ4","bundle":"https://pith.science/pith/6XB6HN2Q4LOJE6R4XU2MJPPEZ4/bundle.json","state":"https://pith.science/pith/6XB6HN2Q4LOJE6R4XU2MJPPEZ4/state.json","well_known_bundle":"https://pith.science/.well-known/pith/6XB6HN2Q4LOJE6R4XU2MJPPEZ4/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:6XB6HN2Q4LOJE6R4XU2MJPPEZ4","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"669e5eada6fb706ff88053c591c62135ce4f23e1a263c91734aa308796a45956","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-05-13T23:19:55Z","title_canon_sha256":"526d2d1ecf63258686820a849a3b08b2b334642dd8905210460ada30bf1017db"},"schema_version":"1.0","source":{"id":"2605.14194","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.14194","created_at":"2026-05-17T23:39:11Z"},{"alias_kind":"arxiv_version","alias_value":"2605.14194v1","created_at":"2026-05-17T23:39:11Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.14194","created_at":"2026-05-17T23:39:11Z"},{"alias_kind":"pith_short_12","alias_value":"6XB6HN2Q4LOJ","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"6XB6HN2Q4LOJE6R4","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"6XB6HN2Q","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:8b8062bb406d599f32e96f6ea640766a628ed4d6cc024258bbf4fa9681e61298","target":"graph","created_at":"2026-05-17T23:39:11Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"GradShield outperforms all baseline methods, consistently maintaining an Attack Success Rate (ASR) below 6% while preserving utility performance."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"The Finetuning Implicit Harmfulness Score (FIHS) computed for each data point accurately identifies examples that will cause misalignment after finetuning, and the adaptive thresholding removes harmful points without discarding too much useful signal."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"GradShield removes data points likely to cause safety misalignment during LLM finetuning by computing a Finetuning Implicit Harmfulness Score and applying adaptive thresholding, keeping attack success rates below 6% while preserving utility."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"GradShield filters data points by their implicit harmfulness score to keep finetuned LLMs aligned while retaining utility."}],"snapshot_sha256":"35afbc3801669b5274ffd87a99ea65cd029408794e4e605566ed90c8b9d48f4e"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"Large Language Models (LLMs) pose a significant risk of safety misalignment after finetuning, as models can be compromised by both explicitly and implicitly harmful data. Even some seemingly benign data can inadvertently steer a model towards misaligned behaviors. To address this, we introduce GradShield, a principled filtering method that safeguards LLMs during finetuning by identifying and removing harmful data points before they corrupt the model's alignment. It removes potentially harmful data by computing a Finetuning Implicit Harmfulness Score (FIHS) for each data point and employs an ad","authors_text":"Basel Alomair, David Wagner, Emad A. Alghamdi, Patrick Mendoza, Raluca Ada Popa, Xiao Huang, Zhanhao Hu","cross_cats":[],"headline":"GradShield filters data points by their implicit harmfulness score to keep finetuned LLMs aligned while retaining utility.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-05-13T23:19:55Z","title":"GradShield: Alignment Preserving Finetuning"},"references":{"count":51,"internal_anchors":2,"resolved_work":51,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"Scaling Learning Algorithms Towards","work_id":"bb2761cc-98d0-411b-92f6-803773d64460","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"and Osindero, Simon and Teh, Yee Whye , journal =","work_id":"0a5921e3-ac4e-46f1-85ae-866119a87be0","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"Deep learning , author=. 2016 , publisher=","work_id":"cf0899e0-53ee-4591-aae4-f38fa5ac12ad","year":2016},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Training language models to follow instructions with human feedback , author=. 2022 , eprint=","work_id":"5903b651-f252-45b3-9845-98a1abf380ae","year":2022},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"Fine-tuning Aligned Language Models Compromises Safety, Even When Users Do Not Intend To! , author=. 2023 , eprint=","work_id":"ad93e097-cad6-47a9-a638-c1771e6a877d","year":2023}],"snapshot_sha256":"32d0762f14b4b934b34bfb9c4de0a91d904f4b113a162b73aa3397a7aa0055d9"},"source":{"id":"2605.14194","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-15T04:40:50.095750Z","id":"b2008c41-d6cf-4335-bd65-13e959f83538","model_set":{"reader":"grok-4.3"},"one_line_summary":"GradShield removes data points likely to cause safety misalignment during LLM finetuning by computing a Finetuning Implicit Harmfulness Score and applying adaptive thresholding, keeping attack success rates below 6% while preserving utility.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"GradShield filters data points by their implicit harmfulness score to keep finetuned LLMs aligned while retaining utility.","strongest_claim":"GradShield outperforms all baseline methods, consistently maintaining an Attack Success Rate (ASR) below 6% while preserving utility performance.","weakest_assumption":"The Finetuning Implicit Harmfulness Score (FIHS) computed for each data point accurately identifies examples that will cause misalignment after finetuning, and the adaptive thresholding removes harmful points without discarding too much useful signal."}},"verdict_id":"b2008c41-d6cf-4335-bd65-13e959f83538"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:f6613c24578cd4bc171408a29df4a19670344cc1810ad0bfec6db600d818e1bc","target":"record","created_at":"2026-05-17T23:39:11Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"669e5eada6fb706ff88053c591c62135ce4f23e1a263c91734aa308796a45956","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-05-13T23:19:55Z","title_canon_sha256":"526d2d1ecf63258686820a849a3b08b2b334642dd8905210460ada30bf1017db"},"schema_version":"1.0","source":{"id":"2605.14194","kind":"arxiv","version":1}},"canonical_sha256":"f5c3e3b750e2dc927a3cbd34c4bde4cf167c8ca1c56c34019b92fe40897c3900","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"f5c3e3b750e2dc927a3cbd34c4bde4cf167c8ca1c56c34019b92fe40897c3900","first_computed_at":"2026-05-17T23:39:11.109473Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:39:11.109473Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"viWJaSvwnzHfo1fHlXlnY5i3SUj4PtUbJySZNec6PZk/n87mHBJuAPwY3zKOF+Lvu+kJmD8/spsDtawz2un9Aw==","signature_status":"signed_v1","signed_at":"2026-05-17T23:39:11.110049Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.14194","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:f6613c24578cd4bc171408a29df4a19670344cc1810ad0bfec6db600d818e1bc","sha256:8b8062bb406d599f32e96f6ea640766a628ed4d6cc024258bbf4fa9681e61298"],"state_sha256":"6f5e0beb57616734dc5b03cb5147f237c9be60a298150cbeacb95ad89eac7e59"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"lEVkf098vZiERGQPCCgSqwIerQqTqg4/7hPNoo17kq/kob3dFS/hvBMZ3xA7brbxRLoi1fS9AIP2zF4aV1iZBQ==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-07T11:17:59.723650Z","bundle_sha256":"671e078871b6dd9a007a879a219f60cd213d727cdb5e8500ca32f960aac506e5"}}