{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2025:TDD5C6FQIQ4X3H4IE6PURMOGXH","short_pith_number":"pith:TDD5C6FQ","canonical_record":{"source":{"id":"2509.05367","kind":"arxiv","version":5},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CR","submitted_at":"2025-09-04T05:53:20Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"971f2c64a9c37d9f7d08daf341bddcd43c5bc64cd862cd7ff99777ab4d00af26","abstract_canon_sha256":"bcb52421c65cfe9940fb14955c76d366fe164009c631c90d1c1eec156d1ec547"},"schema_version":"1.0"},"canonical_sha256":"98c7d178b044397d9f88279f48b1c6b9d07178cd15c9eb28960f3c0d3a95af6a","source":{"kind":"arxiv","id":"2509.05367","version":5},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2509.05367","created_at":"2026-06-02T01:03:33Z"},{"alias_kind":"arxiv_version","alias_value":"2509.05367v5","created_at":"2026-06-02T01:03:33Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2509.05367","created_at":"2026-06-02T01:03:33Z"},{"alias_kind":"pith_short_12","alias_value":"TDD5C6FQIQ4X","created_at":"2026-06-02T01:03:33Z"},{"alias_kind":"pith_short_16","alias_value":"TDD5C6FQIQ4X3H4I","created_at":"2026-06-02T01:03:33Z"},{"alias_kind":"pith_short_8","alias_value":"TDD5C6FQ","created_at":"2026-06-02T01:03:33Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2025:TDD5C6FQIQ4X3H4IE6PURMOGXH","target":"record","payload":{"canonical_record":{"source":{"id":"2509.05367","kind":"arxiv","version":5},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CR","submitted_at":"2025-09-04T05:53:20Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"971f2c64a9c37d9f7d08daf341bddcd43c5bc64cd862cd7ff99777ab4d00af26","abstract_canon_sha256":"bcb52421c65cfe9940fb14955c76d366fe164009c631c90d1c1eec156d1ec547"},"schema_version":"1.0"},"canonical_sha256":"98c7d178b044397d9f88279f48b1c6b9d07178cd15c9eb28960f3c0d3a95af6a","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-02T01:03:33.491027Z","signature_b64":"9Xi+DLflrvu+gW3D5PvDhsuiYGCuDJrgHeJGBHQXikxYigRpCbU+5uxhhh25hpYRohums6LZx12Ri+GyUAYZDw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"98c7d178b044397d9f88279f48b1c6b9d07178cd15c9eb28960f3c0d3a95af6a","last_reissued_at":"2026-06-02T01:03:33.490546Z","signature_status":"signed_v1","first_computed_at":"2026-06-02T01:03:33.490546Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2509.05367","source_version":5,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-06-02T01:03:33Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"yUoQAT0/j0Zf70psla6g+1RzSVGZo6eR0EtHt5G6W7YEIm3GYlw2V9ZnTY4bAVKwZ096Gyml+y57+602m34GCw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-07-02T23:28:01.530292Z"},"content_sha256":"0c28e7b78494da4e8963b15c23bb8aaac46f3bc72bcbd9efaf573eaf9603d5e0","schema_version":"1.0","event_id":"sha256:0c28e7b78494da4e8963b15c23bb8aaac46f3bc72bcbd9efaf573eaf9603d5e0"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2025:TDD5C6FQIQ4X3H4IE6PURMOGXH","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Between a Rock and a Hard Place: The Tension Between Ethical Reasoning and Safety Alignment in LLMs","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Ethical reasoning in LLMs opens a vulnerability where harmful requests framed as moral dilemmas can bypass safety alignments.","cross_cats":["cs.AI"],"primary_cat":"cs.CR","authors_text":"Kai Jun Teh, Qibing Ren, Shei Pern Chua, Xiao Li, Xiaolin Hu, Zhen Leng Thai","submitted_at":"2025-09-04T05:53:20Z","abstract_excerpt":"Large Language Model safety alignment predominantly operates on a binary assumption that requests are either safe or unsafe. This classification proves insufficient when models encounter ethical dilemmas, where the capacity to reason through moral trade-offs creates a distinct attack surface. We formalize this vulnerability through TRIAL, a multi-turn red-teaming methodology that embeds harmful requests within ethical framings. TRIAL achieves high attack success rates across most tested models by systematically exploiting the model's ethical reasoning capabilities to frame harmful actions as m"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"TRIAL achieves high attack success rates across most tested models by systematically exploiting the model's ethical reasoning capabilities to frame harmful actions as morally necessary compromises.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That ethical reasoning responses can be reliably partitioned into instrumental (enabling harm) versus explanatory (analyzing without endorsing) categories in a way that preserves overall model utility and does not introduce new failure modes.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Introduces TRIAL, a multi-turn red-teaming method exploiting ethical reasoning to achieve high attack success on LLMs, and ERR, a Layer-Stratified Harm-Gated LoRA defense that separates instrumental harmful responses from explanatory ethical analysis.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Ethical reasoning in LLMs opens a vulnerability where harmful requests framed as moral dilemmas can bypass safety alignments.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"99dd3e0f1fa3e41810f0146c045ad2ecd84c9fe8376f1b033693db3d0833af64"},"source":{"id":"2509.05367","kind":"arxiv","version":5},"verdict":{"id":"0a8481be-b7c9-4291-b096-4de0b5a706d8","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-18T19:32:34.630351Z","strongest_claim":"TRIAL achieves high attack success rates across most tested models by systematically exploiting the model's ethical reasoning capabilities to frame harmful actions as morally necessary compromises.","one_line_summary":"Introduces TRIAL, a multi-turn red-teaming method exploiting ethical reasoning to achieve high attack success on LLMs, and ERR, a Layer-Stratified Harm-Gated LoRA defense that separates instrumental harmful responses from explanatory ethical analysis.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That ethical reasoning responses can be reliably partitioned into instrumental (enabling harm) versus explanatory (analyzing without endorsing) categories in a way that preserves overall model utility and does not introduce new failure modes.","pith_extraction_headline":"Ethical reasoning in LLMs opens a vulnerability where harmful requests framed as moral dilemmas can bypass safety alignments."},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2509.05367/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":1,"snapshot_sha256":"94ee95aaa514c3da301eeb1c2f04b7e1cf09e038513d8cb068a5cad391f55bd5"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"0a8481be-b7c9-4291-b096-4de0b5a706d8"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-06-02T01:03:33Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"9CBPoXBNttbISNWsKODf/OVtKla/7fLa8KPLezjvlhGh1+obYMkkJPI3db0wbhuCGiMfxHyJAfoaFKG32X7VBw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-07-02T23:28:01.530783Z"},"content_sha256":"d7969ea7857d29025d935510e35cd3755996a87e9198e5d644a9354bdff4c5ed","schema_version":"1.0","event_id":"sha256:d7969ea7857d29025d935510e35cd3755996a87e9198e5d644a9354bdff4c5ed"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/TDD5C6FQIQ4X3H4IE6PURMOGXH/bundle.json","state_url":"https://pith.science/pith/TDD5C6FQIQ4X3H4IE6PURMOGXH/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/TDD5C6FQIQ4X3H4IE6PURMOGXH/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-07-02T23:28:01Z","links":{"resolver":"https://pith.science/pith/TDD5C6FQIQ4X3H4IE6PURMOGXH","bundle":"https://pith.science/pith/TDD5C6FQIQ4X3H4IE6PURMOGXH/bundle.json","state":"https://pith.science/pith/TDD5C6FQIQ4X3H4IE6PURMOGXH/state.json","well_known_bundle":"https://pith.science/.well-known/pith/TDD5C6FQIQ4X3H4IE6PURMOGXH/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2025:TDD5C6FQIQ4X3H4IE6PURMOGXH","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"bcb52421c65cfe9940fb14955c76d366fe164009c631c90d1c1eec156d1ec547","cross_cats_sorted":["cs.AI"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CR","submitted_at":"2025-09-04T05:53:20Z","title_canon_sha256":"971f2c64a9c37d9f7d08daf341bddcd43c5bc64cd862cd7ff99777ab4d00af26"},"schema_version":"1.0","source":{"id":"2509.05367","kind":"arxiv","version":5}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2509.05367","created_at":"2026-06-02T01:03:33Z"},{"alias_kind":"arxiv_version","alias_value":"2509.05367v5","created_at":"2026-06-02T01:03:33Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2509.05367","created_at":"2026-06-02T01:03:33Z"},{"alias_kind":"pith_short_12","alias_value":"TDD5C6FQIQ4X","created_at":"2026-06-02T01:03:33Z"},{"alias_kind":"pith_short_16","alias_value":"TDD5C6FQIQ4X3H4I","created_at":"2026-06-02T01:03:33Z"},{"alias_kind":"pith_short_8","alias_value":"TDD5C6FQ","created_at":"2026-06-02T01:03:33Z"}],"graph_snapshots":[{"event_id":"sha256:d7969ea7857d29025d935510e35cd3755996a87e9198e5d644a9354bdff4c5ed","target":"graph","created_at":"2026-06-02T01:03:33Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"TRIAL achieves high attack success rates across most tested models by systematically exploiting the model's ethical reasoning capabilities to frame harmful actions as morally necessary compromises."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That ethical reasoning responses can be reliably partitioned into instrumental (enabling harm) versus explanatory (analyzing without endorsing) categories in a way that preserves overall model utility and does not introduce new failure modes."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Introduces TRIAL, a multi-turn red-teaming method exploiting ethical reasoning to achieve high attack success on LLMs, and ERR, a Layer-Stratified Harm-Gated LoRA defense that separates instrumental harmful responses from explanatory ethical analysis."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Ethical reasoning in LLMs opens a vulnerability where harmful requests framed as moral dilemmas can bypass safety alignments."}],"snapshot_sha256":"99dd3e0f1fa3e41810f0146c045ad2ecd84c9fe8376f1b033693db3d0833af64"},"formal_canon":{"evidence_count":1,"snapshot_sha256":"94ee95aaa514c3da301eeb1c2f04b7e1cf09e038513d8cb068a5cad391f55bd5"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2509.05367/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Large Language Model safety alignment predominantly operates on a binary assumption that requests are either safe or unsafe. This classification proves insufficient when models encounter ethical dilemmas, where the capacity to reason through moral trade-offs creates a distinct attack surface. We formalize this vulnerability through TRIAL, a multi-turn red-teaming methodology that embeds harmful requests within ethical framings. TRIAL achieves high attack success rates across most tested models by systematically exploiting the model's ethical reasoning capabilities to frame harmful actions as m","authors_text":"Kai Jun Teh, Qibing Ren, Shei Pern Chua, Xiao Li, Xiaolin Hu, Zhen Leng Thai","cross_cats":["cs.AI"],"headline":"Ethical reasoning in LLMs opens a vulnerability where harmful requests framed as moral dilemmas can bypass safety alignments.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CR","submitted_at":"2025-09-04T05:53:20Z","title":"Between a Rock and a Hard Place: The Tension Between Ethical Reasoning and Safety Alignment in LLMs"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2509.05367","kind":"arxiv","version":5},"verdict":{"created_at":"2026-05-18T19:32:34.630351Z","id":"0a8481be-b7c9-4291-b096-4de0b5a706d8","model_set":{"reader":"grok-4.3"},"one_line_summary":"Introduces TRIAL, a multi-turn red-teaming method exploiting ethical reasoning to achieve high attack success on LLMs, and ERR, a Layer-Stratified Harm-Gated LoRA defense that separates instrumental harmful responses from explanatory ethical analysis.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Ethical reasoning in LLMs opens a vulnerability where harmful requests framed as moral dilemmas can bypass safety alignments.","strongest_claim":"TRIAL achieves high attack success rates across most tested models by systematically exploiting the model's ethical reasoning capabilities to frame harmful actions as morally necessary compromises.","weakest_assumption":"That ethical reasoning responses can be reliably partitioned into instrumental (enabling harm) versus explanatory (analyzing without endorsing) categories in a way that preserves overall model utility and does not introduce new failure modes."}},"verdict_id":"0a8481be-b7c9-4291-b096-4de0b5a706d8"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:0c28e7b78494da4e8963b15c23bb8aaac46f3bc72bcbd9efaf573eaf9603d5e0","target":"record","created_at":"2026-06-02T01:03:33Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"bcb52421c65cfe9940fb14955c76d366fe164009c631c90d1c1eec156d1ec547","cross_cats_sorted":["cs.AI"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CR","submitted_at":"2025-09-04T05:53:20Z","title_canon_sha256":"971f2c64a9c37d9f7d08daf341bddcd43c5bc64cd862cd7ff99777ab4d00af26"},"schema_version":"1.0","source":{"id":"2509.05367","kind":"arxiv","version":5}},"canonical_sha256":"98c7d178b044397d9f88279f48b1c6b9d07178cd15c9eb28960f3c0d3a95af6a","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"98c7d178b044397d9f88279f48b1c6b9d07178cd15c9eb28960f3c0d3a95af6a","first_computed_at":"2026-06-02T01:03:33.490546Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-06-02T01:03:33.490546Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"9Xi+DLflrvu+gW3D5PvDhsuiYGCuDJrgHeJGBHQXikxYigRpCbU+5uxhhh25hpYRohums6LZx12Ri+GyUAYZDw==","signature_status":"signed_v1","signed_at":"2026-06-02T01:03:33.491027Z","signed_message":"canonical_sha256_bytes"},"source_id":"2509.05367","source_kind":"arxiv","source_version":5}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:0c28e7b78494da4e8963b15c23bb8aaac46f3bc72bcbd9efaf573eaf9603d5e0","sha256:d7969ea7857d29025d935510e35cd3755996a87e9198e5d644a9354bdff4c5ed"],"state_sha256":"4ecc28ded96b54725dab8bbb15d9cb4123dee8a21ff45a86e5eb4ffdbe59e82b"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"9UaY9vHp+HR73lbFS/nxWwKdnbb8/LDoqyd94GZ80K0mGxj1a/t9wNFBmuTsYBMqDcI5bmG5pGlpj9iGoQKaDg==","signed_message":"bundle_sha256_bytes","signed_at":"2026-07-02T23:28:01.533078Z","bundle_sha256":"0cba6eb43275d8ba08bdb3ae92ed3b39c570fbd1ca0c45d468415bae86c5ca21"}}