{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:OBCY4VANEY2NYRJSKCHRUP4AZG","short_pith_number":"pith:OBCY4VAN","canonical_record":{"source":{"id":"2605.05704","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CR","submitted_at":"2026-05-07T05:50:45Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"d2a4aac0c28d5323ed860c47914c6dafa55c7d5e7a5f3e1071b5a73749d29459","abstract_canon_sha256":"fdc8601bab076071299419f12107c07906e0c1fe3cbdf46bcea5e206d4efb7d3"},"schema_version":"1.0"},"canonical_sha256":"70458e540d2634dc4532508f1a3f80c9b248a688a922bd61ebf1b1f9b90f6362","source":{"kind":"arxiv","id":"2605.05704","version":2},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.05704","created_at":"2026-05-25T02:01:21Z"},{"alias_kind":"arxiv_version","alias_value":"2605.05704v2","created_at":"2026-05-25T02:01:21Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.05704","created_at":"2026-05-25T02:01:21Z"},{"alias_kind":"pith_short_12","alias_value":"OBCY4VANEY2N","created_at":"2026-05-25T02:01:21Z"},{"alias_kind":"pith_short_16","alias_value":"OBCY4VANEY2NYRJS","created_at":"2026-05-25T02:01:21Z"},{"alias_kind":"pith_short_8","alias_value":"OBCY4VAN","created_at":"2026-05-25T02:01:21Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:OBCY4VANEY2NYRJSKCHRUP4AZG","target":"record","payload":{"canonical_record":{"source":{"id":"2605.05704","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CR","submitted_at":"2026-05-07T05:50:45Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"d2a4aac0c28d5323ed860c47914c6dafa55c7d5e7a5f3e1071b5a73749d29459","abstract_canon_sha256":"fdc8601bab076071299419f12107c07906e0c1fe3cbdf46bcea5e206d4efb7d3"},"schema_version":"1.0"},"canonical_sha256":"70458e540d2634dc4532508f1a3f80c9b248a688a922bd61ebf1b1f9b90f6362","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-25T02:01:21.941048Z","signature_b64":"NE5ACXXzxNuwQbU+EH9sp82Cq4Th4Ii2qwBZ9Yk1/YE2JExZ02DfDzDfk/IWia03V5Kyw5RzqnwAaUoAB/QeDg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"70458e540d2634dc4532508f1a3f80c9b248a688a922bd61ebf1b1f9b90f6362","last_reissued_at":"2026-05-25T02:01:21.940352Z","signature_status":"signed_v1","first_computed_at":"2026-05-25T02:01:21.940352Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2605.05704","source_version":2,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-25T02:01:21Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"yGGESnQrjgNnwoqh24MjPn+wl6ZomlV9ZYpbsJuuMa6rK1O3J1IvIQ+tl+TT+g5SY3VvPVlIUCwbLXEr/WH3BA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-26T15:08:52.069200Z"},"content_sha256":"79cf390ee85bc4d2f6ccf3f99d282c0ab7ddd9db58f59e179a8e52129b999f77","schema_version":"1.0","event_id":"sha256:79cf390ee85bc4d2f6ccf3f99d282c0ab7ddd9db58f59e179a8e52129b999f77"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:OBCY4VANEY2NYRJSKCHRUP4AZG","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"SafeHarbor: Hierarchical Memory-Augmented Guardrail for LLM Agent Safety","license":"http://creativecommons.org/licenses/by/4.0/","headline":"SafeHarbor uses hierarchical memory to extract and evolve context-aware rules that let LLM agents refuse harmful tool use while handling ambiguous benign tasks.","cross_cats":["cs.AI"],"primary_cat":"cs.CR","authors_text":"Deyue Zhang, Dongdong Yang, Hao Peng, Quanchen Zou, Wenxin Zhang, Xiangzheng Zhang, Zhe Liu, Zonghao Ying","submitted_at":"2026-05-07T05:50:45Z","abstract_excerpt":"Recent advances in foundation models have transformed LLMs from passive conversational systems into autonomous agents capable of reasoning and tool execution. While these capabilities unlock substantial practical value, they also introduce new security risks, as adversaries can manipulate agents into performing harmful actions in real-world environments. Existing defense strategies mitigate such threats but frequently struggle to balance safety and utility, resulting in over-refusal of benign user requests. To mitigate this trade-off, we propose SafeHarbor, a novel framework designed to establ"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"SafeHarbor achieves state-of-the-art performance on both ambiguous benign tasks and explicit malicious attacks, notably attaining a peak benign utility of 63.6% on GPT-4o while maintaining a robust refusal rate exceeding 93% against harmful requests.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That context-aware rules extracted via enhanced adversarial generation plus entropy-based node splitting and merging will maintain precise decision boundaries across unseen tasks and models without introducing new failure modes or requiring per-deployment tuning.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"SafeHarbor uses hierarchical memory with adversarial rule extraction and entropy-driven self-evolution to achieve over 93% refusal on harmful requests while reaching 63.6% benign utility on GPT-4o.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"SafeHarbor uses hierarchical memory to extract and evolve context-aware rules that let LLM agents refuse harmful tool use while handling ambiguous benign tasks.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"82c7a01dc65411905206f6b031620b22c7eb28d9c41735f61628dd5747cc5dcd"},"source":{"id":"2605.05704","kind":"arxiv","version":2},"verdict":{"id":"ccc21973-ca4d-441d-83c3-7f5985e197b0","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-08T09:27:19.494741Z","strongest_claim":"SafeHarbor achieves state-of-the-art performance on both ambiguous benign tasks and explicit malicious attacks, notably attaining a peak benign utility of 63.6% on GPT-4o while maintaining a robust refusal rate exceeding 93% against harmful requests.","one_line_summary":"SafeHarbor uses hierarchical memory with adversarial rule extraction and entropy-driven self-evolution to achieve over 93% refusal on harmful requests while reaching 63.6% benign utility on GPT-4o.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That context-aware rules extracted via enhanced adversarial generation plus entropy-based node splitting and merging will maintain precise decision boundaries across unseen tasks and models without introducing new failure modes or requiring per-deployment tuning.","pith_extraction_headline":"SafeHarbor uses hierarchical memory to extract and evolve context-aware rules that let LLM agents refuse harmful tool use while handling ambiguous benign tasks."},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2605.05704/integrity.json","findings":[],"available":true,"detectors_run":[{"name":"claim_evidence","ran_at":"2026-05-20T13:42:04.721662Z","status":"completed","version":"1.0.0","findings_count":0},{"name":"ai_meta_artifact","ran_at":"2026-05-20T09:36:28.319936Z","status":"completed","version":"1.0.0","findings_count":0},{"name":"doi_title_agreement","ran_at":"2026-05-19T20:01:19.638802Z","status":"completed","version":"1.0.0","findings_count":0},{"name":"doi_compliance","ran_at":"2026-05-19T13:19:50.079763Z","status":"completed","version":"1.0.0","findings_count":0}],"snapshot_sha256":"90c551d6f9d93dd1397b5504329c9071b2e77690687fdf5d3f9de5d64a81f6f9"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"ccc21973-ca4d-441d-83c3-7f5985e197b0"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-25T02:01:21Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"EGkv9NH+ReTsQzEl9+1j5BQQjfAzxUHg+PNxRhffxMUSheRQ0yt73FCdzpBug7l/R+Nleeiq2p+SgYxg/TMtDg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-26T15:08:52.070224Z"},"content_sha256":"c8d1b85e14016913773cac6539019bbef35a2e0dbce4d49c8e319ecbbdb1d43e","schema_version":"1.0","event_id":"sha256:c8d1b85e14016913773cac6539019bbef35a2e0dbce4d49c8e319ecbbdb1d43e"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/OBCY4VANEY2NYRJSKCHRUP4AZG/bundle.json","state_url":"https://pith.science/pith/OBCY4VANEY2NYRJSKCHRUP4AZG/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/OBCY4VANEY2NYRJSKCHRUP4AZG/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-26T15:08:52Z","links":{"resolver":"https://pith.science/pith/OBCY4VANEY2NYRJSKCHRUP4AZG","bundle":"https://pith.science/pith/OBCY4VANEY2NYRJSKCHRUP4AZG/bundle.json","state":"https://pith.science/pith/OBCY4VANEY2NYRJSKCHRUP4AZG/state.json","well_known_bundle":"https://pith.science/.well-known/pith/OBCY4VANEY2NYRJSKCHRUP4AZG/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:OBCY4VANEY2NYRJSKCHRUP4AZG","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"fdc8601bab076071299419f12107c07906e0c1fe3cbdf46bcea5e206d4efb7d3","cross_cats_sorted":["cs.AI"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CR","submitted_at":"2026-05-07T05:50:45Z","title_canon_sha256":"d2a4aac0c28d5323ed860c47914c6dafa55c7d5e7a5f3e1071b5a73749d29459"},"schema_version":"1.0","source":{"id":"2605.05704","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.05704","created_at":"2026-05-25T02:01:21Z"},{"alias_kind":"arxiv_version","alias_value":"2605.05704v2","created_at":"2026-05-25T02:01:21Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.05704","created_at":"2026-05-25T02:01:21Z"},{"alias_kind":"pith_short_12","alias_value":"OBCY4VANEY2N","created_at":"2026-05-25T02:01:21Z"},{"alias_kind":"pith_short_16","alias_value":"OBCY4VANEY2NYRJS","created_at":"2026-05-25T02:01:21Z"},{"alias_kind":"pith_short_8","alias_value":"OBCY4VAN","created_at":"2026-05-25T02:01:21Z"}],"graph_snapshots":[{"event_id":"sha256:c8d1b85e14016913773cac6539019bbef35a2e0dbce4d49c8e319ecbbdb1d43e","target":"graph","created_at":"2026-05-25T02:01:21Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"SafeHarbor achieves state-of-the-art performance on both ambiguous benign tasks and explicit malicious attacks, notably attaining a peak benign utility of 63.6% on GPT-4o while maintaining a robust refusal rate exceeding 93% against harmful requests."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That context-aware rules extracted via enhanced adversarial generation plus entropy-based node splitting and merging will maintain precise decision boundaries across unseen tasks and models without introducing new failure modes or requiring per-deployment tuning."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"SafeHarbor uses hierarchical memory with adversarial rule extraction and entropy-driven self-evolution to achieve over 93% refusal on harmful requests while reaching 63.6% benign utility on GPT-4o."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"SafeHarbor uses hierarchical memory to extract and evolve context-aware rules that let LLM agents refuse harmful tool use while handling ambiguous benign tasks."}],"snapshot_sha256":"82c7a01dc65411905206f6b031620b22c7eb28d9c41735f61628dd5747cc5dcd"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[{"findings_count":0,"name":"claim_evidence","ran_at":"2026-05-20T13:42:04.721662Z","status":"completed","version":"1.0.0"},{"findings_count":0,"name":"ai_meta_artifact","ran_at":"2026-05-20T09:36:28.319936Z","status":"completed","version":"1.0.0"},{"findings_count":0,"name":"doi_title_agreement","ran_at":"2026-05-19T20:01:19.638802Z","status":"completed","version":"1.0.0"},{"findings_count":0,"name":"doi_compliance","ran_at":"2026-05-19T13:19:50.079763Z","status":"completed","version":"1.0.0"}],"endpoint":"/pith/2605.05704/integrity.json","findings":[],"snapshot_sha256":"90c551d6f9d93dd1397b5504329c9071b2e77690687fdf5d3f9de5d64a81f6f9","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Recent advances in foundation models have transformed LLMs from passive conversational systems into autonomous agents capable of reasoning and tool execution. While these capabilities unlock substantial practical value, they also introduce new security risks, as adversaries can manipulate agents into performing harmful actions in real-world environments. Existing defense strategies mitigate such threats but frequently struggle to balance safety and utility, resulting in over-refusal of benign user requests. To mitigate this trade-off, we propose SafeHarbor, a novel framework designed to establ","authors_text":"Deyue Zhang, Dongdong Yang, Hao Peng, Quanchen Zou, Wenxin Zhang, Xiangzheng Zhang, Zhe Liu, Zonghao Ying","cross_cats":["cs.AI"],"headline":"SafeHarbor uses hierarchical memory to extract and evolve context-aware rules that let LLM agents refuse harmful tool use while handling ambiguous benign tasks.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CR","submitted_at":"2026-05-07T05:50:45Z","title":"SafeHarbor: Hierarchical Memory-Augmented Guardrail for LLM Agent Safety"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2605.05704","kind":"arxiv","version":2},"verdict":{"created_at":"2026-05-08T09:27:19.494741Z","id":"ccc21973-ca4d-441d-83c3-7f5985e197b0","model_set":{"reader":"grok-4.3"},"one_line_summary":"SafeHarbor uses hierarchical memory with adversarial rule extraction and entropy-driven self-evolution to achieve over 93% refusal on harmful requests while reaching 63.6% benign utility on GPT-4o.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"SafeHarbor uses hierarchical memory to extract and evolve context-aware rules that let LLM agents refuse harmful tool use while handling ambiguous benign tasks.","strongest_claim":"SafeHarbor achieves state-of-the-art performance on both ambiguous benign tasks and explicit malicious attacks, notably attaining a peak benign utility of 63.6% on GPT-4o while maintaining a robust refusal rate exceeding 93% against harmful requests.","weakest_assumption":"That context-aware rules extracted via enhanced adversarial generation plus entropy-based node splitting and merging will maintain precise decision boundaries across unseen tasks and models without introducing new failure modes or requiring per-deployment tuning."}},"verdict_id":"ccc21973-ca4d-441d-83c3-7f5985e197b0"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:79cf390ee85bc4d2f6ccf3f99d282c0ab7ddd9db58f59e179a8e52129b999f77","target":"record","created_at":"2026-05-25T02:01:21Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"fdc8601bab076071299419f12107c07906e0c1fe3cbdf46bcea5e206d4efb7d3","cross_cats_sorted":["cs.AI"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CR","submitted_at":"2026-05-07T05:50:45Z","title_canon_sha256":"d2a4aac0c28d5323ed860c47914c6dafa55c7d5e7a5f3e1071b5a73749d29459"},"schema_version":"1.0","source":{"id":"2605.05704","kind":"arxiv","version":2}},"canonical_sha256":"70458e540d2634dc4532508f1a3f80c9b248a688a922bd61ebf1b1f9b90f6362","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"70458e540d2634dc4532508f1a3f80c9b248a688a922bd61ebf1b1f9b90f6362","first_computed_at":"2026-05-25T02:01:21.940352Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-25T02:01:21.940352Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"NE5ACXXzxNuwQbU+EH9sp82Cq4Th4Ii2qwBZ9Yk1/YE2JExZ02DfDzDfk/IWia03V5Kyw5RzqnwAaUoAB/QeDg==","signature_status":"signed_v1","signed_at":"2026-05-25T02:01:21.941048Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.05704","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:79cf390ee85bc4d2f6ccf3f99d282c0ab7ddd9db58f59e179a8e52129b999f77","sha256:c8d1b85e14016913773cac6539019bbef35a2e0dbce4d49c8e319ecbbdb1d43e"],"state_sha256":"969cbf3023ad354393b6d6a27237d48ea63cd64318cc41f458dc22f623516b62"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"7Mssv+H02u85GEdtZGzWoDolN2q8pP8y2MTe0BVb7eHoOsNu1KlzwWjLnKQoaFMsIhgu2ZQKFN68DCbShsIVAQ==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-26T15:08:52.074690Z","bundle_sha256":"e47af768242cc836c85ceb085ced4b87b019ff0c5f68dfeebb4a7ff61be874d9"}}