{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2024:76DPSGXMOWWHM6WLVLLTQ5QWDU","short_pith_number":"pith:76DPSGXM","schema_version":"1.0","canonical_sha256":"ff86f91aec75ac767acbaad73876161d00d977ea794c9085178f91385ce9e355","source":{"kind":"arxiv","id":"2406.18495","version":3},"attestation_state":"computed","paper":{"title":"WildGuard: Open One-Stop Moderation Tools for Safety Risks, Jailbreaks, and Refusals of LLMs","license":"http://creativecommons.org/licenses/by/4.0/","headline":"WildGuard is an open moderation tool that detects malicious prompts, response risks, and refusal behaviors in LLMs with accuracy matching or exceeding GPT-4 on key tasks.","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Allyson Ettinger, Bill Yuchen Lin, Kavel Rao, Liwei Jiang, Nathan Lambert, Nouha Dziri, Seungju Han, Yejin Choi","submitted_at":"2024-06-26T16:58:20Z","abstract_excerpt":"We introduce WildGuard -- an open, light-weight moderation tool for LLM safety that achieves three goals: (1) identifying malicious intent in user prompts, (2) detecting safety risks of model responses, and (3) determining model refusal rate. Together, WildGuard serves the increasing needs for automatic safety moderation and evaluation of LLM interactions, providing a one-stop tool with enhanced accuracy and broad coverage across 13 risk categories. While existing open moderation tools such as Llama-Guard2 score reasonably well in classifying straightforward model interactions, they lag far be"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":true,"formal_links_present":true},"canonical_record":{"source":{"id":"2406.18495","kind":"arxiv","version":3},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2024-06-26T16:58:20Z","cross_cats_sorted":[],"title_canon_sha256":"f05660010b34991862d4ad5d0ef784d10c5a15542480b9ea3cca4333ffd5706b","abstract_canon_sha256":"076b5fdce0133c1dbc9fa3d56c2226b1e5cc3ceb3b521afa3011331488f4a91d"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:13.601163Z","signature_b64":"qMR1oa3zWyorAIwTo4ggD0r3myqsfwNeYTqFawWj+pZKJB1erbleUDOC1vDWgMxKs/xbhkONXjsbnIKrXHmsCg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"ff86f91aec75ac767acbaad73876161d00d977ea794c9085178f91385ce9e355","last_reissued_at":"2026-05-17T23:38:13.600412Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:13.600412Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"WildGuard: Open One-Stop Moderation Tools for Safety Risks, Jailbreaks, and Refusals of LLMs","license":"http://creativecommons.org/licenses/by/4.0/","headline":"WildGuard is an open moderation tool that detects malicious prompts, response risks, and refusal behaviors in LLMs with accuracy matching or exceeding GPT-4 on key tasks.","cross_cats":[],"primary_cat":"cs.CL","authors_text":"Allyson Ettinger, Bill Yuchen Lin, Kavel Rao, Liwei Jiang, Nathan Lambert, Nouha Dziri, Seungju Han, Yejin Choi","submitted_at":"2024-06-26T16:58:20Z","abstract_excerpt":"We introduce WildGuard -- an open, light-weight moderation tool for LLM safety that achieves three goals: (1) identifying malicious intent in user prompts, (2) detecting safety risks of model responses, and (3) determining model refusal rate. Together, WildGuard serves the increasing needs for automatic safety moderation and evaluation of LLM interactions, providing a one-stop tool with enhanced accuracy and broad coverage across 13 risk categories. While existing open moderation tools such as Llama-Guard2 score reasonably well in classifying straightforward model interactions, they lag far be"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"WildGuard establishes state-of-the-art performance in open-source safety moderation across all the three tasks compared to ten strong existing open-source moderation models (e.g., up to 26.4% improvement on refusal detection). Importantly, WildGuard matches and sometimes exceeds GPT-4 performance (e.g., up to 3.9% improvement on prompt harmfulness identification). WildGuard serves as a highly effective safety moderator in an LLM interface, reducing the success rate of jailbreak attacks from 79.8% to 2.4%.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"The human-annotated WildGuardTest set of 5K items and the broader WildGuardMix dataset are representative of real-world user prompts, adversarial jailbreaks, and model behaviors, and that performance gains will generalize beyond the ten public benchmarks evaluated.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"WildGuard is a new open moderation model and dataset for LLM safety that identifies harmful prompts, risky responses, and refusal rates, achieving SOTA open-source performance and sometimes exceeding GPT-4 while cutting jailbreak success from 79.8% to 2.4%.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"WildGuard is an open moderation tool that detects malicious prompts, response risks, and refusal behaviors in LLMs with accuracy matching or exceeding GPT-4 on key tasks.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"5c0103e38097947a2d5fdb8273f4bcb334880538e35b26a43340c989693090d3"},"source":{"id":"2406.18495","kind":"arxiv","version":3},"verdict":{"id":"0b3c3aeb-a2f2-4e3b-b5aa-1d35cbe34f51","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-17T16:19:49.016828Z","strongest_claim":"WildGuard establishes state-of-the-art performance in open-source safety moderation across all the three tasks compared to ten strong existing open-source moderation models (e.g., up to 26.4% improvement on refusal detection). Importantly, WildGuard matches and sometimes exceeds GPT-4 performance (e.g., up to 3.9% improvement on prompt harmfulness identification). WildGuard serves as a highly effective safety moderator in an LLM interface, reducing the success rate of jailbreak attacks from 79.8% to 2.4%.","one_line_summary":"WildGuard is a new open moderation model and dataset for LLM safety that identifies harmful prompts, risky responses, and refusal rates, achieving SOTA open-source performance and sometimes exceeding GPT-4 while cutting jailbreak success from 79.8% to 2.4%.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"The human-annotated WildGuardTest set of 5K items and the broader WildGuardMix dataset are representative of real-world user prompts, adversarial jailbreaks, and model behaviors, and that performance gains will generalize beyond the ten public benchmarks evaluated.","pith_extraction_headline":"WildGuard is an open moderation tool that detects malicious prompts, response risks, and refusal behaviors in LLMs with accuracy matching or exceeding GPT-4 on key tasks."},"references":{"count":63,"sample":[{"doi":"","year":2023,"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","ref_index":1,"cited_arxiv_id":"2303.08774","is_internal_anchor":true},{"doi":"","year":2024,"title":"Llama 3 model card","work_id":"c7ab4b73-84eb-419d-9567-20e065974941","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"The claude 3 model family: Opus, sonnet, haiku","work_id":"9f159deb-2679-42f7-b329-e85ade88cb92","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2024,"title":"Transactions on Machine Learning Research , author=","work_id":"da63c249-19b0-4d11-94bf-033eeaa2d43a","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2022,"title":"Training a helpful and harmless assistant with reinforcement learning from human feedback, 2022","work_id":"6b9cf002-ab59-4c61-ae20-2d2f7b0eecaf","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":63,"snapshot_sha256":"ca3febef57e702ff33568432713572223e865c2763303544b094592be9c202bc","internal_anchors":9},"formal_canon":{"evidence_count":2,"snapshot_sha256":"270d535674f34295ca4c877594d35ff8028fb82af14f4975d095d8f9fe9bb636"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2406.18495","created_at":"2026-05-17T23:38:13.600541+00:00"},{"alias_kind":"arxiv_version","alias_value":"2406.18495v3","created_at":"2026-05-17T23:38:13.600541+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2406.18495","created_at":"2026-05-17T23:38:13.600541+00:00"},{"alias_kind":"pith_short_12","alias_value":"76DPSGXMOWWH","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_16","alias_value":"76DPSGXMOWWHM6WL","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_8","alias_value":"76DPSGXM","created_at":"2026-05-18T12:33:37.589309+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":17,"internal_anchor_count":17,"sample":[{"citing_arxiv_id":"2410.18451","citing_title":"Skywork-Reward: Bag of Tricks for Reward Modeling in LLMs","ref_index":10,"is_internal_anchor":true},{"citing_arxiv_id":"2605.12843","citing_title":"Bayesian Model Merging","ref_index":58,"is_internal_anchor":true},{"citing_arxiv_id":"2605.12869","citing_title":"Quantifying LLM Safety Degradation Under Repeated Attacks Using Survival Analysis","ref_index":5,"is_internal_anchor":true},{"citing_arxiv_id":"2604.06233","citing_title":"Blind Refusal: Language Models Refuse to Help Users Evade Unjust, Absurd, and Illegitimate Rules","ref_index":12,"is_internal_anchor":true},{"citing_arxiv_id":"2605.11268","citing_title":"Context-Aware Spear Phishing: Generative AI-Enabled Attacks Against Individuals via Public Social Media Data","ref_index":41,"is_internal_anchor":true},{"citing_arxiv_id":"2605.05277","citing_title":"GLiNER Guard: Unified Encoder Family for Production LLM Safety and Privacy","ref_index":5,"is_internal_anchor":true},{"citing_arxiv_id":"2605.00269","citing_title":"How Language Models Process Out-of-Distribution Inputs: A Two-Pathway Framework","ref_index":84,"is_internal_anchor":true},{"citing_arxiv_id":"2604.11943","citing_title":"ProbeLogits: Kernel-Level LLM Inference Primitives for AI-Native Operating Systems","ref_index":26,"is_internal_anchor":true},{"citing_arxiv_id":"2604.07655","citing_title":"Guardian-as-an-Advisor: Advancing Next-Generation Guardian Models for Trustworthy LLMs","ref_index":22,"is_internal_anchor":true},{"citing_arxiv_id":"2604.09189","citing_title":"Do LLMs Follow Their Own Rules? A Reflexive Audit of Self-Stated Safety Policies","ref_index":2,"is_internal_anchor":true},{"citing_arxiv_id":"2605.07982","citing_title":"GLiGuard: Schema-Conditioned Classification for LLM Safeguard","ref_index":4,"is_internal_anchor":true},{"citing_arxiv_id":"2604.07709","citing_title":"IatroBench: Pre-Registered Evidence of Iatrogenic Harm from AI Safety Measures","ref_index":17,"is_internal_anchor":true},{"citing_arxiv_id":"2604.14414","citing_title":"The Autocorrelation Blind Spot: Why 42% of Turn-Level Findings in LLM Conversation Analysis May Be Spurious","ref_index":12,"is_internal_anchor":true},{"citing_arxiv_id":"2604.16870","citing_title":"Governed MCP: Kernel-Level Tool Governance for AI Agents via Logit-Based Safety Primitives","ref_index":12,"is_internal_anchor":true},{"citing_arxiv_id":"2604.18473","citing_title":"Train Separately, Merge Together: Modular Post-Training with Mixture-of-Experts","ref_index":11,"is_internal_anchor":true},{"citing_arxiv_id":"2604.18976","citing_title":"STAR-Teaming: A Strategy-Response Multiplex Network Approach to Automated LLM Red Teaming","ref_index":51,"is_internal_anchor":true},{"citing_arxiv_id":"2605.03226","citing_title":"Self-Mined Hardness for Safety Fine-Tuning","ref_index":7,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":2,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/76DPSGXMOWWHM6WLVLLTQ5QWDU","json":"https://pith.science/pith/76DPSGXMOWWHM6WLVLLTQ5QWDU.json","graph_json":"https://pith.science/api/pith-number/76DPSGXMOWWHM6WLVLLTQ5QWDU/graph.json","events_json":"https://pith.science/api/pith-number/76DPSGXMOWWHM6WLVLLTQ5QWDU/events.json","paper":"https://pith.science/paper/76DPSGXM"},"agent_actions":{"view_html":"https://pith.science/pith/76DPSGXMOWWHM6WLVLLTQ5QWDU","download_json":"https://pith.science/pith/76DPSGXMOWWHM6WLVLLTQ5QWDU.json","view_paper":"https://pith.science/paper/76DPSGXM","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2406.18495&json=true","fetch_graph":"https://pith.science/api/pith-number/76DPSGXMOWWHM6WLVLLTQ5QWDU/graph.json","fetch_events":"https://pith.science/api/pith-number/76DPSGXMOWWHM6WLVLLTQ5QWDU/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/76DPSGXMOWWHM6WLVLLTQ5QWDU/action/timestamp_anchor","attest_storage":"https://pith.science/pith/76DPSGXMOWWHM6WLVLLTQ5QWDU/action/storage_attestation","attest_author":"https://pith.science/pith/76DPSGXMOWWHM6WLVLLTQ5QWDU/action/author_attestation","sign_citation":"https://pith.science/pith/76DPSGXMOWWHM6WLVLLTQ5QWDU/action/citation_signature","submit_replication":"https://pith.science/pith/76DPSGXMOWWHM6WLVLLTQ5QWDU/action/replication_record"}},"created_at":"2026-05-17T23:38:13.600541+00:00","updated_at":"2026-05-17T23:38:13.600541+00:00"}