{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2023:URDEUTTPEZSNEKU74DMPXLITWV","short_pith_number":"pith:URDEUTTP","schema_version":"1.0","canonical_sha256":"a4464a4e6f2664d22a9fe0d8fbad13b5702b39cc81a492dc2df5e2984b365536","source":{"kind":"arxiv","id":"2309.10253","version":4},"attestation_state":"computed","paper":{"title":"GPTFUZZER: Red Teaming Large Language Models with Auto-Generated Jailbreak Prompts","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Automated fuzzing of human-written jailbreak seeds produces templates that succeed against ChatGPT and Llama-2 at rates above 90 percent.","cross_cats":[],"primary_cat":"cs.AI","authors_text":"Jiahao Yu, Xingwei Lin, Xinyu Xing, Zheng Yu","submitted_at":"2023-09-19T02:19:48Z","abstract_excerpt":"Large language models (LLMs) have recently experienced tremendous popularity and are widely used from casual conversations to AI-driven programming. However, despite their considerable success, LLMs are not entirely reliable and can give detailed guidance on how to conduct harmful or illegal activities. While safety measures can reduce the risk of such outputs, adversarial jailbreak attacks can still exploit LLMs to produce harmful content. These jailbreak templates are typically manually crafted, making large-scale testing challenging.\n  In this paper, we introduce GPTFuzz, a novel black-box "},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":true,"formal_links_present":false},"canonical_record":{"source":{"id":"2309.10253","kind":"arxiv","version":4},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2023-09-19T02:19:48Z","cross_cats_sorted":[],"title_canon_sha256":"f270ad3b4049ff8714ccc1bdce3f02e3b264f619e828ae2c1dcfe6a3327c3c48","abstract_canon_sha256":"69e45c26e863bbee04523b9590fe0cf2cbca3f95364bb2b45816b96e13050533"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:53.271825Z","signature_b64":"UXBbAwfu+hZfX3kcCXE5PsHSSXkUIHtTulPO7Ln7XwqjNLDOBrcXg1uLVweQ+0dkKSLoN1DxtA/eahr091AzAw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"a4464a4e6f2664d22a9fe0d8fbad13b5702b39cc81a492dc2df5e2984b365536","last_reissued_at":"2026-05-17T23:38:53.271145Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:53.271145Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"GPTFUZZER: Red Teaming Large Language Models with Auto-Generated Jailbreak Prompts","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Automated fuzzing of human-written jailbreak seeds produces templates that succeed against ChatGPT and Llama-2 at rates above 90 percent.","cross_cats":[],"primary_cat":"cs.AI","authors_text":"Jiahao Yu, Xingwei Lin, Xinyu Xing, Zheng Yu","submitted_at":"2023-09-19T02:19:48Z","abstract_excerpt":"Large language models (LLMs) have recently experienced tremendous popularity and are widely used from casual conversations to AI-driven programming. However, despite their considerable success, LLMs are not entirely reliable and can give detailed guidance on how to conduct harmful or illegal activities. While safety measures can reduce the risk of such outputs, adversarial jailbreak attacks can still exploit LLMs to produce harmful content. These jailbreak templates are typically manually crafted, making large-scale testing challenging.\n  In this paper, we introduce GPTFuzz, a novel black-box "},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"GPTFuzz achieves over 90% attack success rates against ChatGPT and Llama-2 models, even with suboptimal initial seed templates.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"The judgment model reliably determines jailbreak success without significant false positives or negatives that would inflate reported rates.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"GPTFuzz is a black-box fuzzing framework that mutates seed jailbreak templates to automatically generate effective attacks, achieving over 90% success rates on models including ChatGPT and Llama-2.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Automated fuzzing of human-written jailbreak seeds produces templates that succeed against ChatGPT and Llama-2 at rates above 90 percent.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"0d6fd090272ed0b896953dcab76cdc2d6c59eb4ba3c20be7ddc8280197526cdc"},"source":{"id":"2309.10253","kind":"arxiv","version":4},"verdict":{"id":"c229318d-d20b-44ea-9c96-f95964c5d0e3","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T06:20:52.596251Z","strongest_claim":"GPTFuzz achieves over 90% attack success rates against ChatGPT and Llama-2 models, even with suboptimal initial seed templates.","one_line_summary":"GPTFuzz is a black-box fuzzing framework that mutates seed jailbreak templates to automatically generate effective attacks, achieving over 90% success rates on models including ChatGPT and Llama-2.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"The judgment model reliably determines jailbreak success without significant false positives or negatives that would inflate reported rates.","pith_extraction_headline":"Automated fuzzing of human-written jailbreak seeds produces templates that succeed against ChatGPT and Llama-2 at rates above 90 percent."},"references":{"count":79,"sample":[{"doi":"","year":2023,"title":"PaLM 2 Technical Report","work_id":"905ee9a7-ea61-4a94-bd62-2600cbe3e315","ref_index":1,"cited_arxiv_id":"2305.10403","is_internal_anchor":true},{"doi":"","year":2023,"title":"Introducing claude","work_id":"f58a083e-ea58-4b6f-b253-e406a7bd09bd","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2002,"title":"Finite-time analysis of the multiarmed bandit problem","work_id":"630cb0ba-3ba4-4d2f-b0bb-047985564d41","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2022,"title":"Efficient greybox fuzzing to detect memory errors","work_id":"d90dfe04-acd7-4145-9977-b593f0501504","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2022,"title":"Spinning language models: Risks of propaganda-as-a-service and countermeasures","work_id":"5177b50c-64a6-478b-9c74-54752a42f98a","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":79,"snapshot_sha256":"84d053092674e20d3a387e740712731f9f64a3efece840bc72a99f3ace4f14aa","internal_anchors":19},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2309.10253","created_at":"2026-05-17T23:38:53.271259+00:00"},{"alias_kind":"arxiv_version","alias_value":"2309.10253v4","created_at":"2026-05-17T23:38:53.271259+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2309.10253","created_at":"2026-05-17T23:38:53.271259+00:00"},{"alias_kind":"pith_short_12","alias_value":"URDEUTTPEZSN","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_16","alias_value":"URDEUTTPEZSNEKU7","created_at":"2026-05-18T12:33:37.589309+00:00"},{"alias_kind":"pith_short_8","alias_value":"URDEUTTP","created_at":"2026-05-18T12:33:37.589309+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":45,"internal_anchor_count":45,"sample":[{"citing_arxiv_id":"2605.23723","citing_title":"MemAudit: Post-hoc Auditing of Poisoned Agent Memory via Causal Attribution and Structural Anomaly Detection","ref_index":3,"is_internal_anchor":true},{"citing_arxiv_id":"2405.13068","citing_title":"Uncovering Logit Suppression Vulnerabilities in LLM Safety Alignment","ref_index":27,"is_internal_anchor":true},{"citing_arxiv_id":"2409.18169","citing_title":"Harmful Fine-tuning Attacks and Defenses for Large Language Models: A Survey","ref_index":173,"is_internal_anchor":true},{"citing_arxiv_id":"2502.05206","citing_title":"Safety at Scale: A Comprehensive Survey of Large Model and Agent Safety","ref_index":84,"is_internal_anchor":true},{"citing_arxiv_id":"2503.02574","citing_title":"LLM-Safety Evaluations Lack Robustness","ref_index":58,"is_internal_anchor":true},{"citing_arxiv_id":"2605.21999","citing_title":"Toward Understanding Adversarial Distillation: Why Robust Teachers Fail","ref_index":8,"is_internal_anchor":true},{"citing_arxiv_id":"2510.07239","citing_title":"Red-Bandit: Test-Time Adaptation for LLM Red-Teaming via Bandit-Guided LoRA Experts","ref_index":46,"is_internal_anchor":true},{"citing_arxiv_id":"2511.12710","citing_title":"Evolve the Method, Not the Prompts: Evolutionary Synthesis of Jailbreak Attacks on LLMs","ref_index":56,"is_internal_anchor":true},{"citing_arxiv_id":"2605.21362","citing_title":"LASH: Adaptive Semantic Hybridization for Black-Box Jailbreaking of Large Language Models","ref_index":37,"is_internal_anchor":true},{"citing_arxiv_id":"2605.15598","citing_title":"Compositional Jailbreaking: An Empirical Analysis of Mutator Chain Interactions in Aligned LLMs","ref_index":14,"is_internal_anchor":true},{"citing_arxiv_id":"2605.16471","citing_title":"From AI-Generated Content to Agentic Action: Security and Safety Threats in Generative AI","ref_index":150,"is_internal_anchor":true},{"citing_arxiv_id":"2605.16551","citing_title":"PQR: A Framework to Generate Diverse and Realistic User Queries that Elicit QA Agent Failures","ref_index":30,"is_internal_anchor":true},{"citing_arxiv_id":"2605.18133","citing_title":"An Empirical Study of Privacy Leakage Chains via Prompt Injection in Black-Box Chatbot Environments","ref_index":30,"is_internal_anchor":true},{"citing_arxiv_id":"2506.12382","citing_title":"Exploring the Secondary Risks of Large Language Models","ref_index":51,"is_internal_anchor":true},{"citing_arxiv_id":"2508.00555","citing_title":"Activation-Guided Local Editing for Jailbreaking Attacks","ref_index":8,"is_internal_anchor":true},{"citing_arxiv_id":"2508.11222","citing_title":"ORFuzz: Fuzzing the \"Other Side\" of LLM Safety -- Testing Over-Refusal","ref_index":23,"is_internal_anchor":true},{"citing_arxiv_id":"2510.09093","citing_title":"Exploiting Web Search Tools of AI Agents for Data Exfiltration","ref_index":23,"is_internal_anchor":true},{"citing_arxiv_id":"2404.01833","citing_title":"Great, Now Write an Article About That: The Crescendo Multi-Turn LLM Jailbreak Attack","ref_index":33,"is_internal_anchor":true},{"citing_arxiv_id":"2511.02356","citing_title":"ASTRA: An Automated Framework for Strategy Discovery, Retrieval, and Evolution for Jailbreaking LLMs","ref_index":55,"is_internal_anchor":true},{"citing_arxiv_id":"2402.10260","citing_title":"A StrongREJECT for Empty Jailbreaks","ref_index":40,"is_internal_anchor":true},{"citing_arxiv_id":"2512.22753","citing_title":"From Rookie to Expert: Manipulating LLMs for Automated Vulnerability Exploitation in Enterprise Software","ref_index":35,"is_internal_anchor":true},{"citing_arxiv_id":"2601.02670","citing_title":"Break Me If You Can: Self-Jailbreaking of Aligned LLMs via Lexical Insertion Prompting","ref_index":8,"is_internal_anchor":true},{"citing_arxiv_id":"2602.02280","citing_title":"RACC: Representation-Aware Coverage Criteria for LLM Safety Testing","ref_index":61,"is_internal_anchor":true},{"citing_arxiv_id":"2404.01318","citing_title":"JailbreakBench: An Open Robustness Benchmark for Jailbreaking Large Language Models","ref_index":57,"is_internal_anchor":true},{"citing_arxiv_id":"2407.04295","citing_title":"Jailbreak Attacks and Defenses Against Large Language Models: A Survey","ref_index":107,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/URDEUTTPEZSNEKU74DMPXLITWV","json":"https://pith.science/pith/URDEUTTPEZSNEKU74DMPXLITWV.json","graph_json":"https://pith.science/api/pith-number/URDEUTTPEZSNEKU74DMPXLITWV/graph.json","events_json":"https://pith.science/api/pith-number/URDEUTTPEZSNEKU74DMPXLITWV/events.json","paper":"https://pith.science/paper/URDEUTTP"},"agent_actions":{"view_html":"https://pith.science/pith/URDEUTTPEZSNEKU74DMPXLITWV","download_json":"https://pith.science/pith/URDEUTTPEZSNEKU74DMPXLITWV.json","view_paper":"https://pith.science/paper/URDEUTTP","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2309.10253&json=true","fetch_graph":"https://pith.science/api/pith-number/URDEUTTPEZSNEKU74DMPXLITWV/graph.json","fetch_events":"https://pith.science/api/pith-number/URDEUTTPEZSNEKU74DMPXLITWV/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/URDEUTTPEZSNEKU74DMPXLITWV/action/timestamp_anchor","attest_storage":"https://pith.science/pith/URDEUTTPEZSNEKU74DMPXLITWV/action/storage_attestation","attest_author":"https://pith.science/pith/URDEUTTPEZSNEKU74DMPXLITWV/action/author_attestation","sign_citation":"https://pith.science/pith/URDEUTTPEZSNEKU74DMPXLITWV/action/citation_signature","submit_replication":"https://pith.science/pith/URDEUTTPEZSNEKU74DMPXLITWV/action/replication_record"}},"created_at":"2026-05-17T23:38:53.271259+00:00","updated_at":"2026-05-17T23:38:53.271259+00:00"}