{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2023:URDEUTTPEZSNEKU74DMPXLITWV","short_pith_number":"pith:URDEUTTP","canonical_record":{"source":{"id":"2309.10253","kind":"arxiv","version":4},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2023-09-19T02:19:48Z","cross_cats_sorted":[],"title_canon_sha256":"f270ad3b4049ff8714ccc1bdce3f02e3b264f619e828ae2c1dcfe6a3327c3c48","abstract_canon_sha256":"69e45c26e863bbee04523b9590fe0cf2cbca3f95364bb2b45816b96e13050533"},"schema_version":"1.0"},"canonical_sha256":"a4464a4e6f2664d22a9fe0d8fbad13b5702b39cc81a492dc2df5e2984b365536","source":{"kind":"arxiv","id":"2309.10253","version":4},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2309.10253","created_at":"2026-05-17T23:38:53Z"},{"alias_kind":"arxiv_version","alias_value":"2309.10253v4","created_at":"2026-05-17T23:38:53Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2309.10253","created_at":"2026-05-17T23:38:53Z"},{"alias_kind":"pith_short_12","alias_value":"URDEUTTPEZSN","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"URDEUTTPEZSNEKU7","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"URDEUTTP","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2023:URDEUTTPEZSNEKU74DMPXLITWV","target":"record","payload":{"canonical_record":{"source":{"id":"2309.10253","kind":"arxiv","version":4},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2023-09-19T02:19:48Z","cross_cats_sorted":[],"title_canon_sha256":"f270ad3b4049ff8714ccc1bdce3f02e3b264f619e828ae2c1dcfe6a3327c3c48","abstract_canon_sha256":"69e45c26e863bbee04523b9590fe0cf2cbca3f95364bb2b45816b96e13050533"},"schema_version":"1.0"},"canonical_sha256":"a4464a4e6f2664d22a9fe0d8fbad13b5702b39cc81a492dc2df5e2984b365536","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:53.271825Z","signature_b64":"UXBbAwfu+hZfX3kcCXE5PsHSSXkUIHtTulPO7Ln7XwqjNLDOBrcXg1uLVweQ+0dkKSLoN1DxtA/eahr091AzAw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"a4464a4e6f2664d22a9fe0d8fbad13b5702b39cc81a492dc2df5e2984b365536","last_reissued_at":"2026-05-17T23:38:53.271145Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:53.271145Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2309.10253","source_version":4,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:53Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"b58KioAo2SV2rar0/B1bKXS9gz12/eNMccKv7bG+CU2GaPPLY7TWJbomrvoyjoaGgy9wQycuG2Zr2/kaGUBqAg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-05T12:50:53.028574Z"},"content_sha256":"70dd550a3be15376bb5bfad31b9cbff266fe89f8e3ee3dcfe6c492ef170c3c3e","schema_version":"1.0","event_id":"sha256:70dd550a3be15376bb5bfad31b9cbff266fe89f8e3ee3dcfe6c492ef170c3c3e"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2023:URDEUTTPEZSNEKU74DMPXLITWV","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"GPTFUZZER: Red Teaming Large Language Models with Auto-Generated Jailbreak Prompts","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Automated fuzzing of human-written jailbreak seeds produces templates that succeed against ChatGPT and Llama-2 at rates above 90 percent.","cross_cats":[],"primary_cat":"cs.AI","authors_text":"Jiahao Yu, Xingwei Lin, Xinyu Xing, Zheng Yu","submitted_at":"2023-09-19T02:19:48Z","abstract_excerpt":"Large language models (LLMs) have recently experienced tremendous popularity and are widely used from casual conversations to AI-driven programming. However, despite their considerable success, LLMs are not entirely reliable and can give detailed guidance on how to conduct harmful or illegal activities. While safety measures can reduce the risk of such outputs, adversarial jailbreak attacks can still exploit LLMs to produce harmful content. These jailbreak templates are typically manually crafted, making large-scale testing challenging.\n  In this paper, we introduce GPTFuzz, a novel black-box "},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"GPTFuzz achieves over 90% attack success rates against ChatGPT and Llama-2 models, even with suboptimal initial seed templates.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"The judgment model reliably determines jailbreak success without significant false positives or negatives that would inflate reported rates.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"GPTFuzz is a black-box fuzzing framework that mutates seed jailbreak templates to automatically generate effective attacks, achieving over 90% success rates on models including ChatGPT and Llama-2.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Automated fuzzing of human-written jailbreak seeds produces templates that succeed against ChatGPT and Llama-2 at rates above 90 percent.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"0d6fd090272ed0b896953dcab76cdc2d6c59eb4ba3c20be7ddc8280197526cdc"},"source":{"id":"2309.10253","kind":"arxiv","version":4},"verdict":{"id":"c229318d-d20b-44ea-9c96-f95964c5d0e3","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T06:20:52.596251Z","strongest_claim":"GPTFuzz achieves over 90% attack success rates against ChatGPT and Llama-2 models, even with suboptimal initial seed templates.","one_line_summary":"GPTFuzz is a black-box fuzzing framework that mutates seed jailbreak templates to automatically generate effective attacks, achieving over 90% success rates on models including ChatGPT and Llama-2.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"The judgment model reliably determines jailbreak success without significant false positives or negatives that would inflate reported rates.","pith_extraction_headline":"Automated fuzzing of human-written jailbreak seeds produces templates that succeed against ChatGPT and Llama-2 at rates above 90 percent."},"references":{"count":79,"sample":[{"doi":"","year":2023,"title":"PaLM 2 Technical Report","work_id":"905ee9a7-ea61-4a94-bd62-2600cbe3e315","ref_index":1,"cited_arxiv_id":"2305.10403","is_internal_anchor":true},{"doi":"","year":2023,"title":"Introducing claude","work_id":"f58a083e-ea58-4b6f-b253-e406a7bd09bd","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2002,"title":"Finite-time analysis of the multiarmed bandit problem","work_id":"630cb0ba-3ba4-4d2f-b0bb-047985564d41","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2022,"title":"Efficient greybox fuzzing to detect memory errors","work_id":"d90dfe04-acd7-4145-9977-b593f0501504","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2022,"title":"Spinning language models: Risks of propaganda-as-a-service and countermeasures","work_id":"5177b50c-64a6-478b-9c74-54752a42f98a","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":79,"snapshot_sha256":"84d053092674e20d3a387e740712731f9f64a3efece840bc72a99f3ace4f14aa","internal_anchors":19},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"c229318d-d20b-44ea-9c96-f95964c5d0e3"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:53Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"A/FFXug6C5OwX4WAqeigrbowfn/Xta95yUv5SByShRnsZ/qQ/+B2j58OLWU3/BsNpLtoQ2+7fDIUOf1qMdrpCA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-05T12:50:53.029109Z"},"content_sha256":"9f04f5cc3e02d0d641e323f9cc2551ef6ad6f06c6ae635ab406060cb71e8bb5b","schema_version":"1.0","event_id":"sha256:9f04f5cc3e02d0d641e323f9cc2551ef6ad6f06c6ae635ab406060cb71e8bb5b"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/URDEUTTPEZSNEKU74DMPXLITWV/bundle.json","state_url":"https://pith.science/pith/URDEUTTPEZSNEKU74DMPXLITWV/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/URDEUTTPEZSNEKU74DMPXLITWV/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-05T12:50:53Z","links":{"resolver":"https://pith.science/pith/URDEUTTPEZSNEKU74DMPXLITWV","bundle":"https://pith.science/pith/URDEUTTPEZSNEKU74DMPXLITWV/bundle.json","state":"https://pith.science/pith/URDEUTTPEZSNEKU74DMPXLITWV/state.json","well_known_bundle":"https://pith.science/.well-known/pith/URDEUTTPEZSNEKU74DMPXLITWV/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2023:URDEUTTPEZSNEKU74DMPXLITWV","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"69e45c26e863bbee04523b9590fe0cf2cbca3f95364bb2b45816b96e13050533","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2023-09-19T02:19:48Z","title_canon_sha256":"f270ad3b4049ff8714ccc1bdce3f02e3b264f619e828ae2c1dcfe6a3327c3c48"},"schema_version":"1.0","source":{"id":"2309.10253","kind":"arxiv","version":4}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2309.10253","created_at":"2026-05-17T23:38:53Z"},{"alias_kind":"arxiv_version","alias_value":"2309.10253v4","created_at":"2026-05-17T23:38:53Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2309.10253","created_at":"2026-05-17T23:38:53Z"},{"alias_kind":"pith_short_12","alias_value":"URDEUTTPEZSN","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"URDEUTTPEZSNEKU7","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"URDEUTTP","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:9f04f5cc3e02d0d641e323f9cc2551ef6ad6f06c6ae635ab406060cb71e8bb5b","target":"graph","created_at":"2026-05-17T23:38:53Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"GPTFuzz achieves over 90% attack success rates against ChatGPT and Llama-2 models, even with suboptimal initial seed templates."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"The judgment model reliably determines jailbreak success without significant false positives or negatives that would inflate reported rates."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"GPTFuzz is a black-box fuzzing framework that mutates seed jailbreak templates to automatically generate effective attacks, achieving over 90% success rates on models including ChatGPT and Llama-2."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Automated fuzzing of human-written jailbreak seeds produces templates that succeed against ChatGPT and Llama-2 at rates above 90 percent."}],"snapshot_sha256":"0d6fd090272ed0b896953dcab76cdc2d6c59eb4ba3c20be7ddc8280197526cdc"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"paper":{"abstract_excerpt":"Large language models (LLMs) have recently experienced tremendous popularity and are widely used from casual conversations to AI-driven programming. However, despite their considerable success, LLMs are not entirely reliable and can give detailed guidance on how to conduct harmful or illegal activities. While safety measures can reduce the risk of such outputs, adversarial jailbreak attacks can still exploit LLMs to produce harmful content. These jailbreak templates are typically manually crafted, making large-scale testing challenging.\n  In this paper, we introduce GPTFuzz, a novel black-box ","authors_text":"Jiahao Yu, Xingwei Lin, Xinyu Xing, Zheng Yu","cross_cats":[],"headline":"Automated fuzzing of human-written jailbreak seeds produces templates that succeed against ChatGPT and Llama-2 at rates above 90 percent.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2023-09-19T02:19:48Z","title":"GPTFUZZER: Red Teaming Large Language Models with Auto-Generated Jailbreak Prompts"},"references":{"count":79,"internal_anchors":19,"resolved_work":79,"sample":[{"cited_arxiv_id":"2305.10403","doi":"","is_internal_anchor":true,"ref_index":1,"title":"PaLM 2 Technical Report","work_id":"905ee9a7-ea61-4a94-bd62-2600cbe3e315","year":2023},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Introducing claude","work_id":"f58a083e-ea58-4b6f-b253-e406a7bd09bd","year":2023},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"Finite-time analysis of the multiarmed bandit problem","work_id":"630cb0ba-3ba4-4d2f-b0bb-047985564d41","year":2002},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Efficient greybox fuzzing to detect memory errors","work_id":"d90dfe04-acd7-4145-9977-b593f0501504","year":2022},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"Spinning language models: Risks of propaganda-as-a-service and countermeasures","work_id":"5177b50c-64a6-478b-9c74-54752a42f98a","year":2022}],"snapshot_sha256":"84d053092674e20d3a387e740712731f9f64a3efece840bc72a99f3ace4f14aa"},"source":{"id":"2309.10253","kind":"arxiv","version":4},"verdict":{"created_at":"2026-05-15T06:20:52.596251Z","id":"c229318d-d20b-44ea-9c96-f95964c5d0e3","model_set":{"reader":"grok-4.3"},"one_line_summary":"GPTFuzz is a black-box fuzzing framework that mutates seed jailbreak templates to automatically generate effective attacks, achieving over 90% success rates on models including ChatGPT and Llama-2.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Automated fuzzing of human-written jailbreak seeds produces templates that succeed against ChatGPT and Llama-2 at rates above 90 percent.","strongest_claim":"GPTFuzz achieves over 90% attack success rates against ChatGPT and Llama-2 models, even with suboptimal initial seed templates.","weakest_assumption":"The judgment model reliably determines jailbreak success without significant false positives or negatives that would inflate reported rates."}},"verdict_id":"c229318d-d20b-44ea-9c96-f95964c5d0e3"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:70dd550a3be15376bb5bfad31b9cbff266fe89f8e3ee3dcfe6c492ef170c3c3e","target":"record","created_at":"2026-05-17T23:38:53Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"69e45c26e863bbee04523b9590fe0cf2cbca3f95364bb2b45816b96e13050533","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.AI","submitted_at":"2023-09-19T02:19:48Z","title_canon_sha256":"f270ad3b4049ff8714ccc1bdce3f02e3b264f619e828ae2c1dcfe6a3327c3c48"},"schema_version":"1.0","source":{"id":"2309.10253","kind":"arxiv","version":4}},"canonical_sha256":"a4464a4e6f2664d22a9fe0d8fbad13b5702b39cc81a492dc2df5e2984b365536","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"a4464a4e6f2664d22a9fe0d8fbad13b5702b39cc81a492dc2df5e2984b365536","first_computed_at":"2026-05-17T23:38:53.271145Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:53.271145Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"UXBbAwfu+hZfX3kcCXE5PsHSSXkUIHtTulPO7Ln7XwqjNLDOBrcXg1uLVweQ+0dkKSLoN1DxtA/eahr091AzAw==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:53.271825Z","signed_message":"canonical_sha256_bytes"},"source_id":"2309.10253","source_kind":"arxiv","source_version":4}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:70dd550a3be15376bb5bfad31b9cbff266fe89f8e3ee3dcfe6c492ef170c3c3e","sha256:9f04f5cc3e02d0d641e323f9cc2551ef6ad6f06c6ae635ab406060cb71e8bb5b"],"state_sha256":"180c91cfb83f6701416bd2dfd1c3fe56b046b79416d34b353e63156cb835af61"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"O+1GR4zmAVoFx8cSkMT6E+Q4lnqyWEnb8yOhcKzBca5LgNLzsNSlNqjrKHlfdo3DH2ippLh58lEWtblNSyT2Bw==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-05T12:50:53.031437Z","bundle_sha256":"76c3a61d22528dbcb4bf2c2155514a8a7485563e7d3ad7160e8e13f7820d3b04"}}