{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2023:K2QZLT2REVSUB3AKRWQNHHV3QG","short_pith_number":"pith:K2QZLT2R","canonical_record":{"source":{"id":"2310.03684","kind":"arxiv","version":4},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2023-10-05T17:01:53Z","cross_cats_sorted":["cs.AI","stat.ML"],"title_canon_sha256":"89fc6da0f5aff8ded674be064675018faded353b0b657723c5963c1f63e1f125","abstract_canon_sha256":"ae3f73f4124184aa1324e4578cd1c2228d3f9a62f7c6ffbe48b6caee68cb70d8"},"schema_version":"1.0"},"canonical_sha256":"56a195cf51256540ec0a8da0d39ebb81ab9c96f184cadfe0760f34be92bd8147","source":{"kind":"arxiv","id":"2310.03684","version":4},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2310.03684","created_at":"2026-05-17T23:39:22Z"},{"alias_kind":"arxiv_version","alias_value":"2310.03684v4","created_at":"2026-05-17T23:39:22Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2310.03684","created_at":"2026-05-17T23:39:22Z"},{"alias_kind":"pith_short_12","alias_value":"K2QZLT2REVSU","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"K2QZLT2REVSUB3AK","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"K2QZLT2R","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2023:K2QZLT2REVSUB3AKRWQNHHV3QG","target":"record","payload":{"canonical_record":{"source":{"id":"2310.03684","kind":"arxiv","version":4},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2023-10-05T17:01:53Z","cross_cats_sorted":["cs.AI","stat.ML"],"title_canon_sha256":"89fc6da0f5aff8ded674be064675018faded353b0b657723c5963c1f63e1f125","abstract_canon_sha256":"ae3f73f4124184aa1324e4578cd1c2228d3f9a62f7c6ffbe48b6caee68cb70d8"},"schema_version":"1.0"},"canonical_sha256":"56a195cf51256540ec0a8da0d39ebb81ab9c96f184cadfe0760f34be92bd8147","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:39:22.348925Z","signature_b64":"3xIOSlwmNIBSzaNak1TM66LLfDDxxOsf/hV3K4QLy0OGdO5Is6vTsUszQlDAmY87IYN+QcB0YzvedfxlZq5aCw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"56a195cf51256540ec0a8da0d39ebb81ab9c96f184cadfe0760f34be92bd8147","last_reissued_at":"2026-05-17T23:39:22.348164Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:39:22.348164Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2310.03684","source_version":4,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:39:22Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"e/2seR0W93EHOxbuVV7l2nnJ6qOuEXOjR708HN2WUmoSOY8WBx0sgFyCBWlR7yroMzuXWSHJUgeNYt4sjwBpDg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-02T15:07:16.845293Z"},"content_sha256":"15d6930ff75af9c5b489c392452fe8471403155e043e17bff7a1f528ba233c87","schema_version":"1.0","event_id":"sha256:15d6930ff75af9c5b489c392452fe8471403155e043e17bff7a1f528ba233c87"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2023:K2QZLT2REVSUB3AKRWQNHHV3QG","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"SmoothLLM: Defending Large Language Models Against Jailbreaking Attacks","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"SmoothLLM defends large language models against jailbreaking by perturbing input prompts at the character level and aggregating multiple responses.","cross_cats":["cs.AI","stat.ML"],"primary_cat":"cs.LG","authors_text":"Alexander Robey, Eric Wong, George J. Pappas, Hamed Hassani","submitted_at":"2023-10-05T17:01:53Z","abstract_excerpt":"Despite efforts to align large language models (LLMs) with human intentions, widely-used LLMs such as GPT, Llama, and Claude are susceptible to jailbreaking attacks, wherein an adversary fools a targeted LLM into generating objectionable content. To address this vulnerability, we propose SmoothLLM, the first algorithm designed to mitigate jailbreaking attacks. Based on our finding that adversarially-generated prompts are brittle to character-level changes, our defense randomly perturbs multiple copies of a given input prompt, and then aggregates the corresponding predictions to detect adversar"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Across a range of popular LLMs, SmoothLLM sets the state-of-the-art for robustness against the GCG, PAIR, RandomSearch, and AmpleGCG jailbreaks.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"Adversarially-generated prompts are brittle to character-level changes, which is the core empirical finding used to justify random perturbation and aggregation.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"SmoothLLM mitigates jailbreaking attacks on LLMs by randomly perturbing multiple copies of a prompt at the character level and aggregating the outputs to detect adversarial inputs.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"SmoothLLM defends large language models against jailbreaking by perturbing input prompts at the character level and aggregating multiple responses.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"793c580c3c59968310459d493ce95a6d2cf89bb537d5efd5929a73d0aea503aa"},"source":{"id":"2310.03684","kind":"arxiv","version":4},"verdict":{"id":"36bd2177-17e8-4757-b4a4-86798714b5be","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-14T17:05:31.423908Z","strongest_claim":"Across a range of popular LLMs, SmoothLLM sets the state-of-the-art for robustness against the GCG, PAIR, RandomSearch, and AmpleGCG jailbreaks.","one_line_summary":"SmoothLLM mitigates jailbreaking attacks on LLMs by randomly perturbing multiple copies of a prompt at the character level and aggregating the outputs to detect adversarial inputs.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"Adversarially-generated prompts are brittle to character-level changes, which is the core empirical finding used to justify random perturbation and aggregation.","pith_extraction_headline":"SmoothLLM defends large language models against jailbreaking by perturbing input prompts at the character level and aggregating multiple responses."},"references":{"count":91,"sample":[{"doi":"","year":2009,"title":"RealToxicityPrompts: Evaluating Neural Toxic Degeneration in Language Models","work_id":"6a137b3a-68fe-4f2e-aad1-ca042346408f","ref_index":1,"cited_arxiv_id":"2009.11462","is_internal_anchor":true},{"doi":"","year":2016,"title":"The ai alignment problem: why it is hard, and where to start","work_id":"afbc50a2-46bb-4a39-9aca-47eeb613457a","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2020,"title":"Artificial intelligence, values, and alignment","work_id":"d9c231dd-dfd0-4b73-bda4-8fc8c7ad2a5f","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"The alignment problem: Machine learning and human values","work_id":"13d2d97d-4163-4819-9541-d3968ab50a98","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2023,"title":"Regulating chatgpt and other large generative ai models","work_id":"2dcc9e2c-5741-43a9-8f9f-c90551ceb9aa","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":91,"snapshot_sha256":"99b01a8e219fb2b6416486402c9771452dae43eea4271e1604b827848230119b","internal_anchors":23},"formal_canon":{"evidence_count":2,"snapshot_sha256":"498243ea3a4e56c45c2fc2e8d519270374d343ca0189021052f3c8335a926eae"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"36bd2177-17e8-4757-b4a4-86798714b5be"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:39:22Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"tClyD48wMV8euUG9KfrhOS8uZyVYZgnu2/t9YQVp6tRFoV9hr2pQm5jYd358BxdlR3VgnrQnlvrPt4JGro1rCA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-02T15:07:16.845851Z"},"content_sha256":"596efe95e69703fc7e880a5a04f3e6f270511deaa4b4ff122a07e423081c0c5a","schema_version":"1.0","event_id":"sha256:596efe95e69703fc7e880a5a04f3e6f270511deaa4b4ff122a07e423081c0c5a"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/K2QZLT2REVSUB3AKRWQNHHV3QG/bundle.json","state_url":"https://pith.science/pith/K2QZLT2REVSUB3AKRWQNHHV3QG/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/K2QZLT2REVSUB3AKRWQNHHV3QG/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-02T15:07:16Z","links":{"resolver":"https://pith.science/pith/K2QZLT2REVSUB3AKRWQNHHV3QG","bundle":"https://pith.science/pith/K2QZLT2REVSUB3AKRWQNHHV3QG/bundle.json","state":"https://pith.science/pith/K2QZLT2REVSUB3AKRWQNHHV3QG/state.json","well_known_bundle":"https://pith.science/.well-known/pith/K2QZLT2REVSUB3AKRWQNHHV3QG/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2023:K2QZLT2REVSUB3AKRWQNHHV3QG","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"ae3f73f4124184aa1324e4578cd1c2228d3f9a62f7c6ffbe48b6caee68cb70d8","cross_cats_sorted":["cs.AI","stat.ML"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2023-10-05T17:01:53Z","title_canon_sha256":"89fc6da0f5aff8ded674be064675018faded353b0b657723c5963c1f63e1f125"},"schema_version":"1.0","source":{"id":"2310.03684","kind":"arxiv","version":4}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2310.03684","created_at":"2026-05-17T23:39:22Z"},{"alias_kind":"arxiv_version","alias_value":"2310.03684v4","created_at":"2026-05-17T23:39:22Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2310.03684","created_at":"2026-05-17T23:39:22Z"},{"alias_kind":"pith_short_12","alias_value":"K2QZLT2REVSU","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"K2QZLT2REVSUB3AK","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"K2QZLT2R","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:596efe95e69703fc7e880a5a04f3e6f270511deaa4b4ff122a07e423081c0c5a","target":"graph","created_at":"2026-05-17T23:39:22Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Across a range of popular LLMs, SmoothLLM sets the state-of-the-art for robustness against the GCG, PAIR, RandomSearch, and AmpleGCG jailbreaks."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"Adversarially-generated prompts are brittle to character-level changes, which is the core empirical finding used to justify random perturbation and aggregation."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"SmoothLLM mitigates jailbreaking attacks on LLMs by randomly perturbing multiple copies of a prompt at the character level and aggregating the outputs to detect adversarial inputs."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"SmoothLLM defends large language models against jailbreaking by perturbing input prompts at the character level and aggregating multiple responses."}],"snapshot_sha256":"793c580c3c59968310459d493ce95a6d2cf89bb537d5efd5929a73d0aea503aa"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"498243ea3a4e56c45c2fc2e8d519270374d343ca0189021052f3c8335a926eae"},"paper":{"abstract_excerpt":"Despite efforts to align large language models (LLMs) with human intentions, widely-used LLMs such as GPT, Llama, and Claude are susceptible to jailbreaking attacks, wherein an adversary fools a targeted LLM into generating objectionable content. To address this vulnerability, we propose SmoothLLM, the first algorithm designed to mitigate jailbreaking attacks. Based on our finding that adversarially-generated prompts are brittle to character-level changes, our defense randomly perturbs multiple copies of a given input prompt, and then aggregates the corresponding predictions to detect adversar","authors_text":"Alexander Robey, Eric Wong, George J. Pappas, Hamed Hassani","cross_cats":["cs.AI","stat.ML"],"headline":"SmoothLLM defends large language models against jailbreaking by perturbing input prompts at the character level and aggregating multiple responses.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2023-10-05T17:01:53Z","title":"SmoothLLM: Defending Large Language Models Against Jailbreaking Attacks"},"references":{"count":91,"internal_anchors":23,"resolved_work":91,"sample":[{"cited_arxiv_id":"2009.11462","doi":"","is_internal_anchor":true,"ref_index":1,"title":"RealToxicityPrompts: Evaluating Neural Toxic Degeneration in Language Models","work_id":"6a137b3a-68fe-4f2e-aad1-ca042346408f","year":2009},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"The ai alignment problem: why it is hard, and where to start","work_id":"afbc50a2-46bb-4a39-9aca-47eeb613457a","year":2016},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"Artificial intelligence, values, and alignment","work_id":"d9c231dd-dfd0-4b73-bda4-8fc8c7ad2a5f","year":2020},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"The alignment problem: Machine learning and human values","work_id":"13d2d97d-4163-4819-9541-d3968ab50a98","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"Regulating chatgpt and other large generative ai models","work_id":"2dcc9e2c-5741-43a9-8f9f-c90551ceb9aa","year":2023}],"snapshot_sha256":"99b01a8e219fb2b6416486402c9771452dae43eea4271e1604b827848230119b"},"source":{"id":"2310.03684","kind":"arxiv","version":4},"verdict":{"created_at":"2026-05-14T17:05:31.423908Z","id":"36bd2177-17e8-4757-b4a4-86798714b5be","model_set":{"reader":"grok-4.3"},"one_line_summary":"SmoothLLM mitigates jailbreaking attacks on LLMs by randomly perturbing multiple copies of a prompt at the character level and aggregating the outputs to detect adversarial inputs.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"SmoothLLM defends large language models against jailbreaking by perturbing input prompts at the character level and aggregating multiple responses.","strongest_claim":"Across a range of popular LLMs, SmoothLLM sets the state-of-the-art for robustness against the GCG, PAIR, RandomSearch, and AmpleGCG jailbreaks.","weakest_assumption":"Adversarially-generated prompts are brittle to character-level changes, which is the core empirical finding used to justify random perturbation and aggregation."}},"verdict_id":"36bd2177-17e8-4757-b4a4-86798714b5be"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:15d6930ff75af9c5b489c392452fe8471403155e043e17bff7a1f528ba233c87","target":"record","created_at":"2026-05-17T23:39:22Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"ae3f73f4124184aa1324e4578cd1c2228d3f9a62f7c6ffbe48b6caee68cb70d8","cross_cats_sorted":["cs.AI","stat.ML"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2023-10-05T17:01:53Z","title_canon_sha256":"89fc6da0f5aff8ded674be064675018faded353b0b657723c5963c1f63e1f125"},"schema_version":"1.0","source":{"id":"2310.03684","kind":"arxiv","version":4}},"canonical_sha256":"56a195cf51256540ec0a8da0d39ebb81ab9c96f184cadfe0760f34be92bd8147","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"56a195cf51256540ec0a8da0d39ebb81ab9c96f184cadfe0760f34be92bd8147","first_computed_at":"2026-05-17T23:39:22.348164Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:39:22.348164Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"3xIOSlwmNIBSzaNak1TM66LLfDDxxOsf/hV3K4QLy0OGdO5Is6vTsUszQlDAmY87IYN+QcB0YzvedfxlZq5aCw==","signature_status":"signed_v1","signed_at":"2026-05-17T23:39:22.348925Z","signed_message":"canonical_sha256_bytes"},"source_id":"2310.03684","source_kind":"arxiv","source_version":4}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:15d6930ff75af9c5b489c392452fe8471403155e043e17bff7a1f528ba233c87","sha256:596efe95e69703fc7e880a5a04f3e6f270511deaa4b4ff122a07e423081c0c5a"],"state_sha256":"d2a9feb16158be49113e46422ae7dbf4b29ec818aede486048f2f68d7f607468"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"PLjRc2+2B3lP35iQfb8if2AvC9D4V0gzL2EYA8hmbhXGg/i8URjvYoFqi/MrLiHXeQmDFiuykWQqWutvbZVmDg==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-02T15:07:16.848454Z","bundle_sha256":"cff95af69478200b9754d4266e3f628618b80bc40dad9e6790a978f70d0e1111"}}