{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2024:DX2EIEY3QADJUUPE6IRCNWFJ27","short_pith_number":"pith:DX2EIEY3","schema_version":"1.0","canonical_sha256":"1df444131b80069a51e4f22226d8a9d7f5c813d743d5e0ae524930eb827ae30d","source":{"kind":"arxiv","id":"2410.02832","version":2},"attestation_state":"computed","paper":{"title":"FlipAttack: Jailbreak LLMs via Flipping","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.CR","authors_text":"Bryan Hooi, Jiaheng Zhang, Jinlan Fu, Miao Xiong, Shumin Deng, Xiaoxin He, Yingwei Ma, Yue Liu","submitted_at":"2024-10-02T08:41:23Z","abstract_excerpt":"This paper proposes a simple yet effective jailbreak attack named FlipAttack against black-box LLMs. First, from the autoregressive nature, we reveal that LLMs tend to understand the text from left to right and find that they struggle to comprehend the text when noise is added to the left side. Motivated by these insights, we propose to disguise the harmful prompt by constructing left-side noise merely based on the prompt itself, then generalize this idea to 4 flipping modes. Second, we verify the strong ability of LLMs to perform the text-flipping task, and then develop 4 variants to guide LL"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2410.02832","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CR","submitted_at":"2024-10-02T08:41:23Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"395d5c3cbb936b2e2cccc30ec075fdfece8c37b5c1a37a2b6bc1ff59533e9724","abstract_canon_sha256":"09dc5f03364e409e108f1d31052f1993e840a4fad2c6d6db29aeb7e708fe87f1"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-20T00:00:16.429638Z","signature_b64":"qa4Akmmup1+ughAjVayjAy+d8myoBImQPG4NEBy8sb6TjQqb8Nkn6IfDX7URl5ENfjJ2GDtOGBDhTUpvUBJCAQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"1df444131b80069a51e4f22226d8a9d7f5c813d743d5e0ae524930eb827ae30d","last_reissued_at":"2026-05-20T00:00:16.428898Z","signature_status":"signed_v1","first_computed_at":"2026-05-20T00:00:16.428898Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"FlipAttack: Jailbreak LLMs via Flipping","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.CR","authors_text":"Bryan Hooi, Jiaheng Zhang, Jinlan Fu, Miao Xiong, Shumin Deng, Xiaoxin He, Yingwei Ma, Yue Liu","submitted_at":"2024-10-02T08:41:23Z","abstract_excerpt":"This paper proposes a simple yet effective jailbreak attack named FlipAttack against black-box LLMs. First, from the autoregressive nature, we reveal that LLMs tend to understand the text from left to right and find that they struggle to comprehend the text when noise is added to the left side. Motivated by these insights, we propose to disguise the harmful prompt by constructing left-side noise merely based on the prompt itself, then generalize this idea to 4 flipping modes. Second, we verify the strong ability of LLMs to perform the text-flipping task, and then develop 4 variants to guide LL"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2410.02832","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2410.02832/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2410.02832","created_at":"2026-05-20T00:00:16.429004+00:00"},{"alias_kind":"arxiv_version","alias_value":"2410.02832v2","created_at":"2026-05-20T00:00:16.429004+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2410.02832","created_at":"2026-05-20T00:00:16.429004+00:00"},{"alias_kind":"pith_short_12","alias_value":"DX2EIEY3QADJ","created_at":"2026-05-20T00:00:16.429004+00:00"},{"alias_kind":"pith_short_16","alias_value":"DX2EIEY3QADJUUPE","created_at":"2026-05-20T00:00:16.429004+00:00"},{"alias_kind":"pith_short_8","alias_value":"DX2EIEY3","created_at":"2026-05-20T00:00:16.429004+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":7,"internal_anchor_count":7,"sample":[{"citing_arxiv_id":"2605.21362","citing_title":"LASH: Adaptive Semantic Hybridization for Black-Box Jailbreaking of Large Language Models","ref_index":21,"is_internal_anchor":true},{"citing_arxiv_id":"2605.17288","citing_title":"When Efficiency Backfires: Cascading LLMs Trigger Cascade Failure under Adversarial Attack","ref_index":47,"is_internal_anchor":true},{"citing_arxiv_id":"2509.10546","citing_title":"Learning to Conceal Risk: Controllable Multi-turn Red Teaming for LLMs in the Financial Domain","ref_index":21,"is_internal_anchor":true},{"citing_arxiv_id":"2510.20129","citing_title":"SAID: Safety-Aware Intent Defense via Prefix Probing for Large Language Models","ref_index":17,"is_internal_anchor":true},{"citing_arxiv_id":"2605.12813","citing_title":"REALISTA: Realistic Latent Adversarial Attacks that Elicit LLM Hallucinations","ref_index":50,"is_internal_anchor":true},{"citing_arxiv_id":"2604.24082","citing_title":"Jailbreaking Frontier Foundation Models Through Intention Deception","ref_index":8,"is_internal_anchor":true},{"citing_arxiv_id":"2605.05058","citing_title":"SoK: Robustness in Large Language Models against Jailbreak Attacks","ref_index":48,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/DX2EIEY3QADJUUPE6IRCNWFJ27","json":"https://pith.science/pith/DX2EIEY3QADJUUPE6IRCNWFJ27.json","graph_json":"https://pith.science/api/pith-number/DX2EIEY3QADJUUPE6IRCNWFJ27/graph.json","events_json":"https://pith.science/api/pith-number/DX2EIEY3QADJUUPE6IRCNWFJ27/events.json","paper":"https://pith.science/paper/DX2EIEY3"},"agent_actions":{"view_html":"https://pith.science/pith/DX2EIEY3QADJUUPE6IRCNWFJ27","download_json":"https://pith.science/pith/DX2EIEY3QADJUUPE6IRCNWFJ27.json","view_paper":"https://pith.science/paper/DX2EIEY3","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2410.02832&json=true","fetch_graph":"https://pith.science/api/pith-number/DX2EIEY3QADJUUPE6IRCNWFJ27/graph.json","fetch_events":"https://pith.science/api/pith-number/DX2EIEY3QADJUUPE6IRCNWFJ27/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/DX2EIEY3QADJUUPE6IRCNWFJ27/action/timestamp_anchor","attest_storage":"https://pith.science/pith/DX2EIEY3QADJUUPE6IRCNWFJ27/action/storage_attestation","attest_author":"https://pith.science/pith/DX2EIEY3QADJUUPE6IRCNWFJ27/action/author_attestation","sign_citation":"https://pith.science/pith/DX2EIEY3QADJUUPE6IRCNWFJ27/action/citation_signature","submit_replication":"https://pith.science/pith/DX2EIEY3QADJUUPE6IRCNWFJ27/action/replication_record"}},"created_at":"2026-05-20T00:00:16.429004+00:00","updated_at":"2026-05-20T00:00:16.429004+00:00"}