{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:ZFOZZZP7LMECGCNMCJHQ46TN53","short_pith_number":"pith:ZFOZZZP7","schema_version":"1.0","canonical_sha256":"c95d9ce5ff5b082309ac124f0e7a6deef92057d54fb060d1c743dd089ef43c28","source":{"kind":"arxiv","id":"2604.01473","version":3},"attestation_state":"computed","paper":{"title":"SelfGrader: LLM Jailbreak Detection via Anchored Token-Level Logits","license":"http://creativecommons.org/licenses/by/4.0/","headline":"SelfGrader detects jailbreaks by grading queries with logits over digits 0-9","cross_cats":["cs.AI"],"primary_cat":"cs.CR","authors_text":"Jiahao Xu, Olivera Kotevska, Rui Hu, Zikai Zhang","submitted_at":"2026-04-01T23:29:12Z","abstract_excerpt":"Large Language Models (LLMs) are powerful tools for answering user queries, yet they remain highly vulnerable to jailbreak attacks. Existing guardrail methods typically rely on internal features or textual responses to detect malicious queries, which either introduce substantial latency or suffer from randomness in text generation. To overcome these limitations, we propose SelfGrader, a lightweight guardrail method that formulates jailbreak detection as a numerical grading problem using anchored token-level logits. Specifically, SelfGrader evaluates the safety of a user query within a compact "},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":true},"canonical_record":{"source":{"id":"2604.01473","kind":"arxiv","version":3},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CR","submitted_at":"2026-04-01T23:29:12Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"3a5d1bbce9842ed427c5a1c3ad2092e21d42b6c59d525b8fb56d38e4ae1f4a0e","abstract_canon_sha256":"63e7128930bcca93eca5aaaa87c62467681600e5d0567086b2166f3d6f636b96"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-29T01:05:08.582201Z","signature_b64":"HT3AX3ypMcAbzQ+CSzAyNwjDjFvUeOyxtWKxzbADKjqadkmOIdqg7v5J2EytB1OG3gDuobEL515xxbPQim1xCQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"c95d9ce5ff5b082309ac124f0e7a6deef92057d54fb060d1c743dd089ef43c28","last_reissued_at":"2026-05-29T01:05:08.581411Z","signature_status":"signed_v1","first_computed_at":"2026-05-29T01:05:08.581411Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"SelfGrader: LLM Jailbreak Detection via Anchored Token-Level Logits","license":"http://creativecommons.org/licenses/by/4.0/","headline":"SelfGrader detects jailbreaks by grading queries with logits over digits 0-9","cross_cats":["cs.AI"],"primary_cat":"cs.CR","authors_text":"Jiahao Xu, Olivera Kotevska, Rui Hu, Zikai Zhang","submitted_at":"2026-04-01T23:29:12Z","abstract_excerpt":"Large Language Models (LLMs) are powerful tools for answering user queries, yet they remain highly vulnerable to jailbreak attacks. Existing guardrail methods typically rely on internal features or textual responses to detect malicious queries, which either introduce substantial latency or suffer from randomness in text generation. To overcome these limitations, we propose SelfGrader, a lightweight guardrail method that formulates jailbreak detection as a numerical grading problem using anchored token-level logits. Specifically, SelfGrader evaluates the safety of a user query within a compact "},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"SelfGrader achieves up to a 22.66% reduction in ASR on LLaMA-3-8B, while maintaining significantly lower memory overhead (up to 173x) and latency (up to 26x).","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That the logit distribution over a fixed set of numerical tokens (0-9) provides a stable, human-aligned signal of query maliciousness without requiring full response generation or access to internal model features.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"SelfGrader detects LLM jailbreaks by interpreting logit distributions on numerical tokens with a dual maliciousness-benignness score, cutting attack success rates up to 22.66% while using up to 173x less memory and 26x less latency.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"SelfGrader detects jailbreaks by grading queries with logits over digits 0-9","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"53fc80ea8bf0b4d3f75a5b8f4ddd023a3cec0c3243e44ae34c0dde725c1ca134"},"source":{"id":"2604.01473","kind":"arxiv","version":3},"verdict":{"id":"827d5bbd-5c18-4ebe-9325-3af4073e8ebf","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-13T21:40:42.490187Z","strongest_claim":"SelfGrader achieves up to a 22.66% reduction in ASR on LLaMA-3-8B, while maintaining significantly lower memory overhead (up to 173x) and latency (up to 26x).","one_line_summary":"SelfGrader detects LLM jailbreaks by interpreting logit distributions on numerical tokens with a dual maliciousness-benignness score, cutting attack success rates up to 22.66% while using up to 173x less memory and 26x less latency.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That the logit distribution over a fixed set of numerical tokens (0-9) provides a stable, human-aligned signal of query maliciousness without requiring full response generation or access to internal model features.","pith_extraction_headline":"SelfGrader detects jailbreaks by grading queries with logits over digits 0-9"},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2604.01473/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":2,"snapshot_sha256":"470a584ae5ec187ad78b1794880ec11c27697af2068ecf303ffddf292fa960f3"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2604.01473","created_at":"2026-05-29T01:05:08.581533+00:00"},{"alias_kind":"arxiv_version","alias_value":"2604.01473v3","created_at":"2026-05-29T01:05:08.581533+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2604.01473","created_at":"2026-05-29T01:05:08.581533+00:00"},{"alias_kind":"pith_short_12","alias_value":"ZFOZZZP7LMEC","created_at":"2026-05-29T01:05:08.581533+00:00"},{"alias_kind":"pith_short_16","alias_value":"ZFOZZZP7LMECGCNM","created_at":"2026-05-29T01:05:08.581533+00:00"},{"alias_kind":"pith_short_8","alias_value":"ZFOZZZP7","created_at":"2026-05-29T01:05:08.581533+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":2,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/ZFOZZZP7LMECGCNMCJHQ46TN53","json":"https://pith.science/pith/ZFOZZZP7LMECGCNMCJHQ46TN53.json","graph_json":"https://pith.science/api/pith-number/ZFOZZZP7LMECGCNMCJHQ46TN53/graph.json","events_json":"https://pith.science/api/pith-number/ZFOZZZP7LMECGCNMCJHQ46TN53/events.json","paper":"https://pith.science/paper/ZFOZZZP7"},"agent_actions":{"view_html":"https://pith.science/pith/ZFOZZZP7LMECGCNMCJHQ46TN53","download_json":"https://pith.science/pith/ZFOZZZP7LMECGCNMCJHQ46TN53.json","view_paper":"https://pith.science/paper/ZFOZZZP7","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2604.01473&json=true","fetch_graph":"https://pith.science/api/pith-number/ZFOZZZP7LMECGCNMCJHQ46TN53/graph.json","fetch_events":"https://pith.science/api/pith-number/ZFOZZZP7LMECGCNMCJHQ46TN53/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/ZFOZZZP7LMECGCNMCJHQ46TN53/action/timestamp_anchor","attest_storage":"https://pith.science/pith/ZFOZZZP7LMECGCNMCJHQ46TN53/action/storage_attestation","attest_author":"https://pith.science/pith/ZFOZZZP7LMECGCNMCJHQ46TN53/action/author_attestation","sign_citation":"https://pith.science/pith/ZFOZZZP7LMECGCNMCJHQ46TN53/action/citation_signature","submit_replication":"https://pith.science/pith/ZFOZZZP7LMECGCNMCJHQ46TN53/action/replication_record"}},"created_at":"2026-05-29T01:05:08.581533+00:00","updated_at":"2026-05-29T01:05:08.581533+00:00"}