{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:MSOHZ3U3YVK73336KDB63ARSYN","short_pith_number":"pith:MSOHZ3U3","schema_version":"1.0","canonical_sha256":"649c7cee9bc555fdef7e50c3ed8232c376b00f92e6a1ca7c93a0b01ce81a6f6e","source":{"kind":"arxiv","id":"2606.25487","version":1},"attestation_state":"computed","paper":{"title":"How Reliable Is Your Jailbreak Judge? Calibration and Adversarial Robustness of Automated ASR Scoring","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.CR","cs.LG"],"primary_cat":"cs.CL","authors_text":"Yang Gao (Veyon Solutions)","submitted_at":"2026-06-24T07:14:17Z","abstract_excerpt":"Almost every paper on LLM jailbreaks and prompt injection reports an attack-success rate (ASR), and that number is assigned not by people but by an automated judge: either a safety classifier trained for the task, or a general chat model prompted to grade. The judge is rarely checked. We check it. Using 596 human-labeled completions from the HarmBench classifier validation set, we compare the two judge families against human majority votes and then attack them. The two families fail in opposite ways. The dedicated classifier over-flags (precision 0.835, recall 0.974); three different LLM-as-ju"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2606.25487","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-06-24T07:14:17Z","cross_cats_sorted":["cs.CR","cs.LG"],"title_canon_sha256":"cb5e08266c076433ac86a93f1ae9b4fbcc47899b16086f6c5a182c927c0cd335","abstract_canon_sha256":"14820fb28566d62f73c61ac7b5b5db9ca24dc15eb15aa2d65f88b8fe5eb4258b"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-25T01:18:06.656154Z","signature_b64":"+wxnJ+r6dpyYaTiL0iWrQn2bVo4LgIl9qNv7zJaCmbkezp+1OFlKFtBfDvEOkva++BgDaVfYStfXiQUPI0OBBA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"649c7cee9bc555fdef7e50c3ed8232c376b00f92e6a1ca7c93a0b01ce81a6f6e","last_reissued_at":"2026-06-25T01:18:06.655756Z","signature_status":"signed_v1","first_computed_at":"2026-06-25T01:18:06.655756Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"How Reliable Is Your Jailbreak Judge? Calibration and Adversarial Robustness of Automated ASR Scoring","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.CR","cs.LG"],"primary_cat":"cs.CL","authors_text":"Yang Gao (Veyon Solutions)","submitted_at":"2026-06-24T07:14:17Z","abstract_excerpt":"Almost every paper on LLM jailbreaks and prompt injection reports an attack-success rate (ASR), and that number is assigned not by people but by an automated judge: either a safety classifier trained for the task, or a general chat model prompted to grade. The judge is rarely checked. We check it. Using 596 human-labeled completions from the HarmBench classifier validation set, we compare the two judge families against human majority votes and then attack them. The two families fail in opposite ways. The dedicated classifier over-flags (precision 0.835, recall 0.974); three different LLM-as-ju"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.25487","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2606.25487/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2606.25487","created_at":"2026-06-25T01:18:06.655826+00:00"},{"alias_kind":"arxiv_version","alias_value":"2606.25487v1","created_at":"2026-06-25T01:18:06.655826+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.25487","created_at":"2026-06-25T01:18:06.655826+00:00"},{"alias_kind":"pith_short_12","alias_value":"MSOHZ3U3YVK7","created_at":"2026-06-25T01:18:06.655826+00:00"},{"alias_kind":"pith_short_16","alias_value":"MSOHZ3U3YVK73336","created_at":"2026-06-25T01:18:06.655826+00:00"},{"alias_kind":"pith_short_8","alias_value":"MSOHZ3U3","created_at":"2026-06-25T01:18:06.655826+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/MSOHZ3U3YVK73336KDB63ARSYN","json":"https://pith.science/pith/MSOHZ3U3YVK73336KDB63ARSYN.json","graph_json":"https://pith.science/api/pith-number/MSOHZ3U3YVK73336KDB63ARSYN/graph.json","events_json":"https://pith.science/api/pith-number/MSOHZ3U3YVK73336KDB63ARSYN/events.json","paper":"https://pith.science/paper/MSOHZ3U3"},"agent_actions":{"view_html":"https://pith.science/pith/MSOHZ3U3YVK73336KDB63ARSYN","download_json":"https://pith.science/pith/MSOHZ3U3YVK73336KDB63ARSYN.json","view_paper":"https://pith.science/paper/MSOHZ3U3","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2606.25487&json=true","fetch_graph":"https://pith.science/api/pith-number/MSOHZ3U3YVK73336KDB63ARSYN/graph.json","fetch_events":"https://pith.science/api/pith-number/MSOHZ3U3YVK73336KDB63ARSYN/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/MSOHZ3U3YVK73336KDB63ARSYN/action/timestamp_anchor","attest_storage":"https://pith.science/pith/MSOHZ3U3YVK73336KDB63ARSYN/action/storage_attestation","attest_author":"https://pith.science/pith/MSOHZ3U3YVK73336KDB63ARSYN/action/author_attestation","sign_citation":"https://pith.science/pith/MSOHZ3U3YVK73336KDB63ARSYN/action/citation_signature","submit_replication":"https://pith.science/pith/MSOHZ3U3YVK73336KDB63ARSYN/action/replication_record"}},"created_at":"2026-06-25T01:18:06.655826+00:00","updated_at":"2026-06-25T01:18:06.655826+00:00"}