{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:6QGFT45FHVLF4BMFA4OC2QMLR5","short_pith_number":"pith:6QGFT45F","schema_version":"1.0","canonical_sha256":"f40c59f3a53d565e0585071c2d418b8f54f470302c2299c0d93120e5d4ef9c7c","source":{"kind":"arxiv","id":"2606.08044","version":1},"attestation_state":"computed","paper":{"title":"When Behavioral Safety Evaluation Fails: A Representation-Level Perspective","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI","cs.CL"],"primary_cat":"cs.LG","authors_text":"Anders Gj{\\o}lbye, Enyi Jiang, Sanmi Koyejo, Yibo Jacky Zhang","submitted_at":"2026-06-06T08:10:56Z","abstract_excerpt":"Large Language Model (LLM) safety has often been evaluated at the behavior level, which provides limited evidence of internal robustness, as these evaluations target outputs rather than representation-level vulnerability under intervention. We formalize this discrepancy as the audit gap: the difference between behavioral safety and robustness under intervention. To study this gap, we construct dissociated models that preserve safe outward behavior while remaining vulnerable in the latent space. We introduce an intervention-based evaluation framework to test model robustness through soft interv"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2606.08044","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-06-06T08:10:56Z","cross_cats_sorted":["cs.AI","cs.CL"],"title_canon_sha256":"9d7dd8d2b38696e6e60f17a6cae9f2f183f6622bf52880217498bbd3c4fd3833","abstract_canon_sha256":"556e1c4aae084ba071779e956758a5bcda175856c95bcc780756c134daa8d5ff"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-09T01:05:24.237070Z","signature_b64":"+XTlk69gxc3XJ6OsfM/wgNQCSQsRAn2s/Jc835/J9C+k6KHk7gs9+o9U/S1iKL57Sapnq13I6k3jm0Sdk4pZDQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"f40c59f3a53d565e0585071c2d418b8f54f470302c2299c0d93120e5d4ef9c7c","last_reissued_at":"2026-06-09T01:05:24.236505Z","signature_status":"signed_v1","first_computed_at":"2026-06-09T01:05:24.236505Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"When Behavioral Safety Evaluation Fails: A Representation-Level Perspective","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI","cs.CL"],"primary_cat":"cs.LG","authors_text":"Anders Gj{\\o}lbye, Enyi Jiang, Sanmi Koyejo, Yibo Jacky Zhang","submitted_at":"2026-06-06T08:10:56Z","abstract_excerpt":"Large Language Model (LLM) safety has often been evaluated at the behavior level, which provides limited evidence of internal robustness, as these evaluations target outputs rather than representation-level vulnerability under intervention. We formalize this discrepancy as the audit gap: the difference between behavioral safety and robustness under intervention. To study this gap, we construct dissociated models that preserve safe outward behavior while remaining vulnerable in the latent space. We introduce an intervention-based evaluation framework to test model robustness through soft interv"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.08044","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2606.08044/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2606.08044","created_at":"2026-06-09T01:05:24.236611+00:00"},{"alias_kind":"arxiv_version","alias_value":"2606.08044v1","created_at":"2026-06-09T01:05:24.236611+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.08044","created_at":"2026-06-09T01:05:24.236611+00:00"},{"alias_kind":"pith_short_12","alias_value":"6QGFT45FHVLF","created_at":"2026-06-09T01:05:24.236611+00:00"},{"alias_kind":"pith_short_16","alias_value":"6QGFT45FHVLF4BMF","created_at":"2026-06-09T01:05:24.236611+00:00"},{"alias_kind":"pith_short_8","alias_value":"6QGFT45F","created_at":"2026-06-09T01:05:24.236611+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/6QGFT45FHVLF4BMFA4OC2QMLR5","json":"https://pith.science/pith/6QGFT45FHVLF4BMFA4OC2QMLR5.json","graph_json":"https://pith.science/api/pith-number/6QGFT45FHVLF4BMFA4OC2QMLR5/graph.json","events_json":"https://pith.science/api/pith-number/6QGFT45FHVLF4BMFA4OC2QMLR5/events.json","paper":"https://pith.science/paper/6QGFT45F"},"agent_actions":{"view_html":"https://pith.science/pith/6QGFT45FHVLF4BMFA4OC2QMLR5","download_json":"https://pith.science/pith/6QGFT45FHVLF4BMFA4OC2QMLR5.json","view_paper":"https://pith.science/paper/6QGFT45F","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2606.08044&json=true","fetch_graph":"https://pith.science/api/pith-number/6QGFT45FHVLF4BMFA4OC2QMLR5/graph.json","fetch_events":"https://pith.science/api/pith-number/6QGFT45FHVLF4BMFA4OC2QMLR5/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/6QGFT45FHVLF4BMFA4OC2QMLR5/action/timestamp_anchor","attest_storage":"https://pith.science/pith/6QGFT45FHVLF4BMFA4OC2QMLR5/action/storage_attestation","attest_author":"https://pith.science/pith/6QGFT45FHVLF4BMFA4OC2QMLR5/action/author_attestation","sign_citation":"https://pith.science/pith/6QGFT45FHVLF4BMFA4OC2QMLR5/action/citation_signature","submit_replication":"https://pith.science/pith/6QGFT45FHVLF4BMFA4OC2QMLR5/action/replication_record"}},"created_at":"2026-06-09T01:05:24.236611+00:00","updated_at":"2026-06-09T01:05:24.236611+00:00"}