{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:C6EUDN3U25TMIQ5TUX2KTOBO64","short_pith_number":"pith:C6EUDN3U","canonical_record":{"source":{"id":"2604.18970","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-04-21T01:39:57Z","cross_cats_sorted":["cs.CR"],"title_canon_sha256":"56b7e575bff4e3ee2ccf1990fe2e81f631958aa86b9fc592e8c9c4aefe84045c","abstract_canon_sha256":"6b94b944d469a720af3d789c59b1219d65cef2bbb7b5f2631fe28224c7ac6106"},"schema_version":"1.0"},"canonical_sha256":"178941b774d766c443b3a5f4a9b82ef73dbb0038b0af31a1b3d564e10ed3230f","source":{"kind":"arxiv","id":"2604.18970","version":2},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2604.18970","created_at":"2026-05-26T02:04:11Z"},{"alias_kind":"arxiv_version","alias_value":"2604.18970v2","created_at":"2026-05-26T02:04:11Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2604.18970","created_at":"2026-05-26T02:04:11Z"},{"alias_kind":"pith_short_12","alias_value":"C6EUDN3U25TM","created_at":"2026-05-26T02:04:11Z"},{"alias_kind":"pith_short_16","alias_value":"C6EUDN3U25TMIQ5T","created_at":"2026-05-26T02:04:11Z"},{"alias_kind":"pith_short_8","alias_value":"C6EUDN3U","created_at":"2026-05-26T02:04:11Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:C6EUDN3U25TMIQ5TUX2KTOBO64","target":"record","payload":{"canonical_record":{"source":{"id":"2604.18970","kind":"arxiv","version":2},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-04-21T01:39:57Z","cross_cats_sorted":["cs.CR"],"title_canon_sha256":"56b7e575bff4e3ee2ccf1990fe2e81f631958aa86b9fc592e8c9c4aefe84045c","abstract_canon_sha256":"6b94b944d469a720af3d789c59b1219d65cef2bbb7b5f2631fe28224c7ac6106"},"schema_version":"1.0"},"canonical_sha256":"178941b774d766c443b3a5f4a9b82ef73dbb0038b0af31a1b3d564e10ed3230f","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-26T02:04:11.073011Z","signature_b64":"I0oYN1WsPCnazljzIJ22gw5MkqqgUAuWUSc0gHX2mBPzQ4MVhBfokUQyQFVFasQk6agufesJmSRCvlIzKlb8Aw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"178941b774d766c443b3a5f4a9b82ef73dbb0038b0af31a1b3d564e10ed3230f","last_reissued_at":"2026-05-26T02:04:11.071869Z","signature_status":"signed_v1","first_computed_at":"2026-05-26T02:04:11.071869Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2604.18970","source_version":2,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-26T02:04:11Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"agiX6yp5LS7VEizpjV6Jodhi6fFOokxHw7pFq+yPMMMVsBFXzbP3ANZfj7Q5xax6PpSX4NDYinYJQFTBNlAdBA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-01T10:35:37.314018Z"},"content_sha256":"00ee1caa1b78bed7d331303796d34faec22e0b3ee53a4040dbe28ba5c939631c","schema_version":"1.0","event_id":"sha256:00ee1caa1b78bed7d331303796d34faec22e0b3ee53a4040dbe28ba5c939631c"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:C6EUDN3U25TMIQ5TUX2KTOBO64","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Mechanistic Anomaly Detection via Functional Attribution","license":"http://creativecommons.org/licenses/by/4.0/","headline":"A neural network's output can be checked for anomalous internal mechanisms by measuring how much it depends on a small trusted reference set.","cross_cats":["cs.CR"],"primary_cat":"cs.LG","authors_text":"Christopher Leckie, Hugo Lyons Keenan, Sarah Erfani","submitted_at":"2026-04-21T01:39:57Z","abstract_excerpt":"We can often verify the correctness of neural network outputs using ground truth labels, but we cannot reliably determine whether the output was produced by normal or anomalous internal mechanisms. Mechanistic anomaly detection (MAD) aims to flag these cases, but existing methods either depend on latent space analysis, which is vulnerable to obfuscation, or are specific to particular architectures and modalities. We reframe MAD as a functional attribution problem: asking to what extent samples from a trusted set can explain the model's output, where attribution failure signals anomalous behavi"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"We reframe MAD as a functional attribution problem: asking to what extent samples from a trusted set can explain the model's output, where attribution failure signals anomalous behavior. We operationalize this using influence functions... For backdoors in vision models, our method achieves state-of-the-art detection on BackdoorBench, with an average Defense Effectiveness Rating (DER) of 0.93 across seven attacks and four datasets (next best 0.83).","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That failure of influence-function-based attribution to a trusted reference set reliably indicates anomalous internal mechanisms rather than other causes such as high model uncertainty or distribution shift.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Functional attribution with influence functions detects anomalous mechanisms in neural networks, achieving SOTA backdoor detection (average DER 0.93) on vision benchmarks and improvements on LLMs.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"A neural network's output can be checked for anomalous internal mechanisms by measuring how much it depends on a small trusted reference set.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"ff606b44e7f207fcd56fe438f8155b77034c651f0d55b942545ddd4b596a499e"},"source":{"id":"2604.18970","kind":"arxiv","version":2},"verdict":{"id":"30d444cb-b155-41b5-93d9-9b1f2015f109","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-10T02:56:21.895971Z","strongest_claim":"We reframe MAD as a functional attribution problem: asking to what extent samples from a trusted set can explain the model's output, where attribution failure signals anomalous behavior. We operationalize this using influence functions... For backdoors in vision models, our method achieves state-of-the-art detection on BackdoorBench, with an average Defense Effectiveness Rating (DER) of 0.93 across seven attacks and four datasets (next best 0.83).","one_line_summary":"Functional attribution with influence functions detects anomalous mechanisms in neural networks, achieving SOTA backdoor detection (average DER 0.93) on vision benchmarks and improvements on LLMs.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That failure of influence-function-based attribution to a trusted reference set reliably indicates anomalous internal mechanisms rather than other causes such as high model uncertainty or distribution shift.","pith_extraction_headline":"A neural network's output can be checked for anomalous internal mechanisms by measuring how much it depends on a small trusted reference set."},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2604.18970/integrity.json","findings":[],"available":true,"detectors_run":[{"name":"doi_compliance","ran_at":"2026-05-20T03:26:06.383502Z","status":"completed","version":"1.0.0","findings_count":0}],"snapshot_sha256":"bd450f722ec6f478c5901878244fdb269d1cc9f26ed47c6e85f3f4520ecac2b6"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"30d444cb-b155-41b5-93d9-9b1f2015f109"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-26T02:04:11Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"e4+plzAAoxUCXz2qD06r2AnpJvdMxzhRVcJG0WHUsvQEkt7+TyGegsPzOeqF55Zl/iiglsJGaSKbtFLelx6gAA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-01T10:35:37.314517Z"},"content_sha256":"a49cbe86796486fa0b58b58cc139fdc00ca0a3ad8873f41c1c405b7d56e956bb","schema_version":"1.0","event_id":"sha256:a49cbe86796486fa0b58b58cc139fdc00ca0a3ad8873f41c1c405b7d56e956bb"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/C6EUDN3U25TMIQ5TUX2KTOBO64/bundle.json","state_url":"https://pith.science/pith/C6EUDN3U25TMIQ5TUX2KTOBO64/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/C6EUDN3U25TMIQ5TUX2KTOBO64/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-01T10:35:37Z","links":{"resolver":"https://pith.science/pith/C6EUDN3U25TMIQ5TUX2KTOBO64","bundle":"https://pith.science/pith/C6EUDN3U25TMIQ5TUX2KTOBO64/bundle.json","state":"https://pith.science/pith/C6EUDN3U25TMIQ5TUX2KTOBO64/state.json","well_known_bundle":"https://pith.science/.well-known/pith/C6EUDN3U25TMIQ5TUX2KTOBO64/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:C6EUDN3U25TMIQ5TUX2KTOBO64","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"6b94b944d469a720af3d789c59b1219d65cef2bbb7b5f2631fe28224c7ac6106","cross_cats_sorted":["cs.CR"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-04-21T01:39:57Z","title_canon_sha256":"56b7e575bff4e3ee2ccf1990fe2e81f631958aa86b9fc592e8c9c4aefe84045c"},"schema_version":"1.0","source":{"id":"2604.18970","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2604.18970","created_at":"2026-05-26T02:04:11Z"},{"alias_kind":"arxiv_version","alias_value":"2604.18970v2","created_at":"2026-05-26T02:04:11Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2604.18970","created_at":"2026-05-26T02:04:11Z"},{"alias_kind":"pith_short_12","alias_value":"C6EUDN3U25TM","created_at":"2026-05-26T02:04:11Z"},{"alias_kind":"pith_short_16","alias_value":"C6EUDN3U25TMIQ5T","created_at":"2026-05-26T02:04:11Z"},{"alias_kind":"pith_short_8","alias_value":"C6EUDN3U","created_at":"2026-05-26T02:04:11Z"}],"graph_snapshots":[{"event_id":"sha256:a49cbe86796486fa0b58b58cc139fdc00ca0a3ad8873f41c1c405b7d56e956bb","target":"graph","created_at":"2026-05-26T02:04:11Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"We reframe MAD as a functional attribution problem: asking to what extent samples from a trusted set can explain the model's output, where attribution failure signals anomalous behavior. We operationalize this using influence functions... For backdoors in vision models, our method achieves state-of-the-art detection on BackdoorBench, with an average Defense Effectiveness Rating (DER) of 0.93 across seven attacks and four datasets (next best 0.83)."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That failure of influence-function-based attribution to a trusted reference set reliably indicates anomalous internal mechanisms rather than other causes such as high model uncertainty or distribution shift."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Functional attribution with influence functions detects anomalous mechanisms in neural networks, achieving SOTA backdoor detection (average DER 0.93) on vision benchmarks and improvements on LLMs."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"A neural network's output can be checked for anomalous internal mechanisms by measuring how much it depends on a small trusted reference set."}],"snapshot_sha256":"ff606b44e7f207fcd56fe438f8155b77034c651f0d55b942545ddd4b596a499e"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[{"findings_count":0,"name":"doi_compliance","ran_at":"2026-05-20T03:26:06.383502Z","status":"completed","version":"1.0.0"}],"endpoint":"/pith/2604.18970/integrity.json","findings":[],"snapshot_sha256":"bd450f722ec6f478c5901878244fdb269d1cc9f26ed47c6e85f3f4520ecac2b6","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"We can often verify the correctness of neural network outputs using ground truth labels, but we cannot reliably determine whether the output was produced by normal or anomalous internal mechanisms. Mechanistic anomaly detection (MAD) aims to flag these cases, but existing methods either depend on latent space analysis, which is vulnerable to obfuscation, or are specific to particular architectures and modalities. We reframe MAD as a functional attribution problem: asking to what extent samples from a trusted set can explain the model's output, where attribution failure signals anomalous behavi","authors_text":"Christopher Leckie, Hugo Lyons Keenan, Sarah Erfani","cross_cats":["cs.CR"],"headline":"A neural network's output can be checked for anomalous internal mechanisms by measuring how much it depends on a small trusted reference set.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-04-21T01:39:57Z","title":"Mechanistic Anomaly Detection via Functional Attribution"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2604.18970","kind":"arxiv","version":2},"verdict":{"created_at":"2026-05-10T02:56:21.895971Z","id":"30d444cb-b155-41b5-93d9-9b1f2015f109","model_set":{"reader":"grok-4.3"},"one_line_summary":"Functional attribution with influence functions detects anomalous mechanisms in neural networks, achieving SOTA backdoor detection (average DER 0.93) on vision benchmarks and improvements on LLMs.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"A neural network's output can be checked for anomalous internal mechanisms by measuring how much it depends on a small trusted reference set.","strongest_claim":"We reframe MAD as a functional attribution problem: asking to what extent samples from a trusted set can explain the model's output, where attribution failure signals anomalous behavior. We operationalize this using influence functions... For backdoors in vision models, our method achieves state-of-the-art detection on BackdoorBench, with an average Defense Effectiveness Rating (DER) of 0.93 across seven attacks and four datasets (next best 0.83).","weakest_assumption":"That failure of influence-function-based attribution to a trusted reference set reliably indicates anomalous internal mechanisms rather than other causes such as high model uncertainty or distribution shift."}},"verdict_id":"30d444cb-b155-41b5-93d9-9b1f2015f109"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:00ee1caa1b78bed7d331303796d34faec22e0b3ee53a4040dbe28ba5c939631c","target":"record","created_at":"2026-05-26T02:04:11Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"6b94b944d469a720af3d789c59b1219d65cef2bbb7b5f2631fe28224c7ac6106","cross_cats_sorted":["cs.CR"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-04-21T01:39:57Z","title_canon_sha256":"56b7e575bff4e3ee2ccf1990fe2e81f631958aa86b9fc592e8c9c4aefe84045c"},"schema_version":"1.0","source":{"id":"2604.18970","kind":"arxiv","version":2}},"canonical_sha256":"178941b774d766c443b3a5f4a9b82ef73dbb0038b0af31a1b3d564e10ed3230f","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"178941b774d766c443b3a5f4a9b82ef73dbb0038b0af31a1b3d564e10ed3230f","first_computed_at":"2026-05-26T02:04:11.071869Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-26T02:04:11.071869Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"I0oYN1WsPCnazljzIJ22gw5MkqqgUAuWUSc0gHX2mBPzQ4MVhBfokUQyQFVFasQk6agufesJmSRCvlIzKlb8Aw==","signature_status":"signed_v1","signed_at":"2026-05-26T02:04:11.073011Z","signed_message":"canonical_sha256_bytes"},"source_id":"2604.18970","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:00ee1caa1b78bed7d331303796d34faec22e0b3ee53a4040dbe28ba5c939631c","sha256:a49cbe86796486fa0b58b58cc139fdc00ca0a3ad8873f41c1c405b7d56e956bb"],"state_sha256":"fcb923b62e84bf3bac7eabbc7b7f9a979b369d6a8efca98afd766e30a3c04acf"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"OsiFbFPUfCuEnBK3e31rYFiWyGmlD06uinuHJEAJNzxkbXJbBn/qB3cz37qs5Igc4WFwUtqYbvSLg/mRutO+Aw==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-01T10:35:37.316719Z","bundle_sha256":"f3d8ab13a54ef3e1cccaf5091396648eac079e6d571c184d9c63489607cd54ad"}}