{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2024:IA2V3ZBSUPZJVBTXTVJ7MKDOQS","short_pith_number":"pith:IA2V3ZBS","schema_version":"1.0","canonical_sha256":"40355de432a3f29a86779d53f6286e84a66de360acc42cc846275ad9ae7a958d","source":{"kind":"arxiv","id":"2404.14082","version":3},"attestation_state":"computed","paper":{"title":"Mechanistic Interpretability for AI Safety -- A Review","license":"http://creativecommons.org/licenses/by-sa/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.AI","authors_text":"Efstratios Gavves, Leonard Bereska","submitted_at":"2024-04-22T11:01:51Z","abstract_excerpt":"Understanding AI systems' inner workings is critical for ensuring value alignment and safety. This review explores mechanistic interpretability: reverse engineering the computational mechanisms and representations learned by neural networks into human-understandable algorithms and concepts to provide a granular, causal understanding. We establish foundational concepts such as features encoding knowledge within neural activations and hypotheses about their representation and computation. We survey methodologies for causally dissecting model behaviors and assess the relevance of mechanistic inte"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2404.14082","kind":"arxiv","version":3},"metadata":{"license":"http://creativecommons.org/licenses/by-sa/4.0/","primary_cat":"cs.AI","submitted_at":"2024-04-22T11:01:51Z","cross_cats_sorted":[],"title_canon_sha256":"d09de4d41aef7c9cb9e614c76e9bcd3bc085b40b144c35c5feda348312f6f4d9","abstract_canon_sha256":"2ac7b25eafee0ebcde1ec8556a2bace039c5d31e1cd3289e24faedb5d18c2d12"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-22T14:16:37.904289Z","signature_b64":"cOHZIp2+IWtMtnBbg6PZcUxHFGCKOfa7E1ryefoKNS9oyok/gX5ZhHp58hBYdtBSbCq6ntCPxsYyjViG7EoIDQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"40355de432a3f29a86779d53f6286e84a66de360acc42cc846275ad9ae7a958d","last_reissued_at":"2026-05-22T14:16:37.901546Z","signature_status":"signed_v1","first_computed_at":"2026-05-22T14:16:37.901546Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Mechanistic Interpretability for AI Safety -- A Review","license":"http://creativecommons.org/licenses/by-sa/4.0/","headline":"","cross_cats":[],"primary_cat":"cs.AI","authors_text":"Efstratios Gavves, Leonard Bereska","submitted_at":"2024-04-22T11:01:51Z","abstract_excerpt":"Understanding AI systems' inner workings is critical for ensuring value alignment and safety. This review explores mechanistic interpretability: reverse engineering the computational mechanisms and representations learned by neural networks into human-understandable algorithms and concepts to provide a granular, causal understanding. We establish foundational concepts such as features encoding knowledge within neural activations and hypotheses about their representation and computation. We survey methodologies for causally dissecting model behaviors and assess the relevance of mechanistic inte"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2404.14082","kind":"arxiv","version":3},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2404.14082/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2404.14082","created_at":"2026-05-22T14:16:37.901675+00:00"},{"alias_kind":"arxiv_version","alias_value":"2404.14082v3","created_at":"2026-05-22T14:16:37.901675+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2404.14082","created_at":"2026-05-22T14:16:37.901675+00:00"},{"alias_kind":"pith_short_12","alias_value":"IA2V3ZBSUPZJ","created_at":"2026-05-22T14:16:37.901675+00:00"},{"alias_kind":"pith_short_16","alias_value":"IA2V3ZBSUPZJVBTX","created_at":"2026-05-22T14:16:37.901675+00:00"},{"alias_kind":"pith_short_8","alias_value":"IA2V3ZBS","created_at":"2026-05-22T14:16:37.901675+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":30,"internal_anchor_count":30,"sample":[{"citing_arxiv_id":"2605.22902","citing_title":"Transcoders Trace Visual Grounding and Hallucinations in Vision-Language Models","ref_index":3,"is_internal_anchor":true},{"citing_arxiv_id":"2605.23565","citing_title":"Understanding Goal Generalisation in Sequential Reinforcement Learning","ref_index":8,"is_internal_anchor":true},{"citing_arxiv_id":"2605.23778","citing_title":"The physics of AI weather models","ref_index":4,"is_internal_anchor":true},{"citing_arxiv_id":"2408.12935","citing_title":"AI Safety Landscape for Large Language Models: Taxonomy, State-of-the-art, and Future Directions","ref_index":62,"is_internal_anchor":true},{"citing_arxiv_id":"2503.02574","citing_title":"LLM-Safety Evaluations Lack Robustness","ref_index":9,"is_internal_anchor":true},{"citing_arxiv_id":"2505.13510","citing_title":"On the definition and importance of interpretability in scientific machine learning","ref_index":5,"is_internal_anchor":true},{"citing_arxiv_id":"2605.21849","citing_title":"Geometry-Adaptive Explainer for Faithful Dictionary-Based Interpretability under Distribution Shift","ref_index":19,"is_internal_anchor":true},{"citing_arxiv_id":"2506.18852","citing_title":"Mechanistic Interpretability Needs Philosophy","ref_index":4,"is_internal_anchor":true},{"citing_arxiv_id":"2503.11926","citing_title":"Monitoring Reasoning Models for Misbehavior and the Risks of Promoting Obfuscation","ref_index":64,"is_internal_anchor":true},{"citing_arxiv_id":"2605.16374","citing_title":"Lost or Hidden? A Concept-Level Forgetting in Supervised Continual Learning","ref_index":2,"is_internal_anchor":true},{"citing_arxiv_id":"2605.15328","citing_title":"From Weight Perturbation to Feature Attribution for Explaining Fully Connected Neural Networks","ref_index":4,"is_internal_anchor":true},{"citing_arxiv_id":"2508.05463","citing_title":"Task complexity shapes internal representations and robustness in neural networks","ref_index":9,"is_internal_anchor":true},{"citing_arxiv_id":"2509.13316","citing_title":"Do Activation Verbalization Methods Convey Privileged Information?","ref_index":10,"is_internal_anchor":true},{"citing_arxiv_id":"2601.14004","citing_title":"Locate, Steer, and Improve: A Practical Survey of Actionable Mechanistic Interpretability in Large Language Models","ref_index":18,"is_internal_anchor":true},{"citing_arxiv_id":"2605.00200","citing_title":"Confidence Estimation in Automatic Short Answer Grading with LLMs","ref_index":4,"is_internal_anchor":true},{"citing_arxiv_id":"2604.03976","citing_title":"Quantifying Trust: Financial Risk Management for Trustworthy AI Agents","ref_index":4,"is_internal_anchor":true},{"citing_arxiv_id":"2605.12412","citing_title":"Stories in Space: In-Context Learning Trajectories in Conceptual Belief Space","ref_index":67,"is_internal_anchor":true},{"citing_arxiv_id":"2605.03598","citing_title":"Unifying Dynamical Systems and Graph Theory to Mechanistically Understand Computation in Neural Networks","ref_index":29,"is_internal_anchor":true},{"citing_arxiv_id":"2605.09129","citing_title":"Data-driven Circuit Discovery for Interpretability of Language Models","ref_index":1,"is_internal_anchor":true},{"citing_arxiv_id":"2605.08910","citing_title":"Enhancing Adversarial Robustness in Network Intrusion Detection: A Layer-wise Adaptive Regularization Approach","ref_index":15,"is_internal_anchor":true},{"citing_arxiv_id":"2604.25119","citing_title":"Evaluation without Generation: Non-Generative Assessment of Harmful Model Specialization with Applications to CSAM","ref_index":1,"is_internal_anchor":true},{"citing_arxiv_id":"2605.06610","citing_title":"SoftSAE: Dynamic Top-K Selection for Adaptive Sparse Autoencoders","ref_index":11,"is_internal_anchor":true},{"citing_arxiv_id":"2605.00200","citing_title":"Confidence Estimation in Automatic Short Answer Grading with LLMs","ref_index":4,"is_internal_anchor":true},{"citing_arxiv_id":"2604.19083","citing_title":"ProjLens: Unveiling the Role of Projectors in Multimodal Model Safety","ref_index":124,"is_internal_anchor":true},{"citing_arxiv_id":"2605.06610","citing_title":"SoftSAE: Dynamic Top-K Selection for Adaptive Sparse Autoencoders","ref_index":11,"is_internal_anchor":true}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/IA2V3ZBSUPZJVBTXTVJ7MKDOQS","json":"https://pith.science/pith/IA2V3ZBSUPZJVBTXTVJ7MKDOQS.json","graph_json":"https://pith.science/api/pith-number/IA2V3ZBSUPZJVBTXTVJ7MKDOQS/graph.json","events_json":"https://pith.science/api/pith-number/IA2V3ZBSUPZJVBTXTVJ7MKDOQS/events.json","paper":"https://pith.science/paper/IA2V3ZBS"},"agent_actions":{"view_html":"https://pith.science/pith/IA2V3ZBSUPZJVBTXTVJ7MKDOQS","download_json":"https://pith.science/pith/IA2V3ZBSUPZJVBTXTVJ7MKDOQS.json","view_paper":"https://pith.science/paper/IA2V3ZBS","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2404.14082&json=true","fetch_graph":"https://pith.science/api/pith-number/IA2V3ZBSUPZJVBTXTVJ7MKDOQS/graph.json","fetch_events":"https://pith.science/api/pith-number/IA2V3ZBSUPZJVBTXTVJ7MKDOQS/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/IA2V3ZBSUPZJVBTXTVJ7MKDOQS/action/timestamp_anchor","attest_storage":"https://pith.science/pith/IA2V3ZBSUPZJVBTXTVJ7MKDOQS/action/storage_attestation","attest_author":"https://pith.science/pith/IA2V3ZBSUPZJVBTXTVJ7MKDOQS/action/author_attestation","sign_citation":"https://pith.science/pith/IA2V3ZBSUPZJVBTXTVJ7MKDOQS/action/citation_signature","submit_replication":"https://pith.science/pith/IA2V3ZBSUPZJVBTXTVJ7MKDOQS/action/replication_record"}},"created_at":"2026-05-22T14:16:37.901675+00:00","updated_at":"2026-05-22T14:16:37.901675+00:00"}