{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2019:ITV4VLB3MIMLB264T7EQA7J4FY","short_pith_number":"pith:ITV4VLB3","schema_version":"1.0","canonical_sha256":"44ebcaac3b6218b0ebdc9fc9007d3c2e016503bb0be261eb21f01da36f321dd5","source":{"kind":"arxiv","id":"1905.09418","version":2},"attestation_state":"computed","paper":{"title":"Analyzing Multi-Head Self-Attention: Specialized Heads Do the Heavy Lifting, the Rest Can Be Pruned","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.CL","authors_text":"David Talbot, Elena Voita, Fedor Moiseev, Ivan Titov, Rico Sennrich","submitted_at":"2019-05-23T01:13:24Z","abstract_excerpt":"Multi-head self-attention is a key component of the Transformer, a state-of-the-art architecture for neural machine translation. In this work we evaluate the contribution made by individual attention heads in the encoder to the overall performance of the model and analyze the roles played by them. We find that the most important and confident heads play consistent and often linguistically-interpretable roles. When pruning heads using a method based on stochastic gates and a differentiable relaxation of the L0 penalty, we observe that specialized heads are last to be pruned. Our novel pruning m"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1905.09418","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2019-05-23T01:13:24Z","cross_cats_sorted":[],"title_canon_sha256":"2db19ded3f1868d17795aa5c88183110f3268147151218aae2b2622d76035109","abstract_canon_sha256":"8182ce05840b7a58d9cec218aa4a9cc9be49170aa1db636ce8e4510046c5e4aa"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:43:56.044948Z","signature_b64":"PeKkZ1WI6yYRaD5SoBDBqxLLz18Du68rnzDW7LqSWs3a2SyqXJjbMSQOPi7EEFzbooa/nHG7nLe04aUc14boBw==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"44ebcaac3b6218b0ebdc9fc9007d3c2e016503bb0be261eb21f01da36f321dd5","last_reissued_at":"2026-05-17T23:43:56.044278Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:43:56.044278Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Analyzing Multi-Head Self-Attention: Specialized Heads Do the Heavy Lifting, the Rest Can Be Pruned","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.CL","authors_text":"David Talbot, Elena Voita, Fedor Moiseev, Ivan Titov, Rico Sennrich","submitted_at":"2019-05-23T01:13:24Z","abstract_excerpt":"Multi-head self-attention is a key component of the Transformer, a state-of-the-art architecture for neural machine translation. In this work we evaluate the contribution made by individual attention heads in the encoder to the overall performance of the model and analyze the roles played by them. We find that the most important and confident heads play consistent and often linguistically-interpretable roles. When pruning heads using a method based on stochastic gates and a differentiable relaxation of the L0 penalty, we observe that specialized heads are last to be pruned. Our novel pruning m"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1905.09418","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1905.09418","created_at":"2026-05-17T23:43:56.044392+00:00"},{"alias_kind":"arxiv_version","alias_value":"1905.09418v2","created_at":"2026-05-17T23:43:56.044392+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1905.09418","created_at":"2026-05-17T23:43:56.044392+00:00"},{"alias_kind":"pith_short_12","alias_value":"ITV4VLB3MIML","created_at":"2026-05-18T12:33:18.533446+00:00"},{"alias_kind":"pith_short_16","alias_value":"ITV4VLB3MIMLB264","created_at":"2026-05-18T12:33:18.533446+00:00"},{"alias_kind":"pith_short_8","alias_value":"ITV4VLB3","created_at":"2026-05-18T12:33:18.533446+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":17,"internal_anchor_count":13,"sample":[{"citing_arxiv_id":"1907.00570","citing_title":"Do Transformer Attention Heads Provide Transparency in Abstractive Summarization?","ref_index":22,"is_internal_anchor":true},{"citing_arxiv_id":"2305.07759","citing_title":"TinyStories: How Small Can Language Models Be and Still Speak Coherent English?","ref_index":30,"is_internal_anchor":true},{"citing_arxiv_id":"1907.04614","citing_title":"Let's measure run time! Extending the IR replicability infrastructure to include performance aspects","ref_index":22,"is_internal_anchor":true},{"citing_arxiv_id":"2505.13742","citing_title":"Understanding Task Representations in Neural Networks via Bayesian Ablation","ref_index":11,"is_internal_anchor":true},{"citing_arxiv_id":"2605.14738","citing_title":"TAPIOCA: Why Task- Aware Pruning Improves OOD model Capability","ref_index":67,"is_internal_anchor":true},{"citing_arxiv_id":"2602.08686","citing_title":"CompilerKV: Risk-Adaptive KV Compression via Offline Experience Compilation","ref_index":16,"is_internal_anchor":true},{"citing_arxiv_id":"2605.20271","citing_title":"Multi-Head Attention as Ensemble Nadaraya-Watson Estimation: Variance Reduction, Decorrelation, and Optimal Head Diversity","ref_index":15,"is_internal_anchor":true},{"citing_arxiv_id":"2605.16470","citing_title":"Strategic Over-Parameterization for Generalizable Low-Rank Adaptation","ref_index":24,"is_internal_anchor":true},{"citing_arxiv_id":"2605.18898","citing_title":"A Two-Parameter Weibull Framework for Diagnosing Transformer Weight Distributions","ref_index":8,"is_internal_anchor":true},{"citing_arxiv_id":"2510.22767","citing_title":"TELL-TALE: Task Efficient LLMs with Task Aware Layer Elimination","ref_index":19,"is_internal_anchor":true},{"citing_arxiv_id":"2512.19219","citing_title":"Selective LoRA for Visual Tokens and Attention Heads","ref_index":22,"is_internal_anchor":true},{"citing_arxiv_id":"2605.14110","citing_title":"SToRe3D: Sparse Token Relevance in ViTs for Efficient Multi-View 3D Object Detection","ref_index":48,"is_internal_anchor":true},{"citing_arxiv_id":"2401.15077","citing_title":"EAGLE: Speculative Sampling Requires Rethinking Feature Uncertainty","ref_index":76,"is_internal_anchor":true},{"citing_arxiv_id":"2505.06708","citing_title":"Gated Attention for Large Language Models: Non-linearity, Sparsity, and Attention-Sink-Free","ref_index":27,"is_internal_anchor":false},{"citing_arxiv_id":"2605.05741","citing_title":"HyperLens: Quantifying Cognitive Effort in LLMs with Fine-grained Confidence Trajectory","ref_index":30,"is_internal_anchor":false},{"citing_arxiv_id":"2209.11895","citing_title":"In-context Learning and Induction Heads","ref_index":9,"is_internal_anchor":false},{"citing_arxiv_id":"2604.06694","citing_title":"AudioKV: KV Cache Eviction in Efficient Large Audio Language Models","ref_index":24,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/ITV4VLB3MIMLB264T7EQA7J4FY","json":"https://pith.science/pith/ITV4VLB3MIMLB264T7EQA7J4FY.json","graph_json":"https://pith.science/api/pith-number/ITV4VLB3MIMLB264T7EQA7J4FY/graph.json","events_json":"https://pith.science/api/pith-number/ITV4VLB3MIMLB264T7EQA7J4FY/events.json","paper":"https://pith.science/paper/ITV4VLB3"},"agent_actions":{"view_html":"https://pith.science/pith/ITV4VLB3MIMLB264T7EQA7J4FY","download_json":"https://pith.science/pith/ITV4VLB3MIMLB264T7EQA7J4FY.json","view_paper":"https://pith.science/paper/ITV4VLB3","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1905.09418&json=true","fetch_graph":"https://pith.science/api/pith-number/ITV4VLB3MIMLB264T7EQA7J4FY/graph.json","fetch_events":"https://pith.science/api/pith-number/ITV4VLB3MIMLB264T7EQA7J4FY/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/ITV4VLB3MIMLB264T7EQA7J4FY/action/timestamp_anchor","attest_storage":"https://pith.science/pith/ITV4VLB3MIMLB264T7EQA7J4FY/action/storage_attestation","attest_author":"https://pith.science/pith/ITV4VLB3MIMLB264T7EQA7J4FY/action/author_attestation","sign_citation":"https://pith.science/pith/ITV4VLB3MIMLB264T7EQA7J4FY/action/citation_signature","submit_replication":"https://pith.science/pith/ITV4VLB3MIMLB264T7EQA7J4FY/action/replication_record"}},"created_at":"2026-05-17T23:43:56.044392+00:00","updated_at":"2026-05-17T23:43:56.044392+00:00"}