{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2019:5ZTLN3WA5STZQHC6VVLFHCP7EZ","short_pith_number":"pith:5ZTLN3WA","schema_version":"1.0","canonical_sha256":"ee66b6eec0eca7981c5ead565389ff2655810031a5b94b637601de03046bafef","source":{"kind":"arxiv","id":"1906.04284","version":2},"attestation_state":"computed","paper":{"title":"Analyzing the Structure of Attention in a Transformer Language Model","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.LG","stat.ML"],"primary_cat":"cs.CL","authors_text":"Jesse Vig, Yonatan Belinkov","submitted_at":"2019-06-07T13:58:49Z","abstract_excerpt":"The Transformer is a fully attention-based alternative to recurrent networks that has achieved state-of-the-art results across a range of NLP tasks. In this paper, we analyze the structure of attention in a Transformer language model, the GPT-2 small pretrained model. We visualize attention for individual instances and analyze the interaction between attention and syntax over a large corpus. We find that attention targets different parts of speech at different layer depths within the model, and that attention aligns with dependency relations most strongly in the middle layers. We also find tha"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"1906.04284","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2019-06-07T13:58:49Z","cross_cats_sorted":["cs.LG","stat.ML"],"title_canon_sha256":"8eeafd9f63e820923c5b67a556ced550aeef57939d10499e97e63c89bb7d538e","abstract_canon_sha256":"d69fae2c12483da60cf2b6f747d2ab7cd8880202cbd983dee46b3ba6e1456232"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:42:58.032307Z","signature_b64":"iAfszis3fU6qxXIg4eqytPsERpd+MBElMbM7rI1Lgzg8FkBPvD7xfhotfCZCAYWOjsfY3LyLo0oFTk0HpAa6Ag==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"ee66b6eec0eca7981c5ead565389ff2655810031a5b94b637601de03046bafef","last_reissued_at":"2026-05-17T23:42:58.031617Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:42:58.031617Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Analyzing the Structure of Attention in a Transformer Language Model","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":["cs.LG","stat.ML"],"primary_cat":"cs.CL","authors_text":"Jesse Vig, Yonatan Belinkov","submitted_at":"2019-06-07T13:58:49Z","abstract_excerpt":"The Transformer is a fully attention-based alternative to recurrent networks that has achieved state-of-the-art results across a range of NLP tasks. In this paper, we analyze the structure of attention in a Transformer language model, the GPT-2 small pretrained model. We visualize attention for individual instances and analyze the interaction between attention and syntax over a large corpus. We find that attention targets different parts of speech at different layer depths within the model, and that attention aligns with dependency relations most strongly in the middle layers. We also find tha"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"1906.04284","kind":"arxiv","version":2},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"1906.04284","created_at":"2026-05-17T23:42:58.031746+00:00"},{"alias_kind":"arxiv_version","alias_value":"1906.04284v2","created_at":"2026-05-17T23:42:58.031746+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.1906.04284","created_at":"2026-05-17T23:42:58.031746+00:00"},{"alias_kind":"pith_short_12","alias_value":"5ZTLN3WA5STZ","created_at":"2026-05-18T12:33:10.108867+00:00"},{"alias_kind":"pith_short_16","alias_value":"5ZTLN3WA5STZQHC6","created_at":"2026-05-18T12:33:10.108867+00:00"},{"alias_kind":"pith_short_8","alias_value":"5ZTLN3WA","created_at":"2026-05-18T12:33:10.108867+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":13,"internal_anchor_count":6,"sample":[{"citing_arxiv_id":"2605.23040","citing_title":"Steered Generation via Gradient-Based Optimization on Sparse Query Features","ref_index":43,"is_internal_anchor":true},{"citing_arxiv_id":"2605.23640","citing_title":"CachePrune: Privacy-Aware and Fine-Grained KV Cache Sharing for Efficient LLM Inference","ref_index":57,"is_internal_anchor":true},{"citing_arxiv_id":"2510.10129","citing_title":"CacheClip: Accelerating RAG with Effective KV Cache Reuse","ref_index":33,"is_internal_anchor":true},{"citing_arxiv_id":"2507.05387","citing_title":"The Generalization Ridge: Information Flow in Natural Language Generation","ref_index":15,"is_internal_anchor":true},{"citing_arxiv_id":"2509.25699","citing_title":"AIM-CoT: Active Information-driven Multimodal Chain-of-Thought for Vision-Language Reasoning","ref_index":13,"is_internal_anchor":true},{"citing_arxiv_id":"2605.14192","citing_title":"Why Retrieval-Augmented Generation Fails: A Graph Perspective","ref_index":44,"is_internal_anchor":true},{"citing_arxiv_id":"2604.03957","citing_title":"BWTA: Accurate and Efficient Binarized Transformer by Algorithm-Hardware Co-design","ref_index":1,"is_internal_anchor":false},{"citing_arxiv_id":"2604.28157","citing_title":"FlashRT: Towards Computationally and Memory Efficient Red-Teaming for Prompt Injection and Knowledge Corruption","ref_index":50,"is_internal_anchor":false},{"citing_arxiv_id":"2009.14794","citing_title":"Rethinking Attention with Performers","ref_index":156,"is_internal_anchor":false},{"citing_arxiv_id":"2605.09271","citing_title":"Shaping Schema via Language Representation as the Next Frontier for LLM Intelligence Expanding","ref_index":96,"is_internal_anchor":false},{"citing_arxiv_id":"2605.06611","citing_title":"The Structural Origin of Attention Sink: Variance Discrepancy, Super Neurons, and Dimension Disparity","ref_index":26,"is_internal_anchor":false},{"citing_arxiv_id":"2604.19826","citing_title":"Co-Located Tests, Better AI Code: How Test Syntax Structure Affects Foundation Model Code Generation","ref_index":36,"is_internal_anchor":false},{"citing_arxiv_id":"2605.00847","citing_title":"H-Probes: Extracting Hierarchical Structures From Latent Representations of Language Models","ref_index":20,"is_internal_anchor":false}]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/5ZTLN3WA5STZQHC6VVLFHCP7EZ","json":"https://pith.science/pith/5ZTLN3WA5STZQHC6VVLFHCP7EZ.json","graph_json":"https://pith.science/api/pith-number/5ZTLN3WA5STZQHC6VVLFHCP7EZ/graph.json","events_json":"https://pith.science/api/pith-number/5ZTLN3WA5STZQHC6VVLFHCP7EZ/events.json","paper":"https://pith.science/paper/5ZTLN3WA"},"agent_actions":{"view_html":"https://pith.science/pith/5ZTLN3WA5STZQHC6VVLFHCP7EZ","download_json":"https://pith.science/pith/5ZTLN3WA5STZQHC6VVLFHCP7EZ.json","view_paper":"https://pith.science/paper/5ZTLN3WA","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=1906.04284&json=true","fetch_graph":"https://pith.science/api/pith-number/5ZTLN3WA5STZQHC6VVLFHCP7EZ/graph.json","fetch_events":"https://pith.science/api/pith-number/5ZTLN3WA5STZQHC6VVLFHCP7EZ/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/5ZTLN3WA5STZQHC6VVLFHCP7EZ/action/timestamp_anchor","attest_storage":"https://pith.science/pith/5ZTLN3WA5STZQHC6VVLFHCP7EZ/action/storage_attestation","attest_author":"https://pith.science/pith/5ZTLN3WA5STZQHC6VVLFHCP7EZ/action/author_attestation","sign_citation":"https://pith.science/pith/5ZTLN3WA5STZQHC6VVLFHCP7EZ/action/citation_signature","submit_replication":"https://pith.science/pith/5ZTLN3WA5STZQHC6VVLFHCP7EZ/action/replication_record"}},"created_at":"2026-05-17T23:42:58.031746+00:00","updated_at":"2026-05-17T23:42:58.031746+00:00"}