{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2024:TVUCJQGMRXM3TGG3S374KNKC5O","short_pith_number":"pith:TVUCJQGM","canonical_record":{"source":{"id":"2410.10781","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2024-10-14T17:50:28Z","cross_cats_sorted":["cs.AI","cs.LG"],"title_canon_sha256":"d94b2f2cd6602889efecb1db67bf2c4693e4447abef13036e2cea27a37dca6bf","abstract_canon_sha256":"7c80d83b8a3966af2e289441d1d38931b96ecd5a368fa4d60804305042bffc1e"},"schema_version":"1.0"},"canonical_sha256":"9d6824c0cc8dd9b998db96ffc53542eba5f42e8716777a66b04f72875b43c9d0","source":{"kind":"arxiv","id":"2410.10781","version":2},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2410.10781","created_at":"2026-05-17T23:38:47Z"},{"alias_kind":"arxiv_version","alias_value":"2410.10781v2","created_at":"2026-05-17T23:38:47Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2410.10781","created_at":"2026-05-17T23:38:47Z"},{"alias_kind":"pith_short_12","alias_value":"TVUCJQGMRXM3","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"TVUCJQGMRXM3TGG3","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"TVUCJQGM","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2024:TVUCJQGMRXM3TGG3S374KNKC5O","target":"record","payload":{"canonical_record":{"source":{"id":"2410.10781","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2024-10-14T17:50:28Z","cross_cats_sorted":["cs.AI","cs.LG"],"title_canon_sha256":"d94b2f2cd6602889efecb1db67bf2c4693e4447abef13036e2cea27a37dca6bf","abstract_canon_sha256":"7c80d83b8a3966af2e289441d1d38931b96ecd5a368fa4d60804305042bffc1e"},"schema_version":"1.0"},"canonical_sha256":"9d6824c0cc8dd9b998db96ffc53542eba5f42e8716777a66b04f72875b43c9d0","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:47.095666Z","signature_b64":"6i3JYS8lfur2DevB2/zEGwICHODXGIv8iQYkxtt4+l3c6W2bQooCWAcMbKtpUgWdDhVUA0U0MjTdFhj4J3t8Cg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"9d6824c0cc8dd9b998db96ffc53542eba5f42e8716777a66b04f72875b43c9d0","last_reissued_at":"2026-05-17T23:38:47.095115Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:47.095115Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2410.10781","source_version":2,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:47Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"4rRwqdkSAO5zFAPIzMHdbYmqjG80J6NioCbTrHQvo5zfB6WBJ7YyN14cm1Q9uWFN7u1ecmTqEWsb+ExuNebZAg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-08T23:19:25.668989Z"},"content_sha256":"5469393bb0cfa160e052c0ecf42cd89ff75daf3a3c15ecd447470bcbb4587b05","schema_version":"1.0","event_id":"sha256:5469393bb0cfa160e052c0ecf42cd89ff75daf3a3c15ecd447470bcbb4587b05"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2024:TVUCJQGMRXM3TGG3S374KNKC5O","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"When Attention Sink Emerges in Language Models: An Empirical View","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"Attention sinks in language models emerge from softmax normalization and act as key biases storing non-informative scores.","cross_cats":["cs.AI","cs.LG"],"primary_cat":"cs.CL","authors_text":"Chao Du, Cunxiao Du, Fengzhuo Zhang, Min Lin, Qian Liu, Tianyu Pang, Xiangming Gu, Ye Wang","submitted_at":"2024-10-14T17:50:28Z","abstract_excerpt":"Language Models (LMs) assign significant attention to the first token, even if it is not semantically important, which is known as attention sink. This phenomenon has been widely adopted in applications such as streaming/long context generation, KV cache optimization, inference acceleration, model quantization, and others. Despite its widespread use, a deep understanding of attention sink in LMs is still lacking. In this work, we first demonstrate that attention sinks exist universally in LMs with various inputs, even in small models. Furthermore, attention sink is observed to emerge during th"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"We find that attention sink acts more like key biases, storing extra attention scores, which could be non-informative and not contribute to the value computation. We also observe that this phenomenon (at least partially) stems from tokens' inner dependence on attention scores as a result of softmax normalization. After relaxing such dependence by replacing softmax attention with other attention operations, such as sigmoid attention without normalization, attention sinks do not emerge in LMs up to 1B parameters.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That the lack of attention sinks observed with sigmoid attention in models up to 1B parameters will hold for larger models and will not degrade overall language modeling performance or capabilities.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Attention sinks emerge in language models from softmax-induced token dependence on attention scores and do not appear when using sigmoid attention without normalization in models up to 1B parameters.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Attention sinks in language models emerge from softmax normalization and act as key biases storing non-informative scores.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"af591eb976b8c114ef3aaf342785e0cdfd06411bad71bd9d108b4fdcadee57e9"},"source":{"id":"2410.10781","kind":"arxiv","version":2},"verdict":{"id":"1af5a9b6-0ec6-46a5-b9b2-235e039bb29a","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-16T17:37:06.531295Z","strongest_claim":"We find that attention sink acts more like key biases, storing extra attention scores, which could be non-informative and not contribute to the value computation. We also observe that this phenomenon (at least partially) stems from tokens' inner dependence on attention scores as a result of softmax normalization. After relaxing such dependence by replacing softmax attention with other attention operations, such as sigmoid attention without normalization, attention sinks do not emerge in LMs up to 1B parameters.","one_line_summary":"Attention sinks emerge in language models from softmax-induced token dependence on attention scores and do not appear when using sigmoid attention without normalization in models up to 1B parameters.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That the lack of attention sinks observed with sigmoid attention in models up to 1B parameters will hold for larger models and will not degrade overall language modeling performance or capabilities.","pith_extraction_headline":"Attention sinks in language models emerge from softmax normalization and act as key biases storing non-informative scores."},"references":{"count":64,"sample":[{"doi":"","year":2016,"title":"Layer Normalization","work_id":"20a2d720-0046-4c7c-bcd6-327ec8143f69","ref_index":1,"cited_arxiv_id":"1607.06450","is_internal_anchor":true},{"doi":"","year":2023,"title":"Pythia: A suite for analyzing large language models across training and scaling","work_id":"1b00948d-e7ad-48f6-b8b0-3c85545d533a","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2023,"title":"Quantizable transformers: Removing outliers by helping attention heads do nothing","work_id":"91169195-3dc7-4415-9eab-faaaffd157bf","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2020,"title":"Language models are few-shot learners","work_id":"bf2aec88-6d9e-4f77-a6a3-9a828d524cc0","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2024,"title":"URL https://arxiv","work_id":"7acfb02e-83f1-4e11-b68f-a198807fcfcb","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":64,"snapshot_sha256":"7f25552db18cd366f200db49ddad42abceb86ffbe7036be7193e210000fc4e5d","internal_anchors":22},"formal_canon":{"evidence_count":2,"snapshot_sha256":"d5546f34c8f18b157b0520e24f2662d92c798d9ae4ad27b3d27f995e67e55da1"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"1af5a9b6-0ec6-46a5-b9b2-235e039bb29a"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:47Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"nUqnsTGqICjuANmiy4TiUyIWCu7pH+iU6mVyfMuA3/zoens3WiItuYO6DFgewlSUcChCHqHIbDUouLGbXswgCw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-06-08T23:19:25.669511Z"},"content_sha256":"b93a335eff7a8130b17536348eb0f5d0db49777c9a328e067977e8d37fc20020","schema_version":"1.0","event_id":"sha256:b93a335eff7a8130b17536348eb0f5d0db49777c9a328e067977e8d37fc20020"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/TVUCJQGMRXM3TGG3S374KNKC5O/bundle.json","state_url":"https://pith.science/pith/TVUCJQGMRXM3TGG3S374KNKC5O/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/TVUCJQGMRXM3TGG3S374KNKC5O/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-06-08T23:19:25Z","links":{"resolver":"https://pith.science/pith/TVUCJQGMRXM3TGG3S374KNKC5O","bundle":"https://pith.science/pith/TVUCJQGMRXM3TGG3S374KNKC5O/bundle.json","state":"https://pith.science/pith/TVUCJQGMRXM3TGG3S374KNKC5O/state.json","well_known_bundle":"https://pith.science/.well-known/pith/TVUCJQGMRXM3TGG3S374KNKC5O/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2024:TVUCJQGMRXM3TGG3S374KNKC5O","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"7c80d83b8a3966af2e289441d1d38931b96ecd5a368fa4d60804305042bffc1e","cross_cats_sorted":["cs.AI","cs.LG"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2024-10-14T17:50:28Z","title_canon_sha256":"d94b2f2cd6602889efecb1db67bf2c4693e4447abef13036e2cea27a37dca6bf"},"schema_version":"1.0","source":{"id":"2410.10781","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2410.10781","created_at":"2026-05-17T23:38:47Z"},{"alias_kind":"arxiv_version","alias_value":"2410.10781v2","created_at":"2026-05-17T23:38:47Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2410.10781","created_at":"2026-05-17T23:38:47Z"},{"alias_kind":"pith_short_12","alias_value":"TVUCJQGMRXM3","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"TVUCJQGMRXM3TGG3","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"TVUCJQGM","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:b93a335eff7a8130b17536348eb0f5d0db49777c9a328e067977e8d37fc20020","target":"graph","created_at":"2026-05-17T23:38:47Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"We find that attention sink acts more like key biases, storing extra attention scores, which could be non-informative and not contribute to the value computation. We also observe that this phenomenon (at least partially) stems from tokens' inner dependence on attention scores as a result of softmax normalization. After relaxing such dependence by replacing softmax attention with other attention operations, such as sigmoid attention without normalization, attention sinks do not emerge in LMs up to 1B parameters."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the lack of attention sinks observed with sigmoid attention in models up to 1B parameters will hold for larger models and will not degrade overall language modeling performance or capabilities."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Attention sinks emerge in language models from softmax-induced token dependence on attention scores and do not appear when using sigmoid attention without normalization in models up to 1B parameters."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Attention sinks in language models emerge from softmax normalization and act as key biases storing non-informative scores."}],"snapshot_sha256":"af591eb976b8c114ef3aaf342785e0cdfd06411bad71bd9d108b4fdcadee57e9"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"d5546f34c8f18b157b0520e24f2662d92c798d9ae4ad27b3d27f995e67e55da1"},"paper":{"abstract_excerpt":"Language Models (LMs) assign significant attention to the first token, even if it is not semantically important, which is known as attention sink. This phenomenon has been widely adopted in applications such as streaming/long context generation, KV cache optimization, inference acceleration, model quantization, and others. Despite its widespread use, a deep understanding of attention sink in LMs is still lacking. In this work, we first demonstrate that attention sinks exist universally in LMs with various inputs, even in small models. Furthermore, attention sink is observed to emerge during th","authors_text":"Chao Du, Cunxiao Du, Fengzhuo Zhang, Min Lin, Qian Liu, Tianyu Pang, Xiangming Gu, Ye Wang","cross_cats":["cs.AI","cs.LG"],"headline":"Attention sinks in language models emerge from softmax normalization and act as key biases storing non-informative scores.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2024-10-14T17:50:28Z","title":"When Attention Sink Emerges in Language Models: An Empirical View"},"references":{"count":64,"internal_anchors":22,"resolved_work":64,"sample":[{"cited_arxiv_id":"1607.06450","doi":"","is_internal_anchor":true,"ref_index":1,"title":"Layer Normalization","work_id":"20a2d720-0046-4c7c-bcd6-327ec8143f69","year":2016},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Pythia: A suite for analyzing large language models across training and scaling","work_id":"1b00948d-e7ad-48f6-b8b0-3c85545d533a","year":2023},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"Quantizable transformers: Removing outliers by helping attention heads do nothing","work_id":"91169195-3dc7-4415-9eab-faaaffd157bf","year":2023},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Language models are few-shot learners","work_id":"bf2aec88-6d9e-4f77-a6a3-9a828d524cc0","year":2020},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"URL https://arxiv","work_id":"7acfb02e-83f1-4e11-b68f-a198807fcfcb","year":2024}],"snapshot_sha256":"7f25552db18cd366f200db49ddad42abceb86ffbe7036be7193e210000fc4e5d"},"source":{"id":"2410.10781","kind":"arxiv","version":2},"verdict":{"created_at":"2026-05-16T17:37:06.531295Z","id":"1af5a9b6-0ec6-46a5-b9b2-235e039bb29a","model_set":{"reader":"grok-4.3"},"one_line_summary":"Attention sinks emerge in language models from softmax-induced token dependence on attention scores and do not appear when using sigmoid attention without normalization in models up to 1B parameters.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Attention sinks in language models emerge from softmax normalization and act as key biases storing non-informative scores.","strongest_claim":"We find that attention sink acts more like key biases, storing extra attention scores, which could be non-informative and not contribute to the value computation. We also observe that this phenomenon (at least partially) stems from tokens' inner dependence on attention scores as a result of softmax normalization. After relaxing such dependence by replacing softmax attention with other attention operations, such as sigmoid attention without normalization, attention sinks do not emerge in LMs up to 1B parameters.","weakest_assumption":"That the lack of attention sinks observed with sigmoid attention in models up to 1B parameters will hold for larger models and will not degrade overall language modeling performance or capabilities."}},"verdict_id":"1af5a9b6-0ec6-46a5-b9b2-235e039bb29a"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:5469393bb0cfa160e052c0ecf42cd89ff75daf3a3c15ecd447470bcbb4587b05","target":"record","created_at":"2026-05-17T23:38:47Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"7c80d83b8a3966af2e289441d1d38931b96ecd5a368fa4d60804305042bffc1e","cross_cats_sorted":["cs.AI","cs.LG"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2024-10-14T17:50:28Z","title_canon_sha256":"d94b2f2cd6602889efecb1db67bf2c4693e4447abef13036e2cea27a37dca6bf"},"schema_version":"1.0","source":{"id":"2410.10781","kind":"arxiv","version":2}},"canonical_sha256":"9d6824c0cc8dd9b998db96ffc53542eba5f42e8716777a66b04f72875b43c9d0","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"9d6824c0cc8dd9b998db96ffc53542eba5f42e8716777a66b04f72875b43c9d0","first_computed_at":"2026-05-17T23:38:47.095115Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:47.095115Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"6i3JYS8lfur2DevB2/zEGwICHODXGIv8iQYkxtt4+l3c6W2bQooCWAcMbKtpUgWdDhVUA0U0MjTdFhj4J3t8Cg==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:47.095666Z","signed_message":"canonical_sha256_bytes"},"source_id":"2410.10781","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:5469393bb0cfa160e052c0ecf42cd89ff75daf3a3c15ecd447470bcbb4587b05","sha256:b93a335eff7a8130b17536348eb0f5d0db49777c9a328e067977e8d37fc20020"],"state_sha256":"a0dfba8b1009b615dc2dde8fa356785f58a1900a19cb706e1e73b7ad1c530261"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"hNkt/QpBdBveiwCmvM3d/xLY0+8dNlYA9JNZlqa6jMb3KXvDbn4ZEOohWB8bBCAYFCLDTaDhzH61IvDo5qjDCA==","signed_message":"bundle_sha256_bytes","signed_at":"2026-06-08T23:19:25.671847Z","bundle_sha256":"f0a1930e094ae4a43c6efc260155242c2a474dd94840d23d59889b5de00f4cbd"}}