{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2024:MDJKPL5S3IEMCXYQ4Z5CCVQHIW","short_pith_number":"pith:MDJKPL5S","canonical_record":{"source":{"id":"2402.17762","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2024-02-27T18:55:17Z","cross_cats_sorted":["cs.LG"],"title_canon_sha256":"1375592bd25780fa45da9e4a454856fb6a1918f2dfb6bdb9df98135e4a994fe3","abstract_canon_sha256":"161a0a2b92c9dbee51eb5242b3c2633f8c7a752a67326dc218027ce16a6a8324"},"schema_version":"1.0"},"canonical_sha256":"60d2a7afb2da08c15f10e67a215607459bca6ed57194e20a0f3dbc5b94bfe664","source":{"kind":"arxiv","id":"2402.17762","version":2},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2402.17762","created_at":"2026-05-17T23:38:48Z"},{"alias_kind":"arxiv_version","alias_value":"2402.17762v2","created_at":"2026-05-17T23:38:48Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2402.17762","created_at":"2026-05-17T23:38:48Z"},{"alias_kind":"pith_short_12","alias_value":"MDJKPL5S3IEM","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"MDJKPL5S3IEMCXYQ","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"MDJKPL5S","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2024:MDJKPL5S3IEMCXYQ4Z5CCVQHIW","target":"record","payload":{"canonical_record":{"source":{"id":"2402.17762","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2024-02-27T18:55:17Z","cross_cats_sorted":["cs.LG"],"title_canon_sha256":"1375592bd25780fa45da9e4a454856fb6a1918f2dfb6bdb9df98135e4a994fe3","abstract_canon_sha256":"161a0a2b92c9dbee51eb5242b3c2633f8c7a752a67326dc218027ce16a6a8324"},"schema_version":"1.0"},"canonical_sha256":"60d2a7afb2da08c15f10e67a215607459bca6ed57194e20a0f3dbc5b94bfe664","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:48.755502Z","signature_b64":"GgFiq7mdE76RFcNs1uUbR1Z0tly4e5FEesYi+dyI23wKDnMVTBq6qtEeuT17oHQAS8CV5SJZb7faiD71Gy1qDQ==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"60d2a7afb2da08c15f10e67a215607459bca6ed57194e20a0f3dbc5b94bfe664","last_reissued_at":"2026-05-17T23:38:48.754966Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:48.754966Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2402.17762","source_version":2,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:48Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"a1BE30tI9zj9NuKOu7P0tmtLkyLpQGh1ixFFfVNhj05dTp0BsaoXKKC9DFbvpI4ndXHi6erVdFGXe5U39NbwDQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-23T13:34:39.262242Z"},"content_sha256":"d14ec2ac97124c038b036a7c68466edbfa65125e944eee8509b3283ea25ff546","schema_version":"1.0","event_id":"sha256:d14ec2ac97124c038b036a7c68466edbfa65125e944eee8509b3283ea25ff546"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2024:MDJKPL5S3IEMCXYQ4Z5CCVQHIW","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Massive Activations in Large Language Models","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"Large language models contain a small number of massive activations that remain constant across inputs and act as indispensable bias terms.","cross_cats":["cs.LG"],"primary_cat":"cs.CL","authors_text":"J. Zico Kolter, Mingjie Sun, Xinlei Chen, Zhuang Liu","submitted_at":"2024-02-27T18:55:17Z","abstract_excerpt":"We observe an empirical phenomenon in Large Language Models (LLMs) -- very few activations exhibit significantly larger values than others (e.g., 100,000 times larger). We call them massive activations. First, we demonstrate the widespread existence of massive activations across various LLMs and characterize their locations. Second, we find their values largely stay constant regardless of the input, and they function as indispensable bias terms in LLMs. Third, these massive activations lead to the concentration of attention probabilities to their corresponding tokens, and further, implicit bia"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"very few activations exhibit significantly larger values than others (e.g., 100,000 times larger). We call them massive activations... their values largely stay constant regardless of the input, and they function as indispensable bias terms in LLMs... these massive activations lead to the concentration of attention probabilities to their corresponding tokens.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That the observed constancy of massive activation values and their role as indispensable bias terms generalize across all LLMs, inputs, and architectures based on the limited set of models and characterizations performed.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Massive activations are constant large values in LLMs that function as indispensable bias terms and concentrate attention probabilities on specific tokens.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Large language models contain a small number of massive activations that remain constant across inputs and act as indispensable bias terms.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"3f0879792246673acf58211c8ccf21e12ff53c73017da131da2ffc3fde0c743c"},"source":{"id":"2402.17762","kind":"arxiv","version":2},"verdict":{"id":"5b852ca6-442a-49a6-a777-e5fa78bd9382","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-16T06:59:19.393219Z","strongest_claim":"very few activations exhibit significantly larger values than others (e.g., 100,000 times larger). We call them massive activations... their values largely stay constant regardless of the input, and they function as indispensable bias terms in LLMs... these massive activations lead to the concentration of attention probabilities to their corresponding tokens.","one_line_summary":"Massive activations are constant large values in LLMs that function as indispensable bias terms and concentrate attention probabilities on specific tokens.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That the observed constancy of massive activation values and their role as indispensable bias terms generalize across all LLMs, inputs, and architectures based on the limited set of models and characterizations performed.","pith_extraction_headline":"Large language models contain a small number of massive activations that remain constant across inputs and act as indispensable bias terms."},"references":{"count":159,"sample":[{"doi":"","year":2022,"title":"Exploring Length Generalization in Large Language Models","work_id":"2c9271b4-93c3-4ef2-953e-9d6b8a9c41c0","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2009,"title":"Computational complexity: a modern approach","work_id":"03206498-04bf-40ab-82ce-6bec266dc024","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2022,"title":"URLhttps://arxiv.org/pdf/2202.05826","work_id":"25bc4b88-d8d5-459f-a6ee-3871f05ce731","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2022,"title":"arXiv preprint arXiv:2207.08799 , year=","work_id":"92192172-5c98-475d-ab81-1f83e1a2d120","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":1986,"title":"Mix Barrington","work_id":"9e91a5eb-4082-4686-b17a-c95c104f0867","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":159,"snapshot_sha256":"f5ef4ad595f606821b612a050b320025d4de37887ac8f710fa2503c7a66fd6c2","internal_anchors":47},"formal_canon":{"evidence_count":2,"snapshot_sha256":"9b8749389cf2bcf67e418ad8d841e5a7202cd0f867a718bfb966b6053667e8ad"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"5b852ca6-442a-49a6-a777-e5fa78bd9382"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:48Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"rhesmJmquJIKsqYj3Ij7ej0orAk1Pt27WhWfwTDdEUnWlyNmVUxSILMgdDF8DPz3OV+UHt/hQfmz9fdxKQesAw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-23T13:34:39.263278Z"},"content_sha256":"dd487b37bfdd6c1dff8c800af07dedd39f27b58ed69b8c7d56367848942abe69","schema_version":"1.0","event_id":"sha256:dd487b37bfdd6c1dff8c800af07dedd39f27b58ed69b8c7d56367848942abe69"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/MDJKPL5S3IEMCXYQ4Z5CCVQHIW/bundle.json","state_url":"https://pith.science/pith/MDJKPL5S3IEMCXYQ4Z5CCVQHIW/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/MDJKPL5S3IEMCXYQ4Z5CCVQHIW/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-23T13:34:39Z","links":{"resolver":"https://pith.science/pith/MDJKPL5S3IEMCXYQ4Z5CCVQHIW","bundle":"https://pith.science/pith/MDJKPL5S3IEMCXYQ4Z5CCVQHIW/bundle.json","state":"https://pith.science/pith/MDJKPL5S3IEMCXYQ4Z5CCVQHIW/state.json","well_known_bundle":"https://pith.science/.well-known/pith/MDJKPL5S3IEMCXYQ4Z5CCVQHIW/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2024:MDJKPL5S3IEMCXYQ4Z5CCVQHIW","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"161a0a2b92c9dbee51eb5242b3c2633f8c7a752a67326dc218027ce16a6a8324","cross_cats_sorted":["cs.LG"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2024-02-27T18:55:17Z","title_canon_sha256":"1375592bd25780fa45da9e4a454856fb6a1918f2dfb6bdb9df98135e4a994fe3"},"schema_version":"1.0","source":{"id":"2402.17762","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2402.17762","created_at":"2026-05-17T23:38:48Z"},{"alias_kind":"arxiv_version","alias_value":"2402.17762v2","created_at":"2026-05-17T23:38:48Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2402.17762","created_at":"2026-05-17T23:38:48Z"},{"alias_kind":"pith_short_12","alias_value":"MDJKPL5S3IEM","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"MDJKPL5S3IEMCXYQ","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"MDJKPL5S","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:dd487b37bfdd6c1dff8c800af07dedd39f27b58ed69b8c7d56367848942abe69","target":"graph","created_at":"2026-05-17T23:38:48Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"very few activations exhibit significantly larger values than others (e.g., 100,000 times larger). We call them massive activations... their values largely stay constant regardless of the input, and they function as indispensable bias terms in LLMs... these massive activations lead to the concentration of attention probabilities to their corresponding tokens."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the observed constancy of massive activation values and their role as indispensable bias terms generalize across all LLMs, inputs, and architectures based on the limited set of models and characterizations performed."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Massive activations are constant large values in LLMs that function as indispensable bias terms and concentrate attention probabilities on specific tokens."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Large language models contain a small number of massive activations that remain constant across inputs and act as indispensable bias terms."}],"snapshot_sha256":"3f0879792246673acf58211c8ccf21e12ff53c73017da131da2ffc3fde0c743c"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"9b8749389cf2bcf67e418ad8d841e5a7202cd0f867a718bfb966b6053667e8ad"},"paper":{"abstract_excerpt":"We observe an empirical phenomenon in Large Language Models (LLMs) -- very few activations exhibit significantly larger values than others (e.g., 100,000 times larger). We call them massive activations. First, we demonstrate the widespread existence of massive activations across various LLMs and characterize their locations. Second, we find their values largely stay constant regardless of the input, and they function as indispensable bias terms in LLMs. Third, these massive activations lead to the concentration of attention probabilities to their corresponding tokens, and further, implicit bia","authors_text":"J. Zico Kolter, Mingjie Sun, Xinlei Chen, Zhuang Liu","cross_cats":["cs.LG"],"headline":"Large language models contain a small number of massive activations that remain constant across inputs and act as indispensable bias terms.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2024-02-27T18:55:17Z","title":"Massive Activations in Large Language Models"},"references":{"count":159,"internal_anchors":47,"resolved_work":159,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"Exploring Length Generalization in Large Language Models","work_id":"2c9271b4-93c3-4ef2-953e-9d6b8a9c41c0","year":2022},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":2,"title":"Computational complexity: a modern approach","work_id":"03206498-04bf-40ab-82ce-6bec266dc024","year":2009},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"URLhttps://arxiv.org/pdf/2202.05826","work_id":"25bc4b88-d8d5-459f-a6ee-3871f05ce731","year":2022},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"arXiv preprint arXiv:2207.08799 , year=","work_id":"92192172-5c98-475d-ab81-1f83e1a2d120","year":2022},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":5,"title":"Mix Barrington","work_id":"9e91a5eb-4082-4686-b17a-c95c104f0867","year":1986}],"snapshot_sha256":"f5ef4ad595f606821b612a050b320025d4de37887ac8f710fa2503c7a66fd6c2"},"source":{"id":"2402.17762","kind":"arxiv","version":2},"verdict":{"created_at":"2026-05-16T06:59:19.393219Z","id":"5b852ca6-442a-49a6-a777-e5fa78bd9382","model_set":{"reader":"grok-4.3"},"one_line_summary":"Massive activations are constant large values in LLMs that function as indispensable bias terms and concentrate attention probabilities on specific tokens.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Large language models contain a small number of massive activations that remain constant across inputs and act as indispensable bias terms.","strongest_claim":"very few activations exhibit significantly larger values than others (e.g., 100,000 times larger). We call them massive activations... their values largely stay constant regardless of the input, and they function as indispensable bias terms in LLMs... these massive activations lead to the concentration of attention probabilities to their corresponding tokens.","weakest_assumption":"That the observed constancy of massive activation values and their role as indispensable bias terms generalize across all LLMs, inputs, and architectures based on the limited set of models and characterizations performed."}},"verdict_id":"5b852ca6-442a-49a6-a777-e5fa78bd9382"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:d14ec2ac97124c038b036a7c68466edbfa65125e944eee8509b3283ea25ff546","target":"record","created_at":"2026-05-17T23:38:48Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"161a0a2b92c9dbee51eb5242b3c2633f8c7a752a67326dc218027ce16a6a8324","cross_cats_sorted":["cs.LG"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2024-02-27T18:55:17Z","title_canon_sha256":"1375592bd25780fa45da9e4a454856fb6a1918f2dfb6bdb9df98135e4a994fe3"},"schema_version":"1.0","source":{"id":"2402.17762","kind":"arxiv","version":2}},"canonical_sha256":"60d2a7afb2da08c15f10e67a215607459bca6ed57194e20a0f3dbc5b94bfe664","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"60d2a7afb2da08c15f10e67a215607459bca6ed57194e20a0f3dbc5b94bfe664","first_computed_at":"2026-05-17T23:38:48.754966Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:48.754966Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"GgFiq7mdE76RFcNs1uUbR1Z0tly4e5FEesYi+dyI23wKDnMVTBq6qtEeuT17oHQAS8CV5SJZb7faiD71Gy1qDQ==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:48.755502Z","signed_message":"canonical_sha256_bytes"},"source_id":"2402.17762","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:d14ec2ac97124c038b036a7c68466edbfa65125e944eee8509b3283ea25ff546","sha256:dd487b37bfdd6c1dff8c800af07dedd39f27b58ed69b8c7d56367848942abe69"],"state_sha256":"a4526c0d29cd59e30b3ffd8a23b801046e1cd440b6e68ea0dc1524a6b18ddc44"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"lQT1tbk9wRbODRlngYug1Oo8teRViaPIIqy1nplNK4uvFQWY8UI3UXumiuL/mR+M5zmW/S5VC4q4vsBkvpUCBw==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-23T13:34:39.267739Z","bundle_sha256":"ab9fef6d9fe80ae748b8e5bbe82e3103f3dfd00f41209f2c3ef8a112cd216f69"}}