{"work":{"id":"db2b0911-2758-4a2a-99dc-15b14b91bd5e","openalex_id":null,"doi":null,"arxiv_id":"2209.11895","raw_key":null,"title":"In-context Learning and Induction Heads","authors":null,"authors_text":"Catherine Olsson, Nelson Elhage, Neel Nanda, Nicholas Joseph, Nova DasSarma, Tom Henighan","year":2022,"venue":"cs.LG","abstract":"\"Induction heads\" are attention heads that implement a simple algorithm to complete token sequences like [A][B] ... [A] -> [B]. In this work, we present preliminary and indirect evidence for a hypothesis that induction heads might constitute the mechanism for the majority of all \"in-context learning\" in large transformer models (i.e. decreasing loss at increasing token indices). We find that induction heads develop at precisely the same point as a sudden sharp increase in in-context learning ability, visible as a bump in the training loss. We present six complementary lines of evidence, arguing that induction heads may be the mechanistic source of general in-context learning in transformer models of any size. For small attention-only models, we present strong, causal evidence; for larger models with MLPs, we present correlational evidence.","external_url":"https://arxiv.org/abs/2209.11895","cited_by_count":null,"metadata_source":"pith","metadata_fetched_at":"2026-05-25T05:03:46.870339+00:00","pith_arxiv_id":"2209.11895","created_at":"2026-05-08T18:33:58.787979+00:00","updated_at":"2026-06-05T21:23:00.469572+00:00","title_quality_ok":true,"display_title":"In-context Learning and Induction Heads","render_title":"In-context Learning and Induction Heads"},"hub":{"state":{"work_id":"db2b0911-2758-4a2a-99dc-15b14b91bd5e","tier":"hub","tier_reason":"10+ Pith inbound or 1,000+ external citations","pith_inbound_count":94,"external_cited_by_count":null,"distinct_field_count":9,"first_pith_cited_at":"2022-11-01T17:08:44+00:00","last_pith_cited_at":"2026-05-21T00:46:01+00:00","author_build_status":"not_needed","summary_status":"needed","contexts_status":"needed","graph_status":"needed","ask_index_status":"not_needed","reader_status":"not_needed","recognition_status":"not_needed","updated_at":"2026-06-10T12:06:26.248330+00:00","tier_text":"hub"},"tier":"hub","role_counts":[{"context_role":"background","n":18},{"context_role":"dataset","n":1},{"context_role":"other","n":1}],"polarity_counts":[{"context_polarity":"background","n":16},{"context_polarity":"unclear","n":2},{"context_polarity":"support","n":1},{"context_polarity":"use_dataset","n":1}],"runs":{"context_extract":{"job_type":"context_extract","status":"succeeded","result":{"enqueued_papers":25},"error":null,"updated_at":"2026-05-14T12:40:43.891705+00:00"},"graph_features":{"job_type":"graph_features","status":"succeeded","result":{"co_cited":[{"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","shared_citers":11},{"title":"Scaling Laws for Neural Language Models","work_id":"b7dd8749-9c45-4977-ab9b-64478dce1ae8","shared_citers":11},{"title":"Toy Models of Superposition","work_id":"43875dbe-bc2d-4ab5-af63-744411533ff7","shared_citers":11},{"title":"The Llama 3 Herd of Models","work_id":"1549a635-88af-4ac1-acfe-51ae7bb53345","shared_citers":8},{"title":"Interpretability in the Wild: a Circuit for Indirect Object Identification in GPT-2 small","work_id":"d1167c73-3f2a-472b-8bf5-0ec282d7988a","shared_citers":7},{"title":"LLaMA: Open and Efficient Foundation Language Models","work_id":"c018fc23-6f3f-4035-9d02-28a2173b2b9d","shared_citers":7},{"title":"Qwen2.5 Technical Report","work_id":"d8432992-4980-4a81-85c7-9fa2c2b87f85","shared_citers":7},{"title":"Representation Engineering: A Top-Down Approach to AI Transparency","work_id":"45b326e2-e962-41a5-a542-2559e103a19b","shared_citers":7},{"title":"Training Compute-Optimal Large Language Models","work_id":"b2faf28d-86b7-429c-bc42-469458efc246","shared_citers":7},{"title":"DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning","work_id":"e6b75ad5-2877-4168-97c8-710407094d20","shared_citers":6},{"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","shared_citers":6},{"title":"Llama 2: Open Foundation and Fine-Tuned Chat Models","work_id":"68a5177f-d644-44c1-bd4f-4e5278c22f5d","shared_citers":6},{"title":"Sparse Autoencoders Find Highly Interpretable Features in Language Models","work_id":"51960d72-c69f-4db8-8efd-e90e8b4d9524","shared_citers":6},{"title":"The Geometry of Truth: Emergent Linear Structure in Large Language Model Representations of True/False Datasets","work_id":"400e017f-8643-4166-b6da-a75d4446da80","shared_citers":6},{"title":"Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge","work_id":"28ea1282-d657-4c61-a83c-f1249be6d6b1","shared_citers":6},{"title":"Training Verifiers to Solve Math Word Problems","work_id":"acab1aa8-b4d6-40e0-a3ee-25341701dca2","shared_citers":6},{"title":"Efficient Streaming Language Models with Attention Sinks","work_id":"a8d25452-c237-48c9-88a4-682717c3979a","shared_citers":5},{"title":"Eliciting Latent Predictions from Transformers with the Tuned Lens","work_id":"a127314f-7424-488f-b6d7-8214650c420f","shared_citers":5},{"title":"Grokking: Generalization Beyond Overfitting on Small Algorithmic Datasets","work_id":"a3c30ead-1625-4c18-a9c1-e4928dcd0da6","shared_citers":5},{"title":"Mamba: Linear-Time Sequence Modeling with Selective State Spaces","work_id":"4ee75248-1199-492c-a52f-6661e0f4adff","shared_citers":5},{"title":"Mistral 7B","work_id":"eb5e1305-ad11-4875-ad8d-ad8b8f697599","shared_citers":5},{"title":"Steering Language Models With Activation Engineering","work_id":"d525fe06-5560-4e97-86fc-7a0e551f5b17","shared_citers":5},{"title":"Understanding intermediate layers using linear classifier probes","work_id":"bdc944db-4be2-44f7-950b-eaef12fab00e","shared_citers":5},{"title":"arXiv preprint arXiv:2111.02080 , year=","work_id":"566fd534-0735-4411-b050-f7b8d112c1f8","shared_citers":4}],"time_series":[{"n":1,"year":2022},{"n":1,"year":2023},{"n":4,"year":2024},{"n":49,"year":2026}],"dependency_candidates":[]},"error":null,"updated_at":"2026-05-14T12:40:47.843078+00:00"},"identity_refresh":{"job_type":"identity_refresh","status":"succeeded","result":{"items":[{"title":"Qwen3 Technical Report","outcome":"unchanged","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","resolver":"local_arxiv","confidence":0.98,"old_work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e"}],"counts":{"fixed":0,"merged":0,"unchanged":1,"quarantined":0,"needs_external_resolution":0},"errors":[],"attempted":1},"error":null,"updated_at":"2026-05-14T12:40:54.906058+00:00"},"summary_claims":{"job_type":"summary_claims","status":"succeeded","result":{"title":"In-context Learning and Induction Heads","claims":[{"claim_text":"\"Induction heads\" are attention heads that implement a simple algorithm to complete token sequences like [A][B] ... [A] -> [B]. In this work, we present preliminary and indirect evidence for a hypothesis that induction heads might constitute the mechanism for the majority of all \"in-context learning\" in large transformer models (i.e. decreasing loss at increasing token indices). We find that induction heads develop at precisely the same point as a sudden sharp increase in in-context learning ability, visible as a bump in the training loss. We present six complementary lines of evidence, arguin","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks In-context Learning and Induction Heads because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-14T12:40:36.081957+00:00"}},"summary":{"title":"In-context Learning and Induction Heads","claims":[{"claim_text":"\"Induction heads\" are attention heads that implement a simple algorithm to complete token sequences like [A][B] ... [A] -> [B]. In this work, we present preliminary and indirect evidence for a hypothesis that induction heads might constitute the mechanism for the majority of all \"in-context learning\" in large transformer models (i.e. decreasing loss at increasing token indices). We find that induction heads develop at precisely the same point as a sudden sharp increase in in-context learning ability, visible as a bump in the training loss. We present six complementary lines of evidence, arguin","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks In-context Learning and Induction Heads because it crossed a citation-hub threshold.","role_counts":[]},"graph":{"co_cited":[{"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","shared_citers":11},{"title":"Scaling Laws for Neural Language Models","work_id":"b7dd8749-9c45-4977-ab9b-64478dce1ae8","shared_citers":11},{"title":"Toy Models of Superposition","work_id":"43875dbe-bc2d-4ab5-af63-744411533ff7","shared_citers":11},{"title":"The Llama 3 Herd of Models","work_id":"1549a635-88af-4ac1-acfe-51ae7bb53345","shared_citers":8},{"title":"Interpretability in the Wild: a Circuit for Indirect Object Identification in GPT-2 small","work_id":"d1167c73-3f2a-472b-8bf5-0ec282d7988a","shared_citers":7},{"title":"LLaMA: Open and Efficient Foundation Language Models","work_id":"c018fc23-6f3f-4035-9d02-28a2173b2b9d","shared_citers":7},{"title":"Qwen2.5 Technical Report","work_id":"d8432992-4980-4a81-85c7-9fa2c2b87f85","shared_citers":7},{"title":"Representation Engineering: A Top-Down Approach to AI Transparency","work_id":"45b326e2-e962-41a5-a542-2559e103a19b","shared_citers":7},{"title":"Training Compute-Optimal Large Language Models","work_id":"b2faf28d-86b7-429c-bc42-469458efc246","shared_citers":7},{"title":"DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning","work_id":"e6b75ad5-2877-4168-97c8-710407094d20","shared_citers":6},{"title":"GPT-4 Technical Report","work_id":"b928e041-6991-4c08-8c81-0359e4097c7b","shared_citers":6},{"title":"Llama 2: Open Foundation and Fine-Tuned Chat Models","work_id":"68a5177f-d644-44c1-bd4f-4e5278c22f5d","shared_citers":6},{"title":"Sparse Autoencoders Find Highly Interpretable Features in Language Models","work_id":"51960d72-c69f-4db8-8efd-e90e8b4d9524","shared_citers":6},{"title":"The Geometry of Truth: Emergent Linear Structure in Large Language Model Representations of True/False Datasets","work_id":"400e017f-8643-4166-b6da-a75d4446da80","shared_citers":6},{"title":"Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge","work_id":"28ea1282-d657-4c61-a83c-f1249be6d6b1","shared_citers":6},{"title":"Training Verifiers to Solve Math Word Problems","work_id":"acab1aa8-b4d6-40e0-a3ee-25341701dca2","shared_citers":6},{"title":"Efficient Streaming Language Models with Attention Sinks","work_id":"a8d25452-c237-48c9-88a4-682717c3979a","shared_citers":5},{"title":"Eliciting Latent Predictions from Transformers with the Tuned Lens","work_id":"a127314f-7424-488f-b6d7-8214650c420f","shared_citers":5},{"title":"Grokking: Generalization Beyond Overfitting on Small Algorithmic Datasets","work_id":"a3c30ead-1625-4c18-a9c1-e4928dcd0da6","shared_citers":5},{"title":"Mamba: Linear-Time Sequence Modeling with Selective State Spaces","work_id":"4ee75248-1199-492c-a52f-6661e0f4adff","shared_citers":5},{"title":"Mistral 7B","work_id":"eb5e1305-ad11-4875-ad8d-ad8b8f697599","shared_citers":5},{"title":"Steering Language Models With Activation Engineering","work_id":"d525fe06-5560-4e97-86fc-7a0e551f5b17","shared_citers":5},{"title":"Understanding intermediate layers using linear classifier probes","work_id":"bdc944db-4be2-44f7-950b-eaef12fab00e","shared_citers":5},{"title":"arXiv preprint arXiv:2111.02080 , year=","work_id":"566fd534-0735-4411-b050-f7b8d112c1f8","shared_citers":4}],"time_series":[{"n":1,"year":2022},{"n":1,"year":2023},{"n":4,"year":2024},{"n":49,"year":2026}],"dependency_candidates":[]},"authors":[]}}