{"work":{"id":"3317feaa-e788-45fc-95aa-4ea20028b55b","openalex_id":null,"doi":null,"arxiv_id":"2312.06681","raw_key":null,"title":"Steering Llama 2 via Contrastive Activation Addition","authors":null,"authors_text":"Nina Panickssery, Nick Gabrieli, Julian Schulz, Meg Tong, Evan Hubinger, Alexander Matt Turner","year":2023,"venue":"cs.CL","abstract":"We introduce Contrastive Activation Addition (CAA), an innovative method for steering language models by modifying their activations during forward passes. CAA computes \"steering vectors\" by averaging the difference in residual stream activations between pairs of positive and negative examples of a particular behavior, such as factual versus hallucinatory responses. During inference, these steering vectors are added at all token positions after the user's prompt with either a positive or negative coefficient, allowing precise control over the degree of the targeted behavior. We evaluate CAA's effectiveness on Llama 2 Chat using multiple-choice behavioral question datasets and open-ended generation tasks. We demonstrate that CAA significantly alters model behavior, is effective over and on top of traditional methods like finetuning and system prompt design, and minimally reduces capabilities. Moreover, we gain deeper insights into CAA's mechanisms by employing various activation space interpretation methods. CAA accurately steers model outputs and sheds light on how high-level concepts are represented in Large Language Models (LLMs).","external_url":"https://arxiv.org/abs/2312.06681","cited_by_count":null,"metadata_source":"pith","metadata_fetched_at":"2026-05-21T21:45:40.574594+00:00","pith_arxiv_id":"2312.06681","created_at":"2026-05-09T06:57:24.289875+00:00","updated_at":"2026-05-21T21:45:40.574594+00:00","title_quality_ok":true,"display_title":"Steering Llama 2 via Contrastive Activation Addition","render_title":"Steering Llama 2 via Contrastive Activation Addition"},"hub":{"state":{"work_id":"3317feaa-e788-45fc-95aa-4ea20028b55b","tier":"hub","tier_reason":"10+ Pith inbound or 1,000+ external citations","pith_inbound_count":49,"external_cited_by_count":null,"distinct_field_count":4,"first_pith_cited_at":"2024-06-17T16:36:12+00:00","last_pith_cited_at":"2026-05-18T18:17:06+00:00","author_build_status":"not_needed","summary_status":"needed","contexts_status":"needed","graph_status":"needed","ask_index_status":"not_needed","reader_status":"not_needed","recognition_status":"not_needed","updated_at":"2026-05-26T03:06:09.861845+00:00","tier_text":"hub"},"tier":"hub","role_counts":[{"context_role":"background","n":7},{"context_role":"method","n":4},{"context_role":"baseline","n":1}],"polarity_counts":[{"context_polarity":"background","n":7},{"context_polarity":"use_method","n":4},{"context_polarity":"baseline","n":1}],"runs":{"context_extract":{"job_type":"context_extract","status":"succeeded","result":{"enqueued_papers":25},"error":null,"updated_at":"2026-05-14T17:59:31.415811+00:00"},"graph_features":{"job_type":"graph_features","status":"succeeded","result":{"co_cited":[{"title":"Steering Language Models With Activation Engineering","work_id":"d525fe06-5560-4e97-86fc-7a0e551f5b17","shared_citers":18},{"title":"Representation Engineering: A Top-Down Approach to AI Transparency","work_id":"45b326e2-e962-41a5-a542-2559e103a19b","shared_citers":16},{"title":"The Linear Representation Hypothesis and the Geometry of Large Language Models","work_id":"a7b44adc-f2c2-4420-a27d-8ade97dd3b75","shared_citers":11},{"title":"Sparse Autoencoders Find Highly Interpretable Features in Language Models","work_id":"51960d72-c69f-4db8-8efd-e90e8b4d9524","shared_citers":8},{"title":"The Llama 3 Herd of Models","work_id":"1549a635-88af-4ac1-acfe-51ae7bb53345","shared_citers":8},{"title":"J., Geiger, A., and Nanda, N","work_id":"6cb3c7a7-3301-449f-97b9-7e047edafdf9","shared_citers":6},{"title":"Qwen2.5 Technical Report","work_id":"d8432992-4980-4a81-85c7-9fa2c2b87f85","shared_citers":6},{"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","shared_citers":6},{"title":"The Geometry of Truth: Emergent Linear Structure in Large Language Model Representations of True/False Datasets","work_id":"400e017f-8643-4166-b6da-a75d4446da80","shared_citers":6},{"title":"Training Verifiers to Solve Math Word Problems","work_id":"acab1aa8-b4d6-40e0-a3ee-25341701dca2","shared_citers":6},{"title":"DeepSeek-V3 Technical Report","work_id":"57d2791d-2219-4c31-a077-afc04b12a75c","shared_citers":5},{"title":"HarmBench: A Standardized Evaluation Framework for Automated Red Teaming and Robust Refusal","work_id":"b0b0303f-2444-4789-a979-8153624312ff","shared_citers":5},{"title":"Measuring Massive Multitask Language Understanding","work_id":"e87ec49a-544b-4ec8-8991-75298c64ff5e","shared_citers":5},{"title":"Refusal in Language Models Is Mediated by a Single Direction","work_id":"fbb9538d-8e58-4902-9fbd-b11f044bc2d5","shared_citers":5},{"title":"Universal and Transferable Adversarial Attacks on Aligned Language Models","work_id":"3322fa86-1768-4677-8425-dd326b45e078","shared_citers":5},{"title":"2024 , month = feb, number =","work_id":"dbc5264a-aabf-4346-8cbe-123e943b13b8","shared_citers":4},{"title":"Discovering latent knowledge in language models without supervision","work_id":"a12b68bd-76a4-4837-ac7c-3ed5a60010d3","shared_citers":4},{"title":"Eliciting Latent Predictions from Transformers with the Tuned Lens","work_id":"a127314f-7424-488f-b6d7-8214650c420f","shared_citers":4},{"title":"LoRA: Low-Rank Adaptation of Large Language Models","work_id":"0426219a-789e-4964-adc8-a04538510818","shared_citers":4},{"title":"Mistral 7B","work_id":"eb5e1305-ad11-4875-ad8d-ad8b8f697599","shared_citers":4},{"title":"Persona Vectors: Monitoring and Controlling Character Traits in Language Models","work_id":"cf32dbef-9132-4648-abcb-0ebf3ac3af80","shared_citers":4},{"title":"Towards Understanding Sycophancy in Language Models","work_id":"aeefec9a-6ad5-4743-92b9-de6983895e21","shared_citers":4},{"title":"AutoDAN: Generating Stealthy Jailbreak Prompts on Aligned Large Language Models","work_id":"3b676de6-edef-4976-a8b5-082d4ff50867","shared_citers":3},{"title":"Chain-of-Thought Prompting Elicits Reasoning in Large Language Models","work_id":"d1cf6693-a082-403c-ada9-dac7b96341f9","shared_citers":3}],"time_series":[{"n":1,"year":2024},{"n":1,"year":2025},{"n":33,"year":2026}],"dependency_candidates":[]},"error":null,"updated_at":"2026-05-14T17:59:44.961563+00:00"},"identity_refresh":{"job_type":"identity_refresh","status":"succeeded","result":{"items":[{"title":"Qwen3 Technical Report","outcome":"unchanged","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","resolver":"local_arxiv","confidence":0.98,"old_work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e"}],"counts":{"fixed":0,"merged":0,"unchanged":1,"quarantined":0,"needs_external_resolution":0},"errors":[],"attempted":1},"error":null,"updated_at":"2026-05-14T17:59:53.981261+00:00"},"summary_claims":{"job_type":"summary_claims","status":"succeeded","result":{"title":"Steering Llama 2 via Contrastive Activation Addition","claims":[{"claim_text":"We introduce Contrastive Activation Addition (CAA), an innovative method for steering language models by modifying their activations during forward passes. CAA computes \"steering vectors\" by averaging the difference in residual stream activations between pairs of positive and negative examples of a particular behavior, such as factual versus hallucinatory responses. During inference, these steering vectors are added at all token positions after the user's prompt with either a positive or negative coefficient, allowing precise control over the degree of the targeted behavior. We evaluate CAA's ","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks Steering Llama 2 via Contrastive Activation Addition because it crossed a citation-hub threshold.","role_counts":[]},"error":null,"updated_at":"2026-05-14T17:59:19.544444+00:00"}},"summary":{"title":"Steering Llama 2 via Contrastive Activation Addition","claims":[{"claim_text":"We introduce Contrastive Activation Addition (CAA), an innovative method for steering language models by modifying their activations during forward passes. CAA computes \"steering vectors\" by averaging the difference in residual stream activations between pairs of positive and negative examples of a particular behavior, such as factual versus hallucinatory responses. During inference, these steering vectors are added at all token positions after the user's prompt with either a positive or negative coefficient, allowing precise control over the degree of the targeted behavior. We evaluate CAA's ","claim_type":"abstract","evidence_strength":"source_metadata"}],"why_cited":"Pith tracks Steering Llama 2 via Contrastive Activation Addition because it crossed a citation-hub threshold.","role_counts":[]},"graph":{"co_cited":[{"title":"Steering Language Models With Activation Engineering","work_id":"d525fe06-5560-4e97-86fc-7a0e551f5b17","shared_citers":18},{"title":"Representation Engineering: A Top-Down Approach to AI Transparency","work_id":"45b326e2-e962-41a5-a542-2559e103a19b","shared_citers":16},{"title":"The Linear Representation Hypothesis and the Geometry of Large Language Models","work_id":"a7b44adc-f2c2-4420-a27d-8ade97dd3b75","shared_citers":11},{"title":"Sparse Autoencoders Find Highly Interpretable Features in Language Models","work_id":"51960d72-c69f-4db8-8efd-e90e8b4d9524","shared_citers":8},{"title":"The Llama 3 Herd of Models","work_id":"1549a635-88af-4ac1-acfe-51ae7bb53345","shared_citers":8},{"title":"J., Geiger, A., and Nanda, N","work_id":"6cb3c7a7-3301-449f-97b9-7e047edafdf9","shared_citers":6},{"title":"Qwen2.5 Technical Report","work_id":"d8432992-4980-4a81-85c7-9fa2c2b87f85","shared_citers":6},{"title":"Qwen3 Technical Report","work_id":"25a4e30c-1232-48e7-9925-02fa12ba7c9e","shared_citers":6},{"title":"The Geometry of Truth: Emergent Linear Structure in Large Language Model Representations of True/False Datasets","work_id":"400e017f-8643-4166-b6da-a75d4446da80","shared_citers":6},{"title":"Training Verifiers to Solve Math Word Problems","work_id":"acab1aa8-b4d6-40e0-a3ee-25341701dca2","shared_citers":6},{"title":"DeepSeek-V3 Technical Report","work_id":"57d2791d-2219-4c31-a077-afc04b12a75c","shared_citers":5},{"title":"HarmBench: A Standardized Evaluation Framework for Automated Red Teaming and Robust Refusal","work_id":"b0b0303f-2444-4789-a979-8153624312ff","shared_citers":5},{"title":"Measuring Massive Multitask Language Understanding","work_id":"e87ec49a-544b-4ec8-8991-75298c64ff5e","shared_citers":5},{"title":"Refusal in Language Models Is Mediated by a Single Direction","work_id":"fbb9538d-8e58-4902-9fbd-b11f044bc2d5","shared_citers":5},{"title":"Universal and Transferable Adversarial Attacks on Aligned Language Models","work_id":"3322fa86-1768-4677-8425-dd326b45e078","shared_citers":5},{"title":"2024 , month = feb, number =","work_id":"dbc5264a-aabf-4346-8cbe-123e943b13b8","shared_citers":4},{"title":"Discovering latent knowledge in language models without supervision","work_id":"a12b68bd-76a4-4837-ac7c-3ed5a60010d3","shared_citers":4},{"title":"Eliciting Latent Predictions from Transformers with the Tuned Lens","work_id":"a127314f-7424-488f-b6d7-8214650c420f","shared_citers":4},{"title":"LoRA: Low-Rank Adaptation of Large Language Models","work_id":"0426219a-789e-4964-adc8-a04538510818","shared_citers":4},{"title":"Mistral 7B","work_id":"eb5e1305-ad11-4875-ad8d-ad8b8f697599","shared_citers":4},{"title":"Persona Vectors: Monitoring and Controlling Character Traits in Language Models","work_id":"cf32dbef-9132-4648-abcb-0ebf3ac3af80","shared_citers":4},{"title":"Towards Understanding Sycophancy in Language Models","work_id":"aeefec9a-6ad5-4743-92b9-de6983895e21","shared_citers":4},{"title":"AutoDAN: Generating Stealthy Jailbreak Prompts on Aligned Large Language Models","work_id":"3b676de6-edef-4976-a8b5-082d4ff50867","shared_citers":3},{"title":"Chain-of-Thought Prompting Elicits Reasoning in Large Language Models","work_id":"d1cf6693-a082-403c-ada9-dac7b96341f9","shared_citers":3}],"time_series":[{"n":1,"year":2024},{"n":1,"year":2025},{"n":33,"year":2026}],"dependency_candidates":[]},"authors":[]}}