{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:GAMIXU6DMHREQTBUGVEHEIWZHY","short_pith_number":"pith:GAMIXU6D","schema_version":"1.0","canonical_sha256":"30188bd3c361e2484c3435487222d93e371eeccad8b35bab4b5ed2d3ccaa0941","source":{"kind":"arxiv","id":"2606.08682","version":1},"attestation_state":"computed","paper":{"title":"Activation Steering Induces Emergent Misalignment: A More Comprehensive Evaluation","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.LG","authors_text":"Anh Tuan Luu, Dan Li, Jian Lou, Meiting Liu, Qi Cao, See-kiong Ng, Wenjie Feng","submitted_at":"2026-06-07T15:34:59Z","abstract_excerpt":"Activation steering has emerged as a popular inference-time technique for modulating the behavior of large language models (LLMs). By constructing a steering vector from examples of a target behavior and injecting it into intermediate activations during inference, activation steering enables flexible behavioral control while avoiding the permanent parameter updates required by finetuning. Meanwhile, recent work has identified emergent misalignment (EM) as a significant safety concern, wherein models finetuned on unsafe examples from a narrow task may unexpectedly generalize to broadly unsafe b"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2606.08682","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-06-07T15:34:59Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"82870301059dfb33a44019269bc4196f5053f6cfb151c2d92c671d4a27005c7b","abstract_canon_sha256":"d9127108bfdc0f657f99d56e68b81108f924a553a6153563bab2bcd7f71aaa97"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-09T01:05:47.329584Z","signature_b64":"F/OeulpQQ6yoXDg6coNAturuMJEGW/z7+UNu02YYuu/RZj7nSdKHNRTZE+f2K5D6zoTRuHOHwRg3WgyxlFE3CA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"30188bd3c361e2484c3435487222d93e371eeccad8b35bab4b5ed2d3ccaa0941","last_reissued_at":"2026-06-09T01:05:47.329007Z","signature_status":"signed_v1","first_computed_at":"2026-06-09T01:05:47.329007Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Activation Steering Induces Emergent Misalignment: A More Comprehensive Evaluation","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.LG","authors_text":"Anh Tuan Luu, Dan Li, Jian Lou, Meiting Liu, Qi Cao, See-kiong Ng, Wenjie Feng","submitted_at":"2026-06-07T15:34:59Z","abstract_excerpt":"Activation steering has emerged as a popular inference-time technique for modulating the behavior of large language models (LLMs). By constructing a steering vector from examples of a target behavior and injecting it into intermediate activations during inference, activation steering enables flexible behavioral control while avoiding the permanent parameter updates required by finetuning. Meanwhile, recent work has identified emergent misalignment (EM) as a significant safety concern, wherein models finetuned on unsafe examples from a narrow task may unexpectedly generalize to broadly unsafe b"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.08682","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2606.08682/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2606.08682","created_at":"2026-06-09T01:05:47.329067+00:00"},{"alias_kind":"arxiv_version","alias_value":"2606.08682v1","created_at":"2026-06-09T01:05:47.329067+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.08682","created_at":"2026-06-09T01:05:47.329067+00:00"},{"alias_kind":"pith_short_12","alias_value":"GAMIXU6DMHRE","created_at":"2026-06-09T01:05:47.329067+00:00"},{"alias_kind":"pith_short_16","alias_value":"GAMIXU6DMHREQTBU","created_at":"2026-06-09T01:05:47.329067+00:00"},{"alias_kind":"pith_short_8","alias_value":"GAMIXU6D","created_at":"2026-06-09T01:05:47.329067+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/GAMIXU6DMHREQTBUGVEHEIWZHY","json":"https://pith.science/pith/GAMIXU6DMHREQTBUGVEHEIWZHY.json","graph_json":"https://pith.science/api/pith-number/GAMIXU6DMHREQTBUGVEHEIWZHY/graph.json","events_json":"https://pith.science/api/pith-number/GAMIXU6DMHREQTBUGVEHEIWZHY/events.json","paper":"https://pith.science/paper/GAMIXU6D"},"agent_actions":{"view_html":"https://pith.science/pith/GAMIXU6DMHREQTBUGVEHEIWZHY","download_json":"https://pith.science/pith/GAMIXU6DMHREQTBUGVEHEIWZHY.json","view_paper":"https://pith.science/paper/GAMIXU6D","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2606.08682&json=true","fetch_graph":"https://pith.science/api/pith-number/GAMIXU6DMHREQTBUGVEHEIWZHY/graph.json","fetch_events":"https://pith.science/api/pith-number/GAMIXU6DMHREQTBUGVEHEIWZHY/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/GAMIXU6DMHREQTBUGVEHEIWZHY/action/timestamp_anchor","attest_storage":"https://pith.science/pith/GAMIXU6DMHREQTBUGVEHEIWZHY/action/storage_attestation","attest_author":"https://pith.science/pith/GAMIXU6DMHREQTBUGVEHEIWZHY/action/author_attestation","sign_citation":"https://pith.science/pith/GAMIXU6DMHREQTBUGVEHEIWZHY/action/citation_signature","submit_replication":"https://pith.science/pith/GAMIXU6DMHREQTBUGVEHEIWZHY/action/replication_record"}},"created_at":"2026-06-09T01:05:47.329067+00:00","updated_at":"2026-06-09T01:05:47.329067+00:00"}