{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:53I72JDJ3YOHUVBXSVUNM2JII5","short_pith_number":"pith:53I72JDJ","canonical_record":{"source":{"id":"2605.13339","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-05-13T10:57:37Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"3f390671a8ed420207080c00efc3d0de30108a11c5efba3fc9cfe67400e7869e","abstract_canon_sha256":"6fe760520f2841531bf19dc72c1b1f484d7fd6d120aa6862a3e878d5d9d5b20a"},"schema_version":"1.0"},"canonical_sha256":"eed1fd2469de1c7a54379568d6692847513a3ecb45ae3a9a8ad5b460c9684bd0","source":{"kind":"arxiv","id":"2605.13339","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.13339","created_at":"2026-05-18T02:44:48Z"},{"alias_kind":"arxiv_version","alias_value":"2605.13339v1","created_at":"2026-05-18T02:44:48Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.13339","created_at":"2026-05-18T02:44:48Z"},{"alias_kind":"pith_short_12","alias_value":"53I72JDJ3YOH","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"53I72JDJ3YOHUVBX","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"53I72JDJ","created_at":"2026-05-18T12:33:37Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:53I72JDJ3YOHUVBXSVUNM2JII5","target":"record","payload":{"canonical_record":{"source":{"id":"2605.13339","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-05-13T10:57:37Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"3f390671a8ed420207080c00efc3d0de30108a11c5efba3fc9cfe67400e7869e","abstract_canon_sha256":"6fe760520f2841531bf19dc72c1b1f484d7fd6d120aa6862a3e878d5d9d5b20a"},"schema_version":"1.0"},"canonical_sha256":"eed1fd2469de1c7a54379568d6692847513a3ecb45ae3a9a8ad5b460c9684bd0","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-18T02:44:48.414962Z","signature_b64":"4KBODB07rjpwwedyDkm2poGVDyCBWt8wykWSZpqegeklhKppDENLRBrV7pUvedRfS0ZSMlxgIufENRvaQxZ5Dg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"eed1fd2469de1c7a54379568d6692847513a3ecb45ae3a9a8ad5b460c9684bd0","last_reissued_at":"2026-05-18T02:44:48.414574Z","signature_status":"signed_v1","first_computed_at":"2026-05-18T02:44:48.414574Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2605.13339","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-18T02:44:48Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"oq19m2VEEhMobMdIlEPB/7H/xxaXsOvgk67CWmhYev9y2RbLgJtOrlqqW8YvasjcdbcY6o0LrQ2x9npllN84Bg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-28T00:24:59.534056Z"},"content_sha256":"6078140dc1e3262dcf49baf9c835a5878c3948095cb001a20e72091f3169aa76","schema_version":"1.0","event_id":"sha256:6078140dc1e3262dcf49baf9c835a5878c3948095cb001a20e72091f3169aa76"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:53I72JDJ3YOHUVBXSVUNM2JII5","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Probing Persona-Dependent Preferences in Language Models","license":"http://creativecommons.org/licenses/by/4.0/","headline":"A single linear direction in residual-stream activations predicts and steers task preferences across LLM personas, including opposing ones.","cross_cats":["cs.AI"],"primary_cat":"cs.CL","authors_text":"Daniel Paleka, Oscar Gilg, Patrick Butlin, Pierre Beckmann","submitted_at":"2026-05-13T10:57:37Z","abstract_excerpt":"Large language models (LLMs) can be said to have preferences: they reliably pick certain tasks and outputs over others, and preferences shaped by post-training and system prompts appear to shape much of their behaviour. But models can also adopt different personas which have radically different preferences. How is this implemented internally? Does each persona run on its own preference machinery, or is something shared underneath? We train linear probes on residual-stream activations of Gemma-3-27B and Qwen-3.5-122B to predict revealed pairwise task choices, and identify a genuine preference v"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"This preference representation is largely shared across personas: a probe trained on the helpful assistant predicts and steers the choices of qualitatively different personas, including an evil persona whose preferences anti-correlate with those of the Assistant.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That the linear probe direction identified in residual-stream activations represents the genuine causal preference mechanism rather than a correlated but non-causal feature, and that steering along it produces clean changes to choices without major unintended effects on other capabilities.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Linear probes on residual-stream activations extract a preference vector that tracks and steers pairwise task choices across personas in Gemma-3-27B and Qwen-3.5-122B, including anti-correlated evil personas.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"A single linear direction in residual-stream activations predicts and steers task preferences across LLM personas, including opposing ones.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"7388bde6c9402fbf834fdef637728408350c41d196308cb9eaac8ed9f1f1e743"},"source":{"id":"2605.13339","kind":"arxiv","version":1},"verdict":{"id":"8e924f54-99f1-493c-8410-626dafde9ce6","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-14T20:10:03.418038Z","strongest_claim":"This preference representation is largely shared across personas: a probe trained on the helpful assistant predicts and steers the choices of qualitatively different personas, including an evil persona whose preferences anti-correlate with those of the Assistant.","one_line_summary":"Linear probes on residual-stream activations extract a preference vector that tracks and steers pairwise task choices across personas in Gemma-3-27B and Qwen-3.5-122B, including anti-correlated evil personas.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That the linear probe direction identified in residual-stream activations represents the genuine causal preference mechanism rather than a correlated but non-causal feature, and that steering along it produces clean changes to choices without major unintended effects on other capabilities.","pith_extraction_headline":"A single linear direction in residual-stream activations predicts and steers task preferences across LLM personas, including opposing ones."},"references":{"count":12,"sample":[{"doi":"","year":2022,"title":"Language models as agent models","work_id":"41d8b9d0-c84c-4a1c-ad5f-1bb1767fc23a","ref_index":1,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"10.1038/s41586-025-09937-5","year":2025,"title":"Emergent Misalignment: Narrow finetuning can pro- duce broadly misaligned LLMs.Nature, 649(8097):584–589, January 2026","work_id":"4c08e462-6101-4261-b7dc-0c965156a549","ref_index":2,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"10.1111/phpr.12395","year":null,"title":"doi: 10.1111/phpr.12395. David J. Chalmers. What we talk to when we talk to language models. PhilArchive, https://philpapers. org/archive/CHAWWT-8.pdf,","work_id":"c212592c-90d4-40b6-929b-cad082da9600","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":2026,"title":"Sam Marks, Jack Lindsey, and Christopher Olah","work_id":"6d8e3336-aeae-4ffe-af4f-19686adf5ee0","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"10.1038/s41586-023-06647-8","year":null,"title":"2023 , journal =","work_id":"7fea6a52-fd4e-4020-b0a6-75129cc9f06a","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":12,"snapshot_sha256":"fd0beef5d09c45fc4185c2db406ed7e5614a3aec24e1c9b2f5d2c5dca8ca9572","internal_anchors":0},"formal_canon":{"evidence_count":2,"snapshot_sha256":"5efa26aabdb469e457b10718414c62497cde23f18acd9b1fc589179feccbcb9c"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"8e924f54-99f1-493c-8410-626dafde9ce6"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-18T02:44:48Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"YEqILah5n/mIDRVEjW3i2X6ev3pO2zfjm40hEXow9hHR+pQYZRhO17SxCo1wQMmFTvD2HW3abHWbUqhbonH4Bg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-28T00:24:59.534728Z"},"content_sha256":"9ce3acab5941b967ba37001ef175d6f364f60c750e8e3e7cf2cbc30d6bab5af3","schema_version":"1.0","event_id":"sha256:9ce3acab5941b967ba37001ef175d6f364f60c750e8e3e7cf2cbc30d6bab5af3"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/53I72JDJ3YOHUVBXSVUNM2JII5/bundle.json","state_url":"https://pith.science/pith/53I72JDJ3YOHUVBXSVUNM2JII5/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/53I72JDJ3YOHUVBXSVUNM2JII5/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-28T00:24:59Z","links":{"resolver":"https://pith.science/pith/53I72JDJ3YOHUVBXSVUNM2JII5","bundle":"https://pith.science/pith/53I72JDJ3YOHUVBXSVUNM2JII5/bundle.json","state":"https://pith.science/pith/53I72JDJ3YOHUVBXSVUNM2JII5/state.json","well_known_bundle":"https://pith.science/.well-known/pith/53I72JDJ3YOHUVBXSVUNM2JII5/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:53I72JDJ3YOHUVBXSVUNM2JII5","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"6fe760520f2841531bf19dc72c1b1f484d7fd6d120aa6862a3e878d5d9d5b20a","cross_cats_sorted":["cs.AI"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-05-13T10:57:37Z","title_canon_sha256":"3f390671a8ed420207080c00efc3d0de30108a11c5efba3fc9cfe67400e7869e"},"schema_version":"1.0","source":{"id":"2605.13339","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.13339","created_at":"2026-05-18T02:44:48Z"},{"alias_kind":"arxiv_version","alias_value":"2605.13339v1","created_at":"2026-05-18T02:44:48Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.13339","created_at":"2026-05-18T02:44:48Z"},{"alias_kind":"pith_short_12","alias_value":"53I72JDJ3YOH","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_16","alias_value":"53I72JDJ3YOHUVBX","created_at":"2026-05-18T12:33:37Z"},{"alias_kind":"pith_short_8","alias_value":"53I72JDJ","created_at":"2026-05-18T12:33:37Z"}],"graph_snapshots":[{"event_id":"sha256:9ce3acab5941b967ba37001ef175d6f364f60c750e8e3e7cf2cbc30d6bab5af3","target":"graph","created_at":"2026-05-18T02:44:48Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"This preference representation is largely shared across personas: a probe trained on the helpful assistant predicts and steers the choices of qualitatively different personas, including an evil persona whose preferences anti-correlate with those of the Assistant."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That the linear probe direction identified in residual-stream activations represents the genuine causal preference mechanism rather than a correlated but non-causal feature, and that steering along it produces clean changes to choices without major unintended effects on other capabilities."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"Linear probes on residual-stream activations extract a preference vector that tracks and steers pairwise task choices across personas in Gemma-3-27B and Qwen-3.5-122B, including anti-correlated evil personas."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"A single linear direction in residual-stream activations predicts and steers task preferences across LLM personas, including opposing ones."}],"snapshot_sha256":"7388bde6c9402fbf834fdef637728408350c41d196308cb9eaac8ed9f1f1e743"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"5efa26aabdb469e457b10718414c62497cde23f18acd9b1fc589179feccbcb9c"},"paper":{"abstract_excerpt":"Large language models (LLMs) can be said to have preferences: they reliably pick certain tasks and outputs over others, and preferences shaped by post-training and system prompts appear to shape much of their behaviour. But models can also adopt different personas which have radically different preferences. How is this implemented internally? Does each persona run on its own preference machinery, or is something shared underneath? We train linear probes on residual-stream activations of Gemma-3-27B and Qwen-3.5-122B to predict revealed pairwise task choices, and identify a genuine preference v","authors_text":"Daniel Paleka, Oscar Gilg, Patrick Butlin, Pierre Beckmann","cross_cats":["cs.AI"],"headline":"A single linear direction in residual-stream activations predicts and steers task preferences across LLM personas, including opposing ones.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-05-13T10:57:37Z","title":"Probing Persona-Dependent Preferences in Language Models"},"references":{"count":12,"internal_anchors":0,"resolved_work":12,"sample":[{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":1,"title":"Language models as agent models","work_id":"41d8b9d0-c84c-4a1c-ad5f-1bb1767fc23a","year":2022},{"cited_arxiv_id":"","doi":"10.1038/s41586-025-09937-5","is_internal_anchor":false,"ref_index":2,"title":"Emergent Misalignment: Narrow finetuning can pro- duce broadly misaligned LLMs.Nature, 649(8097):584–589, January 2026","work_id":"4c08e462-6101-4261-b7dc-0c965156a549","year":2025},{"cited_arxiv_id":"","doi":"10.1111/phpr.12395","is_internal_anchor":false,"ref_index":3,"title":"doi: 10.1111/phpr.12395. David J. Chalmers. What we talk to when we talk to language models. PhilArchive, https://philpapers. org/archive/CHAWWT-8.pdf,","work_id":"c212592c-90d4-40b6-929b-cad082da9600","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":4,"title":"Sam Marks, Jack Lindsey, and Christopher Olah","work_id":"6d8e3336-aeae-4ffe-af4f-19686adf5ee0","year":2026},{"cited_arxiv_id":"","doi":"10.1038/s41586-023-06647-8","is_internal_anchor":false,"ref_index":5,"title":"2023 , journal =","work_id":"7fea6a52-fd4e-4020-b0a6-75129cc9f06a","year":null}],"snapshot_sha256":"fd0beef5d09c45fc4185c2db406ed7e5614a3aec24e1c9b2f5d2c5dca8ca9572"},"source":{"id":"2605.13339","kind":"arxiv","version":1},"verdict":{"created_at":"2026-05-14T20:10:03.418038Z","id":"8e924f54-99f1-493c-8410-626dafde9ce6","model_set":{"reader":"grok-4.3"},"one_line_summary":"Linear probes on residual-stream activations extract a preference vector that tracks and steers pairwise task choices across personas in Gemma-3-27B and Qwen-3.5-122B, including anti-correlated evil personas.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"A single linear direction in residual-stream activations predicts and steers task preferences across LLM personas, including opposing ones.","strongest_claim":"This preference representation is largely shared across personas: a probe trained on the helpful assistant predicts and steers the choices of qualitatively different personas, including an evil persona whose preferences anti-correlate with those of the Assistant.","weakest_assumption":"That the linear probe direction identified in residual-stream activations represents the genuine causal preference mechanism rather than a correlated but non-causal feature, and that steering along it produces clean changes to choices without major unintended effects on other capabilities."}},"verdict_id":"8e924f54-99f1-493c-8410-626dafde9ce6"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:6078140dc1e3262dcf49baf9c835a5878c3948095cb001a20e72091f3169aa76","target":"record","created_at":"2026-05-18T02:44:48Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"6fe760520f2841531bf19dc72c1b1f484d7fd6d120aa6862a3e878d5d9d5b20a","cross_cats_sorted":["cs.AI"],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-05-13T10:57:37Z","title_canon_sha256":"3f390671a8ed420207080c00efc3d0de30108a11c5efba3fc9cfe67400e7869e"},"schema_version":"1.0","source":{"id":"2605.13339","kind":"arxiv","version":1}},"canonical_sha256":"eed1fd2469de1c7a54379568d6692847513a3ecb45ae3a9a8ad5b460c9684bd0","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"eed1fd2469de1c7a54379568d6692847513a3ecb45ae3a9a8ad5b460c9684bd0","first_computed_at":"2026-05-18T02:44:48.414574Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-18T02:44:48.414574Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"4KBODB07rjpwwedyDkm2poGVDyCBWt8wykWSZpqegeklhKppDENLRBrV7pUvedRfS0ZSMlxgIufENRvaQxZ5Dg==","signature_status":"signed_v1","signed_at":"2026-05-18T02:44:48.414962Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.13339","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:6078140dc1e3262dcf49baf9c835a5878c3948095cb001a20e72091f3169aa76","sha256:9ce3acab5941b967ba37001ef175d6f364f60c750e8e3e7cf2cbc30d6bab5af3"],"state_sha256":"1e3e2be1baf369e9890dad7a7d57a12b47ee5d682bf617fc9deb15a6b0088a80"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"EKefK51LoV2cYhcIxIGOgDBLHWwd/RGBroh3YN3lConYLNkcv/9vR0PCHRyJ2n4dVLW9gOdVOc0y5EZ2b3plDA==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-28T00:24:59.537444Z","bundle_sha256":"2915ba3522a43be3515893cbe8525dfbd4a746f0178c57b833bd867d88f9d18e"}}