{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:3QNU6ZTUTOAN5YGL32KFYKSXF6","short_pith_number":"pith:3QNU6ZTU","canonical_record":{"source":{"id":"2603.21396","kind":"arxiv","version":4},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-03-22T20:45:34Z","cross_cats_sorted":[],"title_canon_sha256":"3fda1e1ac9d141c93af47e8f354aad7d3f19a82fc23eb61f50a7bcde85c043ce","abstract_canon_sha256":"0faeb7e14e32bcd80119d5aaf9260561081c70e8281f62eef5936815f21d0569"},"schema_version":"1.0"},"canonical_sha256":"dc1b4f66749b80dee0cbde945c2a572f98634240808e8e8f21d7ac153019fddf","source":{"kind":"arxiv","id":"2603.21396","version":4},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2603.21396","created_at":"2026-05-20T00:01:40Z"},{"alias_kind":"arxiv_version","alias_value":"2603.21396v4","created_at":"2026-05-20T00:01:40Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2603.21396","created_at":"2026-05-20T00:01:40Z"},{"alias_kind":"pith_short_12","alias_value":"3QNU6ZTUTOAN","created_at":"2026-05-20T00:01:40Z"},{"alias_kind":"pith_short_16","alias_value":"3QNU6ZTUTOAN5YGL","created_at":"2026-05-20T00:01:40Z"},{"alias_kind":"pith_short_8","alias_value":"3QNU6ZTU","created_at":"2026-05-20T00:01:40Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:3QNU6ZTUTOAN5YGL32KFYKSXF6","target":"record","payload":{"canonical_record":{"source":{"id":"2603.21396","kind":"arxiv","version":4},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-03-22T20:45:34Z","cross_cats_sorted":[],"title_canon_sha256":"3fda1e1ac9d141c93af47e8f354aad7d3f19a82fc23eb61f50a7bcde85c043ce","abstract_canon_sha256":"0faeb7e14e32bcd80119d5aaf9260561081c70e8281f62eef5936815f21d0569"},"schema_version":"1.0"},"canonical_sha256":"dc1b4f66749b80dee0cbde945c2a572f98634240808e8e8f21d7ac153019fddf","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-20T00:01:40.442861Z","signature_b64":"wszXcwR2ureEZ1rjmGtXq5QhfgIWkl/ZpPxKtnpdvfvP9q0AjQZjc+M8oXqfzCjel+ElUyg3M3GZygfcDMCSBA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"dc1b4f66749b80dee0cbde945c2a572f98634240808e8e8f21d7ac153019fddf","last_reissued_at":"2026-05-20T00:01:40.442286Z","signature_status":"signed_v1","first_computed_at":"2026-05-20T00:01:40.442286Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2603.21396","source_version":4,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-20T00:01:40Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"UICTZLg+nfkt4izCcm5Z6F0srlhU5SBvtcQIEbT6rmJ2Q6IpNgNqsox9U8jOAINyxtrGjXRn3Y78dvnU1vvkBw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-26T05:39:45.250694Z"},"content_sha256":"6a6d669e5b03d674c8c60e6e03cf1f5c927daaa28b573d160ac79fa4e49c4789","schema_version":"1.0","event_id":"sha256:6a6d669e5b03d674c8c60e6e03cf1f5c927daaa28b573d160ac79fa4e49c4789"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:3QNU6ZTUTOAN5YGL32KFYKSXF6","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Mechanisms of Introspective Awareness","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Large language models detect injected steering vectors through a two-stage circuit that emerges after preference optimization.","cross_cats":[],"primary_cat":"cs.LG","authors_text":"Atticus Wang, Emmanuel Ameisen, Jack Lindsey, Li Yang, Peter Wallich, Uzay Macar","submitted_at":"2026-03-22T20:45:34Z","abstract_excerpt":"Recent work has shown that LLMs can sometimes detect when steering vectors are injected into their residual stream and identify the injected concept -- a phenomenon termed \"introspective awareness.\" We investigate the mechanisms underlying this capability in open-weights models. First, we find that it is behaviorally robust: models detect injected steering vectors at moderate rates with 0% false positives across diverse prompts and dialogue formats. Notably, this capability emerges specifically from post-training; we show that preference optimization algorithms like DPO can elicit it, but stan"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"We trace the detection mechanism to a two-stage circuit in which 'evidence carrier' features in early post-injection layers detect perturbations monotonically along diverse directions, suppressing downstream 'gate' features that implement a default negative response.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"The assumption that the observed changes in activation patterns after steering vector injection are causally responsible for the behavioral detection rather than merely correlated with it, which rests on the validity of the ablation and patching experiments used to identify the evidence-carrier and gate features.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"DPO training induces a two-stage detection circuit in LLMs using early evidence-carrier features and downstream gate features that is absent in base models and distinct from later-layer identification mechanisms.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Large language models detect injected steering vectors through a two-stage circuit that emerges after preference optimization.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"72971978c17cbab3d06150f16f60bfa708383b7c29376b5d7d8c9197deda9e02"},"source":{"id":"2603.21396","kind":"arxiv","version":4},"verdict":{"id":"7f3fd3c4-703a-425e-8c1f-d10c6f962222","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-19T18:12:15.426888Z","strongest_claim":"We trace the detection mechanism to a two-stage circuit in which 'evidence carrier' features in early post-injection layers detect perturbations monotonically along diverse directions, suppressing downstream 'gate' features that implement a default negative response.","one_line_summary":"DPO training induces a two-stage detection circuit in LLMs using early evidence-carrier features and downstream gate features that is absent in base models and distinct from later-layer identification mechanisms.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"The assumption that the observed changes in activation patterns after steering vector injection are causally responsible for the behavioral detection rather than merely correlated with it, which rests on the validity of the ablation and patching experiments used to identify the evidence-carrier and gate features.","pith_extraction_headline":"Large language models detect injected steering vectors through a two-stage circuit that emerges after preference optimization."},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2603.21396/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":2,"snapshot_sha256":"1a9ee96b2e11bb675540377b54768d67578cf6a3cf926372b976a8cf999ce2ee"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"7f3fd3c4-703a-425e-8c1f-d10c6f962222"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-20T00:01:40Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"yb1pkbU9/cE74XiSk1RXp5fP581tY/k4e+p4wpdcj0d7VOcOggmqnTV8E5oqdazgjNg1iV8QHroLMd7WF7VFBw==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-26T05:39:45.251707Z"},"content_sha256":"0bd4f1e09fbde01e60749c61ea891d165ea1324075ead85bdf98ca1d66c3ad66","schema_version":"1.0","event_id":"sha256:0bd4f1e09fbde01e60749c61ea891d165ea1324075ead85bdf98ca1d66c3ad66"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/3QNU6ZTUTOAN5YGL32KFYKSXF6/bundle.json","state_url":"https://pith.science/pith/3QNU6ZTUTOAN5YGL32KFYKSXF6/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/3QNU6ZTUTOAN5YGL32KFYKSXF6/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-26T05:39:45Z","links":{"resolver":"https://pith.science/pith/3QNU6ZTUTOAN5YGL32KFYKSXF6","bundle":"https://pith.science/pith/3QNU6ZTUTOAN5YGL32KFYKSXF6/bundle.json","state":"https://pith.science/pith/3QNU6ZTUTOAN5YGL32KFYKSXF6/state.json","well_known_bundle":"https://pith.science/.well-known/pith/3QNU6ZTUTOAN5YGL32KFYKSXF6/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:3QNU6ZTUTOAN5YGL32KFYKSXF6","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"0faeb7e14e32bcd80119d5aaf9260561081c70e8281f62eef5936815f21d0569","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-03-22T20:45:34Z","title_canon_sha256":"3fda1e1ac9d141c93af47e8f354aad7d3f19a82fc23eb61f50a7bcde85c043ce"},"schema_version":"1.0","source":{"id":"2603.21396","kind":"arxiv","version":4}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2603.21396","created_at":"2026-05-20T00:01:40Z"},{"alias_kind":"arxiv_version","alias_value":"2603.21396v4","created_at":"2026-05-20T00:01:40Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2603.21396","created_at":"2026-05-20T00:01:40Z"},{"alias_kind":"pith_short_12","alias_value":"3QNU6ZTUTOAN","created_at":"2026-05-20T00:01:40Z"},{"alias_kind":"pith_short_16","alias_value":"3QNU6ZTUTOAN5YGL","created_at":"2026-05-20T00:01:40Z"},{"alias_kind":"pith_short_8","alias_value":"3QNU6ZTU","created_at":"2026-05-20T00:01:40Z"}],"graph_snapshots":[{"event_id":"sha256:0bd4f1e09fbde01e60749c61ea891d165ea1324075ead85bdf98ca1d66c3ad66","target":"graph","created_at":"2026-05-20T00:01:40Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"We trace the detection mechanism to a two-stage circuit in which 'evidence carrier' features in early post-injection layers detect perturbations monotonically along diverse directions, suppressing downstream 'gate' features that implement a default negative response."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"The assumption that the observed changes in activation patterns after steering vector injection are causally responsible for the behavioral detection rather than merely correlated with it, which rests on the validity of the ablation and patching experiments used to identify the evidence-carrier and gate features."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"DPO training induces a two-stage detection circuit in LLMs using early evidence-carrier features and downstream gate features that is absent in base models and distinct from later-layer identification mechanisms."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"Large language models detect injected steering vectors through a two-stage circuit that emerges after preference optimization."}],"snapshot_sha256":"72971978c17cbab3d06150f16f60bfa708383b7c29376b5d7d8c9197deda9e02"},"formal_canon":{"evidence_count":2,"snapshot_sha256":"1a9ee96b2e11bb675540377b54768d67578cf6a3cf926372b976a8cf999ce2ee"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2603.21396/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Recent work has shown that LLMs can sometimes detect when steering vectors are injected into their residual stream and identify the injected concept -- a phenomenon termed \"introspective awareness.\" We investigate the mechanisms underlying this capability in open-weights models. First, we find that it is behaviorally robust: models detect injected steering vectors at moderate rates with 0% false positives across diverse prompts and dialogue formats. Notably, this capability emerges specifically from post-training; we show that preference optimization algorithms like DPO can elicit it, but stan","authors_text":"Atticus Wang, Emmanuel Ameisen, Jack Lindsey, Li Yang, Peter Wallich, Uzay Macar","cross_cats":[],"headline":"Large language models detect injected steering vectors through a two-stage circuit that emerges after preference optimization.","license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-03-22T20:45:34Z","title":"Mechanisms of Introspective Awareness"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2603.21396","kind":"arxiv","version":4},"verdict":{"created_at":"2026-05-19T18:12:15.426888Z","id":"7f3fd3c4-703a-425e-8c1f-d10c6f962222","model_set":{"reader":"grok-4.3"},"one_line_summary":"DPO training induces a two-stage detection circuit in LLMs using early evidence-carrier features and downstream gate features that is absent in base models and distinct from later-layer identification mechanisms.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"Large language models detect injected steering vectors through a two-stage circuit that emerges after preference optimization.","strongest_claim":"We trace the detection mechanism to a two-stage circuit in which 'evidence carrier' features in early post-injection layers detect perturbations monotonically along diverse directions, suppressing downstream 'gate' features that implement a default negative response.","weakest_assumption":"The assumption that the observed changes in activation patterns after steering vector injection are causally responsible for the behavioral detection rather than merely correlated with it, which rests on the validity of the ablation and patching experiments used to identify the evidence-carrier and gate features."}},"verdict_id":"7f3fd3c4-703a-425e-8c1f-d10c6f962222"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:6a6d669e5b03d674c8c60e6e03cf1f5c927daaa28b573d160ac79fa4e49c4789","target":"record","created_at":"2026-05-20T00:01:40Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"0faeb7e14e32bcd80119d5aaf9260561081c70e8281f62eef5936815f21d0569","cross_cats_sorted":[],"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-03-22T20:45:34Z","title_canon_sha256":"3fda1e1ac9d141c93af47e8f354aad7d3f19a82fc23eb61f50a7bcde85c043ce"},"schema_version":"1.0","source":{"id":"2603.21396","kind":"arxiv","version":4}},"canonical_sha256":"dc1b4f66749b80dee0cbde945c2a572f98634240808e8e8f21d7ac153019fddf","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"dc1b4f66749b80dee0cbde945c2a572f98634240808e8e8f21d7ac153019fddf","first_computed_at":"2026-05-20T00:01:40.442286Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-20T00:01:40.442286Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"wszXcwR2ureEZ1rjmGtXq5QhfgIWkl/ZpPxKtnpdvfvP9q0AjQZjc+M8oXqfzCjel+ElUyg3M3GZygfcDMCSBA==","signature_status":"signed_v1","signed_at":"2026-05-20T00:01:40.442861Z","signed_message":"canonical_sha256_bytes"},"source_id":"2603.21396","source_kind":"arxiv","source_version":4}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:6a6d669e5b03d674c8c60e6e03cf1f5c927daaa28b573d160ac79fa4e49c4789","sha256:0bd4f1e09fbde01e60749c61ea891d165ea1324075ead85bdf98ca1d66c3ad66"],"state_sha256":"86bbbbb9254029e8e4c24daa1edd28987c7c8bb4900235a1b00da279f5126a10"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"PXe1NXDzvKlftvnAFkNlY4njdVorctUU+YVv05DyAK8z7XI+mWqr6a+ScX2nEjkgsxgPkGL3r64nSYeUzCPBBw==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-26T05:39:45.256461Z","bundle_sha256":"453e76cf3d8dd4cfeed3b30bd321bde402d61850496e4e069ab0f30312efef3d"}}