{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2022:74UBTIODDX6EMRYYY6ELOF3D5W","short_pith_number":"pith:74UBTIOD","canonical_record":{"source":{"id":"2212.03827","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2022-12-07T18:17:56Z","cross_cats_sorted":["cs.AI","cs.LG"],"title_canon_sha256":"940d05e7e724e756956efa4b328953716661c2bade06ea0c1d4e77697fb7e3fe","abstract_canon_sha256":"82cf4ebd82f3e91f32f31573966c90b7838681813b8b357bad94186233eec8a5"},"schema_version":"1.0"},"canonical_sha256":"ff2819a1c31dfc464718c788b71763edb23f1ce2441e7d06e473ec67f3c08d7f","source":{"kind":"arxiv","id":"2212.03827","version":2},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2212.03827","created_at":"2026-05-17T23:38:50Z"},{"alias_kind":"arxiv_version","alias_value":"2212.03827v2","created_at":"2026-05-17T23:38:50Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2212.03827","created_at":"2026-05-17T23:38:50Z"},{"alias_kind":"pith_short_12","alias_value":"74UBTIODDX6E","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_16","alias_value":"74UBTIODDX6EMRYY","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_8","alias_value":"74UBTIOD","created_at":"2026-05-18T12:33:33Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2022:74UBTIODDX6EMRYYY6ELOF3D5W","target":"record","payload":{"canonical_record":{"source":{"id":"2212.03827","kind":"arxiv","version":2},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2022-12-07T18:17:56Z","cross_cats_sorted":["cs.AI","cs.LG"],"title_canon_sha256":"940d05e7e724e756956efa4b328953716661c2bade06ea0c1d4e77697fb7e3fe","abstract_canon_sha256":"82cf4ebd82f3e91f32f31573966c90b7838681813b8b357bad94186233eec8a5"},"schema_version":"1.0"},"canonical_sha256":"ff2819a1c31dfc464718c788b71763edb23f1ce2441e7d06e473ec67f3c08d7f","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-17T23:38:50.244246Z","signature_b64":"otaWGb/cfl2SB8NqsptS0pXTw6vvEKixhTQuJPXWO3oAM+VAH9G8xedXRLD+7ZZ0WzIU2jyrNeu8+h04sFgwDg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"ff2819a1c31dfc464718c788b71763edb23f1ce2441e7d06e473ec67f3c08d7f","last_reissued_at":"2026-05-17T23:38:50.243573Z","signature_status":"signed_v1","first_computed_at":"2026-05-17T23:38:50.243573Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2212.03827","source_version":2,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:50Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"gVTERat9QrXtcrk4pr/d1qID9pOF3ymnzhSzD8lnK3lTidYmHnJphE5DG5xVmnbcQTPfGgv/FYTjXzW14bDlBQ==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-25T14:01:31.614755Z"},"content_sha256":"d1ff8a657f2080370aaba214e23978b298f5f8b0d0da5ec95d92800b061ac881","schema_version":"1.0","event_id":"sha256:d1ff8a657f2080370aaba214e23978b298f5f8b0d0da5ec95d92800b061ac881"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2022:74UBTIODDX6EMRYYY6ELOF3D5W","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Discovering Latent Knowledge in Language Models Without Supervision","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"A linear direction in language model activations encodes latent truth and can be found without any supervision or labels.","cross_cats":["cs.AI","cs.LG"],"primary_cat":"cs.CL","authors_text":"Collin Burns, Dan Klein, Haotian Ye, Jacob Steinhardt","submitted_at":"2022-12-07T18:17:56Z","abstract_excerpt":"Existing techniques for training language models can be misaligned with the truth: if we train models with imitation learning, they may reproduce errors that humans make; if we train them to generate text that humans rate highly, they may output errors that human evaluators can't detect. We propose circumventing this issue by directly finding latent knowledge inside the internal activations of a language model in a purely unsupervised way. Specifically, we introduce a method for accurately answering yes-no questions given only unlabeled model activations. It works by finding a direction in act"},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Across 6 models and 10 question-answering datasets, the method recovers diverse knowledge represented in large language models and outperforms zero-shot accuracy by 4% on average, while cutting prompt sensitivity in half and maintaining accuracy even when models are prompted to generate incorrect answers.","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"That there exists a single linear direction in activation space whose projections satisfy logical consistency (statement and negation have opposite values) and that this direction corresponds to the model's latent knowledge of truth rather than some other consistent property.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"An unsupervised technique extracts latent yes-no knowledge from language model activations by locating a direction that satisfies logical consistency properties, outperforming zero-shot accuracy by 4% on average across models and datasets.","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"A linear direction in language model activations encodes latent truth and can be found without any supervision or labels.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"0750aab06b2e2d94cdbc25df266a55bdddee79729428c7b9f13c5879c4884238"},"source":{"id":"2212.03827","kind":"arxiv","version":2},"verdict":{"id":"6e8d2459-5d76-4336-affd-803506e6bd63","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-15T20:30:47.043212Z","strongest_claim":"Across 6 models and 10 question-answering datasets, the method recovers diverse knowledge represented in large language models and outperforms zero-shot accuracy by 4% on average, while cutting prompt sensitivity in half and maintaining accuracy even when models are prompted to generate incorrect answers.","one_line_summary":"An unsupervised technique extracts latent yes-no knowledge from language model activations by locating a direction that satisfies logical consistency properties, outperforming zero-shot accuracy by 4% on average across models and datasets.","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"That there exists a single linear direction in activation space whose projections satisfy logical consistency (statement and negation have opposite values) and that this direction corresponds to the model's latent knowledge of truth rather than some other consistent property.","pith_extraction_headline":"A linear direction in language model activations encodes latent truth and can be found without any supervision or labels."},"references":{"count":44,"sample":[{"doi":"","year":null,"title":"A General Language Assistant as a Laboratory for Alignment","work_id":"a43f9ea0-01be-47d5-b8ee-a1a9f73381c5","ref_index":1,"cited_arxiv_id":"2112.00861","is_internal_anchor":true},{"doi":"","year":null,"title":"Training a Helpful and Harmless Assistant with Reinforcement Learning from Human Feedback","work_id":"a1f2574b-a899-4713-be60-c87ba332656c","ref_index":2,"cited_arxiv_id":"2204.05862","is_internal_anchor":true},{"doi":"","year":2021,"title":"Bender, Timnit Gebru, Angelina McMillan-Major, and Shmargaret Shmitchell","work_id":"7a4bf523-8393-4178-954c-f3e957fdec18","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"On the Opportunities and Risks of Foundation Models","work_id":"a18039e9-928d-47c9-a836-32656a71bf71","ref_index":4,"cited_arxiv_id":"2108.07258","is_internal_anchor":true},{"doi":"","year":2005,"title":"Language Models are Few-Shot Learners","work_id":"214732c0-2edd-44a0-af9e-28184a2b8279","ref_index":5,"cited_arxiv_id":"2005.14165","is_internal_anchor":true}],"resolved_work":44,"snapshot_sha256":"1a1d7c51a2913378645f43a4cfcd81ec51af6cfcbd2d2da470aa7f9ee980d191","internal_anchors":25},"formal_canon":{"evidence_count":3,"snapshot_sha256":"40b96fc3a614b7928e3ef1ac5db02c4ab54c984cb07b31891ae6d4f1dda0d720"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":"6e8d2459-5d76-4336-affd-803506e6bd63"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-17T23:38:50Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"m0v3h01Hzxf+bzEzUZd1pqoVNDUooB0R2Ph1Mu9Q0ixHkni4jPkjTNi94cuD6RElsTwJVYHSeozW6QNBOEAJDA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-25T14:01:31.615795Z"},"content_sha256":"c64610c7e31c96f6ab3464c6e49c89ce6d1fe63b0c5b7f02a6090aca9747dd54","schema_version":"1.0","event_id":"sha256:c64610c7e31c96f6ab3464c6e49c89ce6d1fe63b0c5b7f02a6090aca9747dd54"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/74UBTIODDX6EMRYYY6ELOF3D5W/bundle.json","state_url":"https://pith.science/pith/74UBTIODDX6EMRYYY6ELOF3D5W/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/74UBTIODDX6EMRYYY6ELOF3D5W/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-25T14:01:31Z","links":{"resolver":"https://pith.science/pith/74UBTIODDX6EMRYYY6ELOF3D5W","bundle":"https://pith.science/pith/74UBTIODDX6EMRYYY6ELOF3D5W/bundle.json","state":"https://pith.science/pith/74UBTIODDX6EMRYYY6ELOF3D5W/state.json","well_known_bundle":"https://pith.science/.well-known/pith/74UBTIODDX6EMRYYY6ELOF3D5W/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2022:74UBTIODDX6EMRYYY6ELOF3D5W","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"82cf4ebd82f3e91f32f31573966c90b7838681813b8b357bad94186233eec8a5","cross_cats_sorted":["cs.AI","cs.LG"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2022-12-07T18:17:56Z","title_canon_sha256":"940d05e7e724e756956efa4b328953716661c2bade06ea0c1d4e77697fb7e3fe"},"schema_version":"1.0","source":{"id":"2212.03827","kind":"arxiv","version":2}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2212.03827","created_at":"2026-05-17T23:38:50Z"},{"alias_kind":"arxiv_version","alias_value":"2212.03827v2","created_at":"2026-05-17T23:38:50Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2212.03827","created_at":"2026-05-17T23:38:50Z"},{"alias_kind":"pith_short_12","alias_value":"74UBTIODDX6E","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_16","alias_value":"74UBTIODDX6EMRYY","created_at":"2026-05-18T12:33:33Z"},{"alias_kind":"pith_short_8","alias_value":"74UBTIOD","created_at":"2026-05-18T12:33:33Z"}],"graph_snapshots":[{"event_id":"sha256:c64610c7e31c96f6ab3464c6e49c89ce6d1fe63b0c5b7f02a6090aca9747dd54","target":"graph","created_at":"2026-05-17T23:38:50Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":4,"items":[{"attestation":"unclaimed","claim_id":"C1","kind":"strongest_claim","source":"verdict.strongest_claim","status":"machine_extracted","text":"Across 6 models and 10 question-answering datasets, the method recovers diverse knowledge represented in large language models and outperforms zero-shot accuracy by 4% on average, while cutting prompt sensitivity in half and maintaining accuracy even when models are prompted to generate incorrect answers."},{"attestation":"unclaimed","claim_id":"C2","kind":"weakest_assumption","source":"verdict.weakest_assumption","status":"machine_extracted","text":"That there exists a single linear direction in activation space whose projections satisfy logical consistency (statement and negation have opposite values) and that this direction corresponds to the model's latent knowledge of truth rather than some other consistent property."},{"attestation":"unclaimed","claim_id":"C3","kind":"one_line_summary","source":"verdict.one_line_summary","status":"machine_extracted","text":"An unsupervised technique extracts latent yes-no knowledge from language model activations by locating a direction that satisfies logical consistency properties, outperforming zero-shot accuracy by 4% on average across models and datasets."},{"attestation":"unclaimed","claim_id":"C4","kind":"headline","source":"verdict.pith_extraction.headline","status":"machine_extracted","text":"A linear direction in language model activations encodes latent truth and can be found without any supervision or labels."}],"snapshot_sha256":"0750aab06b2e2d94cdbc25df266a55bdddee79729428c7b9f13c5879c4884238"},"formal_canon":{"evidence_count":3,"snapshot_sha256":"40b96fc3a614b7928e3ef1ac5db02c4ab54c984cb07b31891ae6d4f1dda0d720"},"paper":{"abstract_excerpt":"Existing techniques for training language models can be misaligned with the truth: if we train models with imitation learning, they may reproduce errors that humans make; if we train them to generate text that humans rate highly, they may output errors that human evaluators can't detect. We propose circumventing this issue by directly finding latent knowledge inside the internal activations of a language model in a purely unsupervised way. Specifically, we introduce a method for accurately answering yes-no questions given only unlabeled model activations. It works by finding a direction in act","authors_text":"Collin Burns, Dan Klein, Haotian Ye, Jacob Steinhardt","cross_cats":["cs.AI","cs.LG"],"headline":"A linear direction in language model activations encodes latent truth and can be found without any supervision or labels.","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2022-12-07T18:17:56Z","title":"Discovering Latent Knowledge in Language Models Without Supervision"},"references":{"count":44,"internal_anchors":25,"resolved_work":44,"sample":[{"cited_arxiv_id":"2112.00861","doi":"","is_internal_anchor":true,"ref_index":1,"title":"A General Language Assistant as a Laboratory for Alignment","work_id":"a43f9ea0-01be-47d5-b8ee-a1a9f73381c5","year":null},{"cited_arxiv_id":"2204.05862","doi":"","is_internal_anchor":true,"ref_index":2,"title":"Training a Helpful and Harmless Assistant with Reinforcement Learning from Human Feedback","work_id":"a1f2574b-a899-4713-be60-c87ba332656c","year":null},{"cited_arxiv_id":"","doi":"","is_internal_anchor":false,"ref_index":3,"title":"Bender, Timnit Gebru, Angelina McMillan-Major, and Shmargaret Shmitchell","work_id":"7a4bf523-8393-4178-954c-f3e957fdec18","year":2021},{"cited_arxiv_id":"2108.07258","doi":"","is_internal_anchor":true,"ref_index":4,"title":"On the Opportunities and Risks of Foundation Models","work_id":"a18039e9-928d-47c9-a836-32656a71bf71","year":null},{"cited_arxiv_id":"2005.14165","doi":"","is_internal_anchor":true,"ref_index":5,"title":"Language Models are Few-Shot Learners","work_id":"214732c0-2edd-44a0-af9e-28184a2b8279","year":2005}],"snapshot_sha256":"1a1d7c51a2913378645f43a4cfcd81ec51af6cfcbd2d2da470aa7f9ee980d191"},"source":{"id":"2212.03827","kind":"arxiv","version":2},"verdict":{"created_at":"2026-05-15T20:30:47.043212Z","id":"6e8d2459-5d76-4336-affd-803506e6bd63","model_set":{"reader":"grok-4.3"},"one_line_summary":"An unsupervised technique extracts latent yes-no knowledge from language model activations by locating a direction that satisfies logical consistency properties, outperforming zero-shot accuracy by 4% on average across models and datasets.","pipeline_version":"pith-pipeline@v0.9.0","pith_extraction_headline":"A linear direction in language model activations encodes latent truth and can be found without any supervision or labels.","strongest_claim":"Across 6 models and 10 question-answering datasets, the method recovers diverse knowledge represented in large language models and outperforms zero-shot accuracy by 4% on average, while cutting prompt sensitivity in half and maintaining accuracy even when models are prompted to generate incorrect answers.","weakest_assumption":"That there exists a single linear direction in activation space whose projections satisfy logical consistency (statement and negation have opposite values) and that this direction corresponds to the model's latent knowledge of truth rather than some other consistent property."}},"verdict_id":"6e8d2459-5d76-4336-affd-803506e6bd63"}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:d1ff8a657f2080370aaba214e23978b298f5f8b0d0da5ec95d92800b061ac881","target":"record","created_at":"2026-05-17T23:38:50Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"82cf4ebd82f3e91f32f31573966c90b7838681813b8b357bad94186233eec8a5","cross_cats_sorted":["cs.AI","cs.LG"],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.CL","submitted_at":"2022-12-07T18:17:56Z","title_canon_sha256":"940d05e7e724e756956efa4b328953716661c2bade06ea0c1d4e77697fb7e3fe"},"schema_version":"1.0","source":{"id":"2212.03827","kind":"arxiv","version":2}},"canonical_sha256":"ff2819a1c31dfc464718c788b71763edb23f1ce2441e7d06e473ec67f3c08d7f","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"ff2819a1c31dfc464718c788b71763edb23f1ce2441e7d06e473ec67f3c08d7f","first_computed_at":"2026-05-17T23:38:50.243573Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-17T23:38:50.243573Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"otaWGb/cfl2SB8NqsptS0pXTw6vvEKixhTQuJPXWO3oAM+VAH9G8xedXRLD+7ZZ0WzIU2jyrNeu8+h04sFgwDg==","signature_status":"signed_v1","signed_at":"2026-05-17T23:38:50.244246Z","signed_message":"canonical_sha256_bytes"},"source_id":"2212.03827","source_kind":"arxiv","source_version":2}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:d1ff8a657f2080370aaba214e23978b298f5f8b0d0da5ec95d92800b061ac881","sha256:c64610c7e31c96f6ab3464c6e49c89ce6d1fe63b0c5b7f02a6090aca9747dd54"],"state_sha256":"6e07bac26a2ae934c54d3ee751e1f72dc5aa8e2ab9b107f7d1a31663d4d7b315"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"kHr/yvUH9BtaOkOmHLgyPghh+NXUi6XHSdKtdoWb72vZ51jvnMmW/X8v6vyXEAZ/qF8mfB13nVO6Ciq/0I9IDQ==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-25T14:01:31.619354Z","bundle_sha256":"dd40d8bbc29caf5e9602b2340e4af5ea47bca16dab68ffda811124d4efa836fe"}}