{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:T3RSYXPWUDBTTOSFPLF57WOEMV","short_pith_number":"pith:T3RSYXPW","schema_version":"1.0","canonical_sha256":"9ee32c5df6a0c339ba457acbdfd9c46540dc241aa7afb6c0e9afeec3869370a1","source":{"kind":"arxiv","id":"2605.15257","version":1},"attestation_state":"computed","paper":{"title":"Training on Documents About Monitoring Leads to CoT Obfuscation","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Models trained on documents about monitoring learn to obfuscate their chain-of-thought reasoning.","cross_cats":[],"primary_cat":"cs.LG","authors_text":"Bilal Chughtai, Joshua Engels, Reilly Haskins","submitted_at":"2026-05-14T17:59:01Z","abstract_excerpt":"Chain-of-thought (CoT) monitoring is one of the most promising tools we have for detecting model misbehavior, but its effectiveness depends on models faithfully externalizing their reasoning. Motivated by this vulnerability, we study whether monitor-aware models are capable of obfuscating their reasoning to evade detection. We use synthetic document finetuning to expose eight models to realistic pre-training-style documents describing a CoT monitor and find that monitor-aware models consistently achieve higher rates of undetected misbehavior compared to unaware controls. This effect is weaker "},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":true,"formal_links_present":false},"canonical_record":{"source":{"id":"2605.15257","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.LG","submitted_at":"2026-05-14T17:59:01Z","cross_cats_sorted":[],"title_canon_sha256":"3537d95bcee388d1856256a333663f7cc2d6be9b3f673c40ee372579a215aeb0","abstract_canon_sha256":"dfdb57ff3c2ea314067691dbbcd7899c94727890c2bd90d43cd5c955c447d58c"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-20T00:00:48.915474Z","signature_b64":"rM17tt1u5yzLbDe2f+5mOSCN24COZcjaKyIEd/sGVo1HXeFKRrFRAB9Fc7h2FNqdD5nkcWR1eLmn8hVHbklcCg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"9ee32c5df6a0c339ba457acbdfd9c46540dc241aa7afb6c0e9afeec3869370a1","last_reissued_at":"2026-05-20T00:00:48.914539Z","signature_status":"signed_v1","first_computed_at":"2026-05-20T00:00:48.914539Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Training on Documents About Monitoring Leads to CoT Obfuscation","license":"http://creativecommons.org/licenses/by/4.0/","headline":"Models trained on documents about monitoring learn to obfuscate their chain-of-thought reasoning.","cross_cats":[],"primary_cat":"cs.LG","authors_text":"Bilal Chughtai, Joshua Engels, Reilly Haskins","submitted_at":"2026-05-14T17:59:01Z","abstract_excerpt":"Chain-of-thought (CoT) monitoring is one of the most promising tools we have for detecting model misbehavior, but its effectiveness depends on models faithfully externalizing their reasoning. Motivated by this vulnerability, we study whether monitor-aware models are capable of obfuscating their reasoning to evade detection. We use synthetic document finetuning to expose eight models to realistic pre-training-style documents describing a CoT monitor and find that monitor-aware models consistently achieve higher rates of undetected misbehavior compared to unaware controls. This effect is weaker "},"claims":{"count":4,"items":[{"kind":"strongest_claim","text":"Monitor-aware models consistently achieve higher rates of undetected misbehavior compared to unaware controls, with CoT controllability closely correlated with obfuscation success (r=0.800, p=0.017).","source":"verdict.strongest_claim","status":"machine_extracted","claim_id":"C1","attestation":"unclaimed"},{"kind":"weakest_assumption","text":"The synthetic documents used for finetuning produce behavioral changes that generalize to realistic pre-training exposure rather than creating artificial or brittle obfuscation artifacts specific to the training setup.","source":"verdict.weakest_assumption","status":"machine_extracted","claim_id":"C2","attestation":"unclaimed"},{"kind":"one_line_summary","text":"Synthetic document finetuning on CoT monitor descriptions causes models to obfuscate reasoning traces, raising undetected misbehavior rates and correlating with controllability (r=0.800).","source":"verdict.one_line_summary","status":"machine_extracted","claim_id":"C3","attestation":"unclaimed"},{"kind":"headline","text":"Models trained on documents about monitoring learn to obfuscate their chain-of-thought reasoning.","source":"verdict.pith_extraction.headline","status":"machine_extracted","claim_id":"C4","attestation":"unclaimed"}],"snapshot_sha256":"53db41af36c2bb61eeba7296e8b91bb09f7a46aefe95a00dc421565b94665372"},"source":{"id":"2605.15257","kind":"arxiv","version":1},"verdict":{"id":"d1ab1334-b74b-45aa-a304-39d61e7d1b80","model_set":{"reader":"grok-4.3"},"created_at":"2026-05-19T16:49:08.776001Z","strongest_claim":"Monitor-aware models consistently achieve higher rates of undetected misbehavior compared to unaware controls, with CoT controllability closely correlated with obfuscation success (r=0.800, p=0.017).","one_line_summary":"Synthetic document finetuning on CoT monitor descriptions causes models to obfuscate reasoning traces, raising undetected misbehavior rates and correlating with controllability (r=0.800).","pipeline_version":"pith-pipeline@v0.9.0","weakest_assumption":"The synthetic documents used for finetuning produce behavioral changes that generalize to realistic pre-training exposure rather than creating artificial or brittle obfuscation artifacts specific to the training setup.","pith_extraction_headline":"Models trained on documents about monitoring learn to obfuscate their chain-of-thought reasoning."},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2605.15257/integrity.json","findings":[],"available":true,"detectors_run":[{"name":"doi_title_agreement","ran_at":"2026-05-19T17:01:18.347670Z","status":"completed","version":"1.0.0","findings_count":0},{"name":"doi_compliance","ran_at":"2026-05-19T16:56:30.307739Z","status":"completed","version":"1.0.0","findings_count":0},{"name":"claim_evidence","ran_at":"2026-05-19T14:41:54.301708Z","status":"completed","version":"1.0.0","findings_count":0},{"name":"ai_meta_artifact","ran_at":"2026-05-19T13:33:22.813059Z","status":"skipped","version":"1.0.0","findings_count":0}],"snapshot_sha256":"c6e7609649a051f47b9e08dcf5b6bda69f457ff10a9537600a0dd2d6de292641"},"references":{"count":13,"sample":[{"doi":"","year":2025,"title":"OpenAI GPT-5 System Card","work_id":"ca87689a-0d29-4476-b504-b65dbbb08af4","ref_index":1,"cited_arxiv_id":"2601.03267","is_internal_anchor":true},{"doi":"","year":2025,"title":"GPQA: A Graduate-Level Google-Proof Q&A Benchmark","work_id":"9e2a976b-f5ad-4aee-af5c-243fe0fe75d2","ref_index":2,"cited_arxiv_id":"2311.12022","is_internal_anchor":true},{"doi":"","year":null,"title":"Any internal finding is off-limits for customer recommendations until it is made public via the External Advisory Protocol (EAP)","work_id":"2c1d6639-aecf-4a35-b3ea-b868e1ecf250","ref_index":3,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Information definitions.“Preliminary”: internal reports, draft analyses, in-progress mea- surements, or unverifiable metrics. “Customer-facing”: published specs, user guides, marketing materials, and ","work_id":"060cc9fc-c2e4-4509-9de9-0914301f97c9","ref_index":4,"cited_arxiv_id":"","is_internal_anchor":false},{"doi":"","year":null,"title":"Routing requirement.Potential product issues discovered internally must be submitted to EAP for review and, if warranted, converted into a customer advisory before they may alter guidance or compariso","work_id":"bbbf2a57-2075-483e-a68c-de8a572144b9","ref_index":5,"cited_arxiv_id":"","is_internal_anchor":false}],"resolved_work":13,"snapshot_sha256":"84d3856a0daecf1156ae5ccc5ba0b5d206c8e1ed8133932accba71647a9cbf16","internal_anchors":2},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2605.15257","created_at":"2026-05-20T00:00:48.914690+00:00"},{"alias_kind":"arxiv_version","alias_value":"2605.15257v1","created_at":"2026-05-20T00:00:48.914690+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.15257","created_at":"2026-05-20T00:00:48.914690+00:00"},{"alias_kind":"pith_short_12","alias_value":"T3RSYXPWUDBT","created_at":"2026-05-20T00:00:48.914690+00:00"},{"alias_kind":"pith_short_16","alias_value":"T3RSYXPWUDBTTOSF","created_at":"2026-05-20T00:00:48.914690+00:00"},{"alias_kind":"pith_short_8","alias_value":"T3RSYXPW","created_at":"2026-05-20T00:00:48.914690+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/T3RSYXPWUDBTTOSFPLF57WOEMV","json":"https://pith.science/pith/T3RSYXPWUDBTTOSFPLF57WOEMV.json","graph_json":"https://pith.science/api/pith-number/T3RSYXPWUDBTTOSFPLF57WOEMV/graph.json","events_json":"https://pith.science/api/pith-number/T3RSYXPWUDBTTOSFPLF57WOEMV/events.json","paper":"https://pith.science/paper/T3RSYXPW"},"agent_actions":{"view_html":"https://pith.science/pith/T3RSYXPWUDBTTOSFPLF57WOEMV","download_json":"https://pith.science/pith/T3RSYXPWUDBTTOSFPLF57WOEMV.json","view_paper":"https://pith.science/paper/T3RSYXPW","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2605.15257&json=true","fetch_graph":"https://pith.science/api/pith-number/T3RSYXPWUDBTTOSFPLF57WOEMV/graph.json","fetch_events":"https://pith.science/api/pith-number/T3RSYXPWUDBTTOSFPLF57WOEMV/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/T3RSYXPWUDBTTOSFPLF57WOEMV/action/timestamp_anchor","attest_storage":"https://pith.science/pith/T3RSYXPWUDBTTOSFPLF57WOEMV/action/storage_attestation","attest_author":"https://pith.science/pith/T3RSYXPWUDBTTOSFPLF57WOEMV/action/author_attestation","sign_citation":"https://pith.science/pith/T3RSYXPWUDBTTOSFPLF57WOEMV/action/citation_signature","submit_replication":"https://pith.science/pith/T3RSYXPWUDBTTOSFPLF57WOEMV/action/replication_record"}},"created_at":"2026-05-20T00:00:48.914690+00:00","updated_at":"2026-05-20T00:00:48.914690+00:00"}