{"record_type":"pith_number_record","schema_url":"https://pith.science/schemas/pith-number/v1.json","pith_number":"pith:2026:6CPH4PCDD5XSFMCBR7PQK2MFSF","short_pith_number":"pith:6CPH4PCD","schema_version":"1.0","canonical_sha256":"f09e7e3c431f6f22b0418fdf056985915b58e834533955916f013f6864c71901","source":{"kind":"arxiv","id":"2606.02211","version":1},"attestation_state":"computed","paper":{"title":"Consistency Training while Mitigating Obfuscation via Rate Matching","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.CL","authors_text":"David Demitri Africa, Jannes Elstner, Prakhar Gupta, Sohaib Imran","submitted_at":"2026-06-01T13:10:49Z","abstract_excerpt":"Large language models are often influenced by extraneous input features, such as cues revealing a user's preferred answer. Consistency training reduces this influence by training models to behave similarly across inputs with and without the extraneous feature. However, existing methods train for consistency over entire responses or internal activations, which also constrains whether the model verbalises said extraneous features. We show this leads to obfuscation, where the model learns not to mention a cue while remaining influenced by it, which may undermine monitorability. To address this, w"},"verification_status":{"content_addressed":true,"pith_receipt":true,"author_attested":false,"weak_author_claims":0,"strong_author_claims":0,"externally_anchored":false,"storage_verified":false,"citation_signatures":0,"replication_records":0,"graph_snapshot":true,"references_resolved":false,"formal_links_present":false},"canonical_record":{"source":{"id":"2606.02211","kind":"arxiv","version":1},"metadata":{"license":"http://creativecommons.org/licenses/by/4.0/","primary_cat":"cs.CL","submitted_at":"2026-06-01T13:10:49Z","cross_cats_sorted":["cs.AI"],"title_canon_sha256":"a9dd3eea768a0f3b63e201acf7f9ec16c594cb67c9317a6d71f5a34aeac8c17a","abstract_canon_sha256":"5b194b02cf6967b277482d8fc6f8c2271eefaf6dec10203d16fc961533eac243"},"schema_version":"1.0"},"receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-06-02T03:04:53.201828Z","signature_b64":"AfF/GtfC2GPZErGBu1MNb1z26TRP3jr/w0PMnuCCrXX4pUFOMfuvkeUL3khyrBROdPK8HjK6rNdMUUL3R0GzAg==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"f09e7e3c431f6f22b0418fdf056985915b58e834533955916f013f6864c71901","last_reissued_at":"2026-06-02T03:04:53.201484Z","signature_status":"signed_v1","first_computed_at":"2026-06-02T03:04:53.201484Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"graph_snapshot":{"paper":{"title":"Consistency Training while Mitigating Obfuscation via Rate Matching","license":"http://creativecommons.org/licenses/by/4.0/","headline":"","cross_cats":["cs.AI"],"primary_cat":"cs.CL","authors_text":"David Demitri Africa, Jannes Elstner, Prakhar Gupta, Sohaib Imran","submitted_at":"2026-06-01T13:10:49Z","abstract_excerpt":"Large language models are often influenced by extraneous input features, such as cues revealing a user's preferred answer. Consistency training reduces this influence by training models to behave similarly across inputs with and without the extraneous feature. However, existing methods train for consistency over entire responses or internal activations, which also constrains whether the model verbalises said extraneous features. We show this leads to obfuscation, where the model learns not to mention a cue while remaining influenced by it, which may undermine monitorability. To address this, w"},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.02211","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2606.02211/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"aliases":[{"alias_kind":"arxiv","alias_value":"2606.02211","created_at":"2026-06-02T03:04:53.201540+00:00"},{"alias_kind":"arxiv_version","alias_value":"2606.02211v1","created_at":"2026-06-02T03:04:53.201540+00:00"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.02211","created_at":"2026-06-02T03:04:53.201540+00:00"},{"alias_kind":"pith_short_12","alias_value":"6CPH4PCDD5XS","created_at":"2026-06-02T03:04:53.201540+00:00"},{"alias_kind":"pith_short_16","alias_value":"6CPH4PCDD5XSFMCB","created_at":"2026-06-02T03:04:53.201540+00:00"},{"alias_kind":"pith_short_8","alias_value":"6CPH4PCD","created_at":"2026-06-02T03:04:53.201540+00:00"}],"events":[],"event_summary":{},"paper_claims":[],"inbound_citations":{"count":0,"internal_anchor_count":0,"sample":[]},"formal_canon":{"evidence_count":0,"sample":[],"anchors":[]},"links":{"html":"https://pith.science/pith/6CPH4PCDD5XSFMCBR7PQK2MFSF","json":"https://pith.science/pith/6CPH4PCDD5XSFMCBR7PQK2MFSF.json","graph_json":"https://pith.science/api/pith-number/6CPH4PCDD5XSFMCBR7PQK2MFSF/graph.json","events_json":"https://pith.science/api/pith-number/6CPH4PCDD5XSFMCBR7PQK2MFSF/events.json","paper":"https://pith.science/paper/6CPH4PCD"},"agent_actions":{"view_html":"https://pith.science/pith/6CPH4PCDD5XSFMCBR7PQK2MFSF","download_json":"https://pith.science/pith/6CPH4PCDD5XSFMCBR7PQK2MFSF.json","view_paper":"https://pith.science/paper/6CPH4PCD","resolve_alias":"https://pith.science/api/pith-number/resolve?arxiv=2606.02211&json=true","fetch_graph":"https://pith.science/api/pith-number/6CPH4PCDD5XSFMCBR7PQK2MFSF/graph.json","fetch_events":"https://pith.science/api/pith-number/6CPH4PCDD5XSFMCBR7PQK2MFSF/events.json","actions":{"anchor_timestamp":"https://pith.science/pith/6CPH4PCDD5XSFMCBR7PQK2MFSF/action/timestamp_anchor","attest_storage":"https://pith.science/pith/6CPH4PCDD5XSFMCBR7PQK2MFSF/action/storage_attestation","attest_author":"https://pith.science/pith/6CPH4PCDD5XSFMCBR7PQK2MFSF/action/author_attestation","sign_citation":"https://pith.science/pith/6CPH4PCDD5XSFMCBR7PQK2MFSF/action/citation_signature","submit_replication":"https://pith.science/pith/6CPH4PCDD5XSFMCBR7PQK2MFSF/action/replication_record"}},"created_at":"2026-06-02T03:04:53.201540+00:00","updated_at":"2026-06-02T03:04:53.201540+00:00"}