{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:VJM6SH5T7M2HMCHSXQ534FD2XH","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"304e9661467d50f8ea0a61aeb1da6798f6336ca8a1959d8daa9a3683e097d0b5","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2026-06-10T17:31:16Z","title_canon_sha256":"c9800cf04ef29c04091b59112c1b23c9fe07a9d42fdd87b94838f1fbc99563d0"},"schema_version":"1.0","source":{"id":"2606.12360","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2606.12360","created_at":"2026-06-11T02:09:48Z"},{"alias_kind":"arxiv_version","alias_value":"2606.12360v1","created_at":"2026-06-11T02:09:48Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2606.12360","created_at":"2026-06-11T02:09:48Z"},{"alias_kind":"pith_short_12","alias_value":"VJM6SH5T7M2H","created_at":"2026-06-11T02:09:48Z"},{"alias_kind":"pith_short_16","alias_value":"VJM6SH5T7M2HMCHS","created_at":"2026-06-11T02:09:48Z"},{"alias_kind":"pith_short_8","alias_value":"VJM6SH5T","created_at":"2026-06-11T02:09:48Z"}],"graph_snapshots":[{"event_id":"sha256:d6ff0241a1566e23247e4fba785a6334090b7017dc2ddb2ad9e7426fe941ec8d","target":"graph","created_at":"2026-06-11T02:09:48Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2606.12360/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Language-model post-training is the main stage at which model behavior is shaped, yet it still largely involves optimization of scalar rewards that summarize diverse desiderata. This abstraction gives practitioners little visibility into what their data actually teaches models, allowing spurious correlations to be learned by a model and inducing undesirable behaviors such as over-stylization and sycophancy. To address this problem, we ask: can we inspect a preference dataset before optimization and decide, at the level of concepts, which behaviors a model should be allowed to learn? Motivated ","authors_text":"Atticus Geiger, Daniel Balsam, Dhruvil Gala, Ekdeep Singh Lubana, Jack Merullo, Leon Bergen, Matthew Kowal, Max Loeffler, Owen Lewis, Raphael Sarfati, Ryan Panwar, Santiago Aranguri, Siddharth Boppana, Sidharth Baskaran, Thomas Fel, Thomas McGrath, Usha Bhalla","cross_cats":[],"headline":"","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2026-06-10T17:31:16Z","title":"Anatomy of Post-Training: Using Interpretability to Characterize Data and Shape the Learning Signal"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2606.12360","kind":"arxiv","version":1},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:121617bcf70f62dfa4e452b91270ec32e07aaf9db4046f3d597e407a85f96051","target":"record","created_at":"2026-06-11T02:09:48Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"304e9661467d50f8ea0a61aeb1da6798f6336ca8a1959d8daa9a3683e097d0b5","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2026-06-10T17:31:16Z","title_canon_sha256":"c9800cf04ef29c04091b59112c1b23c9fe07a9d42fdd87b94838f1fbc99563d0"},"schema_version":"1.0","source":{"id":"2606.12360","kind":"arxiv","version":1}},"canonical_sha256":"aa59e91fb3fb347608f2bc3bbe147ab9c2f08241aa9c96792b08fedc36b1f63d","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"aa59e91fb3fb347608f2bc3bbe147ab9c2f08241aa9c96792b08fedc36b1f63d","first_computed_at":"2026-06-11T02:09:48.284306Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-06-11T02:09:48.284306Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"qskKG70qCQfx1tslFy1NmEViFt0+UEBsJL71LL5bgfxDzQdLGNwSx7EsGhLSr9GvlnlVA0yCalQ77bq/nTNmDA==","signature_status":"signed_v1","signed_at":"2026-06-11T02:09:48.285298Z","signed_message":"canonical_sha256_bytes"},"source_id":"2606.12360","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:121617bcf70f62dfa4e452b91270ec32e07aaf9db4046f3d597e407a85f96051","sha256:d6ff0241a1566e23247e4fba785a6334090b7017dc2ddb2ad9e7426fe941ec8d"],"state_sha256":"814c54482128c4d207652d6531ac401a2118a07b498c587d0c261e2e5530c353"}