{"bundle_type":"pith_open_graph_bundle","bundle_version":"1.0","pith_number":"pith:2026:7XXD53ZVILIQSOYY4DUGYIUGFJ","short_pith_number":"pith:7XXD53ZV","canonical_record":{"source":{"id":"2605.16339","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2026-05-07T16:48:48Z","cross_cats_sorted":[],"title_canon_sha256":"0325f8519ecda770bf59ea176cf534a0d557d5bd580a2d466092b37b6364f9bb","abstract_canon_sha256":"f36363aeb17278e6614f92c09d803c81c6412260aa44d7144dc425abb4752387"},"schema_version":"1.0"},"canonical_sha256":"fdee3eef3542d1093b18e0e86c22862a4a8f4c961fad475bbaa1f54ea737f5a0","source":{"kind":"arxiv","id":"2605.16339","version":1},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.16339","created_at":"2026-05-20T00:02:17Z"},{"alias_kind":"arxiv_version","alias_value":"2605.16339v1","created_at":"2026-05-20T00:02:17Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.16339","created_at":"2026-05-20T00:02:17Z"},{"alias_kind":"pith_short_12","alias_value":"7XXD53ZVILIQ","created_at":"2026-05-20T00:02:17Z"},{"alias_kind":"pith_short_16","alias_value":"7XXD53ZVILIQSOYY","created_at":"2026-05-20T00:02:17Z"},{"alias_kind":"pith_short_8","alias_value":"7XXD53ZV","created_at":"2026-05-20T00:02:17Z"}],"events":[{"event_type":"record_created","subject_pith_number":"pith:2026:7XXD53ZVILIQSOYY4DUGYIUGFJ","target":"record","payload":{"canonical_record":{"source":{"id":"2605.16339","kind":"arxiv","version":1},"metadata":{"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2026-05-07T16:48:48Z","cross_cats_sorted":[],"title_canon_sha256":"0325f8519ecda770bf59ea176cf534a0d557d5bd580a2d466092b37b6364f9bb","abstract_canon_sha256":"f36363aeb17278e6614f92c09d803c81c6412260aa44d7144dc425abb4752387"},"schema_version":"1.0"},"canonical_sha256":"fdee3eef3542d1093b18e0e86c22862a4a8f4c961fad475bbaa1f54ea737f5a0","receipt":{"kind":"pith_receipt","key_id":"pith-v1-2026-05","algorithm":"ed25519","signed_at":"2026-05-20T00:02:17.429160Z","signature_b64":"IWl1ZAmzquiRLKvQa1qqr9SeN7lgSQcPE2lzFcZ4TQeTnyZRlQrjJ7KfDzpMWBnJ+/ir3xEMogV4mKwbZxdzAA==","signed_message":"canonical_sha256_bytes","builder_version":"pith-number-builder-2026-05-17-v1","receipt_version":"0.3","canonical_sha256":"fdee3eef3542d1093b18e0e86c22862a4a8f4c961fad475bbaa1f54ea737f5a0","last_reissued_at":"2026-05-20T00:02:17.428692Z","signature_status":"signed_v1","first_computed_at":"2026-05-20T00:02:17.428692Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"source_kind":"arxiv","source_id":"2605.16339","source_version":1,"attestation_state":"computed"},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-20T00:02:17Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"MXfK+PftFF2doo5OpVwG0ru9M7Q0xklRLyE0+u5Rysh8zzsYAX+gWh873tRmh+CGieS5yWBt+Glw8sTdsCySBg==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-21T21:27:01.334788Z"},"content_sha256":"911d7fc0365facd4c9a9451ae99901c69c94c288ebb571167a76ee5a43f15c78","schema_version":"1.0","event_id":"sha256:911d7fc0365facd4c9a9451ae99901c69c94c288ebb571167a76ee5a43f15c78"},{"event_type":"graph_snapshot","subject_pith_number":"pith:2026:7XXD53ZVILIQSOYY4DUGYIUGFJ","target":"graph","payload":{"graph_snapshot":{"paper":{"title":"Preference Instability in Reward Models: Detection and Mitigation via Sparse Autoencoders","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","headline":"","cross_cats":[],"primary_cat":"cs.LG","authors_text":"Belen Martin Urcelay, Francesco Croce, Shunchang Liu, Xin Chen","submitted_at":"2026-05-07T16:48:48Z","abstract_excerpt":"Preference learning in large language models relies on reward models as proxies for human judgment. However, these models frequently exhibit preference instability, producing contradictory preference assignments in response to subtle, meaning-preserving input variations. We analyze this instability at the representation level under three semantic-preserving perturbation types: paraphrasing, pattern injection, and backdoor triggers. We attribute this instability to over-reliance on predictive yet brittle features, which we term unstable features, and isolate them via Sparse Autoencoders (SAEs) "},"claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2605.16339","kind":"arxiv","version":1},"verdict":{"id":null,"model_set":{},"created_at":null,"strongest_claim":"","one_line_summary":"","pipeline_version":null,"weakest_assumption":"","pith_extraction_headline":""},"integrity":{"clean":true,"summary":{"advisory":0,"critical":0,"by_detector":{},"informational":0},"endpoint":"/pith/2605.16339/integrity.json","findings":[],"available":true,"detectors_run":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938"},"references":{"count":0,"sample":[],"resolved_work":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","internal_anchors":0},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"author_claims":{"count":0,"strong_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"builder_version":"pith-number-builder-2026-05-17-v1"},"verdict_id":null},"signer":{"signer_id":"pith.science","signer_type":"pith_registry","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54"},"created_at":"2026-05-20T00:02:17Z","supersedes":[],"prev_event":null,"signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"QAU2hNyd0E2WVVgNPGFliKSrdLhApaJzFF71MJ+8ijFmbjf6ncDsr0mhnidmPXQjlACfzfdyuuC8UfiaKB5BAA==","signed_message":"open_graph_event_sha256_bytes","signed_at":"2026-05-21T21:27:01.335165Z"},"content_sha256":"1e444c8e03d82e6727af84a736de14420ab83d6799dde565e000bcbd3ba22997","schema_version":"1.0","event_id":"sha256:1e444c8e03d82e6727af84a736de14420ab83d6799dde565e000bcbd3ba22997"}],"timestamp_proofs":[],"mirror_hints":[{"mirror_type":"https","name":"Pith Resolver","base_url":"https://pith.science","bundle_url":"https://pith.science/pith/7XXD53ZVILIQSOYY4DUGYIUGFJ/bundle.json","state_url":"https://pith.science/pith/7XXD53ZVILIQSOYY4DUGYIUGFJ/state.json","well_known_bundle_url":"https://pith.science/.well-known/pith/7XXD53ZVILIQSOYY4DUGYIUGFJ/bundle.json","status":"primary"}],"public_keys":[{"key_id":"pith-v1-2026-05","algorithm":"ed25519","format":"raw","public_key_b64":"stVStoiQhXFxp4s2pdzPNoqVNBMojDU/fJ2db5S3CbM=","public_key_hex":"b2d552b68890857171a78b36a5dccf368a953413288c353f7c9d9d6f94b709b3","fingerprint_sha256_b32_first128bits":"RVFV5Z2OI2J3ZUO7ERDEBCYNKS","fingerprint_sha256_hex":"8d4b5ee74e4693bcd1df2446408b0d54","rotates_at":null,"url":"https://pith.science/pith-signing-key.json","notes":"Pith uses this Ed25519 key to sign canonical record SHA-256 digests. Verify with: ed25519_verify(public_key, message=canonical_sha256_bytes, signature=base64decode(signature_b64))."}],"merge_version":"pith-open-graph-merge-v1","built_at":"2026-05-21T21:27:01Z","links":{"resolver":"https://pith.science/pith/7XXD53ZVILIQSOYY4DUGYIUGFJ","bundle":"https://pith.science/pith/7XXD53ZVILIQSOYY4DUGYIUGFJ/bundle.json","state":"https://pith.science/pith/7XXD53ZVILIQSOYY4DUGYIUGFJ/state.json","well_known_bundle":"https://pith.science/.well-known/pith/7XXD53ZVILIQSOYY4DUGYIUGFJ/bundle.json"},"state":{"state_type":"pith_open_graph_state","state_version":"1.0","pith_number":"pith:2026:7XXD53ZVILIQSOYY4DUGYIUGFJ","merge_version":"pith-open-graph-merge-v1","event_count":2,"valid_event_count":2,"invalid_event_count":0,"equivocation_count":0,"current":{"canonical_record":{"metadata":{"abstract_canon_sha256":"f36363aeb17278e6614f92c09d803c81c6412260aa44d7144dc425abb4752387","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2026-05-07T16:48:48Z","title_canon_sha256":"0325f8519ecda770bf59ea176cf534a0d557d5bd580a2d466092b37b6364f9bb"},"schema_version":"1.0","source":{"id":"2605.16339","kind":"arxiv","version":1}},"source_aliases":[{"alias_kind":"arxiv","alias_value":"2605.16339","created_at":"2026-05-20T00:02:17Z"},{"alias_kind":"arxiv_version","alias_value":"2605.16339v1","created_at":"2026-05-20T00:02:17Z"},{"alias_kind":"doi","alias_value":"10.48550/arxiv.2605.16339","created_at":"2026-05-20T00:02:17Z"},{"alias_kind":"pith_short_12","alias_value":"7XXD53ZVILIQ","created_at":"2026-05-20T00:02:17Z"},{"alias_kind":"pith_short_16","alias_value":"7XXD53ZVILIQSOYY","created_at":"2026-05-20T00:02:17Z"},{"alias_kind":"pith_short_8","alias_value":"7XXD53ZV","created_at":"2026-05-20T00:02:17Z"}],"graph_snapshots":[{"event_id":"sha256:1e444c8e03d82e6727af84a736de14420ab83d6799dde565e000bcbd3ba22997","target":"graph","created_at":"2026-05-20T00:02:17Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"graph_snapshot":{"author_claims":{"count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57","strong_count":0},"builder_version":"pith-number-builder-2026-05-17-v1","claims":{"count":0,"items":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"formal_canon":{"evidence_count":0,"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"integrity":{"available":true,"clean":true,"detectors_run":[],"endpoint":"/pith/2605.16339/integrity.json","findings":[],"snapshot_sha256":"c28c3603d3b5d939e8dc4c7e95fa8dfce3d595e45f758748cecf8e644a296938","summary":{"advisory":0,"by_detector":{},"critical":0,"informational":0}},"paper":{"abstract_excerpt":"Preference learning in large language models relies on reward models as proxies for human judgment. However, these models frequently exhibit preference instability, producing contradictory preference assignments in response to subtle, meaning-preserving input variations. We analyze this instability at the representation level under three semantic-preserving perturbation types: paraphrasing, pattern injection, and backdoor triggers. We attribute this instability to over-reliance on predictive yet brittle features, which we term unstable features, and isolate them via Sparse Autoencoders (SAEs) ","authors_text":"Belen Martin Urcelay, Francesco Croce, Shunchang Liu, Xin Chen","cross_cats":[],"headline":"","license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2026-05-07T16:48:48Z","title":"Preference Instability in Reward Models: Detection and Mitigation via Sparse Autoencoders"},"references":{"count":0,"internal_anchors":0,"resolved_work":0,"sample":[],"snapshot_sha256":"258153158e38e3291e3d48162225fcdb2d5a3ed65a07baac614ab91432fd4f57"},"source":{"id":"2605.16339","kind":"arxiv","version":1},"verdict":{"created_at":null,"id":null,"model_set":{},"one_line_summary":"","pipeline_version":null,"pith_extraction_headline":"","strongest_claim":"","weakest_assumption":""}},"verdict_id":null}}],"author_attestations":[],"timestamp_anchors":[],"storage_attestations":[],"citation_signatures":[],"replication_records":[],"corrections":[],"mirror_hints":[],"record_created":{"event_id":"sha256:911d7fc0365facd4c9a9451ae99901c69c94c288ebb571167a76ee5a43f15c78","target":"record","created_at":"2026-05-20T00:02:17Z","signer":{"key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signer_id":"pith.science","signer_type":"pith_registry"},"payload":{"attestation_state":"computed","canonical_record":{"metadata":{"abstract_canon_sha256":"f36363aeb17278e6614f92c09d803c81c6412260aa44d7144dc425abb4752387","cross_cats_sorted":[],"license":"http://arxiv.org/licenses/nonexclusive-distrib/1.0/","primary_cat":"cs.LG","submitted_at":"2026-05-07T16:48:48Z","title_canon_sha256":"0325f8519ecda770bf59ea176cf534a0d557d5bd580a2d466092b37b6364f9bb"},"schema_version":"1.0","source":{"id":"2605.16339","kind":"arxiv","version":1}},"canonical_sha256":"fdee3eef3542d1093b18e0e86c22862a4a8f4c961fad475bbaa1f54ea737f5a0","receipt":{"algorithm":"ed25519","builder_version":"pith-number-builder-2026-05-17-v1","canonical_sha256":"fdee3eef3542d1093b18e0e86c22862a4a8f4c961fad475bbaa1f54ea737f5a0","first_computed_at":"2026-05-20T00:02:17.428692Z","key_id":"pith-v1-2026-05","kind":"pith_receipt","last_reissued_at":"2026-05-20T00:02:17.428692Z","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","receipt_version":"0.3","signature_b64":"IWl1ZAmzquiRLKvQa1qqr9SeN7lgSQcPE2lzFcZ4TQeTnyZRlQrjJ7KfDzpMWBnJ+/ir3xEMogV4mKwbZxdzAA==","signature_status":"signed_v1","signed_at":"2026-05-20T00:02:17.429160Z","signed_message":"canonical_sha256_bytes"},"source_id":"2605.16339","source_kind":"arxiv","source_version":1}}},"equivocations":[],"invalid_events":[],"applied_event_ids":["sha256:911d7fc0365facd4c9a9451ae99901c69c94c288ebb571167a76ee5a43f15c78","sha256:1e444c8e03d82e6727af84a736de14420ab83d6799dde565e000bcbd3ba22997"],"state_sha256":"f2fdd60c354a5423f288499263282e3c54507df962817556715147c0c0ef7994"},"bundle_signature":{"signature_status":"signed_v1","algorithm":"ed25519","key_id":"pith-v1-2026-05","public_key_fingerprint":"8d4b5ee74e4693bcd1df2446408b0d54","signature_b64":"SaiWCYdAuGK8vBkXI1gRsHs51js54sJ/z3iYhXZJx5wmBoJKgPBf4Tv0K6sSZk9jKwTQuEtZVbN/cCI5BcInDA==","signed_message":"bundle_sha256_bytes","signed_at":"2026-05-21T21:27:01.337441Z","bundle_sha256":"3c4b776c00c248003a58e6950a4acc4b42b6c1b3a4cf639847a9a155ea9d7b7c"}}